diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig
index 4e8a1794f50a..c39fe782ba67 100644
--- a/drivers/staging/Kconfig
+++ b/drivers/staging/Kconfig
@@ -140,4 +140,6 @@ source "drivers/staging/netlogic/Kconfig"
 
 source "drivers/staging/dwc2/Kconfig"
 
+source "drivers/staging/lustre/Kconfig"
+
 endif # STAGING
diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile
index 415772ea306d..110c59754dda 100644
--- a/drivers/staging/Makefile
+++ b/drivers/staging/Makefile
@@ -62,3 +62,4 @@ obj-$(CONFIG_FIREWIRE_SERIAL)	+= fwserial/
 obj-$(CONFIG_ZCACHE)		+= zcache/
 obj-$(CONFIG_GOLDFISH)		+= goldfish/
 obj-$(CONFIG_USB_DWC2)		+= dwc2/
+obj-$(CONFIG_LUSTRE_FS)		+= lustre/
diff --git a/drivers/staging/lustre/Kconfig b/drivers/staging/lustre/Kconfig
new file mode 100644
index 000000000000..a224d88bf43d
--- /dev/null
+++ b/drivers/staging/lustre/Kconfig
@@ -0,0 +1,3 @@
+source "drivers/staging/lustre/lustre/Kconfig"
+
+source "drivers/staging/lustre/lnet/Kconfig"
diff --git a/drivers/staging/lustre/Makefile b/drivers/staging/lustre/Makefile
new file mode 100644
index 000000000000..26162893fd20
--- /dev/null
+++ b/drivers/staging/lustre/Makefile
@@ -0,0 +1,4 @@
+subdir-ccflags-y := -I$(src)/include/
+
+obj-$(CONFIG_LUSTRE_FS)		+= lustre/
+obj-$(CONFIG_LNET)		+= lnet/
diff --git a/drivers/staging/lustre/TODO b/drivers/staging/lustre/TODO
new file mode 100644
index 000000000000..22742d6d62a8
--- /dev/null
+++ b/drivers/staging/lustre/TODO
@@ -0,0 +1,13 @@
+* Possible remaining coding style fix.
+* Remove deadcode.
+* Seperate client/server functionality. Functions only used by server can be
+  removed from client.
+* Clean up libcfs layer. Ideally we can remove include/linux/libcfs entirely.
+* Clean up CLIO layer. Lustre client readahead/writeback control needs to better
+  suit kernel providings.
+* Add documents in Documentation.
+* Other minor misc cleanups...
+
+Please send any patches to Greg Kroah-Hartman <greg@kroah.com>, Andreas Dilger
+<andreas.dilger@intel.com> and Peng Tao <tao.peng@emc.com>. CCing
+hpdd-discuss <hpdd-discuss@lists.01.org> would be great too.
diff --git a/drivers/staging/lustre/include/linux/libcfs/bitmap.h b/drivers/staging/lustre/include/linux/libcfs/bitmap.h
new file mode 100644
index 000000000000..3f1c37b4bb7a
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/bitmap.h
@@ -0,0 +1,111 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#ifndef _LIBCFS_BITMAP_H_
+#define _LIBCFS_BITMAP_H_
+
+
+typedef struct {
+	int	     size;
+	unsigned long   data[0];
+} cfs_bitmap_t;
+
+#define CFS_BITMAP_SIZE(nbits) \
+     (((nbits/BITS_PER_LONG)+1)*sizeof(long)+sizeof(cfs_bitmap_t))
+
+static inline
+cfs_bitmap_t *CFS_ALLOCATE_BITMAP(int size)
+{
+	cfs_bitmap_t *ptr;
+
+	OBD_ALLOC(ptr, CFS_BITMAP_SIZE(size));
+	if (ptr == NULL)
+		RETURN(ptr);
+
+	ptr->size = size;
+
+	RETURN (ptr);
+}
+
+#define CFS_FREE_BITMAP(ptr)	OBD_FREE(ptr, CFS_BITMAP_SIZE(ptr->size))
+
+static inline
+void cfs_bitmap_set(cfs_bitmap_t *bitmap, int nbit)
+{
+	set_bit(nbit, bitmap->data);
+}
+
+static inline
+void cfs_bitmap_clear(cfs_bitmap_t *bitmap, int nbit)
+{
+	test_and_clear_bit(nbit, bitmap->data);
+}
+
+static inline
+int cfs_bitmap_check(cfs_bitmap_t *bitmap, int nbit)
+{
+	return test_bit(nbit, bitmap->data);
+}
+
+static inline
+int cfs_bitmap_test_and_clear(cfs_bitmap_t *bitmap, int nbit)
+{
+	return test_and_clear_bit(nbit, bitmap->data);
+}
+
+/* return 0 is bitmap has none set bits */
+static inline
+int cfs_bitmap_check_empty(cfs_bitmap_t *bitmap)
+{
+	return find_first_bit(bitmap->data, bitmap->size) == bitmap->size;
+}
+
+static inline
+void cfs_bitmap_copy(cfs_bitmap_t *new, cfs_bitmap_t *old)
+{
+	int newsize;
+
+	LASSERT(new->size >= old->size);
+	newsize = new->size;
+	memcpy(new, old, CFS_BITMAP_SIZE(old->size));
+	new->size = newsize;
+}
+
+#define cfs_foreach_bit(bitmap, pos)					\
+	for ((pos) = find_first_bit((bitmap)->data, bitmap->size);	\
+	     (pos) < (bitmap)->size;					\
+	     (pos) = find_next_bit((bitmap)->data, (bitmap)->size, (pos) + 1))
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/libcfs/curproc.h b/drivers/staging/lustre/include/linux/libcfs/curproc.h
new file mode 100644
index 000000000000..90d7ce630e94
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/curproc.h
@@ -0,0 +1,110 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/curproc.h
+ *
+ * Lustre curproc API declaration
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#ifndef __LIBCFS_CURPROC_H__
+#define __LIBCFS_CURPROC_H__
+
+/*
+ * Portable API to access common characteristics of "current" UNIX process.
+ *
+ * Implemented in portals/include/libcfs/<os>/
+ */
+int    cfs_curproc_groups_nr(void);
+int    current_is_in_group(gid_t group);
+void   cfs_curproc_groups_dump(gid_t *array, int size);
+
+/*
+ * Plus, platform-specific constant
+ *
+ * CFS_CURPROC_COMM_MAX,
+ *
+ * and opaque scalar type
+ *
+ * kernel_cap_t
+ */
+
+/* check if task is running in compat mode.*/
+int current_is_32bit(void);
+#define current_pid()		(current->pid)
+#define current_comm()		(current->comm)
+int cfs_get_environ(const char *key, char *value, int *val_len);
+
+typedef __u32 cfs_cap_t;
+
+#define CFS_CAP_CHOWN		   0
+#define CFS_CAP_DAC_OVERRIDE	    1
+#define CFS_CAP_DAC_READ_SEARCH	 2
+#define CFS_CAP_FOWNER		  3
+#define CFS_CAP_FSETID		  4
+#define CFS_CAP_LINUX_IMMUTABLE	 9
+#define CFS_CAP_SYS_ADMIN	      21
+#define CFS_CAP_SYS_BOOT	       23
+#define CFS_CAP_SYS_RESOURCE	   24
+
+#define CFS_CAP_FS_MASK ((1 << CFS_CAP_CHOWN) |		 \
+			 (1 << CFS_CAP_DAC_OVERRIDE) |	  \
+			 (1 << CFS_CAP_DAC_READ_SEARCH) |       \
+			 (1 << CFS_CAP_FOWNER) |		\
+			 (1 << CFS_CAP_FSETID ) |	       \
+			 (1 << CFS_CAP_LINUX_IMMUTABLE) |       \
+			 (1 << CFS_CAP_SYS_ADMIN) |	     \
+			 (1 << CFS_CAP_SYS_BOOT) |	      \
+			 (1 << CFS_CAP_SYS_RESOURCE))
+
+void cfs_cap_raise(cfs_cap_t cap);
+void cfs_cap_lower(cfs_cap_t cap);
+int cfs_cap_raised(cfs_cap_t cap);
+cfs_cap_t cfs_curproc_cap_pack(void);
+void cfs_curproc_cap_unpack(cfs_cap_t cap);
+int cfs_capable(cfs_cap_t cap);
+
+/* __LIBCFS_CURPROC_H__ */
+#endif
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 80
+ * scroll-step: 1
+ * End:
+ */
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs.h b/drivers/staging/lustre/include/linux/libcfs/libcfs.h
new file mode 100644
index 000000000000..6dd5a7d27827
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/libcfs.h
@@ -0,0 +1,286 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LIBCFS_LIBCFS_H__
+#define __LIBCFS_LIBCFS_H__
+
+#if !__GNUC__
+#define __attribute__(x)
+#endif
+
+#include <linux/libcfs/linux/libcfs.h>
+
+#include "curproc.h"
+
+#ifndef offsetof
+# define offsetof(typ,memb) ((long)(long_ptr_t)((char *)&(((typ *)0)->memb)))
+#endif
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(a) ((sizeof (a)) / (sizeof ((a)[0])))
+#endif
+
+#if !defined(swap)
+#define swap(x,y) do { typeof(x) z = x; x = y; y = z; } while (0)
+#endif
+
+#if !defined(container_of)
+/* given a pointer @ptr to the field @member embedded into type (usually
+ * struct) @type, return pointer to the embedding instance of @type. */
+#define container_of(ptr, type, member) \
+	((type *)((char *)(ptr)-(char *)(&((type *)0)->member)))
+#endif
+
+static inline int __is_po2(unsigned long long val)
+{
+	return !(val & (val - 1));
+}
+
+#define IS_PO2(val) __is_po2((unsigned long long)(val))
+
+#define LOWEST_BIT_SET(x)       ((x) & ~((x) - 1))
+
+/*
+ * Lustre Error Checksum: calculates checksum
+ * of Hex number by XORing each bit.
+ */
+#define LERRCHKSUM(hexnum) (((hexnum) & 0xf) ^ ((hexnum) >> 4 & 0xf) ^ \
+			   ((hexnum) >> 8 & 0xf))
+
+
+/*
+ * Some (nomina odiosa sunt) platforms define NULL as naked 0. This confuses
+ * Lustre RETURN(NULL) macro.
+ */
+#if defined(NULL)
+#undef NULL
+#endif
+
+#define NULL ((void *)0)
+
+#define LUSTRE_SRV_LNET_PID      LUSTRE_LNET_PID
+
+
+#include <linux/list.h>
+
+#ifndef cfs_for_each_possible_cpu
+#  error cfs_for_each_possible_cpu is not supported by kernel!
+#endif
+
+/* libcfs tcpip */
+int libcfs_ipif_query(char *name, int *up, __u32 *ip, __u32 *mask);
+int libcfs_ipif_enumerate(char ***names);
+void libcfs_ipif_free_enumeration(char **names, int n);
+int libcfs_sock_listen(socket_t **sockp, __u32 ip, int port, int backlog);
+int libcfs_sock_accept(socket_t **newsockp, socket_t *sock);
+void libcfs_sock_abort_accept(socket_t *sock);
+int libcfs_sock_connect(socket_t **sockp, int *fatal,
+			__u32 local_ip, int local_port,
+			__u32 peer_ip, int peer_port);
+int libcfs_sock_setbuf(socket_t *socket, int txbufsize, int rxbufsize);
+int libcfs_sock_getbuf(socket_t *socket, int *txbufsize, int *rxbufsize);
+int libcfs_sock_getaddr(socket_t *socket, int remote, __u32 *ip, int *port);
+int libcfs_sock_write(socket_t *sock, void *buffer, int nob, int timeout);
+int libcfs_sock_read(socket_t *sock, void *buffer, int nob, int timeout);
+void libcfs_sock_release(socket_t *sock);
+
+/* libcfs watchdogs */
+struct lc_watchdog;
+
+/* Add a watchdog which fires after "time" milliseconds of delay.  You have to
+ * touch it once to enable it. */
+struct lc_watchdog *lc_watchdog_add(int time,
+				    void (*cb)(pid_t pid, void *),
+				    void *data);
+
+/* Enables a watchdog and resets its timer. */
+void lc_watchdog_touch(struct lc_watchdog *lcw, int timeout);
+#define CFS_GET_TIMEOUT(svc) (max_t(int, obd_timeout,		   \
+			  AT_OFF ? 0 : at_get(&svc->srv_at_estimate)) * \
+			  svc->srv_watchdog_factor)
+
+/* Disable a watchdog; touch it to restart it. */
+void lc_watchdog_disable(struct lc_watchdog *lcw);
+
+/* Clean up the watchdog */
+void lc_watchdog_delete(struct lc_watchdog *lcw);
+
+/* Dump a debug log */
+void lc_watchdog_dumplog(pid_t pid, void *data);
+
+
+/* need both kernel and user-land acceptor */
+#define LNET_ACCEPTOR_MIN_RESERVED_PORT    512
+#define LNET_ACCEPTOR_MAX_RESERVED_PORT    1023
+
+/*
+ * libcfs pseudo device operations
+ *
+ * struct psdev_t and
+ * misc_register() and
+ * misc_deregister() are declared in
+ * libcfs/<os>/<os>-prim.h
+ *
+ * It's just draft now.
+ */
+
+struct cfs_psdev_file {
+	unsigned long   off;
+	void	    *private_data;
+	unsigned long   reserved1;
+	unsigned long   reserved2;
+};
+
+struct cfs_psdev_ops {
+	int (*p_open)(unsigned long, void *);
+	int (*p_close)(unsigned long, void *);
+	int (*p_read)(struct cfs_psdev_file *, char *, unsigned long);
+	int (*p_write)(struct cfs_psdev_file *, char *, unsigned long);
+	int (*p_ioctl)(struct cfs_psdev_file *, unsigned long, void *);
+};
+
+/*
+ * Drop into debugger, if possible. Implementation is provided by platform.
+ */
+
+void cfs_enter_debugger(void);
+
+/*
+ * Defined by platform
+ */
+int unshare_fs_struct(void);
+sigset_t cfs_get_blocked_sigs(void);
+sigset_t cfs_block_allsigs(void);
+sigset_t cfs_block_sigs(unsigned long sigs);
+sigset_t cfs_block_sigsinv(unsigned long sigs);
+void cfs_restore_sigs(sigset_t);
+int cfs_signal_pending(void);
+void cfs_clear_sigpending(void);
+
+int convert_server_error(__u64 ecode);
+int convert_client_oflag(int cflag, int *result);
+
+/*
+ * Stack-tracing filling.
+ */
+
+/*
+ * Platform-dependent data-type to hold stack frames.
+ */
+struct cfs_stack_trace;
+
+/*
+ * Fill @trace with current back-trace.
+ */
+void cfs_stack_trace_fill(struct cfs_stack_trace *trace);
+
+/*
+ * Return instruction pointer for frame @frame_no. NULL if @frame_no is
+ * invalid.
+ */
+void *cfs_stack_trace_frame(struct cfs_stack_trace *trace, int frame_no);
+
+#ifndef O_NOACCESS
+#define O_NOACCESS O_NONBLOCK
+#endif
+
+/*
+ * Universal open flags.
+ */
+#define CFS_O_NOACCESS	  0003
+#define CFS_O_ACCMODE	   CFS_O_NOACCESS
+#define CFS_O_CREAT	     0100
+#define CFS_O_EXCL	      0200
+#define CFS_O_NOCTTY	    0400
+#define CFS_O_TRUNC	     01000
+#define CFS_O_APPEND	    02000
+#define CFS_O_NONBLOCK	  04000
+#define CFS_O_NDELAY	    CFS_O_NONBLOCK
+#define CFS_O_SYNC	      010000
+#define CFS_O_ASYNC	     020000
+#define CFS_O_DIRECT	    040000
+#define CFS_O_LARGEFILE	 0100000
+#define CFS_O_DIRECTORY	 0200000
+#define CFS_O_NOFOLLOW	  0400000
+#define CFS_O_NOATIME	   01000000
+
+/* convert local open flags to universal open flags */
+int cfs_oflags2univ(int flags);
+/* convert universal open flags to local open flags */
+int cfs_univ2oflags(int flags);
+
+/*
+ * Random number handling
+ */
+
+/* returns a random 32-bit integer */
+unsigned int cfs_rand(void);
+/* seed the generator */
+void cfs_srand(unsigned int, unsigned int);
+void cfs_get_random_bytes(void *buf, int size);
+
+#include <linux/libcfs/libcfs_debug.h>
+#include <linux/libcfs/libcfs_cpu.h>
+#include <linux/libcfs/libcfs_private.h>
+#include <linux/libcfs/libcfs_ioctl.h>
+#include <linux/libcfs/libcfs_prim.h>
+#include <linux/libcfs/libcfs_time.h>
+#include <linux/libcfs/libcfs_string.h>
+#include <linux/libcfs/libcfs_kernelcomm.h>
+#include <linux/libcfs/libcfs_workitem.h>
+#include <linux/libcfs/libcfs_hash.h>
+#include <linux/libcfs/libcfs_heap.h>
+#include <linux/libcfs/libcfs_fail.h>
+#include <linux/libcfs/params_tree.h>
+#include <linux/libcfs/libcfs_crypto.h>
+
+/* container_of depends on "likely" which is defined in libcfs_private.h */
+static inline void *__container_of(void *ptr, unsigned long shift)
+{
+	if (unlikely(IS_ERR(ptr) || ptr == NULL))
+		return ptr;
+	else
+		return (char *)ptr - shift;
+}
+
+#define container_of0(ptr, type, member) \
+	((type *)__container_of((void *)(ptr), offsetof(type, member)))
+
+#define SET_BUT_UNUSED(a) do { } while(sizeof(a) - sizeof(a))
+
+#define _LIBCFS_H
+
+#endif /* _LIBCFS_H */
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_cpu.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_cpu.h
new file mode 100644
index 000000000000..6ae7415a3b99
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_cpu.h
@@ -0,0 +1,214 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_cpu.h
+ *
+ * CPU partition
+ *   . CPU partition is virtual processing unit
+ *
+ *   . CPU partition can present 1-N cores, or 1-N NUMA nodes,
+ *     in other words, CPU partition is a processors pool.
+ *
+ * CPU Partition Table (CPT)
+ *   . a set of CPU partitions
+ *
+ *   . There are two modes for CPT: CFS_CPU_MODE_NUMA and CFS_CPU_MODE_SMP
+ *
+ *   . User can specify total number of CPU partitions while creating a
+ *     CPT, ID of CPU partition is always start from 0.
+ *
+ *     Example: if there are 8 cores on the system, while creating a CPT
+ *     with cpu_npartitions=4:
+ *	      core[0, 1] = partition[0], core[2, 3] = partition[1]
+ *	      core[4, 5] = partition[2], core[6, 7] = partition[3]
+ *
+ *	  cpu_npartitions=1:
+ *	      core[0, 1, ... 7] = partition[0]
+ *
+ *   . User can also specify CPU partitions by string pattern
+ *
+ *     Examples: cpu_partitions="0[0,1], 1[2,3]"
+ *	       cpu_partitions="N 0[0-3], 1[4-8]"
+ *
+ *     The first character "N" means following numbers are numa ID
+ *
+ *   . NUMA allocators, CPU affinity threads are built over CPU partitions,
+ *     instead of HW CPUs or HW nodes.
+ *
+ *   . By default, Lustre modules should refer to the global cfs_cpt_table,
+ *     instead of accessing HW CPUs directly, so concurrency of Lustre can be
+ *     configured by cpu_npartitions of the global cfs_cpt_table
+ *
+ *   . If cpu_npartitions=1(all CPUs in one pool), lustre should work the
+ *     same way as 2.2 or earlier versions
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#ifndef __LIBCFS_CPU_H__
+#define __LIBCFS_CPU_H__
+
+#ifndef HAVE_LIBCFS_CPT
+
+typedef unsigned long		cpumask_t;
+typedef unsigned long		nodemask_t;
+
+struct cfs_cpt_table {
+	/* # of CPU partitions */
+	int			ctb_nparts;
+	/* cpu mask */
+	cpumask_t		ctb_mask;
+	/* node mask */
+	nodemask_t		ctb_nodemask;
+	/* version */
+	__u64			ctb_version;
+};
+
+#endif /* !HAVE_LIBCFS_CPT */
+
+/* any CPU partition */
+#define CFS_CPT_ANY		(-1)
+
+extern struct cfs_cpt_table	*cfs_cpt_table;
+
+/**
+ * destroy a CPU partition table
+ */
+void cfs_cpt_table_free(struct cfs_cpt_table *cptab);
+/**
+ * create a cfs_cpt_table with \a ncpt number of partitions
+ */
+struct cfs_cpt_table *cfs_cpt_table_alloc(unsigned int ncpt);
+/**
+ * print string information of cpt-table
+ */
+int cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len);
+/**
+ * return total number of CPU partitions in \a cptab
+ */
+int
+cfs_cpt_number(struct cfs_cpt_table *cptab);
+/**
+ * return number of HW cores or hypter-threadings in a CPU partition \a cpt
+ */
+int cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt);
+/**
+ * is there any online CPU in CPU partition \a cpt
+ */
+int cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt);
+/**
+ * return cpumask of CPU partition \a cpt
+ */
+cpumask_t *cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt);
+/**
+ * return nodemask of CPU partition \a cpt
+ */
+nodemask_t *cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt);
+/**
+ * shadow current HW processor ID to CPU-partition ID of \a cptab
+ */
+int cfs_cpt_current(struct cfs_cpt_table *cptab, int remap);
+/**
+ * shadow HW processor ID \a CPU to CPU-partition ID by \a cptab
+ */
+int cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu);
+/**
+ * bind current thread on a CPU-partition \a cpt of \a cptab
+ */
+int cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt);
+/**
+ * add \a cpu to CPU partion @cpt of \a cptab, return 1 for success,
+ * otherwise 0 is returned
+ */
+int cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu);
+/**
+ * remove \a cpu from CPU partition \a cpt of \a cptab
+ */
+void cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu);
+/**
+ * add all cpus in \a mask to CPU partition \a cpt
+ * return 1 if successfully set all CPUs, otherwise return 0
+ */
+int cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab,
+			int cpt, cpumask_t *mask);
+/**
+ * remove all cpus in \a mask from CPU partition \a cpt
+ */
+void cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab,
+			   int cpt, cpumask_t *mask);
+/**
+ * add all cpus in NUMA node \a node to CPU partition \a cpt
+ * return 1 if successfully set all CPUs, otherwise return 0
+ */
+int cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node);
+/**
+ * remove all cpus in NUMA node \a node from CPU partition \a cpt
+ */
+void cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node);
+
+/**
+ * add all cpus in node mask \a mask to CPU partition \a cpt
+ * return 1 if successfully set all CPUs, otherwise return 0
+ */
+int cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab,
+			 int cpt, nodemask_t *mask);
+/**
+ * remove all cpus in node mask \a mask from CPU partition \a cpt
+ */
+void cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab,
+			    int cpt, nodemask_t *mask);
+/**
+ * unset all cpus for CPU partition \a cpt
+ */
+void cfs_cpt_clear(struct cfs_cpt_table *cptab, int cpt);
+/**
+ * convert partition id \a cpt to numa node id, if there are more than one
+ * nodes in this partition, it might return a different node id each time.
+ */
+int cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt);
+
+/**
+ * iterate over all CPU partitions in \a cptab
+ */
+#define cfs_cpt_for_each(i, cptab)	\
+	for (i = 0; i < cfs_cpt_number(cptab); i++)
+
+#ifndef __read_mostly
+# define __read_mostly
+#endif
+
+#ifndef ____cacheline_aligned
+#define ____cacheline_aligned
+#endif
+
+int  cfs_cpu_init(void);
+void cfs_cpu_fini(void);
+
+#endif /* __LIBCFS_CPU_H__ */
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_crypto.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_crypto.h
new file mode 100644
index 000000000000..64ca62f0cc93
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_crypto.h
@@ -0,0 +1,201 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ */
+
+#ifndef _LIBCFS_CRYPTO_H
+#define _LIBCFS_CRYPTO_H
+
+struct cfs_crypto_hash_type {
+	char		*cht_name;      /**< hash algorithm name, equal to
+					 * format name for crypto api */
+	unsigned int    cht_key;	/**< init key by default (vaild for
+					 * 4 bytes context like crc32, adler */
+	unsigned int    cht_size;       /**< hash digest size */
+};
+
+enum cfs_crypto_hash_alg {
+	CFS_HASH_ALG_NULL       = 0,
+	CFS_HASH_ALG_ADLER32,
+	CFS_HASH_ALG_CRC32,
+	CFS_HASH_ALG_MD5,
+	CFS_HASH_ALG_SHA1,
+	CFS_HASH_ALG_SHA256,
+	CFS_HASH_ALG_SHA384,
+	CFS_HASH_ALG_SHA512,
+	CFS_HASH_ALG_CRC32C,
+	CFS_HASH_ALG_MAX
+};
+
+static struct cfs_crypto_hash_type hash_types[] = {
+	[CFS_HASH_ALG_NULL]    = { "null",     0,      0 },
+	[CFS_HASH_ALG_ADLER32] = { "adler32",  1,      4 },
+	[CFS_HASH_ALG_CRC32]   = { "crc32",   ~0,      4 },
+	[CFS_HASH_ALG_CRC32C]  = { "crc32c",  ~0,      4 },
+	[CFS_HASH_ALG_MD5]     = { "md5",      0,     16 },
+	[CFS_HASH_ALG_SHA1]    = { "sha1",     0,     20 },
+	[CFS_HASH_ALG_SHA256]  = { "sha256",   0,     32 },
+	[CFS_HASH_ALG_SHA384]  = { "sha384",   0,     48 },
+	[CFS_HASH_ALG_SHA512]  = { "sha512",   0,     64 },
+};
+
+/**    Return pointer to type of hash for valid hash algorithm identifier */
+static inline const struct cfs_crypto_hash_type *
+		    cfs_crypto_hash_type(unsigned char hash_alg)
+{
+	struct cfs_crypto_hash_type *ht;
+
+	if (hash_alg < CFS_HASH_ALG_MAX) {
+		ht = &hash_types[hash_alg];
+		if (ht->cht_name)
+			return ht;
+	}
+	return NULL;
+}
+
+/**     Return hash name for valid hash algorithm identifier or "unknown" */
+static inline const char *cfs_crypto_hash_name(unsigned char hash_alg)
+{
+	const struct cfs_crypto_hash_type *ht;
+
+	ht = cfs_crypto_hash_type(hash_alg);
+	if (ht)
+		return ht->cht_name;
+	else
+		return "unknown";
+}
+
+/**     Return digest size for valid algorithm identifier or 0 */
+static inline int cfs_crypto_hash_digestsize(unsigned char hash_alg)
+{
+	const struct cfs_crypto_hash_type *ht;
+
+	ht = cfs_crypto_hash_type(hash_alg);
+	if (ht)
+		return ht->cht_size;
+	else
+		return 0;
+}
+
+/**     Return hash identifier for valid hash algorithm name or 0xFF */
+static inline unsigned char cfs_crypto_hash_alg(const char *algname)
+{
+	unsigned char   i;
+
+	for (i = 0; i < CFS_HASH_ALG_MAX; i++)
+		if (!strcmp(hash_types[i].cht_name, algname))
+			break;
+	return (i == CFS_HASH_ALG_MAX ? 0xFF : i);
+}
+
+/**     Calculate hash digest for buffer.
+ *      @param alg	    id of hash algorithm
+ *      @param buf	    buffer of data
+ *      @param buf_len	buffer len
+ *      @param key	    initial value for algorithm, if it is NULL,
+ *			    default initial value should be used.
+ *      @param key_len	len of initial value
+ *      @param hash	   [out] pointer to hash, if it is NULL, hash_len is
+ *			    set to valid digest size in bytes, retval -ENOSPC.
+ *      @param hash_len       [in,out] size of hash buffer
+ *      @returns	      status of operation
+ *      @retval -EINVAL       if buf, buf_len, hash_len or alg_id is invalid
+ *      @retval -ENODEV       if this algorithm is unsupported
+ *      @retval -ENOSPC       if pointer to hash is NULL, or hash_len less than
+ *			    digest size
+ *      @retval 0	     for success
+ *      @retval < 0	   other errors from lower layers.
+ */
+int cfs_crypto_hash_digest(unsigned char alg,
+			   const void *buf, unsigned int buf_len,
+			   unsigned char *key, unsigned int key_len,
+			   unsigned char *hash, unsigned int *hash_len);
+
+/* cfs crypto hash descriptor */
+struct cfs_crypto_hash_desc;
+
+/**     Allocate and initialize desriptor for hash algorithm.
+ *      @param alg	    algorithm id
+ *      @param key	    initial value for algorithm, if it is NULL,
+ *			    default initial value should be used.
+ *      @param key_len	len of initial value
+ *      @returns	      pointer to descriptor of hash instance
+ *      @retval ERR_PTR(error) when errors occured.
+ */
+struct cfs_crypto_hash_desc*
+	cfs_crypto_hash_init(unsigned char alg,
+			     unsigned char *key, unsigned int key_len);
+
+/**    Update digest by part of data.
+ *     @param desc	      hash descriptor
+ *     @param page	      data page
+ *     @param offset	    data offset
+ *     @param len	       data len
+ *     @returns		 status of operation
+ *     @retval 0		for success.
+ */
+int cfs_crypto_hash_update_page(struct cfs_crypto_hash_desc *desc,
+				struct page *page, unsigned int offset,
+				unsigned int len);
+
+/**    Update digest by part of data.
+ *     @param desc	      hash descriptor
+ *     @param buf	       pointer to data buffer
+ *     @param buf_len	   size of data at buffer
+ *     @returns		 status of operation
+ *     @retval 0		for success.
+ */
+int cfs_crypto_hash_update(struct cfs_crypto_hash_desc *desc, const void *buf,
+			   unsigned int buf_len);
+
+/**    Finalize hash calculation, copy hash digest to buffer, destroy hash
+ *     descriptor.
+ *     @param desc	      hash descriptor
+ *     @param hash	      buffer pointer to store hash digest
+ *     @param hash_len	  pointer to hash buffer size, if NULL
+ *			      destory hash descriptor
+ *     @returns		 status of operation
+ *     @retval -ENOSPC	  if hash is NULL, or *hash_len less than
+ *			      digest size
+ *     @retval 0		for success
+ *     @retval < 0	      other errors from lower layers.
+ */
+int cfs_crypto_hash_final(struct cfs_crypto_hash_desc *desc,
+			  unsigned char *hash, unsigned int *hash_len);
+/**
+ *      Register crypto hash algorithms
+ */
+int cfs_crypto_register(void);
+
+/**
+ *      Unregister
+ */
+void cfs_crypto_unregister(void);
+
+/**     Return hash speed in Mbytes per second for valid hash algorithm
+ *      identifier. If test was unsuccessfull -1 would be return.
+ */
+int cfs_crypto_hash_speed(unsigned char hash_alg);
+#endif
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_debug.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_debug.h
new file mode 100644
index 000000000000..dd8ac2f52c9f
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_debug.h
@@ -0,0 +1,350 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_debug.h
+ *
+ * Debug messages and assertions
+ *
+ */
+
+#ifndef __LIBCFS_DEBUG_H__
+#define __LIBCFS_DEBUG_H__
+
+/*
+ *  Debugging
+ */
+extern unsigned int libcfs_subsystem_debug;
+extern unsigned int libcfs_stack;
+extern unsigned int libcfs_debug;
+extern unsigned int libcfs_printk;
+extern unsigned int libcfs_console_ratelimit;
+extern unsigned int libcfs_watchdog_ratelimit;
+extern unsigned int libcfs_console_max_delay;
+extern unsigned int libcfs_console_min_delay;
+extern unsigned int libcfs_console_backoff;
+extern unsigned int libcfs_debug_binary;
+extern char libcfs_debug_file_path_arr[PATH_MAX];
+
+int libcfs_debug_mask2str(char *str, int size, int mask, int is_subsys);
+int libcfs_debug_str2mask(int *mask, const char *str, int is_subsys);
+
+/* Has there been an LBUG? */
+extern unsigned int libcfs_catastrophe;
+extern unsigned int libcfs_panic_on_lbug;
+
+/**
+ * Format for debug message headers
+ */
+struct ptldebug_header {
+	__u32 ph_len;
+	__u32 ph_flags;
+	__u32 ph_subsys;
+	__u32 ph_mask;
+	__u16 ph_cpu_id;
+	__u16 ph_type;
+	__u32 ph_sec;
+	__u64 ph_usec;
+	__u32 ph_stack;
+	__u32 ph_pid;
+	__u32 ph_extern_pid;
+	__u32 ph_line_num;
+} __attribute__((packed));
+
+
+#define PH_FLAG_FIRST_RECORD 1
+
+/* Debugging subsystems (32 bits, non-overlapping) */
+/* keep these in sync with lnet/utils/debug.c and lnet/libcfs/debug.c */
+#define S_UNDEFINED   0x00000001
+#define S_MDC	 0x00000002
+#define S_MDS	 0x00000004
+#define S_OSC	 0x00000008
+#define S_OST	 0x00000010
+#define S_CLASS       0x00000020
+#define S_LOG	 0x00000040
+#define S_LLITE       0x00000080
+#define S_RPC	 0x00000100
+#define S_MGMT	0x00000200
+#define S_LNET	0x00000400
+#define S_LND	 0x00000800 /* ALL LNDs */
+#define S_PINGER      0x00001000
+#define S_FILTER      0x00002000
+/* unused */
+#define S_ECHO	0x00008000
+#define S_LDLM	0x00010000
+#define S_LOV	 0x00020000
+#define S_LQUOTA      0x00040000
+#define S_OSD		0x00080000
+/* unused */
+/* unused */
+/* unused */
+#define S_LMV	 0x00800000 /* b_new_cmd */
+/* unused */
+#define S_SEC	 0x02000000 /* upcall cache */
+#define S_GSS	 0x04000000 /* b_new_cmd */
+/* unused */
+#define S_MGC	 0x10000000
+#define S_MGS	 0x20000000
+#define S_FID	 0x40000000 /* b_new_cmd */
+#define S_FLD	 0x80000000 /* b_new_cmd */
+/* keep these in sync with lnet/utils/debug.c and lnet/libcfs/debug.c */
+
+/* Debugging masks (32 bits, non-overlapping) */
+/* keep these in sync with lnet/utils/debug.c and lnet/libcfs/debug.c */
+#define D_TRACE       0x00000001 /* ENTRY/EXIT markers */
+#define D_INODE       0x00000002
+#define D_SUPER       0x00000004
+#define D_EXT2	0x00000008 /* anything from ext2_debug */
+#define D_MALLOC      0x00000010 /* print malloc, free information */
+#define D_CACHE       0x00000020 /* cache-related items */
+#define D_INFO	0x00000040 /* general information */
+#define D_IOCTL       0x00000080 /* ioctl related information */
+#define D_NETERROR    0x00000100 /* network errors */
+#define D_NET	 0x00000200 /* network communications */
+#define D_WARNING     0x00000400 /* CWARN(...) == CDEBUG (D_WARNING, ...) */
+#define D_BUFFS       0x00000800
+#define D_OTHER       0x00001000
+#define D_DENTRY      0x00002000
+#define D_NETTRACE    0x00004000
+#define D_PAGE	0x00008000 /* bulk page handling */
+#define D_DLMTRACE    0x00010000
+#define D_ERROR       0x00020000 /* CERROR(...) == CDEBUG (D_ERROR, ...) */
+#define D_EMERG       0x00040000 /* CEMERG(...) == CDEBUG (D_EMERG, ...) */
+#define D_HA	  0x00080000 /* recovery and failover */
+#define D_RPCTRACE    0x00100000 /* for distributed debugging */
+#define D_VFSTRACE    0x00200000
+#define D_READA       0x00400000 /* read-ahead */
+#define D_MMAP	0x00800000
+#define D_CONFIG      0x01000000
+#define D_CONSOLE     0x02000000
+#define D_QUOTA       0x04000000
+#define D_SEC	 0x08000000
+#define D_LFSCK	      0x10000000 /* For both OI scrub and LFSCK */
+/* keep these in sync with lnet/{utils,libcfs}/debug.c */
+
+#define D_HSM	 D_TRACE
+
+#define D_CANTMASK   (D_ERROR | D_EMERG | D_WARNING | D_CONSOLE)
+
+#ifndef DEBUG_SUBSYSTEM
+# define DEBUG_SUBSYSTEM S_UNDEFINED
+#endif
+
+#define CDEBUG_DEFAULT_MAX_DELAY (cfs_time_seconds(600))	 /* jiffies */
+#define CDEBUG_DEFAULT_MIN_DELAY ((cfs_time_seconds(1) + 1) / 2) /* jiffies */
+#define CDEBUG_DEFAULT_BACKOFF   2
+typedef struct {
+	cfs_time_t      cdls_next;
+	unsigned int    cdls_delay;
+	int	     cdls_count;
+} cfs_debug_limit_state_t;
+
+struct libcfs_debug_msg_data {
+	const char	       *msg_file;
+	const char	       *msg_fn;
+	int		      msg_subsys;
+	int		      msg_line;
+	int		      msg_mask;
+	cfs_debug_limit_state_t  *msg_cdls;
+};
+
+#define LIBCFS_DEBUG_MSG_DATA_INIT(data, mask, cdls)	\
+do {							\
+	(data)->msg_subsys = DEBUG_SUBSYSTEM;	       \
+	(data)->msg_file   = __FILE__;		      \
+	(data)->msg_fn     = __FUNCTION__;		  \
+	(data)->msg_line   = __LINE__;		      \
+	(data)->msg_cdls   = (cdls);			\
+	(data)->msg_mask   = (mask);			\
+} while (0)
+
+#define LIBCFS_DEBUG_MSG_DATA_DECL(dataname, mask, cdls)    \
+	static struct libcfs_debug_msg_data dataname = {    \
+	       .msg_subsys = DEBUG_SUBSYSTEM,	       \
+	       .msg_file   = __FILE__,		      \
+	       .msg_fn     = __FUNCTION__,		  \
+	       .msg_line   = __LINE__,		      \
+	       .msg_cdls   = (cdls)	 };	      \
+	dataname.msg_mask   = (mask);
+
+
+
+/**
+ * Filters out logging messages based on mask and subsystem.
+ */
+static inline int cfs_cdebug_show(unsigned int mask, unsigned int subsystem)
+{
+	return mask & D_CANTMASK ||
+		((libcfs_debug & mask) && (libcfs_subsystem_debug & subsystem));
+}
+
+#define __CDEBUG(cdls, mask, format, ...)			       \
+do {								    \
+	static struct libcfs_debug_msg_data msgdata;		    \
+									\
+	CFS_CHECK_STACK(&msgdata, mask, cdls);			  \
+									\
+	if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {		   \
+		LIBCFS_DEBUG_MSG_DATA_INIT(&msgdata, mask, cdls);       \
+		libcfs_debug_msg(&msgdata, format, ## __VA_ARGS__);     \
+	}							       \
+} while (0)
+
+#define CDEBUG(mask, format, ...) __CDEBUG(NULL, mask, format, ## __VA_ARGS__)
+
+#define CDEBUG_LIMIT(mask, format, ...)	 \
+do {					    \
+	static cfs_debug_limit_state_t cdls;    \
+						\
+	__CDEBUG(&cdls, mask, format, ## __VA_ARGS__);\
+} while (0)
+
+
+
+
+#define CWARN(format, ...)	  CDEBUG_LIMIT(D_WARNING, format, ## __VA_ARGS__)
+#define CERROR(format, ...)	 CDEBUG_LIMIT(D_ERROR, format, ## __VA_ARGS__)
+#define CNETERR(format, a...)       CDEBUG_LIMIT(D_NETERROR, format, ## a)
+#define CEMERG(format, ...)	 CDEBUG_LIMIT(D_EMERG, format, ## __VA_ARGS__)
+
+#define LCONSOLE(mask, format, ...) CDEBUG(D_CONSOLE | (mask), format, ## __VA_ARGS__)
+#define LCONSOLE_INFO(format, ...)  CDEBUG_LIMIT(D_CONSOLE, format, ## __VA_ARGS__)
+#define LCONSOLE_WARN(format, ...)  CDEBUG_LIMIT(D_CONSOLE | D_WARNING, format, ## __VA_ARGS__)
+#define LCONSOLE_ERROR_MSG(errnum, format, ...) CDEBUG_LIMIT(D_CONSOLE | D_ERROR, \
+			   "%x-%x: " format, errnum, LERRCHKSUM(errnum), ## __VA_ARGS__)
+#define LCONSOLE_ERROR(format, ...) LCONSOLE_ERROR_MSG(0x00, format, ## __VA_ARGS__)
+
+#define LCONSOLE_EMERG(format, ...) CDEBUG(D_CONSOLE | D_EMERG, format, ## __VA_ARGS__)
+
+
+void libcfs_log_goto(struct libcfs_debug_msg_data *, const char *, long_ptr_t);
+#define GOTO(label, rc)						 \
+do {								    \
+	if (cfs_cdebug_show(D_TRACE, DEBUG_SUBSYSTEM)) {		\
+		LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_TRACE, NULL);     \
+		libcfs_log_goto(&msgdata, #label, (long_ptr_t)(rc));    \
+	} else {							\
+		(void)(rc);					     \
+	}							       \
+	goto label;						     \
+} while (0)
+
+
+/*
+ * if rc == NULL, we need to code as RETURN((void *)NULL), otherwise
+ * there will be a warning in osx.
+ */
+#if defined(__GNUC__)
+
+long libcfs_log_return(struct libcfs_debug_msg_data *, long rc);
+#if BITS_PER_LONG > 32
+#define RETURN(rc)							\
+do {									\
+	EXIT_NESTING;							\
+	if (cfs_cdebug_show(D_TRACE, DEBUG_SUBSYSTEM)) {		\
+		LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_TRACE, NULL);	\
+		return (typeof(rc))libcfs_log_return(&msgdata,		\
+						     (long)(rc));	\
+	}								\
+									\
+	return (rc);							\
+} while (0)
+#else /* BITS_PER_LONG == 32 */
+/* We need an on-stack variable, because we cannot case a 32-bit pointer
+ * directly to (long long) without generating a complier warning/error, yet
+ * casting directly to (long) will truncate 64-bit return values. The log
+ * values will print as 32-bit values, but they always have been. LU-1436
+ */
+#define RETURN(rc)							\
+do {									\
+	EXIT_NESTING;							\
+	if (cfs_cdebug_show(D_TRACE, DEBUG_SUBSYSTEM)) {		\
+		typeof(rc) __rc = (rc);					\
+		LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_TRACE, NULL);	\
+		libcfs_log_return(&msgdata, (long_ptr_t)__rc);		\
+		return __rc;						\
+	}								\
+									\
+	return (rc);							\
+} while (0)
+#endif /* BITS_PER_LONG > 32 */
+
+#elif defined(_MSC_VER)
+#define RETURN(rc)						      \
+do {								    \
+	CDEBUG(D_TRACE, "Process leaving.\n");			  \
+	EXIT_NESTING;						   \
+	return (rc);						    \
+} while (0)
+#else
+# error "Unkown compiler"
+#endif /* __GNUC__ */
+
+#define ENTRY							   \
+ENTRY_NESTING;							  \
+do {								    \
+	CDEBUG(D_TRACE, "Process entered\n");			   \
+} while (0)
+
+#define EXIT							    \
+do {								    \
+	CDEBUG(D_TRACE, "Process leaving\n");			   \
+	EXIT_NESTING;						   \
+} while(0)
+
+#define RETURN_EXIT							\
+do {									\
+	EXIT;								\
+	return;								\
+} while (0)
+
+extern int libcfs_debug_msg(struct libcfs_debug_msg_data *msgdata,
+			    const char *format1, ...)
+	__attribute__ ((format (printf, 2, 3)));
+
+extern int libcfs_debug_vmsg2(struct libcfs_debug_msg_data *msgdata,
+			      const char *format1,
+			      va_list args, const char *format2, ...)
+	__attribute__ ((format (printf, 4, 5)));
+
+/* other external symbols that tracefile provides: */
+extern int cfs_trace_copyin_string(char *knl_buffer, int knl_buffer_nob,
+				   const char *usr_buffer, int usr_buffer_nob);
+extern int cfs_trace_copyout_string(char *usr_buffer, int usr_buffer_nob,
+				    const char *knl_buffer, char *append);
+
+#define LIBCFS_DEBUG_FILE_PATH_DEFAULT "/tmp/lustre-log"
+
+#endif	/* __LIBCFS_DEBUG_H__ */
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_fail.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_fail.h
new file mode 100644
index 000000000000..8393c2703ce6
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_fail.h
@@ -0,0 +1,170 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please contact Oracle Corporation, Inc., 500 Oracle Parkway, Redwood Shores,
+ * CA 94065 USA or visit www.oracle.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Oracle Corporation, Inc.
+ */
+
+#ifndef _LIBCFS_FAIL_H
+#define _LIBCFS_FAIL_H
+
+extern unsigned long cfs_fail_loc;
+extern unsigned int cfs_fail_val;
+
+extern wait_queue_head_t cfs_race_waitq;
+extern int cfs_race_state;
+
+int __cfs_fail_check_set(__u32 id, __u32 value, int set);
+int __cfs_fail_timeout_set(__u32 id, __u32 value, int ms, int set);
+
+enum {
+	CFS_FAIL_LOC_NOSET      = 0,
+	CFS_FAIL_LOC_ORSET      = 1,
+	CFS_FAIL_LOC_RESET      = 2,
+	CFS_FAIL_LOC_VALUE      = 3
+};
+
+/* Failure injection control */
+#define CFS_FAIL_MASK_SYS    0x0000FF00
+#define CFS_FAIL_MASK_LOC   (0x000000FF | CFS_FAIL_MASK_SYS)
+
+#define CFS_FAILED_BIT       30
+/* CFS_FAILED is 0x40000000 */
+#define CFS_FAILED	  (1 << CFS_FAILED_BIT)
+
+#define CFS_FAIL_ONCE_BIT    31
+/* CFS_FAIL_ONCE is 0x80000000 */
+#define CFS_FAIL_ONCE       (1 << CFS_FAIL_ONCE_BIT)
+
+/* The following flags aren't made to be combined */
+#define CFS_FAIL_SKIP	0x20000000 /* skip N times then fail */
+#define CFS_FAIL_SOME	0x10000000 /* only fail N times */
+#define CFS_FAIL_RAND	0x08000000 /* fail 1/N of the times */
+#define CFS_FAIL_USR1	0x04000000 /* user flag */
+
+#define CFS_FAIL_PRECHECK(id) (cfs_fail_loc &&				\
+			      (cfs_fail_loc & CFS_FAIL_MASK_LOC) ==	   \
+			      ((id) & CFS_FAIL_MASK_LOC))
+
+static inline int cfs_fail_check_set(__u32 id, __u32 value,
+				     int set, int quiet)
+{
+	int ret = 0;
+
+	if (unlikely(CFS_FAIL_PRECHECK(id) &&
+		     (ret = __cfs_fail_check_set(id, value, set)))) {
+		if (quiet) {
+			CDEBUG(D_INFO, "*** cfs_fail_loc=%x, val=%u***\n",
+			       id, value);
+		} else {
+			LCONSOLE_INFO("*** cfs_fail_loc=%x, val=%u***\n",
+				      id, value);
+		}
+	}
+
+	return ret;
+}
+
+/* If id hit cfs_fail_loc, return 1, otherwise return 0 */
+#define CFS_FAIL_CHECK(id) \
+	cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET, 0)
+#define CFS_FAIL_CHECK_QUIET(id) \
+	cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET, 1)
+
+/* If id hit cfs_fail_loc and cfs_fail_val == (-1 or value) return 1,
+ * otherwise return 0 */
+#define CFS_FAIL_CHECK_VALUE(id, value) \
+	cfs_fail_check_set(id, value, CFS_FAIL_LOC_VALUE, 0)
+#define CFS_FAIL_CHECK_VALUE_QUIET(id, value) \
+	cfs_fail_check_set(id, value, CFS_FAIL_LOC_VALUE, 1)
+
+/* If id hit cfs_fail_loc, cfs_fail_loc |= value and return 1,
+ * otherwise return 0 */
+#define CFS_FAIL_CHECK_ORSET(id, value) \
+	cfs_fail_check_set(id, value, CFS_FAIL_LOC_ORSET, 0)
+#define CFS_FAIL_CHECK_ORSET_QUIET(id, value) \
+	cfs_fail_check_set(id, value, CFS_FAIL_LOC_ORSET, 1)
+
+/* If id hit cfs_fail_loc, cfs_fail_loc = value and return 1,
+ * otherwise return 0 */
+#define CFS_FAIL_CHECK_RESET(id, value) \
+	cfs_fail_check_set(id, value, CFS_FAIL_LOC_RESET, 0)
+#define CFS_FAIL_CHECK_RESET_QUIET(id, value) \
+	cfs_fail_check_set(id, value, CFS_FAIL_LOC_RESET, 1)
+
+static inline int cfs_fail_timeout_set(__u32 id, __u32 value, int ms, int set)
+{
+	if (unlikely(CFS_FAIL_PRECHECK(id)))
+		return __cfs_fail_timeout_set(id, value, ms, set);
+	else
+		return 0;
+}
+
+/* If id hit cfs_fail_loc, sleep for seconds or milliseconds */
+#define CFS_FAIL_TIMEOUT(id, secs) \
+	cfs_fail_timeout_set(id, 0, secs * 1000, CFS_FAIL_LOC_NOSET)
+
+#define CFS_FAIL_TIMEOUT_MS(id, ms) \
+	cfs_fail_timeout_set(id, 0, ms, CFS_FAIL_LOC_NOSET)
+
+/* If id hit cfs_fail_loc, cfs_fail_loc |= value and
+ * sleep seconds or milliseconds */
+#define CFS_FAIL_TIMEOUT_ORSET(id, value, secs) \
+	cfs_fail_timeout_set(id, value, secs * 1000, CFS_FAIL_LOC_ORSET)
+
+#define CFS_FAIL_TIMEOUT_MS_ORSET(id, value, ms) \
+	cfs_fail_timeout_set(id, value, ms, CFS_FAIL_LOC_ORSET)
+
+/* The idea here is to synchronise two threads to force a race. The
+ * first thread that calls this with a matching fail_loc is put to
+ * sleep. The next thread that calls with the same fail_loc wakes up
+ * the first and continues. */
+static inline void cfs_race(__u32 id)
+{
+
+	if (CFS_FAIL_PRECHECK(id)) {
+		if (unlikely(__cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET))) {
+			int rc;
+			cfs_race_state = 0;
+			CERROR("cfs_race id %x sleeping\n", id);
+			cfs_wait_event_interruptible(cfs_race_waitq,
+						     cfs_race_state != 0, rc);
+			CERROR("cfs_fail_race id %x awake, rc=%d\n", id, rc);
+		} else {
+			CERROR("cfs_fail_race id %x waking\n", id);
+			cfs_race_state = 1;
+			wake_up(&cfs_race_waitq);
+		}
+	}
+}
+#define CFS_RACE(id) cfs_race(id)
+
+#endif /* _LIBCFS_FAIL_H */
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_hash.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_hash.h
new file mode 100644
index 000000000000..c5b371569da8
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_hash.h
@@ -0,0 +1,850 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_hash.h
+ *
+ * Hashing routines
+ *
+ */
+
+#ifndef __LIBCFS_HASH_H__
+#define __LIBCFS_HASH_H__
+/*
+ * Knuth recommends primes in approximately golden ratio to the maximum
+ * integer representable by a machine word for multiplicative hashing.
+ * Chuck Lever verified the effectiveness of this technique:
+ * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
+ *
+ * These primes are chosen to be bit-sparse, that is operations on
+ * them can use shifts and additions instead of multiplications for
+ * machines where multiplications are slow.
+ */
+/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
+#define CFS_GOLDEN_RATIO_PRIME_32 0x9e370001UL
+/*  2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
+#define CFS_GOLDEN_RATIO_PRIME_64 0x9e37fffffffc0001ULL
+
+/*
+ * Ideally we would use HAVE_HASH_LONG for this, but on linux we configure
+ * the linux kernel and user space at the same time, so we need to differentiate
+ * between them explicitely. If this is not needed on other architectures, then
+ * we'll need to move the functions to archi specific headers.
+ */
+
+#include <linux/hash.h>
+
+#define cfs_hash_long(val, bits)    hash_long(val, bits)
+
+/** disable debug */
+#define CFS_HASH_DEBUG_NONE	 0
+/** record hash depth and output to console when it's too deep,
+ *  computing overhead is low but consume more memory */
+#define CFS_HASH_DEBUG_1	    1
+/** expensive, check key validation */
+#define CFS_HASH_DEBUG_2	    2
+
+#define CFS_HASH_DEBUG_LEVEL	CFS_HASH_DEBUG_NONE
+
+struct cfs_hash_ops;
+struct cfs_hash_lock_ops;
+struct cfs_hash_hlist_ops;
+
+typedef union {
+	rwlock_t		rw;		/**< rwlock */
+	spinlock_t		spin;		/**< spinlock */
+} cfs_hash_lock_t;
+
+/**
+ * cfs_hash_bucket is a container of:
+ * - lock, couter ...
+ * - array of hash-head starting from hsb_head[0], hash-head can be one of
+ *   . cfs_hash_head_t
+ *   . cfs_hash_head_dep_t
+ *   . cfs_hash_dhead_t
+ *   . cfs_hash_dhead_dep_t
+ *   which depends on requirement of user
+ * - some extra bytes (caller can require it while creating hash)
+ */
+typedef struct cfs_hash_bucket {
+	cfs_hash_lock_t		hsb_lock;	/**< bucket lock */
+	__u32			hsb_count;	/**< current entries */
+	__u32			hsb_version;	/**< change version */
+	unsigned int		hsb_index;	/**< index of bucket */
+	int			hsb_depmax;	/**< max depth on bucket */
+	long			hsb_head[0];	/**< hash-head array */
+} cfs_hash_bucket_t;
+
+/**
+ * cfs_hash bucket descriptor, it's normally in stack of caller
+ */
+typedef struct cfs_hash_bd {
+	cfs_hash_bucket_t	  *bd_bucket;      /**< address of bucket */
+	unsigned int		bd_offset;      /**< offset in bucket */
+} cfs_hash_bd_t;
+
+#define CFS_HASH_NAME_LEN	   16      /**< default name length */
+#define CFS_HASH_BIGNAME_LEN	64      /**< bigname for param tree */
+
+#define CFS_HASH_BKT_BITS	   3       /**< default bits of bucket */
+#define CFS_HASH_BITS_MAX	   30      /**< max bits of bucket */
+#define CFS_HASH_BITS_MIN	   CFS_HASH_BKT_BITS
+
+/**
+ * common hash attributes.
+ */
+enum cfs_hash_tag {
+	/**
+	 * don't need any lock, caller will protect operations with it's
+	 * own lock. With this flag:
+	 *  . CFS_HASH_NO_BKTLOCK, CFS_HASH_RW_BKTLOCK, CFS_HASH_SPIN_BKTLOCK
+	 *    will be ignored.
+	 *  . Some functions will be disabled with this flag, i.e:
+	 *    cfs_hash_for_each_empty, cfs_hash_rehash
+	 */
+	CFS_HASH_NO_LOCK	= 1 << 0,
+	/** no bucket lock, use one spinlock to protect the whole hash */
+	CFS_HASH_NO_BKTLOCK     = 1 << 1,
+	/** rwlock to protect bucket */
+	CFS_HASH_RW_BKTLOCK     = 1 << 2,
+	/** spinlcok to protect bucket */
+	CFS_HASH_SPIN_BKTLOCK   = 1 << 3,
+	/** always add new item to tail */
+	CFS_HASH_ADD_TAIL       = 1 << 4,
+	/** hash-table doesn't have refcount on item */
+	CFS_HASH_NO_ITEMREF     = 1 << 5,
+	/** big name for param-tree */
+	CFS_HASH_BIGNAME	= 1 << 6,
+	/** track global count */
+	CFS_HASH_COUNTER	= 1 << 7,
+	/** rehash item by new key */
+	CFS_HASH_REHASH_KEY     = 1 << 8,
+	/** Enable dynamic hash resizing */
+	CFS_HASH_REHASH	 = 1 << 9,
+	/** can shrink hash-size */
+	CFS_HASH_SHRINK	 = 1 << 10,
+	/** assert hash is empty on exit */
+	CFS_HASH_ASSERT_EMPTY   = 1 << 11,
+	/** record hlist depth */
+	CFS_HASH_DEPTH	  = 1 << 12,
+	/**
+	 * rehash is always scheduled in a different thread, so current
+	 * change on hash table is non-blocking
+	 */
+	CFS_HASH_NBLK_CHANGE    = 1 << 13,
+	/** NB, we typed hs_flags as  __u16, please change it
+	 * if you need to extend >=16 flags */
+};
+
+/** most used attributes */
+#define CFS_HASH_DEFAULT       (CFS_HASH_RW_BKTLOCK | \
+				CFS_HASH_COUNTER | CFS_HASH_REHASH)
+
+/**
+ * cfs_hash is a hash-table implementation for general purpose, it can support:
+ *    . two refcount modes
+ *      hash-table with & without refcount
+ *    . four lock modes
+ *      nolock, one-spinlock, rw-bucket-lock, spin-bucket-lock
+ *    . general operations
+ *      lookup, add(add_tail or add_head), delete
+ *    . rehash
+ *      grows or shrink
+ *    . iteration
+ *      locked iteration and unlocked iteration
+ *    . bigname
+ *      support long name hash
+ *    . debug
+ *      trace max searching depth
+ *
+ * Rehash:
+ * When the htable grows or shrinks, a separate task (cfs_hash_rehash_worker)
+ * is spawned to handle the rehash in the background, it's possible that other
+ * processes can concurrently perform additions, deletions, and lookups
+ * without being blocked on rehash completion, because rehash will release
+ * the global wrlock for each bucket.
+ *
+ * rehash and iteration can't run at the same time because it's too tricky
+ * to keep both of them safe and correct.
+ * As they are relatively rare operations, so:
+ *   . if iteration is in progress while we try to launch rehash, then
+ *     it just giveup, iterator will launch rehash at the end.
+ *   . if rehash is in progress while we try to iterate the hash table,
+ *     then we just wait (shouldn't be very long time), anyway, nobody
+ *     should expect iteration of whole hash-table to be non-blocking.
+ *
+ * During rehashing, a (key,object) pair may be in one of two buckets,
+ * depending on whether the worker task has yet to transfer the object
+ * to its new location in the table. Lookups and deletions need to search both
+ * locations; additions must take care to only insert into the new bucket.
+ */
+
+typedef struct cfs_hash {
+	/** serialize with rehash, or serialize all operations if
+	 * the hash-table has CFS_HASH_NO_BKTLOCK */
+	cfs_hash_lock_t	     hs_lock;
+	/** hash operations */
+	struct cfs_hash_ops	*hs_ops;
+	/** hash lock operations */
+	struct cfs_hash_lock_ops   *hs_lops;
+	/** hash list operations */
+	struct cfs_hash_hlist_ops  *hs_hops;
+	/** hash buckets-table */
+	cfs_hash_bucket_t	 **hs_buckets;
+	/** total number of items on this hash-table */
+	atomic_t		hs_count;
+	/** hash flags, see cfs_hash_tag for detail */
+	__u16		       hs_flags;
+	/** # of extra-bytes for bucket, for user saving extended attributes */
+	__u16		       hs_extra_bytes;
+	/** wants to iterate */
+	__u8			hs_iterating;
+	/** hash-table is dying */
+	__u8			hs_exiting;
+	/** current hash bits */
+	__u8			hs_cur_bits;
+	/** min hash bits */
+	__u8			hs_min_bits;
+	/** max hash bits */
+	__u8			hs_max_bits;
+	/** bits for rehash */
+	__u8			hs_rehash_bits;
+	/** bits for each bucket */
+	__u8			hs_bkt_bits;
+	/** resize min threshold */
+	__u16		       hs_min_theta;
+	/** resize max threshold */
+	__u16		       hs_max_theta;
+	/** resize count */
+	__u32		       hs_rehash_count;
+	/** # of iterators (caller of cfs_hash_for_each_*) */
+	__u32		       hs_iterators;
+	/** rehash workitem */
+	cfs_workitem_t	      hs_rehash_wi;
+	/** refcount on this hash table */
+	atomic_t		hs_refcount;
+	/** rehash buckets-table */
+	cfs_hash_bucket_t	 **hs_rehash_buckets;
+#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1
+	/** serialize debug members */
+	spinlock_t			hs_dep_lock;
+	/** max depth */
+	unsigned int		hs_dep_max;
+	/** id of the deepest bucket */
+	unsigned int		hs_dep_bkt;
+	/** offset in the deepest bucket */
+	unsigned int		hs_dep_off;
+	/** bits when we found the max depth */
+	unsigned int		hs_dep_bits;
+	/** workitem to output max depth */
+	cfs_workitem_t	      hs_dep_wi;
+#endif
+	/** name of htable */
+	char			hs_name[0];
+} cfs_hash_t;
+
+typedef struct cfs_hash_lock_ops {
+	/** lock the hash table */
+	void    (*hs_lock)(cfs_hash_lock_t *lock, int exclusive);
+	/** unlock the hash table */
+	void    (*hs_unlock)(cfs_hash_lock_t *lock, int exclusive);
+	/** lock the hash bucket */
+	void    (*hs_bkt_lock)(cfs_hash_lock_t *lock, int exclusive);
+	/** unlock the hash bucket */
+	void    (*hs_bkt_unlock)(cfs_hash_lock_t *lock, int exclusive);
+} cfs_hash_lock_ops_t;
+
+typedef struct cfs_hash_hlist_ops {
+	/** return hlist_head of hash-head of @bd */
+	struct hlist_head *(*hop_hhead)(cfs_hash_t *hs, cfs_hash_bd_t *bd);
+	/** return hash-head size */
+	int (*hop_hhead_size)(cfs_hash_t *hs);
+	/** add @hnode to hash-head of @bd */
+	int (*hop_hnode_add)(cfs_hash_t *hs,
+			     cfs_hash_bd_t *bd, struct hlist_node *hnode);
+	/** remove @hnode from hash-head of @bd */
+	int (*hop_hnode_del)(cfs_hash_t *hs,
+			     cfs_hash_bd_t *bd, struct hlist_node *hnode);
+} cfs_hash_hlist_ops_t;
+
+typedef struct cfs_hash_ops {
+	/** return hashed value from @key */
+	unsigned (*hs_hash)(cfs_hash_t *hs, const void *key, unsigned mask);
+	/** return key address of @hnode */
+	void *   (*hs_key)(struct hlist_node *hnode);
+	/** copy key from @hnode to @key */
+	void     (*hs_keycpy)(struct hlist_node *hnode, void *key);
+	/**
+	 *  compare @key with key of @hnode
+	 *  returns 1 on a match
+	 */
+	int      (*hs_keycmp)(const void *key, struct hlist_node *hnode);
+	/** return object address of @hnode, i.e: container_of(...hnode) */
+	void *   (*hs_object)(struct hlist_node *hnode);
+	/** get refcount of item, always called with holding bucket-lock */
+	void     (*hs_get)(cfs_hash_t *hs, struct hlist_node *hnode);
+	/** release refcount of item */
+	void     (*hs_put)(cfs_hash_t *hs, struct hlist_node *hnode);
+	/** release refcount of item, always called with holding bucket-lock */
+	void     (*hs_put_locked)(cfs_hash_t *hs, struct hlist_node *hnode);
+	/** it's called before removing of @hnode */
+	void     (*hs_exit)(cfs_hash_t *hs, struct hlist_node *hnode);
+} cfs_hash_ops_t;
+
+/** total number of buckets in @hs */
+#define CFS_HASH_NBKT(hs)       \
+	(1U << ((hs)->hs_cur_bits - (hs)->hs_bkt_bits))
+
+/** total number of buckets in @hs while rehashing */
+#define CFS_HASH_RH_NBKT(hs)    \
+	(1U << ((hs)->hs_rehash_bits - (hs)->hs_bkt_bits))
+
+/** number of hlist for in bucket */
+#define CFS_HASH_BKT_NHLIST(hs) (1U << (hs)->hs_bkt_bits)
+
+/** total number of hlist in @hs */
+#define CFS_HASH_NHLIST(hs)     (1U << (hs)->hs_cur_bits)
+
+/** total number of hlist in @hs while rehashing */
+#define CFS_HASH_RH_NHLIST(hs)  (1U << (hs)->hs_rehash_bits)
+
+static inline int
+cfs_hash_with_no_lock(cfs_hash_t *hs)
+{
+	/* caller will serialize all operations for this hash-table */
+	return (hs->hs_flags & CFS_HASH_NO_LOCK) != 0;
+}
+
+static inline int
+cfs_hash_with_no_bktlock(cfs_hash_t *hs)
+{
+	/* no bucket lock, one single lock to protect the hash-table */
+	return (hs->hs_flags & CFS_HASH_NO_BKTLOCK) != 0;
+}
+
+static inline int
+cfs_hash_with_rw_bktlock(cfs_hash_t *hs)
+{
+	/* rwlock to protect hash bucket */
+	return (hs->hs_flags & CFS_HASH_RW_BKTLOCK) != 0;
+}
+
+static inline int
+cfs_hash_with_spin_bktlock(cfs_hash_t *hs)
+{
+	/* spinlock to protect hash bucket */
+	return (hs->hs_flags & CFS_HASH_SPIN_BKTLOCK) != 0;
+}
+
+static inline int
+cfs_hash_with_add_tail(cfs_hash_t *hs)
+{
+	return (hs->hs_flags & CFS_HASH_ADD_TAIL) != 0;
+}
+
+static inline int
+cfs_hash_with_no_itemref(cfs_hash_t *hs)
+{
+	/* hash-table doesn't keep refcount on item,
+	 * item can't be removed from hash unless it's
+	 * ZERO refcount */
+	return (hs->hs_flags & CFS_HASH_NO_ITEMREF) != 0;
+}
+
+static inline int
+cfs_hash_with_bigname(cfs_hash_t *hs)
+{
+	return (hs->hs_flags & CFS_HASH_BIGNAME) != 0;
+}
+
+static inline int
+cfs_hash_with_counter(cfs_hash_t *hs)
+{
+	return (hs->hs_flags & CFS_HASH_COUNTER) != 0;
+}
+
+static inline int
+cfs_hash_with_rehash(cfs_hash_t *hs)
+{
+	return (hs->hs_flags & CFS_HASH_REHASH) != 0;
+}
+
+static inline int
+cfs_hash_with_rehash_key(cfs_hash_t *hs)
+{
+	return (hs->hs_flags & CFS_HASH_REHASH_KEY) != 0;
+}
+
+static inline int
+cfs_hash_with_shrink(cfs_hash_t *hs)
+{
+	return (hs->hs_flags & CFS_HASH_SHRINK) != 0;
+}
+
+static inline int
+cfs_hash_with_assert_empty(cfs_hash_t *hs)
+{
+	return (hs->hs_flags & CFS_HASH_ASSERT_EMPTY) != 0;
+}
+
+static inline int
+cfs_hash_with_depth(cfs_hash_t *hs)
+{
+	return (hs->hs_flags & CFS_HASH_DEPTH) != 0;
+}
+
+static inline int
+cfs_hash_with_nblk_change(cfs_hash_t *hs)
+{
+	return (hs->hs_flags & CFS_HASH_NBLK_CHANGE) != 0;
+}
+
+static inline int
+cfs_hash_is_exiting(cfs_hash_t *hs)
+{       /* cfs_hash_destroy is called */
+	return hs->hs_exiting;
+}
+
+static inline int
+cfs_hash_is_rehashing(cfs_hash_t *hs)
+{       /* rehash is launched */
+	return hs->hs_rehash_bits != 0;
+}
+
+static inline int
+cfs_hash_is_iterating(cfs_hash_t *hs)
+{       /* someone is calling cfs_hash_for_each_* */
+	return hs->hs_iterating || hs->hs_iterators != 0;
+}
+
+static inline int
+cfs_hash_bkt_size(cfs_hash_t *hs)
+{
+	return offsetof(cfs_hash_bucket_t, hsb_head[0]) +
+	       hs->hs_hops->hop_hhead_size(hs) * CFS_HASH_BKT_NHLIST(hs) +
+	       hs->hs_extra_bytes;
+}
+
+#define CFS_HOP(hs, op)	   (hs)->hs_ops->hs_ ## op
+
+static inline unsigned
+cfs_hash_id(cfs_hash_t *hs, const void *key, unsigned mask)
+{
+	return CFS_HOP(hs, hash)(hs, key, mask);
+}
+
+static inline void *
+cfs_hash_key(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	return CFS_HOP(hs, key)(hnode);
+}
+
+static inline void
+cfs_hash_keycpy(cfs_hash_t *hs, struct hlist_node *hnode, void *key)
+{
+	if (CFS_HOP(hs, keycpy) != NULL)
+		CFS_HOP(hs, keycpy)(hnode, key);
+}
+
+/**
+ * Returns 1 on a match,
+ */
+static inline int
+cfs_hash_keycmp(cfs_hash_t *hs, const void *key, struct hlist_node *hnode)
+{
+	return CFS_HOP(hs, keycmp)(key, hnode);
+}
+
+static inline void *
+cfs_hash_object(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	return CFS_HOP(hs, object)(hnode);
+}
+
+static inline void
+cfs_hash_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	return CFS_HOP(hs, get)(hs, hnode);
+}
+
+static inline void
+cfs_hash_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	LASSERT(CFS_HOP(hs, put_locked) != NULL);
+
+	return CFS_HOP(hs, put_locked)(hs, hnode);
+}
+
+static inline void
+cfs_hash_put(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	LASSERT(CFS_HOP(hs, put) != NULL);
+
+	return CFS_HOP(hs, put)(hs, hnode);
+}
+
+static inline void
+cfs_hash_exit(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	if (CFS_HOP(hs, exit))
+		CFS_HOP(hs, exit)(hs, hnode);
+}
+
+static inline void cfs_hash_lock(cfs_hash_t *hs, int excl)
+{
+	hs->hs_lops->hs_lock(&hs->hs_lock, excl);
+}
+
+static inline void cfs_hash_unlock(cfs_hash_t *hs, int excl)
+{
+	hs->hs_lops->hs_unlock(&hs->hs_lock, excl);
+}
+
+static inline int cfs_hash_dec_and_lock(cfs_hash_t *hs,
+					atomic_t *condition)
+{
+	LASSERT(cfs_hash_with_no_bktlock(hs));
+	return atomic_dec_and_lock(condition, &hs->hs_lock.spin);
+}
+
+static inline void cfs_hash_bd_lock(cfs_hash_t *hs,
+				    cfs_hash_bd_t *bd, int excl)
+{
+	hs->hs_lops->hs_bkt_lock(&bd->bd_bucket->hsb_lock, excl);
+}
+
+static inline void cfs_hash_bd_unlock(cfs_hash_t *hs,
+				      cfs_hash_bd_t *bd, int excl)
+{
+	hs->hs_lops->hs_bkt_unlock(&bd->bd_bucket->hsb_lock, excl);
+}
+
+/**
+ * operations on cfs_hash bucket (bd: bucket descriptor),
+ * they are normally for hash-table without rehash
+ */
+void cfs_hash_bd_get(cfs_hash_t *hs, const void *key, cfs_hash_bd_t *bd);
+
+static inline void cfs_hash_bd_get_and_lock(cfs_hash_t *hs, const void *key,
+					    cfs_hash_bd_t *bd, int excl)
+{
+	cfs_hash_bd_get(hs, key, bd);
+	cfs_hash_bd_lock(hs, bd, excl);
+}
+
+static inline unsigned cfs_hash_bd_index_get(cfs_hash_t *hs, cfs_hash_bd_t *bd)
+{
+	return bd->bd_offset | (bd->bd_bucket->hsb_index << hs->hs_bkt_bits);
+}
+
+static inline void cfs_hash_bd_index_set(cfs_hash_t *hs,
+					 unsigned index, cfs_hash_bd_t *bd)
+{
+	bd->bd_bucket = hs->hs_buckets[index >> hs->hs_bkt_bits];
+	bd->bd_offset = index & (CFS_HASH_BKT_NHLIST(hs) - 1U);
+}
+
+static inline void *
+cfs_hash_bd_extra_get(cfs_hash_t *hs, cfs_hash_bd_t *bd)
+{
+	return (void *)bd->bd_bucket +
+	       cfs_hash_bkt_size(hs) - hs->hs_extra_bytes;
+}
+
+static inline __u32
+cfs_hash_bd_version_get(cfs_hash_bd_t *bd)
+{
+	/* need hold cfs_hash_bd_lock */
+	return bd->bd_bucket->hsb_version;
+}
+
+static inline __u32
+cfs_hash_bd_count_get(cfs_hash_bd_t *bd)
+{
+	/* need hold cfs_hash_bd_lock */
+	return bd->bd_bucket->hsb_count;
+}
+
+static inline int
+cfs_hash_bd_depmax_get(cfs_hash_bd_t *bd)
+{
+	return bd->bd_bucket->hsb_depmax;
+}
+
+static inline int
+cfs_hash_bd_compare(cfs_hash_bd_t *bd1, cfs_hash_bd_t *bd2)
+{
+	if (bd1->bd_bucket->hsb_index != bd2->bd_bucket->hsb_index)
+		return bd1->bd_bucket->hsb_index - bd2->bd_bucket->hsb_index;
+
+	if (bd1->bd_offset != bd2->bd_offset)
+		return bd1->bd_offset - bd2->bd_offset;
+
+	return 0;
+}
+
+void cfs_hash_bd_add_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+			    struct hlist_node *hnode);
+void cfs_hash_bd_del_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+			    struct hlist_node *hnode);
+void cfs_hash_bd_move_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd_old,
+			     cfs_hash_bd_t *bd_new, struct hlist_node *hnode);
+
+static inline int cfs_hash_bd_dec_and_lock(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+					   atomic_t *condition)
+{
+	LASSERT(cfs_hash_with_spin_bktlock(hs));
+	return atomic_dec_and_lock(condition,
+				       &bd->bd_bucket->hsb_lock.spin);
+}
+
+static inline struct hlist_head *cfs_hash_bd_hhead(cfs_hash_t *hs,
+						  cfs_hash_bd_t *bd)
+{
+	return hs->hs_hops->hop_hhead(hs, bd);
+}
+
+struct hlist_node *cfs_hash_bd_lookup_locked(cfs_hash_t *hs,
+					    cfs_hash_bd_t *bd, const void *key);
+struct hlist_node *cfs_hash_bd_peek_locked(cfs_hash_t *hs,
+					  cfs_hash_bd_t *bd, const void *key);
+struct hlist_node *cfs_hash_bd_findadd_locked(cfs_hash_t *hs,
+					     cfs_hash_bd_t *bd, const void *key,
+					     struct hlist_node *hnode,
+					     int insist_add);
+struct hlist_node *cfs_hash_bd_finddel_locked(cfs_hash_t *hs,
+					     cfs_hash_bd_t *bd, const void *key,
+					     struct hlist_node *hnode);
+
+/**
+ * operations on cfs_hash bucket (bd: bucket descriptor),
+ * they are safe for hash-table with rehash
+ */
+void cfs_hash_dual_bd_get(cfs_hash_t *hs, const void *key, cfs_hash_bd_t *bds);
+void cfs_hash_dual_bd_lock(cfs_hash_t *hs, cfs_hash_bd_t *bds, int excl);
+void cfs_hash_dual_bd_unlock(cfs_hash_t *hs, cfs_hash_bd_t *bds, int excl);
+
+static inline void cfs_hash_dual_bd_get_and_lock(cfs_hash_t *hs, const void *key,
+						 cfs_hash_bd_t *bds, int excl)
+{
+	cfs_hash_dual_bd_get(hs, key, bds);
+	cfs_hash_dual_bd_lock(hs, bds, excl);
+}
+
+struct hlist_node *cfs_hash_dual_bd_lookup_locked(cfs_hash_t *hs,
+						 cfs_hash_bd_t *bds,
+						 const void *key);
+struct hlist_node *cfs_hash_dual_bd_findadd_locked(cfs_hash_t *hs,
+						  cfs_hash_bd_t *bds,
+						  const void *key,
+						  struct hlist_node *hnode,
+						  int insist_add);
+struct hlist_node *cfs_hash_dual_bd_finddel_locked(cfs_hash_t *hs,
+						  cfs_hash_bd_t *bds,
+						  const void *key,
+						  struct hlist_node *hnode);
+
+/* Hash init/cleanup functions */
+cfs_hash_t *cfs_hash_create(char *name, unsigned cur_bits, unsigned max_bits,
+			    unsigned bkt_bits, unsigned extra_bytes,
+			    unsigned min_theta, unsigned max_theta,
+			    cfs_hash_ops_t *ops, unsigned flags);
+
+cfs_hash_t *cfs_hash_getref(cfs_hash_t *hs);
+void cfs_hash_putref(cfs_hash_t *hs);
+
+/* Hash addition functions */
+void cfs_hash_add(cfs_hash_t *hs, const void *key,
+		  struct hlist_node *hnode);
+int cfs_hash_add_unique(cfs_hash_t *hs, const void *key,
+			struct hlist_node *hnode);
+void *cfs_hash_findadd_unique(cfs_hash_t *hs, const void *key,
+			      struct hlist_node *hnode);
+
+/* Hash deletion functions */
+void *cfs_hash_del(cfs_hash_t *hs, const void *key, struct hlist_node *hnode);
+void *cfs_hash_del_key(cfs_hash_t *hs, const void *key);
+
+/* Hash lookup/for_each functions */
+#define CFS_HASH_LOOP_HOG       1024
+
+typedef int (*cfs_hash_for_each_cb_t)(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+				      struct hlist_node *node, void *data);
+void *cfs_hash_lookup(cfs_hash_t *hs, const void *key);
+void cfs_hash_for_each(cfs_hash_t *hs, cfs_hash_for_each_cb_t, void *data);
+void cfs_hash_for_each_safe(cfs_hash_t *hs, cfs_hash_for_each_cb_t, void *data);
+int  cfs_hash_for_each_nolock(cfs_hash_t *hs,
+			      cfs_hash_for_each_cb_t, void *data);
+int  cfs_hash_for_each_empty(cfs_hash_t *hs,
+			     cfs_hash_for_each_cb_t, void *data);
+void cfs_hash_for_each_key(cfs_hash_t *hs, const void *key,
+			   cfs_hash_for_each_cb_t, void *data);
+typedef int (*cfs_hash_cond_opt_cb_t)(void *obj, void *data);
+void cfs_hash_cond_del(cfs_hash_t *hs, cfs_hash_cond_opt_cb_t, void *data);
+
+void cfs_hash_hlist_for_each(cfs_hash_t *hs, unsigned hindex,
+			     cfs_hash_for_each_cb_t, void *data);
+int  cfs_hash_is_empty(cfs_hash_t *hs);
+__u64 cfs_hash_size_get(cfs_hash_t *hs);
+
+/*
+ * Rehash - Theta is calculated to be the average chained
+ * hash depth assuming a perfectly uniform hash funcion.
+ */
+void cfs_hash_rehash_cancel_locked(cfs_hash_t *hs);
+void cfs_hash_rehash_cancel(cfs_hash_t *hs);
+int  cfs_hash_rehash(cfs_hash_t *hs, int do_rehash);
+void cfs_hash_rehash_key(cfs_hash_t *hs, const void *old_key,
+			 void *new_key, struct hlist_node *hnode);
+
+#if CFS_HASH_DEBUG_LEVEL > CFS_HASH_DEBUG_1
+/* Validate hnode references the correct key */
+static inline void
+cfs_hash_key_validate(cfs_hash_t *hs, const void *key,
+		      struct hlist_node *hnode)
+{
+	LASSERT(cfs_hash_keycmp(hs, key, hnode));
+}
+
+/* Validate hnode is in the correct bucket */
+static inline void
+cfs_hash_bucket_validate(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+			 struct hlist_node *hnode)
+{
+	cfs_hash_bd_t   bds[2];
+
+	cfs_hash_dual_bd_get(hs, cfs_hash_key(hs, hnode), bds);
+	LASSERT(bds[0].bd_bucket == bd->bd_bucket ||
+		bds[1].bd_bucket == bd->bd_bucket);
+}
+
+#else /* CFS_HASH_DEBUG_LEVEL > CFS_HASH_DEBUG_1 */
+
+static inline void
+cfs_hash_key_validate(cfs_hash_t *hs, const void *key,
+		      struct hlist_node *hnode) {}
+
+static inline void
+cfs_hash_bucket_validate(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+			 struct hlist_node *hnode) {}
+
+#endif /* CFS_HASH_DEBUG_LEVEL */
+
+#define CFS_HASH_THETA_BITS  10
+#define CFS_HASH_MIN_THETA  (1U << (CFS_HASH_THETA_BITS - 1))
+#define CFS_HASH_MAX_THETA  (1U << (CFS_HASH_THETA_BITS + 1))
+
+/* Return integer component of theta */
+static inline int __cfs_hash_theta_int(int theta)
+{
+	return (theta >> CFS_HASH_THETA_BITS);
+}
+
+/* Return a fractional value between 0 and 999 */
+static inline int __cfs_hash_theta_frac(int theta)
+{
+	return ((theta * 1000) >> CFS_HASH_THETA_BITS) -
+	       (__cfs_hash_theta_int(theta) * 1000);
+}
+
+static inline int __cfs_hash_theta(cfs_hash_t *hs)
+{
+	return (atomic_read(&hs->hs_count) <<
+		CFS_HASH_THETA_BITS) >> hs->hs_cur_bits;
+}
+
+static inline void __cfs_hash_set_theta(cfs_hash_t *hs, int min, int max)
+{
+	LASSERT(min < max);
+	hs->hs_min_theta = (__u16)min;
+	hs->hs_max_theta = (__u16)max;
+}
+
+/* Generic debug formatting routines mainly for proc handler */
+int cfs_hash_debug_header(char *str, int size);
+int cfs_hash_debug_str(cfs_hash_t *hs, char *str, int size);
+
+/*
+ * Generic djb2 hash algorithm for character arrays.
+ */
+static inline unsigned
+cfs_hash_djb2_hash(const void *key, size_t size, unsigned mask)
+{
+	unsigned i, hash = 5381;
+
+	LASSERT(key != NULL);
+
+	for (i = 0; i < size; i++)
+		hash = hash * 33 + ((char *)key)[i];
+
+	return (hash & mask);
+}
+
+/*
+ * Generic u32 hash algorithm.
+ */
+static inline unsigned
+cfs_hash_u32_hash(const __u32 key, unsigned mask)
+{
+	return ((key * CFS_GOLDEN_RATIO_PRIME_32) & mask);
+}
+
+/*
+ * Generic u64 hash algorithm.
+ */
+static inline unsigned
+cfs_hash_u64_hash(const __u64 key, unsigned mask)
+{
+	return ((unsigned)(key * CFS_GOLDEN_RATIO_PRIME_64) & mask);
+}
+
+/** iterate over all buckets in @bds (array of cfs_hash_bd_t) */
+#define cfs_hash_for_each_bd(bds, n, i) \
+	for (i = 0; i < n && (bds)[i].bd_bucket != NULL; i++)
+
+/** iterate over all buckets of @hs */
+#define cfs_hash_for_each_bucket(hs, bd, pos)		   \
+	for (pos = 0;					   \
+	     pos < CFS_HASH_NBKT(hs) &&			 \
+	     ((bd)->bd_bucket = (hs)->hs_buckets[pos]) != NULL; pos++)
+
+/** iterate over all hlist of bucket @bd */
+#define cfs_hash_bd_for_each_hlist(hs, bd, hlist)	       \
+	for ((bd)->bd_offset = 0;			       \
+	     (bd)->bd_offset < CFS_HASH_BKT_NHLIST(hs) &&       \
+	     (hlist = cfs_hash_bd_hhead(hs, bd)) != NULL;       \
+	     (bd)->bd_offset++)
+
+/* !__LIBCFS__HASH_H__ */
+#endif
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_heap.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_heap.h
new file mode 100644
index 000000000000..bfa6d7b245ea
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_heap.h
@@ -0,0 +1,200 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011 Intel Corporation
+ */
+/*
+ * libcfs/include/libcfs/heap.h
+ *
+ * Author: Eric Barton	<eeb@whamcloud.com>
+ *	   Liang Zhen	<liang@whamcloud.com>
+ */
+
+#ifndef __LIBCFS_HEAP_H__
+#define __LIBCFS_HEAP_H__
+
+/** \defgroup heap Binary heap
+ *
+ * The binary heap is a scalable data structure created using a binary tree. It
+ * is capable of maintaining large sets of elements sorted usually by one or
+ * more element properties, but really based on anything that can be used as a
+ * binary predicate in order to determine the relevant ordering of any two nodes
+ * that belong to the set. There is no search operation, rather the intention is
+ * for the element of the lowest priority which will always be at the root of
+ * the tree (as this is an implementation of a min-heap) to be removed by users
+ * for consumption.
+ *
+ * Users of the heap should embed a \e cfs_binheap_node_t object instance on
+ * every object of the set that they wish the binary heap instance to handle,
+ * and (at a minimum) provide a cfs_binheap_ops_t::hop_compare() implementation
+ * which is used by the heap as the binary predicate during its internal sorting
+ * operations.
+ *
+ * The current implementation enforces no locking scheme, and so assumes the
+ * user caters for locking between calls to insert, delete and lookup
+ * operations. Since the only consumer for the data structure at this point
+ * are NRS policies, and these operate on a per-CPT basis, binary heap instances
+ * are tied to a specific CPT.
+ * @{
+ */
+
+/**
+ * Binary heap node.
+ *
+ * Objects of this type are embedded into objects of the ordered set that is to
+ * be maintained by a \e cfs_binheap_t instance.
+ */
+typedef struct {
+	/** Index into the binary tree */
+	unsigned int	chn_index;
+} cfs_binheap_node_t;
+
+#define CBH_SHIFT	9
+#define CBH_SIZE       (1 << CBH_SHIFT)		    /* # ptrs per level */
+#define CBH_MASK       (CBH_SIZE - 1)
+#define CBH_NOB	(CBH_SIZE * sizeof(cfs_binheap_node_t *))
+
+#define CBH_POISON	0xdeadbeef
+
+/**
+ * Binary heap flags.
+ */
+enum {
+	CBH_FLAG_ATOMIC_GROW	= 1,
+};
+
+struct cfs_binheap;
+
+/**
+ * Binary heap operations.
+ */
+typedef struct {
+	/**
+	 * Called right before inserting a node into the binary heap.
+	 *
+	 * Implementing this operation is optional.
+	 *
+	 * \param[in] h The heap
+	 * \param[in] e The node
+	 *
+	 * \retval 0 success
+	 * \retval != 0 error
+	 */
+	int		(*hop_enter)(struct cfs_binheap *h,
+				     cfs_binheap_node_t *e);
+	/**
+	 * Called right after removing a node from the binary heap.
+	 *
+	 * Implementing this operation is optional.
+	 *
+	 * \param[in] h The heap
+	 * \param[in] e The node
+	 */
+	void		(*hop_exit)(struct cfs_binheap *h,
+				    cfs_binheap_node_t *e);
+	/**
+	 * A binary predicate which is called during internal heap sorting
+	 * operations, and used in order to determine the relevant ordering of
+	 * two heap nodes.
+	 *
+	 * Implementing this operation is mandatory.
+	 *
+	 * \param[in] a The first heap node
+	 * \param[in] b The second heap node
+	 *
+	 * \retval 0 Node a > node b
+	 * \retval 1 Node a < node b
+	 *
+	 * \see cfs_binheap_bubble()
+	 * \see cfs_biheap_sink()
+	 */
+	int		(*hop_compare)(cfs_binheap_node_t *a,
+				       cfs_binheap_node_t *b);
+} cfs_binheap_ops_t;
+
+/**
+ * Binary heap object.
+ *
+ * Sorts elements of type \e cfs_binheap_node_t
+ */
+typedef struct cfs_binheap {
+	/** Triple indirect */
+	cfs_binheap_node_t  ****cbh_elements3;
+	/** double indirect */
+	cfs_binheap_node_t   ***cbh_elements2;
+	/** single indirect */
+	cfs_binheap_node_t    **cbh_elements1;
+	/** # elements referenced */
+	unsigned int		cbh_nelements;
+	/** high water mark */
+	unsigned int		cbh_hwm;
+	/** user flags */
+	unsigned int		cbh_flags;
+	/** operations table */
+	cfs_binheap_ops_t      *cbh_ops;
+	/** private data */
+	void		       *cbh_private;
+	/** associated CPT table */
+	struct cfs_cpt_table   *cbh_cptab;
+	/** associated CPT id of this cfs_binheap_t::cbh_cptab */
+	int			cbh_cptid;
+} cfs_binheap_t;
+
+void cfs_binheap_destroy(cfs_binheap_t *h);
+cfs_binheap_t *cfs_binheap_create(cfs_binheap_ops_t *ops, unsigned int flags,
+				  unsigned count, void *arg,
+				  struct cfs_cpt_table *cptab, int cptid);
+cfs_binheap_node_t *cfs_binheap_find(cfs_binheap_t *h, unsigned int idx);
+int cfs_binheap_insert(cfs_binheap_t *h, cfs_binheap_node_t *e);
+void cfs_binheap_remove(cfs_binheap_t *h, cfs_binheap_node_t *e);
+
+static inline int
+cfs_binheap_size(cfs_binheap_t *h)
+{
+	return h->cbh_nelements;
+}
+
+static inline int
+cfs_binheap_is_empty(cfs_binheap_t *h)
+{
+	return h->cbh_nelements == 0;
+}
+
+static inline cfs_binheap_node_t *
+cfs_binheap_root(cfs_binheap_t *h)
+{
+	return cfs_binheap_find(h, 0);
+}
+
+static inline cfs_binheap_node_t *
+cfs_binheap_remove_root(cfs_binheap_t *h)
+{
+	cfs_binheap_node_t *e = cfs_binheap_find(h, 0);
+
+	if (e != NULL)
+		cfs_binheap_remove(h, e);
+	return e;
+}
+
+/** @} heap */
+
+#endif /* __LIBCFS_HEAP_H__ */
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_ioctl.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_ioctl.h
new file mode 100644
index 000000000000..5be367973508
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_ioctl.h
@@ -0,0 +1,222 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_ioctl.h
+ *
+ * Low-level ioctl data structures. Kernel ioctl functions declared here,
+ * and user space functions are in libcfsutil_ioctl.h.
+ *
+ */
+
+#ifndef __LIBCFS_IOCTL_H__
+#define __LIBCFS_IOCTL_H__
+
+
+#define LIBCFS_IOCTL_VERSION 0x0001000a
+
+struct libcfs_ioctl_data {
+	__u32 ioc_len;
+	__u32 ioc_version;
+
+	__u64 ioc_nid;
+	__u64 ioc_u64[1];
+
+	__u32 ioc_flags;
+	__u32 ioc_count;
+	__u32 ioc_net;
+	__u32 ioc_u32[7];
+
+	__u32 ioc_inllen1;
+	char *ioc_inlbuf1;
+	__u32 ioc_inllen2;
+	char *ioc_inlbuf2;
+
+	__u32 ioc_plen1; /* buffers in userspace */
+	char *ioc_pbuf1;
+	__u32 ioc_plen2; /* buffers in userspace */
+	char *ioc_pbuf2;
+
+	char ioc_bulk[0];
+};
+
+
+struct libcfs_ioctl_hdr {
+	__u32 ioc_len;
+	__u32 ioc_version;
+};
+
+struct libcfs_debug_ioctl_data
+{
+	struct libcfs_ioctl_hdr hdr;
+	unsigned int subs;
+	unsigned int debug;
+};
+
+#define LIBCFS_IOC_INIT(data)			   \
+do {						    \
+	memset(&data, 0, sizeof(data));		 \
+	data.ioc_version = LIBCFS_IOCTL_VERSION;	\
+	data.ioc_len = sizeof(data);		    \
+} while (0)
+
+
+struct libcfs_ioctl_handler {
+	struct list_head item;
+	int (*handle_ioctl)(unsigned int cmd, struct libcfs_ioctl_data *data);
+};
+
+#define DECLARE_IOCTL_HANDLER(ident, func)		      \
+	struct libcfs_ioctl_handler ident = {		   \
+		/* .item = */ LIST_HEAD_INIT(ident.item),   \
+		/* .handle_ioctl = */ func		      \
+	}
+
+
+/* FIXME check conflict with lustre_lib.h */
+#define LIBCFS_IOC_DEBUG_MASK	     _IOWR('f', 250, long)
+
+
+/* ioctls for manipulating snapshots 30- */
+#define IOC_LIBCFS_TYPE		   'e'
+#define IOC_LIBCFS_MIN_NR		 30
+/* libcfs ioctls */
+#define IOC_LIBCFS_PANIC		   _IOWR('e', 30, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_CLEAR_DEBUG	     _IOWR('e', 31, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_MARK_DEBUG	      _IOWR('e', 32, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_LWT_CONTROL	     _IOWR('e', 33, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_LWT_SNAPSHOT	    _IOWR('e', 34, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_LWT_LOOKUP_STRING       _IOWR('e', 35, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_MEMHOG		  _IOWR('e', 36, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_PING_TEST	       _IOWR('e', 37, IOCTL_LIBCFS_TYPE)
+/* lnet ioctls */
+#define IOC_LIBCFS_GET_NI		  _IOWR('e', 50, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_FAIL_NID		_IOWR('e', 51, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_ADD_ROUTE	       _IOWR('e', 52, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_DEL_ROUTE	       _IOWR('e', 53, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_GET_ROUTE	       _IOWR('e', 54, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_NOTIFY_ROUTER	   _IOWR('e', 55, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_UNCONFIGURE	     _IOWR('e', 56, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_PORTALS_COMPATIBILITY   _IOWR('e', 57, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_LNET_DIST	       _IOWR('e', 58, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_CONFIGURE	       _IOWR('e', 59, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_TESTPROTOCOMPAT	 _IOWR('e', 60, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_PING		    _IOWR('e', 61, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_DEBUG_PEER	      _IOWR('e', 62, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_LNETST		  _IOWR('e', 63, IOCTL_LIBCFS_TYPE)
+/* lnd ioctls */
+#define IOC_LIBCFS_REGISTER_MYNID	  _IOWR('e', 70, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_CLOSE_CONNECTION	_IOWR('e', 71, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_PUSH_CONNECTION	 _IOWR('e', 72, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_GET_CONN		_IOWR('e', 73, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_DEL_PEER		_IOWR('e', 74, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_ADD_PEER		_IOWR('e', 75, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_GET_PEER		_IOWR('e', 76, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_GET_TXDESC	      _IOWR('e', 77, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_ADD_INTERFACE	   _IOWR('e', 78, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_DEL_INTERFACE	   _IOWR('e', 79, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_GET_INTERFACE	   _IOWR('e', 80, IOCTL_LIBCFS_TYPE)
+
+#define IOC_LIBCFS_MAX_NR			     80
+
+static inline int libcfs_ioctl_packlen(struct libcfs_ioctl_data *data)
+{
+	int len = sizeof(*data);
+	len += cfs_size_round(data->ioc_inllen1);
+	len += cfs_size_round(data->ioc_inllen2);
+	return len;
+}
+
+static inline int libcfs_ioctl_is_invalid(struct libcfs_ioctl_data *data)
+{
+	if (data->ioc_len > (1<<30)) {
+		CERROR ("LIBCFS ioctl: ioc_len larger than 1<<30\n");
+		return 1;
+	}
+	if (data->ioc_inllen1 > (1<<30)) {
+		CERROR ("LIBCFS ioctl: ioc_inllen1 larger than 1<<30\n");
+		return 1;
+	}
+	if (data->ioc_inllen2 > (1<<30)) {
+		CERROR ("LIBCFS ioctl: ioc_inllen2 larger than 1<<30\n");
+		return 1;
+	}
+	if (data->ioc_inlbuf1 && !data->ioc_inllen1) {
+		CERROR ("LIBCFS ioctl: inlbuf1 pointer but 0 length\n");
+		return 1;
+	}
+	if (data->ioc_inlbuf2 && !data->ioc_inllen2) {
+		CERROR ("LIBCFS ioctl: inlbuf2 pointer but 0 length\n");
+		return 1;
+	}
+	if (data->ioc_pbuf1 && !data->ioc_plen1) {
+		CERROR ("LIBCFS ioctl: pbuf1 pointer but 0 length\n");
+		return 1;
+	}
+	if (data->ioc_pbuf2 && !data->ioc_plen2) {
+		CERROR ("LIBCFS ioctl: pbuf2 pointer but 0 length\n");
+		return 1;
+	}
+	if (data->ioc_plen1 && !data->ioc_pbuf1) {
+		CERROR ("LIBCFS ioctl: plen1 nonzero but no pbuf1 pointer\n");
+		return 1;
+	}
+	if (data->ioc_plen2 && !data->ioc_pbuf2) {
+		CERROR ("LIBCFS ioctl: plen2 nonzero but no pbuf2 pointer\n");
+		return 1;
+	}
+	if ((__u32)libcfs_ioctl_packlen(data) != data->ioc_len ) {
+		CERROR ("LIBCFS ioctl: packlen != ioc_len\n");
+		return 1;
+	}
+	if (data->ioc_inllen1 &&
+	    data->ioc_bulk[data->ioc_inllen1 - 1] != '\0') {
+		CERROR ("LIBCFS ioctl: inlbuf1 not 0 terminated\n");
+		return 1;
+	}
+	if (data->ioc_inllen2 &&
+	    data->ioc_bulk[cfs_size_round(data->ioc_inllen1) +
+			   data->ioc_inllen2 - 1] != '\0') {
+		CERROR ("LIBCFS ioctl: inlbuf2 not 0 terminated\n");
+		return 1;
+	}
+	return 0;
+}
+
+
+extern int libcfs_register_ioctl(struct libcfs_ioctl_handler *hand);
+extern int libcfs_deregister_ioctl(struct libcfs_ioctl_handler *hand);
+extern int libcfs_ioctl_getdata(char *buf, char *end, void *arg);
+extern int libcfs_ioctl_popdata(void *arg, void *buf, int size);
+
+
+#endif /* __LIBCFS_IOCTL_H__ */
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_kernelcomm.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_kernelcomm.h
new file mode 100644
index 000000000000..596a15fc8996
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_kernelcomm.h
@@ -0,0 +1,117 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Author: Nathan Rutman <nathan.rutman@sun.com>
+ *
+ * libcfs/include/libcfs/libcfs_kernelcomm.h
+ *
+ * Kernel <-> userspace communication routines.
+ * The definitions below are used in the kernel and userspace.
+ *
+ */
+
+#ifndef __LIBCFS_KERNELCOMM_H__
+#define __LIBCFS_KERNELCOMM_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <linux/libcfs/libcfs.h> instead
+#endif
+
+
+/* KUC message header.
+ * All current and future KUC messages should use this header.
+ * To avoid having to include Lustre headers from libcfs, define this here.
+ */
+struct kuc_hdr {
+	__u16 kuc_magic;
+	__u8  kuc_transport;  /* Each new Lustre feature should use a different
+				 transport */
+	__u8  kuc_flags;
+	__u16 kuc_msgtype;    /* Message type or opcode, transport-specific */
+	__u16 kuc_msglen;     /* Including header */
+} __attribute__((aligned(sizeof(__u64))));
+
+#define KUC_MAGIC  0x191C /*Lustre9etLinC */
+#define KUC_FL_BLOCK 0x01   /* Wait for send */
+
+/* kuc_msgtype values are defined in each transport */
+enum kuc_transport_type {
+	KUC_TRANSPORT_GENERIC   = 1,
+	KUC_TRANSPORT_HSM       = 2,
+	KUC_TRANSPORT_CHANGELOG = 3,
+};
+
+enum kuc_generic_message_type {
+	KUC_MSG_SHUTDOWN = 1,
+};
+
+/* prototype for callback function on kuc groups */
+typedef int (*libcfs_kkuc_cb_t)(__u32 data, void *cb_arg);
+
+/* KUC Broadcast Groups. This determines which userspace process hears which
+ * messages.  Mutliple transports may be used within a group, or multiple
+ * groups may use the same transport.  Broadcast
+ * groups need not be used if e.g. a UID is specified instead;
+ * use group 0 to signify unicast.
+ */
+#define KUC_GRP_HSM	   0x02
+#define KUC_GRP_MAX	   KUC_GRP_HSM
+
+/* Kernel methods */
+extern int libcfs_kkuc_msg_put(struct file *fp, void *payload);
+extern int libcfs_kkuc_group_put(int group, void *payload);
+extern int libcfs_kkuc_group_add(struct file *fp, int uid, int group,
+				 __u32 data);
+extern int libcfs_kkuc_group_rem(int uid, int group);
+extern int libcfs_kkuc_group_foreach(int group, libcfs_kkuc_cb_t cb_func,
+				     void *cb_arg);
+
+#define LK_FLG_STOP 0x01
+
+/* kernelcomm control structure, passed from userspace to kernel */
+typedef struct lustre_kernelcomm {
+	__u32 lk_wfd;
+	__u32 lk_rfd;
+	__u32 lk_uid;
+	__u32 lk_group;
+	__u32 lk_data;
+	__u32 lk_flags;
+} __attribute__((packed)) lustre_kernelcomm;
+
+/* Userspace methods */
+extern int libcfs_ukuc_start(lustre_kernelcomm *l, int groups);
+extern int libcfs_ukuc_stop(lustre_kernelcomm *l);
+extern int libcfs_ukuc_msg_get(lustre_kernelcomm *l, char *buf, int maxsize,
+			       int transport);
+
+#endif /* __LIBCFS_KERNELCOMM_H__ */
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_prim.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_prim.h
new file mode 100644
index 000000000000..9c40ed904da5
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_prim.h
@@ -0,0 +1,101 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_prim.h
+ *
+ * General primitives.
+ *
+ */
+
+#ifndef __LIBCFS_PRIM_H__
+#define __LIBCFS_PRIM_H__
+
+#ifndef EXPORT_SYMBOL
+# define EXPORT_SYMBOL(s)
+#endif
+
+/*
+ * Schedule
+ */
+void cfs_pause(cfs_duration_t ticks);
+
+/*
+ * Timer
+ */
+typedef  void (cfs_timer_func_t)(ulong_ptr_t);
+void schedule_timeout_and_set_state(cfs_task_state_t, int64_t);
+
+void init_waitqueue_entry_current(wait_queue_t *link);
+int64_t waitq_timedwait(wait_queue_t *, cfs_task_state_t, int64_t);
+void waitq_wait(wait_queue_t *, cfs_task_state_t);
+void add_wait_queue_exclusive_head(wait_queue_head_t *, wait_queue_t *);
+
+void cfs_init_timer(timer_list_t *t);
+void cfs_timer_init(timer_list_t *t, cfs_timer_func_t *func, void *arg);
+void cfs_timer_done(timer_list_t *t);
+void cfs_timer_arm(timer_list_t *t, cfs_time_t deadline);
+void cfs_timer_disarm(timer_list_t *t);
+int  cfs_timer_is_armed(timer_list_t *t);
+cfs_time_t cfs_timer_deadline(timer_list_t *t);
+
+/*
+ * Memory
+ */
+#ifndef memory_pressure_get
+#define memory_pressure_get() (0)
+#endif
+#ifndef memory_pressure_set
+#define memory_pressure_set() do {} while (0)
+#endif
+#ifndef memory_pressure_clr
+#define memory_pressure_clr() do {} while (0)
+#endif
+
+static inline int cfs_memory_pressure_get_and_set(void)
+{
+	int old = memory_pressure_get();
+
+	if (!old)
+		memory_pressure_set();
+	return old;
+}
+
+static inline void cfs_memory_pressure_restore(int old)
+{
+	if (old)
+		memory_pressure_set();
+	else
+		memory_pressure_clr();
+	return;
+}
+#endif
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h
new file mode 100644
index 000000000000..62bf32f8539d
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h
@@ -0,0 +1,568 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_private.h
+ *
+ * Various defines for libcfs.
+ *
+ */
+
+#ifndef __LIBCFS_PRIVATE_H__
+#define __LIBCFS_PRIVATE_H__
+
+/* XXX this layering violation is for nidstrings */
+#include <linux/lnet/types.h>
+
+#ifndef DEBUG_SUBSYSTEM
+# define DEBUG_SUBSYSTEM S_UNDEFINED
+#endif
+
+
+
+/*
+ * When this is on, LASSERT macro includes check for assignment used instead
+ * of equality check, but doesn't have unlikely(). Turn this on from time to
+ * time to make test-builds. This shouldn't be on for production release.
+ */
+#define LASSERT_CHECKED (0)
+
+
+#define LASSERTF(cond, fmt, ...)					\
+do {									\
+	if (unlikely(!(cond))) {					\
+		LIBCFS_DEBUG_MSG_DATA_DECL(__msg_data, D_EMERG, NULL);	\
+		libcfs_debug_msg(&__msg_data,				\
+				 "ASSERTION( %s ) failed: " fmt, #cond,	\
+				 ## __VA_ARGS__);			\
+		lbug_with_loc(&__msg_data);				\
+	}								\
+} while (0)
+
+#define LASSERT(cond) LASSERTF(cond, "\n")
+
+# define LINVRNT(exp) ((void)sizeof!!(exp))
+
+#define KLASSERT(e) LASSERT(e)
+
+void lbug_with_loc(struct libcfs_debug_msg_data *) __attribute__((noreturn));
+
+#define LBUG()							  \
+do {								    \
+	LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_EMERG, NULL);	     \
+	lbug_with_loc(&msgdata);					\
+} while(0)
+
+extern atomic_t libcfs_kmemory;
+/*
+ * Memory
+ */
+
+# define libcfs_kmem_inc(ptr, size)		\
+do {						\
+	atomic_add(size, &libcfs_kmemory);	\
+} while (0)
+
+# define libcfs_kmem_dec(ptr, size)		\
+do {						\
+	atomic_sub(size, &libcfs_kmemory);	\
+} while (0)
+
+# define libcfs_kmem_read()			\
+	atomic_read(&libcfs_kmemory)
+
+
+#ifndef LIBCFS_VMALLOC_SIZE
+#define LIBCFS_VMALLOC_SIZE	(2 << PAGE_CACHE_SHIFT) /* 2 pages */
+#endif
+
+#define LIBCFS_ALLOC_PRE(size, mask)					    \
+do {									    \
+	LASSERT(!in_interrupt() ||					    \
+		((size) <= LIBCFS_VMALLOC_SIZE &&			    \
+		 ((mask) & GFP_ATOMIC)) != 0);			    \
+} while (0)
+
+#define LIBCFS_ALLOC_POST(ptr, size)					    \
+do {									    \
+	if (unlikely((ptr) == NULL)) {					    \
+		CERROR("LNET: out of memory at %s:%d (tried to alloc '"	    \
+		       #ptr "' = %d)\n", __FILE__, __LINE__, (int)(size));  \
+		CERROR("LNET: %d total bytes allocated by lnet\n",	    \
+		       libcfs_kmem_read());				    \
+	} else {							    \
+		memset((ptr), 0, (size));				    \
+		libcfs_kmem_inc((ptr), (size));				    \
+		CDEBUG(D_MALLOC, "alloc '" #ptr "': %d at %p (tot %d).\n",  \
+		       (int)(size), (ptr), libcfs_kmem_read());		    \
+	}								   \
+} while (0)
+
+/**
+ * allocate memory with GFP flags @mask
+ */
+#define LIBCFS_ALLOC_GFP(ptr, size, mask)				    \
+do {									    \
+	LIBCFS_ALLOC_PRE((size), (mask));				    \
+	(ptr) = (size) <= LIBCFS_VMALLOC_SIZE ?				    \
+		kmalloc((size), (mask)) : vmalloc(size);	    \
+	LIBCFS_ALLOC_POST((ptr), (size));				    \
+} while (0)
+
+/**
+ * default allocator
+ */
+#define LIBCFS_ALLOC(ptr, size) \
+	LIBCFS_ALLOC_GFP(ptr, size, __GFP_IO)
+
+/**
+ * non-sleeping allocator
+ */
+#define LIBCFS_ALLOC_ATOMIC(ptr, size) \
+	LIBCFS_ALLOC_GFP(ptr, size, GFP_ATOMIC)
+
+/**
+ * allocate memory for specified CPU partition
+ *   \a cptab != NULL, \a cpt is CPU partition id of \a cptab
+ *   \a cptab == NULL, \a cpt is HW NUMA node id
+ */
+#define LIBCFS_CPT_ALLOC_GFP(ptr, cptab, cpt, size, mask)		    \
+do {									    \
+	LIBCFS_ALLOC_PRE((size), (mask));				    \
+	(ptr) = (size) <= LIBCFS_VMALLOC_SIZE ?				    \
+		cfs_cpt_malloc((cptab), (cpt), (size), (mask)) :	    \
+		cfs_cpt_vmalloc((cptab), (cpt), (size));		    \
+	LIBCFS_ALLOC_POST((ptr), (size));				    \
+} while (0)
+
+/** default numa allocator */
+#define LIBCFS_CPT_ALLOC(ptr, cptab, cpt, size)				    \
+	LIBCFS_CPT_ALLOC_GFP(ptr, cptab, cpt, size, __GFP_IO)
+
+#define LIBCFS_FREE(ptr, size)					  \
+do {								    \
+	int s = (size);						 \
+	if (unlikely((ptr) == NULL)) {				  \
+		CERROR("LIBCFS: free NULL '" #ptr "' (%d bytes) at "    \
+		       "%s:%d\n", s, __FILE__, __LINE__);	       \
+		break;						  \
+	}							       \
+	libcfs_kmem_dec((ptr), s);				      \
+	CDEBUG(D_MALLOC, "kfreed '" #ptr "': %d at %p (tot %d).\n",     \
+	       s, (ptr), libcfs_kmem_read());				\
+	if (unlikely(s > LIBCFS_VMALLOC_SIZE))			  \
+		vfree(ptr);				    \
+	else							    \
+		kfree(ptr);					  \
+} while (0)
+
+/******************************************************************************/
+
+/* htonl hack - either this, or compile with -O2. Stupid byteorder/generic.h */
+#if defined(__GNUC__) && (__GNUC__ >= 2) && !defined(__OPTIMIZE__)
+#define ___htonl(x) __cpu_to_be32(x)
+#define ___htons(x) __cpu_to_be16(x)
+#define ___ntohl(x) __be32_to_cpu(x)
+#define ___ntohs(x) __be16_to_cpu(x)
+#define htonl(x) ___htonl(x)
+#define ntohl(x) ___ntohl(x)
+#define htons(x) ___htons(x)
+#define ntohs(x) ___ntohs(x)
+#endif
+
+void libcfs_debug_dumpstack(task_t *tsk);
+void libcfs_run_upcall(char **argv);
+void libcfs_run_lbug_upcall(struct libcfs_debug_msg_data *);
+void libcfs_debug_dumplog(void);
+int libcfs_debug_init(unsigned long bufsize);
+int libcfs_debug_cleanup(void);
+int libcfs_debug_clear_buffer(void);
+int libcfs_debug_mark_buffer(const char *text);
+
+void libcfs_debug_set_level(unsigned int debug_level);
+
+
+/*
+ * allocate per-cpu-partition data, returned value is an array of pointers,
+ * variable can be indexed by CPU ID.
+ *	cptable != NULL: size of array is number of CPU partitions
+ *	cptable == NULL: size of array is number of HW cores
+ */
+void *cfs_percpt_alloc(struct cfs_cpt_table *cptab, unsigned int size);
+/*
+ * destory per-cpu-partition variable
+ */
+void  cfs_percpt_free(void *vars);
+int   cfs_percpt_number(void *vars);
+void *cfs_percpt_current(void *vars);
+void *cfs_percpt_index(void *vars, int idx);
+
+#define cfs_percpt_for_each(var, i, vars)		\
+	for (i = 0; i < cfs_percpt_number(vars) &&	\
+		    ((var) = (vars)[i]) != NULL; i++)
+
+/*
+ * allocate a variable array, returned value is an array of pointers.
+ * Caller can specify length of array by count.
+ */
+void *cfs_array_alloc(int count, unsigned int size);
+void  cfs_array_free(void *vars);
+
+#define LASSERT_ATOMIC_ENABLED	  (1)
+
+#if LASSERT_ATOMIC_ENABLED
+
+/** assert value of @a is equal to @v */
+#define LASSERT_ATOMIC_EQ(a, v)				 \
+do {							    \
+	LASSERTF(atomic_read(a) == v,		       \
+		 "value: %d\n", atomic_read((a)));	  \
+} while (0)
+
+/** assert value of @a is unequal to @v */
+#define LASSERT_ATOMIC_NE(a, v)				 \
+do {							    \
+	LASSERTF(atomic_read(a) != v,		       \
+		 "value: %d\n", atomic_read((a)));	  \
+} while (0)
+
+/** assert value of @a is little than @v */
+#define LASSERT_ATOMIC_LT(a, v)				 \
+do {							    \
+	LASSERTF(atomic_read(a) < v,			\
+		 "value: %d\n", atomic_read((a)));	  \
+} while (0)
+
+/** assert value of @a is little/equal to @v */
+#define LASSERT_ATOMIC_LE(a, v)				 \
+do {							    \
+	LASSERTF(atomic_read(a) <= v,		       \
+		 "value: %d\n", atomic_read((a)));	  \
+} while (0)
+
+/** assert value of @a is great than @v */
+#define LASSERT_ATOMIC_GT(a, v)				 \
+do {							    \
+	LASSERTF(atomic_read(a) > v,			\
+		 "value: %d\n", atomic_read((a)));	  \
+} while (0)
+
+/** assert value of @a is great/equal to @v */
+#define LASSERT_ATOMIC_GE(a, v)				 \
+do {							    \
+	LASSERTF(atomic_read(a) >= v,		       \
+		 "value: %d\n", atomic_read((a)));	  \
+} while (0)
+
+/** assert value of @a is great than @v1 and little than @v2 */
+#define LASSERT_ATOMIC_GT_LT(a, v1, v2)			 \
+do {							    \
+	int __v = atomic_read(a);			   \
+	LASSERTF(__v > v1 && __v < v2, "value: %d\n", __v);     \
+} while (0)
+
+/** assert value of @a is great than @v1 and little/equal to @v2 */
+#define LASSERT_ATOMIC_GT_LE(a, v1, v2)			 \
+do {							    \
+	int __v = atomic_read(a);			   \
+	LASSERTF(__v > v1 && __v <= v2, "value: %d\n", __v);    \
+} while (0)
+
+/** assert value of @a is great/equal to @v1 and little than @v2 */
+#define LASSERT_ATOMIC_GE_LT(a, v1, v2)			 \
+do {							    \
+	int __v = atomic_read(a);			   \
+	LASSERTF(__v >= v1 && __v < v2, "value: %d\n", __v);    \
+} while (0)
+
+/** assert value of @a is great/equal to @v1 and little/equal to @v2 */
+#define LASSERT_ATOMIC_GE_LE(a, v1, v2)			 \
+do {							    \
+	int __v = atomic_read(a);			   \
+	LASSERTF(__v >= v1 && __v <= v2, "value: %d\n", __v);   \
+} while (0)
+
+#else /* !LASSERT_ATOMIC_ENABLED */
+
+#define LASSERT_ATOMIC_EQ(a, v)		 do {} while (0)
+#define LASSERT_ATOMIC_NE(a, v)		 do {} while (0)
+#define LASSERT_ATOMIC_LT(a, v)		 do {} while (0)
+#define LASSERT_ATOMIC_LE(a, v)		 do {} while (0)
+#define LASSERT_ATOMIC_GT(a, v)		 do {} while (0)
+#define LASSERT_ATOMIC_GE(a, v)		 do {} while (0)
+#define LASSERT_ATOMIC_GT_LT(a, v1, v2)	 do {} while (0)
+#define LASSERT_ATOMIC_GT_LE(a, v1, v2)	 do {} while (0)
+#define LASSERT_ATOMIC_GE_LT(a, v1, v2)	 do {} while (0)
+#define LASSERT_ATOMIC_GE_LE(a, v1, v2)	 do {} while (0)
+
+#endif /* LASSERT_ATOMIC_ENABLED */
+
+#define LASSERT_ATOMIC_ZERO(a)		  LASSERT_ATOMIC_EQ(a, 0)
+#define LASSERT_ATOMIC_POS(a)		   LASSERT_ATOMIC_GT(a, 0)
+
+#define CFS_ALLOC_PTR(ptr)      LIBCFS_ALLOC(ptr, sizeof (*(ptr)));
+#define CFS_FREE_PTR(ptr)       LIBCFS_FREE(ptr, sizeof (*(ptr)));
+
+/*
+ * percpu partition lock
+ *
+ * There are some use-cases like this in Lustre:
+ * . each CPU partition has it's own private data which is frequently changed,
+ *   and mostly by the local CPU partition.
+ * . all CPU partitions share some global data, these data are rarely changed.
+ *
+ * LNet is typical example.
+ * CPU partition lock is designed for this kind of use-cases:
+ * . each CPU partition has it's own private lock
+ * . change on private data just needs to take the private lock
+ * . read on shared data just needs to take _any_ of private locks
+ * . change on shared data needs to take _all_ private locks,
+ *   which is slow and should be really rare.
+ */
+
+enum {
+	CFS_PERCPT_LOCK_EX	= -1, /* negative */
+};
+
+
+struct cfs_percpt_lock {
+	/* cpu-partition-table for this lock */
+	struct cfs_cpt_table	*pcl_cptab;
+	/* exclusively locked */
+	unsigned int		pcl_locked;
+	/* private lock table */
+	spinlock_t		**pcl_locks;
+};
+
+/* return number of private locks */
+static inline int
+cfs_percpt_lock_num(struct cfs_percpt_lock *pcl)
+{
+	return cfs_cpt_number(pcl->pcl_cptab);
+}
+
+
+/*
+ * create a cpu-partition lock based on CPU partition table \a cptab,
+ * each private lock has extra \a psize bytes padding data
+ */
+struct cfs_percpt_lock *cfs_percpt_lock_alloc(struct cfs_cpt_table *cptab);
+/* destroy a cpu-partition lock */
+void cfs_percpt_lock_free(struct cfs_percpt_lock *pcl);
+
+/* lock private lock \a index of \a pcl */
+void cfs_percpt_lock(struct cfs_percpt_lock *pcl, int index);
+/* unlock private lock \a index of \a pcl */
+void cfs_percpt_unlock(struct cfs_percpt_lock *pcl, int index);
+/* create percpt (atomic) refcount based on @cptab */
+atomic_t **cfs_percpt_atomic_alloc(struct cfs_cpt_table *cptab, int val);
+/* destroy percpt refcount */
+void cfs_percpt_atomic_free(atomic_t **refs);
+/* return sum of all percpu refs */
+int cfs_percpt_atomic_summary(atomic_t **refs);
+
+
+/** Compile-time assertion.
+
+ * Check an invariant described by a constant expression at compile time by
+ * forcing a compiler error if it does not hold.  \a cond must be a constant
+ * expression as defined by the ISO C Standard:
+ *
+ *       6.8.4.2  The switch statement
+ *       ....
+ *       [#3] The expression of each case label shall be  an  integer
+ *       constant   expression  and  no  two  of  the  case  constant
+ *       expressions in the same switch statement shall have the same
+ *       value  after  conversion...
+ *
+ */
+#define CLASSERT(cond) do {switch(42) {case (cond): case 0: break;}} while (0)
+
+/* support decl needed both by kernel and liblustre */
+int	 libcfs_isknown_lnd(int type);
+char       *libcfs_lnd2modname(int type);
+char       *libcfs_lnd2str(int type);
+int	 libcfs_str2lnd(const char *str);
+char       *libcfs_net2str(__u32 net);
+char       *libcfs_nid2str(lnet_nid_t nid);
+__u32       libcfs_str2net(const char *str);
+lnet_nid_t  libcfs_str2nid(const char *str);
+int	 libcfs_str2anynid(lnet_nid_t *nid, const char *str);
+char       *libcfs_id2str(lnet_process_id_t id);
+void	cfs_free_nidlist(struct list_head *list);
+int	 cfs_parse_nidlist(char *str, int len, struct list_head *list);
+int	 cfs_match_nid(lnet_nid_t nid, struct list_head *list);
+
+/** \addtogroup lnet_addr
+ * @{ */
+/* how an LNET NID encodes net:address */
+/** extract the address part of an lnet_nid_t */
+#define LNET_NIDADDR(nid)      ((__u32)((nid) & 0xffffffff))
+/** extract the network part of an lnet_nid_t */
+#define LNET_NIDNET(nid)       ((__u32)(((nid) >> 32)) & 0xffffffff)
+/** make an lnet_nid_t from a network part and an address part */
+#define LNET_MKNID(net,addr)   ((((__u64)(net))<<32)|((__u64)(addr)))
+/* how net encodes type:number */
+#define LNET_NETNUM(net)       ((net) & 0xffff)
+#define LNET_NETTYP(net)       (((net) >> 16) & 0xffff)
+#define LNET_MKNET(typ,num)    ((((__u32)(typ))<<16)|((__u32)(num)))
+/** @} lnet_addr */
+
+/* max value for numeric network address */
+#define MAX_NUMERIC_VALUE 0xffffffff
+
+/* implication */
+#define ergo(a, b) (!(a) || (b))
+/* logical equivalence */
+#define equi(a, b) (!!(a) == !!(b))
+
+#ifndef CFS_CURRENT_TIME
+# define CFS_CURRENT_TIME time(0)
+#endif
+
+/* --------------------------------------------------------------------
+ * Light-weight trace
+ * Support for temporary event tracing with minimal Heisenberg effect.
+ * All stuff about lwt are put in arch/kp30.h
+ * -------------------------------------------------------------------- */
+
+struct libcfs_device_userstate
+{
+	int	   ldu_memhog_pages;
+	struct page   *ldu_memhog_root_page;
+};
+
+/* what used to be in portals_lib.h */
+#ifndef MIN
+# define MIN(a,b) (((a)<(b)) ? (a): (b))
+#endif
+#ifndef MAX
+# define MAX(a,b) (((a)>(b)) ? (a): (b))
+#endif
+
+#define MKSTR(ptr) ((ptr))? (ptr) : ""
+
+static inline int cfs_size_round4 (int val)
+{
+	return (val + 3) & (~0x3);
+}
+
+#ifndef HAVE_CFS_SIZE_ROUND
+static inline int cfs_size_round (int val)
+{
+	return (val + 7) & (~0x7);
+}
+#define HAVE_CFS_SIZE_ROUND
+#endif
+
+static inline int cfs_size_round16(int val)
+{
+	return (val + 0xf) & (~0xf);
+}
+
+static inline int cfs_size_round32(int val)
+{
+	return (val + 0x1f) & (~0x1f);
+}
+
+static inline int cfs_size_round0(int val)
+{
+	if (!val)
+		return 0;
+	return (val + 1 + 7) & (~0x7);
+}
+
+static inline size_t cfs_round_strlen(char *fset)
+{
+	return (size_t)cfs_size_round((int)strlen(fset) + 1);
+}
+
+/* roundup \a val to power2 */
+static inline unsigned int cfs_power2_roundup(unsigned int val)
+{
+	if (val != LOWEST_BIT_SET(val)) { /* not a power of 2 already */
+		do {
+			val &= ~LOWEST_BIT_SET(val);
+		} while (val != LOWEST_BIT_SET(val));
+		/* ...and round up */
+		val <<= 1;
+	}
+	return val;
+}
+
+#define LOGL(var,len,ptr)				       \
+do {							    \
+	if (var)						\
+		memcpy((char *)ptr, (const char *)var, len);    \
+	ptr += cfs_size_round(len);			     \
+} while (0)
+
+#define LOGU(var,len,ptr)				       \
+do {							    \
+	if (var)						\
+		memcpy((char *)var, (const char *)ptr, len);    \
+	ptr += cfs_size_round(len);			     \
+} while (0)
+
+#define LOGL0(var,len,ptr)			      \
+do {						    \
+	if (!len)				       \
+		break;				  \
+	memcpy((char *)ptr, (const char *)var, len);    \
+	*((char *)(ptr) + len) = 0;		     \
+	ptr += cfs_size_round(len + 1);		 \
+} while (0)
+
+/**
+ *  Lustre Network Driver types.
+ */
+enum {
+	/* Only add to these values (i.e. don't ever change or redefine them):
+	 * network addresses depend on them... */
+	QSWLND    = 1,
+	SOCKLND   = 2,
+	GMLND     = 3, /* obsolete, keep it so that libcfs_nid2str works */
+	PTLLND    = 4,
+	O2IBLND   = 5,
+	CIBLND    = 6,
+	OPENIBLND = 7,
+	IIBLND    = 8,
+	LOLND     = 9,
+	RALND     = 10,
+	VIBLND    = 11,
+	MXLND     = 12,
+	GNILND    = 13,
+};
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_string.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_string.h
new file mode 100644
index 000000000000..a6bac9c36339
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_string.h
@@ -0,0 +1,137 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_string.h
+ *
+ * Generic string manipulation functions.
+ *
+ * Author: Nathan Rutman <nathan.rutman@sun.com>
+ */
+
+#ifndef __LIBCFS_STRING_H__
+#define __LIBCFS_STRING_H__
+
+/* libcfs_string.c */
+/* string comparison ignoring case */
+int cfs_strncasecmp(const char *s1, const char *s2, size_t n);
+/* Convert a text string to a bitmask */
+int cfs_str2mask(const char *str, const char *(*bit2str)(int bit),
+		 int *oldmask, int minmask, int allmask);
+
+/* Allocate space for and copy an existing string.
+ * Must free with kfree().
+ */
+char *cfs_strdup(const char *str, u_int32_t flags);
+
+/* safe vsnprintf */
+int cfs_vsnprintf(char *buf, size_t size, const char *fmt, va_list args);
+
+/* safe snprintf */
+int cfs_snprintf(char *buf, size_t size, const char *fmt, ...);
+
+/* trim leading and trailing space characters */
+char *cfs_firststr(char *str, size_t size);
+
+/**
+ * Structure to represent NULL-less strings.
+ */
+struct cfs_lstr {
+	char		*ls_str;
+	int		ls_len;
+};
+
+/*
+ * Structure to represent \<range_expr\> token of the syntax.
+ */
+struct cfs_range_expr {
+	/*
+	 * Link to cfs_expr_list::el_exprs.
+	 */
+	struct list_head	re_link;
+	__u32		re_lo;
+	__u32		re_hi;
+	__u32		re_stride;
+};
+
+struct cfs_expr_list {
+	struct list_head	el_link;
+	struct list_head	el_exprs;
+};
+
+static inline int
+cfs_iswhite(char c)
+{
+	switch (c) {
+	case ' ':
+	case '\t':
+	case '\n':
+	case '\r':
+		return 1;
+	default:
+		break;
+	}
+	return 0;
+}
+
+char *cfs_trimwhite(char *str);
+int cfs_gettok(struct cfs_lstr *next, char delim, struct cfs_lstr *res);
+int cfs_str2num_check(char *str, int nob, unsigned *num,
+		      unsigned min, unsigned max);
+int cfs_range_expr_parse(struct cfs_lstr *src, unsigned min, unsigned max,
+			 int single_tok, struct cfs_range_expr **expr);
+int cfs_expr_list_match(__u32 value, struct cfs_expr_list *expr_list);
+int cfs_expr_list_values(struct cfs_expr_list *expr_list,
+			 int max, __u32 **values);
+static inline void
+cfs_expr_list_values_free(__u32 *values, int num)
+{
+	/* This array is allocated by LIBCFS_ALLOC(), so it shouldn't be freed
+	 * by OBD_FREE() if it's called by module other than libcfs & LNet,
+	 * otherwise we will see fake memory leak */
+	LIBCFS_FREE(values, num * sizeof(values[0]));
+}
+
+void cfs_expr_list_free(struct cfs_expr_list *expr_list);
+void cfs_expr_list_print(struct cfs_expr_list *expr_list);
+int cfs_expr_list_parse(char *str, int len, unsigned min, unsigned max,
+			struct cfs_expr_list **elpp);
+void cfs_expr_list_free_list(struct list_head *list);
+int cfs_ip_addr_parse(char *str, int len, struct list_head *list);
+int cfs_ip_addr_match(__u32 addr, struct list_head *list);
+void cfs_ip_addr_free(struct list_head *list);
+
+#define	strtoul(str, endp, base)	simple_strtoul(str, endp, base)
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_time.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_time.h
new file mode 100644
index 000000000000..4bdd77163d5e
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_time.h
@@ -0,0 +1,132 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_time.h
+ *
+ * Time functions.
+ *
+ */
+
+#ifndef __LIBCFS_TIME_H__
+#define __LIBCFS_TIME_H__
+/*
+ * generic time manipulation functions.
+ */
+
+static inline cfs_time_t cfs_time_add(cfs_time_t t, cfs_duration_t d)
+{
+	return (cfs_time_t)(t + d);
+}
+
+static inline cfs_duration_t cfs_time_sub(cfs_time_t t1, cfs_time_t t2)
+{
+	return (cfs_time_t)(t1 - t2);
+}
+
+static inline int cfs_time_after(cfs_time_t t1, cfs_time_t t2)
+{
+	return cfs_time_before(t2, t1);
+}
+
+static inline int cfs_time_aftereq(cfs_time_t t1, cfs_time_t t2)
+{
+	return cfs_time_beforeq(t2, t1);
+}
+
+
+static inline cfs_time_t cfs_time_shift(int seconds)
+{
+	return cfs_time_add(cfs_time_current(), cfs_time_seconds(seconds));
+}
+
+static inline long cfs_timeval_sub(struct timeval *large, struct timeval *small,
+				   struct timeval *result)
+{
+	long r = (long) (
+		(large->tv_sec - small->tv_sec) * ONE_MILLION +
+		(large->tv_usec - small->tv_usec));
+	if (result != NULL) {
+		result->tv_usec = r % ONE_MILLION;
+		result->tv_sec = r / ONE_MILLION;
+	}
+	return r;
+}
+
+static inline void cfs_slow_warning(cfs_time_t now, int seconds, char *msg)
+{
+	if (cfs_time_after(cfs_time_current(),
+			   cfs_time_add(now, cfs_time_seconds(15))))
+		CERROR("slow %s "CFS_TIME_T" sec\n", msg,
+		       cfs_duration_sec(cfs_time_sub(cfs_time_current(),now)));
+}
+
+#define CFS_RATELIMIT(seconds)				  \
+({							      \
+	/*						      \
+	 * XXX nikita: non-portable initializer		 \
+	 */						     \
+	static time_t __next_message = 0;		       \
+	int result;					     \
+								\
+	if (cfs_time_after(cfs_time_current(), __next_message)) \
+		result = 1;				     \
+	else {						  \
+		__next_message = cfs_time_shift(seconds);       \
+		result = 0;				     \
+	}						       \
+	result;						 \
+})
+
+/*
+ * helper function similar to do_gettimeofday() of Linux kernel
+ */
+static inline void cfs_fs_timeval(struct timeval *tv)
+{
+	cfs_fs_time_t time;
+
+	cfs_fs_time_current(&time);
+	cfs_fs_time_usec(&time, tv);
+}
+
+/*
+ * return valid time-out based on user supplied one. Currently we only check
+ * that time-out is not shorted than allowed.
+ */
+static inline cfs_duration_t cfs_timeout_cap(cfs_duration_t timeout)
+{
+	if (timeout < CFS_TICK)
+		timeout = CFS_TICK;
+	return timeout;
+}
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_workitem.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_workitem.h
new file mode 100644
index 000000000000..5cc64f327a87
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_workitem.h
@@ -0,0 +1,110 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_workitem.h
+ *
+ * Author: Isaac Huang  <he.h.huang@oracle.com>
+ *	 Liang Zhen   <zhen.liang@sun.com>
+ *
+ * A workitems is deferred work with these semantics:
+ * - a workitem always runs in thread context.
+ * - a workitem can be concurrent with other workitems but is strictly
+ *   serialized with respect to itself.
+ * - no CPU affinity, a workitem does not necessarily run on the same CPU
+ *   that schedules it. However, this might change in the future.
+ * - if a workitem is scheduled again before it has a chance to run, it
+ *   runs only once.
+ * - if a workitem is scheduled while it runs, it runs again after it
+ *   completes; this ensures that events occurring while other events are
+ *   being processed receive due attention. This behavior also allows a
+ *   workitem to reschedule itself.
+ *
+ * Usage notes:
+ * - a workitem can sleep but it should be aware of how that sleep might
+ *   affect others.
+ * - a workitem runs inside a kernel thread so there's no user space to access.
+ * - do not use a workitem if the scheduling latency can't be tolerated.
+ *
+ * When wi_action returns non-zero, it means the workitem has either been
+ * freed or reused and workitem scheduler won't touch it any more.
+ */
+
+#ifndef __LIBCFS_WORKITEM_H__
+#define __LIBCFS_WORKITEM_H__
+
+struct cfs_wi_sched;
+
+void cfs_wi_sched_destroy(struct cfs_wi_sched *);
+int cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab, int cpt,
+			int nthrs, struct cfs_wi_sched **);
+
+struct cfs_workitem;
+
+typedef int (*cfs_wi_action_t) (struct cfs_workitem *);
+typedef struct cfs_workitem {
+	/** chain on runq or rerunq */
+	struct list_head       wi_list;
+	/** working function */
+	cfs_wi_action_t  wi_action;
+	/** arg for working function */
+	void	    *wi_data;
+	/** in running */
+	unsigned short   wi_running:1;
+	/** scheduled */
+	unsigned short   wi_scheduled:1;
+} cfs_workitem_t;
+
+static inline void
+cfs_wi_init(cfs_workitem_t *wi, void *data, cfs_wi_action_t action)
+{
+	INIT_LIST_HEAD(&wi->wi_list);
+
+	wi->wi_running   = 0;
+	wi->wi_scheduled = 0;
+	wi->wi_data      = data;
+	wi->wi_action    = action;
+}
+
+void cfs_wi_schedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi);
+int  cfs_wi_deschedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi);
+void cfs_wi_exit(struct cfs_wi_sched *sched, cfs_workitem_t *wi);
+
+int  cfs_wi_startup(void);
+void cfs_wi_shutdown(void);
+
+/** # workitem scheduler loops before reschedule */
+#define CFS_WI_RESCHED    128
+
+#endif /* __LIBCFS_WORKITEM_H__ */
diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/kp30.h b/drivers/staging/lustre/include/linux/libcfs/linux/kp30.h
new file mode 100644
index 000000000000..4b7ae1c5bd3b
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/linux/kp30.h
@@ -0,0 +1,286 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LIBCFS_LINUX_KP30_H__
+#define __LIBCFS_LINUX_KP30_H__
+
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/kmod.h>
+#include <linux/notifier.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/time.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/version.h>
+#include <asm/atomic.h>
+#include <asm/uaccess.h>
+#include <linux/rwsem.h>
+#include <linux/proc_fs.h>
+#include <linux/file.h>
+#include <linux/smp.h>
+#include <linux/ctype.h>
+#include <linux/compiler.h>
+#ifdef HAVE_MM_INLINE
+# include <linux/mm_inline.h>
+#endif
+#include <linux/kallsyms.h>
+#include <linux/moduleparam.h>
+#include <linux/scatterlist.h>
+
+#include <linux/libcfs/linux/portals_compat25.h>
+
+
+#define prepare_work(wq,cb,cbdata)					    \
+do {									  \
+	INIT_WORK((wq), (void *)(cb));					\
+} while (0)
+
+#define cfs_get_work_data(type,field,data) container_of(data,type,field)
+
+
+#define our_recalc_sigpending(current) recalc_sigpending()
+#define strtok(a,b) strpbrk(a, b)
+#define work_struct_t      struct work_struct
+
+#ifdef CONFIG_SMP
+#else
+#endif
+
+
+#define SEM_COUNT(sem)	  ((sem)->count)
+
+
+/* ------------------------------------------------------------------- */
+
+#define PORTAL_SYMBOL_REGISTER(x)
+#define PORTAL_SYMBOL_UNREGISTER(x)
+
+
+
+
+/******************************************************************************/
+/* Module parameter support */
+#define CFS_MODULE_PARM(name, t, type, perm, desc) \
+	module_param(name, type, perm);\
+	MODULE_PARM_DESC(name, desc)
+
+#define CFS_SYSFS_MODULE_PARM  1 /* module parameters accessible via sysfs */
+
+/******************************************************************************/
+
+#if (__GNUC__)
+/* Use the special GNU C __attribute__ hack to have the compiler check the
+ * printf style argument string against the actual argument count and
+ * types.
+ */
+#ifdef printf
+# warning printf has been defined as a macro...
+# undef printf
+#endif
+
+#endif /* __GNUC__ */
+
+# define fprintf(a, format, b...) CDEBUG(D_OTHER, format , ## b)
+# define printf(format, b...) CDEBUG(D_OTHER, format , ## b)
+# define time(a) CURRENT_TIME
+
+# define cfs_num_present_cpus()  num_present_cpus()
+
+/******************************************************************************/
+/* Light-weight trace
+ * Support for temporary event tracing with minimal Heisenberg effect. */
+#define LWT_SUPPORT  0
+
+#define LWT_MEMORY   (16<<20)
+
+#ifndef KLWT_SUPPORT
+#  if !defined(BITS_PER_LONG)
+#   error "BITS_PER_LONG not defined"
+#  endif
+
+/* kernel hasn't defined this? */
+typedef struct {
+	long long   lwte_when;
+	char       *lwte_where;
+	void       *lwte_task;
+	long	lwte_p1;
+	long	lwte_p2;
+	long	lwte_p3;
+	long	lwte_p4;
+# if BITS_PER_LONG > 32
+	long	lwte_pad;
+# endif
+} lwt_event_t;
+#endif /* !KLWT_SUPPORT */
+
+#if LWT_SUPPORT
+#  if !KLWT_SUPPORT
+
+typedef struct _lwt_page {
+	struct list_head	       lwtp_list;
+	struct page	     *lwtp_page;
+	lwt_event_t	     *lwtp_events;
+} lwt_page_t;
+
+typedef struct {
+	int		lwtc_current_index;
+	lwt_page_t	*lwtc_current_page;
+} lwt_cpu_t;
+
+extern int       lwt_enabled;
+extern lwt_cpu_t lwt_cpus[];
+
+/* Note that we _don't_ define LWT_EVENT at all if LWT_SUPPORT isn't set.
+ * This stuff is meant for finding specific problems; it never stays in
+ * production code... */
+
+#define LWTSTR(n)       #n
+#define LWTWHERE(f,l)   f ":" LWTSTR(l)
+#define LWT_EVENTS_PER_PAGE (PAGE_CACHE_SIZE / sizeof (lwt_event_t))
+
+#define LWT_EVENT(p1, p2, p3, p4)				       \
+do {								    \
+	unsigned long    flags;					 \
+	lwt_cpu_t       *cpu;					   \
+	lwt_page_t      *p;					     \
+	lwt_event_t     *e;					     \
+									\
+	if (lwt_enabled) {					      \
+		local_irq_save (flags);				 \
+									\
+		cpu = &lwt_cpus[smp_processor_id()];		    \
+		p = cpu->lwtc_current_page;			     \
+		e = &p->lwtp_events[cpu->lwtc_current_index++];	 \
+									\
+		if (cpu->lwtc_current_index >= LWT_EVENTS_PER_PAGE) {   \
+			cpu->lwtc_current_page =			\
+				list_entry (p->lwtp_list.next,      \
+						lwt_page_t, lwtp_list); \
+			cpu->lwtc_current_index = 0;		    \
+		}						       \
+									\
+		e->lwte_when  = get_cycles();			   \
+		e->lwte_where = LWTWHERE(__FILE__,__LINE__);	    \
+		e->lwte_task  = current;				\
+		e->lwte_p1    = (long)(p1);			     \
+		e->lwte_p2    = (long)(p2);			     \
+		e->lwte_p3    = (long)(p3);			     \
+		e->lwte_p4    = (long)(p4);			     \
+									\
+		local_irq_restore (flags);			      \
+	}							       \
+} while (0)
+
+#endif /* !KLWT_SUPPORT */
+
+extern int  lwt_init (void);
+extern void lwt_fini (void);
+extern int  lwt_lookup_string (int *size, char *knlptr,
+			       char *usrptr, int usrsize);
+extern int  lwt_control (int enable, int clear);
+extern int  lwt_snapshot (cfs_cycles_t *now, int *ncpu, int *total_size,
+			  void *user_ptr, int user_size);
+#endif /* LWT_SUPPORT */
+
+/* ------------------------------------------------------------------ */
+
+#define IOCTL_LIBCFS_TYPE long
+
+#ifdef __CYGWIN__
+# ifndef BITS_PER_LONG
+#   define BITS_PER_LONG 64
+# endif
+#endif
+
+# define LI_POISON ((int)0x5a5a5a5a5a5a5a5a)
+# define LL_POISON ((long)0x5a5a5a5a5a5a5a5a)
+# define LP_POISON ((void *)(long)0x5a5a5a5a5a5a5a5a)
+
+/* this is a bit chunky */
+
+#define _LWORDSIZE BITS_PER_LONG
+
+# define LPU64 "%llu"
+# define LPD64 "%lld"
+# define LPX64 "%#llx"
+# define LPX64i "%llx"
+# define LPO64 "%#llo"
+# define LPF64 "L"
+
+/*
+ * long_ptr_t & ulong_ptr_t, same to "long" for gcc
+ */
+# define LPLU "%lu"
+# define LPLD "%ld"
+# define LPLX "%#lx"
+
+/*
+ * pid_t
+ */
+# define LPPID "%d"
+
+
+#undef _LWORDSIZE
+
+/* compat macroses */
+
+
+#ifndef get_cpu
+# ifdef CONFIG_PREEMPT
+#  define get_cpu()  ({ preempt_disable(); smp_processor_id(); })
+#  define put_cpu()  preempt_enable()
+# else
+#  define get_cpu()  smp_processor_id()
+#  define put_cpu()
+# endif
+#else
+#endif /* get_cpu & put_cpu */
+
+#define INIT_CTL_NAME(a)
+#define INIT_STRATEGY(a)
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/libcfs.h b/drivers/staging/lustre/include/linux/libcfs/linux/libcfs.h
new file mode 100644
index 000000000000..757e6dcaaf9a
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/linux/libcfs.h
@@ -0,0 +1,131 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LIBCFS_LINUX_LIBCFS_H__
+#define __LIBCFS_LINUX_LIBCFS_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <linux/libcfs/libcfs.h> instead
+#endif
+
+
+
+#include <stdarg.h>
+#include <linux/libcfs/linux/linux-cpu.h>
+#include <linux/libcfs/linux/linux-time.h>
+#include <linux/libcfs/linux/linux-mem.h>
+#include <linux/libcfs/linux/linux-prim.h>
+#include <linux/libcfs/linux/linux-lock.h>
+#include <linux/libcfs/linux/linux-fs.h>
+#include <linux/libcfs/linux/linux-tcpip.h>
+#include <linux/libcfs/linux/linux-bitops.h>
+#include <linux/libcfs/linux/linux-types.h>
+#include <linux/libcfs/linux/kp30.h>
+
+#include <asm/types.h>
+#include <linux/types.h>
+#include <asm/timex.h>
+#include <linux/sched.h> /* THREAD_SIZE */
+#include <linux/rbtree.h>
+
+#define LUSTRE_TRACE_SIZE (THREAD_SIZE >> 5)
+
+#if !defined(__x86_64__)
+# ifdef  __ia64__
+#  define CDEBUG_STACK() (THREAD_SIZE -				 \
+			  ((unsigned long)__builtin_dwarf_cfa() &       \
+			   (THREAD_SIZE - 1)))
+# else
+#  define CDEBUG_STACK() (THREAD_SIZE -				 \
+			  ((unsigned long)__builtin_frame_address(0) &  \
+			   (THREAD_SIZE - 1)))
+# endif /* __ia64__ */
+
+#define __CHECK_STACK(msgdata, mask, cdls)			      \
+do {								    \
+	if (unlikely(CDEBUG_STACK() > libcfs_stack)) {		  \
+		LIBCFS_DEBUG_MSG_DATA_INIT(msgdata, D_WARNING, NULL);   \
+		libcfs_stack = CDEBUG_STACK();			  \
+		libcfs_debug_msg(msgdata,			       \
+				 "maximum lustre stack %lu\n",	  \
+				 CDEBUG_STACK());		       \
+		(msgdata)->msg_mask = mask;			     \
+		(msgdata)->msg_cdls = cdls;			     \
+		dump_stack();					   \
+	      /*panic("LBUG");*/					\
+	}							       \
+} while (0)
+#define CFS_CHECK_STACK(msgdata, mask, cdls)  __CHECK_STACK(msgdata, mask, cdls)
+#else /* __x86_64__ */
+#define CFS_CHECK_STACK(msgdata, mask, cdls) do {} while(0)
+#define CDEBUG_STACK() (0L)
+#endif /* __x86_64__ */
+
+/* initial pid  */
+#define LUSTRE_LNET_PID	  12345
+
+#define ENTRY_NESTING_SUPPORT (1)
+#define ENTRY_NESTING   do {;} while (0)
+#define EXIT_NESTING   do {;} while (0)
+#define __current_nesting_level() (0)
+
+/**
+ * Platform specific declarations for cfs_curproc API (libcfs/curproc.h)
+ *
+ * Implementation is in linux-curproc.c
+ */
+#define CFS_CURPROC_COMM_MAX (sizeof ((struct task_struct *)0)->comm)
+
+#include <linux/capability.h>
+
+/*
+ * No stack-back-tracing in Linux for now.
+ */
+struct cfs_stack_trace {
+};
+
+/* long integer with size equal to pointer */
+typedef unsigned long ulong_ptr_t;
+typedef long long_ptr_t;
+
+#ifndef WITH_WATCHDOG
+#define WITH_WATCHDOG
+#endif
+
+
+
+
+#endif /* _LINUX_LIBCFS_H */
diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-bitops.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-bitops.h
new file mode 100644
index 000000000000..43936e349dd4
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/linux/linux-bitops.h
@@ -0,0 +1,38 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/linux/linux-bitops.h
+ */
+#include <linux/bitops.h>
diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-cpu.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-cpu.h
new file mode 100644
index 000000000000..224371c92f7c
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/linux/linux-cpu.h
@@ -0,0 +1,175 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/linux/linux-mem.h
+ *
+ * Basic library routines.
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#ifndef __LIBCFS_LINUX_CPU_H__
+#define __LIBCFS_LINUX_CPU_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <linux/libcfs/libcfs.h> instead
+#endif
+
+
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/topology.h>
+#include <linux/version.h>
+
+
+#ifdef CONFIG_SMP
+
+#define HAVE_LIBCFS_CPT
+
+/** virtual processing unit */
+struct cfs_cpu_partition {
+	/* CPUs mask for this partition */
+	cpumask_t			*cpt_cpumask;
+	/* nodes mask for this partition */
+	nodemask_t			*cpt_nodemask;
+	/* spread rotor for NUMA allocator */
+	unsigned			cpt_spread_rotor;
+};
+
+/** descriptor for CPU partitions */
+struct cfs_cpt_table {
+	/* version, reserved for hotplug */
+	unsigned			ctb_version;
+	/* spread rotor for NUMA allocator */
+	unsigned			ctb_spread_rotor;
+	/* # of CPU partitions */
+	unsigned			ctb_nparts;
+	/* partitions tables */
+	struct cfs_cpu_partition	*ctb_parts;
+	/* shadow HW CPU to CPU partition ID */
+	int				*ctb_cpu2cpt;
+	/* all cpus in this partition table */
+	cpumask_t			*ctb_cpumask;
+	/* all nodes in this partition table */
+	nodemask_t			*ctb_nodemask;
+};
+
+void cfs_cpu_core_siblings(int cpu, cpumask_t *mask);
+void cfs_cpu_ht_siblings(int cpu, cpumask_t *mask);
+void cfs_node_to_cpumask(int node, cpumask_t *mask);
+int cfs_cpu_core_nsiblings(int cpu);
+int cfs_cpu_ht_nsiblings(int cpu);
+
+/**
+ * comment out definitions for compatible layer
+ * #define CFS_CPU_NR			  NR_CPUS
+ *
+ * typedef cpumask_t			   cfs_cpumask_t;
+ *
+ * #define cfs_cpu_current()		   smp_processor_id()
+ * #define cfs_cpu_online(i)		   cpu_online(i)
+ * #define cfs_cpu_online_num()		num_online_cpus()
+ * #define cfs_cpu_online_for_each(i)	  for_each_online_cpu(i)
+ * #define cfs_cpu_possible_num()	      num_possible_cpus()
+ * #define cfs_cpu_possible_for_each(i)	for_each_possible_cpu(i)
+ *
+ * #ifdef CONFIG_CPUMASK_SIZE
+ * #define cfs_cpu_mask_size()		 cpumask_size()
+ * #else
+ * #define cfs_cpu_mask_size()		 sizeof(cfs_cpumask_t)
+ * #endif
+ *
+ * #define cfs_cpu_mask_set(i, mask)	   cpu_set(i, mask)
+ * #define cfs_cpu_mask_unset(i, mask)	 cpu_clear(i, mask)
+ * #define cfs_cpu_mask_isset(i, mask)	 cpu_isset(i, mask)
+ * #define cfs_cpu_mask_clear(mask)	    cpus_clear(mask)
+ * #define cfs_cpu_mask_empty(mask)	    cpus_empty(mask)
+ * #define cfs_cpu_mask_weight(mask)	   cpus_weight(mask)
+ * #define cfs_cpu_mask_first(mask)	    first_cpu(mask)
+ * #define cfs_cpu_mask_any_online(mask)      (any_online_cpu(mask) != NR_CPUS)
+ * #define cfs_cpu_mask_for_each(i, mask)      for_each_cpu_mask(i, mask)
+ * #define cfs_cpu_mask_bind(t, mask)	  set_cpus_allowed(t, mask)
+ *
+ * #ifdef HAVE_CPUMASK_COPY
+ * #define cfs_cpu_mask_copy(dst, src)	 cpumask_copy(dst, src)
+ * #else
+ * #define cfs_cpu_mask_copy(dst, src)	 memcpy(dst, src, sizeof(*src))
+ * #endif
+ *
+ * static inline void
+ * cfs_cpu_mask_of_online(cfs_cpumask_t *mask)
+ * {
+ * cfs_cpu_mask_copy(mask, &cpu_online_map);
+ * }
+ *
+ * #ifdef CONFIG_NUMA
+ *
+ * #define CFS_NODE_NR			 MAX_NUMNODES
+ *
+ * typedef nodemask_t			  cfs_node_mask_t;
+ *
+ * #define cfs_node_of_cpu(cpu)		cpu_to_node(cpu)
+ * #define cfs_node_online(i)		  node_online(i)
+ * #define cfs_node_online_num()	       num_online_nodes()
+ * #define cfs_node_online_for_each(i)	 for_each_online_node(i)
+ * #define cfs_node_possible_num()	     num_possible_nodes()
+ * #define cfs_node_possible_for_each(i)       for_each_node(i)
+ *
+ * static inline void cfs_node_to_cpumask(int node, cfs_cpumask_t *mask)
+ * {
+ * #if defined(HAVE_NODE_TO_CPUMASK)
+ *      *mask = node_to_cpumask(node);
+ * #elif defined(HAVE_CPUMASK_OF_NODE)
+ *      cfs_cpu_mask_copy(mask, cpumask_of_node(node));
+ * #else
+ * # error "Needs node_to_cpumask or cpumask_of_node"
+ * #endif
+ * }
+ *
+ * #define cfs_node_mask_set(i, mask)	  node_set(i, mask)
+ * #define cfs_node_mask_unset(i, mask)	node_clear(i, mask)
+ * #define cfs_node_mask_isset(i, mask)	node_isset(i, mask)
+ * #define cfs_node_mask_clear(mask)	   nodes_reset(mask)
+ * #define cfs_node_mask_empty(mask)	   nodes_empty(mask)
+ * #define cfs_node_mask_weight(mask)	  nodes_weight(mask)
+ * #define cfs_node_mask_for_each(i, mask)     for_each_node_mask(i, mask)
+ * #define cfs_node_mask_copy(dst, src)	memcpy(dst, src, sizeof(*src))
+ *
+ * static inline void
+ * cfs_node_mask_of_online(cfs_node_mask_t *mask)
+ * {
+ *       cfs_node_mask_copy(mask, &node_online_map);
+ * }
+ *
+ * #endif
+ */
+
+#endif /* CONFIG_SMP */
+#endif /* __LIBCFS_LINUX_CPU_H__ */
diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-crypto.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-crypto.h
new file mode 100644
index 000000000000..97c771cf691f
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/linux/linux-crypto.h
@@ -0,0 +1,49 @@
+ /*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ */
+
+/**
+ * Linux crypto hash specific functions.
+ */
+
+/**
+ * Functions for start/stop shash CRC32 algorithm.
+ */
+int cfs_crypto_crc32_register(void);
+void cfs_crypto_crc32_unregister(void);
+
+/**
+ * Functions for start/stop shash adler32 algorithm.
+ */
+int cfs_crypto_adler32_register(void);
+void cfs_crypto_adler32_unregister(void);
+
+/**
+ * Functions for start/stop shash crc32 pclmulqdq
+ */
+int cfs_crypto_crc32_pclmul_register(void);
+void cfs_crypto_crc32_pclmul_unregister(void);
diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-fs.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-fs.h
new file mode 100644
index 000000000000..90ff47a18924
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/linux/linux-fs.h
@@ -0,0 +1,95 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/linux/linux-fs.h
+ *
+ * Basic library routines.
+ */
+
+#ifndef __LIBCFS_LINUX_CFS_FS_H__
+#define __LIBCFS_LINUX_CFS_FS_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <linux/libcfs/libcfs.h> instead
+#endif
+
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <linux/mount.h>
+#include <linux/backing-dev.h>
+#include <linux/posix_acl_xattr.h>
+
+#define filp_size(f)					\
+	(i_size_read((f)->f_dentry->d_inode))
+#define filp_poff(f)					\
+	(&(f)->f_pos)
+
+# define do_fsync(fp, flag)				\
+	((fp)->f_op->fsync(fp, 0, LLONG_MAX, flag))
+
+#define filp_read(fp, buf, size, pos)			\
+	((fp)->f_op->read((fp), (buf), (size), pos))
+
+#define filp_write(fp, buf, size, pos)			\
+	((fp)->f_op->write((fp), (buf), (size), pos))
+
+#define filp_fsync(fp)					\
+	do_fsync(fp, 1)
+
+#define flock_type(fl)			((fl)->fl_type)
+#define flock_set_type(fl, type)	do { (fl)->fl_type = (type); } while (0)
+#define flock_pid(fl)			((fl)->fl_pid)
+#define flock_set_pid(fl, pid)		do { (fl)->fl_pid = (pid); } while (0)
+#define flock_start(fl)			((fl)->fl_start)
+#define flock_set_start(fl, st)		do { (fl)->fl_start = (st); } while (0)
+#define flock_end(fl)			((fl)->fl_end)
+#define flock_set_end(fl, end)		do { (fl)->fl_end = (end); } while (0)
+
+ssize_t filp_user_write(struct file *filp, const void *buf, size_t count,
+			loff_t *offset);
+
+#ifndef IFSHIFT
+#define IFSHIFT			12
+#endif
+
+#ifndef IFTODT
+#define IFTODT(type)		(((type) & S_IFMT) >> IFSHIFT)
+#endif
+#ifndef DTTOIF
+#define DTTOIF(dirtype)		((dirtype) << IFSHIFT)
+#endif
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-lock.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-lock.h
new file mode 100644
index 000000000000..6fbcbf3ab0d3
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/linux/linux-lock.h
@@ -0,0 +1,204 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/linux/linux-lock.h
+ *
+ * Basic library routines.
+ */
+
+#ifndef __LIBCFS_LINUX_CFS_LOCK_H__
+#define __LIBCFS_LINUX_CFS_LOCK_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <linux/libcfs/libcfs.h> instead
+#endif
+
+
+#include <linux/mutex.h>
+
+/*
+ * IMPORTANT !!!!!!!!
+ *
+ * All locks' declaration are not guaranteed to be initialized,
+ * Althought some of they are initialized in Linux. All locks
+ * declared by CFS_DECL_* should be initialized explicitly.
+ */
+
+/*
+ * spin_lock "implementation" (use Linux kernel's primitives)
+ *
+ * - spin_lock_init(x)
+ * - spin_lock(x)
+ * - spin_lock_bh(x)
+ * - spin_lock_bh_init(x)
+ * - spin_unlock(x)
+ * - spin_unlock_bh(x)
+ * - spin_trylock(x)
+ * - spin_is_locked(x)
+ *
+ * - spin_lock_irq(x)
+ * - spin_lock_irqsave(x, f)
+ * - spin_unlock_irqrestore(x, f)
+ * - read_lock_irqsave(lock, f)
+ * - write_lock_irqsave(lock, f)
+ * - write_unlock_irqrestore(lock, f)
+ */
+
+/*
+ * spinlock "implementation"
+ */
+
+
+
+
+/*
+ * rw_semaphore "implementation" (use Linux kernel's primitives)
+ *
+ * - sema_init(x)
+ * - init_rwsem(x)
+ * - down_read(x)
+ * - up_read(x)
+ * - down_write(x)
+ * - up_write(x)
+ */
+
+
+#define fini_rwsem(s)		do {} while (0)
+
+
+/*
+ * rwlock_t "implementation" (use Linux kernel's primitives)
+ *
+ * - rwlock_init(x)
+ * - read_lock(x)
+ * - read_unlock(x)
+ * - write_lock(x)
+ * - write_unlock(x)
+ * - write_lock_bh(x)
+ * - write_unlock_bh(x)
+ *
+ * - RW_LOCK_UNLOCKED
+ */
+
+
+#ifndef DEFINE_RWLOCK
+#define DEFINE_RWLOCK(lock)	rwlock_t lock = __RW_LOCK_UNLOCKED(lock)
+#endif
+
+/*
+ * completion "implementation" (use Linux kernel's primitives)
+ *
+ * - DECLARE_COMPLETION(work)
+ * - INIT_COMPLETION(c)
+ * - COMPLETION_INITIALIZER(work)
+ * - init_completion(c)
+ * - complete(c)
+ * - wait_for_completion(c)
+ * - wait_for_completion_interruptible(c)
+ * - fini_completion(c)
+ */
+#define fini_completion(c) do { } while (0)
+
+/*
+ * semaphore "implementation" (use Linux kernel's primitives)
+ * - DEFINE_SEMAPHORE(name)
+ * - sema_init(sem, val)
+ * - up(sem)
+ * - down(sem)
+ * - down_interruptible(sem)
+ * - down_trylock(sem)
+ */
+
+/*
+ * mutex "implementation" (use Linux kernel's primitives)
+ *
+ * - DEFINE_MUTEX(name)
+ * - mutex_init(x)
+ * - mutex_lock(x)
+ * - mutex_unlock(x)
+ * - mutex_trylock(x)
+ * - mutex_is_locked(x)
+ * - mutex_destroy(x)
+ */
+
+#ifndef lockdep_set_class
+
+/**************************************************************************
+ *
+ * Lockdep "implementation". Also see liblustre.h
+ *
+ **************************************************************************/
+
+struct lock_class_key {
+	;
+};
+
+#define lockdep_set_class(lock, key) \
+	do { (void)sizeof(lock); (void)sizeof(key); } while (0)
+/* This has to be a macro, so that `subclass' can be undefined in kernels
+ * that do not support lockdep. */
+
+
+static inline void lockdep_off(void)
+{
+}
+
+static inline void lockdep_on(void)
+{
+}
+#else
+
+#endif /* lockdep_set_class */
+
+#ifndef CONFIG_DEBUG_LOCK_ALLOC
+#ifndef mutex_lock_nested
+#define mutex_lock_nested(mutex, subclass) mutex_lock(mutex)
+#endif
+
+#ifndef spin_lock_nested
+#define spin_lock_nested(lock, subclass) spin_lock(lock)
+#endif
+
+#ifndef down_read_nested
+#define down_read_nested(lock, subclass) down_read(lock)
+#endif
+
+#ifndef down_write_nested
+#define down_write_nested(lock, subclass) down_write(lock)
+#endif
+#endif /* CONFIG_DEBUG_LOCK_ALLOC */
+
+
+#endif /* __LIBCFS_LINUX_CFS_LOCK_H__ */
diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-mem.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-mem.h
new file mode 100644
index 000000000000..f6cb4635ef4e
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/linux/linux-mem.h
@@ -0,0 +1,139 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/linux/linux-mem.h
+ *
+ * Basic library routines.
+ */
+
+#ifndef __LIBCFS_LINUX_CFS_MEM_H__
+#define __LIBCFS_LINUX_CFS_MEM_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <linux/libcfs/libcfs.h> instead
+#endif
+
+
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
+
+#define CFS_PAGE_MASK		   (~((__u64)PAGE_CACHE_SIZE-1))
+#define page_index(p)       ((p)->index)
+
+#define memory_pressure_get() (current->flags & PF_MEMALLOC)
+#define memory_pressure_set() do { current->flags |= PF_MEMALLOC; } while (0)
+#define memory_pressure_clr() do { current->flags &= ~PF_MEMALLOC; } while (0)
+
+#if BITS_PER_LONG == 32
+/* limit to lowmem on 32-bit systems */
+#define NUM_CACHEPAGES \
+	min(num_physpages, 1UL << (30 - PAGE_CACHE_SHIFT) * 3 / 4)
+#else
+#define NUM_CACHEPAGES num_physpages
+#endif
+
+/*
+ * In Linux there is no way to determine whether current execution context is
+ * blockable.
+ */
+#define ALLOC_ATOMIC_TRY   GFP_ATOMIC
+
+#define DECL_MMSPACE		mm_segment_t __oldfs
+#define MMSPACE_OPEN \
+	do { __oldfs = get_fs(); set_fs(get_ds());} while(0)
+#define MMSPACE_CLOSE	       set_fs(__oldfs)
+
+
+/*
+ * NUMA allocators
+ *
+ * NB: we will rename these functions in a separate patch:
+ * - rename kmalloc to cfs_malloc
+ * - rename kmalloc/free_page to cfs_page_alloc/free
+ * - rename kmalloc/free_large to cfs_vmalloc/vfree
+ */
+extern void *cfs_cpt_malloc(struct cfs_cpt_table *cptab, int cpt,
+			    size_t nr_bytes, unsigned int flags);
+extern void *cfs_cpt_vmalloc(struct cfs_cpt_table *cptab, int cpt,
+			     size_t nr_bytes);
+extern struct page *cfs_page_cpt_alloc(struct cfs_cpt_table *cptab,
+				      int cpt, unsigned int flags);
+extern void *cfs_mem_cache_cpt_alloc(struct kmem_cache *cachep,
+				     struct cfs_cpt_table *cptab,
+				     int cpt, unsigned int flags);
+
+/*
+ * Shrinker
+ */
+
+# define SHRINKER_ARGS(sc, nr_to_scan, gfp_mask)  \
+		       struct shrinker *shrinker, \
+		       struct shrink_control *sc
+# define shrink_param(sc, var) ((sc)->var)
+
+typedef int (*shrinker_t)(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask));
+
+static inline
+struct shrinker *set_shrinker(int seek, shrinker_t func)
+{
+	struct shrinker *s;
+
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (s == NULL)
+		return (NULL);
+
+	s->shrink = func;
+	s->seeks = seek;
+
+	register_shrinker(s);
+
+	return s;
+}
+
+static inline
+void remove_shrinker(struct shrinker *shrinker)
+{
+	if (shrinker == NULL)
+		return;
+
+	unregister_shrinker(shrinker);
+	kfree(shrinker);
+}
+
+#endif /* __LINUX_CFS_MEM_H__ */
diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-prim.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-prim.h
new file mode 100644
index 000000000000..c346bcdf05bb
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/linux/linux-prim.h
@@ -0,0 +1,243 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/linux/linux-prim.h
+ *
+ * Basic library routines.
+ */
+
+#ifndef __LIBCFS_LINUX_CFS_PRIM_H__
+#define __LIBCFS_LINUX_CFS_PRIM_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <linux/libcfs/libcfs.h> instead
+#endif
+
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/version.h>
+#include <linux/proc_fs.h>
+#include <linux/mm.h>
+#include <linux/timer.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kthread.h>
+#include <linux/random.h>
+
+#include <linux/miscdevice.h>
+#include <linux/libcfs/linux/portals_compat25.h>
+#include <asm/div64.h>
+
+#include <linux/libcfs/linux/linux-time.h>
+
+
+/*
+ * CPU
+ */
+#ifdef for_each_possible_cpu
+#define cfs_for_each_possible_cpu(cpu) for_each_possible_cpu(cpu)
+#elif defined(for_each_cpu)
+#define cfs_for_each_possible_cpu(cpu) for_each_cpu(cpu)
+#endif
+
+#ifdef NR_CPUS
+#else
+#define NR_CPUS     1
+#endif
+
+#define cfs_set_cpus_allowed(t, mask)  set_cpus_allowed(t, mask)
+
+/*
+ * cache
+ */
+
+/*
+ * IRQs
+ */
+
+
+/*
+ * Pseudo device register
+ */
+typedef struct miscdevice		psdev_t;
+
+/*
+ * Sysctl register
+ */
+typedef struct ctl_table		ctl_table_t;
+typedef struct ctl_table_header		ctl_table_header_t;
+
+#define cfs_register_sysctl_table(t, a) register_sysctl_table(t)
+
+#define DECLARE_PROC_HANDLER(name)		      \
+static int					      \
+LL_PROC_PROTO(name)				     \
+{						       \
+	DECLARE_LL_PROC_PPOS_DECL;		      \
+							\
+	return proc_call_handler(table->data, write,    \
+				 ppos, buffer, lenp,    \
+				 __##name);	     \
+}
+
+/*
+ * Symbol register
+ */
+#define cfs_symbol_register(s, p)       do {} while(0)
+#define cfs_symbol_unregister(s)	do {} while(0)
+#define cfs_symbol_get(s)	       symbol_get(s)
+#define cfs_symbol_put(s)	       symbol_put(s)
+
+typedef struct module module_t;
+
+/*
+ * Proc file system APIs
+ */
+typedef struct proc_dir_entry	   proc_dir_entry_t;
+
+/*
+ * Wait Queue
+ */
+
+
+typedef long			    cfs_task_state_t;
+
+#define CFS_DECL_WAITQ(wq)		DECLARE_WAIT_QUEUE_HEAD(wq)
+
+/*
+ * Task struct
+ */
+typedef struct task_struct	      task_t;
+#define DECL_JOURNAL_DATA	   void *journal_info
+#define PUSH_JOURNAL		do {    \
+	journal_info = current->journal_info;   \
+	current->journal_info = NULL;	   \
+	} while(0)
+#define POP_JOURNAL		 do {    \
+	current->journal_info = journal_info;   \
+	} while(0)
+
+/* Module interfaces */
+#define cfs_module(name, version, init, fini) \
+	module_init(init);		    \
+	module_exit(fini)
+
+/*
+ * Signal
+ */
+
+/*
+ * Timer
+ */
+typedef struct timer_list timer_list_t;
+
+
+#ifndef wait_event_timeout /* Only for RHEL3 2.4.21 kernel */
+#define __wait_event_timeout(wq, condition, timeout, ret)	\
+do {							     \
+	int __ret = 0;					   \
+	if (!(condition)) {				      \
+		wait_queue_t __wait;			     \
+		unsigned long expire;			    \
+								 \
+		init_waitqueue_entry(&__wait, current);	  \
+		expire = timeout + jiffies;		      \
+		add_wait_queue(&wq, &__wait);		    \
+		for (;;) {				       \
+			set_current_state(TASK_UNINTERRUPTIBLE); \
+			if (condition)			   \
+				break;			   \
+			if (jiffies > expire) {		  \
+				ret = jiffies - expire;	  \
+				break;			   \
+			}					\
+			schedule_timeout(timeout);	       \
+		}						\
+		current->state = TASK_RUNNING;		   \
+		remove_wait_queue(&wq, &__wait);		 \
+	}							\
+} while (0)
+/*
+   retval == 0; condition met; we're good.
+   retval > 0; timed out.
+*/
+#define cfs_waitq_wait_event_timeout(wq, condition, timeout, ret)    \
+do {								 \
+	ret = 0;						     \
+	if (!(condition))					    \
+		__wait_event_timeout(wq, condition, timeout, ret);   \
+} while (0)
+#else
+#define cfs_waitq_wait_event_timeout(wq, condition, timeout, ret)    \
+	ret = wait_event_timeout(wq, condition, timeout)
+#endif
+
+#define cfs_waitq_wait_event_interruptible_timeout(wq, c, timeout, ret) \
+	ret = wait_event_interruptible_timeout(wq, c, timeout)
+
+/*
+ * atomic
+ */
+
+
+#define cfs_atomic_add_unless(atom, a, u)    atomic_add_unless(atom, a, u)
+#define cfs_atomic_cmpxchg(atom, old, nv)    atomic_cmpxchg(atom, old, nv)
+
+/*
+ * membar
+ */
+
+
+/*
+ * interrupt
+ */
+
+
+/*
+ * might_sleep
+ */
+
+/*
+ * group_info
+ */
+typedef struct group_info group_info_t;
+
+
+/*
+ * Random bytes
+ */
+#endif
diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-tcpip.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-tcpip.h
new file mode 100644
index 000000000000..687f33f4e8a7
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/linux/linux-tcpip.h
@@ -0,0 +1,87 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/linux/linux-tcpip.h
+ *
+ * Basic library routines.
+ */
+
+#ifndef __LIBCFS_LINUX_CFS_TCP_H__
+#define __LIBCFS_LINUX_CFS_TCP_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <linux/libcfs/libcfs.h> instead
+#endif
+
+
+#include <net/sock.h>
+
+#ifndef HIPQUAD
+// XXX Should just kill all users
+#if defined(__LITTLE_ENDIAN)
+#define HIPQUAD(addr) \
+	((unsigned char *)&addr)[3], \
+	((unsigned char *)&addr)[2], \
+	((unsigned char *)&addr)[1], \
+	((unsigned char *)&addr)[0]
+#elif defined(__BIG_ENDIAN)
+#define HIPQUAD NIPQUAD
+#else
+#error "Please fix asm/byteorder.h"
+#endif /* __LITTLE_ENDIAN */
+#endif
+
+typedef struct socket   socket_t;
+
+#define SOCK_SNDBUF(so)	 ((so)->sk->sk_sndbuf)
+#define SOCK_TEST_NOSPACE(so)   test_bit(SOCK_NOSPACE, &(so)->flags)
+
+static inline int
+cfs_sock_error(struct socket *sock)
+{
+	return sock->sk->sk_err;
+}
+
+static inline int
+cfs_sock_wmem_queued(struct socket *sock)
+{
+	return sock->sk->sk_wmem_queued;
+}
+
+#define cfs_sk_sleep(sk)	sk_sleep(sk)
+
+#define DEFAULT_NET	(&init_net)
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-time.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-time.h
new file mode 100644
index 000000000000..4a48b914b42a
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/linux/linux-time.h
@@ -0,0 +1,275 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/linux/linux-time.h
+ *
+ * Implementation of portable time API for Linux (kernel and user-level).
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#ifndef __LIBCFS_LINUX_LINUX_TIME_H__
+#define __LIBCFS_LINUX_LINUX_TIME_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <linux/libcfs/libcfs.h> instead
+#endif
+
+
+/* Portable time API */
+
+/*
+ * Platform provides three opaque data-types:
+ *
+ *  cfs_time_t	represents point in time. This is internal kernel
+ *		    time rather than "wall clock". This time bears no
+ *		    relation to gettimeofday().
+ *
+ *  cfs_duration_t    represents time interval with resolution of internal
+ *		    platform clock
+ *
+ *  cfs_fs_time_t     represents instance in world-visible time. This is
+ *		    used in file-system time-stamps
+ *
+ *  cfs_time_t     cfs_time_current(void);
+ *  cfs_time_t     cfs_time_add    (cfs_time_t, cfs_duration_t);
+ *  cfs_duration_t cfs_time_sub    (cfs_time_t, cfs_time_t);
+ *  int	    cfs_impl_time_before (cfs_time_t, cfs_time_t);
+ *  int	    cfs_impl_time_before_eq(cfs_time_t, cfs_time_t);
+ *
+ *  cfs_duration_t cfs_duration_build(int64_t);
+ *
+ *  time_t	 cfs_duration_sec (cfs_duration_t);
+ *  void	   cfs_duration_usec(cfs_duration_t, struct timeval *);
+ *  void	   cfs_duration_nsec(cfs_duration_t, struct timespec *);
+ *
+ *  void	   cfs_fs_time_current(cfs_fs_time_t *);
+ *  time_t	 cfs_fs_time_sec    (cfs_fs_time_t *);
+ *  void	   cfs_fs_time_usec   (cfs_fs_time_t *, struct timeval *);
+ *  void	   cfs_fs_time_nsec   (cfs_fs_time_t *, struct timespec *);
+ *  int	    cfs_fs_time_before (cfs_fs_time_t *, cfs_fs_time_t *);
+ *  int	    cfs_fs_time_beforeq(cfs_fs_time_t *, cfs_fs_time_t *);
+ *
+ *  CFS_TIME_FORMAT
+ *  CFS_DURATION_FORMAT
+ *
+ */
+
+#define ONE_BILLION ((u_int64_t)1000000000)
+#define ONE_MILLION 1000000
+
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/version.h>
+#include <linux/time.h>
+#include <asm/div64.h>
+
+#include <linux/libcfs/linux/portals_compat25.h>
+
+/*
+ * post 2.5 kernels.
+ */
+
+#include <linux/jiffies.h>
+
+typedef struct timespec cfs_fs_time_t;
+
+static inline void cfs_fs_time_usec(cfs_fs_time_t *t, struct timeval *v)
+{
+	v->tv_sec  = t->tv_sec;
+	v->tv_usec = t->tv_nsec / 1000;
+}
+
+static inline void cfs_fs_time_nsec(cfs_fs_time_t *t, struct timespec *s)
+{
+	*s = *t;
+}
+
+/*
+ * internal helper function used by cfs_fs_time_before*()
+ */
+static inline unsigned long long __cfs_fs_time_flat(cfs_fs_time_t *t)
+{
+	return (unsigned long long)t->tv_sec * ONE_BILLION + t->tv_nsec;
+}
+
+
+/*
+ * Generic kernel stuff
+ */
+
+typedef unsigned long cfs_time_t;      /* jiffies */
+typedef long cfs_duration_t;
+typedef cycles_t cfs_cycles_t;
+
+static inline int cfs_time_before(cfs_time_t t1, cfs_time_t t2)
+{
+	return time_before(t1, t2);
+}
+
+static inline int cfs_time_beforeq(cfs_time_t t1, cfs_time_t t2)
+{
+	return time_before_eq(t1, t2);
+}
+
+static inline cfs_time_t cfs_time_current(void)
+{
+	return jiffies;
+}
+
+static inline time_t cfs_time_current_sec(void)
+{
+	return get_seconds();
+}
+
+static inline void cfs_fs_time_current(cfs_fs_time_t *t)
+{
+	*t = CURRENT_TIME;
+}
+
+static inline time_t cfs_fs_time_sec(cfs_fs_time_t *t)
+{
+	return t->tv_sec;
+}
+
+static inline int cfs_fs_time_before(cfs_fs_time_t *t1, cfs_fs_time_t *t2)
+{
+	return __cfs_fs_time_flat(t1) <  __cfs_fs_time_flat(t2);
+}
+
+static inline int cfs_fs_time_beforeq(cfs_fs_time_t *t1, cfs_fs_time_t *t2)
+{
+	return __cfs_fs_time_flat(t1) <= __cfs_fs_time_flat(t2);
+}
+
+#if 0
+static inline cfs_duration_t cfs_duration_build(int64_t nano)
+{
+#if (BITS_PER_LONG == 32)
+	/* We cannot use do_div(t, ONE_BILLION), do_div can only process
+	 * 64 bits n and 32 bits base */
+	int64_t  t = nano * HZ;
+	do_div(t, 1000);
+	do_div(t, 1000000);
+	return (cfs_duration_t)t;
+#else
+	return (nano * HZ / ONE_BILLION);
+#endif
+}
+#endif
+
+static inline cfs_duration_t cfs_time_seconds(int seconds)
+{
+	return ((cfs_duration_t)seconds) * HZ;
+}
+
+static inline time_t cfs_duration_sec(cfs_duration_t d)
+{
+	return d / HZ;
+}
+
+static inline void cfs_duration_usec(cfs_duration_t d, struct timeval *s)
+{
+#if (BITS_PER_LONG == 32) && (HZ > 4096)
+	__u64 t;
+
+	s->tv_sec = d / HZ;
+	t = (d - (cfs_duration_t)s->tv_sec * HZ) * ONE_MILLION;
+	do_div(t, HZ);
+	s->tv_usec = t;
+#else
+	s->tv_sec = d / HZ;
+	s->tv_usec = ((d - (cfs_duration_t)s->tv_sec * HZ) * \
+		ONE_MILLION) / HZ;
+#endif
+}
+
+static inline void cfs_duration_nsec(cfs_duration_t d, struct timespec *s)
+{
+#if (BITS_PER_LONG == 32)
+	__u64 t;
+
+	s->tv_sec = d / HZ;
+	t = (d - s->tv_sec * HZ) * ONE_BILLION;
+	do_div(t, HZ);
+	s->tv_nsec = t;
+#else
+	s->tv_sec = d / HZ;
+	s->tv_nsec = ((d - s->tv_sec * HZ) * ONE_BILLION) / HZ;
+#endif
+}
+
+#define cfs_time_current_64 get_jiffies_64
+
+static inline __u64 cfs_time_add_64(__u64 t, __u64 d)
+{
+	return t + d;
+}
+
+static inline __u64 cfs_time_shift_64(int seconds)
+{
+	return cfs_time_add_64(cfs_time_current_64(),
+			       cfs_time_seconds(seconds));
+}
+
+static inline int cfs_time_before_64(__u64 t1, __u64 t2)
+{
+	return (__s64)t2 - (__s64)t1 > 0;
+}
+
+static inline int cfs_time_beforeq_64(__u64 t1, __u64 t2)
+{
+	return (__s64)t2 - (__s64)t1 >= 0;
+}
+
+
+/*
+ * One jiffy
+ */
+#define CFS_TICK		(1)
+
+#define CFS_TIME_T	      "%lu"
+#define CFS_DURATION_T	  "%ld"
+
+
+#endif /* __LIBCFS_LINUX_LINUX_TIME_H__ */
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 80
+ * scroll-step: 1
+ * End:
+ */
diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-types.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-types.h
new file mode 100644
index 000000000000..142394925567
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/linux/linux-types.h
@@ -0,0 +1,36 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/user-bitops.h
+ */
+#include <linux/types.h>
diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/portals_compat25.h b/drivers/staging/lustre/include/linux/libcfs/linux/portals_compat25.h
new file mode 100644
index 000000000000..2b9487267596
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/linux/portals_compat25.h
@@ -0,0 +1,116 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LIBCFS_LINUX_PORTALS_COMPAT_H__
+#define __LIBCFS_LINUX_PORTALS_COMPAT_H__
+
+// XXX BUG 1511 -- remove this stanza and all callers when bug 1511 is resolved
+#if defined(SPINLOCK_DEBUG) && SPINLOCK_DEBUG
+#  define SIGNAL_MASK_ASSERT() \
+   LASSERT(current->sighand->siglock.magic == SPINLOCK_MAGIC)
+#else
+# define SIGNAL_MASK_ASSERT()
+#endif
+// XXX BUG 1511 -- remove this stanza and all callers when bug 1511 is resolved
+
+#define SIGNAL_MASK_LOCK(task, flags)				  \
+	spin_lock_irqsave(&task->sighand->siglock, flags)
+#define SIGNAL_MASK_UNLOCK(task, flags)				\
+	spin_unlock_irqrestore(&task->sighand->siglock, flags)
+#define USERMODEHELPER(path, argv, envp)			       \
+	call_usermodehelper(path, argv, envp, 1)
+#define clear_tsk_thread_flag(current, TIF_SIGPENDING)	  clear_tsk_thread_flag(current,       \
+							TIF_SIGPENDING)
+# define smp_num_cpus	      num_online_cpus()
+
+#define cfs_wait_event_interruptible(wq, condition, ret)	       \
+	ret = wait_event_interruptible(wq, condition)
+#define cfs_wait_event_interruptible_exclusive(wq, condition, ret)     \
+	ret = wait_event_interruptible_exclusive(wq, condition)
+
+#define THREAD_NAME(comm, len, fmt, a...)			      \
+	snprintf(comm, len, fmt, ## a)
+
+/* 2.6 alloc_page users can use page->lru */
+#define PAGE_LIST_ENTRY lru
+#define PAGE_LIST(page) ((page)->lru)
+
+#ifndef __user
+#define __user
+#endif
+
+#ifndef __fls
+#define __cfs_fls fls
+#else
+#define __cfs_fls __fls
+#endif
+
+#define ll_proc_dointvec(table, write, filp, buffer, lenp, ppos)	\
+	proc_dointvec(table, write, buffer, lenp, ppos);
+
+#define ll_proc_dolongvec(table, write, filp, buffer, lenp, ppos)	\
+	proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
+#define ll_proc_dostring(table, write, filp, buffer, lenp, ppos)	\
+	proc_dostring(table, write, buffer, lenp, ppos);
+#define LL_PROC_PROTO(name)					     \
+	name(ctl_table_t *table, int write,		      \
+	     void __user *buffer, size_t *lenp, loff_t *ppos)
+#define DECLARE_LL_PROC_PPOS_DECL
+
+/* helper for sysctl handlers */
+int proc_call_handler(void *data, int write,
+		      loff_t *ppos, void *buffer, size_t *lenp,
+		      int (*handler)(void *data, int write,
+				     loff_t pos, void *buffer, int len));
+/*
+ * CPU
+ */
+#ifdef for_each_possible_cpu
+#define cfs_for_each_possible_cpu(cpu) for_each_possible_cpu(cpu)
+#elif defined(for_each_cpu)
+#define cfs_for_each_possible_cpu(cpu) for_each_cpu(cpu)
+#endif
+
+#ifdef NR_CPUS
+#else
+#define NR_CPUS     1
+#endif
+
+#define cfs_set_cpus_allowed(t, mask)  set_cpus_allowed(t, mask)
+
+#define cfs_register_sysctl_table(t, a) register_sysctl_table(t)
+
+#endif /* _PORTALS_COMPAT_H */
diff --git a/drivers/staging/lustre/include/linux/libcfs/lucache.h b/drivers/staging/lustre/include/linux/libcfs/lucache.h
new file mode 100644
index 000000000000..7ae36fc88d77
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/lucache.h
@@ -0,0 +1,162 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUCACHE_H
+#define _LUCACHE_H
+
+#include <linux/libcfs/libcfs.h>
+
+/** \defgroup ucache ucache
+ *
+ * @{
+ */
+
+#define UC_CACHE_NEW	    0x01
+#define UC_CACHE_ACQUIRING      0x02
+#define UC_CACHE_INVALID	0x04
+#define UC_CACHE_EXPIRED	0x08
+
+#define UC_CACHE_IS_NEW(i)	  ((i)->ue_flags & UC_CACHE_NEW)
+#define UC_CACHE_IS_INVALID(i)      ((i)->ue_flags & UC_CACHE_INVALID)
+#define UC_CACHE_IS_ACQUIRING(i)    ((i)->ue_flags & UC_CACHE_ACQUIRING)
+#define UC_CACHE_IS_EXPIRED(i)      ((i)->ue_flags & UC_CACHE_EXPIRED)
+#define UC_CACHE_IS_VALID(i)	((i)->ue_flags == 0)
+
+#define UC_CACHE_SET_NEW(i)	 (i)->ue_flags |= UC_CACHE_NEW
+#define UC_CACHE_SET_INVALID(i)     (i)->ue_flags |= UC_CACHE_INVALID
+#define UC_CACHE_SET_ACQUIRING(i)   (i)->ue_flags |= UC_CACHE_ACQUIRING
+#define UC_CACHE_SET_EXPIRED(i)     (i)->ue_flags |= UC_CACHE_EXPIRED
+#define UC_CACHE_SET_VALID(i)       (i)->ue_flags = 0
+
+#define UC_CACHE_CLEAR_NEW(i)       (i)->ue_flags &= ~UC_CACHE_NEW
+#define UC_CACHE_CLEAR_ACQUIRING(i) (i)->ue_flags &= ~UC_CACHE_ACQUIRING
+#define UC_CACHE_CLEAR_INVALID(i)   (i)->ue_flags &= ~UC_CACHE_INVALID
+#define UC_CACHE_CLEAR_EXPIRED(i)   (i)->ue_flags &= ~UC_CACHE_EXPIRED
+
+struct upcall_cache_entry;
+
+struct md_perm {
+	lnet_nid_t      mp_nid;
+	__u32	   mp_perm;
+};
+
+struct md_identity {
+	struct upcall_cache_entry *mi_uc_entry;
+	uid_t		      mi_uid;
+	gid_t		      mi_gid;
+	group_info_t	  *mi_ginfo;
+	int			mi_nperms;
+	struct md_perm	    *mi_perms;
+};
+
+struct upcall_cache_entry {
+	struct list_head	      ue_hash;
+	__u64		   ue_key;
+	atomic_t	    ue_refcount;
+	int		     ue_flags;
+	wait_queue_head_t	     ue_waitq;
+	cfs_time_t	      ue_acquire_expire;
+	cfs_time_t	      ue_expire;
+	union {
+		struct md_identity     identity;
+	} u;
+};
+
+#define UC_CACHE_HASH_SIZE	(128)
+#define UC_CACHE_HASH_INDEX(id)   ((id) & (UC_CACHE_HASH_SIZE - 1))
+#define UC_CACHE_UPCALL_MAXPATH   (1024UL)
+
+struct upcall_cache;
+
+struct upcall_cache_ops {
+	void	    (*init_entry)(struct upcall_cache_entry *, void *args);
+	void	    (*free_entry)(struct upcall_cache *,
+				      struct upcall_cache_entry *);
+	int	     (*upcall_compare)(struct upcall_cache *,
+					  struct upcall_cache_entry *,
+					  __u64 key, void *args);
+	int	     (*downcall_compare)(struct upcall_cache *,
+					    struct upcall_cache_entry *,
+					    __u64 key, void *args);
+	int	     (*do_upcall)(struct upcall_cache *,
+				     struct upcall_cache_entry *);
+	int	     (*parse_downcall)(struct upcall_cache *,
+					  struct upcall_cache_entry *, void *);
+};
+
+struct upcall_cache {
+	struct list_head		uc_hashtable[UC_CACHE_HASH_SIZE];
+	spinlock_t		uc_lock;
+	rwlock_t		uc_upcall_rwlock;
+
+	char			uc_name[40];		/* for upcall */
+	char			uc_upcall[UC_CACHE_UPCALL_MAXPATH];
+	int			uc_acquire_expire;	/* seconds */
+	int			uc_entry_expire;	/* seconds */
+	struct upcall_cache_ops	*uc_ops;
+};
+
+struct upcall_cache_entry *upcall_cache_get_entry(struct upcall_cache *cache,
+						  __u64 key, void *args);
+void upcall_cache_put_entry(struct upcall_cache *cache,
+			    struct upcall_cache_entry *entry);
+int upcall_cache_downcall(struct upcall_cache *cache, __u32 err, __u64 key,
+			  void *args);
+void upcall_cache_flush_idle(struct upcall_cache *cache);
+void upcall_cache_flush_all(struct upcall_cache *cache);
+void upcall_cache_flush_one(struct upcall_cache *cache, __u64 key, void *args);
+struct upcall_cache *upcall_cache_init(const char *name, const char *upcall,
+				       struct upcall_cache_ops *ops);
+void upcall_cache_cleanup(struct upcall_cache *cache);
+
+#if 0
+struct upcall_cache_entry *upcall_cache_get_entry(struct upcall_cache *hash,
+						  __u64 key, __u32 primary,
+						  __u32 ngroups, __u32 *groups);
+void upcall_cache_put_entry(struct upcall_cache *hash,
+			    struct upcall_cache_entry *entry);
+int upcall_cache_downcall(struct upcall_cache *hash, __u32 err, __u64 key,
+			  __u32 primary, __u32 ngroups, __u32 *groups);
+void upcall_cache_flush_idle(struct upcall_cache *cache);
+void upcall_cache_flush_all(struct upcall_cache *cache);
+struct upcall_cache *upcall_cache_init(const char *name);
+void upcall_cache_cleanup(struct upcall_cache *hash);
+
+#endif
+
+/** @} ucache */
+
+#endif /* _LUCACHE_H */
diff --git a/drivers/staging/lustre/include/linux/libcfs/params_tree.h b/drivers/staging/lustre/include/linux/libcfs/params_tree.h
new file mode 100644
index 000000000000..6551f4b030d9
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/params_tree.h
@@ -0,0 +1,230 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * API and structure definitions for params_tree.
+ *
+ * Author: LiuYing <emoly.liu@oracle.com>
+ */
+#ifndef __PARAMS_TREE_H__
+#define __PARAMS_TREE_H__
+
+#include <linux/libcfs/libcfs.h>
+
+#undef LPROCFS
+#if  defined(CONFIG_PROC_FS)
+# define LPROCFS
+#endif
+
+#ifdef LPROCFS
+typedef struct file			     cfs_param_file_t;
+typedef struct inode			    cfs_inode_t;
+typedef struct proc_inode		       cfs_proc_inode_t;
+typedef struct seq_file			 cfs_seq_file_t;
+typedef struct seq_operations		   cfs_seq_ops_t;
+typedef struct file_operations		  cfs_param_file_ops_t;
+typedef module_t			   *cfs_param_module_t;
+typedef struct proc_dir_entry		   cfs_param_dentry_t;
+typedef struct poll_table_struct		cfs_poll_table_t;
+#define CFS_PARAM_MODULE			THIS_MODULE
+#define CFS_PDE(value)			  PDE(value)
+#define cfs_file_private(file)		  (file->private_data)
+#define cfs_dentry_data(dentry)		 (dentry->data)
+#define cfs_proc_inode_pde(proc_inode)	  (proc_inode->pde)
+#define cfs_proc_inode(proc_inode)	      (proc_inode->vfs_inode)
+#define cfs_seq_read_common		     seq_read
+#define cfs_seq_lseek_common		    seq_lseek
+#define cfs_seq_private(seq)		    (seq->private)
+#define cfs_seq_printf(seq, format, ...)	seq_printf(seq, format,  \
+							   ## __VA_ARGS__)
+#define cfs_seq_release(inode, file)	    seq_release(inode, file)
+#define cfs_seq_puts(seq, s)		    seq_puts(seq, s)
+#define cfs_seq_putc(seq, s)		    seq_putc(seq, s)
+#define cfs_seq_read(file, buf, count, ppos, rc) (rc = seq_read(file, buf, \
+							    count, ppos))
+#define cfs_seq_open(file, ops, rc)	     (rc = seq_open(file, ops))
+
+/* in lprocfs_stat.c, to protect the private data for proc entries */
+extern struct rw_semaphore		_lprocfs_lock;
+
+/* to begin from 2.6.23, Linux defines self file_operations (proc_reg_file_ops)
+ * in procfs, the proc file_operation defined by Lustre (lprocfs_generic_fops)
+ * will be wrapped into the new defined proc_reg_file_ops, which instroduces
+ * user count in proc_dir_entrey(pde_users) to protect the proc entry from
+ * being deleted. then the protection lock (_lprocfs_lock) defined by Lustre
+ * isn't necessary anymore for lprocfs_generic_fops(e.g. lprocfs_fops_read).
+ * see bug19706 for detailed information.
+ */
+#define LPROCFS_ENTRY() do{ }while(0)
+#define LPROCFS_EXIT()  do{ }while(0)
+
+static inline
+int LPROCFS_ENTRY_AND_CHECK(struct proc_dir_entry *dp)
+{
+	int deleted = 0;
+
+	spin_lock(&(dp)->pde_unload_lock);
+	if (dp->proc_fops == NULL)
+		deleted = 1;
+	spin_unlock(&(dp)->pde_unload_lock);
+	if (deleted)
+		return -ENODEV;
+	return 0;
+}
+#define LPROCFS_SRCH_ENTRY()	    \
+do {				    \
+	down_read(&_lprocfs_lock);      \
+} while(0)
+
+#define LPROCFS_SRCH_EXIT()	     \
+do {				    \
+	up_read(&_lprocfs_lock);	\
+} while(0)
+
+#define LPROCFS_WRITE_ENTRY()		\
+do {					\
+	down_write(&_lprocfs_lock);	\
+} while(0)
+
+#define LPROCFS_WRITE_EXIT()		\
+do {					\
+	up_write(&_lprocfs_lock);	\
+} while(0)
+#else /* !LPROCFS */
+
+typedef struct cfs_params_file {
+	void	   *param_private;
+	loff_t	  param_pos;
+	unsigned int    param_flags;
+} cfs_param_file_t;
+
+typedef struct cfs_param_inode {
+	void    *param_private;
+} cfs_inode_t;
+
+typedef struct cfs_param_dentry {
+	void *param_data;
+} cfs_param_dentry_t;
+
+typedef struct cfs_proc_inode {
+	cfs_param_dentry_t *param_pde;
+	cfs_inode_t	 param_inode;
+} cfs_proc_inode_t;
+
+struct cfs_seq_operations;
+typedef struct cfs_seq_file {
+	char		      *buf;
+	size_t		     size;
+	size_t		     from;
+	size_t		     count;
+	loff_t		     index;
+	loff_t		     version;
+	struct mutex			lock;
+	struct cfs_seq_operations *op;
+	void		      *private;
+} cfs_seq_file_t;
+
+typedef struct cfs_seq_operations {
+	void *(*start) (cfs_seq_file_t *m, loff_t *pos);
+	void  (*stop) (cfs_seq_file_t *m, void *v);
+	void *(*next) (cfs_seq_file_t *m, void *v, loff_t *pos);
+	int   (*show) (cfs_seq_file_t *m, void *v);
+} cfs_seq_ops_t;
+
+typedef void *cfs_param_module_t;
+typedef void *cfs_poll_table_t;
+
+typedef struct cfs_param_file_ops {
+	cfs_param_module_t owner;
+	int (*open) (cfs_inode_t *, struct file *);
+	loff_t (*llseek)(struct file *, loff_t, int);
+	int (*release) (cfs_inode_t *, cfs_param_file_t *);
+	unsigned int (*poll) (struct file *, cfs_poll_table_t *);
+	ssize_t (*write) (struct file *, const char *, size_t, loff_t *);
+	ssize_t (*read)(struct file *, char *, size_t, loff_t *);
+} cfs_param_file_ops_t;
+typedef cfs_param_file_ops_t *cfs_lproc_filep_t;
+
+static inline cfs_proc_inode_t *FAKE_PROC_I(const cfs_inode_t *inode)
+{
+	return container_of(inode, cfs_proc_inode_t, param_inode);
+}
+
+static inline cfs_param_dentry_t *FAKE_PDE(cfs_inode_t *inode)
+{
+	return FAKE_PROC_I(inode)->param_pde;
+}
+
+#define CFS_PARAM_MODULE			NULL
+#define CFS_PDE(value)			  FAKE_PDE(value)
+#define cfs_file_private(file)		  (file->param_private)
+#define cfs_dentry_data(dentry)		 (dentry->param_data)
+#define cfs_proc_inode(proc_inode)	      (proc_inode->param_inode)
+#define cfs_proc_inode_pde(proc_inode)	  (proc_inode->param_pde)
+#define cfs_seq_read_common		     NULL
+#define cfs_seq_lseek_common		    NULL
+#define cfs_seq_private(seq)		    (seq->private)
+#define cfs_seq_read(file, buf, count, ppos, rc) do {} while(0)
+#define cfs_seq_open(file, ops, rc)		     \
+do {						    \
+	 cfs_seq_file_t *p = cfs_file_private(file);    \
+	 if (!p) {				      \
+		LIBCFS_ALLOC(p, sizeof(*p));	    \
+		if (!p) {			       \
+			rc = -ENOMEM;		   \
+			break;			  \
+		}				       \
+		cfs_file_private(file) = p;	     \
+	}					       \
+	memset(p, 0, sizeof(*p));		       \
+	p->op = ops;				    \
+	rc = 0;					 \
+} while(0)
+
+#define LPROCFS_ENTRY()	     do {} while(0)
+#define LPROCFS_EXIT()	      do {} while(0)
+static inline
+int LPROCFS_ENTRY_AND_CHECK(cfs_param_dentry_t *dp)
+{
+	LPROCFS_ENTRY();
+	return 0;
+}
+#define LPROCFS_WRITE_ENTRY()       do {} while(0)
+#define LPROCFS_WRITE_EXIT()	do {} while(0)
+
+#endif /* LPROCFS */
+
+/* XXX: params_tree APIs */
+
+#endif  /* __PARAMS_TREE_H__ */
diff --git a/drivers/staging/lustre/include/linux/lnet/api-support.h b/drivers/staging/lustre/include/linux/lnet/api-support.h
new file mode 100644
index 000000000000..a8d91dbe6060
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/lnet/api-support.h
@@ -0,0 +1,44 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LNET_API_SUPPORT_H__
+#define __LNET_API_SUPPORT_H__
+
+#include <linux/lnet/linux/api-support.h>
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/types.h>
+#include <linux/lnet/lnet.h>
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/lnet/api.h b/drivers/staging/lustre/include/linux/lnet/api.h
new file mode 100644
index 000000000000..e8642e33860d
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/lnet/api.h
@@ -0,0 +1,220 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LNET_API_H__
+#define __LNET_API_H__
+
+/** \defgroup lnet LNet
+ *
+ * The Lustre Networking subsystem.
+ *
+ * LNet is an asynchronous message-passing API, which provides an unreliable
+ * connectionless service that can't guarantee any order. It supports OFA IB,
+ * TCP/IP, and Cray Portals, and routes between heterogeneous networks.
+ *
+ * LNet can run both in OS kernel space and in userspace as a library.
+ * @{
+ */
+
+#include <linux/lnet/types.h>
+
+/** \defgroup lnet_init_fini Initialization and cleanup
+ * The LNet must be properly initialized before any LNet calls can be made.
+ * @{ */
+int LNetInit(void);
+void LNetFini(void);
+
+int LNetNIInit(lnet_pid_t requested_pid);
+int LNetNIFini(void);
+/** @} lnet_init_fini */
+
+/** \defgroup lnet_addr LNet addressing and basic types
+ *
+ * Addressing scheme and basic data types of LNet.
+ *
+ * The LNet API is memory-oriented, so LNet must be able to address not only
+ * end-points but also memory region within a process address space.
+ * An ::lnet_nid_t addresses an end-point. An ::lnet_pid_t identifies a process
+ * in a node. A portal represents an opening in the address space of a
+ * process. Match bits is criteria to identify a region of memory inside a
+ * portal, and offset specifies an offset within the memory region.
+ *
+ * LNet creates a table of portals for each process during initialization.
+ * This table has MAX_PORTALS entries and its size can't be dynamically
+ * changed. A portal stays empty until the owning process starts to add
+ * memory regions to it. A portal is sometimes called an index because
+ * it's an entry in the portals table of a process.
+ *
+ * \see LNetMEAttach
+ * @{ */
+int LNetGetId(unsigned int index, lnet_process_id_t *id);
+int LNetDist(lnet_nid_t nid, lnet_nid_t *srcnid, __u32 *order);
+void LNetSnprintHandle(char *str, int str_len, lnet_handle_any_t handle);
+
+/** @} lnet_addr */
+
+
+/** \defgroup lnet_me Match entries
+ *
+ * A match entry (abbreviated as ME) describes a set of criteria to accept
+ * incoming requests.
+ *
+ * A portal is essentially a match list plus a set of attributes. A match
+ * list is a chain of MEs. Each ME includes a pointer to a memory descriptor
+ * and a set of match criteria. The match criteria can be used to reject
+ * incoming requests based on process ID or the match bits provided in the
+ * request. MEs can be dynamically inserted into a match list by LNetMEAttach()
+ * and LNetMEInsert(), and removed from its list by LNetMEUnlink().
+ * @{ */
+int LNetMEAttach(unsigned int      portal,
+		 lnet_process_id_t match_id_in,
+		 __u64	     match_bits_in,
+		 __u64	     ignore_bits_in,
+		 lnet_unlink_t     unlink_in,
+		 lnet_ins_pos_t    pos_in,
+		 lnet_handle_me_t *handle_out);
+
+int LNetMEInsert(lnet_handle_me_t  current_in,
+		 lnet_process_id_t match_id_in,
+		 __u64	     match_bits_in,
+		 __u64	     ignore_bits_in,
+		 lnet_unlink_t     unlink_in,
+		 lnet_ins_pos_t    position_in,
+		 lnet_handle_me_t *handle_out);
+
+int LNetMEUnlink(lnet_handle_me_t current_in);
+/** @} lnet_me */
+
+/** \defgroup lnet_md Memory descriptors
+ *
+ * A memory descriptor contains information about a region of a user's
+ * memory (either in kernel or user space) and optionally points to an
+ * event queue where information about the operations performed on the
+ * memory descriptor are recorded. Memory descriptor is abbreviated as
+ * MD and can be used interchangeably with the memory region it describes.
+ *
+ * The LNet API provides two operations to create MDs: LNetMDAttach()
+ * and LNetMDBind(); one operation to unlink and release the resources
+ * associated with a MD: LNetMDUnlink().
+ * @{ */
+int LNetMDAttach(lnet_handle_me_t  current_in,
+		 lnet_md_t	 md_in,
+		 lnet_unlink_t     unlink_in,
+		 lnet_handle_md_t *handle_out);
+
+int LNetMDBind(lnet_md_t	 md_in,
+	       lnet_unlink_t     unlink_in,
+	       lnet_handle_md_t *handle_out);
+
+int LNetMDUnlink(lnet_handle_md_t md_in);
+/** @} lnet_md */
+
+/** \defgroup lnet_eq Events and event queues
+ *
+ * Event queues (abbreviated as EQ) are used to log operations performed on
+ * local MDs. In particular, they signal the completion of a data transmission
+ * into or out of a MD. They can also be used to hold acknowledgments for
+ * completed PUT operations and indicate when a MD has been unlinked. Multiple
+ * MDs can share a single EQ. An EQ may have an optional event handler
+ * associated with it. If an event handler exists, it will be run for each
+ * event that is deposited into the EQ.
+ *
+ * In addition to the lnet_handle_eq_t, the LNet API defines two types
+ * associated with events: The ::lnet_event_kind_t defines the kinds of events
+ * that can be stored in an EQ. The lnet_event_t defines a structure that
+ * holds the information about with an event.
+ *
+ * There are five functions for dealing with EQs: LNetEQAlloc() is used to
+ * create an EQ and allocate the resources needed, while LNetEQFree()
+ * releases these resources and free the EQ. LNetEQGet() retrieves the next
+ * event from an EQ, and LNetEQWait() can be used to block a process until
+ * an EQ has at least one event. LNetEQPoll() can be used to test or wait
+ * on multiple EQs.
+ * @{ */
+int LNetEQAlloc(unsigned int       count_in,
+		lnet_eq_handler_t  handler,
+		lnet_handle_eq_t  *handle_out);
+
+int LNetEQFree(lnet_handle_eq_t eventq_in);
+
+int LNetEQGet(lnet_handle_eq_t  eventq_in,
+	      lnet_event_t     *event_out);
+
+
+int LNetEQWait(lnet_handle_eq_t  eventq_in,
+	       lnet_event_t     *event_out);
+
+int LNetEQPoll(lnet_handle_eq_t *eventqs_in,
+	       int	       neq_in,
+	       int	       timeout_ms,
+	       lnet_event_t     *event_out,
+	       int	      *which_eq_out);
+/** @} lnet_eq */
+
+/** \defgroup lnet_data Data movement operations
+ *
+ * The LNet API provides two data movement operations: LNetPut()
+ * and LNetGet().
+ * @{ */
+int LNetPut(lnet_nid_t	self,
+	    lnet_handle_md_t  md_in,
+	    lnet_ack_req_t    ack_req_in,
+	    lnet_process_id_t target_in,
+	    unsigned int      portal_in,
+	    __u64	     match_bits_in,
+	    unsigned int      offset_in,
+	    __u64	     hdr_data_in);
+
+int LNetGet(lnet_nid_t	self,
+	    lnet_handle_md_t  md_in,
+	    lnet_process_id_t target_in,
+	    unsigned int      portal_in,
+	    __u64	     match_bits_in,
+	    unsigned int      offset_in);
+/** @} lnet_data */
+
+
+/** \defgroup lnet_misc Miscellaneous operations.
+ * Miscellaneous operations.
+ * @{ */
+
+int LNetSetLazyPortal(int portal);
+int LNetClearLazyPortal(int portal);
+int LNetCtl(unsigned int cmd, void *arg);
+int LNetSetAsync(lnet_process_id_t id, int nasync);
+
+/** @} lnet_misc */
+
+/** @} lnet */
+#endif
diff --git a/drivers/staging/lustre/include/linux/lnet/lib-lnet.h b/drivers/staging/lustre/include/linux/lnet/lib-lnet.h
new file mode 100644
index 000000000000..59bff0bea816
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/lnet/lib-lnet.h
@@ -0,0 +1,874 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/include/lnet/lib-lnet.h
+ *
+ * Top level include for library side routines
+ */
+
+#ifndef __LNET_LIB_LNET_H__
+#define __LNET_LIB_LNET_H__
+
+#include <linux/lnet/linux/lib-lnet.h>
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/types.h>
+#include <linux/lnet/lnet.h>
+#include <linux/lnet/lib-types.h>
+
+extern lnet_t  the_lnet;			/* THE network */
+
+#if  defined(LNET_USE_LIB_FREELIST)
+/* 1 CPT, simplify implementation... */
+# define LNET_CPT_MAX_BITS      0
+
+#else /* KERNEL and no freelist */
+
+# if (BITS_PER_LONG == 32)
+/* 2 CPTs, allowing more CPTs might make us under memory pressure */
+#  define LNET_CPT_MAX_BITS     1
+
+# else /* 64-bit system */
+/*
+ * 256 CPTs for thousands of CPUs, allowing more CPTs might make us
+ * under risk of consuming all lh_cookie.
+ */
+#  define LNET_CPT_MAX_BITS     8
+# endif /* BITS_PER_LONG == 32 */
+#endif
+
+/* max allowed CPT number */
+#define LNET_CPT_MAX	    (1 << LNET_CPT_MAX_BITS)
+
+#define LNET_CPT_NUMBER	 (the_lnet.ln_cpt_number)
+#define LNET_CPT_BITS	   (the_lnet.ln_cpt_bits)
+#define LNET_CPT_MASK	   ((1ULL << LNET_CPT_BITS) - 1)
+
+/** exclusive lock */
+#define LNET_LOCK_EX	    CFS_PERCPT_LOCK_EX
+
+static inline int lnet_is_wire_handle_none (lnet_handle_wire_t *wh)
+{
+	return (wh->wh_interface_cookie == LNET_WIRE_HANDLE_COOKIE_NONE &&
+		wh->wh_object_cookie == LNET_WIRE_HANDLE_COOKIE_NONE);
+}
+
+static inline int lnet_md_exhausted (lnet_libmd_t *md)
+{
+	return (md->md_threshold == 0 ||
+		((md->md_options & LNET_MD_MAX_SIZE) != 0 &&
+		 md->md_offset + md->md_max_size > md->md_length));
+}
+
+static inline int lnet_md_unlinkable (lnet_libmd_t *md)
+{
+	/* Should unlink md when its refcount is 0 and either:
+	 *  - md has been flagged for deletion (by auto unlink or LNetM[DE]Unlink,
+	 *    in the latter case md may not be exhausted).
+	 *  - auto unlink is on and md is exhausted.
+	 */
+	if (md->md_refcount != 0)
+		return 0;
+
+	if ((md->md_flags & LNET_MD_FLAG_ZOMBIE) != 0)
+		return 1;
+
+	return ((md->md_flags & LNET_MD_FLAG_AUTO_UNLINK) != 0 &&
+		lnet_md_exhausted(md));
+}
+
+#define lnet_cpt_table()	(the_lnet.ln_cpt_table)
+#define lnet_cpt_current()	cfs_cpt_current(the_lnet.ln_cpt_table, 1)
+
+static inline int
+lnet_cpt_of_cookie(__u64 cookie)
+{
+	unsigned int cpt = (cookie >> LNET_COOKIE_TYPE_BITS) & LNET_CPT_MASK;
+
+	/* LNET_CPT_NUMBER doesn't have to be power2, which means we can
+	 * get illegal cpt from it's invalid cookie */
+	return cpt < LNET_CPT_NUMBER ? cpt : cpt % LNET_CPT_NUMBER;
+}
+
+static inline void
+lnet_res_lock(int cpt)
+{
+	cfs_percpt_lock(the_lnet.ln_res_lock, cpt);
+}
+
+static inline void
+lnet_res_unlock(int cpt)
+{
+	cfs_percpt_unlock(the_lnet.ln_res_lock, cpt);
+}
+
+static inline int
+lnet_res_lock_current(void)
+{
+	int cpt = lnet_cpt_current();
+
+	lnet_res_lock(cpt);
+	return cpt;
+}
+
+static inline void
+lnet_net_lock(int cpt)
+{
+	cfs_percpt_lock(the_lnet.ln_net_lock, cpt);
+}
+
+static inline void
+lnet_net_unlock(int cpt)
+{
+	cfs_percpt_unlock(the_lnet.ln_net_lock, cpt);
+}
+
+static inline int
+lnet_net_lock_current(void)
+{
+	int cpt = lnet_cpt_current();
+
+	lnet_net_lock(cpt);
+	return cpt;
+}
+
+#define LNET_LOCK()		lnet_net_lock(LNET_LOCK_EX)
+#define LNET_UNLOCK()		lnet_net_unlock(LNET_LOCK_EX)
+
+
+#define lnet_ptl_lock(ptl)	spin_lock(&(ptl)->ptl_lock)
+#define lnet_ptl_unlock(ptl)	spin_unlock(&(ptl)->ptl_lock)
+#define lnet_eq_wait_lock()	spin_lock(&the_lnet.ln_eq_wait_lock)
+#define lnet_eq_wait_unlock()	spin_unlock(&the_lnet.ln_eq_wait_lock)
+#define lnet_ni_lock(ni)	spin_lock(&(ni)->ni_lock)
+#define lnet_ni_unlock(ni)	spin_unlock(&(ni)->ni_lock)
+#define LNET_MUTEX_LOCK(m)	mutex_lock(m)
+#define LNET_MUTEX_UNLOCK(m)	mutex_unlock(m)
+
+
+#define MAX_PORTALS     64
+
+/* these are only used by code with LNET_USE_LIB_FREELIST, but we still
+ * exported them to !LNET_USE_LIB_FREELIST for easy implemetation */
+#define LNET_FL_MAX_MES		2048
+#define LNET_FL_MAX_MDS		2048
+#define LNET_FL_MAX_EQS		512
+#define LNET_FL_MAX_MSGS	2048    /* Outstanding messages */
+
+#ifdef LNET_USE_LIB_FREELIST
+
+int lnet_freelist_init(lnet_freelist_t *fl, int n, int size);
+void lnet_freelist_fini(lnet_freelist_t *fl);
+
+static inline void *
+lnet_freelist_alloc (lnet_freelist_t *fl)
+{
+	/* ALWAYS called with liblock held */
+	lnet_freeobj_t *o;
+
+	if (list_empty (&fl->fl_list))
+		return (NULL);
+
+	o = list_entry (fl->fl_list.next, lnet_freeobj_t, fo_list);
+	list_del (&o->fo_list);
+	return ((void *)&o->fo_contents);
+}
+
+static inline void
+lnet_freelist_free (lnet_freelist_t *fl, void *obj)
+{
+	/* ALWAYS called with liblock held */
+	lnet_freeobj_t *o = list_entry (obj, lnet_freeobj_t, fo_contents);
+
+	list_add (&o->fo_list, &fl->fl_list);
+}
+
+
+static inline lnet_eq_t *
+lnet_eq_alloc (void)
+{
+	/* NEVER called with resource lock held */
+	struct lnet_res_container *rec = &the_lnet.ln_eq_container;
+	lnet_eq_t		  *eq;
+
+	LASSERT(LNET_CPT_NUMBER == 1);
+
+	lnet_res_lock(0);
+	eq = (lnet_eq_t *)lnet_freelist_alloc(&rec->rec_freelist);
+	lnet_res_unlock(0);
+
+	return eq;
+}
+
+static inline void
+lnet_eq_free_locked(lnet_eq_t *eq)
+{
+	/* ALWAYS called with resource lock held */
+	struct lnet_res_container *rec = &the_lnet.ln_eq_container;
+
+	LASSERT(LNET_CPT_NUMBER == 1);
+	lnet_freelist_free(&rec->rec_freelist, eq);
+}
+
+static inline void
+lnet_eq_free(lnet_eq_t *eq)
+{
+	lnet_res_lock(0);
+	lnet_eq_free_locked(eq);
+	lnet_res_unlock(0);
+}
+
+static inline lnet_libmd_t *
+lnet_md_alloc (lnet_md_t *umd)
+{
+	/* NEVER called with resource lock held */
+	struct lnet_res_container *rec = the_lnet.ln_md_containers[0];
+	lnet_libmd_t		  *md;
+
+	LASSERT(LNET_CPT_NUMBER == 1);
+
+	lnet_res_lock(0);
+	md = (lnet_libmd_t *)lnet_freelist_alloc(&rec->rec_freelist);
+	lnet_res_unlock(0);
+
+	if (md != NULL)
+		INIT_LIST_HEAD(&md->md_list);
+
+	return md;
+}
+
+static inline void
+lnet_md_free_locked(lnet_libmd_t *md)
+{
+	/* ALWAYS called with resource lock held */
+	struct lnet_res_container *rec = the_lnet.ln_md_containers[0];
+
+	LASSERT(LNET_CPT_NUMBER == 1);
+	lnet_freelist_free(&rec->rec_freelist, md);
+}
+
+static inline void
+lnet_md_free(lnet_libmd_t *md)
+{
+	lnet_res_lock(0);
+	lnet_md_free_locked(md);
+	lnet_res_unlock(0);
+}
+
+static inline lnet_me_t *
+lnet_me_alloc(void)
+{
+	/* NEVER called with resource lock held */
+	struct lnet_res_container *rec = the_lnet.ln_me_containers[0];
+	lnet_me_t		  *me;
+
+	LASSERT(LNET_CPT_NUMBER == 1);
+
+	lnet_res_lock(0);
+	me = (lnet_me_t *)lnet_freelist_alloc(&rec->rec_freelist);
+	lnet_res_unlock(0);
+
+	return me;
+}
+
+static inline void
+lnet_me_free_locked(lnet_me_t *me)
+{
+	/* ALWAYS called with resource lock held */
+	struct lnet_res_container *rec = the_lnet.ln_me_containers[0];
+
+	LASSERT(LNET_CPT_NUMBER == 1);
+	lnet_freelist_free(&rec->rec_freelist, me);
+}
+
+static inline void
+lnet_me_free(lnet_me_t *me)
+{
+	lnet_res_lock(0);
+	lnet_me_free_locked(me);
+	lnet_res_unlock(0);
+}
+
+static inline lnet_msg_t *
+lnet_msg_alloc (void)
+{
+	/* NEVER called with network lock held */
+	struct lnet_msg_container *msc = the_lnet.ln_msg_containers[0];
+	lnet_msg_t		  *msg;
+
+	LASSERT(LNET_CPT_NUMBER == 1);
+
+	lnet_net_lock(0);
+	msg = (lnet_msg_t *)lnet_freelist_alloc(&msc->msc_freelist);
+	lnet_net_unlock(0);
+
+	if (msg != NULL) {
+		/* NULL pointers, clear flags etc */
+		memset(msg, 0, sizeof(*msg));
+	}
+	return msg;
+}
+
+static inline void
+lnet_msg_free_locked(lnet_msg_t *msg)
+{
+	/* ALWAYS called with network lock held */
+	struct lnet_msg_container *msc = the_lnet.ln_msg_containers[0];
+
+	LASSERT(LNET_CPT_NUMBER == 1);
+	LASSERT(!msg->msg_onactivelist);
+	lnet_freelist_free(&msc->msc_freelist, msg);
+}
+
+static inline void
+lnet_msg_free (lnet_msg_t *msg)
+{
+	lnet_net_lock(0);
+	lnet_msg_free_locked(msg);
+	lnet_net_unlock(0);
+}
+
+#else /* !LNET_USE_LIB_FREELIST */
+
+static inline lnet_eq_t *
+lnet_eq_alloc (void)
+{
+	/* NEVER called with liblock held */
+	lnet_eq_t *eq;
+
+	LIBCFS_ALLOC(eq, sizeof(*eq));
+	return (eq);
+}
+
+static inline void
+lnet_eq_free(lnet_eq_t *eq)
+{
+	/* ALWAYS called with resource lock held */
+	LIBCFS_FREE(eq, sizeof(*eq));
+}
+
+static inline lnet_libmd_t *
+lnet_md_alloc (lnet_md_t *umd)
+{
+	/* NEVER called with liblock held */
+	lnet_libmd_t *md;
+	unsigned int  size;
+	unsigned int  niov;
+
+	if ((umd->options & LNET_MD_KIOV) != 0) {
+		niov = umd->length;
+		size = offsetof(lnet_libmd_t, md_iov.kiov[niov]);
+	} else {
+		niov = ((umd->options & LNET_MD_IOVEC) != 0) ?
+		       umd->length : 1;
+		size = offsetof(lnet_libmd_t, md_iov.iov[niov]);
+	}
+
+	LIBCFS_ALLOC(md, size);
+
+	if (md != NULL) {
+		/* Set here in case of early free */
+		md->md_options = umd->options;
+		md->md_niov = niov;
+		INIT_LIST_HEAD(&md->md_list);
+	}
+
+	return (md);
+}
+
+static inline void
+lnet_md_free(lnet_libmd_t *md)
+{
+	/* ALWAYS called with resource lock held */
+	unsigned int  size;
+
+	if ((md->md_options & LNET_MD_KIOV) != 0)
+		size = offsetof(lnet_libmd_t, md_iov.kiov[md->md_niov]);
+	else
+		size = offsetof(lnet_libmd_t, md_iov.iov[md->md_niov]);
+
+	LIBCFS_FREE(md, size);
+}
+
+static inline lnet_me_t *
+lnet_me_alloc (void)
+{
+	/* NEVER called with liblock held */
+	lnet_me_t *me;
+
+	LIBCFS_ALLOC(me, sizeof(*me));
+	return (me);
+}
+
+static inline void
+lnet_me_free(lnet_me_t *me)
+{
+	/* ALWAYS called with resource lock held */
+	LIBCFS_FREE(me, sizeof(*me));
+}
+
+static inline lnet_msg_t *
+lnet_msg_alloc(void)
+{
+	/* NEVER called with liblock held */
+	lnet_msg_t *msg;
+
+	LIBCFS_ALLOC(msg, sizeof(*msg));
+
+	/* no need to zero, LIBCFS_ALLOC does for us */
+	return (msg);
+}
+
+static inline void
+lnet_msg_free(lnet_msg_t *msg)
+{
+	/* ALWAYS called with network lock held */
+	LASSERT(!msg->msg_onactivelist);
+	LIBCFS_FREE(msg, sizeof(*msg));
+}
+
+#define lnet_eq_free_locked(eq)		lnet_eq_free(eq)
+#define lnet_md_free_locked(md)		lnet_md_free(md)
+#define lnet_me_free_locked(me)		lnet_me_free(me)
+#define lnet_msg_free_locked(msg)	lnet_msg_free(msg)
+
+#endif /* LNET_USE_LIB_FREELIST */
+
+lnet_libhandle_t *lnet_res_lh_lookup(struct lnet_res_container *rec,
+				     __u64 cookie);
+void lnet_res_lh_initialize(struct lnet_res_container *rec,
+			    lnet_libhandle_t *lh);
+static inline void
+lnet_res_lh_invalidate(lnet_libhandle_t *lh)
+{
+	/* ALWAYS called with resource lock held */
+	/* NB: cookie is still useful, don't reset it */
+	list_del(&lh->lh_hash_chain);
+}
+
+static inline void
+lnet_eq2handle (lnet_handle_eq_t *handle, lnet_eq_t *eq)
+{
+	if (eq == NULL) {
+		LNetInvalidateHandle(handle);
+		return;
+	}
+
+	handle->cookie = eq->eq_lh.lh_cookie;
+}
+
+static inline lnet_eq_t *
+lnet_handle2eq(lnet_handle_eq_t *handle)
+{
+	/* ALWAYS called with resource lock held */
+	lnet_libhandle_t *lh;
+
+	lh = lnet_res_lh_lookup(&the_lnet.ln_eq_container, handle->cookie);
+	if (lh == NULL)
+		return NULL;
+
+	return lh_entry(lh, lnet_eq_t, eq_lh);
+}
+
+static inline void
+lnet_md2handle (lnet_handle_md_t *handle, lnet_libmd_t *md)
+{
+	handle->cookie = md->md_lh.lh_cookie;
+}
+
+static inline lnet_libmd_t *
+lnet_handle2md(lnet_handle_md_t *handle)
+{
+	/* ALWAYS called with resource lock held */
+	lnet_libhandle_t *lh;
+	int		 cpt;
+
+	cpt = lnet_cpt_of_cookie(handle->cookie);
+	lh = lnet_res_lh_lookup(the_lnet.ln_md_containers[cpt],
+				handle->cookie);
+	if (lh == NULL)
+		return NULL;
+
+	return lh_entry(lh, lnet_libmd_t, md_lh);
+}
+
+static inline lnet_libmd_t *
+lnet_wire_handle2md(lnet_handle_wire_t *wh)
+{
+	/* ALWAYS called with resource lock held */
+	lnet_libhandle_t *lh;
+	int		 cpt;
+
+	if (wh->wh_interface_cookie != the_lnet.ln_interface_cookie)
+		return NULL;
+
+	cpt = lnet_cpt_of_cookie(wh->wh_object_cookie);
+	lh = lnet_res_lh_lookup(the_lnet.ln_md_containers[cpt],
+				wh->wh_object_cookie);
+	if (lh == NULL)
+		return NULL;
+
+	return lh_entry(lh, lnet_libmd_t, md_lh);
+}
+
+static inline void
+lnet_me2handle (lnet_handle_me_t *handle, lnet_me_t *me)
+{
+	handle->cookie = me->me_lh.lh_cookie;
+}
+
+static inline lnet_me_t *
+lnet_handle2me(lnet_handle_me_t *handle)
+{
+	/* ALWAYS called with resource lock held */
+	lnet_libhandle_t *lh;
+	int		 cpt;
+
+	cpt = lnet_cpt_of_cookie(handle->cookie);
+	lh = lnet_res_lh_lookup(the_lnet.ln_me_containers[cpt],
+				handle->cookie);
+	if (lh == NULL)
+		return NULL;
+
+	return lh_entry(lh, lnet_me_t, me_lh);
+}
+
+static inline void
+lnet_peer_addref_locked(lnet_peer_t *lp)
+{
+	LASSERT (lp->lp_refcount > 0);
+	lp->lp_refcount++;
+}
+
+extern void lnet_destroy_peer_locked(lnet_peer_t *lp);
+
+static inline void
+lnet_peer_decref_locked(lnet_peer_t *lp)
+{
+	LASSERT (lp->lp_refcount > 0);
+	lp->lp_refcount--;
+	if (lp->lp_refcount == 0)
+		lnet_destroy_peer_locked(lp);
+}
+
+static inline int
+lnet_isrouter(lnet_peer_t *lp)
+{
+	return lp->lp_rtr_refcount != 0;
+}
+
+static inline void
+lnet_ni_addref_locked(lnet_ni_t *ni, int cpt)
+{
+	LASSERT(cpt >= 0 && cpt < LNET_CPT_NUMBER);
+	LASSERT(*ni->ni_refs[cpt] >= 0);
+
+	(*ni->ni_refs[cpt])++;
+}
+
+static inline void
+lnet_ni_addref(lnet_ni_t *ni)
+{
+	lnet_net_lock(0);
+	lnet_ni_addref_locked(ni, 0);
+	lnet_net_unlock(0);
+}
+
+static inline void
+lnet_ni_decref_locked(lnet_ni_t *ni, int cpt)
+{
+	LASSERT(cpt >= 0 && cpt < LNET_CPT_NUMBER);
+	LASSERT(*ni->ni_refs[cpt] > 0);
+
+	(*ni->ni_refs[cpt])--;
+}
+
+static inline void
+lnet_ni_decref(lnet_ni_t *ni)
+{
+	lnet_net_lock(0);
+	lnet_ni_decref_locked(ni, 0);
+	lnet_net_unlock(0);
+}
+
+void lnet_ni_free(lnet_ni_t *ni);
+
+static inline int
+lnet_nid2peerhash(lnet_nid_t nid)
+{
+	return cfs_hash_long(nid, LNET_PEER_HASH_BITS);
+}
+
+static inline struct list_head *
+lnet_net2rnethash(__u32 net)
+{
+	return &the_lnet.ln_remote_nets_hash[(LNET_NETNUM(net) +
+		LNET_NETTYP(net)) &
+		((1U << the_lnet.ln_remote_nets_hbits) - 1)];
+}
+
+extern lnd_t the_lolnd;
+
+
+extern int lnet_cpt_of_nid_locked(lnet_nid_t nid);
+extern int lnet_cpt_of_nid(lnet_nid_t nid);
+extern lnet_ni_t *lnet_nid2ni_locked(lnet_nid_t nid, int cpt);
+extern lnet_ni_t *lnet_net2ni_locked(__u32 net, int cpt);
+extern lnet_ni_t *lnet_net2ni(__u32 net);
+
+int lnet_notify(lnet_ni_t *ni, lnet_nid_t peer, int alive, cfs_time_t when);
+void lnet_notify_locked(lnet_peer_t *lp, int notifylnd, int alive, cfs_time_t when);
+int lnet_add_route(__u32 net, unsigned int hops, lnet_nid_t gateway_nid);
+int lnet_check_routes(void);
+int lnet_del_route(__u32 net, lnet_nid_t gw_nid);
+void lnet_destroy_routes(void);
+int lnet_get_route(int idx, __u32 *net, __u32 *hops,
+		   lnet_nid_t *gateway, __u32 *alive);
+void lnet_proc_init(void);
+void lnet_proc_fini(void);
+int  lnet_rtrpools_alloc(int im_a_router);
+void lnet_rtrpools_free(void);
+lnet_remotenet_t *lnet_find_net_locked (__u32 net);
+
+int lnet_islocalnid(lnet_nid_t nid);
+int lnet_islocalnet(__u32 net);
+
+void lnet_msg_attach_md(lnet_msg_t *msg, lnet_libmd_t *md,
+			unsigned int offset, unsigned int mlen);
+void lnet_msg_detach_md(lnet_msg_t *msg, int status);
+void lnet_build_unlink_event(lnet_libmd_t *md, lnet_event_t *ev);
+void lnet_build_msg_event(lnet_msg_t *msg, lnet_event_kind_t ev_type);
+void lnet_msg_commit(lnet_msg_t *msg, int cpt);
+void lnet_msg_decommit(lnet_msg_t *msg, int cpt, int status);
+
+void lnet_eq_enqueue_event(lnet_eq_t *eq, lnet_event_t *ev);
+void lnet_prep_send(lnet_msg_t *msg, int type, lnet_process_id_t target,
+		    unsigned int offset, unsigned int len);
+int lnet_send(lnet_nid_t nid, lnet_msg_t *msg, lnet_nid_t rtr_nid);
+void lnet_return_tx_credits_locked(lnet_msg_t *msg);
+void lnet_return_rx_credits_locked(lnet_msg_t *msg);
+
+/* portals functions */
+/* portals attributes */
+static inline int
+lnet_ptl_is_lazy(lnet_portal_t *ptl)
+{
+	return !!(ptl->ptl_options & LNET_PTL_LAZY);
+}
+
+static inline int
+lnet_ptl_is_unique(lnet_portal_t *ptl)
+{
+	return !!(ptl->ptl_options & LNET_PTL_MATCH_UNIQUE);
+}
+
+static inline int
+lnet_ptl_is_wildcard(lnet_portal_t *ptl)
+{
+	return !!(ptl->ptl_options & LNET_PTL_MATCH_WILDCARD);
+}
+
+static inline void
+lnet_ptl_setopt(lnet_portal_t *ptl, int opt)
+{
+	ptl->ptl_options |= opt;
+}
+
+static inline void
+lnet_ptl_unsetopt(lnet_portal_t *ptl, int opt)
+{
+	ptl->ptl_options &= ~opt;
+}
+
+/* match-table functions */
+struct list_head *lnet_mt_match_head(struct lnet_match_table *mtable,
+			       lnet_process_id_t id, __u64 mbits);
+struct lnet_match_table *lnet_mt_of_attach(unsigned int index,
+					   lnet_process_id_t id, __u64 mbits,
+					   __u64 ignore_bits,
+					   lnet_ins_pos_t pos);
+int lnet_mt_match_md(struct lnet_match_table *mtable,
+		     struct lnet_match_info *info, struct lnet_msg *msg);
+
+/* portals match/attach functions */
+void lnet_ptl_attach_md(lnet_me_t *me, lnet_libmd_t *md,
+			struct list_head *matches, struct list_head *drops);
+void lnet_ptl_detach_md(lnet_me_t *me, lnet_libmd_t *md);
+int lnet_ptl_match_md(struct lnet_match_info *info, struct lnet_msg *msg);
+
+/* initialized and finalize portals */
+int lnet_portals_create(void);
+void lnet_portals_destroy(void);
+
+/* message functions */
+int lnet_parse (lnet_ni_t *ni, lnet_hdr_t *hdr,
+		lnet_nid_t fromnid, void *private, int rdma_req);
+void lnet_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed,
+	       unsigned int offset, unsigned int mlen, unsigned int rlen);
+lnet_msg_t *lnet_create_reply_msg (lnet_ni_t *ni, lnet_msg_t *get_msg);
+void lnet_set_reply_msg_len(lnet_ni_t *ni, lnet_msg_t *msg, unsigned int len);
+void lnet_finalize(lnet_ni_t *ni, lnet_msg_t *msg, int rc);
+void lnet_drop_delayed_msg_list(struct list_head *head, char *reason);
+void lnet_recv_delayed_msg_list(struct list_head *head);
+
+int lnet_msg_container_setup(struct lnet_msg_container *container, int cpt);
+void lnet_msg_container_cleanup(struct lnet_msg_container *container);
+void lnet_msg_containers_destroy(void);
+int lnet_msg_containers_create(void);
+
+char *lnet_msgtyp2str (int type);
+void lnet_print_hdr (lnet_hdr_t * hdr);
+int lnet_fail_nid(lnet_nid_t nid, unsigned int threshold);
+
+void lnet_counters_get(lnet_counters_t *counters);
+void lnet_counters_reset(void);
+
+unsigned int lnet_iov_nob (unsigned int niov, struct iovec *iov);
+int lnet_extract_iov (int dst_niov, struct iovec *dst,
+		      int src_niov, struct iovec *src,
+		      unsigned int offset, unsigned int len);
+
+unsigned int lnet_kiov_nob (unsigned int niov, lnet_kiov_t *iov);
+int lnet_extract_kiov (int dst_niov, lnet_kiov_t *dst,
+		      int src_niov, lnet_kiov_t *src,
+		      unsigned int offset, unsigned int len);
+
+void lnet_copy_iov2iov (unsigned int ndiov, struct iovec *diov,
+			unsigned int doffset,
+			unsigned int nsiov, struct iovec *siov,
+			unsigned int soffset, unsigned int nob);
+void lnet_copy_kiov2iov (unsigned int niov, struct iovec *iov,
+			 unsigned int iovoffset,
+			 unsigned int nkiov, lnet_kiov_t *kiov,
+			 unsigned int kiovoffset, unsigned int nob);
+void lnet_copy_iov2kiov (unsigned int nkiov, lnet_kiov_t *kiov,
+			 unsigned int kiovoffset,
+			 unsigned int niov, struct iovec *iov,
+			 unsigned int iovoffset, unsigned int nob);
+void lnet_copy_kiov2kiov (unsigned int ndkiov, lnet_kiov_t *dkiov,
+			  unsigned int doffset,
+			  unsigned int nskiov, lnet_kiov_t *skiov,
+			  unsigned int soffset, unsigned int nob);
+
+static inline void
+lnet_copy_iov2flat(int dlen, void *dest, unsigned int doffset,
+		   unsigned int nsiov, struct iovec *siov, unsigned int soffset,
+		   unsigned int nob)
+{
+	struct iovec diov = {/*.iov_base = */ dest, /*.iov_len = */ dlen};
+
+	lnet_copy_iov2iov(1, &diov, doffset,
+			  nsiov, siov, soffset, nob);
+}
+
+static inline void
+lnet_copy_kiov2flat(int dlen, void *dest, unsigned int doffset,
+		    unsigned int nsiov, lnet_kiov_t *skiov, unsigned int soffset,
+		    unsigned int nob)
+{
+	struct iovec diov = {/* .iov_base = */ dest, /* .iov_len = */ dlen};
+
+	lnet_copy_kiov2iov(1, &diov, doffset,
+			   nsiov, skiov, soffset, nob);
+}
+
+static inline void
+lnet_copy_flat2iov(unsigned int ndiov, struct iovec *diov, unsigned int doffset,
+		   int slen, void *src, unsigned int soffset, unsigned int nob)
+{
+	struct iovec siov = {/*.iov_base = */ src, /*.iov_len = */slen};
+	lnet_copy_iov2iov(ndiov, diov, doffset,
+			  1, &siov, soffset, nob);
+}
+
+static inline void
+lnet_copy_flat2kiov(unsigned int ndiov, lnet_kiov_t *dkiov, unsigned int doffset,
+		    int slen, void *src, unsigned int soffset, unsigned int nob)
+{
+	struct iovec siov = {/* .iov_base = */ src, /* .iov_len = */ slen};
+	lnet_copy_iov2kiov(ndiov, dkiov, doffset,
+			   1, &siov, soffset, nob);
+}
+
+void lnet_me_unlink(lnet_me_t *me);
+
+void lnet_md_unlink(lnet_libmd_t *md);
+void lnet_md_deconstruct(lnet_libmd_t *lmd, lnet_md_t *umd);
+
+void lnet_register_lnd(lnd_t *lnd);
+void lnet_unregister_lnd(lnd_t *lnd);
+int lnet_set_ip_niaddr (lnet_ni_t *ni);
+
+int lnet_connect(socket_t **sockp, lnet_nid_t peer_nid,
+		 __u32 local_ip, __u32 peer_ip, int peer_port);
+void lnet_connect_console_error(int rc, lnet_nid_t peer_nid,
+				__u32 peer_ip, int port);
+int lnet_count_acceptor_nis(void);
+int lnet_acceptor_timeout(void);
+int lnet_acceptor_port(void);
+
+int lnet_count_acceptor_nis(void);
+int lnet_acceptor_port(void);
+
+int lnet_acceptor_start(void);
+void lnet_acceptor_stop(void);
+
+void lnet_get_tunables(void);
+int lnet_peers_start_down(void);
+int lnet_peer_buffer_credits(lnet_ni_t *ni);
+
+int lnet_router_checker_start(void);
+void lnet_router_checker_stop(void);
+void lnet_swap_pinginfo(lnet_ping_info_t *info);
+
+int lnet_ping_target_init(void);
+void lnet_ping_target_fini(void);
+int lnet_ping(lnet_process_id_t id, int timeout_ms,
+	      lnet_process_id_t *ids, int n_ids);
+
+int lnet_parse_ip2nets (char **networksp, char *ip2nets);
+int lnet_parse_routes (char *route_str, int *im_a_router);
+int lnet_parse_networks (struct list_head *nilist, char *networks);
+
+int lnet_nid2peer_locked(lnet_peer_t **lpp, lnet_nid_t nid, int cpt);
+lnet_peer_t *lnet_find_peer_locked(struct lnet_peer_table *ptable,
+				   lnet_nid_t nid);
+void lnet_peer_tables_cleanup(void);
+void lnet_peer_tables_destroy(void);
+int lnet_peer_tables_create(void);
+void lnet_debug_peer(lnet_nid_t nid);
+
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/lnet/lib-types.h b/drivers/staging/lustre/include/linux/lnet/lib-types.h
new file mode 100644
index 000000000000..86428d4b993e
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/lnet/lib-types.h
@@ -0,0 +1,765 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/include/lnet/lib-types.h
+ *
+ * Types used by the library side routines that do not need to be
+ * exposed to the user application
+ */
+
+#ifndef __LNET_LIB_TYPES_H__
+#define __LNET_LIB_TYPES_H__
+
+#include <linux/lnet/linux/lib-types.h>
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/list.h>
+#include <linux/lnet/types.h>
+
+#define WIRE_ATTR       __attribute__((packed))
+
+/* Packed version of lnet_process_id_t to transfer via network */
+typedef struct {
+	lnet_nid_t nid;
+	lnet_pid_t pid;   /* node id / process id */
+} WIRE_ATTR lnet_process_id_packed_t;
+
+/* The wire handle's interface cookie only matches one network interface in
+ * one epoch (i.e. new cookie when the interface restarts or the node
+ * reboots).  The object cookie only matches one object on that interface
+ * during that object's lifetime (i.e. no cookie re-use). */
+typedef struct {
+	__u64 wh_interface_cookie;
+	__u64 wh_object_cookie;
+} WIRE_ATTR lnet_handle_wire_t;
+
+typedef enum {
+	LNET_MSG_ACK = 0,
+	LNET_MSG_PUT,
+	LNET_MSG_GET,
+	LNET_MSG_REPLY,
+	LNET_MSG_HELLO,
+} lnet_msg_type_t;
+
+/* The variant fields of the portals message header are aligned on an 8
+ * byte boundary in the message header.  Note that all types used in these
+ * wire structs MUST be fixed size and the smaller types are placed at the
+ * end. */
+typedef struct lnet_ack {
+	lnet_handle_wire_t  dst_wmd;
+	__u64	       match_bits;
+	__u32	       mlength;
+} WIRE_ATTR lnet_ack_t;
+
+typedef struct lnet_put {
+	lnet_handle_wire_t  ack_wmd;
+	__u64	       match_bits;
+	__u64	       hdr_data;
+	__u32	       ptl_index;
+	__u32	       offset;
+} WIRE_ATTR lnet_put_t;
+
+typedef struct lnet_get {
+	lnet_handle_wire_t  return_wmd;
+	__u64	       match_bits;
+	__u32	       ptl_index;
+	__u32	       src_offset;
+	__u32	       sink_length;
+} WIRE_ATTR lnet_get_t;
+
+typedef struct lnet_reply {
+	lnet_handle_wire_t  dst_wmd;
+} WIRE_ATTR lnet_reply_t;
+
+typedef struct lnet_hello {
+	__u64	      incarnation;
+	__u32	      type;
+} WIRE_ATTR lnet_hello_t;
+
+typedef struct {
+	lnet_nid_t	  dest_nid;
+	lnet_nid_t	  src_nid;
+	lnet_pid_t	  dest_pid;
+	lnet_pid_t	  src_pid;
+	__u32	       type;	       /* lnet_msg_type_t */
+	__u32	       payload_length;     /* payload data to follow */
+	/*<------__u64 aligned------->*/
+	union {
+		lnet_ack_t   ack;
+		lnet_put_t   put;
+		lnet_get_t   get;
+		lnet_reply_t reply;
+		lnet_hello_t hello;
+	} msg;
+} WIRE_ATTR lnet_hdr_t;
+
+/* A HELLO message contains a magic number and protocol version
+ * code in the header's dest_nid, the peer's NID in the src_nid, and
+ * LNET_MSG_HELLO in the type field.  All other common fields are zero
+ * (including payload_size; i.e. no payload).
+ * This is for use by byte-stream LNDs (e.g. TCP/IP) to check the peer is
+ * running the same protocol and to find out its NID. These LNDs should
+ * exchange HELLO messages when a connection is first established.  Individual
+ * LNDs can put whatever else they fancy in lnet_hdr_t::msg.
+ */
+typedef struct {
+	__u32   magic;			  /* LNET_PROTO_TCP_MAGIC */
+	__u16   version_major;		  /* increment on incompatible change */
+	__u16   version_minor;		  /* increment on compatible change */
+} WIRE_ATTR lnet_magicversion_t;
+
+/* PROTO MAGIC for LNDs */
+#define LNET_PROTO_IB_MAGIC		 0x0be91b91
+#define LNET_PROTO_RA_MAGIC		 0x0be91b92
+#define LNET_PROTO_QSW_MAGIC		0x0be91b93
+#define LNET_PROTO_GNI_MAGIC		0xb00fbabe /* ask Kim */
+#define LNET_PROTO_TCP_MAGIC		0xeebc0ded
+#define LNET_PROTO_PTL_MAGIC		0x50746C4E /* 'PtlN' unique magic */
+#define LNET_PROTO_MX_MAGIC		 0x4d583130 /* 'MX10'! */
+#define LNET_PROTO_ACCEPTOR_MAGIC	   0xacce7100
+#define LNET_PROTO_PING_MAGIC	       0x70696E67 /* 'ping' */
+
+/* Placeholder for a future "unified" protocol across all LNDs */
+/* Current LNDs that receive a request with this magic will respond with a
+ * "stub" reply using their current protocol */
+#define LNET_PROTO_MAGIC		    0x45726963 /* ! */
+
+
+#define LNET_PROTO_TCP_VERSION_MAJOR	1
+#define LNET_PROTO_TCP_VERSION_MINOR	0
+
+/* Acceptor connection request */
+typedef struct {
+	__u32       acr_magic;		  /* PTL_ACCEPTOR_PROTO_MAGIC */
+	__u32       acr_version;		/* protocol version */
+	__u64       acr_nid;		    /* target NID */
+} WIRE_ATTR lnet_acceptor_connreq_t;
+
+#define LNET_PROTO_ACCEPTOR_VERSION       1
+
+/* forward refs */
+struct lnet_libmd;
+
+typedef struct lnet_msg {
+	struct list_head	    msg_activelist;
+	struct list_head	    msg_list;	   /* Q for credits/MD */
+
+	lnet_process_id_t     msg_target;
+	/* where is it from, it's only for building event */
+	lnet_nid_t		msg_from;
+	__u32			msg_type;
+
+	/* commited for sending */
+	unsigned int		msg_tx_committed:1;
+	/* CPT # this message committed for sending */
+	unsigned int		msg_tx_cpt:15;
+	/* commited for receiving */
+	unsigned int		msg_rx_committed:1;
+	/* CPT # this message committed for receiving */
+	unsigned int		msg_rx_cpt:15;
+	/* queued for tx credit */
+	unsigned int		msg_tx_delayed:1;
+	/* queued for RX buffer */
+	unsigned int		msg_rx_delayed:1;
+	/* ready for pending on RX delay list */
+	unsigned int		msg_rx_ready_delay:1;
+
+	unsigned int	  msg_vmflush:1;      /* VM trying to free memory */
+	unsigned int	  msg_target_is_router:1; /* sending to a router */
+	unsigned int	  msg_routing:1;      /* being forwarded */
+	unsigned int	  msg_ack:1;	  /* ack on finalize (PUT) */
+	unsigned int	  msg_sending:1;      /* outgoing message */
+	unsigned int	  msg_receiving:1;    /* being received */
+	unsigned int	  msg_txcredit:1;     /* taken an NI send credit */
+	unsigned int	  msg_peertxcredit:1; /* taken a peer send credit */
+	unsigned int	  msg_rtrcredit:1;    /* taken a globel router credit */
+	unsigned int	  msg_peerrtrcredit:1; /* taken a peer router credit */
+	unsigned int	  msg_onactivelist:1; /* on the activelist */
+
+	struct lnet_peer     *msg_txpeer;	 /* peer I'm sending to */
+	struct lnet_peer     *msg_rxpeer;	 /* peer I received from */
+
+	void		 *msg_private;
+	struct lnet_libmd    *msg_md;
+
+	unsigned int	  msg_len;
+	unsigned int	  msg_wanted;
+	unsigned int	  msg_offset;
+	unsigned int	  msg_niov;
+	struct iovec	 *msg_iov;
+	lnet_kiov_t	  *msg_kiov;
+
+	lnet_event_t	  msg_ev;
+	lnet_hdr_t	    msg_hdr;
+} lnet_msg_t;
+
+
+typedef struct lnet_libhandle {
+	struct list_head	    lh_hash_chain;
+	__u64		 lh_cookie;
+} lnet_libhandle_t;
+
+#define lh_entry(ptr, type, member) \
+	((type *)((char *)(ptr)-(char *)(&((type *)0)->member)))
+
+typedef struct lnet_eq {
+	struct list_head		eq_list;
+	lnet_libhandle_t	eq_lh;
+	lnet_seq_t		eq_enq_seq;
+	lnet_seq_t		eq_deq_seq;
+	unsigned int		eq_size;
+	lnet_eq_handler_t	eq_callback;
+	lnet_event_t		*eq_events;
+	int			**eq_refs;	/* percpt refcount for EQ */
+} lnet_eq_t;
+
+typedef struct lnet_me {
+	struct list_head	     me_list;
+	lnet_libhandle_t       me_lh;
+	lnet_process_id_t      me_match_id;
+	unsigned int	   me_portal;
+	unsigned int	   me_pos;		/* hash offset in mt_hash */
+	__u64		  me_match_bits;
+	__u64		  me_ignore_bits;
+	lnet_unlink_t	  me_unlink;
+	struct lnet_libmd     *me_md;
+} lnet_me_t;
+
+typedef struct lnet_libmd {
+	struct list_head	    md_list;
+	lnet_libhandle_t      md_lh;
+	lnet_me_t	    *md_me;
+	char		 *md_start;
+	unsigned int	  md_offset;
+	unsigned int	  md_length;
+	unsigned int	  md_max_size;
+	int		   md_threshold;
+	int		   md_refcount;
+	unsigned int	  md_options;
+	unsigned int	  md_flags;
+	void		 *md_user_ptr;
+	lnet_eq_t	    *md_eq;
+	unsigned int	  md_niov;		/* # frags */
+	union {
+		struct iovec  iov[LNET_MAX_IOV];
+		lnet_kiov_t   kiov[LNET_MAX_IOV];
+	} md_iov;
+} lnet_libmd_t;
+
+#define LNET_MD_FLAG_ZOMBIE	   (1 << 0)
+#define LNET_MD_FLAG_AUTO_UNLINK      (1 << 1)
+
+#ifdef LNET_USE_LIB_FREELIST
+typedef struct
+{
+	void		  *fl_objs;	  /* single contiguous array of objects */
+	int		    fl_nobjs;	 /* the number of them */
+	int		    fl_objsize;       /* the size (including overhead) of each of them */
+	struct list_head	     fl_list;	  /* where they are enqueued */
+} lnet_freelist_t;
+
+typedef struct
+{
+	struct list_head	     fo_list;	     /* enqueue on fl_list */
+	void		  *fo_contents;	 /* aligned contents */
+} lnet_freeobj_t;
+#endif
+
+typedef struct {
+	/* info about peers we are trying to fail */
+	struct list_head	     tp_list;	     /* ln_test_peers */
+	lnet_nid_t	     tp_nid;	      /* matching nid */
+	unsigned int	   tp_threshold;	/* # failures to simulate */
+} lnet_test_peer_t;
+
+#define LNET_COOKIE_TYPE_MD    1
+#define LNET_COOKIE_TYPE_ME    2
+#define LNET_COOKIE_TYPE_EQ    3
+#define LNET_COOKIE_TYPE_BITS  2
+#define LNET_COOKIE_MASK	((1ULL << LNET_COOKIE_TYPE_BITS) - 1ULL)
+
+struct lnet_ni;				  /* forward ref */
+
+typedef struct lnet_lnd
+{
+	/* fields managed by portals */
+	struct list_head	    lnd_list;	     /* stash in the LND table */
+	int		   lnd_refcount;	 /* # active instances */
+
+	/* fields initialised by the LND */
+	unsigned int	  lnd_type;
+
+	int  (*lnd_startup) (struct lnet_ni *ni);
+	void (*lnd_shutdown) (struct lnet_ni *ni);
+	int  (*lnd_ctl)(struct lnet_ni *ni, unsigned int cmd, void *arg);
+
+	/* In data movement APIs below, payload buffers are described as a set
+	 * of 'niov' fragments which are...
+	 * EITHER
+	 *    in virtual memory (struct iovec *iov != NULL)
+	 * OR
+	 *    in pages (kernel only: plt_kiov_t *kiov != NULL).
+	 * The LND may NOT overwrite these fragment descriptors.
+	 * An 'offset' and may specify a byte offset within the set of
+	 * fragments to start from
+	 */
+
+	/* Start sending a preformatted message.  'private' is NULL for PUT and
+	 * GET messages; otherwise this is a response to an incoming message
+	 * and 'private' is the 'private' passed to lnet_parse().  Return
+	 * non-zero for immediate failure, otherwise complete later with
+	 * lnet_finalize() */
+	int (*lnd_send)(struct lnet_ni *ni, void *private, lnet_msg_t *msg);
+
+	/* Start receiving 'mlen' bytes of payload data, skipping the following
+	 * 'rlen' - 'mlen' bytes. 'private' is the 'private' passed to
+	 * lnet_parse().  Return non-zero for immedaite failure, otherwise
+	 * complete later with lnet_finalize().  This also gives back a receive
+	 * credit if the LND does flow control. */
+	int (*lnd_recv)(struct lnet_ni *ni, void *private, lnet_msg_t *msg,
+			int delayed, unsigned int niov,
+			struct iovec *iov, lnet_kiov_t *kiov,
+			unsigned int offset, unsigned int mlen, unsigned int rlen);
+
+	/* lnet_parse() has had to delay processing of this message
+	 * (e.g. waiting for a forwarding buffer or send credits).  Give the
+	 * LND a chance to free urgently needed resources.  If called, return 0
+	 * for success and do NOT give back a receive credit; that has to wait
+	 * until lnd_recv() gets called.  On failure return < 0 and
+	 * release resources; lnd_recv() will not be called. */
+	int (*lnd_eager_recv)(struct lnet_ni *ni, void *private, lnet_msg_t *msg,
+			      void **new_privatep);
+
+	/* notification of peer health */
+	void (*lnd_notify)(struct lnet_ni *ni, lnet_nid_t peer, int alive);
+
+	/* query of peer aliveness */
+	void (*lnd_query)(struct lnet_ni *ni, lnet_nid_t peer, cfs_time_t *when);
+
+	/* accept a new connection */
+	int (*lnd_accept)(struct lnet_ni *ni, socket_t *sock);
+
+} lnd_t;
+
+#define LNET_NI_STATUS_UP      0x15aac0de
+#define LNET_NI_STATUS_DOWN    0xdeadface
+#define LNET_NI_STATUS_INVALID 0x00000000
+typedef struct {
+	lnet_nid_t ns_nid;
+	__u32      ns_status;
+	__u32      ns_unused;
+} WIRE_ATTR lnet_ni_status_t;
+
+struct lnet_tx_queue {
+	int			tq_credits;	/* # tx credits free */
+	int			tq_credits_min;	/* lowest it's been */
+	int			tq_credits_max;	/* total # tx credits */
+	struct list_head		tq_delayed;	/* delayed TXs */
+};
+
+#define LNET_MAX_INTERFACES   16
+
+typedef struct lnet_ni {
+	spinlock_t		ni_lock;
+	struct list_head		ni_list;	/* chain on ln_nis */
+	struct list_head		ni_cptlist;	/* chain on ln_nis_cpt */
+	int			ni_maxtxcredits; /* # tx credits  */
+	/* # per-peer send credits */
+	int			ni_peertxcredits;
+	/* # per-peer router buffer credits */
+	int			ni_peerrtrcredits;
+	/* seconds to consider peer dead */
+	int			ni_peertimeout;
+	int			ni_ncpts;	/* number of CPTs */
+	__u32			*ni_cpts;	/* bond NI on some CPTs */
+	lnet_nid_t		ni_nid;		/* interface's NID */
+	void			*ni_data;	/* instance-specific data */
+	lnd_t			*ni_lnd;	/* procedural interface */
+	struct lnet_tx_queue	**ni_tx_queues;	/* percpt TX queues */
+	int			**ni_refs;	/* percpt reference count */
+	long			ni_last_alive;	/* when I was last alive */
+	lnet_ni_status_t	*ni_status;	/* my health status */
+	/* equivalent interfaces to use */
+	char			*ni_interfaces[LNET_MAX_INTERFACES];
+} lnet_ni_t;
+
+#define LNET_PROTO_PING_MATCHBITS	0x8000000000000000LL
+
+/* NB: value of these features equal to LNET_PROTO_PING_VERSION_x
+ * of old LNet, so there shouldn't be any compatibility issue */
+#define LNET_PING_FEAT_INVAL		(0)		/* no feature */
+#define LNET_PING_FEAT_BASE		(1 << 0)	/* just a ping */
+#define LNET_PING_FEAT_NI_STATUS	(1 << 1)	/* return NI status */
+
+#define LNET_PING_FEAT_MASK		(LNET_PING_FEAT_BASE | \
+					 LNET_PING_FEAT_NI_STATUS)
+
+typedef struct {
+	__u32			pi_magic;
+	__u32			pi_features;
+	lnet_pid_t		pi_pid;
+	__u32			pi_nnis;
+	lnet_ni_status_t	pi_ni[0];
+} WIRE_ATTR lnet_ping_info_t;
+
+/* router checker data, per router */
+#define LNET_MAX_RTR_NIS   16
+#define LNET_PINGINFO_SIZE offsetof(lnet_ping_info_t, pi_ni[LNET_MAX_RTR_NIS])
+typedef struct {
+	/* chain on the_lnet.ln_zombie_rcd or ln_deathrow_rcd */
+	struct list_head		rcd_list;
+	lnet_handle_md_t	rcd_mdh;	/* ping buffer MD */
+	struct lnet_peer	*rcd_gateway;	/* reference to gateway */
+	lnet_ping_info_t	*rcd_pinginfo;	/* ping buffer */
+} lnet_rc_data_t;
+
+typedef struct lnet_peer {
+	struct list_head	lp_hashlist;	  /* chain on peer hash */
+	struct list_head	lp_txq;	       /* messages blocking for tx credits */
+	struct list_head	lp_rtrq;	      /* messages blocking for router credits */
+	struct list_head	lp_rtr_list;	  /* chain on router list */
+	int	       lp_txcredits;	 /* # tx credits available */
+	int	       lp_mintxcredits;      /* low water mark */
+	int	       lp_rtrcredits;	/* # router credits */
+	int	       lp_minrtrcredits;     /* low water mark */
+	unsigned int      lp_alive:1;	   /* alive/dead? */
+	unsigned int      lp_notify:1;	  /* notification outstanding? */
+	unsigned int      lp_notifylnd:1;       /* outstanding notification for LND? */
+	unsigned int      lp_notifying:1;       /* some thread is handling notification */
+	unsigned int      lp_ping_notsent;      /* SEND event outstanding from ping */
+	int	       lp_alive_count;       /* # times router went dead<->alive */
+	long	      lp_txqnob;	    /* bytes queued for sending */
+	cfs_time_t	lp_timestamp;	 /* time of last aliveness news */
+	cfs_time_t	lp_ping_timestamp;    /* time of last ping attempt */
+	cfs_time_t	lp_ping_deadline;     /* != 0 if ping reply expected */
+	cfs_time_t	lp_last_alive;	/* when I was last alive */
+	cfs_time_t	lp_last_query;	/* when lp_ni was queried last time */
+	lnet_ni_t	*lp_ni;		/* interface peer is on */
+	lnet_nid_t	lp_nid;	       /* peer's NID */
+	int	       lp_refcount;	  /* # refs */
+	int			lp_cpt;		/* CPT this peer attached on */
+	/* # refs from lnet_route_t::lr_gateway */
+	int			lp_rtr_refcount;
+	/* returned RC ping features */
+	unsigned int		lp_ping_feats;
+	struct list_head		lp_routes;	/* routers on this peer */
+	lnet_rc_data_t		*lp_rcd;	/* router checker state */
+} lnet_peer_t;
+
+
+/* peer hash size */
+#define LNET_PEER_HASH_BITS     9
+#define LNET_PEER_HASH_SIZE     (1 << LNET_PEER_HASH_BITS)
+
+/* peer hash table */
+struct lnet_peer_table {
+	int			pt_version;	/* /proc validity stamp */
+	int			pt_number;	/* # peers extant */
+	struct list_head		pt_deathrow;	/* zombie peers */
+	struct list_head		*pt_hash;	/* NID->peer hash */
+};
+
+/* peer aliveness is enabled only on routers for peers in a network where the
+ * lnet_ni_t::ni_peertimeout has been set to a positive value */
+#define lnet_peer_aliveness_enabled(lp) (the_lnet.ln_routing != 0 && \
+					 (lp)->lp_ni->ni_peertimeout > 0)
+
+typedef struct {
+	struct list_head		lr_list;	/* chain on net */
+	struct list_head		lr_gwlist;	/* chain on gateway */
+	lnet_peer_t		*lr_gateway;	/* router node */
+	__u32			lr_net;		/* remote network number */
+	int			lr_seq;		/* sequence for round-robin */
+	unsigned int		lr_downis;	/* number of down NIs */
+	unsigned int		lr_hops;	/* how far I am */
+} lnet_route_t;
+
+#define LNET_REMOTE_NETS_HASH_DEFAULT	(1U << 7)
+#define LNET_REMOTE_NETS_HASH_MAX	(1U << 16)
+#define LNET_REMOTE_NETS_HASH_SIZE	(1 << the_lnet.ln_remote_nets_hbits)
+
+typedef struct {
+	struct list_head	      lrn_list;       /* chain on ln_remote_nets_hash */
+	struct list_head	      lrn_routes;     /* routes to me */
+	__u32		   lrn_net;	/* my net number */
+} lnet_remotenet_t;
+
+typedef struct {
+	struct list_head rbp_bufs;	     /* my free buffer pool */
+	struct list_head rbp_msgs;	     /* messages blocking for a buffer */
+	int	rbp_npages;	   /* # pages in each buffer */
+	int	rbp_nbuffers;	 /* # buffers */
+	int	rbp_credits;	  /* # free buffers / blocked messages */
+	int	rbp_mincredits;       /* low water mark */
+} lnet_rtrbufpool_t;
+
+typedef struct {
+	struct list_head	     rb_list;	     /* chain on rbp_bufs */
+	lnet_rtrbufpool_t     *rb_pool;	     /* owning pool */
+	lnet_kiov_t	    rb_kiov[0];	  /* the buffer space */
+} lnet_rtrbuf_t;
+
+typedef struct {
+	__u32	msgs_alloc;
+	__u32	msgs_max;
+	__u32	errors;
+	__u32	send_count;
+	__u32	recv_count;
+	__u32	route_count;
+	__u32	drop_count;
+	__u64	send_length;
+	__u64	recv_length;
+	__u64	route_length;
+	__u64	drop_length;
+} WIRE_ATTR lnet_counters_t;
+
+#define LNET_PEER_HASHSIZE   503		/* prime! */
+
+#define LNET_NRBPOOLS	 3		 /* # different router buffer pools */
+
+enum {
+	/* Didn't match anything */
+	LNET_MATCHMD_NONE	= (1 << 0),
+	/* Matched OK */
+	LNET_MATCHMD_OK		= (1 << 1),
+	/* Must be discarded */
+	LNET_MATCHMD_DROP	= (1 << 2),
+	/* match and buffer is exhausted */
+	LNET_MATCHMD_EXHAUSTED  = (1 << 3),
+	/* match or drop */
+	LNET_MATCHMD_FINISH     = (LNET_MATCHMD_OK | LNET_MATCHMD_DROP),
+};
+
+/* Options for lnet_portal_t::ptl_options */
+#define LNET_PTL_LAZY	       (1 << 0)
+#define LNET_PTL_MATCH_UNIQUE       (1 << 1)    /* unique match, for RDMA */
+#define LNET_PTL_MATCH_WILDCARD     (1 << 2)    /* wildcard match, request portal */
+
+/* parameter for matching operations (GET, PUT) */
+struct lnet_match_info {
+	__u64			mi_mbits;
+	lnet_process_id_t	mi_id;
+	unsigned int		mi_opc;
+	unsigned int		mi_portal;
+	unsigned int		mi_rlength;
+	unsigned int		mi_roffset;
+};
+
+/* ME hash of RDMA portal */
+#define LNET_MT_HASH_BITS		8
+#define LNET_MT_HASH_SIZE		(1 << LNET_MT_HASH_BITS)
+#define LNET_MT_HASH_MASK		(LNET_MT_HASH_SIZE - 1)
+/* we allocate (LNET_MT_HASH_SIZE + 1) entries for lnet_match_table::mt_hash,
+ * the last entry is reserved for MEs with ignore-bits */
+#define LNET_MT_HASH_IGNORE		LNET_MT_HASH_SIZE
+/* __u64 has 2^6 bits, so need 2^(LNET_MT_HASH_BITS - LNET_MT_BITS_U64) which
+ * is 4 __u64s as bit-map, and add an extra __u64 (only use one bit) for the
+ * ME-list with ignore-bits, which is mtable::mt_hash[LNET_MT_HASH_IGNORE] */
+#define LNET_MT_BITS_U64		6	/* 2^6 bits */
+#define LNET_MT_EXHAUSTED_BITS		(LNET_MT_HASH_BITS - LNET_MT_BITS_U64)
+#define LNET_MT_EXHAUSTED_BMAP		((1 << LNET_MT_EXHAUSTED_BITS) + 1)
+
+/* portal match table */
+struct lnet_match_table {
+	/* reserved for upcoming patches, CPU partition ID */
+	unsigned int		mt_cpt;
+	unsigned int		mt_portal;      /* portal index */
+	/* match table is set as "enabled" if there's non-exhausted MD
+	 * attached on mt_mhash, it's only valide for wildcard portal */
+	unsigned int		mt_enabled;
+	/* bitmap to flag whether MEs on mt_hash are exhausted or not */
+	__u64			mt_exhausted[LNET_MT_EXHAUSTED_BMAP];
+	struct list_head		*mt_mhash;      /* matching hash */
+};
+
+/* these are only useful for wildcard portal */
+/* Turn off message rotor for wildcard portals */
+#define	LNET_PTL_ROTOR_OFF	0
+/* round-robin dispatch all PUT messages for wildcard portals */
+#define	LNET_PTL_ROTOR_ON	1
+/* round-robin dispatch routed PUT message for wildcard portals */
+#define	LNET_PTL_ROTOR_RR_RT	2
+/* dispatch routed PUT message by hashing source NID for wildcard portals */
+#define	LNET_PTL_ROTOR_HASH_RT	3
+
+typedef struct lnet_portal {
+	spinlock_t		ptl_lock;
+	unsigned int		ptl_index;	/* portal ID, reserved */
+	/* flags on this portal: lazy, unique... */
+	unsigned int		ptl_options;
+	/* list of messags which are stealing buffer */
+	struct list_head		ptl_msg_stealing;
+	/* messages blocking for MD */
+	struct list_head		ptl_msg_delayed;
+	/* Match table for each CPT */
+	struct lnet_match_table	**ptl_mtables;
+	/* spread rotor of incoming "PUT" */
+	int			ptl_rotor;
+	/* # active entries for this portal */
+	int		     ptl_mt_nmaps;
+	/* array of active entries' cpu-partition-id */
+	int		     ptl_mt_maps[0];
+} lnet_portal_t;
+
+#define LNET_LH_HASH_BITS	12
+#define LNET_LH_HASH_SIZE	(1ULL << LNET_LH_HASH_BITS)
+#define LNET_LH_HASH_MASK	(LNET_LH_HASH_SIZE - 1)
+
+/* resource container (ME, MD, EQ) */
+struct lnet_res_container {
+	unsigned int		rec_type;	/* container type */
+	__u64			rec_lh_cookie;	/* cookie generator */
+	struct list_head		rec_active;	/* active resource list */
+	struct list_head		*rec_lh_hash;	/* handle hash */
+#ifdef LNET_USE_LIB_FREELIST
+	lnet_freelist_t		rec_freelist;	/* freelist for resources */
+#endif
+};
+
+/* message container */
+struct lnet_msg_container {
+	int			msc_init;	/* initialized or not */
+	/* max # threads finalizing */
+	int			msc_nfinalizers;
+	/* msgs waiting to complete finalizing */
+	struct list_head		msc_finalizing;
+	struct list_head		msc_active;	/* active message list */
+	/* threads doing finalization */
+	void			**msc_finalizers;
+#ifdef LNET_USE_LIB_FREELIST
+	lnet_freelist_t		msc_freelist;	/* freelist for messages */
+#endif
+};
+
+/* Router Checker states */
+#define LNET_RC_STATE_SHUTDOWN		0	/* not started */
+#define LNET_RC_STATE_RUNNING		1	/* started up OK */
+#define LNET_RC_STATE_STOPPING		2	/* telling thread to stop */
+
+typedef struct
+{
+	/* CPU partition table of LNet */
+	struct cfs_cpt_table		*ln_cpt_table;
+	/* number of CPTs in ln_cpt_table */
+	unsigned int			ln_cpt_number;
+	unsigned int			ln_cpt_bits;
+
+	/* protect LNet resources (ME/MD/EQ) */
+	struct cfs_percpt_lock		*ln_res_lock;
+	/* # portals */
+	int				ln_nportals;
+	/* the vector of portals */
+	lnet_portal_t			**ln_portals;
+	/* percpt ME containers */
+	struct lnet_res_container	**ln_me_containers;
+	/* percpt MD container */
+	struct lnet_res_container	**ln_md_containers;
+
+	/* Event Queue container */
+	struct lnet_res_container	ln_eq_container;
+	wait_queue_head_t			ln_eq_waitq;
+	spinlock_t			ln_eq_wait_lock;
+	unsigned int			ln_remote_nets_hbits;
+
+	/* protect NI, peer table, credits, routers, rtrbuf... */
+	struct cfs_percpt_lock		*ln_net_lock;
+	/* percpt message containers for active/finalizing/freed message */
+	struct lnet_msg_container	**ln_msg_containers;
+	lnet_counters_t			**ln_counters;
+	struct lnet_peer_table		**ln_peer_tables;
+	/* failure simulation */
+	struct list_head			ln_test_peers;
+
+	struct list_head			ln_nis;		/* LND instances */
+	/* NIs bond on specific CPT(s) */
+	struct list_head			ln_nis_cpt;
+	/* dying LND instances */
+	struct list_head			ln_nis_zombie;
+	lnet_ni_t			*ln_loni;	/* the loopback NI */
+	/* NI to wait for events in */
+	lnet_ni_t			*ln_eq_waitni;
+
+	/* remote networks with routes to them */
+	struct list_head			*ln_remote_nets_hash;
+	/* validity stamp */
+	__u64				ln_remote_nets_version;
+	/* list of all known routers */
+	struct list_head			ln_routers;
+	/* validity stamp */
+	__u64				ln_routers_version;
+	/* percpt router buffer pools */
+	lnet_rtrbufpool_t		**ln_rtrpools;
+
+	lnet_handle_md_t		ln_ping_target_md;
+	lnet_handle_eq_t		ln_ping_target_eq;
+	lnet_ping_info_t		*ln_ping_info;
+
+	/* router checker startup/shutdown state */
+	int				ln_rc_state;
+	/* router checker's event queue */
+	lnet_handle_eq_t		ln_rc_eqh;
+	/* rcd still pending on net */
+	struct list_head			ln_rcd_deathrow;
+	/* rcd ready for free */
+	struct list_head			ln_rcd_zombie;
+	/* serialise startup/shutdown */
+	struct semaphore		ln_rc_signal;
+
+	struct mutex			ln_api_mutex;
+	struct mutex			ln_lnd_mutex;
+	int				ln_init;	/* LNetInit() called? */
+	/* Have I called LNetNIInit myself? */
+	int				ln_niinit_self;
+	/* LNetNIInit/LNetNIFini counter */
+	int				ln_refcount;
+	/* shutdown in progress */
+	int				ln_shutdown;
+
+	int				ln_routing;	/* am I a router? */
+	lnet_pid_t			ln_pid;		/* requested pid */
+	/* uniquely identifies this ni in this epoch */
+	__u64				ln_interface_cookie;
+	/* registered LNDs */
+	struct list_head			ln_lnds;
+
+	/* space for network names */
+	char				*ln_network_tokens;
+	int				ln_network_tokens_nob;
+	/* test protocol compatibility flags */
+	int				ln_testprotocompat;
+
+} lnet_t;
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/lnet/linux/api-support.h b/drivers/staging/lustre/include/linux/lnet/linux/api-support.h
new file mode 100644
index 000000000000..ca78a0a4e908
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/lnet/linux/api-support.h
@@ -0,0 +1,43 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LINUX_API_SUPPORT_H__
+#define __LINUX_API_SUPPORT_H__
+
+#ifndef __LNET_API_SUPPORT_H__
+#error Do not #include this file directly. #include <lnet /api-support.h> instead
+#endif
+
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/lnet/linux/lib-lnet.h b/drivers/staging/lustre/include/linux/lnet/linux/lib-lnet.h
new file mode 100644
index 000000000000..d2c0a70f1f7e
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/lnet/linux/lib-lnet.h
@@ -0,0 +1,72 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LNET_LINUX_LIB_LNET_H__
+#define __LNET_LINUX_LIB_LNET_H__
+
+#ifndef __LNET_LIB_LNET_H__
+#error Do not #include this file directly. #include <linux/lnet/lib-lnet.h> instead
+#endif
+
+# include <asm/page.h>
+# include <linux/string.h>
+# include <asm/io.h>
+# include <linux/libcfs/libcfs.h>
+
+static inline __u64
+lnet_page2phys (struct page *p)
+{
+	/* compiler optimizer will elide unused branches */
+
+	switch (sizeof(typeof(page_to_phys(p)))) {
+	case 4:
+		/* page_to_phys returns a 32 bit physical address.  This must
+		 * be a 32 bit machine with <= 4G memory and we must ensure we
+		 * don't sign extend when converting to 64 bits. */
+		return (unsigned long)page_to_phys(p);
+
+	case 8:
+		/* page_to_phys returns a 64 bit physical address :) */
+		return page_to_phys(p);
+
+	default:
+		LBUG();
+		return 0;
+	}
+}
+
+
+#define LNET_ROUTER
+
+#endif /* __LNET_LINUX_LIB_LNET_H__ */
diff --git a/drivers/staging/lustre/include/linux/lnet/linux/lib-types.h b/drivers/staging/lustre/include/linux/lnet/linux/lib-types.h
new file mode 100644
index 000000000000..669e8c038534
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/lnet/linux/lib-types.h
@@ -0,0 +1,45 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LNET_LINUX_LIB_TYPES_H__
+#define __LNET_LINUX_LIB_TYPES_H__
+
+#ifndef __LNET_LIB_TYPES_H__
+#error Do not #include this file directly. #include <linux/lnet/lib-types.h> instead
+#endif
+
+# include <linux/uio.h>
+# include <linux/types.h>
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/lnet/linux/lnet.h b/drivers/staging/lustre/include/linux/lnet/linux/lnet.h
new file mode 100644
index 000000000000..1e888f1efc45
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/lnet/linux/lnet.h
@@ -0,0 +1,56 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LNET_LINUX_LNET_H__
+#define __LNET_LINUX_LNET_H__
+
+#ifndef __LNET_H__
+#error Do not #include this file directly. #include <linux/lnet/lnet.h> instead
+#endif
+
+/*
+ * lnet.h
+ *
+ * User application interface file
+ */
+
+#include <linux/uio.h>
+#include <linux/types.h>
+
+#define cfs_tcp_sendpage(sk, page, offset, size, flags) \
+	tcp_sendpage(sk, page, offset, size, flags)
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/lnet/lnet-sysctl.h b/drivers/staging/lustre/include/linux/lnet/lnet-sysctl.h
new file mode 100644
index 000000000000..1bde44ebb911
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/lnet/lnet-sysctl.h
@@ -0,0 +1,51 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LNET_SYSCTL_H__
+#define __LNET_SYSCTL_H__
+
+#if defined(CONFIG_SYSCTL)
+
+
+#define CTL_KRANAL      201
+#define CTL_O2IBLND     205
+#define CTL_PTLLND      206
+#define CTL_QSWNAL      207
+#define CTL_SOCKLND     208
+#define CTL_GNILND      210
+
+
+#endif
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/lnet/lnet.h b/drivers/staging/lustre/include/linux/lnet/lnet.h
new file mode 100644
index 000000000000..c532b15d7643
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/lnet/lnet.h
@@ -0,0 +1,51 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LNET_H__
+#define __LNET_H__
+
+/*
+ * lnet.h
+ *
+ * User application interface file
+ */
+#include <linux/lnet/linux/lnet.h>
+
+#include <linux/lnet/types.h>
+#include <linux/lnet/api.h>
+
+#define LNET_NIDSTR_COUNT  1024    /* # of nidstrings */
+#define LNET_NIDSTR_SIZE   32      /* size of each one (see below for usage) */
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/lnet/lnetctl.h b/drivers/staging/lustre/include/linux/lnet/lnetctl.h
new file mode 100644
index 000000000000..b22daa234255
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/lnet/lnetctl.h
@@ -0,0 +1,80 @@
+/*
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * header for libptlctl.a
+ */
+#ifndef _PTLCTL_H_
+#define _PTLCTL_H_
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/types.h>
+
+#define LNET_DEV_ID 0
+#define LNET_DEV_PATH "/dev/lnet"
+#define LNET_DEV_MAJOR 10
+#define LNET_DEV_MINOR 240
+#define OBD_DEV_ID 1
+#define OBD_DEV_NAME "obd"
+#define OBD_DEV_PATH "/dev/" OBD_DEV_NAME
+#define OBD_DEV_MAJOR 10
+#define OBD_DEV_MINOR 241
+#define SMFS_DEV_ID  2
+#define SMFS_DEV_PATH "/dev/snapdev"
+#define SMFS_DEV_MAJOR 10
+#define SMFS_DEV_MINOR 242
+
+int ptl_initialize(int argc, char **argv);
+int jt_ptl_network(int argc, char **argv);
+int jt_ptl_list_nids(int argc, char **argv);
+int jt_ptl_which_nid(int argc, char **argv);
+int jt_ptl_print_interfaces(int argc, char **argv);
+int jt_ptl_add_interface(int argc, char **argv);
+int jt_ptl_del_interface(int argc, char **argv);
+int jt_ptl_print_peers (int argc, char **argv);
+int jt_ptl_add_peer (int argc, char **argv);
+int jt_ptl_del_peer (int argc, char **argv);
+int jt_ptl_print_connections (int argc, char **argv);
+int jt_ptl_disconnect(int argc, char **argv);
+int jt_ptl_push_connection(int argc, char **argv);
+int jt_ptl_print_active_txs(int argc, char **argv);
+int jt_ptl_ping(int argc, char **argv);
+int jt_ptl_mynid(int argc, char **argv);
+int jt_ptl_add_uuid(int argc, char **argv);
+int jt_ptl_add_uuid_old(int argc, char **argv); /* backwards compatibility  */
+int jt_ptl_close_uuid(int argc, char **argv);
+int jt_ptl_del_uuid(int argc, char **argv);
+int jt_ptl_add_route (int argc, char **argv);
+int jt_ptl_del_route (int argc, char **argv);
+int jt_ptl_notify_router (int argc, char **argv);
+int jt_ptl_print_routes (int argc, char **argv);
+int jt_ptl_fail_nid (int argc, char **argv);
+int jt_ptl_lwt(int argc, char **argv);
+int jt_ptl_testprotocompat(int argc, char **argv);
+int jt_ptl_memhog(int argc, char **argv);
+
+int dbg_initialize(int argc, char **argv);
+int jt_dbg_filter(int argc, char **argv);
+int jt_dbg_show(int argc, char **argv);
+int jt_dbg_list(int argc, char **argv);
+int jt_dbg_debug_kernel(int argc, char **argv);
+int jt_dbg_debug_daemon(int argc, char **argv);
+int jt_dbg_debug_file(int argc, char **argv);
+int jt_dbg_clear_debug_buf(int argc, char **argv);
+int jt_dbg_mark_debug_buf(int argc, char **argv);
+int jt_dbg_modules(int argc, char **argv);
+int jt_dbg_panic(int argc, char **argv);
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/lnet/lnetst.h b/drivers/staging/lustre/include/linux/lnet/lnetst.h
new file mode 100644
index 000000000000..d90f94e94601
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/lnet/lnetst.h
@@ -0,0 +1,491 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/include/lnet/lnetst.h
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+#ifndef __LNET_ST_H__
+#define __LNET_ST_H__
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lnet.h>
+#include <linux/lnet/lib-types.h>
+
+#define LST_FEAT_NONE		(0)
+#define LST_FEAT_BULK_LEN	(1 << 0)	/* enable variable page size */
+
+#define LST_FEATS_EMPTY		(LST_FEAT_NONE)
+#define LST_FEATS_MASK		(LST_FEAT_NONE | LST_FEAT_BULK_LEN)
+
+#define LST_NAME_SIZE	   32	      /* max name buffer length */
+
+#define LSTIO_DEBUG	     0xC00	   /* debug */
+#define LSTIO_SESSION_NEW       0xC01	   /* create session */
+#define LSTIO_SESSION_END       0xC02	   /* end session */
+#define LSTIO_SESSION_INFO      0xC03	   /* query session */
+#define LSTIO_GROUP_ADD	 0xC10	   /* add group */
+#define LSTIO_GROUP_LIST	0xC11	   /* list all groups in session */
+#define LSTIO_GROUP_INFO	0xC12	   /* query defailt infomation of specified group */
+#define LSTIO_GROUP_DEL	 0xC13	   /* delete group */
+#define LSTIO_NODES_ADD	 0xC14	   /* add nodes to specified group */
+#define LSTIO_GROUP_UPDATE      0xC15	   /* update group */
+#define LSTIO_BATCH_ADD	 0xC20	   /* add batch */
+#define LSTIO_BATCH_START       0xC21	   /* start batch */
+#define LSTIO_BATCH_STOP	0xC22	   /* stop batch */
+#define LSTIO_BATCH_DEL	 0xC23	   /* delete batch */
+#define LSTIO_BATCH_LIST	0xC24	   /* show all batches in the session */
+#define LSTIO_BATCH_INFO	0xC25	   /* show defail of specified batch */
+#define LSTIO_TEST_ADD	  0xC26	   /* add test (to batch) */
+#define LSTIO_BATCH_QUERY       0xC27	   /* query batch status */
+#define LSTIO_STAT_QUERY	0xC30	   /* get stats */
+
+typedef struct {
+	lnet_nid_t	      ses_nid;		/* nid of console node */
+	__u64		   ses_stamp;	      /* time stamp */
+} lst_sid_t;					    /*** session id */
+
+extern lst_sid_t LST_INVALID_SID;
+
+typedef struct {
+	__u64		   bat_id;		 /* unique id in session */
+} lst_bid_t;					    /*** batch id (group of tests) */
+
+/* Status of test node */
+#define LST_NODE_ACTIVE	 0x1		     /* node in this session */
+#define LST_NODE_BUSY	   0x2		     /* node is taken by other session */
+#define LST_NODE_DOWN	   0x4		     /* node is down */
+#define LST_NODE_UNKNOWN	0x8		     /* node not in session */
+
+typedef struct {
+	lnet_process_id_t       nde_id;		 /* id of node */
+	int		     nde_state;	      /* state of node */
+} lstcon_node_ent_t;				    /*** node entry, for list_group command */
+
+typedef struct {
+	int		     nle_nnode;	      /* # of nodes */
+	int		     nle_nactive;	    /* # of active nodes */
+	int		     nle_nbusy;	      /* # of busy nodes */
+	int		     nle_ndown;	      /* # of down nodes */
+	int		     nle_nunknown;	   /* # of unknown nodes */
+} lstcon_ndlist_ent_t;				  /*** node_list entry, for list_batch command */
+
+typedef struct {
+	int		     tse_type;	       /* test type */
+	int		     tse_loop;	       /* loop count */
+	int		     tse_concur;	     /* concurrency of test */
+} lstcon_test_ent_t;				    /*** test summary entry, for list_batch command */
+
+typedef struct {
+	int		     bae_state;	      /* batch status */
+	int		     bae_timeout;	    /* batch timeout */
+	int		     bae_ntest;	      /* # of tests in the batch */
+} lstcon_batch_ent_t;				   /*** batch summary entry, for list_batch command */
+
+typedef struct {
+	lstcon_ndlist_ent_t     tbe_cli_nle;	    /* client (group) node_list entry */
+	lstcon_ndlist_ent_t     tbe_srv_nle;	    /* server (group) node_list entry */
+	union {
+		lstcon_test_ent_t  tbe_test;	    /* test entry */
+		lstcon_batch_ent_t tbe_batch;	   /* batch entry */
+	} u;
+} lstcon_test_batch_ent_t;			      /*** test/batch verbose information entry,
+							 *** for list_batch command */
+
+typedef struct {
+	struct list_head	      rpe_link;	       /* link chain */
+	lnet_process_id_t       rpe_peer;	       /* peer's id */
+	struct timeval	  rpe_stamp;	      /* time stamp of RPC */
+	int		     rpe_state;	      /* peer's state */
+	int		     rpe_rpc_errno;	  /* RPC errno */
+
+	lst_sid_t	       rpe_sid;		/* peer's session id */
+	int		     rpe_fwk_errno;	  /* framework errno */
+	int		     rpe_priv[4];	    /* private data */
+	char		    rpe_payload[0];	 /* private reply payload */
+} lstcon_rpc_ent_t;
+
+typedef struct {
+	int		     trs_rpc_stat[4];	/* RPCs stat (0: total, 1: failed, 2: finished, 4: reserved */
+	int		     trs_rpc_errno;	  /* RPC errno */
+	int		     trs_fwk_stat[8];	/* framework stat */
+	int		     trs_fwk_errno;	  /* errno of the first remote error */
+	void		   *trs_fwk_private;	/* private framework stat */
+} lstcon_trans_stat_t;
+
+static inline int
+lstcon_rpc_stat_total(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_rpc_stat[0] : stat->trs_rpc_stat[0];
+}
+
+static inline int
+lstcon_rpc_stat_success(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_rpc_stat[1] : stat->trs_rpc_stat[1];
+}
+
+static inline int
+lstcon_rpc_stat_failure(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_rpc_stat[2] : stat->trs_rpc_stat[2];
+}
+
+static inline int
+lstcon_sesop_stat_success(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0];
+}
+
+static inline int
+lstcon_sesop_stat_failure(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1];
+}
+
+static inline int
+lstcon_sesqry_stat_active(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0];
+}
+
+static inline int
+lstcon_sesqry_stat_busy(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1];
+}
+
+static inline int
+lstcon_sesqry_stat_unknown(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[2] : stat->trs_fwk_stat[2];
+}
+
+static inline int
+lstcon_tsbop_stat_success(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0];
+}
+
+static inline int
+lstcon_tsbop_stat_failure(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1];
+}
+
+static inline int
+lstcon_tsbqry_stat_idle(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0];
+}
+
+static inline int
+lstcon_tsbqry_stat_run(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1];
+}
+
+static inline int
+lstcon_tsbqry_stat_failure(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[2] : stat->trs_fwk_stat[2];
+}
+
+static inline int
+lstcon_statqry_stat_success(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0];
+}
+
+static inline int
+lstcon_statqry_stat_failure(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1];
+}
+
+/* create a session */
+typedef struct {
+	int		     lstio_ses_key;	  /* IN: local key */
+	int		     lstio_ses_timeout;      /* IN: session timeout */
+	int		     lstio_ses_force;	/* IN: force create ? */
+	/** IN: session features */
+	unsigned		lstio_ses_feats;
+	lst_sid_t	      *lstio_ses_idp;	  /* OUT: session id */
+	int		     lstio_ses_nmlen;	/* IN: name length */
+	char		   *lstio_ses_namep;	/* IN: session name */
+} lstio_session_new_args_t;
+
+/* query current session */
+typedef struct {
+	lst_sid_t	      *lstio_ses_idp;	  /* OUT: session id */
+	int		    *lstio_ses_keyp;	 /* OUT: local key */
+	/** OUT: session features */
+	unsigned	       *lstio_ses_featp;
+	lstcon_ndlist_ent_t    *lstio_ses_ndinfo;       /* OUT: */
+	int		     lstio_ses_nmlen;	/* IN: name length */
+	char		   *lstio_ses_namep;	/* OUT: session name */
+} lstio_session_info_args_t;
+
+/* delete a session */
+typedef struct {
+	int		     lstio_ses_key;	  /* IN: session key */
+} lstio_session_end_args_t;
+
+#define LST_OPC_SESSION	 1
+#define LST_OPC_GROUP	   2
+#define LST_OPC_NODES	   3
+#define LST_OPC_BATCHCLI	4
+#define LST_OPC_BATCHSRV	5
+
+typedef struct {
+	int		     lstio_dbg_key;	  /* IN: session key */
+	int		     lstio_dbg_type;	 /* IN: debug sessin|batch|group|nodes list */
+	int		     lstio_dbg_flags;	/* IN: reserved debug flags */
+	int		     lstio_dbg_timeout;      /* IN: timeout of debug */
+
+	int		     lstio_dbg_nmlen;	/* IN: len of name */
+	char		   *lstio_dbg_namep;	/* IN: name of group|batch */
+	int		     lstio_dbg_count;	/* IN: # of test nodes to debug */
+	lnet_process_id_t      *lstio_dbg_idsp;	 /* IN: id of test nodes */
+	struct list_head	     *lstio_dbg_resultp;      /* OUT: list head of result buffer */
+} lstio_debug_args_t;
+
+typedef struct {
+	int		     lstio_grp_key;	  /* IN: session key */
+	int		     lstio_grp_nmlen;	/* IN: name length */
+	char		   *lstio_grp_namep;	/* IN: group name */
+} lstio_group_add_args_t;
+
+typedef struct {
+	int		     lstio_grp_key;	  /* IN: session key */
+	int		     lstio_grp_nmlen;	/* IN: name length */
+	char		   *lstio_grp_namep;	/* IN: group name */
+} lstio_group_del_args_t;
+
+#define LST_GROUP_CLEAN	 1		       /* remove inactive nodes in the group */
+#define LST_GROUP_REFRESH       2		       /* refresh inactive nodes in the group */
+#define LST_GROUP_RMND	  3		       /* delete nodes from the group */
+
+typedef struct {
+	int		     lstio_grp_key;	  /* IN: session key */
+	int		     lstio_grp_opc;	  /* IN: OPC */
+	int		     lstio_grp_args;	 /* IN: arguments */
+	int		     lstio_grp_nmlen;	/* IN: name length */
+	char		   *lstio_grp_namep;	/* IN: group name */
+	int		     lstio_grp_count;	/* IN: # of nodes id */
+	lnet_process_id_t      *lstio_grp_idsp;	 /* IN: array of nodes */
+	struct list_head	     *lstio_grp_resultp;      /* OUT: list head of result buffer */
+} lstio_group_update_args_t;
+
+typedef struct {
+	int		     lstio_grp_key;	  /* IN: session key */
+	int		     lstio_grp_nmlen;	/* IN: name length */
+	char		   *lstio_grp_namep;	/* IN: group name */
+	int		     lstio_grp_count;	/* IN: # of nodes */
+	/** OUT: session features */
+	unsigned	       *lstio_grp_featp;
+	lnet_process_id_t      *lstio_grp_idsp;	 /* IN: nodes */
+	struct list_head	     *lstio_grp_resultp;      /* OUT: list head of result buffer */
+} lstio_group_nodes_args_t;
+
+typedef struct {
+	int		     lstio_grp_key;	  /* IN: session key */
+	int		     lstio_grp_idx;	  /* IN: group idx */
+	int		     lstio_grp_nmlen;	/* IN: name len */
+	char		   *lstio_grp_namep;	/* OUT: name */
+} lstio_group_list_args_t;
+
+typedef struct {
+	int		     lstio_grp_key;	  /* IN: session key */
+	int		     lstio_grp_nmlen;	/* IN: name len */
+	char		   *lstio_grp_namep;	/* IN: name */
+	lstcon_ndlist_ent_t    *lstio_grp_entp;	 /* OUT: description of group */
+
+	int		    *lstio_grp_idxp;	 /* IN/OUT: node index */
+	int		    *lstio_grp_ndentp;       /* IN/OUT: # of nodent */
+	lstcon_node_ent_t      *lstio_grp_dentsp;       /* OUT: nodent array */
+} lstio_group_info_args_t;
+
+#define LST_DEFAULT_BATCH       "batch"		 /* default batch name */
+
+typedef struct {
+	int		     lstio_bat_key;	  /* IN: session key */
+	int		     lstio_bat_nmlen;	/* IN: name length */
+	char		   *lstio_bat_namep;	/* IN: batch name */
+} lstio_batch_add_args_t;
+
+typedef struct {
+	int		     lstio_bat_key;	  /* IN: session key */
+	int		     lstio_bat_nmlen;	/* IN: name length */
+	char		   *lstio_bat_namep;	/* IN: batch name */
+} lstio_batch_del_args_t;
+
+typedef struct {
+	int		     lstio_bat_key;	  /* IN: session key */
+	int		     lstio_bat_timeout;      /* IN: timeout for the batch */
+	int		     lstio_bat_nmlen;	/* IN: name length */
+	char		   *lstio_bat_namep;	/* IN: batch name */
+	struct list_head	     *lstio_bat_resultp;      /* OUT: list head of result buffer */
+} lstio_batch_run_args_t;
+
+typedef struct {
+	int		     lstio_bat_key;	  /* IN: session key */
+	int		     lstio_bat_force;	/* IN: abort unfinished test RPC */
+	int		     lstio_bat_nmlen;	/* IN: name length */
+	char		   *lstio_bat_namep;	/* IN: batch name */
+	struct list_head	     *lstio_bat_resultp;      /* OUT: list head of result buffer */
+} lstio_batch_stop_args_t;
+
+typedef struct {
+	int		     lstio_bat_key;	  /* IN: session key */
+	int		     lstio_bat_testidx;      /* IN: test index */
+	int		     lstio_bat_client;       /* IN: is test client? */
+	int		     lstio_bat_timeout;      /* IN: timeout for waiting */
+	int		     lstio_bat_nmlen;	/* IN: name length */
+	char		   *lstio_bat_namep;	/* IN: batch name */
+	struct list_head	     *lstio_bat_resultp;      /* OUT: list head of result buffer */
+} lstio_batch_query_args_t;
+
+typedef struct {
+	int		     lstio_bat_key;	  /* IN: session key */
+	int		     lstio_bat_idx;	  /* IN: index */
+	int		     lstio_bat_nmlen;	/* IN: name length */
+	char		   *lstio_bat_namep;	/* IN: batch name */
+} lstio_batch_list_args_t;
+
+typedef struct {
+	int		     lstio_bat_key;	  /* IN: session key */
+	int		     lstio_bat_nmlen;	/* IN: name length */
+	char		   *lstio_bat_namep;	/* IN: name */
+	int		     lstio_bat_server;       /* IN: query server or not */
+	int		     lstio_bat_testidx;      /* IN: test index */
+	lstcon_test_batch_ent_t *lstio_bat_entp;	/* OUT: batch ent */
+
+	int		    *lstio_bat_idxp;	 /* IN/OUT: index of node */
+	int		    *lstio_bat_ndentp;       /* IN/OUT: # of nodent */
+	lstcon_node_ent_t      *lstio_bat_dentsp;       /* array of nodent */
+} lstio_batch_info_args_t;
+
+/* add stat in session */
+typedef struct {
+	int		     lstio_sta_key;	  /* IN: session key */
+	int		     lstio_sta_timeout;      /* IN: timeout for stat requst */
+	int		     lstio_sta_nmlen;	/* IN: group name length */
+	char		   *lstio_sta_namep;	/* IN: group name */
+	int		     lstio_sta_count;	/* IN: # of pid */
+	lnet_process_id_t      *lstio_sta_idsp;	 /* IN: pid */
+	struct list_head	     *lstio_sta_resultp;      /* OUT: list head of result buffer */
+} lstio_stat_args_t;
+
+typedef enum {
+	LST_TEST_BULK   = 1,
+	LST_TEST_PING   = 2
+} lst_test_type_t;
+
+/* create a test in a batch */
+#define LST_MAX_CONCUR	  1024		    /* Max concurrency of test */
+
+typedef struct {
+	int		     lstio_tes_key;	  /* IN: session key */
+	int		     lstio_tes_bat_nmlen;    /* IN: batch name len */
+	char		   *lstio_tes_bat_name;     /* IN: batch name */
+	int		     lstio_tes_type;	 /* IN: test type */
+	int		     lstio_tes_oneside;      /* IN: one sided test */
+	int		     lstio_tes_loop;	 /* IN: loop count */
+	int		     lstio_tes_concur;       /* IN: concurrency */
+
+	int		     lstio_tes_dist;	 /* IN: node distribution in destination groups */
+	int		     lstio_tes_span;	 /* IN: node span in destination groups */
+	int		     lstio_tes_sgrp_nmlen;   /* IN: source group name length */
+	char		   *lstio_tes_sgrp_name;    /* IN: group name */
+	int		     lstio_tes_dgrp_nmlen;   /* IN: destination group name length */
+	char		   *lstio_tes_dgrp_name;    /* IN: group name */
+
+	int		     lstio_tes_param_len;    /* IN: param buffer len */
+	void		   *lstio_tes_param;	/* IN: parameter for specified test:
+							       lstio_bulk_param_t,
+							       lstio_ping_param_t,
+							       ... more */
+	int		    *lstio_tes_retp;	 /* OUT: private returned value */
+	struct list_head	     *lstio_tes_resultp;      /* OUT: list head of result buffer */
+} lstio_test_args_t;
+
+typedef enum {
+	LST_BRW_READ    = 1,
+	LST_BRW_WRITE   = 2
+} lst_brw_type_t;
+
+typedef enum {
+	LST_BRW_CHECK_NONE   = 1,
+	LST_BRW_CHECK_SIMPLE = 2,
+	LST_BRW_CHECK_FULL   = 3
+} lst_brw_flags_t;
+
+typedef struct {
+	int		     blk_opc;		/* bulk operation code */
+	int		     blk_size;	       /* size (bytes) */
+	int		     blk_time;	       /* time of running the test*/
+	int		     blk_flags;	      /* reserved flags */
+} lst_test_bulk_param_t;
+
+typedef struct {
+	int		     png_size;	       /* size of ping message */
+	int		     png_time;	       /* time */
+	int		     png_loop;	       /* loop */
+	int		     png_flags;	      /* reserved flags */
+} lst_test_ping_param_t;
+
+/* more tests */
+typedef struct {
+	__u32 errors;
+	__u32 rpcs_sent;
+	__u32 rpcs_rcvd;
+	__u32 rpcs_dropped;
+	__u32 rpcs_expired;
+	__u64 bulk_get;
+	__u64 bulk_put;
+} WIRE_ATTR srpc_counters_t;
+
+typedef struct {
+	/** milliseconds since current session started */
+	__u32 running_ms;
+	__u32 active_batches;
+	__u32 zombie_sessions;
+	__u32 brw_errors;
+	__u32 ping_errors;
+} WIRE_ATTR sfw_counters_t;
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/lnet/ptllnd.h b/drivers/staging/lustre/include/linux/lnet/ptllnd.h
new file mode 100644
index 000000000000..fc1ce8ed1f8b
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/lnet/ptllnd.h
@@ -0,0 +1,94 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/include/lnet/ptllnd.h
+ *
+ * Author: PJ Kirner <pjkirner@clusterfs.com>
+ */
+
+/*
+ * The PTLLND was designed to support Portals with
+ * Lustre and non-lustre UNLINK semantics.
+ * However for now the two targets are Cray Portals
+ * on the XT3 and Lustre Portals (for testing) both
+ * have Lustre UNLINK semantics, so this is defined
+ * by default.
+ */
+#define LUSTRE_PORTALS_UNLINK_SEMANTICS
+
+
+#ifdef _USING_LUSTRE_PORTALS_
+
+/* NIDs are 64-bits on Lustre Portals */
+#define FMT_NID LPU64
+#define FMT_PID "%d"
+
+/* When using Lustre Portals Lustre completion semantics are imlicit*/
+#define PTL_MD_LUSTRE_COMPLETION_SEMANTICS      0
+
+#else /* _USING_CRAY_PORTALS_ */
+
+/* NIDs are integers on Cray Portals */
+#define FMT_NID "%u"
+#define FMT_PID "%d"
+
+/* When using Cray Portals this is defined in the Cray Portals Header*/
+/*#define PTL_MD_LUSTRE_COMPLETION_SEMANTICS */
+
+/* Can compare handles directly on Cray Portals */
+#define PtlHandleIsEqual(a,b) ((a) == (b))
+
+/* Diffrent error types on Cray Portals*/
+#define ptl_err_t ptl_ni_fail_t
+
+/*
+ * The Cray Portals has no maximum number of IOVs.  The
+ * maximum is limited only by memory and size of the
+ * int parameters (2^31-1).
+ * Lustre only really require that the underyling
+ * implemenation to support at least LNET_MAX_IOV,
+ * so for Cray portals we can safely just use that
+ * value here.
+ *
+ */
+#define PTL_MD_MAX_IOV	  LNET_MAX_IOV
+
+#endif
+
+#define FMT_PTLID "ptlid:"FMT_PID"-"FMT_NID
+
+/* Align incoming small request messages to an 8 byte boundary if this is
+ * supported to avoid alignment issues on some architectures */
+#ifndef PTL_MD_LOCAL_ALIGN8
+# define PTL_MD_LOCAL_ALIGN8 0
+#endif
diff --git a/drivers/staging/lustre/include/linux/lnet/ptllnd_wire.h b/drivers/staging/lustre/include/linux/lnet/ptllnd_wire.h
new file mode 100644
index 000000000000..7d12b3a23a96
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/lnet/ptllnd_wire.h
@@ -0,0 +1,124 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/include/lnet/ptllnd_wire.h
+ *
+ * Author: PJ Kirner <pjkirner@clusterfs.com>
+ */
+
+/* Minimum buffer size that any peer will post to receive ptllnd messages */
+#define PTLLND_MIN_BUFFER_SIZE  256
+
+/************************************************************************
+ * Tunable defaults that {u,k}lnds/ptllnd should have in common.
+ */
+
+#define PTLLND_PORTAL	   9	  /* The same portal PTLPRC used when talking to cray portals */
+#define PTLLND_PID	      9	  /* The Portals PID */
+#define PTLLND_PEERCREDITS      8	  /* concurrent sends to 1 peer */
+
+/* Default buffer size for kernel ptllnds (guaranteed eager) */
+#define PTLLND_MAX_KLND_MSG_SIZE 512
+
+/* Default buffer size for catamount ptllnds (not guaranteed eager) - large
+ * enough to avoid RDMA for anything sent while control is not in liblustre */
+#define PTLLND_MAX_ULND_MSG_SIZE 512
+
+
+/************************************************************************
+ * Portals LND Wire message format.
+ * These are sent in sender's byte order (i.e. receiver flips).
+ */
+
+#define PTL_RESERVED_MATCHBITS  0x100	/* below this value is reserved
+					 * above is for bulk data transfer */
+#define LNET_MSG_MATCHBITS       0      /* the value for the message channel */
+
+typedef struct
+{
+	lnet_hdr_t	kptlim_hdr;	     /* portals header */
+	char	      kptlim_payload[0];      /* piggy-backed payload */
+} WIRE_ATTR kptl_immediate_msg_t;
+
+typedef struct
+{
+	lnet_hdr_t	kptlrm_hdr;	     /* portals header */
+	__u64	     kptlrm_matchbits;       /* matchbits */
+} WIRE_ATTR kptl_rdma_msg_t;
+
+typedef struct
+{
+	__u64	     kptlhm_matchbits;       /* matchbits */
+	__u32	     kptlhm_max_msg_size;    /* max message size */
+} WIRE_ATTR kptl_hello_msg_t;
+
+typedef struct
+{
+	/* First 2 fields fixed FOR ALL TIME */
+	__u32	   ptlm_magic;     /* I'm a Portals LND message */
+	__u16	   ptlm_version;   /* this is my version number */
+	__u8	    ptlm_type;      /* the message type */
+	__u8	    ptlm_credits;   /* returned credits */
+	__u32	   ptlm_nob;       /* # bytes in whole message */
+	__u32	   ptlm_cksum;     /* checksum (0 == no checksum) */
+	__u64	   ptlm_srcnid;    /* sender's NID */
+	__u64	   ptlm_srcstamp;  /* sender's incarnation */
+	__u64	   ptlm_dstnid;    /* destination's NID */
+	__u64	   ptlm_dststamp;  /* destination's incarnation */
+	__u32	   ptlm_srcpid;    /* sender's PID */
+	__u32	   ptlm_dstpid;    /* destination's PID */
+
+	 union {
+		kptl_immediate_msg_t    immediate;
+		kptl_rdma_msg_t	 rdma;
+		kptl_hello_msg_t	hello;
+	} WIRE_ATTR ptlm_u;
+
+} kptl_msg_t;
+
+/* kptl_msg_t::ptlm_credits is only a __u8 */
+#define PTLLND_MSG_MAX_CREDITS ((typeof(((kptl_msg_t*) 0)->ptlm_credits)) -1)
+
+#define PTLLND_MSG_MAGIC		LNET_PROTO_PTL_MAGIC
+#define PTLLND_MSG_VERSION	      0x04
+
+#define PTLLND_RDMA_OK		  0x00
+#define PTLLND_RDMA_FAIL		0x01
+
+#define PTLLND_MSG_TYPE_INVALID	 0x00
+#define PTLLND_MSG_TYPE_PUT	     0x01
+#define PTLLND_MSG_TYPE_GET	     0x02
+#define PTLLND_MSG_TYPE_IMMEDIATE       0x03    /* No bulk data xfer*/
+#define PTLLND_MSG_TYPE_NOOP	    0x04
+#define PTLLND_MSG_TYPE_HELLO	   0x05
+#define PTLLND_MSG_TYPE_NAK	     0x06
diff --git a/drivers/staging/lustre/include/linux/lnet/socklnd.h b/drivers/staging/lustre/include/linux/lnet/socklnd.h
new file mode 100644
index 000000000000..bacc74933a39
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/lnet/socklnd.h
@@ -0,0 +1,103 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/include/lnet/socklnd.h
+ *
+ * #defines shared between socknal implementation and utilities
+ */
+#ifndef __LNET_LNET_SOCKLND_H__
+#define __LNET_LNET_SOCKLND_H__
+
+#include <linux/lnet/types.h>
+#include <linux/lnet/lib-types.h>
+
+#define SOCKLND_CONN_NONE     (-1)
+#define SOCKLND_CONN_ANY	0
+#define SOCKLND_CONN_CONTROL    1
+#define SOCKLND_CONN_BULK_IN    2
+#define SOCKLND_CONN_BULK_OUT   3
+#define SOCKLND_CONN_NTYPES     4
+
+#define SOCKLND_CONN_ACK	SOCKLND_CONN_BULK_IN
+
+typedef struct {
+	__u32		   kshm_magic;     /* magic number of socklnd message */
+	__u32		   kshm_version;   /* version of socklnd message */
+	lnet_nid_t	      kshm_src_nid;   /* sender's nid */
+	lnet_nid_t	      kshm_dst_nid;   /* destination nid */
+	lnet_pid_t	      kshm_src_pid;   /* sender's pid */
+	lnet_pid_t	      kshm_dst_pid;   /* destination pid */
+	__u64		   kshm_src_incarnation; /* sender's incarnation */
+	__u64		   kshm_dst_incarnation; /* destination's incarnation */
+	__u32		   kshm_ctype;     /* connection type */
+	__u32		   kshm_nips;      /* # IP addrs */
+	__u32		   kshm_ips[0];    /* IP addrs */
+} WIRE_ATTR ksock_hello_msg_t;
+
+typedef struct {
+	lnet_hdr_t	      ksnm_hdr;       /* lnet hdr */
+
+	/*
+	 * ksnm_payload is removed because of winnt compiler's limitation:
+	 * zero-sized array can only be placed at the tail of [nested]
+	 * structure definitions. lnet payload will be stored just after
+	 * the body of structure ksock_lnet_msg_t
+	 */
+} WIRE_ATTR ksock_lnet_msg_t;
+
+typedef struct {
+	__u32		   ksm_type;       /* type of socklnd message */
+	__u32		   ksm_csum;       /* checksum if != 0 */
+	__u64		   ksm_zc_cookies[2]; /* Zero-Copy request/ACK cookie */
+	union {
+		ksock_lnet_msg_t lnetmsg;       /* lnet message, it's empty if it's NOOP */
+	} WIRE_ATTR ksm_u;
+} WIRE_ATTR ksock_msg_t;
+
+static inline void
+socklnd_init_msg(ksock_msg_t *msg, int type)
+{
+	msg->ksm_csum	   = 0;
+	msg->ksm_type	   = type;
+	msg->ksm_zc_cookies[0]  = msg->ksm_zc_cookies[1]  = 0;
+}
+
+#define KSOCK_MSG_NOOP	  0xc0	    /* ksm_u empty */
+#define KSOCK_MSG_LNET	  0xc1	    /* lnet msg */
+
+/* We need to know this number to parse hello msg from ksocklnd in
+ * other LND (usocklnd, for example) */
+#define KSOCK_PROTO_V2	  2
+#define KSOCK_PROTO_V3	  3
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/lnet/types.h b/drivers/staging/lustre/include/linux/lnet/types.h
new file mode 100644
index 000000000000..4f63b7acb9d7
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/lnet/types.h
@@ -0,0 +1,503 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LNET_TYPES_H__
+#define __LNET_TYPES_H__
+
+/** \addtogroup lnet
+ * @{ */
+
+#include <linux/libcfs/libcfs.h>
+
+/** \addtogroup lnet_addr
+ * @{ */
+
+/** Portal reserved for LNet's own use.
+ * \see lustre/include/lustre/lustre_idl.h for Lustre portal assignments.
+ */
+#define LNET_RESERVED_PORTAL      0
+
+/**
+ * Address of an end-point in an LNet network.
+ *
+ * A node can have multiple end-points and hence multiple addresses.
+ * An LNet network can be a simple network (e.g. tcp0) or a network of
+ * LNet networks connected by LNet routers. Therefore an end-point address
+ * has two parts: network ID, and address within a network.
+ *
+ * \see LNET_NIDNET, LNET_NIDADDR, and LNET_MKNID.
+ */
+typedef __u64 lnet_nid_t;
+/**
+ * ID of a process in a node. Shortened as PID to distinguish from
+ * lnet_process_id_t, the global process ID.
+ */
+typedef __u32 lnet_pid_t;
+
+/** wildcard NID that matches any end-point address */
+#define LNET_NID_ANY      ((lnet_nid_t) -1)
+/** wildcard PID that matches any lnet_pid_t */
+#define LNET_PID_ANY      ((lnet_pid_t) -1)
+
+#define LNET_PID_RESERVED 0xf0000000 /* reserved bits in PID */
+#define LNET_PID_USERFLAG 0x80000000 /* set in userspace peers */
+
+#define LNET_TIME_FOREVER    (-1)
+
+/**
+ * Objects maintained by the LNet are accessed through handles. Handle types
+ * have names of the form lnet_handle_xx_t, where xx is one of the two letter
+ * object type codes ('eq' for event queue, 'md' for memory descriptor, and
+ * 'me' for match entry).
+ * Each type of object is given a unique handle type to enhance type checking.
+ * The type lnet_handle_any_t can be used when a generic handle is needed.
+ * Every handle value can be converted into a value of type lnet_handle_any_t
+ * without loss of information.
+ */
+typedef struct {
+	__u64	 cookie;
+} lnet_handle_any_t;
+
+typedef lnet_handle_any_t lnet_handle_eq_t;
+typedef lnet_handle_any_t lnet_handle_md_t;
+typedef lnet_handle_any_t lnet_handle_me_t;
+
+#define LNET_WIRE_HANDLE_COOKIE_NONE   (-1)
+
+/**
+ * Invalidate handle \a h.
+ */
+static inline void LNetInvalidateHandle(lnet_handle_any_t *h)
+{
+	h->cookie = LNET_WIRE_HANDLE_COOKIE_NONE;
+}
+
+/**
+ * Compare handles \a h1 and \a h2.
+ *
+ * \return 1 if handles are equal, 0 if otherwise.
+ */
+static inline int LNetHandleIsEqual (lnet_handle_any_t h1, lnet_handle_any_t h2)
+{
+	return (h1.cookie == h2.cookie);
+}
+
+/**
+ * Check whether handle \a h is invalid.
+ *
+ * \return 1 if handle is invalid, 0 if valid.
+ */
+static inline int LNetHandleIsInvalid(lnet_handle_any_t h)
+{
+	return (LNET_WIRE_HANDLE_COOKIE_NONE == h.cookie);
+}
+
+/**
+ * Global process ID.
+ */
+typedef struct {
+	/** node id */
+	lnet_nid_t nid;
+	/** process id */
+	lnet_pid_t pid;
+} lnet_process_id_t;
+/** @} lnet_addr */
+
+/** \addtogroup lnet_me
+ * @{ */
+
+/**
+ * Specifies whether the match entry or memory descriptor should be unlinked
+ * automatically (LNET_UNLINK) or not (LNET_RETAIN).
+ */
+typedef enum {
+	LNET_RETAIN = 0,
+	LNET_UNLINK
+} lnet_unlink_t;
+
+/**
+ * Values of the type lnet_ins_pos_t are used to control where a new match
+ * entry is inserted. The value LNET_INS_BEFORE is used to insert the new
+ * entry before the current entry or before the head of the list. The value
+ * LNET_INS_AFTER is used to insert the new entry after the current entry
+ * or after the last item in the list.
+ */
+typedef enum {
+	/** insert ME before current position or head of the list */
+	LNET_INS_BEFORE,
+	/** insert ME after current position or tail of the list */
+	LNET_INS_AFTER,
+	/** attach ME at tail of local CPU partition ME list */
+	LNET_INS_LOCAL
+} lnet_ins_pos_t;
+
+/** @} lnet_me */
+
+/** \addtogroup lnet_md
+ * @{ */
+
+/**
+ * Defines the visible parts of a memory descriptor. Values of this type
+ * are used to initialize memory descriptors.
+ */
+typedef struct {
+	/**
+	 * Specify the memory region associated with the memory descriptor.
+	 * If the options field has:
+	 * - LNET_MD_KIOV bit set: The start field points to the starting
+	 * address of an array of lnet_kiov_t and the length field specifies
+	 * the number of entries in the array. The length can't be bigger
+	 * than LNET_MAX_IOV. The lnet_kiov_t is used to describe page-based
+	 * fragments that are not necessarily mapped in virtal memory.
+	 * - LNET_MD_IOVEC bit set: The start field points to the starting
+	 * address of an array of struct iovec and the length field specifies
+	 * the number of entries in the array. The length can't be bigger
+	 * than LNET_MAX_IOV. The struct iovec is used to describe fragments
+	 * that have virtual addresses.
+	 * - Otherwise: The memory region is contiguous. The start field
+	 * specifies the starting address for the memory region and the
+	 * length field specifies its length.
+	 *
+	 * When the memory region is fragmented, all fragments but the first
+	 * one must start on page boundary, and all but the last must end on
+	 * page boundary.
+	 */
+	void	    *start;
+	unsigned int     length;
+	/**
+	 * Specifies the maximum number of operations that can be performed
+	 * on the memory descriptor. An operation is any action that could
+	 * possibly generate an event. In the usual case, the threshold value
+	 * is decremented for each operation on the MD. When the threshold
+	 * drops to zero, the MD becomes inactive and does not respond to
+	 * operations. A threshold value of LNET_MD_THRESH_INF indicates that
+	 * there is no bound on the number of operations that may be applied
+	 * to a MD.
+	 */
+	int	      threshold;
+	/**
+	 * Specifies the largest incoming request that the memory descriptor
+	 * should respond to. When the unused portion of a MD (length -
+	 * local offset) falls below this value, the MD becomes inactive and
+	 * does not respond to further operations. This value is only used
+	 * if the LNET_MD_MAX_SIZE option is set.
+	 */
+	int	      max_size;
+	/**
+	 * Specifies the behavior of the memory descriptor. A bitwise OR
+	 * of the following values can be used:
+	 * - LNET_MD_OP_PUT: The LNet PUT operation is allowed on this MD.
+	 * - LNET_MD_OP_GET: The LNet GET operation is allowed on this MD.
+	 * - LNET_MD_MANAGE_REMOTE: The offset used in accessing the memory
+	 *   region is provided by the incoming request. By default, the
+	 *   offset is maintained locally. When maintained locally, the
+	 *   offset is incremented by the length of the request so that
+	 *   the next operation (PUT or GET) will access the next part of
+	 *   the memory region. Note that only one offset variable exists
+	 *   per memory descriptor. If both PUT and GET operations are
+	 *   performed on a memory descriptor, the offset is updated each time.
+	 * - LNET_MD_TRUNCATE: The length provided in the incoming request can
+	 *   be reduced to match the memory available in the region (determined
+	 *   by subtracting the offset from the length of the memory region).
+	 *   By default, if the length in the incoming operation is greater
+	 *   than the amount of memory available, the operation is rejected.
+	 * - LNET_MD_ACK_DISABLE: An acknowledgment should not be sent for
+	 *   incoming PUT operations, even if requested. By default,
+	 *   acknowledgments are sent for PUT operations that request an
+	 *   acknowledgment. Acknowledgments are never sent for GET operations.
+	 *   The data sent in the REPLY serves as an implicit acknowledgment.
+	 * - LNET_MD_KIOV: The start and length fields specify an array of
+	 *   lnet_kiov_t.
+	 * - LNET_MD_IOVEC: The start and length fields specify an array of
+	 *   struct iovec.
+	 * - LNET_MD_MAX_SIZE: The max_size field is valid.
+	 *
+	 * Note:
+	 * - LNET_MD_KIOV or LNET_MD_IOVEC allows for a scatter/gather
+	 *   capability for memory descriptors. They can't be both set.
+	 * - When LNET_MD_MAX_SIZE is set, the total length of the memory
+	 *   region (i.e. sum of all fragment lengths) must not be less than
+	 *   \a max_size.
+	 */
+	unsigned int     options;
+	/**
+	 * A user-specified value that is associated with the memory
+	 * descriptor. The value does not need to be a pointer, but must fit
+	 * in the space used by a pointer. This value is recorded in events
+	 * associated with operations on this MD.
+	 */
+	void	    *user_ptr;
+	/**
+	 * A handle for the event queue used to log the operations performed on
+	 * the memory region. If this argument is a NULL handle (i.e. nullified
+	 * by LNetInvalidateHandle()), operations performed on this memory
+	 * descriptor are not logged.
+	 */
+	lnet_handle_eq_t eq_handle;
+} lnet_md_t;
+
+/* Max Transfer Unit (minimum supported everywhere).
+ * CAVEAT EMPTOR, with multinet (i.e. routers forwarding between networks)
+ * these limits are system wide and not interface-local. */
+#define LNET_MTU_BITS	20
+#define LNET_MTU	(1 << LNET_MTU_BITS)
+
+/** limit on the number of fragments in discontiguous MDs */
+#define LNET_MAX_IOV    256
+
+/* Max payload size */
+# define LNET_MAX_PAYLOAD	CONFIG_LNET_MAX_PAYLOAD
+# if (LNET_MAX_PAYLOAD < LNET_MTU)
+#  error "LNET_MAX_PAYLOAD too small - error in configure --with-max-payload-mb"
+# else
+#  if (LNET_MAX_PAYLOAD > (PAGE_SIZE * LNET_MAX_IOV))
+/*  PAGE_SIZE is a constant: check with cpp! */
+#   error "LNET_MAX_PAYLOAD too large - error in configure --with-max-payload-mb"
+#  endif
+# endif
+
+/**
+ * Options for the MD structure. See lnet_md_t::options.
+ */
+#define LNET_MD_OP_PUT	       (1 << 0)
+/** See lnet_md_t::options. */
+#define LNET_MD_OP_GET	       (1 << 1)
+/** See lnet_md_t::options. */
+#define LNET_MD_MANAGE_REMOTE	(1 << 2)
+/* unused			    (1 << 3) */
+/** See lnet_md_t::options. */
+#define LNET_MD_TRUNCATE	     (1 << 4)
+/** See lnet_md_t::options. */
+#define LNET_MD_ACK_DISABLE	  (1 << 5)
+/** See lnet_md_t::options. */
+#define LNET_MD_IOVEC		(1 << 6)
+/** See lnet_md_t::options. */
+#define LNET_MD_MAX_SIZE	     (1 << 7)
+/** See lnet_md_t::options. */
+#define LNET_MD_KIOV		 (1 << 8)
+
+/* For compatibility with Cray Portals */
+#define LNET_MD_PHYS			 0
+
+/** Infinite threshold on MD operations. See lnet_md_t::threshold */
+#define LNET_MD_THRESH_INF       (-1)
+
+/* NB lustre portals uses struct iovec internally! */
+typedef struct iovec lnet_md_iovec_t;
+
+/**
+ * A page-based fragment of a MD.
+ */
+typedef struct {
+	/** Pointer to the page where the fragment resides */
+	struct page      *kiov_page;
+	/** Length in bytes of the fragment */
+	unsigned int     kiov_len;
+	/**
+	 * Starting offset of the fragment within the page. Note that the
+	 * end of the fragment must not pass the end of the page; i.e.,
+	 * kiov_len + kiov_offset <= PAGE_CACHE_SIZE.
+	 */
+	unsigned int     kiov_offset;
+} lnet_kiov_t;
+/** @} lnet_md */
+
+/** \addtogroup lnet_eq
+ * @{ */
+
+/**
+ * Six types of events can be logged in an event queue.
+ */
+typedef enum {
+	/** An incoming GET operation has completed on the MD. */
+	LNET_EVENT_GET		= 1,
+	/**
+	 * An incoming PUT operation has completed on the MD. The
+	 * underlying layers will not alter the memory (on behalf of this
+	 * operation) once this event has been logged.
+	 */
+	LNET_EVENT_PUT,
+	/**
+	 * A REPLY operation has completed. This event is logged after the
+	 * data (if any) from the REPLY has been written into the MD.
+	 */
+	LNET_EVENT_REPLY,
+	/** An acknowledgment has been received. */
+	LNET_EVENT_ACK,
+	/**
+	 * An outgoing send (PUT or GET) operation has completed. This event
+	 * is logged after the entire buffer has been sent and it is safe for
+	 * the caller to reuse the buffer.
+	 *
+	 * Note:
+	 * - The LNET_EVENT_SEND doesn't guarantee message delivery. It can
+	 *   happen even when the message has not yet been put out on wire.
+	 * - It's unsafe to assume that in an outgoing GET operation
+	 *   the LNET_EVENT_SEND event would happen before the
+	 *   LNET_EVENT_REPLY event. The same holds for LNET_EVENT_SEND and
+	 *   LNET_EVENT_ACK events in an outgoing PUT operation.
+	 */
+	LNET_EVENT_SEND,
+	/**
+	 * A MD has been unlinked. Note that LNetMDUnlink() does not
+	 * necessarily trigger an LNET_EVENT_UNLINK event.
+	 * \see LNetMDUnlink
+	 */
+	LNET_EVENT_UNLINK,
+} lnet_event_kind_t;
+
+#define LNET_SEQ_BASETYPE       long
+typedef unsigned LNET_SEQ_BASETYPE lnet_seq_t;
+#define LNET_SEQ_GT(a,b)	(((signed LNET_SEQ_BASETYPE)((a) - (b))) > 0)
+
+/* XXX
+ * cygwin need the pragma line, not clear if it's needed in other places.
+ * checking!!!
+ */
+#ifdef __CYGWIN__
+#pragma pack(push, 4)
+#endif
+
+/**
+ * Information about an event on a MD.
+ */
+typedef struct {
+	/** The identifier (nid, pid) of the target. */
+	lnet_process_id_t   target;
+	/** The identifier (nid, pid) of the initiator. */
+	lnet_process_id_t   initiator;
+	/**
+	 * The NID of the immediate sender. If the request has been forwarded
+	 * by routers, this is the NID of the last hop; otherwise it's the
+	 * same as the initiator.
+	 */
+	lnet_nid_t	  sender;
+	/** Indicates the type of the event. */
+	lnet_event_kind_t   type;
+	/** The portal table index specified in the request */
+	unsigned int	pt_index;
+	/** A copy of the match bits specified in the request. */
+	__u64	       match_bits;
+	/** The length (in bytes) specified in the request. */
+	unsigned int	rlength;
+	/**
+	 * The length (in bytes) of the data that was manipulated by the
+	 * operation. For truncated operations, the manipulated length will be
+	 * the number of bytes specified by the MD (possibly with an offset,
+	 * see lnet_md_t). For all other operations, the manipulated length
+	 * will be the length of the requested operation, i.e. rlength.
+	 */
+	unsigned int	mlength;
+	/**
+	 * The handle to the MD associated with the event. The handle may be
+	 * invalid if the MD has been unlinked.
+	 */
+	lnet_handle_md_t    md_handle;
+	/**
+	 * A snapshot of the state of the MD immediately after the event has
+	 * been processed. In particular, the threshold field in md will
+	 * reflect the value of the threshold after the operation occurred.
+	 */
+	lnet_md_t	   md;
+	/**
+	 * 64 bits of out-of-band user data. Only valid for LNET_EVENT_PUT.
+	 * \see LNetPut
+	 */
+	__u64	       hdr_data;
+	/**
+	 * Indicates the completion status of the operation. It's 0 for
+	 * successful operations, otherwise it's an error code.
+	 */
+	int		 status;
+	/**
+	 * Indicates whether the MD has been unlinked. Note that:
+	 * - An event with unlinked set is the last event on the MD.
+	 * - This field is also set for an explicit LNET_EVENT_UNLINK event.
+	 * \see LNetMDUnlink
+	 */
+	int		 unlinked;
+	/**
+	 * The displacement (in bytes) into the memory region that the
+	 * operation used. The offset can be determined by the operation for
+	 * a remote managed MD or by the local MD.
+	 * \see lnet_md_t::options
+	 */
+	unsigned int	offset;
+	/**
+	 * The sequence number for this event. Sequence numbers are unique
+	 * to each event.
+	 */
+	volatile lnet_seq_t sequence;
+} lnet_event_t;
+#ifdef __CYGWIN__
+#pragma pop
+#endif
+
+/**
+ * Event queue handler function type.
+ *
+ * The EQ handler runs for each event that is deposited into the EQ. The
+ * handler is supplied with a pointer to the event that triggered the
+ * handler invocation.
+ *
+ * The handler must not block, must be reentrant, and must not call any LNet
+ * API functions. It should return as quickly as possible.
+ */
+typedef void (*lnet_eq_handler_t)(lnet_event_t *event);
+#define LNET_EQ_HANDLER_NONE NULL
+/** @} lnet_eq */
+
+/** \addtogroup lnet_data
+ * @{ */
+
+/**
+ * Specify whether an acknowledgment should be sent by target when the PUT
+ * operation completes (i.e., when the data has been written to a MD of the
+ * target process).
+ *
+ * \see lnet_md_t::options for the discussion on LNET_MD_ACK_DISABLE by which
+ * acknowledgments can be disabled for a MD.
+ */
+typedef enum {
+	/** Request an acknowledgment */
+	LNET_ACK_REQ,
+	/** Request that no acknowledgment should be generated. */
+	LNET_NOACK_REQ
+} lnet_ack_req_t;
+/** @} lnet_data */
+
+/** @} lnet */
+#endif
diff --git a/drivers/staging/lustre/lnet/Kconfig b/drivers/staging/lustre/lnet/Kconfig
new file mode 100644
index 000000000000..00850eeb6a8c
--- /dev/null
+++ b/drivers/staging/lustre/lnet/Kconfig
@@ -0,0 +1,40 @@
+config LNET
+	tristate "Lustre networking subsystem"
+	depends on LUSTRE_FS
+
+config LNET_MAX_PAYLOAD
+	int "Lustre lnet max transfer payload (default 2MB)"
+	depends on LUSTRE_FS
+	default "1048576"
+	help
+	  This option defines the maximum size of payload in bytes that lnet
+	  can put into its transport.
+
+	  If unsure, use default.
+
+config LNET_SELFTEST
+	tristate "Lustre networking self testing"
+	depends on LNET
+	help
+	  Choose Y here if you want to do lnet self testing. To compile this
+	  as a module, choose M here: the module will be called lnet_selftest.
+
+	  To compile this as a kernel modules, choose M here and it will be
+	  called lnet_selftest.
+
+	  If unsure, say N.
+
+	  See also http://wiki.lustre.org/
+
+config LNET_XPRT_IB
+	tristate "LNET infiniband support"
+	depends on LNET && INFINIBAND && INFINIBAND_ADDR_TRANS
+	default LNET && INFINIBAND
+	help
+	  This option allows the LNET users to use infiniband as an
+	  RDMA-enabled transport.
+
+	  To compile this as a kernel module, choose M here and it will be
+	  called ko2iblnd.
+
+	  If unsure, say N.
diff --git a/drivers/staging/lustre/lnet/Makefile b/drivers/staging/lustre/lnet/Makefile
new file mode 100644
index 000000000000..374212b1555a
--- /dev/null
+++ b/drivers/staging/lustre/lnet/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_LNET) := klnds/ lnet/ selftest/
diff --git a/drivers/staging/lustre/lnet/klnds/Makefile b/drivers/staging/lustre/lnet/klnds/Makefile
new file mode 100644
index 000000000000..c23e4f67f837
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_LNET) += o2iblnd/  socklnd/
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile b/drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile
new file mode 100644
index 000000000000..71b7d8418357
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_LNET_XPRT_IB) += ko2iblnd.o
+ko2iblnd-y := o2iblnd.o o2iblnd_cb.o o2iblnd_modparams.o
+
+
+ccflags-y := -I$(src)/../../include
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c
new file mode 100644
index 000000000000..f4b958bbe5a3
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c
@@ -0,0 +1,3256 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/o2iblnd/o2iblnd.c
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include "o2iblnd.h"
+
+lnd_t the_o2iblnd = {
+	.lnd_type       = O2IBLND,
+	.lnd_startup    = kiblnd_startup,
+	.lnd_shutdown   = kiblnd_shutdown,
+	.lnd_ctl	= kiblnd_ctl,
+	.lnd_query      = kiblnd_query,
+	.lnd_send       = kiblnd_send,
+	.lnd_recv       = kiblnd_recv,
+};
+
+kib_data_t	      kiblnd_data;
+
+__u32
+kiblnd_cksum (void *ptr, int nob)
+{
+	char  *c  = ptr;
+	__u32  sum = 0;
+
+	while (nob-- > 0)
+		sum = ((sum << 1) | (sum >> 31)) + *c++;
+
+	/* ensure I don't return 0 (== no checksum) */
+	return (sum == 0) ? 1 : sum;
+}
+
+static char *
+kiblnd_msgtype2str(int type)
+{
+	switch (type) {
+	case IBLND_MSG_CONNREQ:
+		return "CONNREQ";
+
+	case IBLND_MSG_CONNACK:
+		return "CONNACK";
+
+	case IBLND_MSG_NOOP:
+		return "NOOP";
+
+	case IBLND_MSG_IMMEDIATE:
+		return "IMMEDIATE";
+
+	case IBLND_MSG_PUT_REQ:
+		return "PUT_REQ";
+
+	case IBLND_MSG_PUT_NAK:
+		return "PUT_NAK";
+
+	case IBLND_MSG_PUT_ACK:
+		return "PUT_ACK";
+
+	case IBLND_MSG_PUT_DONE:
+		return "PUT_DONE";
+
+	case IBLND_MSG_GET_REQ:
+		return "GET_REQ";
+
+	case IBLND_MSG_GET_DONE:
+		return "GET_DONE";
+
+	default:
+		return "???";
+	}
+}
+
+static int
+kiblnd_msgtype2size(int type)
+{
+	const int hdr_size = offsetof(kib_msg_t, ibm_u);
+
+	switch (type) {
+	case IBLND_MSG_CONNREQ:
+	case IBLND_MSG_CONNACK:
+		return hdr_size + sizeof(kib_connparams_t);
+
+	case IBLND_MSG_NOOP:
+		return hdr_size;
+
+	case IBLND_MSG_IMMEDIATE:
+		return offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0]);
+
+	case IBLND_MSG_PUT_REQ:
+		return hdr_size + sizeof(kib_putreq_msg_t);
+
+	case IBLND_MSG_PUT_ACK:
+		return hdr_size + sizeof(kib_putack_msg_t);
+
+	case IBLND_MSG_GET_REQ:
+		return hdr_size + sizeof(kib_get_msg_t);
+
+	case IBLND_MSG_PUT_NAK:
+	case IBLND_MSG_PUT_DONE:
+	case IBLND_MSG_GET_DONE:
+		return hdr_size + sizeof(kib_completion_msg_t);
+	default:
+		return -1;
+	}
+}
+
+static int
+kiblnd_unpack_rd(kib_msg_t *msg, int flip)
+{
+	kib_rdma_desc_t   *rd;
+	int		nob;
+	int		n;
+	int		i;
+
+	LASSERT (msg->ibm_type == IBLND_MSG_GET_REQ ||
+		 msg->ibm_type == IBLND_MSG_PUT_ACK);
+
+	rd = msg->ibm_type == IBLND_MSG_GET_REQ ?
+			      &msg->ibm_u.get.ibgm_rd :
+			      &msg->ibm_u.putack.ibpam_rd;
+
+	if (flip) {
+		__swab32s(&rd->rd_key);
+		__swab32s(&rd->rd_nfrags);
+	}
+
+	n = rd->rd_nfrags;
+
+	if (n <= 0 || n > IBLND_MAX_RDMA_FRAGS) {
+		CERROR("Bad nfrags: %d, should be 0 < n <= %d\n",
+		       n, IBLND_MAX_RDMA_FRAGS);
+		return 1;
+	}
+
+	nob = offsetof (kib_msg_t, ibm_u) +
+	      kiblnd_rd_msg_size(rd, msg->ibm_type, n);
+
+	if (msg->ibm_nob < nob) {
+		CERROR("Short %s: %d(%d)\n",
+		       kiblnd_msgtype2str(msg->ibm_type), msg->ibm_nob, nob);
+		return 1;
+	}
+
+	if (!flip)
+		return 0;
+
+	for (i = 0; i < n; i++) {
+		__swab32s(&rd->rd_frags[i].rf_nob);
+		__swab64s(&rd->rd_frags[i].rf_addr);
+	}
+
+	return 0;
+}
+
+void
+kiblnd_pack_msg (lnet_ni_t *ni, kib_msg_t *msg, int version,
+		 int credits, lnet_nid_t dstnid, __u64 dststamp)
+{
+	kib_net_t *net = ni->ni_data;
+
+	/* CAVEAT EMPTOR! all message fields not set here should have been
+	 * initialised previously. */
+	msg->ibm_magic    = IBLND_MSG_MAGIC;
+	msg->ibm_version  = version;
+	/*   ibm_type */
+	msg->ibm_credits  = credits;
+	/*   ibm_nob */
+	msg->ibm_cksum    = 0;
+	msg->ibm_srcnid   = ni->ni_nid;
+	msg->ibm_srcstamp = net->ibn_incarnation;
+	msg->ibm_dstnid   = dstnid;
+	msg->ibm_dststamp = dststamp;
+
+	if (*kiblnd_tunables.kib_cksum) {
+		/* NB ibm_cksum zero while computing cksum */
+		msg->ibm_cksum = kiblnd_cksum(msg, msg->ibm_nob);
+	}
+}
+
+int
+kiblnd_unpack_msg(kib_msg_t *msg, int nob)
+{
+	const int hdr_size = offsetof(kib_msg_t, ibm_u);
+	__u32     msg_cksum;
+	__u16     version;
+	int       msg_nob;
+	int       flip;
+
+	/* 6 bytes are enough to have received magic + version */
+	if (nob < 6) {
+		CERROR("Short message: %d\n", nob);
+		return -EPROTO;
+	}
+
+	if (msg->ibm_magic == IBLND_MSG_MAGIC) {
+		flip = 0;
+	} else if (msg->ibm_magic == __swab32(IBLND_MSG_MAGIC)) {
+		flip = 1;
+	} else {
+		CERROR("Bad magic: %08x\n", msg->ibm_magic);
+		return -EPROTO;
+	}
+
+	version = flip ? __swab16(msg->ibm_version) : msg->ibm_version;
+	if (version != IBLND_MSG_VERSION &&
+	    version != IBLND_MSG_VERSION_1) {
+		CERROR("Bad version: %x\n", version);
+		return -EPROTO;
+	}
+
+	if (nob < hdr_size) {
+		CERROR("Short message: %d\n", nob);
+		return -EPROTO;
+	}
+
+	msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob;
+	if (msg_nob > nob) {
+		CERROR("Short message: got %d, wanted %d\n", nob, msg_nob);
+		return -EPROTO;
+	}
+
+	/* checksum must be computed with ibm_cksum zero and BEFORE anything
+	 * gets flipped */
+	msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum;
+	msg->ibm_cksum = 0;
+	if (msg_cksum != 0 &&
+	    msg_cksum != kiblnd_cksum(msg, msg_nob)) {
+		CERROR("Bad checksum\n");
+		return -EPROTO;
+	}
+
+	msg->ibm_cksum = msg_cksum;
+
+	if (flip) {
+		/* leave magic unflipped as a clue to peer endianness */
+		msg->ibm_version = version;
+		CLASSERT (sizeof(msg->ibm_type) == 1);
+		CLASSERT (sizeof(msg->ibm_credits) == 1);
+		msg->ibm_nob     = msg_nob;
+		__swab64s(&msg->ibm_srcnid);
+		__swab64s(&msg->ibm_srcstamp);
+		__swab64s(&msg->ibm_dstnid);
+		__swab64s(&msg->ibm_dststamp);
+	}
+
+	if (msg->ibm_srcnid == LNET_NID_ANY) {
+		CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid));
+		return -EPROTO;
+	}
+
+	if (msg_nob < kiblnd_msgtype2size(msg->ibm_type)) {
+		CERROR("Short %s: %d(%d)\n", kiblnd_msgtype2str(msg->ibm_type),
+		       msg_nob, kiblnd_msgtype2size(msg->ibm_type));
+		return -EPROTO;
+	}
+
+	switch (msg->ibm_type) {
+	default:
+		CERROR("Unknown message type %x\n", msg->ibm_type);
+		return -EPROTO;
+
+	case IBLND_MSG_NOOP:
+	case IBLND_MSG_IMMEDIATE:
+	case IBLND_MSG_PUT_REQ:
+		break;
+
+	case IBLND_MSG_PUT_ACK:
+	case IBLND_MSG_GET_REQ:
+		if (kiblnd_unpack_rd(msg, flip))
+			return -EPROTO;
+		break;
+
+	case IBLND_MSG_PUT_NAK:
+	case IBLND_MSG_PUT_DONE:
+	case IBLND_MSG_GET_DONE:
+		if (flip)
+			__swab32s(&msg->ibm_u.completion.ibcm_status);
+		break;
+
+	case IBLND_MSG_CONNREQ:
+	case IBLND_MSG_CONNACK:
+		if (flip) {
+			__swab16s(&msg->ibm_u.connparams.ibcp_queue_depth);
+			__swab16s(&msg->ibm_u.connparams.ibcp_max_frags);
+			__swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size);
+		}
+		break;
+	}
+	return 0;
+}
+
+int
+kiblnd_create_peer(lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid)
+{
+	kib_peer_t	*peer;
+	kib_net_t	*net = ni->ni_data;
+	int		cpt = lnet_cpt_of_nid(nid);
+	unsigned long   flags;
+
+	LASSERT(net != NULL);
+	LASSERT(nid != LNET_NID_ANY);
+
+	LIBCFS_CPT_ALLOC(peer, lnet_cpt_table(), cpt, sizeof(*peer));
+	if (peer == NULL) {
+		CERROR("Cannot allocate peer\n");
+		return -ENOMEM;
+	}
+
+	memset(peer, 0, sizeof(*peer));	 /* zero flags etc */
+
+	peer->ibp_ni = ni;
+	peer->ibp_nid = nid;
+	peer->ibp_error = 0;
+	peer->ibp_last_alive = 0;
+	atomic_set(&peer->ibp_refcount, 1);  /* 1 ref for caller */
+
+	INIT_LIST_HEAD(&peer->ibp_list);     /* not in the peer table yet */
+	INIT_LIST_HEAD(&peer->ibp_conns);
+	INIT_LIST_HEAD(&peer->ibp_tx_queue);
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	/* always called with a ref on ni, which prevents ni being shutdown */
+	LASSERT (net->ibn_shutdown == 0);
+
+	/* npeers only grows with the global lock held */
+	atomic_inc(&net->ibn_npeers);
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	*peerp = peer;
+	return 0;
+}
+
+void
+kiblnd_destroy_peer (kib_peer_t *peer)
+{
+	kib_net_t *net = peer->ibp_ni->ni_data;
+
+	LASSERT (net != NULL);
+	LASSERT (atomic_read(&peer->ibp_refcount) == 0);
+	LASSERT (!kiblnd_peer_active(peer));
+	LASSERT (peer->ibp_connecting == 0);
+	LASSERT (peer->ibp_accepting == 0);
+	LASSERT (list_empty(&peer->ibp_conns));
+	LASSERT (list_empty(&peer->ibp_tx_queue));
+
+	LIBCFS_FREE(peer, sizeof(*peer));
+
+	/* NB a peer's connections keep a reference on their peer until
+	 * they are destroyed, so we can be assured that _all_ state to do
+	 * with this peer has been cleaned up when its refcount drops to
+	 * zero. */
+	atomic_dec(&net->ibn_npeers);
+}
+
+kib_peer_t *
+kiblnd_find_peer_locked (lnet_nid_t nid)
+{
+	/* the caller is responsible for accounting the additional reference
+	 * that this creates */
+	struct list_head       *peer_list = kiblnd_nid2peerlist(nid);
+	struct list_head       *tmp;
+	kib_peer_t       *peer;
+
+	list_for_each (tmp, peer_list) {
+
+		peer = list_entry(tmp, kib_peer_t, ibp_list);
+
+		LASSERT (peer->ibp_connecting > 0 || /* creating conns */
+			 peer->ibp_accepting > 0 ||
+			 !list_empty(&peer->ibp_conns));  /* active conn */
+
+		if (peer->ibp_nid != nid)
+			continue;
+
+		CDEBUG(D_NET, "got peer [%p] -> %s (%d) version: %x\n",
+		       peer, libcfs_nid2str(nid),
+		       atomic_read(&peer->ibp_refcount),
+		       peer->ibp_version);
+		return peer;
+	}
+	return NULL;
+}
+
+void
+kiblnd_unlink_peer_locked (kib_peer_t *peer)
+{
+	LASSERT (list_empty(&peer->ibp_conns));
+
+	LASSERT (kiblnd_peer_active(peer));
+	list_del_init(&peer->ibp_list);
+	/* lose peerlist's ref */
+	kiblnd_peer_decref(peer);
+}
+
+int
+kiblnd_get_peer_info (lnet_ni_t *ni, int index,
+		      lnet_nid_t *nidp, int *count)
+{
+	kib_peer_t	    *peer;
+	struct list_head	    *ptmp;
+	int		    i;
+	unsigned long	  flags;
+
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
+
+		list_for_each (ptmp, &kiblnd_data.kib_peers[i]) {
+
+			peer = list_entry(ptmp, kib_peer_t, ibp_list);
+			LASSERT (peer->ibp_connecting > 0 ||
+				 peer->ibp_accepting > 0 ||
+				 !list_empty(&peer->ibp_conns));
+
+			if (peer->ibp_ni != ni)
+				continue;
+
+			if (index-- > 0)
+				continue;
+
+			*nidp = peer->ibp_nid;
+			*count = atomic_read(&peer->ibp_refcount);
+
+			read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
+					       flags);
+			return 0;
+		}
+	}
+
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+	return -ENOENT;
+}
+
+void
+kiblnd_del_peer_locked (kib_peer_t *peer)
+{
+	struct list_head	   *ctmp;
+	struct list_head	   *cnxt;
+	kib_conn_t	   *conn;
+
+	if (list_empty(&peer->ibp_conns)) {
+		kiblnd_unlink_peer_locked(peer);
+	} else {
+		list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
+			conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+			kiblnd_close_conn_locked(conn, 0);
+		}
+		/* NB closing peer's last conn unlinked it. */
+	}
+	/* NB peer now unlinked; might even be freed if the peer table had the
+	 * last ref on it. */
+}
+
+int
+kiblnd_del_peer (lnet_ni_t *ni, lnet_nid_t nid)
+{
+	LIST_HEAD	 (zombies);
+	struct list_head	    *ptmp;
+	struct list_head	    *pnxt;
+	kib_peer_t	    *peer;
+	int		    lo;
+	int		    hi;
+	int		    i;
+	unsigned long	  flags;
+	int		    rc = -ENOENT;
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	if (nid != LNET_NID_ANY) {
+		lo = hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
+	} else {
+		lo = 0;
+		hi = kiblnd_data.kib_peer_hash_size - 1;
+	}
+
+	for (i = lo; i <= hi; i++) {
+		list_for_each_safe (ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
+			peer = list_entry(ptmp, kib_peer_t, ibp_list);
+			LASSERT (peer->ibp_connecting > 0 ||
+				 peer->ibp_accepting > 0 ||
+				 !list_empty(&peer->ibp_conns));
+
+			if (peer->ibp_ni != ni)
+				continue;
+
+			if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid))
+				continue;
+
+			if (!list_empty(&peer->ibp_tx_queue)) {
+				LASSERT (list_empty(&peer->ibp_conns));
+
+				list_splice_init(&peer->ibp_tx_queue,
+						     &zombies);
+			}
+
+			kiblnd_del_peer_locked(peer);
+			rc = 0;	 /* matched something */
+		}
+	}
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	kiblnd_txlist_done(ni, &zombies, -EIO);
+
+	return rc;
+}
+
+kib_conn_t *
+kiblnd_get_conn_by_idx (lnet_ni_t *ni, int index)
+{
+	kib_peer_t	    *peer;
+	struct list_head	    *ptmp;
+	kib_conn_t	    *conn;
+	struct list_head	    *ctmp;
+	int		    i;
+	unsigned long	  flags;
+
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
+		list_for_each (ptmp, &kiblnd_data.kib_peers[i]) {
+
+			peer = list_entry(ptmp, kib_peer_t, ibp_list);
+			LASSERT (peer->ibp_connecting > 0 ||
+				 peer->ibp_accepting > 0 ||
+				 !list_empty(&peer->ibp_conns));
+
+			if (peer->ibp_ni != ni)
+				continue;
+
+			list_for_each (ctmp, &peer->ibp_conns) {
+				if (index-- > 0)
+					continue;
+
+				conn = list_entry(ctmp, kib_conn_t,
+						      ibc_list);
+				kiblnd_conn_addref(conn);
+				read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
+						       flags);
+				return conn;
+			}
+		}
+	}
+
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+	return NULL;
+}
+
+void
+kiblnd_debug_rx (kib_rx_t *rx)
+{
+	CDEBUG(D_CONSOLE, "      %p status %d msg_type %x cred %d\n",
+	       rx, rx->rx_status, rx->rx_msg->ibm_type,
+	       rx->rx_msg->ibm_credits);
+}
+
+void
+kiblnd_debug_tx (kib_tx_t *tx)
+{
+	CDEBUG(D_CONSOLE, "      %p snd %d q %d w %d rc %d dl %lx "
+	       "cookie "LPX64" msg %s%s type %x cred %d\n",
+	       tx, tx->tx_sending, tx->tx_queued, tx->tx_waiting,
+	       tx->tx_status, tx->tx_deadline, tx->tx_cookie,
+	       tx->tx_lntmsg[0] == NULL ? "-" : "!",
+	       tx->tx_lntmsg[1] == NULL ? "-" : "!",
+	       tx->tx_msg->ibm_type, tx->tx_msg->ibm_credits);
+}
+
+void
+kiblnd_debug_conn (kib_conn_t *conn)
+{
+	struct list_head	*tmp;
+	int		i;
+
+	spin_lock(&conn->ibc_lock);
+
+	CDEBUG(D_CONSOLE, "conn[%d] %p [version %x] -> %s: \n",
+	       atomic_read(&conn->ibc_refcount), conn,
+	       conn->ibc_version, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+	CDEBUG(D_CONSOLE, "   state %d nposted %d/%d cred %d o_cred %d r_cred %d\n",
+	       conn->ibc_state, conn->ibc_noops_posted,
+	       conn->ibc_nsends_posted, conn->ibc_credits,
+	       conn->ibc_outstanding_credits, conn->ibc_reserved_credits);
+	CDEBUG(D_CONSOLE, "   comms_err %d\n", conn->ibc_comms_error);
+
+	CDEBUG(D_CONSOLE, "   early_rxs:\n");
+	list_for_each(tmp, &conn->ibc_early_rxs)
+		kiblnd_debug_rx(list_entry(tmp, kib_rx_t, rx_list));
+
+	CDEBUG(D_CONSOLE, "   tx_noops:\n");
+	list_for_each(tmp, &conn->ibc_tx_noops)
+		kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+	CDEBUG(D_CONSOLE, "   tx_queue_nocred:\n");
+	list_for_each(tmp, &conn->ibc_tx_queue_nocred)
+		kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+	CDEBUG(D_CONSOLE, "   tx_queue_rsrvd:\n");
+	list_for_each(tmp, &conn->ibc_tx_queue_rsrvd)
+		kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+	CDEBUG(D_CONSOLE, "   tx_queue:\n");
+	list_for_each(tmp, &conn->ibc_tx_queue)
+		kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+	CDEBUG(D_CONSOLE, "   active_txs:\n");
+	list_for_each(tmp, &conn->ibc_active_txs)
+		kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+	CDEBUG(D_CONSOLE, "   rxs:\n");
+	for (i = 0; i < IBLND_RX_MSGS(conn->ibc_version); i++)
+		kiblnd_debug_rx(&conn->ibc_rxs[i]);
+
+	spin_unlock(&conn->ibc_lock);
+}
+
+int
+kiblnd_translate_mtu(int value)
+{
+	switch (value) {
+	default:
+		return -1;
+	case 0:
+		return 0;
+	case 256:
+		return IB_MTU_256;
+	case 512:
+		return IB_MTU_512;
+	case 1024:
+		return IB_MTU_1024;
+	case 2048:
+		return IB_MTU_2048;
+	case 4096:
+		return IB_MTU_4096;
+	}
+}
+
+static void
+kiblnd_setup_mtu_locked(struct rdma_cm_id *cmid)
+{
+	int	   mtu;
+
+	/* XXX There is no path record for iWARP, set by netdev->change_mtu? */
+	if (cmid->route.path_rec == NULL)
+		return;
+
+	mtu = kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu);
+	LASSERT (mtu >= 0);
+	if (mtu != 0)
+		cmid->route.path_rec->mtu = mtu;
+}
+
+static int
+kiblnd_get_completion_vector(kib_conn_t *conn, int cpt)
+{
+	cpumask_t	*mask;
+	int		vectors;
+	int		off;
+	int		i;
+
+	vectors = conn->ibc_cmid->device->num_comp_vectors;
+	if (vectors <= 1)
+		return 0;
+
+	mask = cfs_cpt_cpumask(lnet_cpt_table(), cpt);
+
+	/* hash NID to CPU id in this partition... */
+	off = conn->ibc_peer->ibp_nid % cpus_weight(*mask);
+	for_each_cpu_mask(i, *mask) {
+		if (off-- == 0)
+			return i % vectors;
+	}
+
+	LBUG();
+	return 1;
+}
+
+kib_conn_t *
+kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid,
+		   int state, int version)
+{
+	/* CAVEAT EMPTOR:
+	 * If the new conn is created successfully it takes over the caller's
+	 * ref on 'peer'.  It also "owns" 'cmid' and destroys it when it itself
+	 * is destroyed.  On failure, the caller's ref on 'peer' remains and
+	 * she must dispose of 'cmid'.  (Actually I'd block forever if I tried
+	 * to destroy 'cmid' here since I'm called from the CM which still has
+	 * its ref on 'cmid'). */
+	rwlock_t		*glock = &kiblnd_data.kib_global_lock;
+	kib_net_t	      *net = peer->ibp_ni->ni_data;
+	kib_dev_t	      *dev;
+	struct ib_qp_init_attr *init_qp_attr;
+	struct kib_sched_info	*sched;
+	kib_conn_t		*conn;
+	struct ib_cq		*cq;
+	unsigned long		flags;
+	int			cpt;
+	int			rc;
+	int			i;
+
+	LASSERT(net != NULL);
+	LASSERT(!in_interrupt());
+
+	dev = net->ibn_dev;
+
+	cpt = lnet_cpt_of_nid(peer->ibp_nid);
+	sched = kiblnd_data.kib_scheds[cpt];
+
+	LASSERT(sched->ibs_nthreads > 0);
+
+	LIBCFS_CPT_ALLOC(init_qp_attr, lnet_cpt_table(), cpt,
+			 sizeof(*init_qp_attr));
+	if (init_qp_attr == NULL) {
+		CERROR("Can't allocate qp_attr for %s\n",
+		       libcfs_nid2str(peer->ibp_nid));
+		goto failed_0;
+	}
+
+	LIBCFS_CPT_ALLOC(conn, lnet_cpt_table(), cpt, sizeof(*conn));
+	if (conn == NULL) {
+		CERROR("Can't allocate connection for %s\n",
+		       libcfs_nid2str(peer->ibp_nid));
+		goto failed_1;
+	}
+
+	conn->ibc_state = IBLND_CONN_INIT;
+	conn->ibc_version = version;
+	conn->ibc_peer = peer;		  /* I take the caller's ref */
+	cmid->context = conn;		   /* for future CM callbacks */
+	conn->ibc_cmid = cmid;
+
+	INIT_LIST_HEAD(&conn->ibc_early_rxs);
+	INIT_LIST_HEAD(&conn->ibc_tx_noops);
+	INIT_LIST_HEAD(&conn->ibc_tx_queue);
+	INIT_LIST_HEAD(&conn->ibc_tx_queue_rsrvd);
+	INIT_LIST_HEAD(&conn->ibc_tx_queue_nocred);
+	INIT_LIST_HEAD(&conn->ibc_active_txs);
+	spin_lock_init(&conn->ibc_lock);
+
+	LIBCFS_CPT_ALLOC(conn->ibc_connvars, lnet_cpt_table(), cpt,
+			 sizeof(*conn->ibc_connvars));
+	if (conn->ibc_connvars == NULL) {
+		CERROR("Can't allocate in-progress connection state\n");
+		goto failed_2;
+	}
+
+	write_lock_irqsave(glock, flags);
+	if (dev->ibd_failover) {
+		write_unlock_irqrestore(glock, flags);
+		CERROR("%s: failover in progress\n", dev->ibd_ifname);
+		goto failed_2;
+	}
+
+	if (dev->ibd_hdev->ibh_ibdev != cmid->device) {
+		/* wakeup failover thread and teardown connection */
+		if (kiblnd_dev_can_failover(dev)) {
+			list_add_tail(&dev->ibd_fail_list,
+				      &kiblnd_data.kib_failed_devs);
+			wake_up(&kiblnd_data.kib_failover_waitq);
+		}
+
+		write_unlock_irqrestore(glock, flags);
+		CERROR("cmid HCA(%s), kib_dev(%s) need failover\n",
+		       cmid->device->name, dev->ibd_ifname);
+		goto failed_2;
+	}
+
+	kiblnd_hdev_addref_locked(dev->ibd_hdev);
+	conn->ibc_hdev = dev->ibd_hdev;
+
+	kiblnd_setup_mtu_locked(cmid);
+
+	write_unlock_irqrestore(glock, flags);
+
+	LIBCFS_CPT_ALLOC(conn->ibc_rxs, lnet_cpt_table(), cpt,
+			 IBLND_RX_MSGS(version) * sizeof(kib_rx_t));
+	if (conn->ibc_rxs == NULL) {
+		CERROR("Cannot allocate RX buffers\n");
+		goto failed_2;
+	}
+
+	rc = kiblnd_alloc_pages(&conn->ibc_rx_pages, cpt,
+				IBLND_RX_MSG_PAGES(version));
+	if (rc != 0)
+		goto failed_2;
+
+	kiblnd_map_rx_descs(conn);
+
+	cq = ib_create_cq(cmid->device,
+			  kiblnd_cq_completion, kiblnd_cq_event, conn,
+			  IBLND_CQ_ENTRIES(version),
+			  kiblnd_get_completion_vector(conn, cpt));
+	if (IS_ERR(cq)) {
+		CERROR("Can't create CQ: %ld, cqe: %d\n",
+		       PTR_ERR(cq), IBLND_CQ_ENTRIES(version));
+		goto failed_2;
+	}
+
+	conn->ibc_cq = cq;
+
+	rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+	if (rc != 0) {
+		CERROR("Can't request completion notificiation: %d\n", rc);
+		goto failed_2;
+	}
+
+	init_qp_attr->event_handler = kiblnd_qp_event;
+	init_qp_attr->qp_context = conn;
+	init_qp_attr->cap.max_send_wr = IBLND_SEND_WRS(version);
+	init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(version);
+	init_qp_attr->cap.max_send_sge = 1;
+	init_qp_attr->cap.max_recv_sge = 1;
+	init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
+	init_qp_attr->qp_type = IB_QPT_RC;
+	init_qp_attr->send_cq = cq;
+	init_qp_attr->recv_cq = cq;
+
+	conn->ibc_sched = sched;
+
+	rc = rdma_create_qp(cmid, conn->ibc_hdev->ibh_pd, init_qp_attr);
+	if (rc != 0) {
+		CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d\n",
+		       rc, init_qp_attr->cap.max_send_wr,
+		       init_qp_attr->cap.max_recv_wr);
+		goto failed_2;
+	}
+
+	LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr));
+
+	/* 1 ref for caller and each rxmsg */
+	atomic_set(&conn->ibc_refcount, 1 + IBLND_RX_MSGS(version));
+	conn->ibc_nrx = IBLND_RX_MSGS(version);
+
+	/* post receives */
+	for (i = 0; i < IBLND_RX_MSGS(version); i++) {
+		rc = kiblnd_post_rx(&conn->ibc_rxs[i],
+				    IBLND_POSTRX_NO_CREDIT);
+		if (rc != 0) {
+			CERROR("Can't post rxmsg: %d\n", rc);
+
+			/* Make posted receives complete */
+			kiblnd_abort_receives(conn);
+
+			/* correct # of posted buffers
+			 * NB locking needed now I'm racing with completion */
+			spin_lock_irqsave(&sched->ibs_lock, flags);
+			conn->ibc_nrx -= IBLND_RX_MSGS(version) - i;
+			spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+			/* cmid will be destroyed by CM(ofed) after cm_callback
+			 * returned, so we can't refer it anymore
+			 * (by kiblnd_connd()->kiblnd_destroy_conn) */
+			rdma_destroy_qp(conn->ibc_cmid);
+			conn->ibc_cmid = NULL;
+
+			/* Drop my own and unused rxbuffer refcounts */
+			while (i++ <= IBLND_RX_MSGS(version))
+				kiblnd_conn_decref(conn);
+
+			return NULL;
+		}
+	}
+
+	/* Init successful! */
+	LASSERT (state == IBLND_CONN_ACTIVE_CONNECT ||
+		 state == IBLND_CONN_PASSIVE_WAIT);
+	conn->ibc_state = state;
+
+	/* 1 more conn */
+	atomic_inc(&net->ibn_nconns);
+	return conn;
+
+ failed_2:
+	kiblnd_destroy_conn(conn);
+ failed_1:
+	LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr));
+ failed_0:
+	return NULL;
+}
+
+void
+kiblnd_destroy_conn (kib_conn_t *conn)
+{
+	struct rdma_cm_id *cmid = conn->ibc_cmid;
+	kib_peer_t	*peer = conn->ibc_peer;
+	int		rc;
+
+	LASSERT (!in_interrupt());
+	LASSERT (atomic_read(&conn->ibc_refcount) == 0);
+	LASSERT (list_empty(&conn->ibc_early_rxs));
+	LASSERT (list_empty(&conn->ibc_tx_noops));
+	LASSERT (list_empty(&conn->ibc_tx_queue));
+	LASSERT (list_empty(&conn->ibc_tx_queue_rsrvd));
+	LASSERT (list_empty(&conn->ibc_tx_queue_nocred));
+	LASSERT (list_empty(&conn->ibc_active_txs));
+	LASSERT (conn->ibc_noops_posted == 0);
+	LASSERT (conn->ibc_nsends_posted == 0);
+
+	switch (conn->ibc_state) {
+	default:
+		/* conn must be completely disengaged from the network */
+		LBUG();
+
+	case IBLND_CONN_DISCONNECTED:
+		/* connvars should have been freed already */
+		LASSERT (conn->ibc_connvars == NULL);
+		break;
+
+	case IBLND_CONN_INIT:
+		break;
+	}
+
+	/* conn->ibc_cmid might be destroyed by CM already */
+	if (cmid != NULL && cmid->qp != NULL)
+		rdma_destroy_qp(cmid);
+
+	if (conn->ibc_cq != NULL) {
+		rc = ib_destroy_cq(conn->ibc_cq);
+		if (rc != 0)
+			CWARN("Error destroying CQ: %d\n", rc);
+	}
+
+	if (conn->ibc_rx_pages != NULL)
+		kiblnd_unmap_rx_descs(conn);
+
+	if (conn->ibc_rxs != NULL) {
+		LIBCFS_FREE(conn->ibc_rxs,
+			    IBLND_RX_MSGS(conn->ibc_version) * sizeof(kib_rx_t));
+	}
+
+	if (conn->ibc_connvars != NULL)
+		LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
+
+	if (conn->ibc_hdev != NULL)
+		kiblnd_hdev_decref(conn->ibc_hdev);
+
+	/* See CAVEAT EMPTOR above in kiblnd_create_conn */
+	if (conn->ibc_state != IBLND_CONN_INIT) {
+		kib_net_t *net = peer->ibp_ni->ni_data;
+
+		kiblnd_peer_decref(peer);
+		rdma_destroy_id(cmid);
+		atomic_dec(&net->ibn_nconns);
+	}
+
+	LIBCFS_FREE(conn, sizeof(*conn));
+}
+
+int
+kiblnd_close_peer_conns_locked (kib_peer_t *peer, int why)
+{
+	kib_conn_t	     *conn;
+	struct list_head	     *ctmp;
+	struct list_head	     *cnxt;
+	int		     count = 0;
+
+	list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
+		conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+		CDEBUG(D_NET, "Closing conn -> %s, "
+			      "version: %x, reason: %d\n",
+		       libcfs_nid2str(peer->ibp_nid),
+		       conn->ibc_version, why);
+
+		kiblnd_close_conn_locked(conn, why);
+		count++;
+	}
+
+	return count;
+}
+
+int
+kiblnd_close_stale_conns_locked (kib_peer_t *peer,
+				 int version, __u64 incarnation)
+{
+	kib_conn_t	     *conn;
+	struct list_head	     *ctmp;
+	struct list_head	     *cnxt;
+	int		     count = 0;
+
+	list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
+		conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+		if (conn->ibc_version     == version &&
+		    conn->ibc_incarnation == incarnation)
+			continue;
+
+		CDEBUG(D_NET, "Closing stale conn -> %s version: %x, "
+			      "incarnation:"LPX64"(%x, "LPX64")\n",
+		       libcfs_nid2str(peer->ibp_nid),
+		       conn->ibc_version, conn->ibc_incarnation,
+		       version, incarnation);
+
+		kiblnd_close_conn_locked(conn, -ESTALE);
+		count++;
+	}
+
+	return count;
+}
+
+int
+kiblnd_close_matching_conns (lnet_ni_t *ni, lnet_nid_t nid)
+{
+	kib_peer_t	     *peer;
+	struct list_head	     *ptmp;
+	struct list_head	     *pnxt;
+	int		     lo;
+	int		     hi;
+	int		     i;
+	unsigned long	   flags;
+	int		     count = 0;
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	if (nid != LNET_NID_ANY)
+		lo = hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
+	else {
+		lo = 0;
+		hi = kiblnd_data.kib_peer_hash_size - 1;
+	}
+
+	for (i = lo; i <= hi; i++) {
+		list_for_each_safe (ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
+
+			peer = list_entry(ptmp, kib_peer_t, ibp_list);
+			LASSERT (peer->ibp_connecting > 0 ||
+				 peer->ibp_accepting > 0 ||
+				 !list_empty(&peer->ibp_conns));
+
+			if (peer->ibp_ni != ni)
+				continue;
+
+			if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid))
+				continue;
+
+			count += kiblnd_close_peer_conns_locked(peer, 0);
+		}
+	}
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	/* wildcards always succeed */
+	if (nid == LNET_NID_ANY)
+		return 0;
+
+	return (count == 0) ? -ENOENT : 0;
+}
+
+int
+kiblnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
+{
+	struct libcfs_ioctl_data *data = arg;
+	int		       rc = -EINVAL;
+
+	switch(cmd) {
+	case IOC_LIBCFS_GET_PEER: {
+		lnet_nid_t   nid = 0;
+		int	  count = 0;
+
+		rc = kiblnd_get_peer_info(ni, data->ioc_count,
+					  &nid, &count);
+		data->ioc_nid    = nid;
+		data->ioc_count  = count;
+		break;
+	}
+
+	case IOC_LIBCFS_DEL_PEER: {
+		rc = kiblnd_del_peer(ni, data->ioc_nid);
+		break;
+	}
+	case IOC_LIBCFS_GET_CONN: {
+		kib_conn_t *conn;
+
+		rc = 0;
+		conn = kiblnd_get_conn_by_idx(ni, data->ioc_count);
+		if (conn == NULL) {
+			rc = -ENOENT;
+			break;
+		}
+
+		LASSERT (conn->ibc_cmid != NULL);
+		data->ioc_nid = conn->ibc_peer->ibp_nid;
+		if (conn->ibc_cmid->route.path_rec == NULL)
+			data->ioc_u32[0] = 0; /* iWarp has no path MTU */
+		else
+			data->ioc_u32[0] =
+			ib_mtu_enum_to_int(conn->ibc_cmid->route.path_rec->mtu);
+		kiblnd_conn_decref(conn);
+		break;
+	}
+	case IOC_LIBCFS_CLOSE_CONNECTION: {
+		rc = kiblnd_close_matching_conns(ni, data->ioc_nid);
+		break;
+	}
+
+	default:
+		break;
+	}
+
+	return rc;
+}
+
+void
+kiblnd_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when)
+{
+	cfs_time_t	last_alive = 0;
+	cfs_time_t	now = cfs_time_current();
+	rwlock_t	*glock = &kiblnd_data.kib_global_lock;
+	kib_peer_t	*peer;
+	unsigned long	flags;
+
+	read_lock_irqsave(glock, flags);
+
+	peer = kiblnd_find_peer_locked(nid);
+	if (peer != NULL) {
+		LASSERT (peer->ibp_connecting > 0 || /* creating conns */
+			 peer->ibp_accepting > 0 ||
+			 !list_empty(&peer->ibp_conns));  /* active conn */
+		last_alive = peer->ibp_last_alive;
+	}
+
+	read_unlock_irqrestore(glock, flags);
+
+	if (last_alive != 0)
+		*when = last_alive;
+
+	/* peer is not persistent in hash, trigger peer creation
+	 * and connection establishment with a NULL tx */
+	if (peer == NULL)
+		kiblnd_launch_tx(ni, NULL, nid);
+
+	CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago\n",
+	       libcfs_nid2str(nid), peer,
+	       last_alive ? cfs_duration_sec(now - last_alive) : -1);
+	return;
+}
+
+void
+kiblnd_free_pages(kib_pages_t *p)
+{
+	int	npages = p->ibp_npages;
+	int	i;
+
+	for (i = 0; i < npages; i++) {
+		if (p->ibp_pages[i] != NULL)
+			__free_page(p->ibp_pages[i]);
+	}
+
+	LIBCFS_FREE(p, offsetof(kib_pages_t, ibp_pages[npages]));
+}
+
+int
+kiblnd_alloc_pages(kib_pages_t **pp, int cpt, int npages)
+{
+	kib_pages_t	*p;
+	int		i;
+
+	LIBCFS_CPT_ALLOC(p, lnet_cpt_table(), cpt,
+			 offsetof(kib_pages_t, ibp_pages[npages]));
+	if (p == NULL) {
+		CERROR("Can't allocate descriptor for %d pages\n", npages);
+		return -ENOMEM;
+	}
+
+	memset(p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
+	p->ibp_npages = npages;
+
+	for (i = 0; i < npages; i++) {
+		p->ibp_pages[i] = cfs_page_cpt_alloc(lnet_cpt_table(), cpt,
+						     __GFP_IO);
+		if (p->ibp_pages[i] == NULL) {
+			CERROR("Can't allocate page %d of %d\n", i, npages);
+			kiblnd_free_pages(p);
+			return -ENOMEM;
+		}
+	}
+
+	*pp = p;
+	return 0;
+}
+
+void
+kiblnd_unmap_rx_descs(kib_conn_t *conn)
+{
+	kib_rx_t *rx;
+	int       i;
+
+	LASSERT (conn->ibc_rxs != NULL);
+	LASSERT (conn->ibc_hdev != NULL);
+
+	for (i = 0; i < IBLND_RX_MSGS(conn->ibc_version); i++) {
+		rx = &conn->ibc_rxs[i];
+
+		LASSERT (rx->rx_nob >= 0); /* not posted */
+
+		kiblnd_dma_unmap_single(conn->ibc_hdev->ibh_ibdev,
+					KIBLND_UNMAP_ADDR(rx, rx_msgunmap,
+							  rx->rx_msgaddr),
+					IBLND_MSG_SIZE, DMA_FROM_DEVICE);
+	}
+
+	kiblnd_free_pages(conn->ibc_rx_pages);
+
+	conn->ibc_rx_pages = NULL;
+}
+
+void
+kiblnd_map_rx_descs(kib_conn_t *conn)
+{
+	kib_rx_t       *rx;
+	struct page    *pg;
+	int	     pg_off;
+	int	     ipg;
+	int	     i;
+
+	for (pg_off = ipg = i = 0;
+	     i < IBLND_RX_MSGS(conn->ibc_version); i++) {
+		pg = conn->ibc_rx_pages->ibp_pages[ipg];
+		rx = &conn->ibc_rxs[i];
+
+		rx->rx_conn = conn;
+		rx->rx_msg = (kib_msg_t *)(((char *)page_address(pg)) + pg_off);
+
+		rx->rx_msgaddr = kiblnd_dma_map_single(conn->ibc_hdev->ibh_ibdev,
+						       rx->rx_msg, IBLND_MSG_SIZE,
+						       DMA_FROM_DEVICE);
+		LASSERT (!kiblnd_dma_mapping_error(conn->ibc_hdev->ibh_ibdev,
+						   rx->rx_msgaddr));
+		KIBLND_UNMAP_ADDR_SET(rx, rx_msgunmap, rx->rx_msgaddr);
+
+		CDEBUG(D_NET,"rx %d: %p "LPX64"("LPX64")\n",
+		       i, rx->rx_msg, rx->rx_msgaddr,
+		       lnet_page2phys(pg) + pg_off);
+
+		pg_off += IBLND_MSG_SIZE;
+		LASSERT (pg_off <= PAGE_SIZE);
+
+		if (pg_off == PAGE_SIZE) {
+			pg_off = 0;
+			ipg++;
+			LASSERT (ipg <= IBLND_RX_MSG_PAGES(conn->ibc_version));
+		}
+	}
+}
+
+static void
+kiblnd_unmap_tx_pool(kib_tx_pool_t *tpo)
+{
+	kib_hca_dev_t  *hdev = tpo->tpo_hdev;
+	kib_tx_t       *tx;
+	int	     i;
+
+	LASSERT (tpo->tpo_pool.po_allocated == 0);
+
+	if (hdev == NULL)
+		return;
+
+	for (i = 0; i < tpo->tpo_pool.po_size; i++) {
+		tx = &tpo->tpo_tx_descs[i];
+		kiblnd_dma_unmap_single(hdev->ibh_ibdev,
+					KIBLND_UNMAP_ADDR(tx, tx_msgunmap,
+							  tx->tx_msgaddr),
+					IBLND_MSG_SIZE, DMA_TO_DEVICE);
+	}
+
+	kiblnd_hdev_decref(hdev);
+	tpo->tpo_hdev = NULL;
+}
+
+static kib_hca_dev_t *
+kiblnd_current_hdev(kib_dev_t *dev)
+{
+	kib_hca_dev_t *hdev;
+	unsigned long  flags;
+	int	    i = 0;
+
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+	while (dev->ibd_failover) {
+		read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+		if (i++ % 50 == 0)
+			CDEBUG(D_NET, "%s: Wait for failover\n",
+			       dev->ibd_ifname);
+		schedule_timeout(cfs_time_seconds(1) / 100);
+
+		read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+	}
+
+	kiblnd_hdev_addref_locked(dev->ibd_hdev);
+	hdev = dev->ibd_hdev;
+
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	return hdev;
+}
+
+static void
+kiblnd_map_tx_pool(kib_tx_pool_t *tpo)
+{
+	kib_pages_t    *txpgs = tpo->tpo_tx_pages;
+	kib_pool_t     *pool  = &tpo->tpo_pool;
+	kib_net_t      *net   = pool->po_owner->ps_net;
+	kib_dev_t      *dev;
+	struct page    *page;
+	kib_tx_t       *tx;
+	int	     page_offset;
+	int	     ipage;
+	int	     i;
+
+	LASSERT (net != NULL);
+
+	dev = net->ibn_dev;
+
+	/* pre-mapped messages are not bigger than 1 page */
+	CLASSERT (IBLND_MSG_SIZE <= PAGE_SIZE);
+
+	/* No fancy arithmetic when we do the buffer calculations */
+	CLASSERT (PAGE_SIZE % IBLND_MSG_SIZE == 0);
+
+	tpo->tpo_hdev = kiblnd_current_hdev(dev);
+
+	for (ipage = page_offset = i = 0; i < pool->po_size; i++) {
+		page = txpgs->ibp_pages[ipage];
+		tx = &tpo->tpo_tx_descs[i];
+
+		tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) +
+					   page_offset);
+
+		tx->tx_msgaddr = kiblnd_dma_map_single(
+			tpo->tpo_hdev->ibh_ibdev, tx->tx_msg,
+			IBLND_MSG_SIZE, DMA_TO_DEVICE);
+		LASSERT (!kiblnd_dma_mapping_error(tpo->tpo_hdev->ibh_ibdev,
+						   tx->tx_msgaddr));
+		KIBLND_UNMAP_ADDR_SET(tx, tx_msgunmap, tx->tx_msgaddr);
+
+		list_add(&tx->tx_list, &pool->po_free_list);
+
+		page_offset += IBLND_MSG_SIZE;
+		LASSERT (page_offset <= PAGE_SIZE);
+
+		if (page_offset == PAGE_SIZE) {
+			page_offset = 0;
+			ipage++;
+			LASSERT (ipage <= txpgs->ibp_npages);
+		}
+	}
+}
+
+struct ib_mr *
+kiblnd_find_dma_mr(kib_hca_dev_t *hdev, __u64 addr, __u64 size)
+{
+	__u64   index;
+
+	LASSERT (hdev->ibh_mrs[0] != NULL);
+
+	if (hdev->ibh_nmrs == 1)
+		return hdev->ibh_mrs[0];
+
+	index = addr >> hdev->ibh_mr_shift;
+
+	if (index <  hdev->ibh_nmrs &&
+	    index == ((addr + size - 1) >> hdev->ibh_mr_shift))
+		return hdev->ibh_mrs[index];
+
+	return NULL;
+}
+
+struct ib_mr *
+kiblnd_find_rd_dma_mr(kib_hca_dev_t *hdev, kib_rdma_desc_t *rd)
+{
+	struct ib_mr *prev_mr;
+	struct ib_mr *mr;
+	int	   i;
+
+	LASSERT (hdev->ibh_mrs[0] != NULL);
+
+	if (*kiblnd_tunables.kib_map_on_demand > 0 &&
+	    *kiblnd_tunables.kib_map_on_demand <= rd->rd_nfrags)
+		return NULL;
+
+	if (hdev->ibh_nmrs == 1)
+		return hdev->ibh_mrs[0];
+
+	for (i = 0, mr = prev_mr = NULL;
+	     i < rd->rd_nfrags; i++) {
+		mr = kiblnd_find_dma_mr(hdev,
+					rd->rd_frags[i].rf_addr,
+					rd->rd_frags[i].rf_nob);
+		if (prev_mr == NULL)
+			prev_mr = mr;
+
+		if (mr == NULL || prev_mr != mr) {
+			/* Can't covered by one single MR */
+			mr = NULL;
+			break;
+		}
+	}
+
+	return mr;
+}
+
+void
+kiblnd_destroy_fmr_pool(kib_fmr_pool_t *pool)
+{
+	LASSERT (pool->fpo_map_count == 0);
+
+	if (pool->fpo_fmr_pool != NULL)
+		ib_destroy_fmr_pool(pool->fpo_fmr_pool);
+
+	if (pool->fpo_hdev != NULL)
+		kiblnd_hdev_decref(pool->fpo_hdev);
+
+	LIBCFS_FREE(pool, sizeof(kib_fmr_pool_t));
+}
+
+void
+kiblnd_destroy_fmr_pool_list(struct list_head *head)
+{
+	kib_fmr_pool_t *pool;
+
+	while (!list_empty(head)) {
+		pool = list_entry(head->next, kib_fmr_pool_t, fpo_list);
+		list_del(&pool->fpo_list);
+		kiblnd_destroy_fmr_pool(pool);
+	}
+}
+
+static int kiblnd_fmr_pool_size(int ncpts)
+{
+	int size = *kiblnd_tunables.kib_fmr_pool_size / ncpts;
+
+	return max(IBLND_FMR_POOL, size);
+}
+
+static int kiblnd_fmr_flush_trigger(int ncpts)
+{
+	int size = *kiblnd_tunables.kib_fmr_flush_trigger / ncpts;
+
+	return max(IBLND_FMR_POOL_FLUSH, size);
+}
+
+int
+kiblnd_create_fmr_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t **pp_fpo)
+{
+	/* FMR pool for RDMA */
+	kib_dev_t	       *dev = fps->fps_net->ibn_dev;
+	kib_fmr_pool_t	  *fpo;
+	struct ib_fmr_pool_param param = {
+		.max_pages_per_fmr = LNET_MAX_PAYLOAD/PAGE_SIZE,
+		.page_shift	= PAGE_SHIFT,
+		.access	    = (IB_ACCESS_LOCAL_WRITE |
+				      IB_ACCESS_REMOTE_WRITE),
+		.pool_size	   = fps->fps_pool_size,
+		.dirty_watermark   = fps->fps_flush_trigger,
+		.flush_function    = NULL,
+		.flush_arg	 = NULL,
+		.cache	     = !!*kiblnd_tunables.kib_fmr_cache};
+	int rc;
+
+	LIBCFS_CPT_ALLOC(fpo, lnet_cpt_table(), fps->fps_cpt, sizeof(*fpo));
+	if (fpo == NULL)
+		return -ENOMEM;
+
+	fpo->fpo_hdev = kiblnd_current_hdev(dev);
+
+	fpo->fpo_fmr_pool = ib_create_fmr_pool(fpo->fpo_hdev->ibh_pd, &param);
+	if (IS_ERR(fpo->fpo_fmr_pool)) {
+		rc = PTR_ERR(fpo->fpo_fmr_pool);
+		CERROR("Failed to create FMR pool: %d\n", rc);
+
+		kiblnd_hdev_decref(fpo->fpo_hdev);
+		LIBCFS_FREE(fpo, sizeof(kib_fmr_pool_t));
+		return rc;
+	}
+
+	fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+	fpo->fpo_owner    = fps;
+	*pp_fpo = fpo;
+
+	return 0;
+}
+
+static void
+kiblnd_fail_fmr_poolset(kib_fmr_poolset_t *fps, struct list_head *zombies)
+{
+	if (fps->fps_net == NULL) /* intialized? */
+		return;
+
+	spin_lock(&fps->fps_lock);
+
+	while (!list_empty(&fps->fps_pool_list)) {
+		kib_fmr_pool_t *fpo = list_entry(fps->fps_pool_list.next,
+						 kib_fmr_pool_t, fpo_list);
+		fpo->fpo_failed = 1;
+		list_del(&fpo->fpo_list);
+		if (fpo->fpo_map_count == 0)
+			list_add(&fpo->fpo_list, zombies);
+		else
+			list_add(&fpo->fpo_list, &fps->fps_failed_pool_list);
+	}
+
+	spin_unlock(&fps->fps_lock);
+}
+
+static void
+kiblnd_fini_fmr_poolset(kib_fmr_poolset_t *fps)
+{
+	if (fps->fps_net != NULL) { /* initialized? */
+		kiblnd_destroy_fmr_pool_list(&fps->fps_failed_pool_list);
+		kiblnd_destroy_fmr_pool_list(&fps->fps_pool_list);
+	}
+}
+
+static int
+kiblnd_init_fmr_poolset(kib_fmr_poolset_t *fps, int cpt, kib_net_t *net,
+			int pool_size, int flush_trigger)
+{
+	kib_fmr_pool_t *fpo;
+	int	     rc;
+
+	memset(fps, 0, sizeof(kib_fmr_poolset_t));
+
+	fps->fps_net = net;
+	fps->fps_cpt = cpt;
+	fps->fps_pool_size = pool_size;
+	fps->fps_flush_trigger = flush_trigger;
+	spin_lock_init(&fps->fps_lock);
+	INIT_LIST_HEAD(&fps->fps_pool_list);
+	INIT_LIST_HEAD(&fps->fps_failed_pool_list);
+
+	rc = kiblnd_create_fmr_pool(fps, &fpo);
+	if (rc == 0)
+		list_add_tail(&fpo->fpo_list, &fps->fps_pool_list);
+
+	return rc;
+}
+
+static int
+kiblnd_fmr_pool_is_idle(kib_fmr_pool_t *fpo, cfs_time_t now)
+{
+	if (fpo->fpo_map_count != 0) /* still in use */
+		return 0;
+	if (fpo->fpo_failed)
+		return 1;
+	return cfs_time_aftereq(now, fpo->fpo_deadline);
+}
+
+void
+kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status)
+{
+	LIST_HEAD     (zombies);
+	kib_fmr_pool_t    *fpo = fmr->fmr_pool;
+	kib_fmr_poolset_t *fps = fpo->fpo_owner;
+	cfs_time_t	 now = cfs_time_current();
+	kib_fmr_pool_t    *tmp;
+	int		rc;
+
+	rc = ib_fmr_pool_unmap(fmr->fmr_pfmr);
+	LASSERT (rc == 0);
+
+	if (status != 0) {
+		rc = ib_flush_fmr_pool(fpo->fpo_fmr_pool);
+		LASSERT (rc == 0);
+	}
+
+	fmr->fmr_pool = NULL;
+	fmr->fmr_pfmr = NULL;
+
+	spin_lock(&fps->fps_lock);
+	fpo->fpo_map_count --;  /* decref the pool */
+
+	list_for_each_entry_safe(fpo, tmp, &fps->fps_pool_list, fpo_list) {
+		/* the first pool is persistent */
+		if (fps->fps_pool_list.next == &fpo->fpo_list)
+			continue;
+
+		if (kiblnd_fmr_pool_is_idle(fpo, now)) {
+			list_move(&fpo->fpo_list, &zombies);
+			fps->fps_version ++;
+		}
+	}
+	spin_unlock(&fps->fps_lock);
+
+	if (!list_empty(&zombies))
+		kiblnd_destroy_fmr_pool_list(&zombies);
+}
+
+int
+kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, __u64 *pages, int npages,
+		    __u64 iov, kib_fmr_t *fmr)
+{
+	struct ib_pool_fmr *pfmr;
+	kib_fmr_pool_t     *fpo;
+	__u64	       version;
+	int		 rc;
+
+ again:
+	spin_lock(&fps->fps_lock);
+	version = fps->fps_version;
+	list_for_each_entry(fpo, &fps->fps_pool_list, fpo_list) {
+		fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+		fpo->fpo_map_count++;
+		spin_unlock(&fps->fps_lock);
+
+		pfmr = ib_fmr_pool_map_phys(fpo->fpo_fmr_pool,
+					    pages, npages, iov);
+		if (likely(!IS_ERR(pfmr))) {
+			fmr->fmr_pool = fpo;
+			fmr->fmr_pfmr = pfmr;
+			return 0;
+		}
+
+		spin_lock(&fps->fps_lock);
+		fpo->fpo_map_count--;
+		if (PTR_ERR(pfmr) != -EAGAIN) {
+			spin_unlock(&fps->fps_lock);
+			return PTR_ERR(pfmr);
+		}
+
+		/* EAGAIN and ... */
+		if (version != fps->fps_version) {
+			spin_unlock(&fps->fps_lock);
+			goto again;
+		}
+	}
+
+	if (fps->fps_increasing) {
+		spin_unlock(&fps->fps_lock);
+		CDEBUG(D_NET, "Another thread is allocating new "
+		       "FMR pool, waiting for her to complete\n");
+		schedule();
+		goto again;
+
+	}
+
+	if (cfs_time_before(cfs_time_current(), fps->fps_next_retry)) {
+		/* someone failed recently */
+		spin_unlock(&fps->fps_lock);
+		return -EAGAIN;
+	}
+
+	fps->fps_increasing = 1;
+	spin_unlock(&fps->fps_lock);
+
+	CDEBUG(D_NET, "Allocate new FMR pool\n");
+	rc = kiblnd_create_fmr_pool(fps, &fpo);
+	spin_lock(&fps->fps_lock);
+	fps->fps_increasing = 0;
+	if (rc == 0) {
+		fps->fps_version++;
+		list_add_tail(&fpo->fpo_list, &fps->fps_pool_list);
+	} else {
+		fps->fps_next_retry = cfs_time_shift(IBLND_POOL_RETRY);
+	}
+	spin_unlock(&fps->fps_lock);
+
+	goto again;
+}
+
+static void
+kiblnd_fini_pool(kib_pool_t *pool)
+{
+	LASSERT (list_empty(&pool->po_free_list));
+	LASSERT (pool->po_allocated == 0);
+
+	CDEBUG(D_NET, "Finalize %s pool\n", pool->po_owner->ps_name);
+}
+
+static void
+kiblnd_init_pool(kib_poolset_t *ps, kib_pool_t *pool, int size)
+{
+	CDEBUG(D_NET, "Initialize %s pool\n", ps->ps_name);
+
+	memset(pool, 0, sizeof(kib_pool_t));
+	INIT_LIST_HEAD(&pool->po_free_list);
+	pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+	pool->po_owner    = ps;
+	pool->po_size     = size;
+}
+
+void
+kiblnd_destroy_pool_list(struct list_head *head)
+{
+	kib_pool_t *pool;
+
+	while (!list_empty(head)) {
+		pool = list_entry(head->next, kib_pool_t, po_list);
+		list_del(&pool->po_list);
+
+		LASSERT (pool->po_owner != NULL);
+		pool->po_owner->ps_pool_destroy(pool);
+	}
+}
+
+static void
+kiblnd_fail_poolset(kib_poolset_t *ps, struct list_head *zombies)
+{
+	if (ps->ps_net == NULL) /* intialized? */
+		return;
+
+	spin_lock(&ps->ps_lock);
+	while (!list_empty(&ps->ps_pool_list)) {
+		kib_pool_t *po = list_entry(ps->ps_pool_list.next,
+					    kib_pool_t, po_list);
+		po->po_failed = 1;
+		list_del(&po->po_list);
+		if (po->po_allocated == 0)
+			list_add(&po->po_list, zombies);
+		else
+			list_add(&po->po_list, &ps->ps_failed_pool_list);
+	}
+	spin_unlock(&ps->ps_lock);
+}
+
+static void
+kiblnd_fini_poolset(kib_poolset_t *ps)
+{
+	if (ps->ps_net != NULL) { /* initialized? */
+		kiblnd_destroy_pool_list(&ps->ps_failed_pool_list);
+		kiblnd_destroy_pool_list(&ps->ps_pool_list);
+	}
+}
+
+static int
+kiblnd_init_poolset(kib_poolset_t *ps, int cpt,
+		    kib_net_t *net, char *name, int size,
+		    kib_ps_pool_create_t po_create,
+		    kib_ps_pool_destroy_t po_destroy,
+		    kib_ps_node_init_t nd_init,
+		    kib_ps_node_fini_t nd_fini)
+{
+	kib_pool_t	*pool;
+	int		rc;
+
+	memset(ps, 0, sizeof(kib_poolset_t));
+
+	ps->ps_cpt	    = cpt;
+	ps->ps_net	  = net;
+	ps->ps_pool_create  = po_create;
+	ps->ps_pool_destroy = po_destroy;
+	ps->ps_node_init    = nd_init;
+	ps->ps_node_fini    = nd_fini;
+	ps->ps_pool_size    = size;
+	if (strlcpy(ps->ps_name, name, sizeof(ps->ps_name))
+	    >= sizeof(ps->ps_name))
+		return -E2BIG;
+	spin_lock_init(&ps->ps_lock);
+	INIT_LIST_HEAD(&ps->ps_pool_list);
+	INIT_LIST_HEAD(&ps->ps_failed_pool_list);
+
+	rc = ps->ps_pool_create(ps, size, &pool);
+	if (rc == 0)
+		list_add(&pool->po_list, &ps->ps_pool_list);
+	else
+		CERROR("Failed to create the first pool for %s\n", ps->ps_name);
+
+	return rc;
+}
+
+static int
+kiblnd_pool_is_idle(kib_pool_t *pool, cfs_time_t now)
+{
+	if (pool->po_allocated != 0) /* still in use */
+		return 0;
+	if (pool->po_failed)
+		return 1;
+	return cfs_time_aftereq(now, pool->po_deadline);
+}
+
+void
+kiblnd_pool_free_node(kib_pool_t *pool, struct list_head *node)
+{
+	LIST_HEAD  (zombies);
+	kib_poolset_t  *ps = pool->po_owner;
+	kib_pool_t     *tmp;
+	cfs_time_t      now = cfs_time_current();
+
+	spin_lock(&ps->ps_lock);
+
+	if (ps->ps_node_fini != NULL)
+		ps->ps_node_fini(pool, node);
+
+	LASSERT (pool->po_allocated > 0);
+	list_add(node, &pool->po_free_list);
+	pool->po_allocated --;
+
+	list_for_each_entry_safe(pool, tmp, &ps->ps_pool_list, po_list) {
+		/* the first pool is persistent */
+		if (ps->ps_pool_list.next == &pool->po_list)
+			continue;
+
+		if (kiblnd_pool_is_idle(pool, now))
+			list_move(&pool->po_list, &zombies);
+	}
+	spin_unlock(&ps->ps_lock);
+
+	if (!list_empty(&zombies))
+		kiblnd_destroy_pool_list(&zombies);
+}
+
+struct list_head *
+kiblnd_pool_alloc_node(kib_poolset_t *ps)
+{
+	struct list_head	    *node;
+	kib_pool_t	    *pool;
+	int		    rc;
+
+ again:
+	spin_lock(&ps->ps_lock);
+	list_for_each_entry(pool, &ps->ps_pool_list, po_list) {
+		if (list_empty(&pool->po_free_list))
+			continue;
+
+		pool->po_allocated ++;
+		pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+		node = pool->po_free_list.next;
+		list_del(node);
+
+		if (ps->ps_node_init != NULL) {
+			/* still hold the lock */
+			ps->ps_node_init(pool, node);
+		}
+		spin_unlock(&ps->ps_lock);
+		return node;
+	}
+
+	/* no available tx pool and ... */
+	if (ps->ps_increasing) {
+		/* another thread is allocating a new pool */
+		spin_unlock(&ps->ps_lock);
+		CDEBUG(D_NET, "Another thread is allocating new "
+		       "%s pool, waiting for her to complete\n",
+		       ps->ps_name);
+		schedule();
+		goto again;
+	}
+
+	if (cfs_time_before(cfs_time_current(), ps->ps_next_retry)) {
+		/* someone failed recently */
+		spin_unlock(&ps->ps_lock);
+		return NULL;
+	}
+
+	ps->ps_increasing = 1;
+	spin_unlock(&ps->ps_lock);
+
+	CDEBUG(D_NET, "%s pool exhausted, allocate new pool\n", ps->ps_name);
+
+	rc = ps->ps_pool_create(ps, ps->ps_pool_size, &pool);
+
+	spin_lock(&ps->ps_lock);
+	ps->ps_increasing = 0;
+	if (rc == 0) {
+		list_add_tail(&pool->po_list, &ps->ps_pool_list);
+	} else {
+		ps->ps_next_retry = cfs_time_shift(IBLND_POOL_RETRY);
+		CERROR("Can't allocate new %s pool because out of memory\n",
+		       ps->ps_name);
+	}
+	spin_unlock(&ps->ps_lock);
+
+	goto again;
+}
+
+void
+kiblnd_pmr_pool_unmap(kib_phys_mr_t *pmr)
+{
+	kib_pmr_pool_t      *ppo = pmr->pmr_pool;
+	struct ib_mr	*mr  = pmr->pmr_mr;
+
+	pmr->pmr_mr = NULL;
+	kiblnd_pool_free_node(&ppo->ppo_pool, &pmr->pmr_list);
+	if (mr != NULL)
+		ib_dereg_mr(mr);
+}
+
+int
+kiblnd_pmr_pool_map(kib_pmr_poolset_t *pps, kib_hca_dev_t *hdev,
+		    kib_rdma_desc_t *rd, __u64 *iova, kib_phys_mr_t **pp_pmr)
+{
+	kib_phys_mr_t *pmr;
+	struct list_head    *node;
+	int	    rc;
+	int	    i;
+
+	node = kiblnd_pool_alloc_node(&pps->pps_poolset);
+	if (node == NULL) {
+		CERROR("Failed to allocate PMR descriptor\n");
+		return -ENOMEM;
+	}
+
+	pmr = container_of(node, kib_phys_mr_t, pmr_list);
+	if (pmr->pmr_pool->ppo_hdev != hdev) {
+		kiblnd_pool_free_node(&pmr->pmr_pool->ppo_pool, node);
+		return -EAGAIN;
+	}
+
+	for (i = 0; i < rd->rd_nfrags; i ++) {
+		pmr->pmr_ipb[i].addr = rd->rd_frags[i].rf_addr;
+		pmr->pmr_ipb[i].size = rd->rd_frags[i].rf_nob;
+	}
+
+	pmr->pmr_mr = ib_reg_phys_mr(hdev->ibh_pd,
+				     pmr->pmr_ipb, rd->rd_nfrags,
+				     IB_ACCESS_LOCAL_WRITE |
+				     IB_ACCESS_REMOTE_WRITE,
+				     iova);
+	if (!IS_ERR(pmr->pmr_mr)) {
+		pmr->pmr_iova = *iova;
+		*pp_pmr = pmr;
+		return 0;
+	}
+
+	rc = PTR_ERR(pmr->pmr_mr);
+	CERROR("Failed ib_reg_phys_mr: %d\n", rc);
+
+	pmr->pmr_mr = NULL;
+	kiblnd_pool_free_node(&pmr->pmr_pool->ppo_pool, node);
+
+	return rc;
+}
+
+static void
+kiblnd_destroy_pmr_pool(kib_pool_t *pool)
+{
+	kib_pmr_pool_t *ppo = container_of(pool, kib_pmr_pool_t, ppo_pool);
+	kib_phys_mr_t  *pmr;
+
+	LASSERT (pool->po_allocated == 0);
+
+	while (!list_empty(&pool->po_free_list)) {
+		pmr = list_entry(pool->po_free_list.next,
+				     kib_phys_mr_t, pmr_list);
+
+		LASSERT (pmr->pmr_mr == NULL);
+		list_del(&pmr->pmr_list);
+
+		if (pmr->pmr_ipb != NULL) {
+			LIBCFS_FREE(pmr->pmr_ipb,
+				    IBLND_MAX_RDMA_FRAGS *
+				    sizeof(struct ib_phys_buf));
+		}
+
+		LIBCFS_FREE(pmr, sizeof(kib_phys_mr_t));
+	}
+
+	kiblnd_fini_pool(pool);
+	if (ppo->ppo_hdev != NULL)
+		kiblnd_hdev_decref(ppo->ppo_hdev);
+
+	LIBCFS_FREE(ppo, sizeof(kib_pmr_pool_t));
+}
+
+static inline int kiblnd_pmr_pool_size(int ncpts)
+{
+	int size = *kiblnd_tunables.kib_pmr_pool_size / ncpts;
+
+	return max(IBLND_PMR_POOL, size);
+}
+
+static int
+kiblnd_create_pmr_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po)
+{
+	struct kib_pmr_pool	*ppo;
+	struct kib_pool		*pool;
+	kib_phys_mr_t		*pmr;
+	int			i;
+
+	LIBCFS_CPT_ALLOC(ppo, lnet_cpt_table(),
+			 ps->ps_cpt, sizeof(kib_pmr_pool_t));
+	if (ppo == NULL) {
+		CERROR("Failed to allocate PMR pool\n");
+		return -ENOMEM;
+	}
+
+	pool = &ppo->ppo_pool;
+	kiblnd_init_pool(ps, pool, size);
+
+	for (i = 0; i < size; i++) {
+		LIBCFS_CPT_ALLOC(pmr, lnet_cpt_table(),
+				 ps->ps_cpt, sizeof(kib_phys_mr_t));
+		if (pmr == NULL)
+			break;
+
+		pmr->pmr_pool = ppo;
+		LIBCFS_CPT_ALLOC(pmr->pmr_ipb, lnet_cpt_table(), ps->ps_cpt,
+				 IBLND_MAX_RDMA_FRAGS * sizeof(*pmr->pmr_ipb));
+		if (pmr->pmr_ipb == NULL)
+			break;
+
+		list_add(&pmr->pmr_list, &pool->po_free_list);
+	}
+
+	if (i < size) {
+		ps->ps_pool_destroy(pool);
+		return -ENOMEM;
+	}
+
+	ppo->ppo_hdev = kiblnd_current_hdev(ps->ps_net->ibn_dev);
+	*pp_po = pool;
+	return 0;
+}
+
+static void
+kiblnd_destroy_tx_pool(kib_pool_t *pool)
+{
+	kib_tx_pool_t  *tpo = container_of(pool, kib_tx_pool_t, tpo_pool);
+	int	     i;
+
+	LASSERT (pool->po_allocated == 0);
+
+	if (tpo->tpo_tx_pages != NULL) {
+		kiblnd_unmap_tx_pool(tpo);
+		kiblnd_free_pages(tpo->tpo_tx_pages);
+	}
+
+	if (tpo->tpo_tx_descs == NULL)
+		goto out;
+
+	for (i = 0; i < pool->po_size; i++) {
+		kib_tx_t *tx = &tpo->tpo_tx_descs[i];
+
+		list_del(&tx->tx_list);
+		if (tx->tx_pages != NULL)
+			LIBCFS_FREE(tx->tx_pages,
+				    LNET_MAX_IOV *
+				    sizeof(*tx->tx_pages));
+		if (tx->tx_frags != NULL)
+			LIBCFS_FREE(tx->tx_frags,
+				    IBLND_MAX_RDMA_FRAGS *
+					    sizeof(*tx->tx_frags));
+		if (tx->tx_wrq != NULL)
+			LIBCFS_FREE(tx->tx_wrq,
+				    (1 + IBLND_MAX_RDMA_FRAGS) *
+				    sizeof(*tx->tx_wrq));
+		if (tx->tx_sge != NULL)
+			LIBCFS_FREE(tx->tx_sge,
+				    (1 + IBLND_MAX_RDMA_FRAGS) *
+				    sizeof(*tx->tx_sge));
+		if (tx->tx_rd != NULL)
+			LIBCFS_FREE(tx->tx_rd,
+				    offsetof(kib_rdma_desc_t,
+					     rd_frags[IBLND_MAX_RDMA_FRAGS]));
+	}
+
+	LIBCFS_FREE(tpo->tpo_tx_descs,
+		    pool->po_size * sizeof(kib_tx_t));
+out:
+	kiblnd_fini_pool(pool);
+	LIBCFS_FREE(tpo, sizeof(kib_tx_pool_t));
+}
+
+static int kiblnd_tx_pool_size(int ncpts)
+{
+	int ntx = *kiblnd_tunables.kib_ntx / ncpts;
+
+	return max(IBLND_TX_POOL, ntx);
+}
+
+static int
+kiblnd_create_tx_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po)
+{
+	int	    i;
+	int	    npg;
+	kib_pool_t    *pool;
+	kib_tx_pool_t *tpo;
+
+	LIBCFS_CPT_ALLOC(tpo, lnet_cpt_table(), ps->ps_cpt, sizeof(*tpo));
+	if (tpo == NULL) {
+		CERROR("Failed to allocate TX pool\n");
+		return -ENOMEM;
+	}
+
+	pool = &tpo->tpo_pool;
+	kiblnd_init_pool(ps, pool, size);
+	tpo->tpo_tx_descs = NULL;
+	tpo->tpo_tx_pages = NULL;
+
+	npg = (size * IBLND_MSG_SIZE + PAGE_SIZE - 1) / PAGE_SIZE;
+	if (kiblnd_alloc_pages(&tpo->tpo_tx_pages, ps->ps_cpt, npg) != 0) {
+		CERROR("Can't allocate tx pages: %d\n", npg);
+		LIBCFS_FREE(tpo, sizeof(kib_tx_pool_t));
+		return -ENOMEM;
+	}
+
+	LIBCFS_CPT_ALLOC(tpo->tpo_tx_descs, lnet_cpt_table(), ps->ps_cpt,
+			 size * sizeof(kib_tx_t));
+	if (tpo->tpo_tx_descs == NULL) {
+		CERROR("Can't allocate %d tx descriptors\n", size);
+		ps->ps_pool_destroy(pool);
+		return -ENOMEM;
+	}
+
+	memset(tpo->tpo_tx_descs, 0, size * sizeof(kib_tx_t));
+
+	for (i = 0; i < size; i++) {
+		kib_tx_t *tx = &tpo->tpo_tx_descs[i];
+
+		tx->tx_pool = tpo;
+		if (ps->ps_net->ibn_fmr_ps != NULL) {
+			LIBCFS_CPT_ALLOC(tx->tx_pages,
+					 lnet_cpt_table(), ps->ps_cpt,
+					 LNET_MAX_IOV * sizeof(*tx->tx_pages));
+			if (tx->tx_pages == NULL)
+				break;
+		}
+
+		LIBCFS_CPT_ALLOC(tx->tx_frags, lnet_cpt_table(), ps->ps_cpt,
+				 IBLND_MAX_RDMA_FRAGS * sizeof(*tx->tx_frags));
+		if (tx->tx_frags == NULL)
+			break;
+
+		sg_init_table(tx->tx_frags, IBLND_MAX_RDMA_FRAGS);
+
+		LIBCFS_CPT_ALLOC(tx->tx_wrq, lnet_cpt_table(), ps->ps_cpt,
+				 (1 + IBLND_MAX_RDMA_FRAGS) *
+				 sizeof(*tx->tx_wrq));
+		if (tx->tx_wrq == NULL)
+			break;
+
+		LIBCFS_CPT_ALLOC(tx->tx_sge, lnet_cpt_table(), ps->ps_cpt,
+				 (1 + IBLND_MAX_RDMA_FRAGS) *
+				 sizeof(*tx->tx_sge));
+		if (tx->tx_sge == NULL)
+			break;
+
+		LIBCFS_CPT_ALLOC(tx->tx_rd, lnet_cpt_table(), ps->ps_cpt,
+				 offsetof(kib_rdma_desc_t,
+					  rd_frags[IBLND_MAX_RDMA_FRAGS]));
+		if (tx->tx_rd == NULL)
+			break;
+	}
+
+	if (i == size) {
+		kiblnd_map_tx_pool(tpo);
+		*pp_po = pool;
+		return 0;
+	}
+
+	ps->ps_pool_destroy(pool);
+	return -ENOMEM;
+}
+
+static void
+kiblnd_tx_init(kib_pool_t *pool, struct list_head *node)
+{
+	kib_tx_poolset_t *tps = container_of(pool->po_owner, kib_tx_poolset_t,
+					     tps_poolset);
+	kib_tx_t	 *tx  = list_entry(node, kib_tx_t, tx_list);
+
+	tx->tx_cookie = tps->tps_next_tx_cookie ++;
+}
+
+void
+kiblnd_net_fini_pools(kib_net_t *net)
+{
+	int	i;
+
+	cfs_cpt_for_each(i, lnet_cpt_table()) {
+		kib_tx_poolset_t	*tps;
+		kib_fmr_poolset_t	*fps;
+		kib_pmr_poolset_t	*pps;
+
+		if (net->ibn_tx_ps != NULL) {
+			tps = net->ibn_tx_ps[i];
+			kiblnd_fini_poolset(&tps->tps_poolset);
+		}
+
+		if (net->ibn_fmr_ps != NULL) {
+			fps = net->ibn_fmr_ps[i];
+			kiblnd_fini_fmr_poolset(fps);
+		}
+
+		if (net->ibn_pmr_ps != NULL) {
+			pps = net->ibn_pmr_ps[i];
+			kiblnd_fini_poolset(&pps->pps_poolset);
+		}
+	}
+
+	if (net->ibn_tx_ps != NULL) {
+		cfs_percpt_free(net->ibn_tx_ps);
+		net->ibn_tx_ps = NULL;
+	}
+
+	if (net->ibn_fmr_ps != NULL) {
+		cfs_percpt_free(net->ibn_fmr_ps);
+		net->ibn_fmr_ps = NULL;
+	}
+
+	if (net->ibn_pmr_ps != NULL) {
+		cfs_percpt_free(net->ibn_pmr_ps);
+		net->ibn_pmr_ps = NULL;
+	}
+}
+
+int
+kiblnd_net_init_pools(kib_net_t *net, __u32 *cpts, int ncpts)
+{
+	unsigned long	flags;
+	int		cpt;
+	int		rc;
+	int		i;
+
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+	if (*kiblnd_tunables.kib_map_on_demand == 0 &&
+	    net->ibn_dev->ibd_hdev->ibh_nmrs == 1) {
+		read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
+					   flags);
+		goto create_tx_pool;
+	}
+
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	if (*kiblnd_tunables.kib_fmr_pool_size <
+	    *kiblnd_tunables.kib_ntx / 4) {
+		CERROR("Can't set fmr pool size (%d) < ntx / 4(%d)\n",
+		       *kiblnd_tunables.kib_fmr_pool_size,
+		       *kiblnd_tunables.kib_ntx / 4);
+		rc = -EINVAL;
+		goto failed;
+	}
+
+	/* TX pool must be created later than FMR/PMR, see LU-2268
+	 * for details */
+	LASSERT(net->ibn_tx_ps == NULL);
+
+	/* premapping can fail if ibd_nmr > 1, so we always create
+	 * FMR/PMR pool and map-on-demand if premapping failed */
+
+	net->ibn_fmr_ps = cfs_percpt_alloc(lnet_cpt_table(),
+					   sizeof(kib_fmr_poolset_t));
+	if (net->ibn_fmr_ps == NULL) {
+		CERROR("Failed to allocate FMR pool array\n");
+		rc = -ENOMEM;
+		goto failed;
+	}
+
+	for (i = 0; i < ncpts; i++) {
+		cpt = (cpts == NULL) ? i : cpts[i];
+		rc = kiblnd_init_fmr_poolset(net->ibn_fmr_ps[cpt], cpt, net,
+					     kiblnd_fmr_pool_size(ncpts),
+					     kiblnd_fmr_flush_trigger(ncpts));
+		if (rc == -ENOSYS && i == 0) /* no FMR */
+			break; /* create PMR pool */
+
+		if (rc != 0) { /* a real error */
+			CERROR("Can't initialize FMR pool for CPT %d: %d\n",
+			       cpt, rc);
+			goto failed;
+		}
+	}
+
+	if (i > 0) {
+		LASSERT(i == ncpts);
+		goto create_tx_pool;
+	}
+
+	cfs_percpt_free(net->ibn_fmr_ps);
+	net->ibn_fmr_ps = NULL;
+
+	CWARN("Device does not support FMR, failing back to PMR\n");
+
+	if (*kiblnd_tunables.kib_pmr_pool_size <
+	    *kiblnd_tunables.kib_ntx / 4) {
+		CERROR("Can't set pmr pool size (%d) < ntx / 4(%d)\n",
+		       *kiblnd_tunables.kib_pmr_pool_size,
+		       *kiblnd_tunables.kib_ntx / 4);
+		rc = -EINVAL;
+		goto failed;
+	}
+
+	net->ibn_pmr_ps = cfs_percpt_alloc(lnet_cpt_table(),
+					   sizeof(kib_pmr_poolset_t));
+	if (net->ibn_pmr_ps == NULL) {
+		CERROR("Failed to allocate PMR pool array\n");
+		rc = -ENOMEM;
+		goto failed;
+	}
+
+	for (i = 0; i < ncpts; i++) {
+		cpt = (cpts == NULL) ? i : cpts[i];
+		rc = kiblnd_init_poolset(&net->ibn_pmr_ps[cpt]->pps_poolset,
+					 cpt, net, "PMR",
+					 kiblnd_pmr_pool_size(ncpts),
+					 kiblnd_create_pmr_pool,
+					 kiblnd_destroy_pmr_pool, NULL, NULL);
+		if (rc != 0) {
+			CERROR("Can't initialize PMR pool for CPT %d: %d\n",
+			       cpt, rc);
+			goto failed;
+		}
+	}
+
+ create_tx_pool:
+	net->ibn_tx_ps = cfs_percpt_alloc(lnet_cpt_table(),
+					  sizeof(kib_tx_poolset_t));
+	if (net->ibn_tx_ps == NULL) {
+		CERROR("Failed to allocate tx pool array\n");
+		rc = -ENOMEM;
+		goto failed;
+	}
+
+	for (i = 0; i < ncpts; i++) {
+		cpt = (cpts == NULL) ? i : cpts[i];
+		rc = kiblnd_init_poolset(&net->ibn_tx_ps[cpt]->tps_poolset,
+					 cpt, net, "TX",
+					 kiblnd_tx_pool_size(ncpts),
+					 kiblnd_create_tx_pool,
+					 kiblnd_destroy_tx_pool,
+					 kiblnd_tx_init, NULL);
+		if (rc != 0) {
+			CERROR("Can't initialize TX pool for CPT %d: %d\n",
+			       cpt, rc);
+			goto failed;
+		}
+	}
+
+	return 0;
+ failed:
+	kiblnd_net_fini_pools(net);
+	LASSERT(rc != 0);
+	return rc;
+}
+
+static int
+kiblnd_hdev_get_attr(kib_hca_dev_t *hdev)
+{
+	struct ib_device_attr *attr;
+	int		    rc;
+
+	/* It's safe to assume a HCA can handle a page size
+	 * matching that of the native system */
+	hdev->ibh_page_shift = PAGE_SHIFT;
+	hdev->ibh_page_size  = 1 << PAGE_SHIFT;
+	hdev->ibh_page_mask  = ~((__u64)hdev->ibh_page_size - 1);
+
+	LIBCFS_ALLOC(attr, sizeof(*attr));
+	if (attr == NULL) {
+		CERROR("Out of memory\n");
+		return -ENOMEM;
+	}
+
+	rc = ib_query_device(hdev->ibh_ibdev, attr);
+	if (rc == 0)
+		hdev->ibh_mr_size = attr->max_mr_size;
+
+	LIBCFS_FREE(attr, sizeof(*attr));
+
+	if (rc != 0) {
+		CERROR("Failed to query IB device: %d\n", rc);
+		return rc;
+	}
+
+	if (hdev->ibh_mr_size == ~0ULL) {
+		hdev->ibh_mr_shift = 64;
+		return 0;
+	}
+
+	for (hdev->ibh_mr_shift = 0;
+	     hdev->ibh_mr_shift < 64; hdev->ibh_mr_shift ++) {
+		if (hdev->ibh_mr_size == (1ULL << hdev->ibh_mr_shift) ||
+		    hdev->ibh_mr_size == (1ULL << hdev->ibh_mr_shift) - 1)
+			return 0;
+	}
+
+	CERROR("Invalid mr size: "LPX64"\n", hdev->ibh_mr_size);
+	return -EINVAL;
+}
+
+void
+kiblnd_hdev_cleanup_mrs(kib_hca_dev_t *hdev)
+{
+	int     i;
+
+	if (hdev->ibh_nmrs == 0 || hdev->ibh_mrs == NULL)
+		return;
+
+	for (i = 0; i < hdev->ibh_nmrs; i++) {
+		if (hdev->ibh_mrs[i] == NULL)
+			break;
+
+		ib_dereg_mr(hdev->ibh_mrs[i]);
+	}
+
+	LIBCFS_FREE(hdev->ibh_mrs, sizeof(*hdev->ibh_mrs) * hdev->ibh_nmrs);
+	hdev->ibh_mrs  = NULL;
+	hdev->ibh_nmrs = 0;
+}
+
+void
+kiblnd_hdev_destroy(kib_hca_dev_t *hdev)
+{
+	kiblnd_hdev_cleanup_mrs(hdev);
+
+	if (hdev->ibh_pd != NULL)
+		ib_dealloc_pd(hdev->ibh_pd);
+
+	if (hdev->ibh_cmid != NULL)
+		rdma_destroy_id(hdev->ibh_cmid);
+
+	LIBCFS_FREE(hdev, sizeof(*hdev));
+}
+
+int
+kiblnd_hdev_setup_mrs(kib_hca_dev_t *hdev)
+{
+	struct ib_mr *mr;
+	int	   i;
+	int	   rc;
+	__u64	 mm_size;
+	__u64	 mr_size;
+	int	   acflags = IB_ACCESS_LOCAL_WRITE |
+				IB_ACCESS_REMOTE_WRITE;
+
+	rc = kiblnd_hdev_get_attr(hdev);
+	if (rc != 0)
+		return rc;
+
+	if (hdev->ibh_mr_shift == 64) {
+		LIBCFS_ALLOC(hdev->ibh_mrs, 1 * sizeof(*hdev->ibh_mrs));
+		if (hdev->ibh_mrs == NULL) {
+			CERROR("Failed to allocate MRs table\n");
+			return -ENOMEM;
+		}
+
+		hdev->ibh_mrs[0] = NULL;
+		hdev->ibh_nmrs   = 1;
+
+		mr = ib_get_dma_mr(hdev->ibh_pd, acflags);
+		if (IS_ERR(mr)) {
+			CERROR("Failed ib_get_dma_mr : %ld\n", PTR_ERR(mr));
+			kiblnd_hdev_cleanup_mrs(hdev);
+			return PTR_ERR(mr);
+		}
+
+		hdev->ibh_mrs[0] = mr;
+
+		goto out;
+	}
+
+	mr_size = (1ULL << hdev->ibh_mr_shift);
+	mm_size = (unsigned long)high_memory - PAGE_OFFSET;
+
+	hdev->ibh_nmrs = (int)((mm_size + mr_size - 1) >> hdev->ibh_mr_shift);
+
+	if (hdev->ibh_mr_shift < 32 || hdev->ibh_nmrs > 1024) {
+		/* it's 4T..., assume we will re-code at that time */
+		CERROR("Can't support memory size: x"LPX64
+		       " with MR size: x"LPX64"\n", mm_size, mr_size);
+		return -EINVAL;
+	}
+
+	/* create an array of MRs to cover all memory */
+	LIBCFS_ALLOC(hdev->ibh_mrs, sizeof(*hdev->ibh_mrs) * hdev->ibh_nmrs);
+	if (hdev->ibh_mrs == NULL) {
+		CERROR("Failed to allocate MRs' table\n");
+		return -ENOMEM;
+	}
+
+	memset(hdev->ibh_mrs, 0, sizeof(*hdev->ibh_mrs) * hdev->ibh_nmrs);
+
+	for (i = 0; i < hdev->ibh_nmrs; i++) {
+		struct ib_phys_buf ipb;
+		__u64	      iova;
+
+		ipb.size = hdev->ibh_mr_size;
+		ipb.addr = i * mr_size;
+		iova     = ipb.addr;
+
+		mr = ib_reg_phys_mr(hdev->ibh_pd, &ipb, 1, acflags, &iova);
+		if (IS_ERR(mr)) {
+			CERROR("Failed ib_reg_phys_mr addr "LPX64
+			       " size "LPX64" : %ld\n",
+			       ipb.addr, ipb.size, PTR_ERR(mr));
+			kiblnd_hdev_cleanup_mrs(hdev);
+			return PTR_ERR(mr);
+		}
+
+		LASSERT (iova == ipb.addr);
+
+		hdev->ibh_mrs[i] = mr;
+	}
+
+out:
+	if (hdev->ibh_mr_size != ~0ULL || hdev->ibh_nmrs != 1)
+		LCONSOLE_INFO("Register global MR array, MR size: "
+			      LPX64", array size: %d\n",
+			      hdev->ibh_mr_size, hdev->ibh_nmrs);
+	return 0;
+}
+
+static int
+kiblnd_dummy_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
+{       /* DUMMY */
+	return 0;
+}
+
+static int
+kiblnd_dev_need_failover(kib_dev_t *dev)
+{
+	struct rdma_cm_id  *cmid;
+	struct sockaddr_in  srcaddr;
+	struct sockaddr_in  dstaddr;
+	int		 rc;
+
+	if (dev->ibd_hdev == NULL || /* initializing */
+	    dev->ibd_hdev->ibh_cmid == NULL || /* listener is dead */
+	    *kiblnd_tunables.kib_dev_failover > 1) /* debugging */
+		return 1;
+
+	/* XXX: it's UGLY, but I don't have better way to find
+	 * ib-bonding HCA failover because:
+	 *
+	 * a. no reliable CM event for HCA failover...
+	 * b. no OFED API to get ib_device for current net_device...
+	 *
+	 * We have only two choices at this point:
+	 *
+	 * a. rdma_bind_addr(), it will conflict with listener cmid
+	 * b. rdma_resolve_addr() to zero addr */
+	cmid = kiblnd_rdma_create_id(kiblnd_dummy_callback, dev, RDMA_PS_TCP,
+				     IB_QPT_RC);
+	if (IS_ERR(cmid)) {
+		rc = PTR_ERR(cmid);
+		CERROR("Failed to create cmid for failover: %d\n", rc);
+		return rc;
+	}
+
+	memset(&srcaddr, 0, sizeof(srcaddr));
+	srcaddr.sin_family      = AF_INET;
+	srcaddr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip);
+
+	memset(&dstaddr, 0, sizeof(dstaddr));
+	dstaddr.sin_family = AF_INET;
+	rc = rdma_resolve_addr(cmid, (struct sockaddr *)&srcaddr,
+			       (struct sockaddr *)&dstaddr, 1);
+	if (rc != 0 || cmid->device == NULL) {
+		CERROR("Failed to bind %s:%u.%u.%u.%u to device(%p): %d\n",
+		       dev->ibd_ifname, HIPQUAD(dev->ibd_ifip),
+		       cmid->device, rc);
+		rdma_destroy_id(cmid);
+		return rc;
+	}
+
+	if (dev->ibd_hdev->ibh_ibdev == cmid->device) {
+		/* don't need device failover */
+		rdma_destroy_id(cmid);
+		return 0;
+	}
+
+	return 1;
+}
+
+int
+kiblnd_dev_failover(kib_dev_t *dev)
+{
+	LIST_HEAD      (zombie_tpo);
+	LIST_HEAD      (zombie_ppo);
+	LIST_HEAD      (zombie_fpo);
+	struct rdma_cm_id  *cmid  = NULL;
+	kib_hca_dev_t      *hdev  = NULL;
+	kib_hca_dev_t      *old;
+	struct ib_pd       *pd;
+	kib_net_t	  *net;
+	struct sockaddr_in  addr;
+	unsigned long       flags;
+	int		 rc = 0;
+	int		    i;
+
+	LASSERT (*kiblnd_tunables.kib_dev_failover > 1 ||
+		 dev->ibd_can_failover ||
+		 dev->ibd_hdev == NULL);
+
+	rc = kiblnd_dev_need_failover(dev);
+	if (rc <= 0)
+		goto out;
+
+	if (dev->ibd_hdev != NULL &&
+	    dev->ibd_hdev->ibh_cmid != NULL) {
+		/* XXX it's not good to close old listener at here,
+		 * because we can fail to create new listener.
+		 * But we have to close it now, otherwise rdma_bind_addr
+		 * will return EADDRINUSE... How crap! */
+		write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+		cmid = dev->ibd_hdev->ibh_cmid;
+		/* make next schedule of kiblnd_dev_need_failover()
+		 * return 1 for me */
+		dev->ibd_hdev->ibh_cmid  = NULL;
+		write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+		rdma_destroy_id(cmid);
+	}
+
+	cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, dev, RDMA_PS_TCP,
+				     IB_QPT_RC);
+	if (IS_ERR(cmid)) {
+		rc = PTR_ERR(cmid);
+		CERROR("Failed to create cmid for failover: %d\n", rc);
+		goto out;
+	}
+
+	memset(&addr, 0, sizeof(addr));
+	addr.sin_family      = AF_INET;
+	addr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip);
+	addr.sin_port	= htons(*kiblnd_tunables.kib_service);
+
+	/* Bind to failover device or port */
+	rc = rdma_bind_addr(cmid, (struct sockaddr *)&addr);
+	if (rc != 0 || cmid->device == NULL) {
+		CERROR("Failed to bind %s:%u.%u.%u.%u to device(%p): %d\n",
+		       dev->ibd_ifname, HIPQUAD(dev->ibd_ifip),
+		       cmid->device, rc);
+		rdma_destroy_id(cmid);
+		goto out;
+	}
+
+	LIBCFS_ALLOC(hdev, sizeof(*hdev));
+	if (hdev == NULL) {
+		CERROR("Failed to allocate kib_hca_dev\n");
+		rdma_destroy_id(cmid);
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	atomic_set(&hdev->ibh_ref, 1);
+	hdev->ibh_dev   = dev;
+	hdev->ibh_cmid  = cmid;
+	hdev->ibh_ibdev = cmid->device;
+
+	pd = ib_alloc_pd(cmid->device);
+	if (IS_ERR(pd)) {
+		rc = PTR_ERR(pd);
+		CERROR("Can't allocate PD: %d\n", rc);
+		goto out;
+	}
+
+	hdev->ibh_pd = pd;
+
+	rc = rdma_listen(cmid, 0);
+	if (rc != 0) {
+		CERROR("Can't start new listener: %d\n", rc);
+		goto out;
+	}
+
+	rc = kiblnd_hdev_setup_mrs(hdev);
+	if (rc != 0) {
+		CERROR("Can't setup device: %d\n", rc);
+		goto out;
+	}
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	old = dev->ibd_hdev;
+	dev->ibd_hdev = hdev; /* take over the refcount */
+	hdev = old;
+
+	list_for_each_entry(net, &dev->ibd_nets, ibn_list) {
+		cfs_cpt_for_each(i, lnet_cpt_table()) {
+			kiblnd_fail_poolset(&net->ibn_tx_ps[i]->tps_poolset,
+					    &zombie_tpo);
+
+			if (net->ibn_fmr_ps != NULL) {
+				kiblnd_fail_fmr_poolset(net->ibn_fmr_ps[i],
+							&zombie_fpo);
+
+			} else if (net->ibn_pmr_ps != NULL) {
+				kiblnd_fail_poolset(&net->ibn_pmr_ps[i]->
+						    pps_poolset, &zombie_ppo);
+			}
+		}
+	}
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+ out:
+	if (!list_empty(&zombie_tpo))
+		kiblnd_destroy_pool_list(&zombie_tpo);
+	if (!list_empty(&zombie_ppo))
+		kiblnd_destroy_pool_list(&zombie_ppo);
+	if (!list_empty(&zombie_fpo))
+		kiblnd_destroy_fmr_pool_list(&zombie_fpo);
+	if (hdev != NULL)
+		kiblnd_hdev_decref(hdev);
+
+	if (rc != 0)
+		dev->ibd_failed_failover++;
+	else
+		dev->ibd_failed_failover = 0;
+
+	return rc;
+}
+
+void
+kiblnd_destroy_dev (kib_dev_t *dev)
+{
+	LASSERT (dev->ibd_nnets == 0);
+	LASSERT (list_empty(&dev->ibd_nets));
+
+	list_del(&dev->ibd_fail_list);
+	list_del(&dev->ibd_list);
+
+	if (dev->ibd_hdev != NULL)
+		kiblnd_hdev_decref(dev->ibd_hdev);
+
+	LIBCFS_FREE(dev, sizeof(*dev));
+}
+
+kib_dev_t *
+kiblnd_create_dev(char *ifname)
+{
+	struct net_device *netdev;
+	kib_dev_t	 *dev;
+	__u32	      netmask;
+	__u32	      ip;
+	int		up;
+	int		rc;
+
+	rc = libcfs_ipif_query(ifname, &up, &ip, &netmask);
+	if (rc != 0) {
+		CERROR("Can't query IPoIB interface %s: %d\n",
+		       ifname, rc);
+		return NULL;
+	}
+
+	if (!up) {
+		CERROR("Can't query IPoIB interface %s: it's down\n", ifname);
+		return NULL;
+	}
+
+	LIBCFS_ALLOC(dev, sizeof(*dev));
+	if (dev == NULL)
+		return NULL;
+
+	memset(dev, 0, sizeof(*dev));
+	netdev = dev_get_by_name(&init_net, ifname);
+	if (netdev == NULL) {
+		dev->ibd_can_failover = 0;
+	} else {
+		dev->ibd_can_failover = !!(netdev->flags & IFF_MASTER);
+		dev_put(netdev);
+	}
+
+	INIT_LIST_HEAD(&dev->ibd_nets);
+	INIT_LIST_HEAD(&dev->ibd_list); /* not yet in kib_devs */
+	INIT_LIST_HEAD(&dev->ibd_fail_list);
+	dev->ibd_ifip = ip;
+	strcpy(&dev->ibd_ifname[0], ifname);
+
+	/* initialize the device */
+	rc = kiblnd_dev_failover(dev);
+	if (rc != 0) {
+		CERROR("Can't initialize device: %d\n", rc);
+		LIBCFS_FREE(dev, sizeof(*dev));
+		return NULL;
+	}
+
+	list_add_tail(&dev->ibd_list,
+			  &kiblnd_data.kib_devs);
+	return dev;
+}
+
+void
+kiblnd_base_shutdown(void)
+{
+	struct kib_sched_info	*sched;
+	int			i;
+
+	LASSERT (list_empty(&kiblnd_data.kib_devs));
+
+	CDEBUG(D_MALLOC, "before LND base cleanup: kmem %d\n",
+	       atomic_read(&libcfs_kmemory));
+
+	switch (kiblnd_data.kib_init) {
+	default:
+		LBUG();
+
+	case IBLND_INIT_ALL:
+	case IBLND_INIT_DATA:
+		LASSERT (kiblnd_data.kib_peers != NULL);
+		for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
+			LASSERT (list_empty(&kiblnd_data.kib_peers[i]));
+		}
+		LASSERT (list_empty(&kiblnd_data.kib_connd_zombies));
+		LASSERT (list_empty(&kiblnd_data.kib_connd_conns));
+
+		/* flag threads to terminate; wake and wait for them to die */
+		kiblnd_data.kib_shutdown = 1;
+
+		/* NB: we really want to stop scheduler threads net by net
+		 * instead of the whole module, this should be improved
+		 * with dynamic configuration LNet */
+		cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds)
+			wake_up_all(&sched->ibs_waitq);
+
+		wake_up_all(&kiblnd_data.kib_connd_waitq);
+		wake_up_all(&kiblnd_data.kib_failover_waitq);
+
+		i = 2;
+		while (atomic_read(&kiblnd_data.kib_nthreads) != 0) {
+			i++;
+			CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+			       "Waiting for %d threads to terminate\n",
+			       atomic_read(&kiblnd_data.kib_nthreads));
+			cfs_pause(cfs_time_seconds(1));
+		}
+
+		/* fall through */
+
+	case IBLND_INIT_NOTHING:
+		break;
+	}
+
+	if (kiblnd_data.kib_peers != NULL) {
+		LIBCFS_FREE(kiblnd_data.kib_peers,
+			    sizeof(struct list_head) *
+			    kiblnd_data.kib_peer_hash_size);
+	}
+
+	if (kiblnd_data.kib_scheds != NULL)
+		cfs_percpt_free(kiblnd_data.kib_scheds);
+
+	CDEBUG(D_MALLOC, "after LND base cleanup: kmem %d\n",
+	       atomic_read(&libcfs_kmemory));
+
+	kiblnd_data.kib_init = IBLND_INIT_NOTHING;
+	module_put(THIS_MODULE);
+}
+
+void
+kiblnd_shutdown (lnet_ni_t *ni)
+{
+	kib_net_t	*net = ni->ni_data;
+	rwlock_t     *g_lock = &kiblnd_data.kib_global_lock;
+	int	       i;
+	unsigned long     flags;
+
+	LASSERT(kiblnd_data.kib_init == IBLND_INIT_ALL);
+
+	if (net == NULL)
+		goto out;
+
+	CDEBUG(D_MALLOC, "before LND net cleanup: kmem %d\n",
+	       atomic_read(&libcfs_kmemory));
+
+	write_lock_irqsave(g_lock, flags);
+	net->ibn_shutdown = 1;
+	write_unlock_irqrestore(g_lock, flags);
+
+	switch (net->ibn_init) {
+	default:
+		LBUG();
+
+	case IBLND_INIT_ALL:
+		/* nuke all existing peers within this net */
+		kiblnd_del_peer(ni, LNET_NID_ANY);
+
+		/* Wait for all peer state to clean up */
+		i = 2;
+		while (atomic_read(&net->ibn_npeers) != 0) {
+			i++;
+			CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n? */
+			       "%s: waiting for %d peers to disconnect\n",
+			       libcfs_nid2str(ni->ni_nid),
+			       atomic_read(&net->ibn_npeers));
+			cfs_pause(cfs_time_seconds(1));
+		}
+
+		kiblnd_net_fini_pools(net);
+
+		write_lock_irqsave(g_lock, flags);
+		LASSERT(net->ibn_dev->ibd_nnets > 0);
+		net->ibn_dev->ibd_nnets--;
+		list_del(&net->ibn_list);
+		write_unlock_irqrestore(g_lock, flags);
+
+		/* fall through */
+
+	case IBLND_INIT_NOTHING:
+		LASSERT (atomic_read(&net->ibn_nconns) == 0);
+
+		if (net->ibn_dev != NULL &&
+		    net->ibn_dev->ibd_nnets == 0)
+			kiblnd_destroy_dev(net->ibn_dev);
+
+		break;
+	}
+
+	CDEBUG(D_MALLOC, "after LND net cleanup: kmem %d\n",
+	       atomic_read(&libcfs_kmemory));
+
+	net->ibn_init = IBLND_INIT_NOTHING;
+	ni->ni_data = NULL;
+
+	LIBCFS_FREE(net, sizeof(*net));
+
+out:
+	if (list_empty(&kiblnd_data.kib_devs))
+		kiblnd_base_shutdown();
+	return;
+}
+
+int
+kiblnd_base_startup(void)
+{
+	struct kib_sched_info	*sched;
+	int			rc;
+	int			i;
+
+	LASSERT (kiblnd_data.kib_init == IBLND_INIT_NOTHING);
+
+	try_module_get(THIS_MODULE);
+	memset(&kiblnd_data, 0, sizeof(kiblnd_data)); /* zero pointers, flags etc */
+
+	rwlock_init(&kiblnd_data.kib_global_lock);
+
+	INIT_LIST_HEAD(&kiblnd_data.kib_devs);
+	INIT_LIST_HEAD(&kiblnd_data.kib_failed_devs);
+
+	kiblnd_data.kib_peer_hash_size = IBLND_PEER_HASH_SIZE;
+	LIBCFS_ALLOC(kiblnd_data.kib_peers,
+		     sizeof(struct list_head) *
+			    kiblnd_data.kib_peer_hash_size);
+	if (kiblnd_data.kib_peers == NULL) {
+		goto failed;
+	}
+	for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++)
+		INIT_LIST_HEAD(&kiblnd_data.kib_peers[i]);
+
+	spin_lock_init(&kiblnd_data.kib_connd_lock);
+	INIT_LIST_HEAD(&kiblnd_data.kib_connd_conns);
+	INIT_LIST_HEAD(&kiblnd_data.kib_connd_zombies);
+	init_waitqueue_head(&kiblnd_data.kib_connd_waitq);
+	init_waitqueue_head(&kiblnd_data.kib_failover_waitq);
+
+	kiblnd_data.kib_scheds = cfs_percpt_alloc(lnet_cpt_table(),
+						  sizeof(*sched));
+	if (kiblnd_data.kib_scheds == NULL)
+		goto failed;
+
+	cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds) {
+		int	nthrs;
+
+		spin_lock_init(&sched->ibs_lock);
+		INIT_LIST_HEAD(&sched->ibs_conns);
+		init_waitqueue_head(&sched->ibs_waitq);
+
+		nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
+		if (*kiblnd_tunables.kib_nscheds > 0) {
+			nthrs = min(nthrs, *kiblnd_tunables.kib_nscheds);
+		} else {
+			/* max to half of CPUs, another half is reserved for
+			 * upper layer modules */
+			nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs);
+		}
+
+		sched->ibs_nthreads_max = nthrs;
+		sched->ibs_cpt = i;
+	}
+
+	kiblnd_data.kib_error_qpa.qp_state = IB_QPS_ERR;
+
+	/* lists/ptrs/locks initialised */
+	kiblnd_data.kib_init = IBLND_INIT_DATA;
+	/*****************************************************/
+
+	rc = kiblnd_thread_start(kiblnd_connd, NULL, "kiblnd_connd");
+	if (rc != 0) {
+		CERROR("Can't spawn o2iblnd connd: %d\n", rc);
+		goto failed;
+	}
+
+	if (*kiblnd_tunables.kib_dev_failover != 0)
+		rc = kiblnd_thread_start(kiblnd_failover_thread, NULL,
+					 "kiblnd_failover");
+
+	if (rc != 0) {
+		CERROR("Can't spawn o2iblnd failover thread: %d\n", rc);
+		goto failed;
+	}
+
+	/* flag everything initialised */
+	kiblnd_data.kib_init = IBLND_INIT_ALL;
+	/*****************************************************/
+
+	return 0;
+
+ failed:
+	kiblnd_base_shutdown();
+	return -ENETDOWN;
+}
+
+int
+kiblnd_start_schedulers(struct kib_sched_info *sched)
+{
+	int	rc = 0;
+	int	nthrs;
+	int	i;
+
+	if (sched->ibs_nthreads == 0) {
+		if (*kiblnd_tunables.kib_nscheds > 0) {
+			nthrs = sched->ibs_nthreads_max;
+		} else {
+			nthrs = cfs_cpt_weight(lnet_cpt_table(),
+					       sched->ibs_cpt);
+			nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs);
+			nthrs = min(IBLND_N_SCHED_HIGH, nthrs);
+		}
+	} else {
+		LASSERT(sched->ibs_nthreads <= sched->ibs_nthreads_max);
+		/* increase one thread if there is new interface */
+		nthrs = (sched->ibs_nthreads < sched->ibs_nthreads_max);
+	}
+
+	for (i = 0; i < nthrs; i++) {
+		long	id;
+		char	name[20];
+		id = KIB_THREAD_ID(sched->ibs_cpt, sched->ibs_nthreads + i);
+		snprintf(name, sizeof(name), "kiblnd_sd_%02ld_%02ld",
+			 KIB_THREAD_CPT(id), KIB_THREAD_TID(id));
+		rc = kiblnd_thread_start(kiblnd_scheduler, (void *)id, name);
+		if (rc == 0)
+			continue;
+
+		CERROR("Can't spawn thread %d for scheduler[%d]: %d\n",
+		       sched->ibs_cpt, sched->ibs_nthreads + i, rc);
+		break;
+	}
+
+	sched->ibs_nthreads += i;
+	return rc;
+}
+
+int
+kiblnd_dev_start_threads(kib_dev_t *dev, int newdev, __u32 *cpts, int ncpts)
+{
+	int	cpt;
+	int	rc;
+	int	i;
+
+	for (i = 0; i < ncpts; i++) {
+		struct kib_sched_info *sched;
+
+		cpt = (cpts == NULL) ? i : cpts[i];
+		sched = kiblnd_data.kib_scheds[cpt];
+
+		if (!newdev && sched->ibs_nthreads > 0)
+			continue;
+
+		rc = kiblnd_start_schedulers(kiblnd_data.kib_scheds[cpt]);
+		if (rc != 0) {
+			CERROR("Failed to start scheduler threads for %s\n",
+			       dev->ibd_ifname);
+			return rc;
+		}
+	}
+	return 0;
+}
+
+kib_dev_t *
+kiblnd_dev_search(char *ifname)
+{
+	kib_dev_t	*alias = NULL;
+	kib_dev_t	*dev;
+	char		*colon;
+	char		*colon2;
+
+	colon = strchr(ifname, ':');
+	list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) {
+		if (strcmp(&dev->ibd_ifname[0], ifname) == 0)
+			return dev;
+
+		if (alias != NULL)
+			continue;
+
+		colon2 = strchr(dev->ibd_ifname, ':');
+		if (colon != NULL)
+			*colon = 0;
+		if (colon2 != NULL)
+			*colon2 = 0;
+
+		if (strcmp(&dev->ibd_ifname[0], ifname) == 0)
+			alias = dev;
+
+		if (colon != NULL)
+			*colon = ':';
+		if (colon2 != NULL)
+			*colon2 = ':';
+	}
+	return alias;
+}
+
+int
+kiblnd_startup (lnet_ni_t *ni)
+{
+	char		     *ifname;
+	kib_dev_t		*ibdev = NULL;
+	kib_net_t		*net;
+	struct timeval	    tv;
+	unsigned long	     flags;
+	int		       rc;
+	int			  newdev;
+
+	LASSERT (ni->ni_lnd == &the_o2iblnd);
+
+	if (kiblnd_data.kib_init == IBLND_INIT_NOTHING) {
+		rc = kiblnd_base_startup();
+		if (rc != 0)
+			return rc;
+	}
+
+	LIBCFS_ALLOC(net, sizeof(*net));
+	ni->ni_data = net;
+	if (net == NULL)
+		goto failed;
+
+	memset(net, 0, sizeof(*net));
+
+	do_gettimeofday(&tv);
+	net->ibn_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+
+	ni->ni_peertimeout    = *kiblnd_tunables.kib_peertimeout;
+	ni->ni_maxtxcredits   = *kiblnd_tunables.kib_credits;
+	ni->ni_peertxcredits  = *kiblnd_tunables.kib_peertxcredits;
+	ni->ni_peerrtrcredits = *kiblnd_tunables.kib_peerrtrcredits;
+
+	if (ni->ni_interfaces[0] != NULL) {
+		/* Use the IPoIB interface specified in 'networks=' */
+
+		CLASSERT (LNET_MAX_INTERFACES > 1);
+		if (ni->ni_interfaces[1] != NULL) {
+			CERROR("Multiple interfaces not supported\n");
+			goto failed;
+		}
+
+		ifname = ni->ni_interfaces[0];
+	} else {
+		ifname = *kiblnd_tunables.kib_default_ipif;
+	}
+
+	if (strlen(ifname) >= sizeof(ibdev->ibd_ifname)) {
+		CERROR("IPoIB interface name too long: %s\n", ifname);
+		goto failed;
+	}
+
+	ibdev = kiblnd_dev_search(ifname);
+
+	newdev = ibdev == NULL;
+	/* hmm...create kib_dev even for alias */
+	if (ibdev == NULL || strcmp(&ibdev->ibd_ifname[0], ifname) != 0)
+		ibdev = kiblnd_create_dev(ifname);
+
+	if (ibdev == NULL)
+		goto failed;
+
+	net->ibn_dev = ibdev;
+	ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip);
+
+	rc = kiblnd_dev_start_threads(ibdev, newdev,
+				      ni->ni_cpts, ni->ni_ncpts);
+	if (rc != 0)
+		goto failed;
+
+	rc = kiblnd_net_init_pools(net, ni->ni_cpts, ni->ni_ncpts);
+	if (rc != 0) {
+		CERROR("Failed to initialize NI pools: %d\n", rc);
+		goto failed;
+	}
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+	ibdev->ibd_nnets++;
+	list_add_tail(&net->ibn_list, &ibdev->ibd_nets);
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	net->ibn_init = IBLND_INIT_ALL;
+
+	return 0;
+
+failed:
+	if (net->ibn_dev == NULL && ibdev != NULL)
+		kiblnd_destroy_dev(ibdev);
+
+	kiblnd_shutdown(ni);
+
+	CDEBUG(D_NET, "kiblnd_startup failed\n");
+	return -ENETDOWN;
+}
+
+void __exit
+kiblnd_module_fini (void)
+{
+	lnet_unregister_lnd(&the_o2iblnd);
+	kiblnd_tunables_fini();
+}
+
+int __init
+kiblnd_module_init (void)
+{
+	int    rc;
+
+	CLASSERT (sizeof(kib_msg_t) <= IBLND_MSG_SIZE);
+	CLASSERT (offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[IBLND_MAX_RDMA_FRAGS])
+		  <= IBLND_MSG_SIZE);
+	CLASSERT (offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[IBLND_MAX_RDMA_FRAGS])
+		  <= IBLND_MSG_SIZE);
+
+	rc = kiblnd_tunables_init();
+	if (rc != 0)
+		return rc;
+
+	lnet_register_lnd(&the_o2iblnd);
+
+	return 0;
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Kernel OpenIB gen2 LND v2.00");
+MODULE_LICENSE("GPL");
+
+module_init(kiblnd_module_init);
+module_exit(kiblnd_module_fini);
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h
new file mode 100644
index 000000000000..e4626bf82fc7
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h
@@ -0,0 +1,1057 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/o2iblnd/o2iblnd.h
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/uio.h>
+
+#include <asm/uaccess.h>
+#include <asm/io.h>
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+#include <linux/pci.h>
+
+#include <net/sock.h>
+#include <linux/in.h>
+
+#define DEBUG_SUBSYSTEM S_LND
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lnet.h>
+#include <linux/lnet/lib-lnet.h>
+#include <linux/lnet/lnet-sysctl.h>
+
+#include <rdma/rdma_cm.h>
+#include <rdma/ib_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_fmr_pool.h>
+
+#define IBLND_PEER_HASH_SIZE		101	/* # peer lists */
+/* # scheduler loops before reschedule */
+#define IBLND_RESCHED			100
+
+#define IBLND_N_SCHED			2
+#define IBLND_N_SCHED_HIGH		4
+
+typedef struct
+{
+	int	      *kib_dev_failover;     /* HCA failover */
+	unsigned int     *kib_service;	  /* IB service number */
+	int	      *kib_min_reconnect_interval; /* first failed connection retry... */
+	int	      *kib_max_reconnect_interval; /* ...exponentially increasing to this */
+	int	      *kib_cksum;	    /* checksum kib_msg_t? */
+	int	      *kib_timeout;	  /* comms timeout (seconds) */
+	int	      *kib_keepalive;	/* keepalive timeout (seconds) */
+	int	      *kib_ntx;	      /* # tx descs */
+	int	      *kib_credits;	  /* # concurrent sends */
+	int	      *kib_peertxcredits;    /* # concurrent sends to 1 peer */
+	int	      *kib_peerrtrcredits;   /* # per-peer router buffer credits */
+	int	      *kib_peercredits_hiw;  /* # when eagerly to return credits */
+	int	      *kib_peertimeout;      /* seconds to consider peer dead */
+	char	    **kib_default_ipif;     /* default IPoIB interface */
+	int	      *kib_retry_count;
+	int	      *kib_rnr_retry_count;
+	int	      *kib_concurrent_sends; /* send work queue sizing */
+	int		 *kib_ib_mtu;		/* IB MTU */
+	int	      *kib_map_on_demand;    /* map-on-demand if RD has more fragments
+						 * than this value, 0 disable map-on-demand */
+	int	      *kib_pmr_pool_size;    /* # physical MR in pool */
+	int	      *kib_fmr_pool_size;    /* # FMRs in pool */
+	int	      *kib_fmr_flush_trigger; /* When to trigger FMR flush */
+	int	      *kib_fmr_cache;	/* enable FMR pool cache? */
+#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
+	ctl_table_header_t *kib_sysctl;  /* sysctl interface */
+#endif
+	int	      *kib_require_priv_port;/* accept only privileged ports */
+	int	      *kib_use_priv_port;    /* use privileged port for active connect */
+	/* # threads on each CPT */
+	int		 *kib_nscheds;
+} kib_tunables_t;
+
+extern kib_tunables_t  kiblnd_tunables;
+
+#define IBLND_MSG_QUEUE_SIZE_V1      8	  /* V1 only : # messages/RDMAs in-flight */
+#define IBLND_CREDIT_HIGHWATER_V1    7	  /* V1 only : when eagerly to return credits */
+
+#define IBLND_CREDITS_DEFAULT	8	  /* default # of peer credits */
+#define IBLND_CREDITS_MAX	  ((typeof(((kib_msg_t*) 0)->ibm_credits)) - 1)  /* Max # of peer credits */
+
+#define IBLND_MSG_QUEUE_SIZE(v)    ((v) == IBLND_MSG_VERSION_1 ? \
+				     IBLND_MSG_QUEUE_SIZE_V1 :   \
+				     *kiblnd_tunables.kib_peertxcredits) /* # messages/RDMAs in-flight */
+#define IBLND_CREDITS_HIGHWATER(v) ((v) == IBLND_MSG_VERSION_1 ? \
+				     IBLND_CREDIT_HIGHWATER_V1 : \
+				     *kiblnd_tunables.kib_peercredits_hiw) /* when eagerly to return credits */
+
+#define kiblnd_rdma_create_id(cb, dev, ps, qpt) rdma_create_id(cb, dev, ps, qpt)
+
+static inline int
+kiblnd_concurrent_sends_v1(void)
+{
+	if (*kiblnd_tunables.kib_concurrent_sends > IBLND_MSG_QUEUE_SIZE_V1 * 2)
+		return IBLND_MSG_QUEUE_SIZE_V1 * 2;
+
+	if (*kiblnd_tunables.kib_concurrent_sends < IBLND_MSG_QUEUE_SIZE_V1 / 2)
+		return IBLND_MSG_QUEUE_SIZE_V1 / 2;
+
+	return *kiblnd_tunables.kib_concurrent_sends;
+}
+
+#define IBLND_CONCURRENT_SENDS(v)  ((v) == IBLND_MSG_VERSION_1 ? \
+				     kiblnd_concurrent_sends_v1() : \
+				     *kiblnd_tunables.kib_concurrent_sends)
+/* 2 OOB shall suffice for 1 keepalive and 1 returning credits */
+#define IBLND_OOB_CAPABLE(v)       ((v) != IBLND_MSG_VERSION_1)
+#define IBLND_OOB_MSGS(v)	   (IBLND_OOB_CAPABLE(v) ? 2 : 0)
+
+#define IBLND_MSG_SIZE	      (4<<10)		 /* max size of queued messages (inc hdr) */
+#define IBLND_MAX_RDMA_FRAGS	 LNET_MAX_IOV	   /* max # of fragments supported */
+#define IBLND_CFG_RDMA_FRAGS       (*kiblnd_tunables.kib_map_on_demand != 0 ? \
+				    *kiblnd_tunables.kib_map_on_demand :      \
+				     IBLND_MAX_RDMA_FRAGS)  /* max # of fragments configured by user */
+#define IBLND_RDMA_FRAGS(v)	((v) == IBLND_MSG_VERSION_1 ? \
+				     IBLND_MAX_RDMA_FRAGS : IBLND_CFG_RDMA_FRAGS)
+
+/************************/
+/* derived constants... */
+/* Pools (shared by connections on each CPT) */
+/* These pools can grow at runtime, so don't need give a very large value */
+#define IBLND_TX_POOL			256
+#define IBLND_PMR_POOL			256
+#define IBLND_FMR_POOL			256
+#define IBLND_FMR_POOL_FLUSH		192
+
+/* TX messages (shared by all connections) */
+#define IBLND_TX_MSGS()	    (*kiblnd_tunables.kib_ntx)
+
+/* RX messages (per connection) */
+#define IBLND_RX_MSGS(v)	    (IBLND_MSG_QUEUE_SIZE(v) * 2 + IBLND_OOB_MSGS(v))
+#define IBLND_RX_MSG_BYTES(v)       (IBLND_RX_MSGS(v) * IBLND_MSG_SIZE)
+#define IBLND_RX_MSG_PAGES(v)      ((IBLND_RX_MSG_BYTES(v) + PAGE_SIZE - 1) / PAGE_SIZE)
+
+/* WRs and CQEs (per connection) */
+#define IBLND_RECV_WRS(v)	    IBLND_RX_MSGS(v)
+#define IBLND_SEND_WRS(v)	  ((IBLND_RDMA_FRAGS(v) + 1) * IBLND_CONCURRENT_SENDS(v))
+#define IBLND_CQ_ENTRIES(v)	 (IBLND_RECV_WRS(v) + IBLND_SEND_WRS(v))
+
+struct kib_hca_dev;
+
+/* o2iblnd can run over aliased interface */
+#ifdef IFALIASZ
+#define KIB_IFNAME_SIZE	      IFALIASZ
+#else
+#define KIB_IFNAME_SIZE	      256
+#endif
+
+typedef struct
+{
+	struct list_head	   ibd_list;	  /* chain on kib_devs */
+	struct list_head	   ibd_fail_list;     /* chain on kib_failed_devs */
+	__u32		ibd_ifip;	  /* IPoIB interface IP */
+	/** IPoIB interface name */
+	char		 ibd_ifname[KIB_IFNAME_SIZE];
+	int		  ibd_nnets;	 /* # nets extant */
+
+	cfs_time_t	   ibd_next_failover;
+	int		  ibd_failed_failover; /* # failover failures */
+	unsigned int	 ibd_failover;      /* failover in progress */
+	unsigned int	 ibd_can_failover;  /* IPoIB interface is a bonding master */
+	struct list_head	   ibd_nets;
+	struct kib_hca_dev  *ibd_hdev;
+} kib_dev_t;
+
+typedef struct kib_hca_dev
+{
+	struct rdma_cm_id   *ibh_cmid;	  /* listener cmid */
+	struct ib_device    *ibh_ibdev;	 /* IB device */
+	int		  ibh_page_shift;    /* page shift of current HCA */
+	int		  ibh_page_size;     /* page size of current HCA */
+	__u64		ibh_page_mask;     /* page mask of current HCA */
+	int		  ibh_mr_shift;      /* bits shift of max MR size */
+	__u64		ibh_mr_size;       /* size of MR */
+	int		  ibh_nmrs;	  /* # of global MRs */
+	struct ib_mr       **ibh_mrs;	   /* global MR */
+	struct ib_pd	*ibh_pd;	    /* PD */
+	kib_dev_t	   *ibh_dev;	   /* owner */
+	atomic_t	 ibh_ref;	   /* refcount */
+} kib_hca_dev_t;
+
+/** # of seconds to keep pool alive */
+#define IBLND_POOL_DEADLINE     300
+/** # of seconds to retry if allocation failed */
+#define IBLND_POOL_RETRY	1
+
+typedef struct
+{
+	int		     ibp_npages;	     /* # pages */
+	struct page	    *ibp_pages[0];	   /* page array */
+} kib_pages_t;
+
+struct kib_pmr_pool;
+
+typedef struct {
+	struct list_head	      pmr_list;	       /* chain node */
+	struct ib_phys_buf     *pmr_ipb;		/* physical buffer */
+	struct ib_mr	   *pmr_mr;		 /* IB MR */
+	struct kib_pmr_pool    *pmr_pool;	       /* owner of this MR */
+	__u64		   pmr_iova;	       /* Virtual I/O address */
+	int		     pmr_refcount;	   /* reference count */
+} kib_phys_mr_t;
+
+struct kib_pool;
+struct kib_poolset;
+
+typedef int  (*kib_ps_pool_create_t)(struct kib_poolset *ps,
+				     int inc, struct kib_pool **pp_po);
+typedef void (*kib_ps_pool_destroy_t)(struct kib_pool *po);
+typedef void (*kib_ps_node_init_t)(struct kib_pool *po, struct list_head *node);
+typedef void (*kib_ps_node_fini_t)(struct kib_pool *po, struct list_head *node);
+
+struct kib_net;
+
+#define IBLND_POOL_NAME_LEN     32
+
+typedef struct kib_poolset
+{
+	spinlock_t		ps_lock;		/* serialize */
+	struct kib_net	 *ps_net;		 /* network it belongs to */
+	char		    ps_name[IBLND_POOL_NAME_LEN]; /* pool set name */
+	struct list_head	      ps_pool_list;	   /* list of pools */
+	struct list_head	      ps_failed_pool_list;    /* failed pool list */
+	cfs_time_t	      ps_next_retry;	  /* time stamp for retry if failed to allocate */
+	int		     ps_increasing;	  /* is allocating new pool */
+	int		     ps_pool_size;	   /* new pool size */
+	int			ps_cpt;			/* CPT id */
+
+	kib_ps_pool_create_t    ps_pool_create;	 /* create a new pool */
+	kib_ps_pool_destroy_t   ps_pool_destroy;	/* destroy a pool */
+	kib_ps_node_init_t      ps_node_init;	   /* initialize new allocated node */
+	kib_ps_node_fini_t      ps_node_fini;	   /* finalize node */
+} kib_poolset_t;
+
+typedef struct kib_pool
+{
+	struct list_head	      po_list;		/* chain on pool list */
+	struct list_head	      po_free_list;	   /* pre-allocated node */
+	kib_poolset_t	  *po_owner;	       /* pool_set of this pool */
+	cfs_time_t	      po_deadline;	    /* deadline of this pool */
+	int		     po_allocated;	   /* # of elements in use */
+	int		     po_failed;	      /* pool is created on failed HCA */
+	int		     po_size;		/* # of pre-allocated elements */
+} kib_pool_t;
+
+typedef struct {
+	kib_poolset_t	   tps_poolset;	    /* pool-set */
+	__u64		   tps_next_tx_cookie;     /* cookie of TX */
+} kib_tx_poolset_t;
+
+typedef struct {
+	kib_pool_t	      tpo_pool;	       /* pool */
+	struct kib_hca_dev     *tpo_hdev;	       /* device for this pool */
+	struct kib_tx	  *tpo_tx_descs;	   /* all the tx descriptors */
+	kib_pages_t	    *tpo_tx_pages;	   /* premapped tx msg pages */
+} kib_tx_pool_t;
+
+typedef struct {
+	kib_poolset_t	   pps_poolset;	    /* pool-set */
+} kib_pmr_poolset_t;
+
+typedef struct kib_pmr_pool {
+	struct kib_hca_dev     *ppo_hdev;	       /* device for this pool */
+	kib_pool_t	      ppo_pool;	       /* pool */
+} kib_pmr_pool_t;
+
+typedef struct
+{
+	spinlock_t		fps_lock;		/* serialize */
+	struct kib_net	 *fps_net;		/* IB network */
+	struct list_head	      fps_pool_list;	  /* FMR pool list */
+	struct list_head	      fps_failed_pool_list;   /* FMR pool list */
+	__u64		   fps_version;	    /* validity stamp */
+	int			fps_cpt;		/* CPT id */
+	int			fps_pool_size;
+	int			fps_flush_trigger;
+	/* is allocating new pool */
+	int			fps_increasing;
+	/* time stamp for retry if failed to allocate */
+	cfs_time_t		fps_next_retry;
+} kib_fmr_poolset_t;
+
+typedef struct
+{
+	struct list_head	      fpo_list;	       /* chain on pool list */
+	struct kib_hca_dev     *fpo_hdev;	       /* device for this pool */
+	kib_fmr_poolset_t      *fpo_owner;	      /* owner of this pool */
+	struct ib_fmr_pool     *fpo_fmr_pool;	   /* IB FMR pool */
+	cfs_time_t	      fpo_deadline;	   /* deadline of this pool */
+	int		     fpo_failed;	     /* fmr pool is failed */
+	int		     fpo_map_count;	  /* # of mapped FMR */
+} kib_fmr_pool_t;
+
+typedef struct {
+	struct ib_pool_fmr     *fmr_pfmr;	       /* IB pool fmr */
+	kib_fmr_pool_t	 *fmr_pool;	       /* pool of FMR */
+} kib_fmr_t;
+
+typedef struct kib_net
+{
+	struct list_head	   ibn_list;	  /* chain on kib_dev_t::ibd_nets */
+	__u64		ibn_incarnation;   /* my epoch */
+	int		  ibn_init;	  /* initialisation state */
+	int		  ibn_shutdown;      /* shutting down? */
+
+	atomic_t		ibn_npeers;	/* # peers extant */
+	atomic_t		ibn_nconns;	/* # connections extant */
+
+	kib_tx_poolset_t	**ibn_tx_ps;	/* tx pool-set */
+	kib_fmr_poolset_t	**ibn_fmr_ps;	/* fmr pool-set */
+	kib_pmr_poolset_t	**ibn_pmr_ps;	/* pmr pool-set */
+
+	kib_dev_t		*ibn_dev;	/* underlying IB device */
+} kib_net_t;
+
+#define KIB_THREAD_SHIFT		16
+#define KIB_THREAD_ID(cpt, tid)		((cpt) << KIB_THREAD_SHIFT | (tid))
+#define KIB_THREAD_CPT(id)		((id) >> KIB_THREAD_SHIFT)
+#define KIB_THREAD_TID(id)		((id) & ((1UL << KIB_THREAD_SHIFT) - 1))
+
+struct kib_sched_info {
+	/* serialise */
+	spinlock_t		ibs_lock;
+	/* schedulers sleep here */
+	wait_queue_head_t		ibs_waitq;
+	/* conns to check for rx completions */
+	struct list_head		ibs_conns;
+	/* number of scheduler threads */
+	int			ibs_nthreads;
+	/* max allowed scheduler threads */
+	int			ibs_nthreads_max;
+	int			ibs_cpt;	/* CPT id */
+};
+
+typedef struct
+{
+	int			kib_init;	/* initialisation state */
+	int			kib_shutdown;	/* shut down? */
+	struct list_head		kib_devs;	/* IB devices extant */
+	/* list head of failed devices */
+	struct list_head		kib_failed_devs;
+	/* schedulers sleep here */
+	wait_queue_head_t		kib_failover_waitq;
+	atomic_t		kib_nthreads;	/* # live threads */
+	/* stabilize net/dev/peer/conn ops */
+	rwlock_t		kib_global_lock;
+	/* hash table of all my known peers */
+	struct list_head		*kib_peers;
+	/* size of kib_peers */
+	int			kib_peer_hash_size;
+	/* the connd task (serialisation assertions) */
+	void			*kib_connd;
+	/* connections to setup/teardown */
+	struct list_head		kib_connd_conns;
+	/* connections with zero refcount */
+	struct list_head		kib_connd_zombies;
+	/* connection daemon sleeps here */
+	wait_queue_head_t		kib_connd_waitq;
+	spinlock_t		kib_connd_lock;	/* serialise */
+	struct ib_qp_attr	kib_error_qpa;	/* QP->ERROR */
+	/* percpt data for schedulers */
+	struct kib_sched_info	**kib_scheds;
+} kib_data_t;
+
+#define IBLND_INIT_NOTHING	 0
+#define IBLND_INIT_DATA	    1
+#define IBLND_INIT_ALL	     2
+
+/************************************************************************
+ * IB Wire message format.
+ * These are sent in sender's byte order (i.e. receiver flips).
+ */
+
+typedef struct kib_connparams
+{
+	__u16	     ibcp_queue_depth;
+	__u16	     ibcp_max_frags;
+	__u32	     ibcp_max_msg_size;
+} WIRE_ATTR kib_connparams_t;
+
+typedef struct
+{
+	lnet_hdr_t	ibim_hdr;	     /* portals header */
+	char	      ibim_payload[0];      /* piggy-backed payload */
+} WIRE_ATTR kib_immediate_msg_t;
+
+typedef struct
+{
+	__u32	     rf_nob;	       /* # bytes this frag */
+	__u64	     rf_addr;	      /* CAVEAT EMPTOR: misaligned!! */
+} WIRE_ATTR kib_rdma_frag_t;
+
+typedef struct
+{
+	__u32	     rd_key;	       /* local/remote key */
+	__u32	     rd_nfrags;	    /* # fragments */
+	kib_rdma_frag_t   rd_frags[0];	  /* buffer frags */
+} WIRE_ATTR kib_rdma_desc_t;
+
+typedef struct
+{
+	lnet_hdr_t	ibprm_hdr;	    /* portals header */
+	__u64	     ibprm_cookie;	 /* opaque completion cookie */
+} WIRE_ATTR kib_putreq_msg_t;
+
+typedef struct
+{
+	__u64	     ibpam_src_cookie;     /* reflected completion cookie */
+	__u64	     ibpam_dst_cookie;     /* opaque completion cookie */
+	kib_rdma_desc_t   ibpam_rd;	     /* sender's sink buffer */
+} WIRE_ATTR kib_putack_msg_t;
+
+typedef struct
+{
+	lnet_hdr_t	ibgm_hdr;	     /* portals header */
+	__u64	     ibgm_cookie;	  /* opaque completion cookie */
+	kib_rdma_desc_t   ibgm_rd;	      /* rdma descriptor */
+} WIRE_ATTR kib_get_msg_t;
+
+typedef struct
+{
+	__u64	     ibcm_cookie;	  /* opaque completion cookie */
+	__s32	     ibcm_status;	  /* < 0 failure: >= 0 length */
+} WIRE_ATTR kib_completion_msg_t;
+
+typedef struct
+{
+	/* First 2 fields fixed FOR ALL TIME */
+	__u32	     ibm_magic;	    /* I'm an ibnal message */
+	__u16	     ibm_version;	  /* this is my version number */
+
+	__u8	      ibm_type;	     /* msg type */
+	__u8	      ibm_credits;	  /* returned credits */
+	__u32	     ibm_nob;	      /* # bytes in whole message */
+	__u32	     ibm_cksum;	    /* checksum (0 == no checksum) */
+	__u64	     ibm_srcnid;	   /* sender's NID */
+	__u64	     ibm_srcstamp;	 /* sender's incarnation */
+	__u64	     ibm_dstnid;	   /* destination's NID */
+	__u64	     ibm_dststamp;	 /* destination's incarnation */
+
+	union {
+		kib_connparams_t      connparams;
+		kib_immediate_msg_t   immediate;
+		kib_putreq_msg_t      putreq;
+		kib_putack_msg_t      putack;
+		kib_get_msg_t	 get;
+		kib_completion_msg_t  completion;
+	} WIRE_ATTR ibm_u;
+} WIRE_ATTR kib_msg_t;
+
+#define IBLND_MSG_MAGIC LNET_PROTO_IB_MAGIC	/* unique magic */
+
+#define IBLND_MSG_VERSION_1	 0x11
+#define IBLND_MSG_VERSION_2	 0x12
+#define IBLND_MSG_VERSION	   IBLND_MSG_VERSION_2
+
+#define IBLND_MSG_CONNREQ	   0xc0	/* connection request */
+#define IBLND_MSG_CONNACK	   0xc1	/* connection acknowledge */
+#define IBLND_MSG_NOOP	      0xd0	/* nothing (just credits) */
+#define IBLND_MSG_IMMEDIATE	 0xd1	/* immediate */
+#define IBLND_MSG_PUT_REQ	   0xd2	/* putreq (src->sink) */
+#define IBLND_MSG_PUT_NAK	   0xd3	/* completion (sink->src) */
+#define IBLND_MSG_PUT_ACK	   0xd4	/* putack (sink->src) */
+#define IBLND_MSG_PUT_DONE	  0xd5	/* completion (src->sink) */
+#define IBLND_MSG_GET_REQ	   0xd6	/* getreq (sink->src) */
+#define IBLND_MSG_GET_DONE	  0xd7	/* completion (src->sink: all OK) */
+
+typedef struct {
+	__u32	    ibr_magic;	     /* sender's magic */
+	__u16	    ibr_version;	   /* sender's version */
+	__u8	     ibr_why;	       /* reject reason */
+	__u8	     ibr_padding;	   /* padding */
+	__u64	    ibr_incarnation;       /* incarnation of peer */
+	kib_connparams_t ibr_cp;		/* connection parameters */
+} WIRE_ATTR kib_rej_t;
+
+/* connection rejection reasons */
+#define IBLND_REJECT_CONN_RACE       1	  /* You lost connection race */
+#define IBLND_REJECT_NO_RESOURCES    2	  /* Out of memory/conns etc */
+#define IBLND_REJECT_FATAL	   3	  /* Anything else */
+
+#define IBLND_REJECT_CONN_UNCOMPAT   4	  /* incompatible version peer */
+#define IBLND_REJECT_CONN_STALE      5	  /* stale peer */
+
+#define IBLND_REJECT_RDMA_FRAGS      6	  /* Fatal: peer's rdma frags can't match mine */
+#define IBLND_REJECT_MSG_QUEUE_SIZE  7	  /* Fatal: peer's msg queue size can't match mine */
+
+/***********************************************************************/
+
+typedef struct kib_rx			   /* receive message */
+{
+	struct list_head		rx_list;      /* queue for attention */
+	struct kib_conn	  *rx_conn;      /* owning conn */
+	int		       rx_nob;       /* # bytes received (-1 while posted) */
+	enum ib_wc_status	 rx_status;    /* completion status */
+	kib_msg_t		*rx_msg;       /* message buffer (host vaddr) */
+	__u64		     rx_msgaddr;   /* message buffer (I/O addr) */
+	DECLARE_PCI_UNMAP_ADDR   (rx_msgunmap); /* for dma_unmap_single() */
+	struct ib_recv_wr	 rx_wrq;       /* receive work item... */
+	struct ib_sge	     rx_sge;       /* ...and its memory */
+} kib_rx_t;
+
+#define IBLND_POSTRX_DONT_POST    0	     /* don't post */
+#define IBLND_POSTRX_NO_CREDIT    1	     /* post: no credits */
+#define IBLND_POSTRX_PEER_CREDIT  2	     /* post: give peer back 1 credit */
+#define IBLND_POSTRX_RSRVD_CREDIT 3	     /* post: give myself back 1 reserved credit */
+
+typedef struct kib_tx			   /* transmit message */
+{
+	struct list_head		tx_list;      /* queue on idle_txs ibc_tx_queue etc. */
+	kib_tx_pool_t	    *tx_pool;      /* pool I'm from */
+	struct kib_conn	  *tx_conn;      /* owning conn */
+	short		     tx_sending;   /* # tx callbacks outstanding */
+	short		     tx_queued;    /* queued for sending */
+	short		     tx_waiting;   /* waiting for peer */
+	int		       tx_status;    /* LNET completion status */
+	unsigned long	     tx_deadline;  /* completion deadline */
+	__u64		     tx_cookie;    /* completion cookie */
+	lnet_msg_t	       *tx_lntmsg[2]; /* lnet msgs to finalize on completion */
+	kib_msg_t		*tx_msg;       /* message buffer (host vaddr) */
+	__u64		     tx_msgaddr;   /* message buffer (I/O addr) */
+	DECLARE_PCI_UNMAP_ADDR   (tx_msgunmap); /* for dma_unmap_single() */
+	int		       tx_nwrq;      /* # send work items */
+	struct ib_send_wr	*tx_wrq;       /* send work items... */
+	struct ib_sge	    *tx_sge;       /* ...and their memory */
+	kib_rdma_desc_t	  *tx_rd;	/* rdma descriptor */
+	int		       tx_nfrags;    /* # entries in... */
+	struct scatterlist       *tx_frags;     /* dma_map_sg descriptor */
+	__u64		    *tx_pages;     /* rdma phys page addrs */
+	union {
+		kib_phys_mr_t      *pmr;	/* MR for physical buffer */
+		kib_fmr_t	   fmr;	/* FMR */
+	}			 tx_u;
+	int		       tx_dmadir;    /* dma direction */
+} kib_tx_t;
+
+typedef struct kib_connvars
+{
+	/* connection-in-progress variables */
+	kib_msg_t		 cv_msg;
+} kib_connvars_t;
+
+typedef struct kib_conn
+{
+	struct kib_sched_info *ibc_sched;	/* scheduler information */
+	struct kib_peer     *ibc_peer;	  /* owning peer */
+	kib_hca_dev_t       *ibc_hdev;	  /* HCA bound on */
+	struct list_head	   ibc_list;	  /* stash on peer's conn list */
+	struct list_head	   ibc_sched_list;    /* schedule for attention */
+	__u16		ibc_version;       /* version of connection */
+	__u64		ibc_incarnation;   /* which instance of the peer */
+	atomic_t	 ibc_refcount;      /* # users */
+	int		  ibc_state;	 /* what's happening */
+	int		  ibc_nsends_posted; /* # uncompleted sends */
+	int		  ibc_noops_posted;  /* # uncompleted NOOPs */
+	int		  ibc_credits;       /* # credits I have */
+	int		  ibc_outstanding_credits; /* # credits to return */
+	int		  ibc_reserved_credits;/* # ACK/DONE msg credits */
+	int		  ibc_comms_error;   /* set on comms error */
+	unsigned int	     ibc_nrx:16;	/* receive buffers owned */
+	unsigned int	     ibc_scheduled:1;   /* scheduled for attention */
+	unsigned int	     ibc_ready:1;       /* CQ callback fired */
+	/* time of last send */
+	unsigned long	ibc_last_send;
+	/** link chain for kiblnd_check_conns only */
+	struct list_head	   ibc_connd_list;
+	/** rxs completed before ESTABLISHED */
+	struct list_head	   ibc_early_rxs;
+	/** IBLND_MSG_NOOPs for IBLND_MSG_VERSION_1 */
+	struct list_head	   ibc_tx_noops;
+	struct list_head	   ibc_tx_queue;       /* sends that need a credit */
+	struct list_head	   ibc_tx_queue_nocred;/* sends that don't need a credit */
+	struct list_head	   ibc_tx_queue_rsrvd; /* sends that need to reserve an ACK/DONE msg */
+	struct list_head	   ibc_active_txs;     /* active tx awaiting completion */
+	spinlock_t	     ibc_lock;		 /* serialise */
+	kib_rx_t	    *ibc_rxs;	    /* the rx descs */
+	kib_pages_t	 *ibc_rx_pages;       /* premapped rx msg pages */
+
+	struct rdma_cm_id   *ibc_cmid;	   /* CM id */
+	struct ib_cq	*ibc_cq;	     /* completion queue */
+
+	kib_connvars_t      *ibc_connvars;       /* in-progress connection state */
+} kib_conn_t;
+
+#define IBLND_CONN_INIT	       0	 /* being initialised */
+#define IBLND_CONN_ACTIVE_CONNECT     1	 /* active sending req */
+#define IBLND_CONN_PASSIVE_WAIT       2	 /* passive waiting for rtu */
+#define IBLND_CONN_ESTABLISHED	3	 /* connection established */
+#define IBLND_CONN_CLOSING	    4	 /* being closed */
+#define IBLND_CONN_DISCONNECTED       5	 /* disconnected */
+
+typedef struct kib_peer
+{
+	struct list_head	   ibp_list;	   /* stash on global peer list */
+	lnet_nid_t	   ibp_nid;	    /* who's on the other end(s) */
+	lnet_ni_t	   *ibp_ni;	     /* LNet interface */
+	atomic_t	 ibp_refcount;       /* # users */
+	struct list_head	   ibp_conns;	  /* all active connections */
+	struct list_head	   ibp_tx_queue;       /* msgs waiting for a conn */
+	__u16		ibp_version;	/* version of peer */
+	__u64		ibp_incarnation;    /* incarnation of peer */
+	int		  ibp_connecting;     /* current active connection attempts */
+	int		  ibp_accepting;      /* current passive connection attempts */
+	int		  ibp_error;	  /* errno on closing this peer */
+	cfs_time_t	   ibp_last_alive;     /* when (in jiffies) I was last alive */
+} kib_peer_t;
+
+extern kib_data_t      kiblnd_data;
+
+extern void kiblnd_hdev_destroy(kib_hca_dev_t *hdev);
+
+static inline void
+kiblnd_hdev_addref_locked(kib_hca_dev_t *hdev)
+{
+	LASSERT (atomic_read(&hdev->ibh_ref) > 0);
+	atomic_inc(&hdev->ibh_ref);
+}
+
+static inline void
+kiblnd_hdev_decref(kib_hca_dev_t *hdev)
+{
+	LASSERT (atomic_read(&hdev->ibh_ref) > 0);
+	if (atomic_dec_and_test(&hdev->ibh_ref))
+		kiblnd_hdev_destroy(hdev);
+}
+
+static inline int
+kiblnd_dev_can_failover(kib_dev_t *dev)
+{
+	if (!list_empty(&dev->ibd_fail_list)) /* already scheduled */
+		return 0;
+
+	if (*kiblnd_tunables.kib_dev_failover == 0) /* disabled */
+		return 0;
+
+	if (*kiblnd_tunables.kib_dev_failover > 1) /* force failover */
+		return 1;
+
+	return dev->ibd_can_failover;
+}
+
+#define kiblnd_conn_addref(conn)				\
+do {							    \
+	CDEBUG(D_NET, "conn[%p] (%d)++\n",		      \
+	       (conn), atomic_read(&(conn)->ibc_refcount)); \
+	atomic_inc(&(conn)->ibc_refcount);		  \
+} while (0)
+
+#define kiblnd_conn_decref(conn)					\
+do {									\
+	unsigned long flags;						\
+									\
+	CDEBUG(D_NET, "conn[%p] (%d)--\n",				\
+	       (conn), atomic_read(&(conn)->ibc_refcount));		\
+	LASSERT_ATOMIC_POS(&(conn)->ibc_refcount);			\
+	if (atomic_dec_and_test(&(conn)->ibc_refcount)) {		\
+		spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);	\
+		list_add_tail(&(conn)->ibc_list,			\
+				  &kiblnd_data.kib_connd_zombies);	\
+		wake_up(&kiblnd_data.kib_connd_waitq);		\
+		spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);\
+	}								\
+} while (0)
+
+#define kiblnd_peer_addref(peer)				\
+do {							    \
+	CDEBUG(D_NET, "peer[%p] -> %s (%d)++\n",		\
+	       (peer), libcfs_nid2str((peer)->ibp_nid),	 \
+	       atomic_read (&(peer)->ibp_refcount));	\
+	atomic_inc(&(peer)->ibp_refcount);		  \
+} while (0)
+
+#define kiblnd_peer_decref(peer)				\
+do {							    \
+	CDEBUG(D_NET, "peer[%p] -> %s (%d)--\n",		\
+	       (peer), libcfs_nid2str((peer)->ibp_nid),	 \
+	       atomic_read (&(peer)->ibp_refcount));	\
+	LASSERT_ATOMIC_POS(&(peer)->ibp_refcount);	      \
+	if (atomic_dec_and_test(&(peer)->ibp_refcount))     \
+		kiblnd_destroy_peer(peer);		      \
+} while (0)
+
+static inline struct list_head *
+kiblnd_nid2peerlist (lnet_nid_t nid)
+{
+	unsigned int hash =
+		((unsigned int)nid) % kiblnd_data.kib_peer_hash_size;
+
+	return (&kiblnd_data.kib_peers [hash]);
+}
+
+static inline int
+kiblnd_peer_active (kib_peer_t *peer)
+{
+	/* Am I in the peer hash table? */
+	return (!list_empty(&peer->ibp_list));
+}
+
+static inline kib_conn_t *
+kiblnd_get_conn_locked (kib_peer_t *peer)
+{
+	LASSERT (!list_empty(&peer->ibp_conns));
+
+	/* just return the first connection */
+	return list_entry(peer->ibp_conns.next, kib_conn_t, ibc_list);
+}
+
+static inline int
+kiblnd_send_keepalive(kib_conn_t *conn)
+{
+	return (*kiblnd_tunables.kib_keepalive > 0) &&
+		cfs_time_after(jiffies, conn->ibc_last_send +
+			       *kiblnd_tunables.kib_keepalive*HZ);
+}
+
+static inline int
+kiblnd_need_noop(kib_conn_t *conn)
+{
+	LASSERT (conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+	if (conn->ibc_outstanding_credits <
+	    IBLND_CREDITS_HIGHWATER(conn->ibc_version) &&
+	    !kiblnd_send_keepalive(conn))
+		return 0; /* No need to send NOOP */
+
+	if (IBLND_OOB_CAPABLE(conn->ibc_version)) {
+		if (!list_empty(&conn->ibc_tx_queue_nocred))
+			return 0; /* NOOP can be piggybacked */
+
+		/* No tx to piggyback NOOP onto or no credit to send a tx */
+		return (list_empty(&conn->ibc_tx_queue) ||
+			conn->ibc_credits == 0);
+	}
+
+	if (!list_empty(&conn->ibc_tx_noops) || /* NOOP already queued */
+	    !list_empty(&conn->ibc_tx_queue_nocred) || /* piggyback NOOP */
+	    conn->ibc_credits == 0)		    /* no credit */
+		return 0;
+
+	if (conn->ibc_credits == 1 &&      /* last credit reserved for */
+	    conn->ibc_outstanding_credits == 0) /* giving back credits */
+		return 0;
+
+	/* No tx to piggyback NOOP onto or no credit to send a tx */
+	return (list_empty(&conn->ibc_tx_queue) || conn->ibc_credits == 1);
+}
+
+static inline void
+kiblnd_abort_receives(kib_conn_t *conn)
+{
+	ib_modify_qp(conn->ibc_cmid->qp,
+		     &kiblnd_data.kib_error_qpa, IB_QP_STATE);
+}
+
+static inline const char *
+kiblnd_queue2str (kib_conn_t *conn, struct list_head *q)
+{
+	if (q == &conn->ibc_tx_queue)
+		return "tx_queue";
+
+	if (q == &conn->ibc_tx_queue_rsrvd)
+		return "tx_queue_rsrvd";
+
+	if (q == &conn->ibc_tx_queue_nocred)
+		return "tx_queue_nocred";
+
+	if (q == &conn->ibc_active_txs)
+		return "active_txs";
+
+	LBUG();
+	return NULL;
+}
+
+/* CAVEAT EMPTOR: We rely on descriptor alignment to allow us to use the
+ * lowest bits of the work request id to stash the work item type. */
+
+#define IBLND_WID_TX    0
+#define IBLND_WID_RDMA  1
+#define IBLND_WID_RX    2
+#define IBLND_WID_MASK  3UL
+
+static inline __u64
+kiblnd_ptr2wreqid (void *ptr, int type)
+{
+	unsigned long lptr = (unsigned long)ptr;
+
+	LASSERT ((lptr & IBLND_WID_MASK) == 0);
+	LASSERT ((type & ~IBLND_WID_MASK) == 0);
+	return (__u64)(lptr | type);
+}
+
+static inline void *
+kiblnd_wreqid2ptr (__u64 wreqid)
+{
+	return (void *)(((unsigned long)wreqid) & ~IBLND_WID_MASK);
+}
+
+static inline int
+kiblnd_wreqid2type (__u64 wreqid)
+{
+	return (wreqid & IBLND_WID_MASK);
+}
+
+static inline void
+kiblnd_set_conn_state (kib_conn_t *conn, int state)
+{
+	conn->ibc_state = state;
+	mb();
+}
+
+static inline void
+kiblnd_init_msg (kib_msg_t *msg, int type, int body_nob)
+{
+	msg->ibm_type = type;
+	msg->ibm_nob  = offsetof(kib_msg_t, ibm_u) + body_nob;
+}
+
+static inline int
+kiblnd_rd_size (kib_rdma_desc_t *rd)
+{
+	int   i;
+	int   size;
+
+	for (i = size = 0; i < rd->rd_nfrags; i++)
+		size += rd->rd_frags[i].rf_nob;
+
+	return size;
+}
+
+static inline __u64
+kiblnd_rd_frag_addr(kib_rdma_desc_t *rd, int index)
+{
+	return rd->rd_frags[index].rf_addr;
+}
+
+static inline __u32
+kiblnd_rd_frag_size(kib_rdma_desc_t *rd, int index)
+{
+	return rd->rd_frags[index].rf_nob;
+}
+
+static inline __u32
+kiblnd_rd_frag_key(kib_rdma_desc_t *rd, int index)
+{
+	return rd->rd_key;
+}
+
+static inline int
+kiblnd_rd_consume_frag(kib_rdma_desc_t *rd, int index, __u32 nob)
+{
+	if (nob < rd->rd_frags[index].rf_nob) {
+		rd->rd_frags[index].rf_addr += nob;
+		rd->rd_frags[index].rf_nob  -= nob;
+	} else {
+		index ++;
+	}
+
+	return index;
+}
+
+static inline int
+kiblnd_rd_msg_size(kib_rdma_desc_t *rd, int msgtype, int n)
+{
+	LASSERT (msgtype == IBLND_MSG_GET_REQ ||
+		 msgtype == IBLND_MSG_PUT_ACK);
+
+	return msgtype == IBLND_MSG_GET_REQ ?
+	       offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]) :
+	       offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]);
+}
+
+
+static inline __u64
+kiblnd_dma_mapping_error(struct ib_device *dev, u64 dma_addr)
+{
+	return ib_dma_mapping_error(dev, dma_addr);
+}
+
+static inline __u64 kiblnd_dma_map_single(struct ib_device *dev,
+					  void *msg, size_t size,
+					  enum dma_data_direction direction)
+{
+	return ib_dma_map_single(dev, msg, size, direction);
+}
+
+static inline void kiblnd_dma_unmap_single(struct ib_device *dev,
+					   __u64 addr, size_t size,
+					  enum dma_data_direction direction)
+{
+	ib_dma_unmap_single(dev, addr, size, direction);
+}
+
+#define KIBLND_UNMAP_ADDR_SET(p, m, a)  do {} while (0)
+#define KIBLND_UNMAP_ADDR(p, m, a)      (a)
+
+static inline int kiblnd_dma_map_sg(struct ib_device *dev,
+				    struct scatterlist *sg, int nents,
+				    enum dma_data_direction direction)
+{
+	return ib_dma_map_sg(dev, sg, nents, direction);
+}
+
+static inline void kiblnd_dma_unmap_sg(struct ib_device *dev,
+				       struct scatterlist *sg, int nents,
+				       enum dma_data_direction direction)
+{
+	ib_dma_unmap_sg(dev, sg, nents, direction);
+}
+
+static inline __u64 kiblnd_sg_dma_address(struct ib_device *dev,
+					  struct scatterlist *sg)
+{
+	return ib_sg_dma_address(dev, sg);
+}
+
+static inline unsigned int kiblnd_sg_dma_len(struct ib_device *dev,
+					     struct scatterlist *sg)
+{
+	return ib_sg_dma_len(dev, sg);
+}
+
+/* XXX We use KIBLND_CONN_PARAM(e) as writable buffer, it's not strictly
+ * right because OFED1.2 defines it as const, to use it we have to add
+ * (void *) cast to overcome "const" */
+
+#define KIBLND_CONN_PARAM(e)	    ((e)->param.conn.private_data)
+#define KIBLND_CONN_PARAM_LEN(e)	((e)->param.conn.private_data_len)
+
+
+struct ib_mr *kiblnd_find_rd_dma_mr(kib_hca_dev_t *hdev,
+				    kib_rdma_desc_t *rd);
+struct ib_mr *kiblnd_find_dma_mr(kib_hca_dev_t *hdev,
+				 __u64 addr, __u64 size);
+void kiblnd_map_rx_descs(kib_conn_t *conn);
+void kiblnd_unmap_rx_descs(kib_conn_t *conn);
+int kiblnd_map_tx(lnet_ni_t *ni, kib_tx_t *tx,
+		  kib_rdma_desc_t *rd, int nfrags);
+void kiblnd_unmap_tx(lnet_ni_t *ni, kib_tx_t *tx);
+void kiblnd_pool_free_node(kib_pool_t *pool, struct list_head *node);
+struct list_head *kiblnd_pool_alloc_node(kib_poolset_t *ps);
+
+int  kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, __u64 *pages,
+			 int npages, __u64 iov, kib_fmr_t *fmr);
+void kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status);
+
+int  kiblnd_pmr_pool_map(kib_pmr_poolset_t *pps, kib_hca_dev_t *hdev,
+			 kib_rdma_desc_t *rd, __u64 *iova, kib_phys_mr_t **pp_pmr);
+void kiblnd_pmr_pool_unmap(kib_phys_mr_t *pmr);
+
+int  kiblnd_startup (lnet_ni_t *ni);
+void kiblnd_shutdown (lnet_ni_t *ni);
+int  kiblnd_ctl (lnet_ni_t *ni, unsigned int cmd, void *arg);
+void kiblnd_query (struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when);
+
+int  kiblnd_tunables_init(void);
+void kiblnd_tunables_fini(void);
+
+int  kiblnd_connd (void *arg);
+int  kiblnd_scheduler(void *arg);
+int  kiblnd_thread_start(int (*fn)(void *arg), void *arg, char *name);
+int  kiblnd_failover_thread (void *arg);
+
+int  kiblnd_alloc_pages(kib_pages_t **pp, int cpt, int npages);
+void kiblnd_free_pages (kib_pages_t *p);
+
+int  kiblnd_cm_callback(struct rdma_cm_id *cmid,
+			struct rdma_cm_event *event);
+int  kiblnd_translate_mtu(int value);
+
+int  kiblnd_dev_failover(kib_dev_t *dev);
+int  kiblnd_create_peer (lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid);
+void kiblnd_destroy_peer (kib_peer_t *peer);
+void kiblnd_destroy_dev (kib_dev_t *dev);
+void kiblnd_unlink_peer_locked (kib_peer_t *peer);
+void kiblnd_peer_alive (kib_peer_t *peer);
+kib_peer_t *kiblnd_find_peer_locked (lnet_nid_t nid);
+void kiblnd_peer_connect_failed (kib_peer_t *peer, int active, int error);
+int  kiblnd_close_stale_conns_locked (kib_peer_t *peer,
+				      int version, __u64 incarnation);
+int  kiblnd_close_peer_conns_locked (kib_peer_t *peer, int why);
+
+void kiblnd_connreq_done(kib_conn_t *conn, int status);
+kib_conn_t *kiblnd_create_conn (kib_peer_t *peer, struct rdma_cm_id *cmid,
+				int state, int version);
+void kiblnd_destroy_conn (kib_conn_t *conn);
+void kiblnd_close_conn (kib_conn_t *conn, int error);
+void kiblnd_close_conn_locked (kib_conn_t *conn, int error);
+
+int  kiblnd_init_rdma (kib_conn_t *conn, kib_tx_t *tx, int type,
+		       int nob, kib_rdma_desc_t *dstrd, __u64 dstcookie);
+
+void kiblnd_launch_tx (lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid);
+void kiblnd_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn);
+void kiblnd_queue_tx (kib_tx_t *tx, kib_conn_t *conn);
+void kiblnd_init_tx_msg (lnet_ni_t *ni, kib_tx_t *tx, int type, int body_nob);
+void kiblnd_txlist_done (lnet_ni_t *ni, struct list_head *txlist,
+			 int status);
+void kiblnd_check_sends (kib_conn_t *conn);
+
+void kiblnd_qp_event(struct ib_event *event, void *arg);
+void kiblnd_cq_event(struct ib_event *event, void *arg);
+void kiblnd_cq_completion(struct ib_cq *cq, void *arg);
+
+void kiblnd_pack_msg (lnet_ni_t *ni, kib_msg_t *msg, int version,
+		      int credits, lnet_nid_t dstnid, __u64 dststamp);
+int  kiblnd_unpack_msg(kib_msg_t *msg, int nob);
+int  kiblnd_post_rx (kib_rx_t *rx, int credit);
+
+int  kiblnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
+int  kiblnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
+		 unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
+		 unsigned int offset, unsigned int mlen, unsigned int rlen);
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c
new file mode 100644
index 000000000000..cc6232126dd0
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c
@@ -0,0 +1,3529 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/o2iblnd/o2iblnd_cb.c
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include "o2iblnd.h"
+
+void
+kiblnd_tx_done (lnet_ni_t *ni, kib_tx_t *tx)
+{
+	lnet_msg_t *lntmsg[2];
+	kib_net_t  *net = ni->ni_data;
+	int	 rc;
+	int	 i;
+
+	LASSERT (net != NULL);
+	LASSERT (!in_interrupt());
+	LASSERT (!tx->tx_queued);	       /* mustn't be queued for sending */
+	LASSERT (tx->tx_sending == 0);	  /* mustn't be awaiting sent callback */
+	LASSERT (!tx->tx_waiting);	      /* mustn't be awaiting peer response */
+	LASSERT (tx->tx_pool != NULL);
+
+	kiblnd_unmap_tx(ni, tx);
+
+	/* tx may have up to 2 lnet msgs to finalise */
+	lntmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL;
+	lntmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL;
+	rc = tx->tx_status;
+
+	if (tx->tx_conn != NULL) {
+		LASSERT (ni == tx->tx_conn->ibc_peer->ibp_ni);
+
+		kiblnd_conn_decref(tx->tx_conn);
+		tx->tx_conn = NULL;
+	}
+
+	tx->tx_nwrq = 0;
+	tx->tx_status = 0;
+
+	kiblnd_pool_free_node(&tx->tx_pool->tpo_pool, &tx->tx_list);
+
+	/* delay finalize until my descs have been freed */
+	for (i = 0; i < 2; i++) {
+		if (lntmsg[i] == NULL)
+			continue;
+
+		lnet_finalize(ni, lntmsg[i], rc);
+	}
+}
+
+void
+kiblnd_txlist_done (lnet_ni_t *ni, struct list_head *txlist, int status)
+{
+	kib_tx_t *tx;
+
+	while (!list_empty (txlist)) {
+		tx = list_entry (txlist->next, kib_tx_t, tx_list);
+
+		list_del(&tx->tx_list);
+		/* complete now */
+		tx->tx_waiting = 0;
+		tx->tx_status = status;
+		kiblnd_tx_done(ni, tx);
+	}
+}
+
+kib_tx_t *
+kiblnd_get_idle_tx(lnet_ni_t *ni, lnet_nid_t target)
+{
+	kib_net_t		*net = (kib_net_t *)ni->ni_data;
+	struct list_head		*node;
+	kib_tx_t		*tx;
+	kib_tx_poolset_t	*tps;
+
+	tps = net->ibn_tx_ps[lnet_cpt_of_nid(target)];
+	node = kiblnd_pool_alloc_node(&tps->tps_poolset);
+	if (node == NULL)
+		return NULL;
+	tx = container_of(node, kib_tx_t, tx_list);
+
+	LASSERT (tx->tx_nwrq == 0);
+	LASSERT (!tx->tx_queued);
+	LASSERT (tx->tx_sending == 0);
+	LASSERT (!tx->tx_waiting);
+	LASSERT (tx->tx_status == 0);
+	LASSERT (tx->tx_conn == NULL);
+	LASSERT (tx->tx_lntmsg[0] == NULL);
+	LASSERT (tx->tx_lntmsg[1] == NULL);
+	LASSERT (tx->tx_u.pmr == NULL);
+	LASSERT (tx->tx_nfrags == 0);
+
+	return tx;
+}
+
+void
+kiblnd_drop_rx(kib_rx_t *rx)
+{
+	kib_conn_t		*conn	= rx->rx_conn;
+	struct kib_sched_info	*sched	= conn->ibc_sched;
+	unsigned long		flags;
+
+	spin_lock_irqsave(&sched->ibs_lock, flags);
+	LASSERT(conn->ibc_nrx > 0);
+	conn->ibc_nrx--;
+	spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+	kiblnd_conn_decref(conn);
+}
+
+int
+kiblnd_post_rx (kib_rx_t *rx, int credit)
+{
+	kib_conn_t	 *conn = rx->rx_conn;
+	kib_net_t	  *net = conn->ibc_peer->ibp_ni->ni_data;
+	struct ib_recv_wr  *bad_wrq = NULL;
+	struct ib_mr       *mr;
+	int		 rc;
+
+	LASSERT (net != NULL);
+	LASSERT (!in_interrupt());
+	LASSERT (credit == IBLND_POSTRX_NO_CREDIT ||
+		 credit == IBLND_POSTRX_PEER_CREDIT ||
+		 credit == IBLND_POSTRX_RSRVD_CREDIT);
+
+	mr = kiblnd_find_dma_mr(conn->ibc_hdev, rx->rx_msgaddr, IBLND_MSG_SIZE);
+	LASSERT (mr != NULL);
+
+	rx->rx_sge.lkey   = mr->lkey;
+	rx->rx_sge.addr   = rx->rx_msgaddr;
+	rx->rx_sge.length = IBLND_MSG_SIZE;
+
+	rx->rx_wrq.next = NULL;
+	rx->rx_wrq.sg_list = &rx->rx_sge;
+	rx->rx_wrq.num_sge = 1;
+	rx->rx_wrq.wr_id = kiblnd_ptr2wreqid(rx, IBLND_WID_RX);
+
+	LASSERT (conn->ibc_state >= IBLND_CONN_INIT);
+	LASSERT (rx->rx_nob >= 0);	      /* not posted */
+
+	if (conn->ibc_state > IBLND_CONN_ESTABLISHED) {
+		kiblnd_drop_rx(rx);	     /* No more posts for this rx */
+		return 0;
+	}
+
+	rx->rx_nob = -1;			/* flag posted */
+
+	rc = ib_post_recv(conn->ibc_cmid->qp, &rx->rx_wrq, &bad_wrq);
+	if (rc != 0) {
+		CERROR("Can't post rx for %s: %d, bad_wrq: %p\n",
+		       libcfs_nid2str(conn->ibc_peer->ibp_nid), rc, bad_wrq);
+		rx->rx_nob = 0;
+	}
+
+	if (conn->ibc_state < IBLND_CONN_ESTABLISHED) /* Initial post */
+		return rc;
+
+	if (rc != 0) {
+		kiblnd_close_conn(conn, rc);
+		kiblnd_drop_rx(rx);	     /* No more posts for this rx */
+		return rc;
+	}
+
+	if (credit == IBLND_POSTRX_NO_CREDIT)
+		return 0;
+
+	spin_lock(&conn->ibc_lock);
+	if (credit == IBLND_POSTRX_PEER_CREDIT)
+		conn->ibc_outstanding_credits++;
+	else
+		conn->ibc_reserved_credits++;
+	spin_unlock(&conn->ibc_lock);
+
+	kiblnd_check_sends(conn);
+	return 0;
+}
+
+kib_tx_t *
+kiblnd_find_waiting_tx_locked(kib_conn_t *conn, int txtype, __u64 cookie)
+{
+	struct list_head   *tmp;
+
+	list_for_each(tmp, &conn->ibc_active_txs) {
+		kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
+
+		LASSERT (!tx->tx_queued);
+		LASSERT (tx->tx_sending != 0 || tx->tx_waiting);
+
+		if (tx->tx_cookie != cookie)
+			continue;
+
+		if (tx->tx_waiting &&
+		    tx->tx_msg->ibm_type == txtype)
+			return tx;
+
+		CWARN("Bad completion: %swaiting, type %x (wanted %x)\n",
+		      tx->tx_waiting ? "" : "NOT ",
+		      tx->tx_msg->ibm_type, txtype);
+	}
+	return NULL;
+}
+
+void
+kiblnd_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie)
+{
+	kib_tx_t    *tx;
+	lnet_ni_t   *ni = conn->ibc_peer->ibp_ni;
+	int	  idle;
+
+	spin_lock(&conn->ibc_lock);
+
+	tx = kiblnd_find_waiting_tx_locked(conn, txtype, cookie);
+	if (tx == NULL) {
+		spin_unlock(&conn->ibc_lock);
+
+		CWARN("Unmatched completion type %x cookie "LPX64" from %s\n",
+		      txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+		kiblnd_close_conn(conn, -EPROTO);
+		return;
+	}
+
+	if (tx->tx_status == 0) {	       /* success so far */
+		if (status < 0) {	       /* failed? */
+			tx->tx_status = status;
+		} else if (txtype == IBLND_MSG_GET_REQ) {
+			lnet_set_reply_msg_len(ni, tx->tx_lntmsg[1], status);
+		}
+	}
+
+	tx->tx_waiting = 0;
+
+	idle = !tx->tx_queued && (tx->tx_sending == 0);
+	if (idle)
+		list_del(&tx->tx_list);
+
+	spin_unlock(&conn->ibc_lock);
+
+	if (idle)
+		kiblnd_tx_done(ni, tx);
+}
+
+void
+kiblnd_send_completion(kib_conn_t *conn, int type, int status, __u64 cookie)
+{
+	lnet_ni_t   *ni = conn->ibc_peer->ibp_ni;
+	kib_tx_t    *tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
+
+	if (tx == NULL) {
+		CERROR("Can't get tx for completion %x for %s\n",
+		       type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+		return;
+	}
+
+	tx->tx_msg->ibm_u.completion.ibcm_status = status;
+	tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie;
+	kiblnd_init_tx_msg(ni, tx, type, sizeof(kib_completion_msg_t));
+
+	kiblnd_queue_tx(tx, conn);
+}
+
+void
+kiblnd_handle_rx (kib_rx_t *rx)
+{
+	kib_msg_t    *msg = rx->rx_msg;
+	kib_conn_t   *conn = rx->rx_conn;
+	lnet_ni_t    *ni = conn->ibc_peer->ibp_ni;
+	int	   credits = msg->ibm_credits;
+	kib_tx_t     *tx;
+	int	   rc = 0;
+	int	   rc2;
+	int	   post_credit;
+
+	LASSERT (conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+	CDEBUG (D_NET, "Received %x[%d] from %s\n",
+		msg->ibm_type, credits,
+		libcfs_nid2str(conn->ibc_peer->ibp_nid));
+
+	if (credits != 0) {
+		/* Have I received credits that will let me send? */
+		spin_lock(&conn->ibc_lock);
+
+		if (conn->ibc_credits + credits >
+		    IBLND_MSG_QUEUE_SIZE(conn->ibc_version)) {
+			rc2 = conn->ibc_credits;
+			spin_unlock(&conn->ibc_lock);
+
+			CERROR("Bad credits from %s: %d + %d > %d\n",
+			       libcfs_nid2str(conn->ibc_peer->ibp_nid),
+			       rc2, credits,
+			       IBLND_MSG_QUEUE_SIZE(conn->ibc_version));
+
+			kiblnd_close_conn(conn, -EPROTO);
+			kiblnd_post_rx(rx, IBLND_POSTRX_NO_CREDIT);
+			return;
+		}
+
+		conn->ibc_credits += credits;
+
+		/* This ensures the credit taken by NOOP can be returned */
+		if (msg->ibm_type == IBLND_MSG_NOOP &&
+		    !IBLND_OOB_CAPABLE(conn->ibc_version)) /* v1 only */
+			conn->ibc_outstanding_credits++;
+
+		spin_unlock(&conn->ibc_lock);
+		kiblnd_check_sends(conn);
+	}
+
+	switch (msg->ibm_type) {
+	default:
+		CERROR("Bad IBLND message type %x from %s\n",
+		       msg->ibm_type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+		post_credit = IBLND_POSTRX_NO_CREDIT;
+		rc = -EPROTO;
+		break;
+
+	case IBLND_MSG_NOOP:
+		if (IBLND_OOB_CAPABLE(conn->ibc_version)) {
+			post_credit = IBLND_POSTRX_NO_CREDIT;
+			break;
+		}
+
+		if (credits != 0) /* credit already posted */
+			post_credit = IBLND_POSTRX_NO_CREDIT;
+		else	      /* a keepalive NOOP */
+			post_credit = IBLND_POSTRX_PEER_CREDIT;
+		break;
+
+	case IBLND_MSG_IMMEDIATE:
+		post_credit = IBLND_POSTRX_DONT_POST;
+		rc = lnet_parse(ni, &msg->ibm_u.immediate.ibim_hdr,
+				msg->ibm_srcnid, rx, 0);
+		if (rc < 0)		     /* repost on error */
+			post_credit = IBLND_POSTRX_PEER_CREDIT;
+		break;
+
+	case IBLND_MSG_PUT_REQ:
+		post_credit = IBLND_POSTRX_DONT_POST;
+		rc = lnet_parse(ni, &msg->ibm_u.putreq.ibprm_hdr,
+				msg->ibm_srcnid, rx, 1);
+		if (rc < 0)		     /* repost on error */
+			post_credit = IBLND_POSTRX_PEER_CREDIT;
+		break;
+
+	case IBLND_MSG_PUT_NAK:
+		CWARN ("PUT_NACK from %s\n",
+		       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+		post_credit = IBLND_POSTRX_RSRVD_CREDIT;
+		kiblnd_handle_completion(conn, IBLND_MSG_PUT_REQ,
+					 msg->ibm_u.completion.ibcm_status,
+					 msg->ibm_u.completion.ibcm_cookie);
+		break;
+
+	case IBLND_MSG_PUT_ACK:
+		post_credit = IBLND_POSTRX_RSRVD_CREDIT;
+
+		spin_lock(&conn->ibc_lock);
+		tx = kiblnd_find_waiting_tx_locked(conn, IBLND_MSG_PUT_REQ,
+					msg->ibm_u.putack.ibpam_src_cookie);
+		if (tx != NULL)
+			list_del(&tx->tx_list);
+		spin_unlock(&conn->ibc_lock);
+
+		if (tx == NULL) {
+			CERROR("Unmatched PUT_ACK from %s\n",
+			       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+			rc = -EPROTO;
+			break;
+		}
+
+		LASSERT (tx->tx_waiting);
+		/* CAVEAT EMPTOR: I could be racing with tx_complete, but...
+		 * (a) I can overwrite tx_msg since my peer has received it!
+		 * (b) tx_waiting set tells tx_complete() it's not done. */
+
+		tx->tx_nwrq = 0;		/* overwrite PUT_REQ */
+
+		rc2 = kiblnd_init_rdma(conn, tx, IBLND_MSG_PUT_DONE,
+				       kiblnd_rd_size(&msg->ibm_u.putack.ibpam_rd),
+				       &msg->ibm_u.putack.ibpam_rd,
+				       msg->ibm_u.putack.ibpam_dst_cookie);
+		if (rc2 < 0)
+			CERROR("Can't setup rdma for PUT to %s: %d\n",
+			       libcfs_nid2str(conn->ibc_peer->ibp_nid), rc2);
+
+		spin_lock(&conn->ibc_lock);
+		tx->tx_waiting = 0;	/* clear waiting and queue atomically */
+		kiblnd_queue_tx_locked(tx, conn);
+		spin_unlock(&conn->ibc_lock);
+		break;
+
+	case IBLND_MSG_PUT_DONE:
+		post_credit = IBLND_POSTRX_PEER_CREDIT;
+		kiblnd_handle_completion(conn, IBLND_MSG_PUT_ACK,
+					 msg->ibm_u.completion.ibcm_status,
+					 msg->ibm_u.completion.ibcm_cookie);
+		break;
+
+	case IBLND_MSG_GET_REQ:
+		post_credit = IBLND_POSTRX_DONT_POST;
+		rc = lnet_parse(ni, &msg->ibm_u.get.ibgm_hdr,
+				msg->ibm_srcnid, rx, 1);
+		if (rc < 0)		     /* repost on error */
+			post_credit = IBLND_POSTRX_PEER_CREDIT;
+		break;
+
+	case IBLND_MSG_GET_DONE:
+		post_credit = IBLND_POSTRX_RSRVD_CREDIT;
+		kiblnd_handle_completion(conn, IBLND_MSG_GET_REQ,
+					 msg->ibm_u.completion.ibcm_status,
+					 msg->ibm_u.completion.ibcm_cookie);
+		break;
+	}
+
+	if (rc < 0)			     /* protocol error */
+		kiblnd_close_conn(conn, rc);
+
+	if (post_credit != IBLND_POSTRX_DONT_POST)
+		kiblnd_post_rx(rx, post_credit);
+}
+
+void
+kiblnd_rx_complete (kib_rx_t *rx, int status, int nob)
+{
+	kib_msg_t    *msg = rx->rx_msg;
+	kib_conn_t   *conn = rx->rx_conn;
+	lnet_ni_t    *ni = conn->ibc_peer->ibp_ni;
+	kib_net_t    *net = ni->ni_data;
+	int	   rc;
+	int	   err = -EIO;
+
+	LASSERT (net != NULL);
+	LASSERT (rx->rx_nob < 0);	       /* was posted */
+	rx->rx_nob = 0;			 /* isn't now */
+
+	if (conn->ibc_state > IBLND_CONN_ESTABLISHED)
+		goto ignore;
+
+	if (status != IB_WC_SUCCESS) {
+		CNETERR("Rx from %s failed: %d\n",
+			libcfs_nid2str(conn->ibc_peer->ibp_nid), status);
+		goto failed;
+	}
+
+	LASSERT (nob >= 0);
+	rx->rx_nob = nob;
+
+	rc = kiblnd_unpack_msg(msg, rx->rx_nob);
+	if (rc != 0) {
+		CERROR ("Error %d unpacking rx from %s\n",
+			rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+		goto failed;
+	}
+
+	if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid ||
+	    msg->ibm_dstnid != ni->ni_nid ||
+	    msg->ibm_srcstamp != conn->ibc_incarnation ||
+	    msg->ibm_dststamp != net->ibn_incarnation) {
+		CERROR ("Stale rx from %s\n",
+			libcfs_nid2str(conn->ibc_peer->ibp_nid));
+		err = -ESTALE;
+		goto failed;
+	}
+
+	/* set time last known alive */
+	kiblnd_peer_alive(conn->ibc_peer);
+
+	/* racing with connection establishment/teardown! */
+
+	if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+		rwlock_t  *g_lock = &kiblnd_data.kib_global_lock;
+		unsigned long  flags;
+
+		write_lock_irqsave(g_lock, flags);
+		/* must check holding global lock to eliminate race */
+		if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+			list_add_tail(&rx->rx_list, &conn->ibc_early_rxs);
+			write_unlock_irqrestore(g_lock, flags);
+			return;
+		}
+		write_unlock_irqrestore(g_lock, flags);
+	}
+	kiblnd_handle_rx(rx);
+	return;
+
+ failed:
+	CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
+	kiblnd_close_conn(conn, err);
+ ignore:
+	kiblnd_drop_rx(rx);		     /* Don't re-post rx. */
+}
+
+struct page *
+kiblnd_kvaddr_to_page (unsigned long vaddr)
+{
+	struct page *page;
+
+	if (vaddr >= VMALLOC_START &&
+	    vaddr < VMALLOC_END) {
+		page = vmalloc_to_page ((void *)vaddr);
+		LASSERT (page != NULL);
+		return page;
+	}
+#ifdef CONFIG_HIGHMEM
+	if (vaddr >= PKMAP_BASE &&
+	    vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) {
+		/* No highmem pages only used for bulk (kiov) I/O */
+		CERROR("find page for address in highmem\n");
+		LBUG();
+	}
+#endif
+	page = virt_to_page (vaddr);
+	LASSERT (page != NULL);
+	return page;
+}
+
+static int
+kiblnd_fmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, int nob)
+{
+	kib_hca_dev_t		*hdev;
+	__u64			*pages = tx->tx_pages;
+	kib_fmr_poolset_t	*fps;
+	int			npages;
+	int			size;
+	int			cpt;
+	int			rc;
+	int			i;
+
+	LASSERT(tx->tx_pool != NULL);
+	LASSERT(tx->tx_pool->tpo_pool.po_owner != NULL);
+
+	hdev  = tx->tx_pool->tpo_hdev;
+
+	for (i = 0, npages = 0; i < rd->rd_nfrags; i++) {
+		for (size = 0; size <  rd->rd_frags[i].rf_nob;
+			       size += hdev->ibh_page_size) {
+			pages[npages ++] = (rd->rd_frags[i].rf_addr &
+					    hdev->ibh_page_mask) + size;
+		}
+	}
+
+	cpt = tx->tx_pool->tpo_pool.po_owner->ps_cpt;
+
+	fps = net->ibn_fmr_ps[cpt];
+	rc = kiblnd_fmr_pool_map(fps, pages, npages, 0, &tx->tx_u.fmr);
+	if (rc != 0) {
+		CERROR ("Can't map %d pages: %d\n", npages, rc);
+		return rc;
+	}
+
+	/* If rd is not tx_rd, it's going to get sent to a peer, who will need
+	 * the rkey */
+	rd->rd_key = (rd != tx->tx_rd) ? tx->tx_u.fmr.fmr_pfmr->fmr->rkey :
+					 tx->tx_u.fmr.fmr_pfmr->fmr->lkey;
+	rd->rd_frags[0].rf_addr &= ~hdev->ibh_page_mask;
+	rd->rd_frags[0].rf_nob   = nob;
+	rd->rd_nfrags = 1;
+
+	return 0;
+}
+
+static int
+kiblnd_pmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, int nob)
+{
+	kib_hca_dev_t		*hdev;
+	kib_pmr_poolset_t	*pps;
+	__u64			iova;
+	int			cpt;
+	int			rc;
+
+	LASSERT(tx->tx_pool != NULL);
+	LASSERT(tx->tx_pool->tpo_pool.po_owner != NULL);
+
+	hdev = tx->tx_pool->tpo_hdev;
+
+	iova = rd->rd_frags[0].rf_addr & ~hdev->ibh_page_mask;
+
+	cpt = tx->tx_pool->tpo_pool.po_owner->ps_cpt;
+
+	pps = net->ibn_pmr_ps[cpt];
+	rc = kiblnd_pmr_pool_map(pps, hdev, rd, &iova, &tx->tx_u.pmr);
+	if (rc != 0) {
+		CERROR("Failed to create MR by phybuf: %d\n", rc);
+		return rc;
+	}
+
+	/* If rd is not tx_rd, it's going to get sent to a peer, who will need
+	 * the rkey */
+	rd->rd_key = (rd != tx->tx_rd) ? tx->tx_u.pmr->pmr_mr->rkey :
+					 tx->tx_u.pmr->pmr_mr->lkey;
+	rd->rd_nfrags = 1;
+	rd->rd_frags[0].rf_addr = iova;
+	rd->rd_frags[0].rf_nob  = nob;
+
+	return 0;
+}
+
+void
+kiblnd_unmap_tx(lnet_ni_t *ni, kib_tx_t *tx)
+{
+	kib_net_t  *net = ni->ni_data;
+
+	LASSERT(net != NULL);
+
+	if (net->ibn_fmr_ps != NULL && tx->tx_u.fmr.fmr_pfmr != NULL) {
+		kiblnd_fmr_pool_unmap(&tx->tx_u.fmr, tx->tx_status);
+		tx->tx_u.fmr.fmr_pfmr = NULL;
+
+	} else if (net->ibn_pmr_ps != NULL && tx->tx_u.pmr != NULL) {
+		kiblnd_pmr_pool_unmap(tx->tx_u.pmr);
+		tx->tx_u.pmr = NULL;
+	}
+
+	if (tx->tx_nfrags != 0) {
+		kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev->ibh_ibdev,
+				    tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir);
+		tx->tx_nfrags = 0;
+	}
+}
+
+int
+kiblnd_map_tx(lnet_ni_t *ni, kib_tx_t *tx,
+	      kib_rdma_desc_t *rd, int nfrags)
+{
+	kib_hca_dev_t      *hdev  = tx->tx_pool->tpo_hdev;
+	kib_net_t	  *net   = ni->ni_data;
+	struct ib_mr       *mr    = NULL;
+	__u32	       nob;
+	int		 i;
+
+	/* If rd is not tx_rd, it's going to get sent to a peer and I'm the
+	 * RDMA sink */
+	tx->tx_dmadir = (rd != tx->tx_rd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+	tx->tx_nfrags = nfrags;
+
+	rd->rd_nfrags =
+		kiblnd_dma_map_sg(hdev->ibh_ibdev,
+				  tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir);
+
+	for (i = 0, nob = 0; i < rd->rd_nfrags; i++) {
+		rd->rd_frags[i].rf_nob  = kiblnd_sg_dma_len(
+			hdev->ibh_ibdev, &tx->tx_frags[i]);
+		rd->rd_frags[i].rf_addr = kiblnd_sg_dma_address(
+			hdev->ibh_ibdev, &tx->tx_frags[i]);
+		nob += rd->rd_frags[i].rf_nob;
+	}
+
+	/* looking for pre-mapping MR */
+	mr = kiblnd_find_rd_dma_mr(hdev, rd);
+	if (mr != NULL) {
+		/* found pre-mapping MR */
+		rd->rd_key = (rd != tx->tx_rd) ? mr->rkey : mr->lkey;
+		return 0;
+	}
+
+	if (net->ibn_fmr_ps != NULL)
+		return kiblnd_fmr_map_tx(net, tx, rd, nob);
+	else if (net->ibn_pmr_ps != NULL)
+		return kiblnd_pmr_map_tx(net, tx, rd, nob);
+
+	return -EINVAL;
+}
+
+
+int
+kiblnd_setup_rd_iov(lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
+		    unsigned int niov, struct iovec *iov, int offset, int nob)
+{
+	kib_net_t	  *net = ni->ni_data;
+	struct page	*page;
+	struct scatterlist *sg;
+	unsigned long       vaddr;
+	int		 fragnob;
+	int		 page_offset;
+
+	LASSERT (nob > 0);
+	LASSERT (niov > 0);
+	LASSERT (net != NULL);
+
+	while (offset >= iov->iov_len) {
+		offset -= iov->iov_len;
+		niov--;
+		iov++;
+		LASSERT (niov > 0);
+	}
+
+	sg = tx->tx_frags;
+	do {
+		LASSERT (niov > 0);
+
+		vaddr = ((unsigned long)iov->iov_base) + offset;
+		page_offset = vaddr & (PAGE_SIZE - 1);
+		page = kiblnd_kvaddr_to_page(vaddr);
+		if (page == NULL) {
+			CERROR ("Can't find page\n");
+			return -EFAULT;
+		}
+
+		fragnob = min((int)(iov->iov_len - offset), nob);
+		fragnob = min(fragnob, (int)PAGE_SIZE - page_offset);
+
+		sg_set_page(sg, page, fragnob, page_offset);
+		sg++;
+
+		if (offset + fragnob < iov->iov_len) {
+			offset += fragnob;
+		} else {
+			offset = 0;
+			iov++;
+			niov--;
+		}
+		nob -= fragnob;
+	} while (nob > 0);
+
+	return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags);
+}
+
+int
+kiblnd_setup_rd_kiov (lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
+		      int nkiov, lnet_kiov_t *kiov, int offset, int nob)
+{
+	kib_net_t	  *net = ni->ni_data;
+	struct scatterlist *sg;
+	int		 fragnob;
+
+	CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
+
+	LASSERT (nob > 0);
+	LASSERT (nkiov > 0);
+	LASSERT (net != NULL);
+
+	while (offset >= kiov->kiov_len) {
+		offset -= kiov->kiov_len;
+		nkiov--;
+		kiov++;
+		LASSERT (nkiov > 0);
+	}
+
+	sg = tx->tx_frags;
+	do {
+		LASSERT (nkiov > 0);
+
+		fragnob = min((int)(kiov->kiov_len - offset), nob);
+
+		sg_set_page(sg, kiov->kiov_page, fragnob,
+			    kiov->kiov_offset + offset);
+		sg++;
+
+		offset = 0;
+		kiov++;
+		nkiov--;
+		nob -= fragnob;
+	} while (nob > 0);
+
+	return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags);
+}
+
+int
+kiblnd_post_tx_locked (kib_conn_t *conn, kib_tx_t *tx, int credit)
+{
+	kib_msg_t	 *msg = tx->tx_msg;
+	kib_peer_t	*peer = conn->ibc_peer;
+	int		ver = conn->ibc_version;
+	int		rc;
+	int		done;
+	struct ib_send_wr *bad_wrq;
+
+	LASSERT (tx->tx_queued);
+	/* We rely on this for QP sizing */
+	LASSERT (tx->tx_nwrq > 0);
+	LASSERT (tx->tx_nwrq <= 1 + IBLND_RDMA_FRAGS(ver));
+
+	LASSERT (credit == 0 || credit == 1);
+	LASSERT (conn->ibc_outstanding_credits >= 0);
+	LASSERT (conn->ibc_outstanding_credits <= IBLND_MSG_QUEUE_SIZE(ver));
+	LASSERT (conn->ibc_credits >= 0);
+	LASSERT (conn->ibc_credits <= IBLND_MSG_QUEUE_SIZE(ver));
+
+	if (conn->ibc_nsends_posted == IBLND_CONCURRENT_SENDS(ver)) {
+		/* tx completions outstanding... */
+		CDEBUG(D_NET, "%s: posted enough\n",
+		       libcfs_nid2str(peer->ibp_nid));
+		return -EAGAIN;
+	}
+
+	if (credit != 0 && conn->ibc_credits == 0) {   /* no credits */
+		CDEBUG(D_NET, "%s: no credits\n",
+		       libcfs_nid2str(peer->ibp_nid));
+		return -EAGAIN;
+	}
+
+	if (credit != 0 && !IBLND_OOB_CAPABLE(ver) &&
+	    conn->ibc_credits == 1 &&   /* last credit reserved */
+	    msg->ibm_type != IBLND_MSG_NOOP) {      /* for NOOP */
+		CDEBUG(D_NET, "%s: not using last credit\n",
+		       libcfs_nid2str(peer->ibp_nid));
+		return -EAGAIN;
+	}
+
+	/* NB don't drop ibc_lock before bumping tx_sending */
+	list_del(&tx->tx_list);
+	tx->tx_queued = 0;
+
+	if (msg->ibm_type == IBLND_MSG_NOOP &&
+	    (!kiblnd_need_noop(conn) ||     /* redundant NOOP */
+	     (IBLND_OOB_CAPABLE(ver) && /* posted enough NOOP */
+	      conn->ibc_noops_posted == IBLND_OOB_MSGS(ver)))) {
+		/* OK to drop when posted enough NOOPs, since
+		 * kiblnd_check_sends will queue NOOP again when
+		 * posted NOOPs complete */
+		spin_unlock(&conn->ibc_lock);
+		kiblnd_tx_done(peer->ibp_ni, tx);
+		spin_lock(&conn->ibc_lock);
+		CDEBUG(D_NET, "%s(%d): redundant or enough NOOP\n",
+		       libcfs_nid2str(peer->ibp_nid),
+		       conn->ibc_noops_posted);
+		return 0;
+	}
+
+	kiblnd_pack_msg(peer->ibp_ni, msg, ver, conn->ibc_outstanding_credits,
+			peer->ibp_nid, conn->ibc_incarnation);
+
+	conn->ibc_credits -= credit;
+	conn->ibc_outstanding_credits = 0;
+	conn->ibc_nsends_posted++;
+	if (msg->ibm_type == IBLND_MSG_NOOP)
+		conn->ibc_noops_posted++;
+
+	/* CAVEAT EMPTOR!  This tx could be the PUT_DONE of an RDMA
+	 * PUT.  If so, it was first queued here as a PUT_REQ, sent and
+	 * stashed on ibc_active_txs, matched by an incoming PUT_ACK,
+	 * and then re-queued here.  It's (just) possible that
+	 * tx_sending is non-zero if we've not done the tx_complete()
+	 * from the first send; hence the ++ rather than = below. */
+	tx->tx_sending++;
+	list_add(&tx->tx_list, &conn->ibc_active_txs);
+
+	/* I'm still holding ibc_lock! */
+	if (conn->ibc_state != IBLND_CONN_ESTABLISHED) {
+		rc = -ECONNABORTED;
+	} else if (tx->tx_pool->tpo_pool.po_failed ||
+		 conn->ibc_hdev != tx->tx_pool->tpo_hdev) {
+		/* close_conn will launch failover */
+		rc = -ENETDOWN;
+	} else {
+		rc = ib_post_send(conn->ibc_cmid->qp,
+				  tx->tx_wrq, &bad_wrq);
+	}
+
+	conn->ibc_last_send = jiffies;
+
+	if (rc == 0)
+		return 0;
+
+	/* NB credits are transferred in the actual
+	 * message, which can only be the last work item */
+	conn->ibc_credits += credit;
+	conn->ibc_outstanding_credits += msg->ibm_credits;
+	conn->ibc_nsends_posted--;
+	if (msg->ibm_type == IBLND_MSG_NOOP)
+		conn->ibc_noops_posted--;
+
+	tx->tx_status = rc;
+	tx->tx_waiting = 0;
+	tx->tx_sending--;
+
+	done = (tx->tx_sending == 0);
+	if (done)
+		list_del(&tx->tx_list);
+
+	spin_unlock(&conn->ibc_lock);
+
+	if (conn->ibc_state == IBLND_CONN_ESTABLISHED)
+		CERROR("Error %d posting transmit to %s\n",
+		       rc, libcfs_nid2str(peer->ibp_nid));
+	else
+		CDEBUG(D_NET, "Error %d posting transmit to %s\n",
+		       rc, libcfs_nid2str(peer->ibp_nid));
+
+	kiblnd_close_conn(conn, rc);
+
+	if (done)
+		kiblnd_tx_done(peer->ibp_ni, tx);
+
+	spin_lock(&conn->ibc_lock);
+
+	return -EIO;
+}
+
+void
+kiblnd_check_sends (kib_conn_t *conn)
+{
+	int	ver = conn->ibc_version;
+	lnet_ni_t *ni = conn->ibc_peer->ibp_ni;
+	kib_tx_t  *tx;
+
+	/* Don't send anything until after the connection is established */
+	if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+		CDEBUG(D_NET, "%s too soon\n",
+		       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+		return;
+	}
+
+	spin_lock(&conn->ibc_lock);
+
+	LASSERT (conn->ibc_nsends_posted <= IBLND_CONCURRENT_SENDS(ver));
+	LASSERT (!IBLND_OOB_CAPABLE(ver) ||
+		 conn->ibc_noops_posted <= IBLND_OOB_MSGS(ver));
+	LASSERT (conn->ibc_reserved_credits >= 0);
+
+	while (conn->ibc_reserved_credits > 0 &&
+	       !list_empty(&conn->ibc_tx_queue_rsrvd)) {
+		tx = list_entry(conn->ibc_tx_queue_rsrvd.next,
+				    kib_tx_t, tx_list);
+		list_del(&tx->tx_list);
+		list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
+		conn->ibc_reserved_credits--;
+	}
+
+	if (kiblnd_need_noop(conn)) {
+		spin_unlock(&conn->ibc_lock);
+
+		tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
+		if (tx != NULL)
+			kiblnd_init_tx_msg(ni, tx, IBLND_MSG_NOOP, 0);
+
+		spin_lock(&conn->ibc_lock);
+		if (tx != NULL)
+			kiblnd_queue_tx_locked(tx, conn);
+	}
+
+	kiblnd_conn_addref(conn); /* 1 ref for me.... (see b21911) */
+
+	for (;;) {
+		int credit;
+
+		if (!list_empty(&conn->ibc_tx_queue_nocred)) {
+			credit = 0;
+			tx = list_entry(conn->ibc_tx_queue_nocred.next,
+					    kib_tx_t, tx_list);
+		} else if (!list_empty(&conn->ibc_tx_noops)) {
+			LASSERT (!IBLND_OOB_CAPABLE(ver));
+			credit = 1;
+			tx = list_entry(conn->ibc_tx_noops.next,
+					kib_tx_t, tx_list);
+		} else if (!list_empty(&conn->ibc_tx_queue)) {
+			credit = 1;
+			tx = list_entry(conn->ibc_tx_queue.next,
+					    kib_tx_t, tx_list);
+		} else
+			break;
+
+		if (kiblnd_post_tx_locked(conn, tx, credit) != 0)
+			break;
+	}
+
+	spin_unlock(&conn->ibc_lock);
+
+	kiblnd_conn_decref(conn); /* ...until here */
+}
+
+void
+kiblnd_tx_complete (kib_tx_t *tx, int status)
+{
+	int	   failed = (status != IB_WC_SUCCESS);
+	kib_conn_t   *conn = tx->tx_conn;
+	int	   idle;
+
+	LASSERT (tx->tx_sending > 0);
+
+	if (failed) {
+		if (conn->ibc_state == IBLND_CONN_ESTABLISHED)
+			CNETERR("Tx -> %s cookie "LPX64
+				" sending %d waiting %d: failed %d\n",
+				libcfs_nid2str(conn->ibc_peer->ibp_nid),
+				tx->tx_cookie, tx->tx_sending, tx->tx_waiting,
+				status);
+
+		kiblnd_close_conn(conn, -EIO);
+	} else {
+		kiblnd_peer_alive(conn->ibc_peer);
+	}
+
+	spin_lock(&conn->ibc_lock);
+
+	/* I could be racing with rdma completion.  Whoever makes 'tx' idle
+	 * gets to free it, which also drops its ref on 'conn'. */
+
+	tx->tx_sending--;
+	conn->ibc_nsends_posted--;
+	if (tx->tx_msg->ibm_type == IBLND_MSG_NOOP)
+		conn->ibc_noops_posted--;
+
+	if (failed) {
+		tx->tx_waiting = 0;	     /* don't wait for peer */
+		tx->tx_status = -EIO;
+	}
+
+	idle = (tx->tx_sending == 0) &&	 /* This is the final callback */
+	       !tx->tx_waiting &&	       /* Not waiting for peer */
+	       !tx->tx_queued;		  /* Not re-queued (PUT_DONE) */
+	if (idle)
+		list_del(&tx->tx_list);
+
+	kiblnd_conn_addref(conn);	       /* 1 ref for me.... */
+
+	spin_unlock(&conn->ibc_lock);
+
+	if (idle)
+		kiblnd_tx_done(conn->ibc_peer->ibp_ni, tx);
+
+	kiblnd_check_sends(conn);
+
+	kiblnd_conn_decref(conn);	       /* ...until here */
+}
+
+void
+kiblnd_init_tx_msg (lnet_ni_t *ni, kib_tx_t *tx, int type, int body_nob)
+{
+	kib_hca_dev_t     *hdev = tx->tx_pool->tpo_hdev;
+	struct ib_sge     *sge = &tx->tx_sge[tx->tx_nwrq];
+	struct ib_send_wr *wrq = &tx->tx_wrq[tx->tx_nwrq];
+	int		nob = offsetof (kib_msg_t, ibm_u) + body_nob;
+	struct ib_mr      *mr;
+
+	LASSERT (tx->tx_nwrq >= 0);
+	LASSERT (tx->tx_nwrq < IBLND_MAX_RDMA_FRAGS + 1);
+	LASSERT (nob <= IBLND_MSG_SIZE);
+
+	kiblnd_init_msg(tx->tx_msg, type, body_nob);
+
+	mr = kiblnd_find_dma_mr(hdev, tx->tx_msgaddr, nob);
+	LASSERT (mr != NULL);
+
+	sge->lkey   = mr->lkey;
+	sge->addr   = tx->tx_msgaddr;
+	sge->length = nob;
+
+	memset(wrq, 0, sizeof(*wrq));
+
+	wrq->next       = NULL;
+	wrq->wr_id      = kiblnd_ptr2wreqid(tx, IBLND_WID_TX);
+	wrq->sg_list    = sge;
+	wrq->num_sge    = 1;
+	wrq->opcode     = IB_WR_SEND;
+	wrq->send_flags = IB_SEND_SIGNALED;
+
+	tx->tx_nwrq++;
+}
+
+int
+kiblnd_init_rdma (kib_conn_t *conn, kib_tx_t *tx, int type,
+		  int resid, kib_rdma_desc_t *dstrd, __u64 dstcookie)
+{
+	kib_msg_t	 *ibmsg = tx->tx_msg;
+	kib_rdma_desc_t   *srcrd = tx->tx_rd;
+	struct ib_sge     *sge = &tx->tx_sge[0];
+	struct ib_send_wr *wrq = &tx->tx_wrq[0];
+	int		rc  = resid;
+	int		srcidx;
+	int		dstidx;
+	int		wrknob;
+
+	LASSERT (!in_interrupt());
+	LASSERT (tx->tx_nwrq == 0);
+	LASSERT (type == IBLND_MSG_GET_DONE ||
+		 type == IBLND_MSG_PUT_DONE);
+
+	srcidx = dstidx = 0;
+
+	while (resid > 0) {
+		if (srcidx >= srcrd->rd_nfrags) {
+			CERROR("Src buffer exhausted: %d frags\n", srcidx);
+			rc = -EPROTO;
+			break;
+		}
+
+		if (dstidx == dstrd->rd_nfrags) {
+			CERROR("Dst buffer exhausted: %d frags\n", dstidx);
+			rc = -EPROTO;
+			break;
+		}
+
+		if (tx->tx_nwrq == IBLND_RDMA_FRAGS(conn->ibc_version)) {
+			CERROR("RDMA too fragmented for %s (%d): "
+			       "%d/%d src %d/%d dst frags\n",
+			       libcfs_nid2str(conn->ibc_peer->ibp_nid),
+			       IBLND_RDMA_FRAGS(conn->ibc_version),
+			       srcidx, srcrd->rd_nfrags,
+			       dstidx, dstrd->rd_nfrags);
+			rc = -EMSGSIZE;
+			break;
+		}
+
+		wrknob = MIN(MIN(kiblnd_rd_frag_size(srcrd, srcidx),
+				 kiblnd_rd_frag_size(dstrd, dstidx)), resid);
+
+		sge = &tx->tx_sge[tx->tx_nwrq];
+		sge->addr   = kiblnd_rd_frag_addr(srcrd, srcidx);
+		sge->lkey   = kiblnd_rd_frag_key(srcrd, srcidx);
+		sge->length = wrknob;
+
+		wrq = &tx->tx_wrq[tx->tx_nwrq];
+
+		wrq->next       = wrq + 1;
+		wrq->wr_id      = kiblnd_ptr2wreqid(tx, IBLND_WID_RDMA);
+		wrq->sg_list    = sge;
+		wrq->num_sge    = 1;
+		wrq->opcode     = IB_WR_RDMA_WRITE;
+		wrq->send_flags = 0;
+
+		wrq->wr.rdma.remote_addr = kiblnd_rd_frag_addr(dstrd, dstidx);
+		wrq->wr.rdma.rkey	= kiblnd_rd_frag_key(dstrd, dstidx);
+
+		srcidx = kiblnd_rd_consume_frag(srcrd, srcidx, wrknob);
+		dstidx = kiblnd_rd_consume_frag(dstrd, dstidx, wrknob);
+
+		resid -= wrknob;
+
+		tx->tx_nwrq++;
+		wrq++;
+		sge++;
+	}
+
+	if (rc < 0)			     /* no RDMA if completing with failure */
+		tx->tx_nwrq = 0;
+
+	ibmsg->ibm_u.completion.ibcm_status = rc;
+	ibmsg->ibm_u.completion.ibcm_cookie = dstcookie;
+	kiblnd_init_tx_msg(conn->ibc_peer->ibp_ni, tx,
+			   type, sizeof (kib_completion_msg_t));
+
+	return rc;
+}
+
+void
+kiblnd_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn)
+{
+	struct list_head   *q;
+
+	LASSERT (tx->tx_nwrq > 0);	      /* work items set up */
+	LASSERT (!tx->tx_queued);	       /* not queued for sending already */
+	LASSERT (conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+	tx->tx_queued = 1;
+	tx->tx_deadline = jiffies + (*kiblnd_tunables.kib_timeout * HZ);
+
+	if (tx->tx_conn == NULL) {
+		kiblnd_conn_addref(conn);
+		tx->tx_conn = conn;
+		LASSERT (tx->tx_msg->ibm_type != IBLND_MSG_PUT_DONE);
+	} else {
+		/* PUT_DONE first attached to conn as a PUT_REQ */
+		LASSERT (tx->tx_conn == conn);
+		LASSERT (tx->tx_msg->ibm_type == IBLND_MSG_PUT_DONE);
+	}
+
+	switch (tx->tx_msg->ibm_type) {
+	default:
+		LBUG();
+
+	case IBLND_MSG_PUT_REQ:
+	case IBLND_MSG_GET_REQ:
+		q = &conn->ibc_tx_queue_rsrvd;
+		break;
+
+	case IBLND_MSG_PUT_NAK:
+	case IBLND_MSG_PUT_ACK:
+	case IBLND_MSG_PUT_DONE:
+	case IBLND_MSG_GET_DONE:
+		q = &conn->ibc_tx_queue_nocred;
+		break;
+
+	case IBLND_MSG_NOOP:
+		if (IBLND_OOB_CAPABLE(conn->ibc_version))
+			q = &conn->ibc_tx_queue_nocred;
+		else
+			q = &conn->ibc_tx_noops;
+		break;
+
+	case IBLND_MSG_IMMEDIATE:
+		q = &conn->ibc_tx_queue;
+		break;
+	}
+
+	list_add_tail(&tx->tx_list, q);
+}
+
+void
+kiblnd_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
+{
+	spin_lock(&conn->ibc_lock);
+	kiblnd_queue_tx_locked(tx, conn);
+	spin_unlock(&conn->ibc_lock);
+
+	kiblnd_check_sends(conn);
+}
+
+static int kiblnd_resolve_addr(struct rdma_cm_id *cmid,
+			       struct sockaddr_in *srcaddr,
+			       struct sockaddr_in *dstaddr,
+			       int timeout_ms)
+{
+	unsigned short port;
+	int rc;
+
+	/* allow the port to be reused */
+	rc = rdma_set_reuseaddr(cmid, 1);
+	if (rc != 0) {
+		CERROR("Unable to set reuse on cmid: %d\n", rc);
+		return rc;
+	}
+
+	/* look for a free privileged port */
+	for (port = PROT_SOCK-1; port > 0; port--) {
+		srcaddr->sin_port = htons(port);
+		rc = rdma_resolve_addr(cmid,
+				       (struct sockaddr *)srcaddr,
+				       (struct sockaddr *)dstaddr,
+				       timeout_ms);
+		if (rc == 0) {
+			CDEBUG(D_NET, "bound to port %hu\n", port);
+			return 0;
+		} else if (rc == -EADDRINUSE || rc == -EADDRNOTAVAIL) {
+			CDEBUG(D_NET, "bind to port %hu failed: %d\n",
+			       port, rc);
+		} else {
+			return rc;
+		}
+	}
+
+	CERROR("Failed to bind to a free privileged port\n");
+	return rc;
+}
+
+void
+kiblnd_connect_peer (kib_peer_t *peer)
+{
+	struct rdma_cm_id *cmid;
+	kib_dev_t	 *dev;
+	kib_net_t	 *net = peer->ibp_ni->ni_data;
+	struct sockaddr_in srcaddr;
+	struct sockaddr_in dstaddr;
+	int		rc;
+
+	LASSERT (net != NULL);
+	LASSERT (peer->ibp_connecting > 0);
+
+	cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, peer, RDMA_PS_TCP,
+				     IB_QPT_RC);
+
+	if (IS_ERR(cmid)) {
+		CERROR("Can't create CMID for %s: %ld\n",
+		       libcfs_nid2str(peer->ibp_nid), PTR_ERR(cmid));
+		rc = PTR_ERR(cmid);
+		goto failed;
+	}
+
+	dev = net->ibn_dev;
+	memset(&srcaddr, 0, sizeof(srcaddr));
+	srcaddr.sin_family = AF_INET;
+	srcaddr.sin_addr.s_addr = htonl(dev->ibd_ifip);
+
+	memset(&dstaddr, 0, sizeof(dstaddr));
+	dstaddr.sin_family = AF_INET;
+	dstaddr.sin_port = htons(*kiblnd_tunables.kib_service);
+	dstaddr.sin_addr.s_addr = htonl(LNET_NIDADDR(peer->ibp_nid));
+
+	kiblnd_peer_addref(peer);	       /* cmid's ref */
+
+	if (*kiblnd_tunables.kib_use_priv_port) {
+		rc = kiblnd_resolve_addr(cmid, &srcaddr, &dstaddr,
+					 *kiblnd_tunables.kib_timeout * 1000);
+	} else {
+		rc = rdma_resolve_addr(cmid,
+				       (struct sockaddr *)&srcaddr,
+				       (struct sockaddr *)&dstaddr,
+				       *kiblnd_tunables.kib_timeout * 1000);
+	}
+	if (rc != 0) {
+		/* Can't initiate address resolution:  */
+		CERROR("Can't resolve addr for %s: %d\n",
+		       libcfs_nid2str(peer->ibp_nid), rc);
+		goto failed2;
+	}
+
+	LASSERT (cmid->device != NULL);
+	CDEBUG(D_NET, "%s: connection bound to %s:%u.%u.%u.%u:%s\n",
+	       libcfs_nid2str(peer->ibp_nid), dev->ibd_ifname,
+	       HIPQUAD(dev->ibd_ifip), cmid->device->name);
+
+	return;
+
+ failed2:
+	kiblnd_peer_decref(peer);	       /* cmid's ref */
+	rdma_destroy_id(cmid);
+ failed:
+	kiblnd_peer_connect_failed(peer, 1, rc);
+}
+
+void
+kiblnd_launch_tx (lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid)
+{
+	kib_peer_t	*peer;
+	kib_peer_t	*peer2;
+	kib_conn_t	*conn;
+	rwlock_t	*g_lock = &kiblnd_data.kib_global_lock;
+	unsigned long      flags;
+	int		rc;
+
+	/* If I get here, I've committed to send, so I complete the tx with
+	 * failure on any problems */
+
+	LASSERT (tx == NULL || tx->tx_conn == NULL); /* only set when assigned a conn */
+	LASSERT (tx == NULL || tx->tx_nwrq > 0);     /* work items have been set up */
+
+	/* First time, just use a read lock since I expect to find my peer
+	 * connected */
+	read_lock_irqsave(g_lock, flags);
+
+	peer = kiblnd_find_peer_locked(nid);
+	if (peer != NULL && !list_empty(&peer->ibp_conns)) {
+		/* Found a peer with an established connection */
+		conn = kiblnd_get_conn_locked(peer);
+		kiblnd_conn_addref(conn); /* 1 ref for me... */
+
+		read_unlock_irqrestore(g_lock, flags);
+
+		if (tx != NULL)
+			kiblnd_queue_tx(tx, conn);
+		kiblnd_conn_decref(conn); /* ...to here */
+		return;
+	}
+
+	read_unlock(g_lock);
+	/* Re-try with a write lock */
+	write_lock(g_lock);
+
+	peer = kiblnd_find_peer_locked(nid);
+	if (peer != NULL) {
+		if (list_empty(&peer->ibp_conns)) {
+			/* found a peer, but it's still connecting... */
+			LASSERT (peer->ibp_connecting != 0 ||
+				 peer->ibp_accepting != 0);
+			if (tx != NULL)
+				list_add_tail(&tx->tx_list,
+						  &peer->ibp_tx_queue);
+			write_unlock_irqrestore(g_lock, flags);
+		} else {
+			conn = kiblnd_get_conn_locked(peer);
+			kiblnd_conn_addref(conn); /* 1 ref for me... */
+
+			write_unlock_irqrestore(g_lock, flags);
+
+			if (tx != NULL)
+				kiblnd_queue_tx(tx, conn);
+			kiblnd_conn_decref(conn); /* ...to here */
+		}
+		return;
+	}
+
+	write_unlock_irqrestore(g_lock, flags);
+
+	/* Allocate a peer ready to add to the peer table and retry */
+	rc = kiblnd_create_peer(ni, &peer, nid);
+	if (rc != 0) {
+		CERROR("Can't create peer %s\n", libcfs_nid2str(nid));
+		if (tx != NULL) {
+			tx->tx_status = -EHOSTUNREACH;
+			tx->tx_waiting = 0;
+			kiblnd_tx_done(ni, tx);
+		}
+		return;
+	}
+
+	write_lock_irqsave(g_lock, flags);
+
+	peer2 = kiblnd_find_peer_locked(nid);
+	if (peer2 != NULL) {
+		if (list_empty(&peer2->ibp_conns)) {
+			/* found a peer, but it's still connecting... */
+			LASSERT (peer2->ibp_connecting != 0 ||
+				 peer2->ibp_accepting != 0);
+			if (tx != NULL)
+				list_add_tail(&tx->tx_list,
+						  &peer2->ibp_tx_queue);
+			write_unlock_irqrestore(g_lock, flags);
+		} else {
+			conn = kiblnd_get_conn_locked(peer2);
+			kiblnd_conn_addref(conn); /* 1 ref for me... */
+
+			write_unlock_irqrestore(g_lock, flags);
+
+			if (tx != NULL)
+				kiblnd_queue_tx(tx, conn);
+			kiblnd_conn_decref(conn); /* ...to here */
+		}
+
+		kiblnd_peer_decref(peer);
+		return;
+	}
+
+	/* Brand new peer */
+	LASSERT (peer->ibp_connecting == 0);
+	peer->ibp_connecting = 1;
+
+	/* always called with a ref on ni, which prevents ni being shutdown */
+	LASSERT (((kib_net_t *)ni->ni_data)->ibn_shutdown == 0);
+
+	if (tx != NULL)
+		list_add_tail(&tx->tx_list, &peer->ibp_tx_queue);
+
+	kiblnd_peer_addref(peer);
+	list_add_tail(&peer->ibp_list, kiblnd_nid2peerlist(nid));
+
+	write_unlock_irqrestore(g_lock, flags);
+
+	kiblnd_connect_peer(peer);
+	kiblnd_peer_decref(peer);
+}
+
+int
+kiblnd_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
+{
+	lnet_hdr_t       *hdr = &lntmsg->msg_hdr;
+	int	       type = lntmsg->msg_type;
+	lnet_process_id_t target = lntmsg->msg_target;
+	int	       target_is_router = lntmsg->msg_target_is_router;
+	int	       routing = lntmsg->msg_routing;
+	unsigned int      payload_niov = lntmsg->msg_niov;
+	struct iovec     *payload_iov = lntmsg->msg_iov;
+	lnet_kiov_t      *payload_kiov = lntmsg->msg_kiov;
+	unsigned int      payload_offset = lntmsg->msg_offset;
+	unsigned int      payload_nob = lntmsg->msg_len;
+	kib_msg_t	*ibmsg;
+	kib_tx_t	 *tx;
+	int	       nob;
+	int	       rc;
+
+	/* NB 'private' is different depending on what we're sending.... */
+
+	CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n",
+	       payload_nob, payload_niov, libcfs_id2str(target));
+
+	LASSERT (payload_nob == 0 || payload_niov > 0);
+	LASSERT (payload_niov <= LNET_MAX_IOV);
+
+	/* Thread context */
+	LASSERT (!in_interrupt());
+	/* payload is either all vaddrs or all pages */
+	LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
+
+	switch (type) {
+	default:
+		LBUG();
+		return (-EIO);
+
+	case LNET_MSG_ACK:
+		LASSERT (payload_nob == 0);
+		break;
+
+	case LNET_MSG_GET:
+		if (routing || target_is_router)
+			break;		  /* send IMMEDIATE */
+
+		/* is the REPLY message too small for RDMA? */
+		nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]);
+		if (nob <= IBLND_MSG_SIZE)
+			break;		  /* send IMMEDIATE */
+
+		tx = kiblnd_get_idle_tx(ni, target.nid);
+		if (tx == NULL) {
+			CERROR("Can't allocate txd for GET to %s\n",
+			       libcfs_nid2str(target.nid));
+			return -ENOMEM;
+		}
+
+		ibmsg = tx->tx_msg;
+
+		if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0)
+			rc = kiblnd_setup_rd_iov(ni, tx,
+						 &ibmsg->ibm_u.get.ibgm_rd,
+						 lntmsg->msg_md->md_niov,
+						 lntmsg->msg_md->md_iov.iov,
+						 0, lntmsg->msg_md->md_length);
+		else
+			rc = kiblnd_setup_rd_kiov(ni, tx,
+						  &ibmsg->ibm_u.get.ibgm_rd,
+						  lntmsg->msg_md->md_niov,
+						  lntmsg->msg_md->md_iov.kiov,
+						  0, lntmsg->msg_md->md_length);
+		if (rc != 0) {
+			CERROR("Can't setup GET sink for %s: %d\n",
+			       libcfs_nid2str(target.nid), rc);
+			kiblnd_tx_done(ni, tx);
+			return -EIO;
+		}
+
+		nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[tx->tx_nfrags]);
+		ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie;
+		ibmsg->ibm_u.get.ibgm_hdr = *hdr;
+
+		kiblnd_init_tx_msg(ni, tx, IBLND_MSG_GET_REQ, nob);
+
+		tx->tx_lntmsg[1] = lnet_create_reply_msg(ni, lntmsg);
+		if (tx->tx_lntmsg[1] == NULL) {
+			CERROR("Can't create reply for GET -> %s\n",
+			       libcfs_nid2str(target.nid));
+			kiblnd_tx_done(ni, tx);
+			return -EIO;
+		}
+
+		tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg[0,1] on completion */
+		tx->tx_waiting = 1;	     /* waiting for GET_DONE */
+		kiblnd_launch_tx(ni, tx, target.nid);
+		return 0;
+
+	case LNET_MSG_REPLY:
+	case LNET_MSG_PUT:
+		/* Is the payload small enough not to need RDMA? */
+		nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
+		if (nob <= IBLND_MSG_SIZE)
+			break;		  /* send IMMEDIATE */
+
+		tx = kiblnd_get_idle_tx(ni, target.nid);
+		if (tx == NULL) {
+			CERROR("Can't allocate %s txd for %s\n",
+			       type == LNET_MSG_PUT ? "PUT" : "REPLY",
+			       libcfs_nid2str(target.nid));
+			return -ENOMEM;
+		}
+
+		if (payload_kiov == NULL)
+			rc = kiblnd_setup_rd_iov(ni, tx, tx->tx_rd,
+						 payload_niov, payload_iov,
+						 payload_offset, payload_nob);
+		else
+			rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd,
+						  payload_niov, payload_kiov,
+						  payload_offset, payload_nob);
+		if (rc != 0) {
+			CERROR("Can't setup PUT src for %s: %d\n",
+			       libcfs_nid2str(target.nid), rc);
+			kiblnd_tx_done(ni, tx);
+			return -EIO;
+		}
+
+		ibmsg = tx->tx_msg;
+		ibmsg->ibm_u.putreq.ibprm_hdr = *hdr;
+		ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie;
+		kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_REQ, sizeof(kib_putreq_msg_t));
+
+		tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg on completion */
+		tx->tx_waiting = 1;	     /* waiting for PUT_{ACK,NAK} */
+		kiblnd_launch_tx(ni, tx, target.nid);
+		return 0;
+	}
+
+	/* send IMMEDIATE */
+
+	LASSERT (offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob])
+		 <= IBLND_MSG_SIZE);
+
+	tx = kiblnd_get_idle_tx(ni, target.nid);
+	if (tx == NULL) {
+		CERROR ("Can't send %d to %s: tx descs exhausted\n",
+			type, libcfs_nid2str(target.nid));
+		return -ENOMEM;
+	}
+
+	ibmsg = tx->tx_msg;
+	ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
+
+	if (payload_kiov != NULL)
+		lnet_copy_kiov2flat(IBLND_MSG_SIZE, ibmsg,
+				    offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+				    payload_niov, payload_kiov,
+				    payload_offset, payload_nob);
+	else
+		lnet_copy_iov2flat(IBLND_MSG_SIZE, ibmsg,
+				   offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+				   payload_niov, payload_iov,
+				   payload_offset, payload_nob);
+
+	nob = offsetof(kib_immediate_msg_t, ibim_payload[payload_nob]);
+	kiblnd_init_tx_msg(ni, tx, IBLND_MSG_IMMEDIATE, nob);
+
+	tx->tx_lntmsg[0] = lntmsg;	      /* finalise lntmsg on completion */
+	kiblnd_launch_tx(ni, tx, target.nid);
+	return 0;
+}
+
+void
+kiblnd_reply (lnet_ni_t *ni, kib_rx_t *rx, lnet_msg_t *lntmsg)
+{
+	lnet_process_id_t target = lntmsg->msg_target;
+	unsigned int      niov = lntmsg->msg_niov;
+	struct iovec     *iov = lntmsg->msg_iov;
+	lnet_kiov_t      *kiov = lntmsg->msg_kiov;
+	unsigned int      offset = lntmsg->msg_offset;
+	unsigned int      nob = lntmsg->msg_len;
+	kib_tx_t	 *tx;
+	int	       rc;
+
+	tx = kiblnd_get_idle_tx(ni, rx->rx_conn->ibc_peer->ibp_nid);
+	if (tx == NULL) {
+		CERROR("Can't get tx for REPLY to %s\n",
+		       libcfs_nid2str(target.nid));
+		goto failed_0;
+	}
+
+	if (nob == 0)
+		rc = 0;
+	else if (kiov == NULL)
+		rc = kiblnd_setup_rd_iov(ni, tx, tx->tx_rd,
+					 niov, iov, offset, nob);
+	else
+		rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd,
+					  niov, kiov, offset, nob);
+
+	if (rc != 0) {
+		CERROR("Can't setup GET src for %s: %d\n",
+		       libcfs_nid2str(target.nid), rc);
+		goto failed_1;
+	}
+
+	rc = kiblnd_init_rdma(rx->rx_conn, tx,
+			      IBLND_MSG_GET_DONE, nob,
+			      &rx->rx_msg->ibm_u.get.ibgm_rd,
+			      rx->rx_msg->ibm_u.get.ibgm_cookie);
+	if (rc < 0) {
+		CERROR("Can't setup rdma for GET from %s: %d\n",
+		       libcfs_nid2str(target.nid), rc);
+		goto failed_1;
+	}
+
+	if (nob == 0) {
+		/* No RDMA: local completion may happen now! */
+		lnet_finalize(ni, lntmsg, 0);
+	} else {
+		/* RDMA: lnet_finalize(lntmsg) when it
+		 * completes */
+		tx->tx_lntmsg[0] = lntmsg;
+	}
+
+	kiblnd_queue_tx(tx, rx->rx_conn);
+	return;
+
+ failed_1:
+	kiblnd_tx_done(ni, tx);
+ failed_0:
+	lnet_finalize(ni, lntmsg, -EIO);
+}
+
+int
+kiblnd_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
+	     unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
+	     unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+	kib_rx_t    *rx = private;
+	kib_msg_t   *rxmsg = rx->rx_msg;
+	kib_conn_t  *conn = rx->rx_conn;
+	kib_tx_t    *tx;
+	kib_msg_t   *txmsg;
+	int	  nob;
+	int	  post_credit = IBLND_POSTRX_PEER_CREDIT;
+	int	  rc = 0;
+
+	LASSERT (mlen <= rlen);
+	LASSERT (!in_interrupt());
+	/* Either all pages or all vaddrs */
+	LASSERT (!(kiov != NULL && iov != NULL));
+
+	switch (rxmsg->ibm_type) {
+	default:
+		LBUG();
+
+	case IBLND_MSG_IMMEDIATE:
+		nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
+		if (nob > rx->rx_nob) {
+			CERROR ("Immediate message from %s too big: %d(%d)\n",
+				libcfs_nid2str(rxmsg->ibm_u.immediate.ibim_hdr.src_nid),
+				nob, rx->rx_nob);
+			rc = -EPROTO;
+			break;
+		}
+
+		if (kiov != NULL)
+			lnet_copy_flat2kiov(niov, kiov, offset,
+					    IBLND_MSG_SIZE, rxmsg,
+					    offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+					    mlen);
+		else
+			lnet_copy_flat2iov(niov, iov, offset,
+					   IBLND_MSG_SIZE, rxmsg,
+					   offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+					   mlen);
+		lnet_finalize (ni, lntmsg, 0);
+		break;
+
+	case IBLND_MSG_PUT_REQ:
+		if (mlen == 0) {
+			lnet_finalize(ni, lntmsg, 0);
+			kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, 0,
+					       rxmsg->ibm_u.putreq.ibprm_cookie);
+			break;
+		}
+
+		tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
+		if (tx == NULL) {
+			CERROR("Can't allocate tx for %s\n",
+			       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+			/* Not replying will break the connection */
+			rc = -ENOMEM;
+			break;
+		}
+
+		txmsg = tx->tx_msg;
+		if (kiov == NULL)
+			rc = kiblnd_setup_rd_iov(ni, tx,
+						 &txmsg->ibm_u.putack.ibpam_rd,
+						 niov, iov, offset, mlen);
+		else
+			rc = kiblnd_setup_rd_kiov(ni, tx,
+						  &txmsg->ibm_u.putack.ibpam_rd,
+						  niov, kiov, offset, mlen);
+		if (rc != 0) {
+			CERROR("Can't setup PUT sink for %s: %d\n",
+			       libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
+			kiblnd_tx_done(ni, tx);
+			/* tell peer it's over */
+			kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, rc,
+					       rxmsg->ibm_u.putreq.ibprm_cookie);
+			break;
+		}
+
+		nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[tx->tx_nfrags]);
+		txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
+		txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie;
+
+		kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_ACK, nob);
+
+		tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg on completion */
+		tx->tx_waiting = 1;	     /* waiting for PUT_DONE */
+		kiblnd_queue_tx(tx, conn);
+
+		/* reposted buffer reserved for PUT_DONE */
+		post_credit = IBLND_POSTRX_NO_CREDIT;
+		break;
+
+	case IBLND_MSG_GET_REQ:
+		if (lntmsg != NULL) {
+			/* Optimized GET; RDMA lntmsg's payload */
+			kiblnd_reply(ni, rx, lntmsg);
+		} else {
+			/* GET didn't match anything */
+			kiblnd_send_completion(rx->rx_conn, IBLND_MSG_GET_DONE,
+					       -ENODATA,
+					       rxmsg->ibm_u.get.ibgm_cookie);
+		}
+		break;
+	}
+
+	kiblnd_post_rx(rx, post_credit);
+	return rc;
+}
+
+int
+kiblnd_thread_start(int (*fn)(void *arg), void *arg, char *name)
+{
+	task_t *task = kthread_run(fn, arg, name);
+
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+
+	atomic_inc(&kiblnd_data.kib_nthreads);
+	return 0;
+}
+
+void
+kiblnd_thread_fini (void)
+{
+	atomic_dec (&kiblnd_data.kib_nthreads);
+}
+
+void
+kiblnd_peer_alive (kib_peer_t *peer)
+{
+	/* This is racy, but everyone's only writing cfs_time_current() */
+	peer->ibp_last_alive = cfs_time_current();
+	mb();
+}
+
+void
+kiblnd_peer_notify (kib_peer_t *peer)
+{
+	int	   error = 0;
+	cfs_time_t    last_alive = 0;
+	unsigned long flags;
+
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	if (list_empty(&peer->ibp_conns) &&
+	    peer->ibp_accepting == 0 &&
+	    peer->ibp_connecting == 0 &&
+	    peer->ibp_error != 0) {
+		error = peer->ibp_error;
+		peer->ibp_error = 0;
+
+		last_alive = peer->ibp_last_alive;
+	}
+
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	if (error != 0)
+		lnet_notify(peer->ibp_ni,
+			    peer->ibp_nid, 0, last_alive);
+}
+
+void
+kiblnd_close_conn_locked (kib_conn_t *conn, int error)
+{
+	/* This just does the immediate housekeeping.  'error' is zero for a
+	 * normal shutdown which can happen only after the connection has been
+	 * established.  If the connection is established, schedule the
+	 * connection to be finished off by the connd.  Otherwise the connd is
+	 * already dealing with it (either to set it up or tear it down).
+	 * Caller holds kib_global_lock exclusively in irq context */
+	kib_peer_t       *peer = conn->ibc_peer;
+	kib_dev_t	*dev;
+	unsigned long     flags;
+
+	LASSERT (error != 0 || conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+	if (error != 0 && conn->ibc_comms_error == 0)
+		conn->ibc_comms_error = error;
+
+	if (conn->ibc_state != IBLND_CONN_ESTABLISHED)
+		return; /* already being handled  */
+
+	if (error == 0 &&
+	    list_empty(&conn->ibc_tx_noops) &&
+	    list_empty(&conn->ibc_tx_queue) &&
+	    list_empty(&conn->ibc_tx_queue_rsrvd) &&
+	    list_empty(&conn->ibc_tx_queue_nocred) &&
+	    list_empty(&conn->ibc_active_txs)) {
+		CDEBUG(D_NET, "closing conn to %s\n",
+		       libcfs_nid2str(peer->ibp_nid));
+	} else {
+		CNETERR("Closing conn to %s: error %d%s%s%s%s%s\n",
+		       libcfs_nid2str(peer->ibp_nid), error,
+		       list_empty(&conn->ibc_tx_queue) ? "" : "(sending)",
+		       list_empty(&conn->ibc_tx_noops) ? "" : "(sending_noops)",
+		       list_empty(&conn->ibc_tx_queue_rsrvd) ? "" : "(sending_rsrvd)",
+		       list_empty(&conn->ibc_tx_queue_nocred) ? "" : "(sending_nocred)",
+		       list_empty(&conn->ibc_active_txs) ? "" : "(waiting)");
+	}
+
+	dev = ((kib_net_t *)peer->ibp_ni->ni_data)->ibn_dev;
+	list_del(&conn->ibc_list);
+	/* connd (see below) takes over ibc_list's ref */
+
+	if (list_empty (&peer->ibp_conns) &&    /* no more conns */
+	    kiblnd_peer_active(peer)) {	 /* still in peer table */
+		kiblnd_unlink_peer_locked(peer);
+
+		/* set/clear error on last conn */
+		peer->ibp_error = conn->ibc_comms_error;
+	}
+
+	kiblnd_set_conn_state(conn, IBLND_CONN_CLOSING);
+
+	if (error != 0 &&
+	    kiblnd_dev_can_failover(dev)) {
+		list_add_tail(&dev->ibd_fail_list,
+			      &kiblnd_data.kib_failed_devs);
+		wake_up(&kiblnd_data.kib_failover_waitq);
+	}
+
+	spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+
+	list_add_tail(&conn->ibc_list, &kiblnd_data.kib_connd_conns);
+	wake_up(&kiblnd_data.kib_connd_waitq);
+
+	spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
+}
+
+void
+kiblnd_close_conn(kib_conn_t *conn, int error)
+{
+	unsigned long flags;
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	kiblnd_close_conn_locked(conn, error);
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+}
+
+void
+kiblnd_handle_early_rxs(kib_conn_t *conn)
+{
+	unsigned long    flags;
+	kib_rx_t	*rx;
+
+	LASSERT(!in_interrupt());
+	LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+	while (!list_empty(&conn->ibc_early_rxs)) {
+		rx = list_entry(conn->ibc_early_rxs.next,
+				    kib_rx_t, rx_list);
+		list_del(&rx->rx_list);
+		write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+		kiblnd_handle_rx(rx);
+
+		write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+	}
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+}
+
+void
+kiblnd_abort_txs(kib_conn_t *conn, struct list_head *txs)
+{
+	LIST_HEAD       (zombies);
+	struct list_head	  *tmp;
+	struct list_head	  *nxt;
+	kib_tx_t	    *tx;
+
+	spin_lock(&conn->ibc_lock);
+
+	list_for_each_safe (tmp, nxt, txs) {
+		tx = list_entry (tmp, kib_tx_t, tx_list);
+
+		if (txs == &conn->ibc_active_txs) {
+			LASSERT (!tx->tx_queued);
+			LASSERT (tx->tx_waiting ||
+				 tx->tx_sending != 0);
+		} else {
+			LASSERT (tx->tx_queued);
+		}
+
+		tx->tx_status = -ECONNABORTED;
+		tx->tx_waiting = 0;
+
+		if (tx->tx_sending == 0) {
+			tx->tx_queued = 0;
+			list_del (&tx->tx_list);
+			list_add (&tx->tx_list, &zombies);
+		}
+	}
+
+	spin_unlock(&conn->ibc_lock);
+
+	kiblnd_txlist_done(conn->ibc_peer->ibp_ni, &zombies, -ECONNABORTED);
+}
+
+void
+kiblnd_finalise_conn (kib_conn_t *conn)
+{
+	LASSERT (!in_interrupt());
+	LASSERT (conn->ibc_state > IBLND_CONN_INIT);
+
+	kiblnd_set_conn_state(conn, IBLND_CONN_DISCONNECTED);
+
+	/* abort_receives moves QP state to IB_QPS_ERR.  This is only required
+	 * for connections that didn't get as far as being connected, because
+	 * rdma_disconnect() does this for free. */
+	kiblnd_abort_receives(conn);
+
+	/* Complete all tx descs not waiting for sends to complete.
+	 * NB we should be safe from RDMA now that the QP has changed state */
+
+	kiblnd_abort_txs(conn, &conn->ibc_tx_noops);
+	kiblnd_abort_txs(conn, &conn->ibc_tx_queue);
+	kiblnd_abort_txs(conn, &conn->ibc_tx_queue_rsrvd);
+	kiblnd_abort_txs(conn, &conn->ibc_tx_queue_nocred);
+	kiblnd_abort_txs(conn, &conn->ibc_active_txs);
+
+	kiblnd_handle_early_rxs(conn);
+}
+
+void
+kiblnd_peer_connect_failed (kib_peer_t *peer, int active, int error)
+{
+	LIST_HEAD    (zombies);
+	unsigned long     flags;
+
+	LASSERT (error != 0);
+	LASSERT (!in_interrupt());
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	if (active) {
+		LASSERT (peer->ibp_connecting > 0);
+		peer->ibp_connecting--;
+	} else {
+		LASSERT (peer->ibp_accepting > 0);
+		peer->ibp_accepting--;
+	}
+
+	if (peer->ibp_connecting != 0 ||
+	    peer->ibp_accepting != 0) {
+		/* another connection attempt under way... */
+		write_unlock_irqrestore(&kiblnd_data.kib_global_lock,
+					    flags);
+		return;
+	}
+
+	if (list_empty(&peer->ibp_conns)) {
+		/* Take peer's blocked transmits to complete with error */
+		list_add(&zombies, &peer->ibp_tx_queue);
+		list_del_init(&peer->ibp_tx_queue);
+
+		if (kiblnd_peer_active(peer))
+			kiblnd_unlink_peer_locked(peer);
+
+		peer->ibp_error = error;
+	} else {
+		/* Can't have blocked transmits if there are connections */
+		LASSERT (list_empty(&peer->ibp_tx_queue));
+	}
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	kiblnd_peer_notify(peer);
+
+	if (list_empty (&zombies))
+		return;
+
+	CNETERR("Deleting messages for %s: connection failed\n",
+		libcfs_nid2str(peer->ibp_nid));
+
+	kiblnd_txlist_done(peer->ibp_ni, &zombies, -EHOSTUNREACH);
+}
+
+void
+kiblnd_connreq_done(kib_conn_t *conn, int status)
+{
+	kib_peer_t	*peer = conn->ibc_peer;
+	kib_tx_t	  *tx;
+	struct list_head	 txs;
+	unsigned long      flags;
+	int		active;
+
+	active = (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
+
+	CDEBUG(D_NET,"%s: active(%d), version(%x), status(%d)\n",
+	       libcfs_nid2str(peer->ibp_nid), active,
+	       conn->ibc_version, status);
+
+	LASSERT (!in_interrupt());
+	LASSERT ((conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT &&
+		  peer->ibp_connecting > 0) ||
+		 (conn->ibc_state == IBLND_CONN_PASSIVE_WAIT &&
+		  peer->ibp_accepting > 0));
+
+	LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
+	conn->ibc_connvars = NULL;
+
+	if (status != 0) {
+		/* failed to establish connection */
+		kiblnd_peer_connect_failed(peer, active, status);
+		kiblnd_finalise_conn(conn);
+		return;
+	}
+
+	/* connection established */
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	conn->ibc_last_send = jiffies;
+	kiblnd_set_conn_state(conn, IBLND_CONN_ESTABLISHED);
+	kiblnd_peer_alive(peer);
+
+	/* Add conn to peer's list and nuke any dangling conns from a different
+	 * peer instance... */
+	kiblnd_conn_addref(conn);	       /* +1 ref for ibc_list */
+	list_add(&conn->ibc_list, &peer->ibp_conns);
+	if (active)
+		peer->ibp_connecting--;
+	else
+		peer->ibp_accepting--;
+
+	if (peer->ibp_version == 0) {
+		peer->ibp_version     = conn->ibc_version;
+		peer->ibp_incarnation = conn->ibc_incarnation;
+	}
+
+	if (peer->ibp_version     != conn->ibc_version ||
+	    peer->ibp_incarnation != conn->ibc_incarnation) {
+		kiblnd_close_stale_conns_locked(peer, conn->ibc_version,
+						conn->ibc_incarnation);
+		peer->ibp_version     = conn->ibc_version;
+		peer->ibp_incarnation = conn->ibc_incarnation;
+	}
+
+	/* grab pending txs while I have the lock */
+	list_add(&txs, &peer->ibp_tx_queue);
+	list_del_init(&peer->ibp_tx_queue);
+
+	if (!kiblnd_peer_active(peer) ||	/* peer has been deleted */
+	    conn->ibc_comms_error != 0) {       /* error has happened already */
+		lnet_ni_t *ni = peer->ibp_ni;
+
+		/* start to shut down connection */
+		kiblnd_close_conn_locked(conn, -ECONNABORTED);
+		write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+		kiblnd_txlist_done(ni, &txs, -ECONNABORTED);
+
+		return;
+	}
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	/* Schedule blocked txs */
+	spin_lock(&conn->ibc_lock);
+	while (!list_empty(&txs)) {
+		tx = list_entry(txs.next, kib_tx_t, tx_list);
+		list_del(&tx->tx_list);
+
+		kiblnd_queue_tx_locked(tx, conn);
+	}
+	spin_unlock(&conn->ibc_lock);
+
+	kiblnd_check_sends(conn);
+
+	/* schedule blocked rxs */
+	kiblnd_handle_early_rxs(conn);
+}
+
+void
+kiblnd_reject(struct rdma_cm_id *cmid, kib_rej_t *rej)
+{
+	int	  rc;
+
+	rc = rdma_reject(cmid, rej, sizeof(*rej));
+
+	if (rc != 0)
+		CWARN("Error %d sending reject\n", rc);
+}
+
+int
+kiblnd_passive_connect (struct rdma_cm_id *cmid, void *priv, int priv_nob)
+{
+	rwlock_t		*g_lock = &kiblnd_data.kib_global_lock;
+	kib_msg_t	     *reqmsg = priv;
+	kib_msg_t	     *ackmsg;
+	kib_dev_t	     *ibdev;
+	kib_peer_t	    *peer;
+	kib_peer_t	    *peer2;
+	kib_conn_t	    *conn;
+	lnet_ni_t	     *ni  = NULL;
+	kib_net_t	     *net = NULL;
+	lnet_nid_t	     nid;
+	struct rdma_conn_param cp;
+	kib_rej_t	      rej;
+	int		    version = IBLND_MSG_VERSION;
+	unsigned long	  flags;
+	int		    rc;
+	struct sockaddr_in    *peer_addr;
+	LASSERT (!in_interrupt());
+
+	/* cmid inherits 'context' from the corresponding listener id */
+	ibdev = (kib_dev_t *)cmid->context;
+	LASSERT (ibdev != NULL);
+
+	memset(&rej, 0, sizeof(rej));
+	rej.ibr_magic		= IBLND_MSG_MAGIC;
+	rej.ibr_why		  = IBLND_REJECT_FATAL;
+	rej.ibr_cp.ibcp_max_msg_size = IBLND_MSG_SIZE;
+
+	peer_addr = (struct sockaddr_in *)&(cmid->route.addr.dst_addr);
+	if (*kiblnd_tunables.kib_require_priv_port &&
+	    ntohs(peer_addr->sin_port) >= PROT_SOCK) {
+		__u32 ip = ntohl(peer_addr->sin_addr.s_addr);
+		CERROR("Peer's port (%u.%u.%u.%u:%hu) is not privileged\n",
+		       HIPQUAD(ip), ntohs(peer_addr->sin_port));
+		goto failed;
+	}
+
+	if (priv_nob < offsetof(kib_msg_t, ibm_type)) {
+		CERROR("Short connection request\n");
+		goto failed;
+	}
+
+	/* Future protocol version compatibility support!  If the
+	 * o2iblnd-specific protocol changes, or when LNET unifies
+	 * protocols over all LNDs, the initial connection will
+	 * negotiate a protocol version.  I trap this here to avoid
+	 * console errors; the reject tells the peer which protocol I
+	 * speak. */
+	if (reqmsg->ibm_magic == LNET_PROTO_MAGIC ||
+	    reqmsg->ibm_magic == __swab32(LNET_PROTO_MAGIC))
+		goto failed;
+	if (reqmsg->ibm_magic == IBLND_MSG_MAGIC &&
+	    reqmsg->ibm_version != IBLND_MSG_VERSION &&
+	    reqmsg->ibm_version != IBLND_MSG_VERSION_1)
+		goto failed;
+	if (reqmsg->ibm_magic == __swab32(IBLND_MSG_MAGIC) &&
+	    reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION) &&
+	    reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION_1))
+		goto failed;
+
+	rc = kiblnd_unpack_msg(reqmsg, priv_nob);
+	if (rc != 0) {
+		CERROR("Can't parse connection request: %d\n", rc);
+		goto failed;
+	}
+
+	nid = reqmsg->ibm_srcnid;
+	ni  = lnet_net2ni(LNET_NIDNET(reqmsg->ibm_dstnid));
+
+	if (ni != NULL) {
+		net = (kib_net_t *)ni->ni_data;
+		rej.ibr_incarnation = net->ibn_incarnation;
+	}
+
+	if (ni == NULL ||			 /* no matching net */
+	    ni->ni_nid != reqmsg->ibm_dstnid ||   /* right NET, wrong NID! */
+	    net->ibn_dev != ibdev) {	      /* wrong device */
+		CERROR("Can't accept %s on %s (%s:%d:%u.%u.%u.%u): "
+		       "bad dst nid %s\n", libcfs_nid2str(nid),
+		       ni == NULL ? "NA" : libcfs_nid2str(ni->ni_nid),
+		       ibdev->ibd_ifname, ibdev->ibd_nnets,
+		       HIPQUAD(ibdev->ibd_ifip),
+		       libcfs_nid2str(reqmsg->ibm_dstnid));
+
+		goto failed;
+	}
+
+       /* check time stamp as soon as possible */
+	if (reqmsg->ibm_dststamp != 0 &&
+	    reqmsg->ibm_dststamp != net->ibn_incarnation) {
+		CWARN("Stale connection request\n");
+		rej.ibr_why = IBLND_REJECT_CONN_STALE;
+		goto failed;
+	}
+
+	/* I can accept peer's version */
+	version = reqmsg->ibm_version;
+
+	if (reqmsg->ibm_type != IBLND_MSG_CONNREQ) {
+		CERROR("Unexpected connreq msg type: %x from %s\n",
+		       reqmsg->ibm_type, libcfs_nid2str(nid));
+		goto failed;
+	}
+
+	if (reqmsg->ibm_u.connparams.ibcp_queue_depth !=
+	    IBLND_MSG_QUEUE_SIZE(version)) {
+		CERROR("Can't accept %s: incompatible queue depth %d (%d wanted)\n",
+		       libcfs_nid2str(nid), reqmsg->ibm_u.connparams.ibcp_queue_depth,
+		       IBLND_MSG_QUEUE_SIZE(version));
+
+		if (version == IBLND_MSG_VERSION)
+			rej.ibr_why = IBLND_REJECT_MSG_QUEUE_SIZE;
+
+		goto failed;
+	}
+
+	if (reqmsg->ibm_u.connparams.ibcp_max_frags !=
+	    IBLND_RDMA_FRAGS(version)) {
+		CERROR("Can't accept %s(version %x): "
+		       "incompatible max_frags %d (%d wanted)\n",
+		       libcfs_nid2str(nid), version,
+		       reqmsg->ibm_u.connparams.ibcp_max_frags,
+		       IBLND_RDMA_FRAGS(version));
+
+		if (version == IBLND_MSG_VERSION)
+			rej.ibr_why = IBLND_REJECT_RDMA_FRAGS;
+
+		goto failed;
+
+	}
+
+	if (reqmsg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) {
+		CERROR("Can't accept %s: message size %d too big (%d max)\n",
+		       libcfs_nid2str(nid),
+		       reqmsg->ibm_u.connparams.ibcp_max_msg_size,
+		       IBLND_MSG_SIZE);
+		goto failed;
+	}
+
+	/* assume 'nid' is a new peer; create  */
+	rc = kiblnd_create_peer(ni, &peer, nid);
+	if (rc != 0) {
+		CERROR("Can't create peer for %s\n", libcfs_nid2str(nid));
+		rej.ibr_why = IBLND_REJECT_NO_RESOURCES;
+		goto failed;
+	}
+
+	write_lock_irqsave(g_lock, flags);
+
+	peer2 = kiblnd_find_peer_locked(nid);
+	if (peer2 != NULL) {
+		if (peer2->ibp_version == 0) {
+			peer2->ibp_version     = version;
+			peer2->ibp_incarnation = reqmsg->ibm_srcstamp;
+		}
+
+		/* not the guy I've talked with */
+		if (peer2->ibp_incarnation != reqmsg->ibm_srcstamp ||
+		    peer2->ibp_version     != version) {
+			kiblnd_close_peer_conns_locked(peer2, -ESTALE);
+			write_unlock_irqrestore(g_lock, flags);
+
+			CWARN("Conn stale %s [old ver: %x, new ver: %x]\n",
+			      libcfs_nid2str(nid), peer2->ibp_version, version);
+
+			kiblnd_peer_decref(peer);
+			rej.ibr_why = IBLND_REJECT_CONN_STALE;
+			goto failed;
+		}
+
+		/* tie-break connection race in favour of the higher NID */
+		if (peer2->ibp_connecting != 0 &&
+		    nid < ni->ni_nid) {
+			write_unlock_irqrestore(g_lock, flags);
+
+			CWARN("Conn race %s\n", libcfs_nid2str(peer2->ibp_nid));
+
+			kiblnd_peer_decref(peer);
+			rej.ibr_why = IBLND_REJECT_CONN_RACE;
+			goto failed;
+		}
+
+		peer2->ibp_accepting++;
+		kiblnd_peer_addref(peer2);
+
+		write_unlock_irqrestore(g_lock, flags);
+		kiblnd_peer_decref(peer);
+		peer = peer2;
+	} else {
+		/* Brand new peer */
+		LASSERT (peer->ibp_accepting == 0);
+		LASSERT (peer->ibp_version == 0 &&
+			 peer->ibp_incarnation == 0);
+
+		peer->ibp_accepting   = 1;
+		peer->ibp_version     = version;
+		peer->ibp_incarnation = reqmsg->ibm_srcstamp;
+
+		/* I have a ref on ni that prevents it being shutdown */
+		LASSERT (net->ibn_shutdown == 0);
+
+		kiblnd_peer_addref(peer);
+		list_add_tail(&peer->ibp_list, kiblnd_nid2peerlist(nid));
+
+		write_unlock_irqrestore(g_lock, flags);
+	}
+
+	conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_PASSIVE_WAIT, version);
+	if (conn == NULL) {
+		kiblnd_peer_connect_failed(peer, 0, -ENOMEM);
+		kiblnd_peer_decref(peer);
+		rej.ibr_why = IBLND_REJECT_NO_RESOURCES;
+		goto failed;
+	}
+
+	/* conn now "owns" cmid, so I return success from here on to ensure the
+	 * CM callback doesn't destroy cmid. */
+
+	conn->ibc_incarnation      = reqmsg->ibm_srcstamp;
+	conn->ibc_credits	  = IBLND_MSG_QUEUE_SIZE(version);
+	conn->ibc_reserved_credits = IBLND_MSG_QUEUE_SIZE(version);
+	LASSERT (conn->ibc_credits + conn->ibc_reserved_credits + IBLND_OOB_MSGS(version)
+		 <= IBLND_RX_MSGS(version));
+
+	ackmsg = &conn->ibc_connvars->cv_msg;
+	memset(ackmsg, 0, sizeof(*ackmsg));
+
+	kiblnd_init_msg(ackmsg, IBLND_MSG_CONNACK,
+			sizeof(ackmsg->ibm_u.connparams));
+	ackmsg->ibm_u.connparams.ibcp_queue_depth  = IBLND_MSG_QUEUE_SIZE(version);
+	ackmsg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE;
+	ackmsg->ibm_u.connparams.ibcp_max_frags    = IBLND_RDMA_FRAGS(version);
+
+	kiblnd_pack_msg(ni, ackmsg, version, 0, nid, reqmsg->ibm_srcstamp);
+
+	memset(&cp, 0, sizeof(cp));
+	cp.private_data	= ackmsg;
+	cp.private_data_len    = ackmsg->ibm_nob;
+	cp.responder_resources = 0;	     /* No atomic ops or RDMA reads */
+	cp.initiator_depth     = 0;
+	cp.flow_control	= 1;
+	cp.retry_count	 = *kiblnd_tunables.kib_retry_count;
+	cp.rnr_retry_count     = *kiblnd_tunables.kib_rnr_retry_count;
+
+	CDEBUG(D_NET, "Accept %s\n", libcfs_nid2str(nid));
+
+	rc = rdma_accept(cmid, &cp);
+	if (rc != 0) {
+		CERROR("Can't accept %s: %d\n", libcfs_nid2str(nid), rc);
+		rej.ibr_version = version;
+		rej.ibr_why     = IBLND_REJECT_FATAL;
+
+		kiblnd_reject(cmid, &rej);
+		kiblnd_connreq_done(conn, rc);
+		kiblnd_conn_decref(conn);
+	}
+
+	lnet_ni_decref(ni);
+	return 0;
+
+ failed:
+	if (ni != NULL)
+		lnet_ni_decref(ni);
+
+	rej.ibr_version = version;
+	rej.ibr_cp.ibcp_queue_depth = IBLND_MSG_QUEUE_SIZE(version);
+	rej.ibr_cp.ibcp_max_frags   = IBLND_RDMA_FRAGS(version);
+	kiblnd_reject(cmid, &rej);
+
+	return -ECONNREFUSED;
+}
+
+void
+kiblnd_reconnect (kib_conn_t *conn, int version,
+		  __u64 incarnation, int why, kib_connparams_t *cp)
+{
+	kib_peer_t    *peer = conn->ibc_peer;
+	char	  *reason;
+	int	    retry = 0;
+	unsigned long  flags;
+
+	LASSERT (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
+	LASSERT (peer->ibp_connecting > 0);     /* 'conn' at least */
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	/* retry connection if it's still needed and no other connection
+	 * attempts (active or passive) are in progress
+	 * NB: reconnect is still needed even when ibp_tx_queue is
+	 * empty if ibp_version != version because reconnect may be
+	 * initiated by kiblnd_query() */
+	if ((!list_empty(&peer->ibp_tx_queue) ||
+	     peer->ibp_version != version) &&
+	    peer->ibp_connecting == 1 &&
+	    peer->ibp_accepting == 0) {
+		retry = 1;
+		peer->ibp_connecting++;
+
+		peer->ibp_version     = version;
+		peer->ibp_incarnation = incarnation;
+	}
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	if (!retry)
+		return;
+
+	switch (why) {
+	default:
+		reason = "Unknown";
+		break;
+
+	case IBLND_REJECT_CONN_STALE:
+		reason = "stale";
+		break;
+
+	case IBLND_REJECT_CONN_RACE:
+		reason = "conn race";
+		break;
+
+	case IBLND_REJECT_CONN_UNCOMPAT:
+		reason = "version negotiation";
+		break;
+	}
+
+	CNETERR("%s: retrying (%s), %x, %x, "
+		"queue_dep: %d, max_frag: %d, msg_size: %d\n",
+		libcfs_nid2str(peer->ibp_nid),
+		reason, IBLND_MSG_VERSION, version,
+		cp != NULL? cp->ibcp_queue_depth :IBLND_MSG_QUEUE_SIZE(version),
+		cp != NULL? cp->ibcp_max_frags   : IBLND_RDMA_FRAGS(version),
+		cp != NULL? cp->ibcp_max_msg_size: IBLND_MSG_SIZE);
+
+	kiblnd_connect_peer(peer);
+}
+
+void
+kiblnd_rejected (kib_conn_t *conn, int reason, void *priv, int priv_nob)
+{
+	kib_peer_t    *peer = conn->ibc_peer;
+
+	LASSERT (!in_interrupt());
+	LASSERT (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
+
+	switch (reason) {
+	case IB_CM_REJ_STALE_CONN:
+		kiblnd_reconnect(conn, IBLND_MSG_VERSION, 0,
+				 IBLND_REJECT_CONN_STALE, NULL);
+		break;
+
+	case IB_CM_REJ_INVALID_SERVICE_ID:
+		CNETERR("%s rejected: no listener at %d\n",
+			libcfs_nid2str(peer->ibp_nid),
+			*kiblnd_tunables.kib_service);
+		break;
+
+	case IB_CM_REJ_CONSUMER_DEFINED:
+		if (priv_nob >= offsetof(kib_rej_t, ibr_padding)) {
+			kib_rej_t	*rej	 = priv;
+			kib_connparams_t *cp	  = NULL;
+			int	       flip	= 0;
+			__u64	     incarnation = -1;
+
+			/* NB. default incarnation is -1 because:
+			 * a) V1 will ignore dst incarnation in connreq.
+			 * b) V2 will provide incarnation while rejecting me,
+			 *    -1 will be overwrote.
+			 *
+			 * if I try to connect to a V1 peer with V2 protocol,
+			 * it rejected me then upgrade to V2, I have no idea
+			 * about the upgrading and try to reconnect with V1,
+			 * in this case upgraded V2 can find out I'm trying to
+			 * talk to the old guy and reject me(incarnation is -1).
+			 */
+
+			if (rej->ibr_magic == __swab32(IBLND_MSG_MAGIC) ||
+			    rej->ibr_magic == __swab32(LNET_PROTO_MAGIC)) {
+				__swab32s(&rej->ibr_magic);
+				__swab16s(&rej->ibr_version);
+				flip = 1;
+			}
+
+			if (priv_nob >= sizeof(kib_rej_t) &&
+			    rej->ibr_version > IBLND_MSG_VERSION_1) {
+				/* priv_nob is always 148 in current version
+				 * of OFED, so we still need to check version.
+				 * (define of IB_CM_REJ_PRIVATE_DATA_SIZE) */
+				cp = &rej->ibr_cp;
+
+				if (flip) {
+					__swab64s(&rej->ibr_incarnation);
+					__swab16s(&cp->ibcp_queue_depth);
+					__swab16s(&cp->ibcp_max_frags);
+					__swab32s(&cp->ibcp_max_msg_size);
+				}
+
+				incarnation = rej->ibr_incarnation;
+			}
+
+			if (rej->ibr_magic != IBLND_MSG_MAGIC &&
+			    rej->ibr_magic != LNET_PROTO_MAGIC) {
+				CERROR("%s rejected: consumer defined fatal error\n",
+				       libcfs_nid2str(peer->ibp_nid));
+				break;
+			}
+
+			if (rej->ibr_version != IBLND_MSG_VERSION &&
+			    rej->ibr_version != IBLND_MSG_VERSION_1) {
+				CERROR("%s rejected: o2iblnd version %x error\n",
+				       libcfs_nid2str(peer->ibp_nid),
+				       rej->ibr_version);
+				break;
+			}
+
+			if (rej->ibr_why     == IBLND_REJECT_FATAL &&
+			    rej->ibr_version == IBLND_MSG_VERSION_1) {
+				CDEBUG(D_NET, "rejected by old version peer %s: %x\n",
+				       libcfs_nid2str(peer->ibp_nid), rej->ibr_version);
+
+				if (conn->ibc_version != IBLND_MSG_VERSION_1)
+					rej->ibr_why = IBLND_REJECT_CONN_UNCOMPAT;
+			}
+
+			switch (rej->ibr_why) {
+			case IBLND_REJECT_CONN_RACE:
+			case IBLND_REJECT_CONN_STALE:
+			case IBLND_REJECT_CONN_UNCOMPAT:
+				kiblnd_reconnect(conn, rej->ibr_version,
+						 incarnation, rej->ibr_why, cp);
+				break;
+
+			case IBLND_REJECT_MSG_QUEUE_SIZE:
+				CERROR("%s rejected: incompatible message queue depth %d, %d\n",
+				       libcfs_nid2str(peer->ibp_nid), cp->ibcp_queue_depth,
+				       IBLND_MSG_QUEUE_SIZE(conn->ibc_version));
+				break;
+
+			case IBLND_REJECT_RDMA_FRAGS:
+				CERROR("%s rejected: incompatible # of RDMA fragments %d, %d\n",
+				       libcfs_nid2str(peer->ibp_nid), cp->ibcp_max_frags,
+				       IBLND_RDMA_FRAGS(conn->ibc_version));
+				break;
+
+			case IBLND_REJECT_NO_RESOURCES:
+				CERROR("%s rejected: o2iblnd no resources\n",
+				       libcfs_nid2str(peer->ibp_nid));
+				break;
+
+			case IBLND_REJECT_FATAL:
+				CERROR("%s rejected: o2iblnd fatal error\n",
+				       libcfs_nid2str(peer->ibp_nid));
+				break;
+
+			default:
+				CERROR("%s rejected: o2iblnd reason %d\n",
+				       libcfs_nid2str(peer->ibp_nid),
+				       rej->ibr_why);
+				break;
+			}
+			break;
+		}
+		/* fall through */
+	default:
+		CNETERR("%s rejected: reason %d, size %d\n",
+			libcfs_nid2str(peer->ibp_nid), reason, priv_nob);
+		break;
+	}
+
+	kiblnd_connreq_done(conn, -ECONNREFUSED);
+}
+
+void
+kiblnd_check_connreply (kib_conn_t *conn, void *priv, int priv_nob)
+{
+	kib_peer_t    *peer = conn->ibc_peer;
+	lnet_ni_t     *ni   = peer->ibp_ni;
+	kib_net_t     *net  = ni->ni_data;
+	kib_msg_t     *msg  = priv;
+	int	    ver  = conn->ibc_version;
+	int	    rc   = kiblnd_unpack_msg(msg, priv_nob);
+	unsigned long  flags;
+
+	LASSERT (net != NULL);
+
+	if (rc != 0) {
+		CERROR("Can't unpack connack from %s: %d\n",
+		       libcfs_nid2str(peer->ibp_nid), rc);
+		goto failed;
+	}
+
+	if (msg->ibm_type != IBLND_MSG_CONNACK) {
+		CERROR("Unexpected message %d from %s\n",
+		       msg->ibm_type, libcfs_nid2str(peer->ibp_nid));
+		rc = -EPROTO;
+		goto failed;
+	}
+
+	if (ver != msg->ibm_version) {
+		CERROR("%s replied version %x is different with "
+		       "requested version %x\n",
+		       libcfs_nid2str(peer->ibp_nid), msg->ibm_version, ver);
+		rc = -EPROTO;
+		goto failed;
+	}
+
+	if (msg->ibm_u.connparams.ibcp_queue_depth !=
+	    IBLND_MSG_QUEUE_SIZE(ver)) {
+		CERROR("%s has incompatible queue depth %d(%d wanted)\n",
+		       libcfs_nid2str(peer->ibp_nid),
+		       msg->ibm_u.connparams.ibcp_queue_depth,
+		       IBLND_MSG_QUEUE_SIZE(ver));
+		rc = -EPROTO;
+		goto failed;
+	}
+
+	if (msg->ibm_u.connparams.ibcp_max_frags !=
+	    IBLND_RDMA_FRAGS(ver)) {
+		CERROR("%s has incompatible max_frags %d (%d wanted)\n",
+		       libcfs_nid2str(peer->ibp_nid),
+		       msg->ibm_u.connparams.ibcp_max_frags,
+		       IBLND_RDMA_FRAGS(ver));
+		rc = -EPROTO;
+		goto failed;
+	}
+
+	if (msg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) {
+		CERROR("%s max message size %d too big (%d max)\n",
+		       libcfs_nid2str(peer->ibp_nid),
+		       msg->ibm_u.connparams.ibcp_max_msg_size,
+		       IBLND_MSG_SIZE);
+		rc = -EPROTO;
+		goto failed;
+	}
+
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+	if (msg->ibm_dstnid == ni->ni_nid &&
+	    msg->ibm_dststamp == net->ibn_incarnation)
+		rc = 0;
+	else
+		rc = -ESTALE;
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	if (rc != 0) {
+		CERROR("Bad connection reply from %s, rc = %d, "
+		       "version: %x max_frags: %d\n",
+		       libcfs_nid2str(peer->ibp_nid), rc,
+		       msg->ibm_version, msg->ibm_u.connparams.ibcp_max_frags);
+		goto failed;
+	}
+
+	conn->ibc_incarnation      = msg->ibm_srcstamp;
+	conn->ibc_credits	  =
+	conn->ibc_reserved_credits = IBLND_MSG_QUEUE_SIZE(ver);
+	LASSERT (conn->ibc_credits + conn->ibc_reserved_credits + IBLND_OOB_MSGS(ver)
+		 <= IBLND_RX_MSGS(ver));
+
+	kiblnd_connreq_done(conn, 0);
+	return;
+
+ failed:
+	/* NB My QP has already established itself, so I handle anything going
+	 * wrong here by setting ibc_comms_error.
+	 * kiblnd_connreq_done(0) moves the conn state to ESTABLISHED, but then
+	 * immediately tears it down. */
+
+	LASSERT (rc != 0);
+	conn->ibc_comms_error = rc;
+	kiblnd_connreq_done(conn, 0);
+}
+
+int
+kiblnd_active_connect (struct rdma_cm_id *cmid)
+{
+	kib_peer_t	      *peer = (kib_peer_t *)cmid->context;
+	kib_conn_t	      *conn;
+	kib_msg_t	       *msg;
+	struct rdma_conn_param   cp;
+	int		      version;
+	__u64		    incarnation;
+	unsigned long	    flags;
+	int		      rc;
+
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	incarnation = peer->ibp_incarnation;
+	version     = (peer->ibp_version == 0) ? IBLND_MSG_VERSION :
+						 peer->ibp_version;
+
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_ACTIVE_CONNECT, version);
+	if (conn == NULL) {
+		kiblnd_peer_connect_failed(peer, 1, -ENOMEM);
+		kiblnd_peer_decref(peer); /* lose cmid's ref */
+		return -ENOMEM;
+	}
+
+	/* conn "owns" cmid now, so I return success from here on to ensure the
+	 * CM callback doesn't destroy cmid. conn also takes over cmid's ref
+	 * on peer */
+
+	msg = &conn->ibc_connvars->cv_msg;
+
+	memset(msg, 0, sizeof(*msg));
+	kiblnd_init_msg(msg, IBLND_MSG_CONNREQ, sizeof(msg->ibm_u.connparams));
+	msg->ibm_u.connparams.ibcp_queue_depth  = IBLND_MSG_QUEUE_SIZE(version);
+	msg->ibm_u.connparams.ibcp_max_frags    = IBLND_RDMA_FRAGS(version);
+	msg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE;
+
+	kiblnd_pack_msg(peer->ibp_ni, msg, version,
+			0, peer->ibp_nid, incarnation);
+
+	memset(&cp, 0, sizeof(cp));
+	cp.private_data	= msg;
+	cp.private_data_len    = msg->ibm_nob;
+	cp.responder_resources = 0;	     /* No atomic ops or RDMA reads */
+	cp.initiator_depth     = 0;
+	cp.flow_control	= 1;
+	cp.retry_count	 = *kiblnd_tunables.kib_retry_count;
+	cp.rnr_retry_count     = *kiblnd_tunables.kib_rnr_retry_count;
+
+	LASSERT(cmid->context == (void *)conn);
+	LASSERT(conn->ibc_cmid == cmid);
+
+	rc = rdma_connect(cmid, &cp);
+	if (rc != 0) {
+		CERROR("Can't connect to %s: %d\n",
+		       libcfs_nid2str(peer->ibp_nid), rc);
+		kiblnd_connreq_done(conn, rc);
+		kiblnd_conn_decref(conn);
+	}
+
+	return 0;
+}
+
+int
+kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
+{
+	kib_peer_t  *peer;
+	kib_conn_t  *conn;
+	int	  rc;
+
+	switch (event->event) {
+	default:
+		CERROR("Unexpected event: %d, status: %d\n",
+		       event->event, event->status);
+		LBUG();
+
+	case RDMA_CM_EVENT_CONNECT_REQUEST:
+		/* destroy cmid on failure */
+		rc = kiblnd_passive_connect(cmid,
+					    (void *)KIBLND_CONN_PARAM(event),
+					    KIBLND_CONN_PARAM_LEN(event));
+		CDEBUG(D_NET, "connreq: %d\n", rc);
+		return rc;
+
+	case RDMA_CM_EVENT_ADDR_ERROR:
+		peer = (kib_peer_t *)cmid->context;
+		CNETERR("%s: ADDR ERROR %d\n",
+		       libcfs_nid2str(peer->ibp_nid), event->status);
+		kiblnd_peer_connect_failed(peer, 1, -EHOSTUNREACH);
+		kiblnd_peer_decref(peer);
+		return -EHOSTUNREACH;      /* rc != 0 destroys cmid */
+
+	case RDMA_CM_EVENT_ADDR_RESOLVED:
+		peer = (kib_peer_t *)cmid->context;
+
+		CDEBUG(D_NET,"%s Addr resolved: %d\n",
+		       libcfs_nid2str(peer->ibp_nid), event->status);
+
+		if (event->status != 0) {
+			CNETERR("Can't resolve address for %s: %d\n",
+				libcfs_nid2str(peer->ibp_nid), event->status);
+			rc = event->status;
+		} else {
+			rc = rdma_resolve_route(
+				cmid, *kiblnd_tunables.kib_timeout * 1000);
+			if (rc == 0)
+				return 0;
+			/* Can't initiate route resolution */
+			CERROR("Can't resolve route for %s: %d\n",
+			       libcfs_nid2str(peer->ibp_nid), rc);
+		}
+		kiblnd_peer_connect_failed(peer, 1, rc);
+		kiblnd_peer_decref(peer);
+		return rc;		      /* rc != 0 destroys cmid */
+
+	case RDMA_CM_EVENT_ROUTE_ERROR:
+		peer = (kib_peer_t *)cmid->context;
+		CNETERR("%s: ROUTE ERROR %d\n",
+			libcfs_nid2str(peer->ibp_nid), event->status);
+		kiblnd_peer_connect_failed(peer, 1, -EHOSTUNREACH);
+		kiblnd_peer_decref(peer);
+		return -EHOSTUNREACH;	   /* rc != 0 destroys cmid */
+
+	case RDMA_CM_EVENT_ROUTE_RESOLVED:
+		peer = (kib_peer_t *)cmid->context;
+		CDEBUG(D_NET,"%s Route resolved: %d\n",
+		       libcfs_nid2str(peer->ibp_nid), event->status);
+
+		if (event->status == 0)
+			return kiblnd_active_connect(cmid);
+
+		CNETERR("Can't resolve route for %s: %d\n",
+		       libcfs_nid2str(peer->ibp_nid), event->status);
+		kiblnd_peer_connect_failed(peer, 1, event->status);
+		kiblnd_peer_decref(peer);
+		return event->status;	   /* rc != 0 destroys cmid */
+
+	case RDMA_CM_EVENT_UNREACHABLE:
+		conn = (kib_conn_t *)cmid->context;
+		LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT ||
+			conn->ibc_state == IBLND_CONN_PASSIVE_WAIT);
+		CNETERR("%s: UNREACHABLE %d\n",
+		       libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status);
+		kiblnd_connreq_done(conn, -ENETDOWN);
+		kiblnd_conn_decref(conn);
+		return 0;
+
+	case RDMA_CM_EVENT_CONNECT_ERROR:
+		conn = (kib_conn_t *)cmid->context;
+		LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT ||
+			conn->ibc_state == IBLND_CONN_PASSIVE_WAIT);
+		CNETERR("%s: CONNECT ERROR %d\n",
+			libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status);
+		kiblnd_connreq_done(conn, -ENOTCONN);
+		kiblnd_conn_decref(conn);
+		return 0;
+
+	case RDMA_CM_EVENT_REJECTED:
+		conn = (kib_conn_t *)cmid->context;
+		switch (conn->ibc_state) {
+		default:
+			LBUG();
+
+		case IBLND_CONN_PASSIVE_WAIT:
+			CERROR ("%s: REJECTED %d\n",
+				libcfs_nid2str(conn->ibc_peer->ibp_nid),
+				event->status);
+			kiblnd_connreq_done(conn, -ECONNRESET);
+			break;
+
+		case IBLND_CONN_ACTIVE_CONNECT:
+			kiblnd_rejected(conn, event->status,
+					(void *)KIBLND_CONN_PARAM(event),
+					KIBLND_CONN_PARAM_LEN(event));
+			break;
+		}
+		kiblnd_conn_decref(conn);
+		return 0;
+
+	case RDMA_CM_EVENT_ESTABLISHED:
+		conn = (kib_conn_t *)cmid->context;
+		switch (conn->ibc_state) {
+		default:
+			LBUG();
+
+		case IBLND_CONN_PASSIVE_WAIT:
+			CDEBUG(D_NET, "ESTABLISHED (passive): %s\n",
+			       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+			kiblnd_connreq_done(conn, 0);
+			break;
+
+		case IBLND_CONN_ACTIVE_CONNECT:
+			CDEBUG(D_NET, "ESTABLISHED(active): %s\n",
+			       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+			kiblnd_check_connreply(conn,
+					       (void *)KIBLND_CONN_PARAM(event),
+					       KIBLND_CONN_PARAM_LEN(event));
+			break;
+		}
+		/* net keeps its ref on conn! */
+		return 0;
+
+	case RDMA_CM_EVENT_TIMEWAIT_EXIT:
+		CDEBUG(D_NET, "Ignore TIMEWAIT_EXIT event\n");
+		return 0;
+	case RDMA_CM_EVENT_DISCONNECTED:
+		conn = (kib_conn_t *)cmid->context;
+		if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+			CERROR("%s DISCONNECTED\n",
+			       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+			kiblnd_connreq_done(conn, -ECONNRESET);
+		} else {
+			kiblnd_close_conn(conn, 0);
+		}
+		kiblnd_conn_decref(conn);
+		cmid->context = NULL;
+		return 0;
+
+	case RDMA_CM_EVENT_DEVICE_REMOVAL:
+		LCONSOLE_ERROR_MSG(0x131,
+				   "Received notification of device removal\n"
+				   "Please shutdown LNET to allow this to proceed\n");
+		/* Can't remove network from underneath LNET for now, so I have
+		 * to ignore this */
+		return 0;
+
+	case RDMA_CM_EVENT_ADDR_CHANGE:
+		LCONSOLE_INFO("Physical link changed (eg hca/port)\n");
+		return 0;
+	}
+}
+
+static int
+kiblnd_check_txs_locked(kib_conn_t *conn, struct list_head *txs)
+{
+	kib_tx_t	  *tx;
+	struct list_head	*ttmp;
+
+	list_for_each (ttmp, txs) {
+		tx = list_entry (ttmp, kib_tx_t, tx_list);
+
+		if (txs != &conn->ibc_active_txs) {
+			LASSERT (tx->tx_queued);
+		} else {
+			LASSERT (!tx->tx_queued);
+			LASSERT (tx->tx_waiting || tx->tx_sending != 0);
+		}
+
+		if (cfs_time_aftereq (jiffies, tx->tx_deadline)) {
+			CERROR("Timed out tx: %s, %lu seconds\n",
+			       kiblnd_queue2str(conn, txs),
+			       cfs_duration_sec(jiffies - tx->tx_deadline));
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+static int
+kiblnd_conn_timed_out_locked(kib_conn_t *conn)
+{
+	return  kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue) ||
+		kiblnd_check_txs_locked(conn, &conn->ibc_tx_noops) ||
+		kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_rsrvd) ||
+		kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_nocred) ||
+		kiblnd_check_txs_locked(conn, &conn->ibc_active_txs);
+}
+
+void
+kiblnd_check_conns (int idx)
+{
+	LIST_HEAD (closes);
+	LIST_HEAD (checksends);
+	struct list_head    *peers = &kiblnd_data.kib_peers[idx];
+	struct list_head    *ptmp;
+	kib_peer_t    *peer;
+	kib_conn_t    *conn;
+	struct list_head    *ctmp;
+	unsigned long  flags;
+
+	/* NB. We expect to have a look at all the peers and not find any
+	 * RDMAs to time out, so we just use a shared lock while we
+	 * take a look... */
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	list_for_each (ptmp, peers) {
+		peer = list_entry (ptmp, kib_peer_t, ibp_list);
+
+		list_for_each (ctmp, &peer->ibp_conns) {
+			int timedout;
+			int sendnoop;
+
+			conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+			LASSERT (conn->ibc_state == IBLND_CONN_ESTABLISHED);
+
+			spin_lock(&conn->ibc_lock);
+
+			sendnoop = kiblnd_need_noop(conn);
+			timedout = kiblnd_conn_timed_out_locked(conn);
+			if (!sendnoop && !timedout) {
+				spin_unlock(&conn->ibc_lock);
+				continue;
+			}
+
+			if (timedout) {
+				CERROR("Timed out RDMA with %s (%lu): "
+				       "c: %u, oc: %u, rc: %u\n",
+				       libcfs_nid2str(peer->ibp_nid),
+				       cfs_duration_sec(cfs_time_current() -
+							peer->ibp_last_alive),
+				       conn->ibc_credits,
+				       conn->ibc_outstanding_credits,
+				       conn->ibc_reserved_credits);
+				list_add(&conn->ibc_connd_list, &closes);
+			} else {
+				list_add(&conn->ibc_connd_list,
+					     &checksends);
+			}
+			/* +ref for 'closes' or 'checksends' */
+			kiblnd_conn_addref(conn);
+
+			spin_unlock(&conn->ibc_lock);
+		}
+	}
+
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	/* Handle timeout by closing the whole
+	 * connection. We can only be sure RDMA activity
+	 * has ceased once the QP has been modified. */
+	while (!list_empty(&closes)) {
+		conn = list_entry(closes.next,
+				      kib_conn_t, ibc_connd_list);
+		list_del(&conn->ibc_connd_list);
+		kiblnd_close_conn(conn, -ETIMEDOUT);
+		kiblnd_conn_decref(conn);
+	}
+
+	/* In case we have enough credits to return via a
+	 * NOOP, but there were no non-blocking tx descs
+	 * free to do it last time... */
+	while (!list_empty(&checksends)) {
+		conn = list_entry(checksends.next,
+				      kib_conn_t, ibc_connd_list);
+		list_del(&conn->ibc_connd_list);
+		kiblnd_check_sends(conn);
+		kiblnd_conn_decref(conn);
+	}
+}
+
+void
+kiblnd_disconnect_conn (kib_conn_t *conn)
+{
+	LASSERT (!in_interrupt());
+	LASSERT (current == kiblnd_data.kib_connd);
+	LASSERT (conn->ibc_state == IBLND_CONN_CLOSING);
+
+	rdma_disconnect(conn->ibc_cmid);
+	kiblnd_finalise_conn(conn);
+
+	kiblnd_peer_notify(conn->ibc_peer);
+}
+
+int
+kiblnd_connd (void *arg)
+{
+	wait_queue_t     wait;
+	unsigned long      flags;
+	kib_conn_t	*conn;
+	int		timeout;
+	int		i;
+	int		dropped_lock;
+	int		peer_index = 0;
+	unsigned long      deadline = jiffies;
+
+	cfs_block_allsigs ();
+
+	init_waitqueue_entry_current (&wait);
+	kiblnd_data.kib_connd = current;
+
+	spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+
+	while (!kiblnd_data.kib_shutdown) {
+
+		dropped_lock = 0;
+
+		if (!list_empty (&kiblnd_data.kib_connd_zombies)) {
+			conn = list_entry(kiblnd_data. \
+					      kib_connd_zombies.next,
+					      kib_conn_t, ibc_list);
+			list_del(&conn->ibc_list);
+
+			spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock,
+					       flags);
+			dropped_lock = 1;
+
+			kiblnd_destroy_conn(conn);
+
+			spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+		}
+
+		if (!list_empty(&kiblnd_data.kib_connd_conns)) {
+			conn = list_entry(kiblnd_data.kib_connd_conns.next,
+					      kib_conn_t, ibc_list);
+			list_del(&conn->ibc_list);
+
+			spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock,
+					       flags);
+			dropped_lock = 1;
+
+			kiblnd_disconnect_conn(conn);
+			kiblnd_conn_decref(conn);
+
+			spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+		}
+
+		/* careful with the jiffy wrap... */
+		timeout = (int)(deadline - jiffies);
+		if (timeout <= 0) {
+			const int n = 4;
+			const int p = 1;
+			int       chunk = kiblnd_data.kib_peer_hash_size;
+
+			spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
+			dropped_lock = 1;
+
+			/* Time to check for RDMA timeouts on a few more
+			 * peers: I do checks every 'p' seconds on a
+			 * proportion of the peer table and I need to check
+			 * every connection 'n' times within a timeout
+			 * interval, to ensure I detect a timeout on any
+			 * connection within (n+1)/n times the timeout
+			 * interval. */
+
+			if (*kiblnd_tunables.kib_timeout > n * p)
+				chunk = (chunk * n * p) /
+					*kiblnd_tunables.kib_timeout;
+			if (chunk == 0)
+				chunk = 1;
+
+			for (i = 0; i < chunk; i++) {
+				kiblnd_check_conns(peer_index);
+				peer_index = (peer_index + 1) %
+					     kiblnd_data.kib_peer_hash_size;
+			}
+
+			deadline += p * HZ;
+			spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+		}
+
+		if (dropped_lock)
+			continue;
+
+		/* Nothing to do for 'timeout'  */
+		set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue(&kiblnd_data.kib_connd_waitq, &wait);
+		spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
+
+		waitq_timedwait(&wait, TASK_INTERRUPTIBLE, timeout);
+
+		set_current_state(TASK_RUNNING);
+		remove_wait_queue(&kiblnd_data.kib_connd_waitq, &wait);
+		spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+	}
+
+	spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
+
+	kiblnd_thread_fini();
+	return 0;
+}
+
+void
+kiblnd_qp_event(struct ib_event *event, void *arg)
+{
+	kib_conn_t *conn = arg;
+
+	switch (event->event) {
+	case IB_EVENT_COMM_EST:
+		CDEBUG(D_NET, "%s established\n",
+		       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+		return;
+
+	default:
+		CERROR("%s: Async QP event type %d\n",
+		       libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event);
+		return;
+	}
+}
+
+void
+kiblnd_complete (struct ib_wc *wc)
+{
+	switch (kiblnd_wreqid2type(wc->wr_id)) {
+	default:
+		LBUG();
+
+	case IBLND_WID_RDMA:
+		/* We only get RDMA completion notification if it fails.  All
+		 * subsequent work items, including the final SEND will fail
+		 * too.  However we can't print out any more info about the
+		 * failing RDMA because 'tx' might be back on the idle list or
+		 * even reused already if we didn't manage to post all our work
+		 * items */
+		CNETERR("RDMA (tx: %p) failed: %d\n",
+			kiblnd_wreqid2ptr(wc->wr_id), wc->status);
+		return;
+
+	case IBLND_WID_TX:
+		kiblnd_tx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status);
+		return;
+
+	case IBLND_WID_RX:
+		kiblnd_rx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status,
+				   wc->byte_len);
+		return;
+	}
+}
+
+void
+kiblnd_cq_completion(struct ib_cq *cq, void *arg)
+{
+	/* NB I'm not allowed to schedule this conn once its refcount has
+	 * reached 0.  Since fundamentally I'm racing with scheduler threads
+	 * consuming my CQ I could be called after all completions have
+	 * occurred.  But in this case, ibc_nrx == 0 && ibc_nsends_posted == 0
+	 * and this CQ is about to be destroyed so I NOOP. */
+	kib_conn_t		*conn = (kib_conn_t *)arg;
+	struct kib_sched_info	*sched = conn->ibc_sched;
+	unsigned long		flags;
+
+	LASSERT(cq == conn->ibc_cq);
+
+	spin_lock_irqsave(&sched->ibs_lock, flags);
+
+	conn->ibc_ready = 1;
+
+	if (!conn->ibc_scheduled &&
+	    (conn->ibc_nrx > 0 ||
+	     conn->ibc_nsends_posted > 0)) {
+		kiblnd_conn_addref(conn); /* +1 ref for sched_conns */
+		conn->ibc_scheduled = 1;
+		list_add_tail(&conn->ibc_sched_list, &sched->ibs_conns);
+
+		if (waitqueue_active(&sched->ibs_waitq))
+			wake_up(&sched->ibs_waitq);
+	}
+
+	spin_unlock_irqrestore(&sched->ibs_lock, flags);
+}
+
+void
+kiblnd_cq_event(struct ib_event *event, void *arg)
+{
+	kib_conn_t *conn = arg;
+
+	CERROR("%s: async CQ event type %d\n",
+	       libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event);
+}
+
+int
+kiblnd_scheduler(void *arg)
+{
+	long			id = (long)arg;
+	struct kib_sched_info	*sched;
+	kib_conn_t		*conn;
+	wait_queue_t		wait;
+	unsigned long		flags;
+	struct ib_wc		wc;
+	int			did_something;
+	int			busy_loops = 0;
+	int			rc;
+
+	cfs_block_allsigs();
+
+	init_waitqueue_entry_current(&wait);
+
+	sched = kiblnd_data.kib_scheds[KIB_THREAD_CPT(id)];
+
+	rc = cfs_cpt_bind(lnet_cpt_table(), sched->ibs_cpt);
+	if (rc != 0) {
+		CWARN("Failed to bind on CPT %d, please verify whether "
+		      "all CPUs are healthy and reload modules if necessary, "
+		      "otherwise your system might under risk of low "
+		      "performance\n", sched->ibs_cpt);
+	}
+
+	spin_lock_irqsave(&sched->ibs_lock, flags);
+
+	while (!kiblnd_data.kib_shutdown) {
+		if (busy_loops++ >= IBLND_RESCHED) {
+			spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+			cond_resched();
+			busy_loops = 0;
+
+			spin_lock_irqsave(&sched->ibs_lock, flags);
+		}
+
+		did_something = 0;
+
+		if (!list_empty(&sched->ibs_conns)) {
+			conn = list_entry(sched->ibs_conns.next,
+					      kib_conn_t, ibc_sched_list);
+			/* take over kib_sched_conns' ref on conn... */
+			LASSERT(conn->ibc_scheduled);
+			list_del(&conn->ibc_sched_list);
+			conn->ibc_ready = 0;
+
+			spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+			rc = ib_poll_cq(conn->ibc_cq, 1, &wc);
+			if (rc == 0) {
+				rc = ib_req_notify_cq(conn->ibc_cq,
+						      IB_CQ_NEXT_COMP);
+				if (rc < 0) {
+					CWARN("%s: ib_req_notify_cq failed: %d, "
+					      "closing connection\n",
+					      libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
+					kiblnd_close_conn(conn, -EIO);
+					kiblnd_conn_decref(conn);
+					spin_lock_irqsave(&sched->ibs_lock,
+							      flags);
+					continue;
+				}
+
+				rc = ib_poll_cq(conn->ibc_cq, 1, &wc);
+			}
+
+			if (rc < 0) {
+				CWARN("%s: ib_poll_cq failed: %d, "
+				      "closing connection\n",
+				      libcfs_nid2str(conn->ibc_peer->ibp_nid),
+				      rc);
+				kiblnd_close_conn(conn, -EIO);
+				kiblnd_conn_decref(conn);
+				spin_lock_irqsave(&sched->ibs_lock, flags);
+				continue;
+			}
+
+			spin_lock_irqsave(&sched->ibs_lock, flags);
+
+			if (rc != 0 || conn->ibc_ready) {
+				/* There may be another completion waiting; get
+				 * another scheduler to check while I handle
+				 * this one... */
+				/* +1 ref for sched_conns */
+				kiblnd_conn_addref(conn);
+				list_add_tail(&conn->ibc_sched_list,
+						  &sched->ibs_conns);
+				if (waitqueue_active(&sched->ibs_waitq))
+					wake_up(&sched->ibs_waitq);
+			} else {
+				conn->ibc_scheduled = 0;
+			}
+
+			if (rc != 0) {
+				spin_unlock_irqrestore(&sched->ibs_lock, flags);
+				kiblnd_complete(&wc);
+
+				spin_lock_irqsave(&sched->ibs_lock, flags);
+			}
+
+			kiblnd_conn_decref(conn); /* ...drop my ref from above */
+			did_something = 1;
+		}
+
+		if (did_something)
+			continue;
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue_exclusive(&sched->ibs_waitq, &wait);
+		spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+		waitq_wait(&wait, TASK_INTERRUPTIBLE);
+		busy_loops = 0;
+
+		remove_wait_queue(&sched->ibs_waitq, &wait);
+		set_current_state(TASK_RUNNING);
+		spin_lock_irqsave(&sched->ibs_lock, flags);
+	}
+
+	spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+	kiblnd_thread_fini();
+	return 0;
+}
+
+int
+kiblnd_failover_thread(void *arg)
+{
+	rwlock_t		*glock = &kiblnd_data.kib_global_lock;
+	kib_dev_t	 *dev;
+	wait_queue_t     wait;
+	unsigned long      flags;
+	int		rc;
+
+	LASSERT (*kiblnd_tunables.kib_dev_failover != 0);
+
+	cfs_block_allsigs ();
+
+	init_waitqueue_entry_current(&wait);
+	write_lock_irqsave(glock, flags);
+
+	while (!kiblnd_data.kib_shutdown) {
+		int     do_failover = 0;
+		int     long_sleep;
+
+		list_for_each_entry(dev, &kiblnd_data.kib_failed_devs,
+				    ibd_fail_list) {
+			if (cfs_time_before(cfs_time_current(),
+					    dev->ibd_next_failover))
+				continue;
+			do_failover = 1;
+			break;
+		}
+
+		if (do_failover) {
+			list_del_init(&dev->ibd_fail_list);
+			dev->ibd_failover = 1;
+			write_unlock_irqrestore(glock, flags);
+
+			rc = kiblnd_dev_failover(dev);
+
+			write_lock_irqsave(glock, flags);
+
+			LASSERT (dev->ibd_failover);
+			dev->ibd_failover = 0;
+			if (rc >= 0) { /* Device is OK or failover succeed */
+				dev->ibd_next_failover = cfs_time_shift(3);
+				continue;
+			}
+
+			/* failed to failover, retry later */
+			dev->ibd_next_failover =
+				cfs_time_shift(min(dev->ibd_failed_failover, 10));
+			if (kiblnd_dev_can_failover(dev)) {
+				list_add_tail(&dev->ibd_fail_list,
+					      &kiblnd_data.kib_failed_devs);
+			}
+
+			continue;
+		}
+
+		/* long sleep if no more pending failover */
+		long_sleep = list_empty(&kiblnd_data.kib_failed_devs);
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue(&kiblnd_data.kib_failover_waitq, &wait);
+		write_unlock_irqrestore(glock, flags);
+
+		rc = schedule_timeout(long_sleep ? cfs_time_seconds(10) :
+						   cfs_time_seconds(1));
+		set_current_state(TASK_RUNNING);
+		remove_wait_queue(&kiblnd_data.kib_failover_waitq, &wait);
+		write_lock_irqsave(glock, flags);
+
+		if (!long_sleep || rc != 0)
+			continue;
+
+		/* have a long sleep, routine check all active devices,
+		 * we need checking like this because if there is not active
+		 * connection on the dev and no SEND from local, we may listen
+		 * on wrong HCA for ever while there is a bonding failover */
+		list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) {
+			if (kiblnd_dev_can_failover(dev)) {
+				list_add_tail(&dev->ibd_fail_list,
+					      &kiblnd_data.kib_failed_devs);
+			}
+		}
+	}
+
+	write_unlock_irqrestore(glock, flags);
+
+	kiblnd_thread_fini();
+	return 0;
+}
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c
new file mode 100644
index 000000000000..e21028b72302
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c
@@ -0,0 +1,493 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/o2iblnd/o2iblnd_modparams.c
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include "o2iblnd.h"
+
+static int service = 987;
+CFS_MODULE_PARM(service, "i", int, 0444,
+		"service number (within RDMA_PS_TCP)");
+
+static int cksum = 0;
+CFS_MODULE_PARM(cksum, "i", int, 0644,
+		"set non-zero to enable message (not RDMA) checksums");
+
+static int timeout = 50;
+CFS_MODULE_PARM(timeout, "i", int, 0644,
+		"timeout (seconds)");
+
+/* Number of threads in each scheduler pool which is percpt,
+ * we will estimate reasonable value based on CPUs if it's set to zero. */
+static int nscheds;
+CFS_MODULE_PARM(nscheds, "i", int, 0444,
+		"number of threads in each scheduler pool");
+
+/* NB: this value is shared by all CPTs, it can grow at runtime */
+static int ntx = 512;
+CFS_MODULE_PARM(ntx, "i", int, 0444,
+		"# of message descriptors allocated for each pool");
+
+/* NB: this value is shared by all CPTs */
+static int credits = 256;
+CFS_MODULE_PARM(credits, "i", int, 0444,
+		"# concurrent sends");
+
+static int peer_credits = 8;
+CFS_MODULE_PARM(peer_credits, "i", int, 0444,
+		"# concurrent sends to 1 peer");
+
+static int peer_credits_hiw = 0;
+CFS_MODULE_PARM(peer_credits_hiw, "i", int, 0444,
+		"when eagerly to return credits");
+
+static int peer_buffer_credits = 0;
+CFS_MODULE_PARM(peer_buffer_credits, "i", int, 0444,
+		"# per-peer router buffer credits");
+
+static int peer_timeout = 180;
+CFS_MODULE_PARM(peer_timeout, "i", int, 0444,
+		"Seconds without aliveness news to declare peer dead (<=0 to disable)");
+
+static char *ipif_name = "ib0";
+CFS_MODULE_PARM(ipif_name, "s", charp, 0444,
+		"IPoIB interface name");
+
+static int retry_count = 5;
+CFS_MODULE_PARM(retry_count, "i", int, 0644,
+		"Retransmissions when no ACK received");
+
+static int rnr_retry_count = 6;
+CFS_MODULE_PARM(rnr_retry_count, "i", int, 0644,
+		"RNR retransmissions");
+
+static int keepalive = 100;
+CFS_MODULE_PARM(keepalive, "i", int, 0644,
+		"Idle time in seconds before sending a keepalive");
+
+static int ib_mtu = 0;
+CFS_MODULE_PARM(ib_mtu, "i", int, 0444,
+		"IB MTU 256/512/1024/2048/4096");
+
+static int concurrent_sends = 0;
+CFS_MODULE_PARM(concurrent_sends, "i", int, 0444,
+		"send work-queue sizing");
+
+static int map_on_demand = 0;
+CFS_MODULE_PARM(map_on_demand, "i", int, 0444,
+		"map on demand");
+
+/* NB: this value is shared by all CPTs, it can grow at runtime */
+static int fmr_pool_size = 512;
+CFS_MODULE_PARM(fmr_pool_size, "i", int, 0444,
+		"size of fmr pool on each CPT (>= ntx / 4)");
+
+/* NB: this value is shared by all CPTs, it can grow at runtime */
+static int fmr_flush_trigger = 384;
+CFS_MODULE_PARM(fmr_flush_trigger, "i", int, 0444,
+		"# dirty FMRs that triggers pool flush");
+
+static int fmr_cache = 1;
+CFS_MODULE_PARM(fmr_cache, "i", int, 0444,
+		"non-zero to enable FMR caching");
+
+/* NB: this value is shared by all CPTs, it can grow at runtime */
+static int pmr_pool_size = 512;
+CFS_MODULE_PARM(pmr_pool_size, "i", int, 0444,
+		"size of MR cache pmr pool on each CPT");
+
+/*
+ * 0: disable failover
+ * 1: enable failover if necessary
+ * 2: force to failover (for debug)
+ */
+static int dev_failover = 0;
+CFS_MODULE_PARM(dev_failover, "i", int, 0444,
+	       "HCA failover for bonding (0 off, 1 on, other values reserved)");
+
+
+static int require_privileged_port = 0;
+CFS_MODULE_PARM(require_privileged_port, "i", int, 0644,
+		"require privileged port when accepting connection");
+
+static int use_privileged_port = 1;
+CFS_MODULE_PARM(use_privileged_port, "i", int, 0644,
+		"use privileged port when initiating connection");
+
+kib_tunables_t kiblnd_tunables = {
+	.kib_dev_failover	   = &dev_failover,
+	.kib_service		= &service,
+	.kib_cksum		  = &cksum,
+	.kib_timeout		= &timeout,
+	.kib_keepalive	      = &keepalive,
+	.kib_ntx		    = &ntx,
+	.kib_credits		= &credits,
+	.kib_peertxcredits	  = &peer_credits,
+	.kib_peercredits_hiw	= &peer_credits_hiw,
+	.kib_peerrtrcredits	 = &peer_buffer_credits,
+	.kib_peertimeout	    = &peer_timeout,
+	.kib_default_ipif	   = &ipif_name,
+	.kib_retry_count	    = &retry_count,
+	.kib_rnr_retry_count	= &rnr_retry_count,
+	.kib_concurrent_sends       = &concurrent_sends,
+	.kib_ib_mtu		 = &ib_mtu,
+	.kib_map_on_demand	  = &map_on_demand,
+	.kib_fmr_pool_size	  = &fmr_pool_size,
+	.kib_fmr_flush_trigger      = &fmr_flush_trigger,
+	.kib_fmr_cache	      = &fmr_cache,
+	.kib_pmr_pool_size	  = &pmr_pool_size,
+	.kib_require_priv_port      = &require_privileged_port,
+	.kib_use_priv_port	    = &use_privileged_port,
+	.kib_nscheds		    = &nscheds
+};
+
+#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
+
+static char ipif_basename_space[32];
+
+
+enum {
+	O2IBLND_SERVICE  = 1,
+	O2IBLND_CKSUM,
+	O2IBLND_TIMEOUT,
+	O2IBLND_NTX,
+	O2IBLND_CREDITS,
+	O2IBLND_PEER_TXCREDITS,
+	O2IBLND_PEER_CREDITS_HIW,
+	O2IBLND_PEER_RTRCREDITS,
+	O2IBLND_PEER_TIMEOUT,
+	O2IBLND_IPIF_BASENAME,
+	O2IBLND_RETRY_COUNT,
+	O2IBLND_RNR_RETRY_COUNT,
+	O2IBLND_KEEPALIVE,
+	O2IBLND_CONCURRENT_SENDS,
+	O2IBLND_IB_MTU,
+	O2IBLND_MAP_ON_DEMAND,
+	O2IBLND_FMR_POOL_SIZE,
+	O2IBLND_FMR_FLUSH_TRIGGER,
+	O2IBLND_FMR_CACHE,
+	O2IBLND_PMR_POOL_SIZE,
+	O2IBLND_DEV_FAILOVER
+};
+
+static ctl_table_t kiblnd_ctl_table[] = {
+	{
+		.ctl_name = O2IBLND_SERVICE,
+		.procname = "service",
+		.data     = &service,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_CKSUM,
+		.procname = "cksum",
+		.data     = &cksum,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_TIMEOUT,
+		.procname = "timeout",
+		.data     = &timeout,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_NTX,
+		.procname = "ntx",
+		.data     = &ntx,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_CREDITS,
+		.procname = "credits",
+		.data     = &credits,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_PEER_TXCREDITS,
+		.procname = "peer_credits",
+		.data     = &peer_credits,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_PEER_CREDITS_HIW,
+		.procname = "peer_credits_hiw",
+		.data     = &peer_credits_hiw,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_PEER_RTRCREDITS,
+		.procname = "peer_buffer_credits",
+		.data     = &peer_buffer_credits,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_PEER_TIMEOUT,
+		.procname = "peer_timeout",
+		.data     = &peer_timeout,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_IPIF_BASENAME,
+		.procname = "ipif_name",
+		.data     = ipif_basename_space,
+		.maxlen   = sizeof(ipif_basename_space),
+		.mode     = 0444,
+		.proc_handler = &proc_dostring
+	},
+	{
+		.ctl_name = O2IBLND_RETRY_COUNT,
+		.procname = "retry_count",
+		.data     = &retry_count,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_RNR_RETRY_COUNT,
+		.procname = "rnr_retry_count",
+		.data     = &rnr_retry_count,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_KEEPALIVE,
+		.procname = "keepalive",
+		.data     = &keepalive,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_CONCURRENT_SENDS,
+		.procname = "concurrent_sends",
+		.data     = &concurrent_sends,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_IB_MTU,
+		.procname = "ib_mtu",
+		.data     = &ib_mtu,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_MAP_ON_DEMAND,
+		.procname = "map_on_demand",
+		.data     = &map_on_demand,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+
+	{
+		.ctl_name = O2IBLND_FMR_POOL_SIZE,
+		.procname = "fmr_pool_size",
+		.data     = &fmr_pool_size,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_FMR_FLUSH_TRIGGER,
+		.procname = "fmr_flush_trigger",
+		.data     = &fmr_flush_trigger,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_FMR_CACHE,
+		.procname = "fmr_cache",
+		.data     = &fmr_cache,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_PMR_POOL_SIZE,
+		.procname = "pmr_pool_size",
+		.data     = &pmr_pool_size,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_DEV_FAILOVER,
+		.procname = "dev_failover",
+		.data     = &dev_failover,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{0}
+};
+
+static ctl_table_t kiblnd_top_ctl_table[] = {
+	{
+		.ctl_name = CTL_O2IBLND,
+		.procname = "o2iblnd",
+		.data     = NULL,
+		.maxlen   = 0,
+		.mode     = 0555,
+		.child    = kiblnd_ctl_table
+	},
+	{0}
+};
+
+void
+kiblnd_initstrtunable(char *space, char *str, int size)
+{
+	strncpy(space, str, size);
+	space[size-1] = 0;
+}
+
+void
+kiblnd_sysctl_init (void)
+{
+	kiblnd_initstrtunable(ipif_basename_space, ipif_name,
+			      sizeof(ipif_basename_space));
+
+	kiblnd_tunables.kib_sysctl =
+		cfs_register_sysctl_table(kiblnd_top_ctl_table, 0);
+
+	if (kiblnd_tunables.kib_sysctl == NULL)
+		CWARN("Can't setup /proc tunables\n");
+}
+
+void
+kiblnd_sysctl_fini (void)
+{
+	if (kiblnd_tunables.kib_sysctl != NULL)
+		unregister_sysctl_table(kiblnd_tunables.kib_sysctl);
+}
+
+#else
+
+void
+kiblnd_sysctl_init (void)
+{
+}
+
+void
+kiblnd_sysctl_fini (void)
+{
+}
+
+#endif
+
+int
+kiblnd_tunables_init (void)
+{
+	if (kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu) < 0) {
+		CERROR("Invalid ib_mtu %d, expected 256/512/1024/2048/4096\n",
+		       *kiblnd_tunables.kib_ib_mtu);
+		return -EINVAL;
+	}
+
+	if (*kiblnd_tunables.kib_peertxcredits < IBLND_CREDITS_DEFAULT)
+		*kiblnd_tunables.kib_peertxcredits = IBLND_CREDITS_DEFAULT;
+
+	if (*kiblnd_tunables.kib_peertxcredits > IBLND_CREDITS_MAX)
+		*kiblnd_tunables.kib_peertxcredits = IBLND_CREDITS_MAX;
+
+	if (*kiblnd_tunables.kib_peertxcredits > *kiblnd_tunables.kib_credits)
+		*kiblnd_tunables.kib_peertxcredits = *kiblnd_tunables.kib_credits;
+
+	if (*kiblnd_tunables.kib_peercredits_hiw < *kiblnd_tunables.kib_peertxcredits / 2)
+		*kiblnd_tunables.kib_peercredits_hiw = *kiblnd_tunables.kib_peertxcredits / 2;
+
+	if (*kiblnd_tunables.kib_peercredits_hiw >= *kiblnd_tunables.kib_peertxcredits)
+		*kiblnd_tunables.kib_peercredits_hiw = *kiblnd_tunables.kib_peertxcredits - 1;
+
+	if (*kiblnd_tunables.kib_map_on_demand < 0 ||
+	    *kiblnd_tunables.kib_map_on_demand > IBLND_MAX_RDMA_FRAGS)
+		*kiblnd_tunables.kib_map_on_demand = 0; /* disable map-on-demand */
+
+	if (*kiblnd_tunables.kib_map_on_demand == 1)
+		*kiblnd_tunables.kib_map_on_demand = 2; /* don't make sense to create map if only one fragment */
+
+	if (*kiblnd_tunables.kib_concurrent_sends == 0) {
+		if (*kiblnd_tunables.kib_map_on_demand > 0 &&
+		    *kiblnd_tunables.kib_map_on_demand <= IBLND_MAX_RDMA_FRAGS / 8)
+			*kiblnd_tunables.kib_concurrent_sends = (*kiblnd_tunables.kib_peertxcredits) * 2;
+		else
+			*kiblnd_tunables.kib_concurrent_sends = (*kiblnd_tunables.kib_peertxcredits);
+	}
+
+	if (*kiblnd_tunables.kib_concurrent_sends > *kiblnd_tunables.kib_peertxcredits * 2)
+		*kiblnd_tunables.kib_concurrent_sends = *kiblnd_tunables.kib_peertxcredits * 2;
+
+	if (*kiblnd_tunables.kib_concurrent_sends < *kiblnd_tunables.kib_peertxcredits / 2)
+		*kiblnd_tunables.kib_concurrent_sends = *kiblnd_tunables.kib_peertxcredits / 2;
+
+	if (*kiblnd_tunables.kib_concurrent_sends < *kiblnd_tunables.kib_peertxcredits) {
+		CWARN("Concurrent sends %d is lower than message queue size: %d, "
+		      "performance may drop slightly.\n",
+		      *kiblnd_tunables.kib_concurrent_sends, *kiblnd_tunables.kib_peertxcredits);
+	}
+
+	kiblnd_sysctl_init();
+	return 0;
+}
+
+void
+kiblnd_tunables_fini (void)
+{
+	kiblnd_sysctl_fini();
+}
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/Makefile b/drivers/staging/lustre/lnet/klnds/socklnd/Makefile
new file mode 100644
index 000000000000..6494b2bada05
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/Makefile
@@ -0,0 +1,7 @@
+obj-$(CONFIG_LNET) += ksocklnd.o
+
+ksocklnd-y := socklnd.o socklnd_cb.o socklnd_proto.o socklnd_modparams.o socklnd_lib-linux.o
+
+
+
+ccflags-y := -I$(src)/../../include
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c
new file mode 100644
index 000000000000..c826bf9d49ac
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c
@@ -0,0 +1,2902 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/socklnd/socklnd.c
+ *
+ * Author: Zach Brown <zab@zabbo.net>
+ * Author: Peter J. Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include "socklnd.h"
+
+lnd_t		   the_ksocklnd;
+ksock_nal_data_t	ksocknal_data;
+
+ksock_interface_t *
+ksocknal_ip2iface(lnet_ni_t *ni, __u32 ip)
+{
+	ksock_net_t       *net = ni->ni_data;
+	int		i;
+	ksock_interface_t *iface;
+
+	for (i = 0; i < net->ksnn_ninterfaces; i++) {
+		LASSERT(i < LNET_MAX_INTERFACES);
+		iface = &net->ksnn_interfaces[i];
+
+		if (iface->ksni_ipaddr == ip)
+			return (iface);
+	}
+
+	return (NULL);
+}
+
+ksock_route_t *
+ksocknal_create_route (__u32 ipaddr, int port)
+{
+	ksock_route_t *route;
+
+	LIBCFS_ALLOC (route, sizeof (*route));
+	if (route == NULL)
+		return (NULL);
+
+	atomic_set (&route->ksnr_refcount, 1);
+	route->ksnr_peer = NULL;
+	route->ksnr_retry_interval = 0;	 /* OK to connect at any time */
+	route->ksnr_ipaddr = ipaddr;
+	route->ksnr_port = port;
+	route->ksnr_scheduled = 0;
+	route->ksnr_connecting = 0;
+	route->ksnr_connected = 0;
+	route->ksnr_deleted = 0;
+	route->ksnr_conn_count = 0;
+	route->ksnr_share_count = 0;
+
+	return (route);
+}
+
+void
+ksocknal_destroy_route (ksock_route_t *route)
+{
+	LASSERT (atomic_read(&route->ksnr_refcount) == 0);
+
+	if (route->ksnr_peer != NULL)
+		ksocknal_peer_decref(route->ksnr_peer);
+
+	LIBCFS_FREE (route, sizeof (*route));
+}
+
+int
+ksocknal_create_peer (ksock_peer_t **peerp, lnet_ni_t *ni, lnet_process_id_t id)
+{
+	ksock_net_t   *net = ni->ni_data;
+	ksock_peer_t  *peer;
+
+	LASSERT (id.nid != LNET_NID_ANY);
+	LASSERT (id.pid != LNET_PID_ANY);
+	LASSERT (!in_interrupt());
+
+	LIBCFS_ALLOC (peer, sizeof (*peer));
+	if (peer == NULL)
+		return -ENOMEM;
+
+	memset (peer, 0, sizeof (*peer));       /* NULL pointers/clear flags etc */
+
+	peer->ksnp_ni = ni;
+	peer->ksnp_id = id;
+	atomic_set (&peer->ksnp_refcount, 1);   /* 1 ref for caller */
+	peer->ksnp_closing = 0;
+	peer->ksnp_accepting = 0;
+	peer->ksnp_proto = NULL;
+	peer->ksnp_last_alive = 0;
+	peer->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1;
+
+	INIT_LIST_HEAD (&peer->ksnp_conns);
+	INIT_LIST_HEAD (&peer->ksnp_routes);
+	INIT_LIST_HEAD (&peer->ksnp_tx_queue);
+	INIT_LIST_HEAD (&peer->ksnp_zc_req_list);
+	spin_lock_init(&peer->ksnp_lock);
+
+	spin_lock_bh(&net->ksnn_lock);
+
+	if (net->ksnn_shutdown) {
+		spin_unlock_bh(&net->ksnn_lock);
+
+		LIBCFS_FREE(peer, sizeof(*peer));
+		CERROR("Can't create peer: network shutdown\n");
+		return -ESHUTDOWN;
+	}
+
+	net->ksnn_npeers++;
+
+	spin_unlock_bh(&net->ksnn_lock);
+
+	*peerp = peer;
+	return 0;
+}
+
+void
+ksocknal_destroy_peer (ksock_peer_t *peer)
+{
+	ksock_net_t    *net = peer->ksnp_ni->ni_data;
+
+	CDEBUG (D_NET, "peer %s %p deleted\n",
+		libcfs_id2str(peer->ksnp_id), peer);
+
+	LASSERT (atomic_read (&peer->ksnp_refcount) == 0);
+	LASSERT (peer->ksnp_accepting == 0);
+	LASSERT (list_empty (&peer->ksnp_conns));
+	LASSERT (list_empty (&peer->ksnp_routes));
+	LASSERT (list_empty (&peer->ksnp_tx_queue));
+	LASSERT (list_empty (&peer->ksnp_zc_req_list));
+
+	LIBCFS_FREE (peer, sizeof (*peer));
+
+	/* NB a peer's connections and routes keep a reference on their peer
+	 * until they are destroyed, so we can be assured that _all_ state to
+	 * do with this peer has been cleaned up when its refcount drops to
+	 * zero. */
+	spin_lock_bh(&net->ksnn_lock);
+	net->ksnn_npeers--;
+	spin_unlock_bh(&net->ksnn_lock);
+}
+
+ksock_peer_t *
+ksocknal_find_peer_locked (lnet_ni_t *ni, lnet_process_id_t id)
+{
+	struct list_head       *peer_list = ksocknal_nid2peerlist(id.nid);
+	struct list_head       *tmp;
+	ksock_peer_t     *peer;
+
+	list_for_each (tmp, peer_list) {
+
+		peer = list_entry (tmp, ksock_peer_t, ksnp_list);
+
+		LASSERT (!peer->ksnp_closing);
+
+		if (peer->ksnp_ni != ni)
+			continue;
+
+		if (peer->ksnp_id.nid != id.nid ||
+		    peer->ksnp_id.pid != id.pid)
+			continue;
+
+		CDEBUG(D_NET, "got peer [%p] -> %s (%d)\n",
+		       peer, libcfs_id2str(id),
+		       atomic_read(&peer->ksnp_refcount));
+		return (peer);
+	}
+	return (NULL);
+}
+
+ksock_peer_t *
+ksocknal_find_peer (lnet_ni_t *ni, lnet_process_id_t id)
+{
+	ksock_peer_t     *peer;
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+	peer = ksocknal_find_peer_locked(ni, id);
+	if (peer != NULL)			/* +1 ref for caller? */
+		ksocknal_peer_addref(peer);
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+
+	return (peer);
+}
+
+void
+ksocknal_unlink_peer_locked (ksock_peer_t *peer)
+{
+	int		i;
+	__u32	      ip;
+	ksock_interface_t *iface;
+
+	for (i = 0; i < peer->ksnp_n_passive_ips; i++) {
+		LASSERT (i < LNET_MAX_INTERFACES);
+		ip = peer->ksnp_passive_ips[i];
+
+		iface = ksocknal_ip2iface(peer->ksnp_ni, ip);
+		/* All IPs in peer->ksnp_passive_ips[] come from the
+		 * interface list, therefore the call must succeed. */
+		LASSERT (iface != NULL);
+
+		CDEBUG(D_NET, "peer=%p iface=%p ksni_nroutes=%d\n",
+		       peer, iface, iface->ksni_nroutes);
+		iface->ksni_npeers--;
+	}
+
+	LASSERT (list_empty(&peer->ksnp_conns));
+	LASSERT (list_empty(&peer->ksnp_routes));
+	LASSERT (!peer->ksnp_closing);
+	peer->ksnp_closing = 1;
+	list_del (&peer->ksnp_list);
+	/* lose peerlist's ref */
+	ksocknal_peer_decref(peer);
+}
+
+int
+ksocknal_get_peer_info (lnet_ni_t *ni, int index,
+			lnet_process_id_t *id, __u32 *myip, __u32 *peer_ip,
+			int *port, int *conn_count, int *share_count)
+{
+	ksock_peer_t      *peer;
+	struct list_head	*ptmp;
+	ksock_route_t     *route;
+	struct list_head	*rtmp;
+	int		i;
+	int		j;
+	int		rc = -ENOENT;
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+
+		list_for_each (ptmp, &ksocknal_data.ksnd_peers[i]) {
+			peer = list_entry (ptmp, ksock_peer_t, ksnp_list);
+
+			if (peer->ksnp_ni != ni)
+				continue;
+
+			if (peer->ksnp_n_passive_ips == 0 &&
+			    list_empty(&peer->ksnp_routes)) {
+				if (index-- > 0)
+					continue;
+
+				*id = peer->ksnp_id;
+				*myip = 0;
+				*peer_ip = 0;
+				*port = 0;
+				*conn_count = 0;
+				*share_count = 0;
+				rc = 0;
+				goto out;
+			}
+
+			for (j = 0; j < peer->ksnp_n_passive_ips; j++) {
+				if (index-- > 0)
+					continue;
+
+				*id = peer->ksnp_id;
+				*myip = peer->ksnp_passive_ips[j];
+				*peer_ip = 0;
+				*port = 0;
+				*conn_count = 0;
+				*share_count = 0;
+				rc = 0;
+				goto out;
+			}
+
+			list_for_each (rtmp, &peer->ksnp_routes) {
+				if (index-- > 0)
+					continue;
+
+				route = list_entry(rtmp, ksock_route_t,
+						       ksnr_list);
+
+				*id = peer->ksnp_id;
+				*myip = route->ksnr_myipaddr;
+				*peer_ip = route->ksnr_ipaddr;
+				*port = route->ksnr_port;
+				*conn_count = route->ksnr_conn_count;
+				*share_count = route->ksnr_share_count;
+				rc = 0;
+				goto out;
+			}
+		}
+	}
+ out:
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+	return (rc);
+}
+
+void
+ksocknal_associate_route_conn_locked(ksock_route_t *route, ksock_conn_t *conn)
+{
+	ksock_peer_t      *peer = route->ksnr_peer;
+	int		type = conn->ksnc_type;
+	ksock_interface_t *iface;
+
+	conn->ksnc_route = route;
+	ksocknal_route_addref(route);
+
+	if (route->ksnr_myipaddr != conn->ksnc_myipaddr) {
+		if (route->ksnr_myipaddr == 0) {
+			/* route wasn't bound locally yet (the initial route) */
+			CDEBUG(D_NET, "Binding %s %u.%u.%u.%u to %u.%u.%u.%u\n",
+			       libcfs_id2str(peer->ksnp_id),
+			       HIPQUAD(route->ksnr_ipaddr),
+			       HIPQUAD(conn->ksnc_myipaddr));
+		} else {
+			CDEBUG(D_NET, "Rebinding %s %u.%u.%u.%u from "
+			       "%u.%u.%u.%u to %u.%u.%u.%u\n",
+			       libcfs_id2str(peer->ksnp_id),
+			       HIPQUAD(route->ksnr_ipaddr),
+			       HIPQUAD(route->ksnr_myipaddr),
+			       HIPQUAD(conn->ksnc_myipaddr));
+
+			iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni,
+						  route->ksnr_myipaddr);
+			if (iface != NULL)
+				iface->ksni_nroutes--;
+		}
+		route->ksnr_myipaddr = conn->ksnc_myipaddr;
+		iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni,
+					  route->ksnr_myipaddr);
+		if (iface != NULL)
+			iface->ksni_nroutes++;
+	}
+
+	route->ksnr_connected |= (1<<type);
+	route->ksnr_conn_count++;
+
+	/* Successful connection => further attempts can
+	 * proceed immediately */
+	route->ksnr_retry_interval = 0;
+}
+
+void
+ksocknal_add_route_locked (ksock_peer_t *peer, ksock_route_t *route)
+{
+	struct list_head	*tmp;
+	ksock_conn_t      *conn;
+	ksock_route_t     *route2;
+
+	LASSERT (!peer->ksnp_closing);
+	LASSERT (route->ksnr_peer == NULL);
+	LASSERT (!route->ksnr_scheduled);
+	LASSERT (!route->ksnr_connecting);
+	LASSERT (route->ksnr_connected == 0);
+
+	/* LASSERT(unique) */
+	list_for_each(tmp, &peer->ksnp_routes) {
+		route2 = list_entry(tmp, ksock_route_t, ksnr_list);
+
+		if (route2->ksnr_ipaddr == route->ksnr_ipaddr) {
+			CERROR ("Duplicate route %s %u.%u.%u.%u\n",
+				libcfs_id2str(peer->ksnp_id),
+				HIPQUAD(route->ksnr_ipaddr));
+			LBUG();
+		}
+	}
+
+	route->ksnr_peer = peer;
+	ksocknal_peer_addref(peer);
+	/* peer's routelist takes over my ref on 'route' */
+	list_add_tail(&route->ksnr_list, &peer->ksnp_routes);
+
+	list_for_each(tmp, &peer->ksnp_conns) {
+		conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+		if (conn->ksnc_ipaddr != route->ksnr_ipaddr)
+			continue;
+
+		ksocknal_associate_route_conn_locked(route, conn);
+		/* keep going (typed routes) */
+	}
+}
+
+void
+ksocknal_del_route_locked (ksock_route_t *route)
+{
+	ksock_peer_t      *peer = route->ksnr_peer;
+	ksock_interface_t *iface;
+	ksock_conn_t      *conn;
+	struct list_head	*ctmp;
+	struct list_head	*cnxt;
+
+	LASSERT (!route->ksnr_deleted);
+
+	/* Close associated conns */
+	list_for_each_safe (ctmp, cnxt, &peer->ksnp_conns) {
+		conn = list_entry(ctmp, ksock_conn_t, ksnc_list);
+
+		if (conn->ksnc_route != route)
+			continue;
+
+		ksocknal_close_conn_locked (conn, 0);
+	}
+
+	if (route->ksnr_myipaddr != 0) {
+		iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni,
+					  route->ksnr_myipaddr);
+		if (iface != NULL)
+			iface->ksni_nroutes--;
+	}
+
+	route->ksnr_deleted = 1;
+	list_del (&route->ksnr_list);
+	ksocknal_route_decref(route);	     /* drop peer's ref */
+
+	if (list_empty (&peer->ksnp_routes) &&
+	    list_empty (&peer->ksnp_conns)) {
+		/* I've just removed the last route to a peer with no active
+		 * connections */
+		ksocknal_unlink_peer_locked (peer);
+	}
+}
+
+int
+ksocknal_add_peer (lnet_ni_t *ni, lnet_process_id_t id, __u32 ipaddr, int port)
+{
+	struct list_head	*tmp;
+	ksock_peer_t      *peer;
+	ksock_peer_t      *peer2;
+	ksock_route_t     *route;
+	ksock_route_t     *route2;
+	int		rc;
+
+	if (id.nid == LNET_NID_ANY ||
+	    id.pid == LNET_PID_ANY)
+		return (-EINVAL);
+
+	/* Have a brand new peer ready... */
+	rc = ksocknal_create_peer(&peer, ni, id);
+	if (rc != 0)
+		return rc;
+
+	route = ksocknal_create_route (ipaddr, port);
+	if (route == NULL) {
+		ksocknal_peer_decref(peer);
+		return (-ENOMEM);
+	}
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	/* always called with a ref on ni, so shutdown can't have started */
+	LASSERT (((ksock_net_t *) ni->ni_data)->ksnn_shutdown == 0);
+
+	peer2 = ksocknal_find_peer_locked (ni, id);
+	if (peer2 != NULL) {
+		ksocknal_peer_decref(peer);
+		peer = peer2;
+	} else {
+		/* peer table takes my ref on peer */
+		list_add_tail (&peer->ksnp_list,
+				   ksocknal_nid2peerlist (id.nid));
+	}
+
+	route2 = NULL;
+	list_for_each (tmp, &peer->ksnp_routes) {
+		route2 = list_entry(tmp, ksock_route_t, ksnr_list);
+
+		if (route2->ksnr_ipaddr == ipaddr)
+			break;
+
+		route2 = NULL;
+	}
+	if (route2 == NULL) {
+		ksocknal_add_route_locked(peer, route);
+		route->ksnr_share_count++;
+	} else {
+		ksocknal_route_decref(route);
+		route2->ksnr_share_count++;
+	}
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	return (0);
+}
+
+void
+ksocknal_del_peer_locked (ksock_peer_t *peer, __u32 ip)
+{
+	ksock_conn_t     *conn;
+	ksock_route_t    *route;
+	struct list_head       *tmp;
+	struct list_head       *nxt;
+	int	       nshared;
+
+	LASSERT (!peer->ksnp_closing);
+
+	/* Extra ref prevents peer disappearing until I'm done with it */
+	ksocknal_peer_addref(peer);
+
+	list_for_each_safe (tmp, nxt, &peer->ksnp_routes) {
+		route = list_entry(tmp, ksock_route_t, ksnr_list);
+
+		/* no match */
+		if (!(ip == 0 || route->ksnr_ipaddr == ip))
+			continue;
+
+		route->ksnr_share_count = 0;
+		/* This deletes associated conns too */
+		ksocknal_del_route_locked (route);
+	}
+
+	nshared = 0;
+	list_for_each_safe (tmp, nxt, &peer->ksnp_routes) {
+		route = list_entry(tmp, ksock_route_t, ksnr_list);
+		nshared += route->ksnr_share_count;
+	}
+
+	if (nshared == 0) {
+		/* remove everything else if there are no explicit entries
+		 * left */
+
+		list_for_each_safe (tmp, nxt, &peer->ksnp_routes) {
+			route = list_entry(tmp, ksock_route_t, ksnr_list);
+
+			/* we should only be removing auto-entries */
+			LASSERT(route->ksnr_share_count == 0);
+			ksocknal_del_route_locked (route);
+		}
+
+		list_for_each_safe (tmp, nxt, &peer->ksnp_conns) {
+			conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+			ksocknal_close_conn_locked(conn, 0);
+		}
+	}
+
+	ksocknal_peer_decref(peer);
+	/* NB peer unlinks itself when last conn/route is removed */
+}
+
+int
+ksocknal_del_peer (lnet_ni_t *ni, lnet_process_id_t id, __u32 ip)
+{
+	LIST_HEAD     (zombies);
+	struct list_head	*ptmp;
+	struct list_head	*pnxt;
+	ksock_peer_t      *peer;
+	int		lo;
+	int		hi;
+	int		i;
+	int		rc = -ENOENT;
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	if (id.nid != LNET_NID_ANY)
+		lo = hi = (int)(ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers);
+	else {
+		lo = 0;
+		hi = ksocknal_data.ksnd_peer_hash_size - 1;
+	}
+
+	for (i = lo; i <= hi; i++) {
+		list_for_each_safe (ptmp, pnxt,
+					&ksocknal_data.ksnd_peers[i]) {
+			peer = list_entry (ptmp, ksock_peer_t, ksnp_list);
+
+			if (peer->ksnp_ni != ni)
+				continue;
+
+			if (!((id.nid == LNET_NID_ANY || peer->ksnp_id.nid == id.nid) &&
+			      (id.pid == LNET_PID_ANY || peer->ksnp_id.pid == id.pid)))
+				continue;
+
+			ksocknal_peer_addref(peer);     /* a ref for me... */
+
+			ksocknal_del_peer_locked (peer, ip);
+
+			if (peer->ksnp_closing &&
+			    !list_empty(&peer->ksnp_tx_queue)) {
+				LASSERT (list_empty(&peer->ksnp_conns));
+				LASSERT (list_empty(&peer->ksnp_routes));
+
+				list_splice_init(&peer->ksnp_tx_queue,
+						     &zombies);
+			}
+
+			ksocknal_peer_decref(peer);     /* ...till here */
+
+			rc = 0;		 /* matched! */
+		}
+	}
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	ksocknal_txlist_done(ni, &zombies, 1);
+
+	return (rc);
+}
+
+ksock_conn_t *
+ksocknal_get_conn_by_idx (lnet_ni_t *ni, int index)
+{
+	ksock_peer_t      *peer;
+	struct list_head	*ptmp;
+	ksock_conn_t      *conn;
+	struct list_head	*ctmp;
+	int		i;
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+		list_for_each (ptmp, &ksocknal_data.ksnd_peers[i]) {
+			peer = list_entry (ptmp, ksock_peer_t, ksnp_list);
+
+			LASSERT (!peer->ksnp_closing);
+
+			if (peer->ksnp_ni != ni)
+				continue;
+
+			list_for_each (ctmp, &peer->ksnp_conns) {
+				if (index-- > 0)
+					continue;
+
+				conn = list_entry (ctmp, ksock_conn_t,
+						       ksnc_list);
+				ksocknal_conn_addref(conn);
+				read_unlock(&ksocknal_data. \
+						 ksnd_global_lock);
+				return (conn);
+			}
+		}
+	}
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+	return (NULL);
+}
+
+ksock_sched_t *
+ksocknal_choose_scheduler_locked(unsigned int cpt)
+{
+	struct ksock_sched_info	*info = ksocknal_data.ksnd_sched_info[cpt];
+	ksock_sched_t		*sched;
+	int			i;
+
+	LASSERT(info->ksi_nthreads > 0);
+
+	sched = &info->ksi_scheds[0];
+	/*
+	 * NB: it's safe so far, but info->ksi_nthreads could be changed
+	 * at runtime when we have dynamic LNet configuration, then we
+	 * need to take care of this.
+	 */
+	for (i = 1; i < info->ksi_nthreads; i++) {
+		if (sched->kss_nconns > info->ksi_scheds[i].kss_nconns)
+			sched = &info->ksi_scheds[i];
+	}
+
+	return sched;
+}
+
+int
+ksocknal_local_ipvec (lnet_ni_t *ni, __u32 *ipaddrs)
+{
+	ksock_net_t       *net = ni->ni_data;
+	int		i;
+	int		nip;
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	nip = net->ksnn_ninterfaces;
+	LASSERT (nip <= LNET_MAX_INTERFACES);
+
+	/* Only offer interfaces for additional connections if I have
+	 * more than one. */
+	if (nip < 2) {
+		read_unlock(&ksocknal_data.ksnd_global_lock);
+		return 0;
+	}
+
+	for (i = 0; i < nip; i++) {
+		ipaddrs[i] = net->ksnn_interfaces[i].ksni_ipaddr;
+		LASSERT (ipaddrs[i] != 0);
+	}
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+	return (nip);
+}
+
+int
+ksocknal_match_peerip (ksock_interface_t *iface, __u32 *ips, int nips)
+{
+	int   best_netmatch = 0;
+	int   best_xor      = 0;
+	int   best	  = -1;
+	int   this_xor;
+	int   this_netmatch;
+	int   i;
+
+	for (i = 0; i < nips; i++) {
+		if (ips[i] == 0)
+			continue;
+
+		this_xor = (ips[i] ^ iface->ksni_ipaddr);
+		this_netmatch = ((this_xor & iface->ksni_netmask) == 0) ? 1 : 0;
+
+		if (!(best < 0 ||
+		      best_netmatch < this_netmatch ||
+		      (best_netmatch == this_netmatch &&
+		       best_xor > this_xor)))
+			continue;
+
+		best = i;
+		best_netmatch = this_netmatch;
+		best_xor = this_xor;
+	}
+
+	LASSERT (best >= 0);
+	return (best);
+}
+
+int
+ksocknal_select_ips(ksock_peer_t *peer, __u32 *peerips, int n_peerips)
+{
+	rwlock_t		*global_lock = &ksocknal_data.ksnd_global_lock;
+	ksock_net_t	*net = peer->ksnp_ni->ni_data;
+	ksock_interface_t  *iface;
+	ksock_interface_t  *best_iface;
+	int		 n_ips;
+	int		 i;
+	int		 j;
+	int		 k;
+	__u32	       ip;
+	__u32	       xor;
+	int		 this_netmatch;
+	int		 best_netmatch;
+	int		 best_npeers;
+
+	/* CAVEAT EMPTOR: We do all our interface matching with an
+	 * exclusive hold of global lock at IRQ priority.  We're only
+	 * expecting to be dealing with small numbers of interfaces, so the
+	 * O(n**3)-ness shouldn't matter */
+
+	/* Also note that I'm not going to return more than n_peerips
+	 * interfaces, even if I have more myself */
+
+	write_lock_bh(global_lock);
+
+	LASSERT (n_peerips <= LNET_MAX_INTERFACES);
+	LASSERT (net->ksnn_ninterfaces <= LNET_MAX_INTERFACES);
+
+	/* Only match interfaces for additional connections
+	 * if I have > 1 interface */
+	n_ips = (net->ksnn_ninterfaces < 2) ? 0 :
+		MIN(n_peerips, net->ksnn_ninterfaces);
+
+	for (i = 0; peer->ksnp_n_passive_ips < n_ips; i++) {
+		/*	      ^ yes really... */
+
+		/* If we have any new interfaces, first tick off all the
+		 * peer IPs that match old interfaces, then choose new
+		 * interfaces to match the remaining peer IPS.
+		 * We don't forget interfaces we've stopped using; we might
+		 * start using them again... */
+
+		if (i < peer->ksnp_n_passive_ips) {
+			/* Old interface. */
+			ip = peer->ksnp_passive_ips[i];
+			best_iface = ksocknal_ip2iface(peer->ksnp_ni, ip);
+
+			/* peer passive ips are kept up to date */
+			LASSERT(best_iface != NULL);
+		} else {
+			/* choose a new interface */
+			LASSERT (i == peer->ksnp_n_passive_ips);
+
+			best_iface = NULL;
+			best_netmatch = 0;
+			best_npeers = 0;
+
+			for (j = 0; j < net->ksnn_ninterfaces; j++) {
+				iface = &net->ksnn_interfaces[j];
+				ip = iface->ksni_ipaddr;
+
+				for (k = 0; k < peer->ksnp_n_passive_ips; k++)
+					if (peer->ksnp_passive_ips[k] == ip)
+						break;
+
+				if (k < peer->ksnp_n_passive_ips) /* using it already */
+					continue;
+
+				k = ksocknal_match_peerip(iface, peerips, n_peerips);
+				xor = (ip ^ peerips[k]);
+				this_netmatch = ((xor & iface->ksni_netmask) == 0) ? 1 : 0;
+
+				if (!(best_iface == NULL ||
+				      best_netmatch < this_netmatch ||
+				      (best_netmatch == this_netmatch &&
+				       best_npeers > iface->ksni_npeers)))
+					continue;
+
+				best_iface = iface;
+				best_netmatch = this_netmatch;
+				best_npeers = iface->ksni_npeers;
+			}
+
+			best_iface->ksni_npeers++;
+			ip = best_iface->ksni_ipaddr;
+			peer->ksnp_passive_ips[i] = ip;
+			peer->ksnp_n_passive_ips = i+1;
+		}
+
+		LASSERT (best_iface != NULL);
+
+		/* mark the best matching peer IP used */
+		j = ksocknal_match_peerip(best_iface, peerips, n_peerips);
+		peerips[j] = 0;
+	}
+
+	/* Overwrite input peer IP addresses */
+	memcpy(peerips, peer->ksnp_passive_ips, n_ips * sizeof(*peerips));
+
+	write_unlock_bh(global_lock);
+
+	return (n_ips);
+}
+
+void
+ksocknal_create_routes(ksock_peer_t *peer, int port,
+		       __u32 *peer_ipaddrs, int npeer_ipaddrs)
+{
+	ksock_route_t       *newroute = NULL;
+	rwlock_t		*global_lock = &ksocknal_data.ksnd_global_lock;
+	lnet_ni_t	   *ni = peer->ksnp_ni;
+	ksock_net_t	 *net = ni->ni_data;
+	struct list_head	  *rtmp;
+	ksock_route_t       *route;
+	ksock_interface_t   *iface;
+	ksock_interface_t   *best_iface;
+	int		  best_netmatch;
+	int		  this_netmatch;
+	int		  best_nroutes;
+	int		  i;
+	int		  j;
+
+	/* CAVEAT EMPTOR: We do all our interface matching with an
+	 * exclusive hold of global lock at IRQ priority.  We're only
+	 * expecting to be dealing with small numbers of interfaces, so the
+	 * O(n**3)-ness here shouldn't matter */
+
+	write_lock_bh(global_lock);
+
+	if (net->ksnn_ninterfaces < 2) {
+		/* Only create additional connections
+		 * if I have > 1 interface */
+		write_unlock_bh(global_lock);
+		return;
+	}
+
+	LASSERT (npeer_ipaddrs <= LNET_MAX_INTERFACES);
+
+	for (i = 0; i < npeer_ipaddrs; i++) {
+		if (newroute != NULL) {
+			newroute->ksnr_ipaddr = peer_ipaddrs[i];
+		} else {
+			write_unlock_bh(global_lock);
+
+			newroute = ksocknal_create_route(peer_ipaddrs[i], port);
+			if (newroute == NULL)
+				return;
+
+			write_lock_bh(global_lock);
+		}
+
+		if (peer->ksnp_closing) {
+			/* peer got closed under me */
+			break;
+		}
+
+		/* Already got a route? */
+		route = NULL;
+		list_for_each(rtmp, &peer->ksnp_routes) {
+			route = list_entry(rtmp, ksock_route_t, ksnr_list);
+
+			if (route->ksnr_ipaddr == newroute->ksnr_ipaddr)
+				break;
+
+			route = NULL;
+		}
+		if (route != NULL)
+			continue;
+
+		best_iface = NULL;
+		best_nroutes = 0;
+		best_netmatch = 0;
+
+		LASSERT (net->ksnn_ninterfaces <= LNET_MAX_INTERFACES);
+
+		/* Select interface to connect from */
+		for (j = 0; j < net->ksnn_ninterfaces; j++) {
+			iface = &net->ksnn_interfaces[j];
+
+			/* Using this interface already? */
+			list_for_each(rtmp, &peer->ksnp_routes) {
+				route = list_entry(rtmp, ksock_route_t,
+						       ksnr_list);
+
+				if (route->ksnr_myipaddr == iface->ksni_ipaddr)
+					break;
+
+				route = NULL;
+			}
+			if (route != NULL)
+				continue;
+
+			this_netmatch = (((iface->ksni_ipaddr ^
+					   newroute->ksnr_ipaddr) &
+					   iface->ksni_netmask) == 0) ? 1 : 0;
+
+			if (!(best_iface == NULL ||
+			      best_netmatch < this_netmatch ||
+			      (best_netmatch == this_netmatch &&
+			       best_nroutes > iface->ksni_nroutes)))
+				continue;
+
+			best_iface = iface;
+			best_netmatch = this_netmatch;
+			best_nroutes = iface->ksni_nroutes;
+		}
+
+		if (best_iface == NULL)
+			continue;
+
+		newroute->ksnr_myipaddr = best_iface->ksni_ipaddr;
+		best_iface->ksni_nroutes++;
+
+		ksocknal_add_route_locked(peer, newroute);
+		newroute = NULL;
+	}
+
+	write_unlock_bh(global_lock);
+	if (newroute != NULL)
+		ksocknal_route_decref(newroute);
+}
+
+int
+ksocknal_accept (lnet_ni_t *ni, socket_t *sock)
+{
+	ksock_connreq_t    *cr;
+	int		 rc;
+	__u32	       peer_ip;
+	int		 peer_port;
+
+	rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port);
+	LASSERT (rc == 0);		      /* we succeeded before */
+
+	LIBCFS_ALLOC(cr, sizeof(*cr));
+	if (cr == NULL) {
+		LCONSOLE_ERROR_MSG(0x12f, "Dropping connection request from "
+				   "%u.%u.%u.%u: memory exhausted\n",
+				   HIPQUAD(peer_ip));
+		return -ENOMEM;
+	}
+
+	lnet_ni_addref(ni);
+	cr->ksncr_ni   = ni;
+	cr->ksncr_sock = sock;
+
+	spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+
+	list_add_tail(&cr->ksncr_list, &ksocknal_data.ksnd_connd_connreqs);
+	wake_up(&ksocknal_data.ksnd_connd_waitq);
+
+	spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+	return 0;
+}
+
+int
+ksocknal_connecting (ksock_peer_t *peer, __u32 ipaddr)
+{
+	ksock_route_t   *route;
+
+	list_for_each_entry (route, &peer->ksnp_routes, ksnr_list) {
+
+		if (route->ksnr_ipaddr == ipaddr)
+			return route->ksnr_connecting;
+	}
+	return 0;
+}
+
+int
+ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route,
+		      socket_t *sock, int type)
+{
+	rwlock_t		*global_lock = &ksocknal_data.ksnd_global_lock;
+	LIST_HEAD     (zombies);
+	lnet_process_id_t  peerid;
+	struct list_head	*tmp;
+	__u64	      incarnation;
+	ksock_conn_t      *conn;
+	ksock_conn_t      *conn2;
+	ksock_peer_t      *peer = NULL;
+	ksock_peer_t      *peer2;
+	ksock_sched_t     *sched;
+	ksock_hello_msg_t *hello;
+	int		   cpt;
+	ksock_tx_t	*tx;
+	ksock_tx_t	*txtmp;
+	int		rc;
+	int		active;
+	char	      *warn = NULL;
+
+	active = (route != NULL);
+
+	LASSERT (active == (type != SOCKLND_CONN_NONE));
+
+	LIBCFS_ALLOC(conn, sizeof(*conn));
+	if (conn == NULL) {
+		rc = -ENOMEM;
+		goto failed_0;
+	}
+
+	memset (conn, 0, sizeof (*conn));
+
+	conn->ksnc_peer = NULL;
+	conn->ksnc_route = NULL;
+	conn->ksnc_sock = sock;
+	/* 2 ref, 1 for conn, another extra ref prevents socket
+	 * being closed before establishment of connection */
+	atomic_set (&conn->ksnc_sock_refcount, 2);
+	conn->ksnc_type = type;
+	ksocknal_lib_save_callback(sock, conn);
+	atomic_set (&conn->ksnc_conn_refcount, 1); /* 1 ref for me */
+
+	conn->ksnc_rx_ready = 0;
+	conn->ksnc_rx_scheduled = 0;
+
+	INIT_LIST_HEAD (&conn->ksnc_tx_queue);
+	conn->ksnc_tx_ready = 0;
+	conn->ksnc_tx_scheduled = 0;
+	conn->ksnc_tx_carrier = NULL;
+	atomic_set (&conn->ksnc_tx_nob, 0);
+
+	LIBCFS_ALLOC(hello, offsetof(ksock_hello_msg_t,
+				     kshm_ips[LNET_MAX_INTERFACES]));
+	if (hello == NULL) {
+		rc = -ENOMEM;
+		goto failed_1;
+	}
+
+	/* stash conn's local and remote addrs */
+	rc = ksocknal_lib_get_conn_addrs (conn);
+	if (rc != 0)
+		goto failed_1;
+
+	/* Find out/confirm peer's NID and connection type and get the
+	 * vector of interfaces she's willing to let me connect to.
+	 * Passive connections use the listener timeout since the peer sends
+	 * eagerly */
+
+	if (active) {
+		peer = route->ksnr_peer;
+		LASSERT(ni == peer->ksnp_ni);
+
+		/* Active connection sends HELLO eagerly */
+		hello->kshm_nips = ksocknal_local_ipvec(ni, hello->kshm_ips);
+		peerid = peer->ksnp_id;
+
+		write_lock_bh(global_lock);
+		conn->ksnc_proto = peer->ksnp_proto;
+		write_unlock_bh(global_lock);
+
+		if (conn->ksnc_proto == NULL) {
+			 conn->ksnc_proto = &ksocknal_protocol_v3x;
+#if SOCKNAL_VERSION_DEBUG
+			 if (*ksocknal_tunables.ksnd_protocol == 2)
+				 conn->ksnc_proto = &ksocknal_protocol_v2x;
+			 else if (*ksocknal_tunables.ksnd_protocol == 1)
+				 conn->ksnc_proto = &ksocknal_protocol_v1x;
+#endif
+		}
+
+		rc = ksocknal_send_hello (ni, conn, peerid.nid, hello);
+		if (rc != 0)
+			goto failed_1;
+	} else {
+		peerid.nid = LNET_NID_ANY;
+		peerid.pid = LNET_PID_ANY;
+
+		/* Passive, get protocol from peer */
+		conn->ksnc_proto = NULL;
+	}
+
+	rc = ksocknal_recv_hello (ni, conn, hello, &peerid, &incarnation);
+	if (rc < 0)
+		goto failed_1;
+
+	LASSERT (rc == 0 || active);
+	LASSERT (conn->ksnc_proto != NULL);
+	LASSERT (peerid.nid != LNET_NID_ANY);
+
+	cpt = lnet_cpt_of_nid(peerid.nid);
+
+	if (active) {
+		ksocknal_peer_addref(peer);
+		write_lock_bh(global_lock);
+	} else {
+		rc = ksocknal_create_peer(&peer, ni, peerid);
+		if (rc != 0)
+			goto failed_1;
+
+		write_lock_bh(global_lock);
+
+		/* called with a ref on ni, so shutdown can't have started */
+		LASSERT (((ksock_net_t *) ni->ni_data)->ksnn_shutdown == 0);
+
+		peer2 = ksocknal_find_peer_locked(ni, peerid);
+		if (peer2 == NULL) {
+			/* NB this puts an "empty" peer in the peer
+			 * table (which takes my ref) */
+			list_add_tail(&peer->ksnp_list,
+					  ksocknal_nid2peerlist(peerid.nid));
+		} else {
+			ksocknal_peer_decref(peer);
+			peer = peer2;
+		}
+
+		/* +1 ref for me */
+		ksocknal_peer_addref(peer);
+		peer->ksnp_accepting++;
+
+		/* Am I already connecting to this guy?  Resolve in
+		 * favour of higher NID... */
+		if (peerid.nid < ni->ni_nid &&
+		    ksocknal_connecting(peer, conn->ksnc_ipaddr)) {
+			rc = EALREADY;
+			warn = "connection race resolution";
+			goto failed_2;
+		}
+	}
+
+	if (peer->ksnp_closing ||
+	    (active && route->ksnr_deleted)) {
+		/* peer/route got closed under me */
+		rc = -ESTALE;
+		warn = "peer/route removed";
+		goto failed_2;
+	}
+
+	if (peer->ksnp_proto == NULL) {
+		/* Never connected before.
+		 * NB recv_hello may have returned EPROTO to signal my peer
+		 * wants a different protocol than the one I asked for.
+		 */
+		LASSERT (list_empty(&peer->ksnp_conns));
+
+		peer->ksnp_proto = conn->ksnc_proto;
+		peer->ksnp_incarnation = incarnation;
+	}
+
+	if (peer->ksnp_proto != conn->ksnc_proto ||
+	    peer->ksnp_incarnation != incarnation) {
+		/* Peer rebooted or I've got the wrong protocol version */
+		ksocknal_close_peer_conns_locked(peer, 0, 0);
+
+		peer->ksnp_proto = NULL;
+		rc = ESTALE;
+		warn = peer->ksnp_incarnation != incarnation ?
+		       "peer rebooted" :
+		       "wrong proto version";
+		goto failed_2;
+	}
+
+	switch (rc) {
+	default:
+		LBUG();
+	case 0:
+		break;
+	case EALREADY:
+		warn = "lost conn race";
+		goto failed_2;
+	case EPROTO:
+		warn = "retry with different protocol version";
+		goto failed_2;
+	}
+
+	/* Refuse to duplicate an existing connection, unless this is a
+	 * loopback connection */
+	if (conn->ksnc_ipaddr != conn->ksnc_myipaddr) {
+		list_for_each(tmp, &peer->ksnp_conns) {
+			conn2 = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+			if (conn2->ksnc_ipaddr != conn->ksnc_ipaddr ||
+			    conn2->ksnc_myipaddr != conn->ksnc_myipaddr ||
+			    conn2->ksnc_type != conn->ksnc_type)
+				continue;
+
+			/* Reply on a passive connection attempt so the peer
+			 * realises we're connected. */
+			LASSERT (rc == 0);
+			if (!active)
+				rc = EALREADY;
+
+			warn = "duplicate";
+			goto failed_2;
+		}
+	}
+
+	/* If the connection created by this route didn't bind to the IP
+	 * address the route connected to, the connection/route matching
+	 * code below probably isn't going to work. */
+	if (active &&
+	    route->ksnr_ipaddr != conn->ksnc_ipaddr) {
+		CERROR("Route %s %u.%u.%u.%u connected to %u.%u.%u.%u\n",
+		       libcfs_id2str(peer->ksnp_id),
+		       HIPQUAD(route->ksnr_ipaddr),
+		       HIPQUAD(conn->ksnc_ipaddr));
+	}
+
+	/* Search for a route corresponding to the new connection and
+	 * create an association.  This allows incoming connections created
+	 * by routes in my peer to match my own route entries so I don't
+	 * continually create duplicate routes. */
+	list_for_each (tmp, &peer->ksnp_routes) {
+		route = list_entry(tmp, ksock_route_t, ksnr_list);
+
+		if (route->ksnr_ipaddr != conn->ksnc_ipaddr)
+			continue;
+
+		ksocknal_associate_route_conn_locked(route, conn);
+		break;
+	}
+
+	conn->ksnc_peer = peer;		 /* conn takes my ref on peer */
+	peer->ksnp_last_alive = cfs_time_current();
+	peer->ksnp_send_keepalive = 0;
+	peer->ksnp_error = 0;
+
+	sched = ksocknal_choose_scheduler_locked(cpt);
+	sched->kss_nconns++;
+	conn->ksnc_scheduler = sched;
+
+	conn->ksnc_tx_last_post = cfs_time_current();
+	/* Set the deadline for the outgoing HELLO to drain */
+	conn->ksnc_tx_bufnob = cfs_sock_wmem_queued(sock);
+	conn->ksnc_tx_deadline = cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+	mb();   /* order with adding to peer's conn list */
+
+	list_add (&conn->ksnc_list, &peer->ksnp_conns);
+	ksocknal_conn_addref(conn);
+
+	ksocknal_new_packet(conn, 0);
+
+	conn->ksnc_zc_capable = ksocknal_lib_zc_capable(conn);
+
+	/* Take packets blocking for this connection. */
+	list_for_each_entry_safe(tx, txtmp, &peer->ksnp_tx_queue, tx_list) {
+		if (conn->ksnc_proto->pro_match_tx(conn, tx, tx->tx_nonblk) == SOCKNAL_MATCH_NO)
+				continue;
+
+		list_del (&tx->tx_list);
+		ksocknal_queue_tx_locked (tx, conn);
+	}
+
+	write_unlock_bh(global_lock);
+
+	/* We've now got a new connection.  Any errors from here on are just
+	 * like "normal" comms errors and we close the connection normally.
+	 * NB (a) we still have to send the reply HELLO for passive
+	 *	connections,
+	 *    (b) normal I/O on the conn is blocked until I setup and call the
+	 *	socket callbacks.
+	 */
+
+	CDEBUG(D_NET, "New conn %s p %d.x %u.%u.%u.%u -> %u.%u.%u.%u/%d"
+	       " incarnation:"LPD64" sched[%d:%d]\n",
+	       libcfs_id2str(peerid), conn->ksnc_proto->pro_version,
+	       HIPQUAD(conn->ksnc_myipaddr), HIPQUAD(conn->ksnc_ipaddr),
+	       conn->ksnc_port, incarnation, cpt,
+	       (int)(sched - &sched->kss_info->ksi_scheds[0]));
+
+	if (active) {
+		/* additional routes after interface exchange? */
+		ksocknal_create_routes(peer, conn->ksnc_port,
+				       hello->kshm_ips, hello->kshm_nips);
+	} else {
+		hello->kshm_nips = ksocknal_select_ips(peer, hello->kshm_ips,
+						       hello->kshm_nips);
+		rc = ksocknal_send_hello(ni, conn, peerid.nid, hello);
+	}
+
+	LIBCFS_FREE(hello, offsetof(ksock_hello_msg_t,
+				    kshm_ips[LNET_MAX_INTERFACES]));
+
+	/* setup the socket AFTER I've received hello (it disables
+	 * SO_LINGER).  I might call back to the acceptor who may want
+	 * to send a protocol version response and then close the
+	 * socket; this ensures the socket only tears down after the
+	 * response has been sent. */
+	if (rc == 0)
+		rc = ksocknal_lib_setup_sock(sock);
+
+	write_lock_bh(global_lock);
+
+	/* NB my callbacks block while I hold ksnd_global_lock */
+	ksocknal_lib_set_callback(sock, conn);
+
+	if (!active)
+		peer->ksnp_accepting--;
+
+	write_unlock_bh(global_lock);
+
+	if (rc != 0) {
+		write_lock_bh(global_lock);
+		if (!conn->ksnc_closing) {
+			/* could be closed by another thread */
+			ksocknal_close_conn_locked(conn, rc);
+		}
+		write_unlock_bh(global_lock);
+	} else if (ksocknal_connsock_addref(conn) == 0) {
+		/* Allow I/O to proceed. */
+		ksocknal_read_callback(conn);
+		ksocknal_write_callback(conn);
+		ksocknal_connsock_decref(conn);
+	}
+
+	ksocknal_connsock_decref(conn);
+	ksocknal_conn_decref(conn);
+	return rc;
+
+ failed_2:
+	if (!peer->ksnp_closing &&
+	    list_empty (&peer->ksnp_conns) &&
+	    list_empty (&peer->ksnp_routes)) {
+		list_add(&zombies, &peer->ksnp_tx_queue);
+		list_del_init(&peer->ksnp_tx_queue);
+		ksocknal_unlink_peer_locked(peer);
+	}
+
+	write_unlock_bh(global_lock);
+
+	if (warn != NULL) {
+		if (rc < 0)
+			CERROR("Not creating conn %s type %d: %s\n",
+			       libcfs_id2str(peerid), conn->ksnc_type, warn);
+		else
+			CDEBUG(D_NET, "Not creating conn %s type %d: %s\n",
+			      libcfs_id2str(peerid), conn->ksnc_type, warn);
+	}
+
+	if (!active) {
+		if (rc > 0) {
+			/* Request retry by replying with CONN_NONE
+			 * ksnc_proto has been set already */
+			conn->ksnc_type = SOCKLND_CONN_NONE;
+			hello->kshm_nips = 0;
+			ksocknal_send_hello(ni, conn, peerid.nid, hello);
+		}
+
+		write_lock_bh(global_lock);
+		peer->ksnp_accepting--;
+		write_unlock_bh(global_lock);
+	}
+
+	ksocknal_txlist_done(ni, &zombies, 1);
+	ksocknal_peer_decref(peer);
+
+ failed_1:
+	if (hello != NULL)
+		LIBCFS_FREE(hello, offsetof(ksock_hello_msg_t,
+					    kshm_ips[LNET_MAX_INTERFACES]));
+
+	LIBCFS_FREE (conn, sizeof(*conn));
+
+ failed_0:
+	libcfs_sock_release(sock);
+	return rc;
+}
+
+void
+ksocknal_close_conn_locked (ksock_conn_t *conn, int error)
+{
+	/* This just does the immmediate housekeeping, and queues the
+	 * connection for the reaper to terminate.
+	 * Caller holds ksnd_global_lock exclusively in irq context */
+	ksock_peer_t      *peer = conn->ksnc_peer;
+	ksock_route_t     *route;
+	ksock_conn_t      *conn2;
+	struct list_head	*tmp;
+
+	LASSERT (peer->ksnp_error == 0);
+	LASSERT (!conn->ksnc_closing);
+	conn->ksnc_closing = 1;
+
+	/* ksnd_deathrow_conns takes over peer's ref */
+	list_del (&conn->ksnc_list);
+
+	route = conn->ksnc_route;
+	if (route != NULL) {
+		/* dissociate conn from route... */
+		LASSERT (!route->ksnr_deleted);
+		LASSERT ((route->ksnr_connected & (1 << conn->ksnc_type)) != 0);
+
+		conn2 = NULL;
+		list_for_each(tmp, &peer->ksnp_conns) {
+			conn2 = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+			if (conn2->ksnc_route == route &&
+			    conn2->ksnc_type == conn->ksnc_type)
+				break;
+
+			conn2 = NULL;
+		}
+		if (conn2 == NULL)
+			route->ksnr_connected &= ~(1 << conn->ksnc_type);
+
+		conn->ksnc_route = NULL;
+
+#if 0	   /* irrelevent with only eager routes */
+		/* make route least favourite */
+		list_del (&route->ksnr_list);
+		list_add_tail (&route->ksnr_list, &peer->ksnp_routes);
+#endif
+		ksocknal_route_decref(route);     /* drop conn's ref on route */
+	}
+
+	if (list_empty (&peer->ksnp_conns)) {
+		/* No more connections to this peer */
+
+		if (!list_empty(&peer->ksnp_tx_queue)) {
+			ksock_tx_t *tx;
+
+			LASSERT (conn->ksnc_proto == &ksocknal_protocol_v3x);
+
+			/* throw them to the last connection...,
+			 * these TXs will be send to /dev/null by scheduler */
+			list_for_each_entry(tx, &peer->ksnp_tx_queue,
+						tx_list)
+				ksocknal_tx_prep(conn, tx);
+
+			spin_lock_bh(&conn->ksnc_scheduler->kss_lock);
+			list_splice_init(&peer->ksnp_tx_queue,
+					     &conn->ksnc_tx_queue);
+			spin_unlock_bh(&conn->ksnc_scheduler->kss_lock);
+		}
+
+		peer->ksnp_proto = NULL;	/* renegotiate protocol version */
+		peer->ksnp_error = error;       /* stash last conn close reason */
+
+		if (list_empty (&peer->ksnp_routes)) {
+			/* I've just closed last conn belonging to a
+			 * peer with no routes to it */
+			ksocknal_unlink_peer_locked (peer);
+		}
+	}
+
+	spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+	list_add_tail(&conn->ksnc_list,
+			  &ksocknal_data.ksnd_deathrow_conns);
+	wake_up(&ksocknal_data.ksnd_reaper_waitq);
+
+	spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+}
+
+void
+ksocknal_peer_failed (ksock_peer_t *peer)
+{
+	int	notify = 0;
+	cfs_time_t last_alive = 0;
+
+	/* There has been a connection failure or comms error; but I'll only
+	 * tell LNET I think the peer is dead if it's to another kernel and
+	 * there are no connections or connection attempts in existance. */
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	if ((peer->ksnp_id.pid & LNET_PID_USERFLAG) == 0 &&
+	    list_empty(&peer->ksnp_conns) &&
+	    peer->ksnp_accepting == 0 &&
+	    ksocknal_find_connecting_route_locked(peer) == NULL) {
+		notify = 1;
+		last_alive = peer->ksnp_last_alive;
+	}
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+
+	if (notify)
+		lnet_notify (peer->ksnp_ni, peer->ksnp_id.nid, 0,
+			     last_alive);
+}
+
+void
+ksocknal_finalize_zcreq(ksock_conn_t *conn)
+{
+	ksock_peer_t     *peer = conn->ksnc_peer;
+	ksock_tx_t       *tx;
+	ksock_tx_t       *tmp;
+	LIST_HEAD    (zlist);
+
+	/* NB safe to finalize TXs because closing of socket will
+	 * abort all buffered data */
+	LASSERT (conn->ksnc_sock == NULL);
+
+	spin_lock(&peer->ksnp_lock);
+
+	list_for_each_entry_safe(tx, tmp, &peer->ksnp_zc_req_list, tx_zc_list) {
+		if (tx->tx_conn != conn)
+			continue;
+
+		LASSERT (tx->tx_msg.ksm_zc_cookies[0] != 0);
+
+		tx->tx_msg.ksm_zc_cookies[0] = 0;
+		tx->tx_zc_aborted = 1; /* mark it as not-acked */
+		list_del(&tx->tx_zc_list);
+		list_add(&tx->tx_zc_list, &zlist);
+	}
+
+	spin_unlock(&peer->ksnp_lock);
+
+	while (!list_empty(&zlist)) {
+		tx = list_entry(zlist.next, ksock_tx_t, tx_zc_list);
+
+		list_del(&tx->tx_zc_list);
+		ksocknal_tx_decref(tx);
+	}
+}
+
+void
+ksocknal_terminate_conn (ksock_conn_t *conn)
+{
+	/* This gets called by the reaper (guaranteed thread context) to
+	 * disengage the socket from its callbacks and close it.
+	 * ksnc_refcount will eventually hit zero, and then the reaper will
+	 * destroy it. */
+	ksock_peer_t     *peer = conn->ksnc_peer;
+	ksock_sched_t    *sched = conn->ksnc_scheduler;
+	int	       failed = 0;
+
+	LASSERT(conn->ksnc_closing);
+
+	/* wake up the scheduler to "send" all remaining packets to /dev/null */
+	spin_lock_bh(&sched->kss_lock);
+
+	/* a closing conn is always ready to tx */
+	conn->ksnc_tx_ready = 1;
+
+	if (!conn->ksnc_tx_scheduled &&
+	    !list_empty(&conn->ksnc_tx_queue)){
+		list_add_tail (&conn->ksnc_tx_list,
+			       &sched->kss_tx_conns);
+		conn->ksnc_tx_scheduled = 1;
+		/* extra ref for scheduler */
+		ksocknal_conn_addref(conn);
+
+		wake_up (&sched->kss_waitq);
+	}
+
+	spin_unlock_bh(&sched->kss_lock);
+
+	/* serialise with callbacks */
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	ksocknal_lib_reset_callback(conn->ksnc_sock, conn);
+
+	/* OK, so this conn may not be completely disengaged from its
+	 * scheduler yet, but it _has_ committed to terminate... */
+	conn->ksnc_scheduler->kss_nconns--;
+
+	if (peer->ksnp_error != 0) {
+		/* peer's last conn closed in error */
+		LASSERT (list_empty (&peer->ksnp_conns));
+		failed = 1;
+		peer->ksnp_error = 0;     /* avoid multiple notifications */
+	}
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	if (failed)
+		ksocknal_peer_failed(peer);
+
+	/* The socket is closed on the final put; either here, or in
+	 * ksocknal_{send,recv}msg().  Since we set up the linger2 option
+	 * when the connection was established, this will close the socket
+	 * immediately, aborting anything buffered in it. Any hung
+	 * zero-copy transmits will therefore complete in finite time. */
+	ksocknal_connsock_decref(conn);
+}
+
+void
+ksocknal_queue_zombie_conn (ksock_conn_t *conn)
+{
+	/* Queue the conn for the reaper to destroy */
+
+	LASSERT(atomic_read(&conn->ksnc_conn_refcount) == 0);
+	spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+	list_add_tail(&conn->ksnc_list, &ksocknal_data.ksnd_zombie_conns);
+	wake_up(&ksocknal_data.ksnd_reaper_waitq);
+
+	spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+}
+
+void
+ksocknal_destroy_conn (ksock_conn_t *conn)
+{
+	cfs_time_t      last_rcv;
+
+	/* Final coup-de-grace of the reaper */
+	CDEBUG (D_NET, "connection %p\n", conn);
+
+	LASSERT (atomic_read (&conn->ksnc_conn_refcount) == 0);
+	LASSERT (atomic_read (&conn->ksnc_sock_refcount) == 0);
+	LASSERT (conn->ksnc_sock == NULL);
+	LASSERT (conn->ksnc_route == NULL);
+	LASSERT (!conn->ksnc_tx_scheduled);
+	LASSERT (!conn->ksnc_rx_scheduled);
+	LASSERT (list_empty(&conn->ksnc_tx_queue));
+
+	/* complete current receive if any */
+	switch (conn->ksnc_rx_state) {
+	case SOCKNAL_RX_LNET_PAYLOAD:
+		last_rcv = conn->ksnc_rx_deadline -
+			   cfs_time_seconds(*ksocknal_tunables.ksnd_timeout);
+		CERROR("Completing partial receive from %s[%d]"
+		       ", ip %d.%d.%d.%d:%d, with error, wanted: %d, left: %d, "
+		       "last alive is %ld secs ago\n",
+		       libcfs_id2str(conn->ksnc_peer->ksnp_id), conn->ksnc_type,
+		       HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port,
+		       conn->ksnc_rx_nob_wanted, conn->ksnc_rx_nob_left,
+		       cfs_duration_sec(cfs_time_sub(cfs_time_current(),
+					last_rcv)));
+		lnet_finalize (conn->ksnc_peer->ksnp_ni,
+			       conn->ksnc_cookie, -EIO);
+		break;
+	case SOCKNAL_RX_LNET_HEADER:
+		if (conn->ksnc_rx_started)
+			CERROR("Incomplete receive of lnet header from %s"
+			       ", ip %d.%d.%d.%d:%d, with error, protocol: %d.x.\n",
+			       libcfs_id2str(conn->ksnc_peer->ksnp_id),
+			       HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port,
+			       conn->ksnc_proto->pro_version);
+		break;
+	case SOCKNAL_RX_KSM_HEADER:
+		if (conn->ksnc_rx_started)
+			CERROR("Incomplete receive of ksock message from %s"
+			       ", ip %d.%d.%d.%d:%d, with error, protocol: %d.x.\n",
+			       libcfs_id2str(conn->ksnc_peer->ksnp_id),
+			       HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port,
+			       conn->ksnc_proto->pro_version);
+		break;
+	case SOCKNAL_RX_SLOP:
+		if (conn->ksnc_rx_started)
+			CERROR("Incomplete receive of slops from %s"
+			       ", ip %d.%d.%d.%d:%d, with error\n",
+			       libcfs_id2str(conn->ksnc_peer->ksnp_id),
+			       HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
+	       break;
+	default:
+		LBUG ();
+		break;
+	}
+
+	ksocknal_peer_decref(conn->ksnc_peer);
+
+	LIBCFS_FREE (conn, sizeof (*conn));
+}
+
+int
+ksocknal_close_peer_conns_locked (ksock_peer_t *peer, __u32 ipaddr, int why)
+{
+	ksock_conn_t       *conn;
+	struct list_head	 *ctmp;
+	struct list_head	 *cnxt;
+	int		 count = 0;
+
+	list_for_each_safe (ctmp, cnxt, &peer->ksnp_conns) {
+		conn = list_entry (ctmp, ksock_conn_t, ksnc_list);
+
+		if (ipaddr == 0 ||
+		    conn->ksnc_ipaddr == ipaddr) {
+			count++;
+			ksocknal_close_conn_locked (conn, why);
+		}
+	}
+
+	return (count);
+}
+
+int
+ksocknal_close_conn_and_siblings (ksock_conn_t *conn, int why)
+{
+	ksock_peer_t     *peer = conn->ksnc_peer;
+	__u32	     ipaddr = conn->ksnc_ipaddr;
+	int	       count;
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	count = ksocknal_close_peer_conns_locked (peer, ipaddr, why);
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	return (count);
+}
+
+int
+ksocknal_close_matching_conns (lnet_process_id_t id, __u32 ipaddr)
+{
+	ksock_peer_t       *peer;
+	struct list_head	 *ptmp;
+	struct list_head	 *pnxt;
+	int		 lo;
+	int		 hi;
+	int		 i;
+	int		 count = 0;
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	if (id.nid != LNET_NID_ANY)
+		lo = hi = (int)(ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers);
+	else {
+		lo = 0;
+		hi = ksocknal_data.ksnd_peer_hash_size - 1;
+	}
+
+	for (i = lo; i <= hi; i++) {
+		list_for_each_safe (ptmp, pnxt,
+					&ksocknal_data.ksnd_peers[i]) {
+
+			peer = list_entry (ptmp, ksock_peer_t, ksnp_list);
+
+			if (!((id.nid == LNET_NID_ANY || id.nid == peer->ksnp_id.nid) &&
+			      (id.pid == LNET_PID_ANY || id.pid == peer->ksnp_id.pid)))
+				continue;
+
+			count += ksocknal_close_peer_conns_locked (peer, ipaddr, 0);
+		}
+	}
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	/* wildcards always succeed */
+	if (id.nid == LNET_NID_ANY || id.pid == LNET_PID_ANY || ipaddr == 0)
+		return (0);
+
+	return (count == 0 ? -ENOENT : 0);
+}
+
+void
+ksocknal_notify (lnet_ni_t *ni, lnet_nid_t gw_nid, int alive)
+{
+	/* The router is telling me she's been notified of a change in
+	 * gateway state.... */
+	lnet_process_id_t  id = {0};
+
+	id.nid = gw_nid;
+	id.pid = LNET_PID_ANY;
+
+	CDEBUG (D_NET, "gw %s %s\n", libcfs_nid2str(gw_nid),
+		alive ? "up" : "down");
+
+	if (!alive) {
+		/* If the gateway crashed, close all open connections... */
+		ksocknal_close_matching_conns (id, 0);
+		return;
+	}
+
+	/* ...otherwise do nothing.  We can only establish new connections
+	 * if we have autroutes, and these connect on demand. */
+}
+
+void
+ksocknal_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when)
+{
+	int		connect = 1;
+	cfs_time_t	 last_alive = 0;
+	cfs_time_t	 now = cfs_time_current();
+	ksock_peer_t      *peer = NULL;
+	rwlock_t		*glock = &ksocknal_data.ksnd_global_lock;
+	lnet_process_id_t  id = {.nid = nid, .pid = LUSTRE_SRV_LNET_PID};
+
+	read_lock(glock);
+
+	peer = ksocknal_find_peer_locked(ni, id);
+	if (peer != NULL) {
+		struct list_head       *tmp;
+		ksock_conn_t     *conn;
+		int	       bufnob;
+
+		list_for_each (tmp, &peer->ksnp_conns) {
+			conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+			bufnob = cfs_sock_wmem_queued(conn->ksnc_sock);
+
+			if (bufnob < conn->ksnc_tx_bufnob) {
+				/* something got ACKed */
+				conn->ksnc_tx_deadline =
+					cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+				peer->ksnp_last_alive = now;
+				conn->ksnc_tx_bufnob = bufnob;
+			}
+		}
+
+		last_alive = peer->ksnp_last_alive;
+		if (ksocknal_find_connectable_route_locked(peer) == NULL)
+			connect = 0;
+	}
+
+	read_unlock(glock);
+
+	if (last_alive != 0)
+		*when = last_alive;
+
+	CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago, connect %d\n",
+	       libcfs_nid2str(nid), peer,
+	       last_alive ? cfs_duration_sec(now - last_alive) : -1,
+	       connect);
+
+	if (!connect)
+		return;
+
+	ksocknal_add_peer(ni, id, LNET_NIDADDR(nid), lnet_acceptor_port());
+
+	write_lock_bh(glock);
+
+	peer = ksocknal_find_peer_locked(ni, id);
+	if (peer != NULL)
+		ksocknal_launch_all_connections_locked(peer);
+
+	write_unlock_bh(glock);
+	return;
+}
+
+void
+ksocknal_push_peer (ksock_peer_t *peer)
+{
+	int	       index;
+	int	       i;
+	struct list_head       *tmp;
+	ksock_conn_t     *conn;
+
+	for (index = 0; ; index++) {
+		read_lock(&ksocknal_data.ksnd_global_lock);
+
+		i = 0;
+		conn = NULL;
+
+		list_for_each (tmp, &peer->ksnp_conns) {
+			if (i++ == index) {
+				conn = list_entry (tmp, ksock_conn_t,
+						       ksnc_list);
+				ksocknal_conn_addref(conn);
+				break;
+			}
+		}
+
+		read_unlock(&ksocknal_data.ksnd_global_lock);
+
+		if (conn == NULL)
+			break;
+
+		ksocknal_lib_push_conn (conn);
+		ksocknal_conn_decref(conn);
+	}
+}
+
+int
+ksocknal_push (lnet_ni_t *ni, lnet_process_id_t id)
+{
+	ksock_peer_t      *peer;
+	struct list_head	*tmp;
+	int		index;
+	int		i;
+	int		j;
+	int		rc = -ENOENT;
+
+	for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+		for (j = 0; ; j++) {
+			read_lock(&ksocknal_data.ksnd_global_lock);
+
+			index = 0;
+			peer = NULL;
+
+			list_for_each (tmp, &ksocknal_data.ksnd_peers[i]) {
+				peer = list_entry(tmp, ksock_peer_t,
+						      ksnp_list);
+
+				if (!((id.nid == LNET_NID_ANY ||
+				       id.nid == peer->ksnp_id.nid) &&
+				      (id.pid == LNET_PID_ANY ||
+				       id.pid == peer->ksnp_id.pid))) {
+					peer = NULL;
+					continue;
+				}
+
+				if (index++ == j) {
+					ksocknal_peer_addref(peer);
+					break;
+				}
+			}
+
+			read_unlock(&ksocknal_data.ksnd_global_lock);
+
+			if (peer != NULL) {
+				rc = 0;
+				ksocknal_push_peer (peer);
+				ksocknal_peer_decref(peer);
+			}
+		}
+
+	}
+
+	return (rc);
+}
+
+int
+ksocknal_add_interface(lnet_ni_t *ni, __u32 ipaddress, __u32 netmask)
+{
+	ksock_net_t       *net = ni->ni_data;
+	ksock_interface_t *iface;
+	int		rc;
+	int		i;
+	int		j;
+	struct list_head	*ptmp;
+	ksock_peer_t      *peer;
+	struct list_head	*rtmp;
+	ksock_route_t     *route;
+
+	if (ipaddress == 0 ||
+	    netmask == 0)
+		return (-EINVAL);
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	iface = ksocknal_ip2iface(ni, ipaddress);
+	if (iface != NULL) {
+		/* silently ignore dups */
+		rc = 0;
+	} else if (net->ksnn_ninterfaces == LNET_MAX_INTERFACES) {
+		rc = -ENOSPC;
+	} else {
+		iface = &net->ksnn_interfaces[net->ksnn_ninterfaces++];
+
+		iface->ksni_ipaddr = ipaddress;
+		iface->ksni_netmask = netmask;
+		iface->ksni_nroutes = 0;
+		iface->ksni_npeers = 0;
+
+		for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+			list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) {
+				peer = list_entry(ptmp, ksock_peer_t,
+						      ksnp_list);
+
+				for (j = 0; j < peer->ksnp_n_passive_ips; j++)
+					if (peer->ksnp_passive_ips[j] == ipaddress)
+						iface->ksni_npeers++;
+
+				list_for_each(rtmp, &peer->ksnp_routes) {
+					route = list_entry(rtmp,
+							       ksock_route_t,
+							       ksnr_list);
+
+					if (route->ksnr_myipaddr == ipaddress)
+						iface->ksni_nroutes++;
+				}
+			}
+		}
+
+		rc = 0;
+		/* NB only new connections will pay attention to the new interface! */
+	}
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	return (rc);
+}
+
+void
+ksocknal_peer_del_interface_locked(ksock_peer_t *peer, __u32 ipaddr)
+{
+	struct list_head	 *tmp;
+	struct list_head	 *nxt;
+	ksock_route_t      *route;
+	ksock_conn_t       *conn;
+	int		 i;
+	int		 j;
+
+	for (i = 0; i < peer->ksnp_n_passive_ips; i++)
+		if (peer->ksnp_passive_ips[i] == ipaddr) {
+			for (j = i+1; j < peer->ksnp_n_passive_ips; j++)
+				peer->ksnp_passive_ips[j-1] =
+					peer->ksnp_passive_ips[j];
+			peer->ksnp_n_passive_ips--;
+			break;
+		}
+
+	list_for_each_safe(tmp, nxt, &peer->ksnp_routes) {
+		route = list_entry (tmp, ksock_route_t, ksnr_list);
+
+		if (route->ksnr_myipaddr != ipaddr)
+			continue;
+
+		if (route->ksnr_share_count != 0) {
+			/* Manually created; keep, but unbind */
+			route->ksnr_myipaddr = 0;
+		} else {
+			ksocknal_del_route_locked(route);
+		}
+	}
+
+	list_for_each_safe(tmp, nxt, &peer->ksnp_conns) {
+		conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+		if (conn->ksnc_myipaddr == ipaddr)
+			ksocknal_close_conn_locked (conn, 0);
+	}
+}
+
+int
+ksocknal_del_interface(lnet_ni_t *ni, __u32 ipaddress)
+{
+	ksock_net_t       *net = ni->ni_data;
+	int		rc = -ENOENT;
+	struct list_head	*tmp;
+	struct list_head	*nxt;
+	ksock_peer_t      *peer;
+	__u32	      this_ip;
+	int		i;
+	int		j;
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	for (i = 0; i < net->ksnn_ninterfaces; i++) {
+		this_ip = net->ksnn_interfaces[i].ksni_ipaddr;
+
+		if (!(ipaddress == 0 ||
+		      ipaddress == this_ip))
+			continue;
+
+		rc = 0;
+
+		for (j = i+1; j < net->ksnn_ninterfaces; j++)
+			net->ksnn_interfaces[j-1] =
+				net->ksnn_interfaces[j];
+
+		net->ksnn_ninterfaces--;
+
+		for (j = 0; j < ksocknal_data.ksnd_peer_hash_size; j++) {
+			list_for_each_safe(tmp, nxt,
+					       &ksocknal_data.ksnd_peers[j]) {
+				peer = list_entry(tmp, ksock_peer_t,
+						      ksnp_list);
+
+				if (peer->ksnp_ni != ni)
+					continue;
+
+				ksocknal_peer_del_interface_locked(peer, this_ip);
+			}
+		}
+	}
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	return (rc);
+}
+
+int
+ksocknal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
+{
+	lnet_process_id_t id = {0};
+	struct libcfs_ioctl_data *data = arg;
+	int rc;
+
+	switch(cmd) {
+	case IOC_LIBCFS_GET_INTERFACE: {
+		ksock_net_t       *net = ni->ni_data;
+		ksock_interface_t *iface;
+
+		read_lock(&ksocknal_data.ksnd_global_lock);
+
+		if (data->ioc_count >= (__u32)net->ksnn_ninterfaces) {
+			rc = -ENOENT;
+		} else {
+			rc = 0;
+			iface = &net->ksnn_interfaces[data->ioc_count];
+
+			data->ioc_u32[0] = iface->ksni_ipaddr;
+			data->ioc_u32[1] = iface->ksni_netmask;
+			data->ioc_u32[2] = iface->ksni_npeers;
+			data->ioc_u32[3] = iface->ksni_nroutes;
+		}
+
+		read_unlock(&ksocknal_data.ksnd_global_lock);
+		return rc;
+	}
+
+	case IOC_LIBCFS_ADD_INTERFACE:
+		return ksocknal_add_interface(ni,
+					      data->ioc_u32[0], /* IP address */
+					      data->ioc_u32[1]); /* net mask */
+
+	case IOC_LIBCFS_DEL_INTERFACE:
+		return ksocknal_del_interface(ni,
+					      data->ioc_u32[0]); /* IP address */
+
+	case IOC_LIBCFS_GET_PEER: {
+		__u32	    myip = 0;
+		__u32	    ip = 0;
+		int	      port = 0;
+		int	      conn_count = 0;
+		int	      share_count = 0;
+
+		rc = ksocknal_get_peer_info(ni, data->ioc_count,
+					    &id, &myip, &ip, &port,
+					    &conn_count,  &share_count);
+		if (rc != 0)
+			return rc;
+
+		data->ioc_nid    = id.nid;
+		data->ioc_count  = share_count;
+		data->ioc_u32[0] = ip;
+		data->ioc_u32[1] = port;
+		data->ioc_u32[2] = myip;
+		data->ioc_u32[3] = conn_count;
+		data->ioc_u32[4] = id.pid;
+		return 0;
+	}
+
+	case IOC_LIBCFS_ADD_PEER:
+		id.nid = data->ioc_nid;
+		id.pid = LUSTRE_SRV_LNET_PID;
+		return ksocknal_add_peer (ni, id,
+					  data->ioc_u32[0], /* IP */
+					  data->ioc_u32[1]); /* port */
+
+	case IOC_LIBCFS_DEL_PEER:
+		id.nid = data->ioc_nid;
+		id.pid = LNET_PID_ANY;
+		return ksocknal_del_peer (ni, id,
+					  data->ioc_u32[0]); /* IP */
+
+	case IOC_LIBCFS_GET_CONN: {
+		int	   txmem;
+		int	   rxmem;
+		int	   nagle;
+		ksock_conn_t *conn = ksocknal_get_conn_by_idx (ni, data->ioc_count);
+
+		if (conn == NULL)
+			return -ENOENT;
+
+		ksocknal_lib_get_conn_tunables(conn, &txmem, &rxmem, &nagle);
+
+		data->ioc_count  = txmem;
+		data->ioc_nid    = conn->ksnc_peer->ksnp_id.nid;
+		data->ioc_flags  = nagle;
+		data->ioc_u32[0] = conn->ksnc_ipaddr;
+		data->ioc_u32[1] = conn->ksnc_port;
+		data->ioc_u32[2] = conn->ksnc_myipaddr;
+		data->ioc_u32[3] = conn->ksnc_type;
+		data->ioc_u32[4] = conn->ksnc_scheduler->kss_info->ksi_cpt;
+		data->ioc_u32[5] = rxmem;
+		data->ioc_u32[6] = conn->ksnc_peer->ksnp_id.pid;
+		ksocknal_conn_decref(conn);
+		return 0;
+	}
+
+	case IOC_LIBCFS_CLOSE_CONNECTION:
+		id.nid = data->ioc_nid;
+		id.pid = LNET_PID_ANY;
+		return ksocknal_close_matching_conns (id,
+						      data->ioc_u32[0]);
+
+	case IOC_LIBCFS_REGISTER_MYNID:
+		/* Ignore if this is a noop */
+		if (data->ioc_nid == ni->ni_nid)
+			return 0;
+
+		CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n",
+		       libcfs_nid2str(data->ioc_nid),
+		       libcfs_nid2str(ni->ni_nid));
+		return -EINVAL;
+
+	case IOC_LIBCFS_PUSH_CONNECTION:
+		id.nid = data->ioc_nid;
+		id.pid = LNET_PID_ANY;
+		return ksocknal_push(ni, id);
+
+	default:
+		return -EINVAL;
+	}
+	/* not reached */
+}
+
+void
+ksocknal_free_buffers (void)
+{
+	LASSERT (atomic_read(&ksocknal_data.ksnd_nactive_txs) == 0);
+
+	if (ksocknal_data.ksnd_sched_info != NULL) {
+		struct ksock_sched_info	*info;
+		int			i;
+
+		cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) {
+			if (info->ksi_scheds != NULL) {
+				LIBCFS_FREE(info->ksi_scheds,
+					    info->ksi_nthreads_max *
+					    sizeof(info->ksi_scheds[0]));
+			}
+		}
+		cfs_percpt_free(ksocknal_data.ksnd_sched_info);
+	}
+
+	LIBCFS_FREE (ksocknal_data.ksnd_peers,
+		     sizeof (struct list_head) *
+		     ksocknal_data.ksnd_peer_hash_size);
+
+	spin_lock(&ksocknal_data.ksnd_tx_lock);
+
+	if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) {
+		struct list_head	zlist;
+		ksock_tx_t	*tx;
+
+		list_add(&zlist, &ksocknal_data.ksnd_idle_noop_txs);
+		list_del_init(&ksocknal_data.ksnd_idle_noop_txs);
+		spin_unlock(&ksocknal_data.ksnd_tx_lock);
+
+		while (!list_empty(&zlist)) {
+			tx = list_entry(zlist.next, ksock_tx_t, tx_list);
+			list_del(&tx->tx_list);
+			LIBCFS_FREE(tx, tx->tx_desc_size);
+		}
+	} else {
+		spin_unlock(&ksocknal_data.ksnd_tx_lock);
+	}
+}
+
+void
+ksocknal_base_shutdown(void)
+{
+	struct ksock_sched_info *info;
+	ksock_sched_t		*sched;
+	int			i;
+	int			j;
+
+	CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
+	       atomic_read (&libcfs_kmemory));
+	LASSERT (ksocknal_data.ksnd_nnets == 0);
+
+	switch (ksocknal_data.ksnd_init) {
+	default:
+		LASSERT (0);
+
+	case SOCKNAL_INIT_ALL:
+	case SOCKNAL_INIT_DATA:
+		LASSERT (ksocknal_data.ksnd_peers != NULL);
+		for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+			LASSERT (list_empty (&ksocknal_data.ksnd_peers[i]));
+		}
+
+		LASSERT(list_empty(&ksocknal_data.ksnd_nets));
+		LASSERT (list_empty (&ksocknal_data.ksnd_enomem_conns));
+		LASSERT (list_empty (&ksocknal_data.ksnd_zombie_conns));
+		LASSERT (list_empty (&ksocknal_data.ksnd_connd_connreqs));
+		LASSERT (list_empty (&ksocknal_data.ksnd_connd_routes));
+
+		if (ksocknal_data.ksnd_sched_info != NULL) {
+			cfs_percpt_for_each(info, i,
+					    ksocknal_data.ksnd_sched_info) {
+				if (info->ksi_scheds == NULL)
+					continue;
+
+				for (j = 0; j < info->ksi_nthreads_max; j++) {
+
+					sched = &info->ksi_scheds[j];
+					LASSERT(list_empty(&sched->\
+							       kss_tx_conns));
+					LASSERT(list_empty(&sched->\
+							       kss_rx_conns));
+					LASSERT(list_empty(&sched-> \
+						  kss_zombie_noop_txs));
+					LASSERT(sched->kss_nconns == 0);
+				}
+			}
+		}
+
+		/* flag threads to terminate; wake and wait for them to die */
+		ksocknal_data.ksnd_shuttingdown = 1;
+		wake_up_all(&ksocknal_data.ksnd_connd_waitq);
+		wake_up_all(&ksocknal_data.ksnd_reaper_waitq);
+
+		if (ksocknal_data.ksnd_sched_info != NULL) {
+			cfs_percpt_for_each(info, i,
+					    ksocknal_data.ksnd_sched_info) {
+				if (info->ksi_scheds == NULL)
+					continue;
+
+				for (j = 0; j < info->ksi_nthreads_max; j++) {
+					sched = &info->ksi_scheds[j];
+					wake_up_all(&sched->kss_waitq);
+				}
+			}
+		}
+
+		i = 4;
+		read_lock(&ksocknal_data.ksnd_global_lock);
+		while (ksocknal_data.ksnd_nthreads != 0) {
+			i++;
+			CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+			       "waiting for %d threads to terminate\n",
+				ksocknal_data.ksnd_nthreads);
+			read_unlock(&ksocknal_data.ksnd_global_lock);
+			cfs_pause(cfs_time_seconds(1));
+			read_lock(&ksocknal_data.ksnd_global_lock);
+		}
+		read_unlock(&ksocknal_data.ksnd_global_lock);
+
+		ksocknal_free_buffers();
+
+		ksocknal_data.ksnd_init = SOCKNAL_INIT_NOTHING;
+		break;
+	}
+
+	CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
+	       atomic_read (&libcfs_kmemory));
+
+	module_put(THIS_MODULE);
+}
+
+__u64
+ksocknal_new_incarnation (void)
+{
+	struct timeval tv;
+
+	/* The incarnation number is the time this module loaded and it
+	 * identifies this particular instance of the socknal.  Hopefully
+	 * we won't be able to reboot more frequently than 1MHz for the
+	 * forseeable future :) */
+
+	do_gettimeofday(&tv);
+
+	return (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+}
+
+int
+ksocknal_base_startup(void)
+{
+	struct ksock_sched_info	*info;
+	int			rc;
+	int			i;
+
+	LASSERT (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING);
+	LASSERT (ksocknal_data.ksnd_nnets == 0);
+
+	memset (&ksocknal_data, 0, sizeof (ksocknal_data)); /* zero pointers */
+
+	ksocknal_data.ksnd_peer_hash_size = SOCKNAL_PEER_HASH_SIZE;
+	LIBCFS_ALLOC (ksocknal_data.ksnd_peers,
+		      sizeof (struct list_head) *
+		      ksocknal_data.ksnd_peer_hash_size);
+	if (ksocknal_data.ksnd_peers == NULL)
+		return -ENOMEM;
+
+	for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++)
+		INIT_LIST_HEAD(&ksocknal_data.ksnd_peers[i]);
+
+	rwlock_init(&ksocknal_data.ksnd_global_lock);
+	INIT_LIST_HEAD(&ksocknal_data.ksnd_nets);
+
+	spin_lock_init(&ksocknal_data.ksnd_reaper_lock);
+	INIT_LIST_HEAD (&ksocknal_data.ksnd_enomem_conns);
+	INIT_LIST_HEAD (&ksocknal_data.ksnd_zombie_conns);
+	INIT_LIST_HEAD (&ksocknal_data.ksnd_deathrow_conns);
+	init_waitqueue_head(&ksocknal_data.ksnd_reaper_waitq);
+
+	spin_lock_init(&ksocknal_data.ksnd_connd_lock);
+	INIT_LIST_HEAD (&ksocknal_data.ksnd_connd_connreqs);
+	INIT_LIST_HEAD (&ksocknal_data.ksnd_connd_routes);
+	init_waitqueue_head(&ksocknal_data.ksnd_connd_waitq);
+
+	spin_lock_init(&ksocknal_data.ksnd_tx_lock);
+	INIT_LIST_HEAD (&ksocknal_data.ksnd_idle_noop_txs);
+
+	/* NB memset above zeros whole of ksocknal_data */
+
+	/* flag lists/ptrs/locks initialised */
+	ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA;
+	try_module_get(THIS_MODULE);
+
+	ksocknal_data.ksnd_sched_info = cfs_percpt_alloc(lnet_cpt_table(),
+							 sizeof(*info));
+	if (ksocknal_data.ksnd_sched_info == NULL)
+		goto failed;
+
+	cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) {
+		ksock_sched_t	*sched;
+		int		nthrs;
+
+		nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
+		if (*ksocknal_tunables.ksnd_nscheds > 0) {
+			nthrs = min(nthrs, *ksocknal_tunables.ksnd_nscheds);
+		} else {
+			/* max to half of CPUs, assume another half should be
+			 * reserved for upper layer modules */
+			nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs);
+		}
+
+		info->ksi_nthreads_max = nthrs;
+		info->ksi_cpt = i;
+
+		LIBCFS_CPT_ALLOC(info->ksi_scheds, lnet_cpt_table(), i,
+				 info->ksi_nthreads_max * sizeof(*sched));
+		if (info->ksi_scheds == NULL)
+			goto failed;
+
+		for (; nthrs > 0; nthrs--) {
+			sched = &info->ksi_scheds[nthrs - 1];
+
+			sched->kss_info = info;
+			spin_lock_init(&sched->kss_lock);
+			INIT_LIST_HEAD(&sched->kss_rx_conns);
+			INIT_LIST_HEAD(&sched->kss_tx_conns);
+			INIT_LIST_HEAD(&sched->kss_zombie_noop_txs);
+			init_waitqueue_head(&sched->kss_waitq);
+		}
+	}
+
+	ksocknal_data.ksnd_connd_starting	 = 0;
+	ksocknal_data.ksnd_connd_failed_stamp     = 0;
+	ksocknal_data.ksnd_connd_starting_stamp   = cfs_time_current_sec();
+	/* must have at least 2 connds to remain responsive to accepts while
+	 * connecting */
+	if (*ksocknal_tunables.ksnd_nconnds < SOCKNAL_CONND_RESV + 1)
+		*ksocknal_tunables.ksnd_nconnds = SOCKNAL_CONND_RESV + 1;
+
+	if (*ksocknal_tunables.ksnd_nconnds_max <
+	    *ksocknal_tunables.ksnd_nconnds) {
+		ksocknal_tunables.ksnd_nconnds_max =
+			ksocknal_tunables.ksnd_nconnds;
+	}
+
+	for (i = 0; i < *ksocknal_tunables.ksnd_nconnds; i++) {
+		char name[16];
+		spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+		ksocknal_data.ksnd_connd_starting++;
+		spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+
+
+		snprintf(name, sizeof(name), "socknal_cd%02d", i);
+		rc = ksocknal_thread_start(ksocknal_connd,
+					   (void *)((ulong_ptr_t)i), name);
+		if (rc != 0) {
+			spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+			ksocknal_data.ksnd_connd_starting--;
+			spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+			CERROR("Can't spawn socknal connd: %d\n", rc);
+			goto failed;
+		}
+	}
+
+	rc = ksocknal_thread_start(ksocknal_reaper, NULL, "socknal_reaper");
+	if (rc != 0) {
+		CERROR ("Can't spawn socknal reaper: %d\n", rc);
+		goto failed;
+	}
+
+	/* flag everything initialised */
+	ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL;
+
+	return 0;
+
+ failed:
+	ksocknal_base_shutdown();
+	return -ENETDOWN;
+}
+
+void
+ksocknal_debug_peerhash (lnet_ni_t *ni)
+{
+	ksock_peer_t	*peer = NULL;
+	struct list_head	*tmp;
+	int		i;
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+		list_for_each (tmp, &ksocknal_data.ksnd_peers[i]) {
+			peer = list_entry (tmp, ksock_peer_t, ksnp_list);
+
+			if (peer->ksnp_ni == ni) break;
+
+			peer = NULL;
+		}
+	}
+
+	if (peer != NULL) {
+		ksock_route_t *route;
+		ksock_conn_t  *conn;
+
+		CWARN ("Active peer on shutdown: %s, ref %d, scnt %d, "
+		       "closing %d, accepting %d, err %d, zcookie "LPU64", "
+		       "txq %d, zc_req %d\n", libcfs_id2str(peer->ksnp_id),
+		       atomic_read(&peer->ksnp_refcount),
+		       peer->ksnp_sharecount, peer->ksnp_closing,
+		       peer->ksnp_accepting, peer->ksnp_error,
+		       peer->ksnp_zc_next_cookie,
+		       !list_empty(&peer->ksnp_tx_queue),
+		       !list_empty(&peer->ksnp_zc_req_list));
+
+		list_for_each (tmp, &peer->ksnp_routes) {
+			route = list_entry(tmp, ksock_route_t, ksnr_list);
+			CWARN ("Route: ref %d, schd %d, conn %d, cnted %d, "
+			       "del %d\n", atomic_read(&route->ksnr_refcount),
+			       route->ksnr_scheduled, route->ksnr_connecting,
+			       route->ksnr_connected, route->ksnr_deleted);
+		}
+
+		list_for_each (tmp, &peer->ksnp_conns) {
+			conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+			CWARN ("Conn: ref %d, sref %d, t %d, c %d\n",
+			       atomic_read(&conn->ksnc_conn_refcount),
+			       atomic_read(&conn->ksnc_sock_refcount),
+			       conn->ksnc_type, conn->ksnc_closing);
+		}
+	}
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+	return;
+}
+
+void
+ksocknal_shutdown (lnet_ni_t *ni)
+{
+	ksock_net_t      *net = ni->ni_data;
+	int	       i;
+	lnet_process_id_t anyid = {0};
+
+	anyid.nid =  LNET_NID_ANY;
+	anyid.pid =  LNET_PID_ANY;
+
+	LASSERT(ksocknal_data.ksnd_init == SOCKNAL_INIT_ALL);
+	LASSERT(ksocknal_data.ksnd_nnets > 0);
+
+	spin_lock_bh(&net->ksnn_lock);
+	net->ksnn_shutdown = 1;		 /* prevent new peers */
+	spin_unlock_bh(&net->ksnn_lock);
+
+	/* Delete all peers */
+	ksocknal_del_peer(ni, anyid, 0);
+
+	/* Wait for all peer state to clean up */
+	i = 2;
+	spin_lock_bh(&net->ksnn_lock);
+	while (net->ksnn_npeers != 0) {
+		spin_unlock_bh(&net->ksnn_lock);
+
+		i++;
+		CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+		       "waiting for %d peers to disconnect\n",
+		       net->ksnn_npeers);
+		cfs_pause(cfs_time_seconds(1));
+
+		ksocknal_debug_peerhash(ni);
+
+		spin_lock_bh(&net->ksnn_lock);
+	}
+	spin_unlock_bh(&net->ksnn_lock);
+
+	for (i = 0; i < net->ksnn_ninterfaces; i++) {
+		LASSERT (net->ksnn_interfaces[i].ksni_npeers == 0);
+		LASSERT (net->ksnn_interfaces[i].ksni_nroutes == 0);
+	}
+
+	list_del(&net->ksnn_list);
+	LIBCFS_FREE(net, sizeof(*net));
+
+	ksocknal_data.ksnd_nnets--;
+	if (ksocknal_data.ksnd_nnets == 0)
+		ksocknal_base_shutdown();
+}
+
+int
+ksocknal_enumerate_interfaces(ksock_net_t *net)
+{
+	char      **names;
+	int	 i;
+	int	 j;
+	int	 rc;
+	int	 n;
+
+	n = libcfs_ipif_enumerate(&names);
+	if (n <= 0) {
+		CERROR("Can't enumerate interfaces: %d\n", n);
+		return n;
+	}
+
+	for (i = j = 0; i < n; i++) {
+		int	up;
+		__u32      ip;
+		__u32      mask;
+
+		if (!strcmp(names[i], "lo")) /* skip the loopback IF */
+			continue;
+
+		rc = libcfs_ipif_query(names[i], &up, &ip, &mask);
+		if (rc != 0) {
+			CWARN("Can't get interface %s info: %d\n",
+			      names[i], rc);
+			continue;
+		}
+
+		if (!up) {
+			CWARN("Ignoring interface %s (down)\n",
+			      names[i]);
+			continue;
+		}
+
+		if (j == LNET_MAX_INTERFACES) {
+			CWARN("Ignoring interface %s (too many interfaces)\n",
+			      names[i]);
+			continue;
+		}
+
+		net->ksnn_interfaces[j].ksni_ipaddr = ip;
+		net->ksnn_interfaces[j].ksni_netmask = mask;
+		strncpy(&net->ksnn_interfaces[j].ksni_name[0],
+			names[i], IFNAMSIZ);
+		j++;
+	}
+
+	libcfs_ipif_free_enumeration(names, n);
+
+	if (j == 0)
+		CERROR("Can't find any usable interfaces\n");
+
+	return j;
+}
+
+int
+ksocknal_search_new_ipif(ksock_net_t *net)
+{
+	int	new_ipif = 0;
+	int	i;
+
+	for (i = 0; i < net->ksnn_ninterfaces; i++) {
+		char		*ifnam = &net->ksnn_interfaces[i].ksni_name[0];
+		char		*colon = strchr(ifnam, ':');
+		int		found  = 0;
+		ksock_net_t	*tmp;
+		int		j;
+
+		if (colon != NULL) /* ignore alias device */
+			*colon = 0;
+
+		list_for_each_entry(tmp, &ksocknal_data.ksnd_nets,
+					ksnn_list) {
+			for (j = 0; !found && j < tmp->ksnn_ninterfaces; j++) {
+				char *ifnam2 = &tmp->ksnn_interfaces[j].\
+					     ksni_name[0];
+				char *colon2 = strchr(ifnam2, ':');
+
+				if (colon2 != NULL)
+					*colon2 = 0;
+
+				found = strcmp(ifnam, ifnam2) == 0;
+				if (colon2 != NULL)
+					*colon2 = ':';
+			}
+			if (found)
+				break;
+		}
+
+		new_ipif += !found;
+		if (colon != NULL)
+			*colon = ':';
+	}
+
+	return new_ipif;
+}
+
+int
+ksocknal_start_schedulers(struct ksock_sched_info *info)
+{
+	int	nthrs;
+	int	rc = 0;
+	int	i;
+
+	if (info->ksi_nthreads == 0) {
+		if (*ksocknal_tunables.ksnd_nscheds > 0) {
+			nthrs = info->ksi_nthreads_max;
+		} else {
+			nthrs = cfs_cpt_weight(lnet_cpt_table(),
+					       info->ksi_cpt);
+			nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs);
+			nthrs = min(SOCKNAL_NSCHEDS_HIGH, nthrs);
+		}
+		nthrs = min(nthrs, info->ksi_nthreads_max);
+	} else {
+		LASSERT(info->ksi_nthreads <= info->ksi_nthreads_max);
+		/* increase two threads if there is new interface */
+		nthrs = min(2, info->ksi_nthreads_max - info->ksi_nthreads);
+	}
+
+	for (i = 0; i < nthrs; i++) {
+		long		id;
+		char		name[20];
+		ksock_sched_t	*sched;
+		id = KSOCK_THREAD_ID(info->ksi_cpt, info->ksi_nthreads + i);
+		sched = &info->ksi_scheds[KSOCK_THREAD_SID(id)];
+		snprintf(name, sizeof(name), "socknal_sd%02d_%02d",
+			 info->ksi_cpt, (int)(sched - &info->ksi_scheds[0]));
+
+		rc = ksocknal_thread_start(ksocknal_scheduler,
+					   (void *)id, name);
+		if (rc == 0)
+			continue;
+
+		CERROR("Can't spawn thread %d for scheduler[%d]: %d\n",
+		       info->ksi_cpt, info->ksi_nthreads + i, rc);
+		break;
+	}
+
+	info->ksi_nthreads += i;
+	return rc;
+}
+
+int
+ksocknal_net_start_threads(ksock_net_t *net, __u32 *cpts, int ncpts)
+{
+	int	newif = ksocknal_search_new_ipif(net);
+	int	rc;
+	int	i;
+
+	LASSERT(ncpts > 0 && ncpts <= cfs_cpt_number(lnet_cpt_table()));
+
+	for (i = 0; i < ncpts; i++) {
+		struct ksock_sched_info	*info;
+		int cpt = (cpts == NULL) ? i : cpts[i];
+
+		LASSERT(cpt < cfs_cpt_number(lnet_cpt_table()));
+		info = ksocknal_data.ksnd_sched_info[cpt];
+
+		if (!newif && info->ksi_nthreads > 0)
+			continue;
+
+		rc = ksocknal_start_schedulers(info);
+		if (rc != 0)
+			return rc;
+	}
+	return 0;
+}
+
+int
+ksocknal_startup (lnet_ni_t *ni)
+{
+	ksock_net_t  *net;
+	int	   rc;
+	int	   i;
+
+	LASSERT (ni->ni_lnd == &the_ksocklnd);
+
+	if (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING) {
+		rc = ksocknal_base_startup();
+		if (rc != 0)
+			return rc;
+	}
+
+	LIBCFS_ALLOC(net, sizeof(*net));
+	if (net == NULL)
+		goto fail_0;
+
+	spin_lock_init(&net->ksnn_lock);
+	net->ksnn_incarnation = ksocknal_new_incarnation();
+	ni->ni_data = net;
+	ni->ni_peertimeout    = *ksocknal_tunables.ksnd_peertimeout;
+	ni->ni_maxtxcredits   = *ksocknal_tunables.ksnd_credits;
+	ni->ni_peertxcredits  = *ksocknal_tunables.ksnd_peertxcredits;
+	ni->ni_peerrtrcredits = *ksocknal_tunables.ksnd_peerrtrcredits;
+
+	if (ni->ni_interfaces[0] == NULL) {
+		rc = ksocknal_enumerate_interfaces(net);
+		if (rc <= 0)
+			goto fail_1;
+
+		net->ksnn_ninterfaces = 1;
+	} else {
+		for (i = 0; i < LNET_MAX_INTERFACES; i++) {
+			int    up;
+
+			if (ni->ni_interfaces[i] == NULL)
+				break;
+
+			rc = libcfs_ipif_query(
+				ni->ni_interfaces[i], &up,
+				&net->ksnn_interfaces[i].ksni_ipaddr,
+				&net->ksnn_interfaces[i].ksni_netmask);
+
+			if (rc != 0) {
+				CERROR("Can't get interface %s info: %d\n",
+				       ni->ni_interfaces[i], rc);
+				goto fail_1;
+			}
+
+			if (!up) {
+				CERROR("Interface %s is down\n",
+				       ni->ni_interfaces[i]);
+				goto fail_1;
+			}
+
+			strncpy(&net->ksnn_interfaces[i].ksni_name[0],
+				ni->ni_interfaces[i], IFNAMSIZ);
+		}
+		net->ksnn_ninterfaces = i;
+	}
+
+	/* call it before add it to ksocknal_data.ksnd_nets */
+	rc = ksocknal_net_start_threads(net, ni->ni_cpts, ni->ni_ncpts);
+	if (rc != 0)
+		goto fail_1;
+
+	ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid),
+				net->ksnn_interfaces[0].ksni_ipaddr);
+	list_add(&net->ksnn_list, &ksocknal_data.ksnd_nets);
+
+	ksocknal_data.ksnd_nnets++;
+
+	return 0;
+
+ fail_1:
+	LIBCFS_FREE(net, sizeof(*net));
+ fail_0:
+	if (ksocknal_data.ksnd_nnets == 0)
+		ksocknal_base_shutdown();
+
+	return -ENETDOWN;
+}
+
+
+void __exit
+ksocknal_module_fini (void)
+{
+	lnet_unregister_lnd(&the_ksocklnd);
+	ksocknal_tunables_fini();
+}
+
+int __init
+ksocknal_module_init (void)
+{
+	int    rc;
+
+	/* check ksnr_connected/connecting field large enough */
+	CLASSERT (SOCKLND_CONN_NTYPES <= 4);
+	CLASSERT (SOCKLND_CONN_ACK == SOCKLND_CONN_BULK_IN);
+
+	/* initialize the_ksocklnd */
+	the_ksocklnd.lnd_type     = SOCKLND;
+	the_ksocklnd.lnd_startup  = ksocknal_startup;
+	the_ksocklnd.lnd_shutdown = ksocknal_shutdown;
+	the_ksocklnd.lnd_ctl      = ksocknal_ctl;
+	the_ksocklnd.lnd_send     = ksocknal_send;
+	the_ksocklnd.lnd_recv     = ksocknal_recv;
+	the_ksocklnd.lnd_notify   = ksocknal_notify;
+	the_ksocklnd.lnd_query    = ksocknal_query;
+	the_ksocklnd.lnd_accept   = ksocknal_accept;
+
+	rc = ksocknal_tunables_init();
+	if (rc != 0)
+		return rc;
+
+	lnet_register_lnd(&the_ksocklnd);
+
+	return 0;
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Kernel TCP Socket LND v3.0.0");
+MODULE_LICENSE("GPL");
+
+cfs_module(ksocknal, "3.0.0", ksocknal_module_init, ksocknal_module_fini);
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h
new file mode 100644
index 000000000000..b483e0c3a69a
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h
@@ -0,0 +1,602 @@
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_PORTAL_ALLOC
+#define DEBUG_SUBSYSTEM S_LND
+
+#include "socklnd_lib-linux.h"
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lnet.h>
+#include <linux/lnet/lib-lnet.h>
+#include <linux/lnet/socklnd.h>
+#include <linux/lnet/lnet-sysctl.h>
+
+#define SOCKNAL_PEER_HASH_SIZE  101	     /* # peer lists */
+#define SOCKNAL_RESCHED	 100	     /* # scheduler loops before reschedule */
+#define SOCKNAL_INSANITY_RECONN 5000	    /* connd is trying on reconn infinitely */
+#define SOCKNAL_ENOMEM_RETRY    CFS_TICK	/* jiffies between retries */
+
+#define SOCKNAL_SINGLE_FRAG_TX      0	   /* disable multi-fragment sends */
+#define SOCKNAL_SINGLE_FRAG_RX      0	   /* disable multi-fragment receives */
+
+#define SOCKNAL_VERSION_DEBUG       0	   /* enable protocol version debugging */
+
+/* risk kmap deadlock on multi-frag I/O (backs off to single-frag if disabled).
+ * no risk if we're not running on a CONFIG_HIGHMEM platform. */
+#ifdef CONFIG_HIGHMEM
+# define SOCKNAL_RISK_KMAP_DEADLOCK  0
+#else
+# define SOCKNAL_RISK_KMAP_DEADLOCK  1
+#endif
+
+struct ksock_sched_info;
+
+typedef struct				  /* per scheduler state */
+{
+	spinlock_t		kss_lock;	/* serialise */
+	struct list_head		kss_rx_conns;	/* conn waiting to be read */
+	/* conn waiting to be written */
+	struct list_head		kss_tx_conns;
+	/* zombie noop tx list */
+	struct list_head		kss_zombie_noop_txs;
+	wait_queue_head_t		kss_waitq;	/* where scheduler sleeps */
+	/* # connections assigned to this scheduler */
+	int			kss_nconns;
+	struct ksock_sched_info	*kss_info;	/* owner of it */
+	struct page		*kss_rx_scratch_pgs[LNET_MAX_IOV];
+	struct iovec		kss_scratch_iov[LNET_MAX_IOV];
+} ksock_sched_t;
+
+struct ksock_sched_info {
+	int			ksi_nthreads_max; /* max allowed threads */
+	int			ksi_nthreads;	/* number of threads */
+	int			ksi_cpt;	/* CPT id */
+	ksock_sched_t		*ksi_scheds;	/* array of schedulers */
+};
+
+#define KSOCK_CPT_SHIFT			16
+#define KSOCK_THREAD_ID(cpt, sid)	(((cpt) << KSOCK_CPT_SHIFT) | (sid))
+#define KSOCK_THREAD_CPT(id)		((id) >> KSOCK_CPT_SHIFT)
+#define KSOCK_THREAD_SID(id)		((id) & ((1UL << KSOCK_CPT_SHIFT) - 1))
+
+typedef struct				  /* in-use interface */
+{
+	__u32		ksni_ipaddr;		/* interface's IP address */
+	__u32		ksni_netmask;		/* interface's network mask */
+	int		ksni_nroutes;		/* # routes using (active) */
+	int		ksni_npeers;		/* # peers using (passive) */
+	char		ksni_name[IFNAMSIZ];	/* interface name */
+} ksock_interface_t;
+
+typedef struct
+{
+	/* "stuck" socket timeout (seconds) */
+	int	      *ksnd_timeout;
+	/* # scheduler threads in each pool while starting */
+	int		 *ksnd_nscheds;
+	int	      *ksnd_nconnds;	 /* # connection daemons */
+	int	      *ksnd_nconnds_max;     /* max # connection daemons */
+	int	      *ksnd_min_reconnectms; /* first connection retry after (ms)... */
+	int	      *ksnd_max_reconnectms; /* ...exponentially increasing to this */
+	int	      *ksnd_eager_ack;       /* make TCP ack eagerly? */
+	int	      *ksnd_typed_conns;     /* drive sockets by type? */
+	int	      *ksnd_min_bulk;	/* smallest "large" message */
+	int	      *ksnd_tx_buffer_size;  /* socket tx buffer size */
+	int	      *ksnd_rx_buffer_size;  /* socket rx buffer size */
+	int	      *ksnd_nagle;	   /* enable NAGLE? */
+	int	      *ksnd_round_robin;     /* round robin for multiple interfaces */
+	int	      *ksnd_keepalive;       /* # secs for sending keepalive NOOP */
+	int	      *ksnd_keepalive_idle;  /* # idle secs before 1st probe */
+	int	      *ksnd_keepalive_count; /* # probes */
+	int	      *ksnd_keepalive_intvl; /* time between probes */
+	int	      *ksnd_credits;	 /* # concurrent sends */
+	int	      *ksnd_peertxcredits;   /* # concurrent sends to 1 peer */
+	int	      *ksnd_peerrtrcredits;  /* # per-peer router buffer credits */
+	int	      *ksnd_peertimeout;     /* seconds to consider peer dead */
+	int	      *ksnd_enable_csum;     /* enable check sum */
+	int	      *ksnd_inject_csum_error; /* set non-zero to inject checksum error */
+	int	      *ksnd_nonblk_zcack;    /* always send zc-ack on non-blocking connection */
+	unsigned int     *ksnd_zc_min_payload;  /* minimum zero copy payload size */
+	int	      *ksnd_zc_recv;	 /* enable ZC receive (for Chelsio TOE) */
+	int	      *ksnd_zc_recv_min_nfrags; /* minimum # of fragments to enable ZC receive */
+#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
+	ctl_table_header_t *ksnd_sysctl;   /* sysctl interface */
+#endif
+} ksock_tunables_t;
+
+typedef struct
+{
+	__u64		  ksnn_incarnation;	/* my epoch */
+	spinlock_t	  ksnn_lock;		/* serialise */
+	struct list_head	  ksnn_list;		/* chain on global list */
+	int		  ksnn_npeers;		/* # peers */
+	int		  ksnn_shutdown;	/* shutting down? */
+	int		  ksnn_ninterfaces;	/* IP interfaces */
+	ksock_interface_t ksnn_interfaces[LNET_MAX_INTERFACES];
+} ksock_net_t;
+
+/** connd timeout */
+#define SOCKNAL_CONND_TIMEOUT  120
+/** reserved thread for accepting & creating new connd */
+#define SOCKNAL_CONND_RESV     1
+
+typedef struct
+{
+	int			ksnd_init;	/* initialisation state */
+	int			ksnd_nnets;	/* # networks set up */
+	struct list_head		ksnd_nets;	/* list of nets */
+	/* stabilize peer/conn ops */
+	rwlock_t		ksnd_global_lock;
+	/* hash table of all my known peers */
+	struct list_head		*ksnd_peers;
+	int			ksnd_peer_hash_size; /* size of ksnd_peers */
+
+	int			ksnd_nthreads;	/* # live threads */
+	int			ksnd_shuttingdown; /* tell threads to exit */
+	/* schedulers information */
+	struct ksock_sched_info	**ksnd_sched_info;
+
+	atomic_t      ksnd_nactive_txs;    /* #active txs */
+
+	struct list_head	ksnd_deathrow_conns; /* conns to close: reaper_lock*/
+	struct list_head	ksnd_zombie_conns;   /* conns to free: reaper_lock */
+	struct list_head	ksnd_enomem_conns;   /* conns to retry: reaper_lock*/
+	wait_queue_head_t       ksnd_reaper_waitq;   /* reaper sleeps here */
+	cfs_time_t	ksnd_reaper_waketime;/* when reaper will wake */
+	spinlock_t	  ksnd_reaper_lock;	/* serialise */
+
+	int	       ksnd_enomem_tx;      /* test ENOMEM sender */
+	int	       ksnd_stall_tx;       /* test sluggish sender */
+	int	       ksnd_stall_rx;       /* test sluggish receiver */
+
+	struct list_head	ksnd_connd_connreqs; /* incoming connection requests */
+	struct list_head	ksnd_connd_routes;   /* routes waiting to be connected */
+	wait_queue_head_t       ksnd_connd_waitq;    /* connds sleep here */
+	int	       ksnd_connd_connecting;/* # connds connecting */
+	/** time stamp of the last failed connecting attempt */
+	long	      ksnd_connd_failed_stamp;
+	/** # starting connd */
+	unsigned	  ksnd_connd_starting;
+	/** time stamp of the last starting connd */
+	long	      ksnd_connd_starting_stamp;
+	/** # running connd */
+	unsigned	  ksnd_connd_running;
+	spinlock_t	  ksnd_connd_lock;	/* serialise */
+
+	struct list_head	  ksnd_idle_noop_txs;	/* list head for freed noop tx */
+	spinlock_t	  ksnd_tx_lock;		/* serialise, g_lock unsafe */
+
+} ksock_nal_data_t;
+
+#define SOCKNAL_INIT_NOTHING    0
+#define SOCKNAL_INIT_DATA       1
+#define SOCKNAL_INIT_ALL	2
+
+/* A packet just assembled for transmission is represented by 1 or more
+ * struct iovec fragments (the first frag contains the portals header),
+ * followed by 0 or more lnet_kiov_t fragments.
+ *
+ * On the receive side, initially 1 struct iovec fragment is posted for
+ * receive (the header).  Once the header has been received, the payload is
+ * received into either struct iovec or lnet_kiov_t fragments, depending on
+ * what the header matched or whether the message needs forwarding. */
+
+struct ksock_conn;			      /* forward ref */
+struct ksock_peer;			      /* forward ref */
+struct ksock_route;			     /* forward ref */
+struct ksock_proto;			     /* forward ref */
+
+typedef struct				  /* transmit packet */
+{
+	struct list_head     tx_list;	/* queue on conn for transmission etc */
+	struct list_head     tx_zc_list;     /* queue on peer for ZC request */
+	atomic_t   tx_refcount;    /* tx reference count */
+	int	    tx_nob;	 /* # packet bytes */
+	int	    tx_resid;       /* residual bytes */
+	int	    tx_niov;	/* # packet iovec frags */
+	struct iovec  *tx_iov;	 /* packet iovec frags */
+	int	    tx_nkiov;       /* # packet page frags */
+	unsigned short tx_zc_aborted;  /* aborted ZC request */
+	unsigned short tx_zc_capable:1; /* payload is large enough for ZC */
+	unsigned short tx_zc_checked:1; /* Have I checked if I should ZC? */
+	unsigned short tx_nonblk:1;    /* it's a non-blocking ACK */
+	lnet_kiov_t   *tx_kiov;	/* packet page frags */
+	struct ksock_conn  *tx_conn;	/* owning conn */
+	lnet_msg_t    *tx_lnetmsg;     /* lnet message for lnet_finalize() */
+	cfs_time_t     tx_deadline;    /* when (in jiffies) tx times out */
+	ksock_msg_t    tx_msg;	 /* socklnd message buffer */
+	int	    tx_desc_size;   /* size of this descriptor */
+	union {
+		struct {
+			struct iovec iov;       /* virt hdr */
+			lnet_kiov_t  kiov[0];   /* paged payload */
+		}		  paged;
+		struct {
+			struct iovec iov[1];    /* virt hdr + payload */
+		}		  virt;
+	}		       tx_frags;
+} ksock_tx_t;
+
+#define KSOCK_NOOP_TX_SIZE  ((int)offsetof(ksock_tx_t, tx_frags.paged.kiov[0]))
+
+/* network zero copy callback descriptor embedded in ksock_tx_t */
+
+/* space for the rx frag descriptors; we either read a single contiguous
+ * header, or up to LNET_MAX_IOV frags of payload of either type. */
+typedef union {
+	struct iovec     iov[LNET_MAX_IOV];
+	lnet_kiov_t      kiov[LNET_MAX_IOV];
+} ksock_rxiovspace_t;
+
+#define SOCKNAL_RX_KSM_HEADER   1	       /* reading ksock message header */
+#define SOCKNAL_RX_LNET_HEADER  2	       /* reading lnet message header */
+#define SOCKNAL_RX_PARSE	3	       /* Calling lnet_parse() */
+#define SOCKNAL_RX_PARSE_WAIT   4	       /* waiting to be told to read the body */
+#define SOCKNAL_RX_LNET_PAYLOAD 5	       /* reading lnet payload (to deliver here) */
+#define SOCKNAL_RX_SLOP	 6	       /* skipping body */
+
+typedef struct ksock_conn
+{
+	struct ksock_peer  *ksnc_peer;	 /* owning peer */
+	struct ksock_route *ksnc_route;	/* owning route */
+	struct list_head	  ksnc_list;	 /* stash on peer's conn list */
+	socket_t       *ksnc_sock;	 /* actual socket */
+	void	       *ksnc_saved_data_ready; /* socket's original data_ready() callback */
+	void	       *ksnc_saved_write_space; /* socket's original write_space() callback */
+	atomic_t	ksnc_conn_refcount; /* conn refcount */
+	atomic_t	ksnc_sock_refcount; /* sock refcount */
+	ksock_sched_t      *ksnc_scheduler;  /* who schedules this connection */
+	__u32	       ksnc_myipaddr;   /* my IP */
+	__u32	       ksnc_ipaddr;     /* peer's IP */
+	int		 ksnc_port;       /* peer's port */
+	signed int	  ksnc_type:3;     /* type of connection,
+					      * should be signed value */
+	unsigned int	    ksnc_closing:1;  /* being shut down */
+	unsigned int	    ksnc_flip:1;     /* flip or not, only for V2.x */
+	unsigned int	    ksnc_zc_capable:1; /* enable to ZC */
+	struct ksock_proto *ksnc_proto;      /* protocol for the connection */
+
+	/* reader */
+	struct list_head  ksnc_rx_list;     /* where I enq waiting input or a forwarding descriptor */
+	cfs_time_t	    ksnc_rx_deadline; /* when (in jiffies) receive times out */
+	__u8		  ksnc_rx_started;  /* started receiving a message */
+	__u8		  ksnc_rx_ready;    /* data ready to read */
+	__u8		  ksnc_rx_scheduled;/* being progressed */
+	__u8		  ksnc_rx_state;    /* what is being read */
+	int		   ksnc_rx_nob_left; /* # bytes to next hdr/body */
+	int		   ksnc_rx_nob_wanted; /* bytes actually wanted */
+	int		   ksnc_rx_niov;     /* # iovec frags */
+	struct iovec	 *ksnc_rx_iov;      /* the iovec frags */
+	int		   ksnc_rx_nkiov;    /* # page frags */
+	lnet_kiov_t	  *ksnc_rx_kiov;     /* the page frags */
+	ksock_rxiovspace_t    ksnc_rx_iov_space;/* space for frag descriptors */
+	__u32		 ksnc_rx_csum;     /* partial checksum for incoming data */
+	void		 *ksnc_cookie;      /* rx lnet_finalize passthru arg */
+	ksock_msg_t	   ksnc_msg;	 /* incoming message buffer:
+						 * V2.x message takes the
+						 * whole struct
+						 * V1.x message is a bare
+						 * lnet_hdr_t, it's stored in
+						 * ksnc_msg.ksm_u.lnetmsg */
+
+	/* WRITER */
+	struct list_head	    ksnc_tx_list;     /* where I enq waiting for output space */
+	struct list_head	    ksnc_tx_queue;    /* packets waiting to be sent */
+	ksock_tx_t	   *ksnc_tx_carrier;  /* next TX that can carry a LNet message or ZC-ACK */
+	cfs_time_t	    ksnc_tx_deadline; /* when (in jiffies) tx times out */
+	int		   ksnc_tx_bufnob;     /* send buffer marker */
+	atomic_t	  ksnc_tx_nob;	/* # bytes queued */
+	int		   ksnc_tx_ready;      /* write space */
+	int		   ksnc_tx_scheduled;  /* being progressed */
+	cfs_time_t	    ksnc_tx_last_post;  /* time stamp of the last posted TX */
+} ksock_conn_t;
+
+typedef struct ksock_route
+{
+	struct list_head	    ksnr_list;	/* chain on peer route list */
+	struct list_head	    ksnr_connd_list;  /* chain on ksnr_connd_routes */
+	struct ksock_peer    *ksnr_peer;	/* owning peer */
+	atomic_t	  ksnr_refcount;    /* # users */
+	cfs_time_t	    ksnr_timeout;     /* when (in jiffies) reconnection can happen next */
+	cfs_duration_t	ksnr_retry_interval; /* how long between retries */
+	__u32		 ksnr_myipaddr;    /* my IP */
+	__u32		 ksnr_ipaddr;      /* IP address to connect to */
+	int		   ksnr_port;	/* port to connect to */
+	unsigned int	  ksnr_scheduled:1; /* scheduled for attention */
+	unsigned int	  ksnr_connecting:1;/* connection establishment in progress */
+	unsigned int	  ksnr_connected:4; /* connections established by type */
+	unsigned int	  ksnr_deleted:1;   /* been removed from peer? */
+	unsigned int	  ksnr_share_count; /* created explicitly? */
+	int		   ksnr_conn_count;  /* # conns established by this route */
+} ksock_route_t;
+
+#define SOCKNAL_KEEPALIVE_PING	  1       /* cookie for keepalive ping */
+
+typedef struct ksock_peer
+{
+	struct list_head	    ksnp_list;	/* stash on global peer list */
+	cfs_time_t	    ksnp_last_alive;  /* when (in jiffies) I was last alive */
+	lnet_process_id_t     ksnp_id;       /* who's on the other end(s) */
+	atomic_t	  ksnp_refcount; /* # users */
+	int		   ksnp_sharecount;  /* lconf usage counter */
+	int		   ksnp_closing;  /* being closed */
+	int		   ksnp_accepting;/* # passive connections pending */
+	int		   ksnp_error;    /* errno on closing last conn */
+	__u64		 ksnp_zc_next_cookie;/* ZC completion cookie */
+	__u64		 ksnp_incarnation;   /* latest known peer incarnation */
+	struct ksock_proto   *ksnp_proto;    /* latest known peer protocol */
+	struct list_head	    ksnp_conns;    /* all active connections */
+	struct list_head	    ksnp_routes;   /* routes */
+	struct list_head	    ksnp_tx_queue; /* waiting packets */
+	spinlock_t	      ksnp_lock;	/* serialize, g_lock unsafe */
+	struct list_head	    ksnp_zc_req_list;   /* zero copy requests wait for ACK  */
+	cfs_time_t	    ksnp_send_keepalive; /* time to send keepalive */
+	lnet_ni_t	    *ksnp_ni;       /* which network */
+	int		   ksnp_n_passive_ips; /* # of... */
+	__u32		 ksnp_passive_ips[LNET_MAX_INTERFACES]; /* preferred local interfaces */
+} ksock_peer_t;
+
+typedef struct ksock_connreq
+{
+	struct list_head	    ksncr_list;     /* stash on ksnd_connd_connreqs */
+	lnet_ni_t	    *ksncr_ni;       /* chosen NI */
+	socket_t	 *ksncr_sock;     /* accepted socket */
+} ksock_connreq_t;
+
+extern ksock_nal_data_t ksocknal_data;
+extern ksock_tunables_t ksocknal_tunables;
+
+#define SOCKNAL_MATCH_NO	0	/* TX can't match type of connection */
+#define SOCKNAL_MATCH_YES       1	/* TX matches type of connection */
+#define SOCKNAL_MATCH_MAY       2	/* TX can be sent on the connection, but not preferred */
+
+typedef struct ksock_proto
+{
+	int	   pro_version;					      /* version number of protocol */
+	int	 (*pro_send_hello)(ksock_conn_t *, ksock_hello_msg_t *);     /* handshake function */
+	int	 (*pro_recv_hello)(ksock_conn_t *, ksock_hello_msg_t *, int);/* handshake function */
+	void	(*pro_pack)(ksock_tx_t *);				  /* message pack */
+	void	(*pro_unpack)(ksock_msg_t *);			       /* message unpack */
+	ksock_tx_t *(*pro_queue_tx_msg)(ksock_conn_t *, ksock_tx_t *);	  /* queue tx on the connection */
+	int	 (*pro_queue_tx_zcack)(ksock_conn_t *, ksock_tx_t *, __u64); /* queue ZC ack on the connection */
+	int	 (*pro_handle_zcreq)(ksock_conn_t *, __u64, int);	    /* handle ZC request */
+	int	 (*pro_handle_zcack)(ksock_conn_t *, __u64, __u64);	  /* handle ZC ACK */
+	int	 (*pro_match_tx)(ksock_conn_t *, ksock_tx_t *, int);	 /* msg type matches the connection type:
+										 * return value:
+										 *   return MATCH_NO  : no
+										 *   return MATCH_YES : matching type
+										 *   return MATCH_MAY : can be backup */
+} ksock_proto_t;
+
+extern ksock_proto_t ksocknal_protocol_v1x;
+extern ksock_proto_t ksocknal_protocol_v2x;
+extern ksock_proto_t ksocknal_protocol_v3x;
+
+#define KSOCK_PROTO_V1_MAJOR    LNET_PROTO_TCP_VERSION_MAJOR
+#define KSOCK_PROTO_V1_MINOR    LNET_PROTO_TCP_VERSION_MINOR
+#define KSOCK_PROTO_V1	  KSOCK_PROTO_V1_MAJOR
+
+#ifndef CPU_MASK_NONE
+#define CPU_MASK_NONE   0UL
+#endif
+
+static inline int
+ksocknal_route_mask(void)
+{
+	if (!*ksocknal_tunables.ksnd_typed_conns)
+		return (1 << SOCKLND_CONN_ANY);
+
+	return ((1 << SOCKLND_CONN_CONTROL) |
+		(1 << SOCKLND_CONN_BULK_IN) |
+		(1 << SOCKLND_CONN_BULK_OUT));
+}
+
+static inline struct list_head *
+ksocknal_nid2peerlist (lnet_nid_t nid)
+{
+	unsigned int hash = ((unsigned int)nid) % ksocknal_data.ksnd_peer_hash_size;
+
+	return (&ksocknal_data.ksnd_peers [hash]);
+}
+
+static inline void
+ksocknal_conn_addref (ksock_conn_t *conn)
+{
+	LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0);
+	atomic_inc(&conn->ksnc_conn_refcount);
+}
+
+extern void ksocknal_queue_zombie_conn (ksock_conn_t *conn);
+extern void ksocknal_finalize_zcreq(ksock_conn_t *conn);
+
+static inline void
+ksocknal_conn_decref (ksock_conn_t *conn)
+{
+	LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0);
+	if (atomic_dec_and_test(&conn->ksnc_conn_refcount))
+		ksocknal_queue_zombie_conn(conn);
+}
+
+static inline int
+ksocknal_connsock_addref (ksock_conn_t *conn)
+{
+	int   rc = -ESHUTDOWN;
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+	if (!conn->ksnc_closing) {
+		LASSERT(atomic_read(&conn->ksnc_sock_refcount) > 0);
+		atomic_inc(&conn->ksnc_sock_refcount);
+		rc = 0;
+	}
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+
+	return (rc);
+}
+
+static inline void
+ksocknal_connsock_decref (ksock_conn_t *conn)
+{
+	LASSERT (atomic_read(&conn->ksnc_sock_refcount) > 0);
+	if (atomic_dec_and_test(&conn->ksnc_sock_refcount)) {
+		LASSERT (conn->ksnc_closing);
+		libcfs_sock_release(conn->ksnc_sock);
+		conn->ksnc_sock = NULL;
+		ksocknal_finalize_zcreq(conn);
+	}
+}
+
+static inline void
+ksocknal_tx_addref (ksock_tx_t *tx)
+{
+	LASSERT (atomic_read(&tx->tx_refcount) > 0);
+	atomic_inc(&tx->tx_refcount);
+}
+
+extern void ksocknal_tx_prep (ksock_conn_t *, ksock_tx_t *tx);
+extern void ksocknal_tx_done (lnet_ni_t *ni, ksock_tx_t *tx);
+
+static inline void
+ksocknal_tx_decref (ksock_tx_t *tx)
+{
+	LASSERT (atomic_read(&tx->tx_refcount) > 0);
+	if (atomic_dec_and_test(&tx->tx_refcount))
+		ksocknal_tx_done(NULL, tx);
+}
+
+static inline void
+ksocknal_route_addref (ksock_route_t *route)
+{
+	LASSERT (atomic_read(&route->ksnr_refcount) > 0);
+	atomic_inc(&route->ksnr_refcount);
+}
+
+extern void ksocknal_destroy_route (ksock_route_t *route);
+
+static inline void
+ksocknal_route_decref (ksock_route_t *route)
+{
+	LASSERT (atomic_read (&route->ksnr_refcount) > 0);
+	if (atomic_dec_and_test(&route->ksnr_refcount))
+		ksocknal_destroy_route (route);
+}
+
+static inline void
+ksocknal_peer_addref (ksock_peer_t *peer)
+{
+	LASSERT (atomic_read (&peer->ksnp_refcount) > 0);
+	atomic_inc(&peer->ksnp_refcount);
+}
+
+extern void ksocknal_destroy_peer (ksock_peer_t *peer);
+
+static inline void
+ksocknal_peer_decref (ksock_peer_t *peer)
+{
+	LASSERT (atomic_read (&peer->ksnp_refcount) > 0);
+	if (atomic_dec_and_test(&peer->ksnp_refcount))
+		ksocknal_destroy_peer (peer);
+}
+
+int ksocknal_startup (lnet_ni_t *ni);
+void ksocknal_shutdown (lnet_ni_t *ni);
+int ksocknal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg);
+int ksocknal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
+int ksocknal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
+		  int delayed, unsigned int niov,
+		  struct iovec *iov, lnet_kiov_t *kiov,
+		  unsigned int offset, unsigned int mlen, unsigned int rlen);
+int ksocknal_accept(lnet_ni_t *ni, socket_t *sock);
+
+extern int ksocknal_add_peer(lnet_ni_t *ni, lnet_process_id_t id, __u32 ip, int port);
+extern ksock_peer_t *ksocknal_find_peer_locked (lnet_ni_t *ni, lnet_process_id_t id);
+extern ksock_peer_t *ksocknal_find_peer (lnet_ni_t *ni, lnet_process_id_t id);
+extern void ksocknal_peer_failed (ksock_peer_t *peer);
+extern int ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route,
+				 socket_t *sock, int type);
+extern void ksocknal_close_conn_locked (ksock_conn_t *conn, int why);
+extern void ksocknal_terminate_conn (ksock_conn_t *conn);
+extern void ksocknal_destroy_conn (ksock_conn_t *conn);
+extern int  ksocknal_close_peer_conns_locked (ksock_peer_t *peer,
+					      __u32 ipaddr, int why);
+extern int ksocknal_close_conn_and_siblings (ksock_conn_t *conn, int why);
+extern int ksocknal_close_matching_conns (lnet_process_id_t id, __u32 ipaddr);
+extern ksock_conn_t *ksocknal_find_conn_locked(ksock_peer_t *peer,
+					       ksock_tx_t *tx, int nonblk);
+
+extern int  ksocknal_launch_packet(lnet_ni_t *ni, ksock_tx_t *tx,
+				   lnet_process_id_t id);
+extern ksock_tx_t *ksocknal_alloc_tx(int type, int size);
+extern void ksocknal_free_tx (ksock_tx_t *tx);
+extern ksock_tx_t *ksocknal_alloc_tx_noop(__u64 cookie, int nonblk);
+extern void ksocknal_next_tx_carrier(ksock_conn_t *conn);
+extern void ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn);
+extern void ksocknal_txlist_done (lnet_ni_t *ni, struct list_head *txlist,
+				  int error);
+extern void ksocknal_notify (lnet_ni_t *ni, lnet_nid_t gw_nid, int alive);
+extern void ksocknal_query (struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when);
+extern int ksocknal_thread_start(int (*fn)(void *arg), void *arg, char *name);
+extern void ksocknal_thread_fini (void);
+extern void ksocknal_launch_all_connections_locked (ksock_peer_t *peer);
+extern ksock_route_t *ksocknal_find_connectable_route_locked (ksock_peer_t *peer);
+extern ksock_route_t *ksocknal_find_connecting_route_locked (ksock_peer_t *peer);
+extern int ksocknal_new_packet (ksock_conn_t *conn, int skip);
+extern int ksocknal_scheduler (void *arg);
+extern int ksocknal_connd (void *arg);
+extern int ksocknal_reaper (void *arg);
+extern int ksocknal_send_hello (lnet_ni_t *ni, ksock_conn_t *conn,
+				lnet_nid_t peer_nid, ksock_hello_msg_t *hello);
+extern int ksocknal_recv_hello (lnet_ni_t *ni, ksock_conn_t *conn,
+				ksock_hello_msg_t *hello, lnet_process_id_t *id,
+				__u64 *incarnation);
+extern void ksocknal_read_callback(ksock_conn_t *conn);
+extern void ksocknal_write_callback(ksock_conn_t *conn);
+
+extern int ksocknal_lib_zc_capable(ksock_conn_t *conn);
+extern void ksocknal_lib_save_callback(socket_t *sock, ksock_conn_t *conn);
+extern void ksocknal_lib_set_callback(socket_t *sock,  ksock_conn_t *conn);
+extern void ksocknal_lib_reset_callback(socket_t *sock, ksock_conn_t *conn);
+extern void ksocknal_lib_push_conn (ksock_conn_t *conn);
+extern int ksocknal_lib_get_conn_addrs (ksock_conn_t *conn);
+extern int ksocknal_lib_setup_sock (socket_t *so);
+extern int ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx);
+extern int ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx);
+extern void ksocknal_lib_eager_ack (ksock_conn_t *conn);
+extern int ksocknal_lib_recv_iov (ksock_conn_t *conn);
+extern int ksocknal_lib_recv_kiov (ksock_conn_t *conn);
+extern int ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem,
+					   int *rxmem, int *nagle);
+
+extern int ksocknal_tunables_init(void);
+extern void ksocknal_tunables_fini(void);
+extern int ksocknal_lib_tunables_init(void);
+extern void ksocknal_lib_tunables_fini(void);
+
+extern void ksocknal_lib_csum_tx(ksock_tx_t *tx);
+
+extern int ksocknal_lib_memory_pressure(ksock_conn_t *conn);
+extern int ksocknal_lib_bind_thread_to_cpu(int id);
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c
new file mode 100644
index 000000000000..ad5e24104238
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c
@@ -0,0 +1,2664 @@
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "socklnd.h"
+
+ksock_tx_t *
+ksocknal_alloc_tx(int type, int size)
+{
+	ksock_tx_t *tx = NULL;
+
+	if (type == KSOCK_MSG_NOOP) {
+		LASSERT(size == KSOCK_NOOP_TX_SIZE);
+
+		/* searching for a noop tx in free list */
+		spin_lock(&ksocknal_data.ksnd_tx_lock);
+
+		if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) {
+			tx = list_entry(ksocknal_data.ksnd_idle_noop_txs. \
+					    next, ksock_tx_t, tx_list);
+			LASSERT(tx->tx_desc_size == size);
+			list_del(&tx->tx_list);
+		}
+
+		spin_unlock(&ksocknal_data.ksnd_tx_lock);
+	}
+
+	if (tx == NULL)
+		LIBCFS_ALLOC(tx, size);
+
+	if (tx == NULL)
+		return NULL;
+
+	atomic_set(&tx->tx_refcount, 1);
+	tx->tx_zc_aborted = 0;
+	tx->tx_zc_capable = 0;
+	tx->tx_zc_checked = 0;
+	tx->tx_desc_size  = size;
+
+	atomic_inc(&ksocknal_data.ksnd_nactive_txs);
+
+	return tx;
+}
+
+ksock_tx_t *
+ksocknal_alloc_tx_noop(__u64 cookie, int nonblk)
+{
+	ksock_tx_t *tx;
+
+	tx = ksocknal_alloc_tx(KSOCK_MSG_NOOP, KSOCK_NOOP_TX_SIZE);
+	if (tx == NULL) {
+		CERROR("Can't allocate noop tx desc\n");
+		return NULL;
+	}
+
+	tx->tx_conn     = NULL;
+	tx->tx_lnetmsg  = NULL;
+	tx->tx_kiov     = NULL;
+	tx->tx_nkiov    = 0;
+	tx->tx_iov      = tx->tx_frags.virt.iov;
+	tx->tx_niov     = 1;
+	tx->tx_nonblk   = nonblk;
+
+	socklnd_init_msg(&tx->tx_msg, KSOCK_MSG_NOOP);
+	tx->tx_msg.ksm_zc_cookies[1] = cookie;
+
+	return tx;
+}
+
+
+void
+ksocknal_free_tx (ksock_tx_t *tx)
+{
+	atomic_dec(&ksocknal_data.ksnd_nactive_txs);
+
+	if (tx->tx_lnetmsg == NULL && tx->tx_desc_size == KSOCK_NOOP_TX_SIZE) {
+		/* it's a noop tx */
+		spin_lock(&ksocknal_data.ksnd_tx_lock);
+
+		list_add(&tx->tx_list, &ksocknal_data.ksnd_idle_noop_txs);
+
+		spin_unlock(&ksocknal_data.ksnd_tx_lock);
+	} else {
+		LIBCFS_FREE(tx, tx->tx_desc_size);
+	}
+}
+
+int
+ksocknal_send_iov (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+	struct iovec  *iov = tx->tx_iov;
+	int    nob;
+	int    rc;
+
+	LASSERT (tx->tx_niov > 0);
+
+	/* Never touch tx->tx_iov inside ksocknal_lib_send_iov() */
+	rc = ksocknal_lib_send_iov(conn, tx);
+
+	if (rc <= 0)			    /* sent nothing? */
+		return (rc);
+
+	nob = rc;
+	LASSERT (nob <= tx->tx_resid);
+	tx->tx_resid -= nob;
+
+	/* "consume" iov */
+	do {
+		LASSERT (tx->tx_niov > 0);
+
+		if (nob < (int) iov->iov_len) {
+			iov->iov_base = (void *)((char *)iov->iov_base + nob);
+			iov->iov_len -= nob;
+			return (rc);
+		}
+
+		nob -= iov->iov_len;
+		tx->tx_iov = ++iov;
+		tx->tx_niov--;
+	} while (nob != 0);
+
+	return (rc);
+}
+
+int
+ksocknal_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+	lnet_kiov_t    *kiov = tx->tx_kiov;
+	int     nob;
+	int     rc;
+
+	LASSERT (tx->tx_niov == 0);
+	LASSERT (tx->tx_nkiov > 0);
+
+	/* Never touch tx->tx_kiov inside ksocknal_lib_send_kiov() */
+	rc = ksocknal_lib_send_kiov(conn, tx);
+
+	if (rc <= 0)			    /* sent nothing? */
+		return (rc);
+
+	nob = rc;
+	LASSERT (nob <= tx->tx_resid);
+	tx->tx_resid -= nob;
+
+	/* "consume" kiov */
+	do {
+		LASSERT(tx->tx_nkiov > 0);
+
+		if (nob < (int)kiov->kiov_len) {
+			kiov->kiov_offset += nob;
+			kiov->kiov_len -= nob;
+			return rc;
+		}
+
+		nob -= (int)kiov->kiov_len;
+		tx->tx_kiov = ++kiov;
+		tx->tx_nkiov--;
+	} while (nob != 0);
+
+	return (rc);
+}
+
+int
+ksocknal_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+	int      rc;
+	int      bufnob;
+
+	if (ksocknal_data.ksnd_stall_tx != 0) {
+		cfs_pause(cfs_time_seconds(ksocknal_data.ksnd_stall_tx));
+	}
+
+	LASSERT (tx->tx_resid != 0);
+
+	rc = ksocknal_connsock_addref(conn);
+	if (rc != 0) {
+		LASSERT (conn->ksnc_closing);
+		return (-ESHUTDOWN);
+	}
+
+	do {
+		if (ksocknal_data.ksnd_enomem_tx > 0) {
+			/* testing... */
+			ksocknal_data.ksnd_enomem_tx--;
+			rc = -EAGAIN;
+		} else if (tx->tx_niov != 0) {
+			rc = ksocknal_send_iov (conn, tx);
+		} else {
+			rc = ksocknal_send_kiov (conn, tx);
+		}
+
+		bufnob = cfs_sock_wmem_queued(conn->ksnc_sock);
+		if (rc > 0)		     /* sent something? */
+			conn->ksnc_tx_bufnob += rc; /* account it */
+
+		if (bufnob < conn->ksnc_tx_bufnob) {
+			/* allocated send buffer bytes < computed; infer
+			 * something got ACKed */
+			conn->ksnc_tx_deadline =
+				cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+			conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+			conn->ksnc_tx_bufnob = bufnob;
+			mb();
+		}
+
+		if (rc <= 0) { /* Didn't write anything? */
+
+			if (rc == 0) /* some stacks return 0 instead of -EAGAIN */
+				rc = -EAGAIN;
+
+			/* Check if EAGAIN is due to memory pressure */
+			if(rc == -EAGAIN && ksocknal_lib_memory_pressure(conn))
+				rc = -ENOMEM;
+
+			break;
+		}
+
+		/* socket's wmem_queued now includes 'rc' bytes */
+		atomic_sub (rc, &conn->ksnc_tx_nob);
+		rc = 0;
+
+	} while (tx->tx_resid != 0);
+
+	ksocknal_connsock_decref(conn);
+	return (rc);
+}
+
+int
+ksocknal_recv_iov (ksock_conn_t *conn)
+{
+	struct iovec *iov = conn->ksnc_rx_iov;
+	int     nob;
+	int     rc;
+
+	LASSERT (conn->ksnc_rx_niov > 0);
+
+	/* Never touch conn->ksnc_rx_iov or change connection
+	 * status inside ksocknal_lib_recv_iov */
+	rc = ksocknal_lib_recv_iov(conn);
+
+	if (rc <= 0)
+		return (rc);
+
+	/* received something... */
+	nob = rc;
+
+	conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+	conn->ksnc_rx_deadline =
+		cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+	mb();		       /* order with setting rx_started */
+	conn->ksnc_rx_started = 1;
+
+	conn->ksnc_rx_nob_wanted -= nob;
+	conn->ksnc_rx_nob_left -= nob;
+
+	do {
+		LASSERT (conn->ksnc_rx_niov > 0);
+
+		if (nob < (int)iov->iov_len) {
+			iov->iov_len -= nob;
+			iov->iov_base = (void *)((char *)iov->iov_base + nob);
+			return (-EAGAIN);
+		}
+
+		nob -= iov->iov_len;
+		conn->ksnc_rx_iov = ++iov;
+		conn->ksnc_rx_niov--;
+	} while (nob != 0);
+
+	return (rc);
+}
+
+int
+ksocknal_recv_kiov (ksock_conn_t *conn)
+{
+	lnet_kiov_t   *kiov = conn->ksnc_rx_kiov;
+	int     nob;
+	int     rc;
+	LASSERT (conn->ksnc_rx_nkiov > 0);
+
+	/* Never touch conn->ksnc_rx_kiov or change connection
+	 * status inside ksocknal_lib_recv_iov */
+	rc = ksocknal_lib_recv_kiov(conn);
+
+	if (rc <= 0)
+		return (rc);
+
+	/* received something... */
+	nob = rc;
+
+	conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+	conn->ksnc_rx_deadline =
+		cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+	mb();		       /* order with setting rx_started */
+	conn->ksnc_rx_started = 1;
+
+	conn->ksnc_rx_nob_wanted -= nob;
+	conn->ksnc_rx_nob_left -= nob;
+
+	do {
+		LASSERT (conn->ksnc_rx_nkiov > 0);
+
+		if (nob < (int) kiov->kiov_len) {
+			kiov->kiov_offset += nob;
+			kiov->kiov_len -= nob;
+			return -EAGAIN;
+		}
+
+		nob -= kiov->kiov_len;
+		conn->ksnc_rx_kiov = ++kiov;
+		conn->ksnc_rx_nkiov--;
+	} while (nob != 0);
+
+	return 1;
+}
+
+int
+ksocknal_receive (ksock_conn_t *conn)
+{
+	/* Return 1 on success, 0 on EOF, < 0 on error.
+	 * Caller checks ksnc_rx_nob_wanted to determine
+	 * progress/completion. */
+	int     rc;
+	ENTRY;
+
+	if (ksocknal_data.ksnd_stall_rx != 0) {
+		cfs_pause(cfs_time_seconds (ksocknal_data.ksnd_stall_rx));
+	}
+
+	rc = ksocknal_connsock_addref(conn);
+	if (rc != 0) {
+		LASSERT (conn->ksnc_closing);
+		return (-ESHUTDOWN);
+	}
+
+	for (;;) {
+		if (conn->ksnc_rx_niov != 0)
+			rc = ksocknal_recv_iov (conn);
+		else
+			rc = ksocknal_recv_kiov (conn);
+
+		if (rc <= 0) {
+			/* error/EOF or partial receive */
+			if (rc == -EAGAIN) {
+				rc = 1;
+			} else if (rc == 0 && conn->ksnc_rx_started) {
+				/* EOF in the middle of a message */
+				rc = -EPROTO;
+			}
+			break;
+		}
+
+		/* Completed a fragment */
+
+		if (conn->ksnc_rx_nob_wanted == 0) {
+			rc = 1;
+			break;
+		}
+	}
+
+	ksocknal_connsock_decref(conn);
+	RETURN (rc);
+}
+
+void
+ksocknal_tx_done (lnet_ni_t *ni, ksock_tx_t *tx)
+{
+	lnet_msg_t  *lnetmsg = tx->tx_lnetmsg;
+	int	  rc = (tx->tx_resid == 0 && !tx->tx_zc_aborted) ? 0 : -EIO;
+	ENTRY;
+
+	LASSERT(ni != NULL || tx->tx_conn != NULL);
+
+	if (tx->tx_conn != NULL)
+		ksocknal_conn_decref(tx->tx_conn);
+
+	if (ni == NULL && tx->tx_conn != NULL)
+		ni = tx->tx_conn->ksnc_peer->ksnp_ni;
+
+	ksocknal_free_tx (tx);
+	if (lnetmsg != NULL) /* KSOCK_MSG_NOOP go without lnetmsg */
+		lnet_finalize (ni, lnetmsg, rc);
+
+	EXIT;
+}
+
+void
+ksocknal_txlist_done (lnet_ni_t *ni, struct list_head *txlist, int error)
+{
+	ksock_tx_t *tx;
+
+	while (!list_empty (txlist)) {
+		tx = list_entry (txlist->next, ksock_tx_t, tx_list);
+
+		if (error && tx->tx_lnetmsg != NULL) {
+			CNETERR("Deleting packet type %d len %d %s->%s\n",
+				le32_to_cpu (tx->tx_lnetmsg->msg_hdr.type),
+				le32_to_cpu (tx->tx_lnetmsg->msg_hdr.payload_length),
+				libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.src_nid)),
+				libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.dest_nid)));
+		} else if (error) {
+			CNETERR("Deleting noop packet\n");
+		}
+
+		list_del (&tx->tx_list);
+
+		LASSERT (atomic_read(&tx->tx_refcount) == 1);
+		ksocknal_tx_done (ni, tx);
+	}
+}
+
+static void
+ksocknal_check_zc_req(ksock_tx_t *tx)
+{
+	ksock_conn_t   *conn = tx->tx_conn;
+	ksock_peer_t   *peer = conn->ksnc_peer;
+
+	/* Set tx_msg.ksm_zc_cookies[0] to a unique non-zero cookie and add tx
+	 * to ksnp_zc_req_list if some fragment of this message should be sent
+	 * zero-copy.  Our peer will send an ACK containing this cookie when
+	 * she has received this message to tell us we can signal completion.
+	 * tx_msg.ksm_zc_cookies[0] remains non-zero while tx is on
+	 * ksnp_zc_req_list. */
+	LASSERT (tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
+	LASSERT (tx->tx_zc_capable);
+
+	tx->tx_zc_checked = 1;
+
+	if (conn->ksnc_proto == &ksocknal_protocol_v1x ||
+	    !conn->ksnc_zc_capable)
+		return;
+
+	/* assign cookie and queue tx to pending list, it will be released when
+	 * a matching ack is received. See ksocknal_handle_zcack() */
+
+	ksocknal_tx_addref(tx);
+
+	spin_lock(&peer->ksnp_lock);
+
+	/* ZC_REQ is going to be pinned to the peer */
+	tx->tx_deadline =
+		cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+
+	LASSERT (tx->tx_msg.ksm_zc_cookies[0] == 0);
+
+	tx->tx_msg.ksm_zc_cookies[0] = peer->ksnp_zc_next_cookie++;
+
+	if (peer->ksnp_zc_next_cookie == 0)
+		peer->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1;
+
+	list_add_tail(&tx->tx_zc_list, &peer->ksnp_zc_req_list);
+
+	spin_unlock(&peer->ksnp_lock);
+}
+
+static void
+ksocknal_uncheck_zc_req(ksock_tx_t *tx)
+{
+	ksock_peer_t   *peer = tx->tx_conn->ksnc_peer;
+
+	LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
+	LASSERT(tx->tx_zc_capable);
+
+	tx->tx_zc_checked = 0;
+
+	spin_lock(&peer->ksnp_lock);
+
+	if (tx->tx_msg.ksm_zc_cookies[0] == 0) {
+		/* Not waiting for an ACK */
+		spin_unlock(&peer->ksnp_lock);
+		return;
+	}
+
+	tx->tx_msg.ksm_zc_cookies[0] = 0;
+	list_del(&tx->tx_zc_list);
+
+	spin_unlock(&peer->ksnp_lock);
+
+	ksocknal_tx_decref(tx);
+}
+
+int
+ksocknal_process_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+	int	    rc;
+
+	if (tx->tx_zc_capable && !tx->tx_zc_checked)
+		ksocknal_check_zc_req(tx);
+
+	rc = ksocknal_transmit (conn, tx);
+
+	CDEBUG (D_NET, "send(%d) %d\n", tx->tx_resid, rc);
+
+	if (tx->tx_resid == 0) {
+		/* Sent everything OK */
+		LASSERT (rc == 0);
+
+		return (0);
+	}
+
+	if (rc == -EAGAIN)
+		return (rc);
+
+	if (rc == -ENOMEM) {
+		static int counter;
+
+		counter++;   /* exponential backoff warnings */
+		if ((counter & (-counter)) == counter)
+			CWARN("%u ENOMEM tx %p (%u allocated)\n",
+			      counter, conn, atomic_read(&libcfs_kmemory));
+
+		/* Queue on ksnd_enomem_conns for retry after a timeout */
+		spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+		/* enomem list takes over scheduler's ref... */
+		LASSERT (conn->ksnc_tx_scheduled);
+		list_add_tail(&conn->ksnc_tx_list,
+				  &ksocknal_data.ksnd_enomem_conns);
+		if (!cfs_time_aftereq(cfs_time_add(cfs_time_current(),
+						   SOCKNAL_ENOMEM_RETRY),
+				   ksocknal_data.ksnd_reaper_waketime))
+			wake_up (&ksocknal_data.ksnd_reaper_waitq);
+
+		spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+		return (rc);
+	}
+
+	/* Actual error */
+	LASSERT (rc < 0);
+
+	if (!conn->ksnc_closing) {
+		switch (rc) {
+		case -ECONNRESET:
+			LCONSOLE_WARN("Host %u.%u.%u.%u reset our connection "
+				      "while we were sending data; it may have "
+				      "rebooted.\n",
+				      HIPQUAD(conn->ksnc_ipaddr));
+			break;
+		default:
+			LCONSOLE_WARN("There was an unexpected network error "
+				      "while writing to %u.%u.%u.%u: %d.\n",
+				      HIPQUAD(conn->ksnc_ipaddr), rc);
+			break;
+		}
+		CDEBUG(D_NET, "[%p] Error %d on write to %s"
+		       " ip %d.%d.%d.%d:%d\n", conn, rc,
+		       libcfs_id2str(conn->ksnc_peer->ksnp_id),
+		       HIPQUAD(conn->ksnc_ipaddr),
+		       conn->ksnc_port);
+	}
+
+	if (tx->tx_zc_checked)
+		ksocknal_uncheck_zc_req(tx);
+
+	/* it's not an error if conn is being closed */
+	ksocknal_close_conn_and_siblings (conn,
+					  (conn->ksnc_closing) ? 0 : rc);
+
+	return (rc);
+}
+
+void
+ksocknal_launch_connection_locked (ksock_route_t *route)
+{
+
+	/* called holding write lock on ksnd_global_lock */
+
+	LASSERT (!route->ksnr_scheduled);
+	LASSERT (!route->ksnr_connecting);
+	LASSERT ((ksocknal_route_mask() & ~route->ksnr_connected) != 0);
+
+	route->ksnr_scheduled = 1;	      /* scheduling conn for connd */
+	ksocknal_route_addref(route);	   /* extra ref for connd */
+
+	spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+
+	list_add_tail(&route->ksnr_connd_list,
+			  &ksocknal_data.ksnd_connd_routes);
+	wake_up(&ksocknal_data.ksnd_connd_waitq);
+
+	spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+}
+
+void
+ksocknal_launch_all_connections_locked (ksock_peer_t *peer)
+{
+	ksock_route_t *route;
+
+	/* called holding write lock on ksnd_global_lock */
+	for (;;) {
+		/* launch any/all connections that need it */
+		route = ksocknal_find_connectable_route_locked(peer);
+		if (route == NULL)
+			return;
+
+		ksocknal_launch_connection_locked(route);
+	}
+}
+
+ksock_conn_t *
+ksocknal_find_conn_locked(ksock_peer_t *peer, ksock_tx_t *tx, int nonblk)
+{
+	struct list_head       *tmp;
+	ksock_conn_t     *conn;
+	ksock_conn_t     *typed = NULL;
+	ksock_conn_t     *fallback = NULL;
+	int	       tnob     = 0;
+	int	       fnob     = 0;
+
+	list_for_each (tmp, &peer->ksnp_conns) {
+		ksock_conn_t *c  = list_entry(tmp, ksock_conn_t, ksnc_list);
+		int	   nob = atomic_read(&c->ksnc_tx_nob) +
+				    cfs_sock_wmem_queued(c->ksnc_sock);
+		int	   rc;
+
+		LASSERT (!c->ksnc_closing);
+		LASSERT (c->ksnc_proto != NULL &&
+			 c->ksnc_proto->pro_match_tx != NULL);
+
+		rc = c->ksnc_proto->pro_match_tx(c, tx, nonblk);
+
+		switch (rc) {
+		default:
+			LBUG();
+		case SOCKNAL_MATCH_NO: /* protocol rejected the tx */
+			continue;
+
+		case SOCKNAL_MATCH_YES: /* typed connection */
+			if (typed == NULL || tnob > nob ||
+			    (tnob == nob && *ksocknal_tunables.ksnd_round_robin &&
+			     cfs_time_after(typed->ksnc_tx_last_post, c->ksnc_tx_last_post))) {
+				typed = c;
+				tnob  = nob;
+			}
+			break;
+
+		case SOCKNAL_MATCH_MAY: /* fallback connection */
+			if (fallback == NULL || fnob > nob ||
+			    (fnob == nob && *ksocknal_tunables.ksnd_round_robin &&
+			     cfs_time_after(fallback->ksnc_tx_last_post, c->ksnc_tx_last_post))) {
+				fallback = c;
+				fnob     = nob;
+			}
+			break;
+		}
+	}
+
+	/* prefer the typed selection */
+	conn = (typed != NULL) ? typed : fallback;
+
+	if (conn != NULL)
+		conn->ksnc_tx_last_post = cfs_time_current();
+
+	return conn;
+}
+
+void
+ksocknal_tx_prep(ksock_conn_t *conn, ksock_tx_t *tx)
+{
+	conn->ksnc_proto->pro_pack(tx);
+
+	atomic_add (tx->tx_nob, &conn->ksnc_tx_nob);
+	ksocknal_conn_addref(conn); /* +1 ref for tx */
+	tx->tx_conn = conn;
+}
+
+void
+ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn)
+{
+	ksock_sched_t *sched = conn->ksnc_scheduler;
+	ksock_msg_t   *msg = &tx->tx_msg;
+	ksock_tx_t    *ztx = NULL;
+	int	    bufnob = 0;
+
+	/* called holding global lock (read or irq-write) and caller may
+	 * not have dropped this lock between finding conn and calling me,
+	 * so we don't need the {get,put}connsock dance to deref
+	 * ksnc_sock... */
+	LASSERT(!conn->ksnc_closing);
+
+	CDEBUG (D_NET, "Sending to %s ip %d.%d.%d.%d:%d\n",
+		libcfs_id2str(conn->ksnc_peer->ksnp_id),
+		HIPQUAD(conn->ksnc_ipaddr),
+		conn->ksnc_port);
+
+	ksocknal_tx_prep(conn, tx);
+
+	/* Ensure the frags we've been given EXACTLY match the number of
+	 * bytes we want to send.  Many TCP/IP stacks disregard any total
+	 * size parameters passed to them and just look at the frags.
+	 *
+	 * We always expect at least 1 mapped fragment containing the
+	 * complete ksocknal message header. */
+	LASSERT (lnet_iov_nob (tx->tx_niov, tx->tx_iov) +
+		 lnet_kiov_nob(tx->tx_nkiov, tx->tx_kiov) ==
+		 (unsigned int)tx->tx_nob);
+	LASSERT (tx->tx_niov >= 1);
+	LASSERT (tx->tx_resid == tx->tx_nob);
+
+	CDEBUG (D_NET, "Packet %p type %d, nob %d niov %d nkiov %d\n",
+		tx, (tx->tx_lnetmsg != NULL) ? tx->tx_lnetmsg->msg_hdr.type:
+					       KSOCK_MSG_NOOP,
+		tx->tx_nob, tx->tx_niov, tx->tx_nkiov);
+
+	/*
+	 * FIXME: SOCK_WMEM_QUEUED and SOCK_ERROR could block in __DARWIN8__
+	 * but they're used inside spinlocks a lot.
+	 */
+	bufnob = cfs_sock_wmem_queued(conn->ksnc_sock);
+	spin_lock_bh(&sched->kss_lock);
+
+	if (list_empty(&conn->ksnc_tx_queue) && bufnob == 0) {
+		/* First packet starts the timeout */
+		conn->ksnc_tx_deadline =
+			cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+		if (conn->ksnc_tx_bufnob > 0) /* something got ACKed */
+			conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+		conn->ksnc_tx_bufnob = 0;
+		mb(); /* order with adding to tx_queue */
+	}
+
+	if (msg->ksm_type == KSOCK_MSG_NOOP) {
+		/* The packet is noop ZC ACK, try to piggyback the ack_cookie
+		 * on a normal packet so I don't need to send it */
+		LASSERT (msg->ksm_zc_cookies[1] != 0);
+		LASSERT (conn->ksnc_proto->pro_queue_tx_zcack != NULL);
+
+		if (conn->ksnc_proto->pro_queue_tx_zcack(conn, tx, 0))
+			ztx = tx; /* ZC ACK piggybacked on ztx release tx later */
+
+	} else {
+		/* It's a normal packet - can it piggback a noop zc-ack that
+		 * has been queued already? */
+		LASSERT (msg->ksm_zc_cookies[1] == 0);
+		LASSERT (conn->ksnc_proto->pro_queue_tx_msg != NULL);
+
+		ztx = conn->ksnc_proto->pro_queue_tx_msg(conn, tx);
+		/* ztx will be released later */
+	}
+
+	if (ztx != NULL) {
+		atomic_sub (ztx->tx_nob, &conn->ksnc_tx_nob);
+		list_add_tail(&ztx->tx_list, &sched->kss_zombie_noop_txs);
+	}
+
+	if (conn->ksnc_tx_ready &&      /* able to send */
+	    !conn->ksnc_tx_scheduled) { /* not scheduled to send */
+		/* +1 ref for scheduler */
+		ksocknal_conn_addref(conn);
+		list_add_tail (&conn->ksnc_tx_list,
+				   &sched->kss_tx_conns);
+		conn->ksnc_tx_scheduled = 1;
+		wake_up (&sched->kss_waitq);
+	}
+
+	spin_unlock_bh(&sched->kss_lock);
+}
+
+
+ksock_route_t *
+ksocknal_find_connectable_route_locked (ksock_peer_t *peer)
+{
+	cfs_time_t     now = cfs_time_current();
+	struct list_head    *tmp;
+	ksock_route_t *route;
+
+	list_for_each (tmp, &peer->ksnp_routes) {
+		route = list_entry (tmp, ksock_route_t, ksnr_list);
+
+		LASSERT (!route->ksnr_connecting || route->ksnr_scheduled);
+
+		if (route->ksnr_scheduled)      /* connections being established */
+			continue;
+
+		/* all route types connected ? */
+		if ((ksocknal_route_mask() & ~route->ksnr_connected) == 0)
+			continue;
+
+		if (!(route->ksnr_retry_interval == 0 || /* first attempt */
+		      cfs_time_aftereq(now, route->ksnr_timeout))) {
+			CDEBUG(D_NET,
+			       "Too soon to retry route %u.%u.%u.%u "
+			       "(cnted %d, interval %ld, %ld secs later)\n",
+			       HIPQUAD(route->ksnr_ipaddr),
+			       route->ksnr_connected,
+			       route->ksnr_retry_interval,
+			       cfs_duration_sec(route->ksnr_timeout - now));
+			continue;
+		}
+
+		return (route);
+	}
+
+	return (NULL);
+}
+
+ksock_route_t *
+ksocknal_find_connecting_route_locked (ksock_peer_t *peer)
+{
+	struct list_head	*tmp;
+	ksock_route_t     *route;
+
+	list_for_each (tmp, &peer->ksnp_routes) {
+		route = list_entry (tmp, ksock_route_t, ksnr_list);
+
+		LASSERT (!route->ksnr_connecting || route->ksnr_scheduled);
+
+		if (route->ksnr_scheduled)
+			return (route);
+	}
+
+	return (NULL);
+}
+
+int
+ksocknal_launch_packet (lnet_ni_t *ni, ksock_tx_t *tx, lnet_process_id_t id)
+{
+	ksock_peer_t     *peer;
+	ksock_conn_t     *conn;
+	rwlock_t     *g_lock;
+	int	       retry;
+	int	       rc;
+
+	LASSERT (tx->tx_conn == NULL);
+
+	g_lock = &ksocknal_data.ksnd_global_lock;
+
+	for (retry = 0;; retry = 1) {
+		read_lock(g_lock);
+		peer = ksocknal_find_peer_locked(ni, id);
+		if (peer != NULL) {
+			if (ksocknal_find_connectable_route_locked(peer) == NULL) {
+				conn = ksocknal_find_conn_locked(peer, tx, tx->tx_nonblk);
+				if (conn != NULL) {
+					/* I've got no routes that need to be
+					 * connecting and I do have an actual
+					 * connection... */
+					ksocknal_queue_tx_locked (tx, conn);
+					read_unlock(g_lock);
+					return (0);
+				}
+			}
+		}
+
+		/* I'll need a write lock... */
+		read_unlock(g_lock);
+
+		write_lock_bh(g_lock);
+
+		peer = ksocknal_find_peer_locked(ni, id);
+		if (peer != NULL)
+			break;
+
+		write_unlock_bh(g_lock);
+
+		if ((id.pid & LNET_PID_USERFLAG) != 0) {
+			CERROR("Refusing to create a connection to "
+			       "userspace process %s\n", libcfs_id2str(id));
+			return -EHOSTUNREACH;
+		}
+
+		if (retry) {
+			CERROR("Can't find peer %s\n", libcfs_id2str(id));
+			return -EHOSTUNREACH;
+		}
+
+		rc = ksocknal_add_peer(ni, id,
+				       LNET_NIDADDR(id.nid),
+				       lnet_acceptor_port());
+		if (rc != 0) {
+			CERROR("Can't add peer %s: %d\n",
+			       libcfs_id2str(id), rc);
+			return rc;
+		}
+	}
+
+	ksocknal_launch_all_connections_locked(peer);
+
+	conn = ksocknal_find_conn_locked(peer, tx, tx->tx_nonblk);
+	if (conn != NULL) {
+		/* Connection exists; queue message on it */
+		ksocknal_queue_tx_locked (tx, conn);
+		write_unlock_bh(g_lock);
+		return (0);
+	}
+
+	if (peer->ksnp_accepting > 0 ||
+	    ksocknal_find_connecting_route_locked (peer) != NULL) {
+		/* the message is going to be pinned to the peer */
+		tx->tx_deadline =
+			cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+
+		/* Queue the message until a connection is established */
+		list_add_tail (&tx->tx_list, &peer->ksnp_tx_queue);
+		write_unlock_bh(g_lock);
+		return 0;
+	}
+
+	write_unlock_bh(g_lock);
+
+	/* NB Routes may be ignored if connections to them failed recently */
+	CNETERR("No usable routes to %s\n", libcfs_id2str(id));
+	return (-EHOSTUNREACH);
+}
+
+int
+ksocknal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
+{
+	int	       mpflag = 0;
+	int	       type = lntmsg->msg_type;
+	lnet_process_id_t target = lntmsg->msg_target;
+	unsigned int      payload_niov = lntmsg->msg_niov;
+	struct iovec     *payload_iov = lntmsg->msg_iov;
+	lnet_kiov_t      *payload_kiov = lntmsg->msg_kiov;
+	unsigned int      payload_offset = lntmsg->msg_offset;
+	unsigned int      payload_nob = lntmsg->msg_len;
+	ksock_tx_t       *tx;
+	int	       desc_size;
+	int	       rc;
+
+	/* NB 'private' is different depending on what we're sending.
+	 * Just ignore it... */
+
+	CDEBUG(D_NET, "sending %u bytes in %d frags to %s\n",
+	       payload_nob, payload_niov, libcfs_id2str(target));
+
+	LASSERT (payload_nob == 0 || payload_niov > 0);
+	LASSERT (payload_niov <= LNET_MAX_IOV);
+	/* payload is either all vaddrs or all pages */
+	LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
+	LASSERT (!in_interrupt ());
+
+	if (payload_iov != NULL)
+		desc_size = offsetof(ksock_tx_t,
+				     tx_frags.virt.iov[1 + payload_niov]);
+	else
+		desc_size = offsetof(ksock_tx_t,
+				     tx_frags.paged.kiov[payload_niov]);
+
+	if (lntmsg->msg_vmflush)
+		mpflag = cfs_memory_pressure_get_and_set();
+	tx = ksocknal_alloc_tx(KSOCK_MSG_LNET, desc_size);
+	if (tx == NULL) {
+		CERROR("Can't allocate tx desc type %d size %d\n",
+		       type, desc_size);
+		if (lntmsg->msg_vmflush)
+			cfs_memory_pressure_restore(mpflag);
+		return (-ENOMEM);
+	}
+
+	tx->tx_conn = NULL;		     /* set when assigned a conn */
+	tx->tx_lnetmsg = lntmsg;
+
+	if (payload_iov != NULL) {
+		tx->tx_kiov = NULL;
+		tx->tx_nkiov = 0;
+		tx->tx_iov = tx->tx_frags.virt.iov;
+		tx->tx_niov = 1 +
+			      lnet_extract_iov(payload_niov, &tx->tx_iov[1],
+					       payload_niov, payload_iov,
+					       payload_offset, payload_nob);
+	} else {
+		tx->tx_niov = 1;
+		tx->tx_iov = &tx->tx_frags.paged.iov;
+		tx->tx_kiov = tx->tx_frags.paged.kiov;
+		tx->tx_nkiov = lnet_extract_kiov(payload_niov, tx->tx_kiov,
+						 payload_niov, payload_kiov,
+						 payload_offset, payload_nob);
+
+		if (payload_nob >= *ksocknal_tunables.ksnd_zc_min_payload)
+			tx->tx_zc_capable = 1;
+	}
+
+	socklnd_init_msg(&tx->tx_msg, KSOCK_MSG_LNET);
+
+	/* The first fragment will be set later in pro_pack */
+	rc = ksocknal_launch_packet(ni, tx, target);
+	if (lntmsg->msg_vmflush)
+		cfs_memory_pressure_restore(mpflag);
+	if (rc == 0)
+		return (0);
+
+	ksocknal_free_tx(tx);
+	return (-EIO);
+}
+
+int
+ksocknal_thread_start(int (*fn)(void *arg), void *arg, char *name)
+{
+	task_t *task = kthread_run(fn, arg, name);
+
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+	ksocknal_data.ksnd_nthreads++;
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+	return 0;
+}
+
+void
+ksocknal_thread_fini (void)
+{
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+	ksocknal_data.ksnd_nthreads--;
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+}
+
+int
+ksocknal_new_packet (ksock_conn_t *conn, int nob_to_skip)
+{
+	static char ksocknal_slop_buffer[4096];
+
+	int	    nob;
+	unsigned int   niov;
+	int	    skipped;
+
+	LASSERT(conn->ksnc_proto != NULL);
+
+	if ((*ksocknal_tunables.ksnd_eager_ack & conn->ksnc_type) != 0) {
+		/* Remind the socket to ack eagerly... */
+		ksocknal_lib_eager_ack(conn);
+	}
+
+	if (nob_to_skip == 0) {	 /* right at next packet boundary now */
+		conn->ksnc_rx_started = 0;
+		mb();		       /* racing with timeout thread */
+
+		switch (conn->ksnc_proto->pro_version) {
+		case  KSOCK_PROTO_V2:
+		case  KSOCK_PROTO_V3:
+			conn->ksnc_rx_state = SOCKNAL_RX_KSM_HEADER;
+			conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
+			conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_msg;
+
+			conn->ksnc_rx_nob_wanted = offsetof(ksock_msg_t, ksm_u);
+			conn->ksnc_rx_nob_left = offsetof(ksock_msg_t, ksm_u);
+			conn->ksnc_rx_iov[0].iov_len  = offsetof(ksock_msg_t, ksm_u);
+			break;
+
+		case KSOCK_PROTO_V1:
+			/* Receiving bare lnet_hdr_t */
+			conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER;
+			conn->ksnc_rx_nob_wanted = sizeof(lnet_hdr_t);
+			conn->ksnc_rx_nob_left = sizeof(lnet_hdr_t);
+
+			conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
+			conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_msg.ksm_u.lnetmsg;
+			conn->ksnc_rx_iov[0].iov_len  = sizeof (lnet_hdr_t);
+			break;
+
+		default:
+			LBUG ();
+		}
+		conn->ksnc_rx_niov = 1;
+
+		conn->ksnc_rx_kiov = NULL;
+		conn->ksnc_rx_nkiov = 0;
+		conn->ksnc_rx_csum = ~0;
+		return (1);
+	}
+
+	/* Set up to skip as much as possible now.  If there's more left
+	 * (ran out of iov entries) we'll get called again */
+
+	conn->ksnc_rx_state = SOCKNAL_RX_SLOP;
+	conn->ksnc_rx_nob_left = nob_to_skip;
+	conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
+	skipped = 0;
+	niov = 0;
+
+	do {
+		nob = MIN (nob_to_skip, sizeof (ksocknal_slop_buffer));
+
+		conn->ksnc_rx_iov[niov].iov_base = ksocknal_slop_buffer;
+		conn->ksnc_rx_iov[niov].iov_len  = nob;
+		niov++;
+		skipped += nob;
+		nob_to_skip -=nob;
+
+	} while (nob_to_skip != 0 &&    /* mustn't overflow conn's rx iov */
+		 niov < sizeof(conn->ksnc_rx_iov_space) / sizeof (struct iovec));
+
+	conn->ksnc_rx_niov = niov;
+	conn->ksnc_rx_kiov = NULL;
+	conn->ksnc_rx_nkiov = 0;
+	conn->ksnc_rx_nob_wanted = skipped;
+	return (0);
+}
+
+int
+ksocknal_process_receive (ksock_conn_t *conn)
+{
+	lnet_hdr_t	*lhdr;
+	lnet_process_id_t *id;
+	int		rc;
+
+	LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0);
+
+	/* NB: sched lock NOT held */
+	/* SOCKNAL_RX_LNET_HEADER is here for backward compatability */
+	LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_KSM_HEADER ||
+		 conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD ||
+		 conn->ksnc_rx_state == SOCKNAL_RX_LNET_HEADER ||
+		 conn->ksnc_rx_state == SOCKNAL_RX_SLOP);
+ again:
+	if (conn->ksnc_rx_nob_wanted != 0) {
+		rc = ksocknal_receive(conn);
+
+		if (rc <= 0) {
+			LASSERT (rc != -EAGAIN);
+
+			if (rc == 0)
+				CDEBUG (D_NET, "[%p] EOF from %s"
+					" ip %d.%d.%d.%d:%d\n", conn,
+					libcfs_id2str(conn->ksnc_peer->ksnp_id),
+					HIPQUAD(conn->ksnc_ipaddr),
+					conn->ksnc_port);
+			else if (!conn->ksnc_closing)
+				CERROR ("[%p] Error %d on read from %s"
+					" ip %d.%d.%d.%d:%d\n",
+					conn, rc,
+					libcfs_id2str(conn->ksnc_peer->ksnp_id),
+					HIPQUAD(conn->ksnc_ipaddr),
+					conn->ksnc_port);
+
+			/* it's not an error if conn is being closed */
+			ksocknal_close_conn_and_siblings (conn,
+							  (conn->ksnc_closing) ? 0 : rc);
+			return (rc == 0 ? -ESHUTDOWN : rc);
+		}
+
+		if (conn->ksnc_rx_nob_wanted != 0) {
+			/* short read */
+			return (-EAGAIN);
+		}
+	}
+	switch (conn->ksnc_rx_state) {
+	case SOCKNAL_RX_KSM_HEADER:
+		if (conn->ksnc_flip) {
+			__swab32s(&conn->ksnc_msg.ksm_type);
+			__swab32s(&conn->ksnc_msg.ksm_csum);
+			__swab64s(&conn->ksnc_msg.ksm_zc_cookies[0]);
+			__swab64s(&conn->ksnc_msg.ksm_zc_cookies[1]);
+		}
+
+		if (conn->ksnc_msg.ksm_type != KSOCK_MSG_NOOP &&
+		    conn->ksnc_msg.ksm_type != KSOCK_MSG_LNET) {
+			CERROR("%s: Unknown message type: %x\n",
+			       libcfs_id2str(conn->ksnc_peer->ksnp_id),
+			       conn->ksnc_msg.ksm_type);
+			ksocknal_new_packet(conn, 0);
+			ksocknal_close_conn_and_siblings(conn, -EPROTO);
+			return (-EPROTO);
+		}
+
+		if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP &&
+		    conn->ksnc_msg.ksm_csum != 0 &&     /* has checksum */
+		    conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) {
+			/* NOOP Checksum error */
+			CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n",
+			       libcfs_id2str(conn->ksnc_peer->ksnp_id),
+			       conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum);
+			ksocknal_new_packet(conn, 0);
+			ksocknal_close_conn_and_siblings(conn, -EPROTO);
+			return (-EIO);
+		}
+
+		if (conn->ksnc_msg.ksm_zc_cookies[1] != 0) {
+			__u64 cookie = 0;
+
+			LASSERT (conn->ksnc_proto != &ksocknal_protocol_v1x);
+
+			if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP)
+				cookie = conn->ksnc_msg.ksm_zc_cookies[0];
+
+			rc = conn->ksnc_proto->pro_handle_zcack(conn, cookie,
+					       conn->ksnc_msg.ksm_zc_cookies[1]);
+
+			if (rc != 0) {
+				CERROR("%s: Unknown ZC-ACK cookie: "LPU64", "LPU64"\n",
+				       libcfs_id2str(conn->ksnc_peer->ksnp_id),
+				       cookie, conn->ksnc_msg.ksm_zc_cookies[1]);
+				ksocknal_new_packet(conn, 0);
+				ksocknal_close_conn_and_siblings(conn, -EPROTO);
+				return (rc);
+			}
+		}
+
+		if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP) {
+			ksocknal_new_packet (conn, 0);
+			return 0;       /* NOOP is done and just return */
+		}
+
+		conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER;
+		conn->ksnc_rx_nob_wanted = sizeof(ksock_lnet_msg_t);
+		conn->ksnc_rx_nob_left = sizeof(ksock_lnet_msg_t);
+
+		conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
+		conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_msg.ksm_u.lnetmsg;
+		conn->ksnc_rx_iov[0].iov_len  = sizeof(ksock_lnet_msg_t);
+
+		conn->ksnc_rx_niov = 1;
+		conn->ksnc_rx_kiov = NULL;
+		conn->ksnc_rx_nkiov = 0;
+
+		goto again;     /* read lnet header now */
+
+	case SOCKNAL_RX_LNET_HEADER:
+		/* unpack message header */
+		conn->ksnc_proto->pro_unpack(&conn->ksnc_msg);
+
+		if ((conn->ksnc_peer->ksnp_id.pid & LNET_PID_USERFLAG) != 0) {
+			/* Userspace peer */
+			lhdr = &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr;
+			id   = &conn->ksnc_peer->ksnp_id;
+
+			/* Substitute process ID assigned at connection time */
+			lhdr->src_pid = cpu_to_le32(id->pid);
+			lhdr->src_nid = cpu_to_le64(id->nid);
+		}
+
+		conn->ksnc_rx_state = SOCKNAL_RX_PARSE;
+		ksocknal_conn_addref(conn);     /* ++ref while parsing */
+
+		rc = lnet_parse(conn->ksnc_peer->ksnp_ni,
+				&conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr,
+				conn->ksnc_peer->ksnp_id.nid, conn, 0);
+		if (rc < 0) {
+			/* I just received garbage: give up on this conn */
+			ksocknal_new_packet(conn, 0);
+			ksocknal_close_conn_and_siblings (conn, rc);
+			ksocknal_conn_decref(conn);
+			return (-EPROTO);
+		}
+
+		/* I'm racing with ksocknal_recv() */
+		LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_PARSE ||
+			 conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD);
+
+		if (conn->ksnc_rx_state != SOCKNAL_RX_LNET_PAYLOAD)
+			return 0;
+
+		/* ksocknal_recv() got called */
+		goto again;
+
+	case SOCKNAL_RX_LNET_PAYLOAD:
+		/* payload all received */
+		rc = 0;
+
+		if (conn->ksnc_rx_nob_left == 0 &&   /* not truncating */
+		    conn->ksnc_msg.ksm_csum != 0 &&  /* has checksum */
+		    conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) {
+			CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n",
+			       libcfs_id2str(conn->ksnc_peer->ksnp_id),
+			       conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum);
+			rc = -EIO;
+		}
+
+		if (rc == 0 && conn->ksnc_msg.ksm_zc_cookies[0] != 0) {
+			LASSERT(conn->ksnc_proto != &ksocknal_protocol_v1x);
+
+			lhdr = &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr;
+			id   = &conn->ksnc_peer->ksnp_id;
+
+			rc = conn->ksnc_proto->pro_handle_zcreq(conn,
+					conn->ksnc_msg.ksm_zc_cookies[0],
+					*ksocknal_tunables.ksnd_nonblk_zcack ||
+					le64_to_cpu(lhdr->src_nid) != id->nid);
+		}
+
+		lnet_finalize(conn->ksnc_peer->ksnp_ni, conn->ksnc_cookie, rc);
+
+		if (rc != 0) {
+			ksocknal_new_packet(conn, 0);
+			ksocknal_close_conn_and_siblings (conn, rc);
+			return (-EPROTO);
+		}
+		/* Fall through */
+
+	case SOCKNAL_RX_SLOP:
+		/* starting new packet? */
+		if (ksocknal_new_packet (conn, conn->ksnc_rx_nob_left))
+			return 0;       /* come back later */
+		goto again;	     /* try to finish reading slop now */
+
+	default:
+		break;
+	}
+
+	/* Not Reached */
+	LBUG ();
+	return (-EINVAL);		       /* keep gcc happy */
+}
+
+int
+ksocknal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed,
+	       unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
+	       unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+	ksock_conn_t  *conn = (ksock_conn_t *)private;
+	ksock_sched_t *sched = conn->ksnc_scheduler;
+
+	LASSERT (mlen <= rlen);
+	LASSERT (niov <= LNET_MAX_IOV);
+
+	conn->ksnc_cookie = msg;
+	conn->ksnc_rx_nob_wanted = mlen;
+	conn->ksnc_rx_nob_left   = rlen;
+
+	if (mlen == 0 || iov != NULL) {
+		conn->ksnc_rx_nkiov = 0;
+		conn->ksnc_rx_kiov = NULL;
+		conn->ksnc_rx_iov = conn->ksnc_rx_iov_space.iov;
+		conn->ksnc_rx_niov =
+			lnet_extract_iov(LNET_MAX_IOV, conn->ksnc_rx_iov,
+					 niov, iov, offset, mlen);
+	} else {
+		conn->ksnc_rx_niov = 0;
+		conn->ksnc_rx_iov  = NULL;
+		conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov;
+		conn->ksnc_rx_nkiov =
+			lnet_extract_kiov(LNET_MAX_IOV, conn->ksnc_rx_kiov,
+					  niov, kiov, offset, mlen);
+	}
+
+	LASSERT (mlen ==
+		 lnet_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) +
+		 lnet_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov));
+
+	LASSERT (conn->ksnc_rx_scheduled);
+
+	spin_lock_bh(&sched->kss_lock);
+
+	switch (conn->ksnc_rx_state) {
+	case SOCKNAL_RX_PARSE_WAIT:
+		list_add_tail(&conn->ksnc_rx_list, &sched->kss_rx_conns);
+		wake_up (&sched->kss_waitq);
+		LASSERT (conn->ksnc_rx_ready);
+		break;
+
+	case SOCKNAL_RX_PARSE:
+		/* scheduler hasn't noticed I'm parsing yet */
+		break;
+	}
+
+	conn->ksnc_rx_state = SOCKNAL_RX_LNET_PAYLOAD;
+
+	spin_unlock_bh(&sched->kss_lock);
+	ksocknal_conn_decref(conn);
+	return 0;
+}
+
+static inline int
+ksocknal_sched_cansleep(ksock_sched_t *sched)
+{
+	int	   rc;
+
+	spin_lock_bh(&sched->kss_lock);
+
+	rc = (!ksocknal_data.ksnd_shuttingdown &&
+	      list_empty(&sched->kss_rx_conns) &&
+	      list_empty(&sched->kss_tx_conns));
+
+	spin_unlock_bh(&sched->kss_lock);
+	return rc;
+}
+
+int ksocknal_scheduler(void *arg)
+{
+	struct ksock_sched_info	*info;
+	ksock_sched_t		*sched;
+	ksock_conn_t		*conn;
+	ksock_tx_t		*tx;
+	int			rc;
+	int			nloops = 0;
+	long			id = (long)arg;
+
+	info = ksocknal_data.ksnd_sched_info[KSOCK_THREAD_CPT(id)];
+	sched = &info->ksi_scheds[KSOCK_THREAD_SID(id)];
+
+	cfs_block_allsigs();
+
+	rc = cfs_cpt_bind(lnet_cpt_table(), info->ksi_cpt);
+	if (rc != 0) {
+		CERROR("Can't set CPT affinity to %d: %d\n",
+		       info->ksi_cpt, rc);
+	}
+
+	spin_lock_bh(&sched->kss_lock);
+
+	while (!ksocknal_data.ksnd_shuttingdown) {
+		int did_something = 0;
+
+		/* Ensure I progress everything semi-fairly */
+
+		if (!list_empty (&sched->kss_rx_conns)) {
+			conn = list_entry(sched->kss_rx_conns.next,
+					      ksock_conn_t, ksnc_rx_list);
+			list_del(&conn->ksnc_rx_list);
+
+			LASSERT(conn->ksnc_rx_scheduled);
+			LASSERT(conn->ksnc_rx_ready);
+
+			/* clear rx_ready in case receive isn't complete.
+			 * Do it BEFORE we call process_recv, since
+			 * data_ready can set it any time after we release
+			 * kss_lock. */
+			conn->ksnc_rx_ready = 0;
+			spin_unlock_bh(&sched->kss_lock);
+
+			rc = ksocknal_process_receive(conn);
+
+			spin_lock_bh(&sched->kss_lock);
+
+			/* I'm the only one that can clear this flag */
+			LASSERT(conn->ksnc_rx_scheduled);
+
+			/* Did process_receive get everything it wanted? */
+			if (rc == 0)
+				conn->ksnc_rx_ready = 1;
+
+			if (conn->ksnc_rx_state == SOCKNAL_RX_PARSE) {
+				/* Conn blocked waiting for ksocknal_recv()
+				 * I change its state (under lock) to signal
+				 * it can be rescheduled */
+				conn->ksnc_rx_state = SOCKNAL_RX_PARSE_WAIT;
+			} else if (conn->ksnc_rx_ready) {
+				/* reschedule for rx */
+				list_add_tail (&conn->ksnc_rx_list,
+						   &sched->kss_rx_conns);
+			} else {
+				conn->ksnc_rx_scheduled = 0;
+				/* drop my ref */
+				ksocknal_conn_decref(conn);
+			}
+
+			did_something = 1;
+		}
+
+		if (!list_empty (&sched->kss_tx_conns)) {
+			LIST_HEAD    (zlist);
+
+			if (!list_empty(&sched->kss_zombie_noop_txs)) {
+				list_add(&zlist,
+					     &sched->kss_zombie_noop_txs);
+				list_del_init(&sched->kss_zombie_noop_txs);
+			}
+
+			conn = list_entry(sched->kss_tx_conns.next,
+					      ksock_conn_t, ksnc_tx_list);
+			list_del (&conn->ksnc_tx_list);
+
+			LASSERT(conn->ksnc_tx_scheduled);
+			LASSERT(conn->ksnc_tx_ready);
+			LASSERT(!list_empty(&conn->ksnc_tx_queue));
+
+			tx = list_entry(conn->ksnc_tx_queue.next,
+					    ksock_tx_t, tx_list);
+
+			if (conn->ksnc_tx_carrier == tx)
+				ksocknal_next_tx_carrier(conn);
+
+			/* dequeue now so empty list => more to send */
+			list_del(&tx->tx_list);
+
+			/* Clear tx_ready in case send isn't complete.  Do
+			 * it BEFORE we call process_transmit, since
+			 * write_space can set it any time after we release
+			 * kss_lock. */
+			conn->ksnc_tx_ready = 0;
+			spin_unlock_bh(&sched->kss_lock);
+
+			if (!list_empty(&zlist)) {
+				/* free zombie noop txs, it's fast because
+				 * noop txs are just put in freelist */
+				ksocknal_txlist_done(NULL, &zlist, 0);
+			}
+
+			rc = ksocknal_process_transmit(conn, tx);
+
+			if (rc == -ENOMEM || rc == -EAGAIN) {
+				/* Incomplete send: replace tx on HEAD of tx_queue */
+				spin_lock_bh(&sched->kss_lock);
+				list_add(&tx->tx_list,
+					     &conn->ksnc_tx_queue);
+			} else {
+				/* Complete send; tx -ref */
+				ksocknal_tx_decref(tx);
+
+				spin_lock_bh(&sched->kss_lock);
+				/* assume space for more */
+				conn->ksnc_tx_ready = 1;
+			}
+
+			if (rc == -ENOMEM) {
+				/* Do nothing; after a short timeout, this
+				 * conn will be reposted on kss_tx_conns. */
+			} else if (conn->ksnc_tx_ready &&
+				   !list_empty (&conn->ksnc_tx_queue)) {
+				/* reschedule for tx */
+				list_add_tail (&conn->ksnc_tx_list,
+						   &sched->kss_tx_conns);
+			} else {
+				conn->ksnc_tx_scheduled = 0;
+				/* drop my ref */
+				ksocknal_conn_decref(conn);
+			}
+
+			did_something = 1;
+		}
+		if (!did_something ||	   /* nothing to do */
+		    ++nloops == SOCKNAL_RESCHED) { /* hogging CPU? */
+			spin_unlock_bh(&sched->kss_lock);
+
+			nloops = 0;
+
+			if (!did_something) {   /* wait for something to do */
+				cfs_wait_event_interruptible_exclusive(
+					sched->kss_waitq,
+					!ksocknal_sched_cansleep(sched), rc);
+				LASSERT (rc == 0);
+			} else {
+				cond_resched();
+			}
+
+			spin_lock_bh(&sched->kss_lock);
+		}
+	}
+
+	spin_unlock_bh(&sched->kss_lock);
+	ksocknal_thread_fini();
+	return 0;
+}
+
+/*
+ * Add connection to kss_rx_conns of scheduler
+ * and wakeup the scheduler.
+ */
+void ksocknal_read_callback (ksock_conn_t *conn)
+{
+	ksock_sched_t *sched;
+	ENTRY;
+
+	sched = conn->ksnc_scheduler;
+
+	spin_lock_bh(&sched->kss_lock);
+
+	conn->ksnc_rx_ready = 1;
+
+	if (!conn->ksnc_rx_scheduled) {  /* not being progressed */
+		list_add_tail(&conn->ksnc_rx_list,
+				  &sched->kss_rx_conns);
+		conn->ksnc_rx_scheduled = 1;
+		/* extra ref for scheduler */
+		ksocknal_conn_addref(conn);
+
+		wake_up (&sched->kss_waitq);
+	}
+	spin_unlock_bh(&sched->kss_lock);
+
+	EXIT;
+}
+
+/*
+ * Add connection to kss_tx_conns of scheduler
+ * and wakeup the scheduler.
+ */
+void ksocknal_write_callback (ksock_conn_t *conn)
+{
+	ksock_sched_t *sched;
+	ENTRY;
+
+	sched = conn->ksnc_scheduler;
+
+	spin_lock_bh(&sched->kss_lock);
+
+	conn->ksnc_tx_ready = 1;
+
+	if (!conn->ksnc_tx_scheduled && // not being progressed
+	    !list_empty(&conn->ksnc_tx_queue)){//packets to send
+		list_add_tail (&conn->ksnc_tx_list,
+				   &sched->kss_tx_conns);
+		conn->ksnc_tx_scheduled = 1;
+		/* extra ref for scheduler */
+		ksocknal_conn_addref(conn);
+
+		wake_up (&sched->kss_waitq);
+	}
+
+	spin_unlock_bh(&sched->kss_lock);
+
+	EXIT;
+}
+
+ksock_proto_t *
+ksocknal_parse_proto_version (ksock_hello_msg_t *hello)
+{
+	__u32   version = 0;
+
+	if (hello->kshm_magic == LNET_PROTO_MAGIC)
+		version = hello->kshm_version;
+	else if (hello->kshm_magic == __swab32(LNET_PROTO_MAGIC))
+		version = __swab32(hello->kshm_version);
+
+	if (version != 0) {
+#if SOCKNAL_VERSION_DEBUG
+		if (*ksocknal_tunables.ksnd_protocol == 1)
+			return NULL;
+
+		if (*ksocknal_tunables.ksnd_protocol == 2 &&
+		    version == KSOCK_PROTO_V3)
+			return NULL;
+#endif
+		if (version == KSOCK_PROTO_V2)
+			return &ksocknal_protocol_v2x;
+
+		if (version == KSOCK_PROTO_V3)
+			return &ksocknal_protocol_v3x;
+
+		return NULL;
+	}
+
+	if (hello->kshm_magic == le32_to_cpu(LNET_PROTO_TCP_MAGIC)) {
+		lnet_magicversion_t *hmv = (lnet_magicversion_t *)hello;
+
+		CLASSERT (sizeof (lnet_magicversion_t) ==
+			  offsetof (ksock_hello_msg_t, kshm_src_nid));
+
+		if (hmv->version_major == cpu_to_le16 (KSOCK_PROTO_V1_MAJOR) &&
+		    hmv->version_minor == cpu_to_le16 (KSOCK_PROTO_V1_MINOR))
+			return &ksocknal_protocol_v1x;
+	}
+
+	return NULL;
+}
+
+int
+ksocknal_send_hello (lnet_ni_t *ni, ksock_conn_t *conn,
+		     lnet_nid_t peer_nid, ksock_hello_msg_t *hello)
+{
+	/* CAVEAT EMPTOR: this byte flips 'ipaddrs' */
+	ksock_net_t	 *net = (ksock_net_t *)ni->ni_data;
+
+	LASSERT (hello->kshm_nips <= LNET_MAX_INTERFACES);
+
+	/* rely on caller to hold a ref on socket so it wouldn't disappear */
+	LASSERT (conn->ksnc_proto != NULL);
+
+	hello->kshm_src_nid	 = ni->ni_nid;
+	hello->kshm_dst_nid	 = peer_nid;
+	hello->kshm_src_pid	 = the_lnet.ln_pid;
+
+	hello->kshm_src_incarnation = net->ksnn_incarnation;
+	hello->kshm_ctype	   = conn->ksnc_type;
+
+	return conn->ksnc_proto->pro_send_hello(conn, hello);
+}
+
+int
+ksocknal_invert_type(int type)
+{
+	switch (type)
+	{
+	case SOCKLND_CONN_ANY:
+	case SOCKLND_CONN_CONTROL:
+		return (type);
+	case SOCKLND_CONN_BULK_IN:
+		return SOCKLND_CONN_BULK_OUT;
+	case SOCKLND_CONN_BULK_OUT:
+		return SOCKLND_CONN_BULK_IN;
+	default:
+		return (SOCKLND_CONN_NONE);
+	}
+}
+
+int
+ksocknal_recv_hello (lnet_ni_t *ni, ksock_conn_t *conn,
+		     ksock_hello_msg_t *hello, lnet_process_id_t *peerid,
+		     __u64 *incarnation)
+{
+	/* Return < 0	fatal error
+	 *	0	  success
+	 *	EALREADY   lost connection race
+	 *	EPROTO     protocol version mismatch
+	 */
+	socket_t	*sock = conn->ksnc_sock;
+	int		  active = (conn->ksnc_proto != NULL);
+	int		  timeout;
+	int		  proto_match;
+	int		  rc;
+	ksock_proto_t       *proto;
+	lnet_process_id_t    recv_id;
+
+	/* socket type set on active connections - not set on passive */
+	LASSERT (!active == !(conn->ksnc_type != SOCKLND_CONN_NONE));
+
+	timeout = active ? *ksocknal_tunables.ksnd_timeout :
+			    lnet_acceptor_timeout();
+
+	rc = libcfs_sock_read(sock, &hello->kshm_magic, sizeof (hello->kshm_magic), timeout);
+	if (rc != 0) {
+		CERROR ("Error %d reading HELLO from %u.%u.%u.%u\n",
+			rc, HIPQUAD(conn->ksnc_ipaddr));
+		LASSERT (rc < 0);
+		return rc;
+	}
+
+	if (hello->kshm_magic != LNET_PROTO_MAGIC &&
+	    hello->kshm_magic != __swab32(LNET_PROTO_MAGIC) &&
+	    hello->kshm_magic != le32_to_cpu (LNET_PROTO_TCP_MAGIC)) {
+		/* Unexpected magic! */
+		CERROR ("Bad magic(1) %#08x (%#08x expected) from "
+			"%u.%u.%u.%u\n", __cpu_to_le32 (hello->kshm_magic),
+			LNET_PROTO_TCP_MAGIC,
+			HIPQUAD(conn->ksnc_ipaddr));
+		return -EPROTO;
+	}
+
+	rc = libcfs_sock_read(sock, &hello->kshm_version,
+			      sizeof(hello->kshm_version), timeout);
+	if (rc != 0) {
+		CERROR ("Error %d reading HELLO from %u.%u.%u.%u\n",
+			rc, HIPQUAD(conn->ksnc_ipaddr));
+		LASSERT (rc < 0);
+		return rc;
+	}
+
+	proto = ksocknal_parse_proto_version(hello);
+	if (proto == NULL) {
+		if (!active) {
+			/* unknown protocol from peer, tell peer my protocol */
+			conn->ksnc_proto = &ksocknal_protocol_v3x;
+#if SOCKNAL_VERSION_DEBUG
+			if (*ksocknal_tunables.ksnd_protocol == 2)
+				conn->ksnc_proto = &ksocknal_protocol_v2x;
+			else if (*ksocknal_tunables.ksnd_protocol == 1)
+				conn->ksnc_proto = &ksocknal_protocol_v1x;
+#endif
+			hello->kshm_nips = 0;
+			ksocknal_send_hello(ni, conn, ni->ni_nid, hello);
+		}
+
+		CERROR ("Unknown protocol version (%d.x expected)"
+			" from %u.%u.%u.%u\n",
+			conn->ksnc_proto->pro_version,
+			HIPQUAD(conn->ksnc_ipaddr));
+
+		return -EPROTO;
+	}
+
+	proto_match = (conn->ksnc_proto == proto);
+	conn->ksnc_proto = proto;
+
+	/* receive the rest of hello message anyway */
+	rc = conn->ksnc_proto->pro_recv_hello(conn, hello, timeout);
+	if (rc != 0) {
+		CERROR("Error %d reading or checking hello from from %u.%u.%u.%u\n",
+		       rc, HIPQUAD(conn->ksnc_ipaddr));
+		LASSERT (rc < 0);
+		return rc;
+	}
+
+	*incarnation = hello->kshm_src_incarnation;
+
+	if (hello->kshm_src_nid == LNET_NID_ANY) {
+		CERROR("Expecting a HELLO hdr with a NID, but got LNET_NID_ANY"
+		       "from %u.%u.%u.%u\n", HIPQUAD(conn->ksnc_ipaddr));
+		return -EPROTO;
+	}
+
+	if (!active &&
+	    conn->ksnc_port > LNET_ACCEPTOR_MAX_RESERVED_PORT) {
+		/* Userspace NAL assigns peer process ID from socket */
+		recv_id.pid = conn->ksnc_port | LNET_PID_USERFLAG;
+		recv_id.nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), conn->ksnc_ipaddr);
+	} else {
+		recv_id.nid = hello->kshm_src_nid;
+		recv_id.pid = hello->kshm_src_pid;
+	}
+
+	if (!active) {
+		*peerid = recv_id;
+
+		/* peer determines type */
+		conn->ksnc_type = ksocknal_invert_type(hello->kshm_ctype);
+		if (conn->ksnc_type == SOCKLND_CONN_NONE) {
+			CERROR ("Unexpected type %d from %s ip %u.%u.%u.%u\n",
+				hello->kshm_ctype, libcfs_id2str(*peerid),
+				HIPQUAD(conn->ksnc_ipaddr));
+			return -EPROTO;
+		}
+
+		return 0;
+	}
+
+	if (peerid->pid != recv_id.pid ||
+	    peerid->nid != recv_id.nid) {
+		LCONSOLE_ERROR_MSG(0x130, "Connected successfully to %s on host"
+				   " %u.%u.%u.%u, but they claimed they were "
+				   "%s; please check your Lustre "
+				   "configuration.\n",
+				   libcfs_id2str(*peerid),
+				   HIPQUAD(conn->ksnc_ipaddr),
+				   libcfs_id2str(recv_id));
+		return -EPROTO;
+	}
+
+	if (hello->kshm_ctype == SOCKLND_CONN_NONE) {
+		/* Possible protocol mismatch or I lost the connection race */
+		return proto_match ? EALREADY : EPROTO;
+	}
+
+	if (ksocknal_invert_type(hello->kshm_ctype) != conn->ksnc_type) {
+		CERROR ("Mismatched types: me %d, %s ip %u.%u.%u.%u %d\n",
+			conn->ksnc_type, libcfs_id2str(*peerid),
+			HIPQUAD(conn->ksnc_ipaddr),
+			hello->kshm_ctype);
+		return -EPROTO;
+	}
+
+	return 0;
+}
+
+int
+ksocknal_connect (ksock_route_t *route)
+{
+	LIST_HEAD    (zombies);
+	ksock_peer_t     *peer = route->ksnr_peer;
+	int	       type;
+	int	       wanted;
+	socket_t     *sock;
+	cfs_time_t	deadline;
+	int	       retry_later = 0;
+	int	       rc = 0;
+
+	deadline = cfs_time_add(cfs_time_current(),
+				cfs_time_seconds(*ksocknal_tunables.ksnd_timeout));
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	LASSERT (route->ksnr_scheduled);
+	LASSERT (!route->ksnr_connecting);
+
+	route->ksnr_connecting = 1;
+
+	for (;;) {
+		wanted = ksocknal_route_mask() & ~route->ksnr_connected;
+
+		/* stop connecting if peer/route got closed under me, or
+		 * route got connected while queued */
+		if (peer->ksnp_closing || route->ksnr_deleted ||
+		    wanted == 0) {
+			retry_later = 0;
+			break;
+		}
+
+		/* reschedule if peer is connecting to me */
+		if (peer->ksnp_accepting > 0) {
+			CDEBUG(D_NET,
+			       "peer %s(%d) already connecting to me, retry later.\n",
+			       libcfs_nid2str(peer->ksnp_id.nid), peer->ksnp_accepting);
+			retry_later = 1;
+		}
+
+		if (retry_later) /* needs reschedule */
+			break;
+
+		if ((wanted & (1 << SOCKLND_CONN_ANY)) != 0) {
+			type = SOCKLND_CONN_ANY;
+		} else if ((wanted & (1 << SOCKLND_CONN_CONTROL)) != 0) {
+			type = SOCKLND_CONN_CONTROL;
+		} else if ((wanted & (1 << SOCKLND_CONN_BULK_IN)) != 0) {
+			type = SOCKLND_CONN_BULK_IN;
+		} else {
+			LASSERT ((wanted & (1 << SOCKLND_CONN_BULK_OUT)) != 0);
+			type = SOCKLND_CONN_BULK_OUT;
+		}
+
+		write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+		if (cfs_time_aftereq(cfs_time_current(), deadline)) {
+			rc = -ETIMEDOUT;
+			lnet_connect_console_error(rc, peer->ksnp_id.nid,
+						   route->ksnr_ipaddr,
+						   route->ksnr_port);
+			goto failed;
+		}
+
+		rc = lnet_connect(&sock, peer->ksnp_id.nid,
+				  route->ksnr_myipaddr,
+				  route->ksnr_ipaddr, route->ksnr_port);
+		if (rc != 0)
+			goto failed;
+
+		rc = ksocknal_create_conn(peer->ksnp_ni, route, sock, type);
+		if (rc < 0) {
+			lnet_connect_console_error(rc, peer->ksnp_id.nid,
+						   route->ksnr_ipaddr,
+						   route->ksnr_port);
+			goto failed;
+		}
+
+		/* A +ve RC means I have to retry because I lost the connection
+		 * race or I have to renegotiate protocol version */
+		retry_later = (rc != 0);
+		if (retry_later)
+			CDEBUG(D_NET, "peer %s: conn race, retry later.\n",
+			       libcfs_nid2str(peer->ksnp_id.nid));
+
+		write_lock_bh(&ksocknal_data.ksnd_global_lock);
+	}
+
+	route->ksnr_scheduled = 0;
+	route->ksnr_connecting = 0;
+
+	if (retry_later) {
+		/* re-queue for attention; this frees me up to handle
+		 * the peer's incoming connection request */
+
+		if (rc == EALREADY ||
+		    (rc == 0 && peer->ksnp_accepting > 0)) {
+			/* We want to introduce a delay before next
+			 * attempt to connect if we lost conn race,
+			 * but the race is resolved quickly usually,
+			 * so min_reconnectms should be good heuristic */
+			route->ksnr_retry_interval =
+				cfs_time_seconds(*ksocknal_tunables.ksnd_min_reconnectms)/1000;
+			route->ksnr_timeout = cfs_time_add(cfs_time_current(),
+							   route->ksnr_retry_interval);
+		}
+
+		ksocknal_launch_connection_locked(route);
+	}
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+	return retry_later;
+
+ failed:
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	route->ksnr_scheduled = 0;
+	route->ksnr_connecting = 0;
+
+	/* This is a retry rather than a new connection */
+	route->ksnr_retry_interval *= 2;
+	route->ksnr_retry_interval =
+		MAX(route->ksnr_retry_interval,
+		    cfs_time_seconds(*ksocknal_tunables.ksnd_min_reconnectms)/1000);
+	route->ksnr_retry_interval =
+		MIN(route->ksnr_retry_interval,
+		    cfs_time_seconds(*ksocknal_tunables.ksnd_max_reconnectms)/1000);
+
+	LASSERT (route->ksnr_retry_interval != 0);
+	route->ksnr_timeout = cfs_time_add(cfs_time_current(),
+					   route->ksnr_retry_interval);
+
+	if (!list_empty(&peer->ksnp_tx_queue) &&
+	    peer->ksnp_accepting == 0 &&
+	    ksocknal_find_connecting_route_locked(peer) == NULL) {
+		ksock_conn_t *conn;
+
+		/* ksnp_tx_queue is queued on a conn on successful
+		 * connection for V1.x and V2.x */
+		if (!list_empty (&peer->ksnp_conns)) {
+			conn = list_entry(peer->ksnp_conns.next,
+					      ksock_conn_t, ksnc_list);
+			LASSERT (conn->ksnc_proto == &ksocknal_protocol_v3x);
+		}
+
+		/* take all the blocked packets while I've got the lock and
+		 * complete below... */
+		list_splice_init(&peer->ksnp_tx_queue, &zombies);
+	}
+
+#if 0	   /* irrelevent with only eager routes */
+	if (!route->ksnr_deleted) {
+		/* make this route least-favourite for re-selection */
+		list_del(&route->ksnr_list);
+		list_add_tail(&route->ksnr_list, &peer->ksnp_routes);
+	}
+#endif
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	ksocknal_peer_failed(peer);
+	ksocknal_txlist_done(peer->ksnp_ni, &zombies, 1);
+	return 0;
+}
+
+/*
+ * check whether we need to create more connds.
+ * It will try to create new thread if it's necessary, @timeout can
+ * be updated if failed to create, so caller wouldn't keep try while
+ * running out of resource.
+ */
+static int
+ksocknal_connd_check_start(long sec, long *timeout)
+{
+	char name[16];
+	int rc;
+	int total = ksocknal_data.ksnd_connd_starting +
+		    ksocknal_data.ksnd_connd_running;
+
+	if (unlikely(ksocknal_data.ksnd_init < SOCKNAL_INIT_ALL)) {
+		/* still in initializing */
+		return 0;
+	}
+
+	if (total >= *ksocknal_tunables.ksnd_nconnds_max ||
+	    total > ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV) {
+		/* can't create more connd, or still have enough
+		 * threads to handle more connecting */
+		return 0;
+	}
+
+	if (list_empty(&ksocknal_data.ksnd_connd_routes)) {
+		/* no pending connecting request */
+		return 0;
+	}
+
+	if (sec - ksocknal_data.ksnd_connd_failed_stamp <= 1) {
+		/* may run out of resource, retry later */
+		*timeout = cfs_time_seconds(1);
+		return 0;
+	}
+
+	if (ksocknal_data.ksnd_connd_starting > 0) {
+		/* serialize starting to avoid flood */
+		return 0;
+	}
+
+	ksocknal_data.ksnd_connd_starting_stamp = sec;
+	ksocknal_data.ksnd_connd_starting++;
+	spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+
+	/* NB: total is the next id */
+	snprintf(name, sizeof(name), "socknal_cd%02d", total);
+	rc = ksocknal_thread_start(ksocknal_connd, NULL, name);
+
+	spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+	if (rc == 0)
+		return 1;
+
+	/* we tried ... */
+	LASSERT(ksocknal_data.ksnd_connd_starting > 0);
+	ksocknal_data.ksnd_connd_starting--;
+	ksocknal_data.ksnd_connd_failed_stamp = cfs_time_current_sec();
+
+	return 1;
+}
+
+/*
+ * check whether current thread can exit, it will return 1 if there are too
+ * many threads and no creating in past 120 seconds.
+ * Also, this function may update @timeout to make caller come back
+ * again to recheck these conditions.
+ */
+static int
+ksocknal_connd_check_stop(long sec, long *timeout)
+{
+	int val;
+
+	if (unlikely(ksocknal_data.ksnd_init < SOCKNAL_INIT_ALL)) {
+		/* still in initializing */
+		return 0;
+	}
+
+	if (ksocknal_data.ksnd_connd_starting > 0) {
+		/* in progress of starting new thread */
+		return 0;
+	}
+
+	if (ksocknal_data.ksnd_connd_running <=
+	    *ksocknal_tunables.ksnd_nconnds) { /* can't shrink */
+		return 0;
+	}
+
+	/* created thread in past 120 seconds? */
+	val = (int)(ksocknal_data.ksnd_connd_starting_stamp +
+		    SOCKNAL_CONND_TIMEOUT - sec);
+
+	*timeout = (val > 0) ? cfs_time_seconds(val) :
+			       cfs_time_seconds(SOCKNAL_CONND_TIMEOUT);
+	if (val > 0)
+		return 0;
+
+	/* no creating in past 120 seconds */
+
+	return ksocknal_data.ksnd_connd_running >
+	       ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV;
+}
+
+/* Go through connd_routes queue looking for a route that we can process
+ * right now, @timeout_p can be updated if we need to come back later */
+static ksock_route_t *
+ksocknal_connd_get_route_locked(signed long *timeout_p)
+{
+	ksock_route_t *route;
+	cfs_time_t     now;
+
+	now = cfs_time_current();
+
+	/* connd_routes can contain both pending and ordinary routes */
+	list_for_each_entry (route, &ksocknal_data.ksnd_connd_routes,
+				 ksnr_connd_list) {
+
+		if (route->ksnr_retry_interval == 0 ||
+		    cfs_time_aftereq(now, route->ksnr_timeout))
+			return route;
+
+		if (*timeout_p == MAX_SCHEDULE_TIMEOUT ||
+		    (int)*timeout_p > (int)(route->ksnr_timeout - now))
+			*timeout_p = (int)(route->ksnr_timeout - now);
+	}
+
+	return NULL;
+}
+
+int
+ksocknal_connd (void *arg)
+{
+	spinlock_t    *connd_lock = &ksocknal_data.ksnd_connd_lock;
+	ksock_connreq_t   *cr;
+	wait_queue_t     wait;
+	int		nloops = 0;
+	int		cons_retry = 0;
+
+	cfs_block_allsigs ();
+
+	init_waitqueue_entry_current (&wait);
+
+	spin_lock_bh(connd_lock);
+
+	LASSERT(ksocknal_data.ksnd_connd_starting > 0);
+	ksocknal_data.ksnd_connd_starting--;
+	ksocknal_data.ksnd_connd_running++;
+
+	while (!ksocknal_data.ksnd_shuttingdown) {
+		ksock_route_t *route = NULL;
+		long sec = cfs_time_current_sec();
+		long timeout = MAX_SCHEDULE_TIMEOUT;
+		int  dropped_lock = 0;
+
+		if (ksocknal_connd_check_stop(sec, &timeout)) {
+			/* wakeup another one to check stop */
+			wake_up(&ksocknal_data.ksnd_connd_waitq);
+			break;
+		}
+
+		if (ksocknal_connd_check_start(sec, &timeout)) {
+			/* created new thread */
+			dropped_lock = 1;
+		}
+
+		if (!list_empty(&ksocknal_data.ksnd_connd_connreqs)) {
+			/* Connection accepted by the listener */
+			cr = list_entry(ksocknal_data.ksnd_connd_connreqs. \
+					    next, ksock_connreq_t, ksncr_list);
+
+			list_del(&cr->ksncr_list);
+			spin_unlock_bh(connd_lock);
+			dropped_lock = 1;
+
+			ksocknal_create_conn(cr->ksncr_ni, NULL,
+					     cr->ksncr_sock, SOCKLND_CONN_NONE);
+			lnet_ni_decref(cr->ksncr_ni);
+			LIBCFS_FREE(cr, sizeof(*cr));
+
+			spin_lock_bh(connd_lock);
+		}
+
+		/* Only handle an outgoing connection request if there
+		 * is a thread left to handle incoming connections and
+		 * create new connd */
+		if (ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV <
+		    ksocknal_data.ksnd_connd_running) {
+			route = ksocknal_connd_get_route_locked(&timeout);
+		}
+		if (route != NULL) {
+			list_del (&route->ksnr_connd_list);
+			ksocknal_data.ksnd_connd_connecting++;
+			spin_unlock_bh(connd_lock);
+			dropped_lock = 1;
+
+			if (ksocknal_connect(route)) {
+				/* consecutive retry */
+				if (cons_retry++ > SOCKNAL_INSANITY_RECONN) {
+					CWARN("massive consecutive "
+					      "re-connecting to %u.%u.%u.%u\n",
+					      HIPQUAD(route->ksnr_ipaddr));
+					cons_retry = 0;
+				}
+			} else {
+				cons_retry = 0;
+			}
+
+			ksocknal_route_decref(route);
+
+			spin_lock_bh(connd_lock);
+			ksocknal_data.ksnd_connd_connecting--;
+		}
+
+		if (dropped_lock) {
+			if (++nloops < SOCKNAL_RESCHED)
+				continue;
+			spin_unlock_bh(connd_lock);
+			nloops = 0;
+			cond_resched();
+			spin_lock_bh(connd_lock);
+			continue;
+		}
+
+		/* Nothing to do for 'timeout'  */
+		set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue_exclusive(&ksocknal_data.ksnd_connd_waitq, &wait);
+		spin_unlock_bh(connd_lock);
+
+		nloops = 0;
+		waitq_timedwait(&wait, TASK_INTERRUPTIBLE, timeout);
+
+		set_current_state(TASK_RUNNING);
+		remove_wait_queue(&ksocknal_data.ksnd_connd_waitq, &wait);
+		spin_lock_bh(connd_lock);
+	}
+	ksocknal_data.ksnd_connd_running--;
+	spin_unlock_bh(connd_lock);
+
+	ksocknal_thread_fini();
+	return 0;
+}
+
+ksock_conn_t *
+ksocknal_find_timed_out_conn (ksock_peer_t *peer)
+{
+	/* We're called with a shared lock on ksnd_global_lock */
+	ksock_conn_t      *conn;
+	struct list_head	*ctmp;
+
+	list_for_each (ctmp, &peer->ksnp_conns) {
+		int     error;
+		conn = list_entry (ctmp, ksock_conn_t, ksnc_list);
+
+		/* Don't need the {get,put}connsock dance to deref ksnc_sock */
+		LASSERT (!conn->ksnc_closing);
+
+		/* SOCK_ERROR will reset error code of socket in
+		 * some platform (like Darwin8.x) */
+		error = cfs_sock_error(conn->ksnc_sock);
+		if (error != 0) {
+			ksocknal_conn_addref(conn);
+
+			switch (error) {
+			case ECONNRESET:
+				CNETERR("A connection with %s "
+					"(%u.%u.%u.%u:%d) was reset; "
+					"it may have rebooted.\n",
+					libcfs_id2str(peer->ksnp_id),
+					HIPQUAD(conn->ksnc_ipaddr),
+					conn->ksnc_port);
+				break;
+			case ETIMEDOUT:
+				CNETERR("A connection with %s "
+					"(%u.%u.%u.%u:%d) timed out; the "
+					"network or node may be down.\n",
+					libcfs_id2str(peer->ksnp_id),
+					HIPQUAD(conn->ksnc_ipaddr),
+					conn->ksnc_port);
+				break;
+			default:
+				CNETERR("An unexpected network error %d "
+					"occurred with %s "
+					"(%u.%u.%u.%u:%d\n", error,
+					libcfs_id2str(peer->ksnp_id),
+					HIPQUAD(conn->ksnc_ipaddr),
+					conn->ksnc_port);
+				break;
+			}
+
+			return (conn);
+		}
+
+		if (conn->ksnc_rx_started &&
+		    cfs_time_aftereq(cfs_time_current(),
+				     conn->ksnc_rx_deadline)) {
+			/* Timed out incomplete incoming message */
+			ksocknal_conn_addref(conn);
+			CNETERR("Timeout receiving from %s (%u.%u.%u.%u:%d), "
+				"state %d wanted %d left %d\n",
+				libcfs_id2str(peer->ksnp_id),
+				HIPQUAD(conn->ksnc_ipaddr),
+				conn->ksnc_port,
+				conn->ksnc_rx_state,
+				conn->ksnc_rx_nob_wanted,
+				conn->ksnc_rx_nob_left);
+			return (conn);
+		}
+
+		if ((!list_empty(&conn->ksnc_tx_queue) ||
+		     cfs_sock_wmem_queued(conn->ksnc_sock) != 0) &&
+		    cfs_time_aftereq(cfs_time_current(),
+				     conn->ksnc_tx_deadline)) {
+			/* Timed out messages queued for sending or
+			 * buffered in the socket's send buffer */
+			ksocknal_conn_addref(conn);
+			CNETERR("Timeout sending data to %s (%u.%u.%u.%u:%d) "
+				"the network or that node may be down.\n",
+				libcfs_id2str(peer->ksnp_id),
+				HIPQUAD(conn->ksnc_ipaddr),
+				conn->ksnc_port);
+			return (conn);
+		}
+	}
+
+	return (NULL);
+}
+
+static inline void
+ksocknal_flush_stale_txs(ksock_peer_t *peer)
+{
+	ksock_tx_t	*tx;
+	LIST_HEAD      (stale_txs);
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	while (!list_empty (&peer->ksnp_tx_queue)) {
+		tx = list_entry (peer->ksnp_tx_queue.next,
+				     ksock_tx_t, tx_list);
+
+		if (!cfs_time_aftereq(cfs_time_current(),
+				      tx->tx_deadline))
+			break;
+
+		list_del (&tx->tx_list);
+		list_add_tail (&tx->tx_list, &stale_txs);
+	}
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	ksocknal_txlist_done(peer->ksnp_ni, &stale_txs, 1);
+}
+
+int
+ksocknal_send_keepalive_locked(ksock_peer_t *peer)
+{
+	ksock_sched_t  *sched;
+	ksock_conn_t   *conn;
+	ksock_tx_t     *tx;
+
+	if (list_empty(&peer->ksnp_conns)) /* last_alive will be updated by create_conn */
+		return 0;
+
+	if (peer->ksnp_proto != &ksocknal_protocol_v3x)
+		return 0;
+
+	if (*ksocknal_tunables.ksnd_keepalive <= 0 ||
+	    cfs_time_before(cfs_time_current(),
+			    cfs_time_add(peer->ksnp_last_alive,
+					 cfs_time_seconds(*ksocknal_tunables.ksnd_keepalive))))
+		return 0;
+
+	if (cfs_time_before(cfs_time_current(),
+			    peer->ksnp_send_keepalive))
+		return 0;
+
+	/* retry 10 secs later, so we wouldn't put pressure
+	 * on this peer if we failed to send keepalive this time */
+	peer->ksnp_send_keepalive = cfs_time_shift(10);
+
+	conn = ksocknal_find_conn_locked(peer, NULL, 1);
+	if (conn != NULL) {
+		sched = conn->ksnc_scheduler;
+
+		spin_lock_bh(&sched->kss_lock);
+		if (!list_empty(&conn->ksnc_tx_queue)) {
+			spin_unlock_bh(&sched->kss_lock);
+			/* there is an queued ACK, don't need keepalive */
+			return 0;
+		}
+
+		spin_unlock_bh(&sched->kss_lock);
+	}
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+
+	/* cookie = 1 is reserved for keepalive PING */
+	tx = ksocknal_alloc_tx_noop(1, 1);
+	if (tx == NULL) {
+		read_lock(&ksocknal_data.ksnd_global_lock);
+		return -ENOMEM;
+	}
+
+	if (ksocknal_launch_packet(peer->ksnp_ni, tx, peer->ksnp_id) == 0) {
+		read_lock(&ksocknal_data.ksnd_global_lock);
+		return 1;
+	}
+
+	ksocknal_free_tx(tx);
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	return -EIO;
+}
+
+
+void
+ksocknal_check_peer_timeouts (int idx)
+{
+	struct list_head       *peers = &ksocknal_data.ksnd_peers[idx];
+	ksock_peer_t     *peer;
+	ksock_conn_t     *conn;
+	ksock_tx_t       *tx;
+
+ again:
+	/* NB. We expect to have a look at all the peers and not find any
+	 * connections to time out, so we just use a shared lock while we
+	 * take a look... */
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	list_for_each_entry(peer, peers, ksnp_list) {
+		cfs_time_t  deadline = 0;
+		int	 resid = 0;
+		int	 n     = 0;
+
+		if (ksocknal_send_keepalive_locked(peer) != 0) {
+			read_unlock(&ksocknal_data.ksnd_global_lock);
+			goto again;
+		}
+
+		conn = ksocknal_find_timed_out_conn (peer);
+
+		if (conn != NULL) {
+			read_unlock(&ksocknal_data.ksnd_global_lock);
+
+			ksocknal_close_conn_and_siblings (conn, -ETIMEDOUT);
+
+			/* NB we won't find this one again, but we can't
+			 * just proceed with the next peer, since we dropped
+			 * ksnd_global_lock and it might be dead already! */
+			ksocknal_conn_decref(conn);
+			goto again;
+		}
+
+		/* we can't process stale txs right here because we're
+		 * holding only shared lock */
+		if (!list_empty (&peer->ksnp_tx_queue)) {
+			ksock_tx_t *tx =
+				list_entry (peer->ksnp_tx_queue.next,
+						ksock_tx_t, tx_list);
+
+			if (cfs_time_aftereq(cfs_time_current(),
+					     tx->tx_deadline)) {
+
+				ksocknal_peer_addref(peer);
+				read_unlock(&ksocknal_data.ksnd_global_lock);
+
+				ksocknal_flush_stale_txs(peer);
+
+				ksocknal_peer_decref(peer);
+				goto again;
+			}
+		}
+
+		if (list_empty(&peer->ksnp_zc_req_list))
+			continue;
+
+		spin_lock(&peer->ksnp_lock);
+		list_for_each_entry(tx, &peer->ksnp_zc_req_list, tx_zc_list) {
+			if (!cfs_time_aftereq(cfs_time_current(),
+					      tx->tx_deadline))
+				break;
+			/* ignore the TX if connection is being closed */
+			if (tx->tx_conn->ksnc_closing)
+				continue;
+			n++;
+		}
+
+		if (n == 0) {
+			spin_unlock(&peer->ksnp_lock);
+			continue;
+		}
+
+		tx = list_entry(peer->ksnp_zc_req_list.next,
+				    ksock_tx_t, tx_zc_list);
+		deadline = tx->tx_deadline;
+		resid    = tx->tx_resid;
+		conn     = tx->tx_conn;
+		ksocknal_conn_addref(conn);
+
+		spin_unlock(&peer->ksnp_lock);
+		read_unlock(&ksocknal_data.ksnd_global_lock);
+
+		CERROR("Total %d stale ZC_REQs for peer %s detected; the "
+		       "oldest(%p) timed out %ld secs ago, "
+		       "resid: %d, wmem: %d\n",
+		       n, libcfs_nid2str(peer->ksnp_id.nid), tx,
+		       cfs_duration_sec(cfs_time_current() - deadline),
+		       resid, cfs_sock_wmem_queued(conn->ksnc_sock));
+
+		ksocknal_close_conn_and_siblings (conn, -ETIMEDOUT);
+		ksocknal_conn_decref(conn);
+		goto again;
+	}
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+}
+
+int
+ksocknal_reaper (void *arg)
+{
+	wait_queue_t     wait;
+	ksock_conn_t      *conn;
+	ksock_sched_t     *sched;
+	struct list_head	 enomem_conns;
+	int		nenomem_conns;
+	cfs_duration_t     timeout;
+	int		i;
+	int		peer_index = 0;
+	cfs_time_t	 deadline = cfs_time_current();
+
+	cfs_block_allsigs ();
+
+	INIT_LIST_HEAD(&enomem_conns);
+	init_waitqueue_entry_current (&wait);
+
+	spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+	while (!ksocknal_data.ksnd_shuttingdown) {
+
+		if (!list_empty (&ksocknal_data.ksnd_deathrow_conns)) {
+			conn = list_entry (ksocknal_data. \
+					       ksnd_deathrow_conns.next,
+					       ksock_conn_t, ksnc_list);
+			list_del (&conn->ksnc_list);
+
+			spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+			ksocknal_terminate_conn(conn);
+			ksocknal_conn_decref(conn);
+
+			spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+			continue;
+		}
+
+		if (!list_empty (&ksocknal_data.ksnd_zombie_conns)) {
+			conn = list_entry (ksocknal_data.ksnd_zombie_conns.\
+					       next, ksock_conn_t, ksnc_list);
+			list_del (&conn->ksnc_list);
+
+			spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+			ksocknal_destroy_conn(conn);
+
+			spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+			continue;
+		}
+
+		if (!list_empty (&ksocknal_data.ksnd_enomem_conns)) {
+			list_add(&enomem_conns,
+				     &ksocknal_data.ksnd_enomem_conns);
+			list_del_init(&ksocknal_data.ksnd_enomem_conns);
+		}
+
+		spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+		/* reschedule all the connections that stalled with ENOMEM... */
+		nenomem_conns = 0;
+		while (!list_empty (&enomem_conns)) {
+			conn = list_entry (enomem_conns.next,
+					       ksock_conn_t, ksnc_tx_list);
+			list_del (&conn->ksnc_tx_list);
+
+			sched = conn->ksnc_scheduler;
+
+			spin_lock_bh(&sched->kss_lock);
+
+			LASSERT(conn->ksnc_tx_scheduled);
+			conn->ksnc_tx_ready = 1;
+			list_add_tail(&conn->ksnc_tx_list,
+					  &sched->kss_tx_conns);
+			wake_up(&sched->kss_waitq);
+
+			spin_unlock_bh(&sched->kss_lock);
+			nenomem_conns++;
+		}
+
+		/* careful with the jiffy wrap... */
+		while ((timeout = cfs_time_sub(deadline,
+					       cfs_time_current())) <= 0) {
+			const int n = 4;
+			const int p = 1;
+			int       chunk = ksocknal_data.ksnd_peer_hash_size;
+
+			/* Time to check for timeouts on a few more peers: I do
+			 * checks every 'p' seconds on a proportion of the peer
+			 * table and I need to check every connection 'n' times
+			 * within a timeout interval, to ensure I detect a
+			 * timeout on any connection within (n+1)/n times the
+			 * timeout interval. */
+
+			if (*ksocknal_tunables.ksnd_timeout > n * p)
+				chunk = (chunk * n * p) /
+					*ksocknal_tunables.ksnd_timeout;
+			if (chunk == 0)
+				chunk = 1;
+
+			for (i = 0; i < chunk; i++) {
+				ksocknal_check_peer_timeouts (peer_index);
+				peer_index = (peer_index + 1) %
+					     ksocknal_data.ksnd_peer_hash_size;
+			}
+
+			deadline = cfs_time_add(deadline, cfs_time_seconds(p));
+		}
+
+		if (nenomem_conns != 0) {
+			/* Reduce my timeout if I rescheduled ENOMEM conns.
+			 * This also prevents me getting woken immediately
+			 * if any go back on my enomem list. */
+			timeout = SOCKNAL_ENOMEM_RETRY;
+		}
+		ksocknal_data.ksnd_reaper_waketime =
+			cfs_time_add(cfs_time_current(), timeout);
+
+		set_current_state (TASK_INTERRUPTIBLE);
+		add_wait_queue (&ksocknal_data.ksnd_reaper_waitq, &wait);
+
+		if (!ksocknal_data.ksnd_shuttingdown &&
+		    list_empty (&ksocknal_data.ksnd_deathrow_conns) &&
+		    list_empty (&ksocknal_data.ksnd_zombie_conns))
+			waitq_timedwait (&wait, TASK_INTERRUPTIBLE,
+					     timeout);
+
+		set_current_state (TASK_RUNNING);
+		remove_wait_queue (&ksocknal_data.ksnd_reaper_waitq, &wait);
+
+		spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+	}
+
+	spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+	ksocknal_thread_fini();
+	return 0;
+}
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c
new file mode 100644
index 000000000000..3e08fe2d1489
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c
@@ -0,0 +1,1088 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include "socklnd.h"
+
+# if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
+
+
+enum {
+	SOCKLND_TIMEOUT = 1,
+	SOCKLND_CREDITS,
+	SOCKLND_PEER_TXCREDITS,
+	SOCKLND_PEER_RTRCREDITS,
+	SOCKLND_PEER_TIMEOUT,
+	SOCKLND_NCONNDS,
+	SOCKLND_RECONNECTS_MIN,
+	SOCKLND_RECONNECTS_MAX,
+	SOCKLND_EAGER_ACK,
+	SOCKLND_ZERO_COPY,
+	SOCKLND_TYPED,
+	SOCKLND_BULK_MIN,
+	SOCKLND_RX_BUFFER_SIZE,
+	SOCKLND_TX_BUFFER_SIZE,
+	SOCKLND_NAGLE,
+	SOCKLND_IRQ_AFFINITY,
+	SOCKLND_ROUND_ROBIN,
+	SOCKLND_KEEPALIVE,
+	SOCKLND_KEEPALIVE_IDLE,
+	SOCKLND_KEEPALIVE_COUNT,
+	SOCKLND_KEEPALIVE_INTVL,
+	SOCKLND_BACKOFF_INIT,
+	SOCKLND_BACKOFF_MAX,
+	SOCKLND_PROTOCOL,
+	SOCKLND_ZERO_COPY_RECV,
+	SOCKLND_ZERO_COPY_RECV_MIN_NFRAGS
+};
+
+static ctl_table_t ksocknal_ctl_table[] = {
+	{
+		.ctl_name = SOCKLND_TIMEOUT,
+		.procname = "timeout",
+		.data     = &ksocknal_tunables.ksnd_timeout,
+		.maxlen   = sizeof (int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_CREDITS,
+		.procname = "credits",
+		.data     = &ksocknal_tunables.ksnd_credits,
+		.maxlen   = sizeof (int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	 {
+		.ctl_name = SOCKLND_PEER_TXCREDITS,
+		.procname = "peer_credits",
+		.data     = &ksocknal_tunables.ksnd_peertxcredits,
+		.maxlen   = sizeof (int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	 {
+		.ctl_name = SOCKLND_PEER_RTRCREDITS,
+		.procname = "peer_buffer_credits",
+		.data     = &ksocknal_tunables.ksnd_peerrtrcredits,
+		.maxlen   = sizeof (int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_PEER_TIMEOUT,
+		.procname = "peer_timeout",
+		.data     = &ksocknal_tunables.ksnd_peertimeout,
+		.maxlen   = sizeof (int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_NCONNDS,
+		.procname = "nconnds",
+		.data     = &ksocknal_tunables.ksnd_nconnds,
+		.maxlen   = sizeof (int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_RECONNECTS_MIN,
+		.procname = "min_reconnectms",
+		.data     = &ksocknal_tunables.ksnd_min_reconnectms,
+		.maxlen   = sizeof (int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_RECONNECTS_MAX,
+		.procname = "max_reconnectms",
+		.data     = &ksocknal_tunables.ksnd_max_reconnectms,
+		.maxlen   = sizeof (int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_EAGER_ACK,
+		.procname = "eager_ack",
+		.data     = &ksocknal_tunables.ksnd_eager_ack,
+		.maxlen   = sizeof (int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_ZERO_COPY,
+		.procname = "zero_copy",
+		.data     = &ksocknal_tunables.ksnd_zc_min_payload,
+		.maxlen   = sizeof (int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_ZERO_COPY_RECV,
+		.procname = "zero_copy_recv",
+		.data     = &ksocknal_tunables.ksnd_zc_recv,
+		.maxlen   = sizeof (int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+
+	{
+		.ctl_name = SOCKLND_ZERO_COPY_RECV_MIN_NFRAGS,
+		.procname = "zero_copy_recv",
+		.data     = &ksocknal_tunables.ksnd_zc_recv_min_nfrags,
+		.maxlen   = sizeof (int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_TYPED,
+		.procname = "typed",
+		.data     = &ksocknal_tunables.ksnd_typed_conns,
+		.maxlen   = sizeof (int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_BULK_MIN,
+		.procname = "min_bulk",
+		.data     = &ksocknal_tunables.ksnd_min_bulk,
+		.maxlen   = sizeof (int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_RX_BUFFER_SIZE,
+		.procname = "rx_buffer_size",
+		.data     = &ksocknal_tunables.ksnd_rx_buffer_size,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_TX_BUFFER_SIZE,
+		.procname = "tx_buffer_size",
+		.data     = &ksocknal_tunables.ksnd_tx_buffer_size,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_NAGLE,
+		.procname = "nagle",
+		.data     = &ksocknal_tunables.ksnd_nagle,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_ROUND_ROBIN,
+		.procname = "round_robin",
+		.data     = &ksocknal_tunables.ksnd_round_robin,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_KEEPALIVE,
+		.procname = "keepalive",
+		.data     = &ksocknal_tunables.ksnd_keepalive,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_KEEPALIVE_IDLE,
+		.procname = "keepalive_idle",
+		.data     = &ksocknal_tunables.ksnd_keepalive_idle,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_KEEPALIVE_COUNT,
+		.procname = "keepalive_count",
+		.data     = &ksocknal_tunables.ksnd_keepalive_count,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_KEEPALIVE_INTVL,
+		.procname = "keepalive_intvl",
+		.data     = &ksocknal_tunables.ksnd_keepalive_intvl,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+#if SOCKNAL_VERSION_DEBUG
+	{
+		.ctl_name = SOCKLND_PROTOCOL,
+		.procname = "protocol",
+		.data     = &ksocknal_tunables.ksnd_protocol,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+#endif
+	{0}
+};
+
+
+ctl_table_t ksocknal_top_ctl_table[] = {
+	{
+		.ctl_name = CTL_SOCKLND,
+		.procname = "socknal",
+		.data     = NULL,
+		.maxlen   = 0,
+		.mode     = 0555,
+		.child    = ksocknal_ctl_table
+	},
+	{ 0 }
+};
+
+int
+ksocknal_lib_tunables_init ()
+{
+	if (!*ksocknal_tunables.ksnd_typed_conns) {
+		int rc = -EINVAL;
+#if SOCKNAL_VERSION_DEBUG
+		if (*ksocknal_tunables.ksnd_protocol < 3)
+			rc = 0;
+#endif
+		if (rc != 0) {
+			CERROR("Protocol V3.x MUST have typed connections\n");
+			return rc;
+		}
+	}
+
+	if (*ksocknal_tunables.ksnd_zc_recv_min_nfrags < 2)
+		*ksocknal_tunables.ksnd_zc_recv_min_nfrags = 2;
+	if (*ksocknal_tunables.ksnd_zc_recv_min_nfrags > LNET_MAX_IOV)
+		*ksocknal_tunables.ksnd_zc_recv_min_nfrags = LNET_MAX_IOV;
+
+	ksocknal_tunables.ksnd_sysctl =
+		cfs_register_sysctl_table(ksocknal_top_ctl_table, 0);
+
+	if (ksocknal_tunables.ksnd_sysctl == NULL)
+		CWARN("Can't setup /proc tunables\n");
+
+	return 0;
+}
+
+void
+ksocknal_lib_tunables_fini ()
+{
+	if (ksocknal_tunables.ksnd_sysctl != NULL)
+		unregister_sysctl_table(ksocknal_tunables.ksnd_sysctl);
+}
+#else
+int
+ksocknal_lib_tunables_init ()
+{
+	return 0;
+}
+
+void
+ksocknal_lib_tunables_fini ()
+{
+}
+#endif /* # if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM */
+
+int
+ksocknal_lib_get_conn_addrs (ksock_conn_t *conn)
+{
+	int rc = libcfs_sock_getaddr(conn->ksnc_sock, 1,
+				     &conn->ksnc_ipaddr,
+				     &conn->ksnc_port);
+
+	/* Didn't need the {get,put}connsock dance to deref ksnc_sock... */
+	LASSERT (!conn->ksnc_closing);
+
+	if (rc != 0) {
+		CERROR ("Error %d getting sock peer IP\n", rc);
+		return rc;
+	}
+
+	rc = libcfs_sock_getaddr(conn->ksnc_sock, 0,
+				 &conn->ksnc_myipaddr, NULL);
+	if (rc != 0) {
+		CERROR ("Error %d getting sock local IP\n", rc);
+		return rc;
+	}
+
+	return 0;
+}
+
+int
+ksocknal_lib_zc_capable(ksock_conn_t *conn)
+{
+	int  caps = conn->ksnc_sock->sk->sk_route_caps;
+
+	if (conn->ksnc_proto == &ksocknal_protocol_v1x)
+		return 0;
+
+	/* ZC if the socket supports scatter/gather and doesn't need software
+	 * checksums */
+	return ((caps & NETIF_F_SG) != 0 && (caps & NETIF_F_ALL_CSUM) != 0);
+}
+
+int
+ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+	struct socket *sock = conn->ksnc_sock;
+	int	    nob;
+	int	    rc;
+
+	if (*ksocknal_tunables.ksnd_enable_csum	&& /* checksum enabled */
+	    conn->ksnc_proto == &ksocknal_protocol_v2x && /* V2.x connection  */
+	    tx->tx_nob == tx->tx_resid		 && /* frist sending    */
+	    tx->tx_msg.ksm_csum == 0)		     /* not checksummed  */
+		ksocknal_lib_csum_tx(tx);
+
+	/* NB we can't trust socket ops to either consume our iovs
+	 * or leave them alone. */
+
+	{
+#if SOCKNAL_SINGLE_FRAG_TX
+		struct iovec    scratch;
+		struct iovec   *scratchiov = &scratch;
+		unsigned int    niov = 1;
+#else
+		struct iovec   *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
+		unsigned int    niov = tx->tx_niov;
+#endif
+		struct msghdr msg = {
+			.msg_name       = NULL,
+			.msg_namelen    = 0,
+			.msg_iov	= scratchiov,
+			.msg_iovlen     = niov,
+			.msg_control    = NULL,
+			.msg_controllen = 0,
+			.msg_flags      = MSG_DONTWAIT
+		};
+		mm_segment_t oldmm = get_fs();
+		int  i;
+
+		for (nob = i = 0; i < niov; i++) {
+			scratchiov[i] = tx->tx_iov[i];
+			nob += scratchiov[i].iov_len;
+		}
+
+		if (!list_empty(&conn->ksnc_tx_queue) ||
+		    nob < tx->tx_resid)
+			msg.msg_flags |= MSG_MORE;
+
+		set_fs (KERNEL_DS);
+		rc = sock_sendmsg(sock, &msg, nob);
+		set_fs (oldmm);
+	}
+	return rc;
+}
+
+int
+ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+	struct socket *sock = conn->ksnc_sock;
+	lnet_kiov_t   *kiov = tx->tx_kiov;
+	int	    rc;
+	int	    nob;
+
+	/* Not NOOP message */
+	LASSERT (tx->tx_lnetmsg != NULL);
+
+	/* NB we can't trust socket ops to either consume our iovs
+	 * or leave them alone. */
+	if (tx->tx_msg.ksm_zc_cookies[0] != 0) {
+		/* Zero copy is enabled */
+		struct sock   *sk = sock->sk;
+		struct page   *page = kiov->kiov_page;
+		int	    offset = kiov->kiov_offset;
+		int	    fragsize = kiov->kiov_len;
+		int	    msgflg = MSG_DONTWAIT;
+
+		CDEBUG(D_NET, "page %p + offset %x for %d\n",
+			       page, offset, kiov->kiov_len);
+
+		if (!list_empty(&conn->ksnc_tx_queue) ||
+		    fragsize < tx->tx_resid)
+			msgflg |= MSG_MORE;
+
+		if (sk->sk_prot->sendpage != NULL) {
+			rc = sk->sk_prot->sendpage(sk, page,
+						   offset, fragsize, msgflg);
+		} else {
+			rc = cfs_tcp_sendpage(sk, page, offset, fragsize,
+					      msgflg);
+		}
+	} else {
+#if SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_RISK_KMAP_DEADLOCK
+		struct iovec  scratch;
+		struct iovec *scratchiov = &scratch;
+		unsigned int  niov = 1;
+#else
+#ifdef CONFIG_HIGHMEM
+#warning "XXX risk of kmap deadlock on multiple frags..."
+#endif
+		struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
+		unsigned int  niov = tx->tx_nkiov;
+#endif
+		struct msghdr msg = {
+			.msg_name       = NULL,
+			.msg_namelen    = 0,
+			.msg_iov	= scratchiov,
+			.msg_iovlen     = niov,
+			.msg_control    = NULL,
+			.msg_controllen = 0,
+			.msg_flags      = MSG_DONTWAIT
+		};
+		mm_segment_t  oldmm = get_fs();
+		int	   i;
+
+		for (nob = i = 0; i < niov; i++) {
+			scratchiov[i].iov_base = kmap(kiov[i].kiov_page) +
+						 kiov[i].kiov_offset;
+			nob += scratchiov[i].iov_len = kiov[i].kiov_len;
+		}
+
+		if (!list_empty(&conn->ksnc_tx_queue) ||
+		    nob < tx->tx_resid)
+			msg.msg_flags |= MSG_MORE;
+
+		set_fs (KERNEL_DS);
+		rc = sock_sendmsg(sock, &msg, nob);
+		set_fs (oldmm);
+
+		for (i = 0; i < niov; i++)
+			kunmap(kiov[i].kiov_page);
+	}
+	return rc;
+}
+
+void
+ksocknal_lib_eager_ack (ksock_conn_t *conn)
+{
+	int	    opt = 1;
+	mm_segment_t   oldmm = get_fs();
+	struct socket *sock = conn->ksnc_sock;
+
+	/* Remind the socket to ACK eagerly.  If I don't, the socket might
+	 * think I'm about to send something it could piggy-back the ACK
+	 * on, introducing delay in completing zero-copy sends in my
+	 * peer. */
+
+	set_fs(KERNEL_DS);
+	sock->ops->setsockopt (sock, SOL_TCP, TCP_QUICKACK,
+			       (char *)&opt, sizeof (opt));
+	set_fs(oldmm);
+}
+
+int
+ksocknal_lib_recv_iov (ksock_conn_t *conn)
+{
+#if SOCKNAL_SINGLE_FRAG_RX
+	struct iovec  scratch;
+	struct iovec *scratchiov = &scratch;
+	unsigned int  niov = 1;
+#else
+	struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
+	unsigned int  niov = conn->ksnc_rx_niov;
+#endif
+	struct iovec *iov = conn->ksnc_rx_iov;
+	struct msghdr msg = {
+		.msg_name       = NULL,
+		.msg_namelen    = 0,
+		.msg_iov	= scratchiov,
+		.msg_iovlen     = niov,
+		.msg_control    = NULL,
+		.msg_controllen = 0,
+		.msg_flags      = 0
+	};
+	mm_segment_t oldmm = get_fs();
+	int	  nob;
+	int	  i;
+	int	  rc;
+	int	  fragnob;
+	int	  sum;
+	__u32	saved_csum;
+
+	/* NB we can't trust socket ops to either consume our iovs
+	 * or leave them alone. */
+	LASSERT (niov > 0);
+
+	for (nob = i = 0; i < niov; i++) {
+		scratchiov[i] = iov[i];
+		nob += scratchiov[i].iov_len;
+	}
+	LASSERT (nob <= conn->ksnc_rx_nob_wanted);
+
+	set_fs (KERNEL_DS);
+	rc = sock_recvmsg (conn->ksnc_sock, &msg, nob, MSG_DONTWAIT);
+	/* NB this is just a boolean..........................^ */
+	set_fs (oldmm);
+
+	saved_csum = 0;
+	if (conn->ksnc_proto == &ksocknal_protocol_v2x) {
+		saved_csum = conn->ksnc_msg.ksm_csum;
+		conn->ksnc_msg.ksm_csum = 0;
+	}
+
+	if (saved_csum != 0) {
+		/* accumulate checksum */
+		for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) {
+			LASSERT (i < niov);
+
+			fragnob = iov[i].iov_len;
+			if (fragnob > sum)
+				fragnob = sum;
+
+			conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum,
+							   iov[i].iov_base, fragnob);
+		}
+		conn->ksnc_msg.ksm_csum = saved_csum;
+	}
+
+	return rc;
+}
+
+static void
+ksocknal_lib_kiov_vunmap(void *addr)
+{
+	if (addr == NULL)
+		return;
+
+	vunmap(addr);
+}
+
+static void *
+ksocknal_lib_kiov_vmap(lnet_kiov_t *kiov, int niov,
+		       struct iovec *iov, struct page **pages)
+{
+	void	     *addr;
+	int	       nob;
+	int	       i;
+
+	if (!*ksocknal_tunables.ksnd_zc_recv || pages == NULL)
+		return NULL;
+
+	LASSERT (niov <= LNET_MAX_IOV);
+
+	if (niov < 2 ||
+	    niov < *ksocknal_tunables.ksnd_zc_recv_min_nfrags)
+		return NULL;
+
+	for (nob = i = 0; i < niov; i++) {
+		if ((kiov[i].kiov_offset != 0 && i > 0) ||
+		    (kiov[i].kiov_offset + kiov[i].kiov_len != PAGE_CACHE_SIZE && i < niov - 1))
+			return NULL;
+
+		pages[i] = kiov[i].kiov_page;
+		nob += kiov[i].kiov_len;
+	}
+
+	addr = vmap(pages, niov, VM_MAP, PAGE_KERNEL);
+	if (addr == NULL)
+		return NULL;
+
+	iov->iov_base = addr + kiov[0].kiov_offset;
+	iov->iov_len = nob;
+
+	return addr;
+}
+
+int
+ksocknal_lib_recv_kiov (ksock_conn_t *conn)
+{
+#if SOCKNAL_SINGLE_FRAG_RX || !SOCKNAL_RISK_KMAP_DEADLOCK
+	struct iovec   scratch;
+	struct iovec  *scratchiov = &scratch;
+	struct page  **pages      = NULL;
+	unsigned int   niov       = 1;
+#else
+#ifdef CONFIG_HIGHMEM
+#warning "XXX risk of kmap deadlock on multiple frags..."
+#endif
+	struct iovec  *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
+	struct page  **pages      = conn->ksnc_scheduler->kss_rx_scratch_pgs;
+	unsigned int   niov       = conn->ksnc_rx_nkiov;
+#endif
+	lnet_kiov_t   *kiov = conn->ksnc_rx_kiov;
+	struct msghdr msg = {
+		.msg_name       = NULL,
+		.msg_namelen    = 0,
+		.msg_iov	= scratchiov,
+		.msg_control    = NULL,
+		.msg_controllen = 0,
+		.msg_flags      = 0
+	};
+	mm_segment_t oldmm = get_fs();
+	int	  nob;
+	int	  i;
+	int	  rc;
+	void	*base;
+	void	*addr;
+	int	  sum;
+	int	  fragnob;
+
+	/* NB we can't trust socket ops to either consume our iovs
+	 * or leave them alone. */
+	if ((addr = ksocknal_lib_kiov_vmap(kiov, niov, scratchiov, pages)) != NULL) {
+		nob = scratchiov[0].iov_len;
+		msg.msg_iovlen = 1;
+
+	} else {
+		for (nob = i = 0; i < niov; i++) {
+			nob += scratchiov[i].iov_len = kiov[i].kiov_len;
+			scratchiov[i].iov_base = kmap(kiov[i].kiov_page) +
+						 kiov[i].kiov_offset;
+		}
+		msg.msg_iovlen = niov;
+	}
+
+	LASSERT (nob <= conn->ksnc_rx_nob_wanted);
+
+	set_fs (KERNEL_DS);
+	rc = sock_recvmsg (conn->ksnc_sock, &msg, nob, MSG_DONTWAIT);
+	/* NB this is just a boolean.......................^ */
+	set_fs (oldmm);
+
+	if (conn->ksnc_msg.ksm_csum != 0) {
+		for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) {
+			LASSERT (i < niov);
+
+			/* Dang! have to kmap again because I have nowhere to stash the
+			 * mapped address.  But by doing it while the page is still
+			 * mapped, the kernel just bumps the map count and returns me
+			 * the address it stashed. */
+			base = kmap(kiov[i].kiov_page) + kiov[i].kiov_offset;
+			fragnob = kiov[i].kiov_len;
+			if (fragnob > sum)
+				fragnob = sum;
+
+			conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum,
+							   base, fragnob);
+
+			kunmap(kiov[i].kiov_page);
+		}
+	}
+
+	if (addr != NULL) {
+		ksocknal_lib_kiov_vunmap(addr);
+	} else {
+		for (i = 0; i < niov; i++)
+			kunmap(kiov[i].kiov_page);
+	}
+
+	return (rc);
+}
+
+void
+ksocknal_lib_csum_tx(ksock_tx_t *tx)
+{
+	int	  i;
+	__u32	csum;
+	void	*base;
+
+	LASSERT(tx->tx_iov[0].iov_base == (void *)&tx->tx_msg);
+	LASSERT(tx->tx_conn != NULL);
+	LASSERT(tx->tx_conn->ksnc_proto == &ksocknal_protocol_v2x);
+
+	tx->tx_msg.ksm_csum = 0;
+
+	csum = ksocknal_csum(~0, (void *)tx->tx_iov[0].iov_base,
+			     tx->tx_iov[0].iov_len);
+
+	if (tx->tx_kiov != NULL) {
+		for (i = 0; i < tx->tx_nkiov; i++) {
+			base = kmap(tx->tx_kiov[i].kiov_page) +
+			       tx->tx_kiov[i].kiov_offset;
+
+			csum = ksocknal_csum(csum, base, tx->tx_kiov[i].kiov_len);
+
+			kunmap(tx->tx_kiov[i].kiov_page);
+		}
+	} else {
+		for (i = 1; i < tx->tx_niov; i++)
+			csum = ksocknal_csum(csum, tx->tx_iov[i].iov_base,
+					     tx->tx_iov[i].iov_len);
+	}
+
+	if (*ksocknal_tunables.ksnd_inject_csum_error) {
+		csum++;
+		*ksocknal_tunables.ksnd_inject_csum_error = 0;
+	}
+
+	tx->tx_msg.ksm_csum = csum;
+}
+
+int
+ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle)
+{
+	mm_segment_t   oldmm = get_fs ();
+	struct socket *sock = conn->ksnc_sock;
+	int	    len;
+	int	    rc;
+
+	rc = ksocknal_connsock_addref(conn);
+	if (rc != 0) {
+		LASSERT (conn->ksnc_closing);
+		*txmem = *rxmem = *nagle = 0;
+		return (-ESHUTDOWN);
+	}
+
+	rc = libcfs_sock_getbuf(sock, txmem, rxmem);
+	if (rc == 0) {
+		len = sizeof(*nagle);
+		set_fs(KERNEL_DS);
+		rc = sock->ops->getsockopt(sock, SOL_TCP, TCP_NODELAY,
+					   (char *)nagle, &len);
+		set_fs(oldmm);
+	}
+
+	ksocknal_connsock_decref(conn);
+
+	if (rc == 0)
+		*nagle = !*nagle;
+	else
+		*txmem = *rxmem = *nagle = 0;
+
+	return (rc);
+}
+
+int
+ksocknal_lib_setup_sock (struct socket *sock)
+{
+	mm_segment_t    oldmm = get_fs ();
+	int	     rc;
+	int	     option;
+	int	     keep_idle;
+	int	     keep_intvl;
+	int	     keep_count;
+	int	     do_keepalive;
+	struct linger   linger;
+
+	sock->sk->sk_allocation = GFP_NOFS;
+
+	/* Ensure this socket aborts active sends immediately when we close
+	 * it. */
+
+	linger.l_onoff = 0;
+	linger.l_linger = 0;
+
+	set_fs (KERNEL_DS);
+	rc = sock_setsockopt (sock, SOL_SOCKET, SO_LINGER,
+			      (char *)&linger, sizeof (linger));
+	set_fs (oldmm);
+	if (rc != 0) {
+		CERROR ("Can't set SO_LINGER: %d\n", rc);
+		return (rc);
+	}
+
+	option = -1;
+	set_fs (KERNEL_DS);
+	rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_LINGER2,
+				    (char *)&option, sizeof (option));
+	set_fs (oldmm);
+	if (rc != 0) {
+		CERROR ("Can't set SO_LINGER2: %d\n", rc);
+		return (rc);
+	}
+
+	if (!*ksocknal_tunables.ksnd_nagle) {
+		option = 1;
+
+		set_fs (KERNEL_DS);
+		rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_NODELAY,
+					    (char *)&option, sizeof (option));
+		set_fs (oldmm);
+		if (rc != 0) {
+			CERROR ("Can't disable nagle: %d\n", rc);
+			return (rc);
+		}
+	}
+
+	rc = libcfs_sock_setbuf(sock,
+				*ksocknal_tunables.ksnd_tx_buffer_size,
+				*ksocknal_tunables.ksnd_rx_buffer_size);
+	if (rc != 0) {
+		CERROR ("Can't set buffer tx %d, rx %d buffers: %d\n",
+			*ksocknal_tunables.ksnd_tx_buffer_size,
+			*ksocknal_tunables.ksnd_rx_buffer_size, rc);
+		return (rc);
+	}
+
+/* TCP_BACKOFF_* sockopt tunables unsupported in stock kernels */
+
+	/* snapshot tunables */
+	keep_idle  = *ksocknal_tunables.ksnd_keepalive_idle;
+	keep_count = *ksocknal_tunables.ksnd_keepalive_count;
+	keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl;
+
+	do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0);
+
+	option = (do_keepalive ? 1 : 0);
+	set_fs (KERNEL_DS);
+	rc = sock_setsockopt (sock, SOL_SOCKET, SO_KEEPALIVE,
+			      (char *)&option, sizeof (option));
+	set_fs (oldmm);
+	if (rc != 0) {
+		CERROR ("Can't set SO_KEEPALIVE: %d\n", rc);
+		return (rc);
+	}
+
+	if (!do_keepalive)
+		return (0);
+
+	set_fs (KERNEL_DS);
+	rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPIDLE,
+				    (char *)&keep_idle, sizeof (keep_idle));
+	set_fs (oldmm);
+	if (rc != 0) {
+		CERROR ("Can't set TCP_KEEPIDLE: %d\n", rc);
+		return (rc);
+	}
+
+	set_fs (KERNEL_DS);
+	rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPINTVL,
+				    (char *)&keep_intvl, sizeof (keep_intvl));
+	set_fs (oldmm);
+	if (rc != 0) {
+		CERROR ("Can't set TCP_KEEPINTVL: %d\n", rc);
+		return (rc);
+	}
+
+	set_fs (KERNEL_DS);
+	rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPCNT,
+				    (char *)&keep_count, sizeof (keep_count));
+	set_fs (oldmm);
+	if (rc != 0) {
+		CERROR ("Can't set TCP_KEEPCNT: %d\n", rc);
+		return (rc);
+	}
+
+	return (0);
+}
+
+void
+ksocknal_lib_push_conn (ksock_conn_t *conn)
+{
+	struct sock    *sk;
+	struct tcp_sock *tp;
+	int	     nonagle;
+	int	     val = 1;
+	int	     rc;
+	mm_segment_t    oldmm;
+
+	rc = ksocknal_connsock_addref(conn);
+	if (rc != 0)			    /* being shut down */
+		return;
+
+	sk = conn->ksnc_sock->sk;
+	tp = tcp_sk(sk);
+
+	lock_sock (sk);
+	nonagle = tp->nonagle;
+	tp->nonagle = 1;
+	release_sock (sk);
+
+	oldmm = get_fs ();
+	set_fs (KERNEL_DS);
+
+	rc = sk->sk_prot->setsockopt (sk, SOL_TCP, TCP_NODELAY,
+				      (char *)&val, sizeof (val));
+	LASSERT (rc == 0);
+
+	set_fs (oldmm);
+
+	lock_sock (sk);
+	tp->nonagle = nonagle;
+	release_sock (sk);
+
+	ksocknal_connsock_decref(conn);
+}
+
+extern void ksocknal_read_callback (ksock_conn_t *conn);
+extern void ksocknal_write_callback (ksock_conn_t *conn);
+/*
+ * socket call back in Linux
+ */
+static void
+ksocknal_data_ready (struct sock *sk, int n)
+{
+	ksock_conn_t  *conn;
+	ENTRY;
+
+	/* interleave correctly with closing sockets... */
+	LASSERT(!in_irq());
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	conn = sk->sk_user_data;
+	if (conn == NULL) {	     /* raced with ksocknal_terminate_conn */
+		LASSERT (sk->sk_data_ready != &ksocknal_data_ready);
+		sk->sk_data_ready (sk, n);
+	} else
+		ksocknal_read_callback(conn);
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+
+	EXIT;
+}
+
+static void
+ksocknal_write_space (struct sock *sk)
+{
+	ksock_conn_t  *conn;
+	int	    wspace;
+	int	    min_wpace;
+
+	/* interleave correctly with closing sockets... */
+	LASSERT(!in_irq());
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	conn = sk->sk_user_data;
+	wspace = SOCKNAL_WSPACE(sk);
+	min_wpace = SOCKNAL_MIN_WSPACE(sk);
+
+	CDEBUG(D_NET, "sk %p wspace %d low water %d conn %p%s%s%s\n",
+	       sk, wspace, min_wpace, conn,
+	       (conn == NULL) ? "" : (conn->ksnc_tx_ready ?
+				      " ready" : " blocked"),
+	       (conn == NULL) ? "" : (conn->ksnc_tx_scheduled ?
+				      " scheduled" : " idle"),
+	       (conn == NULL) ? "" : (list_empty (&conn->ksnc_tx_queue) ?
+				      " empty" : " queued"));
+
+	if (conn == NULL) {	     /* raced with ksocknal_terminate_conn */
+		LASSERT (sk->sk_write_space != &ksocknal_write_space);
+		sk->sk_write_space (sk);
+
+		read_unlock(&ksocknal_data.ksnd_global_lock);
+		return;
+	}
+
+	if (wspace >= min_wpace) {	      /* got enough space */
+		ksocknal_write_callback(conn);
+
+		/* Clear SOCK_NOSPACE _after_ ksocknal_write_callback so the
+		 * ENOMEM check in ksocknal_transmit is race-free (think about
+		 * it). */
+
+		clear_bit (SOCK_NOSPACE, &sk->sk_socket->flags);
+	}
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+}
+
+void
+ksocknal_lib_save_callback(struct socket *sock, ksock_conn_t *conn)
+{
+	conn->ksnc_saved_data_ready = sock->sk->sk_data_ready;
+	conn->ksnc_saved_write_space = sock->sk->sk_write_space;
+}
+
+void
+ksocknal_lib_set_callback(struct socket *sock,  ksock_conn_t *conn)
+{
+	sock->sk->sk_user_data = conn;
+	sock->sk->sk_data_ready = ksocknal_data_ready;
+	sock->sk->sk_write_space = ksocknal_write_space;
+	return;
+}
+
+void
+ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn)
+{
+	/* Remove conn's network callbacks.
+	 * NB I _have_ to restore the callback, rather than storing a noop,
+	 * since the socket could survive past this module being unloaded!! */
+	sock->sk->sk_data_ready = conn->ksnc_saved_data_ready;
+	sock->sk->sk_write_space = conn->ksnc_saved_write_space;
+
+	/* A callback could be in progress already; they hold a read lock
+	 * on ksnd_global_lock (to serialise with me) and NOOP if
+	 * sk_user_data is NULL. */
+	sock->sk->sk_user_data = NULL;
+
+	return ;
+}
+
+int
+ksocknal_lib_memory_pressure(ksock_conn_t *conn)
+{
+	int	    rc = 0;
+	ksock_sched_t *sched;
+
+	sched = conn->ksnc_scheduler;
+	spin_lock_bh(&sched->kss_lock);
+
+	if (!SOCK_TEST_NOSPACE(conn->ksnc_sock) &&
+	    !conn->ksnc_tx_ready) {
+		/* SOCK_NOSPACE is set when the socket fills
+		 * and cleared in the write_space callback
+		 * (which also sets ksnc_tx_ready).  If
+		 * SOCK_NOSPACE and ksnc_tx_ready are BOTH
+		 * zero, I didn't fill the socket and
+		 * write_space won't reschedule me, so I
+		 * return -ENOMEM to get my caller to retry
+		 * after a timeout */
+		rc = -ENOMEM;
+	}
+
+	spin_unlock_bh(&sched->kss_lock);
+
+	return rc;
+}
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.h b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.h
new file mode 100644
index 000000000000..3c135786dc11
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.h
@@ -0,0 +1,91 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_PORTAL_ALLOC
+
+#ifndef __LINUX_SOCKNAL_LIB_H__
+#define __LINUX_SOCKNAL_LIB_H__
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/version.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <linux/uio.h>
+#include <linux/if.h>
+
+#include <asm/uaccess.h>
+#include <asm/irq.h>
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+#include <asm/uaccess.h>
+#include <asm/div64.h>
+#include <linux/syscalls.h>
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/libcfs/linux/portals_compat25.h>
+
+#include <linux/crc32.h>
+static inline __u32 ksocknal_csum(__u32 crc, unsigned char const *p, size_t len)
+{
+#if 1
+	return crc32_le(crc, p, len);
+#else
+	while (len-- > 0)
+		crc = ((crc + 0x100) & ~0xff) | ((crc + *p++) & 0xff) ;
+	return crc;
+#endif
+}
+
+#define SOCKNAL_WSPACE(sk)       sk_stream_wspace(sk)
+#define SOCKNAL_MIN_WSPACE(sk)   sk_stream_min_wspace(sk)
+
+/* assume one thread for each connection type */
+#define SOCKNAL_NSCHEDS		3
+#define SOCKNAL_NSCHEDS_HIGH	(SOCKNAL_NSCHEDS << 1)
+
+#endif
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c
new file mode 100644
index 000000000000..8a474f64abbe
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "socklnd.h"
+
+static int sock_timeout = 50;
+CFS_MODULE_PARM(sock_timeout, "i", int, 0644,
+		"dead socket timeout (seconds)");
+
+static int credits = 256;
+CFS_MODULE_PARM(credits, "i", int, 0444,
+		"# concurrent sends");
+
+static int peer_credits = 8;
+CFS_MODULE_PARM(peer_credits, "i", int, 0444,
+		"# concurrent sends to 1 peer");
+
+static int peer_buffer_credits = 0;
+CFS_MODULE_PARM(peer_buffer_credits, "i", int, 0444,
+		"# per-peer router buffer credits");
+
+static int peer_timeout = 180;
+CFS_MODULE_PARM(peer_timeout, "i", int, 0444,
+		"Seconds without aliveness news to declare peer dead (<=0 to disable)");
+
+/* Number of daemons in each thread pool which is percpt,
+ * we will estimate reasonable value based on CPUs if it's not set. */
+static unsigned int nscheds;
+CFS_MODULE_PARM(nscheds, "i", int, 0444,
+		"# scheduler daemons in each pool while starting");
+
+static int nconnds = 4;
+CFS_MODULE_PARM(nconnds, "i", int, 0444,
+		"# connection daemons while starting");
+
+static int nconnds_max = 64;
+CFS_MODULE_PARM(nconnds_max, "i", int, 0444,
+		"max # connection daemons");
+
+static int min_reconnectms = 1000;
+CFS_MODULE_PARM(min_reconnectms, "i", int, 0644,
+		"min connection retry interval (mS)");
+
+static int max_reconnectms = 60000;
+CFS_MODULE_PARM(max_reconnectms, "i", int, 0644,
+		"max connection retry interval (mS)");
+
+# define DEFAULT_EAGER_ACK 0
+static int eager_ack = DEFAULT_EAGER_ACK;
+CFS_MODULE_PARM(eager_ack, "i", int, 0644,
+		"send tcp ack packets eagerly");
+
+static int typed_conns = 1;
+CFS_MODULE_PARM(typed_conns, "i", int, 0444,
+		"use different sockets for bulk");
+
+static int min_bulk = (1<<10);
+CFS_MODULE_PARM(min_bulk, "i", int, 0644,
+		"smallest 'large' message");
+
+# define DEFAULT_BUFFER_SIZE 0
+static int tx_buffer_size = DEFAULT_BUFFER_SIZE;
+CFS_MODULE_PARM(tx_buffer_size, "i", int, 0644,
+		"socket tx buffer size (0 for system default)");
+
+static int rx_buffer_size = DEFAULT_BUFFER_SIZE;
+CFS_MODULE_PARM(rx_buffer_size, "i", int, 0644,
+		"socket rx buffer size (0 for system default)");
+
+static int nagle = 0;
+CFS_MODULE_PARM(nagle, "i", int, 0644,
+		"enable NAGLE?");
+
+static int round_robin = 1;
+CFS_MODULE_PARM(round_robin, "i", int, 0644,
+		"Round robin for multiple interfaces");
+
+static int keepalive = 30;
+CFS_MODULE_PARM(keepalive, "i", int, 0644,
+		"# seconds before send keepalive");
+
+static int keepalive_idle = 30;
+CFS_MODULE_PARM(keepalive_idle, "i", int, 0644,
+		"# idle seconds before probe");
+
+#define DEFAULT_KEEPALIVE_COUNT  5
+static int keepalive_count = DEFAULT_KEEPALIVE_COUNT;
+CFS_MODULE_PARM(keepalive_count, "i", int, 0644,
+		"# missed probes == dead");
+
+static int keepalive_intvl = 5;
+CFS_MODULE_PARM(keepalive_intvl, "i", int, 0644,
+		"seconds between probes");
+
+static int enable_csum = 0;
+CFS_MODULE_PARM(enable_csum, "i", int, 0644,
+		"enable check sum");
+
+static int inject_csum_error = 0;
+CFS_MODULE_PARM(inject_csum_error, "i", int, 0644,
+		"set non-zero to inject a checksum error");
+
+static int nonblk_zcack = 1;
+CFS_MODULE_PARM(nonblk_zcack, "i", int, 0644,
+		"always send ZC-ACK on non-blocking connection");
+
+static unsigned int zc_min_payload = (16 << 10);
+CFS_MODULE_PARM(zc_min_payload, "i", int, 0644,
+		"minimum payload size to zero copy");
+
+static unsigned int zc_recv = 0;
+CFS_MODULE_PARM(zc_recv, "i", int, 0644,
+		"enable ZC recv for Chelsio driver");
+
+static unsigned int zc_recv_min_nfrags = 16;
+CFS_MODULE_PARM(zc_recv_min_nfrags, "i", int, 0644,
+		"minimum # of fragments to enable ZC recv");
+
+
+#if SOCKNAL_VERSION_DEBUG
+static int protocol = 3;
+CFS_MODULE_PARM(protocol, "i", int, 0644,
+		"protocol version");
+#endif
+
+ksock_tunables_t ksocknal_tunables;
+
+int ksocknal_tunables_init(void)
+{
+
+	/* initialize ksocknal_tunables structure */
+	ksocknal_tunables.ksnd_timeout	    = &sock_timeout;
+	ksocknal_tunables.ksnd_nscheds		  = &nscheds;
+	ksocknal_tunables.ksnd_nconnds	    = &nconnds;
+	ksocknal_tunables.ksnd_nconnds_max	= &nconnds_max;
+	ksocknal_tunables.ksnd_min_reconnectms    = &min_reconnectms;
+	ksocknal_tunables.ksnd_max_reconnectms    = &max_reconnectms;
+	ksocknal_tunables.ksnd_eager_ack	  = &eager_ack;
+	ksocknal_tunables.ksnd_typed_conns	= &typed_conns;
+	ksocknal_tunables.ksnd_min_bulk	   = &min_bulk;
+	ksocknal_tunables.ksnd_tx_buffer_size     = &tx_buffer_size;
+	ksocknal_tunables.ksnd_rx_buffer_size     = &rx_buffer_size;
+	ksocknal_tunables.ksnd_nagle	      = &nagle;
+	ksocknal_tunables.ksnd_round_robin	= &round_robin;
+	ksocknal_tunables.ksnd_keepalive	  = &keepalive;
+	ksocknal_tunables.ksnd_keepalive_idle     = &keepalive_idle;
+	ksocknal_tunables.ksnd_keepalive_count    = &keepalive_count;
+	ksocknal_tunables.ksnd_keepalive_intvl    = &keepalive_intvl;
+	ksocknal_tunables.ksnd_credits	    = &credits;
+	ksocknal_tunables.ksnd_peertxcredits      = &peer_credits;
+	ksocknal_tunables.ksnd_peerrtrcredits     = &peer_buffer_credits;
+	ksocknal_tunables.ksnd_peertimeout	= &peer_timeout;
+	ksocknal_tunables.ksnd_enable_csum	= &enable_csum;
+	ksocknal_tunables.ksnd_inject_csum_error  = &inject_csum_error;
+	ksocknal_tunables.ksnd_nonblk_zcack       = &nonblk_zcack;
+	ksocknal_tunables.ksnd_zc_min_payload     = &zc_min_payload;
+	ksocknal_tunables.ksnd_zc_recv	    = &zc_recv;
+	ksocknal_tunables.ksnd_zc_recv_min_nfrags = &zc_recv_min_nfrags;
+
+
+
+#if SOCKNAL_VERSION_DEBUG
+	ksocknal_tunables.ksnd_protocol	   = &protocol;
+#endif
+
+#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
+	ksocknal_tunables.ksnd_sysctl	     =  NULL;
+#endif
+
+	if (*ksocknal_tunables.ksnd_zc_min_payload < (2 << 10))
+		*ksocknal_tunables.ksnd_zc_min_payload = (2 << 10);
+
+	/* initialize platform-sepcific tunables */
+	return ksocknal_lib_tunables_init();
+};
+
+void ksocknal_tunables_fini(void)
+{
+	ksocknal_lib_tunables_fini();
+}
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c
new file mode 100644
index 000000000000..ec57179f8d2b
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c
@@ -0,0 +1,797 @@
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ *
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "socklnd.h"
+
+/*
+ * Protocol entries :
+ *   pro_send_hello       : send hello message
+ *   pro_recv_hello       : receive hello message
+ *   pro_pack	     : pack message header
+ *   pro_unpack	   : unpack message header
+ *   pro_queue_tx_zcack() : Called holding BH lock: kss_lock
+ *			  return 1 if ACK is piggybacked, otherwise return 0
+ *   pro_queue_tx_msg()   : Called holding BH lock: kss_lock
+ *			  return the ACK that piggybacked by my message, or NULL
+ *   pro_handle_zcreq()   : handler of incoming ZC-REQ
+ *   pro_handle_zcack()   : handler of incoming ZC-ACK
+ *   pro_match_tx()       : Called holding glock
+ */
+
+static ksock_tx_t *
+ksocknal_queue_tx_msg_v1(ksock_conn_t *conn, ksock_tx_t *tx_msg)
+{
+	/* V1.x, just enqueue it */
+	list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue);
+	return NULL;
+}
+
+void
+ksocknal_next_tx_carrier(ksock_conn_t *conn)
+{
+	ksock_tx_t     *tx = conn->ksnc_tx_carrier;
+
+	/* Called holding BH lock: conn->ksnc_scheduler->kss_lock */
+	LASSERT (!list_empty(&conn->ksnc_tx_queue));
+	LASSERT (tx != NULL);
+
+	/* Next TX that can carry ZC-ACK or LNet message */
+	if (tx->tx_list.next == &conn->ksnc_tx_queue) {
+		/* no more packets queued */
+		conn->ksnc_tx_carrier = NULL;
+	} else {
+		conn->ksnc_tx_carrier = list_entry(tx->tx_list.next,
+						       ksock_tx_t, tx_list);
+		LASSERT (conn->ksnc_tx_carrier->tx_msg.ksm_type == tx->tx_msg.ksm_type);
+	}
+}
+
+static int
+ksocknal_queue_tx_zcack_v2(ksock_conn_t *conn,
+			   ksock_tx_t *tx_ack, __u64 cookie)
+{
+	ksock_tx_t *tx = conn->ksnc_tx_carrier;
+
+	LASSERT (tx_ack == NULL ||
+		 tx_ack->tx_msg.ksm_type == KSOCK_MSG_NOOP);
+
+	/*
+	 * Enqueue or piggyback tx_ack / cookie
+	 * . no tx can piggyback cookie of tx_ack (or cookie), just
+	 *   enqueue the tx_ack (if tx_ack != NUL) and return NULL.
+	 * . There is tx can piggyback cookie of tx_ack (or cookie),
+	 *   piggyback the cookie and return the tx.
+	 */
+	if (tx == NULL) {
+		if (tx_ack != NULL) {
+			list_add_tail(&tx_ack->tx_list,
+					  &conn->ksnc_tx_queue);
+			conn->ksnc_tx_carrier = tx_ack;
+		}
+		return 0;
+	}
+
+	if (tx->tx_msg.ksm_type == KSOCK_MSG_NOOP) {
+		/* tx is noop zc-ack, can't piggyback zc-ack cookie */
+		if (tx_ack != NULL)
+			list_add_tail(&tx_ack->tx_list,
+					  &conn->ksnc_tx_queue);
+		return 0;
+	}
+
+	LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_LNET);
+	LASSERT(tx->tx_msg.ksm_zc_cookies[1] == 0);
+
+	if (tx_ack != NULL)
+		cookie = tx_ack->tx_msg.ksm_zc_cookies[1];
+
+	/* piggyback the zc-ack cookie */
+	tx->tx_msg.ksm_zc_cookies[1] = cookie;
+	/* move on to the next TX which can carry cookie */
+	ksocknal_next_tx_carrier(conn);
+
+	return 1;
+}
+
+static ksock_tx_t *
+ksocknal_queue_tx_msg_v2(ksock_conn_t *conn, ksock_tx_t *tx_msg)
+{
+	ksock_tx_t  *tx  = conn->ksnc_tx_carrier;
+
+	/*
+	 * Enqueue tx_msg:
+	 * . If there is no NOOP on the connection, just enqueue
+	 *   tx_msg and return NULL
+	 * . If there is NOOP on the connection, piggyback the cookie
+	 *   and replace the NOOP tx, and return the NOOP tx.
+	 */
+	if (tx == NULL) { /* nothing on queue */
+		list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue);
+		conn->ksnc_tx_carrier = tx_msg;
+		return NULL;
+	}
+
+	if (tx->tx_msg.ksm_type == KSOCK_MSG_LNET) { /* nothing to carry */
+		list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue);
+		return NULL;
+	}
+
+	LASSERT (tx->tx_msg.ksm_type == KSOCK_MSG_NOOP);
+
+	/* There is a noop zc-ack can be piggybacked */
+	tx_msg->tx_msg.ksm_zc_cookies[1] = tx->tx_msg.ksm_zc_cookies[1];
+	ksocknal_next_tx_carrier(conn);
+
+	/* use new_tx to replace the noop zc-ack packet */
+	list_add(&tx_msg->tx_list, &tx->tx_list);
+	list_del(&tx->tx_list);
+
+	return tx;
+}
+
+static int
+ksocknal_queue_tx_zcack_v3(ksock_conn_t *conn,
+			   ksock_tx_t *tx_ack, __u64 cookie)
+{
+	ksock_tx_t *tx;
+
+	if (conn->ksnc_type != SOCKLND_CONN_ACK)
+		return ksocknal_queue_tx_zcack_v2(conn, tx_ack, cookie);
+
+	/* non-blocking ZC-ACK (to router) */
+	LASSERT (tx_ack == NULL ||
+		 tx_ack->tx_msg.ksm_type == KSOCK_MSG_NOOP);
+
+	if ((tx = conn->ksnc_tx_carrier) == NULL) {
+		if (tx_ack != NULL) {
+			list_add_tail(&tx_ack->tx_list,
+					  &conn->ksnc_tx_queue);
+			conn->ksnc_tx_carrier = tx_ack;
+		}
+		return 0;
+	}
+
+	/* conn->ksnc_tx_carrier != NULL */
+
+	if (tx_ack != NULL)
+		cookie = tx_ack->tx_msg.ksm_zc_cookies[1];
+
+	if (cookie == SOCKNAL_KEEPALIVE_PING) /* ignore keepalive PING */
+		return 1;
+
+	if (tx->tx_msg.ksm_zc_cookies[1] == SOCKNAL_KEEPALIVE_PING) {
+		/* replace the keepalive PING with a real ACK */
+		LASSERT (tx->tx_msg.ksm_zc_cookies[0] == 0);
+		tx->tx_msg.ksm_zc_cookies[1] = cookie;
+		return 1;
+	}
+
+	if (cookie == tx->tx_msg.ksm_zc_cookies[0] ||
+	    cookie == tx->tx_msg.ksm_zc_cookies[1]) {
+		CWARN("%s: duplicated ZC cookie: "LPU64"\n",
+		      libcfs_id2str(conn->ksnc_peer->ksnp_id), cookie);
+		return 1; /* XXX return error in the future */
+	}
+
+	if (tx->tx_msg.ksm_zc_cookies[0] == 0) {
+		/* NOOP tx has only one ZC-ACK cookie, can carry at least one more */
+		if (tx->tx_msg.ksm_zc_cookies[1] > cookie) {
+			tx->tx_msg.ksm_zc_cookies[0] = tx->tx_msg.ksm_zc_cookies[1];
+			tx->tx_msg.ksm_zc_cookies[1] = cookie;
+		} else {
+			tx->tx_msg.ksm_zc_cookies[0] = cookie;
+		}
+
+		if (tx->tx_msg.ksm_zc_cookies[0] - tx->tx_msg.ksm_zc_cookies[1] > 2) {
+			/* not likely to carry more ACKs, skip it to simplify logic */
+			ksocknal_next_tx_carrier(conn);
+		}
+
+		return 1;
+	}
+
+	/* takes two or more cookies already */
+
+	if (tx->tx_msg.ksm_zc_cookies[0] > tx->tx_msg.ksm_zc_cookies[1]) {
+		__u64   tmp = 0;
+
+		/* two seperated cookies: (a+2, a) or (a+1, a) */
+		LASSERT (tx->tx_msg.ksm_zc_cookies[0] -
+			 tx->tx_msg.ksm_zc_cookies[1] <= 2);
+
+		if (tx->tx_msg.ksm_zc_cookies[0] -
+		    tx->tx_msg.ksm_zc_cookies[1] == 2) {
+			if (cookie == tx->tx_msg.ksm_zc_cookies[1] + 1)
+				tmp = cookie;
+		} else if (cookie == tx->tx_msg.ksm_zc_cookies[1] - 1) {
+			tmp = tx->tx_msg.ksm_zc_cookies[1];
+		} else if (cookie == tx->tx_msg.ksm_zc_cookies[0] + 1) {
+			tmp = tx->tx_msg.ksm_zc_cookies[0];
+		}
+
+		if (tmp != 0) {
+			/* range of cookies */
+			tx->tx_msg.ksm_zc_cookies[0] = tmp - 1;
+			tx->tx_msg.ksm_zc_cookies[1] = tmp + 1;
+			return 1;
+		}
+
+	} else {
+		/* ksm_zc_cookies[0] < ksm_zc_cookies[1], it is range of cookies */
+		if (cookie >= tx->tx_msg.ksm_zc_cookies[0] &&
+		    cookie <= tx->tx_msg.ksm_zc_cookies[1]) {
+			CWARN("%s: duplicated ZC cookie: "LPU64"\n",
+			      libcfs_id2str(conn->ksnc_peer->ksnp_id), cookie);
+			return 1; /* XXX: return error in the future */
+		}
+
+		if (cookie == tx->tx_msg.ksm_zc_cookies[1] + 1) {
+			tx->tx_msg.ksm_zc_cookies[1] = cookie;
+			return 1;
+		}
+
+		if (cookie == tx->tx_msg.ksm_zc_cookies[0] - 1) {
+			tx->tx_msg.ksm_zc_cookies[0] = cookie;
+			return 1;
+		}
+	}
+
+	/* failed to piggyback ZC-ACK */
+	if (tx_ack != NULL) {
+		list_add_tail(&tx_ack->tx_list, &conn->ksnc_tx_queue);
+		/* the next tx can piggyback at least 1 ACK */
+		ksocknal_next_tx_carrier(conn);
+	}
+
+	return 0;
+}
+
+static int
+ksocknal_match_tx(ksock_conn_t *conn, ksock_tx_t *tx, int nonblk)
+{
+	int nob;
+
+#if SOCKNAL_VERSION_DEBUG
+	if (!*ksocknal_tunables.ksnd_typed_conns)
+		return SOCKNAL_MATCH_YES;
+#endif
+
+	if (tx == NULL || tx->tx_lnetmsg == NULL) {
+		/* noop packet */
+		nob = offsetof(ksock_msg_t, ksm_u);
+	} else {
+		nob = tx->tx_lnetmsg->msg_len +
+		      ((conn->ksnc_proto == &ksocknal_protocol_v1x) ?
+		       sizeof(lnet_hdr_t) : sizeof(ksock_msg_t));
+	}
+
+	/* default checking for typed connection */
+	switch (conn->ksnc_type) {
+	default:
+		CERROR("ksnc_type bad: %u\n", conn->ksnc_type);
+		LBUG();
+	case SOCKLND_CONN_ANY:
+		return SOCKNAL_MATCH_YES;
+
+	case SOCKLND_CONN_BULK_IN:
+		return SOCKNAL_MATCH_MAY;
+
+	case SOCKLND_CONN_BULK_OUT:
+		if (nob < *ksocknal_tunables.ksnd_min_bulk)
+			return SOCKNAL_MATCH_MAY;
+		else
+			return SOCKNAL_MATCH_YES;
+
+	case SOCKLND_CONN_CONTROL:
+		if (nob >= *ksocknal_tunables.ksnd_min_bulk)
+			return SOCKNAL_MATCH_MAY;
+		else
+			return SOCKNAL_MATCH_YES;
+	}
+}
+
+static int
+ksocknal_match_tx_v3(ksock_conn_t *conn, ksock_tx_t *tx, int nonblk)
+{
+	int nob;
+
+	if (tx == NULL || tx->tx_lnetmsg == NULL)
+		nob = offsetof(ksock_msg_t, ksm_u);
+	else
+		nob = tx->tx_lnetmsg->msg_len + sizeof(ksock_msg_t);
+
+	switch (conn->ksnc_type) {
+	default:
+		CERROR("ksnc_type bad: %u\n", conn->ksnc_type);
+		LBUG();
+	case SOCKLND_CONN_ANY:
+		return SOCKNAL_MATCH_NO;
+
+	case SOCKLND_CONN_ACK:
+		if (nonblk)
+			return SOCKNAL_MATCH_YES;
+		else if (tx == NULL || tx->tx_lnetmsg == NULL)
+			return SOCKNAL_MATCH_MAY;
+		else
+			return SOCKNAL_MATCH_NO;
+
+	case SOCKLND_CONN_BULK_OUT:
+		if (nonblk)
+			return SOCKNAL_MATCH_NO;
+		else if (nob < *ksocknal_tunables.ksnd_min_bulk)
+			return SOCKNAL_MATCH_MAY;
+		else
+			return SOCKNAL_MATCH_YES;
+
+	case SOCKLND_CONN_CONTROL:
+		if (nonblk)
+			return SOCKNAL_MATCH_NO;
+		else if (nob >= *ksocknal_tunables.ksnd_min_bulk)
+			return SOCKNAL_MATCH_MAY;
+		else
+			return SOCKNAL_MATCH_YES;
+	}
+}
+
+/* (Sink) handle incoming ZC request from sender */
+static int
+ksocknal_handle_zcreq(ksock_conn_t *c, __u64 cookie, int remote)
+{
+	ksock_peer_t   *peer = c->ksnc_peer;
+	ksock_conn_t   *conn;
+	ksock_tx_t     *tx;
+	int	     rc;
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	conn = ksocknal_find_conn_locked(peer, NULL, !!remote);
+	if (conn != NULL) {
+		ksock_sched_t *sched = conn->ksnc_scheduler;
+
+		LASSERT(conn->ksnc_proto->pro_queue_tx_zcack != NULL);
+
+		spin_lock_bh(&sched->kss_lock);
+
+		rc = conn->ksnc_proto->pro_queue_tx_zcack(conn, NULL, cookie);
+
+		spin_unlock_bh(&sched->kss_lock);
+
+		if (rc) { /* piggybacked */
+			read_unlock(&ksocknal_data.ksnd_global_lock);
+			return 0;
+		}
+	}
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+
+	/* ACK connection is not ready, or can't piggyback the ACK */
+	tx = ksocknal_alloc_tx_noop(cookie, !!remote);
+	if (tx == NULL)
+		return -ENOMEM;
+
+	if ((rc = ksocknal_launch_packet(peer->ksnp_ni, tx, peer->ksnp_id)) == 0)
+		return 0;
+
+	ksocknal_free_tx(tx);
+	return rc;
+}
+
+/* (Sender) handle ZC_ACK from sink */
+static int
+ksocknal_handle_zcack(ksock_conn_t *conn, __u64 cookie1, __u64 cookie2)
+{
+	ksock_peer_t      *peer = conn->ksnc_peer;
+	ksock_tx_t	*tx;
+	ksock_tx_t	*tmp;
+	LIST_HEAD     (zlist);
+	int		count;
+
+	if (cookie1 == 0)
+		cookie1 = cookie2;
+
+	count = (cookie1 > cookie2) ? 2 : (cookie2 - cookie1 + 1);
+
+	if (cookie2 == SOCKNAL_KEEPALIVE_PING &&
+	    conn->ksnc_proto == &ksocknal_protocol_v3x) {
+		/* keepalive PING for V3.x, just ignore it */
+		return count == 1 ? 0 : -EPROTO;
+	}
+
+	spin_lock(&peer->ksnp_lock);
+
+	list_for_each_entry_safe(tx, tmp,
+				     &peer->ksnp_zc_req_list, tx_zc_list) {
+		__u64 c = tx->tx_msg.ksm_zc_cookies[0];
+
+		if (c == cookie1 || c == cookie2 || (cookie1 < c && c < cookie2)) {
+			tx->tx_msg.ksm_zc_cookies[0] = 0;
+			list_del(&tx->tx_zc_list);
+			list_add(&tx->tx_zc_list, &zlist);
+
+			if (--count == 0)
+				break;
+		}
+	}
+
+	spin_unlock(&peer->ksnp_lock);
+
+	while (!list_empty(&zlist)) {
+		tx = list_entry(zlist.next, ksock_tx_t, tx_zc_list);
+		list_del(&tx->tx_zc_list);
+		ksocknal_tx_decref(tx);
+	}
+
+	return count == 0 ? 0 : -EPROTO;
+}
+
+static int
+ksocknal_send_hello_v1 (ksock_conn_t *conn, ksock_hello_msg_t *hello)
+{
+	socket_t	*sock = conn->ksnc_sock;
+	lnet_hdr_t	  *hdr;
+	lnet_magicversion_t *hmv;
+	int		  rc;
+	int		  i;
+
+	CLASSERT(sizeof(lnet_magicversion_t) == offsetof(lnet_hdr_t, src_nid));
+
+	LIBCFS_ALLOC(hdr, sizeof(*hdr));
+	if (hdr == NULL) {
+		CERROR("Can't allocate lnet_hdr_t\n");
+		return -ENOMEM;
+	}
+
+	hmv = (lnet_magicversion_t *)&hdr->dest_nid;
+
+	/* Re-organize V2.x message header to V1.x (lnet_hdr_t)
+	 * header and send out */
+	hmv->magic	 = cpu_to_le32 (LNET_PROTO_TCP_MAGIC);
+	hmv->version_major = cpu_to_le16 (KSOCK_PROTO_V1_MAJOR);
+	hmv->version_minor = cpu_to_le16 (KSOCK_PROTO_V1_MINOR);
+
+	if (the_lnet.ln_testprotocompat != 0) {
+		/* single-shot proto check */
+		LNET_LOCK();
+		if ((the_lnet.ln_testprotocompat & 1) != 0) {
+			hmv->version_major++;   /* just different! */
+			the_lnet.ln_testprotocompat &= ~1;
+		}
+		if ((the_lnet.ln_testprotocompat & 2) != 0) {
+			hmv->magic = LNET_PROTO_MAGIC;
+			the_lnet.ln_testprotocompat &= ~2;
+		}
+		LNET_UNLOCK();
+	}
+
+	hdr->src_nid	= cpu_to_le64 (hello->kshm_src_nid);
+	hdr->src_pid	= cpu_to_le32 (hello->kshm_src_pid);
+	hdr->type	   = cpu_to_le32 (LNET_MSG_HELLO);
+	hdr->payload_length = cpu_to_le32 (hello->kshm_nips * sizeof(__u32));
+	hdr->msg.hello.type = cpu_to_le32 (hello->kshm_ctype);
+	hdr->msg.hello.incarnation = cpu_to_le64 (hello->kshm_src_incarnation);
+
+	rc = libcfs_sock_write(sock, hdr, sizeof(*hdr),lnet_acceptor_timeout());
+
+	if (rc != 0) {
+		CNETERR("Error %d sending HELLO hdr to %u.%u.%u.%u/%d\n",
+			rc, HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
+		goto out;
+	}
+
+	if (hello->kshm_nips == 0)
+		goto out;
+
+	for (i = 0; i < (int) hello->kshm_nips; i++) {
+		hello->kshm_ips[i] = __cpu_to_le32 (hello->kshm_ips[i]);
+	}
+
+	rc = libcfs_sock_write(sock, hello->kshm_ips,
+			       hello->kshm_nips * sizeof(__u32),
+			       lnet_acceptor_timeout());
+	if (rc != 0) {
+		CNETERR("Error %d sending HELLO payload (%d)"
+			" to %u.%u.%u.%u/%d\n", rc, hello->kshm_nips,
+			HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
+	}
+out:
+	LIBCFS_FREE(hdr, sizeof(*hdr));
+
+	return rc;
+}
+
+static int
+ksocknal_send_hello_v2 (ksock_conn_t *conn, ksock_hello_msg_t *hello)
+{
+	socket_t   *sock = conn->ksnc_sock;
+	int	     rc;
+
+	hello->kshm_magic   = LNET_PROTO_MAGIC;
+	hello->kshm_version = conn->ksnc_proto->pro_version;
+
+	if (the_lnet.ln_testprotocompat != 0) {
+		/* single-shot proto check */
+		LNET_LOCK();
+		if ((the_lnet.ln_testprotocompat & 1) != 0) {
+			hello->kshm_version++;   /* just different! */
+			the_lnet.ln_testprotocompat &= ~1;
+		}
+		LNET_UNLOCK();
+	}
+
+	rc = libcfs_sock_write(sock, hello, offsetof(ksock_hello_msg_t, kshm_ips),
+			       lnet_acceptor_timeout());
+
+	if (rc != 0) {
+		CNETERR("Error %d sending HELLO hdr to %u.%u.%u.%u/%d\n",
+			rc, HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
+		return rc;
+	}
+
+	if (hello->kshm_nips == 0)
+		return 0;
+
+	rc = libcfs_sock_write(sock, hello->kshm_ips,
+			       hello->kshm_nips * sizeof(__u32),
+			       lnet_acceptor_timeout());
+	if (rc != 0) {
+		CNETERR("Error %d sending HELLO payload (%d)"
+			" to %u.%u.%u.%u/%d\n", rc, hello->kshm_nips,
+			HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
+	}
+
+	return rc;
+}
+
+static int
+ksocknal_recv_hello_v1(ksock_conn_t *conn, ksock_hello_msg_t *hello,int timeout)
+{
+	socket_t	*sock = conn->ksnc_sock;
+	lnet_hdr_t	  *hdr;
+	int		  rc;
+	int		  i;
+
+	LIBCFS_ALLOC(hdr, sizeof(*hdr));
+	if (hdr == NULL) {
+		CERROR("Can't allocate lnet_hdr_t\n");
+		return -ENOMEM;
+	}
+
+	rc = libcfs_sock_read(sock, &hdr->src_nid,
+			      sizeof (*hdr) - offsetof (lnet_hdr_t, src_nid),
+			      timeout);
+	if (rc != 0) {
+		CERROR ("Error %d reading rest of HELLO hdr from %u.%u.%u.%u\n",
+			rc, HIPQUAD(conn->ksnc_ipaddr));
+		LASSERT (rc < 0 && rc != -EALREADY);
+		goto out;
+	}
+
+	/* ...and check we got what we expected */
+	if (hdr->type != cpu_to_le32 (LNET_MSG_HELLO)) {
+		CERROR ("Expecting a HELLO hdr,"
+			" but got type %d from %u.%u.%u.%u\n",
+			le32_to_cpu (hdr->type),
+			HIPQUAD(conn->ksnc_ipaddr));
+		rc = -EPROTO;
+		goto out;
+	}
+
+	hello->kshm_src_nid	 = le64_to_cpu (hdr->src_nid);
+	hello->kshm_src_pid	 = le32_to_cpu (hdr->src_pid);
+	hello->kshm_src_incarnation = le64_to_cpu (hdr->msg.hello.incarnation);
+	hello->kshm_ctype	   = le32_to_cpu (hdr->msg.hello.type);
+	hello->kshm_nips	    = le32_to_cpu (hdr->payload_length) /
+					 sizeof (__u32);
+
+	if (hello->kshm_nips > LNET_MAX_INTERFACES) {
+		CERROR("Bad nips %d from ip %u.%u.%u.%u\n",
+		       hello->kshm_nips, HIPQUAD(conn->ksnc_ipaddr));
+		rc = -EPROTO;
+		goto out;
+	}
+
+	if (hello->kshm_nips == 0)
+		goto out;
+
+	rc = libcfs_sock_read(sock, hello->kshm_ips,
+			      hello->kshm_nips * sizeof(__u32), timeout);
+	if (rc != 0) {
+		CERROR ("Error %d reading IPs from ip %u.%u.%u.%u\n",
+			rc, HIPQUAD(conn->ksnc_ipaddr));
+		LASSERT (rc < 0 && rc != -EALREADY);
+		goto out;
+	}
+
+	for (i = 0; i < (int) hello->kshm_nips; i++) {
+		hello->kshm_ips[i] = __le32_to_cpu(hello->kshm_ips[i]);
+
+		if (hello->kshm_ips[i] == 0) {
+			CERROR("Zero IP[%d] from ip %u.%u.%u.%u\n",
+			       i, HIPQUAD(conn->ksnc_ipaddr));
+			rc = -EPROTO;
+			break;
+		}
+	}
+out:
+	LIBCFS_FREE(hdr, sizeof(*hdr));
+
+	return rc;
+}
+
+static int
+ksocknal_recv_hello_v2 (ksock_conn_t *conn, ksock_hello_msg_t *hello, int timeout)
+{
+	socket_t      *sock = conn->ksnc_sock;
+	int		rc;
+	int		i;
+
+	if (hello->kshm_magic == LNET_PROTO_MAGIC)
+		conn->ksnc_flip = 0;
+	else
+		conn->ksnc_flip = 1;
+
+	rc = libcfs_sock_read(sock, &hello->kshm_src_nid,
+			      offsetof(ksock_hello_msg_t, kshm_ips) -
+				       offsetof(ksock_hello_msg_t, kshm_src_nid),
+			      timeout);
+	if (rc != 0) {
+		CERROR ("Error %d reading HELLO from %u.%u.%u.%u\n",
+			rc, HIPQUAD(conn->ksnc_ipaddr));
+		LASSERT (rc < 0 && rc != -EALREADY);
+		return rc;
+	}
+
+	if (conn->ksnc_flip) {
+		__swab32s(&hello->kshm_src_pid);
+		__swab64s(&hello->kshm_src_nid);
+		__swab32s(&hello->kshm_dst_pid);
+		__swab64s(&hello->kshm_dst_nid);
+		__swab64s(&hello->kshm_src_incarnation);
+		__swab64s(&hello->kshm_dst_incarnation);
+		__swab32s(&hello->kshm_ctype);
+		__swab32s(&hello->kshm_nips);
+	}
+
+	if (hello->kshm_nips > LNET_MAX_INTERFACES) {
+		CERROR("Bad nips %d from ip %u.%u.%u.%u\n",
+		       hello->kshm_nips, HIPQUAD(conn->ksnc_ipaddr));
+		return -EPROTO;
+	}
+
+	if (hello->kshm_nips == 0)
+		return 0;
+
+	rc = libcfs_sock_read(sock, hello->kshm_ips,
+			      hello->kshm_nips * sizeof(__u32), timeout);
+	if (rc != 0) {
+		CERROR ("Error %d reading IPs from ip %u.%u.%u.%u\n",
+			rc, HIPQUAD(conn->ksnc_ipaddr));
+		LASSERT (rc < 0 && rc != -EALREADY);
+		return rc;
+	}
+
+	for (i = 0; i < (int) hello->kshm_nips; i++) {
+		if (conn->ksnc_flip)
+			__swab32s(&hello->kshm_ips[i]);
+
+		if (hello->kshm_ips[i] == 0) {
+			CERROR("Zero IP[%d] from ip %u.%u.%u.%u\n",
+			       i, HIPQUAD(conn->ksnc_ipaddr));
+			return -EPROTO;
+		}
+	}
+
+	return 0;
+}
+
+static void
+ksocknal_pack_msg_v1(ksock_tx_t *tx)
+{
+	/* V1.x has no KSOCK_MSG_NOOP */
+	LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
+	LASSERT(tx->tx_lnetmsg != NULL);
+
+	tx->tx_iov[0].iov_base = (void *)&tx->tx_lnetmsg->msg_hdr;
+	tx->tx_iov[0].iov_len  = sizeof(lnet_hdr_t);
+
+	tx->tx_resid = tx->tx_nob = tx->tx_lnetmsg->msg_len + sizeof(lnet_hdr_t);
+}
+
+static void
+ksocknal_pack_msg_v2(ksock_tx_t *tx)
+{
+	tx->tx_iov[0].iov_base = (void *)&tx->tx_msg;
+
+	if (tx->tx_lnetmsg != NULL) {
+		LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
+
+		tx->tx_msg.ksm_u.lnetmsg.ksnm_hdr = tx->tx_lnetmsg->msg_hdr;
+		tx->tx_iov[0].iov_len = sizeof(ksock_msg_t);
+		tx->tx_resid = tx->tx_nob = sizeof(ksock_msg_t) + tx->tx_lnetmsg->msg_len;
+	} else {
+		LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_NOOP);
+
+		tx->tx_iov[0].iov_len = offsetof(ksock_msg_t, ksm_u.lnetmsg.ksnm_hdr);
+		tx->tx_resid = tx->tx_nob = offsetof(ksock_msg_t,  ksm_u.lnetmsg.ksnm_hdr);
+	}
+	/* Don't checksum before start sending, because packet can be piggybacked with ACK */
+}
+
+static void
+ksocknal_unpack_msg_v1(ksock_msg_t *msg)
+{
+	msg->ksm_csum	   = 0;
+	msg->ksm_type	   = KSOCK_MSG_LNET;
+	msg->ksm_zc_cookies[0]  = msg->ksm_zc_cookies[1]  = 0;
+}
+
+static void
+ksocknal_unpack_msg_v2(ksock_msg_t *msg)
+{
+	return;  /* Do nothing */
+}
+
+ksock_proto_t  ksocknal_protocol_v1x =
+{
+	.pro_version	    = KSOCK_PROTO_V1,
+	.pro_send_hello	 = ksocknal_send_hello_v1,
+	.pro_recv_hello	 = ksocknal_recv_hello_v1,
+	.pro_pack	       = ksocknal_pack_msg_v1,
+	.pro_unpack	     = ksocknal_unpack_msg_v1,
+	.pro_queue_tx_msg       = ksocknal_queue_tx_msg_v1,
+	.pro_handle_zcreq       = NULL,
+	.pro_handle_zcack       = NULL,
+	.pro_queue_tx_zcack     = NULL,
+	.pro_match_tx	   = ksocknal_match_tx
+};
+
+ksock_proto_t  ksocknal_protocol_v2x =
+{
+	.pro_version	    = KSOCK_PROTO_V2,
+	.pro_send_hello	 = ksocknal_send_hello_v2,
+	.pro_recv_hello	 = ksocknal_recv_hello_v2,
+	.pro_pack	       = ksocknal_pack_msg_v2,
+	.pro_unpack	     = ksocknal_unpack_msg_v2,
+	.pro_queue_tx_msg       = ksocknal_queue_tx_msg_v2,
+	.pro_queue_tx_zcack     = ksocknal_queue_tx_zcack_v2,
+	.pro_handle_zcreq       = ksocknal_handle_zcreq,
+	.pro_handle_zcack       = ksocknal_handle_zcack,
+	.pro_match_tx	   = ksocknal_match_tx
+};
+
+ksock_proto_t  ksocknal_protocol_v3x =
+{
+	.pro_version	    = KSOCK_PROTO_V3,
+	.pro_send_hello	 = ksocknal_send_hello_v2,
+	.pro_recv_hello	 = ksocknal_recv_hello_v2,
+	.pro_pack	       = ksocknal_pack_msg_v2,
+	.pro_unpack	     = ksocknal_unpack_msg_v2,
+	.pro_queue_tx_msg       = ksocknal_queue_tx_msg_v2,
+	.pro_queue_tx_zcack     = ksocknal_queue_tx_zcack_v3,
+	.pro_handle_zcreq       = ksocknal_handle_zcreq,
+	.pro_handle_zcack       = ksocknal_handle_zcack,
+	.pro_match_tx	   = ksocknal_match_tx_v3
+};
diff --git a/drivers/staging/lustre/lnet/lnet/Makefile b/drivers/staging/lustre/lnet/lnet/Makefile
new file mode 100644
index 000000000000..1bd9ef774208
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/Makefile
@@ -0,0 +1,8 @@
+obj-$(CONFIG_LNET) += lnet.o
+
+lnet-y := api-errno.o api-ni.o config.o lib-me.o lib-msg.o lib-eq.o	\
+	  lib-md.o lib-ptl.o lib-move.o module.o lo.o router.o		\
+	  router_proc.o acceptor.o peer.o
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lnet/lnet/acceptor.c b/drivers/staging/lustre/lnet/lnet/acceptor.c
new file mode 100644
index 000000000000..81ef28bbcba0
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/acceptor.c
@@ -0,0 +1,527 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+
+
+static int   accept_port    = 988;
+static int   accept_backlog = 127;
+static int   accept_timeout = 5;
+
+struct {
+	int			pta_shutdown;
+	socket_t		*pta_sock;
+	struct completion	pta_signal;
+} lnet_acceptor_state;
+
+int
+lnet_acceptor_port(void)
+{
+	return accept_port;
+}
+
+static inline int
+lnet_accept_magic(__u32 magic, __u32 constant)
+{
+	return (magic == constant ||
+		magic == __swab32(constant));
+}
+
+
+EXPORT_SYMBOL(lnet_acceptor_port);
+
+static char *accept = "secure";
+
+CFS_MODULE_PARM(accept, "s", charp, 0444,
+		"Accept connections (secure|all|none)");
+CFS_MODULE_PARM(accept_port, "i", int, 0444,
+		"Acceptor's port (same on all nodes)");
+CFS_MODULE_PARM(accept_backlog, "i", int, 0444,
+		"Acceptor's listen backlog");
+CFS_MODULE_PARM(accept_timeout, "i", int, 0644,
+		"Acceptor's timeout (seconds)");
+
+static char *accept_type = NULL;
+
+int
+lnet_acceptor_get_tunables(void)
+{
+	/* Userland acceptor uses 'accept_type' instead of 'accept', due to
+	 * conflict with 'accept(2)', but kernel acceptor still uses 'accept'
+	 * for compatibility. Hence the trick. */
+	accept_type = accept;
+	return 0;
+}
+
+int
+lnet_acceptor_timeout(void)
+{
+	return accept_timeout;
+}
+EXPORT_SYMBOL(lnet_acceptor_timeout);
+
+void
+lnet_connect_console_error (int rc, lnet_nid_t peer_nid,
+			   __u32 peer_ip, int peer_port)
+{
+	switch (rc) {
+	/* "normal" errors */
+	case -ECONNREFUSED:
+		CNETERR("Connection to %s at host %u.%u.%u.%u on port %d was "
+			"refused: check that Lustre is running on that node.\n",
+			libcfs_nid2str(peer_nid),
+			HIPQUAD(peer_ip), peer_port);
+		break;
+	case -EHOSTUNREACH:
+	case -ENETUNREACH:
+		CNETERR("Connection to %s at host %u.%u.%u.%u "
+			"was unreachable: the network or that node may "
+			"be down, or Lustre may be misconfigured.\n",
+			libcfs_nid2str(peer_nid), HIPQUAD(peer_ip));
+		break;
+	case -ETIMEDOUT:
+		CNETERR("Connection to %s at host %u.%u.%u.%u on "
+			"port %d took too long: that node may be hung "
+			"or experiencing high load.\n",
+			libcfs_nid2str(peer_nid),
+			HIPQUAD(peer_ip), peer_port);
+		break;
+	case -ECONNRESET:
+		LCONSOLE_ERROR_MSG(0x11b, "Connection to %s at host %u.%u.%u.%u"
+				   " on port %d was reset: "
+				   "is it running a compatible version of "
+				   "Lustre and is %s one of its NIDs?\n",
+				   libcfs_nid2str(peer_nid),
+				   HIPQUAD(peer_ip), peer_port,
+				   libcfs_nid2str(peer_nid));
+		break;
+	case -EPROTO:
+		LCONSOLE_ERROR_MSG(0x11c, "Protocol error connecting to %s at "
+				   "host %u.%u.%u.%u on port %d: is it running "
+				   "a compatible version of Lustre?\n",
+				   libcfs_nid2str(peer_nid),
+				   HIPQUAD(peer_ip), peer_port);
+		break;
+	case -EADDRINUSE:
+		LCONSOLE_ERROR_MSG(0x11d, "No privileged ports available to "
+				   "connect to %s at host %u.%u.%u.%u on port "
+				   "%d\n", libcfs_nid2str(peer_nid),
+				   HIPQUAD(peer_ip), peer_port);
+		break;
+	default:
+		LCONSOLE_ERROR_MSG(0x11e, "Unexpected error %d connecting to %s"
+				   " at host %u.%u.%u.%u on port %d\n", rc,
+				   libcfs_nid2str(peer_nid),
+				   HIPQUAD(peer_ip), peer_port);
+		break;
+	}
+}
+EXPORT_SYMBOL(lnet_connect_console_error);
+
+int
+lnet_connect(socket_t **sockp, lnet_nid_t peer_nid,
+	    __u32 local_ip, __u32 peer_ip, int peer_port)
+{
+	lnet_acceptor_connreq_t cr;
+	socket_t	   *sock;
+	int		     rc;
+	int		     port;
+	int		     fatal;
+
+	CLASSERT (sizeof(cr) <= 16);	    /* not too big to be on the stack */
+
+	for (port = LNET_ACCEPTOR_MAX_RESERVED_PORT;
+	     port >= LNET_ACCEPTOR_MIN_RESERVED_PORT;
+	     --port) {
+		/* Iterate through reserved ports. */
+
+		rc = libcfs_sock_connect(&sock, &fatal,
+					 local_ip, port,
+					 peer_ip, peer_port);
+		if (rc != 0) {
+			if (fatal)
+				goto failed;
+			continue;
+		}
+
+		CLASSERT (LNET_PROTO_ACCEPTOR_VERSION == 1);
+
+		cr.acr_magic   = LNET_PROTO_ACCEPTOR_MAGIC;
+		cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION;
+		cr.acr_nid     = peer_nid;
+
+		if (the_lnet.ln_testprotocompat != 0) {
+			/* single-shot proto check */
+			lnet_net_lock(LNET_LOCK_EX);
+			if ((the_lnet.ln_testprotocompat & 4) != 0) {
+				cr.acr_version++;
+				the_lnet.ln_testprotocompat &= ~4;
+			}
+			if ((the_lnet.ln_testprotocompat & 8) != 0) {
+				cr.acr_magic = LNET_PROTO_MAGIC;
+				the_lnet.ln_testprotocompat &= ~8;
+			}
+			lnet_net_unlock(LNET_LOCK_EX);
+		}
+
+		rc = libcfs_sock_write(sock, &cr, sizeof(cr),
+				       accept_timeout);
+		if (rc != 0)
+			goto failed_sock;
+
+		*sockp = sock;
+		return 0;
+	}
+
+	rc = -EADDRINUSE;
+	goto failed;
+
+ failed_sock:
+	libcfs_sock_release(sock);
+ failed:
+	lnet_connect_console_error(rc, peer_nid, peer_ip, peer_port);
+	return rc;
+}
+EXPORT_SYMBOL(lnet_connect);
+
+
+/* Below is the code common for both kernel and MT user-space */
+
+int
+lnet_accept(socket_t *sock, __u32 magic)
+{
+	lnet_acceptor_connreq_t cr;
+	__u32		   peer_ip;
+	int		     peer_port;
+	int		     rc;
+	int		     flip;
+	lnet_ni_t	      *ni;
+	char		   *str;
+
+	LASSERT (sizeof(cr) <= 16);	     /* not too big for the stack */
+
+	rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port);
+	LASSERT (rc == 0);		      /* we succeeded before */
+
+	if (!lnet_accept_magic(magic, LNET_PROTO_ACCEPTOR_MAGIC)) {
+
+		if (lnet_accept_magic(magic, LNET_PROTO_MAGIC)) {
+			/* future version compatibility!
+			 * When LNET unifies protocols over all LNDs, the first
+			 * thing sent will be a version query.  I send back
+			 * LNET_PROTO_ACCEPTOR_MAGIC to tell her I'm "old" */
+
+			memset (&cr, 0, sizeof(cr));
+			cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC;
+			cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION;
+			rc = libcfs_sock_write(sock, &cr, sizeof(cr),
+					       accept_timeout);
+
+			if (rc != 0)
+				CERROR("Error sending magic+version in response"
+				       "to LNET magic from %u.%u.%u.%u: %d\n",
+				       HIPQUAD(peer_ip), rc);
+			return -EPROTO;
+		}
+
+		if (magic == le32_to_cpu(LNET_PROTO_TCP_MAGIC))
+			str = "'old' socknal/tcpnal";
+		else if (lnet_accept_magic(magic, LNET_PROTO_RA_MAGIC))
+			str = "'old' ranal";
+		else
+			str = "unrecognised";
+
+		LCONSOLE_ERROR_MSG(0x11f, "Refusing connection from %u.%u.%u.%u"
+				   " magic %08x: %s acceptor protocol\n",
+				   HIPQUAD(peer_ip), magic, str);
+		return -EPROTO;
+	}
+
+	flip = (magic != LNET_PROTO_ACCEPTOR_MAGIC);
+
+	rc = libcfs_sock_read(sock, &cr.acr_version,
+			      sizeof(cr.acr_version),
+			      accept_timeout);
+	if (rc != 0) {
+		CERROR("Error %d reading connection request version from "
+		       "%u.%u.%u.%u\n", rc, HIPQUAD(peer_ip));
+		return -EIO;
+	}
+
+	if (flip)
+		__swab32s(&cr.acr_version);
+
+	if (cr.acr_version != LNET_PROTO_ACCEPTOR_VERSION) {
+		/* future version compatibility!
+		 * An acceptor-specific protocol rev will first send a version
+		 * query.  I send back my current version to tell her I'm
+		 * "old". */
+		int peer_version = cr.acr_version;
+
+		memset (&cr, 0, sizeof(cr));
+		cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC;
+		cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION;
+
+		rc = libcfs_sock_write(sock, &cr, sizeof(cr),
+				       accept_timeout);
+
+		if (rc != 0)
+			CERROR("Error sending magic+version in response"
+			       "to version %d from %u.%u.%u.%u: %d\n",
+			       peer_version, HIPQUAD(peer_ip), rc);
+		return -EPROTO;
+	}
+
+	rc = libcfs_sock_read(sock, &cr.acr_nid,
+			      sizeof(cr) -
+			      offsetof(lnet_acceptor_connreq_t, acr_nid),
+			      accept_timeout);
+	if (rc != 0) {
+		CERROR("Error %d reading connection request from "
+		       "%u.%u.%u.%u\n", rc, HIPQUAD(peer_ip));
+		return -EIO;
+	}
+
+	if (flip)
+		__swab64s(&cr.acr_nid);
+
+	ni = lnet_net2ni(LNET_NIDNET(cr.acr_nid));
+	if (ni == NULL ||	       /* no matching net */
+	    ni->ni_nid != cr.acr_nid) { /* right NET, wrong NID! */
+		if (ni != NULL)
+			lnet_ni_decref(ni);
+		LCONSOLE_ERROR_MSG(0x120, "Refusing connection from %u.%u.%u.%u"
+				   " for %s: No matching NI\n",
+				   HIPQUAD(peer_ip), libcfs_nid2str(cr.acr_nid));
+		return -EPERM;
+	}
+
+	if (ni->ni_lnd->lnd_accept == NULL) {
+		/* This catches a request for the loopback LND */
+		lnet_ni_decref(ni);
+		LCONSOLE_ERROR_MSG(0x121, "Refusing connection from %u.%u.%u.%u"
+				  " for %s: NI doesn not accept IP connections\n",
+				  HIPQUAD(peer_ip), libcfs_nid2str(cr.acr_nid));
+		return -EPERM;
+	}
+
+	CDEBUG(D_NET, "Accept %s from %u.%u.%u.%u\n",
+	       libcfs_nid2str(cr.acr_nid), HIPQUAD(peer_ip));
+
+	rc = ni->ni_lnd->lnd_accept(ni, sock);
+
+	lnet_ni_decref(ni);
+	return rc;
+}
+
+int
+lnet_acceptor(void *arg)
+{
+	socket_t  *newsock;
+	int	    rc;
+	__u32	  magic;
+	__u32	  peer_ip;
+	int	    peer_port;
+	int	    secure = (int)((long_ptr_t)arg);
+
+	LASSERT (lnet_acceptor_state.pta_sock == NULL);
+
+	cfs_block_allsigs();
+
+	rc = libcfs_sock_listen(&lnet_acceptor_state.pta_sock,
+				0, accept_port, accept_backlog);
+	if (rc != 0) {
+		if (rc == -EADDRINUSE)
+			LCONSOLE_ERROR_MSG(0x122, "Can't start acceptor on port"
+					   " %d: port already in use\n",
+					   accept_port);
+		else
+			LCONSOLE_ERROR_MSG(0x123, "Can't start acceptor on port "
+					   "%d: unexpected error %d\n",
+					   accept_port, rc);
+
+		lnet_acceptor_state.pta_sock = NULL;
+	} else {
+		LCONSOLE(0, "Accept %s, port %d\n", accept_type, accept_port);
+	}
+
+	/* set init status and unblock parent */
+	lnet_acceptor_state.pta_shutdown = rc;
+	complete(&lnet_acceptor_state.pta_signal);
+
+	if (rc != 0)
+		return rc;
+
+	while (!lnet_acceptor_state.pta_shutdown) {
+
+		rc = libcfs_sock_accept(&newsock, lnet_acceptor_state.pta_sock);
+		if (rc != 0) {
+			if (rc != -EAGAIN) {
+				CWARN("Accept error %d: pausing...\n", rc);
+				cfs_pause(cfs_time_seconds(1));
+			}
+			continue;
+		}
+
+		/* maybe we're waken up with libcfs_sock_abort_accept() */
+		if (lnet_acceptor_state.pta_shutdown) {
+			libcfs_sock_release(newsock);
+			break;
+		}
+
+		rc = libcfs_sock_getaddr(newsock, 1, &peer_ip, &peer_port);
+		if (rc != 0) {
+			CERROR("Can't determine new connection's address\n");
+			goto failed;
+		}
+
+		if (secure && peer_port > LNET_ACCEPTOR_MAX_RESERVED_PORT) {
+			CERROR("Refusing connection from %u.%u.%u.%u: "
+			       "insecure port %d\n",
+			       HIPQUAD(peer_ip), peer_port);
+			goto failed;
+		}
+
+		rc = libcfs_sock_read(newsock, &magic, sizeof(magic),
+				      accept_timeout);
+		if (rc != 0) {
+			CERROR("Error %d reading connection request from "
+			       "%u.%u.%u.%u\n", rc, HIPQUAD(peer_ip));
+			goto failed;
+		}
+
+		rc = lnet_accept(newsock, magic);
+		if (rc != 0)
+			goto failed;
+
+		continue;
+
+	failed:
+		libcfs_sock_release(newsock);
+	}
+
+	libcfs_sock_release(lnet_acceptor_state.pta_sock);
+	lnet_acceptor_state.pta_sock = NULL;
+
+	CDEBUG(D_NET, "Acceptor stopping\n");
+
+	/* unblock lnet_acceptor_stop() */
+	complete(&lnet_acceptor_state.pta_signal);
+	return 0;
+}
+
+static inline int
+accept2secure(const char *acc, long *sec)
+{
+	if (!strcmp(acc, "secure")) {
+		*sec = 1;
+		return 1;
+	} else if (!strcmp(acc, "all")) {
+		*sec = 0;
+		return 1;
+	} else if (!strcmp(acc, "none")) {
+		return 0;
+	} else {
+		LCONSOLE_ERROR_MSG(0x124, "Can't parse 'accept=\"%s\"'\n",
+				   acc);
+		return -EINVAL;
+	}
+}
+
+int
+lnet_acceptor_start(void)
+{
+	int  rc;
+	long rc2;
+	long secure;
+
+	LASSERT (lnet_acceptor_state.pta_sock == NULL);
+
+	rc = lnet_acceptor_get_tunables();
+	if (rc != 0)
+		return rc;
+
+
+	init_completion(&lnet_acceptor_state.pta_signal);
+	rc = accept2secure(accept_type, &secure);
+	if (rc <= 0) {
+		fini_completion(&lnet_acceptor_state.pta_signal);
+		return rc;
+	}
+
+	if (lnet_count_acceptor_nis() == 0)  /* not required */
+		return 0;
+
+	rc2 = PTR_ERR(kthread_run(lnet_acceptor,
+				  (void *)(ulong_ptr_t)secure,
+				  "acceptor_%03ld", secure));
+	if (IS_ERR_VALUE(rc2)) {
+		CERROR("Can't start acceptor thread: %ld\n", rc2);
+		fini_completion(&lnet_acceptor_state.pta_signal);
+
+		return -ESRCH;
+	}
+
+	/* wait for acceptor to startup */
+	wait_for_completion(&lnet_acceptor_state.pta_signal);
+
+	if (!lnet_acceptor_state.pta_shutdown) {
+		/* started OK */
+		LASSERT(lnet_acceptor_state.pta_sock != NULL);
+		return 0;
+	}
+
+	LASSERT(lnet_acceptor_state.pta_sock == NULL);
+	fini_completion(&lnet_acceptor_state.pta_signal);
+
+	return -ENETDOWN;
+}
+
+void
+lnet_acceptor_stop(void)
+{
+	if (lnet_acceptor_state.pta_sock == NULL) /* not running */
+		return;
+
+	lnet_acceptor_state.pta_shutdown = 1;
+	libcfs_sock_abort_accept(lnet_acceptor_state.pta_sock);
+
+	/* block until acceptor signals exit */
+	wait_for_completion(&lnet_acceptor_state.pta_signal);
+
+	fini_completion(&lnet_acceptor_state.pta_signal);
+}
diff --git a/drivers/staging/lustre/lnet/lnet/api-errno.c b/drivers/staging/lustre/lnet/lnet/api-errno.c
new file mode 100644
index 000000000000..695b27265e23
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/api-errno.c
@@ -0,0 +1,39 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/api-errno.c
+ *
+ * Instantiate the string table of errors
+ */
+
+/* If you change these, you must update the number table in portals/errno.h */
diff --git a/drivers/staging/lustre/lnet/lnet/api-ni.c b/drivers/staging/lustre/lnet/lnet/api-ni.c
new file mode 100644
index 000000000000..e88bee362497
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/api-ni.c
@@ -0,0 +1,1941 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+#include <linux/log2.h>
+
+#define D_LNI D_CONSOLE
+
+lnet_t      the_lnet;			   /* THE state of the network */
+EXPORT_SYMBOL(the_lnet);
+
+
+static char *ip2nets = "";
+CFS_MODULE_PARM(ip2nets, "s", charp, 0444,
+		"LNET network <- IP table");
+
+static char *networks = "";
+CFS_MODULE_PARM(networks, "s", charp, 0444,
+		"local networks");
+
+static char *routes = "";
+CFS_MODULE_PARM(routes, "s", charp, 0444,
+		"routes to non-local networks");
+
+static int rnet_htable_size = LNET_REMOTE_NETS_HASH_DEFAULT;
+CFS_MODULE_PARM(rnet_htable_size, "i", int, 0444,
+		"size of remote network hash table");
+
+char *
+lnet_get_routes(void)
+{
+	return routes;
+}
+
+char *
+lnet_get_networks(void)
+{
+	char   *nets;
+	int     rc;
+
+	if (*networks != 0 && *ip2nets != 0) {
+		LCONSOLE_ERROR_MSG(0x101, "Please specify EITHER 'networks' or "
+				   "'ip2nets' but not both at once\n");
+		return NULL;
+	}
+
+	if (*ip2nets != 0) {
+		rc = lnet_parse_ip2nets(&nets, ip2nets);
+		return (rc == 0) ? nets : NULL;
+	}
+
+	if (*networks != 0)
+		return networks;
+
+	return "tcp";
+}
+
+void
+lnet_init_locks(void)
+{
+	spin_lock_init(&the_lnet.ln_eq_wait_lock);
+	init_waitqueue_head(&the_lnet.ln_eq_waitq);
+	mutex_init(&the_lnet.ln_lnd_mutex);
+	mutex_init(&the_lnet.ln_api_mutex);
+}
+
+void
+lnet_fini_locks(void)
+{
+}
+
+
+static int
+lnet_create_remote_nets_table(void)
+{
+	int		i;
+	struct list_head	*hash;
+
+	LASSERT(the_lnet.ln_remote_nets_hash == NULL);
+	LASSERT(the_lnet.ln_remote_nets_hbits > 0);
+	LIBCFS_ALLOC(hash, LNET_REMOTE_NETS_HASH_SIZE * sizeof(*hash));
+	if (hash == NULL) {
+		CERROR("Failed to create remote nets hash table\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&hash[i]);
+	the_lnet.ln_remote_nets_hash = hash;
+	return 0;
+}
+
+static void
+lnet_destroy_remote_nets_table(void)
+{
+	int		i;
+	struct list_head	*hash;
+
+	if (the_lnet.ln_remote_nets_hash == NULL)
+		return;
+
+	for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++)
+		LASSERT(list_empty(&the_lnet.ln_remote_nets_hash[i]));
+
+	LIBCFS_FREE(the_lnet.ln_remote_nets_hash,
+		    LNET_REMOTE_NETS_HASH_SIZE * sizeof(*hash));
+	the_lnet.ln_remote_nets_hash = NULL;
+}
+
+static void
+lnet_destroy_locks(void)
+{
+	if (the_lnet.ln_res_lock != NULL) {
+		cfs_percpt_lock_free(the_lnet.ln_res_lock);
+		the_lnet.ln_res_lock = NULL;
+	}
+
+	if (the_lnet.ln_net_lock != NULL) {
+		cfs_percpt_lock_free(the_lnet.ln_net_lock);
+		the_lnet.ln_net_lock = NULL;
+	}
+
+	lnet_fini_locks();
+}
+
+static int
+lnet_create_locks(void)
+{
+	lnet_init_locks();
+
+	the_lnet.ln_res_lock = cfs_percpt_lock_alloc(lnet_cpt_table());
+	if (the_lnet.ln_res_lock == NULL)
+		goto failed;
+
+	the_lnet.ln_net_lock = cfs_percpt_lock_alloc(lnet_cpt_table());
+	if (the_lnet.ln_net_lock == NULL)
+		goto failed;
+
+	return 0;
+
+ failed:
+	lnet_destroy_locks();
+	return -ENOMEM;
+}
+
+void lnet_assert_wire_constants (void)
+{
+	/* Wire protocol assertions generated by 'wirecheck'
+	 * running on Linux robert.bartonsoftware.com 2.6.8-1.521
+	 * #1 Mon Aug 16 09:01:18 EDT 2004 i686 athlon i386 GNU/Linux
+	 * with gcc version 3.3.3 20040412 (Red Hat Linux 3.3.3-7) */
+
+	/* Constants... */
+	CLASSERT (LNET_PROTO_TCP_MAGIC == 0xeebc0ded);
+	CLASSERT (LNET_PROTO_TCP_VERSION_MAJOR == 1);
+	CLASSERT (LNET_PROTO_TCP_VERSION_MINOR == 0);
+	CLASSERT (LNET_MSG_ACK == 0);
+	CLASSERT (LNET_MSG_PUT == 1);
+	CLASSERT (LNET_MSG_GET == 2);
+	CLASSERT (LNET_MSG_REPLY == 3);
+	CLASSERT (LNET_MSG_HELLO == 4);
+
+	/* Checks for struct ptl_handle_wire_t */
+	CLASSERT ((int)sizeof(lnet_handle_wire_t) == 16);
+	CLASSERT ((int)offsetof(lnet_handle_wire_t, wh_interface_cookie) == 0);
+	CLASSERT ((int)sizeof(((lnet_handle_wire_t *)0)->wh_interface_cookie) == 8);
+	CLASSERT ((int)offsetof(lnet_handle_wire_t, wh_object_cookie) == 8);
+	CLASSERT ((int)sizeof(((lnet_handle_wire_t *)0)->wh_object_cookie) == 8);
+
+	/* Checks for struct lnet_magicversion_t */
+	CLASSERT ((int)sizeof(lnet_magicversion_t) == 8);
+	CLASSERT ((int)offsetof(lnet_magicversion_t, magic) == 0);
+	CLASSERT ((int)sizeof(((lnet_magicversion_t *)0)->magic) == 4);
+	CLASSERT ((int)offsetof(lnet_magicversion_t, version_major) == 4);
+	CLASSERT ((int)sizeof(((lnet_magicversion_t *)0)->version_major) == 2);
+	CLASSERT ((int)offsetof(lnet_magicversion_t, version_minor) == 6);
+	CLASSERT ((int)sizeof(((lnet_magicversion_t *)0)->version_minor) == 2);
+
+	/* Checks for struct lnet_hdr_t */
+	CLASSERT ((int)sizeof(lnet_hdr_t) == 72);
+	CLASSERT ((int)offsetof(lnet_hdr_t, dest_nid) == 0);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->dest_nid) == 8);
+	CLASSERT ((int)offsetof(lnet_hdr_t, src_nid) == 8);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->src_nid) == 8);
+	CLASSERT ((int)offsetof(lnet_hdr_t, dest_pid) == 16);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->dest_pid) == 4);
+	CLASSERT ((int)offsetof(lnet_hdr_t, src_pid) == 20);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->src_pid) == 4);
+	CLASSERT ((int)offsetof(lnet_hdr_t, type) == 24);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->type) == 4);
+	CLASSERT ((int)offsetof(lnet_hdr_t, payload_length) == 28);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->payload_length) == 4);
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg) == 32);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg) == 40);
+
+	/* Ack */
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.ack.dst_wmd) == 32);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.ack.dst_wmd) == 16);
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.ack.match_bits) == 48);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.ack.match_bits) == 8);
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.ack.mlength) == 56);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.ack.mlength) == 4);
+
+	/* Put */
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.ack_wmd) == 32);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.ack_wmd) == 16);
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.match_bits) == 48);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.match_bits) == 8);
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.hdr_data) == 56);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.hdr_data) == 8);
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.ptl_index) == 64);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.ptl_index) == 4);
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.offset) == 68);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.offset) == 4);
+
+	/* Get */
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.return_wmd) == 32);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.return_wmd) == 16);
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.match_bits) == 48);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.match_bits) == 8);
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.ptl_index) == 56);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.ptl_index) == 4);
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.src_offset) == 60);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.src_offset) == 4);
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.sink_length) == 64);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.sink_length) == 4);
+
+	/* Reply */
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.reply.dst_wmd) == 32);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.reply.dst_wmd) == 16);
+
+	/* Hello */
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.hello.incarnation) == 32);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.hello.incarnation) == 8);
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.hello.type) == 40);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.hello.type) == 4);
+}
+
+lnd_t *
+lnet_find_lnd_by_type (int type)
+{
+	lnd_t	      *lnd;
+	struct list_head	 *tmp;
+
+	/* holding lnd mutex */
+	list_for_each (tmp, &the_lnet.ln_lnds) {
+		lnd = list_entry(tmp, lnd_t, lnd_list);
+
+		if ((int)lnd->lnd_type == type)
+			return lnd;
+	}
+
+	return NULL;
+}
+
+void
+lnet_register_lnd (lnd_t *lnd)
+{
+	LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex);
+
+	LASSERT (the_lnet.ln_init);
+	LASSERT (libcfs_isknown_lnd(lnd->lnd_type));
+	LASSERT (lnet_find_lnd_by_type(lnd->lnd_type) == NULL);
+
+	list_add_tail (&lnd->lnd_list, &the_lnet.ln_lnds);
+	lnd->lnd_refcount = 0;
+
+	CDEBUG(D_NET, "%s LND registered\n", libcfs_lnd2str(lnd->lnd_type));
+
+	LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+}
+EXPORT_SYMBOL(lnet_register_lnd);
+
+void
+lnet_unregister_lnd (lnd_t *lnd)
+{
+	LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex);
+
+	LASSERT (the_lnet.ln_init);
+	LASSERT (lnet_find_lnd_by_type(lnd->lnd_type) == lnd);
+	LASSERT (lnd->lnd_refcount == 0);
+
+	list_del (&lnd->lnd_list);
+	CDEBUG(D_NET, "%s LND unregistered\n", libcfs_lnd2str(lnd->lnd_type));
+
+	LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+}
+EXPORT_SYMBOL(lnet_unregister_lnd);
+
+void
+lnet_counters_get(lnet_counters_t *counters)
+{
+	lnet_counters_t *ctr;
+	int		i;
+
+	memset(counters, 0, sizeof(*counters));
+
+	lnet_net_lock(LNET_LOCK_EX);
+
+	cfs_percpt_for_each(ctr, i, the_lnet.ln_counters) {
+		counters->msgs_max     += ctr->msgs_max;
+		counters->msgs_alloc   += ctr->msgs_alloc;
+		counters->errors       += ctr->errors;
+		counters->send_count   += ctr->send_count;
+		counters->recv_count   += ctr->recv_count;
+		counters->route_count  += ctr->route_count;
+		counters->drop_length  += ctr->drop_length;
+		counters->send_length  += ctr->send_length;
+		counters->recv_length  += ctr->recv_length;
+		counters->route_length += ctr->route_length;
+		counters->drop_length  += ctr->drop_length;
+
+	}
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+EXPORT_SYMBOL(lnet_counters_get);
+
+void
+lnet_counters_reset(void)
+{
+	lnet_counters_t *counters;
+	int		i;
+
+	lnet_net_lock(LNET_LOCK_EX);
+
+	cfs_percpt_for_each(counters, i, the_lnet.ln_counters)
+		memset(counters, 0, sizeof(lnet_counters_t));
+
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+EXPORT_SYMBOL(lnet_counters_reset);
+
+#ifdef LNET_USE_LIB_FREELIST
+
+int
+lnet_freelist_init (lnet_freelist_t *fl, int n, int size)
+{
+	char *space;
+
+	LASSERT (n > 0);
+
+	size += offsetof (lnet_freeobj_t, fo_contents);
+
+	LIBCFS_ALLOC(space, n * size);
+	if (space == NULL)
+		return (-ENOMEM);
+
+	INIT_LIST_HEAD (&fl->fl_list);
+	fl->fl_objs = space;
+	fl->fl_nobjs = n;
+	fl->fl_objsize = size;
+
+	do
+	{
+		memset (space, 0, size);
+		list_add ((struct list_head *)space, &fl->fl_list);
+		space += size;
+	} while (--n != 0);
+
+	return (0);
+}
+
+void
+lnet_freelist_fini (lnet_freelist_t *fl)
+{
+	struct list_head       *el;
+	int	       count;
+
+	if (fl->fl_nobjs == 0)
+		return;
+
+	count = 0;
+	for (el = fl->fl_list.next; el != &fl->fl_list; el = el->next)
+		count++;
+
+	LASSERT (count == fl->fl_nobjs);
+
+	LIBCFS_FREE(fl->fl_objs, fl->fl_nobjs * fl->fl_objsize);
+	memset (fl, 0, sizeof (*fl));
+}
+
+#endif /* LNET_USE_LIB_FREELIST */
+
+__u64
+lnet_create_interface_cookie (void)
+{
+	/* NB the interface cookie in wire handles guards against delayed
+	 * replies and ACKs appearing valid after reboot. Initialisation time,
+	 * even if it's only implemented to millisecond resolution is probably
+	 * easily good enough. */
+	struct timeval tv;
+	__u64	  cookie;
+	do_gettimeofday(&tv);
+	cookie = tv.tv_sec;
+	cookie *= 1000000;
+	cookie += tv.tv_usec;
+	return cookie;
+}
+
+static char *
+lnet_res_type2str(int type)
+{
+	switch (type) {
+	default:
+		LBUG();
+	case LNET_COOKIE_TYPE_MD:
+		return "MD";
+	case LNET_COOKIE_TYPE_ME:
+		return "ME";
+	case LNET_COOKIE_TYPE_EQ:
+		return "EQ";
+	}
+}
+
+void
+lnet_res_container_cleanup(struct lnet_res_container *rec)
+{
+	int	count = 0;
+
+	if (rec->rec_type == 0) /* not set yet, it's uninitialized */
+		return;
+
+	while (!list_empty(&rec->rec_active)) {
+		struct list_head *e = rec->rec_active.next;
+
+		list_del_init(e);
+		if (rec->rec_type == LNET_COOKIE_TYPE_EQ) {
+			lnet_eq_free(list_entry(e, lnet_eq_t, eq_list));
+
+		} else if (rec->rec_type == LNET_COOKIE_TYPE_MD) {
+			lnet_md_free(list_entry(e, lnet_libmd_t, md_list));
+
+		} else { /* NB: Active MEs should be attached on portals */
+			LBUG();
+		}
+		count++;
+	}
+
+	if (count > 0) {
+		/* Found alive MD/ME/EQ, user really should unlink/free
+		 * all of them before finalize LNet, but if someone didn't,
+		 * we have to recycle garbage for him */
+		CERROR("%d active elements on exit of %s container\n",
+		       count, lnet_res_type2str(rec->rec_type));
+	}
+
+#ifdef LNET_USE_LIB_FREELIST
+	lnet_freelist_fini(&rec->rec_freelist);
+#endif
+	if (rec->rec_lh_hash != NULL) {
+		LIBCFS_FREE(rec->rec_lh_hash,
+			    LNET_LH_HASH_SIZE * sizeof(rec->rec_lh_hash[0]));
+		rec->rec_lh_hash = NULL;
+	}
+
+	rec->rec_type = 0; /* mark it as finalized */
+}
+
+int
+lnet_res_container_setup(struct lnet_res_container *rec,
+			 int cpt, int type, int objnum, int objsz)
+{
+	int	rc = 0;
+	int	i;
+
+	LASSERT(rec->rec_type == 0);
+
+	rec->rec_type = type;
+	INIT_LIST_HEAD(&rec->rec_active);
+
+#ifdef LNET_USE_LIB_FREELIST
+	memset(&rec->rec_freelist, 0, sizeof(rec->rec_freelist));
+	rc = lnet_freelist_init(&rec->rec_freelist, objnum, objsz);
+	if (rc != 0)
+		goto out;
+#endif
+	rec->rec_lh_cookie = (cpt << LNET_COOKIE_TYPE_BITS) | type;
+
+	/* Arbitrary choice of hash table size */
+	LIBCFS_CPT_ALLOC(rec->rec_lh_hash, lnet_cpt_table(), cpt,
+			 LNET_LH_HASH_SIZE * sizeof(rec->rec_lh_hash[0]));
+	if (rec->rec_lh_hash == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	for (i = 0; i < LNET_LH_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&rec->rec_lh_hash[i]);
+
+	return 0;
+
+out:
+	CERROR("Failed to setup %s resource container\n",
+	       lnet_res_type2str(type));
+	lnet_res_container_cleanup(rec);
+	return rc;
+}
+
+static void
+lnet_res_containers_destroy(struct lnet_res_container **recs)
+{
+	struct lnet_res_container	*rec;
+	int				i;
+
+	cfs_percpt_for_each(rec, i, recs)
+		lnet_res_container_cleanup(rec);
+
+	cfs_percpt_free(recs);
+}
+
+static struct lnet_res_container **
+lnet_res_containers_create(int type, int objnum, int objsz)
+{
+	struct lnet_res_container	**recs;
+	struct lnet_res_container	*rec;
+	int				rc;
+	int				i;
+
+	recs = cfs_percpt_alloc(lnet_cpt_table(), sizeof(*rec));
+	if (recs == NULL) {
+		CERROR("Failed to allocate %s resource containers\n",
+		       lnet_res_type2str(type));
+		return NULL;
+	}
+
+	cfs_percpt_for_each(rec, i, recs) {
+		rc = lnet_res_container_setup(rec, i, type, objnum, objsz);
+		if (rc != 0) {
+			lnet_res_containers_destroy(recs);
+			return NULL;
+		}
+	}
+
+	return recs;
+}
+
+lnet_libhandle_t *
+lnet_res_lh_lookup(struct lnet_res_container *rec, __u64 cookie)
+{
+	/* ALWAYS called with lnet_res_lock held */
+	struct list_head		*head;
+	lnet_libhandle_t	*lh;
+	unsigned int		hash;
+
+	if ((cookie & LNET_COOKIE_MASK) != rec->rec_type)
+		return NULL;
+
+	hash = cookie >> (LNET_COOKIE_TYPE_BITS + LNET_CPT_BITS);
+	head = &rec->rec_lh_hash[hash & LNET_LH_HASH_MASK];
+
+	list_for_each_entry(lh, head, lh_hash_chain) {
+		if (lh->lh_cookie == cookie)
+			return lh;
+	}
+
+	return NULL;
+}
+
+void
+lnet_res_lh_initialize(struct lnet_res_container *rec, lnet_libhandle_t *lh)
+{
+	/* ALWAYS called with lnet_res_lock held */
+	unsigned int	ibits = LNET_COOKIE_TYPE_BITS + LNET_CPT_BITS;
+	unsigned int	hash;
+
+	lh->lh_cookie = rec->rec_lh_cookie;
+	rec->rec_lh_cookie += 1 << ibits;
+
+	hash = (lh->lh_cookie >> ibits) & LNET_LH_HASH_MASK;
+
+	list_add(&lh->lh_hash_chain, &rec->rec_lh_hash[hash]);
+}
+
+
+int lnet_unprepare(void);
+
+int
+lnet_prepare(lnet_pid_t requested_pid)
+{
+	/* Prepare to bring up the network */
+	struct lnet_res_container **recs;
+	int			  rc = 0;
+
+	LASSERT (the_lnet.ln_refcount == 0);
+
+	the_lnet.ln_routing = 0;
+
+	LASSERT ((requested_pid & LNET_PID_USERFLAG) == 0);
+	the_lnet.ln_pid = requested_pid;
+
+	INIT_LIST_HEAD(&the_lnet.ln_test_peers);
+	INIT_LIST_HEAD(&the_lnet.ln_nis);
+	INIT_LIST_HEAD(&the_lnet.ln_nis_cpt);
+	INIT_LIST_HEAD(&the_lnet.ln_nis_zombie);
+	INIT_LIST_HEAD(&the_lnet.ln_routers);
+
+	rc = lnet_create_remote_nets_table();
+	if (rc != 0)
+		goto failed;
+
+	the_lnet.ln_interface_cookie = lnet_create_interface_cookie();
+
+	the_lnet.ln_counters = cfs_percpt_alloc(lnet_cpt_table(),
+						sizeof(lnet_counters_t));
+	if (the_lnet.ln_counters == NULL) {
+		CERROR("Failed to allocate counters for LNet\n");
+		rc = -ENOMEM;
+		goto failed;
+	}
+
+	rc = lnet_peer_tables_create();
+	if (rc != 0)
+		goto failed;
+
+	rc = lnet_msg_containers_create();
+	if (rc != 0)
+		goto failed;
+
+	rc = lnet_res_container_setup(&the_lnet.ln_eq_container, 0,
+				      LNET_COOKIE_TYPE_EQ, LNET_FL_MAX_EQS,
+				      sizeof(lnet_eq_t));
+	if (rc != 0)
+		goto failed;
+
+	recs = lnet_res_containers_create(LNET_COOKIE_TYPE_ME, LNET_FL_MAX_MES,
+					  sizeof(lnet_me_t));
+	if (recs == NULL)
+		goto failed;
+
+	the_lnet.ln_me_containers = recs;
+
+	recs = lnet_res_containers_create(LNET_COOKIE_TYPE_MD, LNET_FL_MAX_MDS,
+					  sizeof(lnet_libmd_t));
+	if (recs == NULL)
+		goto failed;
+
+	the_lnet.ln_md_containers = recs;
+
+	rc = lnet_portals_create();
+	if (rc != 0) {
+		CERROR("Failed to create portals for LNet: %d\n", rc);
+		goto failed;
+	}
+
+	return 0;
+
+ failed:
+	lnet_unprepare();
+	return rc;
+}
+
+int
+lnet_unprepare (void)
+{
+	/* NB no LNET_LOCK since this is the last reference.  All LND instances
+	 * have shut down already, so it is safe to unlink and free all
+	 * descriptors, even those that appear committed to a network op (eg MD
+	 * with non-zero pending count) */
+
+	lnet_fail_nid(LNET_NID_ANY, 0);
+
+	LASSERT(the_lnet.ln_refcount == 0);
+	LASSERT(list_empty(&the_lnet.ln_test_peers));
+	LASSERT(list_empty(&the_lnet.ln_nis));
+	LASSERT(list_empty(&the_lnet.ln_nis_cpt));
+	LASSERT(list_empty(&the_lnet.ln_nis_zombie));
+
+	lnet_portals_destroy();
+
+	if (the_lnet.ln_md_containers != NULL) {
+		lnet_res_containers_destroy(the_lnet.ln_md_containers);
+		the_lnet.ln_md_containers = NULL;
+	}
+
+	if (the_lnet.ln_me_containers != NULL) {
+		lnet_res_containers_destroy(the_lnet.ln_me_containers);
+		the_lnet.ln_me_containers = NULL;
+	}
+
+	lnet_res_container_cleanup(&the_lnet.ln_eq_container);
+
+	lnet_msg_containers_destroy();
+	lnet_peer_tables_destroy();
+	lnet_rtrpools_free();
+
+	if (the_lnet.ln_counters != NULL) {
+		cfs_percpt_free(the_lnet.ln_counters);
+		the_lnet.ln_counters = NULL;
+	}
+	lnet_destroy_remote_nets_table();
+
+	return 0;
+}
+
+lnet_ni_t  *
+lnet_net2ni_locked(__u32 net, int cpt)
+{
+	struct list_head	*tmp;
+	lnet_ni_t	*ni;
+
+	LASSERT(cpt != LNET_LOCK_EX);
+
+	list_for_each(tmp, &the_lnet.ln_nis) {
+		ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+		if (LNET_NIDNET(ni->ni_nid) == net) {
+			lnet_ni_addref_locked(ni, cpt);
+			return ni;
+		}
+	}
+
+	return NULL;
+}
+
+lnet_ni_t *
+lnet_net2ni(__u32 net)
+{
+	lnet_ni_t *ni;
+
+	lnet_net_lock(0);
+	ni = lnet_net2ni_locked(net, 0);
+	lnet_net_unlock(0);
+
+	return ni;
+}
+EXPORT_SYMBOL(lnet_net2ni);
+
+static unsigned int
+lnet_nid_cpt_hash(lnet_nid_t nid, unsigned int number)
+{
+	__u64		key = nid;
+	unsigned int	val;
+
+	LASSERT(number >= 1 && number <= LNET_CPT_NUMBER);
+
+	if (number == 1)
+		return 0;
+
+	val = cfs_hash_long(key, LNET_CPT_BITS);
+	/* NB: LNET_CP_NUMBER doesn't have to be PO2 */
+	if (val < number)
+		return val;
+
+	return (unsigned int)(key + val + (val >> 1)) % number;
+}
+
+int
+lnet_cpt_of_nid_locked(lnet_nid_t nid)
+{
+	struct lnet_ni *ni;
+
+	/* must called with hold of lnet_net_lock */
+	if (LNET_CPT_NUMBER == 1)
+		return 0; /* the only one */
+
+	/* take lnet_net_lock(any) would be OK */
+	if (!list_empty(&the_lnet.ln_nis_cpt)) {
+		list_for_each_entry(ni, &the_lnet.ln_nis_cpt, ni_cptlist) {
+			if (LNET_NIDNET(ni->ni_nid) != LNET_NIDNET(nid))
+				continue;
+
+			LASSERT(ni->ni_cpts != NULL);
+			return ni->ni_cpts[lnet_nid_cpt_hash
+					   (nid, ni->ni_ncpts)];
+		}
+	}
+
+	return lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER);
+}
+
+int
+lnet_cpt_of_nid(lnet_nid_t nid)
+{
+	int	cpt;
+	int	cpt2;
+
+	if (LNET_CPT_NUMBER == 1)
+		return 0; /* the only one */
+
+	if (list_empty(&the_lnet.ln_nis_cpt))
+		return lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER);
+
+	cpt = lnet_net_lock_current();
+	cpt2 = lnet_cpt_of_nid_locked(nid);
+	lnet_net_unlock(cpt);
+
+	return cpt2;
+}
+EXPORT_SYMBOL(lnet_cpt_of_nid);
+
+int
+lnet_islocalnet(__u32 net)
+{
+	struct lnet_ni	*ni;
+	int		cpt;
+
+	cpt = lnet_net_lock_current();
+
+	ni = lnet_net2ni_locked(net, cpt);
+	if (ni != NULL)
+		lnet_ni_decref_locked(ni, cpt);
+
+	lnet_net_unlock(cpt);
+
+	return ni != NULL;
+}
+
+lnet_ni_t  *
+lnet_nid2ni_locked(lnet_nid_t nid, int cpt)
+{
+	struct lnet_ni	*ni;
+	struct list_head	*tmp;
+
+	LASSERT(cpt != LNET_LOCK_EX);
+
+	list_for_each(tmp, &the_lnet.ln_nis) {
+		ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+		if (ni->ni_nid == nid) {
+			lnet_ni_addref_locked(ni, cpt);
+			return ni;
+		}
+	}
+
+	return NULL;
+}
+
+int
+lnet_islocalnid(lnet_nid_t nid)
+{
+	struct lnet_ni	*ni;
+	int		cpt;
+
+	cpt = lnet_net_lock_current();
+	ni = lnet_nid2ni_locked(nid, cpt);
+	if (ni != NULL)
+		lnet_ni_decref_locked(ni, cpt);
+	lnet_net_unlock(cpt);
+
+	return ni != NULL;
+}
+
+int
+lnet_count_acceptor_nis (void)
+{
+	/* Return the # of NIs that need the acceptor. */
+	int		count = 0;
+	struct list_head	*tmp;
+	struct lnet_ni	*ni;
+	int		cpt;
+
+	cpt = lnet_net_lock_current();
+	list_for_each(tmp, &the_lnet.ln_nis) {
+		ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+		if (ni->ni_lnd->lnd_accept != NULL)
+			count++;
+	}
+
+	lnet_net_unlock(cpt);
+
+	return count;
+}
+
+static int
+lnet_ni_tq_credits(lnet_ni_t *ni)
+{
+	int	credits;
+
+	LASSERT(ni->ni_ncpts >= 1);
+
+	if (ni->ni_ncpts == 1)
+		return ni->ni_maxtxcredits;
+
+	credits = ni->ni_maxtxcredits / ni->ni_ncpts;
+	credits = max(credits, 8 * ni->ni_peertxcredits);
+	credits = min(credits, ni->ni_maxtxcredits);
+
+	return credits;
+}
+
+void
+lnet_shutdown_lndnis (void)
+{
+	int		i;
+	int		islo;
+	lnet_ni_t	 *ni;
+
+	/* NB called holding the global mutex */
+
+	/* All quiet on the API front */
+	LASSERT(!the_lnet.ln_shutdown);
+	LASSERT(the_lnet.ln_refcount == 0);
+	LASSERT(list_empty(&the_lnet.ln_nis_zombie));
+
+	lnet_net_lock(LNET_LOCK_EX);
+	the_lnet.ln_shutdown = 1;	/* flag shutdown */
+
+	/* Unlink NIs from the global table */
+	while (!list_empty(&the_lnet.ln_nis)) {
+		ni = list_entry(the_lnet.ln_nis.next,
+				    lnet_ni_t, ni_list);
+		/* move it to zombie list and nobody can find it anymore */
+		list_move(&ni->ni_list, &the_lnet.ln_nis_zombie);
+		lnet_ni_decref_locked(ni, 0);	/* drop ln_nis' ref */
+
+		if (!list_empty(&ni->ni_cptlist)) {
+			list_del_init(&ni->ni_cptlist);
+			lnet_ni_decref_locked(ni, 0);
+		}
+	}
+
+	/* Drop the cached eqwait NI. */
+	if (the_lnet.ln_eq_waitni != NULL) {
+		lnet_ni_decref_locked(the_lnet.ln_eq_waitni, 0);
+		the_lnet.ln_eq_waitni = NULL;
+	}
+
+	/* Drop the cached loopback NI. */
+	if (the_lnet.ln_loni != NULL) {
+		lnet_ni_decref_locked(the_lnet.ln_loni, 0);
+		the_lnet.ln_loni = NULL;
+	}
+
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	/* Clear lazy portals and drop delayed messages which hold refs
+	 * on their lnet_msg_t::msg_rxpeer */
+	for (i = 0; i < the_lnet.ln_nportals; i++)
+		LNetClearLazyPortal(i);
+
+	/* Clear the peer table and wait for all peers to go (they hold refs on
+	 * their NIs) */
+	lnet_peer_tables_cleanup();
+
+	lnet_net_lock(LNET_LOCK_EX);
+	/* Now wait for the NI's I just nuked to show up on ln_zombie_nis
+	 * and shut them down in guaranteed thread context */
+	i = 2;
+	while (!list_empty(&the_lnet.ln_nis_zombie)) {
+		int	*ref;
+		int	j;
+
+		ni = list_entry(the_lnet.ln_nis_zombie.next,
+				    lnet_ni_t, ni_list);
+		list_del_init(&ni->ni_list);
+		cfs_percpt_for_each(ref, j, ni->ni_refs) {
+			if (*ref == 0)
+				continue;
+			/* still busy, add it back to zombie list */
+			list_add(&ni->ni_list, &the_lnet.ln_nis_zombie);
+			break;
+		}
+
+		while (!list_empty(&ni->ni_list)) {
+			lnet_net_unlock(LNET_LOCK_EX);
+			++i;
+			if ((i & (-i)) == i) {
+				CDEBUG(D_WARNING,
+				       "Waiting for zombie LNI %s\n",
+				       libcfs_nid2str(ni->ni_nid));
+			}
+			cfs_pause(cfs_time_seconds(1));
+			lnet_net_lock(LNET_LOCK_EX);
+			continue;
+		}
+
+		ni->ni_lnd->lnd_refcount--;
+		lnet_net_unlock(LNET_LOCK_EX);
+
+		islo = ni->ni_lnd->lnd_type == LOLND;
+
+		LASSERT (!in_interrupt ());
+		(ni->ni_lnd->lnd_shutdown)(ni);
+
+		/* can't deref lnd anymore now; it might have unregistered
+		 * itself...  */
+
+		if (!islo)
+			CDEBUG(D_LNI, "Removed LNI %s\n",
+			       libcfs_nid2str(ni->ni_nid));
+
+		lnet_ni_free(ni);
+		lnet_net_lock(LNET_LOCK_EX);
+	}
+
+	the_lnet.ln_shutdown = 0;
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	if (the_lnet.ln_network_tokens != NULL) {
+		LIBCFS_FREE(the_lnet.ln_network_tokens,
+			    the_lnet.ln_network_tokens_nob);
+		the_lnet.ln_network_tokens = NULL;
+	}
+}
+
+int
+lnet_startup_lndnis (void)
+{
+	lnd_t			*lnd;
+	struct lnet_ni		*ni;
+	struct lnet_tx_queue	*tq;
+	struct list_head		nilist;
+	int			i;
+	int		rc = 0;
+	int		lnd_type;
+	int		nicount = 0;
+	char	      *nets = lnet_get_networks();
+
+	INIT_LIST_HEAD(&nilist);
+
+	if (nets == NULL)
+		goto failed;
+
+	rc = lnet_parse_networks(&nilist, nets);
+	if (rc != 0)
+		goto failed;
+
+	while (!list_empty(&nilist)) {
+		ni = list_entry(nilist.next, lnet_ni_t, ni_list);
+		lnd_type = LNET_NETTYP(LNET_NIDNET(ni->ni_nid));
+
+		LASSERT (libcfs_isknown_lnd(lnd_type));
+
+		if (lnd_type == CIBLND    ||
+		    lnd_type == OPENIBLND ||
+		    lnd_type == IIBLND    ||
+		    lnd_type == VIBLND) {
+			CERROR("LND %s obsoleted\n",
+			       libcfs_lnd2str(lnd_type));
+			goto failed;
+		}
+
+		LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex);
+		lnd = lnet_find_lnd_by_type(lnd_type);
+
+		if (lnd == NULL) {
+			LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+			rc = request_module("%s",
+						libcfs_lnd2modname(lnd_type));
+			LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex);
+
+			lnd = lnet_find_lnd_by_type(lnd_type);
+			if (lnd == NULL) {
+				LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+				CERROR("Can't load LND %s, module %s, rc=%d\n",
+				       libcfs_lnd2str(lnd_type),
+				       libcfs_lnd2modname(lnd_type), rc);
+				goto failed;
+			}
+		}
+
+		lnet_net_lock(LNET_LOCK_EX);
+		lnd->lnd_refcount++;
+		lnet_net_unlock(LNET_LOCK_EX);
+
+		ni->ni_lnd = lnd;
+
+		rc = (lnd->lnd_startup)(ni);
+
+		LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+
+		if (rc != 0) {
+			LCONSOLE_ERROR_MSG(0x105, "Error %d starting up LNI %s"
+					   "\n",
+					   rc, libcfs_lnd2str(lnd->lnd_type));
+			lnet_net_lock(LNET_LOCK_EX);
+			lnd->lnd_refcount--;
+			lnet_net_unlock(LNET_LOCK_EX);
+			goto failed;
+		}
+
+		LASSERT (ni->ni_peertimeout <= 0 || lnd->lnd_query != NULL);
+
+		list_del(&ni->ni_list);
+
+		lnet_net_lock(LNET_LOCK_EX);
+		/* refcount for ln_nis */
+		lnet_ni_addref_locked(ni, 0);
+		list_add_tail(&ni->ni_list, &the_lnet.ln_nis);
+		if (ni->ni_cpts != NULL) {
+			list_add_tail(&ni->ni_cptlist,
+					  &the_lnet.ln_nis_cpt);
+			lnet_ni_addref_locked(ni, 0);
+		}
+
+		lnet_net_unlock(LNET_LOCK_EX);
+
+		if (lnd->lnd_type == LOLND) {
+			lnet_ni_addref(ni);
+			LASSERT (the_lnet.ln_loni == NULL);
+			the_lnet.ln_loni = ni;
+			continue;
+		}
+
+		if (ni->ni_peertxcredits == 0 ||
+		    ni->ni_maxtxcredits == 0) {
+			LCONSOLE_ERROR_MSG(0x107, "LNI %s has no %scredits\n",
+					   libcfs_lnd2str(lnd->lnd_type),
+					   ni->ni_peertxcredits == 0 ?
+					   "" : "per-peer ");
+			goto failed;
+		}
+
+		cfs_percpt_for_each(tq, i, ni->ni_tx_queues) {
+			tq->tq_credits_min =
+			tq->tq_credits_max =
+			tq->tq_credits = lnet_ni_tq_credits(ni);
+		}
+
+		CDEBUG(D_LNI, "Added LNI %s [%d/%d/%d/%d]\n",
+		       libcfs_nid2str(ni->ni_nid), ni->ni_peertxcredits,
+		       lnet_ni_tq_credits(ni) * LNET_CPT_NUMBER,
+		       ni->ni_peerrtrcredits, ni->ni_peertimeout);
+
+		nicount++;
+	}
+
+	if (the_lnet.ln_eq_waitni != NULL && nicount > 1) {
+		lnd_type = the_lnet.ln_eq_waitni->ni_lnd->lnd_type;
+		LCONSOLE_ERROR_MSG(0x109, "LND %s can only run single-network"
+				   "\n",
+				   libcfs_lnd2str(lnd_type));
+		goto failed;
+	}
+
+	return 0;
+
+ failed:
+	lnet_shutdown_lndnis();
+
+	while (!list_empty(&nilist)) {
+		ni = list_entry(nilist.next, lnet_ni_t, ni_list);
+		list_del(&ni->ni_list);
+		lnet_ni_free(ni);
+	}
+
+	return -ENETDOWN;
+}
+
+/**
+ * Initialize LNet library.
+ *
+ * Only userspace program needs to call this function - it's automatically
+ * called in the kernel at module loading time. Caller has to call LNetFini()
+ * after a call to LNetInit(), if and only if the latter returned 0. It must
+ * be called exactly once.
+ *
+ * \return 0 on success, and -ve on failures.
+ */
+int
+LNetInit(void)
+{
+	int	rc;
+
+	lnet_assert_wire_constants();
+	LASSERT(!the_lnet.ln_init);
+
+	memset(&the_lnet, 0, sizeof(the_lnet));
+
+	/* refer to global cfs_cpt_table for now */
+	the_lnet.ln_cpt_table	= cfs_cpt_table;
+	the_lnet.ln_cpt_number	= cfs_cpt_number(cfs_cpt_table);
+
+	LASSERT(the_lnet.ln_cpt_number > 0);
+	if (the_lnet.ln_cpt_number > LNET_CPT_MAX) {
+		/* we are under risk of consuming all lh_cookie */
+		CERROR("Can't have %d CPTs for LNet (max allowed is %d), "
+		       "please change setting of CPT-table and retry\n",
+		       the_lnet.ln_cpt_number, LNET_CPT_MAX);
+		return -1;
+	}
+
+	while ((1 << the_lnet.ln_cpt_bits) < the_lnet.ln_cpt_number)
+		the_lnet.ln_cpt_bits++;
+
+	rc = lnet_create_locks();
+	if (rc != 0) {
+		CERROR("Can't create LNet global locks: %d\n", rc);
+		return -1;
+	}
+
+	the_lnet.ln_refcount = 0;
+	the_lnet.ln_init = 1;
+	LNetInvalidateHandle(&the_lnet.ln_rc_eqh);
+	INIT_LIST_HEAD(&the_lnet.ln_lnds);
+	INIT_LIST_HEAD(&the_lnet.ln_rcd_zombie);
+	INIT_LIST_HEAD(&the_lnet.ln_rcd_deathrow);
+
+	/* The hash table size is the number of bits it takes to express the set
+	 * ln_num_routes, minus 1 (better to under estimate than over so we
+	 * don't waste memory). */
+	if (rnet_htable_size <= 0)
+		rnet_htable_size = LNET_REMOTE_NETS_HASH_DEFAULT;
+	else if (rnet_htable_size > LNET_REMOTE_NETS_HASH_MAX)
+		rnet_htable_size = LNET_REMOTE_NETS_HASH_MAX;
+	the_lnet.ln_remote_nets_hbits = max_t(int, 1,
+					   order_base_2(rnet_htable_size) - 1);
+
+	/* All LNDs apart from the LOLND are in separate modules.  They
+	 * register themselves when their module loads, and unregister
+	 * themselves when their module is unloaded. */
+	lnet_register_lnd(&the_lolnd);
+	return 0;
+}
+EXPORT_SYMBOL(LNetInit);
+
+/**
+ * Finalize LNet library.
+ *
+ * Only userspace program needs to call this function. It can be called
+ * at most once.
+ *
+ * \pre LNetInit() called with success.
+ * \pre All LNet users called LNetNIFini() for matching LNetNIInit() calls.
+ */
+void
+LNetFini(void)
+{
+	LASSERT(the_lnet.ln_init);
+	LASSERT(the_lnet.ln_refcount == 0);
+
+	while (!list_empty(&the_lnet.ln_lnds))
+		lnet_unregister_lnd(list_entry(the_lnet.ln_lnds.next,
+						   lnd_t, lnd_list));
+	lnet_destroy_locks();
+
+	the_lnet.ln_init = 0;
+}
+EXPORT_SYMBOL(LNetFini);
+
+/**
+ * Set LNet PID and start LNet interfaces, routing, and forwarding.
+ *
+ * Userspace program should call this after a successful call to LNetInit().
+ * Users must call this function at least once before any other functions.
+ * For each successful call there must be a corresponding call to
+ * LNetNIFini(). For subsequent calls to LNetNIInit(), \a requested_pid is
+ * ignored.
+ *
+ * The PID used by LNet may be different from the one requested.
+ * See LNetGetId().
+ *
+ * \param requested_pid PID requested by the caller.
+ *
+ * \return >= 0 on success, and < 0 error code on failures.
+ */
+int
+LNetNIInit(lnet_pid_t requested_pid)
+{
+	int	 im_a_router = 0;
+	int	 rc;
+
+	LNET_MUTEX_LOCK(&the_lnet.ln_api_mutex);
+
+	LASSERT (the_lnet.ln_init);
+	CDEBUG(D_OTHER, "refs %d\n", the_lnet.ln_refcount);
+
+	if (the_lnet.ln_refcount > 0) {
+		rc = the_lnet.ln_refcount++;
+		goto out;
+	}
+
+	lnet_get_tunables();
+
+	if (requested_pid == LNET_PID_ANY) {
+		/* Don't instantiate LNET just for me */
+		rc = -ENETDOWN;
+		goto failed0;
+	}
+
+	rc = lnet_prepare(requested_pid);
+	if (rc != 0)
+		goto failed0;
+
+	rc = lnet_startup_lndnis();
+	if (rc != 0)
+		goto failed1;
+
+	rc = lnet_parse_routes(lnet_get_routes(), &im_a_router);
+	if (rc != 0)
+		goto failed2;
+
+	rc = lnet_check_routes();
+	if (rc != 0)
+		goto failed2;
+
+	rc = lnet_rtrpools_alloc(im_a_router);
+	if (rc != 0)
+		goto failed2;
+
+	rc = lnet_acceptor_start();
+	if (rc != 0)
+		goto failed2;
+
+	the_lnet.ln_refcount = 1;
+	/* Now I may use my own API functions... */
+
+	/* NB router checker needs the_lnet.ln_ping_info in
+	 * lnet_router_checker -> lnet_update_ni_status_locked */
+	rc = lnet_ping_target_init();
+	if (rc != 0)
+		goto failed3;
+
+	rc = lnet_router_checker_start();
+	if (rc != 0)
+		goto failed4;
+
+	lnet_proc_init();
+	goto out;
+
+ failed4:
+	lnet_ping_target_fini();
+ failed3:
+	the_lnet.ln_refcount = 0;
+	lnet_acceptor_stop();
+ failed2:
+	lnet_destroy_routes();
+	lnet_shutdown_lndnis();
+ failed1:
+	lnet_unprepare();
+ failed0:
+	LASSERT (rc < 0);
+ out:
+	LNET_MUTEX_UNLOCK(&the_lnet.ln_api_mutex);
+	return rc;
+}
+EXPORT_SYMBOL(LNetNIInit);
+
+/**
+ * Stop LNet interfaces, routing, and forwarding.
+ *
+ * Users must call this function once for each successful call to LNetNIInit().
+ * Once the LNetNIFini() operation has been started, the results of pending
+ * API operations are undefined.
+ *
+ * \return always 0 for current implementation.
+ */
+int
+LNetNIFini()
+{
+	LNET_MUTEX_LOCK(&the_lnet.ln_api_mutex);
+
+	LASSERT (the_lnet.ln_init);
+	LASSERT (the_lnet.ln_refcount > 0);
+
+	if (the_lnet.ln_refcount != 1) {
+		the_lnet.ln_refcount--;
+	} else {
+		LASSERT (!the_lnet.ln_niinit_self);
+
+		lnet_proc_fini();
+		lnet_router_checker_stop();
+		lnet_ping_target_fini();
+
+		/* Teardown fns that use my own API functions BEFORE here */
+		the_lnet.ln_refcount = 0;
+
+		lnet_acceptor_stop();
+		lnet_destroy_routes();
+		lnet_shutdown_lndnis();
+		lnet_unprepare();
+	}
+
+	LNET_MUTEX_UNLOCK(&the_lnet.ln_api_mutex);
+	return 0;
+}
+EXPORT_SYMBOL(LNetNIFini);
+
+/**
+ * This is an ugly hack to export IOC_LIBCFS_DEBUG_PEER and
+ * IOC_LIBCFS_PORTALS_COMPATIBILITY commands to users, by tweaking the LNet
+ * internal ioctl handler.
+ *
+ * IOC_LIBCFS_PORTALS_COMPATIBILITY is now deprecated, don't use it.
+ *
+ * \param cmd IOC_LIBCFS_DEBUG_PEER to print debugging data about a peer.
+ * The data will be printed to system console. Don't use it excessively.
+ * \param arg A pointer to lnet_process_id_t, process ID of the peer.
+ *
+ * \return Always return 0 when called by users directly (i.e., not via ioctl).
+ */
+int
+LNetCtl(unsigned int cmd, void *arg)
+{
+	struct libcfs_ioctl_data *data = arg;
+	lnet_process_id_t	 id = {0};
+	lnet_ni_t		*ni;
+	int		       rc;
+
+	LASSERT (the_lnet.ln_init);
+	LASSERT (the_lnet.ln_refcount > 0);
+
+	switch (cmd) {
+	case IOC_LIBCFS_GET_NI:
+		rc = LNetGetId(data->ioc_count, &id);
+		data->ioc_nid = id.nid;
+		return rc;
+
+	case IOC_LIBCFS_FAIL_NID:
+		return lnet_fail_nid(data->ioc_nid, data->ioc_count);
+
+	case IOC_LIBCFS_ADD_ROUTE:
+		rc = lnet_add_route(data->ioc_net, data->ioc_count,
+				    data->ioc_nid);
+		return (rc != 0) ? rc : lnet_check_routes();
+
+	case IOC_LIBCFS_DEL_ROUTE:
+		return lnet_del_route(data->ioc_net, data->ioc_nid);
+
+	case IOC_LIBCFS_GET_ROUTE:
+		return lnet_get_route(data->ioc_count,
+				      &data->ioc_net, &data->ioc_count,
+				      &data->ioc_nid, &data->ioc_flags);
+	case IOC_LIBCFS_NOTIFY_ROUTER:
+		return lnet_notify(NULL, data->ioc_nid, data->ioc_flags,
+				   cfs_time_current() -
+				   cfs_time_seconds(cfs_time_current_sec() -
+						    (time_t)data->ioc_u64[0]));
+
+	case IOC_LIBCFS_PORTALS_COMPATIBILITY:
+		/* This can be removed once lustre stops calling it */
+		return 0;
+
+	case IOC_LIBCFS_LNET_DIST:
+		rc = LNetDist(data->ioc_nid, &data->ioc_nid, &data->ioc_u32[1]);
+		if (rc < 0 && rc != -EHOSTUNREACH)
+			return rc;
+
+		data->ioc_u32[0] = rc;
+		return 0;
+
+	case IOC_LIBCFS_TESTPROTOCOMPAT:
+		lnet_net_lock(LNET_LOCK_EX);
+		the_lnet.ln_testprotocompat = data->ioc_flags;
+		lnet_net_unlock(LNET_LOCK_EX);
+		return 0;
+
+	case IOC_LIBCFS_PING:
+		id.nid = data->ioc_nid;
+		id.pid = data->ioc_u32[0];
+		rc = lnet_ping(id, data->ioc_u32[1], /* timeout */
+			       (lnet_process_id_t *)data->ioc_pbuf1,
+			       data->ioc_plen1/sizeof(lnet_process_id_t));
+		if (rc < 0)
+			return rc;
+		data->ioc_count = rc;
+		return 0;
+
+	case IOC_LIBCFS_DEBUG_PEER: {
+		/* CAVEAT EMPTOR: this one designed for calling directly; not
+		 * via an ioctl */
+		id = *((lnet_process_id_t *) arg);
+
+		lnet_debug_peer(id.nid);
+
+		ni = lnet_net2ni(LNET_NIDNET(id.nid));
+		if (ni == NULL) {
+			CDEBUG(D_WARNING, "No NI for %s\n", libcfs_id2str(id));
+		} else {
+			if (ni->ni_lnd->lnd_ctl == NULL) {
+				CDEBUG(D_WARNING, "No ctl for %s\n",
+				       libcfs_id2str(id));
+			} else {
+				(void)ni->ni_lnd->lnd_ctl(ni, cmd, arg);
+			}
+
+			lnet_ni_decref(ni);
+		}
+		return 0;
+	}
+
+	default:
+		ni = lnet_net2ni(data->ioc_net);
+		if (ni == NULL)
+			return -EINVAL;
+
+		if (ni->ni_lnd->lnd_ctl == NULL)
+			rc = -EINVAL;
+		else
+			rc = ni->ni_lnd->lnd_ctl(ni, cmd, arg);
+
+		lnet_ni_decref(ni);
+		return rc;
+	}
+	/* not reached */
+}
+EXPORT_SYMBOL(LNetCtl);
+
+/**
+ * Retrieve the lnet_process_id_t ID of LNet interface at \a index. Note that
+ * all interfaces share a same PID, as requested by LNetNIInit().
+ *
+ * \param index Index of the interface to look up.
+ * \param id On successful return, this location will hold the
+ * lnet_process_id_t ID of the interface.
+ *
+ * \retval 0 If an interface exists at \a index.
+ * \retval -ENOENT If no interface has been found.
+ */
+int
+LNetGetId(unsigned int index, lnet_process_id_t *id)
+{
+	struct lnet_ni	*ni;
+	struct list_head	*tmp;
+	int		cpt;
+	int		rc = -ENOENT;
+
+	LASSERT(the_lnet.ln_init);
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	cpt = lnet_net_lock_current();
+
+	list_for_each(tmp, &the_lnet.ln_nis) {
+		if (index-- != 0)
+			continue;
+
+		ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+		id->nid = ni->ni_nid;
+		id->pid = the_lnet.ln_pid;
+		rc = 0;
+		break;
+	}
+
+	lnet_net_unlock(cpt);
+	return rc;
+}
+EXPORT_SYMBOL(LNetGetId);
+
+/**
+ * Print a string representation of handle \a h into buffer \a str of
+ * \a len bytes.
+ */
+void
+LNetSnprintHandle(char *str, int len, lnet_handle_any_t h)
+{
+	snprintf(str, len, LPX64, h.cookie);
+}
+EXPORT_SYMBOL(LNetSnprintHandle);
+
+static int
+lnet_create_ping_info(void)
+{
+	int	       i;
+	int	       n;
+	int	       rc;
+	unsigned int      infosz;
+	lnet_ni_t	*ni;
+	lnet_process_id_t id;
+	lnet_ping_info_t *pinfo;
+
+	for (n = 0; ; n++) {
+		rc = LNetGetId(n, &id);
+		if (rc == -ENOENT)
+			break;
+
+		LASSERT (rc == 0);
+	}
+
+	infosz = offsetof(lnet_ping_info_t, pi_ni[n]);
+	LIBCFS_ALLOC(pinfo, infosz);
+	if (pinfo == NULL) {
+		CERROR("Can't allocate ping info[%d]\n", n);
+		return -ENOMEM;
+	}
+
+	pinfo->pi_nnis    = n;
+	pinfo->pi_pid     = the_lnet.ln_pid;
+	pinfo->pi_magic   = LNET_PROTO_PING_MAGIC;
+	pinfo->pi_features = LNET_PING_FEAT_NI_STATUS;
+
+	for (i = 0; i < n; i++) {
+		lnet_ni_status_t *ns = &pinfo->pi_ni[i];
+
+		rc = LNetGetId(i, &id);
+		LASSERT (rc == 0);
+
+		ns->ns_nid    = id.nid;
+		ns->ns_status = LNET_NI_STATUS_UP;
+
+		lnet_net_lock(0);
+
+		ni = lnet_nid2ni_locked(id.nid, 0);
+		LASSERT(ni != NULL);
+
+		lnet_ni_lock(ni);
+		LASSERT(ni->ni_status == NULL);
+		ni->ni_status = ns;
+		lnet_ni_unlock(ni);
+
+		lnet_ni_decref_locked(ni, 0);
+		lnet_net_unlock(0);
+	}
+
+	the_lnet.ln_ping_info = pinfo;
+	return 0;
+}
+
+static void
+lnet_destroy_ping_info(void)
+{
+	struct lnet_ni	*ni;
+
+	lnet_net_lock(0);
+
+	list_for_each_entry(ni, &the_lnet.ln_nis, ni_list) {
+		lnet_ni_lock(ni);
+		ni->ni_status = NULL;
+		lnet_ni_unlock(ni);
+	}
+
+	lnet_net_unlock(0);
+
+	LIBCFS_FREE(the_lnet.ln_ping_info,
+		    offsetof(lnet_ping_info_t,
+			     pi_ni[the_lnet.ln_ping_info->pi_nnis]));
+	the_lnet.ln_ping_info = NULL;
+	return;
+}
+
+int
+lnet_ping_target_init(void)
+{
+	lnet_md_t	 md = {0};
+	lnet_handle_me_t  meh;
+	lnet_process_id_t id;
+	int	       rc;
+	int	       rc2;
+	int	       infosz;
+
+	rc = lnet_create_ping_info();
+	if (rc != 0)
+		return rc;
+
+	/* We can have a tiny EQ since we only need to see the unlink event on
+	 * teardown, which by definition is the last one! */
+	rc = LNetEQAlloc(2, LNET_EQ_HANDLER_NONE, &the_lnet.ln_ping_target_eq);
+	if (rc != 0) {
+		CERROR("Can't allocate ping EQ: %d\n", rc);
+		goto failed_0;
+	}
+
+	memset(&id, 0, sizeof(lnet_process_id_t));
+	id.nid = LNET_NID_ANY;
+	id.pid = LNET_PID_ANY;
+
+	rc = LNetMEAttach(LNET_RESERVED_PORTAL, id,
+			  LNET_PROTO_PING_MATCHBITS, 0,
+			  LNET_UNLINK, LNET_INS_AFTER,
+			  &meh);
+	if (rc != 0) {
+		CERROR("Can't create ping ME: %d\n", rc);
+		goto failed_1;
+	}
+
+	/* initialize md content */
+	infosz = offsetof(lnet_ping_info_t,
+			  pi_ni[the_lnet.ln_ping_info->pi_nnis]);
+	md.start     = the_lnet.ln_ping_info;
+	md.length    = infosz;
+	md.threshold = LNET_MD_THRESH_INF;
+	md.max_size  = 0;
+	md.options   = LNET_MD_OP_GET | LNET_MD_TRUNCATE |
+		       LNET_MD_MANAGE_REMOTE;
+	md.user_ptr  = NULL;
+	md.eq_handle = the_lnet.ln_ping_target_eq;
+
+	rc = LNetMDAttach(meh, md,
+			  LNET_RETAIN,
+			  &the_lnet.ln_ping_target_md);
+	if (rc != 0) {
+		CERROR("Can't attach ping MD: %d\n", rc);
+		goto failed_2;
+	}
+
+	return 0;
+
+ failed_2:
+	rc2 = LNetMEUnlink(meh);
+	LASSERT (rc2 == 0);
+ failed_1:
+	rc2 = LNetEQFree(the_lnet.ln_ping_target_eq);
+	LASSERT (rc2 == 0);
+ failed_0:
+	lnet_destroy_ping_info();
+	return rc;
+}
+
+void
+lnet_ping_target_fini(void)
+{
+	lnet_event_t    event;
+	int	     rc;
+	int	     which;
+	int	     timeout_ms = 1000;
+	sigset_t    blocked = cfs_block_allsigs();
+
+	LNetMDUnlink(the_lnet.ln_ping_target_md);
+	/* NB md could be busy; this just starts the unlink */
+
+	for (;;) {
+		rc = LNetEQPoll(&the_lnet.ln_ping_target_eq, 1,
+				timeout_ms, &event, &which);
+
+		/* I expect overflow... */
+		LASSERT (rc >= 0 || rc == -EOVERFLOW);
+
+		if (rc == 0) {
+			/* timed out: provide a diagnostic */
+			CWARN("Still waiting for ping MD to unlink\n");
+			timeout_ms *= 2;
+			continue;
+		}
+
+		/* Got a valid event */
+		if (event.unlinked)
+			break;
+	}
+
+	rc = LNetEQFree(the_lnet.ln_ping_target_eq);
+	LASSERT (rc == 0);
+	lnet_destroy_ping_info();
+	cfs_restore_sigs(blocked);
+}
+
+int
+lnet_ping (lnet_process_id_t id, int timeout_ms, lnet_process_id_t *ids, int n_ids)
+{
+	lnet_handle_eq_t     eqh;
+	lnet_handle_md_t     mdh;
+	lnet_event_t	 event;
+	lnet_md_t	    md = {0};
+	int		  which;
+	int		  unlinked = 0;
+	int		  replied = 0;
+	const int	    a_long_time = 60000; /* mS */
+	int		  infosz = offsetof(lnet_ping_info_t, pi_ni[n_ids]);
+	lnet_ping_info_t    *info;
+	lnet_process_id_t    tmpid;
+	int		  i;
+	int		  nob;
+	int		  rc;
+	int		  rc2;
+	sigset_t	 blocked;
+
+	if (n_ids <= 0 ||
+	    id.nid == LNET_NID_ANY ||
+	    timeout_ms > 500000 ||	      /* arbitrary limit! */
+	    n_ids > 20)			 /* arbitrary limit! */
+		return -EINVAL;
+
+	if (id.pid == LNET_PID_ANY)
+		id.pid = LUSTRE_SRV_LNET_PID;
+
+	LIBCFS_ALLOC(info, infosz);
+	if (info == NULL)
+		return -ENOMEM;
+
+	/* NB 2 events max (including any unlink event) */
+	rc = LNetEQAlloc(2, LNET_EQ_HANDLER_NONE, &eqh);
+	if (rc != 0) {
+		CERROR("Can't allocate EQ: %d\n", rc);
+		goto out_0;
+	}
+
+	/* initialize md content */
+	md.start     = info;
+	md.length    = infosz;
+	md.threshold = 2; /*GET/REPLY*/
+	md.max_size  = 0;
+	md.options   = LNET_MD_TRUNCATE;
+	md.user_ptr  = NULL;
+	md.eq_handle = eqh;
+
+	rc = LNetMDBind(md, LNET_UNLINK, &mdh);
+	if (rc != 0) {
+		CERROR("Can't bind MD: %d\n", rc);
+		goto out_1;
+	}
+
+	rc = LNetGet(LNET_NID_ANY, mdh, id,
+		     LNET_RESERVED_PORTAL,
+		     LNET_PROTO_PING_MATCHBITS, 0);
+
+	if (rc != 0) {
+		/* Don't CERROR; this could be deliberate! */
+
+		rc2 = LNetMDUnlink(mdh);
+		LASSERT (rc2 == 0);
+
+		/* NB must wait for the UNLINK event below... */
+		unlinked = 1;
+		timeout_ms = a_long_time;
+	}
+
+	do {
+		/* MUST block for unlink to complete */
+		if (unlinked)
+			blocked = cfs_block_allsigs();
+
+		rc2 = LNetEQPoll(&eqh, 1, timeout_ms, &event, &which);
+
+		if (unlinked)
+			cfs_restore_sigs(blocked);
+
+		CDEBUG(D_NET, "poll %d(%d %d)%s\n", rc2,
+		       (rc2 <= 0) ? -1 : event.type,
+		       (rc2 <= 0) ? -1 : event.status,
+		       (rc2 > 0 && event.unlinked) ? " unlinked" : "");
+
+		LASSERT (rc2 != -EOVERFLOW);     /* can't miss anything */
+
+		if (rc2 <= 0 || event.status != 0) {
+			/* timeout or error */
+			if (!replied && rc == 0)
+				rc = (rc2 < 0) ? rc2 :
+				     (rc2 == 0) ? -ETIMEDOUT :
+				     event.status;
+
+			if (!unlinked) {
+				/* Ensure completion in finite time... */
+				LNetMDUnlink(mdh);
+				/* No assertion (racing with network) */
+				unlinked = 1;
+				timeout_ms = a_long_time;
+			} else if (rc2 == 0) {
+				/* timed out waiting for unlink */
+				CWARN("ping %s: late network completion\n",
+				      libcfs_id2str(id));
+			}
+		} else if (event.type == LNET_EVENT_REPLY) {
+			replied = 1;
+			rc = event.mlength;
+		}
+
+	} while (rc2 <= 0 || !event.unlinked);
+
+	if (!replied) {
+		if (rc >= 0)
+			CWARN("%s: Unexpected rc >= 0 but no reply!\n",
+			      libcfs_id2str(id));
+		rc = -EIO;
+		goto out_1;
+	}
+
+	nob = rc;
+	LASSERT (nob >= 0 && nob <= infosz);
+
+	rc = -EPROTO;			   /* if I can't parse... */
+
+	if (nob < 8) {
+		/* can't check magic/version */
+		CERROR("%s: ping info too short %d\n",
+		       libcfs_id2str(id), nob);
+		goto out_1;
+	}
+
+	if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC)) {
+		lnet_swap_pinginfo(info);
+	} else if (info->pi_magic != LNET_PROTO_PING_MAGIC) {
+		CERROR("%s: Unexpected magic %08x\n",
+		       libcfs_id2str(id), info->pi_magic);
+		goto out_1;
+	}
+
+	if ((info->pi_features & LNET_PING_FEAT_NI_STATUS) == 0) {
+		CERROR("%s: ping w/o NI status: 0x%x\n",
+		       libcfs_id2str(id), info->pi_features);
+		goto out_1;
+	}
+
+	if (nob < offsetof(lnet_ping_info_t, pi_ni[0])) {
+		CERROR("%s: Short reply %d(%d min)\n", libcfs_id2str(id),
+		       nob, (int)offsetof(lnet_ping_info_t, pi_ni[0]));
+		goto out_1;
+	}
+
+	if (info->pi_nnis < n_ids)
+		n_ids = info->pi_nnis;
+
+	if (nob < offsetof(lnet_ping_info_t, pi_ni[n_ids])) {
+		CERROR("%s: Short reply %d(%d expected)\n", libcfs_id2str(id),
+		       nob, (int)offsetof(lnet_ping_info_t, pi_ni[n_ids]));
+		goto out_1;
+	}
+
+	rc = -EFAULT;			   /* If I SEGV... */
+
+	for (i = 0; i < n_ids; i++) {
+		tmpid.pid = info->pi_pid;
+		tmpid.nid = info->pi_ni[i].ns_nid;
+		if (copy_to_user(&ids[i], &tmpid, sizeof(tmpid)))
+			goto out_1;
+	}
+	rc = info->pi_nnis;
+
+ out_1:
+	rc2 = LNetEQFree(eqh);
+	if (rc2 != 0)
+		CERROR("rc2 %d\n", rc2);
+	LASSERT (rc2 == 0);
+
+ out_0:
+	LIBCFS_FREE(info, infosz);
+	return rc;
+}
diff --git a/drivers/staging/lustre/lnet/lnet/config.c b/drivers/staging/lustre/lnet/lnet/config.c
new file mode 100644
index 000000000000..28711e6e8b03
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/config.c
@@ -0,0 +1,1264 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+
+typedef struct {			    /* tmp struct for parsing routes */
+	struct list_head	 ltb_list;	/* stash on lists */
+	int		ltb_size;	/* allocated size */
+	char	       ltb_text[0];     /* text buffer */
+} lnet_text_buf_t;
+
+static int lnet_tbnob = 0;			/* track text buf allocation */
+#define LNET_MAX_TEXTBUF_NOB     (64<<10)	/* bound allocation */
+#define LNET_SINGLE_TEXTBUF_NOB  (4<<10)
+
+void
+lnet_syntax(char *name, char *str, int offset, int width)
+{
+	static char dots[LNET_SINGLE_TEXTBUF_NOB];
+	static char dashes[LNET_SINGLE_TEXTBUF_NOB];
+
+	memset(dots, '.', sizeof(dots));
+	dots[sizeof(dots)-1] = 0;
+	memset(dashes, '-', sizeof(dashes));
+	dashes[sizeof(dashes)-1] = 0;
+
+	LCONSOLE_ERROR_MSG(0x10f, "Error parsing '%s=\"%s\"'\n", name, str);
+	LCONSOLE_ERROR_MSG(0x110, "here...........%.*s..%.*s|%.*s|\n",
+			   (int)strlen(name), dots, offset, dots,
+			    (width < 1) ? 0 : width - 1, dashes);
+}
+
+int
+lnet_issep (char c)
+{
+	switch (c) {
+	case '\n':
+	case '\r':
+	case ';':
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+int
+lnet_net_unique(__u32 net, struct list_head *nilist)
+{
+	struct list_head       *tmp;
+	lnet_ni_t	*ni;
+
+	list_for_each (tmp, nilist) {
+		ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+		if (LNET_NIDNET(ni->ni_nid) == net)
+			return 0;
+	}
+
+	return 1;
+}
+
+void
+lnet_ni_free(struct lnet_ni *ni)
+{
+	if (ni->ni_refs != NULL)
+		cfs_percpt_free(ni->ni_refs);
+
+	if (ni->ni_tx_queues != NULL)
+		cfs_percpt_free(ni->ni_tx_queues);
+
+	if (ni->ni_cpts != NULL)
+		cfs_expr_list_values_free(ni->ni_cpts, ni->ni_ncpts);
+
+	LIBCFS_FREE(ni, sizeof(*ni));
+}
+
+lnet_ni_t *
+lnet_ni_alloc(__u32 net, struct cfs_expr_list *el, struct list_head *nilist)
+{
+	struct lnet_tx_queue	*tq;
+	struct lnet_ni		*ni;
+	int			rc;
+	int			i;
+
+	if (!lnet_net_unique(net, nilist)) {
+		LCONSOLE_ERROR_MSG(0x111, "Duplicate network specified: %s\n",
+				   libcfs_net2str(net));
+		return NULL;
+	}
+
+	LIBCFS_ALLOC(ni, sizeof(*ni));
+	if (ni == NULL) {
+		CERROR("Out of memory creating network %s\n",
+		       libcfs_net2str(net));
+		return NULL;
+	}
+
+	spin_lock_init(&ni->ni_lock);
+	INIT_LIST_HEAD(&ni->ni_cptlist);
+	ni->ni_refs = cfs_percpt_alloc(lnet_cpt_table(),
+				       sizeof(*ni->ni_refs[0]));
+	if (ni->ni_refs == NULL)
+		goto failed;
+
+	ni->ni_tx_queues = cfs_percpt_alloc(lnet_cpt_table(),
+					    sizeof(*ni->ni_tx_queues[0]));
+	if (ni->ni_tx_queues == NULL)
+		goto failed;
+
+	cfs_percpt_for_each(tq, i, ni->ni_tx_queues)
+		INIT_LIST_HEAD(&tq->tq_delayed);
+
+	if (el == NULL) {
+		ni->ni_cpts  = NULL;
+		ni->ni_ncpts = LNET_CPT_NUMBER;
+	} else {
+		rc = cfs_expr_list_values(el, LNET_CPT_NUMBER, &ni->ni_cpts);
+		if (rc <= 0) {
+			CERROR("Failed to set CPTs for NI %s: %d\n",
+			       libcfs_net2str(net), rc);
+			goto failed;
+		}
+
+		LASSERT(rc <= LNET_CPT_NUMBER);
+		if (rc == LNET_CPT_NUMBER) {
+			LIBCFS_FREE(ni->ni_cpts, rc * sizeof(ni->ni_cpts[0]));
+			ni->ni_cpts = NULL;
+		}
+
+		ni->ni_ncpts = rc;
+	}
+
+	/* LND will fill in the address part of the NID */
+	ni->ni_nid = LNET_MKNID(net, 0);
+	ni->ni_last_alive = cfs_time_current_sec();
+	list_add_tail(&ni->ni_list, nilist);
+	return ni;
+ failed:
+	lnet_ni_free(ni);
+	return NULL;
+}
+
+int
+lnet_parse_networks(struct list_head *nilist, char *networks)
+{
+	struct cfs_expr_list *el = NULL;
+	int		tokensize = strlen(networks) + 1;
+	char		*tokens;
+	char		*str;
+	char		*tmp;
+	struct lnet_ni	*ni;
+	__u32		net;
+	int		nnets = 0;
+
+	if (strlen(networks) > LNET_SINGLE_TEXTBUF_NOB) {
+		/* _WAY_ conservative */
+		LCONSOLE_ERROR_MSG(0x112, "Can't parse networks: string too "
+				   "long\n");
+		return -EINVAL;
+	}
+
+	LIBCFS_ALLOC(tokens, tokensize);
+	if (tokens == NULL) {
+		CERROR("Can't allocate net tokens\n");
+		return -ENOMEM;
+	}
+
+	the_lnet.ln_network_tokens = tokens;
+	the_lnet.ln_network_tokens_nob = tokensize;
+	memcpy (tokens, networks, tokensize);
+	str = tmp = tokens;
+
+	/* Add in the loopback network */
+	ni = lnet_ni_alloc(LNET_MKNET(LOLND, 0), NULL, nilist);
+	if (ni == NULL)
+		goto failed;
+
+	while (str != NULL && *str != 0) {
+		char	*comma = strchr(str, ',');
+		char	*bracket = strchr(str, '(');
+		char	*square = strchr(str, '[');
+		char	*iface;
+		int	niface;
+		int	rc;
+
+		/* NB we don't check interface conflicts here; it's the LNDs
+		 * responsibility (if it cares at all) */
+
+		if (square != NULL && (comma == NULL || square < comma)) {
+			/* i.e: o2ib0(ib0)[1,2], number between square
+			 * brackets are CPTs this NI needs to be bond */
+			if (bracket != NULL && bracket > square) {
+				tmp = square;
+				goto failed_syntax;
+			}
+
+			tmp = strchr(square, ']');
+			if (tmp == NULL) {
+				tmp = square;
+				goto failed_syntax;
+			}
+
+			rc = cfs_expr_list_parse(square, tmp - square + 1,
+						 0, LNET_CPT_NUMBER - 1, &el);
+			if (rc != 0) {
+				tmp = square;
+				goto failed_syntax;
+			}
+
+			while (square <= tmp)
+				*square++ = ' ';
+		}
+
+		if (bracket == NULL ||
+		    (comma != NULL && comma < bracket)) {
+
+			/* no interface list specified */
+
+			if (comma != NULL)
+				*comma++ = 0;
+			net = libcfs_str2net(cfs_trimwhite(str));
+
+			if (net == LNET_NIDNET(LNET_NID_ANY)) {
+				LCONSOLE_ERROR_MSG(0x113, "Unrecognised network"
+						   " type\n");
+				tmp = str;
+				goto failed_syntax;
+			}
+
+			if (LNET_NETTYP(net) != LOLND && /* LO is implicit */
+			    lnet_ni_alloc(net, el, nilist) == NULL)
+				goto failed;
+
+			if (el != NULL) {
+				cfs_expr_list_free(el);
+				el = NULL;
+			}
+
+			str = comma;
+			continue;
+		}
+
+		*bracket = 0;
+		net = libcfs_str2net(cfs_trimwhite(str));
+		if (net == LNET_NIDNET(LNET_NID_ANY)) {
+			tmp = str;
+			goto failed_syntax;
+		}
+
+		nnets++;
+		ni = lnet_ni_alloc(net, el, nilist);
+		if (ni == NULL)
+			goto failed;
+
+		if (el != NULL) {
+			cfs_expr_list_free(el);
+			el = NULL;
+		}
+
+		niface = 0;
+		iface = bracket + 1;
+
+		bracket = strchr(iface, ')');
+		if (bracket == NULL) {
+			tmp = iface;
+			goto failed_syntax;
+		}
+
+		*bracket = 0;
+		do {
+			comma = strchr(iface, ',');
+			if (comma != NULL)
+				*comma++ = 0;
+
+			iface = cfs_trimwhite(iface);
+			if (*iface == 0) {
+				tmp = iface;
+				goto failed_syntax;
+			}
+
+			if (niface == LNET_MAX_INTERFACES) {
+				LCONSOLE_ERROR_MSG(0x115, "Too many interfaces "
+						   "for net %s\n",
+						   libcfs_net2str(net));
+				goto failed;
+			}
+
+			ni->ni_interfaces[niface++] = iface;
+			iface = comma;
+		} while (iface != NULL);
+
+		str = bracket + 1;
+		comma = strchr(bracket + 1, ',');
+		if (comma != NULL) {
+			*comma = 0;
+			str = cfs_trimwhite(str);
+			if (*str != 0) {
+				tmp = str;
+				goto failed_syntax;
+			}
+			str = comma + 1;
+			continue;
+		}
+
+		str = cfs_trimwhite(str);
+		if (*str != 0) {
+			tmp = str;
+			goto failed_syntax;
+		}
+	}
+
+	LASSERT(!list_empty(nilist));
+	return 0;
+
+ failed_syntax:
+	lnet_syntax("networks", networks, (int)(tmp - tokens), strlen(tmp));
+ failed:
+	while (!list_empty(nilist)) {
+		ni = list_entry(nilist->next, lnet_ni_t, ni_list);
+
+		list_del(&ni->ni_list);
+		lnet_ni_free(ni);
+	}
+
+	if (el != NULL)
+		cfs_expr_list_free(el);
+
+	LIBCFS_FREE(tokens, tokensize);
+	the_lnet.ln_network_tokens = NULL;
+
+	return -EINVAL;
+}
+
+lnet_text_buf_t *
+lnet_new_text_buf (int str_len)
+{
+	lnet_text_buf_t *ltb;
+	int	      nob;
+
+	/* NB allocate space for the terminating 0 */
+	nob = offsetof(lnet_text_buf_t, ltb_text[str_len + 1]);
+	if (nob > LNET_SINGLE_TEXTBUF_NOB) {
+		/* _way_ conservative for "route net gateway..." */
+		CERROR("text buffer too big\n");
+		return NULL;
+	}
+
+	if (lnet_tbnob + nob > LNET_MAX_TEXTBUF_NOB) {
+		CERROR("Too many text buffers\n");
+		return NULL;
+	}
+
+	LIBCFS_ALLOC(ltb, nob);
+	if (ltb == NULL)
+		return NULL;
+
+	ltb->ltb_size = nob;
+	ltb->ltb_text[0] = 0;
+	lnet_tbnob += nob;
+	return ltb;
+}
+
+void
+lnet_free_text_buf (lnet_text_buf_t *ltb)
+{
+	lnet_tbnob -= ltb->ltb_size;
+	LIBCFS_FREE(ltb, ltb->ltb_size);
+}
+
+void
+lnet_free_text_bufs(struct list_head *tbs)
+{
+	lnet_text_buf_t  *ltb;
+
+	while (!list_empty(tbs)) {
+		ltb = list_entry(tbs->next, lnet_text_buf_t, ltb_list);
+
+		list_del(&ltb->ltb_list);
+		lnet_free_text_buf(ltb);
+	}
+}
+
+void
+lnet_print_text_bufs(struct list_head *tbs)
+{
+	struct list_head	*tmp;
+	lnet_text_buf_t   *ltb;
+
+	list_for_each (tmp, tbs) {
+		ltb = list_entry(tmp, lnet_text_buf_t, ltb_list);
+
+		CDEBUG(D_WARNING, "%s\n", ltb->ltb_text);
+	}
+
+	CDEBUG(D_WARNING, "%d allocated\n", lnet_tbnob);
+}
+
+int
+lnet_str2tbs_sep (struct list_head *tbs, char *str)
+{
+	struct list_head	pending;
+	char	     *sep;
+	int	       nob;
+	int	       i;
+	lnet_text_buf_t  *ltb;
+
+	INIT_LIST_HEAD(&pending);
+
+	/* Split 'str' into separate commands */
+	for (;;) {
+		/* skip leading whitespace */
+		while (cfs_iswhite(*str))
+			str++;
+
+		/* scan for separator or comment */
+		for (sep = str; *sep != 0; sep++)
+			if (lnet_issep(*sep) || *sep == '#')
+				break;
+
+		nob = (int)(sep - str);
+		if (nob > 0) {
+			ltb = lnet_new_text_buf(nob);
+			if (ltb == NULL) {
+				lnet_free_text_bufs(&pending);
+				return -1;
+			}
+
+			for (i = 0; i < nob; i++)
+				if (cfs_iswhite(str[i]))
+					ltb->ltb_text[i] = ' ';
+				else
+					ltb->ltb_text[i] = str[i];
+
+			ltb->ltb_text[nob] = 0;
+
+			list_add_tail(&ltb->ltb_list, &pending);
+		}
+
+		if (*sep == '#') {
+			/* scan for separator */
+			do {
+				sep++;
+			} while (*sep != 0 && !lnet_issep(*sep));
+		}
+
+		if (*sep == 0)
+			break;
+
+		str = sep + 1;
+	}
+
+	list_splice(&pending, tbs->prev);
+	return 0;
+}
+
+int
+lnet_expand1tb (struct list_head *list,
+	       char *str, char *sep1, char *sep2,
+	       char *item, int itemlen)
+{
+	int	      len1 = (int)(sep1 - str);
+	int	      len2 = strlen(sep2 + 1);
+	lnet_text_buf_t *ltb;
+
+	LASSERT (*sep1 == '[');
+	LASSERT (*sep2 == ']');
+
+	ltb = lnet_new_text_buf(len1 + itemlen + len2);
+	if (ltb == NULL)
+		return -ENOMEM;
+
+	memcpy(ltb->ltb_text, str, len1);
+	memcpy(&ltb->ltb_text[len1], item, itemlen);
+	memcpy(&ltb->ltb_text[len1+itemlen], sep2 + 1, len2);
+	ltb->ltb_text[len1 + itemlen + len2] = 0;
+
+	list_add_tail(&ltb->ltb_list, list);
+	return 0;
+}
+
+int
+lnet_str2tbs_expand (struct list_head *tbs, char *str)
+{
+	char	      num[16];
+	struct list_head	pending;
+	char	     *sep;
+	char	     *sep2;
+	char	     *parsed;
+	char	     *enditem;
+	int	       lo;
+	int	       hi;
+	int	       stride;
+	int	       i;
+	int	       nob;
+	int	       scanned;
+
+	INIT_LIST_HEAD(&pending);
+
+	sep = strchr(str, '[');
+	if (sep == NULL)			/* nothing to expand */
+		return 0;
+
+	sep2 = strchr(sep, ']');
+	if (sep2 == NULL)
+		goto failed;
+
+	for (parsed = sep; parsed < sep2; parsed = enditem) {
+
+		enditem = ++parsed;
+		while (enditem < sep2 && *enditem != ',')
+			enditem++;
+
+		if (enditem == parsed)		/* no empty items */
+			goto failed;
+
+		if (sscanf(parsed, "%d-%d/%d%n", &lo, &hi, &stride, &scanned) < 3) {
+
+			if (sscanf(parsed, "%d-%d%n", &lo, &hi, &scanned) < 2) {
+
+				/* simple string enumeration */
+				if (lnet_expand1tb(&pending, str, sep, sep2,
+						   parsed, (int)(enditem - parsed)) != 0)
+					goto failed;
+
+				continue;
+			}
+
+			stride = 1;
+		}
+
+		/* range expansion */
+
+		if (enditem != parsed + scanned) /* no trailing junk */
+			goto failed;
+
+		if (hi < 0 || lo < 0 || stride < 0 || hi < lo ||
+		    (hi - lo) % stride != 0)
+			goto failed;
+
+		for (i = lo; i <= hi; i += stride) {
+
+			snprintf(num, sizeof(num), "%d", i);
+			nob = strlen(num);
+			if (nob + 1 == sizeof(num))
+				goto failed;
+
+			if (lnet_expand1tb(&pending, str, sep, sep2,
+					   num, nob) != 0)
+				goto failed;
+		}
+	}
+
+	list_splice(&pending, tbs->prev);
+	return 1;
+
+ failed:
+	lnet_free_text_bufs(&pending);
+	return -1;
+}
+
+int
+lnet_parse_hops (char *str, unsigned int *hops)
+{
+	int     len = strlen(str);
+	int     nob = len;
+
+	return (sscanf(str, "%u%n", hops, &nob) >= 1 &&
+		nob == len &&
+		*hops > 0 && *hops < 256);
+}
+
+
+int
+lnet_parse_route (char *str, int *im_a_router)
+{
+	/* static scratch buffer OK (single threaded) */
+	static char       cmd[LNET_SINGLE_TEXTBUF_NOB];
+
+	struct list_head	nets;
+	struct list_head	gateways;
+	struct list_head       *tmp1;
+	struct list_head       *tmp2;
+	__u32	     net;
+	lnet_nid_t	nid;
+	lnet_text_buf_t  *ltb;
+	int	       rc;
+	char	     *sep;
+	char	     *token = str;
+	int	       ntokens = 0;
+	int	       myrc = -1;
+	unsigned int      hops;
+	int	       got_hops = 0;
+
+	INIT_LIST_HEAD(&gateways);
+	INIT_LIST_HEAD(&nets);
+
+	/* save a copy of the string for error messages */
+	strncpy(cmd, str, sizeof(cmd) - 1);
+	cmd[sizeof(cmd) - 1] = 0;
+
+	sep = str;
+	for (;;) {
+		/* scan for token start */
+		while (cfs_iswhite(*sep))
+			sep++;
+		if (*sep == 0) {
+			if (ntokens < (got_hops ? 3 : 2))
+				goto token_error;
+			break;
+		}
+
+		ntokens++;
+		token = sep++;
+
+		/* scan for token end */
+		while (*sep != 0 && !cfs_iswhite(*sep))
+			sep++;
+		if (*sep != 0)
+			*sep++ = 0;
+
+		if (ntokens == 1) {
+			tmp2 = &nets;		/* expanding nets */
+		} else if (ntokens == 2 &&
+			   lnet_parse_hops(token, &hops)) {
+			got_hops = 1;	   /* got a hop count */
+			continue;
+		} else {
+			tmp2 = &gateways;	/* expanding gateways */
+		}
+
+		ltb = lnet_new_text_buf(strlen(token));
+		if (ltb == NULL)
+			goto out;
+
+		strcpy(ltb->ltb_text, token);
+		tmp1 = &ltb->ltb_list;
+		list_add_tail(tmp1, tmp2);
+
+		while (tmp1 != tmp2) {
+			ltb = list_entry(tmp1, lnet_text_buf_t, ltb_list);
+
+			rc = lnet_str2tbs_expand(tmp1->next, ltb->ltb_text);
+			if (rc < 0)
+				goto token_error;
+
+			tmp1 = tmp1->next;
+
+			if (rc > 0) {		/* expanded! */
+				list_del(&ltb->ltb_list);
+				lnet_free_text_buf(ltb);
+				continue;
+			}
+
+			if (ntokens == 1) {
+				net = libcfs_str2net(ltb->ltb_text);
+				if (net == LNET_NIDNET(LNET_NID_ANY) ||
+				    LNET_NETTYP(net) == LOLND)
+					goto token_error;
+			} else {
+				nid = libcfs_str2nid(ltb->ltb_text);
+				if (nid == LNET_NID_ANY ||
+				    LNET_NETTYP(LNET_NIDNET(nid)) == LOLND)
+					goto token_error;
+			}
+		}
+	}
+
+	if (!got_hops)
+		hops = 1;
+
+	LASSERT (!list_empty(&nets));
+	LASSERT (!list_empty(&gateways));
+
+	list_for_each (tmp1, &nets) {
+		ltb = list_entry(tmp1, lnet_text_buf_t, ltb_list);
+		net = libcfs_str2net(ltb->ltb_text);
+		LASSERT (net != LNET_NIDNET(LNET_NID_ANY));
+
+		list_for_each (tmp2, &gateways) {
+			ltb = list_entry(tmp2, lnet_text_buf_t, ltb_list);
+			nid = libcfs_str2nid(ltb->ltb_text);
+			LASSERT (nid != LNET_NID_ANY);
+
+			if (lnet_islocalnid(nid)) {
+				*im_a_router = 1;
+				continue;
+			}
+
+			rc = lnet_add_route (net, hops, nid);
+			if (rc != 0) {
+				CERROR("Can't create route "
+				       "to %s via %s\n",
+				       libcfs_net2str(net),
+				       libcfs_nid2str(nid));
+				goto out;
+			}
+		}
+	}
+
+	myrc = 0;
+	goto out;
+
+ token_error:
+	lnet_syntax("routes", cmd, (int)(token - str), strlen(token));
+ out:
+	lnet_free_text_bufs(&nets);
+	lnet_free_text_bufs(&gateways);
+	return myrc;
+}
+
+int
+lnet_parse_route_tbs(struct list_head *tbs, int *im_a_router)
+{
+	lnet_text_buf_t   *ltb;
+
+	while (!list_empty(tbs)) {
+		ltb = list_entry(tbs->next, lnet_text_buf_t, ltb_list);
+
+		if (lnet_parse_route(ltb->ltb_text, im_a_router) < 0) {
+			lnet_free_text_bufs(tbs);
+			return -EINVAL;
+		}
+
+		list_del(&ltb->ltb_list);
+		lnet_free_text_buf(ltb);
+	}
+
+	return 0;
+}
+
+int
+lnet_parse_routes (char *routes, int *im_a_router)
+{
+	struct list_head	tbs;
+	int	       rc = 0;
+
+	*im_a_router = 0;
+
+	INIT_LIST_HEAD(&tbs);
+
+	if (lnet_str2tbs_sep(&tbs, routes) < 0) {
+		CERROR("Error parsing routes\n");
+		rc = -EINVAL;
+	} else {
+		rc = lnet_parse_route_tbs(&tbs, im_a_router);
+	}
+
+	LASSERT (lnet_tbnob == 0);
+	return rc;
+}
+
+int
+lnet_match_network_token(char *token, int len, __u32 *ipaddrs, int nip)
+{
+	LIST_HEAD	(list);
+	int		rc;
+	int		i;
+
+	rc = cfs_ip_addr_parse(token, len, &list);
+	if (rc != 0)
+		return rc;
+
+	for (rc = i = 0; !rc && i < nip; i++)
+		rc = cfs_ip_addr_match(ipaddrs[i], &list);
+
+	cfs_ip_addr_free(&list);
+
+	return rc;
+}
+
+int
+lnet_match_network_tokens(char *net_entry, __u32 *ipaddrs, int nip)
+{
+	static char tokens[LNET_SINGLE_TEXTBUF_NOB];
+
+	int   matched = 0;
+	int   ntokens = 0;
+	int   len;
+	char *net = NULL;
+	char *sep;
+	char *token;
+	int   rc;
+
+	LASSERT (strlen(net_entry) < sizeof(tokens));
+
+	/* work on a copy of the string */
+	strcpy(tokens, net_entry);
+	sep = tokens;
+	for (;;) {
+		/* scan for token start */
+		while (cfs_iswhite(*sep))
+			sep++;
+		if (*sep == 0)
+			break;
+
+		token = sep++;
+
+		/* scan for token end */
+		while (*sep != 0 && !cfs_iswhite(*sep))
+			sep++;
+		if (*sep != 0)
+			*sep++ = 0;
+
+		if (ntokens++ == 0) {
+			net = token;
+			continue;
+		}
+
+		len = strlen(token);
+
+		rc = lnet_match_network_token(token, len, ipaddrs, nip);
+		if (rc < 0) {
+			lnet_syntax("ip2nets", net_entry,
+				    (int)(token - tokens), len);
+			return rc;
+		}
+
+		matched |= (rc != 0);
+	}
+
+	if (!matched)
+		return 0;
+
+	strcpy(net_entry, net);		 /* replace with matched net */
+	return 1;
+}
+
+__u32
+lnet_netspec2net(char *netspec)
+{
+	char   *bracket = strchr(netspec, '(');
+	__u32   net;
+
+	if (bracket != NULL)
+		*bracket = 0;
+
+	net = libcfs_str2net(netspec);
+
+	if (bracket != NULL)
+		*bracket = '(';
+
+	return net;
+}
+
+int
+lnet_splitnets(char *source, struct list_head *nets)
+{
+	int	       offset = 0;
+	int	       offset2;
+	int	       len;
+	lnet_text_buf_t  *tb;
+	lnet_text_buf_t  *tb2;
+	struct list_head       *t;
+	char	     *sep;
+	char	     *bracket;
+	__u32	     net;
+
+	LASSERT (!list_empty(nets));
+	LASSERT (nets->next == nets->prev);     /* single entry */
+
+	tb = list_entry(nets->next, lnet_text_buf_t, ltb_list);
+
+	for (;;) {
+		sep = strchr(tb->ltb_text, ',');
+		bracket = strchr(tb->ltb_text, '(');
+
+		if (sep != NULL &&
+		    bracket != NULL &&
+		    bracket < sep) {
+			/* netspec lists interfaces... */
+
+			offset2 = offset + (int)(bracket - tb->ltb_text);
+			len = strlen(bracket);
+
+			bracket = strchr(bracket + 1, ')');
+
+			if (bracket == NULL ||
+			    !(bracket[1] == ',' || bracket[1] == 0)) {
+				lnet_syntax("ip2nets", source, offset2, len);
+				return -EINVAL;
+			}
+
+			sep = (bracket[1] == 0) ? NULL : bracket + 1;
+		}
+
+		if (sep != NULL)
+			*sep++ = 0;
+
+		net = lnet_netspec2net(tb->ltb_text);
+		if (net == LNET_NIDNET(LNET_NID_ANY)) {
+			lnet_syntax("ip2nets", source, offset,
+				    strlen(tb->ltb_text));
+			return -EINVAL;
+		}
+
+		list_for_each(t, nets) {
+			tb2 = list_entry(t, lnet_text_buf_t, ltb_list);
+
+			if (tb2 == tb)
+				continue;
+
+			if (net == lnet_netspec2net(tb2->ltb_text)) {
+				/* duplicate network */
+				lnet_syntax("ip2nets", source, offset,
+					    strlen(tb->ltb_text));
+				return -EINVAL;
+			}
+		}
+
+		if (sep == NULL)
+			return 0;
+
+		offset += (int)(sep - tb->ltb_text);
+		tb2 = lnet_new_text_buf(strlen(sep));
+		if (tb2 == NULL)
+			return -ENOMEM;
+
+		strcpy(tb2->ltb_text, sep);
+		list_add_tail(&tb2->ltb_list, nets);
+
+		tb = tb2;
+	}
+}
+
+int
+lnet_match_networks (char **networksp, char *ip2nets, __u32 *ipaddrs, int nip)
+{
+	static char	networks[LNET_SINGLE_TEXTBUF_NOB];
+	static char	source[LNET_SINGLE_TEXTBUF_NOB];
+
+	struct list_head	  raw_entries;
+	struct list_head	  matched_nets;
+	struct list_head	  current_nets;
+	struct list_head	 *t;
+	struct list_head	 *t2;
+	lnet_text_buf_t    *tb;
+	lnet_text_buf_t    *tb2;
+	__u32	       net1;
+	__u32	       net2;
+	int		 len;
+	int		 count;
+	int		 dup;
+	int		 rc;
+
+	INIT_LIST_HEAD(&raw_entries);
+	if (lnet_str2tbs_sep(&raw_entries, ip2nets) < 0) {
+		CERROR("Error parsing ip2nets\n");
+		LASSERT (lnet_tbnob == 0);
+		return -EINVAL;
+	}
+
+	INIT_LIST_HEAD(&matched_nets);
+	INIT_LIST_HEAD(&current_nets);
+	networks[0] = 0;
+	count = 0;
+	len = 0;
+	rc = 0;
+
+	while (!list_empty(&raw_entries)) {
+		tb = list_entry(raw_entries.next, lnet_text_buf_t,
+				    ltb_list);
+
+		strncpy(source, tb->ltb_text, sizeof(source)-1);
+		source[sizeof(source)-1] = 0;
+
+		/* replace ltb_text with the network(s) add on match */
+		rc = lnet_match_network_tokens(tb->ltb_text, ipaddrs, nip);
+		if (rc < 0)
+			break;
+
+		list_del(&tb->ltb_list);
+
+		if (rc == 0) {		  /* no match */
+			lnet_free_text_buf(tb);
+			continue;
+		}
+
+		/* split into separate networks */
+		INIT_LIST_HEAD(&current_nets);
+		list_add(&tb->ltb_list, &current_nets);
+		rc = lnet_splitnets(source, &current_nets);
+		if (rc < 0)
+			break;
+
+		dup = 0;
+		list_for_each (t, &current_nets) {
+			tb = list_entry(t, lnet_text_buf_t, ltb_list);
+			net1 = lnet_netspec2net(tb->ltb_text);
+			LASSERT (net1 != LNET_NIDNET(LNET_NID_ANY));
+
+			list_for_each(t2, &matched_nets) {
+				tb2 = list_entry(t2, lnet_text_buf_t,
+						     ltb_list);
+				net2 = lnet_netspec2net(tb2->ltb_text);
+				LASSERT (net2 != LNET_NIDNET(LNET_NID_ANY));
+
+				if (net1 == net2) {
+					dup = 1;
+					break;
+				}
+			}
+
+			if (dup)
+				break;
+		}
+
+		if (dup) {
+			lnet_free_text_bufs(&current_nets);
+			continue;
+		}
+
+		list_for_each_safe(t, t2, &current_nets) {
+			tb = list_entry(t, lnet_text_buf_t, ltb_list);
+
+			list_del(&tb->ltb_list);
+			list_add_tail(&tb->ltb_list, &matched_nets);
+
+			len += snprintf(networks + len, sizeof(networks) - len,
+					"%s%s", (len == 0) ? "" : ",",
+					tb->ltb_text);
+
+			if (len >= sizeof(networks)) {
+				CERROR("Too many matched networks\n");
+				rc = -E2BIG;
+				goto out;
+			}
+		}
+
+		count++;
+	}
+
+ out:
+	lnet_free_text_bufs(&raw_entries);
+	lnet_free_text_bufs(&matched_nets);
+	lnet_free_text_bufs(&current_nets);
+	LASSERT (lnet_tbnob == 0);
+
+	if (rc < 0)
+		return rc;
+
+	*networksp = networks;
+	return count;
+}
+
+void
+lnet_ipaddr_free_enumeration(__u32 *ipaddrs, int nip)
+{
+	LIBCFS_FREE(ipaddrs, nip * sizeof(*ipaddrs));
+}
+
+int
+lnet_ipaddr_enumerate (__u32 **ipaddrsp)
+{
+	int	up;
+	__u32      netmask;
+	__u32     *ipaddrs;
+	__u32     *ipaddrs2;
+	int	nip;
+	char     **ifnames;
+	int	nif = libcfs_ipif_enumerate(&ifnames);
+	int	i;
+	int	rc;
+
+	if (nif <= 0)
+		return nif;
+
+	LIBCFS_ALLOC(ipaddrs, nif * sizeof(*ipaddrs));
+	if (ipaddrs == NULL) {
+		CERROR("Can't allocate ipaddrs[%d]\n", nif);
+		libcfs_ipif_free_enumeration(ifnames, nif);
+		return -ENOMEM;
+	}
+
+	for (i = nip = 0; i < nif; i++) {
+		if (!strcmp(ifnames[i], "lo"))
+			continue;
+
+		rc = libcfs_ipif_query(ifnames[i], &up,
+				       &ipaddrs[nip], &netmask);
+		if (rc != 0) {
+			CWARN("Can't query interface %s: %d\n",
+			      ifnames[i], rc);
+			continue;
+		}
+
+		if (!up) {
+			CWARN("Ignoring interface %s: it's down\n",
+			      ifnames[i]);
+			continue;
+		}
+
+		nip++;
+	}
+
+	libcfs_ipif_free_enumeration(ifnames, nif);
+
+	if (nip == nif) {
+		*ipaddrsp = ipaddrs;
+	} else {
+		if (nip > 0) {
+			LIBCFS_ALLOC(ipaddrs2, nip * sizeof(*ipaddrs2));
+			if (ipaddrs2 == NULL) {
+				CERROR("Can't allocate ipaddrs[%d]\n", nip);
+				nip = -ENOMEM;
+			} else {
+				memcpy(ipaddrs2, ipaddrs,
+				       nip * sizeof(*ipaddrs));
+				*ipaddrsp = ipaddrs2;
+				rc = nip;
+			}
+		}
+		lnet_ipaddr_free_enumeration(ipaddrs, nif);
+	}
+	return nip;
+}
+
+int
+lnet_parse_ip2nets (char **networksp, char *ip2nets)
+{
+	__u32     *ipaddrs;
+	int	nip = lnet_ipaddr_enumerate(&ipaddrs);
+	int	rc;
+
+	if (nip < 0) {
+		LCONSOLE_ERROR_MSG(0x117, "Error %d enumerating local IP "
+				   "interfaces for ip2nets to match\n", nip);
+		return nip;
+	}
+
+	if (nip == 0) {
+		LCONSOLE_ERROR_MSG(0x118, "No local IP interfaces "
+				   "for ip2nets to match\n");
+		return -ENOENT;
+	}
+
+	rc = lnet_match_networks(networksp, ip2nets, ipaddrs, nip);
+	lnet_ipaddr_free_enumeration(ipaddrs, nip);
+
+	if (rc < 0) {
+		LCONSOLE_ERROR_MSG(0x119, "Error %d parsing ip2nets\n", rc);
+		return rc;
+	}
+
+	if (rc == 0) {
+		LCONSOLE_ERROR_MSG(0x11a, "ip2nets does not match "
+				   "any local IP interfaces\n");
+		return -ENOENT;
+	}
+
+	return 0;
+}
+
+int
+lnet_set_ip_niaddr (lnet_ni_t *ni)
+{
+	__u32  net = LNET_NIDNET(ni->ni_nid);
+	char **names;
+	int    n;
+	__u32  ip;
+	__u32  netmask;
+	int    up;
+	int    i;
+	int    rc;
+
+	/* Convenience for LNDs that use the IP address of a local interface as
+	 * the local address part of their NID */
+
+	if (ni->ni_interfaces[0] != NULL) {
+
+		CLASSERT (LNET_MAX_INTERFACES > 1);
+
+		if (ni->ni_interfaces[1] != NULL) {
+			CERROR("Net %s doesn't support multiple interfaces\n",
+			       libcfs_net2str(net));
+			return -EPERM;
+		}
+
+		rc = libcfs_ipif_query(ni->ni_interfaces[0],
+				       &up, &ip, &netmask);
+		if (rc != 0) {
+			CERROR("Net %s can't query interface %s: %d\n",
+			       libcfs_net2str(net), ni->ni_interfaces[0], rc);
+			return -EPERM;
+		}
+
+		if (!up) {
+			CERROR("Net %s can't use interface %s: it's down\n",
+			       libcfs_net2str(net), ni->ni_interfaces[0]);
+			return -ENETDOWN;
+		}
+
+		ni->ni_nid = LNET_MKNID(net, ip);
+		return 0;
+	}
+
+	n = libcfs_ipif_enumerate(&names);
+	if (n <= 0) {
+		CERROR("Net %s can't enumerate interfaces: %d\n",
+		       libcfs_net2str(net), n);
+		return 0;
+	}
+
+	for (i = 0; i < n; i++) {
+		if (!strcmp(names[i], "lo")) /* skip the loopback IF */
+			continue;
+
+		rc = libcfs_ipif_query(names[i], &up, &ip, &netmask);
+
+		if (rc != 0) {
+			CWARN("Net %s can't query interface %s: %d\n",
+			      libcfs_net2str(net), names[i], rc);
+			continue;
+		}
+
+		if (!up) {
+			CWARN("Net %s ignoring interface %s (down)\n",
+			      libcfs_net2str(net), names[i]);
+			continue;
+		}
+
+		libcfs_ipif_free_enumeration(names, n);
+		ni->ni_nid = LNET_MKNID(net, ip);
+		return 0;
+	}
+
+	CERROR("Net %s can't find any interfaces\n", libcfs_net2str(net));
+	libcfs_ipif_free_enumeration(names, n);
+	return -ENOENT;
+}
+EXPORT_SYMBOL(lnet_set_ip_niaddr);
diff --git a/drivers/staging/lustre/lnet/lnet/lib-eq.c b/drivers/staging/lustre/lnet/lnet/lib-eq.c
new file mode 100644
index 000000000000..78297a7d94e8
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/lib-eq.c
@@ -0,0 +1,447 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-eq.c
+ *
+ * Library level Event queue management routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+
+/**
+ * Create an event queue that has room for \a count number of events.
+ *
+ * The event queue is circular and older events will be overwritten by new
+ * ones if they are not removed in time by the user using the functions
+ * LNetEQGet(), LNetEQWait(), or LNetEQPoll(). It is up to the user to
+ * determine the appropriate size of the event queue to prevent this loss
+ * of events. Note that when EQ handler is specified in \a callback, no
+ * event loss can happen, since the handler is run for each event deposited
+ * into the EQ.
+ *
+ * \param count The number of events to be stored in the event queue. It
+ * will be rounded up to the next power of two.
+ * \param callback A handler function that runs when an event is deposited
+ * into the EQ. The constant value LNET_EQ_HANDLER_NONE can be used to
+ * indicate that no event handler is desired.
+ * \param handle On successful return, this location will hold a handle for
+ * the newly created EQ.
+ *
+ * \retval 0       On success.
+ * \retval -EINVAL If an parameter is not valid.
+ * \retval -ENOMEM If memory for the EQ can't be allocated.
+ *
+ * \see lnet_eq_handler_t for the discussion on EQ handler semantics.
+ */
+int
+LNetEQAlloc(unsigned int count, lnet_eq_handler_t callback,
+	    lnet_handle_eq_t *handle)
+{
+	lnet_eq_t     *eq;
+
+	LASSERT (the_lnet.ln_init);
+	LASSERT (the_lnet.ln_refcount > 0);
+
+	/* We need count to be a power of 2 so that when eq_{enq,deq}_seq
+	 * overflow, they don't skip entries, so the queue has the same
+	 * apparent capacity at all times */
+
+	count = cfs_power2_roundup(count);
+
+	if (callback != LNET_EQ_HANDLER_NONE && count != 0) {
+		CWARN("EQ callback is guaranteed to get every event, "
+		      "do you still want to set eqcount %d for polling "
+		      "event which will have locking overhead? "
+		      "Please contact with developer to confirm\n", count);
+	}
+
+	/* count can be 0 if only need callback, we can eliminate
+	 * overhead of enqueue event */
+	if (count == 0 && callback == LNET_EQ_HANDLER_NONE)
+		return -EINVAL;
+
+	eq = lnet_eq_alloc();
+	if (eq == NULL)
+		return -ENOMEM;
+
+	if (count != 0) {
+		LIBCFS_ALLOC(eq->eq_events, count * sizeof(lnet_event_t));
+		if (eq->eq_events == NULL)
+			goto failed;
+		/* NB allocator has set all event sequence numbers to 0,
+		 * so all them should be earlier than eq_deq_seq */
+	}
+
+	eq->eq_deq_seq = 1;
+	eq->eq_enq_seq = 1;
+	eq->eq_size = count;
+	eq->eq_callback = callback;
+
+	eq->eq_refs = cfs_percpt_alloc(lnet_cpt_table(),
+				       sizeof(*eq->eq_refs[0]));
+	if (eq->eq_refs == NULL)
+		goto failed;
+
+	/* MUST hold both exclusive lnet_res_lock */
+	lnet_res_lock(LNET_LOCK_EX);
+	/* NB: hold lnet_eq_wait_lock for EQ link/unlink, so we can do
+	 * both EQ lookup and poll event with only lnet_eq_wait_lock */
+	lnet_eq_wait_lock();
+
+	lnet_res_lh_initialize(&the_lnet.ln_eq_container, &eq->eq_lh);
+	list_add(&eq->eq_list, &the_lnet.ln_eq_container.rec_active);
+
+	lnet_eq_wait_unlock();
+	lnet_res_unlock(LNET_LOCK_EX);
+
+	lnet_eq2handle(handle, eq);
+	return 0;
+
+failed:
+	if (eq->eq_events != NULL)
+		LIBCFS_FREE(eq->eq_events, count * sizeof(lnet_event_t));
+
+	if (eq->eq_refs != NULL)
+		cfs_percpt_free(eq->eq_refs);
+
+	lnet_eq_free(eq);
+	return -ENOMEM;
+}
+EXPORT_SYMBOL(LNetEQAlloc);
+
+/**
+ * Release the resources associated with an event queue if it's idle;
+ * otherwise do nothing and it's up to the user to try again.
+ *
+ * \param eqh A handle for the event queue to be released.
+ *
+ * \retval 0 If the EQ is not in use and freed.
+ * \retval -ENOENT If \a eqh does not point to a valid EQ.
+ * \retval -EBUSY  If the EQ is still in use by some MDs.
+ */
+int
+LNetEQFree(lnet_handle_eq_t eqh)
+{
+	struct lnet_eq	*eq;
+	lnet_event_t	*events = NULL;
+	int		**refs = NULL;
+	int		*ref;
+	int		rc = 0;
+	int		size = 0;
+	int		i;
+
+	LASSERT(the_lnet.ln_init);
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	lnet_res_lock(LNET_LOCK_EX);
+	/* NB: hold lnet_eq_wait_lock for EQ link/unlink, so we can do
+	 * both EQ lookup and poll event with only lnet_eq_wait_lock */
+	lnet_eq_wait_lock();
+
+	eq = lnet_handle2eq(&eqh);
+	if (eq == NULL) {
+		rc = -ENOENT;
+		goto out;
+	}
+
+	cfs_percpt_for_each(ref, i, eq->eq_refs) {
+		LASSERT(*ref >= 0);
+		if (*ref == 0)
+			continue;
+
+		CDEBUG(D_NET, "Event equeue (%d: %d) busy on destroy.\n",
+		       i, *ref);
+		rc = -EBUSY;
+		goto out;
+	}
+
+	/* stash for free after lock dropped */
+	events	= eq->eq_events;
+	size	= eq->eq_size;
+	refs	= eq->eq_refs;
+
+	lnet_res_lh_invalidate(&eq->eq_lh);
+	list_del(&eq->eq_list);
+	lnet_eq_free_locked(eq);
+ out:
+	lnet_eq_wait_unlock();
+	lnet_res_unlock(LNET_LOCK_EX);
+
+	if (events != NULL)
+		LIBCFS_FREE(events, size * sizeof(lnet_event_t));
+	if (refs != NULL)
+		cfs_percpt_free(refs);
+
+	return rc;
+}
+EXPORT_SYMBOL(LNetEQFree);
+
+void
+lnet_eq_enqueue_event(lnet_eq_t *eq, lnet_event_t *ev)
+{
+	/* MUST called with resource lock hold but w/o lnet_eq_wait_lock */
+	int index;
+
+	if (eq->eq_size == 0) {
+		LASSERT(eq->eq_callback != LNET_EQ_HANDLER_NONE);
+		eq->eq_callback(ev);
+		return;
+	}
+
+	lnet_eq_wait_lock();
+	ev->sequence = eq->eq_enq_seq++;
+
+	LASSERT(eq->eq_size == LOWEST_BIT_SET(eq->eq_size));
+	index = ev->sequence & (eq->eq_size - 1);
+
+	eq->eq_events[index] = *ev;
+
+	if (eq->eq_callback != LNET_EQ_HANDLER_NONE)
+		eq->eq_callback(ev);
+
+	/* Wake anyone waiting in LNetEQPoll() */
+	if (waitqueue_active(&the_lnet.ln_eq_waitq))
+		wake_up_all(&the_lnet.ln_eq_waitq);
+	lnet_eq_wait_unlock();
+}
+
+int
+lnet_eq_dequeue_event(lnet_eq_t *eq, lnet_event_t *ev)
+{
+	int		new_index = eq->eq_deq_seq & (eq->eq_size - 1);
+	lnet_event_t	*new_event = &eq->eq_events[new_index];
+	int		rc;
+	ENTRY;
+
+	/* must called with lnet_eq_wait_lock hold */
+	if (LNET_SEQ_GT(eq->eq_deq_seq, new_event->sequence))
+		RETURN(0);
+
+	/* We've got a new event... */
+	*ev = *new_event;
+
+	CDEBUG(D_INFO, "event: %p, sequence: %lu, eq->size: %u\n",
+	       new_event, eq->eq_deq_seq, eq->eq_size);
+
+	/* ...but did it overwrite an event we've not seen yet? */
+	if (eq->eq_deq_seq == new_event->sequence) {
+		rc = 1;
+	} else {
+		/* don't complain with CERROR: some EQs are sized small
+		 * anyway; if it's important, the caller should complain */
+		CDEBUG(D_NET, "Event Queue Overflow: eq seq %lu ev seq %lu\n",
+		       eq->eq_deq_seq, new_event->sequence);
+		rc = -EOVERFLOW;
+	}
+
+	eq->eq_deq_seq = new_event->sequence + 1;
+	RETURN(rc);
+}
+
+/**
+ * A nonblocking function that can be used to get the next event in an EQ.
+ * If an event handler is associated with the EQ, the handler will run before
+ * this function returns successfully. The event is removed from the queue.
+ *
+ * \param eventq A handle for the event queue.
+ * \param event On successful return (1 or -EOVERFLOW), this location will
+ * hold the next event in the EQ.
+ *
+ * \retval 0	  No pending event in the EQ.
+ * \retval 1	  Indicates success.
+ * \retval -ENOENT    If \a eventq does not point to a valid EQ.
+ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that
+ * at least one event between this event and the last event obtained from the
+ * EQ has been dropped due to limited space in the EQ.
+ */
+int
+LNetEQGet (lnet_handle_eq_t eventq, lnet_event_t *event)
+{
+	int which;
+
+	return LNetEQPoll(&eventq, 1, 0,
+			 event, &which);
+}
+EXPORT_SYMBOL(LNetEQGet);
+
+/**
+ * Block the calling process until there is an event in the EQ.
+ * If an event handler is associated with the EQ, the handler will run before
+ * this function returns successfully. This function returns the next event
+ * in the EQ and removes it from the EQ.
+ *
+ * \param eventq A handle for the event queue.
+ * \param event On successful return (1 or -EOVERFLOW), this location will
+ * hold the next event in the EQ.
+ *
+ * \retval 1	  Indicates success.
+ * \retval -ENOENT    If \a eventq does not point to a valid EQ.
+ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that
+ * at least one event between this event and the last event obtained from the
+ * EQ has been dropped due to limited space in the EQ.
+ */
+int
+LNetEQWait (lnet_handle_eq_t eventq, lnet_event_t *event)
+{
+	int which;
+
+	return LNetEQPoll(&eventq, 1, LNET_TIME_FOREVER,
+			 event, &which);
+}
+EXPORT_SYMBOL(LNetEQWait);
+
+
+static int
+lnet_eq_wait_locked(int *timeout_ms)
+{
+	int		tms = *timeout_ms;
+	int		wait;
+	wait_queue_t  wl;
+	cfs_time_t      now;
+
+	if (tms == 0)
+		return -1; /* don't want to wait and no new event */
+
+	init_waitqueue_entry_current(&wl);
+	set_current_state(TASK_INTERRUPTIBLE);
+	add_wait_queue(&the_lnet.ln_eq_waitq, &wl);
+
+	lnet_eq_wait_unlock();
+
+	if (tms < 0) {
+		waitq_wait(&wl, TASK_INTERRUPTIBLE);
+
+	} else {
+		struct timeval tv;
+
+		now = cfs_time_current();
+		waitq_timedwait(&wl, TASK_INTERRUPTIBLE,
+				    cfs_time_seconds(tms) / 1000);
+		cfs_duration_usec(cfs_time_sub(cfs_time_current(), now), &tv);
+		tms -= (int)(tv.tv_sec * 1000 + tv.tv_usec / 1000);
+		if (tms < 0) /* no more wait but may have new event */
+			tms = 0;
+	}
+
+	wait = tms != 0; /* might need to call here again */
+	*timeout_ms = tms;
+
+	lnet_eq_wait_lock();
+	remove_wait_queue(&the_lnet.ln_eq_waitq, &wl);
+
+	return wait;
+}
+
+
+
+/**
+ * Block the calling process until there's an event from a set of EQs or
+ * timeout happens.
+ *
+ * If an event handler is associated with the EQ, the handler will run before
+ * this function returns successfully, in which case the corresponding event
+ * is consumed.
+ *
+ * LNetEQPoll() provides a timeout to allow applications to poll, block for a
+ * fixed period, or block indefinitely.
+ *
+ * \param eventqs,neq An array of EQ handles, and size of the array.
+ * \param timeout_ms Time in milliseconds to wait for an event to occur on
+ * one of the EQs. The constant LNET_TIME_FOREVER can be used to indicate an
+ * infinite timeout.
+ * \param event,which On successful return (1 or -EOVERFLOW), \a event will
+ * hold the next event in the EQs, and \a which will contain the index of the
+ * EQ from which the event was taken.
+ *
+ * \retval 0	  No pending event in the EQs after timeout.
+ * \retval 1	  Indicates success.
+ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that
+ * at least one event between this event and the last event obtained from the
+ * EQ indicated by \a which has been dropped due to limited space in the EQ.
+ * \retval -ENOENT    If there's an invalid handle in \a eventqs.
+ */
+int
+LNetEQPoll(lnet_handle_eq_t *eventqs, int neq, int timeout_ms,
+	   lnet_event_t *event, int *which)
+{
+	int	wait = 1;
+	int	rc;
+	int	i;
+	ENTRY;
+
+	LASSERT (the_lnet.ln_init);
+	LASSERT (the_lnet.ln_refcount > 0);
+
+	if (neq < 1)
+		RETURN(-ENOENT);
+
+	lnet_eq_wait_lock();
+
+	for (;;) {
+		for (i = 0; i < neq; i++) {
+			lnet_eq_t *eq = lnet_handle2eq(&eventqs[i]);
+
+			if (eq == NULL) {
+				lnet_eq_wait_unlock();
+				RETURN(-ENOENT);
+			}
+
+			rc = lnet_eq_dequeue_event(eq, event);
+			if (rc != 0) {
+				lnet_eq_wait_unlock();
+				*which = i;
+				RETURN(rc);
+			}
+		}
+
+		if (wait == 0)
+			break;
+
+		/*
+		 * return value of lnet_eq_wait_locked:
+		 * -1 : did nothing and it's sure no new event
+		 *  1 : sleep inside and wait until new event
+		 *  0 : don't want to wait anymore, but might have new event
+		 *      so need to call dequeue again
+		 */
+		wait = lnet_eq_wait_locked(&timeout_ms);
+		if (wait < 0) /* no new event */
+			break;
+	}
+
+	lnet_eq_wait_unlock();
+	RETURN(0);
+}
diff --git a/drivers/staging/lustre/lnet/lnet/lib-md.c b/drivers/staging/lustre/lnet/lnet/lib-md.c
new file mode 100644
index 000000000000..ae643f26933b
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/lib-md.c
@@ -0,0 +1,451 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-md.c
+ *
+ * Memory Descriptor management routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/lnet/lib-lnet.h>
+
+/* must be called with lnet_res_lock held */
+void
+lnet_md_unlink(lnet_libmd_t *md)
+{
+	if ((md->md_flags & LNET_MD_FLAG_ZOMBIE) == 0) {
+		/* first unlink attempt... */
+		lnet_me_t *me = md->md_me;
+
+		md->md_flags |= LNET_MD_FLAG_ZOMBIE;
+
+		/* Disassociate from ME (if any), and unlink it if it was created
+		 * with LNET_UNLINK */
+		if (me != NULL) {
+			/* detach MD from portal */
+			lnet_ptl_detach_md(me, md);
+			if (me->me_unlink == LNET_UNLINK)
+				lnet_me_unlink(me);
+		}
+
+		/* ensure all future handle lookups fail */
+		lnet_res_lh_invalidate(&md->md_lh);
+	}
+
+	if (md->md_refcount != 0) {
+		CDEBUG(D_NET, "Queueing unlink of md %p\n", md);
+		return;
+	}
+
+	CDEBUG(D_NET, "Unlinking md %p\n", md);
+
+	if (md->md_eq != NULL) {
+		int	cpt = lnet_cpt_of_cookie(md->md_lh.lh_cookie);
+
+		LASSERT(*md->md_eq->eq_refs[cpt] > 0);
+		(*md->md_eq->eq_refs[cpt])--;
+	}
+
+	LASSERT(!list_empty(&md->md_list));
+	list_del_init(&md->md_list);
+	lnet_md_free_locked(md);
+}
+
+static int
+lnet_md_build(lnet_libmd_t *lmd, lnet_md_t *umd, int unlink)
+{
+	int	  i;
+	unsigned int niov;
+	int	  total_length = 0;
+
+	lmd->md_me = NULL;
+	lmd->md_start = umd->start;
+	lmd->md_offset = 0;
+	lmd->md_max_size = umd->max_size;
+	lmd->md_options = umd->options;
+	lmd->md_user_ptr = umd->user_ptr;
+	lmd->md_eq = NULL;
+	lmd->md_threshold = umd->threshold;
+	lmd->md_refcount = 0;
+	lmd->md_flags = (unlink == LNET_UNLINK) ? LNET_MD_FLAG_AUTO_UNLINK : 0;
+
+	if ((umd->options & LNET_MD_IOVEC) != 0) {
+
+		if ((umd->options & LNET_MD_KIOV) != 0) /* Can't specify both */
+			return -EINVAL;
+
+		lmd->md_niov = niov = umd->length;
+		memcpy(lmd->md_iov.iov, umd->start,
+		       niov * sizeof (lmd->md_iov.iov[0]));
+
+		for (i = 0; i < (int)niov; i++) {
+			/* We take the base address on trust */
+			if (lmd->md_iov.iov[i].iov_len <= 0) /* invalid length */
+				return -EINVAL;
+
+			total_length += lmd->md_iov.iov[i].iov_len;
+		}
+
+		lmd->md_length = total_length;
+
+		if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */
+		    (umd->max_size < 0 ||
+		     umd->max_size > total_length)) // illegal max_size
+			return -EINVAL;
+
+	} else if ((umd->options & LNET_MD_KIOV) != 0) {
+		lmd->md_niov = niov = umd->length;
+		memcpy(lmd->md_iov.kiov, umd->start,
+		       niov * sizeof (lmd->md_iov.kiov[0]));
+
+		for (i = 0; i < (int)niov; i++) {
+			/* We take the page pointer on trust */
+			if (lmd->md_iov.kiov[i].kiov_offset +
+			    lmd->md_iov.kiov[i].kiov_len > PAGE_CACHE_SIZE )
+				return -EINVAL; /* invalid length */
+
+			total_length += lmd->md_iov.kiov[i].kiov_len;
+		}
+
+		lmd->md_length = total_length;
+
+		if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */
+		    (umd->max_size < 0 ||
+		     umd->max_size > total_length)) // illegal max_size
+			return -EINVAL;
+	} else {   /* contiguous */
+		lmd->md_length = umd->length;
+		lmd->md_niov = niov = 1;
+		lmd->md_iov.iov[0].iov_base = umd->start;
+		lmd->md_iov.iov[0].iov_len = umd->length;
+
+		if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */
+		    (umd->max_size < 0 ||
+		     umd->max_size > (int)umd->length)) // illegal max_size
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+/* must be called with resource lock held */
+static int
+lnet_md_link(lnet_libmd_t *md, lnet_handle_eq_t eq_handle, int cpt)
+{
+	struct lnet_res_container *container = the_lnet.ln_md_containers[cpt];
+
+	/* NB we are passed an allocated, but inactive md.
+	 * if we return success, caller may lnet_md_unlink() it.
+	 * otherwise caller may only lnet_md_free() it.
+	 */
+	/* This implementation doesn't know how to create START events or
+	 * disable END events.  Best to LASSERT our caller is compliant so
+	 * we find out quickly...  */
+	/*  TODO - reevaluate what should be here in light of
+	 * the removal of the start and end events
+	 * maybe there we shouldn't even allow LNET_EQ_NONE!)
+	 * LASSERT (eq == NULL);
+	 */
+	if (!LNetHandleIsInvalid(eq_handle)) {
+		md->md_eq = lnet_handle2eq(&eq_handle);
+
+		if (md->md_eq == NULL)
+			return -ENOENT;
+
+		(*md->md_eq->eq_refs[cpt])++;
+	}
+
+	lnet_res_lh_initialize(container, &md->md_lh);
+
+	LASSERT(list_empty(&md->md_list));
+	list_add(&md->md_list, &container->rec_active);
+
+	return 0;
+}
+
+/* must be called with lnet_res_lock held */
+void
+lnet_md_deconstruct(lnet_libmd_t *lmd, lnet_md_t *umd)
+{
+	/* NB this doesn't copy out all the iov entries so when a
+	 * discontiguous MD is copied out, the target gets to know the
+	 * original iov pointer (in start) and the number of entries it had
+	 * and that's all.
+	 */
+	umd->start = lmd->md_start;
+	umd->length = ((lmd->md_options & (LNET_MD_IOVEC | LNET_MD_KIOV)) == 0) ?
+		      lmd->md_length : lmd->md_niov;
+	umd->threshold = lmd->md_threshold;
+	umd->max_size = lmd->md_max_size;
+	umd->options = lmd->md_options;
+	umd->user_ptr = lmd->md_user_ptr;
+	lnet_eq2handle(&umd->eq_handle, lmd->md_eq);
+}
+
+int
+lnet_md_validate(lnet_md_t *umd)
+{
+	if (umd->start == NULL && umd->length != 0) {
+		CERROR("MD start pointer can not be NULL with length %u\n",
+		       umd->length);
+		return -EINVAL;
+	}
+
+	if ((umd->options & (LNET_MD_KIOV | LNET_MD_IOVEC)) != 0 &&
+	    umd->length > LNET_MAX_IOV) {
+		CERROR("Invalid option: too many fragments %u, %d max\n",
+		       umd->length, LNET_MAX_IOV);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * Create a memory descriptor and attach it to a ME
+ *
+ * \param meh A handle for a ME to associate the new MD with.
+ * \param umd Provides initial values for the user-visible parts of a MD.
+ * Other than its use for initialization, there is no linkage between this
+ * structure and the MD maintained by the LNet.
+ * \param unlink A flag to indicate whether the MD is automatically unlinked
+ * when it becomes inactive, either because the operation threshold drops to
+ * zero or because the available memory becomes less than \a umd.max_size.
+ * (Note that the check for unlinking a MD only occurs after the completion
+ * of a successful operation on the MD.) The value LNET_UNLINK enables auto
+ * unlinking; the value LNET_RETAIN disables it.
+ * \param handle On successful returns, a handle to the newly created MD is
+ * saved here. This handle can be used later in LNetMDUnlink().
+ *
+ * \retval 0       On success.
+ * \retval -EINVAL If \a umd is not valid.
+ * \retval -ENOMEM If new MD cannot be allocated.
+ * \retval -ENOENT Either \a meh or \a umd.eq_handle does not point to a
+ * valid object. Note that it's OK to supply a NULL \a umd.eq_handle by
+ * calling LNetInvalidateHandle() on it.
+ * \retval -EBUSY  If the ME pointed to by \a meh is already associated with
+ * a MD.
+ */
+int
+LNetMDAttach(lnet_handle_me_t meh, lnet_md_t umd,
+	     lnet_unlink_t unlink, lnet_handle_md_t *handle)
+{
+	LIST_HEAD		(matches);
+	LIST_HEAD		(drops);
+	struct lnet_me		*me;
+	struct lnet_libmd	*md;
+	int			cpt;
+	int			rc;
+
+	LASSERT (the_lnet.ln_init);
+	LASSERT (the_lnet.ln_refcount > 0);
+
+	if (lnet_md_validate(&umd) != 0)
+		return -EINVAL;
+
+	if ((umd.options & (LNET_MD_OP_GET | LNET_MD_OP_PUT)) == 0) {
+		CERROR("Invalid option: no MD_OP set\n");
+		return -EINVAL;
+	}
+
+	md = lnet_md_alloc(&umd);
+	if (md == NULL)
+		return -ENOMEM;
+
+	rc = lnet_md_build(md, &umd, unlink);
+	cpt = lnet_cpt_of_cookie(meh.cookie);
+
+	lnet_res_lock(cpt);
+	if (rc != 0)
+		goto failed;
+
+	me = lnet_handle2me(&meh);
+	if (me == NULL)
+		rc = -ENOENT;
+	else if (me->me_md != NULL)
+		rc = -EBUSY;
+	else
+		rc = lnet_md_link(md, umd.eq_handle, cpt);
+
+	if (rc != 0)
+		goto failed;
+
+	/* attach this MD to portal of ME and check if it matches any
+	 * blocked msgs on this portal */
+	lnet_ptl_attach_md(me, md, &matches, &drops);
+
+	lnet_md2handle(handle, md);
+
+	lnet_res_unlock(cpt);
+
+	lnet_drop_delayed_msg_list(&drops, "Bad match");
+	lnet_recv_delayed_msg_list(&matches);
+
+	return 0;
+
+ failed:
+	lnet_md_free_locked(md);
+
+	lnet_res_unlock(cpt);
+	return rc;
+}
+EXPORT_SYMBOL(LNetMDAttach);
+
+/**
+ * Create a "free floating" memory descriptor - a MD that is not associated
+ * with a ME. Such MDs are usually used in LNetPut() and LNetGet() operations.
+ *
+ * \param umd,unlink See the discussion for LNetMDAttach().
+ * \param handle On successful returns, a handle to the newly created MD is
+ * saved here. This handle can be used later in LNetMDUnlink(), LNetPut(),
+ * and LNetGet() operations.
+ *
+ * \retval 0       On success.
+ * \retval -EINVAL If \a umd is not valid.
+ * \retval -ENOMEM If new MD cannot be allocated.
+ * \retval -ENOENT \a umd.eq_handle does not point to a valid EQ. Note that
+ * it's OK to supply a NULL \a umd.eq_handle by calling
+ * LNetInvalidateHandle() on it.
+ */
+int
+LNetMDBind(lnet_md_t umd, lnet_unlink_t unlink, lnet_handle_md_t *handle)
+{
+	lnet_libmd_t	*md;
+	int		cpt;
+	int		rc;
+
+	LASSERT (the_lnet.ln_init);
+	LASSERT (the_lnet.ln_refcount > 0);
+
+	if (lnet_md_validate(&umd) != 0)
+		return -EINVAL;
+
+	if ((umd.options & (LNET_MD_OP_GET | LNET_MD_OP_PUT)) != 0) {
+		CERROR("Invalid option: GET|PUT illegal on active MDs\n");
+		return -EINVAL;
+	}
+
+	md = lnet_md_alloc(&umd);
+	if (md == NULL)
+		return -ENOMEM;
+
+	rc = lnet_md_build(md, &umd, unlink);
+
+	cpt = lnet_res_lock_current();
+	if (rc != 0)
+		goto failed;
+
+	rc = lnet_md_link(md, umd.eq_handle, cpt);
+	if (rc != 0)
+		goto failed;
+
+	lnet_md2handle(handle, md);
+
+	lnet_res_unlock(cpt);
+	return 0;
+
+ failed:
+	lnet_md_free_locked(md);
+
+	lnet_res_unlock(cpt);
+	return rc;
+}
+EXPORT_SYMBOL(LNetMDBind);
+
+/**
+ * Unlink the memory descriptor from any ME it may be linked to and release
+ * the internal resources associated with it.
+ *
+ * This function does not free the memory region associated with the MD;
+ * i.e., the memory the user allocated for this MD. If the ME associated with
+ * this MD is not NULL and was created with auto unlink enabled, the ME is
+ * unlinked as well (see LNetMEAttach()).
+ *
+ * Explicitly unlinking a MD via this function call has the same behavior as
+ * a MD that has been automatically unlinked, except that no LNET_EVENT_UNLINK
+ * is generated in the latter case.
+ *
+ * An unlinked event can be reported in two ways:
+ * - If there's no pending operations on the MD, it's unlinked immediately
+ *   and an LNET_EVENT_UNLINK event is logged before this function returns.
+ * - Otherwise, the MD is only marked for deletion when this function
+ *   returns, and the unlinked event will be piggybacked on the event of
+ *   the completion of the last operation by setting the unlinked field of
+ *   the event. No dedicated LNET_EVENT_UNLINK event is generated.
+ *
+ * Note that in both cases the unlinked field of the event is always set; no
+ * more event will happen on the MD after such an event is logged.
+ *
+ * \param mdh A handle for the MD to be unlinked.
+ *
+ * \retval 0       On success.
+ * \retval -ENOENT If \a mdh does not point to a valid MD object.
+ */
+int
+LNetMDUnlink (lnet_handle_md_t mdh)
+{
+	lnet_event_t	ev;
+	lnet_libmd_t	*md;
+	int		cpt;
+
+	LASSERT(the_lnet.ln_init);
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	cpt = lnet_cpt_of_cookie(mdh.cookie);
+	lnet_res_lock(cpt);
+
+	md = lnet_handle2md(&mdh);
+	if (md == NULL) {
+		lnet_res_unlock(cpt);
+		return -ENOENT;
+	}
+
+	/* If the MD is busy, lnet_md_unlink just marks it for deletion, and
+	 * when the NAL is done, the completion event flags that the MD was
+	 * unlinked.  Otherwise, we enqueue an event now... */
+
+	if (md->md_eq != NULL &&
+	    md->md_refcount == 0) {
+		lnet_build_unlink_event(md, &ev);
+		lnet_eq_enqueue_event(md->md_eq, &ev);
+	}
+
+	lnet_md_unlink(md);
+
+	lnet_res_unlock(cpt);
+	return 0;
+}
+EXPORT_SYMBOL(LNetMDUnlink);
diff --git a/drivers/staging/lustre/lnet/lnet/lib-me.c b/drivers/staging/lustre/lnet/lnet/lib-me.c
new file mode 100644
index 000000000000..0081075cabee
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/lib-me.c
@@ -0,0 +1,297 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-me.c
+ *
+ * Match Entry management routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/lnet/lib-lnet.h>
+
+/**
+ * Create and attach a match entry to the match list of \a portal. The new
+ * ME is empty, i.e. not associated with a memory descriptor. LNetMDAttach()
+ * can be used to attach a MD to an empty ME.
+ *
+ * \param portal The portal table index where the ME should be attached.
+ * \param match_id Specifies the match criteria for the process ID of
+ * the requester. The constants LNET_PID_ANY and LNET_NID_ANY can be
+ * used to wildcard either of the identifiers in the lnet_process_id_t
+ * structure.
+ * \param match_bits,ignore_bits Specify the match criteria to apply
+ * to the match bits in the incoming request. The ignore bits are used
+ * to mask out insignificant bits in the incoming match bits. The resulting
+ * bits are then compared to the ME's match bits to determine if the
+ * incoming request meets the match criteria.
+ * \param unlink Indicates whether the ME should be unlinked when the memory
+ * descriptor associated with it is unlinked (Note that the check for
+ * unlinking a ME only occurs when the memory descriptor is unlinked.).
+ * Valid values are LNET_RETAIN and LNET_UNLINK.
+ * \param pos Indicates whether the new ME should be prepended or
+ * appended to the match list. Allowed constants: LNET_INS_BEFORE,
+ * LNET_INS_AFTER.
+ * \param handle On successful returns, a handle to the newly created ME
+ * object is saved here. This handle can be used later in LNetMEInsert(),
+ * LNetMEUnlink(), or LNetMDAttach() functions.
+ *
+ * \retval 0       On success.
+ * \retval -EINVAL If \a portal is invalid.
+ * \retval -ENOMEM If new ME object cannot be allocated.
+ */
+int
+LNetMEAttach(unsigned int portal,
+	     lnet_process_id_t match_id,
+	     __u64 match_bits, __u64 ignore_bits,
+	     lnet_unlink_t unlink, lnet_ins_pos_t pos,
+	     lnet_handle_me_t *handle)
+{
+	struct lnet_match_table *mtable;
+	struct lnet_me		*me;
+	struct list_head		*head;
+
+	LASSERT(the_lnet.ln_init);
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	if ((int)portal >= the_lnet.ln_nportals)
+		return -EINVAL;
+
+	mtable = lnet_mt_of_attach(portal, match_id,
+				   match_bits, ignore_bits, pos);
+	if (mtable == NULL) /* can't match portal type */
+		return -EPERM;
+
+	me = lnet_me_alloc();
+	if (me == NULL)
+		return -ENOMEM;
+
+	lnet_res_lock(mtable->mt_cpt);
+
+	me->me_portal = portal;
+	me->me_match_id = match_id;
+	me->me_match_bits = match_bits;
+	me->me_ignore_bits = ignore_bits;
+	me->me_unlink = unlink;
+	me->me_md = NULL;
+
+	lnet_res_lh_initialize(the_lnet.ln_me_containers[mtable->mt_cpt],
+			       &me->me_lh);
+	if (ignore_bits != 0)
+		head = &mtable->mt_mhash[LNET_MT_HASH_IGNORE];
+	else
+		head = lnet_mt_match_head(mtable, match_id, match_bits);
+
+	me->me_pos = head - &mtable->mt_mhash[0];
+	if (pos == LNET_INS_AFTER || pos == LNET_INS_LOCAL)
+		list_add_tail(&me->me_list, head);
+	else
+		list_add(&me->me_list, head);
+
+	lnet_me2handle(handle, me);
+
+	lnet_res_unlock(mtable->mt_cpt);
+	return 0;
+}
+EXPORT_SYMBOL(LNetMEAttach);
+
+/**
+ * Create and a match entry and insert it before or after the ME pointed to by
+ * \a current_meh. The new ME is empty, i.e. not associated with a memory
+ * descriptor. LNetMDAttach() can be used to attach a MD to an empty ME.
+ *
+ * This function is identical to LNetMEAttach() except for the position
+ * where the new ME is inserted.
+ *
+ * \param current_meh A handle for a ME. The new ME will be inserted
+ * immediately before or immediately after this ME.
+ * \param match_id,match_bits,ignore_bits,unlink,pos,handle See the discussion
+ * for LNetMEAttach().
+ *
+ * \retval 0       On success.
+ * \retval -ENOMEM If new ME object cannot be allocated.
+ * \retval -ENOENT If \a current_meh does not point to a valid match entry.
+ */
+int
+LNetMEInsert(lnet_handle_me_t current_meh,
+	     lnet_process_id_t match_id,
+	     __u64 match_bits, __u64 ignore_bits,
+	     lnet_unlink_t unlink, lnet_ins_pos_t pos,
+	     lnet_handle_me_t *handle)
+{
+	struct lnet_me		*current_me;
+	struct lnet_me		*new_me;
+	struct lnet_portal	*ptl;
+	int			cpt;
+
+	LASSERT(the_lnet.ln_init);
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	if (pos == LNET_INS_LOCAL)
+		return -EPERM;
+
+	new_me = lnet_me_alloc();
+	if (new_me == NULL)
+		return -ENOMEM;
+
+	cpt = lnet_cpt_of_cookie(current_meh.cookie);
+
+	lnet_res_lock(cpt);
+
+	current_me = lnet_handle2me(&current_meh);
+	if (current_me == NULL) {
+		lnet_me_free_locked(new_me);
+
+		lnet_res_unlock(cpt);
+		return -ENOENT;
+	}
+
+	LASSERT(current_me->me_portal < the_lnet.ln_nportals);
+
+	ptl = the_lnet.ln_portals[current_me->me_portal];
+	if (lnet_ptl_is_unique(ptl)) {
+		/* nosense to insertion on unique portal */
+		lnet_me_free_locked(new_me);
+		lnet_res_unlock(cpt);
+		return -EPERM;
+	}
+
+	new_me->me_pos = current_me->me_pos;
+	new_me->me_portal = current_me->me_portal;
+	new_me->me_match_id = match_id;
+	new_me->me_match_bits = match_bits;
+	new_me->me_ignore_bits = ignore_bits;
+	new_me->me_unlink = unlink;
+	new_me->me_md = NULL;
+
+	lnet_res_lh_initialize(the_lnet.ln_me_containers[cpt], &new_me->me_lh);
+
+	if (pos == LNET_INS_AFTER)
+		list_add(&new_me->me_list, &current_me->me_list);
+	else
+		list_add_tail(&new_me->me_list, &current_me->me_list);
+
+	lnet_me2handle(handle, new_me);
+
+	lnet_res_unlock(cpt);
+
+	return 0;
+}
+EXPORT_SYMBOL(LNetMEInsert);
+
+/**
+ * Unlink a match entry from its match list.
+ *
+ * This operation also releases any resources associated with the ME. If a
+ * memory descriptor is attached to the ME, then it will be unlinked as well
+ * and an unlink event will be generated. It is an error to use the ME handle
+ * after calling LNetMEUnlink().
+ *
+ * \param meh A handle for the ME to be unlinked.
+ *
+ * \retval 0       On success.
+ * \retval -ENOENT If \a meh does not point to a valid ME.
+ * \see LNetMDUnlink() for the discussion on delivering unlink event.
+ */
+int
+LNetMEUnlink(lnet_handle_me_t meh)
+{
+	lnet_me_t	*me;
+	lnet_libmd_t	*md;
+	lnet_event_t	ev;
+	int		cpt;
+
+	LASSERT(the_lnet.ln_init);
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	cpt = lnet_cpt_of_cookie(meh.cookie);
+	lnet_res_lock(cpt);
+
+	me = lnet_handle2me(&meh);
+	if (me == NULL) {
+		lnet_res_unlock(cpt);
+		return -ENOENT;
+	}
+
+	md = me->me_md;
+	if (md != NULL &&
+	    md->md_eq != NULL &&
+	    md->md_refcount == 0) {
+		lnet_build_unlink_event(md, &ev);
+		lnet_eq_enqueue_event(md->md_eq, &ev);
+	}
+
+	lnet_me_unlink(me);
+
+	lnet_res_unlock(cpt);
+	return 0;
+}
+EXPORT_SYMBOL(LNetMEUnlink);
+
+/* call with lnet_res_lock please */
+void
+lnet_me_unlink(lnet_me_t *me)
+{
+	list_del(&me->me_list);
+
+	if (me->me_md != NULL) {
+		lnet_libmd_t *md = me->me_md;
+
+		/* detach MD from portal of this ME */
+		lnet_ptl_detach_md(me, md);
+		lnet_md_unlink(md);
+	}
+
+	lnet_res_lh_invalidate(&me->me_lh);
+	lnet_me_free_locked(me);
+}
+
+#if 0
+static void
+lib_me_dump(lnet_me_t *me)
+{
+	CWARN("Match Entry %p ("LPX64")\n", me,
+	      me->me_lh.lh_cookie);
+
+	CWARN("\tMatch/Ignore\t= %016lx / %016lx\n",
+	      me->me_match_bits, me->me_ignore_bits);
+
+	CWARN("\tMD\t= %p\n", me->md);
+	CWARN("\tprev\t= %p\n",
+	      list_entry(me->me_list.prev, lnet_me_t, me_list));
+	CWARN("\tnext\t= %p\n",
+	      list_entry(me->me_list.next, lnet_me_t, me_list));
+}
+#endif
diff --git a/drivers/staging/lustre/lnet/lnet/lib-move.c b/drivers/staging/lustre/lnet/lnet/lib-move.c
new file mode 100644
index 000000000000..49b0f1287a69
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/lib-move.c
@@ -0,0 +1,2441 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-move.c
+ *
+ * Data movement routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/lnet/lib-lnet.h>
+
+static int local_nid_dist_zero = 1;
+CFS_MODULE_PARM(local_nid_dist_zero, "i", int, 0444,
+		"Reserved");
+
+int
+lnet_fail_nid (lnet_nid_t nid, unsigned int threshold)
+{
+	lnet_test_peer_t  *tp;
+	struct list_head	*el;
+	struct list_head	*next;
+	struct list_head	 cull;
+
+	LASSERT (the_lnet.ln_init);
+
+	/* NB: use lnet_net_lock(0) to serialize operations on test peers */
+	if (threshold != 0) {
+		/* Adding a new entry */
+		LIBCFS_ALLOC(tp, sizeof(*tp));
+		if (tp == NULL)
+			return -ENOMEM;
+
+		tp->tp_nid = nid;
+		tp->tp_threshold = threshold;
+
+		lnet_net_lock(0);
+		list_add_tail(&tp->tp_list, &the_lnet.ln_test_peers);
+		lnet_net_unlock(0);
+		return 0;
+	}
+
+	/* removing entries */
+	INIT_LIST_HEAD(&cull);
+
+	lnet_net_lock(0);
+
+	list_for_each_safe (el, next, &the_lnet.ln_test_peers) {
+		tp = list_entry (el, lnet_test_peer_t, tp_list);
+
+		if (tp->tp_threshold == 0 ||    /* needs culling anyway */
+		    nid == LNET_NID_ANY ||       /* removing all entries */
+		    tp->tp_nid == nid)	  /* matched this one */
+		{
+			list_del (&tp->tp_list);
+			list_add (&tp->tp_list, &cull);
+		}
+	}
+
+	lnet_net_unlock(0);
+
+	while (!list_empty (&cull)) {
+		tp = list_entry (cull.next, lnet_test_peer_t, tp_list);
+
+		list_del (&tp->tp_list);
+		LIBCFS_FREE(tp, sizeof (*tp));
+	}
+	return 0;
+}
+
+static int
+fail_peer (lnet_nid_t nid, int outgoing)
+{
+	lnet_test_peer_t *tp;
+	struct list_head       *el;
+	struct list_head       *next;
+	struct list_head	cull;
+	int	       fail = 0;
+
+	INIT_LIST_HEAD (&cull);
+
+	/* NB: use lnet_net_lock(0) to serialize operations on test peers */
+	lnet_net_lock(0);
+
+	list_for_each_safe (el, next, &the_lnet.ln_test_peers) {
+		tp = list_entry (el, lnet_test_peer_t, tp_list);
+
+		if (tp->tp_threshold == 0) {
+			/* zombie entry */
+			if (outgoing) {
+				/* only cull zombies on outgoing tests,
+				 * since we may be at interrupt priority on
+				 * incoming messages. */
+				list_del (&tp->tp_list);
+				list_add (&tp->tp_list, &cull);
+			}
+			continue;
+		}
+
+		if (tp->tp_nid == LNET_NID_ANY || /* fail every peer */
+		    nid == tp->tp_nid) {	/* fail this peer */
+			fail = 1;
+
+			if (tp->tp_threshold != LNET_MD_THRESH_INF) {
+				tp->tp_threshold--;
+				if (outgoing &&
+				    tp->tp_threshold == 0) {
+					/* see above */
+					list_del (&tp->tp_list);
+					list_add (&tp->tp_list, &cull);
+				}
+			}
+			break;
+		}
+	}
+
+	lnet_net_unlock(0);
+
+	while (!list_empty (&cull)) {
+		tp = list_entry (cull.next, lnet_test_peer_t, tp_list);
+		list_del (&tp->tp_list);
+
+		LIBCFS_FREE(tp, sizeof (*tp));
+	}
+
+	return (fail);
+}
+
+unsigned int
+lnet_iov_nob (unsigned int niov, struct iovec *iov)
+{
+	unsigned int nob = 0;
+
+	while (niov-- > 0)
+		nob += (iov++)->iov_len;
+
+	return (nob);
+}
+EXPORT_SYMBOL(lnet_iov_nob);
+
+void
+lnet_copy_iov2iov (unsigned int ndiov, struct iovec *diov, unsigned int doffset,
+		   unsigned int nsiov, struct iovec *siov, unsigned int soffset,
+		   unsigned int nob)
+{
+	/* NB diov, siov are READ-ONLY */
+	unsigned int  this_nob;
+
+	if (nob == 0)
+		return;
+
+	/* skip complete frags before 'doffset' */
+	LASSERT (ndiov > 0);
+	while (doffset >= diov->iov_len) {
+		doffset -= diov->iov_len;
+		diov++;
+		ndiov--;
+		LASSERT (ndiov > 0);
+	}
+
+	/* skip complete frags before 'soffset' */
+	LASSERT (nsiov > 0);
+	while (soffset >= siov->iov_len) {
+		soffset -= siov->iov_len;
+		siov++;
+		nsiov--;
+		LASSERT (nsiov > 0);
+	}
+
+	do {
+		LASSERT (ndiov > 0);
+		LASSERT (nsiov > 0);
+		this_nob = MIN(diov->iov_len - doffset,
+			       siov->iov_len - soffset);
+		this_nob = MIN(this_nob, nob);
+
+		memcpy ((char *)diov->iov_base + doffset,
+			(char *)siov->iov_base + soffset, this_nob);
+		nob -= this_nob;
+
+		if (diov->iov_len > doffset + this_nob) {
+			doffset += this_nob;
+		} else {
+			diov++;
+			ndiov--;
+			doffset = 0;
+		}
+
+		if (siov->iov_len > soffset + this_nob) {
+			soffset += this_nob;
+		} else {
+			siov++;
+			nsiov--;
+			soffset = 0;
+		}
+	} while (nob > 0);
+}
+EXPORT_SYMBOL(lnet_copy_iov2iov);
+
+int
+lnet_extract_iov (int dst_niov, struct iovec *dst,
+		  int src_niov, struct iovec *src,
+		  unsigned int offset, unsigned int len)
+{
+	/* Initialise 'dst' to the subset of 'src' starting at 'offset',
+	 * for exactly 'len' bytes, and return the number of entries.
+	 * NB not destructive to 'src' */
+	unsigned int    frag_len;
+	unsigned int    niov;
+
+	if (len == 0)			   /* no data => */
+		return (0);		     /* no frags */
+
+	LASSERT (src_niov > 0);
+	while (offset >= src->iov_len) {      /* skip initial frags */
+		offset -= src->iov_len;
+		src_niov--;
+		src++;
+		LASSERT (src_niov > 0);
+	}
+
+	niov = 1;
+	for (;;) {
+		LASSERT (src_niov > 0);
+		LASSERT ((int)niov <= dst_niov);
+
+		frag_len = src->iov_len - offset;
+		dst->iov_base = ((char *)src->iov_base) + offset;
+
+		if (len <= frag_len) {
+			dst->iov_len = len;
+			return (niov);
+		}
+
+		dst->iov_len = frag_len;
+
+		len -= frag_len;
+		dst++;
+		src++;
+		niov++;
+		src_niov--;
+		offset = 0;
+	}
+}
+EXPORT_SYMBOL(lnet_extract_iov);
+
+
+unsigned int
+lnet_kiov_nob (unsigned int niov, lnet_kiov_t *kiov)
+{
+	unsigned int  nob = 0;
+
+	while (niov-- > 0)
+		nob += (kiov++)->kiov_len;
+
+	return (nob);
+}
+EXPORT_SYMBOL(lnet_kiov_nob);
+
+void
+lnet_copy_kiov2kiov (unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset,
+		     unsigned int nsiov, lnet_kiov_t *siov, unsigned int soffset,
+		     unsigned int nob)
+{
+	/* NB diov, siov are READ-ONLY */
+	unsigned int    this_nob;
+	char	   *daddr = NULL;
+	char	   *saddr = NULL;
+
+	if (nob == 0)
+		return;
+
+	LASSERT (!in_interrupt ());
+
+	LASSERT (ndiov > 0);
+	while (doffset >= diov->kiov_len) {
+		doffset -= diov->kiov_len;
+		diov++;
+		ndiov--;
+		LASSERT (ndiov > 0);
+	}
+
+	LASSERT (nsiov > 0);
+	while (soffset >= siov->kiov_len) {
+		soffset -= siov->kiov_len;
+		siov++;
+		nsiov--;
+		LASSERT (nsiov > 0);
+	}
+
+	do {
+		LASSERT (ndiov > 0);
+		LASSERT (nsiov > 0);
+		this_nob = MIN(diov->kiov_len - doffset,
+			       siov->kiov_len - soffset);
+		this_nob = MIN(this_nob, nob);
+
+		if (daddr == NULL)
+			daddr = ((char *)kmap(diov->kiov_page)) +
+				diov->kiov_offset + doffset;
+		if (saddr == NULL)
+			saddr = ((char *)kmap(siov->kiov_page)) +
+				siov->kiov_offset + soffset;
+
+		/* Vanishing risk of kmap deadlock when mapping 2 pages.
+		 * However in practice at least one of the kiovs will be mapped
+		 * kernel pages and the map/unmap will be NOOPs */
+
+		memcpy (daddr, saddr, this_nob);
+		nob -= this_nob;
+
+		if (diov->kiov_len > doffset + this_nob) {
+			daddr += this_nob;
+			doffset += this_nob;
+		} else {
+			kunmap(diov->kiov_page);
+			daddr = NULL;
+			diov++;
+			ndiov--;
+			doffset = 0;
+		}
+
+		if (siov->kiov_len > soffset + this_nob) {
+			saddr += this_nob;
+			soffset += this_nob;
+		} else {
+			kunmap(siov->kiov_page);
+			saddr = NULL;
+			siov++;
+			nsiov--;
+			soffset = 0;
+		}
+	} while (nob > 0);
+
+	if (daddr != NULL)
+		kunmap(diov->kiov_page);
+	if (saddr != NULL)
+		kunmap(siov->kiov_page);
+}
+EXPORT_SYMBOL(lnet_copy_kiov2kiov);
+
+void
+lnet_copy_kiov2iov (unsigned int niov, struct iovec *iov, unsigned int iovoffset,
+		    unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffset,
+		    unsigned int nob)
+{
+	/* NB iov, kiov are READ-ONLY */
+	unsigned int    this_nob;
+	char	   *addr = NULL;
+
+	if (nob == 0)
+		return;
+
+	LASSERT (!in_interrupt ());
+
+	LASSERT (niov > 0);
+	while (iovoffset >= iov->iov_len) {
+		iovoffset -= iov->iov_len;
+		iov++;
+		niov--;
+		LASSERT (niov > 0);
+	}
+
+	LASSERT (nkiov > 0);
+	while (kiovoffset >= kiov->kiov_len) {
+		kiovoffset -= kiov->kiov_len;
+		kiov++;
+		nkiov--;
+		LASSERT (nkiov > 0);
+	}
+
+	do {
+		LASSERT (niov > 0);
+		LASSERT (nkiov > 0);
+		this_nob = MIN(iov->iov_len - iovoffset,
+			       kiov->kiov_len - kiovoffset);
+		this_nob = MIN(this_nob, nob);
+
+		if (addr == NULL)
+			addr = ((char *)kmap(kiov->kiov_page)) +
+				kiov->kiov_offset + kiovoffset;
+
+		memcpy ((char *)iov->iov_base + iovoffset, addr, this_nob);
+		nob -= this_nob;
+
+		if (iov->iov_len > iovoffset + this_nob) {
+			iovoffset += this_nob;
+		} else {
+			iov++;
+			niov--;
+			iovoffset = 0;
+		}
+
+		if (kiov->kiov_len > kiovoffset + this_nob) {
+			addr += this_nob;
+			kiovoffset += this_nob;
+		} else {
+			kunmap(kiov->kiov_page);
+			addr = NULL;
+			kiov++;
+			nkiov--;
+			kiovoffset = 0;
+		}
+
+	} while (nob > 0);
+
+	if (addr != NULL)
+		kunmap(kiov->kiov_page);
+}
+EXPORT_SYMBOL(lnet_copy_kiov2iov);
+
+void
+lnet_copy_iov2kiov (unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffset,
+		    unsigned int niov, struct iovec *iov, unsigned int iovoffset,
+		    unsigned int nob)
+{
+	/* NB kiov, iov are READ-ONLY */
+	unsigned int    this_nob;
+	char	   *addr = NULL;
+
+	if (nob == 0)
+		return;
+
+	LASSERT (!in_interrupt ());
+
+	LASSERT (nkiov > 0);
+	while (kiovoffset >= kiov->kiov_len) {
+		kiovoffset -= kiov->kiov_len;
+		kiov++;
+		nkiov--;
+		LASSERT (nkiov > 0);
+	}
+
+	LASSERT (niov > 0);
+	while (iovoffset >= iov->iov_len) {
+		iovoffset -= iov->iov_len;
+		iov++;
+		niov--;
+		LASSERT (niov > 0);
+	}
+
+	do {
+		LASSERT (nkiov > 0);
+		LASSERT (niov > 0);
+		this_nob = MIN(kiov->kiov_len - kiovoffset,
+			       iov->iov_len - iovoffset);
+		this_nob = MIN(this_nob, nob);
+
+		if (addr == NULL)
+			addr = ((char *)kmap(kiov->kiov_page)) +
+				kiov->kiov_offset + kiovoffset;
+
+		memcpy (addr, (char *)iov->iov_base + iovoffset, this_nob);
+		nob -= this_nob;
+
+		if (kiov->kiov_len > kiovoffset + this_nob) {
+			addr += this_nob;
+			kiovoffset += this_nob;
+		} else {
+			kunmap(kiov->kiov_page);
+			addr = NULL;
+			kiov++;
+			nkiov--;
+			kiovoffset = 0;
+		}
+
+		if (iov->iov_len > iovoffset + this_nob) {
+			iovoffset += this_nob;
+		} else {
+			iov++;
+			niov--;
+			iovoffset = 0;
+		}
+	} while (nob > 0);
+
+	if (addr != NULL)
+		kunmap(kiov->kiov_page);
+}
+EXPORT_SYMBOL(lnet_copy_iov2kiov);
+
+int
+lnet_extract_kiov (int dst_niov, lnet_kiov_t *dst,
+		   int src_niov, lnet_kiov_t *src,
+		   unsigned int offset, unsigned int len)
+{
+	/* Initialise 'dst' to the subset of 'src' starting at 'offset',
+	 * for exactly 'len' bytes, and return the number of entries.
+	 * NB not destructive to 'src' */
+	unsigned int    frag_len;
+	unsigned int    niov;
+
+	if (len == 0)			   /* no data => */
+		return (0);		     /* no frags */
+
+	LASSERT (src_niov > 0);
+	while (offset >= src->kiov_len) {      /* skip initial frags */
+		offset -= src->kiov_len;
+		src_niov--;
+		src++;
+		LASSERT (src_niov > 0);
+	}
+
+	niov = 1;
+	for (;;) {
+		LASSERT (src_niov > 0);
+		LASSERT ((int)niov <= dst_niov);
+
+		frag_len = src->kiov_len - offset;
+		dst->kiov_page = src->kiov_page;
+		dst->kiov_offset = src->kiov_offset + offset;
+
+		if (len <= frag_len) {
+			dst->kiov_len = len;
+			LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_CACHE_SIZE);
+			return (niov);
+		}
+
+		dst->kiov_len = frag_len;
+		LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_CACHE_SIZE);
+
+		len -= frag_len;
+		dst++;
+		src++;
+		niov++;
+		src_niov--;
+		offset = 0;
+	}
+}
+EXPORT_SYMBOL(lnet_extract_kiov);
+
+void
+lnet_ni_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed,
+	     unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+	unsigned int  niov = 0;
+	struct iovec *iov = NULL;
+	lnet_kiov_t  *kiov = NULL;
+	int	   rc;
+
+	LASSERT (!in_interrupt ());
+	LASSERT (mlen == 0 || msg != NULL);
+
+	if (msg != NULL) {
+		LASSERT(msg->msg_receiving);
+		LASSERT(!msg->msg_sending);
+		LASSERT(rlen == msg->msg_len);
+		LASSERT(mlen <= msg->msg_len);
+		LASSERT(msg->msg_offset == offset);
+		LASSERT(msg->msg_wanted == mlen);
+
+		msg->msg_receiving = 0;
+
+		if (mlen != 0) {
+			niov = msg->msg_niov;
+			iov  = msg->msg_iov;
+			kiov = msg->msg_kiov;
+
+			LASSERT (niov > 0);
+			LASSERT ((iov == NULL) != (kiov == NULL));
+		}
+	}
+
+	rc = (ni->ni_lnd->lnd_recv)(ni, private, msg, delayed,
+				    niov, iov, kiov, offset, mlen, rlen);
+	if (rc < 0)
+		lnet_finalize(ni, msg, rc);
+}
+
+void
+lnet_setpayloadbuffer(lnet_msg_t *msg)
+{
+	lnet_libmd_t *md = msg->msg_md;
+
+	LASSERT (msg->msg_len > 0);
+	LASSERT (!msg->msg_routing);
+	LASSERT (md != NULL);
+	LASSERT (msg->msg_niov == 0);
+	LASSERT (msg->msg_iov == NULL);
+	LASSERT (msg->msg_kiov == NULL);
+
+	msg->msg_niov = md->md_niov;
+	if ((md->md_options & LNET_MD_KIOV) != 0)
+		msg->msg_kiov = md->md_iov.kiov;
+	else
+		msg->msg_iov = md->md_iov.iov;
+}
+
+void
+lnet_prep_send(lnet_msg_t *msg, int type, lnet_process_id_t target,
+	       unsigned int offset, unsigned int len)
+{
+	msg->msg_type = type;
+	msg->msg_target = target;
+	msg->msg_len = len;
+	msg->msg_offset = offset;
+
+	if (len != 0)
+		lnet_setpayloadbuffer(msg);
+
+	memset (&msg->msg_hdr, 0, sizeof (msg->msg_hdr));
+	msg->msg_hdr.type	   = cpu_to_le32(type);
+	msg->msg_hdr.dest_nid       = cpu_to_le64(target.nid);
+	msg->msg_hdr.dest_pid       = cpu_to_le32(target.pid);
+	/* src_nid will be set later */
+	msg->msg_hdr.src_pid	= cpu_to_le32(the_lnet.ln_pid);
+	msg->msg_hdr.payload_length = cpu_to_le32(len);
+}
+
+void
+lnet_ni_send(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+	void   *priv = msg->msg_private;
+	int     rc;
+
+	LASSERT (!in_interrupt ());
+	LASSERT (LNET_NETTYP(LNET_NIDNET(ni->ni_nid)) == LOLND ||
+		 (msg->msg_txcredit && msg->msg_peertxcredit));
+
+	rc = (ni->ni_lnd->lnd_send)(ni, priv, msg);
+	if (rc < 0)
+		lnet_finalize(ni, msg, rc);
+}
+
+int
+lnet_ni_eager_recv(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+	int	rc;
+
+	LASSERT(!msg->msg_sending);
+	LASSERT(msg->msg_receiving);
+	LASSERT(!msg->msg_rx_ready_delay);
+	LASSERT(ni->ni_lnd->lnd_eager_recv != NULL);
+
+	msg->msg_rx_ready_delay = 1;
+	rc = (ni->ni_lnd->lnd_eager_recv)(ni, msg->msg_private, msg,
+					  &msg->msg_private);
+	if (rc != 0) {
+		CERROR("recv from %s / send to %s aborted: "
+		       "eager_recv failed %d\n",
+		       libcfs_nid2str(msg->msg_rxpeer->lp_nid),
+		       libcfs_id2str(msg->msg_target), rc);
+		LASSERT(rc < 0); /* required by my callers */
+	}
+
+	return rc;
+}
+
+/* NB: caller shall hold a ref on 'lp' as I'd drop lnet_net_lock */
+void
+lnet_ni_query_locked(lnet_ni_t *ni, lnet_peer_t *lp)
+{
+	cfs_time_t last_alive = 0;
+
+	LASSERT(lnet_peer_aliveness_enabled(lp));
+	LASSERT(ni->ni_lnd->lnd_query != NULL);
+
+	lnet_net_unlock(lp->lp_cpt);
+	(ni->ni_lnd->lnd_query)(ni, lp->lp_nid, &last_alive);
+	lnet_net_lock(lp->lp_cpt);
+
+	lp->lp_last_query = cfs_time_current();
+
+	if (last_alive != 0) /* NI has updated timestamp */
+		lp->lp_last_alive = last_alive;
+}
+
+/* NB: always called with lnet_net_lock held */
+static inline int
+lnet_peer_is_alive (lnet_peer_t *lp, cfs_time_t now)
+{
+	int	alive;
+	cfs_time_t deadline;
+
+	LASSERT (lnet_peer_aliveness_enabled(lp));
+
+	/* Trust lnet_notify() if it has more recent aliveness news, but
+	 * ignore the initial assumed death (see lnet_peers_start_down()).
+	 */
+	if (!lp->lp_alive && lp->lp_alive_count > 0 &&
+	    cfs_time_aftereq(lp->lp_timestamp, lp->lp_last_alive))
+		return 0;
+
+	deadline = cfs_time_add(lp->lp_last_alive,
+				cfs_time_seconds(lp->lp_ni->ni_peertimeout));
+	alive = cfs_time_after(deadline, now);
+
+	/* Update obsolete lp_alive except for routers assumed to be dead
+	 * initially, because router checker would update aliveness in this
+	 * case, and moreover lp_last_alive at peer creation is assumed.
+	 */
+	if (alive && !lp->lp_alive &&
+	    !(lnet_isrouter(lp) && lp->lp_alive_count == 0))
+		lnet_notify_locked(lp, 0, 1, lp->lp_last_alive);
+
+	return alive;
+}
+
+
+/* NB: returns 1 when alive, 0 when dead, negative when error;
+ *     may drop the lnet_net_lock */
+int
+lnet_peer_alive_locked (lnet_peer_t *lp)
+{
+	cfs_time_t now = cfs_time_current();
+
+	if (!lnet_peer_aliveness_enabled(lp))
+		return -ENODEV;
+
+	if (lnet_peer_is_alive(lp, now))
+		return 1;
+
+	/* Peer appears dead, but we should avoid frequent NI queries (at
+	 * most once per lnet_queryinterval seconds). */
+	if (lp->lp_last_query != 0) {
+		static const int lnet_queryinterval = 1;
+
+		cfs_time_t next_query =
+			   cfs_time_add(lp->lp_last_query,
+					cfs_time_seconds(lnet_queryinterval));
+
+		if (cfs_time_before(now, next_query)) {
+			if (lp->lp_alive)
+				CWARN("Unexpected aliveness of peer %s: "
+				      "%d < %d (%d/%d)\n",
+				      libcfs_nid2str(lp->lp_nid),
+				      (int)now, (int)next_query,
+				      lnet_queryinterval,
+				      lp->lp_ni->ni_peertimeout);
+			return 0;
+		}
+	}
+
+	/* query NI for latest aliveness news */
+	lnet_ni_query_locked(lp->lp_ni, lp);
+
+	if (lnet_peer_is_alive(lp, now))
+		return 1;
+
+	lnet_notify_locked(lp, 0, 0, lp->lp_last_alive);
+	return 0;
+}
+
+int
+lnet_post_send_locked(lnet_msg_t *msg, int do_send)
+{
+	/* lnet_send is going to lnet_net_unlock immediately after this,
+	 * so it sets do_send FALSE and I don't do the unlock/send/lock bit.
+	 * I return EAGAIN if msg blocked, EHOSTUNREACH if msg_txpeer
+	 * appears dead, and 0 if sent or OK to send */
+	struct lnet_peer	*lp = msg->msg_txpeer;
+	struct lnet_ni		*ni = lp->lp_ni;
+	struct lnet_tx_queue	*tq;
+	int			cpt;
+
+	/* non-lnet_send() callers have checked before */
+	LASSERT(!do_send || msg->msg_tx_delayed);
+	LASSERT(!msg->msg_receiving);
+	LASSERT(msg->msg_tx_committed);
+
+	cpt = msg->msg_tx_cpt;
+	tq = ni->ni_tx_queues[cpt];
+
+	/* NB 'lp' is always the next hop */
+	if ((msg->msg_target.pid & LNET_PID_USERFLAG) == 0 &&
+	    lnet_peer_alive_locked(lp) == 0) {
+		the_lnet.ln_counters[cpt]->drop_count++;
+		the_lnet.ln_counters[cpt]->drop_length += msg->msg_len;
+		lnet_net_unlock(cpt);
+
+		CNETERR("Dropping message for %s: peer not alive\n",
+			libcfs_id2str(msg->msg_target));
+		if (do_send)
+			lnet_finalize(ni, msg, -EHOSTUNREACH);
+
+		lnet_net_lock(cpt);
+		return EHOSTUNREACH;
+	}
+
+	if (!msg->msg_peertxcredit) {
+		LASSERT ((lp->lp_txcredits < 0) ==
+			 !list_empty(&lp->lp_txq));
+
+		msg->msg_peertxcredit = 1;
+		lp->lp_txqnob += msg->msg_len + sizeof(lnet_hdr_t);
+		lp->lp_txcredits--;
+
+		if (lp->lp_txcredits < lp->lp_mintxcredits)
+			lp->lp_mintxcredits = lp->lp_txcredits;
+
+		if (lp->lp_txcredits < 0) {
+			msg->msg_tx_delayed = 1;
+			list_add_tail(&msg->msg_list, &lp->lp_txq);
+			return EAGAIN;
+		}
+	}
+
+	if (!msg->msg_txcredit) {
+		LASSERT((tq->tq_credits < 0) ==
+			!list_empty(&tq->tq_delayed));
+
+		msg->msg_txcredit = 1;
+		tq->tq_credits--;
+
+		if (tq->tq_credits < tq->tq_credits_min)
+			tq->tq_credits_min = tq->tq_credits;
+
+		if (tq->tq_credits < 0) {
+			msg->msg_tx_delayed = 1;
+			list_add_tail(&msg->msg_list, &tq->tq_delayed);
+			return EAGAIN;
+		}
+	}
+
+	if (do_send) {
+		lnet_net_unlock(cpt);
+		lnet_ni_send(ni, msg);
+		lnet_net_lock(cpt);
+	}
+	return 0;
+}
+
+
+lnet_rtrbufpool_t *
+lnet_msg2bufpool(lnet_msg_t *msg)
+{
+	lnet_rtrbufpool_t	*rbp;
+	int			cpt;
+
+	LASSERT(msg->msg_rx_committed);
+
+	cpt = msg->msg_rx_cpt;
+	rbp = &the_lnet.ln_rtrpools[cpt][0];
+
+	LASSERT(msg->msg_len <= LNET_MTU);
+	while (msg->msg_len > (unsigned int)rbp->rbp_npages * PAGE_CACHE_SIZE) {
+		rbp++;
+		LASSERT(rbp < &the_lnet.ln_rtrpools[cpt][LNET_NRBPOOLS]);
+	}
+
+	return rbp;
+}
+
+int
+lnet_post_routed_recv_locked (lnet_msg_t *msg, int do_recv)
+{
+	/* lnet_parse is going to lnet_net_unlock immediately after this, so it
+	 * sets do_recv FALSE and I don't do the unlock/send/lock bit.  I
+	 * return EAGAIN if msg blocked and 0 if received or OK to receive */
+	lnet_peer_t	 *lp = msg->msg_rxpeer;
+	lnet_rtrbufpool_t   *rbp;
+	lnet_rtrbuf_t       *rb;
+
+	LASSERT (msg->msg_iov == NULL);
+	LASSERT (msg->msg_kiov == NULL);
+	LASSERT (msg->msg_niov == 0);
+	LASSERT (msg->msg_routing);
+	LASSERT (msg->msg_receiving);
+	LASSERT (!msg->msg_sending);
+
+	/* non-lnet_parse callers only receive delayed messages */
+	LASSERT(!do_recv || msg->msg_rx_delayed);
+
+	if (!msg->msg_peerrtrcredit) {
+		LASSERT ((lp->lp_rtrcredits < 0) ==
+			 !list_empty(&lp->lp_rtrq));
+
+		msg->msg_peerrtrcredit = 1;
+		lp->lp_rtrcredits--;
+		if (lp->lp_rtrcredits < lp->lp_minrtrcredits)
+			lp->lp_minrtrcredits = lp->lp_rtrcredits;
+
+		if (lp->lp_rtrcredits < 0) {
+			/* must have checked eager_recv before here */
+			LASSERT(msg->msg_rx_ready_delay);
+			msg->msg_rx_delayed = 1;
+			list_add_tail(&msg->msg_list, &lp->lp_rtrq);
+			return EAGAIN;
+		}
+	}
+
+	rbp = lnet_msg2bufpool(msg);
+
+	if (!msg->msg_rtrcredit) {
+		LASSERT ((rbp->rbp_credits < 0) ==
+			 !list_empty(&rbp->rbp_msgs));
+
+		msg->msg_rtrcredit = 1;
+		rbp->rbp_credits--;
+		if (rbp->rbp_credits < rbp->rbp_mincredits)
+			rbp->rbp_mincredits = rbp->rbp_credits;
+
+		if (rbp->rbp_credits < 0) {
+			/* must have checked eager_recv before here */
+			LASSERT(msg->msg_rx_ready_delay);
+			msg->msg_rx_delayed = 1;
+			list_add_tail(&msg->msg_list, &rbp->rbp_msgs);
+			return EAGAIN;
+		}
+	}
+
+	LASSERT (!list_empty(&rbp->rbp_bufs));
+	rb = list_entry(rbp->rbp_bufs.next, lnet_rtrbuf_t, rb_list);
+	list_del(&rb->rb_list);
+
+	msg->msg_niov = rbp->rbp_npages;
+	msg->msg_kiov = &rb->rb_kiov[0];
+
+	if (do_recv) {
+		int cpt = msg->msg_rx_cpt;
+
+		lnet_net_unlock(cpt);
+		lnet_ni_recv(lp->lp_ni, msg->msg_private, msg, 1,
+			     0, msg->msg_len, msg->msg_len);
+		lnet_net_lock(cpt);
+	}
+	return 0;
+}
+
+void
+lnet_return_tx_credits_locked(lnet_msg_t *msg)
+{
+	lnet_peer_t	*txpeer = msg->msg_txpeer;
+	lnet_msg_t	*msg2;
+
+	if (msg->msg_txcredit) {
+		struct lnet_ni	     *ni = txpeer->lp_ni;
+		struct lnet_tx_queue *tq = ni->ni_tx_queues[msg->msg_tx_cpt];
+
+		/* give back NI txcredits */
+		msg->msg_txcredit = 0;
+
+		LASSERT((tq->tq_credits < 0) ==
+			!list_empty(&tq->tq_delayed));
+
+		tq->tq_credits++;
+		if (tq->tq_credits <= 0) {
+			msg2 = list_entry(tq->tq_delayed.next,
+					      lnet_msg_t, msg_list);
+			list_del(&msg2->msg_list);
+
+			LASSERT(msg2->msg_txpeer->lp_ni == ni);
+			LASSERT(msg2->msg_tx_delayed);
+
+			(void) lnet_post_send_locked(msg2, 1);
+		}
+	}
+
+	if (msg->msg_peertxcredit) {
+		/* give back peer txcredits */
+		msg->msg_peertxcredit = 0;
+
+		LASSERT((txpeer->lp_txcredits < 0) ==
+			!list_empty(&txpeer->lp_txq));
+
+		txpeer->lp_txqnob -= msg->msg_len + sizeof(lnet_hdr_t);
+		LASSERT (txpeer->lp_txqnob >= 0);
+
+		txpeer->lp_txcredits++;
+		if (txpeer->lp_txcredits <= 0) {
+			msg2 = list_entry(txpeer->lp_txq.next,
+					      lnet_msg_t, msg_list);
+			list_del(&msg2->msg_list);
+
+			LASSERT(msg2->msg_txpeer == txpeer);
+			LASSERT(msg2->msg_tx_delayed);
+
+			(void) lnet_post_send_locked(msg2, 1);
+		}
+	}
+
+	if (txpeer != NULL) {
+		msg->msg_txpeer = NULL;
+		lnet_peer_decref_locked(txpeer);
+	}
+}
+
+void
+lnet_return_rx_credits_locked(lnet_msg_t *msg)
+{
+	lnet_peer_t	*rxpeer = msg->msg_rxpeer;
+	lnet_msg_t	*msg2;
+
+	if (msg->msg_rtrcredit) {
+		/* give back global router credits */
+		lnet_rtrbuf_t     *rb;
+		lnet_rtrbufpool_t *rbp;
+
+		/* NB If a msg ever blocks for a buffer in rbp_msgs, it stays
+		 * there until it gets one allocated, or aborts the wait
+		 * itself */
+		LASSERT (msg->msg_kiov != NULL);
+
+		rb = list_entry(msg->msg_kiov, lnet_rtrbuf_t, rb_kiov[0]);
+		rbp = rb->rb_pool;
+		LASSERT (rbp == lnet_msg2bufpool(msg));
+
+		msg->msg_kiov = NULL;
+		msg->msg_rtrcredit = 0;
+
+		LASSERT((rbp->rbp_credits < 0) ==
+			!list_empty(&rbp->rbp_msgs));
+		LASSERT((rbp->rbp_credits > 0) ==
+			!list_empty(&rbp->rbp_bufs));
+
+		list_add(&rb->rb_list, &rbp->rbp_bufs);
+		rbp->rbp_credits++;
+		if (rbp->rbp_credits <= 0) {
+			msg2 = list_entry(rbp->rbp_msgs.next,
+					      lnet_msg_t, msg_list);
+			list_del(&msg2->msg_list);
+
+			(void) lnet_post_routed_recv_locked(msg2, 1);
+		}
+	}
+
+	if (msg->msg_peerrtrcredit) {
+		/* give back peer router credits */
+		msg->msg_peerrtrcredit = 0;
+
+		LASSERT((rxpeer->lp_rtrcredits < 0) ==
+			!list_empty(&rxpeer->lp_rtrq));
+
+		rxpeer->lp_rtrcredits++;
+		if (rxpeer->lp_rtrcredits <= 0) {
+			msg2 = list_entry(rxpeer->lp_rtrq.next,
+					      lnet_msg_t, msg_list);
+			list_del(&msg2->msg_list);
+
+			(void) lnet_post_routed_recv_locked(msg2, 1);
+		}
+	}
+	if (rxpeer != NULL) {
+		msg->msg_rxpeer = NULL;
+		lnet_peer_decref_locked(rxpeer);
+	}
+}
+
+static int
+lnet_compare_routes(lnet_route_t *r1, lnet_route_t *r2)
+{
+	lnet_peer_t *p1 = r1->lr_gateway;
+	lnet_peer_t *p2 = r2->lr_gateway;
+
+	if (r1->lr_hops < r2->lr_hops)
+		return 1;
+
+	if (r1->lr_hops > r2->lr_hops)
+		return -1;
+
+	if (p1->lp_txqnob < p2->lp_txqnob)
+		return 1;
+
+	if (p1->lp_txqnob > p2->lp_txqnob)
+		return -1;
+
+	if (p1->lp_txcredits > p2->lp_txcredits)
+		return 1;
+
+	if (p1->lp_txcredits < p2->lp_txcredits)
+		return -1;
+
+	if (r1->lr_seq - r2->lr_seq <= 0)
+		return 1;
+
+	return -1;
+}
+
+static lnet_peer_t *
+lnet_find_route_locked(lnet_ni_t *ni, lnet_nid_t target, lnet_nid_t rtr_nid)
+{
+	lnet_remotenet_t	*rnet;
+	lnet_route_t		*rtr;
+	lnet_route_t		*rtr_best;
+	lnet_route_t		*rtr_last;
+	struct lnet_peer	*lp_best;
+	struct lnet_peer	*lp;
+	int			rc;
+
+	/* If @rtr_nid is not LNET_NID_ANY, return the gateway with
+	 * rtr_nid nid, otherwise find the best gateway I can use */
+
+	rnet = lnet_find_net_locked(LNET_NIDNET(target));
+	if (rnet == NULL)
+		return NULL;
+
+	lp_best = NULL;
+	rtr_best = rtr_last = NULL;
+	list_for_each_entry(rtr, &rnet->lrn_routes, lr_list) {
+		lp = rtr->lr_gateway;
+
+		if (!lp->lp_alive || /* gateway is down */
+		    ((lp->lp_ping_feats & LNET_PING_FEAT_NI_STATUS) != 0 &&
+		     rtr->lr_downis != 0)) /* NI to target is down */
+			continue;
+
+		if (ni != NULL && lp->lp_ni != ni)
+			continue;
+
+		if (lp->lp_nid == rtr_nid) /* it's pre-determined router */
+			return lp;
+
+		if (lp_best == NULL) {
+			rtr_best = rtr_last = rtr;
+			lp_best = lp;
+			continue;
+		}
+
+		/* no protection on below fields, but it's harmless */
+		if (rtr_last->lr_seq - rtr->lr_seq < 0)
+			rtr_last = rtr;
+
+		rc = lnet_compare_routes(rtr, rtr_best);
+		if (rc < 0)
+			continue;
+
+		rtr_best = rtr;
+		lp_best = lp;
+	}
+
+	/* set sequence number on the best router to the latest sequence + 1
+	 * so we can round-robin all routers, it's race and inaccurate but
+	 * harmless and functional  */
+	if (rtr_best != NULL)
+		rtr_best->lr_seq = rtr_last->lr_seq + 1;
+	return lp_best;
+}
+
+int
+lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg, lnet_nid_t rtr_nid)
+{
+	lnet_nid_t		dst_nid = msg->msg_target.nid;
+	struct lnet_ni		*src_ni;
+	struct lnet_ni		*local_ni;
+	struct lnet_peer	*lp;
+	int			cpt;
+	int			cpt2;
+	int			rc;
+
+	/* NB: rtr_nid is set to LNET_NID_ANY for all current use-cases,
+	 * but we might want to use pre-determined router for ACK/REPLY
+	 * in the future */
+	/* NB: ni != NULL == interface pre-determined (ACK/REPLY) */
+	LASSERT (msg->msg_txpeer == NULL);
+	LASSERT (!msg->msg_sending);
+	LASSERT (!msg->msg_target_is_router);
+	LASSERT (!msg->msg_receiving);
+
+	msg->msg_sending = 1;
+
+	LASSERT(!msg->msg_tx_committed);
+	cpt = lnet_cpt_of_nid(rtr_nid == LNET_NID_ANY ? dst_nid : rtr_nid);
+ again:
+	lnet_net_lock(cpt);
+
+	if (the_lnet.ln_shutdown) {
+		lnet_net_unlock(cpt);
+		return -ESHUTDOWN;
+	}
+
+	if (src_nid == LNET_NID_ANY) {
+		src_ni = NULL;
+	} else {
+		src_ni = lnet_nid2ni_locked(src_nid, cpt);
+		if (src_ni == NULL) {
+			lnet_net_unlock(cpt);
+			LCONSOLE_WARN("Can't send to %s: src %s is not a "
+				      "local nid\n", libcfs_nid2str(dst_nid),
+				      libcfs_nid2str(src_nid));
+			return -EINVAL;
+		}
+		LASSERT (!msg->msg_routing);
+	}
+
+	/* Is this for someone on a local network? */
+	local_ni = lnet_net2ni_locked(LNET_NIDNET(dst_nid), cpt);
+
+	if (local_ni != NULL) {
+		if (src_ni == NULL) {
+			src_ni = local_ni;
+			src_nid = src_ni->ni_nid;
+		} else if (src_ni == local_ni) {
+			lnet_ni_decref_locked(local_ni, cpt);
+		} else {
+			lnet_ni_decref_locked(local_ni, cpt);
+			lnet_ni_decref_locked(src_ni, cpt);
+			lnet_net_unlock(cpt);
+			LCONSOLE_WARN("No route to %s via from %s\n",
+				      libcfs_nid2str(dst_nid),
+				      libcfs_nid2str(src_nid));
+			return -EINVAL;
+		}
+
+		LASSERT(src_nid != LNET_NID_ANY);
+		lnet_msg_commit(msg, cpt);
+
+		if (!msg->msg_routing)
+			msg->msg_hdr.src_nid = cpu_to_le64(src_nid);
+
+		if (src_ni == the_lnet.ln_loni) {
+			/* No send credit hassles with LOLND */
+			lnet_net_unlock(cpt);
+			lnet_ni_send(src_ni, msg);
+
+			lnet_net_lock(cpt);
+			lnet_ni_decref_locked(src_ni, cpt);
+			lnet_net_unlock(cpt);
+			return 0;
+		}
+
+		rc = lnet_nid2peer_locked(&lp, dst_nid, cpt);
+		/* lp has ref on src_ni; lose mine */
+		lnet_ni_decref_locked(src_ni, cpt);
+		if (rc != 0) {
+			lnet_net_unlock(cpt);
+			LCONSOLE_WARN("Error %d finding peer %s\n", rc,
+				      libcfs_nid2str(dst_nid));
+			/* ENOMEM or shutting down */
+			return rc;
+		}
+		LASSERT (lp->lp_ni == src_ni);
+	} else {
+		/* sending to a remote network */
+		lp = lnet_find_route_locked(src_ni, dst_nid, rtr_nid);
+		if (lp == NULL) {
+			if (src_ni != NULL)
+				lnet_ni_decref_locked(src_ni, cpt);
+			lnet_net_unlock(cpt);
+
+			LCONSOLE_WARN("No route to %s via %s "
+				      "(all routers down)\n",
+				      libcfs_id2str(msg->msg_target),
+				      libcfs_nid2str(src_nid));
+			return -EHOSTUNREACH;
+		}
+
+		/* rtr_nid is LNET_NID_ANY or NID of pre-determined router,
+		 * it's possible that rtr_nid isn't LNET_NID_ANY and lp isn't
+		 * pre-determined router, this can happen if router table
+		 * was changed when we release the lock */
+		if (rtr_nid != lp->lp_nid) {
+			cpt2 = lnet_cpt_of_nid_locked(lp->lp_nid);
+			if (cpt2 != cpt) {
+				if (src_ni != NULL)
+					lnet_ni_decref_locked(src_ni, cpt);
+				lnet_net_unlock(cpt);
+
+				rtr_nid = lp->lp_nid;
+				cpt = cpt2;
+				goto again;
+			}
+		}
+
+		CDEBUG(D_NET, "Best route to %s via %s for %s %d\n",
+		       libcfs_nid2str(dst_nid), libcfs_nid2str(lp->lp_nid),
+		       lnet_msgtyp2str(msg->msg_type), msg->msg_len);
+
+		if (src_ni == NULL) {
+			src_ni = lp->lp_ni;
+			src_nid = src_ni->ni_nid;
+		} else {
+			LASSERT (src_ni == lp->lp_ni);
+			lnet_ni_decref_locked(src_ni, cpt);
+		}
+
+		lnet_peer_addref_locked(lp);
+
+		LASSERT(src_nid != LNET_NID_ANY);
+		lnet_msg_commit(msg, cpt);
+
+		if (!msg->msg_routing) {
+			/* I'm the source and now I know which NI to send on */
+			msg->msg_hdr.src_nid = cpu_to_le64(src_nid);
+		}
+
+		msg->msg_target_is_router = 1;
+		msg->msg_target.nid = lp->lp_nid;
+		msg->msg_target.pid = LUSTRE_SRV_LNET_PID;
+	}
+
+	/* 'lp' is our best choice of peer */
+
+	LASSERT (!msg->msg_peertxcredit);
+	LASSERT (!msg->msg_txcredit);
+	LASSERT (msg->msg_txpeer == NULL);
+
+	msg->msg_txpeer = lp;		   /* msg takes my ref on lp */
+
+	rc = lnet_post_send_locked(msg, 0);
+	lnet_net_unlock(cpt);
+
+	if (rc == EHOSTUNREACH)
+		return -EHOSTUNREACH;
+
+	if (rc == 0)
+		lnet_ni_send(src_ni, msg);
+
+	return 0;
+}
+
+static void
+lnet_drop_message(lnet_ni_t *ni, int cpt, void *private, unsigned int nob)
+{
+	lnet_net_lock(cpt);
+	the_lnet.ln_counters[cpt]->drop_count++;
+	the_lnet.ln_counters[cpt]->drop_length += nob;
+	lnet_net_unlock(cpt);
+
+	lnet_ni_recv(ni, private, NULL, 0, 0, 0, nob);
+}
+
+static void
+lnet_recv_put(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+	lnet_hdr_t	*hdr = &msg->msg_hdr;
+
+	if (msg->msg_wanted != 0)
+		lnet_setpayloadbuffer(msg);
+
+	lnet_build_msg_event(msg, LNET_EVENT_PUT);
+
+	/* Must I ACK?  If so I'll grab the ack_wmd out of the header and put
+	 * it back into the ACK during lnet_finalize() */
+	msg->msg_ack = (!lnet_is_wire_handle_none(&hdr->msg.put.ack_wmd) &&
+			(msg->msg_md->md_options & LNET_MD_ACK_DISABLE) == 0);
+
+	lnet_ni_recv(ni, msg->msg_private, msg, msg->msg_rx_delayed,
+		     msg->msg_offset, msg->msg_wanted, hdr->payload_length);
+}
+
+static int
+lnet_parse_put(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+	lnet_hdr_t		*hdr = &msg->msg_hdr;
+	struct lnet_match_info	info;
+	int			rc;
+
+	/* Convert put fields to host byte order */
+	hdr->msg.put.match_bits	= le64_to_cpu(hdr->msg.put.match_bits);
+	hdr->msg.put.ptl_index	= le32_to_cpu(hdr->msg.put.ptl_index);
+	hdr->msg.put.offset	= le32_to_cpu(hdr->msg.put.offset);
+
+	info.mi_id.nid	= hdr->src_nid;
+	info.mi_id.pid	= hdr->src_pid;
+	info.mi_opc	= LNET_MD_OP_PUT;
+	info.mi_portal	= hdr->msg.put.ptl_index;
+	info.mi_rlength	= hdr->payload_length;
+	info.mi_roffset	= hdr->msg.put.offset;
+	info.mi_mbits	= hdr->msg.put.match_bits;
+
+	msg->msg_rx_ready_delay = ni->ni_lnd->lnd_eager_recv == NULL;
+
+ again:
+	rc = lnet_ptl_match_md(&info, msg);
+	switch (rc) {
+	default:
+		LBUG();
+
+	case LNET_MATCHMD_OK:
+		lnet_recv_put(ni, msg);
+		return 0;
+
+	case LNET_MATCHMD_NONE:
+		if (msg->msg_rx_delayed) /* attached on delayed list */
+			return 0;
+
+		rc = lnet_ni_eager_recv(ni, msg);
+		if (rc == 0)
+			goto again;
+		/* fall through */
+
+	case LNET_MATCHMD_DROP:
+		CNETERR("Dropping PUT from %s portal %d match "LPU64
+			" offset %d length %d: %d\n",
+			libcfs_id2str(info.mi_id), info.mi_portal,
+			info.mi_mbits, info.mi_roffset, info.mi_rlength, rc);
+
+		return ENOENT;	/* +ve: OK but no match */
+	}
+}
+
+static int
+lnet_parse_get(lnet_ni_t *ni, lnet_msg_t *msg, int rdma_get)
+{
+	struct lnet_match_info	info;
+	lnet_hdr_t		*hdr = &msg->msg_hdr;
+	lnet_handle_wire_t	reply_wmd;
+	int			rc;
+
+	/* Convert get fields to host byte order */
+	hdr->msg.get.match_bits	  = le64_to_cpu(hdr->msg.get.match_bits);
+	hdr->msg.get.ptl_index	  = le32_to_cpu(hdr->msg.get.ptl_index);
+	hdr->msg.get.sink_length  = le32_to_cpu(hdr->msg.get.sink_length);
+	hdr->msg.get.src_offset	  = le32_to_cpu(hdr->msg.get.src_offset);
+
+	info.mi_id.nid	= hdr->src_nid;
+	info.mi_id.pid	= hdr->src_pid;
+	info.mi_opc	= LNET_MD_OP_GET;
+	info.mi_portal	= hdr->msg.get.ptl_index;
+	info.mi_rlength	= hdr->msg.get.sink_length;
+	info.mi_roffset	= hdr->msg.get.src_offset;
+	info.mi_mbits	= hdr->msg.get.match_bits;
+
+	rc = lnet_ptl_match_md(&info, msg);
+	if (rc == LNET_MATCHMD_DROP) {
+		CNETERR("Dropping GET from %s portal %d match "LPU64
+			" offset %d length %d\n",
+			libcfs_id2str(info.mi_id), info.mi_portal,
+			info.mi_mbits, info.mi_roffset, info.mi_rlength);
+		return ENOENT;	/* +ve: OK but no match */
+	}
+
+	LASSERT(rc == LNET_MATCHMD_OK);
+
+	lnet_build_msg_event(msg, LNET_EVENT_GET);
+
+	reply_wmd = hdr->msg.get.return_wmd;
+
+	lnet_prep_send(msg, LNET_MSG_REPLY, info.mi_id,
+		       msg->msg_offset, msg->msg_wanted);
+
+	msg->msg_hdr.msg.reply.dst_wmd = reply_wmd;
+
+	if (rdma_get) {
+		/* The LND completes the REPLY from her recv procedure */
+		lnet_ni_recv(ni, msg->msg_private, msg, 0,
+			     msg->msg_offset, msg->msg_len, msg->msg_len);
+		return 0;
+	}
+
+	lnet_ni_recv(ni, msg->msg_private, NULL, 0, 0, 0, 0);
+	msg->msg_receiving = 0;
+
+	rc = lnet_send(ni->ni_nid, msg, LNET_NID_ANY);
+	if (rc < 0) {
+		/* didn't get as far as lnet_ni_send() */
+		CERROR("%s: Unable to send REPLY for GET from %s: %d\n",
+		       libcfs_nid2str(ni->ni_nid),
+		       libcfs_id2str(info.mi_id), rc);
+
+		lnet_finalize(ni, msg, rc);
+	}
+
+	return 0;
+}
+
+static int
+lnet_parse_reply(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+	void	     *private = msg->msg_private;
+	lnet_hdr_t       *hdr = &msg->msg_hdr;
+	lnet_process_id_t src = {0};
+	lnet_libmd_t     *md;
+	int	       rlength;
+	int	       mlength;
+	int			cpt;
+
+	cpt = lnet_cpt_of_cookie(hdr->msg.reply.dst_wmd.wh_object_cookie);
+	lnet_res_lock(cpt);
+
+	src.nid = hdr->src_nid;
+	src.pid = hdr->src_pid;
+
+	/* NB handles only looked up by creator (no flips) */
+	md = lnet_wire_handle2md(&hdr->msg.reply.dst_wmd);
+	if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+		CNETERR("%s: Dropping REPLY from %s for %s "
+			"MD "LPX64"."LPX64"\n",
+			libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+			(md == NULL) ? "invalid" : "inactive",
+			hdr->msg.reply.dst_wmd.wh_interface_cookie,
+			hdr->msg.reply.dst_wmd.wh_object_cookie);
+		if (md != NULL && md->md_me != NULL)
+			CERROR("REPLY MD also attached to portal %d\n",
+			       md->md_me->me_portal);
+
+		lnet_res_unlock(cpt);
+		return ENOENT;		  /* +ve: OK but no match */
+	}
+
+	LASSERT (md->md_offset == 0);
+
+	rlength = hdr->payload_length;
+	mlength = MIN(rlength, (int)md->md_length);
+
+	if (mlength < rlength &&
+	    (md->md_options & LNET_MD_TRUNCATE) == 0) {
+		CNETERR("%s: Dropping REPLY from %s length %d "
+			"for MD "LPX64" would overflow (%d)\n",
+			libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+			rlength, hdr->msg.reply.dst_wmd.wh_object_cookie,
+			mlength);
+		lnet_res_unlock(cpt);
+		return ENOENT;	  /* +ve: OK but no match */
+	}
+
+	CDEBUG(D_NET, "%s: Reply from %s of length %d/%d into md "LPX64"\n",
+	       libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+	       mlength, rlength, hdr->msg.reply.dst_wmd.wh_object_cookie);
+
+	lnet_msg_attach_md(msg, md, 0, mlength);
+
+	if (mlength != 0)
+		lnet_setpayloadbuffer(msg);
+
+	lnet_res_unlock(cpt);
+
+	lnet_build_msg_event(msg, LNET_EVENT_REPLY);
+
+	lnet_ni_recv(ni, private, msg, 0, 0, mlength, rlength);
+	return 0;
+}
+
+static int
+lnet_parse_ack(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+	lnet_hdr_t       *hdr = &msg->msg_hdr;
+	lnet_process_id_t src = {0};
+	lnet_libmd_t     *md;
+	int			cpt;
+
+	src.nid = hdr->src_nid;
+	src.pid = hdr->src_pid;
+
+	/* Convert ack fields to host byte order */
+	hdr->msg.ack.match_bits = le64_to_cpu(hdr->msg.ack.match_bits);
+	hdr->msg.ack.mlength = le32_to_cpu(hdr->msg.ack.mlength);
+
+	cpt = lnet_cpt_of_cookie(hdr->msg.ack.dst_wmd.wh_object_cookie);
+	lnet_res_lock(cpt);
+
+	/* NB handles only looked up by creator (no flips) */
+	md = lnet_wire_handle2md(&hdr->msg.ack.dst_wmd);
+	if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+		/* Don't moan; this is expected */
+		CDEBUG(D_NET,
+		       "%s: Dropping ACK from %s to %s MD "LPX64"."LPX64"\n",
+		       libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+		       (md == NULL) ? "invalid" : "inactive",
+		       hdr->msg.ack.dst_wmd.wh_interface_cookie,
+		       hdr->msg.ack.dst_wmd.wh_object_cookie);
+		if (md != NULL && md->md_me != NULL)
+			CERROR("Source MD also attached to portal %d\n",
+			       md->md_me->me_portal);
+
+		lnet_res_unlock(cpt);
+		return ENOENT;		  /* +ve! */
+	}
+
+	CDEBUG(D_NET, "%s: ACK from %s into md "LPX64"\n",
+	       libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+	       hdr->msg.ack.dst_wmd.wh_object_cookie);
+
+	lnet_msg_attach_md(msg, md, 0, 0);
+
+	lnet_res_unlock(cpt);
+
+	lnet_build_msg_event(msg, LNET_EVENT_ACK);
+
+	lnet_ni_recv(ni, msg->msg_private, msg, 0, 0, 0, msg->msg_len);
+	return 0;
+}
+
+static int
+lnet_parse_forward_locked(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+	int	rc = 0;
+
+	if (msg->msg_rxpeer->lp_rtrcredits <= 0 ||
+	    lnet_msg2bufpool(msg)->rbp_credits <= 0) {
+		if (ni->ni_lnd->lnd_eager_recv == NULL) {
+			msg->msg_rx_ready_delay = 1;
+		} else {
+			lnet_net_unlock(msg->msg_rx_cpt);
+			rc = lnet_ni_eager_recv(ni, msg);
+			lnet_net_lock(msg->msg_rx_cpt);
+		}
+	}
+
+	if (rc == 0)
+		rc = lnet_post_routed_recv_locked(msg, 0);
+	return rc;
+}
+
+char *
+lnet_msgtyp2str (int type)
+{
+	switch (type) {
+	case LNET_MSG_ACK:
+		return ("ACK");
+	case LNET_MSG_PUT:
+		return ("PUT");
+	case LNET_MSG_GET:
+		return ("GET");
+	case LNET_MSG_REPLY:
+		return ("REPLY");
+	case LNET_MSG_HELLO:
+		return ("HELLO");
+	default:
+		return ("<UNKNOWN>");
+	}
+}
+EXPORT_SYMBOL(lnet_msgtyp2str);
+
+void
+lnet_print_hdr(lnet_hdr_t * hdr)
+{
+	lnet_process_id_t src = {0};
+	lnet_process_id_t dst = {0};
+	char *type_str = lnet_msgtyp2str (hdr->type);
+
+	src.nid = hdr->src_nid;
+	src.pid = hdr->src_pid;
+
+	dst.nid = hdr->dest_nid;
+	dst.pid = hdr->dest_pid;
+
+	CWARN("P3 Header at %p of type %s\n", hdr, type_str);
+	CWARN("    From %s\n", libcfs_id2str(src));
+	CWARN("    To   %s\n", libcfs_id2str(dst));
+
+	switch (hdr->type) {
+	default:
+		break;
+
+	case LNET_MSG_PUT:
+		CWARN("    Ptl index %d, ack md "LPX64"."LPX64", "
+		      "match bits "LPU64"\n",
+		      hdr->msg.put.ptl_index,
+		      hdr->msg.put.ack_wmd.wh_interface_cookie,
+		      hdr->msg.put.ack_wmd.wh_object_cookie,
+		      hdr->msg.put.match_bits);
+		CWARN("    Length %d, offset %d, hdr data "LPX64"\n",
+		      hdr->payload_length, hdr->msg.put.offset,
+		      hdr->msg.put.hdr_data);
+		break;
+
+	case LNET_MSG_GET:
+		CWARN("    Ptl index %d, return md "LPX64"."LPX64", "
+		      "match bits "LPU64"\n", hdr->msg.get.ptl_index,
+		      hdr->msg.get.return_wmd.wh_interface_cookie,
+		      hdr->msg.get.return_wmd.wh_object_cookie,
+		      hdr->msg.get.match_bits);
+		CWARN("    Length %d, src offset %d\n",
+		      hdr->msg.get.sink_length,
+		      hdr->msg.get.src_offset);
+		break;
+
+	case LNET_MSG_ACK:
+		CWARN("    dst md "LPX64"."LPX64", "
+		      "manipulated length %d\n",
+		      hdr->msg.ack.dst_wmd.wh_interface_cookie,
+		      hdr->msg.ack.dst_wmd.wh_object_cookie,
+		      hdr->msg.ack.mlength);
+		break;
+
+	case LNET_MSG_REPLY:
+		CWARN("    dst md "LPX64"."LPX64", "
+		      "length %d\n",
+		      hdr->msg.reply.dst_wmd.wh_interface_cookie,
+		      hdr->msg.reply.dst_wmd.wh_object_cookie,
+		      hdr->payload_length);
+	}
+
+}
+
+int
+lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid,
+	   void *private, int rdma_req)
+{
+	int		rc = 0;
+	int		cpt;
+	int		for_me;
+	struct lnet_msg	*msg;
+	lnet_pid_t     dest_pid;
+	lnet_nid_t     dest_nid;
+	lnet_nid_t     src_nid;
+	__u32	  payload_length;
+	__u32	  type;
+
+	LASSERT (!in_interrupt ());
+
+	type = le32_to_cpu(hdr->type);
+	src_nid = le64_to_cpu(hdr->src_nid);
+	dest_nid = le64_to_cpu(hdr->dest_nid);
+	dest_pid = le32_to_cpu(hdr->dest_pid);
+	payload_length = le32_to_cpu(hdr->payload_length);
+
+	for_me = (ni->ni_nid == dest_nid);
+	cpt = lnet_cpt_of_nid(from_nid);
+
+	switch (type) {
+	case LNET_MSG_ACK:
+	case LNET_MSG_GET:
+		if (payload_length > 0) {
+			CERROR("%s, src %s: bad %s payload %d (0 expected)\n",
+			       libcfs_nid2str(from_nid),
+			       libcfs_nid2str(src_nid),
+			       lnet_msgtyp2str(type), payload_length);
+			return -EPROTO;
+		}
+		break;
+
+	case LNET_MSG_PUT:
+	case LNET_MSG_REPLY:
+		if (payload_length > (__u32)(for_me ? LNET_MAX_PAYLOAD : LNET_MTU)) {
+			CERROR("%s, src %s: bad %s payload %d "
+			       "(%d max expected)\n",
+			       libcfs_nid2str(from_nid),
+			       libcfs_nid2str(src_nid),
+			       lnet_msgtyp2str(type),
+			       payload_length,
+			       for_me ? LNET_MAX_PAYLOAD : LNET_MTU);
+			return -EPROTO;
+		}
+		break;
+
+	default:
+		CERROR("%s, src %s: Bad message type 0x%x\n",
+		       libcfs_nid2str(from_nid),
+		       libcfs_nid2str(src_nid), type);
+		return -EPROTO;
+	}
+
+	if (the_lnet.ln_routing &&
+	    ni->ni_last_alive != cfs_time_current_sec()) {
+		lnet_ni_lock(ni);
+
+		/* NB: so far here is the only place to set NI status to "up */
+		ni->ni_last_alive = cfs_time_current_sec();
+		if (ni->ni_status != NULL &&
+		    ni->ni_status->ns_status == LNET_NI_STATUS_DOWN)
+			ni->ni_status->ns_status = LNET_NI_STATUS_UP;
+		lnet_ni_unlock(ni);
+	}
+
+	/* Regard a bad destination NID as a protocol error.  Senders should
+	 * know what they're doing; if they don't they're misconfigured, buggy
+	 * or malicious so we chop them off at the knees :) */
+
+	if (!for_me) {
+		if (LNET_NIDNET(dest_nid) == LNET_NIDNET(ni->ni_nid)) {
+			/* should have gone direct */
+			CERROR ("%s, src %s: Bad dest nid %s "
+				"(should have been sent direct)\n",
+				libcfs_nid2str(from_nid),
+				libcfs_nid2str(src_nid),
+				libcfs_nid2str(dest_nid));
+			return -EPROTO;
+		}
+
+		if (lnet_islocalnid(dest_nid)) {
+			/* dest is another local NI; sender should have used
+			 * this node's NID on its own network */
+			CERROR ("%s, src %s: Bad dest nid %s "
+				"(it's my nid but on a different network)\n",
+				libcfs_nid2str(from_nid),
+				libcfs_nid2str(src_nid),
+				libcfs_nid2str(dest_nid));
+			return -EPROTO;
+		}
+
+		if (rdma_req && type == LNET_MSG_GET) {
+			CERROR ("%s, src %s: Bad optimized GET for %s "
+				"(final destination must be me)\n",
+				libcfs_nid2str(from_nid),
+				libcfs_nid2str(src_nid),
+				libcfs_nid2str(dest_nid));
+			return -EPROTO;
+		}
+
+		if (!the_lnet.ln_routing) {
+			CERROR ("%s, src %s: Dropping message for %s "
+				"(routing not enabled)\n",
+				libcfs_nid2str(from_nid),
+				libcfs_nid2str(src_nid),
+				libcfs_nid2str(dest_nid));
+			goto drop;
+		}
+	}
+
+	/* Message looks OK; we're not going to return an error, so we MUST
+	 * call back lnd_recv() come what may... */
+
+	if (!list_empty (&the_lnet.ln_test_peers) && /* normally we don't */
+	    fail_peer (src_nid, 0))	     /* shall we now? */
+	{
+		CERROR("%s, src %s: Dropping %s to simulate failure\n",
+		       libcfs_nid2str(from_nid), libcfs_nid2str(src_nid),
+		       lnet_msgtyp2str(type));
+		goto drop;
+	}
+
+	msg = lnet_msg_alloc();
+	if (msg == NULL) {
+		CERROR("%s, src %s: Dropping %s (out of memory)\n",
+		       libcfs_nid2str(from_nid), libcfs_nid2str(src_nid),
+		       lnet_msgtyp2str(type));
+		goto drop;
+	}
+
+	/* msg zeroed in lnet_msg_alloc; i.e. flags all clear, pointers NULL etc */
+
+	msg->msg_type = type;
+	msg->msg_private = private;
+	msg->msg_receiving = 1;
+	msg->msg_len = msg->msg_wanted = payload_length;
+	msg->msg_offset = 0;
+	msg->msg_hdr = *hdr;
+	/* for building message event */
+	msg->msg_from = from_nid;
+	if (!for_me) {
+		msg->msg_target.pid	= dest_pid;
+		msg->msg_target.nid	= dest_nid;
+		msg->msg_routing	= 1;
+
+	} else {
+		/* convert common msg->hdr fields to host byteorder */
+		msg->msg_hdr.type	= type;
+		msg->msg_hdr.src_nid	= src_nid;
+		msg->msg_hdr.src_pid	= le32_to_cpu(msg->msg_hdr.src_pid);
+		msg->msg_hdr.dest_nid	= dest_nid;
+		msg->msg_hdr.dest_pid	= dest_pid;
+		msg->msg_hdr.payload_length = payload_length;
+	}
+
+	lnet_net_lock(cpt);
+	rc = lnet_nid2peer_locked(&msg->msg_rxpeer, from_nid, cpt);
+	if (rc != 0) {
+		lnet_net_unlock(cpt);
+		CERROR("%s, src %s: Dropping %s "
+		       "(error %d looking up sender)\n",
+		       libcfs_nid2str(from_nid), libcfs_nid2str(src_nid),
+		       lnet_msgtyp2str(type), rc);
+		lnet_msg_free(msg);
+		goto drop;
+	}
+
+	lnet_msg_commit(msg, cpt);
+
+	if (!for_me) {
+		rc = lnet_parse_forward_locked(ni, msg);
+		lnet_net_unlock(cpt);
+
+		if (rc < 0)
+			goto free_drop;
+		if (rc == 0) {
+			lnet_ni_recv(ni, msg->msg_private, msg, 0,
+				     0, payload_length, payload_length);
+		}
+		return 0;
+	}
+
+	lnet_net_unlock(cpt);
+
+	switch (type) {
+	case LNET_MSG_ACK:
+		rc = lnet_parse_ack(ni, msg);
+		break;
+	case LNET_MSG_PUT:
+		rc = lnet_parse_put(ni, msg);
+		break;
+	case LNET_MSG_GET:
+		rc = lnet_parse_get(ni, msg, rdma_req);
+		break;
+	case LNET_MSG_REPLY:
+		rc = lnet_parse_reply(ni, msg);
+		break;
+	default:
+		LASSERT(0);
+		rc = -EPROTO;
+		goto free_drop;  /* prevent an unused label if !kernel */
+	}
+
+	if (rc == 0)
+		return 0;
+
+	LASSERT (rc == ENOENT);
+
+ free_drop:
+	LASSERT(msg->msg_md == NULL);
+	lnet_finalize(ni, msg, rc);
+
+ drop:
+	lnet_drop_message(ni, cpt, private, payload_length);
+	return 0;
+}
+EXPORT_SYMBOL(lnet_parse);
+
+void
+lnet_drop_delayed_msg_list(struct list_head *head, char *reason)
+{
+	while (!list_empty(head)) {
+		lnet_process_id_t	id = {0};
+		lnet_msg_t		*msg;
+
+		msg = list_entry(head->next, lnet_msg_t, msg_list);
+		list_del(&msg->msg_list);
+
+		id.nid = msg->msg_hdr.src_nid;
+		id.pid = msg->msg_hdr.src_pid;
+
+		LASSERT(msg->msg_md == NULL);
+		LASSERT(msg->msg_rx_delayed);
+		LASSERT(msg->msg_rxpeer != NULL);
+		LASSERT(msg->msg_hdr.type == LNET_MSG_PUT);
+
+		CWARN("Dropping delayed PUT from %s portal %d match "LPU64
+		      " offset %d length %d: %s\n",
+		      libcfs_id2str(id),
+		      msg->msg_hdr.msg.put.ptl_index,
+		      msg->msg_hdr.msg.put.match_bits,
+		      msg->msg_hdr.msg.put.offset,
+		      msg->msg_hdr.payload_length, reason);
+
+		/* NB I can't drop msg's ref on msg_rxpeer until after I've
+		 * called lnet_drop_message(), so I just hang onto msg as well
+		 * until that's done */
+
+		lnet_drop_message(msg->msg_rxpeer->lp_ni,
+				  msg->msg_rxpeer->lp_cpt,
+				  msg->msg_private, msg->msg_len);
+		/*
+		 * NB: message will not generate event because w/o attached MD,
+		 * but we still should give error code so lnet_msg_decommit()
+		 * can skip counters operations and other checks.
+		 */
+		lnet_finalize(msg->msg_rxpeer->lp_ni, msg, -ENOENT);
+	}
+}
+
+void
+lnet_recv_delayed_msg_list(struct list_head *head)
+{
+	while (!list_empty(head)) {
+		lnet_msg_t	  *msg;
+		lnet_process_id_t  id;
+
+		msg = list_entry(head->next, lnet_msg_t, msg_list);
+		list_del(&msg->msg_list);
+
+		/* md won't disappear under me, since each msg
+		 * holds a ref on it */
+
+		id.nid = msg->msg_hdr.src_nid;
+		id.pid = msg->msg_hdr.src_pid;
+
+		LASSERT(msg->msg_rx_delayed);
+		LASSERT(msg->msg_md != NULL);
+		LASSERT(msg->msg_rxpeer != NULL);
+		LASSERT(msg->msg_hdr.type == LNET_MSG_PUT);
+
+		CDEBUG(D_NET, "Resuming delayed PUT from %s portal %d "
+		       "match "LPU64" offset %d length %d.\n",
+			libcfs_id2str(id), msg->msg_hdr.msg.put.ptl_index,
+			msg->msg_hdr.msg.put.match_bits,
+			msg->msg_hdr.msg.put.offset,
+			msg->msg_hdr.payload_length);
+
+		lnet_recv_put(msg->msg_rxpeer->lp_ni, msg);
+	}
+}
+
+/**
+ * Initiate an asynchronous PUT operation.
+ *
+ * There are several events associated with a PUT: completion of the send on
+ * the initiator node (LNET_EVENT_SEND), and when the send completes
+ * successfully, the receipt of an acknowledgment (LNET_EVENT_ACK) indicating
+ * that the operation was accepted by the target. The event LNET_EVENT_PUT is
+ * used at the target node to indicate the completion of incoming data
+ * delivery.
+ *
+ * The local events will be logged in the EQ associated with the MD pointed to
+ * by \a mdh handle. Using a MD without an associated EQ results in these
+ * events being discarded. In this case, the caller must have another
+ * mechanism (e.g., a higher level protocol) for determining when it is safe
+ * to modify the memory region associated with the MD.
+ *
+ * Note that LNet does not guarantee the order of LNET_EVENT_SEND and
+ * LNET_EVENT_ACK, though intuitively ACK should happen after SEND.
+ *
+ * \param self Indicates the NID of a local interface through which to send
+ * the PUT request. Use LNET_NID_ANY to let LNet choose one by itself.
+ * \param mdh A handle for the MD that describes the memory to be sent. The MD
+ * must be "free floating" (See LNetMDBind()).
+ * \param ack Controls whether an acknowledgment is requested.
+ * Acknowledgments are only sent when they are requested by the initiating
+ * process and the target MD enables them.
+ * \param target A process identifier for the target process.
+ * \param portal The index in the \a target's portal table.
+ * \param match_bits The match bits to use for MD selection at the target
+ * process.
+ * \param offset The offset into the target MD (only used when the target
+ * MD has the LNET_MD_MANAGE_REMOTE option set).
+ * \param hdr_data 64 bits of user data that can be included in the message
+ * header. This data is written to an event queue entry at the target if an
+ * EQ is present on the matching MD.
+ *
+ * \retval  0      Success, and only in this case events will be generated
+ * and logged to EQ (if it exists).
+ * \retval -EIO    Simulated failure.
+ * \retval -ENOMEM Memory allocation failure.
+ * \retval -ENOENT Invalid MD object.
+ *
+ * \see lnet_event_t::hdr_data and lnet_event_kind_t.
+ */
+int
+LNetPut(lnet_nid_t self, lnet_handle_md_t mdh, lnet_ack_req_t ack,
+	lnet_process_id_t target, unsigned int portal,
+	__u64 match_bits, unsigned int offset,
+	__u64 hdr_data)
+{
+	struct lnet_msg		*msg;
+	struct lnet_libmd	*md;
+	int			cpt;
+	int			rc;
+
+	LASSERT (the_lnet.ln_init);
+	LASSERT (the_lnet.ln_refcount > 0);
+
+	if (!list_empty (&the_lnet.ln_test_peers) && /* normally we don't */
+	    fail_peer (target.nid, 1))	  /* shall we now? */
+	{
+		CERROR("Dropping PUT to %s: simulated failure\n",
+		       libcfs_id2str(target));
+		return -EIO;
+	}
+
+	msg = lnet_msg_alloc();
+	if (msg == NULL) {
+		CERROR("Dropping PUT to %s: ENOMEM on lnet_msg_t\n",
+		       libcfs_id2str(target));
+		return -ENOMEM;
+	}
+	msg->msg_vmflush = !!memory_pressure_get();
+
+	cpt = lnet_cpt_of_cookie(mdh.cookie);
+	lnet_res_lock(cpt);
+
+	md = lnet_handle2md(&mdh);
+	if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+		CERROR("Dropping PUT ("LPU64":%d:%s): MD (%d) invalid\n",
+		       match_bits, portal, libcfs_id2str(target),
+		       md == NULL ? -1 : md->md_threshold);
+		if (md != NULL && md->md_me != NULL)
+			CERROR("Source MD also attached to portal %d\n",
+			       md->md_me->me_portal);
+		lnet_res_unlock(cpt);
+
+		lnet_msg_free(msg);
+		return -ENOENT;
+	}
+
+	CDEBUG(D_NET, "LNetPut -> %s\n", libcfs_id2str(target));
+
+	lnet_msg_attach_md(msg, md, 0, 0);
+
+	lnet_prep_send(msg, LNET_MSG_PUT, target, 0, md->md_length);
+
+	msg->msg_hdr.msg.put.match_bits = cpu_to_le64(match_bits);
+	msg->msg_hdr.msg.put.ptl_index = cpu_to_le32(portal);
+	msg->msg_hdr.msg.put.offset = cpu_to_le32(offset);
+	msg->msg_hdr.msg.put.hdr_data = hdr_data;
+
+	/* NB handles only looked up by creator (no flips) */
+	if (ack == LNET_ACK_REQ) {
+		msg->msg_hdr.msg.put.ack_wmd.wh_interface_cookie =
+			the_lnet.ln_interface_cookie;
+		msg->msg_hdr.msg.put.ack_wmd.wh_object_cookie =
+			md->md_lh.lh_cookie;
+	} else {
+		msg->msg_hdr.msg.put.ack_wmd.wh_interface_cookie =
+			LNET_WIRE_HANDLE_COOKIE_NONE;
+		msg->msg_hdr.msg.put.ack_wmd.wh_object_cookie =
+			LNET_WIRE_HANDLE_COOKIE_NONE;
+	}
+
+	lnet_res_unlock(cpt);
+
+	lnet_build_msg_event(msg, LNET_EVENT_SEND);
+
+	rc = lnet_send(self, msg, LNET_NID_ANY);
+	if (rc != 0) {
+		CNETERR( "Error sending PUT to %s: %d\n",
+		       libcfs_id2str(target), rc);
+		lnet_finalize (NULL, msg, rc);
+	}
+
+	/* completion will be signalled by an event */
+	return 0;
+}
+EXPORT_SYMBOL(LNetPut);
+
+lnet_msg_t *
+lnet_create_reply_msg (lnet_ni_t *ni, lnet_msg_t *getmsg)
+{
+	/* The LND can DMA direct to the GET md (i.e. no REPLY msg).  This
+	 * returns a msg for the LND to pass to lnet_finalize() when the sink
+	 * data has been received.
+	 *
+	 * CAVEAT EMPTOR: 'getmsg' is the original GET, which is freed when
+	 * lnet_finalize() is called on it, so the LND must call this first */
+
+	struct lnet_msg		*msg = lnet_msg_alloc();
+	struct lnet_libmd	*getmd = getmsg->msg_md;
+	lnet_process_id_t	peer_id = getmsg->msg_target;
+	int			cpt;
+
+	LASSERT(!getmsg->msg_target_is_router);
+	LASSERT(!getmsg->msg_routing);
+
+	cpt = lnet_cpt_of_cookie(getmd->md_lh.lh_cookie);
+	lnet_res_lock(cpt);
+
+	LASSERT (getmd->md_refcount > 0);
+
+	if (msg == NULL) {
+		CERROR ("%s: Dropping REPLY from %s: can't allocate msg\n",
+			libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id));
+		goto drop;
+	}
+
+	if (getmd->md_threshold == 0) {
+		CERROR ("%s: Dropping REPLY from %s for inactive MD %p\n",
+			libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id),
+			getmd);
+		lnet_res_unlock(cpt);
+		goto drop;
+	}
+
+	LASSERT(getmd->md_offset == 0);
+
+	CDEBUG(D_NET, "%s: Reply from %s md %p\n",
+	       libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id), getmd);
+
+	/* setup information for lnet_build_msg_event */
+	msg->msg_from = peer_id.nid;
+	msg->msg_type = LNET_MSG_GET; /* flag this msg as an "optimized" GET */
+	msg->msg_hdr.src_nid = peer_id.nid;
+	msg->msg_hdr.payload_length = getmd->md_length;
+	msg->msg_receiving = 1; /* required by lnet_msg_attach_md */
+
+	lnet_msg_attach_md(msg, getmd, getmd->md_offset, getmd->md_length);
+	lnet_res_unlock(cpt);
+
+	cpt = lnet_cpt_of_nid(peer_id.nid);
+
+	lnet_net_lock(cpt);
+	lnet_msg_commit(msg, cpt);
+	lnet_net_unlock(cpt);
+
+	lnet_build_msg_event(msg, LNET_EVENT_REPLY);
+
+	return msg;
+
+ drop:
+	cpt = lnet_cpt_of_nid(peer_id.nid);
+
+	lnet_net_lock(cpt);
+	the_lnet.ln_counters[cpt]->drop_count++;
+	the_lnet.ln_counters[cpt]->drop_length += getmd->md_length;
+	lnet_net_unlock(cpt);
+
+	if (msg != NULL)
+		lnet_msg_free(msg);
+
+	return NULL;
+}
+EXPORT_SYMBOL(lnet_create_reply_msg);
+
+void
+lnet_set_reply_msg_len(lnet_ni_t *ni, lnet_msg_t *reply, unsigned int len)
+{
+	/* Set the REPLY length, now the RDMA that elides the REPLY message has
+	 * completed and I know it. */
+	LASSERT (reply != NULL);
+	LASSERT (reply->msg_type == LNET_MSG_GET);
+	LASSERT (reply->msg_ev.type == LNET_EVENT_REPLY);
+
+	/* NB I trusted my peer to RDMA.  If she tells me she's written beyond
+	 * the end of my buffer, I might as well be dead. */
+	LASSERT (len <= reply->msg_ev.mlength);
+
+	reply->msg_ev.mlength = len;
+}
+EXPORT_SYMBOL(lnet_set_reply_msg_len);
+
+/**
+ * Initiate an asynchronous GET operation.
+ *
+ * On the initiator node, an LNET_EVENT_SEND is logged when the GET request
+ * is sent, and an LNET_EVENT_REPLY is logged when the data returned from
+ * the target node in the REPLY has been written to local MD.
+ *
+ * On the target node, an LNET_EVENT_GET is logged when the GET request
+ * arrives and is accepted into a MD.
+ *
+ * \param self,target,portal,match_bits,offset See the discussion in LNetPut().
+ * \param mdh A handle for the MD that describes the memory into which the
+ * requested data will be received. The MD must be "free floating" (See LNetMDBind()).
+ *
+ * \retval  0      Success, and only in this case events will be generated
+ * and logged to EQ (if it exists) of the MD.
+ * \retval -EIO    Simulated failure.
+ * \retval -ENOMEM Memory allocation failure.
+ * \retval -ENOENT Invalid MD object.
+ */
+int
+LNetGet(lnet_nid_t self, lnet_handle_md_t mdh,
+	lnet_process_id_t target, unsigned int portal,
+	__u64 match_bits, unsigned int offset)
+{
+	struct lnet_msg		*msg;
+	struct lnet_libmd	*md;
+	int			cpt;
+	int			rc;
+
+	LASSERT (the_lnet.ln_init);
+	LASSERT (the_lnet.ln_refcount > 0);
+
+	if (!list_empty (&the_lnet.ln_test_peers) && /* normally we don't */
+	    fail_peer (target.nid, 1))	  /* shall we now? */
+	{
+		CERROR("Dropping GET to %s: simulated failure\n",
+		       libcfs_id2str(target));
+		return -EIO;
+	}
+
+	msg = lnet_msg_alloc();
+	if (msg == NULL) {
+		CERROR("Dropping GET to %s: ENOMEM on lnet_msg_t\n",
+		       libcfs_id2str(target));
+		return -ENOMEM;
+	}
+
+	cpt = lnet_cpt_of_cookie(mdh.cookie);
+	lnet_res_lock(cpt);
+
+	md = lnet_handle2md(&mdh);
+	if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+		CERROR("Dropping GET ("LPU64":%d:%s): MD (%d) invalid\n",
+		       match_bits, portal, libcfs_id2str(target),
+		       md == NULL ? -1 : md->md_threshold);
+		if (md != NULL && md->md_me != NULL)
+			CERROR("REPLY MD also attached to portal %d\n",
+			       md->md_me->me_portal);
+
+		lnet_res_unlock(cpt);
+
+		lnet_msg_free(msg);
+
+		return -ENOENT;
+	}
+
+	CDEBUG(D_NET, "LNetGet -> %s\n", libcfs_id2str(target));
+
+	lnet_msg_attach_md(msg, md, 0, 0);
+
+	lnet_prep_send(msg, LNET_MSG_GET, target, 0, 0);
+
+	msg->msg_hdr.msg.get.match_bits = cpu_to_le64(match_bits);
+	msg->msg_hdr.msg.get.ptl_index = cpu_to_le32(portal);
+	msg->msg_hdr.msg.get.src_offset = cpu_to_le32(offset);
+	msg->msg_hdr.msg.get.sink_length = cpu_to_le32(md->md_length);
+
+	/* NB handles only looked up by creator (no flips) */
+	msg->msg_hdr.msg.get.return_wmd.wh_interface_cookie =
+		the_lnet.ln_interface_cookie;
+	msg->msg_hdr.msg.get.return_wmd.wh_object_cookie =
+		md->md_lh.lh_cookie;
+
+	lnet_res_unlock(cpt);
+
+	lnet_build_msg_event(msg, LNET_EVENT_SEND);
+
+	rc = lnet_send(self, msg, LNET_NID_ANY);
+	if (rc < 0) {
+		CNETERR( "Error sending GET to %s: %d\n",
+		       libcfs_id2str(target), rc);
+		lnet_finalize (NULL, msg, rc);
+	}
+
+	/* completion will be signalled by an event */
+	return 0;
+}
+EXPORT_SYMBOL(LNetGet);
+
+/**
+ * Calculate distance to node at \a dstnid.
+ *
+ * \param dstnid Target NID.
+ * \param srcnidp If not NULL, NID of the local interface to reach \a dstnid
+ * is saved here.
+ * \param orderp If not NULL, order of the route to reach \a dstnid is saved
+ * here.
+ *
+ * \retval 0 If \a dstnid belongs to a local interface, and reserved option
+ * local_nid_dist_zero is set, which is the default.
+ * \retval positives Distance to target NID, i.e. number of hops plus one.
+ * \retval -EHOSTUNREACH If \a dstnid is not reachable.
+ */
+int
+LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp)
+{
+	struct list_head		*e;
+	struct lnet_ni		*ni;
+	lnet_remotenet_t	*rnet;
+	__u32			dstnet = LNET_NIDNET(dstnid);
+	int			hops;
+	int			cpt;
+	__u32			order = 2;
+	struct list_head		*rn_list;
+
+	/* if !local_nid_dist_zero, I don't return a distance of 0 ever
+	 * (when lustre sees a distance of 0, it substitutes 0@lo), so I
+	 * keep order 0 free for 0@lo and order 1 free for a local NID
+	 * match */
+
+	LASSERT (the_lnet.ln_init);
+	LASSERT (the_lnet.ln_refcount > 0);
+
+	cpt = lnet_net_lock_current();
+
+	list_for_each (e, &the_lnet.ln_nis) {
+		ni = list_entry(e, lnet_ni_t, ni_list);
+
+		if (ni->ni_nid == dstnid) {
+			if (srcnidp != NULL)
+				*srcnidp = dstnid;
+			if (orderp != NULL) {
+				if (LNET_NETTYP(LNET_NIDNET(dstnid)) == LOLND)
+					*orderp = 0;
+				else
+					*orderp = 1;
+			}
+			lnet_net_unlock(cpt);
+
+			return local_nid_dist_zero ? 0 : 1;
+		}
+
+		if (LNET_NIDNET(ni->ni_nid) == dstnet) {
+			if (srcnidp != NULL)
+				*srcnidp = ni->ni_nid;
+			if (orderp != NULL)
+				*orderp = order;
+			lnet_net_unlock(cpt);
+			return 1;
+		}
+
+		order++;
+	}
+
+	rn_list = lnet_net2rnethash(dstnet);
+	list_for_each(e, rn_list) {
+		rnet = list_entry(e, lnet_remotenet_t, lrn_list);
+
+		if (rnet->lrn_net == dstnet) {
+			lnet_route_t *route;
+			lnet_route_t *shortest = NULL;
+
+			LASSERT (!list_empty(&rnet->lrn_routes));
+
+			list_for_each_entry(route, &rnet->lrn_routes,
+						lr_list) {
+				if (shortest == NULL ||
+				    route->lr_hops < shortest->lr_hops)
+					shortest = route;
+			}
+
+			LASSERT (shortest != NULL);
+			hops = shortest->lr_hops;
+			if (srcnidp != NULL)
+				*srcnidp = shortest->lr_gateway->lp_ni->ni_nid;
+			if (orderp != NULL)
+				*orderp = order;
+			lnet_net_unlock(cpt);
+			return hops + 1;
+		}
+		order++;
+	}
+
+	lnet_net_unlock(cpt);
+	return -EHOSTUNREACH;
+}
+EXPORT_SYMBOL(LNetDist);
+
+/**
+ * Set the number of asynchronous messages expected from a target process.
+ *
+ * This function is only meaningful for userspace callers. It's a no-op when
+ * called from kernel.
+ *
+ * Asynchronous messages are those that can come from a target when the
+ * userspace process is not waiting for IO to complete; e.g., AST callbacks
+ * from Lustre servers. Specifying the expected number of such messages
+ * allows them to be eagerly received when user process is not running in
+ * LNet; otherwise network errors may occur.
+ *
+ * \param id Process ID of the target process.
+ * \param nasync Number of asynchronous messages expected from the target.
+ *
+ * \return 0 on success, and an error code otherwise.
+ */
+int
+LNetSetAsync(lnet_process_id_t id, int nasync)
+{
+	return 0;
+}
+EXPORT_SYMBOL(LNetSetAsync);
diff --git a/drivers/staging/lustre/lnet/lnet/lib-msg.c b/drivers/staging/lustre/lnet/lnet/lib-msg.c
new file mode 100644
index 000000000000..8f3a50bd5f69
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/lib-msg.c
@@ -0,0 +1,650 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-msg.c
+ *
+ * Message decoding, parsing and finalizing routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/lnet/lib-lnet.h>
+
+void
+lnet_build_unlink_event (lnet_libmd_t *md, lnet_event_t *ev)
+{
+	ENTRY;
+
+	memset(ev, 0, sizeof(*ev));
+
+	ev->status   = 0;
+	ev->unlinked = 1;
+	ev->type     = LNET_EVENT_UNLINK;
+	lnet_md_deconstruct(md, &ev->md);
+	lnet_md2handle(&ev->md_handle, md);
+	EXIT;
+}
+
+/*
+ * Don't need any lock, must be called after lnet_commit_md
+ */
+void
+lnet_build_msg_event(lnet_msg_t *msg, lnet_event_kind_t ev_type)
+{
+	lnet_hdr_t	*hdr = &msg->msg_hdr;
+	lnet_event_t	*ev  = &msg->msg_ev;
+
+	LASSERT(!msg->msg_routing);
+
+	ev->type = ev_type;
+
+	if (ev_type == LNET_EVENT_SEND) {
+		/* event for active message */
+		ev->target.nid    = le64_to_cpu(hdr->dest_nid);
+		ev->target.pid    = le32_to_cpu(hdr->dest_pid);
+		ev->initiator.nid = LNET_NID_ANY;
+		ev->initiator.pid = the_lnet.ln_pid;
+		ev->sender	  = LNET_NID_ANY;
+
+	} else {
+		/* event for passive message */
+		ev->target.pid    = hdr->dest_pid;
+		ev->target.nid    = hdr->dest_nid;
+		ev->initiator.pid = hdr->src_pid;
+		ev->initiator.nid = hdr->src_nid;
+		ev->rlength       = hdr->payload_length;
+		ev->sender	  = msg->msg_from;
+		ev->mlength	  = msg->msg_wanted;
+		ev->offset	  = msg->msg_offset;
+	}
+
+	switch (ev_type) {
+	default:
+		LBUG();
+
+	case LNET_EVENT_PUT: /* passive PUT */
+		ev->pt_index   = hdr->msg.put.ptl_index;
+		ev->match_bits = hdr->msg.put.match_bits;
+		ev->hdr_data   = hdr->msg.put.hdr_data;
+		return;
+
+	case LNET_EVENT_GET: /* passive GET */
+		ev->pt_index   = hdr->msg.get.ptl_index;
+		ev->match_bits = hdr->msg.get.match_bits;
+		ev->hdr_data   = 0;
+		return;
+
+	case LNET_EVENT_ACK: /* ACK */
+		ev->match_bits = hdr->msg.ack.match_bits;
+		ev->mlength    = hdr->msg.ack.mlength;
+		return;
+
+	case LNET_EVENT_REPLY: /* REPLY */
+		return;
+
+	case LNET_EVENT_SEND: /* active message */
+		if (msg->msg_type == LNET_MSG_PUT) {
+			ev->pt_index   = le32_to_cpu(hdr->msg.put.ptl_index);
+			ev->match_bits = le64_to_cpu(hdr->msg.put.match_bits);
+			ev->offset     = le32_to_cpu(hdr->msg.put.offset);
+			ev->mlength    =
+			ev->rlength    = le32_to_cpu(hdr->payload_length);
+			ev->hdr_data   = le64_to_cpu(hdr->msg.put.hdr_data);
+
+		} else {
+			LASSERT(msg->msg_type == LNET_MSG_GET);
+			ev->pt_index   = le32_to_cpu(hdr->msg.get.ptl_index);
+			ev->match_bits = le64_to_cpu(hdr->msg.get.match_bits);
+			ev->mlength    =
+			ev->rlength    = le32_to_cpu(hdr->msg.get.sink_length);
+			ev->offset     = le32_to_cpu(hdr->msg.get.src_offset);
+			ev->hdr_data   = 0;
+		}
+		return;
+	}
+}
+
+void
+lnet_msg_commit(lnet_msg_t *msg, int cpt)
+{
+	struct lnet_msg_container *container = the_lnet.ln_msg_containers[cpt];
+	lnet_counters_t		  *counters  = the_lnet.ln_counters[cpt];
+
+	/* routed message can be committed for both receiving and sending */
+	LASSERT(!msg->msg_tx_committed);
+
+	if (msg->msg_sending) {
+		LASSERT(!msg->msg_receiving);
+
+		msg->msg_tx_cpt = cpt;
+		msg->msg_tx_committed = 1;
+		if (msg->msg_rx_committed) { /* routed message REPLY */
+			LASSERT(msg->msg_onactivelist);
+			return;
+		}
+	} else {
+		LASSERT(!msg->msg_sending);
+		msg->msg_rx_cpt = cpt;
+		msg->msg_rx_committed = 1;
+	}
+
+	LASSERT(!msg->msg_onactivelist);
+	msg->msg_onactivelist = 1;
+	list_add(&msg->msg_activelist, &container->msc_active);
+
+	counters->msgs_alloc++;
+	if (counters->msgs_alloc > counters->msgs_max)
+		counters->msgs_max = counters->msgs_alloc;
+}
+
+static void
+lnet_msg_decommit_tx(lnet_msg_t *msg, int status)
+{
+	lnet_counters_t	*counters;
+	lnet_event_t	*ev = &msg->msg_ev;
+
+	LASSERT(msg->msg_tx_committed);
+	if (status != 0)
+		goto out;
+
+	counters = the_lnet.ln_counters[msg->msg_tx_cpt];
+	switch (ev->type) {
+	default: /* routed message */
+		LASSERT(msg->msg_routing);
+		LASSERT(msg->msg_rx_committed);
+		LASSERT(ev->type == 0);
+
+		counters->route_length += msg->msg_len;
+		counters->route_count++;
+		goto out;
+
+	case LNET_EVENT_PUT:
+		/* should have been decommitted */
+		LASSERT(!msg->msg_rx_committed);
+		/* overwritten while sending ACK */
+		LASSERT(msg->msg_type == LNET_MSG_ACK);
+		msg->msg_type = LNET_MSG_PUT; /* fix type */
+		break;
+
+	case LNET_EVENT_SEND:
+		LASSERT(!msg->msg_rx_committed);
+		if (msg->msg_type == LNET_MSG_PUT)
+			counters->send_length += msg->msg_len;
+		break;
+
+	case LNET_EVENT_GET:
+		LASSERT(msg->msg_rx_committed);
+		/* overwritten while sending reply, we should never be
+		 * here for optimized GET */
+		LASSERT(msg->msg_type == LNET_MSG_REPLY);
+		msg->msg_type = LNET_MSG_GET; /* fix type */
+		break;
+	}
+
+	counters->send_count++;
+ out:
+	lnet_return_tx_credits_locked(msg);
+	msg->msg_tx_committed = 0;
+}
+
+static void
+lnet_msg_decommit_rx(lnet_msg_t *msg, int status)
+{
+	lnet_counters_t	*counters;
+	lnet_event_t	*ev = &msg->msg_ev;
+
+	LASSERT(!msg->msg_tx_committed); /* decommitted or never committed */
+	LASSERT(msg->msg_rx_committed);
+
+	if (status != 0)
+		goto out;
+
+	counters = the_lnet.ln_counters[msg->msg_rx_cpt];
+	switch (ev->type) {
+	default:
+		LASSERT(ev->type == 0);
+		LASSERT(msg->msg_routing);
+		goto out;
+
+	case LNET_EVENT_ACK:
+		LASSERT(msg->msg_type == LNET_MSG_ACK);
+		break;
+
+	case LNET_EVENT_GET:
+		/* type is "REPLY" if it's an optimized GET on passive side,
+		 * because optimized GET will never be committed for sending,
+		 * so message type wouldn't be changed back to "GET" by
+		 * lnet_msg_decommit_tx(), see details in lnet_parse_get() */
+		LASSERT(msg->msg_type == LNET_MSG_REPLY ||
+			msg->msg_type == LNET_MSG_GET);
+		counters->send_length += msg->msg_wanted;
+		break;
+
+	case LNET_EVENT_PUT:
+		LASSERT(msg->msg_type == LNET_MSG_PUT);
+		break;
+
+	case LNET_EVENT_REPLY:
+		/* type is "GET" if it's an optimized GET on active side,
+		 * see details in lnet_create_reply_msg() */
+		LASSERT(msg->msg_type == LNET_MSG_GET ||
+			msg->msg_type == LNET_MSG_REPLY);
+		break;
+	}
+
+	counters->recv_count++;
+	if (ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_REPLY)
+		counters->recv_length += msg->msg_wanted;
+
+ out:
+	lnet_return_rx_credits_locked(msg);
+	msg->msg_rx_committed = 0;
+}
+
+void
+lnet_msg_decommit(lnet_msg_t *msg, int cpt, int status)
+{
+	int	cpt2 = cpt;
+
+	LASSERT(msg->msg_tx_committed || msg->msg_rx_committed);
+	LASSERT(msg->msg_onactivelist);
+
+	if (msg->msg_tx_committed) { /* always decommit for sending first */
+		LASSERT(cpt == msg->msg_tx_cpt);
+		lnet_msg_decommit_tx(msg, status);
+	}
+
+	if (msg->msg_rx_committed) {
+		/* forwarding msg committed for both receiving and sending */
+		if (cpt != msg->msg_rx_cpt) {
+			lnet_net_unlock(cpt);
+			cpt2 = msg->msg_rx_cpt;
+			lnet_net_lock(cpt2);
+		}
+		lnet_msg_decommit_rx(msg, status);
+	}
+
+	list_del(&msg->msg_activelist);
+	msg->msg_onactivelist = 0;
+
+	the_lnet.ln_counters[cpt2]->msgs_alloc--;
+
+	if (cpt2 != cpt) {
+		lnet_net_unlock(cpt2);
+		lnet_net_lock(cpt);
+	}
+}
+
+void
+lnet_msg_attach_md(lnet_msg_t *msg, lnet_libmd_t *md,
+		   unsigned int offset, unsigned int mlen)
+{
+	/* NB: @offset and @len are only useful for receiving */
+	/* Here, we attach the MD on lnet_msg and mark it busy and
+	 * decrementing its threshold. Come what may, the lnet_msg "owns"
+	 * the MD until a call to lnet_msg_detach_md or lnet_finalize()
+	 * signals completion. */
+	LASSERT(!msg->msg_routing);
+
+	msg->msg_md = md;
+	if (msg->msg_receiving) { /* commited for receiving */
+		msg->msg_offset = offset;
+		msg->msg_wanted = mlen;
+	}
+
+	md->md_refcount++;
+	if (md->md_threshold != LNET_MD_THRESH_INF) {
+		LASSERT(md->md_threshold > 0);
+		md->md_threshold--;
+	}
+
+	/* build umd in event */
+	lnet_md2handle(&msg->msg_ev.md_handle, md);
+	lnet_md_deconstruct(md, &msg->msg_ev.md);
+}
+
+void
+lnet_msg_detach_md(lnet_msg_t *msg, int status)
+{
+	lnet_libmd_t	*md = msg->msg_md;
+	int		unlink;
+
+	/* Now it's safe to drop my caller's ref */
+	md->md_refcount--;
+	LASSERT(md->md_refcount >= 0);
+
+	unlink = lnet_md_unlinkable(md);
+	if (md->md_eq != NULL) {
+		msg->msg_ev.status   = status;
+		msg->msg_ev.unlinked = unlink;
+		lnet_eq_enqueue_event(md->md_eq, &msg->msg_ev);
+	}
+
+	if (unlink)
+		lnet_md_unlink(md);
+
+	msg->msg_md = NULL;
+}
+
+static int
+lnet_complete_msg_locked(lnet_msg_t *msg, int cpt)
+{
+	lnet_handle_wire_t ack_wmd;
+	int		rc;
+	int		status = msg->msg_ev.status;
+
+	LASSERT (msg->msg_onactivelist);
+
+	if (status == 0 && msg->msg_ack) {
+		/* Only send an ACK if the PUT completed successfully */
+
+		lnet_msg_decommit(msg, cpt, 0);
+
+		msg->msg_ack = 0;
+		lnet_net_unlock(cpt);
+
+		LASSERT(msg->msg_ev.type == LNET_EVENT_PUT);
+		LASSERT(!msg->msg_routing);
+
+		ack_wmd = msg->msg_hdr.msg.put.ack_wmd;
+
+		lnet_prep_send(msg, LNET_MSG_ACK, msg->msg_ev.initiator, 0, 0);
+
+		msg->msg_hdr.msg.ack.dst_wmd = ack_wmd;
+		msg->msg_hdr.msg.ack.match_bits = msg->msg_ev.match_bits;
+		msg->msg_hdr.msg.ack.mlength = cpu_to_le32(msg->msg_ev.mlength);
+
+		/* NB: we probably want to use NID of msg::msg_from as 3rd
+		 * parameter (router NID) if it's routed message */
+		rc = lnet_send(msg->msg_ev.target.nid, msg, LNET_NID_ANY);
+
+		lnet_net_lock(cpt);
+		/*
+		 * NB: message is committed for sending, we should return
+		 * on success because LND will finalize this message later.
+		 *
+		 * Also, there is possibility that message is commited for
+		 * sending and also failed before delivering to LND,
+		 * i.e: ENOMEM, in that case we can't fall through either
+		 * because CPT for sending can be different with CPT for
+		 * receiving, so we should return back to lnet_finalize()
+		 * to make sure we are locking the correct partition.
+		 */
+		return rc;
+
+	} else if (status == 0 &&	/* OK so far */
+		   (msg->msg_routing && !msg->msg_sending)) {
+		/* not forwarded */
+		LASSERT(!msg->msg_receiving);	/* called back recv already */
+		lnet_net_unlock(cpt);
+
+		rc = lnet_send(LNET_NID_ANY, msg, LNET_NID_ANY);
+
+		lnet_net_lock(cpt);
+		/*
+		 * NB: message is committed for sending, we should return
+		 * on success because LND will finalize this message later.
+		 *
+		 * Also, there is possibility that message is commited for
+		 * sending and also failed before delivering to LND,
+		 * i.e: ENOMEM, in that case we can't fall through either:
+		 * - The rule is message must decommit for sending first if
+		 *   the it's committed for both sending and receiving
+		 * - CPT for sending can be different with CPT for receiving,
+		 *   so we should return back to lnet_finalize() to make
+		 *   sure we are locking the correct partition.
+		 */
+		return rc;
+	}
+
+	lnet_msg_decommit(msg, cpt, status);
+	lnet_msg_free_locked(msg);
+	return 0;
+}
+
+void
+lnet_finalize (lnet_ni_t *ni, lnet_msg_t *msg, int status)
+{
+	struct lnet_msg_container	*container;
+	int				my_slot;
+	int				cpt;
+	int				rc;
+	int				i;
+
+	LASSERT (!in_interrupt ());
+
+	if (msg == NULL)
+		return;
+#if 0
+	CDEBUG(D_WARNING, "%s msg->%s Flags:%s%s%s%s%s%s%s%s%s%s%s txp %s rxp %s\n",
+	       lnet_msgtyp2str(msg->msg_type), libcfs_id2str(msg->msg_target),
+	       msg->msg_target_is_router ? "t" : "",
+	       msg->msg_routing ? "X" : "",
+	       msg->msg_ack ? "A" : "",
+	       msg->msg_sending ? "S" : "",
+	       msg->msg_receiving ? "R" : "",
+	       msg->msg_delayed ? "d" : "",
+	       msg->msg_txcredit ? "C" : "",
+	       msg->msg_peertxcredit ? "c" : "",
+	       msg->msg_rtrcredit ? "F" : "",
+	       msg->msg_peerrtrcredit ? "f" : "",
+	       msg->msg_onactivelist ? "!" : "",
+	       msg->msg_txpeer == NULL ? "<none>" : libcfs_nid2str(msg->msg_txpeer->lp_nid),
+	       msg->msg_rxpeer == NULL ? "<none>" : libcfs_nid2str(msg->msg_rxpeer->lp_nid));
+#endif
+	msg->msg_ev.status = status;
+
+	if (msg->msg_md != NULL) {
+		cpt = lnet_cpt_of_cookie(msg->msg_md->md_lh.lh_cookie);
+
+		lnet_res_lock(cpt);
+		lnet_msg_detach_md(msg, status);
+		lnet_res_unlock(cpt);
+	}
+
+ again:
+	rc = 0;
+	if (!msg->msg_tx_committed && !msg->msg_rx_committed) {
+		/* not commited to network yet */
+		LASSERT(!msg->msg_onactivelist);
+		lnet_msg_free(msg);
+		return;
+	}
+
+	/*
+	 * NB: routed message can be commited for both receiving and sending,
+	 * we should finalize in LIFO order and keep counters correct.
+	 * (finalize sending first then finalize receiving)
+	 */
+	cpt = msg->msg_tx_committed ? msg->msg_tx_cpt : msg->msg_rx_cpt;
+	lnet_net_lock(cpt);
+
+	container = the_lnet.ln_msg_containers[cpt];
+	list_add_tail(&msg->msg_list, &container->msc_finalizing);
+
+	/* Recursion breaker.  Don't complete the message here if I am (or
+	 * enough other threads are) already completing messages */
+
+	my_slot = -1;
+	for (i = 0; i < container->msc_nfinalizers; i++) {
+		if (container->msc_finalizers[i] == current)
+			break;
+
+		if (my_slot < 0 && container->msc_finalizers[i] == NULL)
+			my_slot = i;
+	}
+
+	if (i < container->msc_nfinalizers || my_slot < 0) {
+		lnet_net_unlock(cpt);
+		return;
+	}
+
+	container->msc_finalizers[my_slot] = current;
+
+	while (!list_empty(&container->msc_finalizing)) {
+		msg = list_entry(container->msc_finalizing.next,
+				     lnet_msg_t, msg_list);
+
+		list_del(&msg->msg_list);
+
+		/* NB drops and regains the lnet lock if it actually does
+		 * anything, so my finalizing friends can chomp along too */
+		rc = lnet_complete_msg_locked(msg, cpt);
+		if (rc != 0)
+			break;
+	}
+
+	container->msc_finalizers[my_slot] = NULL;
+	lnet_net_unlock(cpt);
+
+	if (rc != 0)
+		goto again;
+}
+EXPORT_SYMBOL(lnet_finalize);
+
+void
+lnet_msg_container_cleanup(struct lnet_msg_container *container)
+{
+	int     count = 0;
+
+	if (container->msc_init == 0)
+		return;
+
+	while (!list_empty(&container->msc_active)) {
+		lnet_msg_t *msg = list_entry(container->msc_active.next,
+						 lnet_msg_t, msg_activelist);
+
+		LASSERT(msg->msg_onactivelist);
+		msg->msg_onactivelist = 0;
+		list_del(&msg->msg_activelist);
+		lnet_msg_free(msg);
+		count++;
+	}
+
+	if (count > 0)
+		CERROR("%d active msg on exit\n", count);
+
+	if (container->msc_finalizers != NULL) {
+		LIBCFS_FREE(container->msc_finalizers,
+			    container->msc_nfinalizers *
+			    sizeof(*container->msc_finalizers));
+		container->msc_finalizers = NULL;
+	}
+#ifdef LNET_USE_LIB_FREELIST
+	lnet_freelist_fini(&container->msc_freelist);
+#endif
+	container->msc_init = 0;
+}
+
+int
+lnet_msg_container_setup(struct lnet_msg_container *container, int cpt)
+{
+	int	rc;
+
+	container->msc_init = 1;
+
+	INIT_LIST_HEAD(&container->msc_active);
+	INIT_LIST_HEAD(&container->msc_finalizing);
+
+#ifdef LNET_USE_LIB_FREELIST
+	memset(&container->msc_freelist, 0, sizeof(lnet_freelist_t));
+
+	rc = lnet_freelist_init(&container->msc_freelist,
+				LNET_FL_MAX_MSGS, sizeof(lnet_msg_t));
+	if (rc != 0) {
+		CERROR("Failed to init freelist for message container\n");
+		lnet_msg_container_cleanup(container);
+		return rc;
+	}
+#else
+	rc = 0;
+#endif
+	/* number of CPUs */
+	container->msc_nfinalizers = cfs_cpt_weight(lnet_cpt_table(), cpt);
+
+	LIBCFS_CPT_ALLOC(container->msc_finalizers, lnet_cpt_table(), cpt,
+			 container->msc_nfinalizers *
+			 sizeof(*container->msc_finalizers));
+
+	if (container->msc_finalizers == NULL) {
+		CERROR("Failed to allocate message finalizers\n");
+		lnet_msg_container_cleanup(container);
+		return -ENOMEM;
+	}
+
+	return rc;
+}
+
+void
+lnet_msg_containers_destroy(void)
+{
+	struct lnet_msg_container *container;
+	int     i;
+
+	if (the_lnet.ln_msg_containers == NULL)
+		return;
+
+	cfs_percpt_for_each(container, i, the_lnet.ln_msg_containers)
+		lnet_msg_container_cleanup(container);
+
+	cfs_percpt_free(the_lnet.ln_msg_containers);
+	the_lnet.ln_msg_containers = NULL;
+}
+
+int
+lnet_msg_containers_create(void)
+{
+	struct lnet_msg_container *container;
+	int	rc;
+	int	i;
+
+	the_lnet.ln_msg_containers = cfs_percpt_alloc(lnet_cpt_table(),
+						      sizeof(*container));
+
+	if (the_lnet.ln_msg_containers == NULL) {
+		CERROR("Failed to allocate cpu-partition data for network\n");
+		return -ENOMEM;
+	}
+
+	cfs_percpt_for_each(container, i, the_lnet.ln_msg_containers) {
+		rc = lnet_msg_container_setup(container, i);
+		if (rc != 0) {
+			lnet_msg_containers_destroy();
+			return rc;
+		}
+	}
+
+	return 0;
+}
diff --git a/drivers/staging/lustre/lnet/lnet/lib-ptl.c b/drivers/staging/lustre/lnet/lnet/lib-ptl.c
new file mode 100644
index 000000000000..9b9e7d3139b0
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/lib-ptl.c
@@ -0,0 +1,938 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-ptl.c
+ *
+ * portal & match routines
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/lnet/lib-lnet.h>
+
+/* NB: add /proc interfaces in upcoming patches */
+int	portal_rotor	= LNET_PTL_ROTOR_HASH_RT;
+CFS_MODULE_PARM(portal_rotor, "i", int, 0644,
+		"redirect PUTs to different cpu-partitions");
+
+static int
+lnet_ptl_match_type(unsigned int index, lnet_process_id_t match_id,
+		    __u64 mbits, __u64 ignore_bits)
+{
+	struct lnet_portal	*ptl = the_lnet.ln_portals[index];
+	int			unique;
+
+	unique = ignore_bits == 0 &&
+		 match_id.nid != LNET_NID_ANY &&
+		 match_id.pid != LNET_PID_ANY;
+
+	LASSERT(!lnet_ptl_is_unique(ptl) || !lnet_ptl_is_wildcard(ptl));
+
+	/* prefer to check w/o any lock */
+	if (likely(lnet_ptl_is_unique(ptl) || lnet_ptl_is_wildcard(ptl)))
+		goto match;
+
+	/* unset, new portal */
+	lnet_ptl_lock(ptl);
+	/* check again with lock */
+	if (unlikely(lnet_ptl_is_unique(ptl) || lnet_ptl_is_wildcard(ptl))) {
+		lnet_ptl_unlock(ptl);
+		goto match;
+	}
+
+	/* still not set */
+	if (unique)
+		lnet_ptl_setopt(ptl, LNET_PTL_MATCH_UNIQUE);
+	else
+		lnet_ptl_setopt(ptl, LNET_PTL_MATCH_WILDCARD);
+
+	lnet_ptl_unlock(ptl);
+
+	return 1;
+
+ match:
+	if ((lnet_ptl_is_unique(ptl) && !unique) ||
+	    (lnet_ptl_is_wildcard(ptl) && unique))
+		return 0;
+	return 1;
+}
+
+static void
+lnet_ptl_enable_mt(struct lnet_portal *ptl, int cpt)
+{
+	struct lnet_match_table	*mtable = ptl->ptl_mtables[cpt];
+	int			i;
+
+	/* with hold of both lnet_res_lock(cpt) and lnet_ptl_lock */
+	LASSERT(lnet_ptl_is_wildcard(ptl));
+
+	mtable->mt_enabled = 1;
+
+	ptl->ptl_mt_maps[ptl->ptl_mt_nmaps] = cpt;
+	for (i = ptl->ptl_mt_nmaps - 1; i >= 0; i--) {
+		LASSERT(ptl->ptl_mt_maps[i] != cpt);
+		if (ptl->ptl_mt_maps[i] < cpt)
+			break;
+
+		/* swap to order */
+		ptl->ptl_mt_maps[i + 1] = ptl->ptl_mt_maps[i];
+		ptl->ptl_mt_maps[i] = cpt;
+	}
+
+	ptl->ptl_mt_nmaps++;
+}
+
+static void
+lnet_ptl_disable_mt(struct lnet_portal *ptl, int cpt)
+{
+	struct lnet_match_table	*mtable = ptl->ptl_mtables[cpt];
+	int			i;
+
+	/* with hold of both lnet_res_lock(cpt) and lnet_ptl_lock */
+	LASSERT(lnet_ptl_is_wildcard(ptl));
+
+	if (LNET_CPT_NUMBER == 1)
+		return; /* never disable the only match-table */
+
+	mtable->mt_enabled = 0;
+
+	LASSERT(ptl->ptl_mt_nmaps > 0 &&
+		ptl->ptl_mt_nmaps <= LNET_CPT_NUMBER);
+
+	/* remove it from mt_maps */
+	ptl->ptl_mt_nmaps--;
+	for (i = 0; i < ptl->ptl_mt_nmaps; i++) {
+		if (ptl->ptl_mt_maps[i] >= cpt) /* overwrite it */
+			ptl->ptl_mt_maps[i] = ptl->ptl_mt_maps[i + 1];
+	}
+}
+
+static int
+lnet_try_match_md(lnet_libmd_t *md,
+		  struct lnet_match_info *info, struct lnet_msg *msg)
+{
+	/* ALWAYS called holding the lnet_res_lock, and can't lnet_res_unlock;
+	 * lnet_match_blocked_msg() relies on this to avoid races */
+	unsigned int	offset;
+	unsigned int	mlength;
+	lnet_me_t	*me = md->md_me;
+
+	/* MD exhausted */
+	if (lnet_md_exhausted(md))
+		return LNET_MATCHMD_NONE | LNET_MATCHMD_EXHAUSTED;
+
+	/* mismatched MD op */
+	if ((md->md_options & info->mi_opc) == 0)
+		return LNET_MATCHMD_NONE;
+
+	/* mismatched ME nid/pid? */
+	if (me->me_match_id.nid != LNET_NID_ANY &&
+	    me->me_match_id.nid != info->mi_id.nid)
+		return LNET_MATCHMD_NONE;
+
+	if (me->me_match_id.pid != LNET_PID_ANY &&
+	    me->me_match_id.pid != info->mi_id.pid)
+		return LNET_MATCHMD_NONE;
+
+	/* mismatched ME matchbits? */
+	if (((me->me_match_bits ^ info->mi_mbits) & ~me->me_ignore_bits) != 0)
+		return LNET_MATCHMD_NONE;
+
+	/* Hurrah! This _is_ a match; check it out... */
+
+	if ((md->md_options & LNET_MD_MANAGE_REMOTE) == 0)
+		offset = md->md_offset;
+	else
+		offset = info->mi_roffset;
+
+	if ((md->md_options & LNET_MD_MAX_SIZE) != 0) {
+		mlength = md->md_max_size;
+		LASSERT(md->md_offset + mlength <= md->md_length);
+	} else {
+		mlength = md->md_length - offset;
+	}
+
+	if (info->mi_rlength <= mlength) {	/* fits in allowed space */
+		mlength = info->mi_rlength;
+	} else if ((md->md_options & LNET_MD_TRUNCATE) == 0) {
+		/* this packet _really_ is too big */
+		CERROR("Matching packet from %s, match "LPU64
+		       " length %d too big: %d left, %d allowed\n",
+		       libcfs_id2str(info->mi_id), info->mi_mbits,
+		       info->mi_rlength, md->md_length - offset, mlength);
+
+		return LNET_MATCHMD_DROP;
+	}
+
+	/* Commit to this ME/MD */
+	CDEBUG(D_NET, "Incoming %s index %x from %s of "
+	       "length %d/%d into md "LPX64" [%d] + %d\n",
+	       (info->mi_opc == LNET_MD_OP_PUT) ? "put" : "get",
+	       info->mi_portal, libcfs_id2str(info->mi_id), mlength,
+	       info->mi_rlength, md->md_lh.lh_cookie, md->md_niov, offset);
+
+	lnet_msg_attach_md(msg, md, offset, mlength);
+	md->md_offset = offset + mlength;
+
+	if (!lnet_md_exhausted(md))
+		return LNET_MATCHMD_OK;
+
+	/* Auto-unlink NOW, so the ME gets unlinked if required.
+	 * We bumped md->md_refcount above so the MD just gets flagged
+	 * for unlink when it is finalized. */
+	if ((md->md_flags & LNET_MD_FLAG_AUTO_UNLINK) != 0)
+		lnet_md_unlink(md);
+
+	return LNET_MATCHMD_OK | LNET_MATCHMD_EXHAUSTED;
+}
+
+static struct lnet_match_table *
+lnet_match2mt(struct lnet_portal *ptl, lnet_process_id_t id, __u64 mbits)
+{
+	if (LNET_CPT_NUMBER == 1)
+		return ptl->ptl_mtables[0]; /* the only one */
+
+	/* if it's a unique portal, return match-table hashed by NID */
+	return lnet_ptl_is_unique(ptl) ?
+	       ptl->ptl_mtables[lnet_cpt_of_nid(id.nid)] : NULL;
+}
+
+struct lnet_match_table *
+lnet_mt_of_attach(unsigned int index, lnet_process_id_t id,
+		  __u64 mbits, __u64 ignore_bits, lnet_ins_pos_t pos)
+{
+	struct lnet_portal	*ptl;
+	struct lnet_match_table	*mtable;
+
+	/* NB: called w/o lock */
+	LASSERT(index < the_lnet.ln_nportals);
+
+	if (!lnet_ptl_match_type(index, id, mbits, ignore_bits))
+		return NULL;
+
+	ptl = the_lnet.ln_portals[index];
+
+	mtable = lnet_match2mt(ptl, id, mbits);
+	if (mtable != NULL) /* unique portal or only one match-table */
+		return mtable;
+
+	/* it's a wildcard portal */
+	switch (pos) {
+	default:
+		return NULL;
+	case LNET_INS_BEFORE:
+	case LNET_INS_AFTER:
+		/* posted by no affinity thread, always hash to specific
+		 * match-table to avoid buffer stealing which is heavy */
+		return ptl->ptl_mtables[ptl->ptl_index % LNET_CPT_NUMBER];
+	case LNET_INS_LOCAL:
+		/* posted by cpu-affinity thread */
+		return ptl->ptl_mtables[lnet_cpt_current()];
+	}
+}
+
+static struct lnet_match_table *
+lnet_mt_of_match(struct lnet_match_info *info, struct lnet_msg *msg)
+{
+	struct lnet_match_table	*mtable;
+	struct lnet_portal	*ptl;
+	int			nmaps;
+	int			rotor;
+	int			routed;
+	int			cpt;
+
+	/* NB: called w/o lock */
+	LASSERT(info->mi_portal < the_lnet.ln_nportals);
+	ptl = the_lnet.ln_portals[info->mi_portal];
+
+	LASSERT(lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl));
+
+	mtable = lnet_match2mt(ptl, info->mi_id, info->mi_mbits);
+	if (mtable != NULL)
+		return mtable;
+
+	/* it's a wildcard portal */
+	routed = LNET_NIDNET(msg->msg_hdr.src_nid) !=
+		 LNET_NIDNET(msg->msg_hdr.dest_nid);
+
+	if (portal_rotor == LNET_PTL_ROTOR_OFF ||
+	    (portal_rotor != LNET_PTL_ROTOR_ON && !routed)) {
+		cpt = lnet_cpt_current();
+		if (ptl->ptl_mtables[cpt]->mt_enabled)
+			return ptl->ptl_mtables[cpt];
+	}
+
+	rotor = ptl->ptl_rotor++; /* get round-robin factor */
+	if (portal_rotor == LNET_PTL_ROTOR_HASH_RT && routed)
+		cpt = lnet_cpt_of_nid(msg->msg_hdr.src_nid);
+	else
+		cpt = rotor % LNET_CPT_NUMBER;
+
+	if (!ptl->ptl_mtables[cpt]->mt_enabled) {
+		/* is there any active entry for this portal? */
+		nmaps = ptl->ptl_mt_nmaps;
+		/* map to an active mtable to avoid heavy "stealing" */
+		if (nmaps != 0) {
+			/* NB: there is possibility that ptl_mt_maps is being
+			 * changed because we are not under protection of
+			 * lnet_ptl_lock, but it shouldn't hurt anything */
+			cpt = ptl->ptl_mt_maps[rotor % nmaps];
+		}
+	}
+
+	return ptl->ptl_mtables[cpt];
+}
+
+static int
+lnet_mt_test_exhausted(struct lnet_match_table *mtable, int pos)
+{
+	__u64	*bmap;
+	int	i;
+
+	if (!lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal]))
+		return 0;
+
+	if (pos < 0) { /* check all bits */
+		for (i = 0; i < LNET_MT_EXHAUSTED_BMAP; i++) {
+			if (mtable->mt_exhausted[i] != (__u64)(-1))
+				return 0;
+		}
+		return 1;
+	}
+
+	LASSERT(pos <= LNET_MT_HASH_IGNORE);
+	/* mtable::mt_mhash[pos] is marked as exhausted or not */
+	bmap = &mtable->mt_exhausted[pos >> LNET_MT_BITS_U64];
+	pos &= (1 << LNET_MT_BITS_U64) - 1;
+
+	return ((*bmap) & (1ULL << pos)) != 0;
+}
+
+static void
+lnet_mt_set_exhausted(struct lnet_match_table *mtable, int pos, int exhausted)
+{
+	__u64	*bmap;
+
+	LASSERT(lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal]));
+	LASSERT(pos <= LNET_MT_HASH_IGNORE);
+
+	/* set mtable::mt_mhash[pos] as exhausted/non-exhausted */
+	bmap = &mtable->mt_exhausted[pos >> LNET_MT_BITS_U64];
+	pos &= (1 << LNET_MT_BITS_U64) - 1;
+
+	if (!exhausted)
+		*bmap &= ~(1ULL << pos);
+	else
+		*bmap |= 1ULL << pos;
+}
+
+struct list_head *
+lnet_mt_match_head(struct lnet_match_table *mtable,
+		   lnet_process_id_t id, __u64 mbits)
+{
+	struct lnet_portal *ptl = the_lnet.ln_portals[mtable->mt_portal];
+
+	if (lnet_ptl_is_wildcard(ptl)) {
+		return &mtable->mt_mhash[mbits & LNET_MT_HASH_MASK];
+	} else {
+		unsigned long hash = mbits + id.nid + id.pid;
+
+		LASSERT(lnet_ptl_is_unique(ptl));
+		hash = cfs_hash_long(hash, LNET_MT_HASH_BITS);
+		return &mtable->mt_mhash[hash];
+	}
+}
+
+int
+lnet_mt_match_md(struct lnet_match_table *mtable,
+		 struct lnet_match_info *info, struct lnet_msg *msg)
+{
+	struct list_head		*head;
+	lnet_me_t		*me;
+	lnet_me_t		*tmp;
+	int			exhausted = 0;
+	int			rc;
+
+	/* any ME with ignore bits? */
+	if (!list_empty(&mtable->mt_mhash[LNET_MT_HASH_IGNORE]))
+		head = &mtable->mt_mhash[LNET_MT_HASH_IGNORE];
+	else
+		head = lnet_mt_match_head(mtable, info->mi_id, info->mi_mbits);
+ again:
+	/* NB: only wildcard portal needs to return LNET_MATCHMD_EXHAUSTED */
+	if (lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal]))
+		exhausted = LNET_MATCHMD_EXHAUSTED;
+
+	list_for_each_entry_safe(me, tmp, head, me_list) {
+		/* ME attached but MD not attached yet */
+		if (me->me_md == NULL)
+			continue;
+
+		LASSERT(me == me->me_md->md_me);
+
+		rc = lnet_try_match_md(me->me_md, info, msg);
+		if ((rc & LNET_MATCHMD_EXHAUSTED) == 0)
+			exhausted = 0; /* mlist is not empty */
+
+		if ((rc & LNET_MATCHMD_FINISH) != 0) {
+			/* don't return EXHAUSTED bit because we don't know
+			 * whether the mlist is empty or not */
+			return rc & ~LNET_MATCHMD_EXHAUSTED;
+		}
+	}
+
+	if (exhausted == LNET_MATCHMD_EXHAUSTED) { /* @head is exhausted */
+		lnet_mt_set_exhausted(mtable, head - mtable->mt_mhash, 1);
+		if (!lnet_mt_test_exhausted(mtable, -1))
+			exhausted = 0;
+	}
+
+	if (exhausted == 0 && head == &mtable->mt_mhash[LNET_MT_HASH_IGNORE]) {
+		head = lnet_mt_match_head(mtable, info->mi_id, info->mi_mbits);
+		goto again; /* re-check MEs w/o ignore-bits */
+	}
+
+	if (info->mi_opc == LNET_MD_OP_GET ||
+	    !lnet_ptl_is_lazy(the_lnet.ln_portals[info->mi_portal]))
+		return LNET_MATCHMD_DROP | exhausted;
+
+	return LNET_MATCHMD_NONE | exhausted;
+}
+
+static int
+lnet_ptl_match_early(struct lnet_portal *ptl, struct lnet_msg *msg)
+{
+	int	rc;
+
+	/* message arrived before any buffer posting on this portal,
+	 * simply delay or drop this message */
+	if (likely(lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl)))
+		return 0;
+
+	lnet_ptl_lock(ptl);
+	/* check it again with hold of lock */
+	if (lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl)) {
+		lnet_ptl_unlock(ptl);
+		return 0;
+	}
+
+	if (lnet_ptl_is_lazy(ptl)) {
+		if (msg->msg_rx_ready_delay) {
+			msg->msg_rx_delayed = 1;
+			list_add_tail(&msg->msg_list,
+					  &ptl->ptl_msg_delayed);
+		}
+		rc = LNET_MATCHMD_NONE;
+	} else {
+		rc = LNET_MATCHMD_DROP;
+	}
+
+	lnet_ptl_unlock(ptl);
+	return rc;
+}
+
+static int
+lnet_ptl_match_delay(struct lnet_portal *ptl,
+		     struct lnet_match_info *info, struct lnet_msg *msg)
+{
+	int	first = ptl->ptl_mt_maps[0]; /* read w/o lock */
+	int	rc = 0;
+	int	i;
+
+	/* steal buffer from other CPTs, and delay it if nothing to steal,
+	 * this function is more expensive than a regular match, but we
+	 * don't expect it can happen a lot */
+	LASSERT(lnet_ptl_is_wildcard(ptl));
+
+	for (i = 0; i < LNET_CPT_NUMBER; i++) {
+		struct lnet_match_table *mtable;
+		int			cpt;
+
+		cpt = (first + i) % LNET_CPT_NUMBER;
+		mtable = ptl->ptl_mtables[cpt];
+		if (i != 0 && i != LNET_CPT_NUMBER - 1 && !mtable->mt_enabled)
+			continue;
+
+		lnet_res_lock(cpt);
+		lnet_ptl_lock(ptl);
+
+		if (i == 0) { /* the first try, attach on stealing list */
+			list_add_tail(&msg->msg_list,
+					  &ptl->ptl_msg_stealing);
+		}
+
+		if (!list_empty(&msg->msg_list)) { /* on stealing list */
+			rc = lnet_mt_match_md(mtable, info, msg);
+
+			if ((rc & LNET_MATCHMD_EXHAUSTED) != 0 &&
+			    mtable->mt_enabled)
+				lnet_ptl_disable_mt(ptl, cpt);
+
+			if ((rc & LNET_MATCHMD_FINISH) != 0)
+				list_del_init(&msg->msg_list);
+
+		} else {
+			/* could be matched by lnet_ptl_attach_md()
+			 * which is called by another thread */
+			rc = msg->msg_md == NULL ?
+			     LNET_MATCHMD_DROP : LNET_MATCHMD_OK;
+		}
+
+		if (!list_empty(&msg->msg_list) && /* not matched yet */
+		    (i == LNET_CPT_NUMBER - 1 || /* the last CPT */
+		     ptl->ptl_mt_nmaps == 0 ||   /* no active CPT */
+		     (ptl->ptl_mt_nmaps == 1 &&  /* the only active CPT */
+		      ptl->ptl_mt_maps[0] == cpt))) {
+			/* nothing to steal, delay or drop */
+			list_del_init(&msg->msg_list);
+
+			if (lnet_ptl_is_lazy(ptl)) {
+				msg->msg_rx_delayed = 1;
+				list_add_tail(&msg->msg_list,
+						  &ptl->ptl_msg_delayed);
+				rc = LNET_MATCHMD_NONE;
+			} else {
+				rc = LNET_MATCHMD_DROP;
+			}
+		}
+
+		lnet_ptl_unlock(ptl);
+		lnet_res_unlock(cpt);
+
+		if ((rc & LNET_MATCHMD_FINISH) != 0 || msg->msg_rx_delayed)
+			break;
+	}
+
+	return rc;
+}
+
+int
+lnet_ptl_match_md(struct lnet_match_info *info, struct lnet_msg *msg)
+{
+	struct lnet_match_table	*mtable;
+	struct lnet_portal	*ptl;
+	int			rc;
+
+	CDEBUG(D_NET, "Request from %s of length %d into portal %d "
+	       "MB="LPX64"\n", libcfs_id2str(info->mi_id),
+	       info->mi_rlength, info->mi_portal, info->mi_mbits);
+
+	if (info->mi_portal >= the_lnet.ln_nportals) {
+		CERROR("Invalid portal %d not in [0-%d]\n",
+		       info->mi_portal, the_lnet.ln_nportals);
+		return LNET_MATCHMD_DROP;
+	}
+
+	ptl = the_lnet.ln_portals[info->mi_portal];
+	rc = lnet_ptl_match_early(ptl, msg);
+	if (rc != 0) /* matched or delayed early message */
+		return rc;
+
+	mtable = lnet_mt_of_match(info, msg);
+	lnet_res_lock(mtable->mt_cpt);
+
+	if (the_lnet.ln_shutdown) {
+		rc = LNET_MATCHMD_DROP;
+		goto out1;
+	}
+
+	rc = lnet_mt_match_md(mtable, info, msg);
+	if ((rc & LNET_MATCHMD_EXHAUSTED) != 0 && mtable->mt_enabled) {
+		lnet_ptl_lock(ptl);
+		lnet_ptl_disable_mt(ptl, mtable->mt_cpt);
+		lnet_ptl_unlock(ptl);
+	}
+
+	if ((rc & LNET_MATCHMD_FINISH) != 0)	/* matched or dropping */
+		goto out1;
+
+	if (!msg->msg_rx_ready_delay)
+		goto out1;
+
+	LASSERT(lnet_ptl_is_lazy(ptl));
+	LASSERT(!msg->msg_rx_delayed);
+
+	/* NB: we don't expect "delay" can happen a lot */
+	if (lnet_ptl_is_unique(ptl) || LNET_CPT_NUMBER == 1) {
+		lnet_ptl_lock(ptl);
+
+		msg->msg_rx_delayed = 1;
+		list_add_tail(&msg->msg_list, &ptl->ptl_msg_delayed);
+
+		lnet_ptl_unlock(ptl);
+		lnet_res_unlock(mtable->mt_cpt);
+
+	} else  {
+		lnet_res_unlock(mtable->mt_cpt);
+		rc = lnet_ptl_match_delay(ptl, info, msg);
+	}
+
+	if (msg->msg_rx_delayed) {
+		CDEBUG(D_NET,
+		       "Delaying %s from %s ptl %d MB "LPX64" off %d len %d\n",
+		       info->mi_opc == LNET_MD_OP_PUT ? "PUT" : "GET",
+		       libcfs_id2str(info->mi_id), info->mi_portal,
+		       info->mi_mbits, info->mi_roffset, info->mi_rlength);
+	}
+	goto out0;
+ out1:
+	lnet_res_unlock(mtable->mt_cpt);
+ out0:
+	/* EXHAUSTED bit is only meaningful for internal functions */
+	return rc & ~LNET_MATCHMD_EXHAUSTED;
+}
+
+void
+lnet_ptl_detach_md(lnet_me_t *me, lnet_libmd_t *md)
+{
+	LASSERT(me->me_md == md && md->md_me == me);
+
+	me->me_md = NULL;
+	md->md_me = NULL;
+}
+
+/* called with lnet_res_lock held */
+void
+lnet_ptl_attach_md(lnet_me_t *me, lnet_libmd_t *md,
+		   struct list_head *matches, struct list_head *drops)
+{
+	struct lnet_portal	*ptl = the_lnet.ln_portals[me->me_portal];
+	struct lnet_match_table	*mtable;
+	struct list_head		*head;
+	lnet_msg_t		*tmp;
+	lnet_msg_t		*msg;
+	int			exhausted = 0;
+	int			cpt;
+
+	LASSERT(md->md_refcount == 0); /* a brand new MD */
+
+	me->me_md = md;
+	md->md_me = me;
+
+	cpt = lnet_cpt_of_cookie(md->md_lh.lh_cookie);
+	mtable = ptl->ptl_mtables[cpt];
+
+	if (list_empty(&ptl->ptl_msg_stealing) &&
+	    list_empty(&ptl->ptl_msg_delayed) &&
+	    !lnet_mt_test_exhausted(mtable, me->me_pos))
+		return;
+
+	lnet_ptl_lock(ptl);
+	head = &ptl->ptl_msg_stealing;
+ again:
+	list_for_each_entry_safe(msg, tmp, head, msg_list) {
+		struct lnet_match_info	info;
+		lnet_hdr_t		*hdr;
+		int			rc;
+
+		LASSERT(msg->msg_rx_delayed || head == &ptl->ptl_msg_stealing);
+
+		hdr   = &msg->msg_hdr;
+		info.mi_id.nid	= hdr->src_nid;
+		info.mi_id.pid	= hdr->src_pid;
+		info.mi_opc	= LNET_MD_OP_PUT;
+		info.mi_portal	= hdr->msg.put.ptl_index;
+		info.mi_rlength	= hdr->payload_length;
+		info.mi_roffset	= hdr->msg.put.offset;
+		info.mi_mbits	= hdr->msg.put.match_bits;
+
+		rc = lnet_try_match_md(md, &info, msg);
+
+		exhausted = (rc & LNET_MATCHMD_EXHAUSTED) != 0;
+		if ((rc & LNET_MATCHMD_NONE) != 0) {
+			if (exhausted)
+				break;
+			continue;
+		}
+
+		/* Hurrah! This _is_ a match */
+		LASSERT((rc & LNET_MATCHMD_FINISH) != 0);
+		list_del_init(&msg->msg_list);
+
+		if (head == &ptl->ptl_msg_stealing) {
+			if (exhausted)
+				break;
+			/* stealing thread will handle the message */
+			continue;
+		}
+
+		if ((rc & LNET_MATCHMD_OK) != 0) {
+			list_add_tail(&msg->msg_list, matches);
+
+			CDEBUG(D_NET, "Resuming delayed PUT from %s portal %d "
+			       "match "LPU64" offset %d length %d.\n",
+			       libcfs_id2str(info.mi_id),
+			       info.mi_portal, info.mi_mbits,
+			       info.mi_roffset, info.mi_rlength);
+		} else {
+			list_add_tail(&msg->msg_list, drops);
+		}
+
+		if (exhausted)
+			break;
+	}
+
+	if (!exhausted && head == &ptl->ptl_msg_stealing) {
+		head = &ptl->ptl_msg_delayed;
+		goto again;
+	}
+
+	if (lnet_ptl_is_wildcard(ptl) && !exhausted) {
+		lnet_mt_set_exhausted(mtable, me->me_pos, 0);
+		if (!mtable->mt_enabled)
+			lnet_ptl_enable_mt(ptl, cpt);
+	}
+
+	lnet_ptl_unlock(ptl);
+}
+
+void
+lnet_ptl_cleanup(struct lnet_portal *ptl)
+{
+	struct lnet_match_table	*mtable;
+	int			i;
+
+	if (ptl->ptl_mtables == NULL) /* uninitialized portal */
+		return;
+
+	LASSERT(list_empty(&ptl->ptl_msg_delayed));
+	LASSERT(list_empty(&ptl->ptl_msg_stealing));
+	cfs_percpt_for_each(mtable, i, ptl->ptl_mtables) {
+		struct list_head	*mhash;
+		lnet_me_t	*me;
+		int		j;
+
+		if (mtable->mt_mhash == NULL) /* uninitialized match-table */
+			continue;
+
+		mhash = mtable->mt_mhash;
+		/* cleanup ME */
+		for (j = 0; j < LNET_MT_HASH_SIZE + 1; j++) {
+			while (!list_empty(&mhash[j])) {
+				me = list_entry(mhash[j].next,
+						    lnet_me_t, me_list);
+				CERROR("Active ME %p on exit\n", me);
+				list_del(&me->me_list);
+				lnet_me_free(me);
+			}
+		}
+		/* the extra entry is for MEs with ignore bits */
+		LIBCFS_FREE(mhash, sizeof(*mhash) * (LNET_MT_HASH_SIZE + 1));
+	}
+
+	cfs_percpt_free(ptl->ptl_mtables);
+	ptl->ptl_mtables = NULL;
+}
+
+int
+lnet_ptl_setup(struct lnet_portal *ptl, int index)
+{
+	struct lnet_match_table	*mtable;
+	struct list_head		*mhash;
+	int			i;
+	int			j;
+
+	ptl->ptl_mtables = cfs_percpt_alloc(lnet_cpt_table(),
+					    sizeof(struct lnet_match_table));
+	if (ptl->ptl_mtables == NULL) {
+		CERROR("Failed to create match table for portal %d\n", index);
+		return -ENOMEM;
+	}
+
+	ptl->ptl_index = index;
+	INIT_LIST_HEAD(&ptl->ptl_msg_delayed);
+	INIT_LIST_HEAD(&ptl->ptl_msg_stealing);
+	spin_lock_init(&ptl->ptl_lock);
+	cfs_percpt_for_each(mtable, i, ptl->ptl_mtables) {
+		/* the extra entry is for MEs with ignore bits */
+		LIBCFS_CPT_ALLOC(mhash, lnet_cpt_table(), i,
+				 sizeof(*mhash) * (LNET_MT_HASH_SIZE + 1));
+		if (mhash == NULL) {
+			CERROR("Failed to create match hash for portal %d\n",
+			       index);
+			goto failed;
+		}
+
+		memset(&mtable->mt_exhausted[0], -1,
+		       sizeof(mtable->mt_exhausted[0]) *
+		       LNET_MT_EXHAUSTED_BMAP);
+		mtable->mt_mhash = mhash;
+		for (j = 0; j < LNET_MT_HASH_SIZE + 1; j++)
+			INIT_LIST_HEAD(&mhash[j]);
+
+		mtable->mt_portal = index;
+		mtable->mt_cpt = i;
+	}
+
+	return 0;
+ failed:
+	lnet_ptl_cleanup(ptl);
+	return -ENOMEM;
+}
+
+void
+lnet_portals_destroy(void)
+{
+	int	i;
+
+	if (the_lnet.ln_portals == NULL)
+		return;
+
+	for (i = 0; i < the_lnet.ln_nportals; i++)
+		lnet_ptl_cleanup(the_lnet.ln_portals[i]);
+
+	cfs_array_free(the_lnet.ln_portals);
+	the_lnet.ln_portals = NULL;
+}
+
+int
+lnet_portals_create(void)
+{
+	int	size;
+	int	i;
+
+	size = offsetof(struct lnet_portal, ptl_mt_maps[LNET_CPT_NUMBER]);
+
+	the_lnet.ln_nportals = MAX_PORTALS;
+	the_lnet.ln_portals = cfs_array_alloc(the_lnet.ln_nportals, size);
+	if (the_lnet.ln_portals == NULL) {
+		CERROR("Failed to allocate portals table\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < the_lnet.ln_nportals; i++) {
+		if (lnet_ptl_setup(the_lnet.ln_portals[i], i)) {
+			lnet_portals_destroy();
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * Turn on the lazy portal attribute. Use with caution!
+ *
+ * This portal attribute only affects incoming PUT requests to the portal,
+ * and is off by default. By default, if there's no matching MD for an
+ * incoming PUT request, it is simply dropped. With the lazy attribute on,
+ * such requests are queued indefinitely until either a matching MD is
+ * posted to the portal or the lazy attribute is turned off.
+ *
+ * It would prevent dropped requests, however it should be regarded as the
+ * last line of defense - i.e. users must keep a close watch on active
+ * buffers on a lazy portal and once it becomes too low post more buffers as
+ * soon as possible. This is because delayed requests usually have detrimental
+ * effects on underlying network connections. A few delayed requests often
+ * suffice to bring an underlying connection to a complete halt, due to flow
+ * control mechanisms.
+ *
+ * There's also a DOS attack risk. If users don't post match-all MDs on a
+ * lazy portal, a malicious peer can easily stop a service by sending some
+ * PUT requests with match bits that won't match any MD. A routed server is
+ * especially vulnerable since the connections to its neighbor routers are
+ * shared among all clients.
+ *
+ * \param portal Index of the portal to enable the lazy attribute on.
+ *
+ * \retval 0       On success.
+ * \retval -EINVAL If \a portal is not a valid index.
+ */
+int
+LNetSetLazyPortal(int portal)
+{
+	struct lnet_portal *ptl;
+
+	if (portal < 0 || portal >= the_lnet.ln_nportals)
+		return -EINVAL;
+
+	CDEBUG(D_NET, "Setting portal %d lazy\n", portal);
+	ptl = the_lnet.ln_portals[portal];
+
+	lnet_res_lock(LNET_LOCK_EX);
+	lnet_ptl_lock(ptl);
+
+	lnet_ptl_setopt(ptl, LNET_PTL_LAZY);
+
+	lnet_ptl_unlock(ptl);
+	lnet_res_unlock(LNET_LOCK_EX);
+
+	return 0;
+}
+EXPORT_SYMBOL(LNetSetLazyPortal);
+
+/**
+ * Turn off the lazy portal attribute. Delayed requests on the portal,
+ * if any, will be all dropped when this function returns.
+ *
+ * \param portal Index of the portal to disable the lazy attribute on.
+ *
+ * \retval 0       On success.
+ * \retval -EINVAL If \a portal is not a valid index.
+ */
+int
+LNetClearLazyPortal(int portal)
+{
+	struct lnet_portal	*ptl;
+	LIST_HEAD		(zombies);
+
+	if (portal < 0 || portal >= the_lnet.ln_nportals)
+		return -EINVAL;
+
+	ptl = the_lnet.ln_portals[portal];
+
+	lnet_res_lock(LNET_LOCK_EX);
+	lnet_ptl_lock(ptl);
+
+	if (!lnet_ptl_is_lazy(ptl)) {
+		lnet_ptl_unlock(ptl);
+		lnet_res_unlock(LNET_LOCK_EX);
+		return 0;
+	}
+
+	if (the_lnet.ln_shutdown)
+		CWARN("Active lazy portal %d on exit\n", portal);
+	else
+		CDEBUG(D_NET, "clearing portal %d lazy\n", portal);
+
+	/* grab all the blocked messages atomically */
+	list_splice_init(&ptl->ptl_msg_delayed, &zombies);
+
+	lnet_ptl_unsetopt(ptl, LNET_PTL_LAZY);
+
+	lnet_ptl_unlock(ptl);
+	lnet_res_unlock(LNET_LOCK_EX);
+
+	lnet_drop_delayed_msg_list(&zombies, "Clearing lazy portal attr");
+
+	return 0;
+}
+EXPORT_SYMBOL(LNetClearLazyPortal);
diff --git a/drivers/staging/lustre/lnet/lnet/lo.c b/drivers/staging/lustre/lnet/lnet/lo.c
new file mode 100644
index 000000000000..670dae34107c
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/lo.c
@@ -0,0 +1,120 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+
+int
+lolnd_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
+{
+	LASSERT (!lntmsg->msg_routing);
+	LASSERT (!lntmsg->msg_target_is_router);
+
+	return lnet_parse(ni, &lntmsg->msg_hdr, ni->ni_nid, lntmsg, 0);
+}
+
+int
+lolnd_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
+	    int delayed, unsigned int niov,
+	    struct iovec *iov, lnet_kiov_t *kiov,
+	    unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+	lnet_msg_t *sendmsg = private;
+
+	if (lntmsg != NULL) {		   /* not discarding */
+		if (sendmsg->msg_iov != NULL) {
+			if (iov != NULL)
+				lnet_copy_iov2iov(niov, iov, offset,
+						  sendmsg->msg_niov,
+						  sendmsg->msg_iov,
+						  sendmsg->msg_offset, mlen);
+			else
+				lnet_copy_iov2kiov(niov, kiov, offset,
+						   sendmsg->msg_niov,
+						   sendmsg->msg_iov,
+						   sendmsg->msg_offset, mlen);
+		} else {
+			if (iov != NULL)
+				lnet_copy_kiov2iov(niov, iov, offset,
+						   sendmsg->msg_niov,
+						   sendmsg->msg_kiov,
+						   sendmsg->msg_offset, mlen);
+			else
+				lnet_copy_kiov2kiov(niov, kiov, offset,
+						    sendmsg->msg_niov,
+						    sendmsg->msg_kiov,
+						    sendmsg->msg_offset, mlen);
+		}
+
+		lnet_finalize(ni, lntmsg, 0);
+	}
+
+	lnet_finalize(ni, sendmsg, 0);
+	return 0;
+}
+
+static int lolnd_instanced;
+
+void
+lolnd_shutdown(lnet_ni_t *ni)
+{
+	CDEBUG (D_NET, "shutdown\n");
+	LASSERT (lolnd_instanced);
+
+	lolnd_instanced = 0;
+}
+
+int
+lolnd_startup (lnet_ni_t *ni)
+{
+	LASSERT (ni->ni_lnd == &the_lolnd);
+	LASSERT (!lolnd_instanced);
+	lolnd_instanced = 1;
+
+	return (0);
+}
+
+lnd_t the_lolnd = {
+	/* .lnd_list       = */ {&the_lolnd.lnd_list, &the_lolnd.lnd_list},
+	/* .lnd_refcount   = */ 0,
+	/* .lnd_type       = */ LOLND,
+	/* .lnd_startup    = */ lolnd_startup,
+	/* .lnd_shutdown   = */ lolnd_shutdown,
+	/* .lnt_ctl	= */ NULL,
+	/* .lnd_send       = */ lolnd_send,
+	/* .lnd_recv       = */ lolnd_recv,
+	/* .lnd_eager_recv = */ NULL,
+	/* .lnd_notify     = */ NULL,
+	/* .lnd_accept     = */ NULL
+};
diff --git a/drivers/staging/lustre/lnet/lnet/module.c b/drivers/staging/lustre/lnet/lnet/module.c
new file mode 100644
index 000000000000..c8323854580a
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/module.c
@@ -0,0 +1,154 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+
+static int config_on_load = 0;
+CFS_MODULE_PARM(config_on_load, "i", int, 0444,
+		"configure network at module load");
+
+static struct mutex lnet_config_mutex;
+
+int
+lnet_configure (void *arg)
+{
+	/* 'arg' only there so I can be passed to cfs_create_thread() */
+	int    rc = 0;
+
+	LNET_MUTEX_LOCK(&lnet_config_mutex);
+
+	if (!the_lnet.ln_niinit_self) {
+		rc = LNetNIInit(LUSTRE_SRV_LNET_PID);
+		if (rc >= 0) {
+			the_lnet.ln_niinit_self = 1;
+			rc = 0;
+		}
+	}
+
+	LNET_MUTEX_UNLOCK(&lnet_config_mutex);
+	return rc;
+}
+
+int
+lnet_unconfigure (void)
+{
+	int   refcount;
+
+	LNET_MUTEX_LOCK(&lnet_config_mutex);
+
+	if (the_lnet.ln_niinit_self) {
+		the_lnet.ln_niinit_self = 0;
+		LNetNIFini();
+	}
+
+	LNET_MUTEX_LOCK(&the_lnet.ln_api_mutex);
+	refcount = the_lnet.ln_refcount;
+	LNET_MUTEX_UNLOCK(&the_lnet.ln_api_mutex);
+
+	LNET_MUTEX_UNLOCK(&lnet_config_mutex);
+	return (refcount == 0) ? 0 : -EBUSY;
+}
+
+int
+lnet_ioctl(unsigned int cmd, struct libcfs_ioctl_data *data)
+{
+	int   rc;
+
+	switch (cmd) {
+	case IOC_LIBCFS_CONFIGURE:
+		return lnet_configure(NULL);
+
+	case IOC_LIBCFS_UNCONFIGURE:
+		return lnet_unconfigure();
+
+	default:
+		/* Passing LNET_PID_ANY only gives me a ref if the net is up
+		 * already; I'll need it to ensure the net can't go down while
+		 * I'm called into it */
+		rc = LNetNIInit(LNET_PID_ANY);
+		if (rc >= 0) {
+			rc = LNetCtl(cmd, data);
+			LNetNIFini();
+		}
+		return rc;
+	}
+}
+
+DECLARE_IOCTL_HANDLER(lnet_ioctl_handler, lnet_ioctl);
+
+int
+init_lnet(void)
+{
+	int		  rc;
+	ENTRY;
+
+	mutex_init(&lnet_config_mutex);
+
+	rc = LNetInit();
+	if (rc != 0) {
+		CERROR("LNetInit: error %d\n", rc);
+		RETURN(rc);
+	}
+
+	rc = libcfs_register_ioctl(&lnet_ioctl_handler);
+	LASSERT (rc == 0);
+
+	if (config_on_load) {
+		/* Have to schedule a separate thread to avoid deadlocking
+		 * in modload */
+		(void) kthread_run(lnet_configure, NULL, "lnet_initd");
+	}
+
+	RETURN(0);
+}
+
+void
+fini_lnet(void)
+{
+	int rc;
+
+	rc = libcfs_deregister_ioctl(&lnet_ioctl_handler);
+	LASSERT (rc == 0);
+
+	LNetFini();
+}
+
+MODULE_AUTHOR("Peter J. Braam <braam@clusterfs.com>");
+MODULE_DESCRIPTION("Portals v3.1");
+MODULE_LICENSE("GPL");
+
+cfs_module(lnet, "1.0.0", init_lnet, fini_lnet);
diff --git a/drivers/staging/lustre/lnet/lnet/peer.c b/drivers/staging/lustre/lnet/lnet/peer.c
new file mode 100644
index 000000000000..286977691393
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/peer.c
@@ -0,0 +1,337 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/peer.c
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/lnet/lib-lnet.h>
+
+int
+lnet_peer_tables_create(void)
+{
+	struct lnet_peer_table	*ptable;
+	struct list_head		*hash;
+	int			i;
+	int			j;
+
+	the_lnet.ln_peer_tables = cfs_percpt_alloc(lnet_cpt_table(),
+						   sizeof(*ptable));
+	if (the_lnet.ln_peer_tables == NULL) {
+		CERROR("Failed to allocate cpu-partition peer tables\n");
+		return -ENOMEM;
+	}
+
+	cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+		INIT_LIST_HEAD(&ptable->pt_deathrow);
+
+		LIBCFS_CPT_ALLOC(hash, lnet_cpt_table(), i,
+				 LNET_PEER_HASH_SIZE * sizeof(*hash));
+		if (hash == NULL) {
+			CERROR("Failed to create peer hash table\n");
+			lnet_peer_tables_destroy();
+			return -ENOMEM;
+		}
+
+		for (j = 0; j < LNET_PEER_HASH_SIZE; j++)
+			INIT_LIST_HEAD(&hash[j]);
+		ptable->pt_hash = hash; /* sign of initialization */
+	}
+
+	return 0;
+}
+
+void
+lnet_peer_tables_destroy(void)
+{
+	struct lnet_peer_table	*ptable;
+	struct list_head		*hash;
+	int			i;
+	int			j;
+
+	if (the_lnet.ln_peer_tables == NULL)
+		return;
+
+	cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+		hash = ptable->pt_hash;
+		if (hash == NULL) /* not intialized */
+			break;
+
+		LASSERT(list_empty(&ptable->pt_deathrow));
+
+		ptable->pt_hash = NULL;
+		for (j = 0; j < LNET_PEER_HASH_SIZE; j++)
+			LASSERT(list_empty(&hash[j]));
+
+		LIBCFS_FREE(hash, LNET_PEER_HASH_SIZE * sizeof(*hash));
+	}
+
+	cfs_percpt_free(the_lnet.ln_peer_tables);
+	the_lnet.ln_peer_tables = NULL;
+}
+
+void
+lnet_peer_tables_cleanup(void)
+{
+	struct lnet_peer_table	*ptable;
+	int			i;
+	int			j;
+
+	LASSERT(the_lnet.ln_shutdown);	/* i.e. no new peers */
+
+	cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+		lnet_net_lock(i);
+
+		for (j = 0; j < LNET_PEER_HASH_SIZE; j++) {
+			struct list_head *peers = &ptable->pt_hash[j];
+
+			while (!list_empty(peers)) {
+				lnet_peer_t *lp = list_entry(peers->next,
+								 lnet_peer_t,
+								 lp_hashlist);
+				list_del_init(&lp->lp_hashlist);
+				/* lose hash table's ref */
+				lnet_peer_decref_locked(lp);
+			}
+		}
+
+		lnet_net_unlock(i);
+	}
+
+	cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+		LIST_HEAD	(deathrow);
+		lnet_peer_t	*lp;
+
+		lnet_net_lock(i);
+
+		for (j = 3; ptable->pt_number != 0; j++) {
+			lnet_net_unlock(i);
+
+			if ((j & (j - 1)) == 0) {
+				CDEBUG(D_WARNING,
+				       "Waiting for %d peers on peer table\n",
+				       ptable->pt_number);
+			}
+			cfs_pause(cfs_time_seconds(1) / 2);
+			lnet_net_lock(i);
+		}
+		list_splice_init(&ptable->pt_deathrow, &deathrow);
+
+		lnet_net_unlock(i);
+
+		while (!list_empty(&deathrow)) {
+			lp = list_entry(deathrow.next,
+					    lnet_peer_t, lp_hashlist);
+			list_del(&lp->lp_hashlist);
+			LIBCFS_FREE(lp, sizeof(*lp));
+		}
+	}
+}
+
+void
+lnet_destroy_peer_locked(lnet_peer_t *lp)
+{
+	struct lnet_peer_table *ptable;
+
+	LASSERT(lp->lp_refcount == 0);
+	LASSERT(lp->lp_rtr_refcount == 0);
+	LASSERT(list_empty(&lp->lp_txq));
+	LASSERT(list_empty(&lp->lp_hashlist));
+	LASSERT(lp->lp_txqnob == 0);
+
+	ptable = the_lnet.ln_peer_tables[lp->lp_cpt];
+	LASSERT(ptable->pt_number > 0);
+	ptable->pt_number--;
+
+	lnet_ni_decref_locked(lp->lp_ni, lp->lp_cpt);
+	lp->lp_ni = NULL;
+
+	list_add(&lp->lp_hashlist, &ptable->pt_deathrow);
+}
+
+lnet_peer_t *
+lnet_find_peer_locked(struct lnet_peer_table *ptable, lnet_nid_t nid)
+{
+	struct list_head	*peers;
+	lnet_peer_t	*lp;
+
+	LASSERT(!the_lnet.ln_shutdown);
+
+	peers = &ptable->pt_hash[lnet_nid2peerhash(nid)];
+	list_for_each_entry(lp, peers, lp_hashlist) {
+		if (lp->lp_nid == nid) {
+			lnet_peer_addref_locked(lp);
+			return lp;
+		}
+	}
+
+	return NULL;
+}
+
+int
+lnet_nid2peer_locked(lnet_peer_t **lpp, lnet_nid_t nid, int cpt)
+{
+	struct lnet_peer_table	*ptable;
+	lnet_peer_t		*lp = NULL;
+	lnet_peer_t		*lp2;
+	int			cpt2;
+	int			rc = 0;
+
+	*lpp = NULL;
+	if (the_lnet.ln_shutdown) /* it's shutting down */
+		return -ESHUTDOWN;
+
+	/* cpt can be LNET_LOCK_EX if it's called from router functions */
+	cpt2 = cpt != LNET_LOCK_EX ? cpt : lnet_cpt_of_nid_locked(nid);
+
+	ptable = the_lnet.ln_peer_tables[cpt2];
+	lp = lnet_find_peer_locked(ptable, nid);
+	if (lp != NULL) {
+		*lpp = lp;
+		return 0;
+	}
+
+	if (!list_empty(&ptable->pt_deathrow)) {
+		lp = list_entry(ptable->pt_deathrow.next,
+				    lnet_peer_t, lp_hashlist);
+		list_del(&lp->lp_hashlist);
+	}
+
+	/*
+	 * take extra refcount in case another thread has shutdown LNet
+	 * and destroyed locks and peer-table before I finish the allocation
+	 */
+	ptable->pt_number++;
+	lnet_net_unlock(cpt);
+
+	if (lp != NULL)
+		memset(lp, 0, sizeof(*lp));
+	else
+		LIBCFS_CPT_ALLOC(lp, lnet_cpt_table(), cpt2, sizeof(*lp));
+
+	if (lp == NULL) {
+		rc = -ENOMEM;
+		lnet_net_lock(cpt);
+		goto out;
+	}
+
+	INIT_LIST_HEAD(&lp->lp_txq);
+	INIT_LIST_HEAD(&lp->lp_rtrq);
+	INIT_LIST_HEAD(&lp->lp_routes);
+
+	lp->lp_notify = 0;
+	lp->lp_notifylnd = 0;
+	lp->lp_notifying = 0;
+	lp->lp_alive_count = 0;
+	lp->lp_timestamp = 0;
+	lp->lp_alive = !lnet_peers_start_down(); /* 1 bit!! */
+	lp->lp_last_alive = cfs_time_current(); /* assumes alive */
+	lp->lp_last_query = 0; /* haven't asked NI yet */
+	lp->lp_ping_timestamp = 0;
+	lp->lp_ping_feats = LNET_PING_FEAT_INVAL;
+	lp->lp_nid = nid;
+	lp->lp_cpt = cpt2;
+	lp->lp_refcount = 2;	/* 1 for caller; 1 for hash */
+	lp->lp_rtr_refcount = 0;
+
+	lnet_net_lock(cpt);
+
+	if (the_lnet.ln_shutdown) {
+		rc = -ESHUTDOWN;
+		goto out;
+	}
+
+	lp2 = lnet_find_peer_locked(ptable, nid);
+	if (lp2 != NULL) {
+		*lpp = lp2;
+		goto out;
+	}
+
+	lp->lp_ni = lnet_net2ni_locked(LNET_NIDNET(nid), cpt2);
+	if (lp->lp_ni == NULL) {
+		rc = -EHOSTUNREACH;
+		goto out;
+	}
+
+	lp->lp_txcredits    =
+	lp->lp_mintxcredits = lp->lp_ni->ni_peertxcredits;
+	lp->lp_rtrcredits    =
+	lp->lp_minrtrcredits = lnet_peer_buffer_credits(lp->lp_ni);
+
+	list_add_tail(&lp->lp_hashlist,
+			  &ptable->pt_hash[lnet_nid2peerhash(nid)]);
+	ptable->pt_version++;
+	*lpp = lp;
+
+	return 0;
+out:
+	if (lp != NULL)
+		list_add(&lp->lp_hashlist, &ptable->pt_deathrow);
+	ptable->pt_number--;
+	return rc;
+}
+
+void
+lnet_debug_peer(lnet_nid_t nid)
+{
+	char		*aliveness = "NA";
+	lnet_peer_t	*lp;
+	int		rc;
+	int		cpt;
+
+	cpt = lnet_cpt_of_nid(nid);
+	lnet_net_lock(cpt);
+
+	rc = lnet_nid2peer_locked(&lp, nid, cpt);
+	if (rc != 0) {
+		lnet_net_unlock(cpt);
+		CDEBUG(D_WARNING, "No peer %s\n", libcfs_nid2str(nid));
+		return;
+	}
+
+	if (lnet_isrouter(lp) || lnet_peer_aliveness_enabled(lp))
+		aliveness = lp->lp_alive ? "up" : "down";
+
+	CDEBUG(D_WARNING, "%-24s %4d %5s %5d %5d %5d %5d %5d %ld\n",
+	       libcfs_nid2str(lp->lp_nid), lp->lp_refcount,
+	       aliveness, lp->lp_ni->ni_peertxcredits,
+	       lp->lp_rtrcredits, lp->lp_minrtrcredits,
+	       lp->lp_txcredits, lp->lp_mintxcredits, lp->lp_txqnob);
+
+	lnet_peer_decref_locked(lp);
+
+	lnet_net_unlock(cpt);
+}
diff --git a/drivers/staging/lustre/lnet/lnet/router.c b/drivers/staging/lustre/lnet/lnet/router.c
new file mode 100644
index 000000000000..c5ff97aaacc3
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/router.c
@@ -0,0 +1,1693 @@
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ *   This file is part of Portals
+ *   http://sourceforge.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+
+#if  defined(LNET_ROUTER)
+
+#define LNET_NRB_TINY_MIN	512	/* min value for each CPT */
+#define LNET_NRB_TINY		(LNET_NRB_TINY_MIN * 4)
+#define LNET_NRB_SMALL_MIN	4096	/* min value for each CPT */
+#define LNET_NRB_SMALL		(LNET_NRB_SMALL_MIN * 4)
+#define LNET_NRB_LARGE_MIN	256	/* min value for each CPT */
+#define LNET_NRB_LARGE		(LNET_NRB_LARGE_MIN * 4)
+
+static char *forwarding = "";
+CFS_MODULE_PARM(forwarding, "s", charp, 0444,
+		"Explicitly enable/disable forwarding between networks");
+
+static int tiny_router_buffers;
+CFS_MODULE_PARM(tiny_router_buffers, "i", int, 0444,
+		"# of 0 payload messages to buffer in the router");
+static int small_router_buffers;
+CFS_MODULE_PARM(small_router_buffers, "i", int, 0444,
+		"# of small (1 page) messages to buffer in the router");
+static int large_router_buffers;
+CFS_MODULE_PARM(large_router_buffers, "i", int, 0444,
+		"# of large messages to buffer in the router");
+static int peer_buffer_credits = 0;
+CFS_MODULE_PARM(peer_buffer_credits, "i", int, 0444,
+		"# router buffer credits per peer");
+
+static int auto_down = 1;
+CFS_MODULE_PARM(auto_down, "i", int, 0444,
+		"Automatically mark peers down on comms error");
+
+int
+lnet_peer_buffer_credits(lnet_ni_t *ni)
+{
+	/* NI option overrides LNet default */
+	if (ni->ni_peerrtrcredits > 0)
+		return ni->ni_peerrtrcredits;
+	if (peer_buffer_credits > 0)
+		return peer_buffer_credits;
+
+	/* As an approximation, allow this peer the same number of router
+	 * buffers as it is allowed outstanding sends */
+	return ni->ni_peertxcredits;
+}
+
+/* forward ref's */
+static int lnet_router_checker(void *);
+#else
+
+int
+lnet_peer_buffer_credits(lnet_ni_t *ni)
+{
+	return 0;
+}
+
+#endif
+
+static int check_routers_before_use = 0;
+CFS_MODULE_PARM(check_routers_before_use, "i", int, 0444,
+		"Assume routers are down and ping them before use");
+
+static int avoid_asym_router_failure = 1;
+CFS_MODULE_PARM(avoid_asym_router_failure, "i", int, 0644,
+		"Avoid asymmetrical router failures (0 to disable)");
+
+static int dead_router_check_interval = 60;
+CFS_MODULE_PARM(dead_router_check_interval, "i", int, 0644,
+		"Seconds between dead router health checks (<= 0 to disable)");
+
+static int live_router_check_interval = 60;
+CFS_MODULE_PARM(live_router_check_interval, "i", int, 0644,
+		"Seconds between live router health checks (<= 0 to disable)");
+
+static int router_ping_timeout = 50;
+CFS_MODULE_PARM(router_ping_timeout, "i", int, 0644,
+		"Seconds to wait for the reply to a router health query");
+
+int
+lnet_peers_start_down(void)
+{
+	return check_routers_before_use;
+}
+
+void
+lnet_notify_locked(lnet_peer_t *lp, int notifylnd, int alive, cfs_time_t when)
+{
+	if (cfs_time_before(when, lp->lp_timestamp)) { /* out of date information */
+		CDEBUG(D_NET, "Out of date\n");
+		return;
+	}
+
+	lp->lp_timestamp = when;		/* update timestamp */
+	lp->lp_ping_deadline = 0;	       /* disable ping timeout */
+
+	if (lp->lp_alive_count != 0 &&	  /* got old news */
+	    (!lp->lp_alive) == (!alive)) {      /* new date for old news */
+		CDEBUG(D_NET, "Old news\n");
+		return;
+	}
+
+	/* Flag that notification is outstanding */
+
+	lp->lp_alive_count++;
+	lp->lp_alive = !(!alive);	       /* 1 bit! */
+	lp->lp_notify = 1;
+	lp->lp_notifylnd |= notifylnd;
+	if (lp->lp_alive)
+		lp->lp_ping_feats = LNET_PING_FEAT_INVAL; /* reset */
+
+	CDEBUG(D_NET, "set %s %d\n", libcfs_nid2str(lp->lp_nid), alive);
+}
+
+void
+lnet_ni_notify_locked(lnet_ni_t *ni, lnet_peer_t *lp)
+{
+	int	alive;
+	int	notifylnd;
+
+	/* Notify only in 1 thread at any time to ensure ordered notification.
+	 * NB individual events can be missed; the only guarantee is that you
+	 * always get the most recent news */
+
+	if (lp->lp_notifying)
+		return;
+
+	lp->lp_notifying = 1;
+
+	while (lp->lp_notify) {
+		alive     = lp->lp_alive;
+		notifylnd = lp->lp_notifylnd;
+
+		lp->lp_notifylnd = 0;
+		lp->lp_notify    = 0;
+
+		if (notifylnd && ni->ni_lnd->lnd_notify != NULL) {
+			lnet_net_unlock(lp->lp_cpt);
+
+			/* A new notification could happen now; I'll handle it
+			 * when control returns to me */
+
+			(ni->ni_lnd->lnd_notify)(ni, lp->lp_nid, alive);
+
+			lnet_net_lock(lp->lp_cpt);
+		}
+	}
+
+	lp->lp_notifying = 0;
+}
+
+
+static void
+lnet_rtr_addref_locked(lnet_peer_t *lp)
+{
+	LASSERT(lp->lp_refcount > 0);
+	LASSERT(lp->lp_rtr_refcount >= 0);
+
+	/* lnet_net_lock must be exclusively locked */
+	lp->lp_rtr_refcount++;
+	if (lp->lp_rtr_refcount == 1) {
+		struct list_head *pos;
+
+		/* a simple insertion sort */
+		list_for_each_prev(pos, &the_lnet.ln_routers) {
+			lnet_peer_t *rtr = list_entry(pos, lnet_peer_t,
+							  lp_rtr_list);
+
+			if (rtr->lp_nid < lp->lp_nid)
+				break;
+		}
+
+		list_add(&lp->lp_rtr_list, pos);
+		/* addref for the_lnet.ln_routers */
+		lnet_peer_addref_locked(lp);
+		the_lnet.ln_routers_version++;
+	}
+}
+
+static void
+lnet_rtr_decref_locked(lnet_peer_t *lp)
+{
+	LASSERT(lp->lp_refcount > 0);
+	LASSERT(lp->lp_rtr_refcount > 0);
+
+	/* lnet_net_lock must be exclusively locked */
+	lp->lp_rtr_refcount--;
+	if (lp->lp_rtr_refcount == 0) {
+		LASSERT(list_empty(&lp->lp_routes));
+
+		if (lp->lp_rcd != NULL) {
+			list_add(&lp->lp_rcd->rcd_list,
+				     &the_lnet.ln_rcd_deathrow);
+			lp->lp_rcd = NULL;
+		}
+
+		list_del(&lp->lp_rtr_list);
+		/* decref for the_lnet.ln_routers */
+		lnet_peer_decref_locked(lp);
+		the_lnet.ln_routers_version++;
+	}
+}
+
+lnet_remotenet_t *
+lnet_find_net_locked (__u32 net)
+{
+	lnet_remotenet_t	*rnet;
+	struct list_head		*tmp;
+	struct list_head		*rn_list;
+
+	LASSERT(!the_lnet.ln_shutdown);
+
+	rn_list = lnet_net2rnethash(net);
+	list_for_each(tmp, rn_list) {
+		rnet = list_entry(tmp, lnet_remotenet_t, lrn_list);
+
+		if (rnet->lrn_net == net)
+			return rnet;
+	}
+	return NULL;
+}
+
+static void lnet_shuffle_seed(void)
+{
+	static int seeded = 0;
+	int lnd_type, seed[2];
+	struct timeval tv;
+	lnet_ni_t *ni;
+	struct list_head *tmp;
+
+	if (seeded)
+		return;
+
+	cfs_get_random_bytes(seed, sizeof(seed));
+
+	/* Nodes with small feet have little entropy
+	 * the NID for this node gives the most entropy in the low bits */
+	list_for_each(tmp, &the_lnet.ln_nis) {
+		ni = list_entry(tmp, lnet_ni_t, ni_list);
+		lnd_type = LNET_NETTYP(LNET_NIDNET(ni->ni_nid));
+
+		if (lnd_type != LOLND)
+			seed[0] ^= (LNET_NIDADDR(ni->ni_nid) | lnd_type);
+	}
+
+	do_gettimeofday(&tv);
+	cfs_srand(tv.tv_sec ^ seed[0], tv.tv_usec ^ seed[1]);
+	seeded = 1;
+	return;
+}
+
+/* NB expects LNET_LOCK held */
+void
+lnet_add_route_to_rnet (lnet_remotenet_t *rnet, lnet_route_t *route)
+{
+	unsigned int      len = 0;
+	unsigned int      offset = 0;
+	struct list_head       *e;
+
+	lnet_shuffle_seed();
+
+	list_for_each (e, &rnet->lrn_routes) {
+		len++;
+	}
+
+	/* len+1 positions to add a new entry, also prevents division by 0 */
+	offset = cfs_rand() % (len + 1);
+	list_for_each (e, &rnet->lrn_routes) {
+		if (offset == 0)
+			break;
+		offset--;
+	}
+	list_add(&route->lr_list, e);
+	list_add(&route->lr_gwlist, &route->lr_gateway->lp_routes);
+
+	the_lnet.ln_remote_nets_version++;
+	lnet_rtr_addref_locked(route->lr_gateway);
+}
+
+int
+lnet_add_route (__u32 net, unsigned int hops, lnet_nid_t gateway)
+{
+	struct list_head	  *e;
+	lnet_remotenet_t    *rnet;
+	lnet_remotenet_t    *rnet2;
+	lnet_route_t	*route;
+	lnet_ni_t	   *ni;
+	int		  add_route;
+	int		  rc;
+
+	CDEBUG(D_NET, "Add route: net %s hops %u gw %s\n",
+	       libcfs_net2str(net), hops, libcfs_nid2str(gateway));
+
+	if (gateway == LNET_NID_ANY ||
+	    LNET_NETTYP(LNET_NIDNET(gateway)) == LOLND ||
+	    net == LNET_NIDNET(LNET_NID_ANY) ||
+	    LNET_NETTYP(net) == LOLND ||
+	    LNET_NIDNET(gateway) == net ||
+	    hops < 1 || hops > 255)
+		return (-EINVAL);
+
+	if (lnet_islocalnet(net))	       /* it's a local network */
+		return 0;		       /* ignore the route entry */
+
+	/* Assume net, route, all new */
+	LIBCFS_ALLOC(route, sizeof(*route));
+	LIBCFS_ALLOC(rnet, sizeof(*rnet));
+	if (route == NULL || rnet == NULL) {
+		CERROR("Out of memory creating route %s %d %s\n",
+		       libcfs_net2str(net), hops, libcfs_nid2str(gateway));
+		if (route != NULL)
+			LIBCFS_FREE(route, sizeof(*route));
+		if (rnet != NULL)
+			LIBCFS_FREE(rnet, sizeof(*rnet));
+		return -ENOMEM;
+	}
+
+	INIT_LIST_HEAD(&rnet->lrn_routes);
+	rnet->lrn_net = net;
+	route->lr_hops = hops;
+	route->lr_net = net;
+
+	lnet_net_lock(LNET_LOCK_EX);
+
+	rc = lnet_nid2peer_locked(&route->lr_gateway, gateway, LNET_LOCK_EX);
+	if (rc != 0) {
+		lnet_net_unlock(LNET_LOCK_EX);
+
+		LIBCFS_FREE(route, sizeof(*route));
+		LIBCFS_FREE(rnet, sizeof(*rnet));
+
+		if (rc == -EHOSTUNREACH) { /* gateway is not on a local net */
+			return 0;	/* ignore the route entry */
+		} else {
+			CERROR("Error %d creating route %s %d %s\n", rc,
+			       libcfs_net2str(net), hops,
+			       libcfs_nid2str(gateway));
+		}
+		return rc;
+	}
+
+	LASSERT (!the_lnet.ln_shutdown);
+
+	rnet2 = lnet_find_net_locked(net);
+	if (rnet2 == NULL) {
+		/* new network */
+		list_add_tail(&rnet->lrn_list, lnet_net2rnethash(net));
+		rnet2 = rnet;
+	}
+
+	/* Search for a duplicate route (it's a NOOP if it is) */
+	add_route = 1;
+	list_for_each (e, &rnet2->lrn_routes) {
+		lnet_route_t *route2 = list_entry(e, lnet_route_t, lr_list);
+
+		if (route2->lr_gateway == route->lr_gateway) {
+			add_route = 0;
+			break;
+		}
+
+		/* our lookups must be true */
+		LASSERT (route2->lr_gateway->lp_nid != gateway);
+	}
+
+	if (add_route) {
+		lnet_peer_addref_locked(route->lr_gateway); /* +1 for notify */
+		lnet_add_route_to_rnet(rnet2, route);
+
+		ni = route->lr_gateway->lp_ni;
+		lnet_net_unlock(LNET_LOCK_EX);
+
+		/* XXX Assume alive */
+		if (ni->ni_lnd->lnd_notify != NULL)
+			(ni->ni_lnd->lnd_notify)(ni, gateway, 1);
+
+		lnet_net_lock(LNET_LOCK_EX);
+	}
+
+	/* -1 for notify or !add_route */
+	lnet_peer_decref_locked(route->lr_gateway);
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	if (!add_route)
+		LIBCFS_FREE(route, sizeof(*route));
+
+	if (rnet != rnet2)
+		LIBCFS_FREE(rnet, sizeof(*rnet));
+
+	return 0;
+}
+
+int
+lnet_check_routes(void)
+{
+	lnet_remotenet_t	*rnet;
+	lnet_route_t		*route;
+	lnet_route_t		*route2;
+	struct list_head		*e1;
+	struct list_head		*e2;
+	int			cpt;
+	struct list_head		*rn_list;
+	int			i;
+
+	cpt = lnet_net_lock_current();
+
+	for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) {
+		rn_list = &the_lnet.ln_remote_nets_hash[i];
+		list_for_each(e1, rn_list) {
+			rnet = list_entry(e1, lnet_remotenet_t, lrn_list);
+
+			route2 = NULL;
+			list_for_each(e2, &rnet->lrn_routes) {
+				lnet_nid_t	nid1;
+				lnet_nid_t	nid2;
+				int		net;
+
+				route = list_entry(e2, lnet_route_t,
+						       lr_list);
+
+				if (route2 == NULL) {
+					route2 = route;
+					continue;
+				}
+
+				if (route->lr_gateway->lp_ni ==
+				    route2->lr_gateway->lp_ni)
+					continue;
+
+				nid1 = route->lr_gateway->lp_nid;
+				nid2 = route2->lr_gateway->lp_nid;
+				net = rnet->lrn_net;
+
+				lnet_net_unlock(cpt);
+
+				CERROR("Routes to %s via %s and %s not "
+				       "supported\n",
+				       libcfs_net2str(net),
+				       libcfs_nid2str(nid1),
+				       libcfs_nid2str(nid2));
+				return -EINVAL;
+			}
+		}
+	}
+
+	lnet_net_unlock(cpt);
+	return 0;
+}
+
+int
+lnet_del_route(__u32 net, lnet_nid_t gw_nid)
+{
+	struct lnet_peer	*gateway;
+	lnet_remotenet_t	*rnet;
+	lnet_route_t		*route;
+	struct list_head		*e1;
+	struct list_head		*e2;
+	int			rc = -ENOENT;
+	struct list_head		*rn_list;
+	int			idx = 0;
+
+	CDEBUG(D_NET, "Del route: net %s : gw %s\n",
+	       libcfs_net2str(net), libcfs_nid2str(gw_nid));
+
+	/* NB Caller may specify either all routes via the given gateway
+	 * or a specific route entry actual NIDs) */
+
+	lnet_net_lock(LNET_LOCK_EX);
+	if (net == LNET_NIDNET(LNET_NID_ANY))
+		rn_list = &the_lnet.ln_remote_nets_hash[0];
+	else
+		rn_list = lnet_net2rnethash(net);
+
+ again:
+	list_for_each(e1, rn_list) {
+		rnet = list_entry(e1, lnet_remotenet_t, lrn_list);
+
+		if (!(net == LNET_NIDNET(LNET_NID_ANY) ||
+			net == rnet->lrn_net))
+			continue;
+
+		list_for_each(e2, &rnet->lrn_routes) {
+			route = list_entry(e2, lnet_route_t, lr_list);
+
+			gateway = route->lr_gateway;
+			if (!(gw_nid == LNET_NID_ANY ||
+			      gw_nid == gateway->lp_nid))
+				continue;
+
+			list_del(&route->lr_list);
+			list_del(&route->lr_gwlist);
+			the_lnet.ln_remote_nets_version++;
+
+			if (list_empty(&rnet->lrn_routes))
+				list_del(&rnet->lrn_list);
+			else
+				rnet = NULL;
+
+			lnet_rtr_decref_locked(gateway);
+			lnet_peer_decref_locked(gateway);
+
+			lnet_net_unlock(LNET_LOCK_EX);
+
+			LIBCFS_FREE(route, sizeof(*route));
+
+			if (rnet != NULL)
+				LIBCFS_FREE(rnet, sizeof(*rnet));
+
+			rc = 0;
+			lnet_net_lock(LNET_LOCK_EX);
+			goto again;
+		}
+	}
+
+	if (net == LNET_NIDNET(LNET_NID_ANY) &&
+	    ++idx < LNET_REMOTE_NETS_HASH_SIZE) {
+		rn_list = &the_lnet.ln_remote_nets_hash[idx];
+		goto again;
+	}
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	return rc;
+}
+
+void
+lnet_destroy_routes (void)
+{
+	lnet_del_route(LNET_NIDNET(LNET_NID_ANY), LNET_NID_ANY);
+}
+
+int
+lnet_get_route(int idx, __u32 *net, __u32 *hops,
+	       lnet_nid_t *gateway, __u32 *alive)
+{
+	struct list_head		*e1;
+	struct list_head		*e2;
+	lnet_remotenet_t	*rnet;
+	lnet_route_t		*route;
+	int			cpt;
+	int			i;
+	struct list_head		*rn_list;
+
+	cpt = lnet_net_lock_current();
+
+	for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) {
+		rn_list = &the_lnet.ln_remote_nets_hash[i];
+		list_for_each(e1, rn_list) {
+			rnet = list_entry(e1, lnet_remotenet_t, lrn_list);
+
+			list_for_each(e2, &rnet->lrn_routes) {
+				route = list_entry(e2, lnet_route_t,
+						       lr_list);
+
+				if (idx-- == 0) {
+					*net     = rnet->lrn_net;
+					*hops    = route->lr_hops;
+					*gateway = route->lr_gateway->lp_nid;
+					*alive   = route->lr_gateway->lp_alive;
+					lnet_net_unlock(cpt);
+					return 0;
+				}
+			}
+		}
+	}
+
+	lnet_net_unlock(cpt);
+	return -ENOENT;
+}
+
+void
+lnet_swap_pinginfo(lnet_ping_info_t *info)
+{
+	int	       i;
+	lnet_ni_status_t *stat;
+
+	__swab32s(&info->pi_magic);
+	__swab32s(&info->pi_features);
+	__swab32s(&info->pi_pid);
+	__swab32s(&info->pi_nnis);
+	for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) {
+		stat = &info->pi_ni[i];
+		__swab64s(&stat->ns_nid);
+		__swab32s(&stat->ns_status);
+	}
+	return;
+}
+
+/**
+ * parse router-checker pinginfo, record number of down NIs for remote
+ * networks on that router.
+ */
+static void
+lnet_parse_rc_info(lnet_rc_data_t *rcd)
+{
+	lnet_ping_info_t	*info = rcd->rcd_pinginfo;
+	struct lnet_peer	*gw   = rcd->rcd_gateway;
+	lnet_route_t		*rtr;
+
+	if (!gw->lp_alive)
+		return;
+
+	if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC))
+		lnet_swap_pinginfo(info);
+
+	/* NB always racing with network! */
+	if (info->pi_magic != LNET_PROTO_PING_MAGIC) {
+		CDEBUG(D_NET, "%s: Unexpected magic %08x\n",
+		       libcfs_nid2str(gw->lp_nid), info->pi_magic);
+		gw->lp_ping_feats = LNET_PING_FEAT_INVAL;
+		return;
+	}
+
+	gw->lp_ping_feats = info->pi_features;
+	if ((gw->lp_ping_feats & LNET_PING_FEAT_MASK) == 0) {
+		CDEBUG(D_NET, "%s: Unexpected features 0x%x\n",
+		       libcfs_nid2str(gw->lp_nid), gw->lp_ping_feats);
+		return; /* nothing I can understand */
+	}
+
+	if ((gw->lp_ping_feats & LNET_PING_FEAT_NI_STATUS) == 0)
+		return; /* can't carry NI status info */
+
+	list_for_each_entry(rtr, &gw->lp_routes, lr_gwlist) {
+		int	ptl_status = LNET_NI_STATUS_INVALID;
+		int	down = 0;
+		int	up = 0;
+		int	i;
+
+		for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) {
+			lnet_ni_status_t *stat = &info->pi_ni[i];
+			lnet_nid_t	 nid = stat->ns_nid;
+
+			if (nid == LNET_NID_ANY) {
+				CDEBUG(D_NET, "%s: unexpected LNET_NID_ANY\n",
+				       libcfs_nid2str(gw->lp_nid));
+				gw->lp_ping_feats = LNET_PING_FEAT_INVAL;
+				return;
+			}
+
+			if (LNET_NETTYP(LNET_NIDNET(nid)) == LOLND)
+				continue;
+
+			if (stat->ns_status == LNET_NI_STATUS_DOWN) {
+				if (LNET_NETTYP(LNET_NIDNET(nid)) != PTLLND)
+					down++;
+				else if (ptl_status != LNET_NI_STATUS_UP)
+					ptl_status = LNET_NI_STATUS_DOWN;
+				continue;
+			}
+
+			if (stat->ns_status == LNET_NI_STATUS_UP) {
+				if (LNET_NIDNET(nid) == rtr->lr_net) {
+					up = 1;
+					break;
+				}
+				/* ptl NIs are considered down only when
+				 * they're all down */
+				if (LNET_NETTYP(LNET_NIDNET(nid)) == PTLLND)
+					ptl_status = LNET_NI_STATUS_UP;
+				continue;
+			}
+
+			CDEBUG(D_NET, "%s: Unexpected status 0x%x\n",
+			       libcfs_nid2str(gw->lp_nid), stat->ns_status);
+			gw->lp_ping_feats = LNET_PING_FEAT_INVAL;
+			return;
+		}
+
+		if (up) { /* ignore downed NIs if NI for dest network is up */
+			rtr->lr_downis = 0;
+			continue;
+		}
+		rtr->lr_downis = down + (ptl_status == LNET_NI_STATUS_DOWN);
+	}
+}
+
+static void
+lnet_router_checker_event(lnet_event_t *event)
+{
+	lnet_rc_data_t		*rcd = event->md.user_ptr;
+	struct lnet_peer	*lp;
+
+	LASSERT(rcd != NULL);
+
+	if (event->unlinked) {
+		LNetInvalidateHandle(&rcd->rcd_mdh);
+		return;
+	}
+
+	LASSERT(event->type == LNET_EVENT_SEND ||
+		event->type == LNET_EVENT_REPLY);
+
+	lp = rcd->rcd_gateway;
+	LASSERT(lp != NULL);
+
+	 /* NB: it's called with holding lnet_res_lock, we have a few
+	  * places need to hold both locks at the same time, please take
+	  * care of lock ordering */
+	lnet_net_lock(lp->lp_cpt);
+	if (!lnet_isrouter(lp) || lp->lp_rcd != rcd) {
+		/* ignore if no longer a router or rcd is replaced */
+		goto out;
+	}
+
+	if (event->type == LNET_EVENT_SEND) {
+		lp->lp_ping_notsent = 0;
+		if (event->status == 0)
+			goto out;
+	}
+
+	/* LNET_EVENT_REPLY */
+	/* A successful REPLY means the router is up.  If _any_ comms
+	 * to the router fail I assume it's down (this will happen if
+	 * we ping alive routers to try to detect router death before
+	 * apps get burned). */
+
+	lnet_notify_locked(lp, 1, (event->status == 0), cfs_time_current());
+	/* The router checker will wake up very shortly and do the
+	 * actual notification.
+	 * XXX If 'lp' stops being a router before then, it will still
+	 * have the notification pending!!! */
+
+	if (avoid_asym_router_failure && event->status == 0)
+		lnet_parse_rc_info(rcd);
+
+ out:
+	lnet_net_unlock(lp->lp_cpt);
+}
+
+void
+lnet_wait_known_routerstate(void)
+{
+	lnet_peer_t	 *rtr;
+	struct list_head	  *entry;
+	int		  all_known;
+
+	LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
+
+	for (;;) {
+		int	cpt = lnet_net_lock_current();
+
+		all_known = 1;
+		list_for_each (entry, &the_lnet.ln_routers) {
+			rtr = list_entry(entry, lnet_peer_t, lp_rtr_list);
+
+			if (rtr->lp_alive_count == 0) {
+				all_known = 0;
+				break;
+			}
+		}
+
+		lnet_net_unlock(cpt);
+
+		if (all_known)
+			return;
+
+		cfs_pause(cfs_time_seconds(1));
+	}
+}
+
+void
+lnet_update_ni_status_locked(void)
+{
+	lnet_ni_t	*ni;
+	long		now;
+	int		timeout;
+
+	LASSERT(the_lnet.ln_routing);
+
+	timeout = router_ping_timeout +
+		  MAX(live_router_check_interval, dead_router_check_interval);
+
+	now = cfs_time_current_sec();
+	list_for_each_entry(ni, &the_lnet.ln_nis, ni_list) {
+		if (ni->ni_lnd->lnd_type == LOLND)
+			continue;
+
+		if (now < ni->ni_last_alive + timeout)
+			continue;
+
+		lnet_ni_lock(ni);
+		/* re-check with lock */
+		if (now < ni->ni_last_alive + timeout) {
+			lnet_ni_unlock(ni);
+			continue;
+		}
+
+		LASSERT(ni->ni_status != NULL);
+
+		if (ni->ni_status->ns_status != LNET_NI_STATUS_DOWN) {
+			CDEBUG(D_NET, "NI(%s:%d) status changed to down\n",
+			       libcfs_nid2str(ni->ni_nid), timeout);
+			/* NB: so far, this is the only place to set
+			 * NI status to "down" */
+			ni->ni_status->ns_status = LNET_NI_STATUS_DOWN;
+		}
+		lnet_ni_unlock(ni);
+	}
+}
+
+void
+lnet_destroy_rc_data(lnet_rc_data_t *rcd)
+{
+	LASSERT(list_empty(&rcd->rcd_list));
+	/* detached from network */
+	LASSERT(LNetHandleIsInvalid(rcd->rcd_mdh));
+
+	if (rcd->rcd_gateway != NULL) {
+		int cpt = rcd->rcd_gateway->lp_cpt;
+
+		lnet_net_lock(cpt);
+		lnet_peer_decref_locked(rcd->rcd_gateway);
+		lnet_net_unlock(cpt);
+	}
+
+	if (rcd->rcd_pinginfo != NULL)
+		LIBCFS_FREE(rcd->rcd_pinginfo, LNET_PINGINFO_SIZE);
+
+	LIBCFS_FREE(rcd, sizeof(*rcd));
+}
+
+lnet_rc_data_t *
+lnet_create_rc_data_locked(lnet_peer_t *gateway)
+{
+	lnet_rc_data_t		*rcd = NULL;
+	lnet_ping_info_t	*pi;
+	int			rc;
+	int			i;
+
+	lnet_net_unlock(gateway->lp_cpt);
+
+	LIBCFS_ALLOC(rcd, sizeof(*rcd));
+	if (rcd == NULL)
+		goto out;
+
+	LNetInvalidateHandle(&rcd->rcd_mdh);
+	INIT_LIST_HEAD(&rcd->rcd_list);
+
+	LIBCFS_ALLOC(pi, LNET_PINGINFO_SIZE);
+	if (pi == NULL)
+		goto out;
+
+	memset(pi, 0, LNET_PINGINFO_SIZE);
+	for (i = 0; i < LNET_MAX_RTR_NIS; i++) {
+		pi->pi_ni[i].ns_nid = LNET_NID_ANY;
+		pi->pi_ni[i].ns_status = LNET_NI_STATUS_INVALID;
+	}
+	rcd->rcd_pinginfo = pi;
+
+	LASSERT (!LNetHandleIsInvalid(the_lnet.ln_rc_eqh));
+	rc = LNetMDBind((lnet_md_t){.start     = pi,
+				    .user_ptr  = rcd,
+				    .length    = LNET_PINGINFO_SIZE,
+				    .threshold = LNET_MD_THRESH_INF,
+				    .options   = LNET_MD_TRUNCATE,
+				    .eq_handle = the_lnet.ln_rc_eqh},
+			LNET_UNLINK,
+			&rcd->rcd_mdh);
+	if (rc < 0) {
+		CERROR("Can't bind MD: %d\n", rc);
+		goto out;
+	}
+	LASSERT(rc == 0);
+
+	lnet_net_lock(gateway->lp_cpt);
+	/* router table changed or someone has created rcd for this gateway */
+	if (!lnet_isrouter(gateway) || gateway->lp_rcd != NULL) {
+		lnet_net_unlock(gateway->lp_cpt);
+		goto out;
+	}
+
+	lnet_peer_addref_locked(gateway);
+	rcd->rcd_gateway = gateway;
+	gateway->lp_rcd = rcd;
+	gateway->lp_ping_notsent = 0;
+
+	return rcd;
+
+ out:
+	if (rcd != NULL) {
+		if (!LNetHandleIsInvalid(rcd->rcd_mdh)) {
+			rc = LNetMDUnlink(rcd->rcd_mdh);
+			LASSERT(rc == 0);
+		}
+		lnet_destroy_rc_data(rcd);
+	}
+
+	lnet_net_lock(gateway->lp_cpt);
+	return gateway->lp_rcd;
+}
+
+static int
+lnet_router_check_interval (lnet_peer_t *rtr)
+{
+	int secs;
+
+	secs = rtr->lp_alive ? live_router_check_interval :
+			       dead_router_check_interval;
+	if (secs < 0)
+		secs = 0;
+
+	return secs;
+}
+
+static void
+lnet_ping_router_locked (lnet_peer_t *rtr)
+{
+	lnet_rc_data_t *rcd = NULL;
+	cfs_time_t      now = cfs_time_current();
+	int	     secs;
+
+	lnet_peer_addref_locked(rtr);
+
+	if (rtr->lp_ping_deadline != 0 && /* ping timed out? */
+	    cfs_time_after(now, rtr->lp_ping_deadline))
+		lnet_notify_locked(rtr, 1, 0, now);
+
+	/* Run any outstanding notifications */
+	lnet_ni_notify_locked(rtr->lp_ni, rtr);
+
+	if (!lnet_isrouter(rtr) ||
+	    the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING) {
+		/* router table changed or router checker is shutting down */
+		lnet_peer_decref_locked(rtr);
+		return;
+	}
+
+	rcd = rtr->lp_rcd != NULL ?
+	      rtr->lp_rcd : lnet_create_rc_data_locked(rtr);
+
+	if (rcd == NULL)
+		return;
+
+	secs = lnet_router_check_interval(rtr);
+
+	CDEBUG(D_NET,
+	       "rtr %s %d: deadline %lu ping_notsent %d alive %d "
+	       "alive_count %d lp_ping_timestamp %lu\n",
+	       libcfs_nid2str(rtr->lp_nid), secs,
+	       rtr->lp_ping_deadline, rtr->lp_ping_notsent,
+	       rtr->lp_alive, rtr->lp_alive_count, rtr->lp_ping_timestamp);
+
+	if (secs != 0 && !rtr->lp_ping_notsent &&
+	    cfs_time_after(now, cfs_time_add(rtr->lp_ping_timestamp,
+					     cfs_time_seconds(secs)))) {
+		int	       rc;
+		lnet_process_id_t id;
+		lnet_handle_md_t  mdh;
+
+		id.nid = rtr->lp_nid;
+		id.pid = LUSTRE_SRV_LNET_PID;
+		CDEBUG(D_NET, "Check: %s\n", libcfs_id2str(id));
+
+		rtr->lp_ping_notsent   = 1;
+		rtr->lp_ping_timestamp = now;
+
+		mdh = rcd->rcd_mdh;
+
+		if (rtr->lp_ping_deadline == 0) {
+			rtr->lp_ping_deadline =
+				cfs_time_shift(router_ping_timeout);
+		}
+
+		lnet_net_unlock(rtr->lp_cpt);
+
+		rc = LNetGet(LNET_NID_ANY, mdh, id, LNET_RESERVED_PORTAL,
+			     LNET_PROTO_PING_MATCHBITS, 0);
+
+		lnet_net_lock(rtr->lp_cpt);
+		if (rc != 0)
+			rtr->lp_ping_notsent = 0; /* no event pending */
+	}
+
+	lnet_peer_decref_locked(rtr);
+	return;
+}
+
+int
+lnet_router_checker_start(void)
+{
+	int	  rc;
+	int	  eqsz;
+
+	LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN);
+
+	if (check_routers_before_use &&
+	    dead_router_check_interval <= 0) {
+		LCONSOLE_ERROR_MSG(0x10a, "'dead_router_check_interval' must be"
+				   " set if 'check_routers_before_use' is set"
+				   "\n");
+		return -EINVAL;
+	}
+
+	if (!the_lnet.ln_routing &&
+	    live_router_check_interval <= 0 &&
+	    dead_router_check_interval <= 0)
+		return 0;
+
+	sema_init(&the_lnet.ln_rc_signal, 0);
+	/* EQ size doesn't matter; the callback is guaranteed to get every
+	 * event */
+	eqsz = 0;
+	rc = LNetEQAlloc(eqsz, lnet_router_checker_event,
+			 &the_lnet.ln_rc_eqh);
+	if (rc != 0) {
+		CERROR("Can't allocate EQ(%d): %d\n", eqsz, rc);
+		return -ENOMEM;
+	}
+
+	the_lnet.ln_rc_state = LNET_RC_STATE_RUNNING;
+	rc = PTR_ERR(kthread_run(lnet_router_checker,
+				 NULL, "router_checker"));
+	if (IS_ERR_VALUE(rc)) {
+		CERROR("Can't start router checker thread: %d\n", rc);
+		/* block until event callback signals exit */
+		down(&the_lnet.ln_rc_signal);
+		rc = LNetEQFree(the_lnet.ln_rc_eqh);
+		LASSERT(rc == 0);
+		the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
+		return -ENOMEM;
+	}
+
+	if (check_routers_before_use) {
+		/* Note that a helpful side-effect of pinging all known routers
+		 * at startup is that it makes them drop stale connections they
+		 * may have to a previous instance of me. */
+		lnet_wait_known_routerstate();
+	}
+
+	return 0;
+}
+
+void
+lnet_router_checker_stop (void)
+{
+	int rc;
+
+	if (the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN)
+		return;
+
+	LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
+	the_lnet.ln_rc_state = LNET_RC_STATE_STOPPING;
+
+	/* block until event callback signals exit */
+	down(&the_lnet.ln_rc_signal);
+	LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN);
+
+	rc = LNetEQFree(the_lnet.ln_rc_eqh);
+	LASSERT (rc == 0);
+	return;
+}
+
+static void
+lnet_prune_rc_data(int wait_unlink)
+{
+	lnet_rc_data_t		*rcd;
+	lnet_rc_data_t		*tmp;
+	lnet_peer_t		*lp;
+	struct list_head		head;
+	int			i = 2;
+
+	if (likely(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING &&
+		   list_empty(&the_lnet.ln_rcd_deathrow) &&
+		   list_empty(&the_lnet.ln_rcd_zombie)))
+		return;
+
+	INIT_LIST_HEAD(&head);
+
+	lnet_net_lock(LNET_LOCK_EX);
+
+	if (the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING) {
+		/* router checker is stopping, prune all */
+		list_for_each_entry(lp, &the_lnet.ln_routers,
+					lp_rtr_list) {
+			if (lp->lp_rcd == NULL)
+				continue;
+
+			LASSERT(list_empty(&lp->lp_rcd->rcd_list));
+			list_add(&lp->lp_rcd->rcd_list,
+				     &the_lnet.ln_rcd_deathrow);
+			lp->lp_rcd = NULL;
+		}
+	}
+
+	/* unlink all RCDs on deathrow list */
+	list_splice_init(&the_lnet.ln_rcd_deathrow, &head);
+
+	if (!list_empty(&head)) {
+		lnet_net_unlock(LNET_LOCK_EX);
+
+		list_for_each_entry(rcd, &head, rcd_list)
+			LNetMDUnlink(rcd->rcd_mdh);
+
+		lnet_net_lock(LNET_LOCK_EX);
+	}
+
+	list_splice_init(&head, &the_lnet.ln_rcd_zombie);
+
+	/* release all zombie RCDs */
+	while (!list_empty(&the_lnet.ln_rcd_zombie)) {
+		list_for_each_entry_safe(rcd, tmp, &the_lnet.ln_rcd_zombie,
+					     rcd_list) {
+			if (LNetHandleIsInvalid(rcd->rcd_mdh))
+				list_move(&rcd->rcd_list, &head);
+		}
+
+		wait_unlink = wait_unlink &&
+			      !list_empty(&the_lnet.ln_rcd_zombie);
+
+		lnet_net_unlock(LNET_LOCK_EX);
+
+		while (!list_empty(&head)) {
+			rcd = list_entry(head.next,
+					     lnet_rc_data_t, rcd_list);
+			list_del_init(&rcd->rcd_list);
+			lnet_destroy_rc_data(rcd);
+		}
+
+		if (!wait_unlink)
+			return;
+
+		i++;
+		CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
+		       "Waiting for rc buffers to unlink\n");
+		cfs_pause(cfs_time_seconds(1) / 4);
+
+		lnet_net_lock(LNET_LOCK_EX);
+	}
+
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+
+
+#if  defined(LNET_ROUTER)
+
+static int
+lnet_router_checker(void *arg)
+{
+	lnet_peer_t       *rtr;
+	struct list_head	*entry;
+
+	cfs_block_allsigs();
+
+	LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
+
+	while (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING) {
+		__u64	version;
+		int	cpt;
+		int	cpt2;
+
+		cpt = lnet_net_lock_current();
+rescan:
+		version = the_lnet.ln_routers_version;
+
+		list_for_each(entry, &the_lnet.ln_routers) {
+			rtr = list_entry(entry, lnet_peer_t, lp_rtr_list);
+
+			cpt2 = lnet_cpt_of_nid_locked(rtr->lp_nid);
+			if (cpt != cpt2) {
+				lnet_net_unlock(cpt);
+				cpt = cpt2;
+				lnet_net_lock(cpt);
+				/* the routers list has changed */
+				if (version != the_lnet.ln_routers_version)
+					goto rescan;
+			}
+
+			lnet_ping_router_locked(rtr);
+
+			/* NB dropped lock */
+			if (version != the_lnet.ln_routers_version) {
+				/* the routers list has changed */
+				goto rescan;
+			}
+		}
+
+		if (the_lnet.ln_routing)
+			lnet_update_ni_status_locked();
+
+		lnet_net_unlock(cpt);
+
+		lnet_prune_rc_data(0); /* don't wait for UNLINK */
+
+		/* Call cfs_pause() here always adds 1 to load average
+		 * because kernel counts # active tasks as nr_running
+		 * + nr_uninterruptible. */
+		schedule_timeout_and_set_state(TASK_INTERRUPTIBLE,
+						   cfs_time_seconds(1));
+	}
+
+	LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_STOPPING);
+
+	lnet_prune_rc_data(1); /* wait for UNLINK */
+
+	the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
+	up(&the_lnet.ln_rc_signal);
+	/* The unlink event callback will signal final completion */
+	return 0;
+}
+
+void
+lnet_destroy_rtrbuf(lnet_rtrbuf_t *rb, int npages)
+{
+	int sz = offsetof(lnet_rtrbuf_t, rb_kiov[npages]);
+
+	while (--npages >= 0)
+		__free_page(rb->rb_kiov[npages].kiov_page);
+
+	LIBCFS_FREE(rb, sz);
+}
+
+lnet_rtrbuf_t *
+lnet_new_rtrbuf(lnet_rtrbufpool_t *rbp, int cpt)
+{
+	int	    npages = rbp->rbp_npages;
+	int	    sz = offsetof(lnet_rtrbuf_t, rb_kiov[npages]);
+	struct page   *page;
+	lnet_rtrbuf_t *rb;
+	int	    i;
+
+	LIBCFS_CPT_ALLOC(rb, lnet_cpt_table(), cpt, sz);
+	if (rb == NULL)
+		return NULL;
+
+	rb->rb_pool = rbp;
+
+	for (i = 0; i < npages; i++) {
+		page = cfs_page_cpt_alloc(lnet_cpt_table(), cpt,
+					  __GFP_ZERO | GFP_IOFS);
+		if (page == NULL) {
+			while (--i >= 0)
+				__free_page(rb->rb_kiov[i].kiov_page);
+
+			LIBCFS_FREE(rb, sz);
+			return NULL;
+		}
+
+		rb->rb_kiov[i].kiov_len = PAGE_CACHE_SIZE;
+		rb->rb_kiov[i].kiov_offset = 0;
+		rb->rb_kiov[i].kiov_page = page;
+	}
+
+	return rb;
+}
+
+void
+lnet_rtrpool_free_bufs(lnet_rtrbufpool_t *rbp)
+{
+	int		npages = rbp->rbp_npages;
+	int		nbuffers = 0;
+	lnet_rtrbuf_t	*rb;
+
+	if (rbp->rbp_nbuffers == 0) /* not initialized or already freed */
+		return;
+
+	LASSERT (list_empty(&rbp->rbp_msgs));
+	LASSERT (rbp->rbp_credits == rbp->rbp_nbuffers);
+
+	while (!list_empty(&rbp->rbp_bufs)) {
+		LASSERT (rbp->rbp_credits > 0);
+
+		rb = list_entry(rbp->rbp_bufs.next,
+				    lnet_rtrbuf_t, rb_list);
+		list_del(&rb->rb_list);
+		lnet_destroy_rtrbuf(rb, npages);
+		nbuffers++;
+	}
+
+	LASSERT (rbp->rbp_nbuffers == nbuffers);
+	LASSERT (rbp->rbp_credits == nbuffers);
+
+	rbp->rbp_nbuffers = rbp->rbp_credits = 0;
+}
+
+int
+lnet_rtrpool_alloc_bufs(lnet_rtrbufpool_t *rbp, int nbufs, int cpt)
+{
+	lnet_rtrbuf_t *rb;
+	int	    i;
+
+	if (rbp->rbp_nbuffers != 0) {
+		LASSERT (rbp->rbp_nbuffers == nbufs);
+		return 0;
+	}
+
+	for (i = 0; i < nbufs; i++) {
+		rb = lnet_new_rtrbuf(rbp, cpt);
+
+		if (rb == NULL) {
+			CERROR("Failed to allocate %d router bufs of %d pages\n",
+			       nbufs, rbp->rbp_npages);
+			return -ENOMEM;
+		}
+
+		rbp->rbp_nbuffers++;
+		rbp->rbp_credits++;
+		rbp->rbp_mincredits++;
+		list_add(&rb->rb_list, &rbp->rbp_bufs);
+
+		/* No allocation "under fire" */
+		/* Otherwise we'd need code to schedule blocked msgs etc */
+		LASSERT (!the_lnet.ln_routing);
+	}
+
+	LASSERT (rbp->rbp_credits == nbufs);
+	return 0;
+}
+
+void
+lnet_rtrpool_init(lnet_rtrbufpool_t *rbp, int npages)
+{
+	INIT_LIST_HEAD(&rbp->rbp_msgs);
+	INIT_LIST_HEAD(&rbp->rbp_bufs);
+
+	rbp->rbp_npages = npages;
+	rbp->rbp_credits = 0;
+	rbp->rbp_mincredits = 0;
+}
+
+void
+lnet_rtrpools_free(void)
+{
+	lnet_rtrbufpool_t *rtrp;
+	int		  i;
+
+	if (the_lnet.ln_rtrpools == NULL) /* uninitialized or freed */
+		return;
+
+	cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) {
+		lnet_rtrpool_free_bufs(&rtrp[0]);
+		lnet_rtrpool_free_bufs(&rtrp[1]);
+		lnet_rtrpool_free_bufs(&rtrp[2]);
+	}
+
+	cfs_percpt_free(the_lnet.ln_rtrpools);
+	the_lnet.ln_rtrpools = NULL;
+}
+
+static int
+lnet_nrb_tiny_calculate(int npages)
+{
+	int	nrbs = LNET_NRB_TINY;
+
+	if (tiny_router_buffers < 0) {
+		LCONSOLE_ERROR_MSG(0x10c,
+				   "tiny_router_buffers=%d invalid when "
+				   "routing enabled\n", tiny_router_buffers);
+		return -1;
+	}
+
+	if (tiny_router_buffers > 0)
+		nrbs = tiny_router_buffers;
+
+	nrbs /= LNET_CPT_NUMBER;
+	return max(nrbs, LNET_NRB_TINY_MIN);
+}
+
+static int
+lnet_nrb_small_calculate(int npages)
+{
+	int	nrbs = LNET_NRB_SMALL;
+
+	if (small_router_buffers < 0) {
+		LCONSOLE_ERROR_MSG(0x10c,
+				   "small_router_buffers=%d invalid when "
+				   "routing enabled\n", small_router_buffers);
+		return -1;
+	}
+
+	if (small_router_buffers > 0)
+		nrbs = small_router_buffers;
+
+	nrbs /= LNET_CPT_NUMBER;
+	return max(nrbs, LNET_NRB_SMALL_MIN);
+}
+
+static int
+lnet_nrb_large_calculate(int npages)
+{
+	int	nrbs = LNET_NRB_LARGE;
+
+	if (large_router_buffers < 0) {
+		LCONSOLE_ERROR_MSG(0x10c,
+				   "large_router_buffers=%d invalid when "
+				   "routing enabled\n", large_router_buffers);
+		return -1;
+	}
+
+	if (large_router_buffers > 0)
+		nrbs = large_router_buffers;
+
+	nrbs /= LNET_CPT_NUMBER;
+	return max(nrbs, LNET_NRB_LARGE_MIN);
+}
+
+int
+lnet_rtrpools_alloc(int im_a_router)
+{
+	lnet_rtrbufpool_t *rtrp;
+	int	large_pages = (LNET_MTU + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	int	small_pages = 1;
+	int	nrb_tiny;
+	int	nrb_small;
+	int	nrb_large;
+	int	rc;
+	int	i;
+
+	if (!strcmp(forwarding, "")) {
+		/* not set either way */
+		if (!im_a_router)
+			return 0;
+	} else if (!strcmp(forwarding, "disabled")) {
+		/* explicitly disabled */
+		return 0;
+	} else if (!strcmp(forwarding, "enabled")) {
+		/* explicitly enabled */
+	} else {
+		LCONSOLE_ERROR_MSG(0x10b, "'forwarding' not set to either "
+				   "'enabled' or 'disabled'\n");
+		return -EINVAL;
+	}
+
+	nrb_tiny = lnet_nrb_tiny_calculate(0);
+	if (nrb_tiny < 0)
+		return -EINVAL;
+
+	nrb_small = lnet_nrb_small_calculate(small_pages);
+	if (nrb_small < 0)
+		return -EINVAL;
+
+	nrb_large = lnet_nrb_large_calculate(large_pages);
+	if (nrb_large < 0)
+		return -EINVAL;
+
+	the_lnet.ln_rtrpools = cfs_percpt_alloc(lnet_cpt_table(),
+						LNET_NRBPOOLS *
+						sizeof(lnet_rtrbufpool_t));
+	if (the_lnet.ln_rtrpools == NULL) {
+		LCONSOLE_ERROR_MSG(0x10c,
+				   "Failed to initialize router buffe pool\n");
+		return -ENOMEM;
+	}
+
+	cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) {
+		lnet_rtrpool_init(&rtrp[0], 0);
+		rc = lnet_rtrpool_alloc_bufs(&rtrp[0], nrb_tiny, i);
+		if (rc != 0)
+			goto failed;
+
+		lnet_rtrpool_init(&rtrp[1], small_pages);
+		rc = lnet_rtrpool_alloc_bufs(&rtrp[1], nrb_small, i);
+		if (rc != 0)
+			goto failed;
+
+		lnet_rtrpool_init(&rtrp[2], large_pages);
+		rc = lnet_rtrpool_alloc_bufs(&rtrp[2], nrb_large, i);
+		if (rc != 0)
+			goto failed;
+	}
+
+	lnet_net_lock(LNET_LOCK_EX);
+	the_lnet.ln_routing = 1;
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	return 0;
+
+ failed:
+	lnet_rtrpools_free();
+	return rc;
+}
+
+int
+lnet_notify(lnet_ni_t *ni, lnet_nid_t nid, int alive, cfs_time_t when)
+{
+	struct lnet_peer	*lp = NULL;
+	cfs_time_t		now = cfs_time_current();
+	int			cpt = lnet_cpt_of_nid(nid);
+
+	LASSERT (!in_interrupt ());
+
+	CDEBUG (D_NET, "%s notifying %s: %s\n",
+		(ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid),
+		libcfs_nid2str(nid),
+		alive ? "up" : "down");
+
+	if (ni != NULL &&
+	    LNET_NIDNET(ni->ni_nid) != LNET_NIDNET(nid)) {
+		CWARN ("Ignoring notification of %s %s by %s (different net)\n",
+			libcfs_nid2str(nid), alive ? "birth" : "death",
+			libcfs_nid2str(ni->ni_nid));
+		return -EINVAL;
+	}
+
+	/* can't do predictions... */
+	if (cfs_time_after(when, now)) {
+		CWARN ("Ignoring prediction from %s of %s %s "
+		       "%ld seconds in the future\n",
+		       (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid),
+		       libcfs_nid2str(nid), alive ? "up" : "down",
+		       cfs_duration_sec(cfs_time_sub(when, now)));
+		return -EINVAL;
+	}
+
+	if (ni != NULL && !alive &&	     /* LND telling me she's down */
+	    !auto_down) {		       /* auto-down disabled */
+		CDEBUG(D_NET, "Auto-down disabled\n");
+		return 0;
+	}
+
+	lnet_net_lock(cpt);
+
+	if (the_lnet.ln_shutdown) {
+		lnet_net_unlock(cpt);
+		return -ESHUTDOWN;
+	}
+
+	lp = lnet_find_peer_locked(the_lnet.ln_peer_tables[cpt], nid);
+	if (lp == NULL) {
+		/* nid not found */
+		lnet_net_unlock(cpt);
+		CDEBUG(D_NET, "%s not found\n", libcfs_nid2str(nid));
+		return 0;
+	}
+
+	/* We can't fully trust LND on reporting exact peer last_alive
+	 * if he notifies us about dead peer. For example ksocklnd can
+	 * call us with when == _time_when_the_node_was_booted_ if
+	 * no connections were successfully established */
+	if (ni != NULL && !alive && when < lp->lp_last_alive)
+		when = lp->lp_last_alive;
+
+	lnet_notify_locked(lp, ni == NULL, alive, when);
+
+	lnet_ni_notify_locked(ni, lp);
+
+	lnet_peer_decref_locked(lp);
+
+	lnet_net_unlock(cpt);
+	return 0;
+}
+EXPORT_SYMBOL(lnet_notify);
+
+void
+lnet_get_tunables (void)
+{
+	return;
+}
+
+#else
+
+int
+lnet_notify (lnet_ni_t *ni, lnet_nid_t nid, int alive, cfs_time_t when)
+{
+	return -EOPNOTSUPP;
+}
+
+void
+lnet_router_checker (void)
+{
+	static time_t last = 0;
+	static int    running = 0;
+
+	time_t	    now = cfs_time_current_sec();
+	int	       interval = now - last;
+	int	       rc;
+	__u64	     version;
+	lnet_peer_t      *rtr;
+
+	/* It's no use to call me again within a sec - all intervals and
+	 * timeouts are measured in seconds */
+	if (last != 0 && interval < 2)
+		return;
+
+	if (last != 0 &&
+	    interval > MAX(live_router_check_interval,
+			   dead_router_check_interval))
+		CNETERR("Checker(%d/%d) not called for %d seconds\n",
+			live_router_check_interval, dead_router_check_interval,
+			interval);
+
+	LASSERT(LNET_CPT_NUMBER == 1);
+
+	lnet_net_lock(0);
+	LASSERT(!running); /* recursion check */
+	running = 1;
+	lnet_net_unlock(0);
+
+	last = now;
+
+	if (the_lnet.ln_rc_state == LNET_RC_STATE_STOPPING)
+		lnet_prune_rc_data(0); /* unlink all rcd and nowait */
+
+	/* consume all pending events */
+	while (1) {
+		int	  i;
+		lnet_event_t ev;
+
+		/* NB ln_rc_eqh must be the 1st in 'eventqs' otherwise the
+		 * recursion breaker in LNetEQPoll would fail */
+		rc = LNetEQPoll(&the_lnet.ln_rc_eqh, 1, 0, &ev, &i);
+		if (rc == 0)   /* no event pending */
+			break;
+
+		/* NB a lost SENT prevents me from pinging a router again */
+		if (rc == -EOVERFLOW) {
+			CERROR("Dropped an event!!!\n");
+			abort();
+		}
+
+		LASSERT (rc == 1);
+
+		lnet_router_checker_event(&ev);
+	}
+
+	if (the_lnet.ln_rc_state == LNET_RC_STATE_STOPPING) {
+		lnet_prune_rc_data(1); /* release rcd */
+		the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
+		running = 0;
+		return;
+	}
+
+	LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
+
+	lnet_net_lock(0);
+
+	version = the_lnet.ln_routers_version;
+	list_for_each_entry (rtr, &the_lnet.ln_routers, lp_rtr_list) {
+		lnet_ping_router_locked(rtr);
+		LASSERT (version == the_lnet.ln_routers_version);
+	}
+
+	lnet_net_unlock(0);
+
+	running = 0; /* lock only needed for the recursion check */
+	return;
+}
+
+/* NB lnet_peers_start_down depends on me,
+ * so must be called before any peer creation */
+void
+lnet_get_tunables (void)
+{
+	char *s;
+
+	s = getenv("LNET_ROUTER_PING_TIMEOUT");
+	if (s != NULL) router_ping_timeout = atoi(s);
+
+	s = getenv("LNET_LIVE_ROUTER_CHECK_INTERVAL");
+	if (s != NULL) live_router_check_interval = atoi(s);
+
+	s = getenv("LNET_DEAD_ROUTER_CHECK_INTERVAL");
+	if (s != NULL) dead_router_check_interval = atoi(s);
+
+	/* This replaces old lnd_notify mechanism */
+	check_routers_before_use = 1;
+	if (dead_router_check_interval <= 0)
+		dead_router_check_interval = 30;
+}
+
+void
+lnet_rtrpools_free(void)
+{
+}
+
+int
+lnet_rtrpools_alloc(int im_a_arouter)
+{
+	return 0;
+}
+
+#endif
diff --git a/drivers/staging/lustre/lnet/lnet/router_proc.c b/drivers/staging/lustre/lnet/lnet/router_proc.c
new file mode 100644
index 000000000000..3084b0c75983
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/router_proc.c
@@ -0,0 +1,950 @@
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ *   This file is part of Portals
+ *   http://sourceforge.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lib-lnet.h>
+
+#if  defined(LNET_ROUTER)
+
+/* This is really lnet_proc.c. You might need to update sanity test 215
+ * if any file format is changed. */
+
+static ctl_table_header_t *lnet_table_header = NULL;
+
+#define CTL_LNET	 (0x100)
+enum {
+	PSDEV_LNET_STATS = 100,
+	PSDEV_LNET_ROUTES,
+	PSDEV_LNET_ROUTERS,
+	PSDEV_LNET_PEERS,
+	PSDEV_LNET_BUFFERS,
+	PSDEV_LNET_NIS,
+	PSDEV_LNET_PTL_ROTOR,
+};
+
+#define LNET_LOFFT_BITS		(sizeof(loff_t) * 8)
+/*
+ * NB: max allowed LNET_CPT_BITS is 8 on 64-bit system and 2 on 32-bit system
+ */
+#define LNET_PROC_CPT_BITS	(LNET_CPT_BITS + 1)
+/* change version, 16 bits or 8 bits */
+#define LNET_PROC_VER_BITS	MAX(((MIN(LNET_LOFFT_BITS, 64)) / 4), 8)
+
+#define LNET_PROC_HASH_BITS	LNET_PEER_HASH_BITS
+/*
+ * bits for peer hash offset
+ * NB: we don't use the highest bit of *ppos because it's signed
+ */
+#define LNET_PROC_HOFF_BITS	(LNET_LOFFT_BITS -       \
+				 LNET_PROC_CPT_BITS -    \
+				 LNET_PROC_VER_BITS -    \
+				 LNET_PROC_HASH_BITS - 1)
+/* bits for hash index + position */
+#define LNET_PROC_HPOS_BITS	(LNET_PROC_HASH_BITS + LNET_PROC_HOFF_BITS)
+/* bits for peer hash table + hash version */
+#define LNET_PROC_VPOS_BITS	(LNET_PROC_HPOS_BITS + LNET_PROC_VER_BITS)
+
+#define LNET_PROC_CPT_MASK	((1ULL << LNET_PROC_CPT_BITS) - 1)
+#define LNET_PROC_VER_MASK	((1ULL << LNET_PROC_VER_BITS) - 1)
+#define LNET_PROC_HASH_MASK	((1ULL << LNET_PROC_HASH_BITS) - 1)
+#define LNET_PROC_HOFF_MASK	((1ULL << LNET_PROC_HOFF_BITS) - 1)
+
+#define LNET_PROC_CPT_GET(pos)				\
+	(int)(((pos) >> LNET_PROC_VPOS_BITS) & LNET_PROC_CPT_MASK)
+
+#define LNET_PROC_VER_GET(pos)				\
+	(int)(((pos) >> LNET_PROC_HPOS_BITS) & LNET_PROC_VER_MASK)
+
+#define LNET_PROC_HASH_GET(pos)				\
+	(int)(((pos) >> LNET_PROC_HOFF_BITS) & LNET_PROC_HASH_MASK)
+
+#define LNET_PROC_HOFF_GET(pos)				\
+	(int)((pos) & LNET_PROC_HOFF_MASK)
+
+#define LNET_PROC_POS_MAKE(cpt, ver, hash, off)		\
+	(((((loff_t)(cpt)) & LNET_PROC_CPT_MASK) << LNET_PROC_VPOS_BITS) |   \
+	((((loff_t)(ver)) & LNET_PROC_VER_MASK) << LNET_PROC_HPOS_BITS) |   \
+	((((loff_t)(hash)) & LNET_PROC_HASH_MASK) << LNET_PROC_HOFF_BITS) | \
+	((off) & LNET_PROC_HOFF_MASK))
+
+#define LNET_PROC_VERSION(v)	((unsigned int)((v) & LNET_PROC_VER_MASK))
+
+static int __proc_lnet_stats(void *data, int write,
+			     loff_t pos, void *buffer, int nob)
+{
+	int	      rc;
+	lnet_counters_t *ctrs;
+	int	      len;
+	char	    *tmpstr;
+	const int	tmpsiz = 256; /* 7 %u and 4 LPU64 */
+
+	if (write) {
+		lnet_counters_reset();
+		return 0;
+	}
+
+	/* read */
+
+	LIBCFS_ALLOC(ctrs, sizeof(*ctrs));
+	if (ctrs == NULL)
+		return -ENOMEM;
+
+	LIBCFS_ALLOC(tmpstr, tmpsiz);
+	if (tmpstr == NULL) {
+		LIBCFS_FREE(ctrs, sizeof(*ctrs));
+		return -ENOMEM;
+	}
+
+	lnet_counters_get(ctrs);
+
+	len = snprintf(tmpstr, tmpsiz,
+		       "%u %u %u %u %u %u %u "LPU64" "LPU64" "
+		       LPU64" "LPU64,
+		       ctrs->msgs_alloc, ctrs->msgs_max,
+		       ctrs->errors,
+		       ctrs->send_count, ctrs->recv_count,
+		       ctrs->route_count, ctrs->drop_count,
+		       ctrs->send_length, ctrs->recv_length,
+		       ctrs->route_length, ctrs->drop_length);
+
+	if (pos >= min_t(int, len, strlen(tmpstr)))
+		rc = 0;
+	else
+		rc = cfs_trace_copyout_string(buffer, nob,
+					      tmpstr + pos, "\n");
+
+	LIBCFS_FREE(tmpstr, tmpsiz);
+	LIBCFS_FREE(ctrs, sizeof(*ctrs));
+	return rc;
+}
+
+DECLARE_PROC_HANDLER(proc_lnet_stats);
+
+int LL_PROC_PROTO(proc_lnet_routes)
+{
+	const int	tmpsiz = 256;
+	char		*tmpstr;
+	char		*s;
+	int		rc = 0;
+	int		len;
+	int		ver;
+	int		off;
+
+	DECLARE_LL_PROC_PPOS_DECL;
+
+	CLASSERT(sizeof(loff_t) >= 4);
+
+	off = LNET_PROC_HOFF_GET(*ppos);
+	ver = LNET_PROC_VER_GET(*ppos);
+
+	LASSERT (!write);
+
+	if (*lenp == 0)
+		return 0;
+
+	LIBCFS_ALLOC(tmpstr, tmpsiz);
+	if (tmpstr == NULL)
+		return -ENOMEM;
+
+	s = tmpstr; /* points to current position in tmpstr[] */
+
+	if (*ppos == 0) {
+		s += snprintf(s, tmpstr + tmpsiz - s, "Routing %s\n",
+			      the_lnet.ln_routing ? "enabled" : "disabled");
+		LASSERT (tmpstr + tmpsiz - s > 0);
+
+		s += snprintf(s, tmpstr + tmpsiz - s, "%-8s %4s %7s %s\n",
+			      "net", "hops", "state", "router");
+		LASSERT (tmpstr + tmpsiz - s > 0);
+
+		lnet_net_lock(0);
+		ver = (unsigned int)the_lnet.ln_remote_nets_version;
+		lnet_net_unlock(0);
+		*ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+	} else {
+		struct list_head		*n;
+		struct list_head		*r;
+		lnet_route_t		*route = NULL;
+		lnet_remotenet_t	*rnet  = NULL;
+		int			skip  = off - 1;
+		struct list_head		*rn_list;
+		int			i;
+
+		lnet_net_lock(0);
+
+		if (ver != LNET_PROC_VERSION(the_lnet.ln_remote_nets_version)) {
+			lnet_net_unlock(0);
+			LIBCFS_FREE(tmpstr, tmpsiz);
+			return -ESTALE;
+		}
+
+		for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE && route == NULL;
+		     i++) {
+			rn_list = &the_lnet.ln_remote_nets_hash[i];
+
+			n = rn_list->next;
+
+			while (n != rn_list && route == NULL) {
+				rnet = list_entry(n, lnet_remotenet_t,
+						      lrn_list);
+
+				r = rnet->lrn_routes.next;
+
+				while (r != &rnet->lrn_routes) {
+					lnet_route_t *re =
+						list_entry(r, lnet_route_t,
+							       lr_list);
+					if (skip == 0) {
+						route = re;
+						break;
+					}
+
+					skip--;
+					r = r->next;
+				}
+
+				n = n->next;
+			}
+		}
+
+		if (route != NULL) {
+			__u32	net   = rnet->lrn_net;
+			unsigned int hops  = route->lr_hops;
+			lnet_nid_t   nid   = route->lr_gateway->lp_nid;
+			int	  alive = route->lr_gateway->lp_alive;
+
+			s += snprintf(s, tmpstr + tmpsiz - s,
+				      "%-8s %4u %7s %s\n",
+				      libcfs_net2str(net), hops,
+				      alive ? "up" : "down",
+				      libcfs_nid2str(nid));
+			LASSERT(tmpstr + tmpsiz - s > 0);
+		}
+
+		lnet_net_unlock(0);
+	}
+
+	len = s - tmpstr;     /* how many bytes was written */
+
+	if (len > *lenp) {    /* linux-supplied buffer is too small */
+		rc = -EINVAL;
+	} else if (len > 0) { /* wrote something */
+		if (copy_to_user(buffer, tmpstr, len))
+			rc = -EFAULT;
+		else {
+			off += 1;
+			*ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+		}
+	}
+
+	LIBCFS_FREE(tmpstr, tmpsiz);
+
+	if (rc == 0)
+		*lenp = len;
+
+	return rc;
+}
+
+int LL_PROC_PROTO(proc_lnet_routers)
+{
+	int	rc = 0;
+	char      *tmpstr;
+	char      *s;
+	const int  tmpsiz = 256;
+	int	len;
+	int	ver;
+	int	off;
+
+	DECLARE_LL_PROC_PPOS_DECL;
+
+	off = LNET_PROC_HOFF_GET(*ppos);
+	ver = LNET_PROC_VER_GET(*ppos);
+
+	LASSERT (!write);
+
+	if (*lenp == 0)
+		return 0;
+
+	LIBCFS_ALLOC(tmpstr, tmpsiz);
+	if (tmpstr == NULL)
+		return -ENOMEM;
+
+	s = tmpstr; /* points to current position in tmpstr[] */
+
+	if (*ppos == 0) {
+		s += snprintf(s, tmpstr + tmpsiz - s,
+			      "%-4s %7s %9s %6s %12s %9s %8s %7s %s\n",
+			      "ref", "rtr_ref", "alive_cnt", "state",
+			      "last_ping", "ping_sent", "deadline",
+			      "down_ni", "router");
+		LASSERT(tmpstr + tmpsiz - s > 0);
+
+		lnet_net_lock(0);
+		ver = (unsigned int)the_lnet.ln_routers_version;
+		lnet_net_unlock(0);
+		*ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+	} else {
+		struct list_head		*r;
+		struct lnet_peer	*peer = NULL;
+		int			skip = off - 1;
+
+		lnet_net_lock(0);
+
+		if (ver != LNET_PROC_VERSION(the_lnet.ln_routers_version)) {
+			lnet_net_unlock(0);
+
+			LIBCFS_FREE(tmpstr, tmpsiz);
+			return -ESTALE;
+		}
+
+		r = the_lnet.ln_routers.next;
+
+		while (r != &the_lnet.ln_routers) {
+			lnet_peer_t *lp = list_entry(r, lnet_peer_t,
+							 lp_rtr_list);
+
+			if (skip == 0) {
+				peer = lp;
+				break;
+			}
+
+			skip--;
+			r = r->next;
+		}
+
+		if (peer != NULL) {
+			lnet_nid_t nid = peer->lp_nid;
+			cfs_time_t now = cfs_time_current();
+			cfs_time_t deadline = peer->lp_ping_deadline;
+			int nrefs     = peer->lp_refcount;
+			int nrtrrefs  = peer->lp_rtr_refcount;
+			int alive_cnt = peer->lp_alive_count;
+			int alive     = peer->lp_alive;
+			int pingsent  = !peer->lp_ping_notsent;
+			int last_ping = cfs_duration_sec(cfs_time_sub(now,
+						     peer->lp_ping_timestamp));
+			int down_ni   = 0;
+			lnet_route_t *rtr;
+
+			if ((peer->lp_ping_feats &
+			     LNET_PING_FEAT_NI_STATUS) != 0) {
+				list_for_each_entry(rtr, &peer->lp_routes,
+							lr_gwlist) {
+					/* downis on any route should be the
+					 * number of downis on the gateway */
+					if (rtr->lr_downis != 0) {
+						down_ni = rtr->lr_downis;
+						break;
+					}
+				}
+			}
+
+			if (deadline == 0)
+				s += snprintf(s, tmpstr + tmpsiz - s,
+					      "%-4d %7d %9d %6s %12d %9d %8s %7d %s\n",
+					      nrefs, nrtrrefs, alive_cnt,
+					      alive ? "up" : "down", last_ping,
+					      pingsent, "NA", down_ni,
+					      libcfs_nid2str(nid));
+			else
+				s += snprintf(s, tmpstr + tmpsiz - s,
+					      "%-4d %7d %9d %6s %12d %9d %8lu %7d %s\n",
+					      nrefs, nrtrrefs, alive_cnt,
+					      alive ? "up" : "down", last_ping,
+					      pingsent,
+					      cfs_duration_sec(cfs_time_sub(deadline, now)),
+					      down_ni, libcfs_nid2str(nid));
+			LASSERT (tmpstr + tmpsiz - s > 0);
+		}
+
+		lnet_net_unlock(0);
+	}
+
+	len = s - tmpstr;     /* how many bytes was written */
+
+	if (len > *lenp) {    /* linux-supplied buffer is too small */
+		rc = -EINVAL;
+	} else if (len > 0) { /* wrote something */
+		if (copy_to_user(buffer, tmpstr, len))
+			rc = -EFAULT;
+		else {
+			off += 1;
+			*ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+		}
+	}
+
+	LIBCFS_FREE(tmpstr, tmpsiz);
+
+	if (rc == 0)
+		*lenp = len;
+
+	return rc;
+}
+
+int LL_PROC_PROTO(proc_lnet_peers)
+{
+	const int		tmpsiz  = 256;
+	struct lnet_peer_table	*ptable;
+	char			*tmpstr;
+	char			*s;
+	int			cpt  = LNET_PROC_CPT_GET(*ppos);
+	int			ver  = LNET_PROC_VER_GET(*ppos);
+	int			hash = LNET_PROC_HASH_GET(*ppos);
+	int			hoff = LNET_PROC_HOFF_GET(*ppos);
+	int			rc = 0;
+	int			len;
+
+	CLASSERT(LNET_PROC_HASH_BITS >= LNET_PEER_HASH_BITS);
+	LASSERT(!write);
+
+	if (*lenp == 0)
+		return 0;
+
+	if (cpt >= LNET_CPT_NUMBER) {
+		*lenp = 0;
+		return 0;
+	}
+
+	LIBCFS_ALLOC(tmpstr, tmpsiz);
+	if (tmpstr == NULL)
+		return -ENOMEM;
+
+	s = tmpstr; /* points to current position in tmpstr[] */
+
+	if (*ppos == 0) {
+		s += snprintf(s, tmpstr + tmpsiz - s,
+			      "%-24s %4s %5s %5s %5s %5s %5s %5s %5s %s\n",
+			      "nid", "refs", "state", "last", "max",
+			      "rtr", "min", "tx", "min", "queue");
+		LASSERT (tmpstr + tmpsiz - s > 0);
+
+		hoff++;
+	} else {
+		struct lnet_peer	*peer;
+		struct list_head		*p;
+		int			skip;
+ again:
+		p = NULL;
+		peer = NULL;
+		skip = hoff - 1;
+
+		lnet_net_lock(cpt);
+		ptable = the_lnet.ln_peer_tables[cpt];
+		if (hoff == 1)
+			ver = LNET_PROC_VERSION(ptable->pt_version);
+
+		if (ver != LNET_PROC_VERSION(ptable->pt_version)) {
+			lnet_net_unlock(cpt);
+			LIBCFS_FREE(tmpstr, tmpsiz);
+			return -ESTALE;
+		}
+
+		while (hash < LNET_PEER_HASH_SIZE) {
+			if (p == NULL)
+				p = ptable->pt_hash[hash].next;
+
+			while (p != &ptable->pt_hash[hash]) {
+				lnet_peer_t *lp = list_entry(p, lnet_peer_t,
+								 lp_hashlist);
+				if (skip == 0) {
+					peer = lp;
+
+					/* minor optimization: start from idx+1
+					 * on next iteration if we've just
+					 * drained lp_hashlist */
+					if (lp->lp_hashlist.next ==
+					    &ptable->pt_hash[hash]) {
+						hoff = 1;
+						hash++;
+					} else {
+						hoff++;
+					}
+
+					break;
+				}
+
+				skip--;
+				p = lp->lp_hashlist.next;
+			}
+
+			if (peer != NULL)
+				break;
+
+			p = NULL;
+			hoff = 1;
+			hash++;
+		}
+
+		if (peer != NULL) {
+			lnet_nid_t nid       = peer->lp_nid;
+			int	nrefs     = peer->lp_refcount;
+			int	lastalive = -1;
+			char      *aliveness = "NA";
+			int	maxcr     = peer->lp_ni->ni_peertxcredits;
+			int	txcr      = peer->lp_txcredits;
+			int	mintxcr   = peer->lp_mintxcredits;
+			int	rtrcr     = peer->lp_rtrcredits;
+			int	minrtrcr  = peer->lp_minrtrcredits;
+			int	txqnob    = peer->lp_txqnob;
+
+			if (lnet_isrouter(peer) ||
+			    lnet_peer_aliveness_enabled(peer))
+				aliveness = peer->lp_alive ? "up" : "down";
+
+			if (lnet_peer_aliveness_enabled(peer)) {
+				cfs_time_t     now = cfs_time_current();
+				cfs_duration_t delta;
+
+				delta = cfs_time_sub(now, peer->lp_last_alive);
+				lastalive = cfs_duration_sec(delta);
+
+				/* No need to mess up peers contents with
+				 * arbitrarily long integers - it suffices to
+				 * know that lastalive is more than 10000s old
+				 */
+				if (lastalive >= 10000)
+					lastalive = 9999;
+			}
+
+			lnet_net_unlock(cpt);
+
+			s += snprintf(s, tmpstr + tmpsiz - s,
+				      "%-24s %4d %5s %5d %5d %5d %5d %5d %5d %d\n",
+				      libcfs_nid2str(nid), nrefs, aliveness,
+				      lastalive, maxcr, rtrcr, minrtrcr, txcr,
+				      mintxcr, txqnob);
+			LASSERT (tmpstr + tmpsiz - s > 0);
+
+		} else { /* peer is NULL */
+			lnet_net_unlock(cpt);
+		}
+
+		if (hash == LNET_PEER_HASH_SIZE) {
+			cpt++;
+			hash = 0;
+			hoff = 1;
+			if (peer == NULL && cpt < LNET_CPT_NUMBER)
+				goto again;
+		}
+	}
+
+	len = s - tmpstr;     /* how many bytes was written */
+
+	if (len > *lenp) {    /* linux-supplied buffer is too small */
+		rc = -EINVAL;
+	} else if (len > 0) { /* wrote something */
+		if (copy_to_user(buffer, tmpstr, len))
+			rc = -EFAULT;
+		else
+			*ppos = LNET_PROC_POS_MAKE(cpt, ver, hash, hoff);
+	}
+
+	LIBCFS_FREE(tmpstr, tmpsiz);
+
+	if (rc == 0)
+		*lenp = len;
+
+	return rc;
+}
+
+static int __proc_lnet_buffers(void *data, int write,
+			       loff_t pos, void *buffer, int nob)
+{
+	char	    *s;
+	char	    *tmpstr;
+	int		tmpsiz;
+	int		idx;
+	int		len;
+	int		rc;
+	int		i;
+
+	LASSERT(!write);
+
+	/* (4 %d) * 4 * LNET_CPT_NUMBER */
+	tmpsiz = 64 * (LNET_NRBPOOLS + 1) * LNET_CPT_NUMBER;
+	LIBCFS_ALLOC(tmpstr, tmpsiz);
+	if (tmpstr == NULL)
+		return -ENOMEM;
+
+	s = tmpstr; /* points to current position in tmpstr[] */
+
+	s += snprintf(s, tmpstr + tmpsiz - s,
+		      "%5s %5s %7s %7s\n",
+		      "pages", "count", "credits", "min");
+	LASSERT (tmpstr + tmpsiz - s > 0);
+
+	if (the_lnet.ln_rtrpools == NULL)
+		goto out; /* I'm not a router */
+
+	for (idx = 0; idx < LNET_NRBPOOLS; idx++) {
+		lnet_rtrbufpool_t *rbp;
+
+		lnet_net_lock(LNET_LOCK_EX);
+		cfs_percpt_for_each(rbp, i, the_lnet.ln_rtrpools) {
+			s += snprintf(s, tmpstr + tmpsiz - s,
+				      "%5d %5d %7d %7d\n",
+				      rbp[idx].rbp_npages,
+				      rbp[idx].rbp_nbuffers,
+				      rbp[idx].rbp_credits,
+				      rbp[idx].rbp_mincredits);
+			LASSERT(tmpstr + tmpsiz - s > 0);
+		}
+		lnet_net_unlock(LNET_LOCK_EX);
+	}
+
+ out:
+	len = s - tmpstr;
+
+	if (pos >= min_t(int, len, strlen(tmpstr)))
+		rc = 0;
+	else
+		rc = cfs_trace_copyout_string(buffer, nob,
+					      tmpstr + pos, NULL);
+
+	LIBCFS_FREE(tmpstr, tmpsiz);
+	return rc;
+}
+
+DECLARE_PROC_HANDLER(proc_lnet_buffers);
+
+int LL_PROC_PROTO(proc_lnet_nis)
+{
+	int	tmpsiz = 128 * LNET_CPT_NUMBER;
+	int	rc = 0;
+	char      *tmpstr;
+	char      *s;
+	int	len;
+
+	DECLARE_LL_PROC_PPOS_DECL;
+
+	LASSERT (!write);
+
+	if (*lenp == 0)
+		return 0;
+
+	LIBCFS_ALLOC(tmpstr, tmpsiz);
+	if (tmpstr == NULL)
+		return -ENOMEM;
+
+	s = tmpstr; /* points to current position in tmpstr[] */
+
+	if (*ppos == 0) {
+		s += snprintf(s, tmpstr + tmpsiz - s,
+			      "%-24s %6s %5s %4s %4s %4s %5s %5s %5s\n",
+			      "nid", "status", "alive", "refs", "peer",
+			      "rtr", "max", "tx", "min");
+		LASSERT (tmpstr + tmpsiz - s > 0);
+	} else {
+		struct list_head	*n;
+		lnet_ni_t	 *ni   = NULL;
+		int		skip = *ppos - 1;
+
+		lnet_net_lock(0);
+
+		n = the_lnet.ln_nis.next;
+
+		while (n != &the_lnet.ln_nis) {
+			lnet_ni_t *a_ni = list_entry(n, lnet_ni_t, ni_list);
+
+			if (skip == 0) {
+				ni = a_ni;
+				break;
+			}
+
+			skip--;
+			n = n->next;
+		}
+
+		if (ni != NULL) {
+			struct lnet_tx_queue	*tq;
+			char	*stat;
+			long	now = cfs_time_current_sec();
+			int	last_alive = -1;
+			int	i;
+			int	j;
+
+			if (the_lnet.ln_routing)
+				last_alive = now - ni->ni_last_alive;
+
+			/* @lo forever alive */
+			if (ni->ni_lnd->lnd_type == LOLND)
+				last_alive = 0;
+
+			lnet_ni_lock(ni);
+			LASSERT(ni->ni_status != NULL);
+			stat = (ni->ni_status->ns_status ==
+				LNET_NI_STATUS_UP) ? "up" : "down";
+			lnet_ni_unlock(ni);
+
+			/* we actually output credits information for
+			 * TX queue of each partition */
+			cfs_percpt_for_each(tq, i, ni->ni_tx_queues) {
+				for (j = 0; ni->ni_cpts != NULL &&
+				     j < ni->ni_ncpts; j++) {
+					if (i == ni->ni_cpts[j])
+						break;
+				}
+
+				if (j == ni->ni_ncpts)
+					continue;
+
+				if (i != 0)
+					lnet_net_lock(i);
+
+				s += snprintf(s, tmpstr + tmpsiz - s,
+				      "%-24s %6s %5d %4d %4d %4d %5d %5d %5d\n",
+				      libcfs_nid2str(ni->ni_nid), stat,
+				      last_alive, *ni->ni_refs[i],
+				      ni->ni_peertxcredits,
+				      ni->ni_peerrtrcredits,
+				      tq->tq_credits_max,
+				      tq->tq_credits, tq->tq_credits_min);
+				if (i != 0)
+					lnet_net_unlock(i);
+			}
+			LASSERT(tmpstr + tmpsiz - s > 0);
+		}
+
+		lnet_net_unlock(0);
+	}
+
+	len = s - tmpstr;     /* how many bytes was written */
+
+	if (len > *lenp) {    /* linux-supplied buffer is too small */
+		rc = -EINVAL;
+	} else if (len > 0) { /* wrote something */
+		if (copy_to_user(buffer, tmpstr, len))
+			rc = -EFAULT;
+		else
+			*ppos += 1;
+	}
+
+	LIBCFS_FREE(tmpstr, tmpsiz);
+
+	if (rc == 0)
+		*lenp = len;
+
+	return rc;
+}
+
+struct lnet_portal_rotors {
+	int	     pr_value;
+	const char      *pr_name;
+	const char	*pr_desc;
+};
+
+static struct lnet_portal_rotors	portal_rotors[] = {
+	{
+		.pr_value = LNET_PTL_ROTOR_OFF,
+		.pr_name  = "OFF",
+		.pr_desc  = "Turn off message rotor for wildcard portals"
+	},
+	{
+		.pr_value = LNET_PTL_ROTOR_ON,
+		.pr_name  = "ON",
+		.pr_desc  = "round-robin dispatch all PUT messages for "
+			    "wildcard portals"
+	},
+	{
+		.pr_value = LNET_PTL_ROTOR_RR_RT,
+		.pr_name  = "RR_RT",
+		.pr_desc  = "round-robin dispatch routed PUT message for "
+			    "wildcard portals"
+	},
+	{
+		.pr_value = LNET_PTL_ROTOR_HASH_RT,
+		.pr_name  = "HASH_RT",
+		.pr_desc  = "dispatch routed PUT message by hashing source "
+			    "NID for wildcard portals"
+	},
+	{
+		.pr_value = -1,
+		.pr_name  = NULL,
+		.pr_desc  = NULL
+	},
+};
+
+extern int portal_rotor;
+
+static int __proc_lnet_portal_rotor(void *data, int write,
+				    loff_t pos, void *buffer, int nob)
+{
+	const int	buf_len	= 128;
+	char		*buf;
+	char		*tmp;
+	int		rc;
+	int		i;
+
+	LIBCFS_ALLOC(buf, buf_len);
+	if (buf == NULL)
+		return -ENOMEM;
+
+	if (!write) {
+		lnet_res_lock(0);
+
+		for (i = 0; portal_rotors[i].pr_value >= 0; i++) {
+			if (portal_rotors[i].pr_value == portal_rotor)
+				break;
+		}
+
+		LASSERT(portal_rotors[i].pr_value == portal_rotor);
+		lnet_res_unlock(0);
+
+		rc = snprintf(buf, buf_len,
+			      "{\n\tportals: all\n"
+			      "\trotor: %s\n\tdescription: %s\n}",
+			      portal_rotors[i].pr_name,
+			      portal_rotors[i].pr_desc);
+
+		if (pos >= min_t(int, rc, buf_len)) {
+			rc = 0;
+		} else {
+			rc = cfs_trace_copyout_string(buffer, nob,
+					buf + pos, "\n");
+		}
+		goto out;
+	}
+
+	rc = cfs_trace_copyin_string(buf, buf_len, buffer, nob);
+	if (rc < 0)
+		goto out;
+
+	tmp = cfs_trimwhite(buf);
+
+	rc = -EINVAL;
+	lnet_res_lock(0);
+	for (i = 0; portal_rotors[i].pr_name != NULL; i++) {
+		if (cfs_strncasecmp(portal_rotors[i].pr_name, tmp,
+				    strlen(portal_rotors[i].pr_name)) == 0) {
+			portal_rotor = portal_rotors[i].pr_value;
+			rc = 0;
+			break;
+		}
+	}
+	lnet_res_unlock(0);
+out:
+	LIBCFS_FREE(buf, buf_len);
+	return rc;
+}
+DECLARE_PROC_HANDLER(proc_lnet_portal_rotor);
+
+static ctl_table_t lnet_table[] = {
+	/*
+	 * NB No .strategy entries have been provided since sysctl(8) prefers
+	 * to go via /proc for portability.
+	 */
+	{
+		INIT_CTL_NAME(PSDEV_LNET_STATS)
+		.procname = "stats",
+		.mode     = 0644,
+		.proc_handler = &proc_lnet_stats,
+	},
+	{
+		INIT_CTL_NAME(PSDEV_LNET_ROUTES)
+		.procname = "routes",
+		.mode     = 0444,
+		.proc_handler = &proc_lnet_routes,
+	},
+	{
+		INIT_CTL_NAME(PSDEV_LNET_ROUTERS)
+		.procname = "routers",
+		.mode     = 0444,
+		.proc_handler = &proc_lnet_routers,
+	},
+	{
+		INIT_CTL_NAME(PSDEV_LNET_PEERS)
+		.procname = "peers",
+		.mode     = 0444,
+		.proc_handler = &proc_lnet_peers,
+	},
+	{
+		INIT_CTL_NAME(PSDEV_LNET_PEERS)
+		.procname = "buffers",
+		.mode     = 0444,
+		.proc_handler = &proc_lnet_buffers,
+	},
+	{
+		INIT_CTL_NAME(PSDEV_LNET_NIS)
+		.procname = "nis",
+		.mode     = 0444,
+		.proc_handler = &proc_lnet_nis,
+	},
+	{
+		INIT_CTL_NAME(PSDEV_LNET_PTL_ROTOR)
+		.procname = "portal_rotor",
+		.mode     = 0644,
+		.proc_handler = &proc_lnet_portal_rotor,
+	},
+	{
+		INIT_CTL_NAME(0)
+	}
+};
+
+static ctl_table_t top_table[] = {
+	{
+		INIT_CTL_NAME(CTL_LNET)
+		.procname = "lnet",
+		.mode     = 0555,
+		.data     = NULL,
+		.maxlen   = 0,
+		.child    = lnet_table,
+	},
+	{
+		INIT_CTL_NAME(0)
+	}
+};
+
+void
+lnet_proc_init(void)
+{
+#ifdef CONFIG_SYSCTL
+	if (lnet_table_header == NULL)
+		lnet_table_header = cfs_register_sysctl_table(top_table, 0);
+#endif
+}
+
+void
+lnet_proc_fini(void)
+{
+#ifdef CONFIG_SYSCTL
+	if (lnet_table_header != NULL)
+		unregister_sysctl_table(lnet_table_header);
+
+	lnet_table_header = NULL;
+#endif
+}
+
+#else
+
+void
+lnet_proc_init(void)
+{
+}
+
+void
+lnet_proc_fini(void)
+{
+}
+
+#endif
diff --git a/drivers/staging/lustre/lnet/selftest/Makefile b/drivers/staging/lustre/lnet/selftest/Makefile
new file mode 100644
index 000000000000..1e40aeea2962
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/Makefile
@@ -0,0 +1,6 @@
+obj-$(CONFIG_LNET_SELFTEST) := lnet_selftest.o
+
+lnet_selftest-y := console.o conrpc.o conctl.o framework.o timer.o rpc.o \
+		   module.o ping_test.o brw_test.o
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lnet/selftest/brw_test.c b/drivers/staging/lustre/lnet/selftest/brw_test.c
new file mode 100644
index 000000000000..3bb6fbe23f78
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/brw_test.c
@@ -0,0 +1,499 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/brw_test.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ */
+
+#include "selftest.h"
+
+static int brw_srv_workitems = SFW_TEST_WI_MAX;
+CFS_MODULE_PARM(brw_srv_workitems, "i", int, 0644, "# BRW server workitems");
+
+static int brw_inject_errors;
+CFS_MODULE_PARM(brw_inject_errors, "i", int, 0644,
+		"# data errors to inject randomly, zero by default");
+
+static void
+brw_client_fini (sfw_test_instance_t *tsi)
+{
+	srpc_bulk_t     *bulk;
+	sfw_test_unit_t *tsu;
+
+	LASSERT (tsi->tsi_is_client);
+
+	list_for_each_entry (tsu, &tsi->tsi_units, tsu_list) {
+		bulk = tsu->tsu_private;
+		if (bulk == NULL) continue;
+
+		srpc_free_bulk(bulk);
+		tsu->tsu_private = NULL;
+	}
+}
+
+int
+brw_client_init (sfw_test_instance_t *tsi)
+{
+	sfw_session_t	 *sn = tsi->tsi_batch->bat_session;
+	int		  flags;
+	int		  npg;
+	int		  len;
+	int		  opc;
+	srpc_bulk_t	 *bulk;
+	sfw_test_unit_t	 *tsu;
+
+	LASSERT(sn != NULL);
+	LASSERT(tsi->tsi_is_client);
+
+	if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) {
+		test_bulk_req_t  *breq = &tsi->tsi_u.bulk_v0;
+
+		opc   = breq->blk_opc;
+		flags = breq->blk_flags;
+		npg   = breq->blk_npg;
+		/* NB: this is not going to work for variable page size,
+		 * but we have to keep it for compatibility */
+		len   = npg * PAGE_CACHE_SIZE;
+
+	} else {
+		test_bulk_req_v1_t  *breq = &tsi->tsi_u.bulk_v1;
+
+		/* I should never get this step if it's unknown feature
+		 * because make_session will reject unknown feature */
+		LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0);
+
+		opc   = breq->blk_opc;
+		flags = breq->blk_flags;
+		len   = breq->blk_len;
+		npg   = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	}
+
+	if (npg > LNET_MAX_IOV || npg <= 0)
+		return -EINVAL;
+
+	if (opc != LST_BRW_READ && opc != LST_BRW_WRITE)
+		return -EINVAL;
+
+	if (flags != LST_BRW_CHECK_NONE &&
+	    flags != LST_BRW_CHECK_FULL && flags != LST_BRW_CHECK_SIMPLE)
+		return -EINVAL;
+
+	list_for_each_entry(tsu, &tsi->tsi_units, tsu_list) {
+		bulk = srpc_alloc_bulk(lnet_cpt_of_nid(tsu->tsu_dest.nid),
+				       npg, len, opc == LST_BRW_READ);
+		if (bulk == NULL) {
+			brw_client_fini(tsi);
+			return -ENOMEM;
+		}
+
+		tsu->tsu_private = bulk;
+	}
+
+	return 0;
+}
+
+#define BRW_POISON      0xbeefbeefbeefbeefULL
+#define BRW_MAGIC       0xeeb0eeb1eeb2eeb3ULL
+#define BRW_MSIZE       sizeof(__u64)
+
+int
+brw_inject_one_error (void)
+{
+	struct timeval tv;
+
+	if (brw_inject_errors <= 0) return 0;
+
+	do_gettimeofday(&tv);
+
+	if ((tv.tv_usec & 1) == 0) return 0;
+
+	return brw_inject_errors--;
+}
+
+void
+brw_fill_page (struct page *pg, int pattern, __u64 magic)
+{
+	char *addr = page_address(pg);
+	int   i;
+
+	LASSERT (addr != NULL);
+
+	if (pattern == LST_BRW_CHECK_NONE) return;
+
+	if (magic == BRW_MAGIC)
+		magic += brw_inject_one_error();
+
+	if (pattern == LST_BRW_CHECK_SIMPLE) {
+		memcpy(addr, &magic, BRW_MSIZE);
+		addr += PAGE_CACHE_SIZE - BRW_MSIZE;
+		memcpy(addr, &magic, BRW_MSIZE);
+		return;
+	}
+
+	if (pattern == LST_BRW_CHECK_FULL) {
+		for (i = 0; i < PAGE_CACHE_SIZE / BRW_MSIZE; i++)
+			memcpy(addr + i * BRW_MSIZE, &magic, BRW_MSIZE);
+		return;
+	}
+
+	LBUG ();
+	return;
+}
+
+int
+brw_check_page (struct page *pg, int pattern, __u64 magic)
+{
+	char  *addr = page_address(pg);
+	__u64  data = 0; /* make compiler happy */
+	int    i;
+
+	LASSERT (addr != NULL);
+
+	if (pattern == LST_BRW_CHECK_NONE)
+		return 0;
+
+	if (pattern == LST_BRW_CHECK_SIMPLE) {
+		data = *((__u64 *) addr);
+		if (data != magic) goto bad_data;
+
+		addr += PAGE_CACHE_SIZE - BRW_MSIZE;
+		data = *((__u64 *) addr);
+		if (data != magic) goto bad_data;
+
+		return 0;
+	}
+
+	if (pattern == LST_BRW_CHECK_FULL) {
+		for (i = 0; i < PAGE_CACHE_SIZE / BRW_MSIZE; i++) {
+			data = *(((__u64 *) addr) + i);
+			if (data != magic) goto bad_data;
+		}
+
+		return 0;
+	}
+
+	LBUG ();
+
+bad_data:
+	CERROR ("Bad data in page %p: "LPX64", "LPX64" expected\n",
+		pg, data, magic);
+	return 1;
+}
+
+void
+brw_fill_bulk (srpc_bulk_t *bk, int pattern, __u64 magic)
+{
+	int	 i;
+	struct page *pg;
+
+	for (i = 0; i < bk->bk_niov; i++) {
+		pg = bk->bk_iovs[i].kiov_page;
+		brw_fill_page(pg, pattern, magic);
+	}
+}
+
+int
+brw_check_bulk (srpc_bulk_t *bk, int pattern, __u64 magic)
+{
+	int	 i;
+	struct page *pg;
+
+	for (i = 0; i < bk->bk_niov; i++) {
+		pg = bk->bk_iovs[i].kiov_page;
+		if (brw_check_page(pg, pattern, magic) != 0) {
+			CERROR ("Bulk page %p (%d/%d) is corrupted!\n",
+				pg, i, bk->bk_niov);
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+static int
+brw_client_prep_rpc (sfw_test_unit_t *tsu,
+		     lnet_process_id_t dest, srpc_client_rpc_t **rpcpp)
+{
+	srpc_bulk_t	 *bulk = tsu->tsu_private;
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+	sfw_session_t	    *sn = tsi->tsi_batch->bat_session;
+	srpc_client_rpc_t   *rpc;
+	srpc_brw_reqst_t    *req;
+	int		     flags;
+	int		     npg;
+	int		     len;
+	int		     opc;
+	int		     rc;
+
+	LASSERT(sn != NULL);
+	LASSERT(bulk != NULL);
+
+	if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) {
+		test_bulk_req_t *breq = &tsi->tsi_u.bulk_v0;
+
+		opc   = breq->blk_opc;
+		flags = breq->blk_flags;
+		npg   = breq->blk_npg;
+		len   = npg * PAGE_CACHE_SIZE;
+
+	} else {
+		test_bulk_req_v1_t  *breq = &tsi->tsi_u.bulk_v1;
+
+		/* I should never get this step if it's unknown feature
+		 * because make_session will reject unknown feature */
+		LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0);
+
+		opc   = breq->blk_opc;
+		flags = breq->blk_flags;
+		len   = breq->blk_len;
+		npg   = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	}
+
+	rc = sfw_create_test_rpc(tsu, dest, sn->sn_features, npg, len, &rpc);
+	if (rc != 0)
+		return rc;
+
+	memcpy(&rpc->crpc_bulk, bulk, offsetof(srpc_bulk_t, bk_iovs[npg]));
+	if (opc == LST_BRW_WRITE)
+		brw_fill_bulk(&rpc->crpc_bulk, flags, BRW_MAGIC);
+	else
+		brw_fill_bulk(&rpc->crpc_bulk, flags, BRW_POISON);
+
+	req = &rpc->crpc_reqstmsg.msg_body.brw_reqst;
+	req->brw_flags = flags;
+	req->brw_rw    = opc;
+	req->brw_len   = len;
+
+	*rpcpp = rpc;
+	return 0;
+}
+
+static void
+brw_client_done_rpc (sfw_test_unit_t *tsu, srpc_client_rpc_t *rpc)
+{
+	__u64		magic = BRW_MAGIC;
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+	sfw_session_t       *sn = tsi->tsi_batch->bat_session;
+	srpc_msg_t	  *msg = &rpc->crpc_replymsg;
+	srpc_brw_reply_t    *reply = &msg->msg_body.brw_reply;
+	srpc_brw_reqst_t    *reqst = &rpc->crpc_reqstmsg.msg_body.brw_reqst;
+
+	LASSERT (sn != NULL);
+
+	if (rpc->crpc_status != 0) {
+		CERROR ("BRW RPC to %s failed with %d\n",
+			libcfs_id2str(rpc->crpc_dest), rpc->crpc_status);
+		if (!tsi->tsi_stopping) /* rpc could have been aborted */
+			atomic_inc(&sn->sn_brw_errors);
+		goto out;
+	}
+
+	if (msg->msg_magic != SRPC_MSG_MAGIC) {
+		__swab64s(&magic);
+		__swab32s(&reply->brw_status);
+	}
+
+	CDEBUG (reply->brw_status ? D_WARNING : D_NET,
+		"BRW RPC to %s finished with brw_status: %d\n",
+		libcfs_id2str(rpc->crpc_dest), reply->brw_status);
+
+	if (reply->brw_status != 0) {
+		atomic_inc(&sn->sn_brw_errors);
+		rpc->crpc_status = -(int)reply->brw_status;
+		goto out;
+	}
+
+	if (reqst->brw_rw == LST_BRW_WRITE) goto out;
+
+	if (brw_check_bulk(&rpc->crpc_bulk, reqst->brw_flags, magic) != 0) {
+		CERROR ("Bulk data from %s is corrupted!\n",
+			libcfs_id2str(rpc->crpc_dest));
+		atomic_inc(&sn->sn_brw_errors);
+		rpc->crpc_status = -EBADMSG;
+	}
+
+out:
+	return;
+}
+
+void
+brw_server_rpc_done (srpc_server_rpc_t *rpc)
+{
+	srpc_bulk_t *blk = rpc->srpc_bulk;
+
+	if (blk == NULL) return;
+
+	if (rpc->srpc_status != 0)
+		CERROR ("Bulk transfer %s %s has failed: %d\n",
+			blk->bk_sink ? "from" : "to",
+			libcfs_id2str(rpc->srpc_peer), rpc->srpc_status);
+	else
+		CDEBUG (D_NET, "Transfered %d pages bulk data %s %s\n",
+			blk->bk_niov, blk->bk_sink ? "from" : "to",
+			libcfs_id2str(rpc->srpc_peer));
+
+	sfw_free_pages(rpc);
+}
+
+int
+brw_bulk_ready (srpc_server_rpc_t *rpc, int status)
+{
+	__u64	     magic = BRW_MAGIC;
+	srpc_brw_reply_t *reply = &rpc->srpc_replymsg.msg_body.brw_reply;
+	srpc_brw_reqst_t *reqst;
+	srpc_msg_t       *reqstmsg;
+
+	LASSERT (rpc->srpc_bulk != NULL);
+	LASSERT (rpc->srpc_reqstbuf != NULL);
+
+	reqstmsg = &rpc->srpc_reqstbuf->buf_msg;
+	reqst = &reqstmsg->msg_body.brw_reqst;
+
+	if (status != 0) {
+		CERROR ("BRW bulk %s failed for RPC from %s: %d\n",
+			reqst->brw_rw == LST_BRW_READ ? "READ" : "WRITE",
+			libcfs_id2str(rpc->srpc_peer), status);
+		return -EIO;
+	}
+
+	if (reqst->brw_rw == LST_BRW_READ)
+		return 0;
+
+	if (reqstmsg->msg_magic != SRPC_MSG_MAGIC)
+		__swab64s(&magic);
+
+	if (brw_check_bulk(rpc->srpc_bulk, reqst->brw_flags, magic) != 0) {
+		CERROR ("Bulk data from %s is corrupted!\n",
+			libcfs_id2str(rpc->srpc_peer));
+		reply->brw_status = EBADMSG;
+	}
+
+	return 0;
+}
+
+int
+brw_server_handle(struct srpc_server_rpc *rpc)
+{
+	struct srpc_service	*sv = rpc->srpc_scd->scd_svc;
+	srpc_msg_t       *replymsg = &rpc->srpc_replymsg;
+	srpc_msg_t       *reqstmsg = &rpc->srpc_reqstbuf->buf_msg;
+	srpc_brw_reply_t *reply = &replymsg->msg_body.brw_reply;
+	srpc_brw_reqst_t *reqst = &reqstmsg->msg_body.brw_reqst;
+	int		  npg;
+	int	       rc;
+
+	LASSERT (sv->sv_id == SRPC_SERVICE_BRW);
+
+	if (reqstmsg->msg_magic != SRPC_MSG_MAGIC) {
+		LASSERT (reqstmsg->msg_magic == __swab32(SRPC_MSG_MAGIC));
+
+		__swab32s(&reqst->brw_rw);
+		__swab32s(&reqst->brw_len);
+		__swab32s(&reqst->brw_flags);
+		__swab64s(&reqst->brw_rpyid);
+		__swab64s(&reqst->brw_bulkid);
+	}
+	LASSERT (reqstmsg->msg_type == (__u32)srpc_service2request(sv->sv_id));
+
+	reply->brw_status = 0;
+	rpc->srpc_done = brw_server_rpc_done;
+
+	if ((reqst->brw_rw != LST_BRW_READ && reqst->brw_rw != LST_BRW_WRITE) ||
+	    (reqst->brw_flags != LST_BRW_CHECK_NONE &&
+	     reqst->brw_flags != LST_BRW_CHECK_FULL &&
+	     reqst->brw_flags != LST_BRW_CHECK_SIMPLE)) {
+		reply->brw_status = EINVAL;
+		return 0;
+	}
+
+	if ((reqstmsg->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+		replymsg->msg_ses_feats = LST_FEATS_MASK;
+		reply->brw_status = EPROTO;
+		return 0;
+	}
+
+	if ((reqstmsg->msg_ses_feats & LST_FEAT_BULK_LEN) == 0) {
+		/* compat with old version */
+		if ((reqst->brw_len & ~CFS_PAGE_MASK) != 0) {
+			reply->brw_status = EINVAL;
+			return 0;
+		}
+		npg = reqst->brw_len >> PAGE_CACHE_SHIFT;
+
+	} else {
+		npg = (reqst->brw_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	}
+
+	replymsg->msg_ses_feats = reqstmsg->msg_ses_feats;
+
+	if (reqst->brw_len == 0 || npg > LNET_MAX_IOV) {
+		reply->brw_status = EINVAL;
+		return 0;
+	}
+
+	rc = sfw_alloc_pages(rpc, rpc->srpc_scd->scd_cpt, npg,
+			     reqst->brw_len,
+			     reqst->brw_rw == LST_BRW_WRITE);
+	if (rc != 0)
+		return rc;
+
+	if (reqst->brw_rw == LST_BRW_READ)
+		brw_fill_bulk(rpc->srpc_bulk, reqst->brw_flags, BRW_MAGIC);
+	else
+		brw_fill_bulk(rpc->srpc_bulk, reqst->brw_flags, BRW_POISON);
+
+	return 0;
+}
+
+sfw_test_client_ops_t brw_test_client;
+void brw_init_test_client(void)
+{
+	brw_test_client.tso_init       = brw_client_init;
+	brw_test_client.tso_fini       = brw_client_fini;
+	brw_test_client.tso_prep_rpc   = brw_client_prep_rpc;
+	brw_test_client.tso_done_rpc   = brw_client_done_rpc;
+};
+
+srpc_service_t brw_test_service;
+void brw_init_test_service(void)
+{
+
+	brw_test_service.sv_id	 = SRPC_SERVICE_BRW;
+	brw_test_service.sv_name       = "brw_test";
+	brw_test_service.sv_handler    = brw_server_handle;
+	brw_test_service.sv_bulk_ready = brw_bulk_ready;
+	brw_test_service.sv_wi_total   = brw_srv_workitems;
+}
diff --git a/drivers/staging/lustre/lnet/selftest/conctl.c b/drivers/staging/lustre/lnet/selftest/conctl.c
new file mode 100644
index 000000000000..bce3d3bde6b2
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/conctl.c
@@ -0,0 +1,931 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/conctl.c
+ *
+ * IOC handle in kernel
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lib-lnet.h>
+#include <linux/lnet/lnetst.h>
+#include "console.h"
+
+int
+lst_session_new_ioctl(lstio_session_new_args_t *args)
+{
+	char      *name;
+	int	rc;
+
+	if (args->lstio_ses_idp   == NULL || /* address for output sid */
+	    args->lstio_ses_key   == 0 || /* no key is specified */
+	    args->lstio_ses_namep == NULL || /* session name */
+	    args->lstio_ses_nmlen <= 0 ||
+	    args->lstio_ses_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_ses_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name,
+			       args->lstio_ses_namep,
+			       args->lstio_ses_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_ses_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_ses_nmlen] = 0;
+
+	rc = lstcon_session_new(name,
+				args->lstio_ses_key,
+				args->lstio_ses_feats,
+				args->lstio_ses_force,
+				args->lstio_ses_timeout,
+				args->lstio_ses_idp);
+
+	LIBCFS_FREE(name, args->lstio_ses_nmlen + 1);
+	return rc;
+}
+
+int
+lst_session_end_ioctl(lstio_session_end_args_t *args)
+{
+	if (args->lstio_ses_key != console_session.ses_key)
+		return -EACCES;
+
+	return lstcon_session_end();
+}
+
+int
+lst_session_info_ioctl(lstio_session_info_args_t *args)
+{
+	/* no checking of key */
+
+	if (args->lstio_ses_idp   == NULL || /* address for ouput sid */
+	    args->lstio_ses_keyp  == NULL || /* address for ouput key */
+	    args->lstio_ses_featp  == NULL || /* address for ouput features */
+	    args->lstio_ses_ndinfo == NULL || /* address for output ndinfo */
+	    args->lstio_ses_namep == NULL || /* address for ouput name */
+	    args->lstio_ses_nmlen <= 0 ||
+	    args->lstio_ses_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	return lstcon_session_info(args->lstio_ses_idp,
+				   args->lstio_ses_keyp,
+				   args->lstio_ses_featp,
+				   args->lstio_ses_ndinfo,
+				   args->lstio_ses_namep,
+				   args->lstio_ses_nmlen);
+}
+
+int
+lst_debug_ioctl(lstio_debug_args_t *args)
+{
+	char   *name   = NULL;
+	int     client = 1;
+	int     rc;
+
+	if (args->lstio_dbg_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_dbg_resultp == NULL)
+		return -EINVAL;
+
+	if (args->lstio_dbg_namep != NULL && /* name of batch/group */
+	    (args->lstio_dbg_nmlen <= 0 ||
+	     args->lstio_dbg_nmlen > LST_NAME_SIZE))
+		return -EINVAL;
+
+	if (args->lstio_dbg_namep != NULL) {
+		LIBCFS_ALLOC(name, args->lstio_dbg_nmlen + 1);
+		if (name == NULL)
+			return -ENOMEM;
+
+		if (copy_from_user(name, args->lstio_dbg_namep,
+				       args->lstio_dbg_nmlen)) {
+			LIBCFS_FREE(name, args->lstio_dbg_nmlen + 1);
+
+			return -EFAULT;
+		}
+
+		name[args->lstio_dbg_nmlen] = 0;
+	}
+
+	rc = -EINVAL;
+
+	switch (args->lstio_dbg_type) {
+	case LST_OPC_SESSION:
+		rc = lstcon_session_debug(args->lstio_dbg_timeout,
+					  args->lstio_dbg_resultp);
+		break;
+
+	case LST_OPC_BATCHSRV:
+		client = 0;
+	case LST_OPC_BATCHCLI:
+		if (name == NULL)
+			goto out;
+
+		rc = lstcon_batch_debug(args->lstio_dbg_timeout,
+					name, client, args->lstio_dbg_resultp);
+		break;
+
+	case LST_OPC_GROUP:
+		if (name == NULL)
+			goto out;
+
+		rc = lstcon_group_debug(args->lstio_dbg_timeout,
+					name, args->lstio_dbg_resultp);
+		break;
+
+	case LST_OPC_NODES:
+		if (args->lstio_dbg_count <= 0 ||
+		    args->lstio_dbg_idsp == NULL)
+			goto out;
+
+		rc = lstcon_nodes_debug(args->lstio_dbg_timeout,
+					args->lstio_dbg_count,
+					args->lstio_dbg_idsp,
+					args->lstio_dbg_resultp);
+		break;
+
+	default:
+		break;
+	}
+
+out:
+	if (name != NULL)
+		LIBCFS_FREE(name, args->lstio_dbg_nmlen + 1);
+
+	return rc;
+}
+
+int
+lst_group_add_ioctl(lstio_group_add_args_t *args)
+{
+	char	   *name;
+	int	     rc;
+
+	if (args->lstio_grp_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_grp_namep == NULL||
+	    args->lstio_grp_nmlen <= 0 ||
+	    args->lstio_grp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name,
+			       args->lstio_grp_namep,
+			       args->lstio_grp_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_grp_nmlen);
+		return -EFAULT;
+	}
+
+	name[args->lstio_grp_nmlen] = 0;
+
+	rc = lstcon_group_add(name);
+
+	LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+	return rc;
+}
+
+int
+lst_group_del_ioctl(lstio_group_del_args_t *args)
+{
+	int     rc;
+	char   *name;
+
+	if (args->lstio_grp_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_grp_namep == NULL ||
+	    args->lstio_grp_nmlen <= 0 ||
+	    args->lstio_grp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name,
+			       args->lstio_grp_namep,
+			       args->lstio_grp_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_grp_nmlen] = 0;
+
+	rc = lstcon_group_del(name);
+
+	LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+	return rc;
+}
+
+int
+lst_group_update_ioctl(lstio_group_update_args_t *args)
+{
+	int     rc;
+	char   *name;
+
+	if (args->lstio_grp_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_grp_resultp == NULL ||
+	    args->lstio_grp_namep == NULL ||
+	    args->lstio_grp_nmlen <= 0 ||
+	    args->lstio_grp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name,
+			   args->lstio_grp_namep,
+			   args->lstio_grp_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_grp_nmlen] = 0;
+
+	switch (args->lstio_grp_opc) {
+	case LST_GROUP_CLEAN:
+		rc = lstcon_group_clean(name, args->lstio_grp_args);
+		break;
+
+	case LST_GROUP_REFRESH:
+		rc = lstcon_group_refresh(name, args->lstio_grp_resultp);
+		break;
+
+	case LST_GROUP_RMND:
+		if (args->lstio_grp_count  <= 0 ||
+		    args->lstio_grp_idsp == NULL) {
+			rc = -EINVAL;
+			break;
+		}
+		rc = lstcon_nodes_remove(name, args->lstio_grp_count,
+					 args->lstio_grp_idsp,
+					 args->lstio_grp_resultp);
+		break;
+
+	default:
+		rc = -EINVAL;
+		break;
+	}
+
+	LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+	return rc;
+}
+
+int
+lst_nodes_add_ioctl(lstio_group_nodes_args_t *args)
+{
+	unsigned feats;
+	int     rc;
+	char   *name;
+
+	if (args->lstio_grp_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_grp_idsp == NULL || /* array of ids */
+	    args->lstio_grp_count <= 0 ||
+	    args->lstio_grp_resultp == NULL ||
+	    args->lstio_grp_featp == NULL ||
+	    args->lstio_grp_namep == NULL ||
+	    args->lstio_grp_nmlen <= 0 ||
+	    args->lstio_grp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name, args->lstio_grp_namep,
+			       args->lstio_grp_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+		return -EFAULT;
+	}
+
+	name[args->lstio_grp_nmlen] = 0;
+
+	rc = lstcon_nodes_add(name, args->lstio_grp_count,
+			      args->lstio_grp_idsp, &feats,
+			      args->lstio_grp_resultp);
+
+	LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+	if (rc == 0 &&
+	    copy_to_user(args->lstio_grp_featp, &feats, sizeof(feats))) {
+		return -EINVAL;
+	}
+
+	return rc;
+}
+
+int
+lst_group_list_ioctl(lstio_group_list_args_t *args)
+{
+	if (args->lstio_grp_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_grp_idx   < 0 ||
+	    args->lstio_grp_namep == NULL ||
+	    args->lstio_grp_nmlen <= 0 ||
+	    args->lstio_grp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	return lstcon_group_list(args->lstio_grp_idx,
+			      args->lstio_grp_nmlen,
+			      args->lstio_grp_namep);
+}
+
+int
+lst_group_info_ioctl(lstio_group_info_args_t *args)
+{
+	char	   *name;
+	int	     ndent;
+	int	     index;
+	int	     rc;
+
+	if (args->lstio_grp_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_grp_namep == NULL ||
+	    args->lstio_grp_nmlen <= 0 ||
+	    args->lstio_grp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	if (args->lstio_grp_entp  == NULL && /* output: group entry */
+	    args->lstio_grp_dentsp == NULL)  /* output: node entry */
+		return -EINVAL;
+
+	if (args->lstio_grp_dentsp != NULL) { /* have node entry */
+		if (args->lstio_grp_idxp == NULL || /* node index */
+		    args->lstio_grp_ndentp == NULL) /* # of node entry */
+			return -EINVAL;
+
+		if (copy_from_user(&ndent, args->lstio_grp_ndentp,
+				       sizeof(ndent)) ||
+		    copy_from_user(&index, args->lstio_grp_idxp,
+				       sizeof(index)))
+			return -EFAULT;
+
+		if (ndent <= 0 || index < 0)
+			return -EINVAL;
+	}
+
+	LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name,
+			       args->lstio_grp_namep,
+			       args->lstio_grp_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_grp_nmlen] = 0;
+
+	rc = lstcon_group_info(name, args->lstio_grp_entp,
+			       &index, &ndent, args->lstio_grp_dentsp);
+
+	LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+	if (rc != 0)
+		return rc;
+
+	if (args->lstio_grp_dentsp != NULL &&
+	    (copy_to_user(args->lstio_grp_idxp, &index, sizeof(index)) ||
+	     copy_to_user(args->lstio_grp_ndentp, &ndent, sizeof(ndent))))
+		rc = -EFAULT;
+
+	return 0;
+}
+
+int
+lst_batch_add_ioctl(lstio_batch_add_args_t *args)
+{
+	int	     rc;
+	char	   *name;
+
+	if (args->lstio_bat_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_bat_namep == NULL ||
+	    args->lstio_bat_nmlen <= 0 ||
+	    args->lstio_bat_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name,
+			       args->lstio_bat_namep,
+			       args->lstio_bat_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_bat_nmlen] = 0;
+
+	rc = lstcon_batch_add(name);
+
+	LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+	return rc;
+}
+
+int
+lst_batch_run_ioctl(lstio_batch_run_args_t *args)
+{
+	int	     rc;
+	char	   *name;
+
+	if (args->lstio_bat_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_bat_namep == NULL ||
+	    args->lstio_bat_nmlen <= 0 ||
+	    args->lstio_bat_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name,
+			       args->lstio_bat_namep,
+			       args->lstio_bat_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_bat_nmlen] = 0;
+
+	rc = lstcon_batch_run(name, args->lstio_bat_timeout,
+			      args->lstio_bat_resultp);
+
+	LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+	return rc;
+}
+
+int
+lst_batch_stop_ioctl(lstio_batch_stop_args_t *args)
+{
+	int	     rc;
+	char	   *name;
+
+	if (args->lstio_bat_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_bat_resultp == NULL ||
+	    args->lstio_bat_namep == NULL ||
+	    args->lstio_bat_nmlen <= 0 ||
+	    args->lstio_bat_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name,
+			       args->lstio_bat_namep,
+			       args->lstio_bat_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_bat_nmlen] = 0;
+
+	rc = lstcon_batch_stop(name, args->lstio_bat_force,
+			       args->lstio_bat_resultp);
+
+	LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+	return rc;
+}
+
+int
+lst_batch_query_ioctl(lstio_batch_query_args_t *args)
+{
+	char   *name;
+	int     rc;
+
+	if (args->lstio_bat_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_bat_resultp == NULL ||
+	    args->lstio_bat_namep == NULL ||
+	    args->lstio_bat_nmlen <= 0 ||
+	    args->lstio_bat_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	if (args->lstio_bat_testidx < 0)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name,
+			       args->lstio_bat_namep,
+			       args->lstio_bat_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_bat_nmlen] = 0;
+
+	rc = lstcon_test_batch_query(name,
+				     args->lstio_bat_testidx,
+				     args->lstio_bat_client,
+				     args->lstio_bat_timeout,
+				     args->lstio_bat_resultp);
+
+	LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+	return rc;
+}
+
+int
+lst_batch_list_ioctl(lstio_batch_list_args_t *args)
+{
+	if (args->lstio_bat_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_bat_idx   < 0 ||
+	    args->lstio_bat_namep == NULL ||
+	    args->lstio_bat_nmlen <= 0 ||
+	    args->lstio_bat_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	return lstcon_batch_list(args->lstio_bat_idx,
+			      args->lstio_bat_nmlen,
+			      args->lstio_bat_namep);
+}
+
+int
+lst_batch_info_ioctl(lstio_batch_info_args_t *args)
+{
+	char	   *name;
+	int	     rc;
+	int	     index;
+	int	     ndent;
+
+	if (args->lstio_bat_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_bat_namep == NULL || /* batch name */
+	    args->lstio_bat_nmlen <= 0 ||
+	    args->lstio_bat_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	if (args->lstio_bat_entp == NULL && /* output: batch entry */
+	    args->lstio_bat_dentsp == NULL) /* output: node entry */
+		return -EINVAL;
+
+	if (args->lstio_bat_dentsp != NULL) { /* have node entry */
+		if (args->lstio_bat_idxp == NULL || /* node index */
+		    args->lstio_bat_ndentp == NULL) /* # of node entry */
+			return -EINVAL;
+
+		if (copy_from_user(&index, args->lstio_bat_idxp,
+				       sizeof(index)) ||
+		    copy_from_user(&ndent, args->lstio_bat_ndentp,
+				       sizeof(ndent)))
+			return -EFAULT;
+
+		if (ndent <= 0 || index < 0)
+			return -EINVAL;
+	}
+
+	LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name,
+			       args->lstio_bat_namep, args->lstio_bat_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_bat_nmlen] = 0;
+
+	rc = lstcon_batch_info(name,
+			    args->lstio_bat_entp, args->lstio_bat_server,
+			    args->lstio_bat_testidx, &index, &ndent,
+			    args->lstio_bat_dentsp);
+
+	LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+	if (rc != 0)
+		return rc;
+
+	if (args->lstio_bat_dentsp != NULL &&
+	    (copy_to_user(args->lstio_bat_idxp, &index, sizeof(index)) ||
+	     copy_to_user(args->lstio_bat_ndentp, &ndent, sizeof(ndent))))
+		rc = -EFAULT;
+
+	return rc;
+}
+
+int
+lst_stat_query_ioctl(lstio_stat_args_t *args)
+{
+	int	     rc;
+	char	   *name;
+
+	/* TODO: not finished */
+	if (args->lstio_sta_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_sta_resultp == NULL ||
+	    (args->lstio_sta_namep  == NULL &&
+	     args->lstio_sta_idsp   == NULL) ||
+	    args->lstio_sta_nmlen <= 0 ||
+	    args->lstio_sta_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	if (args->lstio_sta_idsp != NULL &&
+	    args->lstio_sta_count <= 0)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_sta_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name, args->lstio_sta_namep,
+			       args->lstio_sta_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_sta_nmlen + 1);
+		return -EFAULT;
+	}
+
+	if (args->lstio_sta_idsp == NULL) {
+		rc = lstcon_group_stat(name, args->lstio_sta_timeout,
+				       args->lstio_sta_resultp);
+	} else {
+		rc = lstcon_nodes_stat(args->lstio_sta_count,
+				       args->lstio_sta_idsp,
+				       args->lstio_sta_timeout,
+				       args->lstio_sta_resultp);
+	}
+
+	LIBCFS_FREE(name, args->lstio_sta_nmlen + 1);
+
+	return rc;
+}
+
+int lst_test_add_ioctl(lstio_test_args_t *args)
+{
+	char	   *name;
+	char	   *srcgrp = NULL;
+	char	   *dstgrp = NULL;
+	void	   *param = NULL;
+	int	     ret = 0;
+	int	     rc = -ENOMEM;
+
+	if (args->lstio_tes_resultp == NULL ||
+	    args->lstio_tes_retp == NULL ||
+	    args->lstio_tes_bat_name == NULL || /* no specified batch */
+	    args->lstio_tes_bat_nmlen <= 0 ||
+	    args->lstio_tes_bat_nmlen > LST_NAME_SIZE ||
+	    args->lstio_tes_sgrp_name == NULL || /* no source group */
+	    args->lstio_tes_sgrp_nmlen <= 0 ||
+	    args->lstio_tes_sgrp_nmlen > LST_NAME_SIZE ||
+	    args->lstio_tes_dgrp_name == NULL || /* no target group */
+	    args->lstio_tes_dgrp_nmlen <= 0 ||
+	    args->lstio_tes_dgrp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	if (args->lstio_tes_loop == 0 || /* negative is infinite */
+	    args->lstio_tes_concur <= 0 ||
+	    args->lstio_tes_dist <= 0 ||
+	    args->lstio_tes_span <= 0)
+		return -EINVAL;
+
+	/* have parameter, check if parameter length is valid */
+	if (args->lstio_tes_param != NULL &&
+	    (args->lstio_tes_param_len <= 0 ||
+	     args->lstio_tes_param_len > PAGE_CACHE_SIZE - sizeof(lstcon_test_t)))
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_tes_bat_nmlen + 1);
+	if (name == NULL)
+		return rc;
+
+	LIBCFS_ALLOC(srcgrp, args->lstio_tes_sgrp_nmlen + 1);
+	if (srcgrp == NULL)
+		goto out;
+
+	LIBCFS_ALLOC(dstgrp, args->lstio_tes_dgrp_nmlen + 1);
+	 if (dstgrp == NULL)
+		goto out;
+
+	if (args->lstio_tes_param != NULL) {
+		LIBCFS_ALLOC(param, args->lstio_tes_param_len);
+		if (param == NULL)
+			goto out;
+	}
+
+	rc = -EFAULT;
+	if (copy_from_user(name,
+			      args->lstio_tes_bat_name,
+			      args->lstio_tes_bat_nmlen) ||
+	    copy_from_user(srcgrp,
+			      args->lstio_tes_sgrp_name,
+			      args->lstio_tes_sgrp_nmlen) ||
+	    copy_from_user(dstgrp,
+			      args->lstio_tes_dgrp_name,
+			      args->lstio_tes_dgrp_nmlen) ||
+	    copy_from_user(param, args->lstio_tes_param,
+			      args->lstio_tes_param_len))
+		goto out;
+
+	rc = lstcon_test_add(name,
+			    args->lstio_tes_type,
+			    args->lstio_tes_loop,
+			    args->lstio_tes_concur,
+			    args->lstio_tes_dist, args->lstio_tes_span,
+			    srcgrp, dstgrp, param, args->lstio_tes_param_len,
+			    &ret, args->lstio_tes_resultp);
+
+	if (ret != 0)
+		rc = (copy_to_user(args->lstio_tes_retp, &ret,
+				       sizeof(ret))) ? -EFAULT : 0;
+out:
+	if (name != NULL)
+		LIBCFS_FREE(name, args->lstio_tes_bat_nmlen + 1);
+
+	if (srcgrp != NULL)
+		LIBCFS_FREE(srcgrp, args->lstio_tes_sgrp_nmlen + 1);
+
+	if (dstgrp != NULL)
+		LIBCFS_FREE(dstgrp, args->lstio_tes_dgrp_nmlen + 1);
+
+	if (param != NULL)
+		LIBCFS_FREE(param, args->lstio_tes_param_len);
+
+	return rc;
+}
+
+int
+lstcon_ioctl_entry(unsigned int cmd, struct libcfs_ioctl_data *data)
+{
+	char   *buf;
+	int     opc = data->ioc_u32[0];
+	int     rc;
+
+	if (cmd != IOC_LIBCFS_LNETST)
+		return -EINVAL;
+
+	if (data->ioc_plen1 > PAGE_CACHE_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(buf, data->ioc_plen1);
+	if (buf == NULL)
+		return -ENOMEM;
+
+	/* copy in parameter */
+	if (copy_from_user(buf, data->ioc_pbuf1, data->ioc_plen1)) {
+		LIBCFS_FREE(buf, data->ioc_plen1);
+		return -EFAULT;
+	}
+
+	mutex_lock(&console_session.ses_mutex);
+
+	console_session.ses_laststamp = cfs_time_current_sec();
+
+	if (console_session.ses_shutdown) {
+		rc = -ESHUTDOWN;
+		goto out;
+	}
+
+	if (console_session.ses_expired)
+		lstcon_session_end();
+
+	if (opc != LSTIO_SESSION_NEW &&
+	    console_session.ses_state == LST_SESSION_NONE) {
+		CDEBUG(D_NET, "LST no active session\n");
+		rc = -ESRCH;
+		goto out;
+	}
+
+	memset(&console_session.ses_trans_stat, 0, sizeof(lstcon_trans_stat_t));
+
+	switch (opc) {
+		case LSTIO_SESSION_NEW:
+			rc = lst_session_new_ioctl((lstio_session_new_args_t *)buf);
+			break;
+		case LSTIO_SESSION_END:
+			rc = lst_session_end_ioctl((lstio_session_end_args_t *)buf);
+			break;
+		case LSTIO_SESSION_INFO:
+			rc = lst_session_info_ioctl((lstio_session_info_args_t *)buf);
+			break;
+		case LSTIO_DEBUG:
+			rc = lst_debug_ioctl((lstio_debug_args_t *)buf);
+			break;
+		case LSTIO_GROUP_ADD:
+			rc = lst_group_add_ioctl((lstio_group_add_args_t *)buf);
+			break;
+		case LSTIO_GROUP_DEL:
+			rc = lst_group_del_ioctl((lstio_group_del_args_t *)buf);
+			break;
+		case LSTIO_GROUP_UPDATE:
+			rc = lst_group_update_ioctl((lstio_group_update_args_t *)buf);
+			break;
+		case LSTIO_NODES_ADD:
+			rc = lst_nodes_add_ioctl((lstio_group_nodes_args_t *)buf);
+			break;
+		case LSTIO_GROUP_LIST:
+			rc = lst_group_list_ioctl((lstio_group_list_args_t *)buf);
+			break;
+		case LSTIO_GROUP_INFO:
+			rc = lst_group_info_ioctl((lstio_group_info_args_t *)buf);
+			break;
+		case LSTIO_BATCH_ADD:
+			rc = lst_batch_add_ioctl((lstio_batch_add_args_t *)buf);
+			break;
+		case LSTIO_BATCH_START:
+			rc = lst_batch_run_ioctl((lstio_batch_run_args_t *)buf);
+			break;
+		case LSTIO_BATCH_STOP:
+			rc = lst_batch_stop_ioctl((lstio_batch_stop_args_t *)buf);
+			break;
+		case LSTIO_BATCH_QUERY:
+			rc = lst_batch_query_ioctl((lstio_batch_query_args_t *)buf);
+			break;
+		case LSTIO_BATCH_LIST:
+			rc = lst_batch_list_ioctl((lstio_batch_list_args_t *)buf);
+			break;
+		case LSTIO_BATCH_INFO:
+			rc = lst_batch_info_ioctl((lstio_batch_info_args_t *)buf);
+			break;
+		case LSTIO_TEST_ADD:
+			rc = lst_test_add_ioctl((lstio_test_args_t *)buf);
+			break;
+		case LSTIO_STAT_QUERY:
+			rc = lst_stat_query_ioctl((lstio_stat_args_t *)buf);
+			break;
+		default:
+			rc = -EINVAL;
+	}
+
+	if (copy_to_user(data->ioc_pbuf2, &console_session.ses_trans_stat,
+			     sizeof(lstcon_trans_stat_t)))
+		rc = -EFAULT;
+out:
+	mutex_unlock(&console_session.ses_mutex);
+
+	LIBCFS_FREE(buf, data->ioc_plen1);
+
+	return rc;
+}
+
+EXPORT_SYMBOL(lstcon_ioctl_entry);
diff --git a/drivers/staging/lustre/lnet/selftest/conrpc.c b/drivers/staging/lustre/lnet/selftest/conrpc.c
new file mode 100644
index 000000000000..446de0e4672f
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/conrpc.c
@@ -0,0 +1,1397 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/conctl.c
+ *
+ * Console framework rpcs
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ */
+
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lib-lnet.h>
+#include "timer.h"
+#include "conrpc.h"
+#include "console.h"
+
+void lstcon_rpc_stat_reply(lstcon_rpc_trans_t *, srpc_msg_t *,
+			   lstcon_node_t *, lstcon_trans_stat_t *);
+
+static void
+lstcon_rpc_done(srpc_client_rpc_t *rpc)
+{
+	lstcon_rpc_t *crpc = (lstcon_rpc_t *)rpc->crpc_priv;
+
+	LASSERT(crpc != NULL && rpc == crpc->crp_rpc);
+	LASSERT(crpc->crp_posted && !crpc->crp_finished);
+
+	spin_lock(&rpc->crpc_lock);
+
+	if (crpc->crp_trans == NULL) {
+		/* Orphan RPC is not in any transaction,
+		 * I'm just a poor body and nobody loves me */
+		spin_unlock(&rpc->crpc_lock);
+
+		/* release it */
+		lstcon_rpc_put(crpc);
+		return;
+	}
+
+	/* not an orphan RPC */
+	crpc->crp_finished = 1;
+
+	if (crpc->crp_stamp == 0) {
+		/* not aborted */
+		LASSERT (crpc->crp_status == 0);
+
+		crpc->crp_stamp  = cfs_time_current();
+		crpc->crp_status = rpc->crpc_status;
+	}
+
+	/* wakeup (transaction)thread if I'm the last RPC in the transaction */
+	if (atomic_dec_and_test(&crpc->crp_trans->tas_remaining))
+		wake_up(&crpc->crp_trans->tas_waitq);
+
+	spin_unlock(&rpc->crpc_lock);
+}
+
+int
+lstcon_rpc_init(lstcon_node_t *nd, int service, unsigned feats,
+		int bulk_npg, int bulk_len, int embedded, lstcon_rpc_t *crpc)
+{
+	crpc->crp_rpc = sfw_create_rpc(nd->nd_id, service,
+				       feats, bulk_npg, bulk_len,
+				       lstcon_rpc_done, (void *)crpc);
+	if (crpc->crp_rpc == NULL)
+		return -ENOMEM;
+
+	crpc->crp_trans    = NULL;
+	crpc->crp_node     = nd;
+	crpc->crp_posted   = 0;
+	crpc->crp_finished = 0;
+	crpc->crp_unpacked = 0;
+	crpc->crp_status   = 0;
+	crpc->crp_stamp    = 0;
+	crpc->crp_embedded = embedded;
+	INIT_LIST_HEAD(&crpc->crp_link);
+
+	atomic_inc(&console_session.ses_rpc_counter);
+
+	return 0;
+}
+
+int
+lstcon_rpc_prep(lstcon_node_t *nd, int service, unsigned feats,
+		int bulk_npg, int bulk_len, lstcon_rpc_t **crpcpp)
+{
+	lstcon_rpc_t  *crpc = NULL;
+	int	    rc;
+
+	spin_lock(&console_session.ses_rpc_lock);
+
+	if (!list_empty(&console_session.ses_rpc_freelist)) {
+		crpc = list_entry(console_session.ses_rpc_freelist.next,
+				      lstcon_rpc_t, crp_link);
+		list_del_init(&crpc->crp_link);
+	}
+
+	spin_unlock(&console_session.ses_rpc_lock);
+
+	if (crpc == NULL) {
+		LIBCFS_ALLOC(crpc, sizeof(*crpc));
+		if (crpc == NULL)
+			return -ENOMEM;
+	}
+
+	rc = lstcon_rpc_init(nd, service, feats, bulk_npg, bulk_len, 0, crpc);
+	if (rc == 0) {
+		*crpcpp = crpc;
+		return 0;
+	}
+
+	LIBCFS_FREE(crpc, sizeof(*crpc));
+
+	return rc;
+}
+
+void
+lstcon_rpc_put(lstcon_rpc_t *crpc)
+{
+	srpc_bulk_t *bulk = &crpc->crp_rpc->crpc_bulk;
+	int	  i;
+
+	LASSERT (list_empty(&crpc->crp_link));
+
+	for (i = 0; i < bulk->bk_niov; i++) {
+		if (bulk->bk_iovs[i].kiov_page == NULL)
+			continue;
+
+		__free_page(bulk->bk_iovs[i].kiov_page);
+	}
+
+	srpc_client_rpc_decref(crpc->crp_rpc);
+
+	if (crpc->crp_embedded) {
+		/* embedded RPC, don't recycle it */
+		memset(crpc, 0, sizeof(*crpc));
+		crpc->crp_embedded = 1;
+
+	} else {
+		spin_lock(&console_session.ses_rpc_lock);
+
+		list_add(&crpc->crp_link,
+			     &console_session.ses_rpc_freelist);
+
+		spin_unlock(&console_session.ses_rpc_lock);
+	}
+
+	/* RPC is not alive now */
+	atomic_dec(&console_session.ses_rpc_counter);
+}
+
+void
+lstcon_rpc_post(lstcon_rpc_t *crpc)
+{
+	lstcon_rpc_trans_t *trans = crpc->crp_trans;
+
+	LASSERT (trans != NULL);
+
+	atomic_inc(&trans->tas_remaining);
+	crpc->crp_posted = 1;
+
+	sfw_post_rpc(crpc->crp_rpc);
+}
+
+static char *
+lstcon_rpc_trans_name(int transop)
+{
+	if (transop == LST_TRANS_SESNEW)
+		return "SESNEW";
+
+	if (transop == LST_TRANS_SESEND)
+		return "SESEND";
+
+	if (transop == LST_TRANS_SESQRY)
+		return "SESQRY";
+
+	if (transop == LST_TRANS_SESPING)
+		return "SESPING";
+
+	if (transop == LST_TRANS_TSBCLIADD)
+		return "TSBCLIADD";
+
+	if (transop == LST_TRANS_TSBSRVADD)
+		return "TSBSRVADD";
+
+	if (transop == LST_TRANS_TSBRUN)
+		return "TSBRUN";
+
+	if (transop == LST_TRANS_TSBSTOP)
+		return "TSBSTOP";
+
+	if (transop == LST_TRANS_TSBCLIQRY)
+		return "TSBCLIQRY";
+
+	if (transop == LST_TRANS_TSBSRVQRY)
+		return "TSBSRVQRY";
+
+	if (transop == LST_TRANS_STATQRY)
+		return "STATQRY";
+
+	return "Unknown";
+}
+
+int
+lstcon_rpc_trans_prep(struct list_head *translist,
+		      int transop, lstcon_rpc_trans_t **transpp)
+{
+	lstcon_rpc_trans_t *trans;
+
+	if (translist != NULL) {
+		list_for_each_entry(trans, translist, tas_link) {
+			/* Can't enqueue two private transaction on
+			 * the same object */
+			if ((trans->tas_opc & transop) == LST_TRANS_PRIVATE)
+				return -EPERM;
+		}
+	}
+
+	/* create a trans group */
+	LIBCFS_ALLOC(trans, sizeof(*trans));
+	if (trans == NULL)
+		return -ENOMEM;
+
+	trans->tas_opc = transop;
+
+	if (translist == NULL)
+		INIT_LIST_HEAD(&trans->tas_olink);
+	else
+		list_add_tail(&trans->tas_olink, translist);
+
+	list_add_tail(&trans->tas_link, &console_session.ses_trans_list);
+
+	INIT_LIST_HEAD(&trans->tas_rpcs_list);
+	atomic_set(&trans->tas_remaining, 0);
+	init_waitqueue_head(&trans->tas_waitq);
+
+	spin_lock(&console_session.ses_rpc_lock);
+	trans->tas_features = console_session.ses_features;
+	spin_unlock(&console_session.ses_rpc_lock);
+
+	*transpp = trans;
+	return 0;
+}
+
+void
+lstcon_rpc_trans_addreq(lstcon_rpc_trans_t *trans, lstcon_rpc_t *crpc)
+{
+	list_add_tail(&crpc->crp_link, &trans->tas_rpcs_list);
+	crpc->crp_trans = trans;
+}
+
+void
+lstcon_rpc_trans_abort(lstcon_rpc_trans_t *trans, int error)
+{
+	srpc_client_rpc_t *rpc;
+	lstcon_rpc_t      *crpc;
+	lstcon_node_t     *nd;
+
+	list_for_each_entry (crpc, &trans->tas_rpcs_list, crp_link) {
+		rpc = crpc->crp_rpc;
+
+		spin_lock(&rpc->crpc_lock);
+
+		if (!crpc->crp_posted || /* not posted */
+		    crpc->crp_stamp != 0) { /* rpc done or aborted already */
+			if (crpc->crp_stamp == 0) {
+				crpc->crp_stamp = cfs_time_current();
+				crpc->crp_status = -EINTR;
+			}
+			spin_unlock(&rpc->crpc_lock);
+			continue;
+		}
+
+		crpc->crp_stamp  = cfs_time_current();
+		crpc->crp_status = error;
+
+		spin_unlock(&rpc->crpc_lock);
+
+		sfw_abort_rpc(rpc);
+
+		if  (error != ETIMEDOUT)
+			continue;
+
+		nd = crpc->crp_node;
+		if (cfs_time_after(nd->nd_stamp, crpc->crp_stamp))
+			continue;
+
+		nd->nd_stamp = crpc->crp_stamp;
+		nd->nd_state = LST_NODE_DOWN;
+	}
+}
+
+static int
+lstcon_rpc_trans_check(lstcon_rpc_trans_t *trans)
+{
+	if (console_session.ses_shutdown &&
+	    !list_empty(&trans->tas_olink)) /* Not an end session RPC */
+		return 1;
+
+	return (atomic_read(&trans->tas_remaining) == 0) ? 1: 0;
+}
+
+int
+lstcon_rpc_trans_postwait(lstcon_rpc_trans_t *trans, int timeout)
+{
+	lstcon_rpc_t  *crpc;
+	int	    rc;
+
+	if (list_empty(&trans->tas_rpcs_list))
+		return 0;
+
+	if (timeout < LST_TRANS_MIN_TIMEOUT)
+		timeout = LST_TRANS_MIN_TIMEOUT;
+
+	CDEBUG(D_NET, "Transaction %s started\n",
+	       lstcon_rpc_trans_name(trans->tas_opc));
+
+	/* post all requests */
+	list_for_each_entry (crpc, &trans->tas_rpcs_list, crp_link) {
+		LASSERT (!crpc->crp_posted);
+
+		lstcon_rpc_post(crpc);
+	}
+
+	mutex_unlock(&console_session.ses_mutex);
+
+	rc = wait_event_interruptible_timeout(trans->tas_waitq,
+					      lstcon_rpc_trans_check(trans),
+					      cfs_time_seconds(timeout));
+	rc = (rc > 0) ? 0 : ((rc < 0) ? -EINTR : -ETIMEDOUT);
+
+	mutex_lock(&console_session.ses_mutex);
+
+	if (console_session.ses_shutdown)
+		rc = -ESHUTDOWN;
+
+	if (rc != 0 || atomic_read(&trans->tas_remaining) != 0) {
+		/* treat short timeout as canceled */
+		if (rc == -ETIMEDOUT && timeout < LST_TRANS_MIN_TIMEOUT * 2)
+			rc = -EINTR;
+
+		lstcon_rpc_trans_abort(trans, rc);
+	}
+
+	CDEBUG(D_NET, "Transaction %s stopped: %d\n",
+	       lstcon_rpc_trans_name(trans->tas_opc), rc);
+
+	lstcon_rpc_trans_stat(trans, lstcon_trans_stat());
+
+	return rc;
+}
+
+int
+lstcon_rpc_get_reply(lstcon_rpc_t *crpc, srpc_msg_t **msgpp)
+{
+	lstcon_node_t	*nd  = crpc->crp_node;
+	srpc_client_rpc_t    *rpc = crpc->crp_rpc;
+	srpc_generic_reply_t *rep;
+
+	LASSERT (nd != NULL && rpc != NULL);
+	LASSERT (crpc->crp_stamp != 0);
+
+	if (crpc->crp_status != 0) {
+		*msgpp = NULL;
+		return crpc->crp_status;
+	}
+
+	*msgpp = &rpc->crpc_replymsg;
+	if (!crpc->crp_unpacked) {
+		sfw_unpack_message(*msgpp);
+		crpc->crp_unpacked = 1;
+	}
+
+	if (cfs_time_after(nd->nd_stamp, crpc->crp_stamp))
+		return 0;
+
+	nd->nd_stamp = crpc->crp_stamp;
+	rep = &(*msgpp)->msg_body.reply;
+
+	if (rep->sid.ses_nid == LNET_NID_ANY)
+		nd->nd_state = LST_NODE_UNKNOWN;
+	else if (lstcon_session_match(rep->sid))
+		nd->nd_state = LST_NODE_ACTIVE;
+	else
+		nd->nd_state = LST_NODE_BUSY;
+
+	return 0;
+}
+
+void
+lstcon_rpc_trans_stat(lstcon_rpc_trans_t *trans, lstcon_trans_stat_t *stat)
+{
+	lstcon_rpc_t      *crpc;
+	srpc_msg_t	*rep;
+	int		error;
+
+	LASSERT (stat != NULL);
+
+	memset(stat, 0, sizeof(*stat));
+
+	list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) {
+		lstcon_rpc_stat_total(stat, 1);
+
+		LASSERT (crpc->crp_stamp != 0);
+
+		error = lstcon_rpc_get_reply(crpc, &rep);
+		if (error != 0) {
+			lstcon_rpc_stat_failure(stat, 1);
+			if (stat->trs_rpc_errno == 0)
+				stat->trs_rpc_errno = -error;
+
+			continue;
+		}
+
+		lstcon_rpc_stat_success(stat, 1);
+
+		lstcon_rpc_stat_reply(trans, rep, crpc->crp_node, stat);
+	}
+
+	if (trans->tas_opc == LST_TRANS_SESNEW && stat->trs_fwk_errno == 0) {
+		stat->trs_fwk_errno =
+		      lstcon_session_feats_check(trans->tas_features);
+	}
+
+	CDEBUG(D_NET, "transaction %s : success %d, failure %d, total %d, "
+		      "RPC error(%d), Framework error(%d)\n",
+	       lstcon_rpc_trans_name(trans->tas_opc),
+	       lstcon_rpc_stat_success(stat, 0),
+	       lstcon_rpc_stat_failure(stat, 0),
+	       lstcon_rpc_stat_total(stat, 0),
+	       stat->trs_rpc_errno, stat->trs_fwk_errno);
+
+	return;
+}
+
+int
+lstcon_rpc_trans_interpreter(lstcon_rpc_trans_t *trans,
+			     struct list_head *head_up,
+			     lstcon_rpc_readent_func_t readent)
+{
+	struct list_head	    tmp;
+	struct list_head	   *next;
+	lstcon_rpc_ent_t     *ent;
+	srpc_generic_reply_t *rep;
+	lstcon_rpc_t	 *crpc;
+	srpc_msg_t	   *msg;
+	lstcon_node_t	*nd;
+	cfs_duration_t	dur;
+	struct timeval	tv;
+	int		   error;
+
+	LASSERT (head_up != NULL);
+
+	next = head_up;
+
+	list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) {
+		if (copy_from_user(&tmp, next,
+				       sizeof(struct list_head)))
+			return -EFAULT;
+
+		if (tmp.next == head_up)
+			return 0;
+
+		next = tmp.next;
+
+		ent = list_entry(next, lstcon_rpc_ent_t, rpe_link);
+
+		LASSERT (crpc->crp_stamp != 0);
+
+		error = lstcon_rpc_get_reply(crpc, &msg);
+
+		nd = crpc->crp_node;
+
+		dur = (cfs_duration_t)cfs_time_sub(crpc->crp_stamp,
+		      (cfs_time_t)console_session.ses_id.ses_stamp);
+		cfs_duration_usec(dur, &tv);
+
+		if (copy_to_user(&ent->rpe_peer,
+				     &nd->nd_id, sizeof(lnet_process_id_t)) ||
+		    copy_to_user(&ent->rpe_stamp, &tv, sizeof(tv)) ||
+		    copy_to_user(&ent->rpe_state,
+				     &nd->nd_state, sizeof(nd->nd_state)) ||
+		    copy_to_user(&ent->rpe_rpc_errno, &error,
+				     sizeof(error)))
+			return -EFAULT;
+
+		if (error != 0)
+			continue;
+
+		/* RPC is done */
+		rep = (srpc_generic_reply_t *)&msg->msg_body.reply;
+
+		if (copy_to_user(&ent->rpe_sid,
+				     &rep->sid, sizeof(lst_sid_t)) ||
+		    copy_to_user(&ent->rpe_fwk_errno,
+				     &rep->status, sizeof(rep->status)))
+			return -EFAULT;
+
+		if (readent == NULL)
+			continue;
+
+		if ((error = readent(trans->tas_opc, msg, ent)) != 0)
+			return error;
+	}
+
+	return 0;
+}
+
+void
+lstcon_rpc_trans_destroy(lstcon_rpc_trans_t *trans)
+{
+	srpc_client_rpc_t *rpc;
+	lstcon_rpc_t      *crpc;
+	lstcon_rpc_t      *tmp;
+	int		count = 0;
+
+	list_for_each_entry_safe(crpc, tmp, &trans->tas_rpcs_list,
+				 crp_link) {
+		rpc = crpc->crp_rpc;
+
+		spin_lock(&rpc->crpc_lock);
+
+		/* free it if not posted or finished already */
+		if (!crpc->crp_posted || crpc->crp_finished) {
+			spin_unlock(&rpc->crpc_lock);
+
+			list_del_init(&crpc->crp_link);
+			lstcon_rpc_put(crpc);
+
+			continue;
+		}
+
+		/* rpcs can be still not callbacked (even LNetMDUnlink is called)
+		 * because huge timeout for inaccessible network, don't make
+		 * user wait for them, just abandon them, they will be recycled
+		 * in callback */
+
+		LASSERT (crpc->crp_status != 0);
+
+		crpc->crp_node  = NULL;
+		crpc->crp_trans = NULL;
+		list_del_init(&crpc->crp_link);
+		count ++;
+
+		spin_unlock(&rpc->crpc_lock);
+
+		atomic_dec(&trans->tas_remaining);
+	}
+
+	LASSERT (atomic_read(&trans->tas_remaining) == 0);
+
+	list_del(&trans->tas_link);
+	if (!list_empty(&trans->tas_olink))
+		list_del(&trans->tas_olink);
+
+	CDEBUG(D_NET, "Transaction %s destroyed with %d pending RPCs\n",
+	       lstcon_rpc_trans_name(trans->tas_opc), count);
+
+	LIBCFS_FREE(trans, sizeof(*trans));
+
+	return;
+}
+
+int
+lstcon_sesrpc_prep(lstcon_node_t *nd, int transop,
+		   unsigned feats, lstcon_rpc_t **crpc)
+{
+	srpc_mksn_reqst_t *msrq;
+	srpc_rmsn_reqst_t *rsrq;
+	int		rc;
+
+	switch (transop) {
+	case LST_TRANS_SESNEW:
+		rc = lstcon_rpc_prep(nd, SRPC_SERVICE_MAKE_SESSION,
+				     feats, 0, 0, crpc);
+		if (rc != 0)
+			return rc;
+
+		msrq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.mksn_reqst;
+		msrq->mksn_sid     = console_session.ses_id;
+		msrq->mksn_force   = console_session.ses_force;
+		strncpy(msrq->mksn_name, console_session.ses_name,
+			strlen(console_session.ses_name));
+		break;
+
+	case LST_TRANS_SESEND:
+		rc = lstcon_rpc_prep(nd, SRPC_SERVICE_REMOVE_SESSION,
+				     feats, 0, 0, crpc);
+		if (rc != 0)
+			return rc;
+
+		rsrq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.rmsn_reqst;
+		rsrq->rmsn_sid = console_session.ses_id;
+		break;
+
+	default:
+		LBUG();
+	}
+
+	return 0;
+}
+
+int
+lstcon_dbgrpc_prep(lstcon_node_t *nd, unsigned feats, lstcon_rpc_t **crpc)
+{
+	srpc_debug_reqst_t *drq;
+	int		    rc;
+
+	rc = lstcon_rpc_prep(nd, SRPC_SERVICE_DEBUG, feats, 0, 0, crpc);
+	if (rc != 0)
+		return rc;
+
+	drq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.dbg_reqst;
+
+	drq->dbg_sid   = console_session.ses_id;
+	drq->dbg_flags = 0;
+
+	return rc;
+}
+
+int
+lstcon_batrpc_prep(lstcon_node_t *nd, int transop, unsigned feats,
+		   lstcon_tsb_hdr_t *tsb, lstcon_rpc_t **crpc)
+{
+	lstcon_batch_t	   *batch;
+	srpc_batch_reqst_t *brq;
+	int		    rc;
+
+	rc = lstcon_rpc_prep(nd, SRPC_SERVICE_BATCH, feats, 0, 0, crpc);
+	if (rc != 0)
+		return rc;
+
+	brq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.bat_reqst;
+
+	brq->bar_sid     = console_session.ses_id;
+	brq->bar_bid     = tsb->tsb_id;
+	brq->bar_testidx = tsb->tsb_index;
+	brq->bar_opc     = transop == LST_TRANS_TSBRUN ? SRPC_BATCH_OPC_RUN :
+			   (transop == LST_TRANS_TSBSTOP ? SRPC_BATCH_OPC_STOP:
+			    SRPC_BATCH_OPC_QUERY);
+
+	if (transop != LST_TRANS_TSBRUN &&
+	    transop != LST_TRANS_TSBSTOP)
+		return 0;
+
+	LASSERT (tsb->tsb_index == 0);
+
+	batch = (lstcon_batch_t *)tsb;
+	brq->bar_arg = batch->bat_arg;
+
+	return 0;
+}
+
+int
+lstcon_statrpc_prep(lstcon_node_t *nd, unsigned feats, lstcon_rpc_t **crpc)
+{
+	srpc_stat_reqst_t *srq;
+	int		   rc;
+
+	rc = lstcon_rpc_prep(nd, SRPC_SERVICE_QUERY_STAT, feats, 0, 0, crpc);
+	if (rc != 0)
+		return rc;
+
+	srq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.stat_reqst;
+
+	srq->str_sid  = console_session.ses_id;
+	srq->str_type = 0; /* XXX remove it */
+
+	return 0;
+}
+
+lnet_process_id_packed_t *
+lstcon_next_id(int idx, int nkiov, lnet_kiov_t *kiov)
+{
+	lnet_process_id_packed_t *pid;
+	int		       i;
+
+	i = idx / SFW_ID_PER_PAGE;
+
+	LASSERT (i < nkiov);
+
+	pid = (lnet_process_id_packed_t *)page_address(kiov[i].kiov_page);
+
+	return &pid[idx % SFW_ID_PER_PAGE];
+}
+
+int
+lstcon_dstnodes_prep(lstcon_group_t *grp, int idx,
+		     int dist, int span, int nkiov, lnet_kiov_t *kiov)
+{
+	lnet_process_id_packed_t *pid;
+	lstcon_ndlink_t	  *ndl;
+	lstcon_node_t	    *nd;
+	int		       start;
+	int		       end;
+	int		       i = 0;
+
+	LASSERT (dist >= 1);
+	LASSERT (span >= 1);
+	LASSERT (grp->grp_nnode >= 1);
+
+	if (span > grp->grp_nnode)
+		return -EINVAL;
+
+	start = ((idx / dist) * span) % grp->grp_nnode;
+	end   = ((idx / dist) * span + span - 1) % grp->grp_nnode;
+
+	list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link) {
+		nd = ndl->ndl_node;
+		if (i < start) {
+			i ++;
+			continue;
+		}
+
+		if (i > (end >= start ? end: grp->grp_nnode))
+			break;
+
+		pid = lstcon_next_id((i - start), nkiov, kiov);
+		pid->nid = nd->nd_id.nid;
+		pid->pid = nd->nd_id.pid;
+		i++;
+	}
+
+	if (start <= end) /* done */
+		return 0;
+
+	list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link) {
+		if (i > grp->grp_nnode + end)
+			break;
+
+		nd = ndl->ndl_node;
+		pid = lstcon_next_id((i - start), nkiov, kiov);
+		pid->nid = nd->nd_id.nid;
+		pid->pid = nd->nd_id.pid;
+		i++;
+	}
+
+	return 0;
+}
+
+int
+lstcon_pingrpc_prep(lst_test_ping_param_t *param, srpc_test_reqst_t *req)
+{
+	test_ping_req_t *prq = &req->tsr_u.ping;
+
+	prq->png_size   = param->png_size;
+	prq->png_flags  = param->png_flags;
+	/* TODO dest */
+	return 0;
+}
+
+int
+lstcon_bulkrpc_v0_prep(lst_test_bulk_param_t *param, srpc_test_reqst_t *req)
+{
+	test_bulk_req_t *brq = &req->tsr_u.bulk_v0;
+
+	brq->blk_opc    = param->blk_opc;
+	brq->blk_npg    = (param->blk_size + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE;
+	brq->blk_flags  = param->blk_flags;
+
+	return 0;
+}
+
+int
+lstcon_bulkrpc_v1_prep(lst_test_bulk_param_t *param, srpc_test_reqst_t *req)
+{
+	test_bulk_req_v1_t *brq = &req->tsr_u.bulk_v1;
+
+	brq->blk_opc	= param->blk_opc;
+	brq->blk_flags	= param->blk_flags;
+	brq->blk_len	= param->blk_size;
+	brq->blk_offset	= 0; /* reserved */
+
+	return 0;
+}
+
+int
+lstcon_testrpc_prep(lstcon_node_t *nd, int transop, unsigned feats,
+		    lstcon_test_t *test, lstcon_rpc_t **crpc)
+{
+	lstcon_group_t    *sgrp = test->tes_src_grp;
+	lstcon_group_t    *dgrp = test->tes_dst_grp;
+	srpc_test_reqst_t *trq;
+	srpc_bulk_t       *bulk;
+	int		i;
+	int		   npg = 0;
+	int		   nob = 0;
+	int		   rc  = 0;
+
+	if (transop == LST_TRANS_TSBCLIADD) {
+		npg = sfw_id_pages(test->tes_span);
+		nob = (feats & LST_FEAT_BULK_LEN) == 0 ?
+		      npg * PAGE_CACHE_SIZE :
+		      sizeof(lnet_process_id_packed_t) * test->tes_span;
+	}
+
+	rc = lstcon_rpc_prep(nd, SRPC_SERVICE_TEST, feats, npg, nob, crpc);
+	if (rc != 0)
+		return rc;
+
+	trq  = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.tes_reqst;
+
+	if (transop == LST_TRANS_TSBSRVADD) {
+		int ndist = (sgrp->grp_nnode + test->tes_dist - 1) / test->tes_dist;
+		int nspan = (dgrp->grp_nnode + test->tes_span - 1) / test->tes_span;
+		int nmax = (ndist + nspan - 1) / nspan;
+
+		trq->tsr_ndest = 0;
+		trq->tsr_loop  = nmax * test->tes_dist * test->tes_concur;
+
+	} else {
+		bulk = &(*crpc)->crp_rpc->crpc_bulk;
+
+		for (i = 0; i < npg; i++) {
+			int	len;
+
+			LASSERT(nob > 0);
+
+			len = (feats & LST_FEAT_BULK_LEN) == 0 ?
+			      PAGE_CACHE_SIZE : min_t(int, nob, PAGE_CACHE_SIZE);
+			nob -= len;
+
+			bulk->bk_iovs[i].kiov_offset = 0;
+			bulk->bk_iovs[i].kiov_len    = len;
+			bulk->bk_iovs[i].kiov_page   =
+				alloc_page(GFP_IOFS);
+
+			if (bulk->bk_iovs[i].kiov_page == NULL) {
+				lstcon_rpc_put(*crpc);
+				return -ENOMEM;
+			}
+		}
+
+		bulk->bk_sink = 0;
+
+		LASSERT (transop == LST_TRANS_TSBCLIADD);
+
+		rc = lstcon_dstnodes_prep(test->tes_dst_grp,
+					  test->tes_cliidx++,
+					  test->tes_dist,
+					  test->tes_span,
+					  npg, &bulk->bk_iovs[0]);
+		if (rc != 0) {
+			lstcon_rpc_put(*crpc);
+			return rc;
+		}
+
+		trq->tsr_ndest = test->tes_span;
+		trq->tsr_loop  = test->tes_loop;
+	}
+
+	trq->tsr_sid	= console_session.ses_id;
+	trq->tsr_bid	= test->tes_hdr.tsb_id;
+	trq->tsr_concur     = test->tes_concur;
+	trq->tsr_is_client  = (transop == LST_TRANS_TSBCLIADD) ? 1 : 0;
+	trq->tsr_stop_onerr = !!test->tes_stop_onerr;
+
+	switch (test->tes_type) {
+	case LST_TEST_PING:
+		trq->tsr_service = SRPC_SERVICE_PING;
+		rc = lstcon_pingrpc_prep((lst_test_ping_param_t *)
+					 &test->tes_param[0], trq);
+		break;
+
+	case LST_TEST_BULK:
+		trq->tsr_service = SRPC_SERVICE_BRW;
+		if ((feats & LST_FEAT_BULK_LEN) == 0) {
+			rc = lstcon_bulkrpc_v0_prep((lst_test_bulk_param_t *)
+						    &test->tes_param[0], trq);
+		} else {
+			rc = lstcon_bulkrpc_v1_prep((lst_test_bulk_param_t *)
+						    &test->tes_param[0], trq);
+		}
+
+		break;
+	default:
+		LBUG();
+		break;
+	}
+
+	return rc;
+}
+
+int
+lstcon_sesnew_stat_reply(lstcon_rpc_trans_t *trans,
+			 lstcon_node_t *nd, srpc_msg_t *reply)
+{
+	srpc_mksn_reply_t *mksn_rep = &reply->msg_body.mksn_reply;
+	int		   status   = mksn_rep->mksn_status;
+
+	if (status == 0 &&
+	    (reply->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+		mksn_rep->mksn_status = EPROTO;
+		status = EPROTO;
+	}
+
+	if (status == EPROTO) {
+		CNETERR("session protocol error from %s: %u\n",
+			libcfs_nid2str(nd->nd_id.nid),
+			reply->msg_ses_feats);
+	}
+
+	if (status != 0)
+		return status;
+
+	if (!trans->tas_feats_updated) {
+		trans->tas_feats_updated = 1;
+		trans->tas_features = reply->msg_ses_feats;
+	}
+
+	if (reply->msg_ses_feats != trans->tas_features) {
+		CNETERR("Framework features %x from %s is different with "
+			"features on this transaction: %x\n",
+			 reply->msg_ses_feats, libcfs_nid2str(nd->nd_id.nid),
+			 trans->tas_features);
+		status = mksn_rep->mksn_status = EPROTO;
+	}
+
+	if (status == 0) {
+		/* session timeout on remote node */
+		nd->nd_timeout = mksn_rep->mksn_timeout;
+	}
+
+	return status;
+}
+
+void
+lstcon_rpc_stat_reply(lstcon_rpc_trans_t *trans, srpc_msg_t *msg,
+		      lstcon_node_t *nd, lstcon_trans_stat_t *stat)
+{
+	srpc_rmsn_reply_t  *rmsn_rep;
+	srpc_debug_reply_t *dbg_rep;
+	srpc_batch_reply_t *bat_rep;
+	srpc_test_reply_t  *test_rep;
+	srpc_stat_reply_t  *stat_rep;
+	int		 rc = 0;
+
+	switch (trans->tas_opc) {
+	case LST_TRANS_SESNEW:
+		rc = lstcon_sesnew_stat_reply(trans, nd, msg);
+		if (rc == 0) {
+			lstcon_sesop_stat_success(stat, 1);
+			return;
+		}
+
+		lstcon_sesop_stat_failure(stat, 1);
+		break;
+
+	case LST_TRANS_SESEND:
+		rmsn_rep = &msg->msg_body.rmsn_reply;
+		/* ESRCH is not an error for end session */
+		if (rmsn_rep->rmsn_status == 0 ||
+		    rmsn_rep->rmsn_status == ESRCH) {
+			lstcon_sesop_stat_success(stat, 1);
+			return;
+		}
+
+		lstcon_sesop_stat_failure(stat, 1);
+		rc = rmsn_rep->rmsn_status;
+		break;
+
+	case LST_TRANS_SESQRY:
+	case LST_TRANS_SESPING:
+		dbg_rep = &msg->msg_body.dbg_reply;
+
+		if (dbg_rep->dbg_status == ESRCH) {
+			lstcon_sesqry_stat_unknown(stat, 1);
+			return;
+		}
+
+		if (lstcon_session_match(dbg_rep->dbg_sid))
+			lstcon_sesqry_stat_active(stat, 1);
+		else
+			lstcon_sesqry_stat_busy(stat, 1);
+		return;
+
+	case LST_TRANS_TSBRUN:
+	case LST_TRANS_TSBSTOP:
+		bat_rep = &msg->msg_body.bat_reply;
+
+		if (bat_rep->bar_status == 0) {
+			lstcon_tsbop_stat_success(stat, 1);
+			return;
+		}
+
+		if (bat_rep->bar_status == EPERM &&
+		    trans->tas_opc == LST_TRANS_TSBSTOP) {
+			lstcon_tsbop_stat_success(stat, 1);
+			return;
+		}
+
+		lstcon_tsbop_stat_failure(stat, 1);
+		rc = bat_rep->bar_status;
+		break;
+
+	case LST_TRANS_TSBCLIQRY:
+	case LST_TRANS_TSBSRVQRY:
+		bat_rep = &msg->msg_body.bat_reply;
+
+		if (bat_rep->bar_active != 0)
+			lstcon_tsbqry_stat_run(stat, 1);
+		else
+			lstcon_tsbqry_stat_idle(stat, 1);
+
+		if (bat_rep->bar_status == 0)
+			return;
+
+		lstcon_tsbqry_stat_failure(stat, 1);
+		rc = bat_rep->bar_status;
+		break;
+
+	case LST_TRANS_TSBCLIADD:
+	case LST_TRANS_TSBSRVADD:
+		test_rep = &msg->msg_body.tes_reply;
+
+		if (test_rep->tsr_status == 0) {
+			lstcon_tsbop_stat_success(stat, 1);
+			return;
+		}
+
+		lstcon_tsbop_stat_failure(stat, 1);
+		rc = test_rep->tsr_status;
+		break;
+
+	case LST_TRANS_STATQRY:
+		stat_rep = &msg->msg_body.stat_reply;
+
+		if (stat_rep->str_status == 0) {
+			lstcon_statqry_stat_success(stat, 1);
+			return;
+		}
+
+		lstcon_statqry_stat_failure(stat, 1);
+		rc = stat_rep->str_status;
+		break;
+
+	default:
+		LBUG();
+	}
+
+	if (stat->trs_fwk_errno == 0)
+		stat->trs_fwk_errno = rc;
+
+	return;
+}
+
+int
+lstcon_rpc_trans_ndlist(struct list_head *ndlist,
+			struct list_head *translist, int transop,
+			void *arg, lstcon_rpc_cond_func_t condition,
+			lstcon_rpc_trans_t **transpp)
+{
+	lstcon_rpc_trans_t *trans;
+	lstcon_ndlink_t    *ndl;
+	lstcon_node_t      *nd;
+	lstcon_rpc_t       *rpc;
+	unsigned	    feats;
+	int		 rc;
+
+	/* Creating session RPG for list of nodes */
+
+	rc = lstcon_rpc_trans_prep(translist, transop, &trans);
+	if (rc != 0) {
+		CERROR("Can't create transaction %d: %d\n", transop, rc);
+		return rc;
+	}
+
+	feats = trans->tas_features;
+	list_for_each_entry(ndl, ndlist, ndl_link) {
+		rc = condition == NULL ? 1 :
+		     condition(transop, ndl->ndl_node, arg);
+
+		if (rc == 0)
+			continue;
+
+		if (rc < 0) {
+			CDEBUG(D_NET, "Condition error while creating RPC "
+				      " for transaction %d: %d\n", transop, rc);
+			break;
+		}
+
+		nd = ndl->ndl_node;
+
+		switch (transop) {
+		case LST_TRANS_SESNEW:
+		case LST_TRANS_SESEND:
+			rc = lstcon_sesrpc_prep(nd, transop, feats, &rpc);
+			break;
+		case LST_TRANS_SESQRY:
+		case LST_TRANS_SESPING:
+			rc = lstcon_dbgrpc_prep(nd, feats, &rpc);
+			break;
+		case LST_TRANS_TSBCLIADD:
+		case LST_TRANS_TSBSRVADD:
+			rc = lstcon_testrpc_prep(nd, transop, feats,
+						 (lstcon_test_t *)arg, &rpc);
+			break;
+		case LST_TRANS_TSBRUN:
+		case LST_TRANS_TSBSTOP:
+		case LST_TRANS_TSBCLIQRY:
+		case LST_TRANS_TSBSRVQRY:
+			rc = lstcon_batrpc_prep(nd, transop, feats,
+						(lstcon_tsb_hdr_t *)arg, &rpc);
+			break;
+		case LST_TRANS_STATQRY:
+			rc = lstcon_statrpc_prep(nd, feats, &rpc);
+			break;
+		default:
+			rc = -EINVAL;
+			break;
+		}
+
+		if (rc != 0) {
+			CERROR("Failed to create RPC for transaction %s: %d\n",
+			       lstcon_rpc_trans_name(transop), rc);
+			break;
+		}
+
+		lstcon_rpc_trans_addreq(trans, rpc);
+	}
+
+	if (rc == 0) {
+		*transpp = trans;
+		return 0;
+	}
+
+	lstcon_rpc_trans_destroy(trans);
+
+	return rc;
+}
+
+void
+lstcon_rpc_pinger(void *arg)
+{
+	stt_timer_t	*ptimer = (stt_timer_t *)arg;
+	lstcon_rpc_trans_t *trans;
+	lstcon_rpc_t       *crpc;
+	srpc_msg_t	 *rep;
+	srpc_debug_reqst_t *drq;
+	lstcon_ndlink_t    *ndl;
+	lstcon_node_t      *nd;
+	time_t	      intv;
+	int		 count = 0;
+	int		 rc;
+
+	/* RPC pinger is a special case of transaction,
+	 * it's called by timer at 8 seconds interval.
+	 */
+	mutex_lock(&console_session.ses_mutex);
+
+	if (console_session.ses_shutdown || console_session.ses_expired) {
+		mutex_unlock(&console_session.ses_mutex);
+		return;
+	}
+
+	if (!console_session.ses_expired &&
+	    cfs_time_current_sec() - console_session.ses_laststamp >
+	    (time_t)console_session.ses_timeout)
+		console_session.ses_expired = 1;
+
+	trans = console_session.ses_ping;
+
+	LASSERT (trans != NULL);
+
+	list_for_each_entry(ndl, &console_session.ses_ndl_list, ndl_link) {
+		nd = ndl->ndl_node;
+
+		if (console_session.ses_expired) {
+			/* idle console, end session on all nodes */
+			if (nd->nd_state != LST_NODE_ACTIVE)
+				continue;
+
+			rc = lstcon_sesrpc_prep(nd, LST_TRANS_SESEND,
+						trans->tas_features, &crpc);
+			if (rc != 0) {
+				CERROR("Out of memory\n");
+				break;
+			}
+
+			lstcon_rpc_trans_addreq(trans, crpc);
+			lstcon_rpc_post(crpc);
+
+			continue;
+		}
+
+		crpc = &nd->nd_ping;
+
+		if (crpc->crp_rpc != NULL) {
+			LASSERT (crpc->crp_trans == trans);
+			LASSERT (!list_empty(&crpc->crp_link));
+
+			spin_lock(&crpc->crp_rpc->crpc_lock);
+
+			LASSERT(crpc->crp_posted);
+
+			if (!crpc->crp_finished) {
+				/* in flight */
+				spin_unlock(&crpc->crp_rpc->crpc_lock);
+				continue;
+			}
+
+			spin_unlock(&crpc->crp_rpc->crpc_lock);
+
+			lstcon_rpc_get_reply(crpc, &rep);
+
+			list_del_init(&crpc->crp_link);
+
+			lstcon_rpc_put(crpc);
+		}
+
+		if (nd->nd_state != LST_NODE_ACTIVE)
+			continue;
+
+		intv = cfs_duration_sec(cfs_time_sub(cfs_time_current(),
+						     nd->nd_stamp));
+		if (intv < (time_t)nd->nd_timeout / 2)
+			continue;
+
+		rc = lstcon_rpc_init(nd, SRPC_SERVICE_DEBUG,
+				     trans->tas_features, 0, 0, 1, crpc);
+		if (rc != 0) {
+			CERROR("Out of memory\n");
+			break;
+		}
+
+		drq = &crpc->crp_rpc->crpc_reqstmsg.msg_body.dbg_reqst;
+
+		drq->dbg_sid   = console_session.ses_id;
+		drq->dbg_flags = 0;
+
+		lstcon_rpc_trans_addreq(trans, crpc);
+		lstcon_rpc_post(crpc);
+
+		count ++;
+	}
+
+	if (console_session.ses_expired) {
+		mutex_unlock(&console_session.ses_mutex);
+		return;
+	}
+
+	CDEBUG(D_NET, "Ping %d nodes in session\n", count);
+
+	ptimer->stt_expires = (cfs_time_t)(cfs_time_current_sec() + LST_PING_INTERVAL);
+	stt_add_timer(ptimer);
+
+	mutex_unlock(&console_session.ses_mutex);
+}
+
+int
+lstcon_rpc_pinger_start(void)
+{
+	stt_timer_t    *ptimer;
+	int	     rc;
+
+	LASSERT (list_empty(&console_session.ses_rpc_freelist));
+	LASSERT (atomic_read(&console_session.ses_rpc_counter) == 0);
+
+	rc = lstcon_rpc_trans_prep(NULL, LST_TRANS_SESPING,
+				   &console_session.ses_ping);
+	if (rc != 0) {
+		CERROR("Failed to create console pinger\n");
+		return rc;
+	}
+
+	ptimer = &console_session.ses_ping_timer;
+	ptimer->stt_expires = (cfs_time_t)(cfs_time_current_sec() + LST_PING_INTERVAL);
+
+	stt_add_timer(ptimer);
+
+	return 0;
+}
+
+void
+lstcon_rpc_pinger_stop(void)
+{
+	LASSERT (console_session.ses_shutdown);
+
+	stt_del_timer(&console_session.ses_ping_timer);
+
+	lstcon_rpc_trans_abort(console_session.ses_ping, -ESHUTDOWN);
+	lstcon_rpc_trans_stat(console_session.ses_ping, lstcon_trans_stat());
+	lstcon_rpc_trans_destroy(console_session.ses_ping);
+
+	memset(lstcon_trans_stat(), 0, sizeof(lstcon_trans_stat_t));
+
+	console_session.ses_ping = NULL;
+}
+
+void
+lstcon_rpc_cleanup_wait(void)
+{
+	lstcon_rpc_trans_t *trans;
+	lstcon_rpc_t       *crpc;
+	struct list_head	 *pacer;
+	struct list_head	  zlist;
+
+	/* Called with hold of global mutex */
+
+	LASSERT (console_session.ses_shutdown);
+
+	while (!list_empty(&console_session.ses_trans_list)) {
+		list_for_each(pacer, &console_session.ses_trans_list) {
+			trans = list_entry(pacer, lstcon_rpc_trans_t,
+					       tas_link);
+
+			CDEBUG(D_NET, "Session closed, wakeup transaction %s\n",
+			       lstcon_rpc_trans_name(trans->tas_opc));
+
+			wake_up(&trans->tas_waitq);
+		}
+
+		mutex_unlock(&console_session.ses_mutex);
+
+		CWARN("Session is shutting down, "
+		      "waiting for termination of transactions\n");
+		cfs_pause(cfs_time_seconds(1));
+
+		mutex_lock(&console_session.ses_mutex);
+	}
+
+	spin_lock(&console_session.ses_rpc_lock);
+
+	lst_wait_until((atomic_read(&console_session.ses_rpc_counter) == 0),
+		       console_session.ses_rpc_lock,
+		       "Network is not accessable or target is down, "
+		       "waiting for %d console RPCs to being recycled\n",
+		       atomic_read(&console_session.ses_rpc_counter));
+
+	list_add(&zlist, &console_session.ses_rpc_freelist);
+	list_del_init(&console_session.ses_rpc_freelist);
+
+	spin_unlock(&console_session.ses_rpc_lock);
+
+	while (!list_empty(&zlist)) {
+		crpc = list_entry(zlist.next, lstcon_rpc_t, crp_link);
+
+		list_del(&crpc->crp_link);
+		LIBCFS_FREE(crpc, sizeof(lstcon_rpc_t));
+	}
+}
+
+int
+lstcon_rpc_module_init(void)
+{
+	INIT_LIST_HEAD(&console_session.ses_ping_timer.stt_list);
+	console_session.ses_ping_timer.stt_func = lstcon_rpc_pinger;
+	console_session.ses_ping_timer.stt_data = &console_session.ses_ping_timer;
+
+	console_session.ses_ping = NULL;
+
+	spin_lock_init(&console_session.ses_rpc_lock);
+	atomic_set(&console_session.ses_rpc_counter, 0);
+	INIT_LIST_HEAD(&console_session.ses_rpc_freelist);
+
+	return 0;
+}
+
+void
+lstcon_rpc_module_fini(void)
+{
+	LASSERT (list_empty(&console_session.ses_rpc_freelist));
+	LASSERT (atomic_read(&console_session.ses_rpc_counter) == 0);
+}
diff --git a/drivers/staging/lustre/lnet/selftest/conrpc.h b/drivers/staging/lustre/lnet/selftest/conrpc.h
new file mode 100644
index 000000000000..9aba24a2eab9
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/conrpc.h
@@ -0,0 +1,146 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * /lnet/selftest/conrpc.h
+ *
+ * Console rpc
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ */
+
+#ifndef __LST_CONRPC_H__
+#define __LST_CONRPC_H__
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lnet.h>
+#include <linux/lnet/lib-types.h>
+#include <linux/lnet/lnetst.h>
+#include "rpc.h"
+#include "selftest.h"
+
+/* Console rpc and rpc transaction */
+#define LST_TRANS_TIMEOUT       30
+#define LST_TRANS_MIN_TIMEOUT   3
+
+#define LST_VALIDATE_TIMEOUT(t) MIN(MAX(t, LST_TRANS_MIN_TIMEOUT), LST_TRANS_TIMEOUT)
+
+#define LST_PING_INTERVAL       8
+
+struct lstcon_rpc_trans;
+struct lstcon_tsb_hdr;
+struct lstcon_test;
+struct lstcon_node;
+
+typedef struct lstcon_rpc {
+	struct list_head	       crp_link;       /* chain on rpc transaction */
+	srpc_client_rpc_t       *crp_rpc;	/* client rpc */
+	struct lstcon_node      *crp_node;       /* destination node */
+	struct lstcon_rpc_trans *crp_trans;     /* conrpc transaction */
+
+	unsigned int		 crp_posted:1;   /* rpc is posted */
+	unsigned int		 crp_finished:1; /* rpc is finished */
+	unsigned int		 crp_unpacked:1; /* reply is unpacked */
+	/** RPC is embedded in other structure and can't free it */
+	unsigned int		 crp_embedded:1;
+	int		      crp_status;     /* console rpc errors */
+	cfs_time_t	       crp_stamp;      /* replied time stamp */
+} lstcon_rpc_t;
+
+typedef struct lstcon_rpc_trans {
+	struct list_head	    tas_olink;     /* link chain on owner list */
+	struct list_head	    tas_link;      /* link chain on global list */
+	int		   tas_opc;       /* operation code of transaction */
+	/* features mask is uptodate */
+	unsigned	      tas_feats_updated;
+	/* test features mask */
+	unsigned	      tas_features;
+	wait_queue_head_t	   tas_waitq;     /* wait queue head */
+	atomic_t	  tas_remaining; /* # of un-scheduled rpcs */
+	struct list_head	    tas_rpcs_list; /* queued requests */
+} lstcon_rpc_trans_t;
+
+#define LST_TRANS_PRIVATE       0x1000
+
+#define LST_TRANS_SESNEW	(LST_TRANS_PRIVATE | 0x01)
+#define LST_TRANS_SESEND	(LST_TRANS_PRIVATE | 0x02)
+#define LST_TRANS_SESQRY	0x03
+#define LST_TRANS_SESPING       0x04
+
+#define LST_TRANS_TSBCLIADD     (LST_TRANS_PRIVATE | 0x11)
+#define LST_TRANS_TSBSRVADD     (LST_TRANS_PRIVATE | 0x12)
+#define LST_TRANS_TSBRUN	(LST_TRANS_PRIVATE | 0x13)
+#define LST_TRANS_TSBSTOP       (LST_TRANS_PRIVATE | 0x14)
+#define LST_TRANS_TSBCLIQRY     0x15
+#define LST_TRANS_TSBSRVQRY     0x16
+
+#define LST_TRANS_STATQRY       0x21
+
+typedef int (* lstcon_rpc_cond_func_t)(int, struct lstcon_node *, void *);
+typedef int (* lstcon_rpc_readent_func_t)(int, srpc_msg_t *, lstcon_rpc_ent_t *);
+
+int  lstcon_sesrpc_prep(struct lstcon_node *nd, int transop,
+			unsigned version, lstcon_rpc_t **crpc);
+int  lstcon_dbgrpc_prep(struct lstcon_node *nd,
+			unsigned version, lstcon_rpc_t **crpc);
+int  lstcon_batrpc_prep(struct lstcon_node *nd, int transop, unsigned version,
+			struct lstcon_tsb_hdr *tsb, lstcon_rpc_t **crpc);
+int  lstcon_testrpc_prep(struct lstcon_node *nd, int transop, unsigned version,
+			 struct lstcon_test *test, lstcon_rpc_t **crpc);
+int  lstcon_statrpc_prep(struct lstcon_node *nd, unsigned version,
+			 lstcon_rpc_t **crpc);
+void lstcon_rpc_put(lstcon_rpc_t *crpc);
+int  lstcon_rpc_trans_prep(struct list_head *translist,
+			   int transop, lstcon_rpc_trans_t **transpp);
+int  lstcon_rpc_trans_ndlist(struct list_head *ndlist,
+			     struct list_head *translist, int transop,
+			     void *arg, lstcon_rpc_cond_func_t condition,
+			     lstcon_rpc_trans_t **transpp);
+void lstcon_rpc_trans_stat(lstcon_rpc_trans_t *trans,
+			   lstcon_trans_stat_t *stat);
+int  lstcon_rpc_trans_interpreter(lstcon_rpc_trans_t *trans,
+				  struct list_head *head_up,
+				  lstcon_rpc_readent_func_t readent);
+void lstcon_rpc_trans_abort(lstcon_rpc_trans_t *trans, int error);
+void lstcon_rpc_trans_destroy(lstcon_rpc_trans_t *trans);
+void lstcon_rpc_trans_addreq(lstcon_rpc_trans_t *trans, lstcon_rpc_t *req);
+int  lstcon_rpc_trans_postwait(lstcon_rpc_trans_t *trans, int timeout);
+int  lstcon_rpc_pinger_start(void);
+void lstcon_rpc_pinger_stop(void);
+void lstcon_rpc_cleanup_wait(void);
+int  lstcon_rpc_module_init(void);
+void lstcon_rpc_module_fini(void);
+
+
+#endif
diff --git a/drivers/staging/lustre/lnet/selftest/console.c b/drivers/staging/lustre/lnet/selftest/console.c
new file mode 100644
index 000000000000..78e8d0467267
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/console.c
@@ -0,0 +1,2071 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/conctl.c
+ *
+ * Infrastructure of LST console
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lib-lnet.h>
+#include "console.h"
+#include "conrpc.h"
+
+#define LST_NODE_STATE_COUNTER(nd, p)		   \
+do {						    \
+	if ((nd)->nd_state == LST_NODE_ACTIVE)	  \
+		(p)->nle_nactive ++;		    \
+	else if ((nd)->nd_state == LST_NODE_BUSY)       \
+		(p)->nle_nbusy ++;		      \
+	else if ((nd)->nd_state == LST_NODE_DOWN)       \
+		(p)->nle_ndown ++;		      \
+	else					    \
+		(p)->nle_nunknown ++;		   \
+	(p)->nle_nnode ++;			      \
+} while (0)
+
+lstcon_session_t	console_session;
+
+void
+lstcon_node_get(lstcon_node_t *nd)
+{
+	LASSERT (nd->nd_ref >= 1);
+
+	nd->nd_ref++;
+}
+
+static int
+lstcon_node_find(lnet_process_id_t id, lstcon_node_t **ndpp, int create)
+{
+	lstcon_ndlink_t *ndl;
+	unsigned int     idx = LNET_NIDADDR(id.nid) % LST_GLOBAL_HASHSIZE;
+
+	LASSERT (id.nid != LNET_NID_ANY);
+
+	list_for_each_entry(ndl, &console_session.ses_ndl_hash[idx], ndl_hlink) {
+		if (ndl->ndl_node->nd_id.nid != id.nid ||
+		    ndl->ndl_node->nd_id.pid != id.pid)
+			continue;
+
+		lstcon_node_get(ndl->ndl_node);
+		*ndpp = ndl->ndl_node;
+		return 0;
+	}
+
+	if (!create)
+		return -ENOENT;
+
+	LIBCFS_ALLOC(*ndpp, sizeof(lstcon_node_t) + sizeof(lstcon_ndlink_t));
+	if (*ndpp == NULL)
+		return -ENOMEM;
+
+	ndl = (lstcon_ndlink_t *)(*ndpp + 1);
+
+	ndl->ndl_node = *ndpp;
+
+	ndl->ndl_node->nd_ref   = 1;
+	ndl->ndl_node->nd_id    = id;
+	ndl->ndl_node->nd_stamp = cfs_time_current();
+	ndl->ndl_node->nd_state = LST_NODE_UNKNOWN;
+	ndl->ndl_node->nd_timeout = 0;
+	memset(&ndl->ndl_node->nd_ping, 0, sizeof(lstcon_rpc_t));
+
+	/* queued in global hash & list, no refcount is taken by
+	 * global hash & list, if caller release his refcount,
+	 * node will be released */
+	list_add_tail(&ndl->ndl_hlink, &console_session.ses_ndl_hash[idx]);
+	list_add_tail(&ndl->ndl_link, &console_session.ses_ndl_list);
+
+	return 0;
+}
+
+void
+lstcon_node_put(lstcon_node_t *nd)
+{
+	lstcon_ndlink_t  *ndl;
+
+	LASSERT (nd->nd_ref > 0);
+
+	if (--nd->nd_ref > 0)
+		return;
+
+	ndl = (lstcon_ndlink_t *)(nd + 1);
+
+	LASSERT (!list_empty(&ndl->ndl_link));
+	LASSERT (!list_empty(&ndl->ndl_hlink));
+
+	/* remove from session */
+	list_del(&ndl->ndl_link);
+	list_del(&ndl->ndl_hlink);
+
+	LIBCFS_FREE(nd, sizeof(lstcon_node_t) + sizeof(lstcon_ndlink_t));
+}
+
+static int
+lstcon_ndlink_find(struct list_head *hash,
+		   lnet_process_id_t id, lstcon_ndlink_t **ndlpp, int create)
+{
+	unsigned int     idx = LNET_NIDADDR(id.nid) % LST_NODE_HASHSIZE;
+	lstcon_ndlink_t *ndl;
+	lstcon_node_t   *nd;
+	int	      rc;
+
+	if (id.nid == LNET_NID_ANY)
+		return -EINVAL;
+
+	/* search in hash */
+	list_for_each_entry(ndl, &hash[idx], ndl_hlink) {
+		if (ndl->ndl_node->nd_id.nid != id.nid ||
+		    ndl->ndl_node->nd_id.pid != id.pid)
+			continue;
+
+		*ndlpp = ndl;
+		return 0;
+	}
+
+	if (create == 0)
+		return -ENOENT;
+
+	/* find or create in session hash */
+	rc = lstcon_node_find(id, &nd, (create == 1) ? 1 : 0);
+	if (rc != 0)
+		return rc;
+
+	LIBCFS_ALLOC(ndl, sizeof(lstcon_ndlink_t));
+	if (ndl == NULL) {
+		lstcon_node_put(nd);
+		return -ENOMEM;
+	}
+
+	*ndlpp = ndl;
+
+	ndl->ndl_node = nd;
+	INIT_LIST_HEAD(&ndl->ndl_link);
+	list_add_tail(&ndl->ndl_hlink, &hash[idx]);
+
+	return  0;
+}
+
+static void
+lstcon_ndlink_release(lstcon_ndlink_t *ndl)
+{
+	LASSERT (list_empty(&ndl->ndl_link));
+	LASSERT (!list_empty(&ndl->ndl_hlink));
+
+	list_del(&ndl->ndl_hlink); /* delete from hash */
+	lstcon_node_put(ndl->ndl_node);
+
+	LIBCFS_FREE(ndl, sizeof(*ndl));
+}
+
+static int
+lstcon_group_alloc(char *name, lstcon_group_t **grpp)
+{
+	lstcon_group_t *grp;
+	int	     i;
+
+	LIBCFS_ALLOC(grp, offsetof(lstcon_group_t,
+				   grp_ndl_hash[LST_NODE_HASHSIZE]));
+	if (grp == NULL)
+		return -ENOMEM;
+
+	memset(grp, 0, offsetof(lstcon_group_t,
+				grp_ndl_hash[LST_NODE_HASHSIZE]));
+
+	grp->grp_ref = 1;
+	if (name != NULL)
+		strcpy(grp->grp_name, name);
+
+	INIT_LIST_HEAD(&grp->grp_link);
+	INIT_LIST_HEAD(&grp->grp_ndl_list);
+	INIT_LIST_HEAD(&grp->grp_trans_list);
+
+	for (i = 0; i < LST_NODE_HASHSIZE; i++)
+		INIT_LIST_HEAD(&grp->grp_ndl_hash[i]);
+
+	*grpp = grp;
+
+	return 0;
+}
+
+static void
+lstcon_group_addref(lstcon_group_t *grp)
+{
+	grp->grp_ref ++;
+}
+
+static void lstcon_group_ndlink_release(lstcon_group_t *, lstcon_ndlink_t *);
+
+static void
+lstcon_group_drain(lstcon_group_t *grp, int keep)
+{
+	lstcon_ndlink_t *ndl;
+	lstcon_ndlink_t *tmp;
+
+	list_for_each_entry_safe(ndl, tmp, &grp->grp_ndl_list, ndl_link) {
+		if ((ndl->ndl_node->nd_state & keep) == 0)
+			lstcon_group_ndlink_release(grp, ndl);
+	}
+}
+
+static void
+lstcon_group_decref(lstcon_group_t *grp)
+{
+	int     i;
+
+	if (--grp->grp_ref > 0)
+		return;
+
+	if (!list_empty(&grp->grp_link))
+		list_del(&grp->grp_link);
+
+	lstcon_group_drain(grp, 0);
+
+	for (i = 0; i < LST_NODE_HASHSIZE; i++) {
+		LASSERT (list_empty(&grp->grp_ndl_hash[i]));
+	}
+
+	LIBCFS_FREE(grp, offsetof(lstcon_group_t,
+				  grp_ndl_hash[LST_NODE_HASHSIZE]));
+}
+
+static int
+lstcon_group_find(char *name, lstcon_group_t **grpp)
+{
+	lstcon_group_t   *grp;
+
+	list_for_each_entry(grp, &console_session.ses_grp_list, grp_link) {
+		if (strncmp(grp->grp_name, name, LST_NAME_SIZE) != 0)
+			continue;
+
+		lstcon_group_addref(grp);  /* +1 ref for caller */
+		*grpp = grp;
+		return 0;
+	}
+
+	return -ENOENT;
+}
+
+static void
+lstcon_group_put(lstcon_group_t *grp)
+{
+	lstcon_group_decref(grp);
+}
+
+static int
+lstcon_group_ndlink_find(lstcon_group_t *grp, lnet_process_id_t id,
+			 lstcon_ndlink_t **ndlpp, int create)
+{
+	int     rc;
+
+	rc = lstcon_ndlink_find(&grp->grp_ndl_hash[0], id, ndlpp, create);
+	if (rc != 0)
+		return rc;
+
+	if (!list_empty(&(*ndlpp)->ndl_link))
+		return 0;
+
+	list_add_tail(&(*ndlpp)->ndl_link, &grp->grp_ndl_list);
+	grp->grp_nnode ++;
+
+	return 0;
+}
+
+static void
+lstcon_group_ndlink_release(lstcon_group_t *grp, lstcon_ndlink_t *ndl)
+{
+	list_del_init(&ndl->ndl_link);
+	lstcon_ndlink_release(ndl);
+	grp->grp_nnode --;
+}
+
+static void
+lstcon_group_ndlink_move(lstcon_group_t *old,
+			 lstcon_group_t *new, lstcon_ndlink_t *ndl)
+{
+	unsigned int idx = LNET_NIDADDR(ndl->ndl_node->nd_id.nid) %
+			   LST_NODE_HASHSIZE;
+
+	list_del(&ndl->ndl_hlink);
+	list_del(&ndl->ndl_link);
+	old->grp_nnode --;
+
+	list_add_tail(&ndl->ndl_hlink, &new->grp_ndl_hash[idx]);
+	list_add_tail(&ndl->ndl_link, &new->grp_ndl_list);
+	new->grp_nnode ++;
+
+	return;
+}
+
+static void
+lstcon_group_move(lstcon_group_t *old, lstcon_group_t *new)
+{
+	lstcon_ndlink_t *ndl;
+
+	while (!list_empty(&old->grp_ndl_list)) {
+		ndl = list_entry(old->grp_ndl_list.next,
+				     lstcon_ndlink_t, ndl_link);
+		lstcon_group_ndlink_move(old, new, ndl);
+	}
+}
+
+int
+lstcon_sesrpc_condition(int transop, lstcon_node_t *nd, void *arg)
+{
+	lstcon_group_t *grp = (lstcon_group_t *)arg;
+
+	switch (transop) {
+	case LST_TRANS_SESNEW:
+		if (nd->nd_state == LST_NODE_ACTIVE)
+			return 0;
+		break;
+
+	case LST_TRANS_SESEND:
+		if (nd->nd_state != LST_NODE_ACTIVE)
+			return 0;
+
+		if (grp != NULL && nd->nd_ref > 1)
+			return 0;
+		break;
+
+	case LST_TRANS_SESQRY:
+		break;
+
+	default:
+		LBUG();
+	}
+
+	return 1;
+}
+
+int
+lstcon_sesrpc_readent(int transop, srpc_msg_t *msg,
+		      lstcon_rpc_ent_t *ent_up)
+{
+	srpc_debug_reply_t *rep;
+
+	switch (transop) {
+	case LST_TRANS_SESNEW:
+	case LST_TRANS_SESEND:
+		return 0;
+
+	case LST_TRANS_SESQRY:
+		rep = &msg->msg_body.dbg_reply;
+
+		if (copy_to_user(&ent_up->rpe_priv[0],
+				     &rep->dbg_timeout, sizeof(int)) ||
+		    copy_to_user(&ent_up->rpe_payload[0],
+				     &rep->dbg_name, LST_NAME_SIZE))
+			return -EFAULT;
+
+		return 0;
+
+	default:
+		LBUG();
+	}
+
+	return 0;
+}
+
+static int
+lstcon_group_nodes_add(lstcon_group_t *grp,
+		       int count, lnet_process_id_t *ids_up,
+		       unsigned *featp, struct list_head *result_up)
+{
+	lstcon_rpc_trans_t      *trans;
+	lstcon_ndlink_t	 *ndl;
+	lstcon_group_t	  *tmp;
+	lnet_process_id_t	id;
+	int		      i;
+	int		      rc;
+
+	rc = lstcon_group_alloc(NULL, &tmp);
+	if (rc != 0) {
+		CERROR("Out of memory\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0 ; i < count; i++) {
+		if (copy_from_user(&id, &ids_up[i], sizeof(id))) {
+			rc = -EFAULT;
+			break;
+		}
+
+		/* skip if it's in this group already */
+		rc = lstcon_group_ndlink_find(grp, id, &ndl, 0);
+		if (rc == 0)
+			continue;
+
+		/* add to tmp group */
+		rc = lstcon_group_ndlink_find(tmp, id, &ndl, 1);
+		if (rc != 0) {
+			CERROR("Can't create ndlink, out of memory\n");
+			break;
+		}
+	}
+
+	if (rc != 0) {
+		lstcon_group_put(tmp);
+		return rc;
+	}
+
+	rc = lstcon_rpc_trans_ndlist(&tmp->grp_ndl_list,
+				     &tmp->grp_trans_list, LST_TRANS_SESNEW,
+				     tmp, lstcon_sesrpc_condition, &trans);
+	if (rc != 0) {
+		CERROR("Can't create transaction: %d\n", rc);
+		lstcon_group_put(tmp);
+		return rc;
+	}
+
+	/* post all RPCs */
+	lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+	rc = lstcon_rpc_trans_interpreter(trans, result_up,
+					  lstcon_sesrpc_readent);
+	*featp = trans->tas_features;
+
+	/* destroy all RPGs */
+	lstcon_rpc_trans_destroy(trans);
+
+	lstcon_group_move(tmp, grp);
+	lstcon_group_put(tmp);
+
+	return rc;
+}
+
+static int
+lstcon_group_nodes_remove(lstcon_group_t *grp,
+			  int count, lnet_process_id_t *ids_up,
+			  struct list_head *result_up)
+{
+	lstcon_rpc_trans_t     *trans;
+	lstcon_ndlink_t	*ndl;
+	lstcon_group_t	 *tmp;
+	lnet_process_id_t       id;
+	int		     rc;
+	int		     i;
+
+	/* End session and remove node from the group */
+
+	rc = lstcon_group_alloc(NULL, &tmp);
+	if (rc != 0) {
+		CERROR("Out of memory\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < count; i++) {
+		if (copy_from_user(&id, &ids_up[i], sizeof(id))) {
+			rc = -EFAULT;
+			goto error;
+		}
+
+		/* move node to tmp group */
+		if (lstcon_group_ndlink_find(grp, id, &ndl, 0) == 0)
+			lstcon_group_ndlink_move(grp, tmp, ndl);
+	}
+
+	rc = lstcon_rpc_trans_ndlist(&tmp->grp_ndl_list,
+				     &tmp->grp_trans_list, LST_TRANS_SESEND,
+				     tmp, lstcon_sesrpc_condition, &trans);
+	if (rc != 0) {
+		CERROR("Can't create transaction: %d\n", rc);
+		goto error;
+	}
+
+	lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+	rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL);
+
+	lstcon_rpc_trans_destroy(trans);
+	/* release nodes anyway, because we can't rollback status */
+	lstcon_group_put(tmp);
+
+	return rc;
+error:
+	lstcon_group_move(tmp, grp);
+	lstcon_group_put(tmp);
+
+	return rc;
+}
+
+int
+lstcon_group_add(char *name)
+{
+	lstcon_group_t *grp;
+	int	     rc;
+
+	rc = (lstcon_group_find(name, &grp) == 0)? -EEXIST: 0;
+	if (rc != 0) {
+		/* find a group with same name */
+		lstcon_group_put(grp);
+		return rc;
+	}
+
+	rc = lstcon_group_alloc(name, &grp);
+	if (rc != 0) {
+		CERROR("Can't allocate descriptor for group %s\n", name);
+		return -ENOMEM;
+	}
+
+	list_add_tail(&grp->grp_link, &console_session.ses_grp_list);
+
+	return rc;
+}
+
+int
+lstcon_nodes_add(char *name, int count, lnet_process_id_t *ids_up,
+		 unsigned *featp, struct list_head *result_up)
+{
+	lstcon_group_t	 *grp;
+	int		     rc;
+
+	LASSERT (count > 0);
+	LASSERT (ids_up != NULL);
+
+	rc = lstcon_group_find(name, &grp);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find group %s\n", name);
+		return rc;
+	}
+
+	if (grp->grp_ref > 2) {
+		/* referred by other threads or test */
+		CDEBUG(D_NET, "Group %s is busy\n", name);
+		lstcon_group_put(grp);
+
+		return -EBUSY;
+	}
+
+	rc = lstcon_group_nodes_add(grp, count, ids_up, featp, result_up);
+
+	lstcon_group_put(grp);
+
+	return rc;
+}
+
+int
+lstcon_group_del(char *name)
+{
+	lstcon_rpc_trans_t *trans;
+	lstcon_group_t     *grp;
+	int		 rc;
+
+	rc = lstcon_group_find(name, &grp);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find group: %s\n", name);
+		return rc;
+	}
+
+	if (grp->grp_ref > 2) {
+		/* referred by others threads or test */
+		CDEBUG(D_NET, "Group %s is busy\n", name);
+		lstcon_group_put(grp);
+		return -EBUSY;
+	}
+
+	rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list,
+				     &grp->grp_trans_list, LST_TRANS_SESEND,
+				     grp, lstcon_sesrpc_condition, &trans);
+	if (rc != 0) {
+		CERROR("Can't create transaction: %d\n", rc);
+		lstcon_group_put(grp);
+		return rc;
+	}
+
+	lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+	lstcon_rpc_trans_destroy(trans);
+
+	lstcon_group_put(grp);
+	/* -ref for session, it's destroyed,
+	 * status can't be rolled back, destroy group anway */
+	lstcon_group_put(grp);
+
+	return rc;
+}
+
+int
+lstcon_group_clean(char *name, int args)
+{
+	lstcon_group_t *grp = NULL;
+	int	     rc;
+
+	rc = lstcon_group_find(name, &grp);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find group %s\n", name);
+		return rc;
+	}
+
+	if (grp->grp_ref > 2) {
+		/* referred by test */
+		CDEBUG(D_NET, "Group %s is busy\n", name);
+		lstcon_group_put(grp);
+		return -EBUSY;
+	}
+
+	args = (LST_NODE_ACTIVE | LST_NODE_BUSY |
+		LST_NODE_DOWN | LST_NODE_UNKNOWN) & ~args;
+
+	lstcon_group_drain(grp, args);
+
+	lstcon_group_put(grp);
+	/* release empty group */
+	if (list_empty(&grp->grp_ndl_list))
+		lstcon_group_put(grp);
+
+	return 0;
+}
+
+int
+lstcon_nodes_remove(char *name, int count,
+		    lnet_process_id_t *ids_up, struct list_head *result_up)
+{
+	lstcon_group_t *grp = NULL;
+	int	     rc;
+
+	rc = lstcon_group_find(name, &grp);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find group: %s\n", name);
+		return rc;
+	}
+
+	if (grp->grp_ref > 2) {
+		/* referred by test */
+		CDEBUG(D_NET, "Group %s is busy\n", name);
+		lstcon_group_put(grp);
+		return -EBUSY;
+	}
+
+	rc = lstcon_group_nodes_remove(grp, count, ids_up, result_up);
+
+	lstcon_group_put(grp);
+	/* release empty group */
+	if (list_empty(&grp->grp_ndl_list))
+		lstcon_group_put(grp);
+
+	return rc;
+}
+
+int
+lstcon_group_refresh(char *name, struct list_head *result_up)
+{
+	lstcon_rpc_trans_t      *trans;
+	lstcon_group_t	  *grp;
+	int		      rc;
+
+	rc = lstcon_group_find(name, &grp);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find group: %s\n", name);
+		return rc;
+	}
+
+	if (grp->grp_ref > 2) {
+		/* referred by test */
+		CDEBUG(D_NET, "Group %s is busy\n", name);
+		lstcon_group_put(grp);
+		return -EBUSY;
+	}
+
+	/* re-invite all inactive nodes int the group */
+	rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list,
+				     &grp->grp_trans_list, LST_TRANS_SESNEW,
+				     grp, lstcon_sesrpc_condition, &trans);
+	if (rc != 0) {
+		/* local error, return */
+		CDEBUG(D_NET, "Can't create transaction: %d\n", rc);
+		lstcon_group_put(grp);
+		return rc;
+	}
+
+	lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+	rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL);
+
+	lstcon_rpc_trans_destroy(trans);
+	/* -ref for me */
+	lstcon_group_put(grp);
+
+	return rc;
+}
+
+int
+lstcon_group_list(int index, int len, char *name_up)
+{
+	lstcon_group_t *grp;
+
+	LASSERT (index >= 0);
+	LASSERT (name_up != NULL);
+
+	list_for_each_entry(grp, &console_session.ses_grp_list, grp_link) {
+		if (index-- == 0) {
+			return copy_to_user(name_up, grp->grp_name, len) ?
+			       -EFAULT : 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+static int
+lstcon_nodes_getent(struct list_head *head, int *index_p,
+		    int *count_p, lstcon_node_ent_t *dents_up)
+{
+	lstcon_ndlink_t  *ndl;
+	lstcon_node_t    *nd;
+	int	       count = 0;
+	int	       index = 0;
+
+	LASSERT (index_p != NULL && count_p != NULL);
+	LASSERT (dents_up != NULL);
+	LASSERT (*index_p >= 0);
+	LASSERT (*count_p > 0);
+
+	list_for_each_entry(ndl, head, ndl_link) {
+		if (index++ < *index_p)
+			continue;
+
+		if (count >= *count_p)
+			break;
+
+		nd = ndl->ndl_node;
+		if (copy_to_user(&dents_up[count].nde_id,
+				     &nd->nd_id, sizeof(nd->nd_id)) ||
+		    copy_to_user(&dents_up[count].nde_state,
+				     &nd->nd_state, sizeof(nd->nd_state)))
+			return -EFAULT;
+
+		count ++;
+	}
+
+	if (index <= *index_p)
+		return -ENOENT;
+
+	*count_p = count;
+	*index_p = index;
+
+	return 0;
+}
+
+int
+lstcon_group_info(char *name, lstcon_ndlist_ent_t *gents_p,
+		  int *index_p, int *count_p, lstcon_node_ent_t *dents_up)
+{
+	lstcon_ndlist_ent_t *gentp;
+	lstcon_group_t      *grp;
+	lstcon_ndlink_t     *ndl;
+	int		  rc;
+
+	rc = lstcon_group_find(name, &grp);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find group %s\n", name);
+		return rc;
+	}
+
+	if (dents_up != 0) {
+		/* verbose query */
+		rc = lstcon_nodes_getent(&grp->grp_ndl_list,
+					 index_p, count_p, dents_up);
+		lstcon_group_put(grp);
+
+		return rc;
+	}
+
+	/* non-verbose query */
+	LIBCFS_ALLOC(gentp, sizeof(lstcon_ndlist_ent_t));
+	if (gentp == NULL) {
+		CERROR("Can't allocate ndlist_ent\n");
+		lstcon_group_put(grp);
+
+		return -ENOMEM;
+	}
+
+	memset(gentp, 0, sizeof(lstcon_ndlist_ent_t));
+
+	list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link)
+		LST_NODE_STATE_COUNTER(ndl->ndl_node, gentp);
+
+	rc = copy_to_user(gents_p, gentp,
+			      sizeof(lstcon_ndlist_ent_t)) ? -EFAULT: 0;
+
+	LIBCFS_FREE(gentp, sizeof(lstcon_ndlist_ent_t));
+
+	lstcon_group_put(grp);
+
+	return 0;
+}
+
+int
+lstcon_batch_find(char *name, lstcon_batch_t **batpp)
+{
+	lstcon_batch_t   *bat;
+
+	list_for_each_entry(bat, &console_session.ses_bat_list, bat_link) {
+		if (strncmp(bat->bat_name, name, LST_NAME_SIZE) == 0) {
+			*batpp = bat;
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+int
+lstcon_batch_add(char *name)
+{
+	lstcon_batch_t   *bat;
+	int	       i;
+	int	       rc;
+
+	rc = (lstcon_batch_find(name, &bat) == 0)? -EEXIST: 0;
+	if (rc != 0) {
+		CDEBUG(D_NET, "Batch %s already exists\n", name);
+		return rc;
+	}
+
+	LIBCFS_ALLOC(bat, sizeof(lstcon_batch_t));
+	if (bat == NULL) {
+		CERROR("Can't allocate descriptor for batch %s\n", name);
+		return -ENOMEM;
+	}
+
+	LIBCFS_ALLOC(bat->bat_cli_hash,
+		     sizeof(struct list_head) * LST_NODE_HASHSIZE);
+	if (bat->bat_cli_hash == NULL) {
+		CERROR("Can't allocate hash for batch %s\n", name);
+		LIBCFS_FREE(bat, sizeof(lstcon_batch_t));
+
+		return -ENOMEM;
+	}
+
+	LIBCFS_ALLOC(bat->bat_srv_hash,
+		     sizeof(struct list_head) * LST_NODE_HASHSIZE);
+	if (bat->bat_srv_hash == NULL) {
+		CERROR("Can't allocate hash for batch %s\n", name);
+		LIBCFS_FREE(bat->bat_cli_hash, LST_NODE_HASHSIZE);
+		LIBCFS_FREE(bat, sizeof(lstcon_batch_t));
+
+		return -ENOMEM;
+	}
+
+	strcpy(bat->bat_name, name);
+	bat->bat_hdr.tsb_index = 0;
+	bat->bat_hdr.tsb_id.bat_id = ++console_session.ses_id_cookie;
+
+	bat->bat_ntest = 0;
+	bat->bat_state = LST_BATCH_IDLE;
+
+	INIT_LIST_HEAD(&bat->bat_cli_list);
+	INIT_LIST_HEAD(&bat->bat_srv_list);
+	INIT_LIST_HEAD(&bat->bat_test_list);
+	INIT_LIST_HEAD(&bat->bat_trans_list);
+
+	for (i = 0; i < LST_NODE_HASHSIZE; i++) {
+		INIT_LIST_HEAD(&bat->bat_cli_hash[i]);
+		INIT_LIST_HEAD(&bat->bat_srv_hash[i]);
+	}
+
+	list_add_tail(&bat->bat_link, &console_session.ses_bat_list);
+
+	return rc;
+}
+
+int
+lstcon_batch_list(int index, int len, char *name_up)
+{
+	lstcon_batch_t    *bat;
+
+	LASSERT (name_up != NULL);
+	LASSERT (index >= 0);
+
+	list_for_each_entry(bat, &console_session.ses_bat_list, bat_link) {
+		if (index-- == 0) {
+			return copy_to_user(name_up,bat->bat_name, len) ?
+			       -EFAULT: 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+int
+lstcon_batch_info(char *name, lstcon_test_batch_ent_t *ent_up, int server,
+		  int testidx, int *index_p, int *ndent_p,
+		  lstcon_node_ent_t *dents_up)
+{
+	lstcon_test_batch_ent_t *entp;
+	struct list_head	      *clilst;
+	struct list_head	      *srvlst;
+	lstcon_test_t	   *test = NULL;
+	lstcon_batch_t	  *bat;
+	lstcon_ndlink_t	 *ndl;
+	int		      rc;
+
+	rc = lstcon_batch_find(name, &bat);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find batch %s\n", name);
+		return -ENOENT;
+	}
+
+	if (testidx > 0) {
+		/* query test, test index start from 1 */
+		list_for_each_entry(test, &bat->bat_test_list, tes_link) {
+			if (testidx-- == 1)
+				break;
+		}
+
+		if (testidx > 0) {
+			CDEBUG(D_NET, "Can't find specified test in batch\n");
+			return -ENOENT;
+		}
+	}
+
+	clilst = (test == NULL) ? &bat->bat_cli_list :
+				  &test->tes_src_grp->grp_ndl_list;
+	srvlst = (test == NULL) ? &bat->bat_srv_list :
+				  &test->tes_dst_grp->grp_ndl_list;
+
+	if (dents_up != NULL) {
+		rc = lstcon_nodes_getent((server ? srvlst: clilst),
+					 index_p, ndent_p, dents_up);
+		return rc;
+	}
+
+	/* non-verbose query */
+	LIBCFS_ALLOC(entp, sizeof(lstcon_test_batch_ent_t));
+	if (entp == NULL)
+		return -ENOMEM;
+
+	memset(entp, 0, sizeof(lstcon_test_batch_ent_t));
+
+	if (test == NULL) {
+		entp->u.tbe_batch.bae_ntest = bat->bat_ntest;
+		entp->u.tbe_batch.bae_state = bat->bat_state;
+
+	} else {
+
+		entp->u.tbe_test.tse_type   = test->tes_type;
+		entp->u.tbe_test.tse_loop   = test->tes_loop;
+		entp->u.tbe_test.tse_concur = test->tes_concur;
+	}
+
+	list_for_each_entry(ndl, clilst, ndl_link)
+		LST_NODE_STATE_COUNTER(ndl->ndl_node, &entp->tbe_cli_nle);
+
+	list_for_each_entry(ndl, srvlst, ndl_link)
+		LST_NODE_STATE_COUNTER(ndl->ndl_node, &entp->tbe_srv_nle);
+
+	rc = copy_to_user(ent_up, entp,
+			      sizeof(lstcon_test_batch_ent_t)) ? -EFAULT : 0;
+
+	LIBCFS_FREE(entp, sizeof(lstcon_test_batch_ent_t));
+
+	return rc;
+}
+
+int
+lstcon_batrpc_condition(int transop, lstcon_node_t *nd, void *arg)
+{
+	switch (transop) {
+	case LST_TRANS_TSBRUN:
+		if (nd->nd_state != LST_NODE_ACTIVE)
+			return -ENETDOWN;
+		break;
+
+	case LST_TRANS_TSBSTOP:
+		if (nd->nd_state != LST_NODE_ACTIVE)
+			return 0;
+		break;
+
+	case LST_TRANS_TSBCLIQRY:
+	case LST_TRANS_TSBSRVQRY:
+		break;
+	}
+
+	return 1;
+}
+
+static int
+lstcon_batch_op(lstcon_batch_t *bat, int transop,
+		struct list_head *result_up)
+{
+	lstcon_rpc_trans_t *trans;
+	int		 rc;
+
+	rc = lstcon_rpc_trans_ndlist(&bat->bat_cli_list,
+				     &bat->bat_trans_list, transop,
+				     bat, lstcon_batrpc_condition, &trans);
+	if (rc != 0) {
+		CERROR("Can't create transaction: %d\n", rc);
+		return rc;
+	}
+
+	lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+	rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL);
+
+	lstcon_rpc_trans_destroy(trans);
+
+	return rc;
+}
+
+int
+lstcon_batch_run(char *name, int timeout, struct list_head *result_up)
+{
+	lstcon_batch_t *bat;
+	int	     rc;
+
+	if (lstcon_batch_find(name, &bat) != 0) {
+		CDEBUG(D_NET, "Can't find batch %s\n", name);
+		return -ENOENT;
+	}
+
+	bat->bat_arg = timeout;
+
+	rc = lstcon_batch_op(bat, LST_TRANS_TSBRUN, result_up);
+
+	/* mark batch as running if it's started in any node */
+	if (lstcon_tsbop_stat_success(lstcon_trans_stat(), 0) != 0)
+		bat->bat_state = LST_BATCH_RUNNING;
+
+	return rc;
+}
+
+int
+lstcon_batch_stop(char *name, int force, struct list_head *result_up)
+{
+	lstcon_batch_t *bat;
+	int	     rc;
+
+	if (lstcon_batch_find(name, &bat) != 0) {
+		CDEBUG(D_NET, "Can't find batch %s\n", name);
+		return -ENOENT;
+	}
+
+	bat->bat_arg = force;
+
+	rc = lstcon_batch_op(bat, LST_TRANS_TSBSTOP, result_up);
+
+	/* mark batch as stopped if all RPCs finished */
+	if (lstcon_tsbop_stat_failure(lstcon_trans_stat(), 0) == 0)
+		bat->bat_state = LST_BATCH_IDLE;
+
+	return rc;
+}
+
+static void
+lstcon_batch_destroy(lstcon_batch_t *bat)
+{
+	lstcon_ndlink_t    *ndl;
+	lstcon_test_t      *test;
+	int		 i;
+
+	list_del(&bat->bat_link);
+
+	while (!list_empty(&bat->bat_test_list)) {
+		test = list_entry(bat->bat_test_list.next,
+				      lstcon_test_t, tes_link);
+		LASSERT (list_empty(&test->tes_trans_list));
+
+		list_del(&test->tes_link);
+
+		lstcon_group_put(test->tes_src_grp);
+		lstcon_group_put(test->tes_dst_grp);
+
+		LIBCFS_FREE(test, offsetof(lstcon_test_t,
+					   tes_param[test->tes_paramlen]));
+	}
+
+	LASSERT (list_empty(&bat->bat_trans_list));
+
+	while (!list_empty(&bat->bat_cli_list)) {
+		ndl = list_entry(bat->bat_cli_list.next,
+				     lstcon_ndlink_t, ndl_link);
+		list_del_init(&ndl->ndl_link);
+
+		lstcon_ndlink_release(ndl);
+	}
+
+	while (!list_empty(&bat->bat_srv_list)) {
+		ndl = list_entry(bat->bat_srv_list.next,
+				     lstcon_ndlink_t, ndl_link);
+		list_del_init(&ndl->ndl_link);
+
+		lstcon_ndlink_release(ndl);
+	}
+
+	for (i = 0; i < LST_NODE_HASHSIZE; i++) {
+		LASSERT (list_empty(&bat->bat_cli_hash[i]));
+		LASSERT (list_empty(&bat->bat_srv_hash[i]));
+	}
+
+	LIBCFS_FREE(bat->bat_cli_hash,
+		    sizeof(struct list_head) * LST_NODE_HASHSIZE);
+	LIBCFS_FREE(bat->bat_srv_hash,
+		    sizeof(struct list_head) * LST_NODE_HASHSIZE);
+	LIBCFS_FREE(bat, sizeof(lstcon_batch_t));
+}
+
+int
+lstcon_testrpc_condition(int transop, lstcon_node_t *nd, void *arg)
+{
+	lstcon_test_t    *test;
+	lstcon_batch_t   *batch;
+	lstcon_ndlink_t  *ndl;
+	struct list_head       *hash;
+	struct list_head       *head;
+
+	test = (lstcon_test_t *)arg;
+	LASSERT (test != NULL);
+
+	batch = test->tes_batch;
+	LASSERT (batch != NULL);
+
+	if (test->tes_oneside &&
+	    transop == LST_TRANS_TSBSRVADD)
+		return 0;
+
+	if (nd->nd_state != LST_NODE_ACTIVE)
+		return -ENETDOWN;
+
+	if (transop == LST_TRANS_TSBCLIADD) {
+		hash = batch->bat_cli_hash;
+		head = &batch->bat_cli_list;
+
+	} else {
+		LASSERT (transop == LST_TRANS_TSBSRVADD);
+
+		hash = batch->bat_srv_hash;
+		head = &batch->bat_srv_list;
+	}
+
+	LASSERT (nd->nd_id.nid != LNET_NID_ANY);
+
+	if (lstcon_ndlink_find(hash, nd->nd_id, &ndl, 1) != 0)
+		return -ENOMEM;
+
+	if (list_empty(&ndl->ndl_link))
+		list_add_tail(&ndl->ndl_link, head);
+
+	return 1;
+}
+
+static int
+lstcon_test_nodes_add(lstcon_test_t *test, struct list_head *result_up)
+{
+	lstcon_rpc_trans_t     *trans;
+	lstcon_group_t	 *grp;
+	int		     transop;
+	int		     rc;
+
+	LASSERT (test->tes_src_grp != NULL);
+	LASSERT (test->tes_dst_grp != NULL);
+
+	transop = LST_TRANS_TSBSRVADD;
+	grp  = test->tes_dst_grp;
+again:
+	rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list,
+				     &test->tes_trans_list, transop,
+				     test, lstcon_testrpc_condition, &trans);
+	if (rc != 0) {
+		CERROR("Can't create transaction: %d\n", rc);
+		return rc;
+	}
+
+	lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+	if (lstcon_trans_stat()->trs_rpc_errno != 0 ||
+	    lstcon_trans_stat()->trs_fwk_errno != 0) {
+		lstcon_rpc_trans_interpreter(trans, result_up, NULL);
+
+		lstcon_rpc_trans_destroy(trans);
+		/* return if any error */
+		CDEBUG(D_NET, "Failed to add test %s, "
+			      "RPC error %d, framework error %d\n",
+		       transop == LST_TRANS_TSBCLIADD ? "client" : "server",
+		       lstcon_trans_stat()->trs_rpc_errno,
+		       lstcon_trans_stat()->trs_fwk_errno);
+
+		return rc;
+	}
+
+	lstcon_rpc_trans_destroy(trans);
+
+	if (transop == LST_TRANS_TSBCLIADD)
+		return rc;
+
+	transop = LST_TRANS_TSBCLIADD;
+	grp = test->tes_src_grp;
+	test->tes_cliidx = 0;
+
+	/* requests to test clients */
+	goto again;
+}
+
+int
+lstcon_test_add(char *name, int type, int loop, int concur,
+		int dist, int span, char *src_name, char * dst_name,
+		void *param, int paramlen, int *retp,
+		struct list_head *result_up)
+{
+	lstcon_group_t  *src_grp = NULL;
+	lstcon_group_t  *dst_grp = NULL;
+	lstcon_test_t   *test    = NULL;
+	lstcon_batch_t  *batch;
+	int	      rc;
+
+	rc = lstcon_batch_find(name, &batch);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find batch %s\n", name);
+		return rc;
+	}
+
+	if (batch->bat_state != LST_BATCH_IDLE) {
+		CDEBUG(D_NET, "Can't change running batch %s\n", name);
+		return rc;
+	}
+
+	rc = lstcon_group_find(src_name, &src_grp);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find group %s\n", src_name);
+		goto out;
+	}
+
+	rc = lstcon_group_find(dst_name, &dst_grp);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find group %s\n", dst_name);
+		goto out;
+	}
+
+	if (dst_grp->grp_userland)
+		*retp = 1;
+
+	LIBCFS_ALLOC(test, offsetof(lstcon_test_t, tes_param[paramlen]));
+	if (!test) {
+		CERROR("Can't allocate test descriptor\n");
+		rc = -ENOMEM;
+
+		goto out;
+	}
+
+	memset(test, 0, offsetof(lstcon_test_t, tes_param[paramlen]));
+	test->tes_hdr.tsb_id    = batch->bat_hdr.tsb_id;
+	test->tes_batch	 = batch;
+	test->tes_type	  = type;
+	test->tes_oneside       = 0; /* TODO */
+	test->tes_loop	  = loop;
+	test->tes_concur	= concur;
+	test->tes_stop_onerr    = 1; /* TODO */
+	test->tes_span	  = span;
+	test->tes_dist	  = dist;
+	test->tes_cliidx	= 0; /* just used for creating RPC */
+	test->tes_src_grp       = src_grp;
+	test->tes_dst_grp       = dst_grp;
+	INIT_LIST_HEAD(&test->tes_trans_list);
+
+	if (param != NULL) {
+		test->tes_paramlen = paramlen;
+		memcpy(&test->tes_param[0], param, paramlen);
+	}
+
+	rc = lstcon_test_nodes_add(test, result_up);
+
+	if (rc != 0)
+		goto out;
+
+	if (lstcon_trans_stat()->trs_rpc_errno != 0 ||
+	    lstcon_trans_stat()->trs_fwk_errno != 0)
+		CDEBUG(D_NET, "Failed to add test %d to batch %s\n", type, name);
+
+	/* add to test list anyway, so user can check what's going on */
+	list_add_tail(&test->tes_link, &batch->bat_test_list);
+
+	batch->bat_ntest ++;
+	test->tes_hdr.tsb_index = batch->bat_ntest;
+
+	/*  hold groups so nobody can change them */
+	return rc;
+out:
+	if (test != NULL)
+		LIBCFS_FREE(test, offsetof(lstcon_test_t, tes_param[paramlen]));
+
+	if (dst_grp != NULL)
+		lstcon_group_put(dst_grp);
+
+	if (src_grp != NULL)
+		lstcon_group_put(src_grp);
+
+	return rc;
+}
+
+int
+lstcon_test_find(lstcon_batch_t *batch, int idx, lstcon_test_t **testpp)
+{
+	lstcon_test_t *test;
+
+	list_for_each_entry(test, &batch->bat_test_list, tes_link) {
+		if (idx == test->tes_hdr.tsb_index) {
+			*testpp = test;
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+int
+lstcon_tsbrpc_readent(int transop, srpc_msg_t *msg,
+		      lstcon_rpc_ent_t *ent_up)
+{
+	srpc_batch_reply_t *rep = &msg->msg_body.bat_reply;
+
+	LASSERT (transop == LST_TRANS_TSBCLIQRY ||
+		 transop == LST_TRANS_TSBSRVQRY);
+
+	/* positive errno, framework error code */
+	if (copy_to_user(&ent_up->rpe_priv[0],
+			     &rep->bar_active, sizeof(rep->bar_active)))
+		return -EFAULT;
+
+	return 0;
+}
+
+int
+lstcon_test_batch_query(char *name, int testidx, int client,
+			int timeout, struct list_head *result_up)
+{
+	lstcon_rpc_trans_t *trans;
+	struct list_head	 *translist;
+	struct list_head	 *ndlist;
+	lstcon_tsb_hdr_t   *hdr;
+	lstcon_batch_t     *batch;
+	lstcon_test_t      *test = NULL;
+	int		 transop;
+	int		 rc;
+
+	rc = lstcon_batch_find(name, &batch);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find batch: %s\n", name);
+		return rc;
+	}
+
+	if (testidx == 0) {
+		translist = &batch->bat_trans_list;
+		ndlist    = &batch->bat_cli_list;
+		hdr       = &batch->bat_hdr;
+
+	} else {
+		/* query specified test only */
+		rc = lstcon_test_find(batch, testidx, &test);
+		if (rc != 0) {
+			CDEBUG(D_NET, "Can't find test: %d\n", testidx);
+			return rc;
+		}
+
+		translist = &test->tes_trans_list;
+		ndlist    = &test->tes_src_grp->grp_ndl_list;
+		hdr       = &test->tes_hdr;
+	}
+
+	transop = client ? LST_TRANS_TSBCLIQRY : LST_TRANS_TSBSRVQRY;
+
+	rc = lstcon_rpc_trans_ndlist(ndlist, translist, transop, hdr,
+				     lstcon_batrpc_condition, &trans);
+	if (rc != 0) {
+		CERROR("Can't create transaction: %d\n", rc);
+		return rc;
+	}
+
+	lstcon_rpc_trans_postwait(trans, timeout);
+
+	if (testidx == 0 && /* query a batch, not a test */
+	    lstcon_rpc_stat_failure(lstcon_trans_stat(), 0) == 0 &&
+	    lstcon_tsbqry_stat_run(lstcon_trans_stat(), 0) == 0) {
+		/* all RPCs finished, and no active test */
+		batch->bat_state = LST_BATCH_IDLE;
+	}
+
+	rc = lstcon_rpc_trans_interpreter(trans, result_up,
+					  lstcon_tsbrpc_readent);
+	lstcon_rpc_trans_destroy(trans);
+
+	return rc;
+}
+
+int
+lstcon_statrpc_readent(int transop, srpc_msg_t *msg,
+		       lstcon_rpc_ent_t *ent_up)
+{
+	srpc_stat_reply_t *rep = &msg->msg_body.stat_reply;
+	sfw_counters_t    *sfwk_stat;
+	srpc_counters_t   *srpc_stat;
+	lnet_counters_t   *lnet_stat;
+
+	if (rep->str_status != 0)
+		return 0;
+
+	sfwk_stat = (sfw_counters_t *)&ent_up->rpe_payload[0];
+	srpc_stat = (srpc_counters_t *)((char *)sfwk_stat + sizeof(*sfwk_stat));
+	lnet_stat = (lnet_counters_t *)((char *)srpc_stat + sizeof(*srpc_stat));
+
+	if (copy_to_user(sfwk_stat, &rep->str_fw, sizeof(*sfwk_stat)) ||
+	    copy_to_user(srpc_stat, &rep->str_rpc, sizeof(*srpc_stat)) ||
+	    copy_to_user(lnet_stat, &rep->str_lnet, sizeof(*lnet_stat)))
+		return -EFAULT;
+
+	return 0;
+}
+
+int
+lstcon_ndlist_stat(struct list_head *ndlist,
+		   int timeout, struct list_head *result_up)
+{
+	struct list_head	  head;
+	lstcon_rpc_trans_t *trans;
+	int		 rc;
+
+	INIT_LIST_HEAD(&head);
+
+	rc = lstcon_rpc_trans_ndlist(ndlist, &head,
+				     LST_TRANS_STATQRY, NULL, NULL, &trans);
+	if (rc != 0) {
+		CERROR("Can't create transaction: %d\n", rc);
+		return rc;
+	}
+
+	lstcon_rpc_trans_postwait(trans, LST_VALIDATE_TIMEOUT(timeout));
+
+	rc = lstcon_rpc_trans_interpreter(trans, result_up,
+					  lstcon_statrpc_readent);
+	lstcon_rpc_trans_destroy(trans);
+
+	return rc;
+}
+
+int
+lstcon_group_stat(char *grp_name, int timeout, struct list_head *result_up)
+{
+	lstcon_group_t     *grp;
+	int		 rc;
+
+	rc = lstcon_group_find(grp_name, &grp);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find group %s\n", grp_name);
+		return rc;
+	}
+
+	rc = lstcon_ndlist_stat(&grp->grp_ndl_list, timeout, result_up);
+
+	lstcon_group_put(grp);
+
+	return rc;
+}
+
+int
+lstcon_nodes_stat(int count, lnet_process_id_t *ids_up,
+		  int timeout, struct list_head *result_up)
+{
+	lstcon_ndlink_t	 *ndl;
+	lstcon_group_t	  *tmp;
+	lnet_process_id_t	id;
+	int		      i;
+	int		      rc;
+
+	rc = lstcon_group_alloc(NULL, &tmp);
+	if (rc != 0) {
+		CERROR("Out of memory\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0 ; i < count; i++) {
+		if (copy_from_user(&id, &ids_up[i], sizeof(id))) {
+			rc = -EFAULT;
+			break;
+		}
+
+		/* add to tmp group */
+		rc = lstcon_group_ndlink_find(tmp, id, &ndl, 2);
+		if (rc != 0) {
+			CDEBUG((rc == -ENOMEM) ? D_ERROR : D_NET,
+			       "Failed to find or create %s: %d\n",
+			       libcfs_id2str(id), rc);
+			break;
+		}
+	}
+
+	if (rc != 0) {
+		lstcon_group_put(tmp);
+		return rc;
+	}
+
+	rc = lstcon_ndlist_stat(&tmp->grp_ndl_list, timeout, result_up);
+
+	lstcon_group_put(tmp);
+
+	return rc;
+}
+
+int
+lstcon_debug_ndlist(struct list_head *ndlist,
+		    struct list_head *translist,
+		    int timeout, struct list_head *result_up)
+{
+	lstcon_rpc_trans_t *trans;
+	int		 rc;
+
+	rc = lstcon_rpc_trans_ndlist(ndlist, translist, LST_TRANS_SESQRY,
+				     NULL, lstcon_sesrpc_condition, &trans);
+	if (rc != 0) {
+		CERROR("Can't create transaction: %d\n", rc);
+		return rc;
+	}
+
+	lstcon_rpc_trans_postwait(trans, LST_VALIDATE_TIMEOUT(timeout));
+
+	rc = lstcon_rpc_trans_interpreter(trans, result_up,
+					  lstcon_sesrpc_readent);
+	lstcon_rpc_trans_destroy(trans);
+
+	return rc;
+}
+
+int
+lstcon_session_debug(int timeout, struct list_head *result_up)
+{
+	return lstcon_debug_ndlist(&console_session.ses_ndl_list,
+				   NULL, timeout, result_up);
+}
+
+int
+lstcon_batch_debug(int timeout, char *name,
+		   int client, struct list_head *result_up)
+{
+	lstcon_batch_t *bat;
+	int	     rc;
+
+	rc = lstcon_batch_find(name, &bat);
+	if (rc != 0)
+		return -ENOENT;
+
+	rc = lstcon_debug_ndlist(client ? &bat->bat_cli_list :
+					  &bat->bat_srv_list,
+				 NULL, timeout, result_up);
+
+	return rc;
+}
+
+int
+lstcon_group_debug(int timeout, char *name,
+		   struct list_head *result_up)
+{
+	lstcon_group_t *grp;
+	int	     rc;
+
+	rc = lstcon_group_find(name, &grp);
+	if (rc != 0)
+		return -ENOENT;
+
+	rc = lstcon_debug_ndlist(&grp->grp_ndl_list, NULL,
+				 timeout, result_up);
+	lstcon_group_put(grp);
+
+	return rc;
+}
+
+int
+lstcon_nodes_debug(int timeout,
+		   int count, lnet_process_id_t *ids_up,
+		   struct list_head *result_up)
+{
+	lnet_process_id_t  id;
+	lstcon_ndlink_t   *ndl;
+	lstcon_group_t    *grp;
+	int		i;
+	int		rc;
+
+	rc = lstcon_group_alloc(NULL, &grp);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Out of memory\n");
+		return rc;
+	}
+
+	for (i = 0; i < count; i++) {
+		if (copy_from_user(&id, &ids_up[i], sizeof(id))) {
+			rc = -EFAULT;
+			break;
+		}
+
+		/* node is added to tmp group */
+		rc = lstcon_group_ndlink_find(grp, id, &ndl, 1);
+		if (rc != 0) {
+			CERROR("Can't create node link\n");
+			break;
+		}
+	}
+
+	if (rc != 0) {
+		lstcon_group_put(grp);
+		return rc;
+	}
+
+	rc = lstcon_debug_ndlist(&grp->grp_ndl_list, NULL,
+				 timeout, result_up);
+
+	lstcon_group_put(grp);
+
+	return rc;
+}
+
+int
+lstcon_session_match(lst_sid_t sid)
+{
+	return (console_session.ses_id.ses_nid   == sid.ses_nid &&
+		console_session.ses_id.ses_stamp == sid.ses_stamp) ?  1: 0;
+}
+
+static void
+lstcon_new_session_id(lst_sid_t *sid)
+{
+	lnet_process_id_t      id;
+
+	LASSERT (console_session.ses_state == LST_SESSION_NONE);
+
+	LNetGetId(1, &id);
+	sid->ses_nid   = id.nid;
+	sid->ses_stamp = cfs_time_current();
+}
+
+extern srpc_service_t lstcon_acceptor_service;
+
+int
+lstcon_session_new(char *name, int key, unsigned feats,
+		   int timeout, int force, lst_sid_t *sid_up)
+{
+	int     rc = 0;
+	int     i;
+
+	if (console_session.ses_state != LST_SESSION_NONE) {
+		/* session exists */
+		if (!force) {
+			CNETERR("Session %s already exists\n",
+				console_session.ses_name);
+			return -EEXIST;
+		}
+
+		rc = lstcon_session_end();
+
+		/* lstcon_session_end() only return local error */
+		if  (rc != 0)
+			return rc;
+	}
+
+	if ((feats & ~LST_FEATS_MASK) != 0) {
+		CNETERR("Unknown session features %x\n",
+			(feats & ~LST_FEATS_MASK));
+		return -EINVAL;
+	}
+
+	for (i = 0; i < LST_GLOBAL_HASHSIZE; i++)
+		LASSERT(list_empty(&console_session.ses_ndl_hash[i]));
+
+	lstcon_new_session_id(&console_session.ses_id);
+
+	console_session.ses_key	    = key;
+	console_session.ses_state   = LST_SESSION_ACTIVE;
+	console_session.ses_force   = !!force;
+	console_session.ses_features = feats;
+	console_session.ses_feats_updated = 0;
+	console_session.ses_timeout = (timeout <= 0) ?
+				      LST_CONSOLE_TIMEOUT : timeout;
+	strcpy(console_session.ses_name, name);
+
+	rc = lstcon_batch_add(LST_DEFAULT_BATCH);
+	if (rc != 0)
+		return rc;
+
+	rc = lstcon_rpc_pinger_start();
+	if (rc != 0) {
+		lstcon_batch_t *bat = NULL;
+
+		lstcon_batch_find(LST_DEFAULT_BATCH, &bat);
+		lstcon_batch_destroy(bat);
+
+		return rc;
+	}
+
+	if (copy_to_user(sid_up, &console_session.ses_id,
+			     sizeof(lst_sid_t)) == 0)
+		return rc;
+
+	lstcon_session_end();
+
+	return -EFAULT;
+}
+
+int
+lstcon_session_info(lst_sid_t *sid_up, int *key_up, unsigned *featp,
+		    lstcon_ndlist_ent_t *ndinfo_up, char *name_up, int len)
+{
+	lstcon_ndlist_ent_t *entp;
+	lstcon_ndlink_t     *ndl;
+	int		  rc = 0;
+
+	if (console_session.ses_state != LST_SESSION_ACTIVE)
+		return -ESRCH;
+
+	LIBCFS_ALLOC(entp, sizeof(*entp));
+	if (entp == NULL)
+		return -ENOMEM;
+
+	memset(entp, 0, sizeof(*entp));
+
+	list_for_each_entry(ndl, &console_session.ses_ndl_list, ndl_link)
+		LST_NODE_STATE_COUNTER(ndl->ndl_node, entp);
+
+	if (copy_to_user(sid_up, &console_session.ses_id,
+			     sizeof(lst_sid_t)) ||
+	    copy_to_user(key_up, &console_session.ses_key,
+			     sizeof(*key_up)) ||
+	    copy_to_user(featp, &console_session.ses_features,
+			     sizeof(*featp)) ||
+	    copy_to_user(ndinfo_up, entp, sizeof(*entp)) ||
+	    copy_to_user(name_up, console_session.ses_name, len))
+		rc = -EFAULT;
+
+	LIBCFS_FREE(entp, sizeof(*entp));
+
+	return rc;
+}
+
+int
+lstcon_session_end()
+{
+	lstcon_rpc_trans_t *trans;
+	lstcon_group_t     *grp;
+	lstcon_batch_t     *bat;
+	int		 rc = 0;
+
+	LASSERT (console_session.ses_state == LST_SESSION_ACTIVE);
+
+	rc = lstcon_rpc_trans_ndlist(&console_session.ses_ndl_list,
+				     NULL, LST_TRANS_SESEND, NULL,
+				     lstcon_sesrpc_condition, &trans);
+	if (rc != 0) {
+		CERROR("Can't create transaction: %d\n", rc);
+		return rc;
+	}
+
+	console_session.ses_shutdown = 1;
+
+	lstcon_rpc_pinger_stop();
+
+	lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+	lstcon_rpc_trans_destroy(trans);
+	/* User can do nothing even rpc failed, so go on */
+
+	/* waiting for orphan rpcs to die */
+	lstcon_rpc_cleanup_wait();
+
+	console_session.ses_id    = LST_INVALID_SID;
+	console_session.ses_state = LST_SESSION_NONE;
+	console_session.ses_key   = 0;
+	console_session.ses_force = 0;
+	console_session.ses_feats_updated = 0;
+
+	/* destroy all batches */
+	while (!list_empty(&console_session.ses_bat_list)) {
+		bat = list_entry(console_session.ses_bat_list.next,
+				     lstcon_batch_t, bat_link);
+
+		lstcon_batch_destroy(bat);
+	}
+
+	/* destroy all groups */
+	while (!list_empty(&console_session.ses_grp_list)) {
+		grp = list_entry(console_session.ses_grp_list.next,
+				     lstcon_group_t, grp_link);
+		LASSERT (grp->grp_ref == 1);
+
+		lstcon_group_put(grp);
+	}
+
+	/* all nodes should be released */
+	LASSERT (list_empty(&console_session.ses_ndl_list));
+
+	console_session.ses_shutdown = 0;
+	console_session.ses_expired  = 0;
+
+	return rc;
+}
+
+int
+lstcon_session_feats_check(unsigned feats)
+{
+	int rc = 0;
+
+	if ((feats & ~LST_FEATS_MASK) != 0) {
+		CERROR("Can't support these features: %x\n",
+		       (feats & ~LST_FEATS_MASK));
+		return -EPROTO;
+	}
+
+	spin_lock(&console_session.ses_rpc_lock);
+
+	if (!console_session.ses_feats_updated) {
+		console_session.ses_feats_updated = 1;
+		console_session.ses_features = feats;
+	}
+
+	if (console_session.ses_features != feats)
+		rc = -EPROTO;
+
+	spin_unlock(&console_session.ses_rpc_lock);
+
+	if (rc != 0) {
+		CERROR("remote features %x do not match with "
+		       "session features %x of console\n",
+		       feats, console_session.ses_features);
+	}
+
+	return rc;
+}
+
+static int
+lstcon_acceptor_handle (srpc_server_rpc_t *rpc)
+{
+	srpc_msg_t	*rep  = &rpc->srpc_replymsg;
+	srpc_msg_t	*req  = &rpc->srpc_reqstbuf->buf_msg;
+	srpc_join_reqst_t *jreq = &req->msg_body.join_reqst;
+	srpc_join_reply_t *jrep = &rep->msg_body.join_reply;
+	lstcon_group_t    *grp  = NULL;
+	lstcon_ndlink_t   *ndl;
+	int		rc   = 0;
+
+	sfw_unpack_message(req);
+
+	mutex_lock(&console_session.ses_mutex);
+
+	jrep->join_sid = console_session.ses_id;
+
+	if (console_session.ses_id.ses_nid == LNET_NID_ANY) {
+		jrep->join_status = ESRCH;
+		goto out;
+	}
+
+	if (lstcon_session_feats_check(req->msg_ses_feats) != 0) {
+		jrep->join_status = EPROTO;
+		goto out;
+	}
+
+	if (jreq->join_sid.ses_nid != LNET_NID_ANY &&
+	     !lstcon_session_match(jreq->join_sid)) {
+		jrep->join_status = EBUSY;
+		goto out;
+	}
+
+	if (lstcon_group_find(jreq->join_group, &grp) != 0) {
+		rc = lstcon_group_alloc(jreq->join_group, &grp);
+		if (rc != 0) {
+			CERROR("Out of memory\n");
+			goto out;
+		}
+
+		list_add_tail(&grp->grp_link,
+				  &console_session.ses_grp_list);
+		lstcon_group_addref(grp);
+	}
+
+	if (grp->grp_ref > 2) {
+		/* Group in using */
+		jrep->join_status = EBUSY;
+		goto out;
+	}
+
+	rc = lstcon_group_ndlink_find(grp, rpc->srpc_peer, &ndl, 0);
+	if (rc == 0) {
+		jrep->join_status = EEXIST;
+		goto out;
+	}
+
+	rc = lstcon_group_ndlink_find(grp, rpc->srpc_peer, &ndl, 1);
+	if (rc != 0) {
+		CERROR("Out of memory\n");
+		goto out;
+	}
+
+	ndl->ndl_node->nd_state   = LST_NODE_ACTIVE;
+	ndl->ndl_node->nd_timeout = console_session.ses_timeout;
+
+	if (grp->grp_userland == 0)
+		grp->grp_userland = 1;
+
+	strcpy(jrep->join_session, console_session.ses_name);
+	jrep->join_timeout = console_session.ses_timeout;
+	jrep->join_status  = 0;
+
+out:
+	rep->msg_ses_feats = console_session.ses_features;
+	if (grp != NULL)
+		lstcon_group_put(grp);
+
+	mutex_unlock(&console_session.ses_mutex);
+
+	return rc;
+}
+
+srpc_service_t lstcon_acceptor_service;
+void lstcon_init_acceptor_service(void)
+{
+	/* initialize selftest console acceptor service table */
+	lstcon_acceptor_service.sv_name    = "join session";
+	lstcon_acceptor_service.sv_handler = lstcon_acceptor_handle;
+	lstcon_acceptor_service.sv_id      = SRPC_SERVICE_JOIN;
+	lstcon_acceptor_service.sv_wi_total = SFW_FRWK_WI_MAX;
+}
+
+extern int lstcon_ioctl_entry(unsigned int cmd, struct libcfs_ioctl_data *data);
+
+DECLARE_IOCTL_HANDLER(lstcon_ioctl_handler, lstcon_ioctl_entry);
+
+/* initialize console */
+int
+lstcon_console_init(void)
+{
+	int     i;
+	int     rc;
+
+	memset(&console_session, 0, sizeof(lstcon_session_t));
+
+	console_session.ses_id		    = LST_INVALID_SID;
+	console_session.ses_state	    = LST_SESSION_NONE;
+	console_session.ses_timeout	    = 0;
+	console_session.ses_force	    = 0;
+	console_session.ses_expired	    = 0;
+	console_session.ses_feats_updated   = 0;
+	console_session.ses_features	    = LST_FEATS_MASK;
+	console_session.ses_laststamp	    = cfs_time_current_sec();
+
+	mutex_init(&console_session.ses_mutex);
+
+	INIT_LIST_HEAD(&console_session.ses_ndl_list);
+	INIT_LIST_HEAD(&console_session.ses_grp_list);
+	INIT_LIST_HEAD(&console_session.ses_bat_list);
+	INIT_LIST_HEAD(&console_session.ses_trans_list);
+
+	LIBCFS_ALLOC(console_session.ses_ndl_hash,
+		     sizeof(struct list_head) * LST_GLOBAL_HASHSIZE);
+	if (console_session.ses_ndl_hash == NULL)
+		return -ENOMEM;
+
+	for (i = 0; i < LST_GLOBAL_HASHSIZE; i++)
+		INIT_LIST_HEAD(&console_session.ses_ndl_hash[i]);
+
+
+	/* initialize acceptor service table */
+	lstcon_init_acceptor_service();
+
+	rc = srpc_add_service(&lstcon_acceptor_service);
+	LASSERT (rc != -EBUSY);
+	if (rc != 0) {
+		LIBCFS_FREE(console_session.ses_ndl_hash,
+			    sizeof(struct list_head) * LST_GLOBAL_HASHSIZE);
+		return rc;
+	}
+
+	rc = srpc_service_add_buffers(&lstcon_acceptor_service,
+				      lstcon_acceptor_service.sv_wi_total);
+	if (rc != 0) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	rc = libcfs_register_ioctl(&lstcon_ioctl_handler);
+
+	if (rc == 0) {
+		lstcon_rpc_module_init();
+		return 0;
+	}
+
+out:
+	srpc_shutdown_service(&lstcon_acceptor_service);
+	srpc_remove_service(&lstcon_acceptor_service);
+
+	LIBCFS_FREE(console_session.ses_ndl_hash,
+		    sizeof(struct list_head) * LST_GLOBAL_HASHSIZE);
+
+	srpc_wait_service_shutdown(&lstcon_acceptor_service);
+
+	return rc;
+}
+
+int
+lstcon_console_fini(void)
+{
+	int     i;
+
+	libcfs_deregister_ioctl(&lstcon_ioctl_handler);
+
+	mutex_lock(&console_session.ses_mutex);
+
+	srpc_shutdown_service(&lstcon_acceptor_service);
+	srpc_remove_service(&lstcon_acceptor_service);
+
+	if (console_session.ses_state != LST_SESSION_NONE)
+		lstcon_session_end();
+
+	lstcon_rpc_module_fini();
+
+	mutex_unlock(&console_session.ses_mutex);
+
+	LASSERT (list_empty(&console_session.ses_ndl_list));
+	LASSERT (list_empty(&console_session.ses_grp_list));
+	LASSERT (list_empty(&console_session.ses_bat_list));
+	LASSERT (list_empty(&console_session.ses_trans_list));
+
+	for (i = 0; i < LST_NODE_HASHSIZE; i++) {
+		LASSERT (list_empty(&console_session.ses_ndl_hash[i]));
+	}
+
+	LIBCFS_FREE(console_session.ses_ndl_hash,
+		    sizeof(struct list_head) * LST_GLOBAL_HASHSIZE);
+
+	srpc_wait_service_shutdown(&lstcon_acceptor_service);
+
+	return 0;
+}
diff --git a/drivers/staging/lustre/lnet/selftest/console.h b/drivers/staging/lustre/lnet/selftest/console.h
new file mode 100644
index 000000000000..e61b26687dbb
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/console.h
@@ -0,0 +1,232 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/console.h
+ *
+ * kernel structure for LST console
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+#ifndef __LST_CONSOLE_H__
+#define __LST_CONSOLE_H__
+
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lnet.h>
+#include <linux/lnet/lib-types.h>
+#include <linux/lnet/lnetst.h>
+#include "selftest.h"
+#include "conrpc.h"
+
+typedef struct lstcon_node {
+	lnet_process_id_t    nd_id;	  /* id of the node */
+	int		  nd_ref;	 /* reference count */
+	int		  nd_state;       /* state of the node */
+	int		  nd_timeout;     /* session timeout */
+	cfs_time_t	   nd_stamp;       /* timestamp of last replied RPC */
+	struct lstcon_rpc    nd_ping;	/* ping rpc */
+} lstcon_node_t;				/*** node descriptor */
+
+typedef struct {
+	struct list_head	   ndl_link;       /* chain on list */
+	struct list_head	   ndl_hlink;      /* chain on hash */
+	lstcon_node_t       *ndl_node;       /* pointer to node */
+} lstcon_ndlink_t;			      /*** node link descriptor */
+
+typedef struct {
+	struct list_head	   grp_link;       /* chain on global group list */
+	int		  grp_ref;	/* reference count */
+	int		  grp_userland;   /* has userland nodes */
+	int		  grp_nnode;      /* # of nodes */
+	char		 grp_name[LST_NAME_SIZE]; /* group name */
+
+	struct list_head	   grp_trans_list; /* transaction list */
+	struct list_head	   grp_ndl_list;   /* nodes list */
+	struct list_head	   grp_ndl_hash[0];/* hash table for nodes */
+} lstcon_group_t;		    /*** (alias of nodes) group descriptor */
+
+#define LST_BATCH_IDLE	  0xB0	    /* idle batch */
+#define LST_BATCH_RUNNING       0xB1	    /* running batch */
+
+typedef struct lstcon_tsb_hdr {
+	lst_bid_t	       tsb_id;	 /* batch ID */
+	int		     tsb_index;      /* test index */
+} lstcon_tsb_hdr_t;
+
+typedef struct {
+	lstcon_tsb_hdr_t	bat_hdr;	/* test_batch header */
+	struct list_head	      bat_link;       /* chain on session's batches list */
+	int		     bat_ntest;      /* # of test */
+	int		     bat_state;      /* state of the batch */
+	int		     bat_arg;	/* parameter for run|stop, timeout for run, force for stop */
+	char		    bat_name[LST_NAME_SIZE]; /* name of batch */
+
+	struct list_head	      bat_test_list;  /* list head of tests (lstcon_test_t) */
+	struct list_head	      bat_trans_list; /* list head of transaction */
+	struct list_head	      bat_cli_list;   /* list head of client nodes (lstcon_node_t) */
+	struct list_head	     *bat_cli_hash;   /* hash table of client nodes */
+	struct list_head	      bat_srv_list;   /* list head of server nodes */
+	struct list_head	     *bat_srv_hash;   /* hash table of server nodes */
+} lstcon_batch_t;			     /*** (tests ) batch descritptor */
+
+typedef struct lstcon_test {
+	lstcon_tsb_hdr_t      tes_hdr;	/* test batch header */
+	struct list_head	    tes_link;       /* chain on batch's tests list */
+	lstcon_batch_t       *tes_batch;      /* pointer to batch */
+
+	int		   tes_type;       /* type of the test, i.e: bulk, ping */
+	int		   tes_stop_onerr; /* stop on error */
+	int		   tes_oneside;    /* one-sided test */
+	int		   tes_concur;     /* concurrency */
+	int		   tes_loop;       /* loop count */
+	int		   tes_dist;       /* nodes distribution of target group */
+	int		   tes_span;       /* nodes span of target group */
+	int		   tes_cliidx;     /* client index, used for RPC creating */
+
+	struct list_head  tes_trans_list; /* transaction list */
+	lstcon_group_t       *tes_src_grp;    /* group run the test */
+	lstcon_group_t       *tes_dst_grp;    /* target group */
+
+	int		   tes_paramlen;   /* test parameter length */
+	char		  tes_param[0];   /* test parameter */
+} lstcon_test_t;				/*** a single test descriptor */
+
+#define LST_GLOBAL_HASHSIZE     503	     /* global nodes hash table size */
+#define LST_NODE_HASHSIZE       239	     /* node hash table (for batch or group) */
+
+#define LST_SESSION_NONE	0x0	     /* no session */
+#define LST_SESSION_ACTIVE      0x1	     /* working session */
+
+#define LST_CONSOLE_TIMEOUT     300	     /* default console timeout */
+
+typedef struct {
+	struct mutex		ses_mutex;      /* only 1 thread in session */
+	lst_sid_t	       ses_id;	 /* global session id */
+	int		     ses_key;	/* local session key */
+	int		     ses_state;      /* state of session */
+	int		     ses_timeout;    /* timeout in seconds */
+	time_t		  ses_laststamp;  /* last operation stamp (seconds) */
+	/** tests features of the session */
+	unsigned		ses_features;
+	/** features are synced with remote test nodes */
+	unsigned		ses_feats_updated:1;
+	/** force creating */
+	unsigned		ses_force:1;
+	/** session is shutting down */
+	unsigned		ses_shutdown:1;
+	/** console is timedout */
+	unsigned		ses_expired:1;
+	__u64		   ses_id_cookie;  /* batch id cookie */
+	char		    ses_name[LST_NAME_SIZE];  /* session name */
+	lstcon_rpc_trans_t     *ses_ping;       /* session pinger */
+	stt_timer_t	     ses_ping_timer; /* timer for pinger */
+	lstcon_trans_stat_t     ses_trans_stat; /* transaction stats */
+
+	struct list_head	      ses_trans_list; /* global list of transaction */
+	struct list_head	      ses_grp_list;   /* global list of groups */
+	struct list_head	      ses_bat_list;   /* global list of batches */
+	struct list_head	      ses_ndl_list;   /* global list of nodes */
+	struct list_head	     *ses_ndl_hash;   /* hash table of nodes */
+
+	spinlock_t	  ses_rpc_lock;   /* serialize */
+	atomic_t	    ses_rpc_counter;/* # of initialized RPCs */
+	struct list_head	      ses_rpc_freelist; /* idle console rpc */
+} lstcon_session_t;			     /*** session descriptor */
+
+extern lstcon_session_t	 console_session;
+
+static inline lstcon_trans_stat_t *
+lstcon_trans_stat(void)
+{
+	return &console_session.ses_trans_stat;
+}
+
+static inline struct list_head *
+lstcon_id2hash (lnet_process_id_t id, struct list_head *hash)
+{
+	unsigned int idx = LNET_NIDADDR(id.nid) % LST_NODE_HASHSIZE;
+
+	return &hash[idx];
+}
+
+extern int lstcon_session_match(lst_sid_t sid);
+extern int lstcon_session_new(char *name, int key, unsigned version,
+			      int timeout, int flags, lst_sid_t *sid_up);
+extern int lstcon_session_info(lst_sid_t *sid_up, int *key, unsigned *verp,
+			       lstcon_ndlist_ent_t *entp, char *name_up, int len);
+extern int lstcon_session_end(void);
+extern int lstcon_session_debug(int timeout, struct list_head *result_up);
+extern int lstcon_session_feats_check(unsigned feats);
+extern int lstcon_batch_debug(int timeout, char *name,
+			      int client, struct list_head *result_up);
+extern int lstcon_group_debug(int timeout, char *name,
+			      struct list_head *result_up);
+extern int lstcon_nodes_debug(int timeout, int nnd, lnet_process_id_t *nds_up,
+			      struct list_head *result_up);
+extern int lstcon_group_add(char *name);
+extern int lstcon_group_del(char *name);
+extern int lstcon_group_clean(char *name, int args);
+extern int lstcon_group_refresh(char *name, struct list_head *result_up);
+extern int lstcon_nodes_add(char *name, int nnd, lnet_process_id_t *nds_up,
+			    unsigned *featp, struct list_head *result_up);
+extern int lstcon_nodes_remove(char *name, int nnd, lnet_process_id_t *nds_up,
+			       struct list_head *result_up);
+extern int lstcon_group_info(char *name, lstcon_ndlist_ent_t *gent_up,
+			     int *index_p, int *ndent_p, lstcon_node_ent_t *ndents_up);
+extern int lstcon_group_list(int idx, int len, char *name_up);
+extern int lstcon_batch_add(char *name);
+extern int lstcon_batch_run(char *name, int timeout,
+			    struct list_head *result_up);
+extern int lstcon_batch_stop(char *name, int force,
+			     struct list_head *result_up);
+extern int lstcon_test_batch_query(char *name, int testidx,
+				   int client, int timeout,
+				   struct list_head *result_up);
+extern int lstcon_batch_del(char *name);
+extern int lstcon_batch_list(int idx, int namelen, char *name_up);
+extern int lstcon_batch_info(char *name, lstcon_test_batch_ent_t *ent_up,
+			     int server, int testidx, int *index_p,
+			     int *ndent_p, lstcon_node_ent_t *dents_up);
+extern int lstcon_group_stat(char *grp_name, int timeout,
+			     struct list_head *result_up);
+extern int lstcon_nodes_stat(int count, lnet_process_id_t *ids_up,
+			     int timeout, struct list_head *result_up);
+extern int lstcon_test_add(char *name, int type, int loop, int concur,
+			   int dist, int span, char *src_name, char * dst_name,
+			   void *param, int paramlen, int *retp,
+			   struct list_head *result_up);
+
+#endif
diff --git a/drivers/staging/lustre/lnet/selftest/framework.c b/drivers/staging/lustre/lnet/selftest/framework.c
new file mode 100644
index 000000000000..483c78564dae
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/framework.c
@@ -0,0 +1,1814 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/framework.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ * Author: Liang Zhen  <liangzhen@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "selftest.h"
+
+lst_sid_t LST_INVALID_SID = {LNET_NID_ANY, -1};
+
+static int session_timeout = 100;
+CFS_MODULE_PARM(session_timeout, "i", int, 0444,
+		"test session timeout in seconds (100 by default, 0 == never)");
+
+static int rpc_timeout = 64;
+CFS_MODULE_PARM(rpc_timeout, "i", int, 0644,
+		"rpc timeout in seconds (64 by default, 0 == never)");
+
+#define sfw_unpack_id(id)	       \
+do {				    \
+	__swab64s(&(id).nid);	   \
+	__swab32s(&(id).pid);	   \
+} while (0)
+
+#define sfw_unpack_sid(sid)	     \
+do {				    \
+	__swab64s(&(sid).ses_nid);      \
+	__swab64s(&(sid).ses_stamp);    \
+} while (0)
+
+#define sfw_unpack_fw_counters(fc)	\
+do {				      \
+	__swab32s(&(fc).running_ms);      \
+	__swab32s(&(fc).active_batches);  \
+	__swab32s(&(fc).zombie_sessions); \
+	__swab32s(&(fc).brw_errors);      \
+	__swab32s(&(fc).ping_errors);     \
+} while (0)
+
+#define sfw_unpack_rpc_counters(rc)     \
+do {				    \
+	__swab32s(&(rc).errors);	\
+	__swab32s(&(rc).rpcs_sent);     \
+	__swab32s(&(rc).rpcs_rcvd);     \
+	__swab32s(&(rc).rpcs_dropped);  \
+	__swab32s(&(rc).rpcs_expired);  \
+	__swab64s(&(rc).bulk_get);      \
+	__swab64s(&(rc).bulk_put);      \
+} while (0)
+
+#define sfw_unpack_lnet_counters(lc)    \
+do {				    \
+	__swab32s(&(lc).errors);	\
+	__swab32s(&(lc).msgs_max);      \
+	__swab32s(&(lc).msgs_alloc);    \
+	__swab32s(&(lc).send_count);    \
+	__swab32s(&(lc).recv_count);    \
+	__swab32s(&(lc).drop_count);    \
+	__swab32s(&(lc).route_count);   \
+	__swab64s(&(lc).send_length);   \
+	__swab64s(&(lc).recv_length);   \
+	__swab64s(&(lc).drop_length);   \
+	__swab64s(&(lc).route_length);  \
+} while (0)
+
+#define sfw_test_active(t)      (atomic_read(&(t)->tsi_nactive) != 0)
+#define sfw_batch_active(b)     (atomic_read(&(b)->bat_nactive) != 0)
+
+struct smoketest_framework {
+	struct list_head	 fw_zombie_rpcs;     /* RPCs to be recycled */
+	struct list_head	 fw_zombie_sessions; /* stopping sessions */
+	struct list_head	 fw_tests;	   /* registered test cases */
+	atomic_t       fw_nzombies;	/* # zombie sessions */
+	spinlock_t	   fw_lock;		/* serialise */
+	sfw_session_t	  *fw_session;		/* _the_ session */
+	int		   fw_shuttingdown;	/* shutdown in progress */
+	srpc_server_rpc_t *fw_active_srpc;	/* running RPC */
+} sfw_data;
+
+/* forward ref's */
+int sfw_stop_batch (sfw_batch_t *tsb, int force);
+void sfw_destroy_session (sfw_session_t *sn);
+
+static inline sfw_test_case_t *
+sfw_find_test_case(int id)
+{
+	sfw_test_case_t *tsc;
+
+	LASSERT (id <= SRPC_SERVICE_MAX_ID);
+	LASSERT (id > SRPC_FRAMEWORK_SERVICE_MAX_ID);
+
+	list_for_each_entry (tsc, &sfw_data.fw_tests, tsc_list) {
+		if (tsc->tsc_srv_service->sv_id == id)
+			return tsc;
+	}
+
+	return NULL;
+}
+
+static int
+sfw_register_test (srpc_service_t *service, sfw_test_client_ops_t *cliops)
+{
+	sfw_test_case_t *tsc;
+
+	if (sfw_find_test_case(service->sv_id) != NULL) {
+		CERROR ("Failed to register test %s (%d)\n",
+			service->sv_name, service->sv_id);
+		return -EEXIST;
+	}
+
+	LIBCFS_ALLOC(tsc, sizeof(sfw_test_case_t));
+	if (tsc == NULL)
+		return -ENOMEM;
+
+	memset(tsc, 0, sizeof(sfw_test_case_t));
+	tsc->tsc_cli_ops     = cliops;
+	tsc->tsc_srv_service = service;
+
+	list_add_tail(&tsc->tsc_list, &sfw_data.fw_tests);
+	return 0;
+}
+
+void
+sfw_add_session_timer (void)
+{
+	sfw_session_t *sn = sfw_data.fw_session;
+	stt_timer_t   *timer = &sn->sn_timer;
+
+	LASSERT (!sfw_data.fw_shuttingdown);
+
+	if (sn == NULL || sn->sn_timeout == 0)
+		return;
+
+	LASSERT (!sn->sn_timer_active);
+
+	sn->sn_timer_active = 1;
+	timer->stt_expires = cfs_time_add(sn->sn_timeout,
+					  cfs_time_current_sec());
+	stt_add_timer(timer);
+	return;
+}
+
+int
+sfw_del_session_timer (void)
+{
+	sfw_session_t *sn = sfw_data.fw_session;
+
+	if (sn == NULL || !sn->sn_timer_active)
+		return 0;
+
+	LASSERT (sn->sn_timeout != 0);
+
+	if (stt_del_timer(&sn->sn_timer)) { /* timer defused */
+		sn->sn_timer_active = 0;
+		return 0;
+	}
+
+	return EBUSY; /* racing with sfw_session_expired() */
+}
+
+/* called with sfw_data.fw_lock held */
+static void
+sfw_deactivate_session (void)
+{
+	sfw_session_t *sn = sfw_data.fw_session;
+	int	    nactive = 0;
+	sfw_batch_t   *tsb;
+	sfw_test_case_t *tsc;
+
+	if (sn == NULL) return;
+
+	LASSERT (!sn->sn_timer_active);
+
+	sfw_data.fw_session = NULL;
+	atomic_inc(&sfw_data.fw_nzombies);
+	list_add(&sn->sn_list, &sfw_data.fw_zombie_sessions);
+
+	spin_unlock(&sfw_data.fw_lock);
+
+	list_for_each_entry(tsc, &sfw_data.fw_tests, tsc_list) {
+		srpc_abort_service(tsc->tsc_srv_service);
+	}
+
+	spin_lock(&sfw_data.fw_lock);
+
+	list_for_each_entry (tsb, &sn->sn_batches, bat_list) {
+		if (sfw_batch_active(tsb)) {
+			nactive++;
+			sfw_stop_batch(tsb, 1);
+		}
+	}
+
+	if (nactive != 0)
+		return;   /* wait for active batches to stop */
+
+	list_del_init(&sn->sn_list);
+	spin_unlock(&sfw_data.fw_lock);
+
+	sfw_destroy_session(sn);
+
+	spin_lock(&sfw_data.fw_lock);
+}
+
+
+void
+sfw_session_expired (void *data)
+{
+	sfw_session_t *sn = data;
+
+	spin_lock(&sfw_data.fw_lock);
+
+	LASSERT (sn->sn_timer_active);
+	LASSERT (sn == sfw_data.fw_session);
+
+	CWARN ("Session expired! sid: %s-"LPU64", name: %s\n",
+	       libcfs_nid2str(sn->sn_id.ses_nid),
+	       sn->sn_id.ses_stamp, &sn->sn_name[0]);
+
+	sn->sn_timer_active = 0;
+	sfw_deactivate_session();
+
+	spin_unlock(&sfw_data.fw_lock);
+}
+
+static inline void
+sfw_init_session(sfw_session_t *sn, lst_sid_t sid,
+		 unsigned features, const char *name)
+{
+	stt_timer_t *timer = &sn->sn_timer;
+
+	memset(sn, 0, sizeof(sfw_session_t));
+	INIT_LIST_HEAD(&sn->sn_list);
+	INIT_LIST_HEAD(&sn->sn_batches);
+	atomic_set(&sn->sn_refcount, 1);	/* +1 for caller */
+	atomic_set(&sn->sn_brw_errors, 0);
+	atomic_set(&sn->sn_ping_errors, 0);
+	strlcpy(&sn->sn_name[0], name, sizeof(sn->sn_name));
+
+	sn->sn_timer_active = 0;
+	sn->sn_id	   = sid;
+	sn->sn_features	    = features;
+	sn->sn_timeout      = session_timeout;
+	sn->sn_started      = cfs_time_current();
+
+	timer->stt_data = sn;
+	timer->stt_func = sfw_session_expired;
+	INIT_LIST_HEAD(&timer->stt_list);
+}
+
+/* completion handler for incoming framework RPCs */
+void
+sfw_server_rpc_done(struct srpc_server_rpc *rpc)
+{
+	struct srpc_service	*sv	= rpc->srpc_scd->scd_svc;
+	int			status	= rpc->srpc_status;
+
+	CDEBUG (D_NET,
+		"Incoming framework RPC done: "
+		"service %s, peer %s, status %s:%d\n",
+		sv->sv_name, libcfs_id2str(rpc->srpc_peer),
+		swi_state2str(rpc->srpc_wi.swi_state),
+		status);
+
+	if (rpc->srpc_bulk != NULL)
+		sfw_free_pages(rpc);
+	return;
+}
+
+void
+sfw_client_rpc_fini (srpc_client_rpc_t *rpc)
+{
+	LASSERT (rpc->crpc_bulk.bk_niov == 0);
+	LASSERT (list_empty(&rpc->crpc_list));
+	LASSERT (atomic_read(&rpc->crpc_refcount) == 0);
+
+	CDEBUG (D_NET,
+		"Outgoing framework RPC done: "
+		"service %d, peer %s, status %s:%d:%d\n",
+		rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+		swi_state2str(rpc->crpc_wi.swi_state),
+		rpc->crpc_aborted, rpc->crpc_status);
+
+	spin_lock(&sfw_data.fw_lock);
+
+	/* my callers must finish all RPCs before shutting me down */
+	LASSERT(!sfw_data.fw_shuttingdown);
+	list_add(&rpc->crpc_list, &sfw_data.fw_zombie_rpcs);
+
+	spin_unlock(&sfw_data.fw_lock);
+}
+
+sfw_batch_t *
+sfw_find_batch (lst_bid_t bid)
+{
+	sfw_session_t *sn = sfw_data.fw_session;
+	sfw_batch_t   *bat;
+
+	LASSERT (sn != NULL);
+
+	list_for_each_entry (bat, &sn->sn_batches, bat_list) {
+		if (bat->bat_id.bat_id == bid.bat_id)
+			return bat;
+	}
+
+	return NULL;
+}
+
+sfw_batch_t *
+sfw_bid2batch (lst_bid_t bid)
+{
+	sfw_session_t *sn = sfw_data.fw_session;
+	sfw_batch_t   *bat;
+
+	LASSERT (sn != NULL);
+
+	bat = sfw_find_batch(bid);
+	if (bat != NULL)
+		return bat;
+
+	LIBCFS_ALLOC(bat, sizeof(sfw_batch_t));
+	if (bat == NULL)
+		return NULL;
+
+	bat->bat_error    = 0;
+	bat->bat_session  = sn;
+	bat->bat_id       = bid;
+	atomic_set(&bat->bat_nactive, 0);
+	INIT_LIST_HEAD(&bat->bat_tests);
+
+	list_add_tail(&bat->bat_list, &sn->sn_batches);
+	return bat;
+}
+
+int
+sfw_get_stats (srpc_stat_reqst_t *request, srpc_stat_reply_t *reply)
+{
+	sfw_session_t  *sn = sfw_data.fw_session;
+	sfw_counters_t *cnt = &reply->str_fw;
+	sfw_batch_t    *bat;
+	struct timeval  tv;
+
+	reply->str_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+
+	if (request->str_sid.ses_nid == LNET_NID_ANY) {
+		reply->str_status = EINVAL;
+		return 0;
+	}
+
+	if (sn == NULL || !sfw_sid_equal(request->str_sid, sn->sn_id)) {
+		reply->str_status = ESRCH;
+		return 0;
+	}
+
+	lnet_counters_get(&reply->str_lnet);
+	srpc_get_counters(&reply->str_rpc);
+
+	/* send over the msecs since the session was started
+	 - with 32 bits to send, this is ~49 days */
+	cfs_duration_usec(cfs_time_sub(cfs_time_current(),
+				       sn->sn_started), &tv);
+
+	cnt->running_ms      = (__u32)(tv.tv_sec * 1000 + tv.tv_usec / 1000);
+	cnt->brw_errors      = atomic_read(&sn->sn_brw_errors);
+	cnt->ping_errors     = atomic_read(&sn->sn_ping_errors);
+	cnt->zombie_sessions = atomic_read(&sfw_data.fw_nzombies);
+
+	cnt->active_batches = 0;
+	list_for_each_entry (bat, &sn->sn_batches, bat_list) {
+		if (atomic_read(&bat->bat_nactive) > 0)
+			cnt->active_batches++;
+	}
+
+	reply->str_status = 0;
+	return 0;
+}
+
+int
+sfw_make_session(srpc_mksn_reqst_t *request, srpc_mksn_reply_t *reply)
+{
+	sfw_session_t *sn = sfw_data.fw_session;
+	srpc_msg_t    *msg = container_of(request, srpc_msg_t,
+					  msg_body.mksn_reqst);
+	int	       cplen = 0;
+
+	if (request->mksn_sid.ses_nid == LNET_NID_ANY) {
+		reply->mksn_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+		reply->mksn_status = EINVAL;
+		return 0;
+	}
+
+	if (sn != NULL) {
+		reply->mksn_status  = 0;
+		reply->mksn_sid     = sn->sn_id;
+		reply->mksn_timeout = sn->sn_timeout;
+
+		if (sfw_sid_equal(request->mksn_sid, sn->sn_id)) {
+			atomic_inc(&sn->sn_refcount);
+			return 0;
+		}
+
+		if (!request->mksn_force) {
+			reply->mksn_status = EBUSY;
+			cplen = strlcpy(&reply->mksn_name[0], &sn->sn_name[0],
+					sizeof(reply->mksn_name));
+			if (cplen >= sizeof(reply->mksn_name))
+				return -E2BIG;
+			return 0;
+		}
+	}
+
+	/* reject the request if it requires unknown features
+	 * NB: old version will always accept all features because it's not
+	 * aware of srpc_msg_t::msg_ses_feats, it's a defect but it's also
+	 * harmless because it will return zero feature to console, and it's
+	 * console's responsibility to make sure all nodes in a session have
+	 * same feature mask. */
+	if ((msg->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+		reply->mksn_status = EPROTO;
+		return 0;
+	}
+
+	/* brand new or create by force */
+	LIBCFS_ALLOC(sn, sizeof(sfw_session_t));
+	if (sn == NULL) {
+		CERROR ("Dropping RPC (mksn) under memory pressure.\n");
+		return -ENOMEM;
+	}
+
+	sfw_init_session(sn, request->mksn_sid,
+			 msg->msg_ses_feats, &request->mksn_name[0]);
+
+	spin_lock(&sfw_data.fw_lock);
+
+	sfw_deactivate_session();
+	LASSERT(sfw_data.fw_session == NULL);
+	sfw_data.fw_session = sn;
+
+	spin_unlock(&sfw_data.fw_lock);
+
+	reply->mksn_status  = 0;
+	reply->mksn_sid     = sn->sn_id;
+	reply->mksn_timeout = sn->sn_timeout;
+	return 0;
+}
+
+int
+sfw_remove_session (srpc_rmsn_reqst_t *request, srpc_rmsn_reply_t *reply)
+{
+	sfw_session_t *sn = sfw_data.fw_session;
+
+	reply->rmsn_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+
+	if (request->rmsn_sid.ses_nid == LNET_NID_ANY) {
+		reply->rmsn_status = EINVAL;
+		return 0;
+	}
+
+	if (sn == NULL || !sfw_sid_equal(request->rmsn_sid, sn->sn_id)) {
+		reply->rmsn_status = (sn == NULL) ? ESRCH : EBUSY;
+		return 0;
+	}
+
+	if (!atomic_dec_and_test(&sn->sn_refcount)) {
+		reply->rmsn_status = 0;
+		return 0;
+	}
+
+	spin_lock(&sfw_data.fw_lock);
+	sfw_deactivate_session();
+	spin_unlock(&sfw_data.fw_lock);
+
+	reply->rmsn_status = 0;
+	reply->rmsn_sid    = LST_INVALID_SID;
+	LASSERT(sfw_data.fw_session == NULL);
+	return 0;
+}
+
+int
+sfw_debug_session (srpc_debug_reqst_t *request, srpc_debug_reply_t *reply)
+{
+	sfw_session_t *sn = sfw_data.fw_session;
+
+	if (sn == NULL) {
+		reply->dbg_status = ESRCH;
+		reply->dbg_sid    = LST_INVALID_SID;
+		return 0;
+	}
+
+	reply->dbg_status  = 0;
+	reply->dbg_sid     = sn->sn_id;
+	reply->dbg_timeout = sn->sn_timeout;
+	if (strlcpy(reply->dbg_name, &sn->sn_name[0], sizeof(reply->dbg_name))
+	    >= sizeof(reply->dbg_name))
+		return -E2BIG;
+
+	return 0;
+}
+
+void
+sfw_test_rpc_fini (srpc_client_rpc_t *rpc)
+{
+	sfw_test_unit_t     *tsu = rpc->crpc_priv;
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+
+	/* Called with hold of tsi->tsi_lock */
+	LASSERT (list_empty(&rpc->crpc_list));
+	list_add(&rpc->crpc_list, &tsi->tsi_free_rpcs);
+}
+
+static inline int
+sfw_test_buffers(sfw_test_instance_t *tsi)
+{
+	struct sfw_test_case	*tsc = sfw_find_test_case(tsi->tsi_service);
+	struct srpc_service	*svc = tsc->tsc_srv_service;
+	int			nbuf;
+
+	nbuf = min(svc->sv_wi_total, tsi->tsi_loop) / svc->sv_ncpts;
+	return max(SFW_TEST_WI_MIN, nbuf + SFW_TEST_WI_EXTRA);
+}
+
+int
+sfw_load_test(struct sfw_test_instance *tsi)
+{
+	struct sfw_test_case	*tsc;
+	struct srpc_service	*svc;
+	int			nbuf;
+	int			rc;
+
+	LASSERT(tsi != NULL);
+	tsc = sfw_find_test_case(tsi->tsi_service);
+	nbuf = sfw_test_buffers(tsi);
+	LASSERT(tsc != NULL);
+	svc = tsc->tsc_srv_service;
+
+	if (tsi->tsi_is_client) {
+		tsi->tsi_ops = tsc->tsc_cli_ops;
+		return 0;
+	}
+
+	rc = srpc_service_add_buffers(svc, nbuf);
+	if (rc != 0) {
+		CWARN("Failed to reserve enough buffers: "
+		      "service %s, %d needed: %d\n", svc->sv_name, nbuf, rc);
+		/* NB: this error handler is not strictly correct, because
+		 * it may release more buffers than already allocated,
+		 * but it doesn't matter because request portal should
+		 * be lazy portal and will grow buffers if necessary. */
+		srpc_service_remove_buffers(svc, nbuf);
+		return -ENOMEM;
+	}
+
+	CDEBUG(D_NET, "Reserved %d buffers for test %s\n",
+	       nbuf * (srpc_serv_is_framework(svc) ?
+		       1 : cfs_cpt_number(cfs_cpt_table)), svc->sv_name);
+	return 0;
+}
+
+void
+sfw_unload_test(struct sfw_test_instance *tsi)
+{
+	struct sfw_test_case *tsc = sfw_find_test_case(tsi->tsi_service);
+
+	LASSERT(tsc != NULL);
+
+	if (tsi->tsi_is_client)
+		return;
+
+	/* shrink buffers, because request portal is lazy portal
+	 * which can grow buffers at runtime so we may leave
+	 * some buffers behind, but never mind... */
+	srpc_service_remove_buffers(tsc->tsc_srv_service,
+				    sfw_test_buffers(tsi));
+	return;
+}
+
+void
+sfw_destroy_test_instance (sfw_test_instance_t *tsi)
+{
+	srpc_client_rpc_t *rpc;
+	sfw_test_unit_t   *tsu;
+
+	if (!tsi->tsi_is_client) goto clean;
+
+	tsi->tsi_ops->tso_fini(tsi);
+
+	LASSERT (!tsi->tsi_stopping);
+	LASSERT (list_empty(&tsi->tsi_active_rpcs));
+	LASSERT (!sfw_test_active(tsi));
+
+	while (!list_empty(&tsi->tsi_units)) {
+		tsu = list_entry(tsi->tsi_units.next,
+				     sfw_test_unit_t, tsu_list);
+		list_del(&tsu->tsu_list);
+		LIBCFS_FREE(tsu, sizeof(*tsu));
+	}
+
+	while (!list_empty(&tsi->tsi_free_rpcs)) {
+		rpc = list_entry(tsi->tsi_free_rpcs.next,
+				     srpc_client_rpc_t, crpc_list);
+		list_del(&rpc->crpc_list);
+		LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc));
+	}
+
+clean:
+	sfw_unload_test(tsi);
+	LIBCFS_FREE(tsi, sizeof(*tsi));
+	return;
+}
+
+void
+sfw_destroy_batch (sfw_batch_t *tsb)
+{
+	sfw_test_instance_t *tsi;
+
+	LASSERT (!sfw_batch_active(tsb));
+	LASSERT (list_empty(&tsb->bat_list));
+
+	while (!list_empty(&tsb->bat_tests)) {
+		tsi = list_entry(tsb->bat_tests.next,
+				     sfw_test_instance_t, tsi_list);
+		list_del_init(&tsi->tsi_list);
+		sfw_destroy_test_instance(tsi);
+	}
+
+	LIBCFS_FREE(tsb, sizeof(sfw_batch_t));
+	return;
+}
+
+void
+sfw_destroy_session (sfw_session_t *sn)
+{
+	sfw_batch_t *batch;
+
+	LASSERT (list_empty(&sn->sn_list));
+	LASSERT (sn != sfw_data.fw_session);
+
+	while (!list_empty(&sn->sn_batches)) {
+		batch = list_entry(sn->sn_batches.next,
+				       sfw_batch_t, bat_list);
+		list_del_init(&batch->bat_list);
+		sfw_destroy_batch(batch);
+	}
+
+	LIBCFS_FREE(sn, sizeof(*sn));
+	atomic_dec(&sfw_data.fw_nzombies);
+	return;
+}
+
+void
+sfw_unpack_addtest_req(srpc_msg_t *msg)
+{
+	srpc_test_reqst_t *req = &msg->msg_body.tes_reqst;
+
+	LASSERT (msg->msg_type == SRPC_MSG_TEST_REQST);
+	LASSERT (req->tsr_is_client);
+
+	if (msg->msg_magic == SRPC_MSG_MAGIC)
+		return; /* no flipping needed */
+
+	LASSERT (msg->msg_magic == __swab32(SRPC_MSG_MAGIC));
+
+	if (req->tsr_service == SRPC_SERVICE_BRW) {
+		if ((msg->msg_ses_feats & LST_FEAT_BULK_LEN) == 0) {
+			test_bulk_req_t *bulk = &req->tsr_u.bulk_v0;
+
+			__swab32s(&bulk->blk_opc);
+			__swab32s(&bulk->blk_npg);
+			__swab32s(&bulk->blk_flags);
+
+		} else {
+			test_bulk_req_v1_t *bulk = &req->tsr_u.bulk_v1;
+
+			__swab16s(&bulk->blk_opc);
+			__swab16s(&bulk->blk_flags);
+			__swab32s(&bulk->blk_offset);
+			__swab32s(&bulk->blk_len);
+		}
+
+		return;
+	}
+
+	if (req->tsr_service == SRPC_SERVICE_PING) {
+		test_ping_req_t *ping = &req->tsr_u.ping;
+
+		__swab32s(&ping->png_size);
+		__swab32s(&ping->png_flags);
+		return;
+	}
+
+	LBUG ();
+	return;
+}
+
+int
+sfw_add_test_instance (sfw_batch_t *tsb, srpc_server_rpc_t *rpc)
+{
+	srpc_msg_t	  *msg = &rpc->srpc_reqstbuf->buf_msg;
+	srpc_test_reqst_t   *req = &msg->msg_body.tes_reqst;
+	srpc_bulk_t	 *bk = rpc->srpc_bulk;
+	int		  ndest = req->tsr_ndest;
+	sfw_test_unit_t     *tsu;
+	sfw_test_instance_t *tsi;
+	int		  i;
+	int		  rc;
+
+	LIBCFS_ALLOC(tsi, sizeof(*tsi));
+	if (tsi == NULL) {
+		CERROR ("Can't allocate test instance for batch: "LPU64"\n",
+			tsb->bat_id.bat_id);
+		return -ENOMEM;
+	}
+
+	memset(tsi, 0, sizeof(*tsi));
+	spin_lock_init(&tsi->tsi_lock);
+	atomic_set(&tsi->tsi_nactive, 0);
+	INIT_LIST_HEAD(&tsi->tsi_units);
+	INIT_LIST_HEAD(&tsi->tsi_free_rpcs);
+	INIT_LIST_HEAD(&tsi->tsi_active_rpcs);
+
+	tsi->tsi_stopping      = 0;
+	tsi->tsi_batch	 = tsb;
+	tsi->tsi_loop	  = req->tsr_loop;
+	tsi->tsi_concur	= req->tsr_concur;
+	tsi->tsi_service       = req->tsr_service;
+	tsi->tsi_is_client     = !!(req->tsr_is_client);
+	tsi->tsi_stoptsu_onerr = !!(req->tsr_stop_onerr);
+
+	rc = sfw_load_test(tsi);
+	if (rc != 0) {
+		LIBCFS_FREE(tsi, sizeof(*tsi));
+		return rc;
+	}
+
+	LASSERT (!sfw_batch_active(tsb));
+
+	if (!tsi->tsi_is_client) {
+		/* it's test server, just add it to tsb */
+		list_add_tail(&tsi->tsi_list, &tsb->bat_tests);
+		return 0;
+	}
+
+	LASSERT (bk != NULL);
+	LASSERT (bk->bk_niov * SFW_ID_PER_PAGE >= (unsigned int)ndest);
+	LASSERT((unsigned int)bk->bk_len >=
+		sizeof(lnet_process_id_packed_t) * ndest);
+
+	sfw_unpack_addtest_req(msg);
+	memcpy(&tsi->tsi_u, &req->tsr_u, sizeof(tsi->tsi_u));
+
+	for (i = 0; i < ndest; i++) {
+		lnet_process_id_packed_t *dests;
+		lnet_process_id_packed_t  id;
+		int		       j;
+
+		dests = page_address(bk->bk_iovs[i / SFW_ID_PER_PAGE].kiov_page);
+		LASSERT (dests != NULL);  /* my pages are within KVM always */
+		id = dests[i % SFW_ID_PER_PAGE];
+		if (msg->msg_magic != SRPC_MSG_MAGIC)
+			sfw_unpack_id(id);
+
+		for (j = 0; j < tsi->tsi_concur; j++) {
+			LIBCFS_ALLOC(tsu, sizeof(sfw_test_unit_t));
+			if (tsu == NULL) {
+				rc = -ENOMEM;
+				CERROR ("Can't allocate tsu for %d\n",
+					tsi->tsi_service);
+				goto error;
+			}
+
+			tsu->tsu_dest.nid = id.nid;
+			tsu->tsu_dest.pid = id.pid;
+			tsu->tsu_instance = tsi;
+			tsu->tsu_private  = NULL;
+			list_add_tail(&tsu->tsu_list, &tsi->tsi_units);
+		}
+	}
+
+	rc = tsi->tsi_ops->tso_init(tsi);
+	if (rc == 0) {
+		list_add_tail(&tsi->tsi_list, &tsb->bat_tests);
+		return 0;
+	}
+
+error:
+	LASSERT (rc != 0);
+	sfw_destroy_test_instance(tsi);
+	return rc;
+}
+
+static void
+sfw_test_unit_done (sfw_test_unit_t *tsu)
+{
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+	sfw_batch_t	 *tsb = tsi->tsi_batch;
+	sfw_session_t       *sn = tsb->bat_session;
+
+	LASSERT (sfw_test_active(tsi));
+
+	if (!atomic_dec_and_test(&tsi->tsi_nactive))
+		return;
+
+	/* the test instance is done */
+	spin_lock(&tsi->tsi_lock);
+
+	tsi->tsi_stopping = 0;
+
+	spin_unlock(&tsi->tsi_lock);
+
+	spin_lock(&sfw_data.fw_lock);
+
+	if (!atomic_dec_and_test(&tsb->bat_nactive) ||/* tsb still active */
+	    sn == sfw_data.fw_session) {		  /* sn also active */
+		spin_unlock(&sfw_data.fw_lock);
+		return;
+	}
+
+	LASSERT (!list_empty(&sn->sn_list)); /* I'm a zombie! */
+
+	list_for_each_entry (tsb, &sn->sn_batches, bat_list) {
+		if (sfw_batch_active(tsb)) {
+			spin_unlock(&sfw_data.fw_lock);
+			return;
+		}
+	}
+
+	list_del_init(&sn->sn_list);
+	spin_unlock(&sfw_data.fw_lock);
+
+	sfw_destroy_session(sn);
+	return;
+}
+
+void
+sfw_test_rpc_done (srpc_client_rpc_t *rpc)
+{
+	sfw_test_unit_t     *tsu = rpc->crpc_priv;
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+	int		  done = 0;
+
+	tsi->tsi_ops->tso_done_rpc(tsu, rpc);
+
+	spin_lock(&tsi->tsi_lock);
+
+	LASSERT (sfw_test_active(tsi));
+	LASSERT (!list_empty(&rpc->crpc_list));
+
+	list_del_init(&rpc->crpc_list);
+
+	/* batch is stopping or loop is done or get error */
+	if (tsi->tsi_stopping ||
+	    tsu->tsu_loop == 0 ||
+	    (rpc->crpc_status != 0 && tsi->tsi_stoptsu_onerr))
+		done = 1;
+
+	/* dec ref for poster */
+	srpc_client_rpc_decref(rpc);
+
+	spin_unlock(&tsi->tsi_lock);
+
+	if (!done) {
+		swi_schedule_workitem(&tsu->tsu_worker);
+		return;
+	}
+
+	sfw_test_unit_done(tsu);
+	return;
+}
+
+int
+sfw_create_test_rpc(sfw_test_unit_t *tsu, lnet_process_id_t peer,
+		    unsigned features, int nblk, int blklen,
+		    srpc_client_rpc_t **rpcpp)
+{
+	srpc_client_rpc_t   *rpc = NULL;
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+
+	spin_lock(&tsi->tsi_lock);
+
+	LASSERT (sfw_test_active(tsi));
+
+	if (!list_empty(&tsi->tsi_free_rpcs)) {
+		/* pick request from buffer */
+		rpc = list_entry(tsi->tsi_free_rpcs.next,
+				     srpc_client_rpc_t, crpc_list);
+		LASSERT (nblk == rpc->crpc_bulk.bk_niov);
+		list_del_init(&rpc->crpc_list);
+	}
+
+	spin_unlock(&tsi->tsi_lock);
+
+	if (rpc == NULL) {
+		rpc = srpc_create_client_rpc(peer, tsi->tsi_service, nblk,
+					     blklen, sfw_test_rpc_done,
+					     sfw_test_rpc_fini, tsu);
+	} else {
+		srpc_init_client_rpc(rpc, peer, tsi->tsi_service, nblk,
+				     blklen, sfw_test_rpc_done,
+				     sfw_test_rpc_fini, tsu);
+	}
+
+	if (rpc == NULL) {
+		CERROR("Can't create rpc for test %d\n", tsi->tsi_service);
+		return -ENOMEM;
+	}
+
+	rpc->crpc_reqstmsg.msg_ses_feats = features;
+	*rpcpp = rpc;
+
+	return 0;
+}
+
+int
+sfw_run_test (swi_workitem_t *wi)
+{
+	sfw_test_unit_t     *tsu = wi->swi_workitem.wi_data;
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+	srpc_client_rpc_t   *rpc = NULL;
+
+	LASSERT (wi == &tsu->tsu_worker);
+
+	if (tsi->tsi_ops->tso_prep_rpc(tsu, tsu->tsu_dest, &rpc) != 0) {
+		LASSERT (rpc == NULL);
+		goto test_done;
+	}
+
+	LASSERT (rpc != NULL);
+
+	spin_lock(&tsi->tsi_lock);
+
+	if (tsi->tsi_stopping) {
+		list_add(&rpc->crpc_list, &tsi->tsi_free_rpcs);
+		spin_unlock(&tsi->tsi_lock);
+		goto test_done;
+	}
+
+	if (tsu->tsu_loop > 0)
+		tsu->tsu_loop--;
+
+	list_add_tail(&rpc->crpc_list, &tsi->tsi_active_rpcs);
+	spin_unlock(&tsi->tsi_lock);
+
+	rpc->crpc_timeout = rpc_timeout;
+
+	spin_lock(&rpc->crpc_lock);
+	srpc_post_rpc(rpc);
+	spin_unlock(&rpc->crpc_lock);
+	return 0;
+
+test_done:
+	/*
+	 * No one can schedule me now since:
+	 * - previous RPC, if any, has done and
+	 * - no new RPC is initiated.
+	 * - my batch is still active; no one can run it again now.
+	 * Cancel pending schedules and prevent future schedule attempts:
+	 */
+	swi_exit_workitem(wi);
+	sfw_test_unit_done(tsu);
+	return 1;
+}
+
+int
+sfw_run_batch (sfw_batch_t *tsb)
+{
+	swi_workitem_t      *wi;
+	sfw_test_unit_t     *tsu;
+	sfw_test_instance_t *tsi;
+
+	if (sfw_batch_active(tsb)) {
+		CDEBUG(D_NET, "Batch already active: "LPU64" (%d)\n",
+		       tsb->bat_id.bat_id, atomic_read(&tsb->bat_nactive));
+		return 0;
+	}
+
+	list_for_each_entry (tsi, &tsb->bat_tests, tsi_list) {
+		if (!tsi->tsi_is_client) /* skip server instances */
+			continue;
+
+		LASSERT (!tsi->tsi_stopping);
+		LASSERT (!sfw_test_active(tsi));
+
+		atomic_inc(&tsb->bat_nactive);
+
+		list_for_each_entry (tsu, &tsi->tsi_units, tsu_list) {
+			atomic_inc(&tsi->tsi_nactive);
+			tsu->tsu_loop = tsi->tsi_loop;
+			wi = &tsu->tsu_worker;
+			swi_init_workitem(wi, tsu, sfw_run_test,
+					  lst_sched_test[\
+					  lnet_cpt_of_nid(tsu->tsu_dest.nid)]);
+			swi_schedule_workitem(wi);
+		}
+	}
+
+	return 0;
+}
+
+int
+sfw_stop_batch (sfw_batch_t *tsb, int force)
+{
+	sfw_test_instance_t *tsi;
+	srpc_client_rpc_t   *rpc;
+
+	if (!sfw_batch_active(tsb)) {
+		CDEBUG(D_NET, "Batch "LPU64" inactive\n", tsb->bat_id.bat_id);
+		return 0;
+	}
+
+	list_for_each_entry (tsi, &tsb->bat_tests, tsi_list) {
+		spin_lock(&tsi->tsi_lock);
+
+		if (!tsi->tsi_is_client ||
+		    !sfw_test_active(tsi) || tsi->tsi_stopping) {
+			spin_unlock(&tsi->tsi_lock);
+			continue;
+		}
+
+		tsi->tsi_stopping = 1;
+
+		if (!force) {
+			spin_unlock(&tsi->tsi_lock);
+			continue;
+		}
+
+		/* abort launched rpcs in the test */
+		list_for_each_entry(rpc, &tsi->tsi_active_rpcs, crpc_list) {
+			spin_lock(&rpc->crpc_lock);
+
+			srpc_abort_rpc(rpc, -EINTR);
+
+			spin_unlock(&rpc->crpc_lock);
+		}
+
+		spin_unlock(&tsi->tsi_lock);
+	}
+
+	return 0;
+}
+
+int
+sfw_query_batch (sfw_batch_t *tsb, int testidx, srpc_batch_reply_t *reply)
+{
+	sfw_test_instance_t *tsi;
+
+	if (testidx < 0)
+		return -EINVAL;
+
+	if (testidx == 0) {
+		reply->bar_active = atomic_read(&tsb->bat_nactive);
+		return 0;
+	}
+
+	list_for_each_entry (tsi, &tsb->bat_tests, tsi_list) {
+		if (testidx-- > 1)
+			continue;
+
+		reply->bar_active = atomic_read(&tsi->tsi_nactive);
+		return 0;
+	}
+
+	return -ENOENT;
+}
+
+void
+sfw_free_pages (srpc_server_rpc_t *rpc)
+{
+	srpc_free_bulk(rpc->srpc_bulk);
+	rpc->srpc_bulk = NULL;
+}
+
+int
+sfw_alloc_pages(struct srpc_server_rpc *rpc, int cpt, int npages, int len,
+		int sink)
+{
+	LASSERT(rpc->srpc_bulk == NULL);
+	LASSERT(npages > 0 && npages <= LNET_MAX_IOV);
+
+	rpc->srpc_bulk = srpc_alloc_bulk(cpt, npages, len, sink);
+	if (rpc->srpc_bulk == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+int
+sfw_add_test (srpc_server_rpc_t *rpc)
+{
+	sfw_session_t     *sn = sfw_data.fw_session;
+	srpc_test_reply_t *reply = &rpc->srpc_replymsg.msg_body.tes_reply;
+	srpc_test_reqst_t *request;
+	int		rc;
+	sfw_batch_t       *bat;
+
+	request = &rpc->srpc_reqstbuf->buf_msg.msg_body.tes_reqst;
+	reply->tsr_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+
+	if (request->tsr_loop == 0 ||
+	    request->tsr_concur == 0 ||
+	    request->tsr_sid.ses_nid == LNET_NID_ANY ||
+	    request->tsr_ndest > SFW_MAX_NDESTS ||
+	    (request->tsr_is_client && request->tsr_ndest == 0) ||
+	    request->tsr_concur > SFW_MAX_CONCUR ||
+	    request->tsr_service > SRPC_SERVICE_MAX_ID ||
+	    request->tsr_service <= SRPC_FRAMEWORK_SERVICE_MAX_ID) {
+		reply->tsr_status = EINVAL;
+		return 0;
+	}
+
+	if (sn == NULL || !sfw_sid_equal(request->tsr_sid, sn->sn_id) ||
+	    sfw_find_test_case(request->tsr_service) == NULL) {
+		reply->tsr_status = ENOENT;
+		return 0;
+	}
+
+	bat = sfw_bid2batch(request->tsr_bid);
+	if (bat == NULL) {
+		CERROR ("Dropping RPC (%s) from %s under memory pressure.\n",
+			rpc->srpc_scd->scd_svc->sv_name,
+			libcfs_id2str(rpc->srpc_peer));
+		return -ENOMEM;
+	}
+
+	if (sfw_batch_active(bat)) {
+		reply->tsr_status = EBUSY;
+		return 0;
+	}
+
+	if (request->tsr_is_client && rpc->srpc_bulk == NULL) {
+		/* rpc will be resumed later in sfw_bulk_ready */
+		int	npg = sfw_id_pages(request->tsr_ndest);
+		int	len;
+
+		if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) {
+			len = npg * PAGE_CACHE_SIZE;
+
+		} else  {
+			len = sizeof(lnet_process_id_packed_t) *
+			      request->tsr_ndest;
+		}
+
+		return sfw_alloc_pages(rpc, CFS_CPT_ANY, npg, len, 1);
+	}
+
+	rc = sfw_add_test_instance(bat, rpc);
+	CDEBUG (rc == 0 ? D_NET : D_WARNING,
+		"%s test: sv %d %s, loop %d, concur %d, ndest %d\n",
+		rc == 0 ? "Added" : "Failed to add", request->tsr_service,
+		request->tsr_is_client ? "client" : "server",
+		request->tsr_loop, request->tsr_concur, request->tsr_ndest);
+
+	reply->tsr_status = (rc < 0) ? -rc : rc;
+	return 0;
+}
+
+int
+sfw_control_batch (srpc_batch_reqst_t *request, srpc_batch_reply_t *reply)
+{
+	sfw_session_t *sn = sfw_data.fw_session;
+	int	    rc = 0;
+	sfw_batch_t   *bat;
+
+	reply->bar_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+
+	if (sn == NULL || !sfw_sid_equal(request->bar_sid, sn->sn_id)) {
+		reply->bar_status = ESRCH;
+		return 0;
+	}
+
+	bat = sfw_find_batch(request->bar_bid);
+	if (bat == NULL) {
+		reply->bar_status = ENOENT;
+		return 0;
+	}
+
+	switch (request->bar_opc) {
+	case SRPC_BATCH_OPC_RUN:
+		rc = sfw_run_batch(bat);
+		break;
+
+	case SRPC_BATCH_OPC_STOP:
+		rc = sfw_stop_batch(bat, request->bar_arg);
+		break;
+
+	case SRPC_BATCH_OPC_QUERY:
+		rc = sfw_query_batch(bat, request->bar_testidx, reply);
+		break;
+
+	default:
+		return -EINVAL; /* drop it */
+	}
+
+	reply->bar_status = (rc < 0) ? -rc : rc;
+	return 0;
+}
+
+int
+sfw_handle_server_rpc(struct srpc_server_rpc *rpc)
+{
+	struct srpc_service	*sv = rpc->srpc_scd->scd_svc;
+	srpc_msg_t     *reply	= &rpc->srpc_replymsg;
+	srpc_msg_t     *request	= &rpc->srpc_reqstbuf->buf_msg;
+	unsigned	features = LST_FEATS_MASK;
+	int		rc = 0;
+
+	LASSERT(sfw_data.fw_active_srpc == NULL);
+	LASSERT(sv->sv_id <= SRPC_FRAMEWORK_SERVICE_MAX_ID);
+
+	spin_lock(&sfw_data.fw_lock);
+
+	if (sfw_data.fw_shuttingdown) {
+		spin_unlock(&sfw_data.fw_lock);
+		return -ESHUTDOWN;
+	}
+
+	/* Remove timer to avoid racing with it or expiring active session */
+	if (sfw_del_session_timer() != 0) {
+		CERROR("Dropping RPC (%s) from %s: racing with expiry timer.",
+		       sv->sv_name, libcfs_id2str(rpc->srpc_peer));
+		spin_unlock(&sfw_data.fw_lock);
+		return -EAGAIN;
+	}
+
+	sfw_data.fw_active_srpc = rpc;
+	spin_unlock(&sfw_data.fw_lock);
+
+	sfw_unpack_message(request);
+	LASSERT(request->msg_type == srpc_service2request(sv->sv_id));
+
+	/* rpc module should have checked this */
+	LASSERT(request->msg_version == SRPC_MSG_VERSION);
+
+	if (sv->sv_id != SRPC_SERVICE_MAKE_SESSION &&
+	    sv->sv_id != SRPC_SERVICE_DEBUG) {
+		sfw_session_t *sn = sfw_data.fw_session;
+
+		if (sn != NULL &&
+		    sn->sn_features != request->msg_ses_feats) {
+			CNETERR("Features of framework RPC don't match "
+				"features of current session: %x/%x\n",
+				request->msg_ses_feats, sn->sn_features);
+			reply->msg_body.reply.status = EPROTO;
+			reply->msg_body.reply.sid    = sn->sn_id;
+			goto out;
+		}
+
+	} else if ((request->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+		/* NB: at this point, old version will ignore features and
+		 * create new session anyway, so console should be able
+		 * to handle this */
+		reply->msg_body.reply.status = EPROTO;
+		goto out;
+	}
+
+	switch(sv->sv_id) {
+	default:
+		LBUG ();
+	case SRPC_SERVICE_TEST:
+		rc = sfw_add_test(rpc);
+		break;
+
+	case SRPC_SERVICE_BATCH:
+		rc = sfw_control_batch(&request->msg_body.bat_reqst,
+				       &reply->msg_body.bat_reply);
+		break;
+
+	case SRPC_SERVICE_QUERY_STAT:
+		rc = sfw_get_stats(&request->msg_body.stat_reqst,
+				   &reply->msg_body.stat_reply);
+		break;
+
+	case SRPC_SERVICE_DEBUG:
+		rc = sfw_debug_session(&request->msg_body.dbg_reqst,
+				       &reply->msg_body.dbg_reply);
+		break;
+
+	case SRPC_SERVICE_MAKE_SESSION:
+		rc = sfw_make_session(&request->msg_body.mksn_reqst,
+				      &reply->msg_body.mksn_reply);
+		break;
+
+	case SRPC_SERVICE_REMOVE_SESSION:
+		rc = sfw_remove_session(&request->msg_body.rmsn_reqst,
+					&reply->msg_body.rmsn_reply);
+		break;
+	}
+
+	if (sfw_data.fw_session != NULL)
+		features = sfw_data.fw_session->sn_features;
+ out:
+	reply->msg_ses_feats = features;
+	rpc->srpc_done = sfw_server_rpc_done;
+	spin_lock(&sfw_data.fw_lock);
+
+	if (!sfw_data.fw_shuttingdown)
+		sfw_add_session_timer();
+
+	sfw_data.fw_active_srpc = NULL;
+	spin_unlock(&sfw_data.fw_lock);
+	return rc;
+}
+
+int
+sfw_bulk_ready(struct srpc_server_rpc *rpc, int status)
+{
+	struct srpc_service	*sv = rpc->srpc_scd->scd_svc;
+	int			rc;
+
+	LASSERT(rpc->srpc_bulk != NULL);
+	LASSERT(sv->sv_id == SRPC_SERVICE_TEST);
+	LASSERT(sfw_data.fw_active_srpc == NULL);
+	LASSERT(rpc->srpc_reqstbuf->buf_msg.msg_body.tes_reqst.tsr_is_client);
+
+	spin_lock(&sfw_data.fw_lock);
+
+	if (status != 0) {
+		CERROR("Bulk transfer failed for RPC: "
+		       "service %s, peer %s, status %d\n",
+		       sv->sv_name, libcfs_id2str(rpc->srpc_peer), status);
+		spin_unlock(&sfw_data.fw_lock);
+		return -EIO;
+	}
+
+	if (sfw_data.fw_shuttingdown) {
+		spin_unlock(&sfw_data.fw_lock);
+		return -ESHUTDOWN;
+	}
+
+	if (sfw_del_session_timer() != 0) {
+		CERROR("Dropping RPC (%s) from %s: racing with expiry timer",
+		       sv->sv_name, libcfs_id2str(rpc->srpc_peer));
+		spin_unlock(&sfw_data.fw_lock);
+		return -EAGAIN;
+	}
+
+	sfw_data.fw_active_srpc = rpc;
+	spin_unlock(&sfw_data.fw_lock);
+
+	rc = sfw_add_test(rpc);
+
+	spin_lock(&sfw_data.fw_lock);
+
+	if (!sfw_data.fw_shuttingdown)
+		sfw_add_session_timer();
+
+	sfw_data.fw_active_srpc = NULL;
+	spin_unlock(&sfw_data.fw_lock);
+	return rc;
+}
+
+srpc_client_rpc_t *
+sfw_create_rpc(lnet_process_id_t peer, int service,
+	       unsigned features, int nbulkiov, int bulklen,
+	       void (*done)(srpc_client_rpc_t *), void *priv)
+{
+	srpc_client_rpc_t *rpc = NULL;
+
+	spin_lock(&sfw_data.fw_lock);
+
+	LASSERT (!sfw_data.fw_shuttingdown);
+	LASSERT (service <= SRPC_FRAMEWORK_SERVICE_MAX_ID);
+
+	if (nbulkiov == 0 && !list_empty(&sfw_data.fw_zombie_rpcs)) {
+		rpc = list_entry(sfw_data.fw_zombie_rpcs.next,
+				     srpc_client_rpc_t, crpc_list);
+		list_del(&rpc->crpc_list);
+
+		srpc_init_client_rpc(rpc, peer, service, 0, 0,
+				     done, sfw_client_rpc_fini, priv);
+	}
+
+	spin_unlock(&sfw_data.fw_lock);
+
+	if (rpc == NULL) {
+		rpc = srpc_create_client_rpc(peer, service,
+					     nbulkiov, bulklen, done,
+					     nbulkiov != 0 ?  NULL :
+					     sfw_client_rpc_fini,
+					     priv);
+	}
+
+	if (rpc != NULL) /* "session" is concept in framework */
+		rpc->crpc_reqstmsg.msg_ses_feats = features;
+
+	return rpc;
+}
+
+void
+sfw_unpack_message (srpc_msg_t *msg)
+{
+	if (msg->msg_magic == SRPC_MSG_MAGIC)
+		return; /* no flipping needed */
+
+	/* srpc module should guarantee I wouldn't get crap */
+	LASSERT (msg->msg_magic == __swab32(SRPC_MSG_MAGIC));
+
+	if (msg->msg_type == SRPC_MSG_STAT_REQST) {
+		srpc_stat_reqst_t *req = &msg->msg_body.stat_reqst;
+
+		__swab32s(&req->str_type);
+		__swab64s(&req->str_rpyid);
+		sfw_unpack_sid(req->str_sid);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_STAT_REPLY) {
+		srpc_stat_reply_t *rep = &msg->msg_body.stat_reply;
+
+		__swab32s(&rep->str_status);
+		sfw_unpack_sid(rep->str_sid);
+		sfw_unpack_fw_counters(rep->str_fw);
+		sfw_unpack_rpc_counters(rep->str_rpc);
+		sfw_unpack_lnet_counters(rep->str_lnet);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_MKSN_REQST) {
+		srpc_mksn_reqst_t *req = &msg->msg_body.mksn_reqst;
+
+		__swab64s(&req->mksn_rpyid);
+		__swab32s(&req->mksn_force);
+		sfw_unpack_sid(req->mksn_sid);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_MKSN_REPLY) {
+		srpc_mksn_reply_t *rep = &msg->msg_body.mksn_reply;
+
+		__swab32s(&rep->mksn_status);
+		__swab32s(&rep->mksn_timeout);
+		sfw_unpack_sid(rep->mksn_sid);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_RMSN_REQST) {
+		srpc_rmsn_reqst_t *req = &msg->msg_body.rmsn_reqst;
+
+		__swab64s(&req->rmsn_rpyid);
+		sfw_unpack_sid(req->rmsn_sid);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_RMSN_REPLY) {
+		srpc_rmsn_reply_t *rep = &msg->msg_body.rmsn_reply;
+
+		__swab32s(&rep->rmsn_status);
+		sfw_unpack_sid(rep->rmsn_sid);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_DEBUG_REQST) {
+		srpc_debug_reqst_t *req = &msg->msg_body.dbg_reqst;
+
+		__swab64s(&req->dbg_rpyid);
+		__swab32s(&req->dbg_flags);
+		sfw_unpack_sid(req->dbg_sid);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_DEBUG_REPLY) {
+		srpc_debug_reply_t *rep = &msg->msg_body.dbg_reply;
+
+		__swab32s(&rep->dbg_nbatch);
+		__swab32s(&rep->dbg_timeout);
+		sfw_unpack_sid(rep->dbg_sid);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_BATCH_REQST) {
+		srpc_batch_reqst_t *req = &msg->msg_body.bat_reqst;
+
+		__swab32s(&req->bar_opc);
+		__swab64s(&req->bar_rpyid);
+		__swab32s(&req->bar_testidx);
+		__swab32s(&req->bar_arg);
+		sfw_unpack_sid(req->bar_sid);
+		__swab64s(&req->bar_bid.bat_id);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_BATCH_REPLY) {
+		srpc_batch_reply_t *rep = &msg->msg_body.bat_reply;
+
+		__swab32s(&rep->bar_status);
+		sfw_unpack_sid(rep->bar_sid);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_TEST_REQST) {
+		srpc_test_reqst_t *req = &msg->msg_body.tes_reqst;
+
+		__swab64s(&req->tsr_rpyid);
+		__swab64s(&req->tsr_bulkid);
+		__swab32s(&req->tsr_loop);
+		__swab32s(&req->tsr_ndest);
+		__swab32s(&req->tsr_concur);
+		__swab32s(&req->tsr_service);
+		sfw_unpack_sid(req->tsr_sid);
+		__swab64s(&req->tsr_bid.bat_id);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_TEST_REPLY) {
+		srpc_test_reply_t *rep = &msg->msg_body.tes_reply;
+
+		__swab32s(&rep->tsr_status);
+		sfw_unpack_sid(rep->tsr_sid);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_JOIN_REQST) {
+		srpc_join_reqst_t *req = &msg->msg_body.join_reqst;
+
+		__swab64s(&req->join_rpyid);
+		sfw_unpack_sid(req->join_sid);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_JOIN_REPLY) {
+		srpc_join_reply_t *rep = &msg->msg_body.join_reply;
+
+		__swab32s(&rep->join_status);
+		__swab32s(&rep->join_timeout);
+		sfw_unpack_sid(rep->join_sid);
+		return;
+	}
+
+	LBUG ();
+	return;
+}
+
+void
+sfw_abort_rpc (srpc_client_rpc_t *rpc)
+{
+	LASSERT(atomic_read(&rpc->crpc_refcount) > 0);
+	LASSERT(rpc->crpc_service <= SRPC_FRAMEWORK_SERVICE_MAX_ID);
+
+	spin_lock(&rpc->crpc_lock);
+	srpc_abort_rpc(rpc, -EINTR);
+	spin_unlock(&rpc->crpc_lock);
+	return;
+}
+
+void
+sfw_post_rpc (srpc_client_rpc_t *rpc)
+{
+	spin_lock(&rpc->crpc_lock);
+
+	LASSERT (!rpc->crpc_closed);
+	LASSERT (!rpc->crpc_aborted);
+	LASSERT (list_empty(&rpc->crpc_list));
+	LASSERT (!sfw_data.fw_shuttingdown);
+
+	rpc->crpc_timeout = rpc_timeout;
+	srpc_post_rpc(rpc);
+
+	spin_unlock(&rpc->crpc_lock);
+	return;
+}
+
+static srpc_service_t sfw_services[] =
+{
+	{
+		/* sv_id */    SRPC_SERVICE_DEBUG,
+		/* sv_name */  "debug",
+		0
+	},
+	{
+		/* sv_id */    SRPC_SERVICE_QUERY_STAT,
+		/* sv_name */  "query stats",
+		0
+	},
+	{
+		/* sv_id */    SRPC_SERVICE_MAKE_SESSION,
+		/* sv_name */  "make session",
+		0
+	},
+	{
+		/* sv_id */    SRPC_SERVICE_REMOVE_SESSION,
+		/* sv_name */  "remove session",
+		0
+	},
+	{
+		/* sv_id */    SRPC_SERVICE_BATCH,
+		/* sv_name */  "batch service",
+		0
+	},
+	{
+		/* sv_id */    SRPC_SERVICE_TEST,
+		/* sv_name */  "test service",
+		0
+	},
+	{
+		/* sv_id */    0,
+		/* sv_name */  NULL,
+		0
+	}
+};
+
+extern sfw_test_client_ops_t ping_test_client;
+extern srpc_service_t	ping_test_service;
+extern void ping_init_test_client(void);
+extern void ping_init_test_service(void);
+
+extern sfw_test_client_ops_t brw_test_client;
+extern srpc_service_t	brw_test_service;
+extern void brw_init_test_client(void);
+extern void brw_init_test_service(void);
+
+
+int
+sfw_startup (void)
+{
+	int	      i;
+	int	      rc;
+	int	      error;
+	srpc_service_t  *sv;
+	sfw_test_case_t *tsc;
+
+
+	if (session_timeout < 0) {
+		CERROR ("Session timeout must be non-negative: %d\n",
+			session_timeout);
+		return -EINVAL;
+	}
+
+	if (rpc_timeout < 0) {
+		CERROR ("RPC timeout must be non-negative: %d\n",
+			rpc_timeout);
+		return -EINVAL;
+	}
+
+	if (session_timeout == 0)
+		CWARN ("Zero session_timeout specified "
+		       "- test sessions never expire.\n");
+
+	if (rpc_timeout == 0)
+		CWARN ("Zero rpc_timeout specified "
+		       "- test RPC never expire.\n");
+
+	memset(&sfw_data, 0, sizeof(struct smoketest_framework));
+
+	sfw_data.fw_session     = NULL;
+	sfw_data.fw_active_srpc = NULL;
+	spin_lock_init(&sfw_data.fw_lock);
+	atomic_set(&sfw_data.fw_nzombies, 0);
+	INIT_LIST_HEAD(&sfw_data.fw_tests);
+	INIT_LIST_HEAD(&sfw_data.fw_zombie_rpcs);
+	INIT_LIST_HEAD(&sfw_data.fw_zombie_sessions);
+
+	brw_init_test_client();
+	brw_init_test_service();
+	rc = sfw_register_test(&brw_test_service, &brw_test_client);
+	LASSERT (rc == 0);
+
+	ping_init_test_client();
+	ping_init_test_service();
+	rc = sfw_register_test(&ping_test_service, &ping_test_client);
+	LASSERT (rc == 0);
+
+	error = 0;
+	list_for_each_entry (tsc, &sfw_data.fw_tests, tsc_list) {
+		sv = tsc->tsc_srv_service;
+
+		rc = srpc_add_service(sv);
+		LASSERT (rc != -EBUSY);
+		if (rc != 0) {
+			CWARN ("Failed to add %s service: %d\n",
+			       sv->sv_name, rc);
+			error = rc;
+		}
+	}
+
+	for (i = 0; ; i++) {
+		sv = &sfw_services[i];
+		if (sv->sv_name == NULL) break;
+
+		sv->sv_bulk_ready = NULL;
+		sv->sv_handler    = sfw_handle_server_rpc;
+		sv->sv_wi_total   = SFW_FRWK_WI_MAX;
+		if (sv->sv_id == SRPC_SERVICE_TEST)
+			sv->sv_bulk_ready = sfw_bulk_ready;
+
+		rc = srpc_add_service(sv);
+		LASSERT (rc != -EBUSY);
+		if (rc != 0) {
+			CWARN ("Failed to add %s service: %d\n",
+			       sv->sv_name, rc);
+			error = rc;
+		}
+
+		/* about to sfw_shutdown, no need to add buffer */
+		if (error) continue;
+
+		rc = srpc_service_add_buffers(sv, sv->sv_wi_total);
+		if (rc != 0) {
+			CWARN("Failed to reserve enough buffers: "
+			      "service %s, %d needed: %d\n",
+			      sv->sv_name, sv->sv_wi_total, rc);
+			error = -ENOMEM;
+		}
+	}
+
+	if (error != 0)
+		sfw_shutdown();
+	return error;
+}
+
+void
+sfw_shutdown (void)
+{
+	srpc_service_t	*sv;
+	sfw_test_case_t	*tsc;
+	int		 i;
+
+	spin_lock(&sfw_data.fw_lock);
+
+	sfw_data.fw_shuttingdown = 1;
+	lst_wait_until(sfw_data.fw_active_srpc == NULL, sfw_data.fw_lock,
+		       "waiting for active RPC to finish.\n");
+
+	if (sfw_del_session_timer() != 0)
+		lst_wait_until(sfw_data.fw_session == NULL, sfw_data.fw_lock,
+			       "waiting for session timer to explode.\n");
+
+	sfw_deactivate_session();
+	lst_wait_until(atomic_read(&sfw_data.fw_nzombies) == 0,
+		       sfw_data.fw_lock,
+		       "waiting for %d zombie sessions to die.\n",
+		       atomic_read(&sfw_data.fw_nzombies));
+
+	spin_unlock(&sfw_data.fw_lock);
+
+	for (i = 0; ; i++) {
+		sv = &sfw_services[i];
+		if (sv->sv_name == NULL)
+			break;
+
+		srpc_shutdown_service(sv);
+		srpc_remove_service(sv);
+	}
+
+	list_for_each_entry (tsc, &sfw_data.fw_tests, tsc_list) {
+		sv = tsc->tsc_srv_service;
+		srpc_shutdown_service(sv);
+		srpc_remove_service(sv);
+	}
+
+	while (!list_empty(&sfw_data.fw_zombie_rpcs)) {
+		srpc_client_rpc_t *rpc;
+
+		rpc = list_entry(sfw_data.fw_zombie_rpcs.next,
+				     srpc_client_rpc_t, crpc_list);
+		list_del(&rpc->crpc_list);
+
+		LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc));
+	}
+
+	for (i = 0; ; i++) {
+		sv = &sfw_services[i];
+		if (sv->sv_name == NULL)
+			break;
+
+		srpc_wait_service_shutdown(sv);
+	}
+
+	while (!list_empty(&sfw_data.fw_tests)) {
+		tsc = list_entry(sfw_data.fw_tests.next,
+				     sfw_test_case_t, tsc_list);
+
+		srpc_wait_service_shutdown(tsc->tsc_srv_service);
+
+		list_del(&tsc->tsc_list);
+		LIBCFS_FREE(tsc, sizeof(*tsc));
+	}
+
+	return;
+}
diff --git a/drivers/staging/lustre/lnet/selftest/module.c b/drivers/staging/lustre/lnet/selftest/module.c
new file mode 100644
index 000000000000..5257e5630a0e
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/module.c
@@ -0,0 +1,169 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "selftest.h"
+
+enum {
+	LST_INIT_NONE		= 0,
+	LST_INIT_WI_SERIAL,
+	LST_INIT_WI_TEST,
+	LST_INIT_RPC,
+	LST_INIT_FW,
+	LST_INIT_CONSOLE
+};
+
+extern int lstcon_console_init(void);
+extern int lstcon_console_fini(void);
+
+static int lst_init_step = LST_INIT_NONE;
+
+struct cfs_wi_sched *lst_sched_serial;
+struct cfs_wi_sched **lst_sched_test;
+
+void
+lnet_selftest_fini(void)
+{
+	int	i;
+
+	switch (lst_init_step) {
+		case LST_INIT_CONSOLE:
+			lstcon_console_fini();
+		case LST_INIT_FW:
+			sfw_shutdown();
+		case LST_INIT_RPC:
+			srpc_shutdown();
+		case LST_INIT_WI_TEST:
+			for (i = 0;
+			     i < cfs_cpt_number(lnet_cpt_table()); i++) {
+				if (lst_sched_test[i] == NULL)
+					continue;
+				cfs_wi_sched_destroy(lst_sched_test[i]);
+			}
+			LIBCFS_FREE(lst_sched_test,
+				    sizeof(lst_sched_test[0]) *
+				    cfs_cpt_number(lnet_cpt_table()));
+			lst_sched_test = NULL;
+
+		case LST_INIT_WI_SERIAL:
+			cfs_wi_sched_destroy(lst_sched_serial);
+			lst_sched_serial = NULL;
+		case LST_INIT_NONE:
+			break;
+		default:
+			LBUG();
+	}
+	return;
+}
+
+void
+lnet_selftest_structure_assertion(void)
+{
+	CLASSERT(sizeof(srpc_msg_t) == 160);
+	CLASSERT(sizeof(srpc_test_reqst_t) == 70);
+	CLASSERT(offsetof(srpc_msg_t, msg_body.tes_reqst.tsr_concur) == 72);
+	CLASSERT(offsetof(srpc_msg_t, msg_body.tes_reqst.tsr_ndest) == 78);
+	CLASSERT(sizeof(srpc_stat_reply_t) == 136);
+	CLASSERT(sizeof(srpc_stat_reqst_t) == 28);
+}
+
+int
+lnet_selftest_init(void)
+{
+	int	nscheds;
+	int	rc;
+	int	i;
+
+	rc = cfs_wi_sched_create("lst_s", lnet_cpt_table(), CFS_CPT_ANY,
+				 1, &lst_sched_serial);
+	if (rc != 0) {
+		CERROR("Failed to create serial WI scheduler for LST\n");
+		return rc;
+	}
+	lst_init_step = LST_INIT_WI_SERIAL;
+
+	nscheds = cfs_cpt_number(lnet_cpt_table());
+	LIBCFS_ALLOC(lst_sched_test, sizeof(lst_sched_test[0]) * nscheds);
+	if (lst_sched_test == NULL)
+		goto error;
+
+	lst_init_step = LST_INIT_WI_TEST;
+	for (i = 0; i < nscheds; i++) {
+		int nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
+
+		/* reserve at least one CPU for LND */
+		nthrs = max(nthrs - 1, 1);
+		rc = cfs_wi_sched_create("lst_t", lnet_cpt_table(), i,
+					 nthrs, &lst_sched_test[i]);
+		if (rc != 0) {
+			CERROR("Failed to create CPT affinity WI scheduler "
+			       "%d for LST\n", i);
+			goto error;
+		}
+	}
+
+	rc = srpc_startup();
+	if (rc != 0) {
+		CERROR("LST can't startup rpc\n");
+		goto error;
+	}
+	lst_init_step = LST_INIT_RPC;
+
+	rc = sfw_startup();
+	if (rc != 0) {
+		CERROR("LST can't startup framework\n");
+		goto error;
+	}
+	lst_init_step = LST_INIT_FW;
+
+	rc = lstcon_console_init();
+	if (rc != 0) {
+		CERROR("LST can't startup console\n");
+		goto error;
+	}
+	lst_init_step = LST_INIT_CONSOLE;
+	return 0;
+error:
+	lnet_selftest_fini();
+	return rc;
+}
+
+
+MODULE_DESCRIPTION("LNet Selftest");
+MODULE_LICENSE("GPL");
+
+cfs_module(lnet, "0.9.0", lnet_selftest_init, lnet_selftest_fini);
diff --git a/drivers/staging/lustre/lnet/selftest/ping_test.c b/drivers/staging/lustre/lnet/selftest/ping_test.c
new file mode 100644
index 000000000000..f0f919482b56
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/ping_test.c
@@ -0,0 +1,229 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/conctl.c
+ *
+ * Test client & Server
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+#include "selftest.h"
+
+#define LST_PING_TEST_MAGIC     0xbabeface
+
+int ping_srv_workitems = SFW_TEST_WI_MAX;
+CFS_MODULE_PARM(ping_srv_workitems, "i", int, 0644, "# PING server workitems");
+
+typedef struct {
+	spinlock_t	pnd_lock;	/* serialize */
+	int		pnd_counter;	/* sequence counter */
+} lst_ping_data_t;
+
+static lst_ping_data_t  lst_ping_data;
+
+static int
+ping_client_init(sfw_test_instance_t *tsi)
+{
+	sfw_session_t *sn = tsi->tsi_batch->bat_session;
+
+	LASSERT(tsi->tsi_is_client);
+	LASSERT(sn != NULL && (sn->sn_features & ~LST_FEATS_MASK) == 0);
+
+	spin_lock_init(&lst_ping_data.pnd_lock);
+	lst_ping_data.pnd_counter = 0;
+
+	return 0;
+}
+
+static void
+ping_client_fini (sfw_test_instance_t *tsi)
+{
+	sfw_session_t *sn = tsi->tsi_batch->bat_session;
+	int	    errors;
+
+	LASSERT (sn != NULL);
+	LASSERT (tsi->tsi_is_client);
+
+	errors = atomic_read(&sn->sn_ping_errors);
+	if (errors)
+		CWARN ("%d pings have failed.\n", errors);
+	else
+		CDEBUG (D_NET, "Ping test finished OK.\n");
+}
+
+static int
+ping_client_prep_rpc(sfw_test_unit_t *tsu,
+		     lnet_process_id_t dest, srpc_client_rpc_t **rpc)
+{
+	srpc_ping_reqst_t   *req;
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+	sfw_session_t       *sn  = tsi->tsi_batch->bat_session;
+	struct timeval       tv;
+	int		     rc;
+
+	LASSERT(sn != NULL);
+	LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0);
+
+	rc = sfw_create_test_rpc(tsu, dest, sn->sn_features, 0, 0, rpc);
+	if (rc != 0)
+		return rc;
+
+	req = &(*rpc)->crpc_reqstmsg.msg_body.ping_reqst;
+
+	req->pnr_magic = LST_PING_TEST_MAGIC;
+
+	spin_lock(&lst_ping_data.pnd_lock);
+	req->pnr_seq = lst_ping_data.pnd_counter++;
+	spin_unlock(&lst_ping_data.pnd_lock);
+
+	cfs_fs_timeval(&tv);
+	req->pnr_time_sec  = tv.tv_sec;
+	req->pnr_time_usec = tv.tv_usec;
+
+	return rc;
+}
+
+static void
+ping_client_done_rpc (sfw_test_unit_t *tsu, srpc_client_rpc_t *rpc)
+{
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+	sfw_session_t       *sn = tsi->tsi_batch->bat_session;
+	srpc_ping_reqst_t   *reqst = &rpc->crpc_reqstmsg.msg_body.ping_reqst;
+	srpc_ping_reply_t   *reply = &rpc->crpc_replymsg.msg_body.ping_reply;
+	struct timeval       tv;
+
+	LASSERT (sn != NULL);
+
+	if (rpc->crpc_status != 0) {
+		if (!tsi->tsi_stopping) /* rpc could have been aborted */
+			atomic_inc(&sn->sn_ping_errors);
+		CERROR ("Unable to ping %s (%d): %d\n",
+			libcfs_id2str(rpc->crpc_dest),
+			reqst->pnr_seq, rpc->crpc_status);
+		return;
+	}
+
+	if (rpc->crpc_replymsg.msg_magic != SRPC_MSG_MAGIC) {
+		__swab32s(&reply->pnr_seq);
+		__swab32s(&reply->pnr_magic);
+		__swab32s(&reply->pnr_status);
+	}
+
+	if (reply->pnr_magic != LST_PING_TEST_MAGIC) {
+		rpc->crpc_status = -EBADMSG;
+		atomic_inc(&sn->sn_ping_errors);
+		CERROR ("Bad magic %u from %s, %u expected.\n",
+			reply->pnr_magic, libcfs_id2str(rpc->crpc_dest),
+			LST_PING_TEST_MAGIC);
+		return;
+	}
+
+	if (reply->pnr_seq != reqst->pnr_seq) {
+		rpc->crpc_status = -EBADMSG;
+		atomic_inc(&sn->sn_ping_errors);
+		CERROR ("Bad seq %u from %s, %u expected.\n",
+			reply->pnr_seq, libcfs_id2str(rpc->crpc_dest),
+			reqst->pnr_seq);
+		return;
+	}
+
+	cfs_fs_timeval(&tv);
+	CDEBUG (D_NET, "%d reply in %u usec\n", reply->pnr_seq,
+		(unsigned)((tv.tv_sec - (unsigned)reqst->pnr_time_sec) * 1000000
+			   + (tv.tv_usec - reqst->pnr_time_usec)));
+	return;
+}
+
+static int
+ping_server_handle(struct srpc_server_rpc *rpc)
+{
+	struct srpc_service	*sv  = rpc->srpc_scd->scd_svc;
+	srpc_msg_t	*reqstmsg = &rpc->srpc_reqstbuf->buf_msg;
+	srpc_msg_t	  *replymsg = &rpc->srpc_replymsg;
+	srpc_ping_reqst_t *req = &reqstmsg->msg_body.ping_reqst;
+	srpc_ping_reply_t *rep = &rpc->srpc_replymsg.msg_body.ping_reply;
+
+	LASSERT (sv->sv_id == SRPC_SERVICE_PING);
+
+	if (reqstmsg->msg_magic != SRPC_MSG_MAGIC) {
+		LASSERT (reqstmsg->msg_magic == __swab32(SRPC_MSG_MAGIC));
+
+		__swab32s(&req->pnr_seq);
+		__swab32s(&req->pnr_magic);
+		__swab64s(&req->pnr_time_sec);
+		__swab64s(&req->pnr_time_usec);
+	}
+	LASSERT (reqstmsg->msg_type == srpc_service2request(sv->sv_id));
+
+	if (req->pnr_magic != LST_PING_TEST_MAGIC) {
+		CERROR ("Unexpect magic %08x from %s\n",
+			req->pnr_magic, libcfs_id2str(rpc->srpc_peer));
+		return -EINVAL;
+	}
+
+	rep->pnr_seq   = req->pnr_seq;
+	rep->pnr_magic = LST_PING_TEST_MAGIC;
+
+	if ((reqstmsg->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+		replymsg->msg_ses_feats = LST_FEATS_MASK;
+		rep->pnr_status = EPROTO;
+		return 0;
+	}
+
+	replymsg->msg_ses_feats = reqstmsg->msg_ses_feats;
+
+	CDEBUG(D_NET, "Get ping %d from %s\n",
+	       req->pnr_seq, libcfs_id2str(rpc->srpc_peer));
+	return 0;
+}
+
+sfw_test_client_ops_t ping_test_client;
+void ping_init_test_client(void)
+{
+	ping_test_client.tso_init     = ping_client_init;
+	ping_test_client.tso_fini     = ping_client_fini;
+	ping_test_client.tso_prep_rpc = ping_client_prep_rpc;
+	ping_test_client.tso_done_rpc = ping_client_done_rpc;
+}
+
+srpc_service_t ping_test_service;
+void ping_init_test_service(void)
+{
+	ping_test_service.sv_id       = SRPC_SERVICE_PING;
+	ping_test_service.sv_name     = "ping_test";
+	ping_test_service.sv_handler  = ping_server_handle;
+	ping_test_service.sv_wi_total = ping_srv_workitems;
+}
diff --git a/drivers/staging/lustre/lnet/selftest/rpc.c b/drivers/staging/lustre/lnet/selftest/rpc.c
new file mode 100644
index 000000000000..91d83f4b746e
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/rpc.c
@@ -0,0 +1,1665 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/rpc.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ *
+ * 2012-05-13: Liang Zhen <liang@whamcloud.com>
+ * - percpt data for service to improve smp performance
+ * - code cleanup
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "selftest.h"
+
+typedef enum {
+	SRPC_STATE_NONE,
+	SRPC_STATE_NI_INIT,
+	SRPC_STATE_EQ_INIT,
+	SRPC_STATE_RUNNING,
+	SRPC_STATE_STOPPING,
+} srpc_state_t;
+
+struct smoketest_rpc {
+	spinlock_t	 rpc_glock;	/* global lock */
+	srpc_service_t	*rpc_services[SRPC_SERVICE_MAX_ID + 1];
+	lnet_handle_eq_t rpc_lnet_eq;	/* _the_ LNet event queue */
+	srpc_state_t	 rpc_state;
+	srpc_counters_t	 rpc_counters;
+	__u64		 rpc_matchbits;	/* matchbits counter */
+} srpc_data;
+
+static inline int
+srpc_serv_portal(int svc_id)
+{
+	return svc_id < SRPC_FRAMEWORK_SERVICE_MAX_ID ?
+	       SRPC_FRAMEWORK_REQUEST_PORTAL : SRPC_REQUEST_PORTAL;
+}
+
+/* forward ref's */
+int srpc_handle_rpc (swi_workitem_t *wi);
+
+void srpc_get_counters (srpc_counters_t *cnt)
+{
+	spin_lock(&srpc_data.rpc_glock);
+	*cnt = srpc_data.rpc_counters;
+	spin_unlock(&srpc_data.rpc_glock);
+}
+
+void srpc_set_counters (const srpc_counters_t *cnt)
+{
+	spin_lock(&srpc_data.rpc_glock);
+	srpc_data.rpc_counters = *cnt;
+	spin_unlock(&srpc_data.rpc_glock);
+}
+
+int
+srpc_add_bulk_page(srpc_bulk_t *bk, struct page *pg, int i, int nob)
+{
+	nob = min(nob, (int)PAGE_CACHE_SIZE);
+
+	LASSERT(nob > 0);
+	LASSERT(i >= 0 && i < bk->bk_niov);
+
+	bk->bk_iovs[i].kiov_offset = 0;
+	bk->bk_iovs[i].kiov_page   = pg;
+	bk->bk_iovs[i].kiov_len    = nob;
+	return nob;
+}
+
+void
+srpc_free_bulk (srpc_bulk_t *bk)
+{
+	int	 i;
+	struct page *pg;
+
+	LASSERT (bk != NULL);
+
+	for (i = 0; i < bk->bk_niov; i++) {
+		pg = bk->bk_iovs[i].kiov_page;
+		if (pg == NULL) break;
+
+		__free_page(pg);
+	}
+
+	LIBCFS_FREE(bk, offsetof(srpc_bulk_t, bk_iovs[bk->bk_niov]));
+	return;
+}
+
+srpc_bulk_t *
+srpc_alloc_bulk(int cpt, unsigned bulk_npg, unsigned bulk_len, int sink)
+{
+	srpc_bulk_t  *bk;
+	struct page  **pages;
+	int	      i;
+
+	LASSERT(bulk_npg > 0 && bulk_npg <= LNET_MAX_IOV);
+
+	LIBCFS_CPT_ALLOC(bk, lnet_cpt_table(), cpt,
+			 offsetof(srpc_bulk_t, bk_iovs[bulk_npg]));
+	if (bk == NULL) {
+		CERROR("Can't allocate descriptor for %d pages\n", bulk_npg);
+		return NULL;
+	}
+
+	memset(bk, 0, offsetof(srpc_bulk_t, bk_iovs[bulk_npg]));
+	bk->bk_sink   = sink;
+	bk->bk_len    = bulk_len;
+	bk->bk_niov   = bulk_npg;
+	UNUSED(pages);
+
+	for (i = 0; i < bulk_npg; i++) {
+		struct page *pg;
+		int	    nob;
+
+		pg = cfs_page_cpt_alloc(lnet_cpt_table(), cpt, GFP_IOFS);
+		if (pg == NULL) {
+			CERROR("Can't allocate page %d of %d\n", i, bulk_npg);
+			srpc_free_bulk(bk);
+			return NULL;
+		}
+
+		nob = srpc_add_bulk_page(bk, pg, i, bulk_len);
+		bulk_len -= nob;
+	}
+
+	return bk;
+}
+
+static inline __u64
+srpc_next_id (void)
+{
+	__u64 id;
+
+	spin_lock(&srpc_data.rpc_glock);
+	id = srpc_data.rpc_matchbits++;
+	spin_unlock(&srpc_data.rpc_glock);
+	return id;
+}
+
+void
+srpc_init_server_rpc(struct srpc_server_rpc *rpc,
+		     struct srpc_service_cd *scd,
+		     struct srpc_buffer *buffer)
+{
+	memset(rpc, 0, sizeof(*rpc));
+	swi_init_workitem(&rpc->srpc_wi, rpc, srpc_handle_rpc,
+			  srpc_serv_is_framework(scd->scd_svc) ?
+			  lst_sched_serial : lst_sched_test[scd->scd_cpt]);
+
+	rpc->srpc_ev.ev_fired = 1; /* no event expected now */
+
+	rpc->srpc_scd      = scd;
+	rpc->srpc_reqstbuf = buffer;
+	rpc->srpc_peer     = buffer->buf_peer;
+	rpc->srpc_self     = buffer->buf_self;
+	LNetInvalidateHandle(&rpc->srpc_replymdh);
+}
+
+static void
+srpc_service_fini(struct srpc_service *svc)
+{
+	struct srpc_service_cd	*scd;
+	struct srpc_server_rpc	*rpc;
+	struct srpc_buffer	*buf;
+	struct list_head		*q;
+	int			i;
+
+	if (svc->sv_cpt_data == NULL)
+		return;
+
+	cfs_percpt_for_each(scd, i, svc->sv_cpt_data) {
+		while (1) {
+			if (!list_empty(&scd->scd_buf_posted))
+				q = &scd->scd_buf_posted;
+			else if (!list_empty(&scd->scd_buf_blocked))
+				q = &scd->scd_buf_blocked;
+			else
+				break;
+
+			while (!list_empty(q)) {
+				buf = list_entry(q->next,
+						     struct srpc_buffer,
+						     buf_list);
+				list_del(&buf->buf_list);
+				LIBCFS_FREE(buf, sizeof(*buf));
+			}
+		}
+
+		LASSERT(list_empty(&scd->scd_rpc_active));
+
+		while (!list_empty(&scd->scd_rpc_free)) {
+			rpc = list_entry(scd->scd_rpc_free.next,
+					     struct srpc_server_rpc,
+					     srpc_list);
+			list_del(&rpc->srpc_list);
+			LIBCFS_FREE(rpc, sizeof(*rpc));
+		}
+	}
+
+	cfs_percpt_free(svc->sv_cpt_data);
+	svc->sv_cpt_data = NULL;
+}
+
+static int
+srpc_service_nrpcs(struct srpc_service *svc)
+{
+	int nrpcs = svc->sv_wi_total / svc->sv_ncpts;
+
+	return srpc_serv_is_framework(svc) ?
+	       max(nrpcs, SFW_FRWK_WI_MIN) : max(nrpcs, SFW_TEST_WI_MIN);
+}
+
+int srpc_add_buffer(struct swi_workitem *wi);
+
+static int
+srpc_service_init(struct srpc_service *svc)
+{
+	struct srpc_service_cd	*scd;
+	struct srpc_server_rpc	*rpc;
+	int			nrpcs;
+	int			i;
+	int			j;
+
+	svc->sv_shuttingdown = 0;
+
+	svc->sv_cpt_data = cfs_percpt_alloc(lnet_cpt_table(),
+					    sizeof(struct srpc_service_cd));
+	if (svc->sv_cpt_data == NULL)
+		return -ENOMEM;
+
+	svc->sv_ncpts = srpc_serv_is_framework(svc) ?
+			1 : cfs_cpt_number(lnet_cpt_table());
+	nrpcs = srpc_service_nrpcs(svc);
+
+	cfs_percpt_for_each(scd, i, svc->sv_cpt_data) {
+		scd->scd_cpt = i;
+		scd->scd_svc = svc;
+		spin_lock_init(&scd->scd_lock);
+		INIT_LIST_HEAD(&scd->scd_rpc_free);
+		INIT_LIST_HEAD(&scd->scd_rpc_active);
+		INIT_LIST_HEAD(&scd->scd_buf_posted);
+		INIT_LIST_HEAD(&scd->scd_buf_blocked);
+
+		scd->scd_ev.ev_data = scd;
+		scd->scd_ev.ev_type = SRPC_REQUEST_RCVD;
+
+		/* NB: don't use lst_sched_serial for adding buffer,
+		 * see details in srpc_service_add_buffers() */
+		swi_init_workitem(&scd->scd_buf_wi, scd,
+				  srpc_add_buffer, lst_sched_test[i]);
+
+		if (i != 0 && srpc_serv_is_framework(svc)) {
+			/* NB: framework service only needs srpc_service_cd for
+			 * one partition, but we allocate for all to make
+			 * it easier to implement, it will waste a little
+			 * memory but nobody should care about this */
+			continue;
+		}
+
+		for (j = 0; j < nrpcs; j++) {
+			LIBCFS_CPT_ALLOC(rpc, lnet_cpt_table(),
+					 i, sizeof(*rpc));
+			if (rpc == NULL) {
+				srpc_service_fini(svc);
+				return -ENOMEM;
+			}
+			list_add(&rpc->srpc_list, &scd->scd_rpc_free);
+		}
+	}
+
+	return 0;
+}
+
+int
+srpc_add_service(struct srpc_service *sv)
+{
+	int id = sv->sv_id;
+
+	LASSERT(0 <= id && id <= SRPC_SERVICE_MAX_ID);
+
+	if (srpc_service_init(sv) != 0)
+		return -ENOMEM;
+
+	spin_lock(&srpc_data.rpc_glock);
+
+	LASSERT(srpc_data.rpc_state == SRPC_STATE_RUNNING);
+
+	if (srpc_data.rpc_services[id] != NULL) {
+		spin_unlock(&srpc_data.rpc_glock);
+		goto failed;
+	}
+
+	srpc_data.rpc_services[id] = sv;
+	spin_unlock(&srpc_data.rpc_glock);
+
+	CDEBUG(D_NET, "Adding service: id %d, name %s\n", id, sv->sv_name);
+	return 0;
+
+ failed:
+	srpc_service_fini(sv);
+	return -EBUSY;
+}
+
+int
+srpc_remove_service (srpc_service_t *sv)
+{
+	int id = sv->sv_id;
+
+	spin_lock(&srpc_data.rpc_glock);
+
+	if (srpc_data.rpc_services[id] != sv) {
+		spin_unlock(&srpc_data.rpc_glock);
+		return -ENOENT;
+	}
+
+	srpc_data.rpc_services[id] = NULL;
+	spin_unlock(&srpc_data.rpc_glock);
+	return 0;
+}
+
+int
+srpc_post_passive_rdma(int portal, int local, __u64 matchbits, void *buf,
+		       int len, int options, lnet_process_id_t peer,
+		       lnet_handle_md_t *mdh, srpc_event_t *ev)
+{
+	int		 rc;
+	lnet_md_t	 md;
+	lnet_handle_me_t meh;
+
+	rc = LNetMEAttach(portal, peer, matchbits, 0, LNET_UNLINK,
+			  local ? LNET_INS_LOCAL : LNET_INS_AFTER, &meh);
+	if (rc != 0) {
+		CERROR ("LNetMEAttach failed: %d\n", rc);
+		LASSERT (rc == -ENOMEM);
+		return -ENOMEM;
+	}
+
+	md.threshold = 1;
+	md.user_ptr  = ev;
+	md.start     = buf;
+	md.length    = len;
+	md.options   = options;
+	md.eq_handle = srpc_data.rpc_lnet_eq;
+
+	rc = LNetMDAttach(meh, md, LNET_UNLINK, mdh);
+	if (rc != 0) {
+		CERROR ("LNetMDAttach failed: %d\n", rc);
+		LASSERT (rc == -ENOMEM);
+
+		rc = LNetMEUnlink(meh);
+		LASSERT (rc == 0);
+		return -ENOMEM;
+	}
+
+	CDEBUG (D_NET,
+		"Posted passive RDMA: peer %s, portal %d, matchbits "LPX64"\n",
+		libcfs_id2str(peer), portal, matchbits);
+	return 0;
+}
+
+int
+srpc_post_active_rdma(int portal, __u64 matchbits, void *buf, int len,
+		      int options, lnet_process_id_t peer, lnet_nid_t self,
+		      lnet_handle_md_t *mdh, srpc_event_t *ev)
+{
+	int       rc;
+	lnet_md_t md;
+
+	md.user_ptr  = ev;
+	md.start     = buf;
+	md.length    = len;
+	md.eq_handle = srpc_data.rpc_lnet_eq;
+	md.threshold = ((options & LNET_MD_OP_GET) != 0) ? 2 : 1;
+	md.options   = options & ~(LNET_MD_OP_PUT | LNET_MD_OP_GET);
+
+	rc = LNetMDBind(md, LNET_UNLINK, mdh);
+	if (rc != 0) {
+		CERROR ("LNetMDBind failed: %d\n", rc);
+		LASSERT (rc == -ENOMEM);
+		return -ENOMEM;
+	}
+
+	/* this is kind of an abuse of the LNET_MD_OP_{PUT,GET} options.
+	 * they're only meaningful for MDs attached to an ME (i.e. passive
+	 * buffers... */
+	if ((options & LNET_MD_OP_PUT) != 0) {
+		rc = LNetPut(self, *mdh, LNET_NOACK_REQ, peer,
+			     portal, matchbits, 0, 0);
+	} else {
+		LASSERT ((options & LNET_MD_OP_GET) != 0);
+
+		rc = LNetGet(self, *mdh, peer, portal, matchbits, 0);
+	}
+
+	if (rc != 0) {
+		CERROR ("LNet%s(%s, %d, "LPD64") failed: %d\n",
+			((options & LNET_MD_OP_PUT) != 0) ? "Put" : "Get",
+			libcfs_id2str(peer), portal, matchbits, rc);
+
+		/* The forthcoming unlink event will complete this operation
+		 * with failure, so fall through and return success here.
+		 */
+		rc = LNetMDUnlink(*mdh);
+		LASSERT (rc == 0);
+	} else {
+		CDEBUG (D_NET,
+			"Posted active RDMA: peer %s, portal %u, matchbits "LPX64"\n",
+			libcfs_id2str(peer), portal, matchbits);
+	}
+	return 0;
+}
+
+int
+srpc_post_active_rqtbuf(lnet_process_id_t peer, int service, void *buf,
+			int len, lnet_handle_md_t *mdh, srpc_event_t *ev)
+{
+	return srpc_post_active_rdma(srpc_serv_portal(service), service,
+				     buf, len, LNET_MD_OP_PUT, peer,
+				     LNET_NID_ANY, mdh, ev);
+}
+
+int
+srpc_post_passive_rqtbuf(int service, int local, void *buf, int len,
+			 lnet_handle_md_t *mdh, srpc_event_t *ev)
+{
+	lnet_process_id_t any = {0};
+
+	any.nid = LNET_NID_ANY;
+	any.pid = LNET_PID_ANY;
+
+	return srpc_post_passive_rdma(srpc_serv_portal(service),
+				      local, service, buf, len,
+				      LNET_MD_OP_PUT, any, mdh, ev);
+}
+
+int
+srpc_service_post_buffer(struct srpc_service_cd *scd, struct srpc_buffer *buf)
+{
+	struct srpc_service	*sv = scd->scd_svc;
+	struct srpc_msg		*msg = &buf->buf_msg;
+	int			rc;
+
+	LNetInvalidateHandle(&buf->buf_mdh);
+	list_add(&buf->buf_list, &scd->scd_buf_posted);
+	scd->scd_buf_nposted++;
+	spin_unlock(&scd->scd_lock);
+
+	rc = srpc_post_passive_rqtbuf(sv->sv_id,
+				      !srpc_serv_is_framework(sv),
+				      msg, sizeof(*msg), &buf->buf_mdh,
+				      &scd->scd_ev);
+
+	/* At this point, a RPC (new or delayed) may have arrived in
+	 * msg and its event handler has been called. So we must add
+	 * buf to scd_buf_posted _before_ dropping scd_lock */
+
+	spin_lock(&scd->scd_lock);
+
+	if (rc == 0) {
+		if (!sv->sv_shuttingdown)
+			return 0;
+
+		spin_unlock(&scd->scd_lock);
+		/* srpc_shutdown_service might have tried to unlink me
+		 * when my buf_mdh was still invalid */
+		LNetMDUnlink(buf->buf_mdh);
+		spin_lock(&scd->scd_lock);
+		return 0;
+	}
+
+	scd->scd_buf_nposted--;
+	if (sv->sv_shuttingdown)
+		return rc; /* don't allow to change scd_buf_posted */
+
+	list_del(&buf->buf_list);
+	spin_unlock(&scd->scd_lock);
+
+	LIBCFS_FREE(buf, sizeof(*buf));
+
+	spin_lock(&scd->scd_lock);
+	return rc;
+}
+
+int
+srpc_add_buffer(struct swi_workitem *wi)
+{
+	struct srpc_service_cd	*scd = wi->swi_workitem.wi_data;
+	struct srpc_buffer	*buf;
+	int			rc = 0;
+
+	/* it's called by workitem scheduler threads, these threads
+	 * should have been set CPT affinity, so buffers will be posted
+	 * on CPT local list of Portal */
+	spin_lock(&scd->scd_lock);
+
+	while (scd->scd_buf_adjust > 0 &&
+	       !scd->scd_svc->sv_shuttingdown) {
+		scd->scd_buf_adjust--; /* consume it */
+		scd->scd_buf_posting++;
+
+		spin_unlock(&scd->scd_lock);
+
+		LIBCFS_ALLOC(buf, sizeof(*buf));
+		if (buf == NULL) {
+			CERROR("Failed to add new buf to service: %s\n",
+			       scd->scd_svc->sv_name);
+			spin_lock(&scd->scd_lock);
+			rc = -ENOMEM;
+			break;
+		}
+
+		spin_lock(&scd->scd_lock);
+		if (scd->scd_svc->sv_shuttingdown) {
+			spin_unlock(&scd->scd_lock);
+			LIBCFS_FREE(buf, sizeof(*buf));
+
+			spin_lock(&scd->scd_lock);
+			rc = -ESHUTDOWN;
+			break;
+		}
+
+		rc = srpc_service_post_buffer(scd, buf);
+		if (rc != 0)
+			break; /* buf has been freed inside */
+
+		LASSERT(scd->scd_buf_posting > 0);
+		scd->scd_buf_posting--;
+		scd->scd_buf_total++;
+		scd->scd_buf_low = MAX(2, scd->scd_buf_total / 4);
+	}
+
+	if (rc != 0) {
+		scd->scd_buf_err_stamp = cfs_time_current_sec();
+		scd->scd_buf_err = rc;
+
+		LASSERT(scd->scd_buf_posting > 0);
+		scd->scd_buf_posting--;
+	}
+
+	spin_unlock(&scd->scd_lock);
+	return 0;
+}
+
+int
+srpc_service_add_buffers(struct srpc_service *sv, int nbuffer)
+{
+	struct srpc_service_cd	*scd;
+	int			rc = 0;
+	int			i;
+
+	LASSERTF(nbuffer > 0, "nbuffer must be positive: %d\n", nbuffer);
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+		spin_lock(&scd->scd_lock);
+
+		scd->scd_buf_err = 0;
+		scd->scd_buf_err_stamp = 0;
+		scd->scd_buf_posting = 0;
+		scd->scd_buf_adjust = nbuffer;
+		/* start to post buffers */
+		swi_schedule_workitem(&scd->scd_buf_wi);
+		spin_unlock(&scd->scd_lock);
+
+		/* framework service only post buffer for one partition  */
+		if (srpc_serv_is_framework(sv))
+			break;
+	}
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+		spin_lock(&scd->scd_lock);
+		/*
+		 * NB: srpc_service_add_buffers() can be called inside
+		 * thread context of lst_sched_serial, and we don't normally
+		 * allow to sleep inside thread context of WI scheduler
+		 * because it will block current scheduler thread from doing
+		 * anything else, even worse, it could deadlock if it's
+		 * waiting on result from another WI of the same scheduler.
+		 * However, it's safe at here because scd_buf_wi is scheduled
+		 * by thread in a different WI scheduler (lst_sched_test),
+		 * so we don't have any risk of deadlock, though this could
+		 * block all WIs pending on lst_sched_serial for a moment
+		 * which is not good but not fatal.
+		 */
+		lst_wait_until(scd->scd_buf_err != 0 ||
+			       (scd->scd_buf_adjust == 0 &&
+				scd->scd_buf_posting == 0),
+			       scd->scd_lock, "waiting for adding buffer\n");
+
+		if (scd->scd_buf_err != 0 && rc == 0)
+			rc = scd->scd_buf_err;
+
+		spin_unlock(&scd->scd_lock);
+	}
+
+	return rc;
+}
+
+void
+srpc_service_remove_buffers(struct srpc_service *sv, int nbuffer)
+{
+	struct srpc_service_cd	*scd;
+	int			num;
+	int			i;
+
+	LASSERT(!sv->sv_shuttingdown);
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+		spin_lock(&scd->scd_lock);
+
+		num = scd->scd_buf_total + scd->scd_buf_posting;
+		scd->scd_buf_adjust -= min(nbuffer, num);
+
+		spin_unlock(&scd->scd_lock);
+	}
+}
+
+/* returns 1 if sv has finished, otherwise 0 */
+int
+srpc_finish_service(struct srpc_service *sv)
+{
+	struct srpc_service_cd	*scd;
+	struct srpc_server_rpc	*rpc;
+	int			i;
+
+	LASSERT(sv->sv_shuttingdown); /* srpc_shutdown_service called */
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+		spin_lock(&scd->scd_lock);
+		if (!swi_deschedule_workitem(&scd->scd_buf_wi))
+			return 0;
+
+		if (scd->scd_buf_nposted > 0) {
+			CDEBUG(D_NET, "waiting for %d posted buffers to unlink",
+			       scd->scd_buf_nposted);
+			spin_unlock(&scd->scd_lock);
+			return 0;
+		}
+
+		if (list_empty(&scd->scd_rpc_active)) {
+			spin_unlock(&scd->scd_lock);
+			continue;
+		}
+
+		rpc = list_entry(scd->scd_rpc_active.next,
+				     struct srpc_server_rpc, srpc_list);
+		CNETERR("Active RPC %p on shutdown: sv %s, peer %s, "
+			"wi %s scheduled %d running %d, "
+			"ev fired %d type %d status %d lnet %d\n",
+			rpc, sv->sv_name, libcfs_id2str(rpc->srpc_peer),
+			swi_state2str(rpc->srpc_wi.swi_state),
+			rpc->srpc_wi.swi_workitem.wi_scheduled,
+			rpc->srpc_wi.swi_workitem.wi_running,
+			rpc->srpc_ev.ev_fired, rpc->srpc_ev.ev_type,
+			rpc->srpc_ev.ev_status, rpc->srpc_ev.ev_lnet);
+		spin_unlock(&scd->scd_lock);
+		return 0;
+	}
+
+	/* no lock needed from now on */
+	srpc_service_fini(sv);
+	return 1;
+}
+
+/* called with sv->sv_lock held */
+void
+srpc_service_recycle_buffer(struct srpc_service_cd *scd, srpc_buffer_t *buf)
+{
+	if (!scd->scd_svc->sv_shuttingdown && scd->scd_buf_adjust >= 0) {
+		if (srpc_service_post_buffer(scd, buf) != 0) {
+			CWARN("Failed to post %s buffer\n",
+			      scd->scd_svc->sv_name);
+		}
+		return;
+	}
+
+	/* service is shutting down, or we want to recycle some buffers */
+	scd->scd_buf_total--;
+
+	if (scd->scd_buf_adjust < 0) {
+		scd->scd_buf_adjust++;
+		if (scd->scd_buf_adjust < 0 &&
+		    scd->scd_buf_total == 0 && scd->scd_buf_posting == 0) {
+			CDEBUG(D_INFO,
+			       "Try to recyle %d buffers but nothing left\n",
+			       scd->scd_buf_adjust);
+			scd->scd_buf_adjust = 0;
+		}
+	}
+
+	spin_unlock(&scd->scd_lock);
+	LIBCFS_FREE(buf, sizeof(*buf));
+	spin_lock(&scd->scd_lock);
+}
+
+void
+srpc_abort_service(struct srpc_service *sv)
+{
+	struct srpc_service_cd	*scd;
+	struct srpc_server_rpc	*rpc;
+	int			i;
+
+	CDEBUG(D_NET, "Aborting service: id %d, name %s\n",
+	       sv->sv_id, sv->sv_name);
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+		spin_lock(&scd->scd_lock);
+
+		/* schedule in-flight RPCs to notice the abort, NB:
+		 * racing with incoming RPCs; complete fix should make test
+		 * RPCs carry session ID in its headers */
+		list_for_each_entry(rpc, &scd->scd_rpc_active, srpc_list) {
+			rpc->srpc_aborted = 1;
+			swi_schedule_workitem(&rpc->srpc_wi);
+		}
+
+		spin_unlock(&scd->scd_lock);
+	}
+}
+
+void
+srpc_shutdown_service(srpc_service_t *sv)
+{
+	struct srpc_service_cd	*scd;
+	struct srpc_server_rpc	*rpc;
+	srpc_buffer_t		*buf;
+	int			i;
+
+	CDEBUG(D_NET, "Shutting down service: id %d, name %s\n",
+	       sv->sv_id, sv->sv_name);
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data)
+		spin_lock(&scd->scd_lock);
+
+	sv->sv_shuttingdown = 1; /* i.e. no new active RPC */
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data)
+		spin_unlock(&scd->scd_lock);
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+		spin_lock(&scd->scd_lock);
+
+		/* schedule in-flight RPCs to notice the shutdown */
+		list_for_each_entry(rpc, &scd->scd_rpc_active, srpc_list)
+			swi_schedule_workitem(&rpc->srpc_wi);
+
+		spin_unlock(&scd->scd_lock);
+
+		/* OK to traverse scd_buf_posted without lock, since no one
+		 * touches scd_buf_posted now */
+		list_for_each_entry(buf, &scd->scd_buf_posted, buf_list)
+			LNetMDUnlink(buf->buf_mdh);
+	}
+}
+
+int
+srpc_send_request (srpc_client_rpc_t *rpc)
+{
+	srpc_event_t *ev = &rpc->crpc_reqstev;
+	int	   rc;
+
+	ev->ev_fired = 0;
+	ev->ev_data  = rpc;
+	ev->ev_type  = SRPC_REQUEST_SENT;
+
+	rc = srpc_post_active_rqtbuf(rpc->crpc_dest, rpc->crpc_service,
+				     &rpc->crpc_reqstmsg, sizeof(srpc_msg_t),
+				     &rpc->crpc_reqstmdh, ev);
+	if (rc != 0) {
+		LASSERT (rc == -ENOMEM);
+		ev->ev_fired = 1;  /* no more event expected */
+	}
+	return rc;
+}
+
+int
+srpc_prepare_reply (srpc_client_rpc_t *rpc)
+{
+	srpc_event_t *ev = &rpc->crpc_replyev;
+	__u64	*id = &rpc->crpc_reqstmsg.msg_body.reqst.rpyid;
+	int	   rc;
+
+	ev->ev_fired = 0;
+	ev->ev_data  = rpc;
+	ev->ev_type  = SRPC_REPLY_RCVD;
+
+	*id = srpc_next_id();
+
+	rc = srpc_post_passive_rdma(SRPC_RDMA_PORTAL, 0, *id,
+				    &rpc->crpc_replymsg, sizeof(srpc_msg_t),
+				    LNET_MD_OP_PUT, rpc->crpc_dest,
+				    &rpc->crpc_replymdh, ev);
+	if (rc != 0) {
+		LASSERT (rc == -ENOMEM);
+		ev->ev_fired = 1;  /* no more event expected */
+	}
+	return rc;
+}
+
+int
+srpc_prepare_bulk (srpc_client_rpc_t *rpc)
+{
+	srpc_bulk_t  *bk = &rpc->crpc_bulk;
+	srpc_event_t *ev = &rpc->crpc_bulkev;
+	__u64	*id = &rpc->crpc_reqstmsg.msg_body.reqst.bulkid;
+	int	   rc;
+	int	   opt;
+
+	LASSERT (bk->bk_niov <= LNET_MAX_IOV);
+
+	if (bk->bk_niov == 0) return 0; /* nothing to do */
+
+	opt = bk->bk_sink ? LNET_MD_OP_PUT : LNET_MD_OP_GET;
+	opt |= LNET_MD_KIOV;
+
+	ev->ev_fired = 0;
+	ev->ev_data  = rpc;
+	ev->ev_type  = SRPC_BULK_REQ_RCVD;
+
+	*id = srpc_next_id();
+
+	rc = srpc_post_passive_rdma(SRPC_RDMA_PORTAL, 0, *id,
+				    &bk->bk_iovs[0], bk->bk_niov, opt,
+				    rpc->crpc_dest, &bk->bk_mdh, ev);
+	if (rc != 0) {
+		LASSERT (rc == -ENOMEM);
+		ev->ev_fired = 1;  /* no more event expected */
+	}
+	return rc;
+}
+
+int
+srpc_do_bulk (srpc_server_rpc_t *rpc)
+{
+	srpc_event_t  *ev = &rpc->srpc_ev;
+	srpc_bulk_t   *bk = rpc->srpc_bulk;
+	__u64	  id = rpc->srpc_reqstbuf->buf_msg.msg_body.reqst.bulkid;
+	int	    rc;
+	int	    opt;
+
+	LASSERT (bk != NULL);
+
+	opt = bk->bk_sink ? LNET_MD_OP_GET : LNET_MD_OP_PUT;
+	opt |= LNET_MD_KIOV;
+
+	ev->ev_fired = 0;
+	ev->ev_data  = rpc;
+	ev->ev_type  = bk->bk_sink ? SRPC_BULK_GET_RPLD : SRPC_BULK_PUT_SENT;
+
+	rc = srpc_post_active_rdma(SRPC_RDMA_PORTAL, id,
+				   &bk->bk_iovs[0], bk->bk_niov, opt,
+				   rpc->srpc_peer, rpc->srpc_self,
+				   &bk->bk_mdh, ev);
+	if (rc != 0)
+		ev->ev_fired = 1;  /* no more event expected */
+	return rc;
+}
+
+/* only called from srpc_handle_rpc */
+void
+srpc_server_rpc_done(srpc_server_rpc_t *rpc, int status)
+{
+	struct srpc_service_cd	*scd = rpc->srpc_scd;
+	struct srpc_service	*sv  = scd->scd_svc;
+	srpc_buffer_t		*buffer;
+
+	LASSERT (status != 0 || rpc->srpc_wi.swi_state == SWI_STATE_DONE);
+
+	rpc->srpc_status = status;
+
+	CDEBUG_LIMIT (status == 0 ? D_NET : D_NETERROR,
+		"Server RPC %p done: service %s, peer %s, status %s:%d\n",
+		rpc, sv->sv_name, libcfs_id2str(rpc->srpc_peer),
+		swi_state2str(rpc->srpc_wi.swi_state), status);
+
+	if (status != 0) {
+		spin_lock(&srpc_data.rpc_glock);
+		srpc_data.rpc_counters.rpcs_dropped++;
+		spin_unlock(&srpc_data.rpc_glock);
+	}
+
+	if (rpc->srpc_done != NULL)
+		(*rpc->srpc_done) (rpc);
+	LASSERT(rpc->srpc_bulk == NULL);
+
+	spin_lock(&scd->scd_lock);
+
+	if (rpc->srpc_reqstbuf != NULL) {
+		/* NB might drop sv_lock in srpc_service_recycle_buffer, but
+		 * sv won't go away for scd_rpc_active must not be empty */
+		srpc_service_recycle_buffer(scd, rpc->srpc_reqstbuf);
+		rpc->srpc_reqstbuf = NULL;
+	}
+
+	list_del(&rpc->srpc_list); /* from scd->scd_rpc_active */
+
+	/*
+	 * No one can schedule me now since:
+	 * - I'm not on scd_rpc_active.
+	 * - all LNet events have been fired.
+	 * Cancel pending schedules and prevent future schedule attempts:
+	 */
+	LASSERT(rpc->srpc_ev.ev_fired);
+	swi_exit_workitem(&rpc->srpc_wi);
+
+	if (!sv->sv_shuttingdown && !list_empty(&scd->scd_buf_blocked)) {
+		buffer = list_entry(scd->scd_buf_blocked.next,
+					srpc_buffer_t, buf_list);
+		list_del(&buffer->buf_list);
+
+		srpc_init_server_rpc(rpc, scd, buffer);
+		list_add_tail(&rpc->srpc_list, &scd->scd_rpc_active);
+		swi_schedule_workitem(&rpc->srpc_wi);
+	} else {
+		list_add(&rpc->srpc_list, &scd->scd_rpc_free);
+	}
+
+	spin_unlock(&scd->scd_lock);
+	return;
+}
+
+/* handles an incoming RPC */
+int
+srpc_handle_rpc(swi_workitem_t *wi)
+{
+	struct srpc_server_rpc	*rpc = wi->swi_workitem.wi_data;
+	struct srpc_service_cd	*scd = rpc->srpc_scd;
+	struct srpc_service	*sv = scd->scd_svc;
+	srpc_event_t		*ev = &rpc->srpc_ev;
+	int			rc = 0;
+
+	LASSERT(wi == &rpc->srpc_wi);
+
+	spin_lock(&scd->scd_lock);
+
+	if (sv->sv_shuttingdown || rpc->srpc_aborted) {
+		spin_unlock(&scd->scd_lock);
+
+		if (rpc->srpc_bulk != NULL)
+			LNetMDUnlink(rpc->srpc_bulk->bk_mdh);
+		LNetMDUnlink(rpc->srpc_replymdh);
+
+		if (ev->ev_fired) { /* no more event, OK to finish */
+			srpc_server_rpc_done(rpc, -ESHUTDOWN);
+			return 1;
+		}
+		return 0;
+	}
+
+	spin_unlock(&scd->scd_lock);
+
+	switch (wi->swi_state) {
+	default:
+		LBUG ();
+	case SWI_STATE_NEWBORN: {
+		srpc_msg_t	   *msg;
+		srpc_generic_reply_t *reply;
+
+		msg = &rpc->srpc_reqstbuf->buf_msg;
+		reply = &rpc->srpc_replymsg.msg_body.reply;
+
+		if (msg->msg_magic == 0) {
+			/* moaned already in srpc_lnet_ev_handler */
+			srpc_server_rpc_done(rpc, EBADMSG);
+			return 1;
+		}
+
+		srpc_unpack_msg_hdr(msg);
+		if (msg->msg_version != SRPC_MSG_VERSION) {
+			CWARN("Version mismatch: %u, %u expected, from %s\n",
+			      msg->msg_version, SRPC_MSG_VERSION,
+			      libcfs_id2str(rpc->srpc_peer));
+			reply->status = EPROTO;
+			/* drop through and send reply */
+		} else {
+			reply->status = 0;
+			rc = (*sv->sv_handler)(rpc);
+			LASSERT(reply->status == 0 || !rpc->srpc_bulk);
+			if (rc != 0) {
+				srpc_server_rpc_done(rpc, rc);
+				return 1;
+			}
+		}
+
+		wi->swi_state = SWI_STATE_BULK_STARTED;
+
+		if (rpc->srpc_bulk != NULL) {
+			rc = srpc_do_bulk(rpc);
+			if (rc == 0)
+				return 0; /* wait for bulk */
+
+			LASSERT (ev->ev_fired);
+			ev->ev_status = rc;
+		}
+	}
+	case SWI_STATE_BULK_STARTED:
+		LASSERT (rpc->srpc_bulk == NULL || ev->ev_fired);
+
+		if (rpc->srpc_bulk != NULL) {
+			rc = ev->ev_status;
+
+			if (sv->sv_bulk_ready != NULL)
+				rc = (*sv->sv_bulk_ready) (rpc, rc);
+
+			if (rc != 0) {
+				srpc_server_rpc_done(rpc, rc);
+				return 1;
+			}
+		}
+
+		wi->swi_state = SWI_STATE_REPLY_SUBMITTED;
+		rc = srpc_send_reply(rpc);
+		if (rc == 0)
+			return 0; /* wait for reply */
+		srpc_server_rpc_done(rpc, rc);
+		return 1;
+
+	case SWI_STATE_REPLY_SUBMITTED:
+		if (!ev->ev_fired) {
+			CERROR("RPC %p: bulk %p, service %d\n",
+			       rpc, rpc->srpc_bulk, sv->sv_id);
+			CERROR("Event: status %d, type %d, lnet %d\n",
+			       ev->ev_status, ev->ev_type, ev->ev_lnet);
+			LASSERT (ev->ev_fired);
+		}
+
+		wi->swi_state = SWI_STATE_DONE;
+		srpc_server_rpc_done(rpc, ev->ev_status);
+		return 1;
+	}
+
+	return 0;
+}
+
+void
+srpc_client_rpc_expired (void *data)
+{
+	srpc_client_rpc_t *rpc = data;
+
+	CWARN ("Client RPC expired: service %d, peer %s, timeout %d.\n",
+	       rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+	       rpc->crpc_timeout);
+
+	spin_lock(&rpc->crpc_lock);
+
+	rpc->crpc_timeout = 0;
+	srpc_abort_rpc(rpc, -ETIMEDOUT);
+
+	spin_unlock(&rpc->crpc_lock);
+
+	spin_lock(&srpc_data.rpc_glock);
+	srpc_data.rpc_counters.rpcs_expired++;
+	spin_unlock(&srpc_data.rpc_glock);
+}
+
+inline void
+srpc_add_client_rpc_timer (srpc_client_rpc_t *rpc)
+{
+	stt_timer_t *timer = &rpc->crpc_timer;
+
+	if (rpc->crpc_timeout == 0) return;
+
+	INIT_LIST_HEAD(&timer->stt_list);
+	timer->stt_data    = rpc;
+	timer->stt_func    = srpc_client_rpc_expired;
+	timer->stt_expires = cfs_time_add(rpc->crpc_timeout,
+					  cfs_time_current_sec());
+	stt_add_timer(timer);
+	return;
+}
+
+/*
+ * Called with rpc->crpc_lock held.
+ *
+ * Upon exit the RPC expiry timer is not queued and the handler is not
+ * running on any CPU. */
+void
+srpc_del_client_rpc_timer (srpc_client_rpc_t *rpc)
+{
+	/* timer not planted or already exploded */
+	if (rpc->crpc_timeout == 0)
+		return;
+
+	/* timer sucessfully defused */
+	if (stt_del_timer(&rpc->crpc_timer))
+		return;
+
+	/* timer detonated, wait for it to explode */
+	while (rpc->crpc_timeout != 0) {
+		spin_unlock(&rpc->crpc_lock);
+
+		schedule();
+
+		spin_lock(&rpc->crpc_lock);
+	}
+}
+
+void
+srpc_client_rpc_done (srpc_client_rpc_t *rpc, int status)
+{
+	swi_workitem_t *wi = &rpc->crpc_wi;
+
+	LASSERT(status != 0 || wi->swi_state == SWI_STATE_DONE);
+
+	spin_lock(&rpc->crpc_lock);
+
+	rpc->crpc_closed = 1;
+	if (rpc->crpc_status == 0)
+		rpc->crpc_status = status;
+
+	srpc_del_client_rpc_timer(rpc);
+
+	CDEBUG_LIMIT ((status == 0) ? D_NET : D_NETERROR,
+		"Client RPC done: service %d, peer %s, status %s:%d:%d\n",
+		rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+		swi_state2str(wi->swi_state), rpc->crpc_aborted, status);
+
+	/*
+	 * No one can schedule me now since:
+	 * - RPC timer has been defused.
+	 * - all LNet events have been fired.
+	 * - crpc_closed has been set, preventing srpc_abort_rpc from
+	 *   scheduling me.
+	 * Cancel pending schedules and prevent future schedule attempts:
+	 */
+	LASSERT (!srpc_event_pending(rpc));
+	swi_exit_workitem(wi);
+
+	spin_unlock(&rpc->crpc_lock);
+
+	(*rpc->crpc_done)(rpc);
+	return;
+}
+
+/* sends an outgoing RPC */
+int
+srpc_send_rpc (swi_workitem_t *wi)
+{
+	int		rc = 0;
+	srpc_client_rpc_t *rpc;
+	srpc_msg_t	*reply;
+	int		do_bulk;
+
+	LASSERT(wi != NULL);
+
+	rpc = wi->swi_workitem.wi_data;
+
+	LASSERT (rpc != NULL);
+	LASSERT (wi == &rpc->crpc_wi);
+
+	reply = &rpc->crpc_replymsg;
+	do_bulk = rpc->crpc_bulk.bk_niov > 0;
+
+	spin_lock(&rpc->crpc_lock);
+
+	if (rpc->crpc_aborted) {
+		spin_unlock(&rpc->crpc_lock);
+		goto abort;
+	}
+
+	spin_unlock(&rpc->crpc_lock);
+
+	switch (wi->swi_state) {
+	default:
+		LBUG ();
+	case SWI_STATE_NEWBORN:
+		LASSERT (!srpc_event_pending(rpc));
+
+		rc = srpc_prepare_reply(rpc);
+		if (rc != 0) {
+			srpc_client_rpc_done(rpc, rc);
+			return 1;
+		}
+
+		rc = srpc_prepare_bulk(rpc);
+		if (rc != 0) break;
+
+		wi->swi_state = SWI_STATE_REQUEST_SUBMITTED;
+		rc = srpc_send_request(rpc);
+		break;
+
+	case SWI_STATE_REQUEST_SUBMITTED:
+		/* CAVEAT EMPTOR: rqtev, rpyev, and bulkev may come in any
+		 * order; however, they're processed in a strict order:
+		 * rqt, rpy, and bulk. */
+		if (!rpc->crpc_reqstev.ev_fired) break;
+
+		rc = rpc->crpc_reqstev.ev_status;
+		if (rc != 0) break;
+
+		wi->swi_state = SWI_STATE_REQUEST_SENT;
+		/* perhaps more events, fall thru */
+	case SWI_STATE_REQUEST_SENT: {
+		srpc_msg_type_t type = srpc_service2reply(rpc->crpc_service);
+
+		if (!rpc->crpc_replyev.ev_fired) break;
+
+		rc = rpc->crpc_replyev.ev_status;
+		if (rc != 0) break;
+
+		srpc_unpack_msg_hdr(reply);
+		if (reply->msg_type != type ||
+		    (reply->msg_magic != SRPC_MSG_MAGIC &&
+		     reply->msg_magic != __swab32(SRPC_MSG_MAGIC))) {
+			CWARN ("Bad message from %s: type %u (%d expected),"
+			       " magic %u (%d expected).\n",
+			       libcfs_id2str(rpc->crpc_dest),
+			       reply->msg_type, type,
+			       reply->msg_magic, SRPC_MSG_MAGIC);
+			rc = -EBADMSG;
+			break;
+		}
+
+		if (do_bulk && reply->msg_body.reply.status != 0) {
+			CWARN ("Remote error %d at %s, unlink bulk buffer in "
+			       "case peer didn't initiate bulk transfer\n",
+			       reply->msg_body.reply.status,
+			       libcfs_id2str(rpc->crpc_dest));
+			LNetMDUnlink(rpc->crpc_bulk.bk_mdh);
+		}
+
+		wi->swi_state = SWI_STATE_REPLY_RECEIVED;
+	}
+	case SWI_STATE_REPLY_RECEIVED:
+		if (do_bulk && !rpc->crpc_bulkev.ev_fired) break;
+
+		rc = do_bulk ? rpc->crpc_bulkev.ev_status : 0;
+
+		/* Bulk buffer was unlinked due to remote error. Clear error
+		 * since reply buffer still contains valid data.
+		 * NB rpc->crpc_done shouldn't look into bulk data in case of
+		 * remote error. */
+		if (do_bulk && rpc->crpc_bulkev.ev_lnet == LNET_EVENT_UNLINK &&
+		    rpc->crpc_status == 0 && reply->msg_body.reply.status != 0)
+			rc = 0;
+
+		wi->swi_state = SWI_STATE_DONE;
+		srpc_client_rpc_done(rpc, rc);
+		return 1;
+	}
+
+	if (rc != 0) {
+		spin_lock(&rpc->crpc_lock);
+		srpc_abort_rpc(rpc, rc);
+		spin_unlock(&rpc->crpc_lock);
+	}
+
+abort:
+	if (rpc->crpc_aborted) {
+		LNetMDUnlink(rpc->crpc_reqstmdh);
+		LNetMDUnlink(rpc->crpc_replymdh);
+		LNetMDUnlink(rpc->crpc_bulk.bk_mdh);
+
+		if (!srpc_event_pending(rpc)) {
+			srpc_client_rpc_done(rpc, -EINTR);
+			return 1;
+		}
+	}
+	return 0;
+}
+
+srpc_client_rpc_t *
+srpc_create_client_rpc (lnet_process_id_t peer, int service,
+			int nbulkiov, int bulklen,
+			void (*rpc_done)(srpc_client_rpc_t *),
+			void (*rpc_fini)(srpc_client_rpc_t *), void *priv)
+{
+	srpc_client_rpc_t *rpc;
+
+	LIBCFS_ALLOC(rpc, offsetof(srpc_client_rpc_t,
+				   crpc_bulk.bk_iovs[nbulkiov]));
+	if (rpc == NULL)
+		return NULL;
+
+	srpc_init_client_rpc(rpc, peer, service, nbulkiov,
+			     bulklen, rpc_done, rpc_fini, priv);
+	return rpc;
+}
+
+/* called with rpc->crpc_lock held */
+void
+srpc_abort_rpc (srpc_client_rpc_t *rpc, int why)
+{
+	LASSERT (why != 0);
+
+	if (rpc->crpc_aborted || /* already aborted */
+	    rpc->crpc_closed)    /* callback imminent */
+		return;
+
+	CDEBUG (D_NET,
+		"Aborting RPC: service %d, peer %s, state %s, why %d\n",
+		rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+		swi_state2str(rpc->crpc_wi.swi_state), why);
+
+	rpc->crpc_aborted = 1;
+	rpc->crpc_status  = why;
+	swi_schedule_workitem(&rpc->crpc_wi);
+	return;
+}
+
+/* called with rpc->crpc_lock held */
+void
+srpc_post_rpc (srpc_client_rpc_t *rpc)
+{
+	LASSERT (!rpc->crpc_aborted);
+	LASSERT (srpc_data.rpc_state == SRPC_STATE_RUNNING);
+
+	CDEBUG (D_NET, "Posting RPC: peer %s, service %d, timeout %d\n",
+		libcfs_id2str(rpc->crpc_dest), rpc->crpc_service,
+		rpc->crpc_timeout);
+
+	srpc_add_client_rpc_timer(rpc);
+	swi_schedule_workitem(&rpc->crpc_wi);
+	return;
+}
+
+
+int
+srpc_send_reply(struct srpc_server_rpc *rpc)
+{
+	srpc_event_t		*ev = &rpc->srpc_ev;
+	struct srpc_msg		*msg = &rpc->srpc_replymsg;
+	struct srpc_buffer	*buffer = rpc->srpc_reqstbuf;
+	struct srpc_service_cd	*scd = rpc->srpc_scd;
+	struct srpc_service	*sv = scd->scd_svc;
+	__u64			rpyid;
+	int			rc;
+
+	LASSERT(buffer != NULL);
+	rpyid = buffer->buf_msg.msg_body.reqst.rpyid;
+
+	spin_lock(&scd->scd_lock);
+
+	if (!sv->sv_shuttingdown && !srpc_serv_is_framework(sv)) {
+		/* Repost buffer before replying since test client
+		 * might send me another RPC once it gets the reply */
+		if (srpc_service_post_buffer(scd, buffer) != 0)
+			CWARN("Failed to repost %s buffer\n", sv->sv_name);
+		rpc->srpc_reqstbuf = NULL;
+	}
+
+	spin_unlock(&scd->scd_lock);
+
+	ev->ev_fired = 0;
+	ev->ev_data  = rpc;
+	ev->ev_type  = SRPC_REPLY_SENT;
+
+	msg->msg_magic   = SRPC_MSG_MAGIC;
+	msg->msg_version = SRPC_MSG_VERSION;
+	msg->msg_type    = srpc_service2reply(sv->sv_id);
+
+	rc = srpc_post_active_rdma(SRPC_RDMA_PORTAL, rpyid, msg,
+				   sizeof(*msg), LNET_MD_OP_PUT,
+				   rpc->srpc_peer, rpc->srpc_self,
+				   &rpc->srpc_replymdh, ev);
+	if (rc != 0)
+		ev->ev_fired = 1;  /* no more event expected */
+	return rc;
+}
+
+/* when in kernel always called with LNET_LOCK() held, and in thread context */
+void
+srpc_lnet_ev_handler(lnet_event_t *ev)
+{
+	struct srpc_service_cd	*scd;
+	srpc_event_t      *rpcev = ev->md.user_ptr;
+	srpc_client_rpc_t *crpc;
+	srpc_server_rpc_t *srpc;
+	srpc_buffer_t     *buffer;
+	srpc_service_t    *sv;
+	srpc_msg_t	*msg;
+	srpc_msg_type_t    type;
+
+	LASSERT (!in_interrupt());
+
+	if (ev->status != 0) {
+		spin_lock(&srpc_data.rpc_glock);
+		srpc_data.rpc_counters.errors++;
+		spin_unlock(&srpc_data.rpc_glock);
+	}
+
+	rpcev->ev_lnet = ev->type;
+
+	switch (rpcev->ev_type) {
+	default:
+		CERROR("Unknown event: status %d, type %d, lnet %d\n",
+		       rpcev->ev_status, rpcev->ev_type, rpcev->ev_lnet);
+		LBUG ();
+	case SRPC_REQUEST_SENT:
+		if (ev->status == 0 && ev->type != LNET_EVENT_UNLINK) {
+			spin_lock(&srpc_data.rpc_glock);
+			srpc_data.rpc_counters.rpcs_sent++;
+			spin_unlock(&srpc_data.rpc_glock);
+		}
+	case SRPC_REPLY_RCVD:
+	case SRPC_BULK_REQ_RCVD:
+		crpc = rpcev->ev_data;
+
+		if (rpcev != &crpc->crpc_reqstev &&
+		    rpcev != &crpc->crpc_replyev &&
+		    rpcev != &crpc->crpc_bulkev) {
+			CERROR("rpcev %p, crpc %p, reqstev %p, replyev %p, bulkev %p\n",
+			       rpcev, crpc, &crpc->crpc_reqstev,
+			       &crpc->crpc_replyev, &crpc->crpc_bulkev);
+			CERROR("Bad event: status %d, type %d, lnet %d\n",
+			       rpcev->ev_status, rpcev->ev_type, rpcev->ev_lnet);
+			LBUG ();
+		}
+
+		spin_lock(&crpc->crpc_lock);
+
+		LASSERT(rpcev->ev_fired == 0);
+		rpcev->ev_fired  = 1;
+		rpcev->ev_status = (ev->type == LNET_EVENT_UNLINK) ?
+						-EINTR : ev->status;
+		swi_schedule_workitem(&crpc->crpc_wi);
+
+		spin_unlock(&crpc->crpc_lock);
+		break;
+
+	case SRPC_REQUEST_RCVD:
+		scd = rpcev->ev_data;
+		sv = scd->scd_svc;
+
+		LASSERT(rpcev == &scd->scd_ev);
+
+		spin_lock(&scd->scd_lock);
+
+		LASSERT (ev->unlinked);
+		LASSERT (ev->type == LNET_EVENT_PUT ||
+			 ev->type == LNET_EVENT_UNLINK);
+		LASSERT (ev->type != LNET_EVENT_UNLINK ||
+			 sv->sv_shuttingdown);
+
+		buffer = container_of(ev->md.start, srpc_buffer_t, buf_msg);
+		buffer->buf_peer = ev->initiator;
+		buffer->buf_self = ev->target.nid;
+
+		LASSERT(scd->scd_buf_nposted > 0);
+		scd->scd_buf_nposted--;
+
+		if (sv->sv_shuttingdown) {
+			/* Leave buffer on scd->scd_buf_nposted since
+			 * srpc_finish_service needs to traverse it. */
+			spin_unlock(&scd->scd_lock);
+			break;
+		}
+
+		if (scd->scd_buf_err_stamp != 0 &&
+		    scd->scd_buf_err_stamp < cfs_time_current_sec()) {
+			/* re-enable adding buffer */
+			scd->scd_buf_err_stamp = 0;
+			scd->scd_buf_err = 0;
+		}
+
+		if (scd->scd_buf_err == 0 && /* adding buffer is enabled */
+		    scd->scd_buf_adjust == 0 &&
+		    scd->scd_buf_nposted < scd->scd_buf_low) {
+			scd->scd_buf_adjust = MAX(scd->scd_buf_total / 2,
+						  SFW_TEST_WI_MIN);
+			swi_schedule_workitem(&scd->scd_buf_wi);
+		}
+
+		list_del(&buffer->buf_list); /* from scd->scd_buf_posted */
+		msg = &buffer->buf_msg;
+		type = srpc_service2request(sv->sv_id);
+
+		if (ev->status != 0 || ev->mlength != sizeof(*msg) ||
+		    (msg->msg_type != type &&
+		     msg->msg_type != __swab32(type)) ||
+		    (msg->msg_magic != SRPC_MSG_MAGIC &&
+		     msg->msg_magic != __swab32(SRPC_MSG_MAGIC))) {
+			CERROR ("Dropping RPC (%s) from %s: "
+				"status %d mlength %d type %u magic %u.\n",
+				sv->sv_name, libcfs_id2str(ev->initiator),
+				ev->status, ev->mlength,
+				msg->msg_type, msg->msg_magic);
+
+			/* NB can't call srpc_service_recycle_buffer here since
+			 * it may call LNetM[DE]Attach. The invalid magic tells
+			 * srpc_handle_rpc to drop this RPC */
+			msg->msg_magic = 0;
+		}
+
+		if (!list_empty(&scd->scd_rpc_free)) {
+			srpc = list_entry(scd->scd_rpc_free.next,
+					      struct srpc_server_rpc,
+					      srpc_list);
+			list_del(&srpc->srpc_list);
+
+			srpc_init_server_rpc(srpc, scd, buffer);
+			list_add_tail(&srpc->srpc_list,
+					  &scd->scd_rpc_active);
+			swi_schedule_workitem(&srpc->srpc_wi);
+		} else {
+			list_add_tail(&buffer->buf_list,
+					  &scd->scd_buf_blocked);
+		}
+
+		spin_unlock(&scd->scd_lock);
+
+		spin_lock(&srpc_data.rpc_glock);
+		srpc_data.rpc_counters.rpcs_rcvd++;
+		spin_unlock(&srpc_data.rpc_glock);
+		break;
+
+	case SRPC_BULK_GET_RPLD:
+		LASSERT (ev->type == LNET_EVENT_SEND ||
+			 ev->type == LNET_EVENT_REPLY ||
+			 ev->type == LNET_EVENT_UNLINK);
+
+		if (!ev->unlinked)
+			break; /* wait for final event */
+
+	case SRPC_BULK_PUT_SENT:
+		if (ev->status == 0 && ev->type != LNET_EVENT_UNLINK) {
+			spin_lock(&srpc_data.rpc_glock);
+
+			if (rpcev->ev_type == SRPC_BULK_GET_RPLD)
+				srpc_data.rpc_counters.bulk_get += ev->mlength;
+			else
+				srpc_data.rpc_counters.bulk_put += ev->mlength;
+
+			spin_unlock(&srpc_data.rpc_glock);
+		}
+	case SRPC_REPLY_SENT:
+		srpc = rpcev->ev_data;
+		scd  = srpc->srpc_scd;
+
+		LASSERT(rpcev == &srpc->srpc_ev);
+
+		spin_lock(&scd->scd_lock);
+
+		rpcev->ev_fired  = 1;
+		rpcev->ev_status = (ev->type == LNET_EVENT_UNLINK) ?
+				   -EINTR : ev->status;
+		swi_schedule_workitem(&srpc->srpc_wi);
+
+		spin_unlock(&scd->scd_lock);
+		break;
+	}
+}
+
+
+int
+srpc_startup (void)
+{
+	int rc;
+
+	memset(&srpc_data, 0, sizeof(struct smoketest_rpc));
+	spin_lock_init(&srpc_data.rpc_glock);
+
+	/* 1 second pause to avoid timestamp reuse */
+	cfs_pause(cfs_time_seconds(1));
+	srpc_data.rpc_matchbits = ((__u64) cfs_time_current_sec()) << 48;
+
+	srpc_data.rpc_state = SRPC_STATE_NONE;
+
+	rc = LNetNIInit(LUSTRE_SRV_LNET_PID);
+	if (rc < 0) {
+		CERROR ("LNetNIInit() has failed: %d\n", rc);
+		return rc;
+	}
+
+	srpc_data.rpc_state = SRPC_STATE_NI_INIT;
+
+	LNetInvalidateHandle(&srpc_data.rpc_lnet_eq);
+	rc = LNetEQAlloc(0, srpc_lnet_ev_handler, &srpc_data.rpc_lnet_eq);
+	if (rc != 0) {
+		CERROR("LNetEQAlloc() has failed: %d\n", rc);
+		goto bail;
+	}
+
+	rc = LNetSetLazyPortal(SRPC_FRAMEWORK_REQUEST_PORTAL);
+	LASSERT(rc == 0);
+	rc = LNetSetLazyPortal(SRPC_REQUEST_PORTAL);
+	LASSERT(rc == 0);
+
+	srpc_data.rpc_state = SRPC_STATE_EQ_INIT;
+
+	rc = stt_startup();
+
+bail:
+	if (rc != 0)
+		srpc_shutdown();
+	else
+		srpc_data.rpc_state = SRPC_STATE_RUNNING;
+
+	return rc;
+}
+
+void
+srpc_shutdown (void)
+{
+	int i;
+	int rc;
+	int state;
+
+	state = srpc_data.rpc_state;
+	srpc_data.rpc_state = SRPC_STATE_STOPPING;
+
+	switch (state) {
+	default:
+		LBUG ();
+	case SRPC_STATE_RUNNING:
+		spin_lock(&srpc_data.rpc_glock);
+
+		for (i = 0; i <= SRPC_SERVICE_MAX_ID; i++) {
+			srpc_service_t *sv = srpc_data.rpc_services[i];
+
+			LASSERTF (sv == NULL,
+				  "service not empty: id %d, name %s\n",
+				  i, sv->sv_name);
+		}
+
+		spin_unlock(&srpc_data.rpc_glock);
+
+		stt_shutdown();
+
+	case SRPC_STATE_EQ_INIT:
+		rc = LNetClearLazyPortal(SRPC_FRAMEWORK_REQUEST_PORTAL);
+		rc = LNetClearLazyPortal(SRPC_REQUEST_PORTAL);
+		LASSERT (rc == 0);
+		rc = LNetEQFree(srpc_data.rpc_lnet_eq);
+		LASSERT (rc == 0); /* the EQ should have no user by now */
+
+	case SRPC_STATE_NI_INIT:
+		LNetNIFini();
+	}
+
+	return;
+}
diff --git a/drivers/staging/lustre/lnet/selftest/rpc.h b/drivers/staging/lustre/lnet/selftest/rpc.h
new file mode 100644
index 000000000000..b905d49a351f
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/rpc.h
@@ -0,0 +1,302 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __SELFTEST_RPC_H__
+#define __SELFTEST_RPC_H__
+
+#include <linux/lnet/lnetst.h>
+
+/*
+ * LST wired structures
+ *
+ * XXX: *REPLY == *REQST + 1
+ */
+typedef enum {
+	SRPC_MSG_MKSN_REQST     = 0,
+	SRPC_MSG_MKSN_REPLY     = 1,
+	SRPC_MSG_RMSN_REQST     = 2,
+	SRPC_MSG_RMSN_REPLY     = 3,
+	SRPC_MSG_BATCH_REQST    = 4,
+	SRPC_MSG_BATCH_REPLY    = 5,
+	SRPC_MSG_STAT_REQST     = 6,
+	SRPC_MSG_STAT_REPLY     = 7,
+	SRPC_MSG_TEST_REQST     = 8,
+	SRPC_MSG_TEST_REPLY     = 9,
+	SRPC_MSG_DEBUG_REQST    = 10,
+	SRPC_MSG_DEBUG_REPLY    = 11,
+	SRPC_MSG_BRW_REQST      = 12,
+	SRPC_MSG_BRW_REPLY      = 13,
+	SRPC_MSG_PING_REQST     = 14,
+	SRPC_MSG_PING_REPLY     = 15,
+	SRPC_MSG_JOIN_REQST     = 16,
+	SRPC_MSG_JOIN_REPLY     = 17,
+} srpc_msg_type_t;
+
+
+/* CAVEAT EMPTOR:
+ * All srpc_*_reqst_t's 1st field must be matchbits of reply buffer,
+ * and 2nd field matchbits of bulk buffer if any.
+ *
+ * All srpc_*_reply_t's 1st field must be a __u32 status, and 2nd field
+ * session id if needed.
+ */
+typedef struct {
+	__u64			rpyid;		/* reply buffer matchbits */
+	__u64			bulkid;		/* bulk buffer matchbits */
+} WIRE_ATTR srpc_generic_reqst_t;
+
+typedef struct {
+	__u32		   status;
+	lst_sid_t	       sid;
+} WIRE_ATTR srpc_generic_reply_t;
+
+/* FRAMEWORK RPCs */
+typedef struct {
+	__u64			mksn_rpyid;      /* reply buffer matchbits */
+	lst_sid_t	       mksn_sid;	/* session id */
+	__u32			mksn_force;      /* use brute force */
+	char			mksn_name[LST_NAME_SIZE];
+} WIRE_ATTR srpc_mksn_reqst_t;			/* make session request */
+
+typedef struct {
+	__u32		   mksn_status;      /* session status */
+	lst_sid_t	       mksn_sid;	 /* session id */
+	__u32		   mksn_timeout;     /* session timeout */
+	char			mksn_name[LST_NAME_SIZE];
+} WIRE_ATTR srpc_mksn_reply_t; /* make session reply */
+
+typedef struct {
+	__u64			rmsn_rpyid;      /* reply buffer matchbits */
+	lst_sid_t		rmsn_sid;	/* session id */
+} WIRE_ATTR srpc_rmsn_reqst_t; /* remove session request */
+
+typedef struct {
+	__u32			rmsn_status;
+	lst_sid_t		rmsn_sid;	/* session id */
+} WIRE_ATTR srpc_rmsn_reply_t; /* remove session reply */
+
+typedef struct {
+	__u64			join_rpyid;     /* reply buffer matchbits */
+	lst_sid_t	       join_sid;       /* session id to join */
+	char		    join_group[LST_NAME_SIZE]; /* group name */
+} WIRE_ATTR srpc_join_reqst_t;
+
+typedef struct {
+	__u32		   join_status;    /* returned status */
+	lst_sid_t	       join_sid;       /* session id */
+	__u32			join_timeout;   /* # seconds' inactivity to expire */
+	char		    join_session[LST_NAME_SIZE]; /* session name */
+} WIRE_ATTR srpc_join_reply_t;
+
+typedef struct {
+	__u64		   dbg_rpyid;      /* reply buffer matchbits */
+	lst_sid_t	       dbg_sid;	/* session id */
+	__u32		   dbg_flags;      /* bitmap of debug */
+} WIRE_ATTR srpc_debug_reqst_t;
+
+typedef struct {
+	__u32		   dbg_status;     /* returned code */
+	lst_sid_t	       dbg_sid;	/* session id */
+	__u32		   dbg_timeout;    /* session timeout */
+	__u32		   dbg_nbatch;     /* # of batches in the node */
+	char		    dbg_name[LST_NAME_SIZE]; /* session name */
+} WIRE_ATTR srpc_debug_reply_t;
+
+#define SRPC_BATCH_OPC_RUN      1
+#define SRPC_BATCH_OPC_STOP     2
+#define SRPC_BATCH_OPC_QUERY    3
+
+typedef struct {
+	__u64		   bar_rpyid;      /* reply buffer matchbits */
+	lst_sid_t	       bar_sid;	/* session id */
+	lst_bid_t	       bar_bid;	/* batch id */
+	__u32		   bar_opc;	/* create/start/stop batch */
+	__u32		   bar_testidx;    /* index of test */
+	__u32		   bar_arg;	/* parameters */
+} WIRE_ATTR srpc_batch_reqst_t;
+
+typedef struct {
+	__u32		   bar_status;     /* status of request */
+	lst_sid_t	       bar_sid;	/* session id */
+	__u32		   bar_active;     /* # of active tests in batch/test */
+	__u32		   bar_time;       /* remained time */
+} WIRE_ATTR srpc_batch_reply_t;
+
+typedef struct {
+	__u64		   str_rpyid;      /* reply buffer matchbits */
+	lst_sid_t	       str_sid;	/* session id */
+	__u32		   str_type;       /* type of stat */
+} WIRE_ATTR srpc_stat_reqst_t;
+
+typedef struct {
+	__u32		   str_status;
+	lst_sid_t	       str_sid;
+	sfw_counters_t	  str_fw;
+	srpc_counters_t	 str_rpc;
+	lnet_counters_t	 str_lnet;
+} WIRE_ATTR srpc_stat_reply_t;
+
+typedef struct {
+	__u32		   blk_opc;	/* bulk operation code */
+	__u32		   blk_npg;	/* # of pages */
+	__u32		   blk_flags;      /* reserved flags */
+} WIRE_ATTR test_bulk_req_t;
+
+typedef struct {
+	/** bulk operation code */
+	__u16			blk_opc;
+	/** data check flags */
+	__u16			blk_flags;
+	/** data length */
+	__u32			blk_len;
+	/** reserved: offset */
+	__u32		   blk_offset;
+} WIRE_ATTR test_bulk_req_v1_t;
+
+typedef struct {
+	__u32			png_size;       /* size of ping message */
+	__u32			png_flags;      /* reserved flags */
+} WIRE_ATTR test_ping_req_t;
+
+typedef struct {
+	__u64			tsr_rpyid;      /* reply buffer matchbits */
+	__u64			tsr_bulkid;     /* bulk buffer matchbits */
+	lst_sid_t		tsr_sid;	/* session id */
+	lst_bid_t		tsr_bid;	/* batch id */
+	__u32			tsr_service;    /* test type: bulk|ping|... */
+	/* test client loop count or # server buffers needed */
+	__u32			tsr_loop;
+	__u32			tsr_concur;     /* concurrency of test */
+	__u8			tsr_is_client;  /* is test client or not */
+	__u8			tsr_stop_onerr; /* stop on error */
+	__u32			tsr_ndest;      /* # of dest nodes */
+
+	union {
+		test_ping_req_t		ping;
+		test_bulk_req_t		bulk_v0;
+		test_bulk_req_v1_t	bulk_v1;
+	}		tsr_u;
+} WIRE_ATTR srpc_test_reqst_t;
+
+typedef struct {
+	__u32			tsr_status;     /* returned code */
+	lst_sid_t		tsr_sid;
+} WIRE_ATTR srpc_test_reply_t;
+
+/* TEST RPCs */
+typedef struct {
+	__u64		   pnr_rpyid;
+	__u32		   pnr_magic;
+	__u32		   pnr_seq;
+	__u64		   pnr_time_sec;
+	__u64		   pnr_time_usec;
+} WIRE_ATTR srpc_ping_reqst_t;
+
+typedef struct {
+	__u32		   pnr_status;
+	__u32		   pnr_magic;
+	__u32		   pnr_seq;
+} WIRE_ATTR srpc_ping_reply_t;
+
+typedef struct {
+	__u64		   brw_rpyid;      /* reply buffer matchbits */
+	__u64		   brw_bulkid;     /* bulk buffer matchbits */
+	__u32		   brw_rw;	 /* read or write */
+	__u32		   brw_len;	/* bulk data len */
+	__u32		   brw_flags;      /* bulk data patterns */
+} WIRE_ATTR srpc_brw_reqst_t; /* bulk r/w request */
+
+typedef struct {
+	__u32		   brw_status;
+} WIRE_ATTR srpc_brw_reply_t; /* bulk r/w reply */
+
+#define SRPC_MSG_MAGIC		  0xeeb0f00d
+#define SRPC_MSG_VERSION		1
+
+typedef struct srpc_msg {
+	/** magic number */
+	__u32	msg_magic;
+	/** message version number */
+	__u32	msg_version;
+	/** type of message body: srpc_msg_type_t */
+	__u32	msg_type;
+	__u32	msg_reserved0;
+	__u32	msg_reserved1;
+	/** test session features */
+	__u32	msg_ses_feats;
+	union {
+		srpc_generic_reqst_t reqst;
+		srpc_generic_reply_t reply;
+
+		srpc_mksn_reqst_t    mksn_reqst;
+		srpc_mksn_reply_t    mksn_reply;
+		srpc_rmsn_reqst_t    rmsn_reqst;
+		srpc_rmsn_reply_t    rmsn_reply;
+		srpc_debug_reqst_t   dbg_reqst;
+		srpc_debug_reply_t   dbg_reply;
+		srpc_batch_reqst_t   bat_reqst;
+		srpc_batch_reply_t   bat_reply;
+		srpc_stat_reqst_t    stat_reqst;
+		srpc_stat_reply_t    stat_reply;
+		srpc_test_reqst_t    tes_reqst;
+		srpc_test_reply_t    tes_reply;
+		srpc_join_reqst_t    join_reqst;
+		srpc_join_reply_t    join_reply;
+
+		srpc_ping_reqst_t    ping_reqst;
+		srpc_ping_reply_t    ping_reply;
+		srpc_brw_reqst_t     brw_reqst;
+		srpc_brw_reply_t     brw_reply;
+	}     msg_body;
+} WIRE_ATTR srpc_msg_t;
+
+static inline void
+srpc_unpack_msg_hdr(srpc_msg_t *msg)
+{
+	if (msg->msg_magic == SRPC_MSG_MAGIC)
+		return; /* no flipping needed */
+
+	/* We do not swap the magic number here as it is needed to
+	   determine whether the body needs to be swapped. */
+	/* __swab32s(&msg->msg_magic); */
+	__swab32s(&msg->msg_type);
+	__swab32s(&msg->msg_version);
+	__swab32s(&msg->msg_ses_feats);
+	__swab32s(&msg->msg_reserved0);
+	__swab32s(&msg->msg_reserved1);
+}
+
+#endif /* __SELFTEST_RPC_H__ */
diff --git a/drivers/staging/lustre/lnet/selftest/selftest.h b/drivers/staging/lustre/lnet/selftest/selftest.h
new file mode 100644
index 000000000000..8053b0563ff3
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/selftest.h
@@ -0,0 +1,611 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ * copy of GPLv2].
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/selftest.h
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ */
+#ifndef __SELFTEST_SELFTEST_H__
+#define __SELFTEST_SELFTEST_H__
+
+#define LNET_ONLY
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lnet.h>
+#include <linux/lnet/lib-lnet.h>
+#include <linux/lnet/lib-types.h>
+#include <linux/lnet/lnetst.h>
+
+#include "rpc.h"
+#include "timer.h"
+
+#ifndef MADE_WITHOUT_COMPROMISE
+#define MADE_WITHOUT_COMPROMISE
+#endif
+
+
+#define SWI_STATE_NEWBORN		  0
+#define SWI_STATE_REPLY_SUBMITTED	  1
+#define SWI_STATE_REPLY_SENT	       2
+#define SWI_STATE_REQUEST_SUBMITTED	3
+#define SWI_STATE_REQUEST_SENT	     4
+#define SWI_STATE_REPLY_RECEIVED	   5
+#define SWI_STATE_BULK_STARTED	     6
+#define SWI_STATE_DONE		     10
+
+/* forward refs */
+struct srpc_service;
+struct srpc_service_cd;
+struct sfw_test_unit;
+struct sfw_test_instance;
+
+/* services below SRPC_FRAMEWORK_SERVICE_MAX_ID are framework
+ * services, e.g. create/modify session.
+ */
+#define SRPC_SERVICE_DEBUG	      0
+#define SRPC_SERVICE_MAKE_SESSION       1
+#define SRPC_SERVICE_REMOVE_SESSION     2
+#define SRPC_SERVICE_BATCH	      3
+#define SRPC_SERVICE_TEST	       4
+#define SRPC_SERVICE_QUERY_STAT	 5
+#define SRPC_SERVICE_JOIN	       6
+#define SRPC_FRAMEWORK_SERVICE_MAX_ID   10
+/* other services start from SRPC_FRAMEWORK_SERVICE_MAX_ID+1 */
+#define SRPC_SERVICE_BRW		11
+#define SRPC_SERVICE_PING	       12
+#define SRPC_SERVICE_MAX_ID	     12
+
+#define SRPC_REQUEST_PORTAL	     50
+/* a lazy portal for framework RPC requests */
+#define SRPC_FRAMEWORK_REQUEST_PORTAL   51
+/* all reply/bulk RDMAs go to this portal */
+#define SRPC_RDMA_PORTAL		52
+
+static inline srpc_msg_type_t
+srpc_service2request (int service)
+{
+	switch (service) {
+	default:
+		LBUG ();
+	case SRPC_SERVICE_DEBUG:
+		return SRPC_MSG_DEBUG_REQST;
+
+	case SRPC_SERVICE_MAKE_SESSION:
+		return SRPC_MSG_MKSN_REQST;
+
+	case SRPC_SERVICE_REMOVE_SESSION:
+		return SRPC_MSG_RMSN_REQST;
+
+	case SRPC_SERVICE_BATCH:
+		return SRPC_MSG_BATCH_REQST;
+
+	case SRPC_SERVICE_TEST:
+		return SRPC_MSG_TEST_REQST;
+
+	case SRPC_SERVICE_QUERY_STAT:
+		return SRPC_MSG_STAT_REQST;
+
+	case SRPC_SERVICE_BRW:
+		return SRPC_MSG_BRW_REQST;
+
+	case SRPC_SERVICE_PING:
+		return SRPC_MSG_PING_REQST;
+
+	case SRPC_SERVICE_JOIN:
+		return SRPC_MSG_JOIN_REQST;
+	}
+}
+
+static inline srpc_msg_type_t
+srpc_service2reply (int service)
+{
+	return srpc_service2request(service) + 1;
+}
+
+typedef enum {
+	SRPC_BULK_REQ_RCVD   = 1, /* passive bulk request(PUT sink/GET source) received */
+	SRPC_BULK_PUT_SENT   = 2, /* active bulk PUT sent (source) */
+	SRPC_BULK_GET_RPLD   = 3, /* active bulk GET replied (sink) */
+	SRPC_REPLY_RCVD      = 4, /* incoming reply received */
+	SRPC_REPLY_SENT      = 5, /* outgoing reply sent */
+	SRPC_REQUEST_RCVD    = 6, /* incoming request received */
+	SRPC_REQUEST_SENT    = 7, /* outgoing request sent */
+} srpc_event_type_t;
+
+/* RPC event */
+typedef struct {
+	srpc_event_type_t ev_type;   /* what's up */
+	lnet_event_kind_t ev_lnet;   /* LNet event type */
+	int	       ev_fired;  /* LNet event fired? */
+	int	       ev_status; /* LNet event status */
+	void	     *ev_data;   /* owning server/client RPC */
+} srpc_event_t;
+
+typedef struct {
+	int	      bk_len;  /* len of bulk data */
+	lnet_handle_md_t bk_mdh;
+	int	      bk_sink; /* sink/source */
+	int	      bk_niov; /* # iov in bk_iovs */
+	lnet_kiov_t      bk_iovs[0];
+} srpc_bulk_t; /* bulk descriptor */
+
+/* message buffer descriptor */
+typedef struct srpc_buffer {
+	struct list_head	   buf_list; /* chain on srpc_service::*_msgq */
+	srpc_msg_t	   buf_msg;
+	lnet_handle_md_t     buf_mdh;
+	lnet_nid_t	   buf_self;
+	lnet_process_id_t    buf_peer;
+} srpc_buffer_t;
+
+struct swi_workitem;
+typedef int (*swi_action_t) (struct swi_workitem *);
+
+typedef struct swi_workitem {
+	struct cfs_wi_sched	*swi_sched;
+	cfs_workitem_t       swi_workitem;
+	swi_action_t	 swi_action;
+	int		  swi_state;
+} swi_workitem_t;
+
+/* server-side state of a RPC */
+typedef struct srpc_server_rpc {
+	/* chain on srpc_service::*_rpcq */
+	struct list_head		srpc_list;
+	struct srpc_service_cd *srpc_scd;
+	swi_workitem_t       srpc_wi;
+	srpc_event_t	 srpc_ev;      /* bulk/reply event */
+	lnet_nid_t	   srpc_self;
+	lnet_process_id_t    srpc_peer;
+	srpc_msg_t	   srpc_replymsg;
+	lnet_handle_md_t     srpc_replymdh;
+	srpc_buffer_t       *srpc_reqstbuf;
+	srpc_bulk_t	 *srpc_bulk;
+
+	unsigned int	 srpc_aborted; /* being given up */
+	int		  srpc_status;
+	void	       (*srpc_done)(struct srpc_server_rpc *);
+} srpc_server_rpc_t;
+
+/* client-side state of a RPC */
+typedef struct srpc_client_rpc {
+	struct list_head		crpc_list;	/* chain on user's lists */
+	spinlock_t		crpc_lock;	/* serialize */
+	int		  crpc_service;
+	atomic_t	 crpc_refcount;
+	int		  crpc_timeout; /* # seconds to wait for reply */
+	stt_timer_t	  crpc_timer;
+	swi_workitem_t       crpc_wi;
+	lnet_process_id_t    crpc_dest;
+
+	void	       (*crpc_done)(struct srpc_client_rpc *);
+	void	       (*crpc_fini)(struct srpc_client_rpc *);
+	int		  crpc_status;    /* completion status */
+	void		*crpc_priv;      /* caller data */
+
+	/* state flags */
+	unsigned int	 crpc_aborted:1; /* being given up */
+	unsigned int	 crpc_closed:1;  /* completed */
+
+	/* RPC events */
+	srpc_event_t	 crpc_bulkev;    /* bulk event */
+	srpc_event_t	 crpc_reqstev;   /* request event */
+	srpc_event_t	 crpc_replyev;   /* reply event */
+
+	/* bulk, request(reqst), and reply exchanged on wire */
+	srpc_msg_t	   crpc_reqstmsg;
+	srpc_msg_t	   crpc_replymsg;
+	lnet_handle_md_t     crpc_reqstmdh;
+	lnet_handle_md_t     crpc_replymdh;
+	srpc_bulk_t	  crpc_bulk;
+} srpc_client_rpc_t;
+
+#define srpc_client_rpc_size(rpc)				       \
+offsetof(srpc_client_rpc_t, crpc_bulk.bk_iovs[(rpc)->crpc_bulk.bk_niov])
+
+#define srpc_client_rpc_addref(rpc)				     \
+do {								    \
+	CDEBUG(D_NET, "RPC[%p] -> %s (%d)++\n",			 \
+	       (rpc), libcfs_id2str((rpc)->crpc_dest),		  \
+	       atomic_read(&(rpc)->crpc_refcount));		 \
+	LASSERT(atomic_read(&(rpc)->crpc_refcount) > 0);	    \
+	atomic_inc(&(rpc)->crpc_refcount);			  \
+} while (0)
+
+#define srpc_client_rpc_decref(rpc)				     \
+do {								    \
+	CDEBUG(D_NET, "RPC[%p] -> %s (%d)--\n",			 \
+	       (rpc), libcfs_id2str((rpc)->crpc_dest),		  \
+	       atomic_read(&(rpc)->crpc_refcount));		 \
+	LASSERT(atomic_read(&(rpc)->crpc_refcount) > 0);	    \
+	if (atomic_dec_and_test(&(rpc)->crpc_refcount))	     \
+		srpc_destroy_client_rpc(rpc);			   \
+} while (0)
+
+#define srpc_event_pending(rpc)   ((rpc)->crpc_bulkev.ev_fired == 0 ||  \
+				   (rpc)->crpc_reqstev.ev_fired == 0 || \
+				   (rpc)->crpc_replyev.ev_fired == 0)
+
+/* CPU partition data of srpc service */
+struct srpc_service_cd {
+	/** serialize */
+	spinlock_t		scd_lock;
+	/** backref to service */
+	struct srpc_service	*scd_svc;
+	/** event buffer */
+	srpc_event_t		scd_ev;
+	/** free RPC descriptors */
+	struct list_head		scd_rpc_free;
+	/** in-flight RPCs */
+	struct list_head		scd_rpc_active;
+	/** workitem for posting buffer */
+	swi_workitem_t		scd_buf_wi;
+	/** CPT id */
+	int			scd_cpt;
+	/** error code for scd_buf_wi */
+	int			scd_buf_err;
+	/** timestamp for scd_buf_err */
+	unsigned long	   scd_buf_err_stamp;
+	/** total # request buffers */
+	int			scd_buf_total;
+	/** # posted request buffers */
+	int			scd_buf_nposted;
+	/** in progress of buffer posting */
+	int			scd_buf_posting;
+	/** allocate more buffers if scd_buf_nposted < scd_buf_low */
+	int			scd_buf_low;
+	/** increase/decrease some buffers */
+	int			scd_buf_adjust;
+	/** posted message buffers */
+	struct list_head		scd_buf_posted;
+	/** blocked for RPC descriptor */
+	struct list_head		scd_buf_blocked;
+};
+
+/* number of server workitems (mini-thread) for testing service */
+#define SFW_TEST_WI_MIN		256
+#define SFW_TEST_WI_MAX		2048
+/* extra buffers for tolerating buggy peers, or unbalanced number
+ * of peers between partitions  */
+#define SFW_TEST_WI_EXTRA	64
+
+/* number of server workitems (mini-thread) for framework service */
+#define SFW_FRWK_WI_MIN		16
+#define SFW_FRWK_WI_MAX		256
+
+typedef struct srpc_service {
+	int			sv_id;		/* service id */
+	const char		*sv_name;	/* human readable name */
+	int			sv_wi_total;	/* total server workitems */
+	int			sv_shuttingdown;
+	int			sv_ncpts;
+	/* percpt data for srpc_service */
+	struct srpc_service_cd	**sv_cpt_data;
+	/* Service callbacks:
+	 * - sv_handler: process incoming RPC request
+	 * - sv_bulk_ready: notify bulk data
+	 */
+	int	      (*sv_handler) (srpc_server_rpc_t *);
+	int	      (*sv_bulk_ready) (srpc_server_rpc_t *, int);
+} srpc_service_t;
+
+typedef struct {
+	struct list_head	sn_list;    /* chain on fw_zombie_sessions */
+	lst_sid_t	 sn_id;      /* unique identifier */
+	unsigned int      sn_timeout; /* # seconds' inactivity to expire */
+	int	       sn_timer_active;
+	unsigned int	  sn_features;
+	stt_timer_t       sn_timer;
+	struct list_head	sn_batches; /* list of batches */
+	char	      sn_name[LST_NAME_SIZE];
+	atomic_t      sn_refcount;
+	atomic_t      sn_brw_errors;
+	atomic_t      sn_ping_errors;
+	cfs_time_t	sn_started;
+} sfw_session_t;
+
+#define sfw_sid_equal(sid0, sid1)     ((sid0).ses_nid == (sid1).ses_nid && \
+				       (sid0).ses_stamp == (sid1).ses_stamp)
+
+typedef struct {
+	struct list_head	bat_list;      /* chain on sn_batches */
+	lst_bid_t	 bat_id;	/* batch id */
+	int	       bat_error;     /* error code of batch */
+	sfw_session_t    *bat_session;   /* batch's session */
+	atomic_t      bat_nactive;   /* # of active tests */
+	struct list_head	bat_tests;     /* test instances */
+} sfw_batch_t;
+
+typedef struct {
+	int  (*tso_init)(struct sfw_test_instance *tsi); /* intialize test client */
+	void (*tso_fini)(struct sfw_test_instance *tsi); /* finalize test client */
+	int  (*tso_prep_rpc)(struct sfw_test_unit *tsu,
+			     lnet_process_id_t dest,
+			     srpc_client_rpc_t **rpc);   /* prep a tests rpc */
+	void (*tso_done_rpc)(struct sfw_test_unit *tsu,
+			     srpc_client_rpc_t *rpc);    /* done a test rpc */
+} sfw_test_client_ops_t;
+
+typedef struct sfw_test_instance {
+	struct list_head	      tsi_list;	 /* chain on batch */
+	int		     tsi_service;      /* test type */
+	sfw_batch_t	    *tsi_batch;	/* batch */
+	sfw_test_client_ops_t  *tsi_ops;	  /* test client operations */
+
+	/* public parameter for all test units */
+	unsigned int		tsi_is_client:1;     /* is test client */
+	unsigned int		tsi_stoptsu_onerr:1; /* stop tsu on error */
+	int		     tsi_concur;	  /* concurrency */
+	int		     tsi_loop;	    /* loop count */
+
+	/* status of test instance */
+	spinlock_t		tsi_lock;	  /* serialize */
+	unsigned int		tsi_stopping:1;   /* test is stopping */
+	atomic_t	    tsi_nactive;      /* # of active test unit */
+	struct list_head	      tsi_units;	/* test units */
+	struct list_head	      tsi_free_rpcs;    /* free rpcs */
+	struct list_head	      tsi_active_rpcs;  /* active rpcs */
+
+	union {
+		test_ping_req_t		ping;	  /* ping parameter */
+		test_bulk_req_t		bulk_v0;  /* bulk parameter */
+		test_bulk_req_v1_t	bulk_v1;  /* bulk v1 parameter */
+	} tsi_u;
+} sfw_test_instance_t;
+
+/* XXX: trailing (PAGE_CACHE_SIZE % sizeof(lnet_process_id_t)) bytes at
+ * the end of pages are not used */
+#define SFW_MAX_CONCUR     LST_MAX_CONCUR
+#define SFW_ID_PER_PAGE    (PAGE_CACHE_SIZE / sizeof(lnet_process_id_packed_t))
+#define SFW_MAX_NDESTS     (LNET_MAX_IOV * SFW_ID_PER_PAGE)
+#define sfw_id_pages(n)    (((n) + SFW_ID_PER_PAGE - 1) / SFW_ID_PER_PAGE)
+
+typedef struct sfw_test_unit {
+	struct list_head	    tsu_list;	 /* chain on lst_test_instance */
+	lnet_process_id_t     tsu_dest;	 /* id of dest node */
+	int		   tsu_loop;	 /* loop count of the test */
+	sfw_test_instance_t  *tsu_instance;     /* pointer to test instance */
+	void		 *tsu_private;      /* private data */
+	swi_workitem_t	tsu_worker;       /* workitem of the test unit */
+} sfw_test_unit_t;
+
+typedef struct sfw_test_case {
+	struct list_head	      tsc_list;	 /* chain on fw_tests */
+	srpc_service_t	 *tsc_srv_service;  /* test service */
+	sfw_test_client_ops_t  *tsc_cli_ops;      /* ops of test client */
+} sfw_test_case_t;
+
+srpc_client_rpc_t *
+sfw_create_rpc(lnet_process_id_t peer, int service,
+	       unsigned features, int nbulkiov, int bulklen,
+	       void (*done) (srpc_client_rpc_t *), void *priv);
+int sfw_create_test_rpc(sfw_test_unit_t *tsu,
+			lnet_process_id_t peer, unsigned features,
+			int nblk, int blklen, srpc_client_rpc_t **rpc);
+void sfw_abort_rpc(srpc_client_rpc_t *rpc);
+void sfw_post_rpc(srpc_client_rpc_t *rpc);
+void sfw_client_rpc_done(srpc_client_rpc_t *rpc);
+void sfw_unpack_message(srpc_msg_t *msg);
+void sfw_free_pages(srpc_server_rpc_t *rpc);
+void sfw_add_bulk_page(srpc_bulk_t *bk, struct page *pg, int i);
+int sfw_alloc_pages(srpc_server_rpc_t *rpc, int cpt, int npages, int len,
+		    int sink);
+int sfw_make_session (srpc_mksn_reqst_t *request, srpc_mksn_reply_t *reply);
+
+srpc_client_rpc_t *
+srpc_create_client_rpc(lnet_process_id_t peer, int service,
+		       int nbulkiov, int bulklen,
+		       void (*rpc_done)(srpc_client_rpc_t *),
+		       void (*rpc_fini)(srpc_client_rpc_t *), void *priv);
+void srpc_post_rpc(srpc_client_rpc_t *rpc);
+void srpc_abort_rpc(srpc_client_rpc_t *rpc, int why);
+void srpc_free_bulk(srpc_bulk_t *bk);
+srpc_bulk_t *srpc_alloc_bulk(int cpt, unsigned bulk_npg, unsigned bulk_len,
+			     int sink);
+int srpc_send_rpc(swi_workitem_t *wi);
+int srpc_send_reply(srpc_server_rpc_t *rpc);
+int srpc_add_service(srpc_service_t *sv);
+int srpc_remove_service(srpc_service_t *sv);
+void srpc_shutdown_service(srpc_service_t *sv);
+void srpc_abort_service(srpc_service_t *sv);
+int srpc_finish_service(srpc_service_t *sv);
+int srpc_service_add_buffers(srpc_service_t *sv, int nbuffer);
+void srpc_service_remove_buffers(srpc_service_t *sv, int nbuffer);
+void srpc_get_counters(srpc_counters_t *cnt);
+void srpc_set_counters(const srpc_counters_t *cnt);
+
+extern struct cfs_wi_sched *lst_sched_serial;
+extern struct cfs_wi_sched **lst_sched_test;
+
+static inline int
+srpc_serv_is_framework(struct srpc_service *svc)
+{
+	return svc->sv_id < SRPC_FRAMEWORK_SERVICE_MAX_ID;
+}
+
+static inline int
+swi_wi_action(cfs_workitem_t *wi)
+{
+	swi_workitem_t *swi = container_of(wi, swi_workitem_t, swi_workitem);
+
+	return swi->swi_action(swi);
+}
+
+static inline void
+swi_init_workitem(swi_workitem_t *swi, void *data,
+		  swi_action_t action, struct cfs_wi_sched *sched)
+{
+	swi->swi_sched  = sched;
+	swi->swi_action = action;
+	swi->swi_state  = SWI_STATE_NEWBORN;
+	cfs_wi_init(&swi->swi_workitem, data, swi_wi_action);
+}
+
+static inline void
+swi_schedule_workitem(swi_workitem_t *wi)
+{
+	cfs_wi_schedule(wi->swi_sched, &wi->swi_workitem);
+}
+
+static inline void
+swi_exit_workitem(swi_workitem_t *swi)
+{
+	cfs_wi_exit(swi->swi_sched, &swi->swi_workitem);
+}
+
+static inline int
+swi_deschedule_workitem(swi_workitem_t *swi)
+{
+	return cfs_wi_deschedule(swi->swi_sched, &swi->swi_workitem);
+}
+
+
+int sfw_startup(void);
+int srpc_startup(void);
+void sfw_shutdown(void);
+void srpc_shutdown(void);
+
+static inline void
+srpc_destroy_client_rpc (srpc_client_rpc_t *rpc)
+{
+	LASSERT (rpc != NULL);
+	LASSERT (!srpc_event_pending(rpc));
+	LASSERT (atomic_read(&rpc->crpc_refcount) == 0);
+
+	if (rpc->crpc_fini == NULL) {
+		LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc));
+	} else {
+		(*rpc->crpc_fini) (rpc);
+	}
+
+	return;
+}
+
+static inline void
+srpc_init_client_rpc (srpc_client_rpc_t *rpc, lnet_process_id_t peer,
+		      int service, int nbulkiov, int bulklen,
+		      void (*rpc_done)(srpc_client_rpc_t *),
+		      void (*rpc_fini)(srpc_client_rpc_t *), void *priv)
+{
+	LASSERT (nbulkiov <= LNET_MAX_IOV);
+
+	memset(rpc, 0, offsetof(srpc_client_rpc_t,
+				crpc_bulk.bk_iovs[nbulkiov]));
+
+	INIT_LIST_HEAD(&rpc->crpc_list);
+	swi_init_workitem(&rpc->crpc_wi, rpc, srpc_send_rpc,
+			  lst_sched_test[lnet_cpt_of_nid(peer.nid)]);
+	spin_lock_init(&rpc->crpc_lock);
+	atomic_set(&rpc->crpc_refcount, 1); /* 1 ref for caller */
+
+	rpc->crpc_dest	 = peer;
+	rpc->crpc_priv	 = priv;
+	rpc->crpc_service      = service;
+	rpc->crpc_bulk.bk_len  = bulklen;
+	rpc->crpc_bulk.bk_niov = nbulkiov;
+	rpc->crpc_done	 = rpc_done;
+	rpc->crpc_fini	 = rpc_fini;
+	LNetInvalidateHandle(&rpc->crpc_reqstmdh);
+	LNetInvalidateHandle(&rpc->crpc_replymdh);
+	LNetInvalidateHandle(&rpc->crpc_bulk.bk_mdh);
+
+	/* no event is expected at this point */
+	rpc->crpc_bulkev.ev_fired  =
+	rpc->crpc_reqstev.ev_fired =
+	rpc->crpc_replyev.ev_fired = 1;
+
+	rpc->crpc_reqstmsg.msg_magic   = SRPC_MSG_MAGIC;
+	rpc->crpc_reqstmsg.msg_version = SRPC_MSG_VERSION;
+	rpc->crpc_reqstmsg.msg_type    = srpc_service2request(service);
+	return;
+}
+
+static inline const char *
+swi_state2str (int state)
+{
+#define STATE2STR(x) case x: return #x
+	switch(state) {
+		default:
+			LBUG();
+		STATE2STR(SWI_STATE_NEWBORN);
+		STATE2STR(SWI_STATE_REPLY_SUBMITTED);
+		STATE2STR(SWI_STATE_REPLY_SENT);
+		STATE2STR(SWI_STATE_REQUEST_SUBMITTED);
+		STATE2STR(SWI_STATE_REQUEST_SENT);
+		STATE2STR(SWI_STATE_REPLY_RECEIVED);
+		STATE2STR(SWI_STATE_BULK_STARTED);
+		STATE2STR(SWI_STATE_DONE);
+	}
+#undef STATE2STR
+}
+
+#define UNUSED(x)       ( (void)(x) )
+
+
+#define selftest_wait_events()	cfs_pause(cfs_time_seconds(1) / 10)
+
+
+#define lst_wait_until(cond, lock, fmt, ...)				\
+do {									\
+	int __I = 2;							\
+	while (!(cond)) {						\
+		CDEBUG(IS_PO2(++__I) ? D_WARNING : D_NET,		\
+		       fmt, ## __VA_ARGS__);				\
+		spin_unlock(&(lock));					\
+									\
+		selftest_wait_events();					\
+									\
+		spin_lock(&(lock));					\
+	}								\
+} while (0)
+
+static inline void
+srpc_wait_service_shutdown(srpc_service_t *sv)
+{
+	int i = 2;
+
+	LASSERT(sv->sv_shuttingdown);
+
+	while (srpc_finish_service(sv) == 0) {
+		i++;
+		CDEBUG (((i & -i) == i) ? D_WARNING : D_NET,
+			"Waiting for %s service to shutdown...\n",
+			sv->sv_name);
+		selftest_wait_events();
+	}
+}
+
+#endif /* __SELFTEST_SELFTEST_H__ */
diff --git a/drivers/staging/lustre/lnet/selftest/timer.c b/drivers/staging/lustre/lnet/selftest/timer.c
new file mode 100644
index 000000000000..2c078550277b
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/timer.c
@@ -0,0 +1,253 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/timer.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "selftest.h"
+
+
+/*
+ * Timers are implemented as a sorted queue of expiry times. The queue
+ * is slotted, with each slot holding timers which expire in a
+ * 2**STTIMER_MINPOLL (8) second period. The timers in each slot are
+ * sorted by increasing expiry time. The number of slots is 2**7 (128),
+ * to cover a time period of 1024 seconds into the future before wrapping.
+ */
+#define STTIMER_MINPOLL	3   /* log2 min poll interval (8 s) */
+#define STTIMER_SLOTTIME       (1 << STTIMER_MINPOLL)
+#define STTIMER_SLOTTIMEMASK   (~(STTIMER_SLOTTIME - 1))
+#define STTIMER_NSLOTS	       (1 << 7)
+#define STTIMER_SLOT(t)	       (&stt_data.stt_hash[(((t) >> STTIMER_MINPOLL) & \
+						    (STTIMER_NSLOTS - 1))])
+
+struct st_timer_data {
+	spinlock_t	 stt_lock;
+	/* start time of the slot processed previously */
+	cfs_time_t       stt_prev_slot;
+	struct list_head       stt_hash[STTIMER_NSLOTS];
+	int	      stt_shuttingdown;
+	wait_queue_head_t      stt_waitq;
+	int	      stt_nthreads;
+} stt_data;
+
+void
+stt_add_timer(stt_timer_t *timer)
+{
+	struct list_head *pos;
+
+	spin_lock(&stt_data.stt_lock);
+
+	LASSERT (stt_data.stt_nthreads > 0);
+	LASSERT (!stt_data.stt_shuttingdown);
+	LASSERT (timer->stt_func != NULL);
+	LASSERT (list_empty(&timer->stt_list));
+	LASSERT (cfs_time_after(timer->stt_expires, cfs_time_current_sec()));
+
+	/* a simple insertion sort */
+	list_for_each_prev (pos, STTIMER_SLOT(timer->stt_expires)) {
+		stt_timer_t *old = list_entry(pos, stt_timer_t, stt_list);
+
+		if (cfs_time_aftereq(timer->stt_expires, old->stt_expires))
+			break;
+	}
+	list_add(&timer->stt_list, pos);
+
+	spin_unlock(&stt_data.stt_lock);
+}
+
+/*
+ * The function returns whether it has deactivated a pending timer or not.
+ * (ie. del_timer() of an inactive timer returns 0, del_timer() of an
+ * active timer returns 1.)
+ *
+ * CAVEAT EMPTOR:
+ * When 0 is returned, it is possible that timer->stt_func _is_ running on
+ * another CPU.
+ */
+int
+stt_del_timer (stt_timer_t *timer)
+{
+	int ret = 0;
+
+	spin_lock(&stt_data.stt_lock);
+
+	LASSERT (stt_data.stt_nthreads > 0);
+	LASSERT (!stt_data.stt_shuttingdown);
+
+	if (!list_empty(&timer->stt_list)) {
+		ret = 1;
+		list_del_init(&timer->stt_list);
+	}
+
+	spin_unlock(&stt_data.stt_lock);
+	return ret;
+}
+
+/* called with stt_data.stt_lock held */
+int
+stt_expire_list (struct list_head *slot, cfs_time_t now)
+{
+	int	  expired = 0;
+	stt_timer_t *timer;
+
+	while (!list_empty(slot)) {
+		timer = list_entry(slot->next, stt_timer_t, stt_list);
+
+		if (cfs_time_after(timer->stt_expires, now))
+			break;
+
+		list_del_init(&timer->stt_list);
+		spin_unlock(&stt_data.stt_lock);
+
+		expired++;
+		(*timer->stt_func) (timer->stt_data);
+
+		spin_lock(&stt_data.stt_lock);
+	}
+
+	return expired;
+}
+
+int
+stt_check_timers (cfs_time_t *last)
+{
+	int	expired = 0;
+	cfs_time_t now;
+	cfs_time_t this_slot;
+
+	now = cfs_time_current_sec();
+	this_slot = now & STTIMER_SLOTTIMEMASK;
+
+	spin_lock(&stt_data.stt_lock);
+
+	while (cfs_time_aftereq(this_slot, *last)) {
+		expired += stt_expire_list(STTIMER_SLOT(this_slot), now);
+		this_slot = cfs_time_sub(this_slot, STTIMER_SLOTTIME);
+	}
+
+	*last = now & STTIMER_SLOTTIMEMASK;
+	spin_unlock(&stt_data.stt_lock);
+	return expired;
+}
+
+
+int
+stt_timer_main (void *arg)
+{
+	int rc = 0;
+	UNUSED(arg);
+
+	SET_BUT_UNUSED(rc);
+
+	cfs_block_allsigs();
+
+	while (!stt_data.stt_shuttingdown) {
+		stt_check_timers(&stt_data.stt_prev_slot);
+
+		rc = wait_event_timeout(stt_data.stt_waitq,
+					stt_data.stt_shuttingdown,
+					cfs_time_seconds(STTIMER_SLOTTIME));
+	}
+
+	spin_lock(&stt_data.stt_lock);
+	stt_data.stt_nthreads--;
+	spin_unlock(&stt_data.stt_lock);
+	return 0;
+}
+
+int
+stt_start_timer_thread (void)
+{
+	task_t *task;
+
+	LASSERT(!stt_data.stt_shuttingdown);
+
+	task = kthread_run(stt_timer_main, NULL, "st_timer");
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+
+	spin_lock(&stt_data.stt_lock);
+	stt_data.stt_nthreads++;
+	spin_unlock(&stt_data.stt_lock);
+	return 0;
+}
+
+
+int
+stt_startup (void)
+{
+	int rc = 0;
+	int i;
+
+	stt_data.stt_shuttingdown = 0;
+	stt_data.stt_prev_slot = cfs_time_current_sec() & STTIMER_SLOTTIMEMASK;
+
+	spin_lock_init(&stt_data.stt_lock);
+	for (i = 0; i < STTIMER_NSLOTS; i++)
+		INIT_LIST_HEAD(&stt_data.stt_hash[i]);
+
+	stt_data.stt_nthreads = 0;
+	init_waitqueue_head(&stt_data.stt_waitq);
+	rc = stt_start_timer_thread();
+	if (rc != 0)
+		CERROR ("Can't spawn timer thread: %d\n", rc);
+
+	return rc;
+}
+
+void
+stt_shutdown (void)
+{
+	int i;
+
+	spin_lock(&stt_data.stt_lock);
+
+	for (i = 0; i < STTIMER_NSLOTS; i++)
+		LASSERT (list_empty(&stt_data.stt_hash[i]));
+
+	stt_data.stt_shuttingdown = 1;
+
+	wake_up(&stt_data.stt_waitq);
+	lst_wait_until(stt_data.stt_nthreads == 0, stt_data.stt_lock,
+		       "waiting for %d threads to terminate\n",
+		       stt_data.stt_nthreads);
+
+	spin_unlock(&stt_data.stt_lock);
+}
diff --git a/drivers/staging/lustre/lnet/selftest/timer.h b/drivers/staging/lustre/lnet/selftest/timer.h
new file mode 100644
index 000000000000..56dbfe5ea1e5
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/timer.h
@@ -0,0 +1,53 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/timer.h
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ */
+#ifndef __SELFTEST_TIMER_H__
+#define __SELFTEST_TIMER_H__
+
+typedef struct {
+	struct list_head	stt_list;
+	cfs_time_t	stt_expires;
+	void	    (*stt_func) (void *);
+	void	     *stt_data;
+} stt_timer_t;
+
+void stt_add_timer (stt_timer_t *timer);
+int stt_del_timer (stt_timer_t *timer);
+int stt_startup (void);
+void stt_shutdown (void);
+
+#endif /* __SELFTEST_TIMER_H__ */
diff --git a/drivers/staging/lustre/lustre/Kconfig b/drivers/staging/lustre/lustre/Kconfig
new file mode 100644
index 000000000000..d0a0e08afbc7
--- /dev/null
+++ b/drivers/staging/lustre/lustre/Kconfig
@@ -0,0 +1,33 @@
+config LUSTRE_FS
+	tristate "Lustre file system client support"
+	depends on STAGING && INET && BROKEN
+	select LNET
+	help
+	  This option enables Lustre file system client support. Choose Y
+	  here if you want to access a Lustre file system cluster. To compile
+	  this file system support as a module, choose M here: the module will
+	  be called lustre.
+
+	  To mount Lustre file systems , you also need to install the user space
+	  mount.lustre and other user space commands which can be found in the
+	  lustre-client package, available from
+	  http://downloads.whamcloud.com/public/lustre/
+
+	  Lustre file system is the most popular cluster file system in high
+	  performance computing. Source code of both kernel space and user space
+	  Lustre components can also be found at
+	  http://git.whamcloud.com/?p=fs/lustre-release.git;a=summary
+
+	  If unsure, say N.
+
+	  See also http://wiki.lustre.org/
+
+config LUSTRE_OBD_MAX_IOCTL_BUFFER
+	int "Lustre obd max ioctl buffer bytes (default 8KB)"
+	depends on LUSTRE_FS
+	default 8192
+	help
+	  This option defines the maximum size of buffer in bytes that user space
+	  applications can pass to Lustre kernel module through ioctl interface.
+
+	  If unsure, use default.
diff --git a/drivers/staging/lustre/lustre/Makefile b/drivers/staging/lustre/lustre/Makefile
new file mode 100644
index 000000000000..3fb94fc12068
--- /dev/null
+++ b/drivers/staging/lustre/lustre/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_LUSTRE_FS) := fid/ lvfs/ obdclass/ ptlrpc/ obdecho/ mgc/ lov/ \
+			   osc/ mdc/ lmv/ llite/ fld/ libcfs/
diff --git a/drivers/staging/lustre/lustre/fid/Makefile b/drivers/staging/lustre/lustre/fid/Makefile
new file mode 100644
index 000000000000..b8d6d21b39ff
--- /dev/null
+++ b/drivers/staging/lustre/lustre/fid/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_LUSTRE_FS) += fid.o
+fid-y := fid_handler.o fid_store.o fid_request.o lproc_fid.o fid_lib.o
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lustre/fid/fid_handler.c b/drivers/staging/lustre/lustre/fid/fid_handler.c
new file mode 100644
index 000000000000..bbbb3cfe57b3
--- /dev/null
+++ b/drivers/staging/lustre/lustre/fid/fid_handler.c
@@ -0,0 +1,661 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fid/fid_handler.c
+ *
+ * Lustre Sequence Manager
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FID
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/module.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <dt_object.h>
+#include <md_object.h>
+#include <obd_support.h>
+#include <lustre_req_layout.h>
+#include <lustre_fid.h>
+#include "fid_internal.h"
+
+int client_fid_init(struct obd_device *obd,
+		    struct obd_export *exp, enum lu_cli_type type)
+{
+	struct client_obd *cli = &obd->u.cli;
+	char *prefix;
+	int rc;
+	ENTRY;
+
+	OBD_ALLOC_PTR(cli->cl_seq);
+	if (cli->cl_seq == NULL)
+		RETURN(-ENOMEM);
+
+	OBD_ALLOC(prefix, MAX_OBD_NAME + 5);
+	if (prefix == NULL)
+		GOTO(out_free_seq, rc = -ENOMEM);
+
+	snprintf(prefix, MAX_OBD_NAME + 5, "cli-%s", obd->obd_name);
+
+	/* Init client side sequence-manager */
+	rc = seq_client_init(cli->cl_seq, exp, type, prefix, NULL);
+	OBD_FREE(prefix, MAX_OBD_NAME + 5);
+	if (rc)
+		GOTO(out_free_seq, rc);
+
+	RETURN(rc);
+out_free_seq:
+	OBD_FREE_PTR(cli->cl_seq);
+	cli->cl_seq = NULL;
+	return rc;
+}
+EXPORT_SYMBOL(client_fid_init);
+
+int client_fid_fini(struct obd_device *obd)
+{
+	struct client_obd *cli = &obd->u.cli;
+	ENTRY;
+
+	if (cli->cl_seq != NULL) {
+		seq_client_fini(cli->cl_seq);
+		OBD_FREE_PTR(cli->cl_seq);
+		cli->cl_seq = NULL;
+	}
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(client_fid_fini);
+
+static void seq_server_proc_fini(struct lu_server_seq *seq);
+
+/* Assigns client to sequence controller node. */
+int seq_server_set_cli(struct lu_server_seq *seq,
+		       struct lu_client_seq *cli,
+		       const struct lu_env *env)
+{
+	int rc = 0;
+	ENTRY;
+
+	/*
+	 * Ask client for new range, assign that range to ->seq_space and write
+	 * seq state to backing store should be atomic.
+	 */
+	mutex_lock(&seq->lss_mutex);
+
+	if (cli == NULL) {
+		CDEBUG(D_INFO, "%s: Detached sequence client %s\n",
+		       seq->lss_name, cli->lcs_name);
+		seq->lss_cli = cli;
+		GOTO(out_up, rc = 0);
+	}
+
+	if (seq->lss_cli != NULL) {
+		CDEBUG(D_HA, "%s: Sequence controller is already "
+		       "assigned\n", seq->lss_name);
+		GOTO(out_up, rc = -EEXIST);
+	}
+
+	CDEBUG(D_INFO, "%s: Attached sequence controller %s\n",
+	       seq->lss_name, cli->lcs_name);
+
+	seq->lss_cli = cli;
+	cli->lcs_space.lsr_index = seq->lss_site->ss_node_id;
+	EXIT;
+out_up:
+	mutex_unlock(&seq->lss_mutex);
+	return rc;
+}
+EXPORT_SYMBOL(seq_server_set_cli);
+/*
+ * allocate \a w units of sequence from range \a from.
+ */
+static inline void range_alloc(struct lu_seq_range *to,
+			       struct lu_seq_range *from,
+			       __u64 width)
+{
+	width = min(range_space(from), width);
+	to->lsr_start = from->lsr_start;
+	to->lsr_end = from->lsr_start + width;
+	from->lsr_start += width;
+}
+
+/**
+ * On controller node, allocate new super sequence for regular sequence server.
+ * As this super sequence controller, this node suppose to maintain fld
+ * and update index.
+ * \a out range always has currect mds node number of requester.
+ */
+
+static int __seq_server_alloc_super(struct lu_server_seq *seq,
+				    struct lu_seq_range *out,
+				    const struct lu_env *env)
+{
+	struct lu_seq_range *space = &seq->lss_space;
+	int rc;
+	ENTRY;
+
+	LASSERT(range_is_sane(space));
+
+	if (range_is_exhausted(space)) {
+		CERROR("%s: Sequences space is exhausted\n",
+		       seq->lss_name);
+		RETURN(-ENOSPC);
+	} else {
+		range_alloc(out, space, seq->lss_width);
+	}
+
+	rc = seq_store_update(env, seq, out, 1 /* sync */);
+
+	LCONSOLE_INFO("%s: super-sequence allocation rc = %d " DRANGE"\n",
+		      seq->lss_name, rc, PRANGE(out));
+
+	RETURN(rc);
+}
+
+int seq_server_alloc_super(struct lu_server_seq *seq,
+			   struct lu_seq_range *out,
+			   const struct lu_env *env)
+{
+	int rc;
+	ENTRY;
+
+	mutex_lock(&seq->lss_mutex);
+	rc = __seq_server_alloc_super(seq, out, env);
+	mutex_unlock(&seq->lss_mutex);
+
+	RETURN(rc);
+}
+
+static int __seq_set_init(const struct lu_env *env,
+			    struct lu_server_seq *seq)
+{
+	struct lu_seq_range *space = &seq->lss_space;
+	int rc;
+
+	range_alloc(&seq->lss_lowater_set, space, seq->lss_set_width);
+	range_alloc(&seq->lss_hiwater_set, space, seq->lss_set_width);
+
+	rc = seq_store_update(env, seq, NULL, 1);
+
+	return rc;
+}
+
+/*
+ * This function implements new seq allocation algorithm using async
+ * updates to seq file on disk. ref bug 18857 for details.
+ * there are four variable to keep track of this process
+ *
+ * lss_space; - available lss_space
+ * lss_lowater_set; - lu_seq_range for all seqs before barrier, i.e. safe to use
+ * lss_hiwater_set; - lu_seq_range after barrier, i.e. allocated but may be
+ *		    not yet committed
+ *
+ * when lss_lowater_set reaches the end it is replaced with hiwater one and
+ * a write operation is initiated to allocate new hiwater range.
+ * if last seq write opearion is still not commited, current operation is
+ * flaged as sync write op.
+ */
+static int range_alloc_set(const struct lu_env *env,
+			    struct lu_seq_range *out,
+			    struct lu_server_seq *seq)
+{
+	struct lu_seq_range *space = &seq->lss_space;
+	struct lu_seq_range *loset = &seq->lss_lowater_set;
+	struct lu_seq_range *hiset = &seq->lss_hiwater_set;
+	int rc = 0;
+
+	if (range_is_zero(loset))
+		__seq_set_init(env, seq);
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_SEQ_ALLOC)) /* exhaust set */
+		loset->lsr_start = loset->lsr_end;
+
+	if (range_is_exhausted(loset)) {
+		/* reached high water mark. */
+		struct lu_device *dev = seq->lss_site->ss_lu->ls_top_dev;
+		int obd_num_clients = dev->ld_obd->obd_num_exports;
+		__u64 set_sz;
+
+		/* calculate new seq width based on number of clients */
+		set_sz = max(seq->lss_set_width,
+			     obd_num_clients * seq->lss_width);
+		set_sz = min(range_space(space), set_sz);
+
+		/* Switch to hiwater range now */
+		*loset = *hiset;
+		/* allocate new hiwater range */
+		range_alloc(hiset, space, set_sz);
+
+		/* update ondisk seq with new *space */
+		rc = seq_store_update(env, seq, NULL, seq->lss_need_sync);
+	}
+
+	LASSERTF(!range_is_exhausted(loset) || range_is_sane(loset),
+		 DRANGE"\n", PRANGE(loset));
+
+	if (rc == 0)
+		range_alloc(out, loset, seq->lss_width);
+
+	RETURN(rc);
+}
+
+static int __seq_server_alloc_meta(struct lu_server_seq *seq,
+				   struct lu_seq_range *out,
+				   const struct lu_env *env)
+{
+	struct lu_seq_range *space = &seq->lss_space;
+	int rc = 0;
+
+	ENTRY;
+
+	LASSERT(range_is_sane(space));
+
+	/* Check if available space ends and allocate new super seq */
+	if (range_is_exhausted(space)) {
+		if (!seq->lss_cli) {
+			CERROR("%s: No sequence controller is attached.\n",
+			       seq->lss_name);
+			RETURN(-ENODEV);
+		}
+
+		rc = seq_client_alloc_super(seq->lss_cli, env);
+		if (rc) {
+			CERROR("%s: Can't allocate super-sequence, rc %d\n",
+			       seq->lss_name, rc);
+			RETURN(rc);
+		}
+
+		/* Saving new range to allocation space. */
+		*space = seq->lss_cli->lcs_space;
+		LASSERT(range_is_sane(space));
+	}
+
+	rc = range_alloc_set(env, out, seq);
+	if (rc != 0) {
+		CERROR("%s: Allocated meta-sequence failed: rc = %d\n",
+			seq->lss_name, rc);
+		RETURN(rc);
+	}
+
+	CDEBUG(D_INFO, "%s: Allocated meta-sequence " DRANGE"\n",
+		seq->lss_name, PRANGE(out));
+
+	RETURN(rc);
+}
+
+int seq_server_alloc_meta(struct lu_server_seq *seq,
+			  struct lu_seq_range *out,
+			  const struct lu_env *env)
+{
+	int rc;
+	ENTRY;
+
+	mutex_lock(&seq->lss_mutex);
+	rc = __seq_server_alloc_meta(seq, out, env);
+	mutex_unlock(&seq->lss_mutex);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(seq_server_alloc_meta);
+
+static int seq_server_handle(struct lu_site *site,
+			     const struct lu_env *env,
+			     __u32 opc, struct lu_seq_range *out)
+{
+	int rc;
+	struct seq_server_site *ss_site;
+	ENTRY;
+
+	ss_site = lu_site2seq(site);
+
+	switch (opc) {
+	case SEQ_ALLOC_META:
+		if (!ss_site->ss_server_seq) {
+			CERROR("Sequence server is not "
+			       "initialized\n");
+			RETURN(-EINVAL);
+		}
+		rc = seq_server_alloc_meta(ss_site->ss_server_seq, out, env);
+		break;
+	case SEQ_ALLOC_SUPER:
+		if (!ss_site->ss_control_seq) {
+			CERROR("Sequence controller is not "
+			       "initialized\n");
+			RETURN(-EINVAL);
+		}
+		rc = seq_server_alloc_super(ss_site->ss_control_seq, out, env);
+		break;
+	default:
+		rc = -EINVAL;
+		break;
+	}
+
+	RETURN(rc);
+}
+
+static int seq_req_handle(struct ptlrpc_request *req,
+			  const struct lu_env *env,
+			  struct seq_thread_info *info)
+{
+	struct lu_seq_range *out, *tmp;
+	struct lu_site *site;
+	int rc = -EPROTO;
+	__u32 *opc;
+	ENTRY;
+
+	LASSERT(!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY));
+	site = req->rq_export->exp_obd->obd_lu_dev->ld_site;
+	LASSERT(site != NULL);
+
+	rc = req_capsule_server_pack(info->sti_pill);
+	if (rc)
+		RETURN(err_serious(rc));
+
+	opc = req_capsule_client_get(info->sti_pill, &RMF_SEQ_OPC);
+	if (opc != NULL) {
+		out = req_capsule_server_get(info->sti_pill, &RMF_SEQ_RANGE);
+		if (out == NULL)
+			RETURN(err_serious(-EPROTO));
+
+		tmp = req_capsule_client_get(info->sti_pill, &RMF_SEQ_RANGE);
+
+		/* seq client passed mdt id, we need to pass that using out
+		 * range parameter */
+
+		out->lsr_index = tmp->lsr_index;
+		out->lsr_flags = tmp->lsr_flags;
+		rc = seq_server_handle(site, env, *opc, out);
+	} else
+		rc = err_serious(-EPROTO);
+
+	RETURN(rc);
+}
+
+/* context key constructor/destructor: seq_key_init, seq_key_fini */
+LU_KEY_INIT_FINI(seq, struct seq_thread_info);
+
+/* context key: seq_thread_key */
+LU_CONTEXT_KEY_DEFINE(seq, LCT_MD_THREAD | LCT_DT_THREAD);
+
+static void seq_thread_info_init(struct ptlrpc_request *req,
+				 struct seq_thread_info *info)
+{
+	info->sti_pill = &req->rq_pill;
+	/* Init request capsule */
+	req_capsule_init(info->sti_pill, req, RCL_SERVER);
+	req_capsule_set(info->sti_pill, &RQF_SEQ_QUERY);
+}
+
+static void seq_thread_info_fini(struct seq_thread_info *info)
+{
+	req_capsule_fini(info->sti_pill);
+}
+
+int seq_handle(struct ptlrpc_request *req)
+{
+	const struct lu_env *env;
+	struct seq_thread_info *info;
+	int rc;
+
+	env = req->rq_svc_thread->t_env;
+	LASSERT(env != NULL);
+
+	info = lu_context_key_get(&env->le_ctx, &seq_thread_key);
+	LASSERT(info != NULL);
+
+	seq_thread_info_init(req, info);
+	rc = seq_req_handle(req, env, info);
+	/* XXX: we don't need replay but MDT assign transno in any case,
+	 * remove it manually before reply*/
+	lustre_msg_set_transno(req->rq_repmsg, 0);
+	seq_thread_info_fini(info);
+
+	return rc;
+}
+EXPORT_SYMBOL(seq_handle);
+
+/*
+ * Entry point for handling FLD RPCs called from MDT.
+ */
+int seq_query(struct com_thread_info *info)
+{
+	return seq_handle(info->cti_pill->rc_req);
+}
+EXPORT_SYMBOL(seq_query);
+
+
+#ifdef LPROCFS
+static int seq_server_proc_init(struct lu_server_seq *seq)
+{
+	int rc;
+	ENTRY;
+
+	seq->lss_proc_dir = lprocfs_register(seq->lss_name,
+					     seq_type_proc_dir,
+					     NULL, NULL);
+	if (IS_ERR(seq->lss_proc_dir)) {
+		rc = PTR_ERR(seq->lss_proc_dir);
+		RETURN(rc);
+	}
+
+	rc = lprocfs_add_vars(seq->lss_proc_dir,
+			      seq_server_proc_list, seq);
+	if (rc) {
+		CERROR("%s: Can't init sequence manager "
+		       "proc, rc %d\n", seq->lss_name, rc);
+		GOTO(out_cleanup, rc);
+	}
+
+	RETURN(0);
+
+out_cleanup:
+	seq_server_proc_fini(seq);
+	return rc;
+}
+
+static void seq_server_proc_fini(struct lu_server_seq *seq)
+{
+	ENTRY;
+	if (seq->lss_proc_dir != NULL) {
+		if (!IS_ERR(seq->lss_proc_dir))
+			lprocfs_remove(&seq->lss_proc_dir);
+		seq->lss_proc_dir = NULL;
+	}
+	EXIT;
+}
+#else
+static int seq_server_proc_init(struct lu_server_seq *seq)
+{
+	return 0;
+}
+
+static void seq_server_proc_fini(struct lu_server_seq *seq)
+{
+	return;
+}
+#endif
+
+
+int seq_server_init(struct lu_server_seq *seq,
+		    struct dt_device *dev,
+		    const char *prefix,
+		    enum lu_mgr_type type,
+		    struct seq_server_site *ss,
+		    const struct lu_env *env)
+{
+	int rc, is_srv = (type == LUSTRE_SEQ_SERVER);
+	ENTRY;
+
+	LASSERT(dev != NULL);
+	LASSERT(prefix != NULL);
+	LASSERT(ss != NULL);
+	LASSERT(ss->ss_lu != NULL);
+
+	seq->lss_cli = NULL;
+	seq->lss_type = type;
+	seq->lss_site = ss;
+	range_init(&seq->lss_space);
+
+	range_init(&seq->lss_lowater_set);
+	range_init(&seq->lss_hiwater_set);
+	seq->lss_set_width = LUSTRE_SEQ_BATCH_WIDTH;
+
+	mutex_init(&seq->lss_mutex);
+
+	seq->lss_width = is_srv ?
+		LUSTRE_SEQ_META_WIDTH : LUSTRE_SEQ_SUPER_WIDTH;
+
+	snprintf(seq->lss_name, sizeof(seq->lss_name),
+		 "%s-%s", (is_srv ? "srv" : "ctl"), prefix);
+
+	rc = seq_store_init(seq, env, dev);
+	if (rc)
+		GOTO(out, rc);
+	/* Request backing store for saved sequence info. */
+	rc = seq_store_read(seq, env);
+	if (rc == -ENODATA) {
+
+		/* Nothing is read, init by default value. */
+		seq->lss_space = is_srv ?
+			LUSTRE_SEQ_ZERO_RANGE:
+			LUSTRE_SEQ_SPACE_RANGE;
+
+		LASSERT(ss != NULL);
+		seq->lss_space.lsr_index = ss->ss_node_id;
+		LCONSOLE_INFO("%s: No data found "
+			      "on store. Initialize space\n",
+			      seq->lss_name);
+
+		rc = seq_store_update(env, seq, NULL, 0);
+		if (rc) {
+			CERROR("%s: Can't write space data, "
+			       "rc %d\n", seq->lss_name, rc);
+		}
+	} else if (rc) {
+		CERROR("%s: Can't read space data, rc %d\n",
+		       seq->lss_name, rc);
+		GOTO(out, rc);
+	}
+
+	if (is_srv) {
+		LASSERT(range_is_sane(&seq->lss_space));
+	} else {
+		LASSERT(!range_is_zero(&seq->lss_space) &&
+			range_is_sane(&seq->lss_space));
+	}
+
+	rc  = seq_server_proc_init(seq);
+	if (rc)
+		GOTO(out, rc);
+
+	EXIT;
+out:
+	if (rc)
+		seq_server_fini(seq, env);
+	return rc;
+}
+EXPORT_SYMBOL(seq_server_init);
+
+void seq_server_fini(struct lu_server_seq *seq,
+		     const struct lu_env *env)
+{
+	ENTRY;
+
+	seq_server_proc_fini(seq);
+	seq_store_fini(seq, env);
+
+	EXIT;
+}
+EXPORT_SYMBOL(seq_server_fini);
+
+int seq_site_fini(const struct lu_env *env, struct seq_server_site *ss)
+{
+	if (ss == NULL)
+		RETURN(0);
+
+	if (ss->ss_server_seq) {
+		seq_server_fini(ss->ss_server_seq, env);
+		OBD_FREE_PTR(ss->ss_server_seq);
+		ss->ss_server_seq = NULL;
+	}
+
+	if (ss->ss_control_seq) {
+		seq_server_fini(ss->ss_control_seq, env);
+		OBD_FREE_PTR(ss->ss_control_seq);
+		ss->ss_control_seq = NULL;
+	}
+
+	if (ss->ss_client_seq) {
+		seq_client_fini(ss->ss_client_seq);
+		OBD_FREE_PTR(ss->ss_client_seq);
+		ss->ss_client_seq = NULL;
+	}
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(seq_site_fini);
+
+proc_dir_entry_t *seq_type_proc_dir = NULL;
+
+static int __init fid_mod_init(void)
+{
+	seq_type_proc_dir = lprocfs_register(LUSTRE_SEQ_NAME,
+					     proc_lustre_root,
+					     NULL, NULL);
+	if (IS_ERR(seq_type_proc_dir))
+		return PTR_ERR(seq_type_proc_dir);
+
+	LU_CONTEXT_KEY_INIT(&seq_thread_key);
+	lu_context_key_register(&seq_thread_key);
+	return 0;
+}
+
+static void __exit fid_mod_exit(void)
+{
+	lu_context_key_degister(&seq_thread_key);
+	if (seq_type_proc_dir != NULL && !IS_ERR(seq_type_proc_dir)) {
+		lprocfs_remove(&seq_type_proc_dir);
+		seq_type_proc_dir = NULL;
+	}
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre FID Module");
+MODULE_LICENSE("GPL");
+
+cfs_module(fid, "0.1.0", fid_mod_init, fid_mod_exit);
diff --git a/drivers/staging/lustre/lustre/fid/fid_internal.h b/drivers/staging/lustre/lustre/fid/fid_internal.h
new file mode 100644
index 000000000000..c3a94f4c9fce
--- /dev/null
+++ b/drivers/staging/lustre/lustre/fid/fid_internal.h
@@ -0,0 +1,86 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fid/fid_internal.h
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+#ifndef __FID_INTERNAL_H
+#define __FID_INTERNAL_H
+
+#include <lustre/lustre_idl.h>
+#include <dt_object.h>
+
+#include <linux/libcfs/libcfs.h>
+
+struct seq_thread_info {
+	struct req_capsule     *sti_pill;
+	struct lu_seq_range     sti_space;
+	struct lu_buf	   sti_buf;
+};
+
+enum {
+	SEQ_TXN_STORE_CREDITS = 20
+};
+
+extern struct lu_context_key seq_thread_key;
+
+/* Functions used internally in module. */
+int seq_client_alloc_super(struct lu_client_seq *seq,
+			   const struct lu_env *env);
+
+/* Store API functions. */
+int seq_store_init(struct lu_server_seq *seq,
+		   const struct lu_env *env,
+		   struct dt_device *dt);
+
+void seq_store_fini(struct lu_server_seq *seq,
+		    const struct lu_env *env);
+
+int seq_store_read(struct lu_server_seq *seq,
+		   const struct lu_env *env);
+
+int seq_store_update(const struct lu_env *env, struct lu_server_seq *seq,
+		     struct lu_seq_range *out, int sync);
+
+#ifdef LPROCFS
+extern struct lprocfs_vars seq_server_proc_list[];
+extern struct lprocfs_vars seq_client_proc_list[];
+#endif
+
+
+extern proc_dir_entry_t *seq_type_proc_dir;
+
+#endif /* __FID_INTERNAL_H */
diff --git a/drivers/staging/lustre/lustre/fid/fid_lib.c b/drivers/staging/lustre/lustre/fid/fid_lib.c
new file mode 100644
index 000000000000..eaff51a555fb
--- /dev/null
+++ b/drivers/staging/lustre/lustre/fid/fid_lib.c
@@ -0,0 +1,97 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fid/fid_lib.c
+ *
+ * Miscellaneous fid functions.
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FID
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/module.h>
+
+#include <obd.h>
+#include <lu_object.h>
+#include <lustre_fid.h>
+
+/**
+ * A cluster-wide range from which fid-sequences are granted to servers and
+ * then clients.
+ *
+ * Fid namespace:
+ * <pre>
+ * Normal FID:	seq:64 [2^33,2^64-1]      oid:32	  ver:32
+ * IGIF      :	0:32, ino:32	      gen:32	  0:32
+ * IDIF      :	0:31, 1:1, ost-index:16,  objd:48	 0:32
+ * </pre>
+ *
+ * The first 0x400 sequences of normal FID are reserved for special purpose.
+ * FID_SEQ_START + 1 is for local file id generation.
+ * FID_SEQ_START + 2 is for .lustre directory and its objects
+ */
+const struct lu_seq_range LUSTRE_SEQ_SPACE_RANGE = {
+	FID_SEQ_NORMAL,
+	(__u64)~0ULL
+};
+EXPORT_SYMBOL(LUSTRE_SEQ_SPACE_RANGE);
+
+/* Zero range, used for init and other purposes. */
+const struct lu_seq_range LUSTRE_SEQ_ZERO_RANGE = {
+	0,
+	0
+};
+EXPORT_SYMBOL(LUSTRE_SEQ_ZERO_RANGE);
+
+/* Lustre Big Fs Lock fid. */
+const struct lu_fid LUSTRE_BFL_FID = { .f_seq = FID_SEQ_SPECIAL,
+				       .f_oid = FID_OID_SPECIAL_BFL,
+				       .f_ver = 0x0000000000000000 };
+EXPORT_SYMBOL(LUSTRE_BFL_FID);
+
+/** Special fid for ".lustre" directory */
+const struct lu_fid LU_DOT_LUSTRE_FID = { .f_seq = FID_SEQ_DOT_LUSTRE,
+					  .f_oid = FID_OID_DOT_LUSTRE,
+					  .f_ver = 0x0000000000000000 };
+EXPORT_SYMBOL(LU_DOT_LUSTRE_FID);
+
+/** Special fid for "fid" special object in .lustre */
+const struct lu_fid LU_OBF_FID = { .f_seq = FID_SEQ_DOT_LUSTRE,
+				   .f_oid = FID_OID_DOT_LUSTRE_OBF,
+				   .f_ver = 0x0000000000000000 };
+EXPORT_SYMBOL(LU_OBF_FID);
diff --git a/drivers/staging/lustre/lustre/fid/fid_request.c b/drivers/staging/lustre/lustre/fid/fid_request.c
new file mode 100644
index 000000000000..fcaaca7e2e01
--- /dev/null
+++ b/drivers/staging/lustre/lustre/fid/fid_request.c
@@ -0,0 +1,522 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fid/fid_request.c
+ *
+ * Lustre Sequence Manager
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FID
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/module.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <dt_object.h>
+#include <md_object.h>
+#include <obd_support.h>
+#include <lustre_req_layout.h>
+#include <lustre_fid.h>
+/* mdc RPC locks */
+#include <lustre_mdc.h>
+#include "fid_internal.h"
+
+static int seq_client_rpc(struct lu_client_seq *seq,
+			  struct lu_seq_range *output, __u32 opc,
+			  const char *opcname)
+{
+	struct obd_export     *exp = seq->lcs_exp;
+	struct ptlrpc_request *req;
+	struct lu_seq_range   *out, *in;
+	__u32		 *op;
+	unsigned int	   debug_mask;
+	int		    rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), &RQF_SEQ_QUERY,
+					LUSTRE_MDS_VERSION, SEQ_QUERY);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	/* Init operation code */
+	op = req_capsule_client_get(&req->rq_pill, &RMF_SEQ_OPC);
+	*op = opc;
+
+	/* Zero out input range, this is not recovery yet. */
+	in = req_capsule_client_get(&req->rq_pill, &RMF_SEQ_RANGE);
+	range_init(in);
+
+	ptlrpc_request_set_replen(req);
+
+	in->lsr_index = seq->lcs_space.lsr_index;
+	if (seq->lcs_type == LUSTRE_SEQ_METADATA)
+		fld_range_set_mdt(in);
+	else
+		fld_range_set_ost(in);
+
+	if (opc == SEQ_ALLOC_SUPER) {
+		req->rq_request_portal = SEQ_CONTROLLER_PORTAL;
+		req->rq_reply_portal = MDC_REPLY_PORTAL;
+		/* During allocating super sequence for data object,
+		 * the current thread might hold the export of MDT0(MDT0
+		 * precreating objects on this OST), and it will send the
+		 * request to MDT0 here, so we can not keep resending the
+		 * request here, otherwise if MDT0 is failed(umounted),
+		 * it can not release the export of MDT0 */
+		if (seq->lcs_type == LUSTRE_SEQ_DATA)
+			req->rq_no_delay = req->rq_no_resend = 1;
+		debug_mask = D_CONSOLE;
+	} else {
+		if (seq->lcs_type == LUSTRE_SEQ_METADATA)
+			req->rq_request_portal = SEQ_METADATA_PORTAL;
+		else
+			req->rq_request_portal = SEQ_DATA_PORTAL;
+		debug_mask = D_INFO;
+	}
+
+	ptlrpc_at_set_req_timeout(req);
+
+	if (seq->lcs_type == LUSTRE_SEQ_METADATA)
+		mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+	rc = ptlrpc_queue_wait(req);
+	if (seq->lcs_type == LUSTRE_SEQ_METADATA)
+		mdc_put_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+	if (rc)
+		GOTO(out_req, rc);
+
+	out = req_capsule_server_get(&req->rq_pill, &RMF_SEQ_RANGE);
+	*output = *out;
+
+	if (!range_is_sane(output)) {
+		CERROR("%s: Invalid range received from server: "
+		       DRANGE"\n", seq->lcs_name, PRANGE(output));
+		GOTO(out_req, rc = -EINVAL);
+	}
+
+	if (range_is_exhausted(output)) {
+		CERROR("%s: Range received from server is exhausted: "
+		       DRANGE"]\n", seq->lcs_name, PRANGE(output));
+		GOTO(out_req, rc = -EINVAL);
+	}
+
+	CDEBUG_LIMIT(debug_mask, "%s: Allocated %s-sequence "DRANGE"]\n",
+		     seq->lcs_name, opcname, PRANGE(output));
+
+	EXIT;
+out_req:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+/* Request sequence-controller node to allocate new super-sequence. */
+int seq_client_alloc_super(struct lu_client_seq *seq,
+			   const struct lu_env *env)
+{
+	int rc;
+	ENTRY;
+
+	mutex_lock(&seq->lcs_mutex);
+
+	if (seq->lcs_srv) {
+		LASSERT(env != NULL);
+		rc = seq_server_alloc_super(seq->lcs_srv, &seq->lcs_space,
+					    env);
+	} else {
+		/* Check whether the connection to seq controller has been
+		 * setup (lcs_exp != NULL) */
+		if (seq->lcs_exp == NULL) {
+			mutex_unlock(&seq->lcs_mutex);
+			RETURN(-EINPROGRESS);
+		}
+
+		rc = seq_client_rpc(seq, &seq->lcs_space,
+				    SEQ_ALLOC_SUPER, "super");
+	}
+	mutex_unlock(&seq->lcs_mutex);
+	RETURN(rc);
+}
+
+/* Request sequence-controller node to allocate new meta-sequence. */
+static int seq_client_alloc_meta(const struct lu_env *env,
+				 struct lu_client_seq *seq)
+{
+	int rc;
+	ENTRY;
+
+	if (seq->lcs_srv) {
+		LASSERT(env != NULL);
+		rc = seq_server_alloc_meta(seq->lcs_srv, &seq->lcs_space, env);
+	} else {
+		do {
+			/* If meta server return -EINPROGRESS or EAGAIN,
+			 * it means meta server might not be ready to
+			 * allocate super sequence from sequence controller
+			 * (MDT0)yet */
+			rc = seq_client_rpc(seq, &seq->lcs_space,
+					    SEQ_ALLOC_META, "meta");
+		} while (rc == -EINPROGRESS || rc == -EAGAIN);
+	}
+	RETURN(rc);
+}
+
+/* Allocate new sequence for client. */
+static int seq_client_alloc_seq(const struct lu_env *env,
+				struct lu_client_seq *seq, seqno_t *seqnr)
+{
+	int rc;
+	ENTRY;
+
+	LASSERT(range_is_sane(&seq->lcs_space));
+
+	if (range_is_exhausted(&seq->lcs_space)) {
+		rc = seq_client_alloc_meta(env, seq);
+		if (rc) {
+			CERROR("%s: Can't allocate new meta-sequence,"
+			       "rc %d\n", seq->lcs_name, rc);
+			RETURN(rc);
+		} else {
+			CDEBUG(D_INFO, "%s: New range - "DRANGE"\n",
+			       seq->lcs_name, PRANGE(&seq->lcs_space));
+		}
+	} else {
+		rc = 0;
+	}
+
+	LASSERT(!range_is_exhausted(&seq->lcs_space));
+	*seqnr = seq->lcs_space.lsr_start;
+	seq->lcs_space.lsr_start += 1;
+
+	CDEBUG(D_INFO, "%s: Allocated sequence ["LPX64"]\n", seq->lcs_name,
+	       *seqnr);
+
+	RETURN(rc);
+}
+
+static int seq_fid_alloc_prep(struct lu_client_seq *seq,
+			      wait_queue_t *link)
+{
+	if (seq->lcs_update) {
+		add_wait_queue(&seq->lcs_waitq, link);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		mutex_unlock(&seq->lcs_mutex);
+
+		waitq_wait(link, TASK_UNINTERRUPTIBLE);
+
+		mutex_lock(&seq->lcs_mutex);
+		remove_wait_queue(&seq->lcs_waitq, link);
+		set_current_state(TASK_RUNNING);
+		return -EAGAIN;
+	}
+	++seq->lcs_update;
+	mutex_unlock(&seq->lcs_mutex);
+	return 0;
+}
+
+static void seq_fid_alloc_fini(struct lu_client_seq *seq)
+{
+	LASSERT(seq->lcs_update == 1);
+	mutex_lock(&seq->lcs_mutex);
+	--seq->lcs_update;
+	wake_up(&seq->lcs_waitq);
+}
+
+/**
+ * Allocate the whole seq to the caller.
+ **/
+int seq_client_get_seq(const struct lu_env *env,
+		       struct lu_client_seq *seq, seqno_t *seqnr)
+{
+	wait_queue_t link;
+	int rc;
+
+	LASSERT(seqnr != NULL);
+	mutex_lock(&seq->lcs_mutex);
+	init_waitqueue_entry_current(&link);
+
+	while (1) {
+		rc = seq_fid_alloc_prep(seq, &link);
+		if (rc == 0)
+			break;
+	}
+
+	rc = seq_client_alloc_seq(env, seq, seqnr);
+	if (rc) {
+		CERROR("%s: Can't allocate new sequence, "
+		       "rc %d\n", seq->lcs_name, rc);
+		seq_fid_alloc_fini(seq);
+		mutex_unlock(&seq->lcs_mutex);
+		return rc;
+	}
+
+	CDEBUG(D_INFO, "%s: allocate sequence "
+	       "[0x%16.16"LPF64"x]\n", seq->lcs_name, *seqnr);
+
+	/* Since the caller require the whole seq,
+	 * so marked this seq to be used */
+	if (seq->lcs_type == LUSTRE_SEQ_METADATA)
+		seq->lcs_fid.f_oid = LUSTRE_METADATA_SEQ_MAX_WIDTH;
+	else
+		seq->lcs_fid.f_oid = LUSTRE_DATA_SEQ_MAX_WIDTH;
+
+	seq->lcs_fid.f_seq = *seqnr;
+	seq->lcs_fid.f_ver = 0;
+	/*
+	 * Inform caller that sequence switch is performed to allow it
+	 * to setup FLD for it.
+	 */
+	seq_fid_alloc_fini(seq);
+	mutex_unlock(&seq->lcs_mutex);
+
+	return rc;
+}
+EXPORT_SYMBOL(seq_client_get_seq);
+
+/* Allocate new fid on passed client @seq and save it to @fid. */
+int seq_client_alloc_fid(const struct lu_env *env,
+			 struct lu_client_seq *seq, struct lu_fid *fid)
+{
+	wait_queue_t link;
+	int rc;
+	ENTRY;
+
+	LASSERT(seq != NULL);
+	LASSERT(fid != NULL);
+
+	init_waitqueue_entry_current(&link);
+	mutex_lock(&seq->lcs_mutex);
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_SEQ_EXHAUST))
+		seq->lcs_fid.f_oid = seq->lcs_width;
+
+	while (1) {
+		seqno_t seqnr;
+
+		if (!fid_is_zero(&seq->lcs_fid) &&
+		    fid_oid(&seq->lcs_fid) < seq->lcs_width) {
+			/* Just bump last allocated fid and return to caller. */
+			seq->lcs_fid.f_oid += 1;
+			rc = 0;
+			break;
+		}
+
+		rc = seq_fid_alloc_prep(seq, &link);
+		if (rc)
+			continue;
+
+		rc = seq_client_alloc_seq(env, seq, &seqnr);
+		if (rc) {
+			CERROR("%s: Can't allocate new sequence, "
+			       "rc %d\n", seq->lcs_name, rc);
+			seq_fid_alloc_fini(seq);
+			mutex_unlock(&seq->lcs_mutex);
+			RETURN(rc);
+		}
+
+		CDEBUG(D_INFO, "%s: Switch to sequence "
+		       "[0x%16.16"LPF64"x]\n", seq->lcs_name, seqnr);
+
+		seq->lcs_fid.f_oid = LUSTRE_FID_INIT_OID;
+		seq->lcs_fid.f_seq = seqnr;
+		seq->lcs_fid.f_ver = 0;
+
+		/*
+		 * Inform caller that sequence switch is performed to allow it
+		 * to setup FLD for it.
+		 */
+		rc = 1;
+
+		seq_fid_alloc_fini(seq);
+		break;
+	}
+
+	*fid = seq->lcs_fid;
+	mutex_unlock(&seq->lcs_mutex);
+
+	CDEBUG(D_INFO, "%s: Allocated FID "DFID"\n", seq->lcs_name,  PFID(fid));
+	RETURN(rc);
+}
+EXPORT_SYMBOL(seq_client_alloc_fid);
+
+/*
+ * Finish the current sequence due to disconnect.
+ * See mdc_import_event()
+ */
+void seq_client_flush(struct lu_client_seq *seq)
+{
+	wait_queue_t link;
+
+	LASSERT(seq != NULL);
+	init_waitqueue_entry_current(&link);
+	mutex_lock(&seq->lcs_mutex);
+
+	while (seq->lcs_update) {
+		add_wait_queue(&seq->lcs_waitq, &link);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		mutex_unlock(&seq->lcs_mutex);
+
+		waitq_wait(&link, TASK_UNINTERRUPTIBLE);
+
+		mutex_lock(&seq->lcs_mutex);
+		remove_wait_queue(&seq->lcs_waitq, &link);
+		set_current_state(TASK_RUNNING);
+	}
+
+	fid_zero(&seq->lcs_fid);
+	/**
+	 * this id shld not be used for seq range allocation.
+	 * set to -1 for dgb check.
+	 */
+
+	seq->lcs_space.lsr_index = -1;
+
+	range_init(&seq->lcs_space);
+	mutex_unlock(&seq->lcs_mutex);
+}
+EXPORT_SYMBOL(seq_client_flush);
+
+static void seq_client_proc_fini(struct lu_client_seq *seq);
+
+#ifdef LPROCFS
+static int seq_client_proc_init(struct lu_client_seq *seq)
+{
+	int rc;
+	ENTRY;
+
+	seq->lcs_proc_dir = lprocfs_register(seq->lcs_name,
+					     seq_type_proc_dir,
+					     NULL, NULL);
+
+	if (IS_ERR(seq->lcs_proc_dir)) {
+		CERROR("%s: LProcFS failed in seq-init\n",
+		       seq->lcs_name);
+		rc = PTR_ERR(seq->lcs_proc_dir);
+		RETURN(rc);
+	}
+
+	rc = lprocfs_add_vars(seq->lcs_proc_dir,
+			      seq_client_proc_list, seq);
+	if (rc) {
+		CERROR("%s: Can't init sequence manager "
+		       "proc, rc %d\n", seq->lcs_name, rc);
+		GOTO(out_cleanup, rc);
+	}
+
+	RETURN(0);
+
+out_cleanup:
+	seq_client_proc_fini(seq);
+	return rc;
+}
+
+static void seq_client_proc_fini(struct lu_client_seq *seq)
+{
+	ENTRY;
+	if (seq->lcs_proc_dir) {
+		if (!IS_ERR(seq->lcs_proc_dir))
+			lprocfs_remove(&seq->lcs_proc_dir);
+		seq->lcs_proc_dir = NULL;
+	}
+	EXIT;
+}
+#else
+static int seq_client_proc_init(struct lu_client_seq *seq)
+{
+	return 0;
+}
+
+static void seq_client_proc_fini(struct lu_client_seq *seq)
+{
+	return;
+}
+#endif
+
+int seq_client_init(struct lu_client_seq *seq,
+		    struct obd_export *exp,
+		    enum lu_cli_type type,
+		    const char *prefix,
+		    struct lu_server_seq *srv)
+{
+	int rc;
+	ENTRY;
+
+	LASSERT(seq != NULL);
+	LASSERT(prefix != NULL);
+
+	seq->lcs_srv = srv;
+	seq->lcs_type = type;
+
+	mutex_init(&seq->lcs_mutex);
+	if (type == LUSTRE_SEQ_METADATA)
+		seq->lcs_width = LUSTRE_METADATA_SEQ_MAX_WIDTH;
+	else
+		seq->lcs_width = LUSTRE_DATA_SEQ_MAX_WIDTH;
+
+	init_waitqueue_head(&seq->lcs_waitq);
+	/* Make sure that things are clear before work is started. */
+	seq_client_flush(seq);
+
+	if (exp != NULL)
+		seq->lcs_exp = class_export_get(exp);
+	else if (type == LUSTRE_SEQ_METADATA)
+		LASSERT(seq->lcs_srv != NULL);
+
+	snprintf(seq->lcs_name, sizeof(seq->lcs_name),
+		 "cli-%s", prefix);
+
+	rc = seq_client_proc_init(seq);
+	if (rc)
+		seq_client_fini(seq);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(seq_client_init);
+
+void seq_client_fini(struct lu_client_seq *seq)
+{
+	ENTRY;
+
+	seq_client_proc_fini(seq);
+
+	if (seq->lcs_exp != NULL) {
+		class_export_put(seq->lcs_exp);
+		seq->lcs_exp = NULL;
+	}
+
+	seq->lcs_srv = NULL;
+	EXIT;
+}
+EXPORT_SYMBOL(seq_client_fini);
diff --git a/drivers/staging/lustre/lustre/fid/fid_store.c b/drivers/staging/lustre/lustre/fid/fid_store.c
new file mode 100644
index 000000000000..a90e6e37d689
--- /dev/null
+++ b/drivers/staging/lustre/lustre/fid/fid_store.c
@@ -0,0 +1,259 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fid/fid_store.c
+ *
+ * Lustre Sequence Manager
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FID
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/module.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <dt_object.h>
+#include <md_object.h>
+#include <obd_support.h>
+#include <lustre_req_layout.h>
+#include <lustre_fid.h>
+#include "fid_internal.h"
+
+
+static struct lu_buf *seq_store_buf(struct seq_thread_info *info)
+{
+	struct lu_buf *buf;
+
+	buf = &info->sti_buf;
+	buf->lb_buf = &info->sti_space;
+	buf->lb_len = sizeof(info->sti_space);
+	return buf;
+}
+
+struct seq_update_callback {
+	struct dt_txn_commit_cb suc_cb;
+	struct lu_server_seq   *suc_seq;
+};
+
+void seq_update_cb(struct lu_env *env, struct thandle *th,
+		   struct dt_txn_commit_cb *cb, int err)
+{
+	struct seq_update_callback *ccb;
+
+	ccb = container_of0(cb, struct seq_update_callback, suc_cb);
+
+	LASSERT(ccb->suc_seq != NULL);
+
+	ccb->suc_seq->lss_need_sync = 0;
+	OBD_FREE_PTR(ccb);
+}
+
+int seq_update_cb_add(struct thandle *th, struct lu_server_seq *seq)
+{
+	struct seq_update_callback *ccb;
+	struct dt_txn_commit_cb	   *dcb;
+	int			   rc;
+
+	OBD_ALLOC_PTR(ccb);
+	if (ccb == NULL)
+		return -ENOMEM;
+
+	ccb->suc_seq	   = seq;
+	seq->lss_need_sync = 1;
+
+	dcb	       = &ccb->suc_cb;
+	dcb->dcb_func  = seq_update_cb;
+	INIT_LIST_HEAD(&dcb->dcb_linkage);
+	strncpy(dcb->dcb_name, "seq_update_cb", MAX_COMMIT_CB_STR_LEN);
+	dcb->dcb_name[MAX_COMMIT_CB_STR_LEN - 1] = '\0';
+
+	rc = dt_trans_cb_add(th, dcb);
+	if (rc)
+		OBD_FREE_PTR(ccb);
+	return rc;
+}
+
+/* This function implies that caller takes care about locking. */
+int seq_store_update(const struct lu_env *env, struct lu_server_seq *seq,
+		     struct lu_seq_range *out, int sync)
+{
+	struct dt_device *dt_dev = lu2dt_dev(seq->lss_obj->do_lu.lo_dev);
+	struct seq_thread_info *info;
+	struct thandle *th;
+	loff_t pos = 0;
+	int rc;
+
+	info = lu_context_key_get(&env->le_ctx, &seq_thread_key);
+	LASSERT(info != NULL);
+
+	th = dt_trans_create(env, dt_dev);
+	if (IS_ERR(th))
+		RETURN(PTR_ERR(th));
+
+	rc = dt_declare_record_write(env, seq->lss_obj,
+				     sizeof(struct lu_seq_range), 0, th);
+	if (rc)
+		GOTO(exit, rc);
+
+	if (out != NULL) {
+		rc = fld_declare_server_create(env,
+					       seq->lss_site->ss_server_fld,
+					       out, th);
+		if (rc)
+			GOTO(exit, rc);
+	}
+
+	rc = dt_trans_start_local(env, dt_dev, th);
+	if (rc)
+		GOTO(exit, rc);
+
+	/* Store ranges in le format. */
+	range_cpu_to_le(&info->sti_space, &seq->lss_space);
+
+	rc = dt_record_write(env, seq->lss_obj, seq_store_buf(info), &pos, th);
+	if (rc) {
+		CERROR("%s: Can't write space data, rc %d\n",
+		       seq->lss_name, rc);
+		GOTO(exit, rc);
+	} else if (out != NULL) {
+		rc = fld_server_create(env, seq->lss_site->ss_server_fld, out,
+				       th);
+		if (rc) {
+			CERROR("%s: Can't Update fld database, rc %d\n",
+				seq->lss_name, rc);
+			GOTO(exit, rc);
+		}
+	}
+	/* next sequence update will need sync until this update is committed
+	 * in case of sync operation this is not needed obviously */
+	if (!sync)
+		/* if callback can't be added then sync always */
+		sync = !!seq_update_cb_add(th, seq);
+
+	th->th_sync |= sync;
+exit:
+	dt_trans_stop(env, dt_dev, th);
+	return rc;
+}
+
+/*
+ * This function implies that caller takes care about locking or locking is not
+ * needed (init time).
+ */
+int seq_store_read(struct lu_server_seq *seq,
+		   const struct lu_env *env)
+{
+	struct seq_thread_info *info;
+	loff_t pos = 0;
+	int rc;
+	ENTRY;
+
+	info = lu_context_key_get(&env->le_ctx, &seq_thread_key);
+	LASSERT(info != NULL);
+
+	rc = seq->lss_obj->do_body_ops->dbo_read(env, seq->lss_obj,
+						 seq_store_buf(info),
+						 &pos, BYPASS_CAPA);
+
+	if (rc == sizeof(info->sti_space)) {
+		range_le_to_cpu(&seq->lss_space, &info->sti_space);
+		CDEBUG(D_INFO, "%s: Space - "DRANGE"\n",
+		       seq->lss_name, PRANGE(&seq->lss_space));
+		rc = 0;
+	} else if (rc == 0) {
+		rc = -ENODATA;
+	} else if (rc > 0) {
+		CERROR("%s: Read only %d bytes of %d\n", seq->lss_name,
+		       rc, (int)sizeof(info->sti_space));
+		rc = -EIO;
+	}
+
+	RETURN(rc);
+}
+
+int seq_store_init(struct lu_server_seq *seq,
+		   const struct lu_env *env,
+		   struct dt_device *dt)
+{
+	struct dt_object *dt_obj;
+	struct lu_fid fid;
+	struct lu_attr attr;
+	struct dt_object_format dof;
+	const char *name;
+	int rc;
+	ENTRY;
+
+	name = seq->lss_type == LUSTRE_SEQ_SERVER ?
+		LUSTRE_SEQ_SRV_NAME : LUSTRE_SEQ_CTL_NAME;
+
+	if (seq->lss_type == LUSTRE_SEQ_SERVER)
+		lu_local_obj_fid(&fid, FID_SEQ_SRV_OID);
+	else
+		lu_local_obj_fid(&fid, FID_SEQ_CTL_OID);
+
+	memset(&attr, 0, sizeof(attr));
+	attr.la_valid = LA_MODE;
+	attr.la_mode = S_IFREG | 0666;
+	dof.dof_type = DFT_REGULAR;
+
+	dt_obj = dt_find_or_create(env, dt, &fid, &dof, &attr);
+	if (!IS_ERR(dt_obj)) {
+		seq->lss_obj = dt_obj;
+		rc = 0;
+	} else {
+		CERROR("%s: Can't find \"%s\" obj %d\n",
+		       seq->lss_name, name, (int)PTR_ERR(dt_obj));
+		rc = PTR_ERR(dt_obj);
+	}
+
+	RETURN(rc);
+}
+
+void seq_store_fini(struct lu_server_seq *seq,
+		    const struct lu_env *env)
+{
+	ENTRY;
+
+	if (seq->lss_obj != NULL) {
+		if (!IS_ERR(seq->lss_obj))
+			lu_object_put(env, &seq->lss_obj->do_lu);
+		seq->lss_obj = NULL;
+	}
+
+	EXIT;
+}
diff --git a/drivers/staging/lustre/lustre/fid/lproc_fid.c b/drivers/staging/lustre/lustre/fid/lproc_fid.c
new file mode 100644
index 000000000000..49ea357be686
--- /dev/null
+++ b/drivers/staging/lustre/lustre/fid/lproc_fid.c
@@ -0,0 +1,360 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fid/lproc_fid.c
+ *
+ * Lustre Sequence Manager
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FID
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/module.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <dt_object.h>
+#include <md_object.h>
+#include <obd_support.h>
+#include <lustre_req_layout.h>
+#include <lustre_fid.h>
+#include "fid_internal.h"
+
+#ifdef LPROCFS
+/*
+ * Note: this function is only used for testing, it is no safe for production
+ * use.
+ */
+static int
+seq_proc_write_common(struct file *file, const char *buffer,
+		      unsigned long count, void *data,
+		      struct lu_seq_range *range)
+{
+	struct lu_seq_range tmp;
+	int rc;
+	ENTRY;
+
+	LASSERT(range != NULL);
+
+	rc = sscanf(buffer, "[%llx - %llx]\n",
+		    (long long unsigned *)&tmp.lsr_start,
+		    (long long unsigned *)&tmp.lsr_end);
+	if (rc != 2 || !range_is_sane(&tmp) || range_is_zero(&tmp))
+		RETURN(-EINVAL);
+	*range = tmp;
+	RETURN(0);
+}
+
+static int
+seq_proc_read_common(char *page, char **start, off_t off,
+		     int count, int *eof, void *data,
+		     struct lu_seq_range *range)
+{
+	int rc;
+	ENTRY;
+
+	*eof = 1;
+	rc = snprintf(page, count, "["LPX64" - "LPX64"]:%x:%s\n",
+			PRANGE(range));
+	RETURN(rc);
+}
+
+/*
+ * Server side procfs stuff.
+ */
+static int
+seq_server_proc_write_space(struct file *file, const char *buffer,
+			    unsigned long count, void *data)
+{
+	struct lu_server_seq *seq = (struct lu_server_seq *)data;
+	int rc;
+	ENTRY;
+
+	LASSERT(seq != NULL);
+
+	mutex_lock(&seq->lss_mutex);
+	rc = seq_proc_write_common(file, buffer, count,
+				   data, &seq->lss_space);
+	if (rc == 0) {
+		CDEBUG(D_INFO, "%s: Space: "DRANGE"\n",
+		       seq->lss_name, PRANGE(&seq->lss_space));
+	}
+
+	mutex_unlock(&seq->lss_mutex);
+
+	RETURN(count);
+}
+
+static int
+seq_server_proc_read_space(char *page, char **start, off_t off,
+			   int count, int *eof, void *data)
+{
+	struct lu_server_seq *seq = (struct lu_server_seq *)data;
+	int rc;
+	ENTRY;
+
+	LASSERT(seq != NULL);
+
+	mutex_lock(&seq->lss_mutex);
+	rc = seq_proc_read_common(page, start, off, count, eof,
+				  data, &seq->lss_space);
+	mutex_unlock(&seq->lss_mutex);
+
+	RETURN(rc);
+}
+
+static int
+seq_server_proc_read_server(char *page, char **start, off_t off,
+			    int count, int *eof, void *data)
+{
+	struct lu_server_seq *seq = (struct lu_server_seq *)data;
+	struct client_obd *cli;
+	int rc;
+	ENTRY;
+
+	LASSERT(seq != NULL);
+
+	*eof = 1;
+	if (seq->lss_cli) {
+		if (seq->lss_cli->lcs_exp != NULL) {
+			cli = &seq->lss_cli->lcs_exp->exp_obd->u.cli;
+			rc = snprintf(page, count, "%s\n",
+				      cli->cl_target_uuid.uuid);
+		} else {
+			rc = snprintf(page, count, "%s\n",
+				      seq->lss_cli->lcs_srv->lss_name);
+		}
+	} else {
+		rc = snprintf(page, count, "<none>\n");
+	}
+
+	RETURN(rc);
+}
+
+static int
+seq_server_proc_write_width(struct file *file, const char *buffer,
+			    unsigned long count, void *data)
+{
+	struct lu_server_seq *seq = (struct lu_server_seq *)data;
+	int rc, val;
+	ENTRY;
+
+	LASSERT(seq != NULL);
+
+	mutex_lock(&seq->lss_mutex);
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc != 0) {
+		CERROR("%s: invalid width.\n", seq->lss_name);
+		GOTO(out_unlock, rc);
+	}
+
+	seq->lss_width = val;
+
+	CDEBUG(D_INFO, "%s: Width: "LPU64"\n",
+	       seq->lss_name, seq->lss_width);
+out_unlock:
+	mutex_unlock(&seq->lss_mutex);
+
+	RETURN(count);
+}
+
+static int
+seq_server_proc_read_width(char *page, char **start, off_t off,
+			   int count, int *eof, void *data)
+{
+	struct lu_server_seq *seq = (struct lu_server_seq *)data;
+	int rc;
+	ENTRY;
+
+	LASSERT(seq != NULL);
+
+	mutex_lock(&seq->lss_mutex);
+	rc = snprintf(page, count, LPU64"\n", seq->lss_width);
+	mutex_unlock(&seq->lss_mutex);
+
+	RETURN(rc);
+}
+
+/* Client side procfs stuff */
+static int
+seq_client_proc_write_space(struct file *file, const char *buffer,
+			    unsigned long count, void *data)
+{
+	struct lu_client_seq *seq = (struct lu_client_seq *)data;
+	int rc;
+	ENTRY;
+
+	LASSERT(seq != NULL);
+
+	mutex_lock(&seq->lcs_mutex);
+	rc = seq_proc_write_common(file, buffer, count,
+				   data, &seq->lcs_space);
+
+	if (rc == 0) {
+		CDEBUG(D_INFO, "%s: Space: "DRANGE"\n",
+		       seq->lcs_name, PRANGE(&seq->lcs_space));
+	}
+
+	mutex_unlock(&seq->lcs_mutex);
+
+	RETURN(count);
+}
+
+static int
+seq_client_proc_read_space(char *page, char **start, off_t off,
+			   int count, int *eof, void *data)
+{
+	struct lu_client_seq *seq = (struct lu_client_seq *)data;
+	int rc;
+	ENTRY;
+
+	LASSERT(seq != NULL);
+
+	mutex_lock(&seq->lcs_mutex);
+	rc = seq_proc_read_common(page, start, off, count, eof,
+				  data, &seq->lcs_space);
+	mutex_unlock(&seq->lcs_mutex);
+
+	RETURN(rc);
+}
+
+static int
+seq_client_proc_write_width(struct file *file, const char *buffer,
+			    unsigned long count, void *data)
+{
+	struct lu_client_seq *seq = (struct lu_client_seq *)data;
+	__u64  max;
+	int rc, val;
+	ENTRY;
+
+	LASSERT(seq != NULL);
+
+	mutex_lock(&seq->lcs_mutex);
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc) {
+		mutex_unlock(&seq->lcs_mutex);
+		RETURN(rc);
+	}
+
+	if (seq->lcs_type == LUSTRE_SEQ_DATA)
+		max = LUSTRE_DATA_SEQ_MAX_WIDTH;
+	else
+		max = LUSTRE_METADATA_SEQ_MAX_WIDTH;
+
+	if (val <= max && val > 0) {
+		seq->lcs_width = val;
+
+		if (rc == 0) {
+			CDEBUG(D_INFO, "%s: Sequence size: "LPU64"\n",
+			       seq->lcs_name, seq->lcs_width);
+		}
+	}
+
+	mutex_unlock(&seq->lcs_mutex);
+
+	RETURN(count);
+}
+
+static int
+seq_client_proc_read_width(char *page, char **start, off_t off,
+			   int count, int *eof, void *data)
+{
+	struct lu_client_seq *seq = (struct lu_client_seq *)data;
+	int rc;
+	ENTRY;
+
+	LASSERT(seq != NULL);
+
+	mutex_lock(&seq->lcs_mutex);
+	rc = snprintf(page, count, LPU64"\n", seq->lcs_width);
+	mutex_unlock(&seq->lcs_mutex);
+
+	RETURN(rc);
+}
+
+static int
+seq_client_proc_read_fid(char *page, char **start, off_t off,
+			 int count, int *eof, void *data)
+{
+	struct lu_client_seq *seq = (struct lu_client_seq *)data;
+	int rc;
+	ENTRY;
+
+	LASSERT(seq != NULL);
+
+	mutex_lock(&seq->lcs_mutex);
+	rc = snprintf(page, count, DFID"\n", PFID(&seq->lcs_fid));
+	mutex_unlock(&seq->lcs_mutex);
+
+	RETURN(rc);
+}
+
+static int
+seq_client_proc_read_server(char *page, char **start, off_t off,
+			    int count, int *eof, void *data)
+{
+	struct lu_client_seq *seq = (struct lu_client_seq *)data;
+	struct client_obd *cli;
+	int rc;
+	ENTRY;
+
+	LASSERT(seq != NULL);
+
+	if (seq->lcs_exp != NULL) {
+		cli = &seq->lcs_exp->exp_obd->u.cli;
+		rc = snprintf(page, count, "%s\n", cli->cl_target_uuid.uuid);
+	} else {
+		rc = snprintf(page, count, "%s\n", seq->lcs_srv->lss_name);
+	}
+	RETURN(rc);
+}
+
+struct lprocfs_vars seq_server_proc_list[] = {
+	{ "space",    seq_server_proc_read_space, seq_server_proc_write_space, NULL },
+	{ "width",    seq_server_proc_read_width, seq_server_proc_write_width, NULL },
+	{ "server",   seq_server_proc_read_server, NULL, NULL },
+	{ NULL }};
+
+struct lprocfs_vars seq_client_proc_list[] = {
+	{ "space",    seq_client_proc_read_space, seq_client_proc_write_space, NULL },
+	{ "width",    seq_client_proc_read_width, seq_client_proc_write_width, NULL },
+	{ "server",   seq_client_proc_read_server, NULL, NULL },
+	{ "fid",      seq_client_proc_read_fid, NULL, NULL },
+	{ NULL }};
+#endif
diff --git a/drivers/staging/lustre/lustre/fld/Makefile b/drivers/staging/lustre/lustre/fld/Makefile
new file mode 100644
index 000000000000..e7f2881a1d9e
--- /dev/null
+++ b/drivers/staging/lustre/lustre/fld/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_LUSTRE_FS) += fld.o
+fld-y := fld_handler.o fld_request.o fld_cache.o fld_index.o lproc_fld.o
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lustre/fld/fld_cache.c b/drivers/staging/lustre/lustre/fld/fld_cache.c
new file mode 100644
index 000000000000..347f2ae83bc8
--- /dev/null
+++ b/drivers/staging/lustre/lustre/fld/fld_cache.c
@@ -0,0 +1,566 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fld/fld_cache.c
+ *
+ * FLD (Fids Location Database)
+ *
+ * Author: Pravin Shelar <pravin.shelar@sun.com>
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FLD
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/module.h>
+# include <linux/jbd.h>
+# include <asm/div64.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_ver.h>
+#include <obd_support.h>
+#include <lprocfs_status.h>
+
+#include <dt_object.h>
+#include <md_object.h>
+#include <lustre_req_layout.h>
+#include <lustre_fld.h>
+#include "fld_internal.h"
+
+/**
+ * create fld cache.
+ */
+struct fld_cache *fld_cache_init(const char *name,
+				 int cache_size, int cache_threshold)
+{
+	struct fld_cache *cache;
+	ENTRY;
+
+	LASSERT(name != NULL);
+	LASSERT(cache_threshold < cache_size);
+
+	OBD_ALLOC_PTR(cache);
+	if (cache == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	INIT_LIST_HEAD(&cache->fci_entries_head);
+	INIT_LIST_HEAD(&cache->fci_lru);
+
+	cache->fci_cache_count = 0;
+	rwlock_init(&cache->fci_lock);
+
+	strlcpy(cache->fci_name, name,
+		sizeof(cache->fci_name));
+
+	cache->fci_cache_size = cache_size;
+	cache->fci_threshold = cache_threshold;
+
+	/* Init fld cache info. */
+	memset(&cache->fci_stat, 0, sizeof(cache->fci_stat));
+
+	CDEBUG(D_INFO, "%s: FLD cache - Size: %d, Threshold: %d\n",
+	       cache->fci_name, cache_size, cache_threshold);
+
+	RETURN(cache);
+}
+
+/**
+ * destroy fld cache.
+ */
+void fld_cache_fini(struct fld_cache *cache)
+{
+	__u64 pct;
+	ENTRY;
+
+	LASSERT(cache != NULL);
+	fld_cache_flush(cache);
+
+	if (cache->fci_stat.fst_count > 0) {
+		pct = cache->fci_stat.fst_cache * 100;
+		do_div(pct, cache->fci_stat.fst_count);
+	} else {
+		pct = 0;
+	}
+
+	CDEBUG(D_INFO, "FLD cache statistics (%s):\n", cache->fci_name);
+	CDEBUG(D_INFO, "  Total reqs: "LPU64"\n", cache->fci_stat.fst_count);
+	CDEBUG(D_INFO, "  Cache reqs: "LPU64"\n", cache->fci_stat.fst_cache);
+	CDEBUG(D_INFO, "  Cache hits: "LPU64"%%\n", pct);
+
+	OBD_FREE_PTR(cache);
+
+	EXIT;
+}
+
+/**
+ * delete given node from list.
+ */
+void fld_cache_entry_delete(struct fld_cache *cache,
+			    struct fld_cache_entry *node)
+{
+	list_del(&node->fce_list);
+	list_del(&node->fce_lru);
+	cache->fci_cache_count--;
+	OBD_FREE_PTR(node);
+}
+
+/**
+ * fix list by checking new entry with NEXT entry in order.
+ */
+static void fld_fix_new_list(struct fld_cache *cache)
+{
+	struct fld_cache_entry *f_curr;
+	struct fld_cache_entry *f_next;
+	struct lu_seq_range *c_range;
+	struct lu_seq_range *n_range;
+	struct list_head *head = &cache->fci_entries_head;
+	ENTRY;
+
+restart_fixup:
+
+	list_for_each_entry_safe(f_curr, f_next, head, fce_list) {
+		c_range = &f_curr->fce_range;
+		n_range = &f_next->fce_range;
+
+		LASSERT(range_is_sane(c_range));
+		if (&f_next->fce_list == head)
+			break;
+
+		if (c_range->lsr_flags != n_range->lsr_flags)
+			continue;
+
+		LASSERTF(c_range->lsr_start <= n_range->lsr_start,
+			 "cur lsr_start "DRANGE" next lsr_start "DRANGE"\n",
+			 PRANGE(c_range), PRANGE(n_range));
+
+		/* check merge possibility with next range */
+		if (c_range->lsr_end == n_range->lsr_start) {
+			if (c_range->lsr_index != n_range->lsr_index)
+				continue;
+			n_range->lsr_start = c_range->lsr_start;
+			fld_cache_entry_delete(cache, f_curr);
+			continue;
+		}
+
+		/* check if current range overlaps with next range. */
+		if (n_range->lsr_start < c_range->lsr_end) {
+			if (c_range->lsr_index == n_range->lsr_index) {
+				n_range->lsr_start = c_range->lsr_start;
+				n_range->lsr_end = max(c_range->lsr_end,
+						       n_range->lsr_end);
+				fld_cache_entry_delete(cache, f_curr);
+			} else {
+				if (n_range->lsr_end <= c_range->lsr_end) {
+					*n_range = *c_range;
+					fld_cache_entry_delete(cache, f_curr);
+				} else
+					n_range->lsr_start = c_range->lsr_end;
+			}
+
+			/* we could have overlap over next
+			 * range too. better restart. */
+			goto restart_fixup;
+		}
+
+		/* kill duplicates */
+		if (c_range->lsr_start == n_range->lsr_start &&
+		    c_range->lsr_end == n_range->lsr_end)
+			fld_cache_entry_delete(cache, f_curr);
+	}
+
+	EXIT;
+}
+
+/**
+ * add node to fld cache
+ */
+static inline void fld_cache_entry_add(struct fld_cache *cache,
+				       struct fld_cache_entry *f_new,
+				       struct list_head *pos)
+{
+	list_add(&f_new->fce_list, pos);
+	list_add(&f_new->fce_lru, &cache->fci_lru);
+
+	cache->fci_cache_count++;
+	fld_fix_new_list(cache);
+}
+
+/**
+ * Check if cache needs to be shrunk. If so - do it.
+ * Remove one entry in list and so on until cache is shrunk enough.
+ */
+static int fld_cache_shrink(struct fld_cache *cache)
+{
+	struct fld_cache_entry *flde;
+	struct list_head *curr;
+	int num = 0;
+	ENTRY;
+
+	LASSERT(cache != NULL);
+
+	if (cache->fci_cache_count < cache->fci_cache_size)
+		RETURN(0);
+
+	curr = cache->fci_lru.prev;
+
+	while (cache->fci_cache_count + cache->fci_threshold >
+	       cache->fci_cache_size && curr != &cache->fci_lru) {
+
+		flde = list_entry(curr, struct fld_cache_entry, fce_lru);
+		curr = curr->prev;
+		fld_cache_entry_delete(cache, flde);
+		num++;
+	}
+
+	CDEBUG(D_INFO, "%s: FLD cache - Shrunk by "
+	       "%d entries\n", cache->fci_name, num);
+
+	RETURN(0);
+}
+
+/**
+ * kill all fld cache entries.
+ */
+void fld_cache_flush(struct fld_cache *cache)
+{
+	ENTRY;
+
+	write_lock(&cache->fci_lock);
+	cache->fci_cache_size = 0;
+	fld_cache_shrink(cache);
+	write_unlock(&cache->fci_lock);
+
+	EXIT;
+}
+
+/**
+ * punch hole in existing range. divide this range and add new
+ * entry accordingly.
+ */
+
+void fld_cache_punch_hole(struct fld_cache *cache,
+			  struct fld_cache_entry *f_curr,
+			  struct fld_cache_entry *f_new)
+{
+	const struct lu_seq_range *range = &f_new->fce_range;
+	const seqno_t new_start  = range->lsr_start;
+	const seqno_t new_end  = range->lsr_end;
+	struct fld_cache_entry *fldt;
+
+	ENTRY;
+	OBD_ALLOC_GFP(fldt, sizeof *fldt, GFP_ATOMIC);
+	if (!fldt) {
+		OBD_FREE_PTR(f_new);
+		EXIT;
+		/* overlap is not allowed, so dont mess up list. */
+		return;
+	}
+	/*  break f_curr RANGE into three RANGES:
+	 *	f_curr, f_new , fldt
+	 */
+
+	/* f_new = *range */
+
+	/* fldt */
+	fldt->fce_range.lsr_start = new_end;
+	fldt->fce_range.lsr_end = f_curr->fce_range.lsr_end;
+	fldt->fce_range.lsr_index = f_curr->fce_range.lsr_index;
+
+	/* f_curr */
+	f_curr->fce_range.lsr_end = new_start;
+
+	/* add these two entries to list */
+	fld_cache_entry_add(cache, f_new, &f_curr->fce_list);
+	fld_cache_entry_add(cache, fldt, &f_new->fce_list);
+
+	/* no need to fixup */
+	EXIT;
+}
+
+/**
+ * handle range overlap in fld cache.
+ */
+static void fld_cache_overlap_handle(struct fld_cache *cache,
+				struct fld_cache_entry *f_curr,
+				struct fld_cache_entry *f_new)
+{
+	const struct lu_seq_range *range = &f_new->fce_range;
+	const seqno_t new_start  = range->lsr_start;
+	const seqno_t new_end  = range->lsr_end;
+	const mdsno_t mdt = range->lsr_index;
+
+	/* this is overlap case, these case are checking overlapping with
+	 * prev range only. fixup will handle overlaping with next range. */
+
+	if (f_curr->fce_range.lsr_index == mdt) {
+		f_curr->fce_range.lsr_start = min(f_curr->fce_range.lsr_start,
+						  new_start);
+
+		f_curr->fce_range.lsr_end = max(f_curr->fce_range.lsr_end,
+						new_end);
+
+		OBD_FREE_PTR(f_new);
+		fld_fix_new_list(cache);
+
+	} else if (new_start <= f_curr->fce_range.lsr_start &&
+			f_curr->fce_range.lsr_end <= new_end) {
+		/* case 1: new range completely overshadowed existing range.
+		 *	 e.g. whole range migrated. update fld cache entry */
+
+		f_curr->fce_range = *range;
+		OBD_FREE_PTR(f_new);
+		fld_fix_new_list(cache);
+
+	} else if (f_curr->fce_range.lsr_start < new_start &&
+			new_end < f_curr->fce_range.lsr_end) {
+		/* case 2: new range fit within existing range. */
+
+		fld_cache_punch_hole(cache, f_curr, f_new);
+
+	} else  if (new_end <= f_curr->fce_range.lsr_end) {
+		/* case 3: overlap:
+		 *	 [new_start [c_start  new_end)  c_end)
+		 */
+
+		LASSERT(new_start <= f_curr->fce_range.lsr_start);
+
+		f_curr->fce_range.lsr_start = new_end;
+		fld_cache_entry_add(cache, f_new, f_curr->fce_list.prev);
+
+	} else if (f_curr->fce_range.lsr_start <= new_start) {
+		/* case 4: overlap:
+		 *	 [c_start [new_start c_end) new_end)
+		 */
+
+		LASSERT(f_curr->fce_range.lsr_end <= new_end);
+
+		f_curr->fce_range.lsr_end = new_start;
+		fld_cache_entry_add(cache, f_new, &f_curr->fce_list);
+	} else
+		CERROR("NEW range ="DRANGE" curr = "DRANGE"\n",
+		       PRANGE(range),PRANGE(&f_curr->fce_range));
+}
+
+struct fld_cache_entry
+*fld_cache_entry_create(const struct lu_seq_range *range)
+{
+	struct fld_cache_entry *f_new;
+
+	LASSERT(range_is_sane(range));
+
+	OBD_ALLOC_PTR(f_new);
+	if (!f_new)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	f_new->fce_range = *range;
+	RETURN(f_new);
+}
+
+/**
+ * Insert FLD entry in FLD cache.
+ *
+ * This function handles all cases of merging and breaking up of
+ * ranges.
+ */
+int fld_cache_insert_nolock(struct fld_cache *cache,
+			    struct fld_cache_entry *f_new)
+{
+	struct fld_cache_entry *f_curr;
+	struct fld_cache_entry *n;
+	struct list_head *head;
+	struct list_head *prev = NULL;
+	const seqno_t new_start  = f_new->fce_range.lsr_start;
+	const seqno_t new_end  = f_new->fce_range.lsr_end;
+	__u32 new_flags  = f_new->fce_range.lsr_flags;
+	ENTRY;
+
+	/*
+	 * Duplicate entries are eliminated in insert op.
+	 * So we don't need to search new entry before starting
+	 * insertion loop.
+	 */
+
+	if (!cache->fci_no_shrink)
+		fld_cache_shrink(cache);
+
+	head = &cache->fci_entries_head;
+
+	list_for_each_entry_safe(f_curr, n, head, fce_list) {
+		/* add list if next is end of list */
+		if (new_end < f_curr->fce_range.lsr_start ||
+		   (new_end == f_curr->fce_range.lsr_start &&
+		    new_flags != f_curr->fce_range.lsr_flags))
+			break;
+
+		prev = &f_curr->fce_list;
+		/* check if this range is to left of new range. */
+		if (new_start < f_curr->fce_range.lsr_end &&
+		    new_flags == f_curr->fce_range.lsr_flags) {
+			fld_cache_overlap_handle(cache, f_curr, f_new);
+			goto out;
+		}
+	}
+
+	if (prev == NULL)
+		prev = head;
+
+	CDEBUG(D_INFO, "insert range "DRANGE"\n", PRANGE(&f_new->fce_range));
+	/* Add new entry to cache and lru list. */
+	fld_cache_entry_add(cache, f_new, prev);
+out:
+	RETURN(0);
+}
+
+int fld_cache_insert(struct fld_cache *cache,
+		     const struct lu_seq_range *range)
+{
+	struct fld_cache_entry	*flde;
+	int rc;
+
+	flde = fld_cache_entry_create(range);
+	if (IS_ERR(flde))
+		RETURN(PTR_ERR(flde));
+
+	write_lock(&cache->fci_lock);
+	rc = fld_cache_insert_nolock(cache, flde);
+	write_unlock(&cache->fci_lock);
+	if (rc)
+		OBD_FREE_PTR(flde);
+
+	RETURN(rc);
+}
+
+void fld_cache_delete_nolock(struct fld_cache *cache,
+		      const struct lu_seq_range *range)
+{
+	struct fld_cache_entry *flde;
+	struct fld_cache_entry *tmp;
+	struct list_head *head;
+
+	head = &cache->fci_entries_head;
+	list_for_each_entry_safe(flde, tmp, head, fce_list) {
+		/* add list if next is end of list */
+		if (range->lsr_start == flde->fce_range.lsr_start ||
+		   (range->lsr_end == flde->fce_range.lsr_end &&
+		    range->lsr_flags == flde->fce_range.lsr_flags)) {
+			fld_cache_entry_delete(cache, flde);
+			break;
+		}
+	}
+}
+
+/**
+ * Delete FLD entry in FLD cache.
+ *
+ */
+void fld_cache_delete(struct fld_cache *cache,
+		      const struct lu_seq_range *range)
+{
+	write_lock(&cache->fci_lock);
+	fld_cache_delete_nolock(cache, range);
+	write_unlock(&cache->fci_lock);
+}
+
+struct fld_cache_entry
+*fld_cache_entry_lookup_nolock(struct fld_cache *cache,
+			      struct lu_seq_range *range)
+{
+	struct fld_cache_entry *flde;
+	struct fld_cache_entry *got = NULL;
+	struct list_head *head;
+
+	head = &cache->fci_entries_head;
+	list_for_each_entry(flde, head, fce_list) {
+		if (range->lsr_start == flde->fce_range.lsr_start ||
+		   (range->lsr_end == flde->fce_range.lsr_end &&
+		    range->lsr_flags == flde->fce_range.lsr_flags)) {
+			got = flde;
+			break;
+		}
+	}
+
+	RETURN(got);
+}
+
+/**
+ * lookup \a seq sequence for range in fld cache.
+ */
+struct fld_cache_entry
+*fld_cache_entry_lookup(struct fld_cache *cache, struct lu_seq_range *range)
+{
+	struct fld_cache_entry *got = NULL;
+	ENTRY;
+
+	read_lock(&cache->fci_lock);
+	got = fld_cache_entry_lookup_nolock(cache, range);
+	read_unlock(&cache->fci_lock);
+	RETURN(got);
+}
+
+/**
+ * lookup \a seq sequence for range in fld cache.
+ */
+int fld_cache_lookup(struct fld_cache *cache,
+		     const seqno_t seq, struct lu_seq_range *range)
+{
+	struct fld_cache_entry *flde;
+	struct fld_cache_entry *prev = NULL;
+	struct list_head *head;
+	ENTRY;
+
+	read_lock(&cache->fci_lock);
+	head = &cache->fci_entries_head;
+
+	cache->fci_stat.fst_count++;
+	list_for_each_entry(flde, head, fce_list) {
+		if (flde->fce_range.lsr_start > seq) {
+			if (prev != NULL)
+				*range = prev->fce_range;
+			break;
+		}
+
+		prev = flde;
+		if (range_within(&flde->fce_range, seq)) {
+			*range = flde->fce_range;
+
+			cache->fci_stat.fst_cache++;
+			read_unlock(&cache->fci_lock);
+			RETURN(0);
+		}
+	}
+	read_unlock(&cache->fci_lock);
+	RETURN(-ENOENT);
+}
diff --git a/drivers/staging/lustre/lustre/fld/fld_handler.c b/drivers/staging/lustre/lustre/fld/fld_handler.c
new file mode 100644
index 000000000000..d2707ae4ad57
--- /dev/null
+++ b/drivers/staging/lustre/lustre/fld/fld_handler.c
@@ -0,0 +1,447 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fld/fld_handler.c
+ *
+ * FLD (Fids Location Database)
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ * Author: WangDi <wangdi@clusterfs.com>
+ * Author: Pravin Shelar <pravin.shelar@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FLD
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/module.h>
+# include <linux/jbd.h>
+# include <asm/div64.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_ver.h>
+#include <obd_support.h>
+#include <lprocfs_status.h>
+
+#include <md_object.h>
+#include <lustre_fid.h>
+#include <lustre_req_layout.h>
+#include "fld_internal.h"
+#include <lustre_fid.h>
+
+
+/* context key constructor/destructor: fld_key_init, fld_key_fini */
+LU_KEY_INIT_FINI(fld, struct fld_thread_info);
+
+/* context key: fld_thread_key */
+LU_CONTEXT_KEY_DEFINE(fld, LCT_MD_THREAD | LCT_DT_THREAD | LCT_MG_THREAD);
+
+proc_dir_entry_t *fld_type_proc_dir = NULL;
+
+static int __init fld_mod_init(void)
+{
+	fld_type_proc_dir = lprocfs_register(LUSTRE_FLD_NAME,
+					     proc_lustre_root,
+					     NULL, NULL);
+	if (IS_ERR(fld_type_proc_dir))
+		return PTR_ERR(fld_type_proc_dir);
+
+	LU_CONTEXT_KEY_INIT(&fld_thread_key);
+	lu_context_key_register(&fld_thread_key);
+	return 0;
+}
+
+static void __exit fld_mod_exit(void)
+{
+	lu_context_key_degister(&fld_thread_key);
+	if (fld_type_proc_dir != NULL && !IS_ERR(fld_type_proc_dir)) {
+		lprocfs_remove(&fld_type_proc_dir);
+		fld_type_proc_dir = NULL;
+	}
+}
+
+int fld_declare_server_create(const struct lu_env *env,
+			      struct lu_server_fld *fld,
+			      struct lu_seq_range *range,
+			      struct thandle *th)
+{
+	int rc;
+
+	rc = fld_declare_index_create(env, fld, range, th);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(fld_declare_server_create);
+
+/**
+ * Insert FLD index entry and update FLD cache.
+ *
+ * This function is called from the sequence allocator when a super-sequence
+ * is granted to a server.
+ */
+int fld_server_create(const struct lu_env *env, struct lu_server_fld *fld,
+		      struct lu_seq_range *range, struct thandle *th)
+{
+	int rc;
+
+	mutex_lock(&fld->lsf_lock);
+	rc = fld_index_create(env, fld, range, th);
+	mutex_unlock(&fld->lsf_lock);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(fld_server_create);
+
+/**
+ *  Lookup mds by seq, returns a range for given seq.
+ *
+ *  If that entry is not cached in fld cache, request is sent to super
+ *  sequence controller node (MDT0). All other MDT[1...N] and client
+ *  cache fld entries, but this cache is not persistent.
+ */
+int fld_server_lookup(const struct lu_env *env, struct lu_server_fld *fld,
+		      seqno_t seq, struct lu_seq_range *range)
+{
+	struct lu_seq_range *erange;
+	struct fld_thread_info *info;
+	int rc;
+	ENTRY;
+
+	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+	LASSERT(info != NULL);
+	erange = &info->fti_lrange;
+
+	/* Lookup it in the cache. */
+	rc = fld_cache_lookup(fld->lsf_cache, seq, erange);
+	if (rc == 0) {
+		if (unlikely(fld_range_type(erange) != fld_range_type(range) &&
+			     !fld_range_is_any(range))) {
+			CERROR("%s: FLD cache range "DRANGE" does not match"
+			       "requested flag %x: rc = %d\n", fld->lsf_name,
+			       PRANGE(erange), range->lsr_flags, -EIO);
+			RETURN(-EIO);
+		}
+		*range = *erange;
+		RETURN(0);
+	}
+
+	if (fld->lsf_obj) {
+		/* On server side, all entries should be in cache.
+		 * If we can not find it in cache, just return error */
+		CERROR("%s: Cannot find sequence "LPX64": rc = %d\n",
+			fld->lsf_name, seq, -EIO);
+		RETURN(-EIO);
+	} else {
+		LASSERT(fld->lsf_control_exp);
+		/* send request to mdt0 i.e. super seq. controller.
+		 * This is temporary solution, long term solution is fld
+		 * replication on all mdt servers.
+		 */
+		range->lsr_start = seq;
+		rc = fld_client_rpc(fld->lsf_control_exp,
+				    range, FLD_LOOKUP);
+		if (rc == 0)
+			fld_cache_insert(fld->lsf_cache, range);
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(fld_server_lookup);
+
+/**
+ * All MDT server handle fld lookup operation. But only MDT0 has fld index.
+ * if entry is not found in cache we need to forward lookup request to MDT0
+ */
+
+static int fld_server_handle(struct lu_server_fld *fld,
+			     const struct lu_env *env,
+			     __u32 opc, struct lu_seq_range *range,
+			     struct fld_thread_info *info)
+{
+	int rc;
+	ENTRY;
+
+	switch (opc) {
+	case FLD_LOOKUP:
+		rc = fld_server_lookup(env, fld, range->lsr_start, range);
+		break;
+	default:
+		rc = -EINVAL;
+		break;
+	}
+
+	CDEBUG(D_INFO, "%s: FLD req handle: error %d (opc: %d, range: "
+	       DRANGE"\n", fld->lsf_name, rc, opc, PRANGE(range));
+
+	RETURN(rc);
+
+}
+
+static int fld_req_handle(struct ptlrpc_request *req,
+			  struct fld_thread_info *info)
+{
+	struct obd_export *exp = req->rq_export;
+	struct lu_site *site = exp->exp_obd->obd_lu_dev->ld_site;
+	struct lu_seq_range *in;
+	struct lu_seq_range *out;
+	int rc;
+	__u32 *opc;
+	ENTRY;
+
+	rc = req_capsule_server_pack(info->fti_pill);
+	if (rc)
+		RETURN(err_serious(rc));
+
+	opc = req_capsule_client_get(info->fti_pill, &RMF_FLD_OPC);
+	if (opc != NULL) {
+		in = req_capsule_client_get(info->fti_pill, &RMF_FLD_MDFLD);
+		if (in == NULL)
+			RETURN(err_serious(-EPROTO));
+		out = req_capsule_server_get(info->fti_pill, &RMF_FLD_MDFLD);
+		if (out == NULL)
+			RETURN(err_serious(-EPROTO));
+		*out = *in;
+
+		/* For old 2.0 client, the 'lsr_flags' is uninitialized.
+		 * Set it as 'LU_SEQ_RANGE_MDT' by default. */
+		if (!(exp_connect_flags(exp) & OBD_CONNECT_64BITHASH) &&
+		    !(exp_connect_flags(exp) & OBD_CONNECT_MDS_MDS) &&
+		    !(exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT) &&
+		    !exp->exp_libclient)
+			fld_range_set_mdt(out);
+
+		rc = fld_server_handle(lu_site2seq(site)->ss_server_fld,
+				       req->rq_svc_thread->t_env,
+				       *opc, out, info);
+	} else {
+		rc = err_serious(-EPROTO);
+	}
+
+	RETURN(rc);
+}
+
+static void fld_thread_info_init(struct ptlrpc_request *req,
+				 struct fld_thread_info *info)
+{
+	info->fti_pill = &req->rq_pill;
+	/* Init request capsule. */
+	req_capsule_init(info->fti_pill, req, RCL_SERVER);
+	req_capsule_set(info->fti_pill, &RQF_FLD_QUERY);
+}
+
+static void fld_thread_info_fini(struct fld_thread_info *info)
+{
+	req_capsule_fini(info->fti_pill);
+}
+
+static int fld_handle(struct ptlrpc_request *req)
+{
+	struct fld_thread_info *info;
+	const struct lu_env *env;
+	int rc;
+
+	env = req->rq_svc_thread->t_env;
+	LASSERT(env != NULL);
+
+	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+	LASSERT(info != NULL);
+
+	fld_thread_info_init(req, info);
+	rc = fld_req_handle(req, info);
+	fld_thread_info_fini(info);
+
+	return rc;
+}
+
+/*
+ * Entry point for handling FLD RPCs called from MDT.
+ */
+int fld_query(struct com_thread_info *info)
+{
+	return fld_handle(info->cti_pill->rc_req);
+}
+EXPORT_SYMBOL(fld_query);
+
+/*
+ * Returns true, if fid is local to this server node.
+ *
+ * WARNING: this function is *not* guaranteed to return false if fid is
+ * remote: it makes an educated conservative guess only.
+ *
+ * fid_is_local() is supposed to be used in assertion checks only.
+ */
+int fid_is_local(const struct lu_env *env,
+		 struct lu_site *site, const struct lu_fid *fid)
+{
+	int result;
+	struct seq_server_site *ss_site;
+	struct lu_seq_range *range;
+	struct fld_thread_info *info;
+	ENTRY;
+
+	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+	range = &info->fti_lrange;
+
+	result = 1; /* conservatively assume fid is local */
+	ss_site = lu_site2seq(site);
+	if (ss_site->ss_client_fld != NULL) {
+		int rc;
+
+		rc = fld_cache_lookup(ss_site->ss_client_fld->lcf_cache,
+				      fid_seq(fid), range);
+		if (rc == 0)
+			result = (range->lsr_index == ss_site->ss_node_id);
+	}
+	return result;
+}
+EXPORT_SYMBOL(fid_is_local);
+
+static void fld_server_proc_fini(struct lu_server_fld *fld);
+
+#ifdef LPROCFS
+static int fld_server_proc_init(struct lu_server_fld *fld)
+{
+	int rc = 0;
+	ENTRY;
+
+	fld->lsf_proc_dir = lprocfs_register(fld->lsf_name,
+					     fld_type_proc_dir,
+					     fld_server_proc_list, fld);
+	if (IS_ERR(fld->lsf_proc_dir)) {
+		rc = PTR_ERR(fld->lsf_proc_dir);
+		RETURN(rc);
+	}
+
+	rc = lprocfs_seq_create(fld->lsf_proc_dir, "fldb", 0444,
+				&fld_proc_seq_fops, fld);
+	if (rc) {
+		lprocfs_remove(&fld->lsf_proc_dir);
+		fld->lsf_proc_dir = NULL;
+	}
+
+	RETURN(rc);
+}
+
+static void fld_server_proc_fini(struct lu_server_fld *fld)
+{
+	ENTRY;
+	if (fld->lsf_proc_dir != NULL) {
+		if (!IS_ERR(fld->lsf_proc_dir))
+			lprocfs_remove(&fld->lsf_proc_dir);
+		fld->lsf_proc_dir = NULL;
+	}
+	EXIT;
+}
+#else
+static int fld_server_proc_init(struct lu_server_fld *fld)
+{
+	return 0;
+}
+
+static void fld_server_proc_fini(struct lu_server_fld *fld)
+{
+	return;
+}
+#endif
+
+int fld_server_init(const struct lu_env *env, struct lu_server_fld *fld,
+		    struct dt_device *dt, const char *prefix, int mds_node_id,
+		    int type)
+{
+	int cache_size, cache_threshold;
+	int rc;
+	ENTRY;
+
+	snprintf(fld->lsf_name, sizeof(fld->lsf_name),
+		 "srv-%s", prefix);
+
+	cache_size = FLD_SERVER_CACHE_SIZE /
+		sizeof(struct fld_cache_entry);
+
+	cache_threshold = cache_size *
+		FLD_SERVER_CACHE_THRESHOLD / 100;
+
+	mutex_init(&fld->lsf_lock);
+	fld->lsf_cache = fld_cache_init(fld->lsf_name,
+					cache_size, cache_threshold);
+	if (IS_ERR(fld->lsf_cache)) {
+		rc = PTR_ERR(fld->lsf_cache);
+		fld->lsf_cache = NULL;
+		GOTO(out, rc);
+	}
+
+	if (!mds_node_id && type == LU_SEQ_RANGE_MDT) {
+		rc = fld_index_init(env, fld, dt);
+		if (rc)
+			GOTO(out, rc);
+	} else {
+		fld->lsf_obj = NULL;
+	}
+
+	rc = fld_server_proc_init(fld);
+	if (rc)
+		GOTO(out, rc);
+
+	fld->lsf_control_exp = NULL;
+
+	GOTO(out, rc);
+
+out:
+	if (rc)
+		fld_server_fini(env, fld);
+	return rc;
+}
+EXPORT_SYMBOL(fld_server_init);
+
+void fld_server_fini(const struct lu_env *env, struct lu_server_fld *fld)
+{
+	ENTRY;
+
+	fld_server_proc_fini(fld);
+	fld_index_fini(env, fld);
+
+	if (fld->lsf_cache != NULL) {
+		if (!IS_ERR(fld->lsf_cache))
+			fld_cache_fini(fld->lsf_cache);
+		fld->lsf_cache = NULL;
+	}
+
+	EXIT;
+}
+EXPORT_SYMBOL(fld_server_fini);
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre FLD");
+MODULE_LICENSE("GPL");
+
+cfs_module(mdd, "0.1.0", fld_mod_init, fld_mod_exit);
diff --git a/drivers/staging/lustre/lustre/fld/fld_index.c b/drivers/staging/lustre/lustre/fld/fld_index.c
new file mode 100644
index 000000000000..ec68a54c23bd
--- /dev/null
+++ b/drivers/staging/lustre/lustre/fld/fld_index.c
@@ -0,0 +1,426 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fld/fld_index.c
+ *
+ * Author: WangDi <wangdi@clusterfs.com>
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FLD
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/module.h>
+# include <linux/jbd.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_ver.h>
+#include <obd_support.h>
+#include <lprocfs_status.h>
+
+#include <dt_object.h>
+#include <md_object.h>
+#include <lustre_mdc.h>
+#include <lustre_fid.h>
+#include <lustre_fld.h>
+#include "fld_internal.h"
+
+const char fld_index_name[] = "fld";
+
+static const struct lu_seq_range IGIF_FLD_RANGE = {
+	.lsr_start = FID_SEQ_IGIF,
+	.lsr_end   = FID_SEQ_IGIF_MAX + 1,
+	.lsr_index = 0,
+	.lsr_flags = LU_SEQ_RANGE_MDT
+};
+
+static const struct lu_seq_range DOT_LUSTRE_FLD_RANGE = {
+	.lsr_start = FID_SEQ_DOT_LUSTRE,
+	.lsr_end   = FID_SEQ_DOT_LUSTRE + 1,
+	.lsr_index = 0,
+	.lsr_flags = LU_SEQ_RANGE_MDT
+};
+
+static const struct lu_seq_range ROOT_FLD_RANGE = {
+	.lsr_start = FID_SEQ_ROOT,
+	.lsr_end   = FID_SEQ_ROOT + 1,
+	.lsr_index = 0,
+	.lsr_flags = LU_SEQ_RANGE_MDT
+};
+
+const struct dt_index_features fld_index_features = {
+	.dif_flags       = DT_IND_UPDATE,
+	.dif_keysize_min = sizeof(seqno_t),
+	.dif_keysize_max = sizeof(seqno_t),
+	.dif_recsize_min = sizeof(struct lu_seq_range),
+	.dif_recsize_max = sizeof(struct lu_seq_range),
+	.dif_ptrsize     = 4
+};
+
+extern struct lu_context_key fld_thread_key;
+
+int fld_declare_index_create(const struct lu_env *env,
+			     struct lu_server_fld *fld,
+			     const struct lu_seq_range *new_range,
+			     struct thandle *th)
+{
+	struct lu_seq_range	*tmp;
+	struct lu_seq_range	*range;
+	struct fld_thread_info	*info;
+	int			rc = 0;
+
+	ENTRY;
+
+	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+	range = &info->fti_lrange;
+	tmp = &info->fti_irange;
+	memset(range, 0, sizeof(*range));
+
+	rc = fld_index_lookup(env, fld, new_range->lsr_start, range);
+	if (rc == 0) {
+		/* In case of duplicate entry, the location must be same */
+		LASSERT((range_compare_loc(new_range, range) == 0));
+		GOTO(out, rc = -EEXIST);
+	}
+
+	if (rc != -ENOENT) {
+		CERROR("%s: lookup range "DRANGE" error: rc = %d\n",
+			fld->lsf_name, PRANGE(range), rc);
+		GOTO(out, rc);
+	}
+
+	/* Check for merge case, since the fld entry can only be increamental,
+	 * so we will only check whether it can be merged from the left. */
+	if (new_range->lsr_start == range->lsr_end && range->lsr_end != 0 &&
+	    range_compare_loc(new_range, range) == 0) {
+		range_cpu_to_be(tmp, range);
+		rc = dt_declare_delete(env, fld->lsf_obj,
+				       (struct dt_key *)&tmp->lsr_start, th);
+		if (rc) {
+			CERROR("%s: declare record "DRANGE" failed: rc = %d\n",
+			       fld->lsf_name, PRANGE(range), rc);
+			GOTO(out, rc);
+		}
+		memcpy(tmp, new_range, sizeof(*new_range));
+		tmp->lsr_start = range->lsr_start;
+	} else {
+		memcpy(tmp, new_range, sizeof(*new_range));
+	}
+
+	range_cpu_to_be(tmp, tmp);
+	rc = dt_declare_insert(env, fld->lsf_obj, (struct dt_rec *)tmp,
+			       (struct dt_key *)&tmp->lsr_start, th);
+out:
+	RETURN(rc);
+}
+
+/**
+ * insert range in fld store.
+ *
+ *      \param  range  range to be inserted
+ *      \param  th     transaction for this operation as it could compound
+ *		     transaction.
+ *
+ *      \retval  0  success
+ *      \retval  -ve error
+ *
+ * The whole fld index insertion is protected by seq->lss_mutex (see
+ * seq_server_alloc_super), i.e. only one thread will access fldb each
+ * time, so we do not need worry the fld file and cache will being
+ * changed between declare and create.
+ * Because the fld entry can only be increamental, so we will only check
+ * whether it can be merged from the left.
+ **/
+int fld_index_create(const struct lu_env *env, struct lu_server_fld *fld,
+		     const struct lu_seq_range *new_range, struct thandle *th)
+{
+	struct lu_seq_range	*range;
+	struct lu_seq_range	*tmp;
+	struct fld_thread_info	*info;
+	int			rc = 0;
+	int			deleted = 0;
+	struct fld_cache_entry	*flde;
+	ENTRY;
+
+	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+
+	LASSERT(mutex_is_locked(&fld->lsf_lock));
+
+	range = &info->fti_lrange;
+	memset(range, 0, sizeof(*range));
+	tmp = &info->fti_irange;
+	rc = fld_index_lookup(env, fld, new_range->lsr_start, range);
+	if (rc != -ENOENT) {
+		rc = rc == 0 ? -EEXIST : rc;
+		GOTO(out, rc);
+	}
+
+	if (new_range->lsr_start == range->lsr_end && range->lsr_end != 0 &&
+	    range_compare_loc(new_range, range) == 0) {
+		range_cpu_to_be(tmp, range);
+		rc = dt_delete(env, fld->lsf_obj,
+			       (struct dt_key *)&tmp->lsr_start, th,
+				BYPASS_CAPA);
+		if (rc != 0)
+			GOTO(out, rc);
+		memcpy(tmp, new_range, sizeof(*new_range));
+		tmp->lsr_start = range->lsr_start;
+		deleted = 1;
+	} else {
+		memcpy(tmp, new_range, sizeof(*new_range));
+	}
+
+	range_cpu_to_be(tmp, tmp);
+	rc = dt_insert(env, fld->lsf_obj, (struct dt_rec *)tmp,
+		       (struct dt_key *)&tmp->lsr_start, th, BYPASS_CAPA, 1);
+	if (rc != 0) {
+		CERROR("%s: insert range "DRANGE" failed: rc = %d\n",
+		       fld->lsf_name, PRANGE(new_range), rc);
+		GOTO(out, rc);
+	}
+
+	flde = fld_cache_entry_create(new_range);
+	if (IS_ERR(flde))
+		GOTO(out, rc = PTR_ERR(flde));
+
+	write_lock(&fld->lsf_cache->fci_lock);
+	if (deleted)
+		fld_cache_delete_nolock(fld->lsf_cache, new_range);
+	rc = fld_cache_insert_nolock(fld->lsf_cache, flde);
+	write_unlock(&fld->lsf_cache->fci_lock);
+	if (rc)
+		OBD_FREE_PTR(flde);
+out:
+	RETURN(rc);
+}
+
+/**
+ * lookup range for a seq passed. note here we only care about the start/end,
+ * caller should handle the attached location data (flags, index).
+ *
+ * \param  seq     seq for lookup.
+ * \param  range   result of lookup.
+ *
+ * \retval  0	   found, \a range is the matched range;
+ * \retval -ENOENT      not found, \a range is the left-side range;
+ * \retval  -ve	 other error;
+ */
+int fld_index_lookup(const struct lu_env *env, struct lu_server_fld *fld,
+		     seqno_t seq, struct lu_seq_range *range)
+{
+	struct lu_seq_range     *fld_rec;
+	struct fld_thread_info  *info;
+	int rc;
+
+	ENTRY;
+
+	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+	fld_rec = &info->fti_rec;
+
+	rc = fld_cache_lookup(fld->lsf_cache, seq, fld_rec);
+	if (rc == 0) {
+		*range = *fld_rec;
+		if (range_within(range, seq))
+			rc = 0;
+		else
+			rc = -ENOENT;
+	}
+
+	CDEBUG(D_INFO, "%s: lookup seq = "LPX64" range : "DRANGE" rc = %d\n",
+	       fld->lsf_name, seq, PRANGE(range), rc);
+
+	RETURN(rc);
+}
+
+int fld_insert_entry(const struct lu_env *env,
+		     struct lu_server_fld *fld,
+		     const struct lu_seq_range *range)
+{
+	struct thandle *th;
+	int rc;
+	ENTRY;
+
+	th = dt_trans_create(env, lu2dt_dev(fld->lsf_obj->do_lu.lo_dev));
+	if (IS_ERR(th))
+		RETURN(PTR_ERR(th));
+
+	rc = fld_declare_index_create(env, fld, range, th);
+	if (rc != 0) {
+		if (rc == -EEXIST)
+			rc = 0;
+		GOTO(out, rc);
+	}
+
+	rc = dt_trans_start_local(env, lu2dt_dev(fld->lsf_obj->do_lu.lo_dev),
+				  th);
+	if (rc)
+		GOTO(out, rc);
+
+	rc = fld_index_create(env, fld, range, th);
+	if (rc == -EEXIST)
+		rc = 0;
+out:
+	dt_trans_stop(env, lu2dt_dev(fld->lsf_obj->do_lu.lo_dev), th);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(fld_insert_entry);
+
+static int fld_insert_special_entries(const struct lu_env *env,
+				      struct lu_server_fld *fld)
+{
+	int rc;
+
+	rc = fld_insert_entry(env, fld, &IGIF_FLD_RANGE);
+	if (rc != 0)
+		RETURN(rc);
+
+	rc = fld_insert_entry(env, fld, &DOT_LUSTRE_FLD_RANGE);
+	if (rc != 0)
+		RETURN(rc);
+
+	rc = fld_insert_entry(env, fld, &ROOT_FLD_RANGE);
+
+	RETURN(rc);
+}
+
+int fld_index_init(const struct lu_env *env, struct lu_server_fld *fld,
+		   struct dt_device *dt)
+{
+	struct dt_object	*dt_obj = NULL;
+	struct lu_fid		fid;
+	struct lu_attr		*attr = NULL;
+	struct lu_seq_range	*range = NULL;
+	struct fld_thread_info	*info;
+	struct dt_object_format	dof;
+	struct dt_it		*it;
+	const struct dt_it_ops	*iops;
+	int			rc;
+	ENTRY;
+
+	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+	LASSERT(info != NULL);
+
+	lu_local_obj_fid(&fid, FLD_INDEX_OID);
+	OBD_ALLOC_PTR(attr);
+	if (attr == NULL)
+		RETURN(-ENOMEM);
+
+	memset(attr, 0, sizeof(*attr));
+	attr->la_valid = LA_MODE;
+	attr->la_mode = S_IFREG | 0666;
+	dof.dof_type = DFT_INDEX;
+	dof.u.dof_idx.di_feat = &fld_index_features;
+
+	dt_obj = dt_find_or_create(env, dt, &fid, &dof, attr);
+	if (IS_ERR(dt_obj)) {
+		rc = PTR_ERR(dt_obj);
+		CERROR("%s: Can't find \"%s\" obj %d\n", fld->lsf_name,
+			fld_index_name, rc);
+		dt_obj = NULL;
+		GOTO(out, rc);
+	}
+
+	fld->lsf_obj = dt_obj;
+	rc = dt_obj->do_ops->do_index_try(env, dt_obj, &fld_index_features);
+	if (rc != 0) {
+		CERROR("%s: File \"%s\" is not an index: rc = %d!\n",
+		       fld->lsf_name, fld_index_name, rc);
+		GOTO(out, rc);
+	}
+
+	range = &info->fti_rec;
+	/* Load fld entry to cache */
+	iops = &dt_obj->do_index_ops->dio_it;
+	it = iops->init(env, dt_obj, 0, NULL);
+	if (IS_ERR(it))
+		GOTO(out, rc = PTR_ERR(it));
+
+	rc = iops->load(env, it, 0);
+	if (rc < 0)
+		GOTO(out_it_fini, rc);
+
+	if (rc > 0) {
+		/* Load FLD entry into server cache */
+		do {
+			rc = iops->rec(env, it, (struct dt_rec *)range, 0);
+			if (rc != 0)
+				GOTO(out_it_put, rc);
+			LASSERT(range != NULL);
+			range_be_to_cpu(range, range);
+			rc = fld_cache_insert(fld->lsf_cache, range);
+			if (rc != 0)
+				GOTO(out_it_put, rc);
+			rc = iops->next(env, it);
+		} while (rc == 0);
+	}
+
+	/* Note: fld_insert_entry will detect whether these
+	 * special entries already exist inside FLDB */
+	mutex_lock(&fld->lsf_lock);
+	rc = fld_insert_special_entries(env, fld);
+	mutex_unlock(&fld->lsf_lock);
+	if (rc != 0) {
+		CERROR("%s: insert special entries failed!: rc = %d\n",
+		       fld->lsf_name, rc);
+		GOTO(out_it_put, rc);
+	}
+
+out_it_put:
+	iops->put(env, it);
+out_it_fini:
+	iops->fini(env, it);
+out:
+	if (attr != NULL)
+		OBD_FREE_PTR(attr);
+
+	if (rc != 0) {
+		if (dt_obj != NULL)
+			lu_object_put(env, &dt_obj->do_lu);
+		fld->lsf_obj = NULL;
+	}
+	RETURN(rc);
+}
+
+void fld_index_fini(const struct lu_env *env, struct lu_server_fld *fld)
+{
+	ENTRY;
+	if (fld->lsf_obj != NULL) {
+		if (!IS_ERR(fld->lsf_obj))
+			lu_object_put(env, &fld->lsf_obj->do_lu);
+		fld->lsf_obj = NULL;
+	}
+	EXIT;
+}
diff --git a/drivers/staging/lustre/lustre/fld/fld_internal.h b/drivers/staging/lustre/lustre/fld/fld_internal.h
new file mode 100644
index 000000000000..9fa9e01cdb67
--- /dev/null
+++ b/drivers/staging/lustre/lustre/fld/fld_internal.h
@@ -0,0 +1,223 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fld/fld_internal.h
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ * Author: Tom WangDi <wangdi@clusterfs.com>
+ */
+#ifndef __FLD_INTERNAL_H
+#define __FLD_INTERNAL_H
+
+#include <lustre/lustre_idl.h>
+#include <dt_object.h>
+
+#include <linux/libcfs/libcfs.h>
+#include <lustre_req_layout.h>
+#include <lustre_fld.h>
+
+enum {
+	LUSTRE_FLD_INIT = 1 << 0,
+	LUSTRE_FLD_RUN  = 1 << 1
+};
+
+struct fld_stats {
+	__u64   fst_count;
+	__u64   fst_cache;
+	__u64   fst_inflight;
+};
+
+typedef int (*fld_hash_func_t) (struct lu_client_fld *, __u64);
+
+typedef struct lu_fld_target *
+(*fld_scan_func_t) (struct lu_client_fld *, __u64);
+
+struct lu_fld_hash {
+	const char	      *fh_name;
+	fld_hash_func_t	  fh_hash_func;
+	fld_scan_func_t	  fh_scan_func;
+};
+
+struct fld_cache_entry {
+	struct list_head	       fce_lru;
+	struct list_head	       fce_list;
+	/**
+	 * fld cache entries are sorted on range->lsr_start field. */
+	struct lu_seq_range      fce_range;
+};
+
+struct fld_cache {
+	/**
+	 * Cache guard, protects fci_hash mostly because others immutable after
+	 * init is finished.
+	 */
+	rwlock_t		 fci_lock;
+
+	/**
+	 * Cache shrink threshold */
+	int		      fci_threshold;
+
+	/**
+	 * Prefered number of cached entries */
+	int		      fci_cache_size;
+
+	/**
+	 * Current number of cached entries. Protected by \a fci_lock */
+	int		      fci_cache_count;
+
+	/**
+	 * LRU list fld entries. */
+	struct list_head	       fci_lru;
+
+	/**
+	 * sorted fld entries. */
+	struct list_head	       fci_entries_head;
+
+	/**
+	 * Cache statistics. */
+	struct fld_stats	 fci_stat;
+
+	/**
+	 * Cache name used for debug and messages. */
+	char		     fci_name[80];
+	unsigned int		 fci_no_shrink:1;
+};
+
+enum fld_op {
+	FLD_CREATE = 0,
+	FLD_DELETE = 1,
+	FLD_LOOKUP = 2
+};
+
+enum {
+	/* 4M of FLD cache will not hurt client a lot. */
+	FLD_SERVER_CACHE_SIZE      = (4 * 0x100000),
+
+	/* 1M of FLD cache will not hurt client a lot. */
+	FLD_CLIENT_CACHE_SIZE      = (1 * 0x100000)
+};
+
+enum {
+	/* Cache threshold is 10 percent of size. */
+	FLD_SERVER_CACHE_THRESHOLD = 10,
+
+	/* Cache threshold is 10 percent of size. */
+	FLD_CLIENT_CACHE_THRESHOLD = 10
+};
+
+extern struct lu_fld_hash fld_hash[];
+
+
+struct fld_thread_info {
+	struct req_capsule *fti_pill;
+	__u64	       fti_key;
+	struct lu_seq_range fti_rec;
+	struct lu_seq_range fti_lrange;
+	struct lu_seq_range fti_irange;
+};
+
+extern struct lu_context_key fld_thread_key;
+
+int fld_index_init(const struct lu_env *env, struct lu_server_fld *fld,
+		   struct dt_device *dt);
+
+void fld_index_fini(const struct lu_env *env, struct lu_server_fld *fld);
+
+int fld_declare_index_create(const struct lu_env *env,
+			     struct lu_server_fld *fld,
+			     const struct lu_seq_range *new,
+			     struct thandle *th);
+
+int fld_index_create(const struct lu_env *env, struct lu_server_fld *fld,
+		     const struct lu_seq_range *new, struct thandle *th);
+
+int fld_index_lookup(const struct lu_env *env, struct lu_server_fld *fld,
+		     seqno_t seq, struct lu_seq_range *range);
+
+int fld_client_rpc(struct obd_export *exp,
+		   struct lu_seq_range *range, __u32 fld_op);
+
+#ifdef LPROCFS
+extern struct lprocfs_vars fld_server_proc_list[];
+extern struct lprocfs_vars fld_client_proc_list[];
+#endif
+
+
+struct fld_cache *fld_cache_init(const char *name,
+				 int cache_size, int cache_threshold);
+
+void fld_cache_fini(struct fld_cache *cache);
+
+void fld_cache_flush(struct fld_cache *cache);
+
+int fld_cache_insert(struct fld_cache *cache,
+		     const struct lu_seq_range *range);
+
+struct fld_cache_entry
+*fld_cache_entry_create(const struct lu_seq_range *range);
+
+int fld_cache_insert_nolock(struct fld_cache *cache,
+			    struct fld_cache_entry *f_new);
+void fld_cache_delete(struct fld_cache *cache,
+		      const struct lu_seq_range *range);
+void fld_cache_delete_nolock(struct fld_cache *cache,
+			     const struct lu_seq_range *range);
+int fld_cache_lookup(struct fld_cache *cache,
+		     const seqno_t seq, struct lu_seq_range *range);
+
+struct fld_cache_entry*
+fld_cache_entry_lookup(struct fld_cache *cache, struct lu_seq_range *range);
+void fld_cache_entry_delete(struct fld_cache *cache,
+			    struct fld_cache_entry *node);
+void fld_dump_cache_entries(struct fld_cache *cache);
+
+struct fld_cache_entry
+*fld_cache_entry_lookup_nolock(struct fld_cache *cache,
+			      struct lu_seq_range *range);
+int fld_write_range(const struct lu_env *env, struct dt_object *dt,
+		    const struct lu_seq_range *range, struct thandle *th);
+
+static inline const char *
+fld_target_name(struct lu_fld_target *tar)
+{
+	if (tar->ft_srv != NULL)
+		return tar->ft_srv->lsf_name;
+
+	return (const char *)tar->ft_exp->exp_obd->obd_name;
+}
+
+extern proc_dir_entry_t *fld_type_proc_dir;
+extern struct file_operations fld_proc_seq_fops;
+#endif /* __FLD_INTERNAL_H */
diff --git a/drivers/staging/lustre/lustre/fld/fld_request.c b/drivers/staging/lustre/lustre/fld/fld_request.c
new file mode 100644
index 000000000000..e9f07398b68a
--- /dev/null
+++ b/drivers/staging/lustre/lustre/fld/fld_request.c
@@ -0,0 +1,519 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fld/fld_request.c
+ *
+ * FLD (Fids Location Database)
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FLD
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/module.h>
+# include <linux/jbd.h>
+# include <asm/div64.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_ver.h>
+#include <obd_support.h>
+#include <lprocfs_status.h>
+
+#include <dt_object.h>
+#include <md_object.h>
+#include <lustre_req_layout.h>
+#include <lustre_fld.h>
+#include <lustre_mdc.h>
+#include "fld_internal.h"
+
+/* TODO: these 3 functions are copies of flow-control code from mdc_lib.c
+ * It should be common thing. The same about mdc RPC lock */
+static int fld_req_avail(struct client_obd *cli, struct mdc_cache_waiter *mcw)
+{
+	int rc;
+	ENTRY;
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	rc = list_empty(&mcw->mcw_entry);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+	RETURN(rc);
+};
+
+static void fld_enter_request(struct client_obd *cli)
+{
+	struct mdc_cache_waiter mcw;
+	struct l_wait_info lwi = { 0 };
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	if (cli->cl_r_in_flight >= cli->cl_max_rpcs_in_flight) {
+		list_add_tail(&mcw.mcw_entry, &cli->cl_cache_waiters);
+		init_waitqueue_head(&mcw.mcw_waitq);
+		client_obd_list_unlock(&cli->cl_loi_list_lock);
+		l_wait_event(mcw.mcw_waitq, fld_req_avail(cli, &mcw), &lwi);
+	} else {
+		cli->cl_r_in_flight++;
+		client_obd_list_unlock(&cli->cl_loi_list_lock);
+	}
+}
+
+static void fld_exit_request(struct client_obd *cli)
+{
+	struct list_head *l, *tmp;
+	struct mdc_cache_waiter *mcw;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	cli->cl_r_in_flight--;
+	list_for_each_safe(l, tmp, &cli->cl_cache_waiters) {
+
+		if (cli->cl_r_in_flight >= cli->cl_max_rpcs_in_flight) {
+			/* No free request slots anymore */
+			break;
+		}
+
+		mcw = list_entry(l, struct mdc_cache_waiter, mcw_entry);
+		list_del_init(&mcw->mcw_entry);
+		cli->cl_r_in_flight++;
+		wake_up(&mcw->mcw_waitq);
+	}
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+}
+
+static int fld_rrb_hash(struct lu_client_fld *fld,
+			seqno_t seq)
+{
+	LASSERT(fld->lcf_count > 0);
+	return do_div(seq, fld->lcf_count);
+}
+
+static struct lu_fld_target *
+fld_rrb_scan(struct lu_client_fld *fld, seqno_t seq)
+{
+	struct lu_fld_target *target;
+	int hash;
+	ENTRY;
+
+	/* Because almost all of special sequence located in MDT0,
+	 * it should go to index 0 directly, instead of calculating
+	 * hash again, and also if other MDTs is not being connected,
+	 * the fld lookup requests(for seq on MDT0) should not be
+	 * blocked because of other MDTs */
+	if (fid_seq_is_norm(seq))
+		hash = fld_rrb_hash(fld, seq);
+	else
+		hash = 0;
+
+	list_for_each_entry(target, &fld->lcf_targets, ft_chain) {
+		if (target->ft_idx == hash)
+			RETURN(target);
+	}
+
+	CERROR("%s: Can't find target by hash %d (seq "LPX64"). "
+	       "Targets (%d):\n", fld->lcf_name, hash, seq,
+	       fld->lcf_count);
+
+	list_for_each_entry(target, &fld->lcf_targets, ft_chain) {
+		const char *srv_name = target->ft_srv != NULL  ?
+			target->ft_srv->lsf_name : "<null>";
+		const char *exp_name = target->ft_exp != NULL ?
+			(char *)target->ft_exp->exp_obd->obd_uuid.uuid :
+			"<null>";
+
+		CERROR("  exp: 0x%p (%s), srv: 0x%p (%s), idx: "LPU64"\n",
+		       target->ft_exp, exp_name, target->ft_srv,
+		       srv_name, target->ft_idx);
+	}
+
+	/*
+	 * If target is not found, there is logical error anyway, so here is
+	 * LBUG() to catch this situation.
+	 */
+	LBUG();
+	RETURN(NULL);
+}
+
+struct lu_fld_hash fld_hash[] = {
+	{
+		.fh_name = "RRB",
+		.fh_hash_func = fld_rrb_hash,
+		.fh_scan_func = fld_rrb_scan
+	},
+	{
+		0,
+	}
+};
+
+static struct lu_fld_target *
+fld_client_get_target(struct lu_client_fld *fld, seqno_t seq)
+{
+	struct lu_fld_target *target;
+	ENTRY;
+
+	LASSERT(fld->lcf_hash != NULL);
+
+	spin_lock(&fld->lcf_lock);
+	target = fld->lcf_hash->fh_scan_func(fld, seq);
+	spin_unlock(&fld->lcf_lock);
+
+	if (target != NULL) {
+		CDEBUG(D_INFO, "%s: Found target (idx "LPU64
+		       ") by seq "LPX64"\n", fld->lcf_name,
+		       target->ft_idx, seq);
+	}
+
+	RETURN(target);
+}
+
+/*
+ * Add export to FLD. This is usually done by CMM and LMV as they are main users
+ * of FLD module.
+ */
+int fld_client_add_target(struct lu_client_fld *fld,
+			  struct lu_fld_target *tar)
+{
+	const char *name;
+	struct lu_fld_target *target, *tmp;
+	ENTRY;
+
+	LASSERT(tar != NULL);
+	name = fld_target_name(tar);
+	LASSERT(name != NULL);
+	LASSERT(tar->ft_srv != NULL || tar->ft_exp != NULL);
+
+	if (fld->lcf_flags != LUSTRE_FLD_INIT) {
+		CERROR("%s: Attempt to add target %s (idx "LPU64") "
+		       "on fly - skip it\n", fld->lcf_name, name,
+		       tar->ft_idx);
+		RETURN(0);
+	} else {
+		CDEBUG(D_INFO, "%s: Adding target %s (idx "
+		       LPU64")\n", fld->lcf_name, name, tar->ft_idx);
+	}
+
+	OBD_ALLOC_PTR(target);
+	if (target == NULL)
+		RETURN(-ENOMEM);
+
+	spin_lock(&fld->lcf_lock);
+	list_for_each_entry(tmp, &fld->lcf_targets, ft_chain) {
+		if (tmp->ft_idx == tar->ft_idx) {
+			spin_unlock(&fld->lcf_lock);
+			OBD_FREE_PTR(target);
+			CERROR("Target %s exists in FLD and known as %s:#"LPU64"\n",
+			       name, fld_target_name(tmp), tmp->ft_idx);
+			RETURN(-EEXIST);
+		}
+	}
+
+	target->ft_exp = tar->ft_exp;
+	if (target->ft_exp != NULL)
+		class_export_get(target->ft_exp);
+	target->ft_srv = tar->ft_srv;
+	target->ft_idx = tar->ft_idx;
+
+	list_add_tail(&target->ft_chain,
+			  &fld->lcf_targets);
+
+	fld->lcf_count++;
+	spin_unlock(&fld->lcf_lock);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(fld_client_add_target);
+
+/* Remove export from FLD */
+int fld_client_del_target(struct lu_client_fld *fld, __u64 idx)
+{
+	struct lu_fld_target *target, *tmp;
+	ENTRY;
+
+	spin_lock(&fld->lcf_lock);
+	list_for_each_entry_safe(target, tmp,
+				     &fld->lcf_targets, ft_chain) {
+		if (target->ft_idx == idx) {
+			fld->lcf_count--;
+			list_del(&target->ft_chain);
+			spin_unlock(&fld->lcf_lock);
+
+			if (target->ft_exp != NULL)
+				class_export_put(target->ft_exp);
+
+			OBD_FREE_PTR(target);
+			RETURN(0);
+		}
+	}
+	spin_unlock(&fld->lcf_lock);
+	RETURN(-ENOENT);
+}
+EXPORT_SYMBOL(fld_client_del_target);
+
+#ifdef LPROCFS
+static int fld_client_proc_init(struct lu_client_fld *fld)
+{
+	int rc;
+	ENTRY;
+
+	fld->lcf_proc_dir = lprocfs_register(fld->lcf_name,
+					     fld_type_proc_dir,
+					     NULL, NULL);
+
+	if (IS_ERR(fld->lcf_proc_dir)) {
+		CERROR("%s: LProcFS failed in fld-init\n",
+		       fld->lcf_name);
+		rc = PTR_ERR(fld->lcf_proc_dir);
+		RETURN(rc);
+	}
+
+	rc = lprocfs_add_vars(fld->lcf_proc_dir,
+			      fld_client_proc_list, fld);
+	if (rc) {
+		CERROR("%s: Can't init FLD proc, rc %d\n",
+		       fld->lcf_name, rc);
+		GOTO(out_cleanup, rc);
+	}
+
+	RETURN(0);
+
+out_cleanup:
+	fld_client_proc_fini(fld);
+	return rc;
+}
+
+void fld_client_proc_fini(struct lu_client_fld *fld)
+{
+	ENTRY;
+	if (fld->lcf_proc_dir) {
+		if (!IS_ERR(fld->lcf_proc_dir))
+			lprocfs_remove(&fld->lcf_proc_dir);
+		fld->lcf_proc_dir = NULL;
+	}
+	EXIT;
+}
+#else
+static int fld_client_proc_init(struct lu_client_fld *fld)
+{
+	return 0;
+}
+
+void fld_client_proc_fini(struct lu_client_fld *fld)
+{
+	return;
+}
+#endif
+
+EXPORT_SYMBOL(fld_client_proc_fini);
+
+static inline int hash_is_sane(int hash)
+{
+	return (hash >= 0 && hash < ARRAY_SIZE(fld_hash));
+}
+
+int fld_client_init(struct lu_client_fld *fld,
+		    const char *prefix, int hash)
+{
+	int cache_size, cache_threshold;
+	int rc;
+	ENTRY;
+
+	LASSERT(fld != NULL);
+
+	snprintf(fld->lcf_name, sizeof(fld->lcf_name),
+		 "cli-%s", prefix);
+
+	if (!hash_is_sane(hash)) {
+		CERROR("%s: Wrong hash function %#x\n",
+		       fld->lcf_name, hash);
+		RETURN(-EINVAL);
+	}
+
+	fld->lcf_count = 0;
+	spin_lock_init(&fld->lcf_lock);
+	fld->lcf_hash = &fld_hash[hash];
+	fld->lcf_flags = LUSTRE_FLD_INIT;
+	INIT_LIST_HEAD(&fld->lcf_targets);
+
+	cache_size = FLD_CLIENT_CACHE_SIZE /
+		sizeof(struct fld_cache_entry);
+
+	cache_threshold = cache_size *
+		FLD_CLIENT_CACHE_THRESHOLD / 100;
+
+	fld->lcf_cache = fld_cache_init(fld->lcf_name,
+					cache_size, cache_threshold);
+	if (IS_ERR(fld->lcf_cache)) {
+		rc = PTR_ERR(fld->lcf_cache);
+		fld->lcf_cache = NULL;
+		GOTO(out, rc);
+	}
+
+	rc = fld_client_proc_init(fld);
+	if (rc)
+		GOTO(out, rc);
+	EXIT;
+out:
+	if (rc)
+		fld_client_fini(fld);
+	else
+		CDEBUG(D_INFO, "%s: Using \"%s\" hash\n",
+		       fld->lcf_name, fld->lcf_hash->fh_name);
+	return rc;
+}
+EXPORT_SYMBOL(fld_client_init);
+
+void fld_client_fini(struct lu_client_fld *fld)
+{
+	struct lu_fld_target *target, *tmp;
+	ENTRY;
+
+	spin_lock(&fld->lcf_lock);
+	list_for_each_entry_safe(target, tmp,
+				     &fld->lcf_targets, ft_chain) {
+		fld->lcf_count--;
+		list_del(&target->ft_chain);
+		if (target->ft_exp != NULL)
+			class_export_put(target->ft_exp);
+		OBD_FREE_PTR(target);
+	}
+	spin_unlock(&fld->lcf_lock);
+
+	if (fld->lcf_cache != NULL) {
+		if (!IS_ERR(fld->lcf_cache))
+			fld_cache_fini(fld->lcf_cache);
+		fld->lcf_cache = NULL;
+	}
+
+	EXIT;
+}
+EXPORT_SYMBOL(fld_client_fini);
+
+int fld_client_rpc(struct obd_export *exp,
+		   struct lu_seq_range *range, __u32 fld_op)
+{
+	struct ptlrpc_request *req;
+	struct lu_seq_range   *prange;
+	__u32		 *op;
+	int		    rc;
+	struct obd_import     *imp;
+	ENTRY;
+
+	LASSERT(exp != NULL);
+
+	imp = class_exp2cliimp(exp);
+	req = ptlrpc_request_alloc_pack(imp, &RQF_FLD_QUERY, LUSTRE_MDS_VERSION,
+					FLD_QUERY);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	op = req_capsule_client_get(&req->rq_pill, &RMF_FLD_OPC);
+	*op = fld_op;
+
+	prange = req_capsule_client_get(&req->rq_pill, &RMF_FLD_MDFLD);
+	*prange = *range;
+
+	ptlrpc_request_set_replen(req);
+	req->rq_request_portal = FLD_REQUEST_PORTAL;
+	ptlrpc_at_set_req_timeout(req);
+
+	if (fld_op == FLD_LOOKUP &&
+	    imp->imp_connect_flags_orig & OBD_CONNECT_MDS_MDS)
+		req->rq_allow_replay = 1;
+
+	if (fld_op != FLD_LOOKUP)
+		mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+	fld_enter_request(&exp->exp_obd->u.cli);
+	rc = ptlrpc_queue_wait(req);
+	fld_exit_request(&exp->exp_obd->u.cli);
+	if (fld_op != FLD_LOOKUP)
+		mdc_put_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+	if (rc)
+		GOTO(out_req, rc);
+
+	prange = req_capsule_server_get(&req->rq_pill, &RMF_FLD_MDFLD);
+	if (prange == NULL)
+		GOTO(out_req, rc = -EFAULT);
+	*range = *prange;
+	EXIT;
+out_req:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+int fld_client_lookup(struct lu_client_fld *fld, seqno_t seq, mdsno_t *mds,
+		      __u32 flags, const struct lu_env *env)
+{
+	struct lu_seq_range res = { 0 };
+	struct lu_fld_target *target;
+	int rc;
+	ENTRY;
+
+	fld->lcf_flags |= LUSTRE_FLD_RUN;
+
+	rc = fld_cache_lookup(fld->lcf_cache, seq, &res);
+	if (rc == 0) {
+		*mds = res.lsr_index;
+		RETURN(0);
+	}
+
+	/* Can not find it in the cache */
+	target = fld_client_get_target(fld, seq);
+	LASSERT(target != NULL);
+
+	CDEBUG(D_INFO, "%s: Lookup fld entry (seq: "LPX64") on "
+	       "target %s (idx "LPU64")\n", fld->lcf_name, seq,
+	       fld_target_name(target), target->ft_idx);
+
+	res.lsr_start = seq;
+	fld_range_set_type(&res, flags);
+	if (target->ft_srv != NULL) {
+		LASSERT(env != NULL);
+		rc = fld_server_lookup(env, target->ft_srv, seq, &res);
+	} else {
+		rc = fld_client_rpc(target->ft_exp, &res, FLD_LOOKUP);
+	}
+
+	if (rc == 0) {
+		*mds = res.lsr_index;
+
+		fld_cache_insert(fld->lcf_cache, &res);
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(fld_client_lookup);
+
+void fld_client_flush(struct lu_client_fld *fld)
+{
+	fld_cache_flush(fld->lcf_cache);
+}
+EXPORT_SYMBOL(fld_client_flush);
diff --git a/drivers/staging/lustre/lustre/fld/lproc_fld.c b/drivers/staging/lustre/lustre/fld/lproc_fld.c
new file mode 100644
index 000000000000..00fe31e3861c
--- /dev/null
+++ b/drivers/staging/lustre/lustre/fld/lproc_fld.c
@@ -0,0 +1,365 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fld/lproc_fld.c
+ *
+ * FLD (FIDs Location Database)
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ *	Di Wang <di.wang@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FLD
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/module.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <dt_object.h>
+#include <md_object.h>
+#include <obd_support.h>
+#include <lustre_req_layout.h>
+#include <lustre_fld.h>
+#include <lustre_fid.h>
+#include "fld_internal.h"
+
+#ifdef LPROCFS
+static int
+fld_proc_read_targets(char *page, char **start, off_t off,
+		      int count, int *eof, void *data)
+{
+	struct lu_client_fld *fld = (struct lu_client_fld *)data;
+	struct lu_fld_target *target;
+	int total = 0, rc;
+	ENTRY;
+
+	LASSERT(fld != NULL);
+
+	spin_lock(&fld->lcf_lock);
+	list_for_each_entry(target,
+				&fld->lcf_targets, ft_chain)
+	{
+		rc = snprintf(page, count, "%s\n",
+			      fld_target_name(target));
+		page += rc;
+		count -= rc;
+		total += rc;
+		if (count == 0)
+			break;
+	}
+	spin_unlock(&fld->lcf_lock);
+	RETURN(total);
+}
+
+static int
+fld_proc_read_hash(char *page, char **start, off_t off,
+		   int count, int *eof, void *data)
+{
+	struct lu_client_fld *fld = (struct lu_client_fld *)data;
+	int rc;
+	ENTRY;
+
+	LASSERT(fld != NULL);
+
+	spin_lock(&fld->lcf_lock);
+	rc = snprintf(page, count, "%s\n", fld->lcf_hash->fh_name);
+	spin_unlock(&fld->lcf_lock);
+
+	RETURN(rc);
+}
+
+static int
+fld_proc_write_hash(struct file *file, const char *buffer,
+		    unsigned long count, void *data)
+{
+	struct lu_client_fld *fld = (struct lu_client_fld *)data;
+	struct lu_fld_hash *hash = NULL;
+	int i;
+	ENTRY;
+
+	LASSERT(fld != NULL);
+
+	for (i = 0; fld_hash[i].fh_name != NULL; i++) {
+		if (count != strlen(fld_hash[i].fh_name))
+			continue;
+
+		if (!strncmp(fld_hash[i].fh_name, buffer, count)) {
+			hash = &fld_hash[i];
+			break;
+		}
+	}
+
+	if (hash != NULL) {
+		spin_lock(&fld->lcf_lock);
+		fld->lcf_hash = hash;
+		spin_unlock(&fld->lcf_lock);
+
+		CDEBUG(D_INFO, "%s: Changed hash to \"%s\"\n",
+		       fld->lcf_name, hash->fh_name);
+	}
+
+	RETURN(count);
+}
+
+static int
+fld_proc_write_cache_flush(struct file *file, const char *buffer,
+			   unsigned long count, void *data)
+{
+	struct lu_client_fld *fld = (struct lu_client_fld *)data;
+	ENTRY;
+
+	LASSERT(fld != NULL);
+
+	fld_cache_flush(fld->lcf_cache);
+
+	CDEBUG(D_INFO, "%s: Lookup cache is flushed\n", fld->lcf_name);
+
+	RETURN(count);
+}
+
+struct fld_seq_param {
+	struct lu_env		fsp_env;
+	struct dt_it		*fsp_it;
+	struct lu_server_fld	*fsp_fld;
+	unsigned int		fsp_stop:1;
+};
+
+static void *fldb_seq_start(struct seq_file *p, loff_t *pos)
+{
+	struct fld_seq_param    *param = p->private;
+	struct lu_server_fld    *fld;
+	struct dt_object	*obj;
+	const struct dt_it_ops  *iops;
+
+	if (param == NULL || param->fsp_stop)
+		return NULL;
+
+	fld = param->fsp_fld;
+	obj = fld->lsf_obj;
+	LASSERT(obj != NULL);
+	iops = &obj->do_index_ops->dio_it;
+
+	iops->load(&param->fsp_env, param->fsp_it, *pos);
+
+	*pos = be64_to_cpu(*(__u64 *)iops->key(&param->fsp_env, param->fsp_it));
+	return param;
+}
+
+static void fldb_seq_stop(struct seq_file *p, void *v)
+{
+	struct fld_seq_param    *param = p->private;
+	const struct dt_it_ops	*iops;
+	struct lu_server_fld	*fld;
+	struct dt_object	*obj;
+
+	if (param == NULL)
+		return;
+
+	fld = param->fsp_fld;
+	obj = fld->lsf_obj;
+	LASSERT(obj != NULL);
+	iops = &obj->do_index_ops->dio_it;
+
+	iops->put(&param->fsp_env, param->fsp_it);
+}
+
+static void *fldb_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	struct fld_seq_param    *param = p->private;
+	struct lu_server_fld	*fld;
+	struct dt_object	*obj;
+	const struct dt_it_ops	*iops;
+	int			rc;
+
+	if (param == NULL || param->fsp_stop)
+		return NULL;
+
+	fld = param->fsp_fld;
+	obj = fld->lsf_obj;
+	LASSERT(obj != NULL);
+	iops = &obj->do_index_ops->dio_it;
+
+	rc = iops->next(&param->fsp_env, param->fsp_it);
+	if (rc > 0) {
+		param->fsp_stop = 1;
+		return NULL;
+	}
+
+	*pos = be64_to_cpu(*(__u64 *)iops->key(&param->fsp_env, param->fsp_it));
+	return param;
+}
+
+static int fldb_seq_show(struct seq_file *p, void *v)
+{
+	struct fld_seq_param    *param = p->private;
+	struct lu_server_fld	*fld;
+	struct dt_object	*obj;
+	const struct dt_it_ops	*iops;
+	struct fld_thread_info	*info;
+	struct lu_seq_range	*fld_rec;
+	int			rc;
+
+	if (param == NULL || param->fsp_stop)
+		return 0;
+
+	fld = param->fsp_fld;
+	obj = fld->lsf_obj;
+	LASSERT(obj != NULL);
+	iops = &obj->do_index_ops->dio_it;
+
+	info = lu_context_key_get(&param->fsp_env.le_ctx,
+				  &fld_thread_key);
+	fld_rec = &info->fti_rec;
+	rc = iops->rec(&param->fsp_env, param->fsp_it,
+		       (struct dt_rec *)fld_rec, 0);
+	if (rc != 0) {
+		CERROR("%s:read record error: rc %d\n",
+		       fld->lsf_name, rc);
+	} else if (fld_rec->lsr_start != 0) {
+		range_be_to_cpu(fld_rec, fld_rec);
+		rc = seq_printf(p, DRANGE"\n", PRANGE(fld_rec));
+	}
+
+	return rc;
+}
+
+struct seq_operations fldb_sops = {
+	.start = fldb_seq_start,
+	.stop = fldb_seq_stop,
+	.next = fldb_seq_next,
+	.show = fldb_seq_show,
+};
+
+static int fldb_seq_open(struct inode *inode, struct file *file)
+{
+	struct proc_dir_entry	*dp = PDE(inode);
+	struct seq_file		*seq;
+	struct lu_server_fld    *fld = (struct lu_server_fld *)dp->data;
+	struct dt_object	*obj;
+	const struct dt_it_ops  *iops;
+	struct fld_seq_param    *param = NULL;
+	int			env_init = 0;
+	int			rc;
+
+	LPROCFS_ENTRY_AND_CHECK(dp);
+	rc = seq_open(file, &fldb_sops);
+	if (rc)
+		GOTO(out, rc);
+
+	obj = fld->lsf_obj;
+	if (obj == NULL) {
+		seq = file->private_data;
+		seq->private = NULL;
+		return 0;
+	}
+
+	OBD_ALLOC_PTR(param);
+	if (param == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	rc = lu_env_init(&param->fsp_env, LCT_MD_THREAD);
+	if (rc != 0)
+		GOTO(out, rc);
+
+	env_init = 1;
+	iops = &obj->do_index_ops->dio_it;
+	param->fsp_it = iops->init(&param->fsp_env, obj, 0, NULL);
+	if (IS_ERR(param->fsp_it))
+		GOTO(out, rc = PTR_ERR(param->fsp_it));
+
+	param->fsp_fld = fld;
+	param->fsp_stop = 0;
+
+	seq = file->private_data;
+	seq->private = param;
+out:
+	if (rc != 0) {
+		if (env_init == 1)
+			lu_env_fini(&param->fsp_env);
+		if (param != NULL)
+			OBD_FREE_PTR(param);
+		LPROCFS_EXIT();
+	}
+	return rc;
+}
+
+static int fldb_seq_release(struct inode *inode, struct file *file)
+{
+	struct seq_file		*seq = file->private_data;
+	struct fld_seq_param	*param;
+	struct lu_server_fld	*fld;
+	struct dt_object	*obj;
+	const struct dt_it_ops	*iops;
+
+	param = seq->private;
+	if (param == NULL) {
+		lprocfs_seq_release(inode, file);
+		return 0;
+	}
+
+	fld = param->fsp_fld;
+	obj = fld->lsf_obj;
+	LASSERT(obj != NULL);
+	iops = &obj->do_index_ops->dio_it;
+
+	LASSERT(iops != NULL);
+	LASSERT(obj != NULL);
+	LASSERT(param->fsp_it != NULL);
+	iops->fini(&param->fsp_env, param->fsp_it);
+	lu_env_fini(&param->fsp_env);
+	OBD_FREE_PTR(param);
+	lprocfs_seq_release(inode, file);
+
+	return 0;
+}
+
+struct lprocfs_vars fld_server_proc_list[] = {
+	{ NULL }};
+
+struct lprocfs_vars fld_client_proc_list[] = {
+	{ "targets",     fld_proc_read_targets, NULL, NULL },
+	{ "hash",	fld_proc_read_hash, fld_proc_write_hash, NULL },
+	{ "cache_flush", NULL, fld_proc_write_cache_flush, NULL },
+	{ NULL }};
+
+struct file_operations fld_proc_seq_fops = {
+	.owner   = THIS_MODULE,
+	.open    = fldb_seq_open,
+	.read    = seq_read,
+	.release = fldb_seq_release,
+};
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/cl_object.h b/drivers/staging/lustre/lustre/include/cl_object.h
new file mode 100644
index 000000000000..61c463553200
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/cl_object.h
@@ -0,0 +1,3281 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#ifndef _LUSTRE_CL_OBJECT_H
+#define _LUSTRE_CL_OBJECT_H
+
+/** \defgroup clio clio
+ *
+ * Client objects implement io operations and cache pages.
+ *
+ * Examples: lov and osc are implementations of cl interface.
+ *
+ * Big Theory Statement.
+ *
+ * Layered objects.
+ *
+ * Client implementation is based on the following data-types:
+ *
+ *   - cl_object
+ *
+ *   - cl_page
+ *
+ *   - cl_lock     represents an extent lock on an object.
+ *
+ *   - cl_io       represents high-level i/o activity such as whole read/write
+ *		 system call, or write-out of pages from under the lock being
+ *		 canceled. cl_io has sub-ios that can be stopped and resumed
+ *		 independently, thus achieving high degree of transfer
+ *		 parallelism. Single cl_io can be advanced forward by
+ *		 the multiple threads (although in the most usual case of
+ *		 read/write system call it is associated with the single user
+ *		 thread, that issued the system call).
+ *
+ *   - cl_req      represents a collection of pages for a transfer. cl_req is
+ *		 constructed by req-forming engine that tries to saturate
+ *		 transport with large and continuous transfers.
+ *
+ * Terminology
+ *
+ *     - to avoid confusion high-level I/O operation like read or write system
+ *     call is referred to as "an io", whereas low-level I/O operation, like
+ *     RPC, is referred to as "a transfer"
+ *
+ *     - "generic code" means generic (not file system specific) code in the
+ *     hosting environment. "cl-code" means code (mostly in cl_*.c files) that
+ *     is not layer specific.
+ *
+ * Locking.
+ *
+ *  - i_mutex
+ *      - PG_locked
+ *	  - cl_object_header::coh_page_guard
+ *	  - cl_object_header::coh_lock_guard
+ *	  - lu_site::ls_guard
+ *
+ * See the top comment in cl_object.c for the description of overall locking and
+ * reference-counting design.
+ *
+ * See comments below for the description of i/o, page, and dlm-locking
+ * design.
+ *
+ * @{
+ */
+
+/*
+ * super-class definitions.
+ */
+#include <lu_object.h>
+#include <lvfs.h>
+#	include <linux/mutex.h>
+#	include <linux/radix-tree.h>
+
+struct inode;
+
+struct cl_device;
+struct cl_device_operations;
+
+struct cl_object;
+struct cl_object_page_operations;
+struct cl_object_lock_operations;
+
+struct cl_page;
+struct cl_page_slice;
+struct cl_lock;
+struct cl_lock_slice;
+
+struct cl_lock_operations;
+struct cl_page_operations;
+
+struct cl_io;
+struct cl_io_slice;
+
+struct cl_req;
+struct cl_req_slice;
+
+/**
+ * Operations for each data device in the client stack.
+ *
+ * \see vvp_cl_ops, lov_cl_ops, lovsub_cl_ops, osc_cl_ops
+ */
+struct cl_device_operations {
+	/**
+	 * Initialize cl_req. This method is called top-to-bottom on all
+	 * devices in the stack to get them a chance to allocate layer-private
+	 * data, and to attach them to the cl_req by calling
+	 * cl_req_slice_add().
+	 *
+	 * \see osc_req_init(), lov_req_init(), lovsub_req_init()
+	 * \see ccc_req_init()
+	 */
+	int (*cdo_req_init)(const struct lu_env *env, struct cl_device *dev,
+			    struct cl_req *req);
+};
+
+/**
+ * Device in the client stack.
+ *
+ * \see ccc_device, lov_device, lovsub_device, osc_device
+ */
+struct cl_device {
+	/** Super-class. */
+	struct lu_device		   cd_lu_dev;
+	/** Per-layer operation vector. */
+	const struct cl_device_operations *cd_ops;
+};
+
+/** \addtogroup cl_object cl_object
+ * @{ */
+/**
+ * "Data attributes" of cl_object. Data attributes can be updated
+ * independently for a sub-object, and top-object's attributes are calculated
+ * from sub-objects' ones.
+ */
+struct cl_attr {
+	/** Object size, in bytes */
+	loff_t cat_size;
+	/**
+	 * Known minimal size, in bytes.
+	 *
+	 * This is only valid when at least one DLM lock is held.
+	 */
+	loff_t cat_kms;
+	/** Modification time. Measured in seconds since epoch. */
+	time_t cat_mtime;
+	/** Access time. Measured in seconds since epoch. */
+	time_t cat_atime;
+	/** Change time. Measured in seconds since epoch. */
+	time_t cat_ctime;
+	/**
+	 * Blocks allocated to this cl_object on the server file system.
+	 *
+	 * \todo XXX An interface for block size is needed.
+	 */
+	__u64  cat_blocks;
+	/**
+	 * User identifier for quota purposes.
+	 */
+	uid_t  cat_uid;
+	/**
+	 * Group identifier for quota purposes.
+	 */
+	gid_t  cat_gid;
+};
+
+/**
+ * Fields in cl_attr that are being set.
+ */
+enum cl_attr_valid {
+	CAT_SIZE   = 1 << 0,
+	CAT_KMS    = 1 << 1,
+	CAT_MTIME  = 1 << 3,
+	CAT_ATIME  = 1 << 4,
+	CAT_CTIME  = 1 << 5,
+	CAT_BLOCKS = 1 << 6,
+	CAT_UID    = 1 << 7,
+	CAT_GID    = 1 << 8
+};
+
+/**
+ * Sub-class of lu_object with methods common for objects on the client
+ * stacks.
+ *
+ * cl_object: represents a regular file system object, both a file and a
+ *    stripe. cl_object is based on lu_object: it is identified by a fid,
+ *    layered, cached, hashed, and lrued. Important distinction with the server
+ *    side, where md_object and dt_object are used, is that cl_object "fans out"
+ *    at the lov/sns level: depending on the file layout, single file is
+ *    represented as a set of "sub-objects" (stripes). At the implementation
+ *    level, struct lov_object contains an array of cl_objects. Each sub-object
+ *    is a full-fledged cl_object, having its fid, living in the lru and hash
+ *    table.
+ *
+ *    This leads to the next important difference with the server side: on the
+ *    client, it's quite usual to have objects with the different sequence of
+ *    layers. For example, typical top-object is composed of the following
+ *    layers:
+ *
+ *	- vvp
+ *	- lov
+ *
+ *    whereas its sub-objects are composed of
+ *
+ *	- lovsub
+ *	- osc
+ *
+ *    layers. Here "lovsub" is a mostly dummy layer, whose purpose is to keep
+ *    track of the object-subobject relationship.
+ *
+ *    Sub-objects are not cached independently: when top-object is about to
+ *    be discarded from the memory, all its sub-objects are torn-down and
+ *    destroyed too.
+ *
+ * \see ccc_object, lov_object, lovsub_object, osc_object
+ */
+struct cl_object {
+	/** super class */
+	struct lu_object		   co_lu;
+	/** per-object-layer operations */
+	const struct cl_object_operations *co_ops;
+	/** offset of page slice in cl_page buffer */
+	int				   co_slice_off;
+};
+
+/**
+ * Description of the client object configuration. This is used for the
+ * creation of a new client object that is identified by a more state than
+ * fid.
+ */
+struct cl_object_conf {
+	/** Super-class. */
+	struct lu_object_conf     coc_lu;
+	union {
+		/**
+		 * Object layout. This is consumed by lov.
+		 */
+		struct lustre_md *coc_md;
+		/**
+		 * Description of particular stripe location in the
+		 * cluster. This is consumed by osc.
+		 */
+		struct lov_oinfo *coc_oinfo;
+	} u;
+	/**
+	 * VFS inode. This is consumed by vvp.
+	 */
+	struct inode	     *coc_inode;
+	/**
+	 * Layout lock handle.
+	 */
+	struct ldlm_lock	 *coc_lock;
+	/**
+	 * Operation to handle layout, OBJECT_CONF_XYZ.
+	 */
+	int			  coc_opc;
+};
+
+enum {
+	/** configure layout, set up a new stripe, must be called while
+	 * holding layout lock. */
+	OBJECT_CONF_SET = 0,
+	/** invalidate the current stripe configuration due to losing
+	 * layout lock. */
+	OBJECT_CONF_INVALIDATE = 1,
+	/** wait for old layout to go away so that new layout can be
+	 * set up. */
+	OBJECT_CONF_WAIT = 2
+};
+
+/**
+ * Operations implemented for each cl object layer.
+ *
+ * \see vvp_ops, lov_ops, lovsub_ops, osc_ops
+ */
+struct cl_object_operations {
+	/**
+	 * Initialize page slice for this layer. Called top-to-bottom through
+	 * every object layer when a new cl_page is instantiated. Layer
+	 * keeping private per-page data, or requiring its own page operations
+	 * vector should allocate these data here, and attach then to the page
+	 * by calling cl_page_slice_add(). \a vmpage is locked (in the VM
+	 * sense). Optional.
+	 *
+	 * \retval NULL success.
+	 *
+	 * \retval ERR_PTR(errno) failure code.
+	 *
+	 * \retval valid-pointer pointer to already existing referenced page
+	 *	 to be used instead of newly created.
+	 */
+	int  (*coo_page_init)(const struct lu_env *env, struct cl_object *obj,
+				struct cl_page *page, struct page *vmpage);
+	/**
+	 * Initialize lock slice for this layer. Called top-to-bottom through
+	 * every object layer when a new cl_lock is instantiated. Layer
+	 * keeping private per-lock data, or requiring its own lock operations
+	 * vector should allocate these data here, and attach then to the lock
+	 * by calling cl_lock_slice_add(). Mandatory.
+	 */
+	int  (*coo_lock_init)(const struct lu_env *env,
+			      struct cl_object *obj, struct cl_lock *lock,
+			      const struct cl_io *io);
+	/**
+	 * Initialize io state for a given layer.
+	 *
+	 * called top-to-bottom once per io existence to initialize io
+	 * state. If layer wants to keep some state for this type of io, it
+	 * has to embed struct cl_io_slice in lu_env::le_ses, and register
+	 * slice with cl_io_slice_add(). It is guaranteed that all threads
+	 * participating in this io share the same session.
+	 */
+	int  (*coo_io_init)(const struct lu_env *env,
+			    struct cl_object *obj, struct cl_io *io);
+	/**
+	 * Fill portion of \a attr that this layer controls. This method is
+	 * called top-to-bottom through all object layers.
+	 *
+	 * \pre cl_object_header::coh_attr_guard of the top-object is locked.
+	 *
+	 * \return   0: to continue
+	 * \return +ve: to stop iterating through layers (but 0 is returned
+	 * from enclosing cl_object_attr_get())
+	 * \return -ve: to signal error
+	 */
+	int (*coo_attr_get)(const struct lu_env *env, struct cl_object *obj,
+			    struct cl_attr *attr);
+	/**
+	 * Update attributes.
+	 *
+	 * \a valid is a bitmask composed from enum #cl_attr_valid, and
+	 * indicating what attributes are to be set.
+	 *
+	 * \pre cl_object_header::coh_attr_guard of the top-object is locked.
+	 *
+	 * \return the same convention as for
+	 * cl_object_operations::coo_attr_get() is used.
+	 */
+	int (*coo_attr_set)(const struct lu_env *env, struct cl_object *obj,
+			    const struct cl_attr *attr, unsigned valid);
+	/**
+	 * Update object configuration. Called top-to-bottom to modify object
+	 * configuration.
+	 *
+	 * XXX error conditions and handling.
+	 */
+	int (*coo_conf_set)(const struct lu_env *env, struct cl_object *obj,
+			    const struct cl_object_conf *conf);
+	/**
+	 * Glimpse ast. Executed when glimpse ast arrives for a lock on this
+	 * object. Layers are supposed to fill parts of \a lvb that will be
+	 * shipped to the glimpse originator as a glimpse result.
+	 *
+	 * \see ccc_object_glimpse(), lovsub_object_glimpse(),
+	 * \see osc_object_glimpse()
+	 */
+	int (*coo_glimpse)(const struct lu_env *env,
+			   const struct cl_object *obj, struct ost_lvb *lvb);
+};
+
+/**
+ * Extended header for client object.
+ */
+struct cl_object_header {
+	/** Standard lu_object_header. cl_object::co_lu::lo_header points
+	 * here. */
+	struct lu_object_header  coh_lu;
+	/** \name locks
+	 * \todo XXX move locks below to the separate cache-lines, they are
+	 * mostly useless otherwise.
+	 */
+	/** @{ */
+	/** Lock protecting page tree. */
+	spinlock_t		 coh_page_guard;
+	/** Lock protecting lock list. */
+	spinlock_t		 coh_lock_guard;
+	/** @} locks */
+	/** Radix tree of cl_page's, cached for this object. */
+	struct radix_tree_root   coh_tree;
+	/** # of pages in radix tree. */
+	unsigned long	    coh_pages;
+	/** List of cl_lock's granted for this object. */
+	struct list_head	       coh_locks;
+
+	/**
+	 * Parent object. It is assumed that an object has a well-defined
+	 * parent, but not a well-defined child (there may be multiple
+	 * sub-objects, for the same top-object). cl_object_header::coh_parent
+	 * field allows certain code to be written generically, without
+	 * limiting possible cl_object layouts unduly.
+	 */
+	struct cl_object_header *coh_parent;
+	/**
+	 * Protects consistency between cl_attr of parent object and
+	 * attributes of sub-objects, that the former is calculated ("merged")
+	 * from.
+	 *
+	 * \todo XXX this can be read/write lock if needed.
+	 */
+	spinlock_t		 coh_attr_guard;
+	/**
+	 * Size of cl_page + page slices
+	 */
+	unsigned short		 coh_page_bufsize;
+	/**
+	 * Number of objects above this one: 0 for a top-object, 1 for its
+	 * sub-object, etc.
+	 */
+	unsigned char		 coh_nesting;
+};
+
+/**
+ * Helper macro: iterate over all layers of the object \a obj, assigning every
+ * layer top-to-bottom to \a slice.
+ */
+#define cl_object_for_each(slice, obj)				      \
+	list_for_each_entry((slice),				    \
+				&(obj)->co_lu.lo_header->loh_layers,	\
+				co_lu.lo_linkage)
+/**
+ * Helper macro: iterate over all layers of the object \a obj, assigning every
+ * layer bottom-to-top to \a slice.
+ */
+#define cl_object_for_each_reverse(slice, obj)			       \
+	list_for_each_entry_reverse((slice),			     \
+					&(obj)->co_lu.lo_header->loh_layers, \
+					co_lu.lo_linkage)
+/** @} cl_object */
+
+#ifndef pgoff_t
+#define pgoff_t unsigned long
+#endif
+
+#define CL_PAGE_EOF ((pgoff_t)~0ull)
+
+/** \addtogroup cl_page cl_page
+ * @{ */
+
+/** \struct cl_page
+ * Layered client page.
+ *
+ * cl_page: represents a portion of a file, cached in the memory. All pages
+ *    of the given file are of the same size, and are kept in the radix tree
+ *    hanging off the cl_object. cl_page doesn't fan out, but as sub-objects
+ *    of the top-level file object are first class cl_objects, they have their
+ *    own radix trees of pages and hence page is implemented as a sequence of
+ *    struct cl_pages's, linked into double-linked list through
+ *    cl_page::cp_parent and cl_page::cp_child pointers, each residing in the
+ *    corresponding radix tree at the corresponding logical offset.
+ *
+ * cl_page is associated with VM page of the hosting environment (struct
+ *    page in Linux kernel, for example), struct page. It is assumed, that this
+ *    association is implemented by one of cl_page layers (top layer in the
+ *    current design) that
+ *
+ *	- intercepts per-VM-page call-backs made by the environment (e.g.,
+ *	  memory pressure),
+ *
+ *	- translates state (page flag bits) and locking between lustre and
+ *	  environment.
+ *
+ *    The association between cl_page and struct page is immutable and
+ *    established when cl_page is created.
+ *
+ * cl_page can be "owned" by a particular cl_io (see below), guaranteeing
+ *    this io an exclusive access to this page w.r.t. other io attempts and
+ *    various events changing page state (such as transfer completion, or
+ *    eviction of the page from the memory). Note, that in general cl_io
+ *    cannot be identified with a particular thread, and page ownership is not
+ *    exactly equal to the current thread holding a lock on the page. Layer
+ *    implementing association between cl_page and struct page has to implement
+ *    ownership on top of available synchronization mechanisms.
+ *
+ *    While lustre client maintains the notion of an page ownership by io,
+ *    hosting MM/VM usually has its own page concurrency control
+ *    mechanisms. For example, in Linux, page access is synchronized by the
+ *    per-page PG_locked bit-lock, and generic kernel code (generic_file_*())
+ *    takes care to acquire and release such locks as necessary around the
+ *    calls to the file system methods (->readpage(), ->prepare_write(),
+ *    ->commit_write(), etc.). This leads to the situation when there are two
+ *    different ways to own a page in the client:
+ *
+ *	- client code explicitly and voluntary owns the page (cl_page_own());
+ *
+ *	- VM locks a page and then calls the client, that has "to assume"
+ *	  the ownership from the VM (cl_page_assume()).
+ *
+ *    Dual methods to release ownership are cl_page_disown() and
+ *    cl_page_unassume().
+ *
+ * cl_page is reference counted (cl_page::cp_ref). When reference counter
+ *    drops to 0, the page is returned to the cache, unless it is in
+ *    cl_page_state::CPS_FREEING state, in which case it is immediately
+ *    destroyed.
+ *
+ *    The general logic guaranteeing the absence of "existential races" for
+ *    pages is the following:
+ *
+ *	- there are fixed known ways for a thread to obtain a new reference
+ *	  to a page:
+ *
+ *	    - by doing a lookup in the cl_object radix tree, protected by the
+ *	      spin-lock;
+ *
+ *	    - by starting from VM-locked struct page and following some
+ *	      hosting environment method (e.g., following ->private pointer in
+ *	      the case of Linux kernel), see cl_vmpage_page();
+ *
+ *	- when the page enters cl_page_state::CPS_FREEING state, all these
+ *	  ways are severed with the proper synchronization
+ *	  (cl_page_delete());
+ *
+ *	- entry into cl_page_state::CPS_FREEING is serialized by the VM page
+ *	  lock;
+ *
+ *	- no new references to the page in cl_page_state::CPS_FREEING state
+ *	  are allowed (checked in cl_page_get()).
+ *
+ *    Together this guarantees that when last reference to a
+ *    cl_page_state::CPS_FREEING page is released, it is safe to destroy the
+ *    page, as neither references to it can be acquired at that point, nor
+ *    ones exist.
+ *
+ * cl_page is a state machine. States are enumerated in enum
+ *    cl_page_state. Possible state transitions are enumerated in
+ *    cl_page_state_set(). State transition process (i.e., actual changing of
+ *    cl_page::cp_state field) is protected by the lock on the underlying VM
+ *    page.
+ *
+ * Linux Kernel implementation.
+ *
+ *    Binding between cl_page and struct page (which is a typedef for
+ *    struct page) is implemented in the vvp layer. cl_page is attached to the
+ *    ->private pointer of the struct page, together with the setting of
+ *    PG_private bit in page->flags, and acquiring additional reference on the
+ *    struct page (much like struct buffer_head, or any similar file system
+ *    private data structures).
+ *
+ *    PG_locked lock is used to implement both ownership and transfer
+ *    synchronization, that is, page is VM-locked in CPS_{OWNED,PAGE{IN,OUT}}
+ *    states. No additional references are acquired for the duration of the
+ *    transfer.
+ *
+ * \warning *THIS IS NOT* the behavior expected by the Linux kernel, where
+ *	  write-out is "protected" by the special PG_writeback bit.
+ */
+
+/**
+ * States of cl_page. cl_page.c assumes particular order here.
+ *
+ * The page state machine is rather crude, as it doesn't recognize finer page
+ * states like "dirty" or "up to date". This is because such states are not
+ * always well defined for the whole stack (see, for example, the
+ * implementation of the read-ahead, that hides page up-to-dateness to track
+ * cache hits accurately). Such sub-states are maintained by the layers that
+ * are interested in them.
+ */
+enum cl_page_state {
+	/**
+	 * Page is in the cache, un-owned. Page leaves cached state in the
+	 * following cases:
+	 *
+	 *     - [cl_page_state::CPS_OWNED] io comes across the page and
+	 *     owns it;
+	 *
+	 *     - [cl_page_state::CPS_PAGEOUT] page is dirty, the
+	 *     req-formation engine decides that it wants to include this page
+	 *     into an cl_req being constructed, and yanks it from the cache;
+	 *
+	 *     - [cl_page_state::CPS_FREEING] VM callback is executed to
+	 *     evict the page form the memory;
+	 *
+	 * \invariant cl_page::cp_owner == NULL && cl_page::cp_req == NULL
+	 */
+	CPS_CACHED,
+	/**
+	 * Page is exclusively owned by some cl_io. Page may end up in this
+	 * state as a result of
+	 *
+	 *     - io creating new page and immediately owning it;
+	 *
+	 *     - [cl_page_state::CPS_CACHED] io finding existing cached page
+	 *     and owning it;
+	 *
+	 *     - [cl_page_state::CPS_OWNED] io finding existing owned page
+	 *     and waiting for owner to release the page;
+	 *
+	 * Page leaves owned state in the following cases:
+	 *
+	 *     - [cl_page_state::CPS_CACHED] io decides to leave the page in
+	 *     the cache, doing nothing;
+	 *
+	 *     - [cl_page_state::CPS_PAGEIN] io starts read transfer for
+	 *     this page;
+	 *
+	 *     - [cl_page_state::CPS_PAGEOUT] io starts immediate write
+	 *     transfer for this page;
+	 *
+	 *     - [cl_page_state::CPS_FREEING] io decides to destroy this
+	 *     page (e.g., as part of truncate or extent lock cancellation).
+	 *
+	 * \invariant cl_page::cp_owner != NULL && cl_page::cp_req == NULL
+	 */
+	CPS_OWNED,
+	/**
+	 * Page is being written out, as a part of a transfer. This state is
+	 * entered when req-formation logic decided that it wants this page to
+	 * be sent through the wire _now_. Specifically, it means that once
+	 * this state is achieved, transfer completion handler (with either
+	 * success or failure indication) is guaranteed to be executed against
+	 * this page independently of any locks and any scheduling decisions
+	 * made by the hosting environment (that effectively means that the
+	 * page is never put into cl_page_state::CPS_PAGEOUT state "in
+	 * advance". This property is mentioned, because it is important when
+	 * reasoning about possible dead-locks in the system). The page can
+	 * enter this state as a result of
+	 *
+	 *     - [cl_page_state::CPS_OWNED] an io requesting an immediate
+	 *     write-out of this page, or
+	 *
+	 *     - [cl_page_state::CPS_CACHED] req-forming engine deciding
+	 *     that it has enough dirty pages cached to issue a "good"
+	 *     transfer.
+	 *
+	 * The page leaves cl_page_state::CPS_PAGEOUT state when the transfer
+	 * is completed---it is moved into cl_page_state::CPS_CACHED state.
+	 *
+	 * Underlying VM page is locked for the duration of transfer.
+	 *
+	 * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req != NULL
+	 */
+	CPS_PAGEOUT,
+	/**
+	 * Page is being read in, as a part of a transfer. This is quite
+	 * similar to the cl_page_state::CPS_PAGEOUT state, except that
+	 * read-in is always "immediate"---there is no such thing a sudden
+	 * construction of read cl_req from cached, presumably not up to date,
+	 * pages.
+	 *
+	 * Underlying VM page is locked for the duration of transfer.
+	 *
+	 * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req != NULL
+	 */
+	CPS_PAGEIN,
+	/**
+	 * Page is being destroyed. This state is entered when client decides
+	 * that page has to be deleted from its host object, as, e.g., a part
+	 * of truncate.
+	 *
+	 * Once this state is reached, there is no way to escape it.
+	 *
+	 * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req == NULL
+	 */
+	CPS_FREEING,
+	CPS_NR
+};
+
+enum cl_page_type {
+	/** Host page, the page is from the host inode which the cl_page
+	 * belongs to. */
+	CPT_CACHEABLE = 1,
+
+	/** Transient page, the transient cl_page is used to bind a cl_page
+	 *  to vmpage which is not belonging to the same object of cl_page.
+	 *  it is used in DirectIO, lockless IO and liblustre. */
+	CPT_TRANSIENT,
+};
+
+/**
+ * Flags maintained for every cl_page.
+ */
+enum cl_page_flags {
+	/**
+	 * Set when pagein completes. Used for debugging (read completes at
+	 * most once for a page).
+	 */
+	CPF_READ_COMPLETED = 1 << 0
+};
+
+/**
+ * Fields are protected by the lock on struct page, except for atomics and
+ * immutables.
+ *
+ * \invariant Data type invariants are in cl_page_invariant(). Basically:
+ * cl_page::cp_parent and cl_page::cp_child are a well-formed double-linked
+ * list, consistent with the parent/child pointers in the cl_page::cp_obj and
+ * cl_page::cp_owner (when set).
+ */
+struct cl_page {
+	/** Reference counter. */
+	atomic_t	     cp_ref;
+	/** An object this page is a part of. Immutable after creation. */
+	struct cl_object	*cp_obj;
+	/** Logical page index within the object. Immutable after creation. */
+	pgoff_t		  cp_index;
+	/** List of slices. Immutable after creation. */
+	struct list_head	       cp_layers;
+	/** Parent page, NULL for top-level page. Immutable after creation. */
+	struct cl_page	  *cp_parent;
+	/** Lower-layer page. NULL for bottommost page. Immutable after
+	 * creation. */
+	struct cl_page	  *cp_child;
+	/**
+	 * Page state. This field is const to avoid accidental update, it is
+	 * modified only internally within cl_page.c. Protected by a VM lock.
+	 */
+	const enum cl_page_state cp_state;
+	/** Linkage of pages within group. Protected by cl_page::cp_mutex. */
+	struct list_head		cp_batch;
+	/** Mutex serializing membership of a page in a batch. */
+	struct mutex		cp_mutex;
+	/** Linkage of pages within cl_req. */
+	struct list_head	       cp_flight;
+	/** Transfer error. */
+	int		      cp_error;
+
+	/**
+	 * Page type. Only CPT_TRANSIENT is used so far. Immutable after
+	 * creation.
+	 */
+	enum cl_page_type	cp_type;
+
+	/**
+	 * Owning IO in cl_page_state::CPS_OWNED state. Sub-page can be owned
+	 * by sub-io. Protected by a VM lock.
+	 */
+	struct cl_io	    *cp_owner;
+	/**
+	 * Debug information, the task is owning the page.
+	 */
+	task_t	      *cp_task;
+	/**
+	 * Owning IO request in cl_page_state::CPS_PAGEOUT and
+	 * cl_page_state::CPS_PAGEIN states. This field is maintained only in
+	 * the top-level pages. Protected by a VM lock.
+	 */
+	struct cl_req	   *cp_req;
+	/** List of references to this page, for debugging. */
+	struct lu_ref	    cp_reference;
+	/** Link to an object, for debugging. */
+	struct lu_ref_link      *cp_obj_ref;
+	/** Link to a queue, for debugging. */
+	struct lu_ref_link      *cp_queue_ref;
+	/** Per-page flags from enum cl_page_flags. Protected by a VM lock. */
+	unsigned		 cp_flags;
+	/** Assigned if doing a sync_io */
+	struct cl_sync_io       *cp_sync_io;
+};
+
+/**
+ * Per-layer part of cl_page.
+ *
+ * \see ccc_page, lov_page, osc_page
+ */
+struct cl_page_slice {
+	struct cl_page		  *cpl_page;
+	/**
+	 * Object slice corresponding to this page slice. Immutable after
+	 * creation.
+	 */
+	struct cl_object		*cpl_obj;
+	const struct cl_page_operations *cpl_ops;
+	/** Linkage into cl_page::cp_layers. Immutable after creation. */
+	struct list_head		       cpl_linkage;
+};
+
+/**
+ * Lock mode. For the client extent locks.
+ *
+ * \warning: cl_lock_mode_match() assumes particular ordering here.
+ * \ingroup cl_lock
+ */
+enum cl_lock_mode {
+	/**
+	 * Mode of a lock that protects no data, and exists only as a
+	 * placeholder. This is used for `glimpse' requests. A phantom lock
+	 * might get promoted to real lock at some point.
+	 */
+	CLM_PHANTOM,
+	CLM_READ,
+	CLM_WRITE,
+	CLM_GROUP
+};
+
+/**
+ * Requested transfer type.
+ * \ingroup cl_req
+ */
+enum cl_req_type {
+	CRT_READ,
+	CRT_WRITE,
+	CRT_NR
+};
+
+/**
+ * Per-layer page operations.
+ *
+ * Methods taking an \a io argument are for the activity happening in the
+ * context of given \a io. Page is assumed to be owned by that io, except for
+ * the obvious cases (like cl_page_operations::cpo_own()).
+ *
+ * \see vvp_page_ops, lov_page_ops, osc_page_ops
+ */
+struct cl_page_operations {
+	/**
+	 * cl_page<->struct page methods. Only one layer in the stack has to
+	 * implement these. Current code assumes that this functionality is
+	 * provided by the topmost layer, see cl_page_disown0() as an example.
+	 */
+
+	/**
+	 * \return the underlying VM page. Optional.
+	 */
+	struct page *(*cpo_vmpage)(const struct lu_env *env,
+				  const struct cl_page_slice *slice);
+	/**
+	 * Called when \a io acquires this page into the exclusive
+	 * ownership. When this method returns, it is guaranteed that the is
+	 * not owned by other io, and no transfer is going on against
+	 * it. Optional.
+	 *
+	 * \see cl_page_own()
+	 * \see vvp_page_own(), lov_page_own()
+	 */
+	int  (*cpo_own)(const struct lu_env *env,
+			const struct cl_page_slice *slice,
+			struct cl_io *io, int nonblock);
+	/** Called when ownership it yielded. Optional.
+	 *
+	 * \see cl_page_disown()
+	 * \see vvp_page_disown()
+	 */
+	void (*cpo_disown)(const struct lu_env *env,
+			   const struct cl_page_slice *slice, struct cl_io *io);
+	/**
+	 * Called for a page that is already "owned" by \a io from VM point of
+	 * view. Optional.
+	 *
+	 * \see cl_page_assume()
+	 * \see vvp_page_assume(), lov_page_assume()
+	 */
+	void (*cpo_assume)(const struct lu_env *env,
+			   const struct cl_page_slice *slice, struct cl_io *io);
+	/** Dual to cl_page_operations::cpo_assume(). Optional. Called
+	 * bottom-to-top when IO releases a page without actually unlocking
+	 * it.
+	 *
+	 * \see cl_page_unassume()
+	 * \see vvp_page_unassume()
+	 */
+	void (*cpo_unassume)(const struct lu_env *env,
+			     const struct cl_page_slice *slice,
+			     struct cl_io *io);
+	/**
+	 * Announces whether the page contains valid data or not by \a uptodate.
+	 *
+	 * \see cl_page_export()
+	 * \see vvp_page_export()
+	 */
+	void  (*cpo_export)(const struct lu_env *env,
+			    const struct cl_page_slice *slice, int uptodate);
+	/**
+	 * Unmaps page from the user space (if it is mapped).
+	 *
+	 * \see cl_page_unmap()
+	 * \see vvp_page_unmap()
+	 */
+	int (*cpo_unmap)(const struct lu_env *env,
+			 const struct cl_page_slice *slice, struct cl_io *io);
+	/**
+	 * Checks whether underlying VM page is locked (in the suitable
+	 * sense). Used for assertions.
+	 *
+	 * \retval    -EBUSY: page is protected by a lock of a given mode;
+	 * \retval  -ENODATA: page is not protected by a lock;
+	 * \retval	 0: this layer cannot decide. (Should never happen.)
+	 */
+	int (*cpo_is_vmlocked)(const struct lu_env *env,
+			       const struct cl_page_slice *slice);
+	/**
+	 * Page destruction.
+	 */
+
+	/**
+	 * Called when page is truncated from the object. Optional.
+	 *
+	 * \see cl_page_discard()
+	 * \see vvp_page_discard(), osc_page_discard()
+	 */
+	void (*cpo_discard)(const struct lu_env *env,
+			    const struct cl_page_slice *slice,
+			    struct cl_io *io);
+	/**
+	 * Called when page is removed from the cache, and is about to being
+	 * destroyed. Optional.
+	 *
+	 * \see cl_page_delete()
+	 * \see vvp_page_delete(), osc_page_delete()
+	 */
+	void (*cpo_delete)(const struct lu_env *env,
+			   const struct cl_page_slice *slice);
+	/** Destructor. Frees resources and slice itself. */
+	void (*cpo_fini)(const struct lu_env *env,
+			 struct cl_page_slice *slice);
+
+	/**
+	 * Checks whether the page is protected by a cl_lock. This is a
+	 * per-layer method, because certain layers have ways to check for the
+	 * lock much more efficiently than through the generic locks scan, or
+	 * implement locking mechanisms separate from cl_lock, e.g.,
+	 * LL_FILE_GROUP_LOCKED in vvp. If \a pending is true, check for locks
+	 * being canceled, or scheduled for cancellation as soon as the last
+	 * user goes away, too.
+	 *
+	 * \retval    -EBUSY: page is protected by a lock of a given mode;
+	 * \retval  -ENODATA: page is not protected by a lock;
+	 * \retval	 0: this layer cannot decide.
+	 *
+	 * \see cl_page_is_under_lock()
+	 */
+	int (*cpo_is_under_lock)(const struct lu_env *env,
+				 const struct cl_page_slice *slice,
+				 struct cl_io *io);
+
+	/**
+	 * Optional debugging helper. Prints given page slice.
+	 *
+	 * \see cl_page_print()
+	 */
+	int (*cpo_print)(const struct lu_env *env,
+			 const struct cl_page_slice *slice,
+			 void *cookie, lu_printer_t p);
+	/**
+	 * \name transfer
+	 *
+	 * Transfer methods. See comment on cl_req for a description of
+	 * transfer formation and life-cycle.
+	 *
+	 * @{
+	 */
+	/**
+	 * Request type dependent vector of operations.
+	 *
+	 * Transfer operations depend on transfer mode (cl_req_type). To avoid
+	 * passing transfer mode to each and every of these methods, and to
+	 * avoid branching on request type inside of the methods, separate
+	 * methods for cl_req_type:CRT_READ and cl_req_type:CRT_WRITE are
+	 * provided. That is, method invocation usually looks like
+	 *
+	 *	 slice->cp_ops.io[req->crq_type].cpo_method(env, slice, ...);
+	 */
+	struct {
+		/**
+		 * Called when a page is submitted for a transfer as a part of
+		 * cl_page_list.
+		 *
+		 * \return    0	 : page is eligible for submission;
+		 * \return    -EALREADY : skip this page;
+		 * \return    -ve       : error.
+		 *
+		 * \see cl_page_prep()
+		 */
+		int  (*cpo_prep)(const struct lu_env *env,
+				 const struct cl_page_slice *slice,
+				 struct cl_io *io);
+		/**
+		 * Completion handler. This is guaranteed to be eventually
+		 * fired after cl_page_operations::cpo_prep() or
+		 * cl_page_operations::cpo_make_ready() call.
+		 *
+		 * This method can be called in a non-blocking context. It is
+		 * guaranteed however, that the page involved and its object
+		 * are pinned in memory (and, hence, calling cl_page_put() is
+		 * safe).
+		 *
+		 * \see cl_page_completion()
+		 */
+		void (*cpo_completion)(const struct lu_env *env,
+				       const struct cl_page_slice *slice,
+				       int ioret);
+		/**
+		 * Called when cached page is about to be added to the
+		 * cl_req as a part of req formation.
+		 *
+		 * \return    0       : proceed with this page;
+		 * \return    -EAGAIN : skip this page;
+		 * \return    -ve     : error.
+		 *
+		 * \see cl_page_make_ready()
+		 */
+		int  (*cpo_make_ready)(const struct lu_env *env,
+				       const struct cl_page_slice *slice);
+		/**
+		 * Announce that this page is to be written out
+		 * opportunistically, that is, page is dirty, it is not
+		 * necessary to start write-out transfer right now, but
+		 * eventually page has to be written out.
+		 *
+		 * Main caller of this is the write path (see
+		 * vvp_io_commit_write()), using this method to build a
+		 * "transfer cache" from which large transfers are then
+		 * constructed by the req-formation engine.
+		 *
+		 * \todo XXX it would make sense to add page-age tracking
+		 * semantics here, and to oblige the req-formation engine to
+		 * send the page out not later than it is too old.
+		 *
+		 * \see cl_page_cache_add()
+		 */
+		int  (*cpo_cache_add)(const struct lu_env *env,
+				      const struct cl_page_slice *slice,
+				      struct cl_io *io);
+	} io[CRT_NR];
+	/**
+	 * Tell transfer engine that only [to, from] part of a page should be
+	 * transmitted.
+	 *
+	 * This is used for immediate transfers.
+	 *
+	 * \todo XXX this is not very good interface. It would be much better
+	 * if all transfer parameters were supplied as arguments to
+	 * cl_io_operations::cio_submit() call, but it is not clear how to do
+	 * this for page queues.
+	 *
+	 * \see cl_page_clip()
+	 */
+	void (*cpo_clip)(const struct lu_env *env,
+			 const struct cl_page_slice *slice,
+			 int from, int to);
+	/**
+	 * \pre  the page was queued for transferring.
+	 * \post page is removed from client's pending list, or -EBUSY
+	 *       is returned if it has already been in transferring.
+	 *
+	 * This is one of seldom page operation which is:
+	 * 0. called from top level;
+	 * 1. don't have vmpage locked;
+	 * 2. every layer should synchronize execution of its ->cpo_cancel()
+	 *    with completion handlers. Osc uses client obd lock for this
+	 *    purpose. Based on there is no vvp_page_cancel and
+	 *    lov_page_cancel(), cpo_cancel is defacto protected by client lock.
+	 *
+	 * \see osc_page_cancel().
+	 */
+	int (*cpo_cancel)(const struct lu_env *env,
+			  const struct cl_page_slice *slice);
+	/**
+	 * Write out a page by kernel. This is only called by ll_writepage
+	 * right now.
+	 *
+	 * \see cl_page_flush()
+	 */
+	int (*cpo_flush)(const struct lu_env *env,
+			 const struct cl_page_slice *slice,
+			 struct cl_io *io);
+	/** @} transfer */
+};
+
+/**
+ * Helper macro, dumping detailed information about \a page into a log.
+ */
+#define CL_PAGE_DEBUG(mask, env, page, format, ...)		     \
+do {								    \
+	LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);		\
+									\
+	if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {		   \
+		cl_page_print(env, &msgdata, lu_cdebug_printer, page);  \
+		CDEBUG(mask, format , ## __VA_ARGS__);		  \
+	}							       \
+} while (0)
+
+/**
+ * Helper macro, dumping shorter information about \a page into a log.
+ */
+#define CL_PAGE_HEADER(mask, env, page, format, ...)			  \
+do {									  \
+	LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);		      \
+									      \
+	if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {			 \
+		cl_page_header_print(env, &msgdata, lu_cdebug_printer, page); \
+		CDEBUG(mask, format , ## __VA_ARGS__);			\
+	}								     \
+} while (0)
+
+static inline int __page_in_use(const struct cl_page *page, int refc)
+{
+	if (page->cp_type == CPT_CACHEABLE)
+		++refc;
+	LASSERT(atomic_read(&page->cp_ref) > 0);
+	return (atomic_read(&page->cp_ref) > refc);
+}
+#define cl_page_in_use(pg)       __page_in_use(pg, 1)
+#define cl_page_in_use_noref(pg) __page_in_use(pg, 0)
+
+/** @} cl_page */
+
+/** \addtogroup cl_lock cl_lock
+ * @{ */
+/** \struct cl_lock
+ *
+ * Extent locking on the client.
+ *
+ * LAYERING
+ *
+ * The locking model of the new client code is built around
+ *
+ *	struct cl_lock
+ *
+ * data-type representing an extent lock on a regular file. cl_lock is a
+ * layered object (much like cl_object and cl_page), it consists of a header
+ * (struct cl_lock) and a list of layers (struct cl_lock_slice), linked to
+ * cl_lock::cll_layers list through cl_lock_slice::cls_linkage.
+ *
+ * All locks for a given object are linked into cl_object_header::coh_locks
+ * list (protected by cl_object_header::coh_lock_guard spin-lock) through
+ * cl_lock::cll_linkage. Currently this list is not sorted in any way. We can
+ * sort it in starting lock offset, or use altogether different data structure
+ * like a tree.
+ *
+ * Typical cl_lock consists of the two layers:
+ *
+ *     - vvp_lock (vvp specific data), and
+ *     - lov_lock (lov specific data).
+ *
+ * lov_lock contains an array of sub-locks. Each of these sub-locks is a
+ * normal cl_lock: it has a header (struct cl_lock) and a list of layers:
+ *
+ *     - lovsub_lock, and
+ *     - osc_lock
+ *
+ * Each sub-lock is associated with a cl_object (representing stripe
+ * sub-object or the file to which top-level cl_lock is associated to), and is
+ * linked into that cl_object::coh_locks. In this respect cl_lock is similar to
+ * cl_object (that at lov layer also fans out into multiple sub-objects), and
+ * is different from cl_page, that doesn't fan out (there is usually exactly
+ * one osc_page for every vvp_page). We shall call vvp-lov portion of the lock
+ * a "top-lock" and its lovsub-osc portion a "sub-lock".
+ *
+ * LIFE CYCLE
+ *
+ * cl_lock is reference counted. When reference counter drops to 0, lock is
+ * placed in the cache, except when lock is in CLS_FREEING state. CLS_FREEING
+ * lock is destroyed when last reference is released. Referencing between
+ * top-lock and its sub-locks is described in the lov documentation module.
+ *
+ * STATE MACHINE
+ *
+ * Also, cl_lock is a state machine. This requires some clarification. One of
+ * the goals of client IO re-write was to make IO path non-blocking, or at
+ * least to make it easier to make it non-blocking in the future. Here
+ * `non-blocking' means that when a system call (read, write, truncate)
+ * reaches a situation where it has to wait for a communication with the
+ * server, it should --instead of waiting-- remember its current state and
+ * switch to some other work.  E.g,. instead of waiting for a lock enqueue,
+ * client should proceed doing IO on the next stripe, etc. Obviously this is
+ * rather radical redesign, and it is not planned to be fully implemented at
+ * this time, instead we are putting some infrastructure in place, that would
+ * make it easier to do asynchronous non-blocking IO easier in the
+ * future. Specifically, where old locking code goes to sleep (waiting for
+ * enqueue, for example), new code returns cl_lock_transition::CLO_WAIT. When
+ * enqueue reply comes, its completion handler signals that lock state-machine
+ * is ready to transit to the next state. There is some generic code in
+ * cl_lock.c that sleeps, waiting for these signals. As a result, for users of
+ * this cl_lock.c code, it looks like locking is done in normal blocking
+ * fashion, and it the same time it is possible to switch to the non-blocking
+ * locking (simply by returning cl_lock_transition::CLO_WAIT from cl_lock.c
+ * functions).
+ *
+ * For a description of state machine states and transitions see enum
+ * cl_lock_state.
+ *
+ * There are two ways to restrict a set of states which lock might move to:
+ *
+ *     - placing a "hold" on a lock guarantees that lock will not be moved
+ *       into cl_lock_state::CLS_FREEING state until hold is released. Hold
+ *       can be only acquired on a lock that is not in
+ *       cl_lock_state::CLS_FREEING. All holds on a lock are counted in
+ *       cl_lock::cll_holds. Hold protects lock from cancellation and
+ *       destruction. Requests to cancel and destroy a lock on hold will be
+ *       recorded, but only honored when last hold on a lock is released;
+ *
+ *     - placing a "user" on a lock guarantees that lock will not leave
+ *       cl_lock_state::CLS_NEW, cl_lock_state::CLS_QUEUING,
+ *       cl_lock_state::CLS_ENQUEUED and cl_lock_state::CLS_HELD set of
+ *       states, once it enters this set. That is, if a user is added onto a
+ *       lock in a state not from this set, it doesn't immediately enforce
+ *       lock to move to this set, but once lock enters this set it will
+ *       remain there until all users are removed. Lock users are counted in
+ *       cl_lock::cll_users.
+ *
+ *       User is used to assure that lock is not canceled or destroyed while
+ *       it is being enqueued, or actively used by some IO.
+ *
+ *       Currently, a user always comes with a hold (cl_lock_invariant()
+ *       checks that a number of holds is not less than a number of users).
+ *
+ * CONCURRENCY
+ *
+ * This is how lock state-machine operates. struct cl_lock contains a mutex
+ * cl_lock::cll_guard that protects struct fields.
+ *
+ *     - mutex is taken, and cl_lock::cll_state is examined.
+ *
+ *     - for every state there are possible target states where lock can move
+ *       into. They are tried in order. Attempts to move into next state are
+ *       done by _try() functions in cl_lock.c:cl_{enqueue,unlock,wait}_try().
+ *
+ *     - if the transition can be performed immediately, state is changed,
+ *       and mutex is released.
+ *
+ *     - if the transition requires blocking, _try() function returns
+ *       cl_lock_transition::CLO_WAIT. Caller unlocks mutex and goes to
+ *       sleep, waiting for possibility of lock state change. It is woken
+ *       up when some event occurs, that makes lock state change possible
+ *       (e.g., the reception of the reply from the server), and repeats
+ *       the loop.
+ *
+ * Top-lock and sub-lock has separate mutexes and the latter has to be taken
+ * first to avoid dead-lock.
+ *
+ * To see an example of interaction of all these issues, take a look at the
+ * lov_cl.c:lov_lock_enqueue() function. It is called as a part of
+ * cl_enqueue_try(), and tries to advance top-lock to ENQUEUED state, by
+ * advancing state-machines of its sub-locks (lov_lock_enqueue_one()). Note
+ * also, that it uses trylock to grab sub-lock mutex to avoid dead-lock. It
+ * also has to handle CEF_ASYNC enqueue, when sub-locks enqueues have to be
+ * done in parallel, rather than one after another (this is used for glimpse
+ * locks, that cannot dead-lock).
+ *
+ * INTERFACE AND USAGE
+ *
+ * struct cl_lock_operations provide a number of call-backs that are invoked
+ * when events of interest occurs. Layers can intercept and handle glimpse,
+ * blocking, cancel ASTs and a reception of the reply from the server.
+ *
+ * One important difference with the old client locking model is that new
+ * client has a representation for the top-lock, whereas in the old code only
+ * sub-locks existed as real data structures and file-level locks are
+ * represented by "request sets" that are created and destroyed on each and
+ * every lock creation.
+ *
+ * Top-locks are cached, and can be found in the cache by the system calls. It
+ * is possible that top-lock is in cache, but some of its sub-locks were
+ * canceled and destroyed. In that case top-lock has to be enqueued again
+ * before it can be used.
+ *
+ * Overall process of the locking during IO operation is as following:
+ *
+ *     - once parameters for IO are setup in cl_io, cl_io_operations::cio_lock()
+ *       is called on each layer. Responsibility of this method is to add locks,
+ *       needed by a given layer into cl_io.ci_lockset.
+ *
+ *     - once locks for all layers were collected, they are sorted to avoid
+ *       dead-locks (cl_io_locks_sort()), and enqueued.
+ *
+ *     - when all locks are acquired, IO is performed;
+ *
+ *     - locks are released into cache.
+ *
+ * Striping introduces major additional complexity into locking. The
+ * fundamental problem is that it is generally unsafe to actively use (hold)
+ * two locks on the different OST servers at the same time, as this introduces
+ * inter-server dependency and can lead to cascading evictions.
+ *
+ * Basic solution is to sub-divide large read/write IOs into smaller pieces so
+ * that no multi-stripe locks are taken (note that this design abandons POSIX
+ * read/write semantics). Such pieces ideally can be executed concurrently. At
+ * the same time, certain types of IO cannot be sub-divived, without
+ * sacrificing correctness. This includes:
+ *
+ *  - O_APPEND write, where [0, EOF] lock has to be taken, to guarantee
+ *  atomicity;
+ *
+ *  - ftruncate(fd, offset), where [offset, EOF] lock has to be taken.
+ *
+ * Also, in the case of read(fd, buf, count) or write(fd, buf, count), where
+ * buf is a part of memory mapped Lustre file, a lock or locks protecting buf
+ * has to be held together with the usual lock on [offset, offset + count].
+ *
+ * As multi-stripe locks have to be allowed, it makes sense to cache them, so
+ * that, for example, a sequence of O_APPEND writes can proceed quickly
+ * without going down to the individual stripes to do lock matching. On the
+ * other hand, multi-stripe locks shouldn't be used by normal read/write
+ * calls. To achieve this, every layer can implement ->clo_fits_into() method,
+ * that is called by lock matching code (cl_lock_lookup()), and that can be
+ * used to selectively disable matching of certain locks for certain IOs. For
+ * exmaple, lov layer implements lov_lock_fits_into() that allow multi-stripe
+ * locks to be matched only for truncates and O_APPEND writes.
+ *
+ * Interaction with DLM
+ *
+ * In the expected setup, cl_lock is ultimately backed up by a collection of
+ * DLM locks (struct ldlm_lock). Association between cl_lock and DLM lock is
+ * implemented in osc layer, that also matches DLM events (ASTs, cancellation,
+ * etc.) into cl_lock_operation calls. See struct osc_lock for a more detailed
+ * description of interaction with DLM.
+ */
+
+/**
+ * Lock description.
+ */
+struct cl_lock_descr {
+	/** Object this lock is granted for. */
+	struct cl_object *cld_obj;
+	/** Index of the first page protected by this lock. */
+	pgoff_t	   cld_start;
+	/** Index of the last page (inclusive) protected by this lock. */
+	pgoff_t	   cld_end;
+	/** Group ID, for group lock */
+	__u64	     cld_gid;
+	/** Lock mode. */
+	enum cl_lock_mode cld_mode;
+	/**
+	 * flags to enqueue lock. A combination of bit-flags from
+	 * enum cl_enq_flags.
+	 */
+	__u32	     cld_enq_flags;
+};
+
+#define DDESCR "%s(%d):[%lu, %lu]"
+#define PDESCR(descr)						   \
+	cl_lock_mode_name((descr)->cld_mode), (descr)->cld_mode,	\
+	(descr)->cld_start, (descr)->cld_end
+
+const char *cl_lock_mode_name(const enum cl_lock_mode mode);
+
+/**
+ * Lock state-machine states.
+ *
+ * \htmlonly
+ * <pre>
+ *
+ * Possible state transitions:
+ *
+ *	      +------------------>NEW
+ *	      |		    |
+ *	      |		    | cl_enqueue_try()
+ *	      |		    |
+ *	      |    cl_unuse_try()  V
+ *	      |  +--------------QUEUING (*)
+ *	      |  |		 |
+ *	      |  |		 | cl_enqueue_try()
+ *	      |  |		 |
+ *	      |  | cl_unuse_try()  V
+ *    sub-lock  |  +-------------ENQUEUED (*)
+ *    canceled  |  |		 |
+ *	      |  |		 | cl_wait_try()
+ *	      |  |		 |
+ *	      |  |		(R)
+ *	      |  |		 |
+ *	      |  |		 V
+ *	      |  |		HELD<---------+
+ *	      |  |		 |	    |
+ *	      |  |		 |	    | cl_use_try()
+ *	      |  |  cl_unuse_try() |	    |
+ *	      |  |		 |	    |
+ *	      |  |		 V	 ---+
+ *	      |  +------------>INTRANSIT (D) <--+
+ *	      |		    |	    |
+ *	      |     cl_unuse_try() |	    | cached lock found
+ *	      |		    |	    | cl_use_try()
+ *	      |		    |	    |
+ *	      |		    V	    |
+ *	      +------------------CACHED---------+
+ *				   |
+ *				  (C)
+ *				   |
+ *				   V
+ *				FREEING
+ *
+ * Legend:
+ *
+ *	 In states marked with (*) transition to the same state (i.e., a loop
+ *	 in the diagram) is possible.
+ *
+ *	 (R) is the point where Receive call-back is invoked: it allows layers
+ *	 to handle arrival of lock reply.
+ *
+ *	 (C) is the point where Cancellation call-back is invoked.
+ *
+ *	 (D) is the transit state which means the lock is changing.
+ *
+ *	 Transition to FREEING state is possible from any other state in the
+ *	 diagram in case of unrecoverable error.
+ * </pre>
+ * \endhtmlonly
+ *
+ * These states are for individual cl_lock object. Top-lock and its sub-locks
+ * can be in the different states. Another way to say this is that we have
+ * nested state-machines.
+ *
+ * Separate QUEUING and ENQUEUED states are needed to support non-blocking
+ * operation for locks with multiple sub-locks. Imagine lock on a file F, that
+ * intersects 3 stripes S0, S1, and S2. To enqueue F client has to send
+ * enqueue to S0, wait for its completion, then send enqueue for S1, wait for
+ * its completion and at last enqueue lock for S2, and wait for its
+ * completion. In that case, top-lock is in QUEUING state while S0, S1 are
+ * handled, and is in ENQUEUED state after enqueue to S2 has been sent (note
+ * that in this case, sub-locks move from state to state, and top-lock remains
+ * in the same state).
+ */
+enum cl_lock_state {
+	/**
+	 * Lock that wasn't yet enqueued
+	 */
+	CLS_NEW,
+	/**
+	 * Enqueue is in progress, blocking for some intermediate interaction
+	 * with the other side.
+	 */
+	CLS_QUEUING,
+	/**
+	 * Lock is fully enqueued, waiting for server to reply when it is
+	 * granted.
+	 */
+	CLS_ENQUEUED,
+	/**
+	 * Lock granted, actively used by some IO.
+	 */
+	CLS_HELD,
+	/**
+	 * This state is used to mark the lock is being used, or unused.
+	 * We need this state because the lock may have several sublocks,
+	 * so it's impossible to have an atomic way to bring all sublocks
+	 * into CLS_HELD state at use case, or all sublocks to CLS_CACHED
+	 * at unuse case.
+	 * If a thread is referring to a lock, and it sees the lock is in this
+	 * state, it must wait for the lock.
+	 * See state diagram for details.
+	 */
+	CLS_INTRANSIT,
+	/**
+	 * Lock granted, not used.
+	 */
+	CLS_CACHED,
+	/**
+	 * Lock is being destroyed.
+	 */
+	CLS_FREEING,
+	CLS_NR
+};
+
+enum cl_lock_flags {
+	/**
+	 * lock has been cancelled. This flag is never cleared once set (by
+	 * cl_lock_cancel0()).
+	 */
+	CLF_CANCELLED  = 1 << 0,
+	/** cancellation is pending for this lock. */
+	CLF_CANCELPEND = 1 << 1,
+	/** destruction is pending for this lock. */
+	CLF_DOOMED     = 1 << 2,
+	/** from enqueue RPC reply upcall. */
+	CLF_FROM_UPCALL= 1 << 3,
+};
+
+/**
+ * Lock closure.
+ *
+ * Lock closure is a collection of locks (both top-locks and sub-locks) that
+ * might be updated in a result of an operation on a certain lock (which lock
+ * this is a closure of).
+ *
+ * Closures are needed to guarantee dead-lock freedom in the presence of
+ *
+ *     - nested state-machines (top-lock state-machine composed of sub-lock
+ *       state-machines), and
+ *
+ *     - shared sub-locks.
+ *
+ * Specifically, many operations, such as lock enqueue, wait, unlock,
+ * etc. start from a top-lock, and then operate on a sub-locks of this
+ * top-lock, holding a top-lock mutex. When sub-lock state changes as a result
+ * of such operation, this change has to be propagated to all top-locks that
+ * share this sub-lock. Obviously, no natural lock ordering (e.g.,
+ * top-to-bottom or bottom-to-top) captures this scenario, so try-locking has
+ * to be used. Lock closure systematizes this try-and-repeat logic.
+ */
+struct cl_lock_closure {
+	/**
+	 * Lock that is mutexed when closure construction is started. When
+	 * closure in is `wait' mode (cl_lock_closure::clc_wait), mutex on
+	 * origin is released before waiting.
+	 */
+	struct cl_lock   *clc_origin;
+	/**
+	 * List of enclosed locks, so far. Locks are linked here through
+	 * cl_lock::cll_inclosure.
+	 */
+	struct list_head	clc_list;
+	/**
+	 * True iff closure is in a `wait' mode. This determines what
+	 * cl_lock_enclosure() does when a lock L to be added to the closure
+	 * is currently mutexed by some other thread.
+	 *
+	 * If cl_lock_closure::clc_wait is not set, then closure construction
+	 * fails with CLO_REPEAT immediately.
+	 *
+	 * In wait mode, cl_lock_enclosure() waits until next attempt to build
+	 * a closure might succeed. To this end it releases an origin mutex
+	 * (cl_lock_closure::clc_origin), that has to be the only lock mutex
+	 * owned by the current thread, and then waits on L mutex (by grabbing
+	 * it and immediately releasing), before returning CLO_REPEAT to the
+	 * caller.
+	 */
+	int	       clc_wait;
+	/** Number of locks in the closure. */
+	int	       clc_nr;
+};
+
+/**
+ * Layered client lock.
+ */
+struct cl_lock {
+	/** Reference counter. */
+	atomic_t	  cll_ref;
+	/** List of slices. Immutable after creation. */
+	struct list_head	    cll_layers;
+	/**
+	 * Linkage into cl_lock::cll_descr::cld_obj::coh_locks list. Protected
+	 * by cl_lock::cll_descr::cld_obj::coh_lock_guard.
+	 */
+	struct list_head	    cll_linkage;
+	/**
+	 * Parameters of this lock. Protected by
+	 * cl_lock::cll_descr::cld_obj::coh_lock_guard nested within
+	 * cl_lock::cll_guard. Modified only on lock creation and in
+	 * cl_lock_modify().
+	 */
+	struct cl_lock_descr  cll_descr;
+	/** Protected by cl_lock::cll_guard. */
+	enum cl_lock_state    cll_state;
+	/** signals state changes. */
+	wait_queue_head_t	   cll_wq;
+	/**
+	 * Recursive lock, most fields in cl_lock{} are protected by this.
+	 *
+	 * Locking rules: this mutex is never held across network
+	 * communication, except when lock is being canceled.
+	 *
+	 * Lock ordering: a mutex of a sub-lock is taken first, then a mutex
+	 * on a top-lock. Other direction is implemented through a
+	 * try-lock-repeat loop. Mutices of unrelated locks can be taken only
+	 * by try-locking.
+	 *
+	 * \see osc_lock_enqueue_wait(), lov_lock_cancel(), lov_sublock_wait().
+	 */
+	struct mutex		cll_guard;
+	task_t	   *cll_guarder;
+	int		   cll_depth;
+
+	/**
+	 * the owner for INTRANSIT state
+	 */
+	task_t	   *cll_intransit_owner;
+	int		   cll_error;
+	/**
+	 * Number of holds on a lock. A hold prevents a lock from being
+	 * canceled and destroyed. Protected by cl_lock::cll_guard.
+	 *
+	 * \see cl_lock_hold(), cl_lock_unhold(), cl_lock_release()
+	 */
+	int		   cll_holds;
+	 /**
+	  * Number of lock users. Valid in cl_lock_state::CLS_HELD state
+	  * only. Lock user pins lock in CLS_HELD state. Protected by
+	  * cl_lock::cll_guard.
+	  *
+	  * \see cl_wait(), cl_unuse().
+	  */
+	int		   cll_users;
+	/**
+	 * Flag bit-mask. Values from enum cl_lock_flags. Updates are
+	 * protected by cl_lock::cll_guard.
+	 */
+	unsigned long	 cll_flags;
+	/**
+	 * A linkage into a list of locks in a closure.
+	 *
+	 * \see cl_lock_closure
+	 */
+	struct list_head	    cll_inclosure;
+	/**
+	 * Confict lock at queuing time.
+	 */
+	struct cl_lock       *cll_conflict;
+	/**
+	 * A list of references to this lock, for debugging.
+	 */
+	struct lu_ref	 cll_reference;
+	/**
+	 * A list of holds on this lock, for debugging.
+	 */
+	struct lu_ref	 cll_holders;
+	/**
+	 * A reference for cl_lock::cll_descr::cld_obj. For debugging.
+	 */
+	struct lu_ref_link   *cll_obj_ref;
+#ifdef CONFIG_LOCKDEP
+	/* "dep_map" name is assumed by lockdep.h macros. */
+	struct lockdep_map    dep_map;
+#endif
+};
+
+/**
+ * Per-layer part of cl_lock
+ *
+ * \see ccc_lock, lov_lock, lovsub_lock, osc_lock
+ */
+struct cl_lock_slice {
+	struct cl_lock		  *cls_lock;
+	/** Object slice corresponding to this lock slice. Immutable after
+	 * creation. */
+	struct cl_object		*cls_obj;
+	const struct cl_lock_operations *cls_ops;
+	/** Linkage into cl_lock::cll_layers. Immutable after creation. */
+	struct list_head		       cls_linkage;
+};
+
+/**
+ * Possible (non-error) return values of ->clo_{enqueue,wait,unlock}().
+ *
+ * NOTE: lov_subresult() depends on ordering here.
+ */
+enum cl_lock_transition {
+	/** operation cannot be completed immediately. Wait for state change. */
+	CLO_WAIT	= 1,
+	/** operation had to release lock mutex, restart. */
+	CLO_REPEAT      = 2,
+	/** lower layer re-enqueued. */
+	CLO_REENQUEUED  = 3,
+};
+
+/**
+ *
+ * \see vvp_lock_ops, lov_lock_ops, lovsub_lock_ops, osc_lock_ops
+ */
+struct cl_lock_operations {
+	/**
+	 * \name statemachine
+	 *
+	 * State machine transitions. These 3 methods are called to transfer
+	 * lock from one state to another, as described in the commentary
+	 * above enum #cl_lock_state.
+	 *
+	 * \retval 0	  this layer has nothing more to do to before
+	 *		       transition to the target state happens;
+	 *
+	 * \retval CLO_REPEAT method had to release and re-acquire cl_lock
+	 *		    mutex, repeat invocation of transition method
+	 *		    across all layers;
+	 *
+	 * \retval CLO_WAIT   this layer cannot move to the target state
+	 *		    immediately, as it has to wait for certain event
+	 *		    (e.g., the communication with the server). It
+	 *		    is guaranteed, that when the state transfer
+	 *		    becomes possible, cl_lock::cll_wq wait-queue
+	 *		    is signaled. Caller can wait for this event by
+	 *		    calling cl_lock_state_wait();
+	 *
+	 * \retval -ve	failure, abort state transition, move the lock
+	 *		    into cl_lock_state::CLS_FREEING state, and set
+	 *		    cl_lock::cll_error.
+	 *
+	 * Once all layers voted to agree to transition (by returning 0), lock
+	 * is moved into corresponding target state. All state transition
+	 * methods are optional.
+	 */
+	/** @{ */
+	/**
+	 * Attempts to enqueue the lock. Called top-to-bottom.
+	 *
+	 * \see ccc_lock_enqueue(), lov_lock_enqueue(), lovsub_lock_enqueue(),
+	 * \see osc_lock_enqueue()
+	 */
+	int  (*clo_enqueue)(const struct lu_env *env,
+			    const struct cl_lock_slice *slice,
+			    struct cl_io *io, __u32 enqflags);
+	/**
+	 * Attempts to wait for enqueue result. Called top-to-bottom.
+	 *
+	 * \see ccc_lock_wait(), lov_lock_wait(), osc_lock_wait()
+	 */
+	int  (*clo_wait)(const struct lu_env *env,
+			 const struct cl_lock_slice *slice);
+	/**
+	 * Attempts to unlock the lock. Called bottom-to-top. In addition to
+	 * usual return values of lock state-machine methods, this can return
+	 * -ESTALE to indicate that lock cannot be returned to the cache, and
+	 * has to be re-initialized.
+	 * unuse is a one-shot operation, so it must NOT return CLO_WAIT.
+	 *
+	 * \see ccc_lock_unuse(), lov_lock_unuse(), osc_lock_unuse()
+	 */
+	int  (*clo_unuse)(const struct lu_env *env,
+			  const struct cl_lock_slice *slice);
+	/**
+	 * Notifies layer that cached lock is started being used.
+	 *
+	 * \pre lock->cll_state == CLS_CACHED
+	 *
+	 * \see lov_lock_use(), osc_lock_use()
+	 */
+	int  (*clo_use)(const struct lu_env *env,
+			const struct cl_lock_slice *slice);
+	/** @} statemachine */
+	/**
+	 * A method invoked when lock state is changed (as a result of state
+	 * transition). This is used, for example, to track when the state of
+	 * a sub-lock changes, to propagate this change to the corresponding
+	 * top-lock. Optional
+	 *
+	 * \see lovsub_lock_state()
+	 */
+	void (*clo_state)(const struct lu_env *env,
+			  const struct cl_lock_slice *slice,
+			  enum cl_lock_state st);
+	/**
+	 * Returns true, iff given lock is suitable for the given io, idea
+	 * being, that there are certain "unsafe" locks, e.g., ones acquired
+	 * for O_APPEND writes, that we don't want to re-use for a normal
+	 * write, to avoid the danger of cascading evictions. Optional. Runs
+	 * under cl_object_header::coh_lock_guard.
+	 *
+	 * XXX this should take more information about lock needed by
+	 * io. Probably lock description or something similar.
+	 *
+	 * \see lov_fits_into()
+	 */
+	int (*clo_fits_into)(const struct lu_env *env,
+			     const struct cl_lock_slice *slice,
+			     const struct cl_lock_descr *need,
+			     const struct cl_io *io);
+	/**
+	 * \name ast
+	 * Asynchronous System Traps. All of then are optional, all are
+	 * executed bottom-to-top.
+	 */
+	/** @{ */
+
+	/**
+	 * Cancellation callback. Cancel a lock voluntarily, or under
+	 * the request of server.
+	 */
+	void (*clo_cancel)(const struct lu_env *env,
+			   const struct cl_lock_slice *slice);
+	/**
+	 * Lock weighting ast. Executed to estimate how precious this lock
+	 * is. The sum of results across all layers is used to determine
+	 * whether lock worth keeping in cache given present memory usage.
+	 *
+	 * \see osc_lock_weigh(), vvp_lock_weigh(), lovsub_lock_weigh().
+	 */
+	unsigned long (*clo_weigh)(const struct lu_env *env,
+				   const struct cl_lock_slice *slice);
+	/** @} ast */
+
+	/**
+	 * \see lovsub_lock_closure()
+	 */
+	int (*clo_closure)(const struct lu_env *env,
+			   const struct cl_lock_slice *slice,
+			   struct cl_lock_closure *closure);
+	/**
+	 * Executed bottom-to-top when lock description changes (e.g., as a
+	 * result of server granting more generous lock than was requested).
+	 *
+	 * \see lovsub_lock_modify()
+	 */
+	int (*clo_modify)(const struct lu_env *env,
+			  const struct cl_lock_slice *slice,
+			  const struct cl_lock_descr *updated);
+	/**
+	 * Notifies layers (bottom-to-top) that lock is going to be
+	 * destroyed. Responsibility of layers is to prevent new references on
+	 * this lock from being acquired once this method returns.
+	 *
+	 * This can be called multiple times due to the races.
+	 *
+	 * \see cl_lock_delete()
+	 * \see osc_lock_delete(), lovsub_lock_delete()
+	 */
+	void (*clo_delete)(const struct lu_env *env,
+			   const struct cl_lock_slice *slice);
+	/**
+	 * Destructor. Frees resources and the slice.
+	 *
+	 * \see ccc_lock_fini(), lov_lock_fini(), lovsub_lock_fini(),
+	 * \see osc_lock_fini()
+	 */
+	void (*clo_fini)(const struct lu_env *env, struct cl_lock_slice *slice);
+	/**
+	 * Optional debugging helper. Prints given lock slice.
+	 */
+	int (*clo_print)(const struct lu_env *env,
+			 void *cookie, lu_printer_t p,
+			 const struct cl_lock_slice *slice);
+};
+
+#define CL_LOCK_DEBUG(mask, env, lock, format, ...)		     \
+do {								    \
+	LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);		\
+									\
+	if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {		   \
+		cl_lock_print(env, &msgdata, lu_cdebug_printer, lock);  \
+		CDEBUG(mask, format , ## __VA_ARGS__);		  \
+	}							       \
+} while (0)
+
+#define CL_LOCK_ASSERT(expr, env, lock) do {			    \
+	if (likely(expr))					       \
+		break;						  \
+									\
+	CL_LOCK_DEBUG(D_ERROR, env, lock, "failed at %s.\n", #expr);    \
+	LBUG();							 \
+} while (0)
+
+/** @} cl_lock */
+
+/** \addtogroup cl_page_list cl_page_list
+ * Page list used to perform collective operations on a group of pages.
+ *
+ * Pages are added to the list one by one. cl_page_list acquires a reference
+ * for every page in it. Page list is used to perform collective operations on
+ * pages:
+ *
+ *     - submit pages for an immediate transfer,
+ *
+ *     - own pages on behalf of certain io (waiting for each page in turn),
+ *
+ *     - discard pages.
+ *
+ * When list is finalized, it releases references on all pages it still has.
+ *
+ * \todo XXX concurrency control.
+ *
+ * @{
+ */
+struct cl_page_list {
+	unsigned	     pl_nr;
+	struct list_head	   pl_pages;
+	task_t	  *pl_owner;
+};
+
+/**
+ * A 2-queue of pages. A convenience data-type for common use case, 2-queue
+ * contains an incoming page list and an outgoing page list.
+ */
+struct cl_2queue {
+	struct cl_page_list c2_qin;
+	struct cl_page_list c2_qout;
+};
+
+/** @} cl_page_list */
+
+/** \addtogroup cl_io cl_io
+ * @{ */
+/** \struct cl_io
+ * I/O
+ *
+ * cl_io represents a high level I/O activity like
+ * read(2)/write(2)/truncate(2) system call, or cancellation of an extent
+ * lock.
+ *
+ * cl_io is a layered object, much like cl_{object,page,lock} but with one
+ * important distinction. We want to minimize number of calls to the allocator
+ * in the fast path, e.g., in the case of read(2) when everything is cached:
+ * client already owns the lock over region being read, and data are cached
+ * due to read-ahead. To avoid allocation of cl_io layers in such situations,
+ * per-layer io state is stored in the session, associated with the io, see
+ * struct {vvp,lov,osc}_io for example. Sessions allocation is amortized
+ * by using free-lists, see cl_env_get().
+ *
+ * There is a small predefined number of possible io types, enumerated in enum
+ * cl_io_type.
+ *
+ * cl_io is a state machine, that can be advanced concurrently by the multiple
+ * threads. It is up to these threads to control the concurrency and,
+ * specifically, to detect when io is done, and its state can be safely
+ * released.
+ *
+ * For read/write io overall execution plan is as following:
+ *
+ *     (0) initialize io state through all layers;
+ *
+ *     (1) loop: prepare chunk of work to do
+ *
+ *     (2) call all layers to collect locks they need to process current chunk
+ *
+ *     (3) sort all locks to avoid dead-locks, and acquire them
+ *
+ *     (4) process the chunk: call per-page methods
+ *	 (cl_io_operations::cio_read_page() for read,
+ *	 cl_io_operations::cio_prepare_write(),
+ *	 cl_io_operations::cio_commit_write() for write)
+ *
+ *     (5) release locks
+ *
+ *     (6) repeat loop.
+ *
+ * To implement the "parallel IO mode", lov layer creates sub-io's (lazily to
+ * address allocation efficiency issues mentioned above), and returns with the
+ * special error condition from per-page method when current sub-io has to
+ * block. This causes io loop to be repeated, and lov switches to the next
+ * sub-io in its cl_io_operations::cio_iter_init() implementation.
+ */
+
+/** IO types */
+enum cl_io_type {
+	/** read system call */
+	CIT_READ,
+	/** write system call */
+	CIT_WRITE,
+	/** truncate, utime system calls */
+	CIT_SETATTR,
+	/**
+	 * page fault handling
+	 */
+	CIT_FAULT,
+	/**
+	 * fsync system call handling
+	 * To write out a range of file
+	 */
+	CIT_FSYNC,
+	/**
+	 * Miscellaneous io. This is used for occasional io activity that
+	 * doesn't fit into other types. Currently this is used for:
+	 *
+	 *     - cancellation of an extent lock. This io exists as a context
+	 *     to write dirty pages from under the lock being canceled back
+	 *     to the server;
+	 *
+	 *     - VM induced page write-out. An io context for writing page out
+	 *     for memory cleansing;
+	 *
+	 *     - glimpse. An io context to acquire glimpse lock.
+	 *
+	 *     - grouplock. An io context to acquire group lock.
+	 *
+	 * CIT_MISC io is used simply as a context in which locks and pages
+	 * are manipulated. Such io has no internal "process", that is,
+	 * cl_io_loop() is never called for it.
+	 */
+	CIT_MISC,
+	CIT_OP_NR
+};
+
+/**
+ * States of cl_io state machine
+ */
+enum cl_io_state {
+	/** Not initialized. */
+	CIS_ZERO,
+	/** Initialized. */
+	CIS_INIT,
+	/** IO iteration started. */
+	CIS_IT_STARTED,
+	/** Locks taken. */
+	CIS_LOCKED,
+	/** Actual IO is in progress. */
+	CIS_IO_GOING,
+	/** IO for the current iteration finished. */
+	CIS_IO_FINISHED,
+	/** Locks released. */
+	CIS_UNLOCKED,
+	/** Iteration completed. */
+	CIS_IT_ENDED,
+	/** cl_io finalized. */
+	CIS_FINI
+};
+
+/**
+ * IO state private for a layer.
+ *
+ * This is usually embedded into layer session data, rather than allocated
+ * dynamically.
+ *
+ * \see vvp_io, lov_io, osc_io, ccc_io
+ */
+struct cl_io_slice {
+	struct cl_io		  *cis_io;
+	/** corresponding object slice. Immutable after creation. */
+	struct cl_object	      *cis_obj;
+	/** io operations. Immutable after creation. */
+	const struct cl_io_operations *cis_iop;
+	/**
+	 * linkage into a list of all slices for a given cl_io, hanging off
+	 * cl_io::ci_layers. Immutable after creation.
+	 */
+	struct list_head		     cis_linkage;
+};
+
+
+/**
+ * Per-layer io operations.
+ * \see vvp_io_ops, lov_io_ops, lovsub_io_ops, osc_io_ops
+ */
+struct cl_io_operations {
+	/**
+	 * Vector of io state transition methods for every io type.
+	 *
+	 * \see cl_page_operations::io
+	 */
+	struct {
+		/**
+		 * Prepare io iteration at a given layer.
+		 *
+		 * Called top-to-bottom at the beginning of each iteration of
+		 * "io loop" (if it makes sense for this type of io). Here
+		 * layer selects what work it will do during this iteration.
+		 *
+		 * \see cl_io_operations::cio_iter_fini()
+		 */
+		int (*cio_iter_init) (const struct lu_env *env,
+				      const struct cl_io_slice *slice);
+		/**
+		 * Finalize io iteration.
+		 *
+		 * Called bottom-to-top at the end of each iteration of "io
+		 * loop". Here layers can decide whether IO has to be
+		 * continued.
+		 *
+		 * \see cl_io_operations::cio_iter_init()
+		 */
+		void (*cio_iter_fini) (const struct lu_env *env,
+				       const struct cl_io_slice *slice);
+		/**
+		 * Collect locks for the current iteration of io.
+		 *
+		 * Called top-to-bottom to collect all locks necessary for
+		 * this iteration. This methods shouldn't actually enqueue
+		 * anything, instead it should post a lock through
+		 * cl_io_lock_add(). Once all locks are collected, they are
+		 * sorted and enqueued in the proper order.
+		 */
+		int  (*cio_lock) (const struct lu_env *env,
+				  const struct cl_io_slice *slice);
+		/**
+		 * Finalize unlocking.
+		 *
+		 * Called bottom-to-top to finish layer specific unlocking
+		 * functionality, after generic code released all locks
+		 * acquired by cl_io_operations::cio_lock().
+		 */
+		void  (*cio_unlock)(const struct lu_env *env,
+				    const struct cl_io_slice *slice);
+		/**
+		 * Start io iteration.
+		 *
+		 * Once all locks are acquired, called top-to-bottom to
+		 * commence actual IO. In the current implementation,
+		 * top-level vvp_io_{read,write}_start() does all the work
+		 * synchronously by calling generic_file_*(), so other layers
+		 * are called when everything is done.
+		 */
+		int  (*cio_start)(const struct lu_env *env,
+				  const struct cl_io_slice *slice);
+		/**
+		 * Called top-to-bottom at the end of io loop. Here layer
+		 * might wait for an unfinished asynchronous io.
+		 */
+		void (*cio_end)  (const struct lu_env *env,
+				  const struct cl_io_slice *slice);
+		/**
+		 * Called bottom-to-top to notify layers that read/write IO
+		 * iteration finished, with \a nob bytes transferred.
+		 */
+		void (*cio_advance)(const struct lu_env *env,
+				    const struct cl_io_slice *slice,
+				    size_t nob);
+		/**
+		 * Called once per io, bottom-to-top to release io resources.
+		 */
+		void (*cio_fini) (const struct lu_env *env,
+				  const struct cl_io_slice *slice);
+	} op[CIT_OP_NR];
+	struct {
+		/**
+		 * Submit pages from \a queue->c2_qin for IO, and move
+		 * successfully submitted pages into \a queue->c2_qout. Return
+		 * non-zero if failed to submit even the single page. If
+		 * submission failed after some pages were moved into \a
+		 * queue->c2_qout, completion callback with non-zero ioret is
+		 * executed on them.
+		 */
+		int  (*cio_submit)(const struct lu_env *env,
+				   const struct cl_io_slice *slice,
+				   enum cl_req_type crt,
+				   struct cl_2queue *queue);
+	} req_op[CRT_NR];
+	/**
+	 * Read missing page.
+	 *
+	 * Called by a top-level cl_io_operations::op[CIT_READ]::cio_start()
+	 * method, when it hits not-up-to-date page in the range. Optional.
+	 *
+	 * \pre io->ci_type == CIT_READ
+	 */
+	int (*cio_read_page)(const struct lu_env *env,
+			     const struct cl_io_slice *slice,
+			     const struct cl_page_slice *page);
+	/**
+	 * Prepare write of a \a page. Called bottom-to-top by a top-level
+	 * cl_io_operations::op[CIT_WRITE]::cio_start() to prepare page for
+	 * get data from user-level buffer.
+	 *
+	 * \pre io->ci_type == CIT_WRITE
+	 *
+	 * \see vvp_io_prepare_write(), lov_io_prepare_write(),
+	 * osc_io_prepare_write().
+	 */
+	int (*cio_prepare_write)(const struct lu_env *env,
+				 const struct cl_io_slice *slice,
+				 const struct cl_page_slice *page,
+				 unsigned from, unsigned to);
+	/**
+	 *
+	 * \pre io->ci_type == CIT_WRITE
+	 *
+	 * \see vvp_io_commit_write(), lov_io_commit_write(),
+	 * osc_io_commit_write().
+	 */
+	int (*cio_commit_write)(const struct lu_env *env,
+				const struct cl_io_slice *slice,
+				const struct cl_page_slice *page,
+				unsigned from, unsigned to);
+	/**
+	 * Optional debugging helper. Print given io slice.
+	 */
+	int (*cio_print)(const struct lu_env *env, void *cookie,
+			 lu_printer_t p, const struct cl_io_slice *slice);
+};
+
+/**
+ * Flags to lock enqueue procedure.
+ * \ingroup cl_lock
+ */
+enum cl_enq_flags {
+	/**
+	 * instruct server to not block, if conflicting lock is found. Instead
+	 * -EWOULDBLOCK is returned immediately.
+	 */
+	CEF_NONBLOCK     = 0x00000001,
+	/**
+	 * take lock asynchronously (out of order), as it cannot
+	 * deadlock. This is for LDLM_FL_HAS_INTENT locks used for glimpsing.
+	 */
+	CEF_ASYNC	= 0x00000002,
+	/**
+	 * tell the server to instruct (though a flag in the blocking ast) an
+	 * owner of the conflicting lock, that it can drop dirty pages
+	 * protected by this lock, without sending them to the server.
+	 */
+	CEF_DISCARD_DATA = 0x00000004,
+	/**
+	 * tell the sub layers that it must be a `real' lock. This is used for
+	 * mmapped-buffer locks and glimpse locks that must be never converted
+	 * into lockless mode.
+	 *
+	 * \see vvp_mmap_locks(), cl_glimpse_lock().
+	 */
+	CEF_MUST	 = 0x00000008,
+	/**
+	 * tell the sub layers that never request a `real' lock. This flag is
+	 * not used currently.
+	 *
+	 * cl_io::ci_lockreq and CEF_{MUST,NEVER} flags specify lockless
+	 * conversion policy: ci_lockreq describes generic information of lock
+	 * requirement for this IO, especially for locks which belong to the
+	 * object doing IO; however, lock itself may have precise requirements
+	 * that are described by the enqueue flags.
+	 */
+	CEF_NEVER	= 0x00000010,
+	/**
+	 * for async glimpse lock.
+	 */
+	CEF_AGL	  = 0x00000020,
+	/**
+	 * mask of enq_flags.
+	 */
+	CEF_MASK	 = 0x0000003f,
+};
+
+/**
+ * Link between lock and io. Intermediate structure is needed, because the
+ * same lock can be part of multiple io's simultaneously.
+ */
+struct cl_io_lock_link {
+	/** linkage into one of cl_lockset lists. */
+	struct list_head	   cill_linkage;
+	struct cl_lock_descr cill_descr;
+	struct cl_lock      *cill_lock;
+	/** optional destructor */
+	void	       (*cill_fini)(const struct lu_env *env,
+					struct cl_io_lock_link *link);
+};
+
+/**
+ * Lock-set represents a collection of locks, that io needs at a
+ * time. Generally speaking, client tries to avoid holding multiple locks when
+ * possible, because
+ *
+ *      - holding extent locks over multiple ost's introduces the danger of
+ *	"cascading timeouts";
+ *
+ *      - holding multiple locks over the same ost is still dead-lock prone,
+ *	see comment in osc_lock_enqueue(),
+ *
+ * but there are certain situations where this is unavoidable:
+ *
+ *      - O_APPEND writes have to take [0, EOF] lock for correctness;
+ *
+ *      - truncate has to take [new-size, EOF] lock for correctness;
+ *
+ *      - SNS has to take locks across full stripe for correctness;
+ *
+ *      - in the case when user level buffer, supplied to {read,write}(file0),
+ *	is a part of a memory mapped lustre file, client has to take a dlm
+ *	locks on file0, and all files that back up the buffer (or a part of
+ *	the buffer, that is being processed in the current chunk, in any
+ *	case, there are situations where at least 2 locks are necessary).
+ *
+ * In such cases we at least try to take locks in the same consistent
+ * order. To this end, all locks are first collected, then sorted, and then
+ * enqueued.
+ */
+struct cl_lockset {
+	/** locks to be acquired. */
+	struct list_head  cls_todo;
+	/** locks currently being processed. */
+	struct list_head  cls_curr;
+	/** locks acquired. */
+	struct list_head  cls_done;
+};
+
+/**
+ * Lock requirements(demand) for IO. It should be cl_io_lock_req,
+ * but 'req' is always to be thought as 'request' :-)
+ */
+enum cl_io_lock_dmd {
+	/** Always lock data (e.g., O_APPEND). */
+	CILR_MANDATORY = 0,
+	/** Layers are free to decide between local and global locking. */
+	CILR_MAYBE,
+	/** Never lock: there is no cache (e.g., liblustre). */
+	CILR_NEVER
+};
+
+enum cl_fsync_mode {
+	/** start writeback, do not wait for them to finish */
+	CL_FSYNC_NONE  = 0,
+	/** start writeback and wait for them to finish */
+	CL_FSYNC_LOCAL = 1,
+	/** discard all of dirty pages in a specific file range */
+	CL_FSYNC_DISCARD = 2,
+	/** start writeback and make sure they have reached storage before
+	 * return. OST_SYNC RPC must be issued and finished */
+	CL_FSYNC_ALL   = 3
+};
+
+struct cl_io_rw_common {
+	loff_t      crw_pos;
+	size_t      crw_count;
+	int	 crw_nonblock;
+};
+
+
+/**
+ * State for io.
+ *
+ * cl_io is shared by all threads participating in this IO (in current
+ * implementation only one thread advances IO, but parallel IO design and
+ * concurrent copy_*_user() require multiple threads acting on the same IO. It
+ * is up to these threads to serialize their activities, including updates to
+ * mutable cl_io fields.
+ */
+struct cl_io {
+	/** type of this IO. Immutable after creation. */
+	enum cl_io_type		ci_type;
+	/** current state of cl_io state machine. */
+	enum cl_io_state	       ci_state;
+	/** main object this io is against. Immutable after creation. */
+	struct cl_object	      *ci_obj;
+	/**
+	 * Upper layer io, of which this io is a part of. Immutable after
+	 * creation.
+	 */
+	struct cl_io		  *ci_parent;
+	/** List of slices. Immutable after creation. */
+	struct list_head		     ci_layers;
+	/** list of locks (to be) acquired by this io. */
+	struct cl_lockset	      ci_lockset;
+	/** lock requirements, this is just a help info for sublayers. */
+	enum cl_io_lock_dmd	    ci_lockreq;
+	union {
+		struct cl_rd_io {
+			struct cl_io_rw_common rd;
+		} ci_rd;
+		struct cl_wr_io {
+			struct cl_io_rw_common wr;
+			int		    wr_append;
+			int		    wr_sync;
+		} ci_wr;
+		struct cl_io_rw_common ci_rw;
+		struct cl_setattr_io {
+			struct ost_lvb   sa_attr;
+			unsigned int     sa_valid;
+			struct obd_capa *sa_capa;
+		} ci_setattr;
+		struct cl_fault_io {
+			/** page index within file. */
+			pgoff_t	 ft_index;
+			/** bytes valid byte on a faulted page. */
+			int	     ft_nob;
+			/** writable page? for nopage() only */
+			int	     ft_writable;
+			/** page of an executable? */
+			int	     ft_executable;
+			/** page_mkwrite() */
+			int	     ft_mkwrite;
+			/** resulting page */
+			struct cl_page *ft_page;
+		} ci_fault;
+		struct cl_fsync_io {
+			loff_t	     fi_start;
+			loff_t	     fi_end;
+			struct obd_capa   *fi_capa;
+			/** file system level fid */
+			struct lu_fid     *fi_fid;
+			enum cl_fsync_mode fi_mode;
+			/* how many pages were written/discarded */
+			unsigned int       fi_nr_written;
+		} ci_fsync;
+	} u;
+	struct cl_2queue     ci_queue;
+	size_t	       ci_nob;
+	int		  ci_result;
+	unsigned int	 ci_continue:1,
+	/**
+	 * This io has held grouplock, to inform sublayers that
+	 * don't do lockless i/o.
+	 */
+			     ci_no_srvlock:1,
+	/**
+	 * The whole IO need to be restarted because layout has been changed
+	 */
+			     ci_need_restart:1,
+	/**
+	 * to not refresh layout - the IO issuer knows that the layout won't
+	 * change(page operations, layout change causes all page to be
+	 * discarded), or it doesn't matter if it changes(sync).
+	 */
+			     ci_ignore_layout:1,
+	/**
+	 * Check if layout changed after the IO finishes. Mainly for HSM
+	 * requirement. If IO occurs to openning files, it doesn't need to
+	 * verify layout because HSM won't release openning files.
+	 * Right now, only two opertaions need to verify layout: glimpse
+	 * and setattr.
+	 */
+			     ci_verify_layout:1;
+	/**
+	 * Number of pages owned by this IO. For invariant checking.
+	 */
+	unsigned	     ci_owned_nr;
+};
+
+/** @} cl_io */
+
+/** \addtogroup cl_req cl_req
+ * @{ */
+/** \struct cl_req
+ * Transfer.
+ *
+ * There are two possible modes of transfer initiation on the client:
+ *
+ *     - immediate transfer: this is started when a high level io wants a page
+ *       or a collection of pages to be transferred right away. Examples:
+ *       read-ahead, synchronous read in the case of non-page aligned write,
+ *       page write-out as a part of extent lock cancellation, page write-out
+ *       as a part of memory cleansing. Immediate transfer can be both
+ *       cl_req_type::CRT_READ and cl_req_type::CRT_WRITE;
+ *
+ *     - opportunistic transfer (cl_req_type::CRT_WRITE only), that happens
+ *       when io wants to transfer a page to the server some time later, when
+ *       it can be done efficiently. Example: pages dirtied by the write(2)
+ *       path.
+ *
+ * In any case, transfer takes place in the form of a cl_req, which is a
+ * representation for a network RPC.
+ *
+ * Pages queued for an opportunistic transfer are cached until it is decided
+ * that efficient RPC can be composed of them. This decision is made by "a
+ * req-formation engine", currently implemented as a part of osc
+ * layer. Req-formation depends on many factors: the size of the resulting
+ * RPC, whether or not multi-object RPCs are supported by the server,
+ * max-rpc-in-flight limitations, size of the dirty cache, etc.
+ *
+ * For the immediate transfer io submits a cl_page_list, that req-formation
+ * engine slices into cl_req's, possibly adding cached pages to some of
+ * the resulting req's.
+ *
+ * Whenever a page from cl_page_list is added to a newly constructed req, its
+ * cl_page_operations::cpo_prep() layer methods are called. At that moment,
+ * page state is atomically changed from cl_page_state::CPS_OWNED to
+ * cl_page_state::CPS_PAGEOUT or cl_page_state::CPS_PAGEIN, cl_page::cp_owner
+ * is zeroed, and cl_page::cp_req is set to the
+ * req. cl_page_operations::cpo_prep() method at the particular layer might
+ * return -EALREADY to indicate that it does not need to submit this page
+ * at all. This is possible, for example, if page, submitted for read,
+ * became up-to-date in the meantime; and for write, the page don't have
+ * dirty bit marked. \see cl_io_submit_rw()
+ *
+ * Whenever a cached page is added to a newly constructed req, its
+ * cl_page_operations::cpo_make_ready() layer methods are called. At that
+ * moment, page state is atomically changed from cl_page_state::CPS_CACHED to
+ * cl_page_state::CPS_PAGEOUT, and cl_page::cp_req is set to
+ * req. cl_page_operations::cpo_make_ready() method at the particular layer
+ * might return -EAGAIN to indicate that this page is not eligible for the
+ * transfer right now.
+ *
+ * FUTURE
+ *
+ * Plan is to divide transfers into "priority bands" (indicated when
+ * submitting cl_page_list, and queuing a page for the opportunistic transfer)
+ * and allow glueing of cached pages to immediate transfers only within single
+ * band. This would make high priority transfers (like lock cancellation or
+ * memory pressure induced write-out) really high priority.
+ *
+ */
+
+/**
+ * Per-transfer attributes.
+ */
+struct cl_req_attr {
+	/** Generic attributes for the server consumption. */
+	struct obdo	*cra_oa;
+	/** Capability. */
+	struct obd_capa	*cra_capa;
+	/** Jobid */
+	char		 cra_jobid[JOBSTATS_JOBID_SIZE];
+};
+
+/**
+ * Transfer request operations definable at every layer.
+ *
+ * Concurrency: transfer formation engine synchronizes calls to all transfer
+ * methods.
+ */
+struct cl_req_operations {
+	/**
+	 * Invoked top-to-bottom by cl_req_prep() when transfer formation is
+	 * complete (all pages are added).
+	 *
+	 * \see osc_req_prep()
+	 */
+	int  (*cro_prep)(const struct lu_env *env,
+			 const struct cl_req_slice *slice);
+	/**
+	 * Called top-to-bottom to fill in \a oa fields. This is called twice
+	 * with different flags, see bug 10150 and osc_build_req().
+	 *
+	 * \param obj an object from cl_req which attributes are to be set in
+	 *	    \a oa.
+	 *
+	 * \param oa struct obdo where attributes are placed
+	 *
+	 * \param flags \a oa fields to be filled.
+	 */
+	void (*cro_attr_set)(const struct lu_env *env,
+			     const struct cl_req_slice *slice,
+			     const struct cl_object *obj,
+			     struct cl_req_attr *attr, obd_valid flags);
+	/**
+	 * Called top-to-bottom from cl_req_completion() to notify layers that
+	 * transfer completed. Has to free all state allocated by
+	 * cl_device_operations::cdo_req_init().
+	 */
+	void (*cro_completion)(const struct lu_env *env,
+			       const struct cl_req_slice *slice, int ioret);
+};
+
+/**
+ * A per-object state that (potentially multi-object) transfer request keeps.
+ */
+struct cl_req_obj {
+	/** object itself */
+	struct cl_object   *ro_obj;
+	/** reference to cl_req_obj::ro_obj. For debugging. */
+	struct lu_ref_link *ro_obj_ref;
+	/* something else? Number of pages for a given object? */
+};
+
+/**
+ * Transfer request.
+ *
+ * Transfer requests are not reference counted, because IO sub-system owns
+ * them exclusively and knows when to free them.
+ *
+ * Life cycle.
+ *
+ * cl_req is created by cl_req_alloc() that calls
+ * cl_device_operations::cdo_req_init() device methods to allocate per-req
+ * state in every layer.
+ *
+ * Then pages are added (cl_req_page_add()), req keeps track of all objects it
+ * contains pages for.
+ *
+ * Once all pages were collected, cl_page_operations::cpo_prep() method is
+ * called top-to-bottom. At that point layers can modify req, let it pass, or
+ * deny it completely. This is to support things like SNS that have transfer
+ * ordering requirements invisible to the individual req-formation engine.
+ *
+ * On transfer completion (or transfer timeout, or failure to initiate the
+ * transfer of an allocated req), cl_req_operations::cro_completion() method
+ * is called, after execution of cl_page_operations::cpo_completion() of all
+ * req's pages.
+ */
+struct cl_req {
+	enum cl_req_type      crq_type;
+	/** A list of pages being transfered */
+	struct list_head	    crq_pages;
+	/** Number of pages in cl_req::crq_pages */
+	unsigned	      crq_nrpages;
+	/** An array of objects which pages are in ->crq_pages */
+	struct cl_req_obj    *crq_o;
+	/** Number of elements in cl_req::crq_objs[] */
+	unsigned	      crq_nrobjs;
+	struct list_head	    crq_layers;
+};
+
+/**
+ * Per-layer state for request.
+ */
+struct cl_req_slice {
+	struct cl_req    *crs_req;
+	struct cl_device *crs_dev;
+	struct list_head	crs_linkage;
+	const struct cl_req_operations *crs_ops;
+};
+
+/* @} cl_req */
+
+enum cache_stats_item {
+	/** how many cache lookups were performed */
+	CS_lookup = 0,
+	/** how many times cache lookup resulted in a hit */
+	CS_hit,
+	/** how many entities are in the cache right now */
+	CS_total,
+	/** how many entities in the cache are actively used (and cannot be
+	 * evicted) right now */
+	CS_busy,
+	/** how many entities were created at all */
+	CS_create,
+	CS_NR
+};
+
+#define CS_NAMES { "lookup", "hit", "total", "busy", "create" }
+
+/**
+ * Stats for a generic cache (similar to inode, lu_object, etc. caches).
+ */
+struct cache_stats {
+	const char    *cs_name;
+	atomic_t   cs_stats[CS_NR];
+};
+
+/** These are not exported so far */
+void cache_stats_init (struct cache_stats *cs, const char *name);
+int  cache_stats_print(const struct cache_stats *cs,
+		       char *page, int count, int header);
+
+/**
+ * Client-side site. This represents particular client stack. "Global"
+ * variables should (directly or indirectly) be added here to allow multiple
+ * clients to co-exist in the single address space.
+ */
+struct cl_site {
+	struct lu_site	cs_lu;
+	/**
+	 * Statistical counters. Atomics do not scale, something better like
+	 * per-cpu counters is needed.
+	 *
+	 * These are exported as /proc/fs/lustre/llite/.../site
+	 *
+	 * When interpreting keep in mind that both sub-locks (and sub-pages)
+	 * and top-locks (and top-pages) are accounted here.
+	 */
+	struct cache_stats    cs_pages;
+	struct cache_stats    cs_locks;
+	atomic_t	  cs_pages_state[CPS_NR];
+	atomic_t	  cs_locks_state[CLS_NR];
+};
+
+int  cl_site_init (struct cl_site *s, struct cl_device *top);
+void cl_site_fini (struct cl_site *s);
+void cl_stack_fini(const struct lu_env *env, struct cl_device *cl);
+
+/**
+ * Output client site statistical counters into a buffer. Suitable for
+ * ll_rd_*()-style functions.
+ */
+int cl_site_stats_print(const struct cl_site *s, char *page, int count);
+
+/**
+ * \name helpers
+ *
+ * Type conversion and accessory functions.
+ */
+/** @{ */
+
+static inline struct cl_site *lu2cl_site(const struct lu_site *site)
+{
+	return container_of(site, struct cl_site, cs_lu);
+}
+
+static inline int lu_device_is_cl(const struct lu_device *d)
+{
+	return d->ld_type->ldt_tags & LU_DEVICE_CL;
+}
+
+static inline struct cl_device *lu2cl_dev(const struct lu_device *d)
+{
+	LASSERT(d == NULL || IS_ERR(d) || lu_device_is_cl(d));
+	return container_of0(d, struct cl_device, cd_lu_dev);
+}
+
+static inline struct lu_device *cl2lu_dev(struct cl_device *d)
+{
+	return &d->cd_lu_dev;
+}
+
+static inline struct cl_object *lu2cl(const struct lu_object *o)
+{
+	LASSERT(o == NULL || IS_ERR(o) || lu_device_is_cl(o->lo_dev));
+	return container_of0(o, struct cl_object, co_lu);
+}
+
+static inline const struct cl_object_conf *
+lu2cl_conf(const struct lu_object_conf *conf)
+{
+	return container_of0(conf, struct cl_object_conf, coc_lu);
+}
+
+static inline struct cl_object *cl_object_next(const struct cl_object *obj)
+{
+	return obj ? lu2cl(lu_object_next(&obj->co_lu)) : NULL;
+}
+
+static inline struct cl_device *cl_object_device(const struct cl_object *o)
+{
+	LASSERT(o == NULL || IS_ERR(o) || lu_device_is_cl(o->co_lu.lo_dev));
+	return container_of0(o->co_lu.lo_dev, struct cl_device, cd_lu_dev);
+}
+
+static inline struct cl_object_header *luh2coh(const struct lu_object_header *h)
+{
+	return container_of0(h, struct cl_object_header, coh_lu);
+}
+
+static inline struct cl_site *cl_object_site(const struct cl_object *obj)
+{
+	return lu2cl_site(obj->co_lu.lo_dev->ld_site);
+}
+
+static inline
+struct cl_object_header *cl_object_header(const struct cl_object *obj)
+{
+	return luh2coh(obj->co_lu.lo_header);
+}
+
+static inline int cl_device_init(struct cl_device *d, struct lu_device_type *t)
+{
+	return lu_device_init(&d->cd_lu_dev, t);
+}
+
+static inline void cl_device_fini(struct cl_device *d)
+{
+	lu_device_fini(&d->cd_lu_dev);
+}
+
+void cl_page_slice_add(struct cl_page *page, struct cl_page_slice *slice,
+		       struct cl_object *obj,
+		       const struct cl_page_operations *ops);
+void cl_lock_slice_add(struct cl_lock *lock, struct cl_lock_slice *slice,
+		       struct cl_object *obj,
+		       const struct cl_lock_operations *ops);
+void cl_io_slice_add(struct cl_io *io, struct cl_io_slice *slice,
+		     struct cl_object *obj, const struct cl_io_operations *ops);
+void cl_req_slice_add(struct cl_req *req, struct cl_req_slice *slice,
+		      struct cl_device *dev,
+		      const struct cl_req_operations *ops);
+/** @} helpers */
+
+/** \defgroup cl_object cl_object
+ * @{ */
+struct cl_object *cl_object_top (struct cl_object *o);
+struct cl_object *cl_object_find(const struct lu_env *env, struct cl_device *cd,
+				 const struct lu_fid *fid,
+				 const struct cl_object_conf *c);
+
+int  cl_object_header_init(struct cl_object_header *h);
+void cl_object_header_fini(struct cl_object_header *h);
+void cl_object_put	(const struct lu_env *env, struct cl_object *o);
+void cl_object_get	(struct cl_object *o);
+void cl_object_attr_lock  (struct cl_object *o);
+void cl_object_attr_unlock(struct cl_object *o);
+int  cl_object_attr_get   (const struct lu_env *env, struct cl_object *obj,
+			   struct cl_attr *attr);
+int  cl_object_attr_set   (const struct lu_env *env, struct cl_object *obj,
+			   const struct cl_attr *attr, unsigned valid);
+int  cl_object_glimpse    (const struct lu_env *env, struct cl_object *obj,
+			   struct ost_lvb *lvb);
+int  cl_conf_set	  (const struct lu_env *env, struct cl_object *obj,
+			   const struct cl_object_conf *conf);
+void cl_object_prune      (const struct lu_env *env, struct cl_object *obj);
+void cl_object_kill       (const struct lu_env *env, struct cl_object *obj);
+int  cl_object_has_locks  (struct cl_object *obj);
+
+/**
+ * Returns true, iff \a o0 and \a o1 are slices of the same object.
+ */
+static inline int cl_object_same(struct cl_object *o0, struct cl_object *o1)
+{
+	return cl_object_header(o0) == cl_object_header(o1);
+}
+
+static inline void cl_object_page_init(struct cl_object *clob, int size)
+{
+	clob->co_slice_off = cl_object_header(clob)->coh_page_bufsize;
+	cl_object_header(clob)->coh_page_bufsize += ALIGN(size, 8);
+}
+
+static inline void *cl_object_page_slice(struct cl_object *clob,
+					 struct cl_page *page)
+{
+	return (void *)((char *)page + clob->co_slice_off);
+}
+
+/** @} cl_object */
+
+/** \defgroup cl_page cl_page
+ * @{ */
+enum {
+	CLP_GANG_OKAY = 0,
+	CLP_GANG_RESCHED,
+	CLP_GANG_AGAIN,
+	CLP_GANG_ABORT
+};
+
+/* callback of cl_page_gang_lookup() */
+typedef int   (*cl_page_gang_cb_t)  (const struct lu_env *, struct cl_io *,
+				     struct cl_page *, void *);
+int	     cl_page_gang_lookup (const struct lu_env *env,
+				     struct cl_object *obj,
+				     struct cl_io *io,
+				     pgoff_t start, pgoff_t end,
+				     cl_page_gang_cb_t cb, void *cbdata);
+struct cl_page *cl_page_lookup      (struct cl_object_header *hdr,
+				     pgoff_t index);
+struct cl_page *cl_page_find	(const struct lu_env *env,
+				     struct cl_object *obj,
+				     pgoff_t idx, struct page *vmpage,
+				     enum cl_page_type type);
+struct cl_page *cl_page_find_sub    (const struct lu_env *env,
+				     struct cl_object *obj,
+				     pgoff_t idx, struct page *vmpage,
+				     struct cl_page *parent);
+void	    cl_page_get	 (struct cl_page *page);
+void	    cl_page_put	 (const struct lu_env *env,
+				     struct cl_page *page);
+void	    cl_page_print       (const struct lu_env *env, void *cookie,
+				     lu_printer_t printer,
+				     const struct cl_page *pg);
+void	    cl_page_header_print(const struct lu_env *env, void *cookie,
+				     lu_printer_t printer,
+				     const struct cl_page *pg);
+struct page     *cl_page_vmpage      (const struct lu_env *env,
+				     struct cl_page *page);
+struct cl_page *cl_vmpage_page      (struct page *vmpage, struct cl_object *obj);
+struct cl_page *cl_page_top	 (struct cl_page *page);
+
+const struct cl_page_slice *cl_page_at(const struct cl_page *page,
+				       const struct lu_device_type *dtype);
+
+/**
+ * \name ownership
+ *
+ * Functions dealing with the ownership of page by io.
+ */
+/** @{ */
+
+int  cl_page_own	(const struct lu_env *env,
+			 struct cl_io *io, struct cl_page *page);
+int  cl_page_own_try    (const struct lu_env *env,
+			 struct cl_io *io, struct cl_page *page);
+void cl_page_assume     (const struct lu_env *env,
+			 struct cl_io *io, struct cl_page *page);
+void cl_page_unassume   (const struct lu_env *env,
+			 struct cl_io *io, struct cl_page *pg);
+void cl_page_disown     (const struct lu_env *env,
+			 struct cl_io *io, struct cl_page *page);
+int  cl_page_is_owned   (const struct cl_page *pg, const struct cl_io *io);
+
+/** @} ownership */
+
+/**
+ * \name transfer
+ *
+ * Functions dealing with the preparation of a page for a transfer, and
+ * tracking transfer state.
+ */
+/** @{ */
+int  cl_page_prep       (const struct lu_env *env, struct cl_io *io,
+			 struct cl_page *pg, enum cl_req_type crt);
+void cl_page_completion (const struct lu_env *env,
+			 struct cl_page *pg, enum cl_req_type crt, int ioret);
+int  cl_page_make_ready (const struct lu_env *env, struct cl_page *pg,
+			 enum cl_req_type crt);
+int  cl_page_cache_add  (const struct lu_env *env, struct cl_io *io,
+			 struct cl_page *pg, enum cl_req_type crt);
+void cl_page_clip       (const struct lu_env *env, struct cl_page *pg,
+			 int from, int to);
+int  cl_page_cancel     (const struct lu_env *env, struct cl_page *page);
+int  cl_page_flush      (const struct lu_env *env, struct cl_io *io,
+			 struct cl_page *pg);
+
+/** @} transfer */
+
+
+/**
+ * \name helper routines
+ * Functions to discard, delete and export a cl_page.
+ */
+/** @{ */
+void    cl_page_discard      (const struct lu_env *env, struct cl_io *io,
+			      struct cl_page *pg);
+void    cl_page_delete       (const struct lu_env *env, struct cl_page *pg);
+int     cl_page_unmap	(const struct lu_env *env, struct cl_io *io,
+			      struct cl_page *pg);
+int     cl_page_is_vmlocked  (const struct lu_env *env,
+			      const struct cl_page *pg);
+void    cl_page_export       (const struct lu_env *env,
+			      struct cl_page *pg, int uptodate);
+int     cl_page_is_under_lock(const struct lu_env *env, struct cl_io *io,
+			      struct cl_page *page);
+loff_t  cl_offset	    (const struct cl_object *obj, pgoff_t idx);
+pgoff_t cl_index	     (const struct cl_object *obj, loff_t offset);
+int     cl_page_size	 (const struct cl_object *obj);
+int     cl_pages_prune       (const struct lu_env *env, struct cl_object *obj);
+
+void cl_lock_print      (const struct lu_env *env, void *cookie,
+			 lu_printer_t printer, const struct cl_lock *lock);
+void cl_lock_descr_print(const struct lu_env *env, void *cookie,
+			 lu_printer_t printer,
+			 const struct cl_lock_descr *descr);
+/* @} helper */
+
+/** @} cl_page */
+
+/** \defgroup cl_lock cl_lock
+ * @{ */
+
+struct cl_lock *cl_lock_hold(const struct lu_env *env, const struct cl_io *io,
+			     const struct cl_lock_descr *need,
+			     const char *scope, const void *source);
+struct cl_lock *cl_lock_peek(const struct lu_env *env, const struct cl_io *io,
+			     const struct cl_lock_descr *need,
+			     const char *scope, const void *source);
+struct cl_lock *cl_lock_request(const struct lu_env *env, struct cl_io *io,
+				const struct cl_lock_descr *need,
+				const char *scope, const void *source);
+struct cl_lock *cl_lock_at_pgoff(const struct lu_env *env,
+				 struct cl_object *obj, pgoff_t index,
+				 struct cl_lock *except, int pending,
+				 int canceld);
+static inline struct cl_lock *cl_lock_at_page(const struct lu_env *env,
+					      struct cl_object *obj,
+					      struct cl_page *page,
+					      struct cl_lock *except,
+					      int pending, int canceld)
+{
+	LASSERT(cl_object_header(obj) == cl_object_header(page->cp_obj));
+	return cl_lock_at_pgoff(env, obj, page->cp_index, except,
+				pending, canceld);
+}
+
+const struct cl_lock_slice *cl_lock_at(const struct cl_lock *lock,
+				       const struct lu_device_type *dtype);
+
+void  cl_lock_get       (struct cl_lock *lock);
+void  cl_lock_get_trust (struct cl_lock *lock);
+void  cl_lock_put       (const struct lu_env *env, struct cl_lock *lock);
+void  cl_lock_hold_add  (const struct lu_env *env, struct cl_lock *lock,
+			 const char *scope, const void *source);
+void cl_lock_hold_release(const struct lu_env *env, struct cl_lock *lock,
+			  const char *scope, const void *source);
+void  cl_lock_unhold    (const struct lu_env *env, struct cl_lock *lock,
+			 const char *scope, const void *source);
+void  cl_lock_release   (const struct lu_env *env, struct cl_lock *lock,
+			 const char *scope, const void *source);
+void  cl_lock_user_add  (const struct lu_env *env, struct cl_lock *lock);
+void  cl_lock_user_del  (const struct lu_env *env, struct cl_lock *lock);
+
+enum cl_lock_state cl_lock_intransit(const struct lu_env *env,
+				     struct cl_lock *lock);
+void cl_lock_extransit(const struct lu_env *env, struct cl_lock *lock,
+		       enum cl_lock_state state);
+int cl_lock_is_intransit(struct cl_lock *lock);
+
+int cl_lock_enqueue_wait(const struct lu_env *env, struct cl_lock *lock,
+			 int keep_mutex);
+
+/** \name statemachine statemachine
+ * Interface to lock state machine consists of 3 parts:
+ *
+ *     - "try" functions that attempt to effect a state transition. If state
+ *     transition is not possible right now (e.g., if it has to wait for some
+ *     asynchronous event to occur), these functions return
+ *     cl_lock_transition::CLO_WAIT.
+ *
+ *     - "non-try" functions that implement synchronous blocking interface on
+ *     top of non-blocking "try" functions. These functions repeatedly call
+ *     corresponding "try" versions, and if state transition is not possible
+ *     immediately, wait for lock state change.
+ *
+ *     - methods from cl_lock_operations, called by "try" functions. Lock can
+ *     be advanced to the target state only when all layers voted that they
+ *     are ready for this transition. "Try" functions call methods under lock
+ *     mutex. If a layer had to release a mutex, it re-acquires it and returns
+ *     cl_lock_transition::CLO_REPEAT, causing "try" function to call all
+ *     layers again.
+ *
+ * TRY	      NON-TRY      METHOD			    FINAL STATE
+ *
+ * cl_enqueue_try() cl_enqueue() cl_lock_operations::clo_enqueue() CLS_ENQUEUED
+ *
+ * cl_wait_try()    cl_wait()    cl_lock_operations::clo_wait()    CLS_HELD
+ *
+ * cl_unuse_try()   cl_unuse()   cl_lock_operations::clo_unuse()   CLS_CACHED
+ *
+ * cl_use_try()     NONE	 cl_lock_operations::clo_use()     CLS_HELD
+ *
+ * @{ */
+
+int   cl_enqueue    (const struct lu_env *env, struct cl_lock *lock,
+		     struct cl_io *io, __u32 flags);
+int   cl_wait       (const struct lu_env *env, struct cl_lock *lock);
+void  cl_unuse      (const struct lu_env *env, struct cl_lock *lock);
+int   cl_enqueue_try(const struct lu_env *env, struct cl_lock *lock,
+		     struct cl_io *io, __u32 flags);
+int   cl_unuse_try  (const struct lu_env *env, struct cl_lock *lock);
+int   cl_wait_try   (const struct lu_env *env, struct cl_lock *lock);
+int   cl_use_try    (const struct lu_env *env, struct cl_lock *lock, int atomic);
+
+/** @} statemachine */
+
+void cl_lock_signal      (const struct lu_env *env, struct cl_lock *lock);
+int  cl_lock_state_wait  (const struct lu_env *env, struct cl_lock *lock);
+void cl_lock_state_set   (const struct lu_env *env, struct cl_lock *lock,
+			  enum cl_lock_state state);
+int  cl_queue_match      (const struct list_head *queue,
+			  const struct cl_lock_descr *need);
+
+void cl_lock_mutex_get  (const struct lu_env *env, struct cl_lock *lock);
+int  cl_lock_mutex_try  (const struct lu_env *env, struct cl_lock *lock);
+void cl_lock_mutex_put  (const struct lu_env *env, struct cl_lock *lock);
+int  cl_lock_is_mutexed (struct cl_lock *lock);
+int  cl_lock_nr_mutexed (const struct lu_env *env);
+int  cl_lock_discard_pages(const struct lu_env *env, struct cl_lock *lock);
+int  cl_lock_ext_match  (const struct cl_lock_descr *has,
+			 const struct cl_lock_descr *need);
+int  cl_lock_descr_match(const struct cl_lock_descr *has,
+			 const struct cl_lock_descr *need);
+int  cl_lock_mode_match (enum cl_lock_mode has, enum cl_lock_mode need);
+int  cl_lock_modify     (const struct lu_env *env, struct cl_lock *lock,
+			 const struct cl_lock_descr *desc);
+
+void cl_lock_closure_init (const struct lu_env *env,
+			   struct cl_lock_closure *closure,
+			   struct cl_lock *origin, int wait);
+void cl_lock_closure_fini (struct cl_lock_closure *closure);
+int  cl_lock_closure_build(const struct lu_env *env, struct cl_lock *lock,
+			   struct cl_lock_closure *closure);
+void cl_lock_disclosure   (const struct lu_env *env,
+			   struct cl_lock_closure *closure);
+int  cl_lock_enclosure    (const struct lu_env *env, struct cl_lock *lock,
+			   struct cl_lock_closure *closure);
+
+void cl_lock_cancel(const struct lu_env *env, struct cl_lock *lock);
+void cl_lock_delete(const struct lu_env *env, struct cl_lock *lock);
+void cl_lock_error (const struct lu_env *env, struct cl_lock *lock, int error);
+void cl_locks_prune(const struct lu_env *env, struct cl_object *obj, int wait);
+
+unsigned long cl_lock_weigh(const struct lu_env *env, struct cl_lock *lock);
+
+/** @} cl_lock */
+
+/** \defgroup cl_io cl_io
+ * @{ */
+
+int   cl_io_init	 (const struct lu_env *env, struct cl_io *io,
+			  enum cl_io_type iot, struct cl_object *obj);
+int   cl_io_sub_init     (const struct lu_env *env, struct cl_io *io,
+			  enum cl_io_type iot, struct cl_object *obj);
+int   cl_io_rw_init      (const struct lu_env *env, struct cl_io *io,
+			  enum cl_io_type iot, loff_t pos, size_t count);
+int   cl_io_loop	 (const struct lu_env *env, struct cl_io *io);
+
+void  cl_io_fini	 (const struct lu_env *env, struct cl_io *io);
+int   cl_io_iter_init    (const struct lu_env *env, struct cl_io *io);
+void  cl_io_iter_fini    (const struct lu_env *env, struct cl_io *io);
+int   cl_io_lock	 (const struct lu_env *env, struct cl_io *io);
+void  cl_io_unlock       (const struct lu_env *env, struct cl_io *io);
+int   cl_io_start	(const struct lu_env *env, struct cl_io *io);
+void  cl_io_end	  (const struct lu_env *env, struct cl_io *io);
+int   cl_io_lock_add     (const struct lu_env *env, struct cl_io *io,
+			  struct cl_io_lock_link *link);
+int   cl_io_lock_alloc_add(const struct lu_env *env, struct cl_io *io,
+			   struct cl_lock_descr *descr);
+int   cl_io_read_page    (const struct lu_env *env, struct cl_io *io,
+			  struct cl_page *page);
+int   cl_io_prepare_write(const struct lu_env *env, struct cl_io *io,
+			  struct cl_page *page, unsigned from, unsigned to);
+int   cl_io_commit_write (const struct lu_env *env, struct cl_io *io,
+			  struct cl_page *page, unsigned from, unsigned to);
+int   cl_io_submit_rw    (const struct lu_env *env, struct cl_io *io,
+			  enum cl_req_type iot, struct cl_2queue *queue);
+int   cl_io_submit_sync  (const struct lu_env *env, struct cl_io *io,
+			  enum cl_req_type iot, struct cl_2queue *queue,
+			  long timeout);
+void  cl_io_rw_advance   (const struct lu_env *env, struct cl_io *io,
+			  size_t nob);
+int   cl_io_cancel       (const struct lu_env *env, struct cl_io *io,
+			  struct cl_page_list *queue);
+int   cl_io_is_going     (const struct lu_env *env);
+
+/**
+ * True, iff \a io is an O_APPEND write(2).
+ */
+static inline int cl_io_is_append(const struct cl_io *io)
+{
+	return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_append;
+}
+
+static inline int cl_io_is_sync_write(const struct cl_io *io)
+{
+	return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_sync;
+}
+
+static inline int cl_io_is_mkwrite(const struct cl_io *io)
+{
+	return io->ci_type == CIT_FAULT && io->u.ci_fault.ft_mkwrite;
+}
+
+/**
+ * True, iff \a io is a truncate(2).
+ */
+static inline int cl_io_is_trunc(const struct cl_io *io)
+{
+	return io->ci_type == CIT_SETATTR &&
+		(io->u.ci_setattr.sa_valid & ATTR_SIZE);
+}
+
+struct cl_io *cl_io_top(struct cl_io *io);
+
+void cl_io_print(const struct lu_env *env, void *cookie,
+		 lu_printer_t printer, const struct cl_io *io);
+
+#define CL_IO_SLICE_CLEAN(foo_io, base)				 \
+do {								    \
+	typeof(foo_io) __foo_io = (foo_io);			     \
+									\
+	CLASSERT(offsetof(typeof(*__foo_io), base) == 0);	       \
+	memset(&__foo_io->base + 1, 0,				  \
+	       (sizeof *__foo_io) - sizeof __foo_io->base);	     \
+} while (0)
+
+/** @} cl_io */
+
+/** \defgroup cl_page_list cl_page_list
+ * @{ */
+
+/**
+ * Last page in the page list.
+ */
+static inline struct cl_page *cl_page_list_last(struct cl_page_list *plist)
+{
+	LASSERT(plist->pl_nr > 0);
+	return list_entry(plist->pl_pages.prev, struct cl_page, cp_batch);
+}
+
+/**
+ * Iterate over pages in a page list.
+ */
+#define cl_page_list_for_each(page, list)			       \
+	list_for_each_entry((page), &(list)->pl_pages, cp_batch)
+
+/**
+ * Iterate over pages in a page list, taking possible removals into account.
+ */
+#define cl_page_list_for_each_safe(page, temp, list)		    \
+	list_for_each_entry_safe((page), (temp), &(list)->pl_pages, cp_batch)
+
+void cl_page_list_init   (struct cl_page_list *plist);
+void cl_page_list_add    (struct cl_page_list *plist, struct cl_page *page);
+void cl_page_list_move   (struct cl_page_list *dst, struct cl_page_list *src,
+			  struct cl_page *page);
+void cl_page_list_splice (struct cl_page_list *list,
+			  struct cl_page_list *head);
+void cl_page_list_del    (const struct lu_env *env,
+			  struct cl_page_list *plist, struct cl_page *page);
+void cl_page_list_disown (const struct lu_env *env,
+			  struct cl_io *io, struct cl_page_list *plist);
+int  cl_page_list_own    (const struct lu_env *env,
+			  struct cl_io *io, struct cl_page_list *plist);
+void cl_page_list_assume (const struct lu_env *env,
+			  struct cl_io *io, struct cl_page_list *plist);
+void cl_page_list_discard(const struct lu_env *env,
+			  struct cl_io *io, struct cl_page_list *plist);
+int  cl_page_list_unmap  (const struct lu_env *env,
+			  struct cl_io *io, struct cl_page_list *plist);
+void cl_page_list_fini   (const struct lu_env *env, struct cl_page_list *plist);
+
+void cl_2queue_init     (struct cl_2queue *queue);
+void cl_2queue_add      (struct cl_2queue *queue, struct cl_page *page);
+void cl_2queue_disown   (const struct lu_env *env,
+			 struct cl_io *io, struct cl_2queue *queue);
+void cl_2queue_assume   (const struct lu_env *env,
+			 struct cl_io *io, struct cl_2queue *queue);
+void cl_2queue_discard  (const struct lu_env *env,
+			 struct cl_io *io, struct cl_2queue *queue);
+void cl_2queue_fini     (const struct lu_env *env, struct cl_2queue *queue);
+void cl_2queue_init_page(struct cl_2queue *queue, struct cl_page *page);
+
+/** @} cl_page_list */
+
+/** \defgroup cl_req cl_req
+ * @{ */
+struct cl_req *cl_req_alloc(const struct lu_env *env, struct cl_page *page,
+			    enum cl_req_type crt, int nr_objects);
+
+void cl_req_page_add  (const struct lu_env *env, struct cl_req *req,
+		       struct cl_page *page);
+void cl_req_page_done (const struct lu_env *env, struct cl_page *page);
+int  cl_req_prep      (const struct lu_env *env, struct cl_req *req);
+void cl_req_attr_set  (const struct lu_env *env, struct cl_req *req,
+		       struct cl_req_attr *attr, obd_valid flags);
+void cl_req_completion(const struct lu_env *env, struct cl_req *req, int ioret);
+
+/** \defgroup cl_sync_io cl_sync_io
+ * @{ */
+
+/**
+ * Anchor for synchronous transfer. This is allocated on a stack by thread
+ * doing synchronous transfer, and a pointer to this structure is set up in
+ * every page submitted for transfer. Transfer completion routine updates
+ * anchor and wakes up waiting thread when transfer is complete.
+ */
+struct cl_sync_io {
+	/** number of pages yet to be transferred. */
+	atomic_t		csi_sync_nr;
+	/** error code. */
+	int			csi_sync_rc;
+	/** barrier of destroy this structure */
+	atomic_t		csi_barrier;
+	/** completion to be signaled when transfer is complete. */
+	wait_queue_head_t		csi_waitq;
+};
+
+void cl_sync_io_init(struct cl_sync_io *anchor, int nrpages);
+int  cl_sync_io_wait(const struct lu_env *env, struct cl_io *io,
+		     struct cl_page_list *queue, struct cl_sync_io *anchor,
+		     long timeout);
+void cl_sync_io_note(struct cl_sync_io *anchor, int ioret);
+
+/** @} cl_sync_io */
+
+/** @} cl_req */
+
+/** \defgroup cl_env cl_env
+ *
+ * lu_env handling for a client.
+ *
+ * lu_env is an environment within which lustre code executes. Its major part
+ * is lu_context---a fast memory allocation mechanism that is used to conserve
+ * precious kernel stack space. Originally lu_env was designed for a server,
+ * where
+ *
+ *     - there is a (mostly) fixed number of threads, and
+ *
+ *     - call chains have no non-lustre portions inserted between lustre code.
+ *
+ * On a client both these assumtpion fails, because every user thread can
+ * potentially execute lustre code as part of a system call, and lustre calls
+ * into VFS or MM that call back into lustre.
+ *
+ * To deal with that, cl_env wrapper functions implement the following
+ * optimizations:
+ *
+ *     - allocation and destruction of environment is amortized by caching no
+ *     longer used environments instead of destroying them;
+ *
+ *     - there is a notion of "current" environment, attached to the kernel
+ *     data structure representing current thread Top-level lustre code
+ *     allocates an environment and makes it current, then calls into
+ *     non-lustre code, that in turn calls lustre back. Low-level lustre
+ *     code thus called can fetch environment created by the top-level code
+ *     and reuse it, avoiding additional environment allocation.
+ *       Right now, three interfaces can attach the cl_env to running thread:
+ *       - cl_env_get
+ *       - cl_env_implant
+ *       - cl_env_reexit(cl_env_reenter had to be called priorly)
+ *
+ * \see lu_env, lu_context, lu_context_key
+ * @{ */
+
+struct cl_env_nest {
+	int   cen_refcheck;
+	void *cen_cookie;
+};
+
+struct lu_env *cl_env_peek       (int *refcheck);
+struct lu_env *cl_env_get	(int *refcheck);
+struct lu_env *cl_env_alloc      (int *refcheck, __u32 tags);
+struct lu_env *cl_env_nested_get (struct cl_env_nest *nest);
+void	   cl_env_put	(struct lu_env *env, int *refcheck);
+void	   cl_env_nested_put (struct cl_env_nest *nest, struct lu_env *env);
+void	  *cl_env_reenter    (void);
+void	   cl_env_reexit     (void *cookie);
+void	   cl_env_implant    (struct lu_env *env, int *refcheck);
+void	   cl_env_unplant    (struct lu_env *env, int *refcheck);
+
+/** @} cl_env */
+
+/*
+ * Misc
+ */
+void cl_attr2lvb(struct ost_lvb *lvb, const struct cl_attr *attr);
+void cl_lvb2attr(struct cl_attr *attr, const struct ost_lvb *lvb);
+
+struct cl_device *cl_type_setup(const struct lu_env *env, struct lu_site *site,
+				struct lu_device_type *ldt,
+				struct lu_device *next);
+/** @} clio */
+
+int cl_global_init(void);
+void cl_global_fini(void);
+
+#endif /* _LINUX_CL_OBJECT_H */
diff --git a/drivers/staging/lustre/lustre/include/dt_object.h b/drivers/staging/lustre/lustre/include/dt_object.h
new file mode 100644
index 000000000000..e116bb21b529
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/dt_object.h
@@ -0,0 +1,1498 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LUSTRE_DT_OBJECT_H
+#define __LUSTRE_DT_OBJECT_H
+
+/** \defgroup dt dt
+ * Sub-class of lu_object with methods common for "data" objects in OST stack.
+ *
+ * Data objects behave like regular files: you can read/write them, get and
+ * set their attributes. Implementation of dt interface is supposed to
+ * implement some form of garbage collection, normally reference counting
+ * (nlink) based one.
+ *
+ * Examples: osd (lustre/osd) is an implementation of dt interface.
+ * @{
+ */
+
+
+/*
+ * super-class definitions.
+ */
+#include <lu_object.h>
+
+#include <linux/libcfs/libcfs.h>
+
+struct seq_file;
+struct proc_dir_entry;
+struct lustre_cfg;
+
+struct thandle;
+struct dt_device;
+struct dt_object;
+struct dt_index_features;
+struct niobuf_local;
+struct niobuf_remote;
+struct ldlm_enqueue_info;
+
+typedef enum {
+	MNTOPT_USERXATTR	= 0x00000001,
+	MNTOPT_ACL	      = 0x00000002,
+} mntopt_t;
+
+struct dt_device_param {
+	unsigned	   ddp_max_name_len;
+	unsigned	   ddp_max_nlink;
+	unsigned	   ddp_block_shift;
+	mntopt_t	   ddp_mntopts;
+	unsigned	   ddp_max_ea_size;
+	void	      *ddp_mnt; /* XXX: old code can retrieve mnt -bzzz */
+	int		ddp_mount_type;
+	unsigned long long ddp_maxbytes;
+	/* percentage of available space to reserve for grant error margin */
+	int		ddp_grant_reserved;
+	/* per-inode space consumption */
+	short	      ddp_inodespace;
+	/* per-fragment grant overhead to be used by client for grant
+	 * calculation */
+	int		ddp_grant_frag;
+};
+
+/**
+ * Per-transaction commit callback function
+ */
+struct dt_txn_commit_cb;
+typedef void (*dt_cb_t)(struct lu_env *env, struct thandle *th,
+			struct dt_txn_commit_cb *cb, int err);
+/**
+ * Special per-transaction callback for cases when just commit callback
+ * is needed and per-device callback are not convenient to use
+ */
+#define TRANS_COMMIT_CB_MAGIC	0xa0a00a0a
+#define MAX_COMMIT_CB_STR_LEN	32
+
+struct dt_txn_commit_cb {
+	struct list_head	dcb_linkage;
+	dt_cb_t		dcb_func;
+	__u32		dcb_magic;
+	char		dcb_name[MAX_COMMIT_CB_STR_LEN];
+};
+
+/**
+ * Operations on dt device.
+ */
+struct dt_device_operations {
+	/**
+	 * Return device-wide statistics.
+	 */
+	int   (*dt_statfs)(const struct lu_env *env,
+			   struct dt_device *dev, struct obd_statfs *osfs);
+	/**
+	 * Create transaction, described by \a param.
+	 */
+	struct thandle *(*dt_trans_create)(const struct lu_env *env,
+					   struct dt_device *dev);
+	/**
+	 * Start transaction, described by \a param.
+	 */
+	int   (*dt_trans_start)(const struct lu_env *env,
+				struct dt_device *dev, struct thandle *th);
+	/**
+	 * Finish previously started transaction.
+	 */
+	int   (*dt_trans_stop)(const struct lu_env *env,
+			       struct thandle *th);
+	/**
+	 * Add commit callback to the transaction.
+	 */
+	int   (*dt_trans_cb_add)(struct thandle *th,
+				 struct dt_txn_commit_cb *dcb);
+	/**
+	 * Return fid of root index object.
+	 */
+	int   (*dt_root_get)(const struct lu_env *env,
+			     struct dt_device *dev, struct lu_fid *f);
+	/**
+	 * Return device configuration data.
+	 */
+	void  (*dt_conf_get)(const struct lu_env *env,
+			     const struct dt_device *dev,
+			     struct dt_device_param *param);
+	/**
+	 *  handling device state, mostly for tests
+	 */
+	int   (*dt_sync)(const struct lu_env *env, struct dt_device *dev);
+	int   (*dt_ro)(const struct lu_env *env, struct dt_device *dev);
+	/**
+	  * Start a transaction commit asynchronously
+	  *
+	  * \param env environment
+	  * \param dev dt_device to start commit on
+	  *
+	  * \return 0 success, negative value if error
+	  */
+	 int   (*dt_commit_async)(const struct lu_env *env,
+				  struct dt_device *dev);
+	/**
+	 * Initialize capability context.
+	 */
+	int   (*dt_init_capa_ctxt)(const struct lu_env *env,
+				   struct dt_device *dev,
+				   int mode, unsigned long timeout,
+				   __u32 alg, struct lustre_capa_key *keys);
+};
+
+struct dt_index_features {
+	/** required feature flags from enum dt_index_flags */
+	__u32 dif_flags;
+	/** minimal required key size */
+	size_t dif_keysize_min;
+	/** maximal required key size, 0 if no limit */
+	size_t dif_keysize_max;
+	/** minimal required record size */
+	size_t dif_recsize_min;
+	/** maximal required record size, 0 if no limit */
+	size_t dif_recsize_max;
+	/** pointer size for record */
+	size_t dif_ptrsize;
+};
+
+enum dt_index_flags {
+	/** index supports variable sized keys */
+	DT_IND_VARKEY = 1 << 0,
+	/** index supports variable sized records */
+	DT_IND_VARREC = 1 << 1,
+	/** index can be modified */
+	DT_IND_UPDATE = 1 << 2,
+	/** index supports records with non-unique (duplicate) keys */
+	DT_IND_NONUNQ = 1 << 3,
+	/**
+	 * index support fixed-size keys sorted with natural numerical way
+	 * and is able to return left-side value if no exact value found
+	 */
+	DT_IND_RANGE = 1 << 4,
+};
+
+/**
+ * Features, required from index to support file system directories (mapping
+ * names to fids).
+ */
+extern const struct dt_index_features dt_directory_features;
+extern const struct dt_index_features dt_otable_features;
+extern const struct dt_index_features dt_lfsck_features;
+
+/* index features supported by the accounting objects */
+extern const struct dt_index_features dt_acct_features;
+
+/* index features supported by the quota global indexes */
+extern const struct dt_index_features dt_quota_glb_features;
+
+/* index features supported by the quota slave indexes */
+extern const struct dt_index_features dt_quota_slv_features;
+
+/**
+ * This is a general purpose dt allocation hint.
+ * It now contains the parent object.
+ * It can contain any allocation hint in the future.
+ */
+struct dt_allocation_hint {
+	struct dt_object	   *dah_parent;
+	__u32		       dah_mode;
+};
+
+/**
+ * object type specifier.
+ */
+
+enum dt_format_type {
+	DFT_REGULAR,
+	DFT_DIR,
+	/** for mknod */
+	DFT_NODE,
+	/** for special index */
+	DFT_INDEX,
+	/** for symbolic link */
+	DFT_SYM,
+};
+
+/**
+ * object format specifier.
+ */
+struct dt_object_format {
+	/** type for dt object */
+	enum dt_format_type dof_type;
+	union {
+		struct dof_regular {
+			int striped;
+		} dof_reg;
+		struct dof_dir {
+		} dof_dir;
+		struct dof_node {
+		} dof_node;
+		/**
+		 * special index need feature as parameter to create
+		 * special idx
+		 */
+		struct dof_index {
+			const struct dt_index_features *di_feat;
+		} dof_idx;
+	} u;
+};
+
+enum dt_format_type dt_mode_to_dft(__u32 mode);
+
+typedef __u64 dt_obj_version_t;
+
+/**
+ * Per-dt-object operations.
+ */
+struct dt_object_operations {
+	void  (*do_read_lock)(const struct lu_env *env,
+			      struct dt_object *dt, unsigned role);
+	void  (*do_write_lock)(const struct lu_env *env,
+			       struct dt_object *dt, unsigned role);
+	void  (*do_read_unlock)(const struct lu_env *env,
+				struct dt_object *dt);
+	void  (*do_write_unlock)(const struct lu_env *env,
+				 struct dt_object *dt);
+	int  (*do_write_locked)(const struct lu_env *env,
+				struct dt_object *dt);
+	/**
+	 * Note: following ->do_{x,}attr_{set,get}() operations are very
+	 * similar to ->moo_{x,}attr_{set,get}() operations in struct
+	 * md_object_operations (see md_object.h). These operations are not in
+	 * lu_object_operations, because ->do_{x,}attr_set() versions take
+	 * transaction handle as an argument (this transaction is started by
+	 * caller). We might factor ->do_{x,}attr_get() into
+	 * lu_object_operations, but that would break existing symmetry.
+	 */
+
+	/**
+	 * Return standard attributes.
+	 *
+	 * precondition: lu_object_exists(&dt->do_lu);
+	 */
+	int   (*do_attr_get)(const struct lu_env *env,
+			     struct dt_object *dt, struct lu_attr *attr,
+			     struct lustre_capa *capa);
+	/**
+	 * Set standard attributes.
+	 *
+	 * precondition: dt_object_exists(dt);
+	 */
+	int   (*do_declare_attr_set)(const struct lu_env *env,
+				     struct dt_object *dt,
+				     const struct lu_attr *attr,
+				     struct thandle *handle);
+	int   (*do_attr_set)(const struct lu_env *env,
+			     struct dt_object *dt,
+			     const struct lu_attr *attr,
+			     struct thandle *handle,
+			     struct lustre_capa *capa);
+	/**
+	 * Return a value of an extended attribute.
+	 *
+	 * precondition: dt_object_exists(dt);
+	 */
+	int   (*do_xattr_get)(const struct lu_env *env, struct dt_object *dt,
+			      struct lu_buf *buf, const char *name,
+			      struct lustre_capa *capa);
+	/**
+	 * Set value of an extended attribute.
+	 *
+	 * \a fl - flags from enum lu_xattr_flags
+	 *
+	 * precondition: dt_object_exists(dt);
+	 */
+	int   (*do_declare_xattr_set)(const struct lu_env *env,
+				      struct dt_object *dt,
+				      const struct lu_buf *buf,
+				      const char *name, int fl,
+				      struct thandle *handle);
+	int   (*do_xattr_set)(const struct lu_env *env,
+			      struct dt_object *dt, const struct lu_buf *buf,
+			      const char *name, int fl, struct thandle *handle,
+			      struct lustre_capa *capa);
+	/**
+	 * Delete existing extended attribute.
+	 *
+	 * precondition: dt_object_exists(dt);
+	 */
+	int   (*do_declare_xattr_del)(const struct lu_env *env,
+				      struct dt_object *dt,
+				      const char *name, struct thandle *handle);
+	int   (*do_xattr_del)(const struct lu_env *env,
+			      struct dt_object *dt,
+			      const char *name, struct thandle *handle,
+			      struct lustre_capa *capa);
+	/**
+	 * Place list of existing extended attributes into \a buf (which has
+	 * length len).
+	 *
+	 * precondition: dt_object_exists(dt);
+	 */
+	int   (*do_xattr_list)(const struct lu_env *env,
+			       struct dt_object *dt, struct lu_buf *buf,
+			       struct lustre_capa *capa);
+	/**
+	 * Init allocation hint using parent object and child mode.
+	 * (1) The \a parent might be NULL if this is a partial creation for
+	 *     remote object.
+	 * (2) The type of child is in \a child_mode.
+	 * (3) The result hint is stored in \a ah;
+	 */
+	void  (*do_ah_init)(const struct lu_env *env,
+			    struct dt_allocation_hint *ah,
+			    struct dt_object *parent,
+			    struct dt_object *child,
+			    umode_t child_mode);
+	/**
+	 * Create new object on this device.
+	 *
+	 * precondition: !dt_object_exists(dt);
+	 * postcondition: ergo(result == 0, dt_object_exists(dt));
+	 */
+	int   (*do_declare_create)(const struct lu_env *env,
+				   struct dt_object *dt,
+				   struct lu_attr *attr,
+				   struct dt_allocation_hint *hint,
+				   struct dt_object_format *dof,
+				   struct thandle *th);
+	int   (*do_create)(const struct lu_env *env, struct dt_object *dt,
+			   struct lu_attr *attr,
+			   struct dt_allocation_hint *hint,
+			   struct dt_object_format *dof,
+			   struct thandle *th);
+
+	/**
+	  Destroy object on this device
+	 * precondition: !dt_object_exists(dt);
+	 * postcondition: ergo(result == 0, dt_object_exists(dt));
+	 */
+	int   (*do_declare_destroy)(const struct lu_env *env,
+				    struct dt_object *dt,
+				    struct thandle *th);
+	int   (*do_destroy)(const struct lu_env *env, struct dt_object *dt,
+			    struct thandle *th);
+
+	/**
+	 * Announce that this object is going to be used as an index. This
+	 * operation check that object supports indexing operations and
+	 * installs appropriate dt_index_operations vector on success.
+	 *
+	 * Also probes for features. Operation is successful if all required
+	 * features are supported.
+	 */
+	int   (*do_index_try)(const struct lu_env *env,
+			      struct dt_object *dt,
+			      const struct dt_index_features *feat);
+	/**
+	 * Add nlink of the object
+	 * precondition: dt_object_exists(dt);
+	 */
+	int   (*do_declare_ref_add)(const struct lu_env *env,
+				    struct dt_object *dt, struct thandle *th);
+	int   (*do_ref_add)(const struct lu_env *env,
+			    struct dt_object *dt, struct thandle *th);
+	/**
+	 * Del nlink of the object
+	 * precondition: dt_object_exists(dt);
+	 */
+	int   (*do_declare_ref_del)(const struct lu_env *env,
+				    struct dt_object *dt, struct thandle *th);
+	int   (*do_ref_del)(const struct lu_env *env,
+			    struct dt_object *dt, struct thandle *th);
+
+	struct obd_capa *(*do_capa_get)(const struct lu_env *env,
+					struct dt_object *dt,
+					struct lustre_capa *old,
+					__u64 opc);
+	int (*do_object_sync)(const struct lu_env *, struct dt_object *);
+	/**
+	 * Get object info of next level. Currently, only get inode from osd.
+	 * This is only used by quota b=16542
+	 * precondition: dt_object_exists(dt);
+	 */
+	int (*do_data_get)(const struct lu_env *env, struct dt_object *dt,
+			   void **data);
+
+	/**
+	 * Lock object.
+	 */
+	int (*do_object_lock)(const struct lu_env *env, struct dt_object *dt,
+			      struct lustre_handle *lh,
+			      struct ldlm_enqueue_info *einfo,
+			      void *policy);
+};
+
+/**
+ * Per-dt-object operations on "file body".
+ */
+struct dt_body_operations {
+	/**
+	 * precondition: dt_object_exists(dt);
+	 */
+	ssize_t (*dbo_read)(const struct lu_env *env, struct dt_object *dt,
+			    struct lu_buf *buf, loff_t *pos,
+			    struct lustre_capa *capa);
+	/**
+	 * precondition: dt_object_exists(dt);
+	 */
+	ssize_t (*dbo_declare_write)(const struct lu_env *env,
+				     struct dt_object *dt,
+				     const loff_t size, loff_t pos,
+				     struct thandle *handle);
+	ssize_t (*dbo_write)(const struct lu_env *env, struct dt_object *dt,
+			     const struct lu_buf *buf, loff_t *pos,
+			     struct thandle *handle, struct lustre_capa *capa,
+			     int ignore_quota);
+	/*
+	 * methods for zero-copy IO
+	 */
+
+	/*
+	 * precondition: dt_object_exists(dt);
+	 * returns:
+	 * < 0 - error code
+	 * = 0 - illegal
+	 * > 0 - number of local buffers prepared
+	 */
+	int (*dbo_bufs_get)(const struct lu_env *env, struct dt_object *dt,
+			    loff_t pos, ssize_t len, struct niobuf_local *lb,
+			    int rw, struct lustre_capa *capa);
+	/*
+	 * precondition: dt_object_exists(dt);
+	 */
+	int (*dbo_bufs_put)(const struct lu_env *env, struct dt_object *dt,
+			    struct niobuf_local *lb, int nr);
+	/*
+	 * precondition: dt_object_exists(dt);
+	 */
+	int (*dbo_write_prep)(const struct lu_env *env, struct dt_object *dt,
+			      struct niobuf_local *lb, int nr);
+	/*
+	 * precondition: dt_object_exists(dt);
+	 */
+	int (*dbo_declare_write_commit)(const struct lu_env *env,
+					struct dt_object *dt,
+					struct niobuf_local *,
+					int, struct thandle *);
+	/*
+	 * precondition: dt_object_exists(dt);
+	 */
+	int (*dbo_write_commit)(const struct lu_env *env, struct dt_object *dt,
+				struct niobuf_local *, int, struct thandle *);
+	/*
+	 * precondition: dt_object_exists(dt);
+	 */
+	int (*dbo_read_prep)(const struct lu_env *env, struct dt_object *dt,
+			     struct niobuf_local *lnb, int nr);
+	int (*dbo_fiemap_get)(const struct lu_env *env, struct dt_object *dt,
+			      struct ll_user_fiemap *fm);
+	/**
+	 * Punch object's content
+	 * precondition: regular object, not index
+	 */
+	int   (*dbo_declare_punch)(const struct lu_env *, struct dt_object *,
+				  __u64, __u64, struct thandle *th);
+	int   (*dbo_punch)(const struct lu_env *env, struct dt_object *dt,
+			  __u64 start, __u64 end, struct thandle *th,
+			  struct lustre_capa *capa);
+};
+
+/**
+ * Incomplete type of index record.
+ */
+struct dt_rec;
+
+/**
+ * Incomplete type of index key.
+ */
+struct dt_key;
+
+/**
+ * Incomplete type of dt iterator.
+ */
+struct dt_it;
+
+/**
+ * Per-dt-object operations on object as index.
+ */
+struct dt_index_operations {
+	/**
+	 * precondition: dt_object_exists(dt);
+	 */
+	int (*dio_lookup)(const struct lu_env *env, struct dt_object *dt,
+			  struct dt_rec *rec, const struct dt_key *key,
+			  struct lustre_capa *capa);
+	/**
+	 * precondition: dt_object_exists(dt);
+	 */
+	int (*dio_declare_insert)(const struct lu_env *env,
+				  struct dt_object *dt,
+				  const struct dt_rec *rec,
+				  const struct dt_key *key,
+				  struct thandle *handle);
+	int (*dio_insert)(const struct lu_env *env, struct dt_object *dt,
+			  const struct dt_rec *rec, const struct dt_key *key,
+			  struct thandle *handle, struct lustre_capa *capa,
+			  int ignore_quota);
+	/**
+	 * precondition: dt_object_exists(dt);
+	 */
+	int (*dio_declare_delete)(const struct lu_env *env,
+				  struct dt_object *dt,
+				  const struct dt_key *key,
+				  struct thandle *handle);
+	int (*dio_delete)(const struct lu_env *env, struct dt_object *dt,
+			  const struct dt_key *key, struct thandle *handle,
+			  struct lustre_capa *capa);
+	/**
+	 * Iterator interface
+	 */
+	struct dt_it_ops {
+		/**
+		 * Allocate and initialize new iterator.
+		 *
+		 * precondition: dt_object_exists(dt);
+		 */
+		struct dt_it *(*init)(const struct lu_env *env,
+				      struct dt_object *dt,
+				      __u32 attr,
+				      struct lustre_capa *capa);
+		void	  (*fini)(const struct lu_env *env,
+				      struct dt_it *di);
+		int	    (*get)(const struct lu_env *env,
+				      struct dt_it *di,
+				      const struct dt_key *key);
+		void	   (*put)(const struct lu_env *env,
+				      struct dt_it *di);
+		int	   (*next)(const struct lu_env *env,
+				      struct dt_it *di);
+		struct dt_key *(*key)(const struct lu_env *env,
+				      const struct dt_it *di);
+		int       (*key_size)(const struct lu_env *env,
+				      const struct dt_it *di);
+		int	    (*rec)(const struct lu_env *env,
+				      const struct dt_it *di,
+				      struct dt_rec *rec,
+				      __u32 attr);
+		__u64	(*store)(const struct lu_env *env,
+				      const struct dt_it *di);
+		int	   (*load)(const struct lu_env *env,
+				      const struct dt_it *di, __u64 hash);
+		int	(*key_rec)(const struct lu_env *env,
+				      const struct dt_it *di, void* key_rec);
+	} dio_it;
+};
+
+enum dt_otable_it_valid {
+	DOIV_ERROR_HANDLE	= 0x0001,
+};
+
+enum dt_otable_it_flags {
+	/* Exit when fail. */
+	DOIF_FAILOUT	= 0x0001,
+
+	/* Reset iteration position to the device beginning. */
+	DOIF_RESET	= 0x0002,
+
+	/* There is up layer component uses the iteration. */
+	DOIF_OUTUSED	= 0x0004,
+};
+
+/* otable based iteration needs to use the common DT interation APIs.
+ * To initialize the iteration, it needs call dio_it::init() firstly.
+ * Here is how the otable based iteration should prepare arguments to
+ * call dt_it_ops::init().
+ *
+ * For otable based iteration, the 32-bits 'attr' for dt_it_ops::init()
+ * is composed of two parts:
+ * low 16-bits is for valid bits, high 16-bits is for flags bits. */
+#define DT_OTABLE_IT_FLAGS_SHIFT	16
+#define DT_OTABLE_IT_FLAGS_MASK 	0xffff0000
+
+struct dt_device {
+	struct lu_device		   dd_lu_dev;
+	const struct dt_device_operations *dd_ops;
+
+	/**
+	 * List of dt_txn_callback (see below). This is not protected in any
+	 * way, because callbacks are supposed to be added/deleted only during
+	 * single-threaded start-up shut-down procedures.
+	 */
+	struct list_head			 dd_txn_callbacks;
+};
+
+int  dt_device_init(struct dt_device *dev, struct lu_device_type *t);
+void dt_device_fini(struct dt_device *dev);
+
+static inline int lu_device_is_dt(const struct lu_device *d)
+{
+	return ergo(d != NULL, d->ld_type->ldt_tags & LU_DEVICE_DT);
+}
+
+static inline struct dt_device * lu2dt_dev(struct lu_device *l)
+{
+	LASSERT(lu_device_is_dt(l));
+	return container_of0(l, struct dt_device, dd_lu_dev);
+}
+
+struct dt_object {
+	struct lu_object		   do_lu;
+	const struct dt_object_operations *do_ops;
+	const struct dt_body_operations   *do_body_ops;
+	const struct dt_index_operations  *do_index_ops;
+};
+
+/*
+ * In-core representation of per-device local object OID storage
+ */
+struct local_oid_storage {
+	/* all initialized llog systems on this node linked by this */
+	struct list_head	  los_list;
+
+	/* how many handle's reference this los has */
+	atomic_t	  los_refcount;
+	struct dt_device *los_dev;
+	struct dt_object *los_obj;
+
+	/* data used to generate new fids */
+	struct mutex	 los_id_lock;
+	__u64		  los_seq;
+	__u32		  los_last_oid;
+};
+
+static inline struct dt_object *lu2dt(struct lu_object *l)
+{
+	LASSERT(l == NULL || IS_ERR(l) || lu_device_is_dt(l->lo_dev));
+	return container_of0(l, struct dt_object, do_lu);
+}
+
+int  dt_object_init(struct dt_object *obj,
+		    struct lu_object_header *h, struct lu_device *d);
+
+void dt_object_fini(struct dt_object *obj);
+
+static inline int dt_object_exists(const struct dt_object *dt)
+{
+	return lu_object_exists(&dt->do_lu);
+}
+
+static inline int dt_object_remote(const struct dt_object *dt)
+{
+	return lu_object_remote(&dt->do_lu);
+}
+
+static inline struct dt_object *lu2dt_obj(struct lu_object *o)
+{
+	LASSERT(ergo(o != NULL, lu_device_is_dt(o->lo_dev)));
+	return container_of0(o, struct dt_object, do_lu);
+}
+
+/**
+ * This is the general purpose transaction handle.
+ * 1. Transaction Life Cycle
+ *      This transaction handle is allocated upon starting a new transaction,
+ *      and deallocated after this transaction is committed.
+ * 2. Transaction Nesting
+ *      We do _NOT_ support nested transaction. So, every thread should only
+ *      have one active transaction, and a transaction only belongs to one
+ *      thread. Due to this, transaction handle need no reference count.
+ * 3. Transaction & dt_object locking
+ *      dt_object locks should be taken inside transaction.
+ * 4. Transaction & RPC
+ *      No RPC request should be issued inside transaction.
+ */
+struct thandle {
+	/** the dt device on which the transactions are executed */
+	struct dt_device *th_dev;
+
+	/** context for this transaction, tag is LCT_TX_HANDLE */
+	struct lu_context th_ctx;
+
+	/** additional tags (layers can add in declare) */
+	__u32	     th_tags;
+
+	/** the last operation result in this transaction.
+	 * this value is used in recovery */
+	__s32	     th_result;
+
+	/** whether we need sync commit */
+	unsigned int		th_sync:1;
+
+	/* local transation, no need to inform other layers */
+	unsigned int		th_local:1;
+
+	/* In DNE, one transaction can be disassemblied into
+	 * updates on several different MDTs, and these updates
+	 * will be attached to th_remote_update_list per target.
+	 * Only single thread will access the list, no need lock
+	 */
+	struct list_head		th_remote_update_list;
+	struct update_request	*th_current_request;
+};
+
+/**
+ * Transaction call-backs.
+ *
+ * These are invoked by osd (or underlying transaction engine) when
+ * transaction changes state.
+ *
+ * Call-backs are used by upper layers to modify transaction parameters and to
+ * perform some actions on for each transaction state transition. Typical
+ * example is mdt registering call-back to write into last-received file
+ * before each transaction commit.
+ */
+struct dt_txn_callback {
+	int (*dtc_txn_start)(const struct lu_env *env,
+			     struct thandle *txn, void *cookie);
+	int (*dtc_txn_stop)(const struct lu_env *env,
+			    struct thandle *txn, void *cookie);
+	void (*dtc_txn_commit)(struct thandle *txn, void *cookie);
+	void		*dtc_cookie;
+	__u32		dtc_tag;
+	struct list_head	   dtc_linkage;
+};
+
+void dt_txn_callback_add(struct dt_device *dev, struct dt_txn_callback *cb);
+void dt_txn_callback_del(struct dt_device *dev, struct dt_txn_callback *cb);
+
+int dt_txn_hook_start(const struct lu_env *env,
+		      struct dt_device *dev, struct thandle *txn);
+int dt_txn_hook_stop(const struct lu_env *env, struct thandle *txn);
+void dt_txn_hook_commit(struct thandle *txn);
+
+int dt_try_as_dir(const struct lu_env *env, struct dt_object *obj);
+
+/**
+ * Callback function used for parsing path.
+ * \see llo_store_resolve
+ */
+typedef int (*dt_entry_func_t)(const struct lu_env *env,
+			    const char *name,
+			    void *pvt);
+
+#define DT_MAX_PATH 1024
+
+int dt_path_parser(const struct lu_env *env,
+		   char *local, dt_entry_func_t entry_func,
+		   void *data);
+
+struct dt_object *
+dt_store_resolve(const struct lu_env *env, struct dt_device *dt,
+		 const char *path, struct lu_fid *fid);
+
+struct dt_object *dt_store_open(const struct lu_env *env,
+				struct dt_device *dt,
+				const char *dirname,
+				const char *filename,
+				struct lu_fid *fid);
+
+struct dt_object *dt_find_or_create(const struct lu_env *env,
+				    struct dt_device *dt,
+				    const struct lu_fid *fid,
+				    struct dt_object_format *dof,
+				    struct lu_attr *attr);
+
+struct dt_object *dt_locate_at(const struct lu_env *env,
+			       struct dt_device *dev,
+			       const struct lu_fid *fid,
+			       struct lu_device *top_dev);
+static inline struct dt_object *
+dt_locate(const struct lu_env *env, struct dt_device *dev,
+	  const struct lu_fid *fid)
+{
+	return dt_locate_at(env, dev, fid, dev->dd_lu_dev.ld_site->ls_top_dev);
+}
+
+
+int local_oid_storage_init(const struct lu_env *env, struct dt_device *dev,
+			   const struct lu_fid *first_fid,
+			   struct local_oid_storage **los);
+void local_oid_storage_fini(const struct lu_env *env,
+			    struct local_oid_storage *los);
+int local_object_fid_generate(const struct lu_env *env,
+			      struct local_oid_storage *los,
+			      struct lu_fid *fid);
+int local_object_declare_create(const struct lu_env *env,
+				struct local_oid_storage *los,
+				struct dt_object *o,
+				struct lu_attr *attr,
+				struct dt_object_format *dof,
+				struct thandle *th);
+int local_object_create(const struct lu_env *env,
+			struct local_oid_storage *los,
+			struct dt_object *o,
+			struct lu_attr *attr, struct dt_object_format *dof,
+			struct thandle *th);
+struct dt_object *local_file_find_or_create(const struct lu_env *env,
+					    struct local_oid_storage *los,
+					    struct dt_object *parent,
+					    const char *name, __u32 mode);
+struct dt_object *local_file_find_or_create_with_fid(const struct lu_env *env,
+						     struct dt_device *dt,
+						     const struct lu_fid *fid,
+						     struct dt_object *parent,
+						     const char *name,
+						     __u32 mode);
+struct dt_object *
+local_index_find_or_create(const struct lu_env *env,
+			   struct local_oid_storage *los,
+			   struct dt_object *parent,
+			   const char *name, __u32 mode,
+			   const struct dt_index_features *ft);
+struct dt_object *
+local_index_find_or_create_with_fid(const struct lu_env *env,
+				    struct dt_device *dt,
+				    const struct lu_fid *fid,
+				    struct dt_object *parent,
+				    const char *name, __u32 mode,
+				    const struct dt_index_features *ft);
+int local_object_unlink(const struct lu_env *env, struct dt_device *dt,
+			struct dt_object *parent, const char *name);
+
+static inline int dt_object_lock(const struct lu_env *env,
+				 struct dt_object *o, struct lustre_handle *lh,
+				 struct ldlm_enqueue_info *einfo,
+				 void *policy)
+{
+	LASSERT(o);
+	LASSERT(o->do_ops);
+	LASSERT(o->do_ops->do_object_lock);
+	return o->do_ops->do_object_lock(env, o, lh, einfo, policy);
+}
+
+int dt_lookup_dir(const struct lu_env *env, struct dt_object *dir,
+		  const char *name, struct lu_fid *fid);
+
+static inline int dt_object_sync(const struct lu_env *env,
+				 struct dt_object *o)
+{
+	LASSERT(o);
+	LASSERT(o->do_ops);
+	LASSERT(o->do_ops->do_object_sync);
+	return o->do_ops->do_object_sync(env, o);
+}
+
+int dt_declare_version_set(const struct lu_env *env, struct dt_object *o,
+			   struct thandle *th);
+void dt_version_set(const struct lu_env *env, struct dt_object *o,
+		    dt_obj_version_t version, struct thandle *th);
+dt_obj_version_t dt_version_get(const struct lu_env *env, struct dt_object *o);
+
+
+int dt_read(const struct lu_env *env, struct dt_object *dt,
+	    struct lu_buf *buf, loff_t *pos);
+int dt_record_read(const struct lu_env *env, struct dt_object *dt,
+		   struct lu_buf *buf, loff_t *pos);
+int dt_record_write(const struct lu_env *env, struct dt_object *dt,
+		    const struct lu_buf *buf, loff_t *pos, struct thandle *th);
+typedef int (*dt_index_page_build_t)(const struct lu_env *env,
+				     union lu_page *lp, int nob,
+				     const struct dt_it_ops *iops,
+				     struct dt_it *it, __u32 attr, void *arg);
+int dt_index_walk(const struct lu_env *env, struct dt_object *obj,
+		  const struct lu_rdpg *rdpg, dt_index_page_build_t filler,
+		  void *arg);
+int dt_index_read(const struct lu_env *env, struct dt_device *dev,
+		  struct idx_info *ii, const struct lu_rdpg *rdpg);
+
+static inline struct thandle *dt_trans_create(const struct lu_env *env,
+					      struct dt_device *d)
+{
+	LASSERT(d->dd_ops->dt_trans_create);
+	return d->dd_ops->dt_trans_create(env, d);
+}
+
+static inline int dt_trans_start(const struct lu_env *env,
+				 struct dt_device *d, struct thandle *th)
+{
+	LASSERT(d->dd_ops->dt_trans_start);
+	return d->dd_ops->dt_trans_start(env, d, th);
+}
+
+/* for this transaction hooks shouldn't be called */
+static inline int dt_trans_start_local(const struct lu_env *env,
+				       struct dt_device *d, struct thandle *th)
+{
+	LASSERT(d->dd_ops->dt_trans_start);
+	th->th_local = 1;
+	return d->dd_ops->dt_trans_start(env, d, th);
+}
+
+static inline int dt_trans_stop(const struct lu_env *env,
+				struct dt_device *d, struct thandle *th)
+{
+	LASSERT(d->dd_ops->dt_trans_stop);
+	return d->dd_ops->dt_trans_stop(env, th);
+}
+
+static inline int dt_trans_cb_add(struct thandle *th,
+				  struct dt_txn_commit_cb *dcb)
+{
+	LASSERT(th->th_dev->dd_ops->dt_trans_cb_add);
+	dcb->dcb_magic = TRANS_COMMIT_CB_MAGIC;
+	return th->th_dev->dd_ops->dt_trans_cb_add(th, dcb);
+}
+/** @} dt */
+
+
+static inline int dt_declare_record_write(const struct lu_env *env,
+					  struct dt_object *dt,
+					  int size, loff_t pos,
+					  struct thandle *th)
+{
+	int rc;
+
+	LASSERTF(dt != NULL, "dt is NULL when we want to write record\n");
+	LASSERT(th != NULL);
+	LASSERT(dt->do_body_ops);
+	LASSERT(dt->do_body_ops->dbo_declare_write);
+	rc = dt->do_body_ops->dbo_declare_write(env, dt, size, pos, th);
+	return rc;
+}
+
+static inline int dt_declare_create(const struct lu_env *env,
+				    struct dt_object *dt,
+				    struct lu_attr *attr,
+				    struct dt_allocation_hint *hint,
+				    struct dt_object_format *dof,
+				    struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_declare_create);
+	return dt->do_ops->do_declare_create(env, dt, attr, hint, dof, th);
+}
+
+static inline int dt_create(const struct lu_env *env,
+				    struct dt_object *dt,
+				    struct lu_attr *attr,
+				    struct dt_allocation_hint *hint,
+				    struct dt_object_format *dof,
+				    struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_create);
+	return dt->do_ops->do_create(env, dt, attr, hint, dof, th);
+}
+
+static inline int dt_declare_destroy(const struct lu_env *env,
+				     struct dt_object *dt,
+				     struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_declare_destroy);
+	return dt->do_ops->do_declare_destroy(env, dt, th);
+}
+
+static inline int dt_destroy(const struct lu_env *env,
+			     struct dt_object *dt,
+			     struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_destroy);
+	return dt->do_ops->do_destroy(env, dt, th);
+}
+
+static inline void dt_read_lock(const struct lu_env *env,
+				struct dt_object *dt,
+				unsigned role)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_read_lock);
+	dt->do_ops->do_read_lock(env, dt, role);
+}
+
+static inline void dt_write_lock(const struct lu_env *env,
+				struct dt_object *dt,
+				unsigned role)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_write_lock);
+	dt->do_ops->do_write_lock(env, dt, role);
+}
+
+static inline void dt_read_unlock(const struct lu_env *env,
+				struct dt_object *dt)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_read_unlock);
+	dt->do_ops->do_read_unlock(env, dt);
+}
+
+static inline void dt_write_unlock(const struct lu_env *env,
+				struct dt_object *dt)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_write_unlock);
+	dt->do_ops->do_write_unlock(env, dt);
+}
+
+static inline int dt_write_locked(const struct lu_env *env,
+				  struct dt_object *dt)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_write_locked);
+	return dt->do_ops->do_write_locked(env, dt);
+}
+
+static inline int dt_attr_get(const struct lu_env *env, struct dt_object *dt,
+			      struct lu_attr *la, void *arg)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_attr_get);
+	return dt->do_ops->do_attr_get(env, dt, la, arg);
+}
+
+static inline int dt_declare_attr_set(const struct lu_env *env,
+				      struct dt_object *dt,
+				      const struct lu_attr *la,
+				      struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_declare_attr_set);
+	return dt->do_ops->do_declare_attr_set(env, dt, la, th);
+}
+
+static inline int dt_attr_set(const struct lu_env *env, struct dt_object *dt,
+			      const struct lu_attr *la, struct thandle *th,
+			      struct lustre_capa *capa)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_attr_set);
+	return dt->do_ops->do_attr_set(env, dt, la, th, capa);
+}
+
+static inline int dt_declare_ref_add(const struct lu_env *env,
+				     struct dt_object *dt, struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_declare_ref_add);
+	return dt->do_ops->do_declare_ref_add(env, dt, th);
+}
+
+static inline int dt_ref_add(const struct lu_env *env,
+			     struct dt_object *dt, struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_ref_add);
+	return dt->do_ops->do_ref_add(env, dt, th);
+}
+
+static inline int dt_declare_ref_del(const struct lu_env *env,
+				     struct dt_object *dt, struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_declare_ref_del);
+	return dt->do_ops->do_declare_ref_del(env, dt, th);
+}
+
+static inline int dt_ref_del(const struct lu_env *env,
+			     struct dt_object *dt, struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_ref_del);
+	return dt->do_ops->do_ref_del(env, dt, th);
+}
+
+static inline struct obd_capa *dt_capa_get(const struct lu_env *env,
+					   struct dt_object *dt,
+					   struct lustre_capa *old, __u64 opc)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_ref_del);
+	return dt->do_ops->do_capa_get(env, dt, old, opc);
+}
+
+static inline int dt_bufs_get(const struct lu_env *env, struct dt_object *d,
+			      struct niobuf_remote *rnb,
+			      struct niobuf_local *lnb, int rw,
+			      struct lustre_capa *capa)
+{
+	LASSERT(d);
+	LASSERT(d->do_body_ops);
+	LASSERT(d->do_body_ops->dbo_bufs_get);
+	return d->do_body_ops->dbo_bufs_get(env, d, rnb->offset,
+					    rnb->len, lnb, rw, capa);
+}
+
+static inline int dt_bufs_put(const struct lu_env *env, struct dt_object *d,
+			      struct niobuf_local *lnb, int n)
+{
+	LASSERT(d);
+	LASSERT(d->do_body_ops);
+	LASSERT(d->do_body_ops->dbo_bufs_put);
+	return d->do_body_ops->dbo_bufs_put(env, d, lnb, n);
+}
+
+static inline int dt_write_prep(const struct lu_env *env, struct dt_object *d,
+				struct niobuf_local *lnb, int n)
+{
+	LASSERT(d);
+	LASSERT(d->do_body_ops);
+	LASSERT(d->do_body_ops->dbo_write_prep);
+	return d->do_body_ops->dbo_write_prep(env, d, lnb, n);
+}
+
+static inline int dt_declare_write_commit(const struct lu_env *env,
+					  struct dt_object *d,
+					  struct niobuf_local *lnb,
+					  int n, struct thandle *th)
+{
+	LASSERTF(d != NULL, "dt is NULL when we want to declare write\n");
+	LASSERT(th != NULL);
+	return d->do_body_ops->dbo_declare_write_commit(env, d, lnb, n, th);
+}
+
+
+static inline int dt_write_commit(const struct lu_env *env,
+				  struct dt_object *d, struct niobuf_local *lnb,
+				  int n, struct thandle *th)
+{
+	LASSERT(d);
+	LASSERT(d->do_body_ops);
+	LASSERT(d->do_body_ops->dbo_write_commit);
+	return d->do_body_ops->dbo_write_commit(env, d, lnb, n, th);
+}
+
+static inline int dt_read_prep(const struct lu_env *env, struct dt_object *d,
+			       struct niobuf_local *lnb, int n)
+{
+	LASSERT(d);
+	LASSERT(d->do_body_ops);
+	LASSERT(d->do_body_ops->dbo_read_prep);
+	return d->do_body_ops->dbo_read_prep(env, d, lnb, n);
+}
+
+static inline int dt_declare_punch(const struct lu_env *env,
+				   struct dt_object *dt, __u64 start,
+				   __u64 end, struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_body_ops);
+	LASSERT(dt->do_body_ops->dbo_declare_punch);
+	return dt->do_body_ops->dbo_declare_punch(env, dt, start, end, th);
+}
+
+static inline int dt_punch(const struct lu_env *env, struct dt_object *dt,
+			   __u64 start, __u64 end, struct thandle *th,
+			   struct lustre_capa *capa)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_body_ops);
+	LASSERT(dt->do_body_ops->dbo_punch);
+	return dt->do_body_ops->dbo_punch(env, dt, start, end, th, capa);
+}
+
+static inline int dt_fiemap_get(const struct lu_env *env, struct dt_object *d,
+				struct ll_user_fiemap *fm)
+{
+	LASSERT(d);
+	if (d->do_body_ops == NULL)
+		return -EPROTO;
+	if (d->do_body_ops->dbo_fiemap_get == NULL)
+		return -EOPNOTSUPP;
+	return d->do_body_ops->dbo_fiemap_get(env, d, fm);
+}
+
+static inline int dt_statfs(const struct lu_env *env, struct dt_device *dev,
+			    struct obd_statfs *osfs)
+{
+	LASSERT(dev);
+	LASSERT(dev->dd_ops);
+	LASSERT(dev->dd_ops->dt_statfs);
+	return dev->dd_ops->dt_statfs(env, dev, osfs);
+}
+
+static inline int dt_root_get(const struct lu_env *env, struct dt_device *dev,
+			      struct lu_fid *f)
+{
+	LASSERT(dev);
+	LASSERT(dev->dd_ops);
+	LASSERT(dev->dd_ops->dt_root_get);
+	return dev->dd_ops->dt_root_get(env, dev, f);
+}
+
+static inline void dt_conf_get(const struct lu_env *env,
+			       const struct dt_device *dev,
+			       struct dt_device_param *param)
+{
+	LASSERT(dev);
+	LASSERT(dev->dd_ops);
+	LASSERT(dev->dd_ops->dt_conf_get);
+	return dev->dd_ops->dt_conf_get(env, dev, param);
+}
+
+static inline int dt_sync(const struct lu_env *env, struct dt_device *dev)
+{
+	LASSERT(dev);
+	LASSERT(dev->dd_ops);
+	LASSERT(dev->dd_ops->dt_sync);
+	return dev->dd_ops->dt_sync(env, dev);
+}
+
+static inline int dt_ro(const struct lu_env *env, struct dt_device *dev)
+{
+	LASSERT(dev);
+	LASSERT(dev->dd_ops);
+	LASSERT(dev->dd_ops->dt_ro);
+	return dev->dd_ops->dt_ro(env, dev);
+}
+
+static inline int dt_declare_insert(const struct lu_env *env,
+				    struct dt_object *dt,
+				    const struct dt_rec *rec,
+				    const struct dt_key *key,
+				    struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_index_ops);
+	LASSERT(dt->do_index_ops->dio_declare_insert);
+	return dt->do_index_ops->dio_declare_insert(env, dt, rec, key, th);
+}
+
+static inline int dt_insert(const struct lu_env *env,
+				    struct dt_object *dt,
+				    const struct dt_rec *rec,
+				    const struct dt_key *key,
+				    struct thandle *th,
+				    struct lustre_capa *capa,
+				    int noquota)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_index_ops);
+	LASSERT(dt->do_index_ops->dio_insert);
+	return dt->do_index_ops->dio_insert(env, dt, rec, key, th,
+					    capa, noquota);
+}
+
+static inline int dt_declare_xattr_del(const struct lu_env *env,
+				       struct dt_object *dt,
+				       const char *name,
+				       struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_declare_xattr_del);
+	return dt->do_ops->do_declare_xattr_del(env, dt, name, th);
+}
+
+static inline int dt_xattr_del(const struct lu_env *env,
+			       struct dt_object *dt, const char *name,
+			       struct thandle *th,
+			       struct lustre_capa *capa)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_xattr_del);
+	return dt->do_ops->do_xattr_del(env, dt, name, th, capa);
+}
+
+static inline int dt_declare_xattr_set(const struct lu_env *env,
+				      struct dt_object *dt,
+				      const struct lu_buf *buf,
+				      const char *name, int fl,
+				      struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_declare_xattr_set);
+	return dt->do_ops->do_declare_xattr_set(env, dt, buf, name, fl, th);
+}
+
+static inline int dt_xattr_set(const struct lu_env *env,
+			      struct dt_object *dt, const struct lu_buf *buf,
+			      const char *name, int fl, struct thandle *th,
+			      struct lustre_capa *capa)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_xattr_set);
+	return dt->do_ops->do_xattr_set(env, dt, buf, name, fl, th, capa);
+}
+
+static inline int dt_xattr_get(const struct lu_env *env,
+			      struct dt_object *dt, struct lu_buf *buf,
+			      const char *name, struct lustre_capa *capa)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_xattr_get);
+	return dt->do_ops->do_xattr_get(env, dt, buf, name, capa);
+}
+
+static inline int dt_xattr_list(const struct lu_env *env,
+			       struct dt_object *dt, struct lu_buf *buf,
+			       struct lustre_capa *capa)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_xattr_list);
+	return dt->do_ops->do_xattr_list(env, dt, buf, capa);
+}
+
+static inline int dt_declare_delete(const struct lu_env *env,
+				    struct dt_object *dt,
+				    const struct dt_key *key,
+				    struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_index_ops);
+	LASSERT(dt->do_index_ops->dio_declare_delete);
+	return dt->do_index_ops->dio_declare_delete(env, dt, key, th);
+}
+
+static inline int dt_delete(const struct lu_env *env,
+			    struct dt_object *dt,
+			    const struct dt_key *key,
+			    struct thandle *th,
+			    struct lustre_capa *capa)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_index_ops);
+	LASSERT(dt->do_index_ops->dio_delete);
+	return dt->do_index_ops->dio_delete(env, dt, key, th, capa);
+}
+
+static inline int dt_commit_async(const struct lu_env *env,
+				  struct dt_device *dev)
+{
+	LASSERT(dev);
+	LASSERT(dev->dd_ops);
+	LASSERT(dev->dd_ops->dt_commit_async);
+	return dev->dd_ops->dt_commit_async(env, dev);
+}
+
+static inline int dt_init_capa_ctxt(const struct lu_env *env,
+				    struct dt_device *dev,
+				    int mode, unsigned long timeout,
+				    __u32 alg, struct lustre_capa_key *keys)
+{
+	LASSERT(dev);
+	LASSERT(dev->dd_ops);
+	LASSERT(dev->dd_ops->dt_init_capa_ctxt);
+	return dev->dd_ops->dt_init_capa_ctxt(env, dev, mode,
+					      timeout, alg, keys);
+}
+
+static inline int dt_lookup(const struct lu_env *env,
+			    struct dt_object *dt,
+			    struct dt_rec *rec,
+			    const struct dt_key *key,
+			    struct lustre_capa *capa)
+{
+	int ret;
+
+	LASSERT(dt);
+	LASSERT(dt->do_index_ops);
+	LASSERT(dt->do_index_ops->dio_lookup);
+
+	ret = dt->do_index_ops->dio_lookup(env, dt, rec, key, capa);
+	if (ret > 0)
+		ret = 0;
+	else if (ret == 0)
+		ret = -ENOENT;
+	return ret;
+}
+
+#define LU221_BAD_TIME (0x80000000U + 24 * 3600)
+
+struct dt_find_hint {
+	struct lu_fid	*dfh_fid;
+	struct dt_device     *dfh_dt;
+	struct dt_object     *dfh_o;
+};
+
+struct dt_thread_info {
+	char		     dti_buf[DT_MAX_PATH];
+	struct dt_find_hint      dti_dfh;
+	struct lu_attr	   dti_attr;
+	struct lu_fid	    dti_fid;
+	struct dt_object_format  dti_dof;
+	struct lustre_mdt_attrs  dti_lma;
+	struct lu_buf	    dti_lb;
+	loff_t		   dti_off;
+};
+
+extern struct lu_context_key dt_key;
+
+static inline struct dt_thread_info *dt_info(const struct lu_env *env)
+{
+	struct dt_thread_info *dti;
+
+	dti = lu_context_key_get(&env->le_ctx, &dt_key);
+	LASSERT(dti);
+	return dti;
+}
+
+int dt_global_init(void);
+void dt_global_fini(void);
+
+# ifdef LPROCFS
+int lprocfs_dt_rd_blksize(char *page, char **start, off_t off,
+			  int count, int *eof, void *data);
+int lprocfs_dt_rd_kbytestotal(char *page, char **start, off_t off,
+			      int count, int *eof, void *data);
+int lprocfs_dt_rd_kbytesfree(char *page, char **start, off_t off,
+			     int count, int *eof, void *data);
+int lprocfs_dt_rd_kbytesavail(char *page, char **start, off_t off,
+			      int count, int *eof, void *data);
+int lprocfs_dt_rd_filestotal(char *page, char **start, off_t off,
+			     int count, int *eof, void *data);
+int lprocfs_dt_rd_filesfree(char *page, char **start, off_t off,
+			    int count, int *eof, void *data);
+# endif /* LPROCFS */
+
+#endif /* __LUSTRE_DT_OBJECT_H */
diff --git a/drivers/staging/lustre/lustre/include/interval_tree.h b/drivers/staging/lustre/lustre/include/interval_tree.h
new file mode 100644
index 000000000000..dfdb8aa4e035
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/interval_tree.h
@@ -0,0 +1,124 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/interval_tree.h
+ *
+ * Author: Huang Wei <huangwei@clusterfs.com>
+ * Author: Jay Xiong <jinshan.xiong@sun.com>
+ */
+
+#ifndef _INTERVAL_H__
+#define _INTERVAL_H__
+
+#include <linux/libcfs/libcfs.h>   /* LASSERT. */
+
+struct interval_node {
+	struct interval_node   *in_left;
+	struct interval_node   *in_right;
+	struct interval_node   *in_parent;
+	unsigned		in_color:1,
+				in_intree:1, /** set if the node is in tree */
+				in_res1:30;
+	__u8		    in_res2[4];  /** tags, 8-bytes aligned */
+	__u64		   in_max_high;
+	struct interval_node_extent {
+		__u64 start;
+		__u64 end;
+	} in_extent;
+};
+
+enum interval_iter {
+	INTERVAL_ITER_CONT = 1,
+	INTERVAL_ITER_STOP = 2
+};
+
+static inline int interval_is_intree(struct interval_node *node)
+{
+	return node->in_intree == 1;
+}
+
+static inline __u64 interval_low(struct interval_node *node)
+{
+	return node->in_extent.start;
+}
+
+static inline __u64 interval_high(struct interval_node *node)
+{
+	return node->in_extent.end;
+}
+
+static inline void interval_set(struct interval_node *node,
+				__u64 start, __u64 end)
+{
+	LASSERT(start <= end);
+	node->in_extent.start = start;
+	node->in_extent.end = end;
+	node->in_max_high = end;
+}
+
+/* Rules to write an interval callback.
+ *  - the callback returns INTERVAL_ITER_STOP when it thinks the iteration
+ *    should be stopped. It will then cause the iteration function to return
+ *    immediately with return value INTERVAL_ITER_STOP.
+ *  - callbacks for interval_iterate and interval_iterate_reverse: Every
+ *    nodes in the tree will be set to @node before the callback being called
+ *  - callback for interval_search: Only overlapped node will be set to @node
+ *    before the callback being called.
+ */
+typedef enum interval_iter (*interval_callback_t)(struct interval_node *node,
+						  void *args);
+
+struct interval_node *interval_insert(struct interval_node *node,
+				      struct interval_node **root);
+void interval_erase(struct interval_node *node, struct interval_node **root);
+
+/* Search the extents in the tree and call @func for each overlapped
+ * extents. */
+enum interval_iter interval_search(struct interval_node *root,
+				   struct interval_node_extent *ex,
+				   interval_callback_t func, void *data);
+
+/* Iterate every node in the tree - by reverse order or regular order. */
+enum interval_iter interval_iterate(struct interval_node *root,
+				    interval_callback_t func, void *data);
+enum interval_iter interval_iterate_reverse(struct interval_node *root,
+				    interval_callback_t func,void *data);
+
+void interval_expand(struct interval_node *root,
+		     struct interval_node_extent *ext,
+		     struct interval_node_extent *limiter);
+int interval_is_overlapped(struct interval_node *root,
+			   struct interval_node_extent *ex);
+struct interval_node *interval_find(struct interval_node *root,
+				    struct interval_node_extent *ex);
+#endif
diff --git a/drivers/staging/lustre/lustre/include/ioctl.h b/drivers/staging/lustre/lustre/include/ioctl.h
new file mode 100644
index 000000000000..227c261b2ae9
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/ioctl.h
@@ -0,0 +1,106 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _IOWR
+
+/* On i386 and x86_64, _ASM_I386_IOCTL_H is defined by the kernel's ioctl.h,
+ * and on newer kernels this header is shared as _ASM_GENERIC_IOCTL_H.
+ *
+ * We can avoid any problems with the kernel header being included again by
+ * defining _ASM_I386_IOCTL_H here so that a later occurence of <asm/ioctl.h>
+ * does not include the kernel's ioctl.h after this one. b=14746 */
+#define _ASM_I386_IOCTL_H
+#define _ASM_GENERIC_IOCTL_H
+
+/* ioctl command encoding: 32 bits total, command in lower 16 bits,
+ * size of the parameter structure in the lower 14 bits of the
+ * upper 16 bits.
+ * Encoding the size of the parameter structure in the ioctl request
+ * The highest 2 bits are reserved for indicating the ``access mode''.
+ * NOTE: This limits the max parameter size to 16kB -1 !
+ */
+
+/*
+ * The following is for compatibility across the various Linux
+ * platforms.  The i386 ioctl numbering scheme doesn't really enforce
+ * a type field.  De facto, however, the top 8 bits of the lower 16
+ * bits are indeed used as a type field, so we might just as well make
+ * this explicit here.  Please be sure to use the decoding macros
+ * below from now on.
+ */
+#define _IOC_NRBITS     8
+#define _IOC_TYPEBITS   8
+#define _IOC_SIZEBITS   14
+#define _IOC_DIRBITS    2
+
+#define _IOC_NRMASK     ((1 << _IOC_NRBITS)-1)
+#define _IOC_TYPEMASK   ((1 << _IOC_TYPEBITS)-1)
+#define _IOC_SIZEMASK   ((1 << _IOC_SIZEBITS)-1)
+#define _IOC_DIRMASK    ((1 << _IOC_DIRBITS)-1)
+
+#define _IOC_NRSHIFT    0
+#define _IOC_TYPESHIFT  (_IOC_NRSHIFT+_IOC_NRBITS)
+#define _IOC_SIZESHIFT  (_IOC_TYPESHIFT+_IOC_TYPEBITS)
+#define _IOC_DIRSHIFT   (_IOC_SIZESHIFT+_IOC_SIZEBITS)
+
+/*
+ * Direction bits.
+ */
+#define _IOC_NONE       0U
+#define _IOC_WRITE      1U
+#define _IOC_READ       2U
+
+#define _IOC(dir,type,nr,size) (((dir)  << _IOC_DIRSHIFT) | ((type) << _IOC_TYPESHIFT) | ((nr)   << _IOC_NRSHIFT) | ((size) << _IOC_SIZESHIFT))
+
+/* used to create numbers */
+#define _IO(type,nr)	    _IOC(_IOC_NONE,(type),(nr),0)
+#define _IOR(type,nr,size)      _IOC(_IOC_READ,(type),(nr),sizeof(size))
+#define _IOW(type,nr,size)      _IOC(_IOC_WRITE,(type),(nr),sizeof(size))
+#define _IOWR(type,nr,size)     _IOC(_IOC_READ|_IOC_WRITE,(type),(nr),sizeof(size))
+
+/* used to decode ioctl numbers.. */
+#define _IOC_DIR(nr)	    (((nr) >> _IOC_DIRSHIFT) & _IOC_DIRMASK)
+#define _IOC_TYPE(nr)	   (((nr) >> _IOC_TYPESHIFT) & _IOC_TYPEMASK)
+#define _IOC_NR(nr)	     (((nr) >> _IOC_NRSHIFT) & _IOC_NRMASK)
+#define _IOC_SIZE(nr)	   (((nr) >> _IOC_SIZESHIFT) & _IOC_SIZEMASK)
+
+/* ...and for the drivers/sound files... */
+
+#define IOC_IN	  (_IOC_WRITE << _IOC_DIRSHIFT)
+#define IOC_OUT	 (_IOC_READ << _IOC_DIRSHIFT)
+#define IOC_INOUT       ((_IOC_WRITE|_IOC_READ) << _IOC_DIRSHIFT)
+#define IOCSIZE_MASK    (_IOC_SIZEMASK << _IOC_SIZESHIFT)
+#define IOCSIZE_SHIFT   (_IOC_SIZESHIFT)
+
+#endif /* _IOWR */
diff --git a/drivers/staging/lustre/lustre/include/lclient.h b/drivers/staging/lustre/lustre/include/lclient.h
new file mode 100644
index 000000000000..d00600ce208f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lclient.h
@@ -0,0 +1,441 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Definitions shared between vvp and liblustre, and other clients in the
+ * future.
+ *
+ *   Author: Oleg Drokin <oleg.drokin@sun.com>
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#ifndef LCLIENT_H
+#define LCLIENT_H
+
+blkcnt_t dirty_cnt(struct inode *inode);
+
+int cl_glimpse_size0(struct inode *inode, int agl);
+int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io,
+		    struct inode *inode, struct cl_object *clob, int agl);
+
+static inline int cl_glimpse_size(struct inode *inode)
+{
+	return cl_glimpse_size0(inode, 0);
+}
+
+static inline int cl_agl(struct inode *inode)
+{
+	return cl_glimpse_size0(inode, 1);
+}
+
+/**
+ * Locking policy for setattr.
+ */
+enum ccc_setattr_lock_type {
+	/** Locking is done by server */
+	SETATTR_NOLOCK,
+	/** Extent lock is enqueued */
+	SETATTR_EXTENT_LOCK,
+	/** Existing local extent lock is used */
+	SETATTR_MATCH_LOCK
+};
+
+
+/**
+ * IO state private to vvp or slp layers.
+ */
+struct ccc_io {
+	/** super class */
+	struct cl_io_slice     cui_cl;
+	struct cl_io_lock_link cui_link;
+	/**
+	 * I/O vector information to or from which read/write is going.
+	 */
+	struct iovec *cui_iov;
+	unsigned long cui_nrsegs;
+	/**
+	 * Total iov count for left IO.
+	 */
+	unsigned long cui_tot_nrsegs;
+	/**
+	 * Old length for iov that was truncated partially.
+	 */
+	size_t cui_iov_olen;
+	/**
+	 * Total size for the left IO.
+	 */
+	size_t cui_tot_count;
+
+	union {
+		struct {
+			enum ccc_setattr_lock_type cui_local_lock;
+		} setattr;
+	} u;
+	/**
+	 * True iff io is processing glimpse right now.
+	 */
+	int		  cui_glimpse;
+	/**
+	 * Layout version when this IO is initialized
+	 */
+	__u32		cui_layout_gen;
+	/**
+	 * File descriptor against which IO is done.
+	 */
+	struct ll_file_data *cui_fd;
+	struct kiocb *cui_iocb;
+};
+
+/**
+ * True, if \a io is a normal io, False for other (sendfile, splice*).
+ * must be impementated in arch specific code.
+ */
+int cl_is_normalio(const struct lu_env *env, const struct cl_io *io);
+
+extern struct lu_context_key ccc_key;
+extern struct lu_context_key ccc_session_key;
+
+struct ccc_thread_info {
+	struct cl_lock_descr cti_descr;
+	struct cl_io	 cti_io;
+	struct cl_attr       cti_attr;
+};
+
+static inline struct ccc_thread_info *ccc_env_info(const struct lu_env *env)
+{
+	struct ccc_thread_info      *info;
+
+	info = lu_context_key_get(&env->le_ctx, &ccc_key);
+	LASSERT(info != NULL);
+	return info;
+}
+
+static inline struct cl_attr *ccc_env_thread_attr(const struct lu_env *env)
+{
+	struct cl_attr *attr = &ccc_env_info(env)->cti_attr;
+	memset(attr, 0, sizeof(*attr));
+	return attr;
+}
+
+static inline struct cl_io *ccc_env_thread_io(const struct lu_env *env)
+{
+	struct cl_io *io = &ccc_env_info(env)->cti_io;
+	memset(io, 0, sizeof(*io));
+	return io;
+}
+
+struct ccc_session {
+	struct ccc_io cs_ios;
+};
+
+static inline struct ccc_session *ccc_env_session(const struct lu_env *env)
+{
+	struct ccc_session *ses;
+
+	ses = lu_context_key_get(env->le_ses, &ccc_session_key);
+	LASSERT(ses != NULL);
+	return ses;
+}
+
+static inline struct ccc_io *ccc_env_io(const struct lu_env *env)
+{
+	return &ccc_env_session(env)->cs_ios;
+}
+
+/**
+ * ccc-private object state.
+ */
+struct ccc_object {
+	struct cl_object_header cob_header;
+	struct cl_object	cob_cl;
+	struct inode	   *cob_inode;
+
+	/**
+	 * A list of dirty pages pending IO in the cache. Used by
+	 * SOM. Protected by ll_inode_info::lli_lock.
+	 *
+	 * \see ccc_page::cpg_pending_linkage
+	 */
+	struct list_head	     cob_pending_list;
+
+	/**
+	 * Access this counter is protected by inode->i_sem. Now that
+	 * the lifetime of transient pages must be covered by inode sem,
+	 * we don't need to hold any lock..
+	 */
+	int		     cob_transient_pages;
+	/**
+	 * Number of outstanding mmaps on this file.
+	 *
+	 * \see ll_vm_open(), ll_vm_close().
+	 */
+	atomic_t	    cob_mmap_cnt;
+
+	/**
+	 * various flags
+	 * cob_discard_page_warned
+	 *     if pages belonging to this object are discarded when a client
+	 * is evicted, some debug info will be printed, this flag will be set
+	 * during processing the first discarded page, then avoid flooding
+	 * debug message for lots of discarded pages.
+	 *
+	 * \see ll_dirty_page_discard_warn.
+	 */
+	unsigned int		cob_discard_page_warned:1;
+};
+
+/**
+ * ccc-private page state.
+ */
+struct ccc_page {
+	struct cl_page_slice cpg_cl;
+	int		  cpg_defer_uptodate;
+	int		  cpg_ra_used;
+	int		  cpg_write_queued;
+	/**
+	 * Non-empty iff this page is already counted in
+	 * ccc_object::cob_pending_list. Protected by
+	 * ccc_object::cob_pending_guard. This list is only used as a flag,
+	 * that is, never iterated through, only checked for list_empty(), but
+	 * having a list is useful for debugging.
+	 */
+	struct list_head	   cpg_pending_linkage;
+	/** VM page */
+	struct page	  *cpg_page;
+};
+
+static inline struct ccc_page *cl2ccc_page(const struct cl_page_slice *slice)
+{
+	return container_of(slice, struct ccc_page, cpg_cl);
+}
+
+struct cl_page    *ccc_vmpage_page_transient(struct page *vmpage);
+
+struct ccc_device {
+	struct cl_device    cdv_cl;
+	struct super_block *cdv_sb;
+	struct cl_device   *cdv_next;
+};
+
+struct ccc_lock {
+	struct cl_lock_slice clk_cl;
+};
+
+struct ccc_req {
+	struct cl_req_slice  crq_cl;
+};
+
+void *ccc_key_init	(const struct lu_context *ctx,
+			   struct lu_context_key *key);
+void  ccc_key_fini	(const struct lu_context *ctx,
+			   struct lu_context_key *key, void *data);
+void *ccc_session_key_init(const struct lu_context *ctx,
+			   struct lu_context_key *key);
+void  ccc_session_key_fini(const struct lu_context *ctx,
+			   struct lu_context_key *key, void *data);
+
+int	      ccc_device_init  (const struct lu_env *env,
+				   struct lu_device *d,
+				   const char *name, struct lu_device *next);
+struct lu_device *ccc_device_fini (const struct lu_env *env,
+				   struct lu_device *d);
+struct lu_device *ccc_device_alloc(const struct lu_env *env,
+				   struct lu_device_type *t,
+				   struct lustre_cfg *cfg,
+				   const struct lu_device_operations *luops,
+				   const struct cl_device_operations *clops);
+struct lu_device *ccc_device_free (const struct lu_env *env,
+				   struct lu_device *d);
+struct lu_object *ccc_object_alloc(const struct lu_env *env,
+				   const struct lu_object_header *hdr,
+				   struct lu_device *dev,
+				   const struct cl_object_operations *clops,
+				   const struct lu_object_operations *luops);
+
+int ccc_req_init(const struct lu_env *env, struct cl_device *dev,
+		 struct cl_req *req);
+void ccc_umount(const struct lu_env *env, struct cl_device *dev);
+int ccc_global_init(struct lu_device_type *device_type);
+void ccc_global_fini(struct lu_device_type *device_type);
+int ccc_object_init0(const struct lu_env *env,struct ccc_object *vob,
+		     const struct cl_object_conf *conf);
+int ccc_object_init(const struct lu_env *env, struct lu_object *obj,
+		    const struct lu_object_conf *conf);
+void ccc_object_free(const struct lu_env *env, struct lu_object *obj);
+int ccc_lock_init(const struct lu_env *env, struct cl_object *obj,
+		  struct cl_lock *lock, const struct cl_io *io,
+		  const struct cl_lock_operations *lkops);
+int ccc_attr_set(const struct lu_env *env, struct cl_object *obj,
+		 const struct cl_attr *attr, unsigned valid);
+int ccc_object_glimpse(const struct lu_env *env,
+		       const struct cl_object *obj, struct ost_lvb *lvb);
+int ccc_conf_set(const struct lu_env *env, struct cl_object *obj,
+		 const struct cl_object_conf *conf);
+struct page *ccc_page_vmpage(const struct lu_env *env,
+			    const struct cl_page_slice *slice);
+int ccc_page_is_under_lock(const struct lu_env *env,
+			   const struct cl_page_slice *slice, struct cl_io *io);
+int ccc_fail(const struct lu_env *env, const struct cl_page_slice *slice);
+void ccc_transient_page_verify(const struct cl_page *page);
+int  ccc_transient_page_own(const struct lu_env *env,
+			    const struct cl_page_slice *slice,
+			    struct cl_io *io, int nonblock);
+void ccc_transient_page_assume(const struct lu_env *env,
+			       const struct cl_page_slice *slice,
+			       struct cl_io *io);
+void ccc_transient_page_unassume(const struct lu_env *env,
+				 const struct cl_page_slice *slice,
+				 struct cl_io *io);
+void ccc_transient_page_disown(const struct lu_env *env,
+			       const struct cl_page_slice *slice,
+			       struct cl_io *io);
+void ccc_transient_page_discard(const struct lu_env *env,
+				const struct cl_page_slice *slice,
+				struct cl_io *io);
+int ccc_transient_page_prep(const struct lu_env *env,
+			    const struct cl_page_slice *slice,
+			    struct cl_io *io);
+void ccc_lock_delete(const struct lu_env *env,
+		     const struct cl_lock_slice *slice);
+void ccc_lock_fini(const struct lu_env *env,struct cl_lock_slice *slice);
+int ccc_lock_enqueue(const struct lu_env *env,const struct cl_lock_slice *slice,
+		     struct cl_io *io, __u32 enqflags);
+int ccc_lock_unuse(const struct lu_env *env,const struct cl_lock_slice *slice);
+int ccc_lock_wait(const struct lu_env *env,const struct cl_lock_slice *slice);
+int ccc_lock_fits_into(const struct lu_env *env,
+		       const struct cl_lock_slice *slice,
+		       const struct cl_lock_descr *need,
+		       const struct cl_io *io);
+void ccc_lock_state(const struct lu_env *env,
+		    const struct cl_lock_slice *slice,
+		    enum cl_lock_state state);
+
+void ccc_io_fini(const struct lu_env *env, const struct cl_io_slice *ios);
+int ccc_io_one_lock_index(const struct lu_env *env, struct cl_io *io,
+			  __u32 enqflags, enum cl_lock_mode mode,
+			  pgoff_t start, pgoff_t end);
+int ccc_io_one_lock(const struct lu_env *env, struct cl_io *io,
+		    __u32 enqflags, enum cl_lock_mode mode,
+		    loff_t start, loff_t end);
+void ccc_io_end(const struct lu_env *env, const struct cl_io_slice *ios);
+void ccc_io_advance(const struct lu_env *env, const struct cl_io_slice *ios,
+		    size_t nob);
+void ccc_io_update_iov(const struct lu_env *env, struct ccc_io *cio,
+		       struct cl_io *io);
+int ccc_prep_size(const struct lu_env *env, struct cl_object *obj,
+		  struct cl_io *io, loff_t start, size_t count, int *exceed);
+void ccc_req_completion(const struct lu_env *env,
+			const struct cl_req_slice *slice, int ioret);
+void ccc_req_attr_set(const struct lu_env *env,const struct cl_req_slice *slice,
+		      const struct cl_object *obj,
+		      struct cl_req_attr *oa, obd_valid flags);
+
+struct lu_device   *ccc2lu_dev      (struct ccc_device *vdv);
+struct lu_object   *ccc2lu	  (struct ccc_object *vob);
+struct ccc_device  *lu2ccc_dev      (const struct lu_device *d);
+struct ccc_device  *cl2ccc_dev      (const struct cl_device *d);
+struct ccc_object  *lu2ccc	  (const struct lu_object *obj);
+struct ccc_object  *cl2ccc	  (const struct cl_object *obj);
+struct ccc_lock    *cl2ccc_lock     (const struct cl_lock_slice *slice);
+struct ccc_io      *cl2ccc_io       (const struct lu_env *env,
+				     const struct cl_io_slice *slice);
+struct ccc_req     *cl2ccc_req      (const struct cl_req_slice *slice);
+struct page	 *cl2vm_page      (const struct cl_page_slice *slice);
+struct inode       *ccc_object_inode(const struct cl_object *obj);
+struct ccc_object  *cl_inode2ccc    (struct inode *inode);
+
+int cl_setattr_ost(struct inode *inode, const struct iattr *attr,
+		   struct obd_capa *capa);
+
+struct cl_page *ccc_vmpage_page_transient(struct page *vmpage);
+int ccc_object_invariant(const struct cl_object *obj);
+int cl_file_inode_init(struct inode *inode, struct lustre_md *md);
+void cl_inode_fini(struct inode *inode);
+int cl_local_size(struct inode *inode);
+
+__u16 ll_dirent_type_get(struct lu_dirent *ent);
+__u64 cl_fid_build_ino(const struct lu_fid *fid, int api32);
+__u32 cl_fid_build_gen(const struct lu_fid *fid);
+
+# define CLOBINVRNT(env, clob, expr)				    \
+	((void)sizeof(env), (void)sizeof(clob), (void)sizeof !!(expr))
+
+int cl_init_ea_size(struct obd_export *md_exp, struct obd_export *dt_exp);
+int cl_ocd_update(struct obd_device *host,
+		  struct obd_device *watched,
+		  enum obd_notify_event ev, void *owner, void *data);
+
+struct ccc_grouplock {
+	struct lu_env   *cg_env;
+	struct cl_io    *cg_io;
+	struct cl_lock  *cg_lock;
+	unsigned long    cg_gid;
+};
+
+int  cl_get_grouplock(struct cl_object *obj, unsigned long gid, int nonblock,
+		      struct ccc_grouplock *cg);
+void cl_put_grouplock(struct ccc_grouplock *cg);
+
+/**
+ * New interfaces to get and put lov_stripe_md from lov layer. This violates
+ * layering because lov_stripe_md is supposed to be a private data in lov.
+ *
+ * NB: If you find you have to use these interfaces for your new code, please
+ * think about it again. These interfaces may be removed in the future for
+ * better layering. */
+struct lov_stripe_md *lov_lsm_get(struct cl_object *clobj);
+void lov_lsm_put(struct cl_object *clobj, struct lov_stripe_md *lsm);
+int lov_read_and_clear_async_rc(struct cl_object *clob);
+
+struct lov_stripe_md *ccc_inode_lsm_get(struct inode *inode);
+void ccc_inode_lsm_put(struct inode *inode, struct lov_stripe_md *lsm);
+
+/**
+ * Data structure managing a client's cached pages. A count of
+ * "unstable" pages is maintained, and an LRU of clean pages is
+ * maintained. "unstable" pages are pages pinned by the ptlrpc
+ * layer for recovery purposes.
+ */
+struct cl_client_cache {
+	atomic_t	ccc_users;    /* # of users (OSCs) of this data */
+	struct list_head	ccc_lru;      /* LRU list of cached clean pages */
+	spinlock_t	ccc_lru_lock; /* lock for list */
+	atomic_t	ccc_lru_left; /* # of LRU entries available */
+	unsigned long	ccc_lru_max;  /* Max # of LRU entries possible */
+	unsigned int	ccc_lru_shrinkers; /* # of threads reclaiming */
+	atomic_t	ccc_unstable_nr;    /* # of unstable pages pinned */
+	wait_queue_head_t	ccc_unstable_waitq; /* Signaled on BRW commit */
+};
+
+#endif /*LCLIENT_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/lprocfs_status.h b/drivers/staging/lustre/lustre/include/linux/lprocfs_status.h
new file mode 100644
index 000000000000..586692272d78
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lprocfs_status.h
@@ -0,0 +1,58 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/linux/lprocfs_status.h
+ *
+ * Top level header file for LProc SNMP
+ *
+ * Author: Hariharan Thantry thantry@users.sourceforge.net
+ */
+#ifndef _LINUX_LPROCFS_SNMP_H
+#define _LINUX_LPROCFS_SNMP_H
+
+#ifndef _LPROCFS_SNMP_H
+#error Do not #include this file directly. #include <lprocfs_status.h> instead
+#endif
+
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/version.h>
+#include <linux/smp.h>
+#include <linux/rwsem.h>
+#include <linux/libcfs/libcfs.h>
+#include <linux/statfs.h>
+
+
+#endif /* LPROCFS_SNMP_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_acl.h b/drivers/staging/lustre/lustre/include/linux/lustre_acl.h
new file mode 100644
index 000000000000..ff4fc4ff2894
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_acl.h
@@ -0,0 +1,66 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lustre/include/lustre_acl.h
+ *
+ * MDS data structures.
+ * See also lustre_idl.h for wire formats of requests.
+ */
+
+#ifndef _LUSTRE_LINUX_ACL_H
+#define _LUSTRE_LINUX_ACL_H
+
+#ifndef	_LUSTRE_ACL_H
+#error	Shoud not include direectly. use #include <lustre_acl.h> instead
+#endif
+
+# include <linux/fs.h>
+# include <linux/dcache.h>
+# ifdef CONFIG_FS_POSIX_ACL
+#  include <linux/posix_acl_xattr.h>
+#  define LUSTRE_POSIX_ACL_MAX_ENTRIES	32
+#  define LUSTRE_POSIX_ACL_MAX_SIZE					\
+	(sizeof(posix_acl_xattr_header) +				\
+	 LUSTRE_POSIX_ACL_MAX_ENTRIES * sizeof(posix_acl_xattr_entry))
+# endif /* CONFIG_FS_POSIX_ACL */
+# include <linux/lustre_intent.h>
+# include <linux/xattr.h> /* XATTR_{REPLACE,CREATE} */
+
+#ifndef LUSTRE_POSIX_ACL_MAX_SIZE
+# define LUSTRE_POSIX_ACL_MAX_SIZE   0
+#endif
+
+#endif /* _LUSTRE_LINUX_ACL_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_common.h b/drivers/staging/lustre/lustre/include/linux/lustre_common.h
new file mode 100644
index 000000000000..d1783a33d8ca
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_common.h
@@ -0,0 +1,22 @@
+#ifndef LUSTRE_COMMON_H
+#define LUSTRE_COMMON_H
+
+#include <linux/sched.h>
+
+static inline int cfs_cleanup_group_info(void)
+{
+	struct group_info *ginfo;
+
+	ginfo = groups_alloc(0);
+	if (!ginfo)
+		return -ENOMEM;
+
+	set_current_groups(ginfo);
+	put_group_info(ginfo);
+
+	return 0;
+}
+
+#define ll_inode_blksize(a)		(1<<(a)->i_blkbits)
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h b/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h
new file mode 100644
index 000000000000..dff04688945b
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h
@@ -0,0 +1,349 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LINUX_COMPAT25_H
+#define _LINUX_COMPAT25_H
+
+#include <linux/fs_struct.h>
+#include <linux/namei.h>
+#include <linux/libcfs/linux/portals_compat25.h>
+
+#include <linux/lustre_patchless_compat.h>
+
+# define LOCK_FS_STRUCT(fs)	spin_lock(&(fs)->lock)
+# define UNLOCK_FS_STRUCT(fs)	spin_unlock(&(fs)->lock)
+
+static inline void ll_set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt,
+				 struct dentry *dentry)
+{
+	struct path path;
+	struct path old_pwd;
+
+	path.mnt = mnt;
+	path.dentry = dentry;
+	LOCK_FS_STRUCT(fs);
+	old_pwd = fs->pwd;
+	path_get(&path);
+	fs->pwd = path;
+	UNLOCK_FS_STRUCT(fs);
+
+	if (old_pwd.dentry)
+		path_put(&old_pwd);
+}
+
+
+/*
+ * set ATTR_BLOCKS to a high value to avoid any risk of collision with other
+ * ATTR_* attributes (see bug 13828)
+ */
+#define ATTR_BLOCKS    (1 << 27)
+
+#define current_ngroups current_cred()->group_info->ngroups
+#define current_groups current_cred()->group_info->small_block
+
+/*
+ * OBD need working random driver, thus all our
+ * initialization routines must be called after device
+ * driver initialization
+ */
+#ifndef MODULE
+#undef module_init
+#define module_init(a)     late_initcall(a)
+#endif
+
+
+#define LTIME_S(time)		   (time.tv_sec)
+
+#define ll_permission(inode,mask,nd)    inode_permission(inode,mask)
+
+# define ll_generic_permission(inode, mask, flags, check_acl) \
+	 generic_permission(inode, mask)
+
+#define ll_blkdev_put(a, b) blkdev_put(a, b)
+
+#define ll_dentry_open(a,b,c)	dentry_open(a,b,c)
+
+#define ll_vfs_symlink(dir, dentry, mnt, path, mode) \
+		       vfs_symlink(dir, dentry, path)
+
+
+#define ll_generic_file_llseek_size(file, offset, origin, maxbytes, eof) \
+		generic_file_llseek_size(file, offset, origin, maxbytes, eof);
+
+/* inode_dio_wait(i) use as-is for write lock */
+# define inode_dio_write_done(i)	do {} while (0) /* for write unlock */
+# define inode_dio_read(i)		atomic_inc(&(i)->i_dio_count)
+/* inode_dio_done(i) use as-is for read unlock */
+
+#define TREE_READ_LOCK_IRQ(mapping)	spin_lock_irq(&(mapping)->tree_lock)
+#define TREE_READ_UNLOCK_IRQ(mapping)	spin_unlock_irq(&(mapping)->tree_lock)
+
+static inline
+int ll_unregister_blkdev(unsigned int dev, const char *name)
+{
+	unregister_blkdev(dev, name);
+	return 0;
+}
+
+#define ll_invalidate_bdev(a,b)	 invalidate_bdev((a))
+
+#ifndef FS_HAS_FIEMAP
+#define FS_HAS_FIEMAP			(0)
+#endif
+
+
+
+/* add a lustre compatible layer for crypto API */
+#include <linux/crypto.h>
+#define ll_crypto_hash	  crypto_hash
+#define ll_crypto_cipher	crypto_blkcipher
+#define ll_crypto_alloc_hash(name, type, mask)  crypto_alloc_hash(name, type, mask)
+#define ll_crypto_hash_setkey(tfm, key, keylen) crypto_hash_setkey(tfm, key, keylen)
+#define ll_crypto_hash_init(desc)	       crypto_hash_init(desc)
+#define ll_crypto_hash_update(desc, sl, bytes)  crypto_hash_update(desc, sl, bytes)
+#define ll_crypto_hash_final(desc, out)	 crypto_hash_final(desc, out)
+#define ll_crypto_blkcipher_setkey(tfm, key, keylen) \
+		crypto_blkcipher_setkey(tfm, key, keylen)
+#define ll_crypto_blkcipher_set_iv(tfm, src, len) \
+		crypto_blkcipher_set_iv(tfm, src, len)
+#define ll_crypto_blkcipher_get_iv(tfm, dst, len) \
+		crypto_blkcipher_get_iv(tfm, dst, len)
+#define ll_crypto_blkcipher_encrypt(desc, dst, src, bytes) \
+		crypto_blkcipher_encrypt(desc, dst, src, bytes)
+#define ll_crypto_blkcipher_decrypt(desc, dst, src, bytes) \
+		crypto_blkcipher_decrypt(desc, dst, src, bytes)
+#define ll_crypto_blkcipher_encrypt_iv(desc, dst, src, bytes) \
+		crypto_blkcipher_encrypt_iv(desc, dst, src, bytes)
+#define ll_crypto_blkcipher_decrypt_iv(desc, dst, src, bytes) \
+		crypto_blkcipher_decrypt_iv(desc, dst, src, bytes)
+
+static inline
+struct ll_crypto_cipher *ll_crypto_alloc_blkcipher(const char *name,
+						   u32 type, u32 mask)
+{
+	struct ll_crypto_cipher *rtn = crypto_alloc_blkcipher(name, type, mask);
+
+	return (rtn == NULL ? ERR_PTR(-ENOMEM) : rtn);
+}
+
+static inline int ll_crypto_hmac(struct ll_crypto_hash *tfm,
+				 u8 *key, unsigned int *keylen,
+				 struct scatterlist *sg,
+				 unsigned int size, u8 *result)
+{
+	struct hash_desc desc;
+	int	      rv;
+	desc.tfm   = tfm;
+	desc.flags = 0;
+	rv = crypto_hash_setkey(desc.tfm, key, *keylen);
+	if (rv) {
+		CERROR("failed to hash setkey: %d\n", rv);
+		return rv;
+	}
+	return crypto_hash_digest(&desc, sg, size, result);
+}
+static inline
+unsigned int ll_crypto_tfm_alg_max_keysize(struct crypto_blkcipher *tfm)
+{
+	return crypto_blkcipher_tfm(tfm)->__crt_alg->cra_blkcipher.max_keysize;
+}
+static inline
+unsigned int ll_crypto_tfm_alg_min_keysize(struct crypto_blkcipher *tfm)
+{
+	return crypto_blkcipher_tfm(tfm)->__crt_alg->cra_blkcipher.min_keysize;
+}
+
+#define ll_crypto_hash_blocksize(tfm)       crypto_hash_blocksize(tfm)
+#define ll_crypto_hash_digestsize(tfm)      crypto_hash_digestsize(tfm)
+#define ll_crypto_blkcipher_ivsize(tfm)     crypto_blkcipher_ivsize(tfm)
+#define ll_crypto_blkcipher_blocksize(tfm)  crypto_blkcipher_blocksize(tfm)
+#define ll_crypto_free_hash(tfm)	    crypto_free_hash(tfm)
+#define ll_crypto_free_blkcipher(tfm)       crypto_free_blkcipher(tfm)
+
+#define ll_vfs_rmdir(dir,entry,mnt)	     vfs_rmdir(dir,entry)
+#define ll_vfs_mkdir(inode,dir,mnt,mode)	vfs_mkdir(inode,dir,mode)
+#define ll_vfs_link(old,mnt,dir,new,mnt1)       vfs_link(old,dir,new)
+#define ll_vfs_unlink(inode,entry,mnt)	  vfs_unlink(inode,entry)
+#define ll_vfs_mknod(dir,entry,mnt,mode,dev)    vfs_mknod(dir,entry,mode,dev)
+#define ll_security_inode_unlink(dir,entry,mnt) security_inode_unlink(dir,entry)
+#define ll_vfs_rename(old,old_dir,mnt,new,new_dir,mnt1) \
+		vfs_rename(old,old_dir,new,new_dir)
+
+#ifdef for_each_possible_cpu
+#define cfs_for_each_possible_cpu(cpu) for_each_possible_cpu(cpu)
+#elif defined(for_each_cpu)
+#define cfs_for_each_possible_cpu(cpu) for_each_cpu(cpu)
+#endif
+
+#define cfs_bio_io_error(a,b)   bio_io_error((a))
+#define cfs_bio_endio(a,b,c)    bio_endio((a),(c))
+
+#define cfs_fs_pwd(fs)       ((fs)->pwd.dentry)
+#define cfs_fs_mnt(fs)       ((fs)->pwd.mnt)
+#define cfs_path_put(nd)     path_put(&(nd)->path)
+
+
+#ifndef SLAB_DESTROY_BY_RCU
+#define SLAB_DESTROY_BY_RCU 0
+#endif
+
+
+
+static inline int
+ll_quota_on(struct super_block *sb, int off, int ver, char *name, int remount)
+{
+	int rc;
+
+	if (sb->s_qcop->quota_on) {
+		struct path path;
+
+		rc = kern_path(name, LOOKUP_FOLLOW, &path);
+		if (!rc)
+			return rc;
+		rc = sb->s_qcop->quota_on(sb, off, ver
+					    , &path
+					   );
+		path_put(&path);
+		return rc;
+	}
+	else
+		return -ENOSYS;
+}
+
+static inline int ll_quota_off(struct super_block *sb, int off, int remount)
+{
+	if (sb->s_qcop->quota_off) {
+		return sb->s_qcop->quota_off(sb, off
+					    );
+	}
+	else
+		return -ENOSYS;
+}
+
+
+# define ll_vfs_dq_init	     dquot_initialize
+# define ll_vfs_dq_drop	     dquot_drop
+# define ll_vfs_dq_transfer	 dquot_transfer
+# define ll_vfs_dq_off(sb, remount) dquot_suspend(sb, -1)
+
+
+
+
+
+#define queue_max_phys_segments(rq)       queue_max_segments(rq)
+#define queue_max_hw_segments(rq)	 queue_max_segments(rq)
+
+#define ll_kmap_atomic(a, b)	kmap_atomic(a)
+#define ll_kunmap_atomic(a, b)	kunmap_atomic(a)
+
+
+#define ll_d_hlist_node hlist_node
+#define ll_d_hlist_empty(list) hlist_empty(list)
+#define ll_d_hlist_entry(ptr, type, name) hlist_entry(ptr.first, type, name)
+#define ll_d_hlist_for_each(tmp, i_dentry) hlist_for_each(tmp, i_dentry)
+#define ll_d_hlist_for_each_entry(dentry, p, i_dentry, alias) \
+	p = NULL; hlist_for_each_entry(dentry, i_dentry, alias)
+
+
+#define bio_hw_segments(q, bio) 0
+
+
+#define ll_pagevec_init(pv, cold)       do {} while (0)
+#define ll_pagevec_add(pv, pg)	  (0)
+#define ll_pagevec_lru_add_file(pv)     do {} while (0)
+
+
+#ifndef QUOTA_OK
+# define QUOTA_OK 0
+#endif
+#ifndef NO_QUOTA
+# define NO_QUOTA (-EDQUOT)
+#endif
+
+#ifndef SEEK_DATA
+#define SEEK_DATA      3       /* seek to the next data */
+#endif
+#ifndef SEEK_HOLE
+#define SEEK_HOLE      4       /* seek to the next hole */
+#endif
+
+#ifndef FMODE_UNSIGNED_OFFSET
+#define FMODE_UNSIGNED_OFFSET	((__force fmode_t)0x2000)
+#endif
+
+#if !defined(_ASM_GENERIC_BITOPS_EXT2_NON_ATOMIC_H_) && !defined(ext2_set_bit)
+# define ext2_set_bit	     __test_and_set_bit_le
+# define ext2_clear_bit	   __test_and_clear_bit_le
+# define ext2_test_bit	    test_bit_le
+# define ext2_find_first_zero_bit find_first_zero_bit_le
+# define ext2_find_next_zero_bit  find_next_zero_bit_le
+#endif
+
+#ifdef ATTR_TIMES_SET
+# define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)
+#else
+# define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET)
+#endif
+
+
+
+/*
+ * After 3.1, kernel's nameidata.intent.open.flags is different
+ * with lustre's lookup_intent.it_flags, as lustre's it_flags'
+ * lower bits equal to FMODE_xxx while kernel doesn't transliterate
+ * lower bits of nameidata.intent.open.flags to FMODE_xxx.
+ * */
+#include <linux/version.h>
+static inline int ll_namei_to_lookup_intent_flag(int flag)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 1, 0)
+	flag = (flag & ~O_ACCMODE) | OPEN_FMODE(flag);
+#endif
+	return flag;
+}
+
+# define ll_mrf_ret void
+# define LL_MRF_RETURN(rc)
+
+#include <linux/fs.h>
+
+# define ll_umode_t	umode_t
+
+#include <linux/dcache.h>
+
+# define ll_dirty_inode(inode, flag)	(inode)->i_sb->s_op->dirty_inode((inode), flag)
+
+#endif /* _COMPAT25_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_debug.h b/drivers/staging/lustre/lustre/include/linux/lustre_debug.h
new file mode 100644
index 000000000000..11deac7248ae
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_debug.h
@@ -0,0 +1,47 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LINUX_LUSTRE_DEBUG_H
+#define _LINUX_LUSTRE_DEBUG_H
+
+#ifndef _LUSTRE_DEBUG_H
+#error Do not #include this file directly. #include <lprocfs_status.h> instead
+#endif
+
+#define LL_CDEBUG_PAGE(mask, page, fmt, arg...)			       \
+	CDEBUG(mask, "page %p map %p index %lu flags %lx count %u priv %0lx: "\
+	       fmt, page, page->mapping, page->index, (long)page->flags,      \
+	       page_count(page), page_private(page), ## arg)
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_dlm.h b/drivers/staging/lustre/lustre/include/linux/lustre_dlm.h
new file mode 100644
index 000000000000..207df03f6149
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_dlm.h
@@ -0,0 +1,46 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LINUX_LUSTRE_DLM_H__
+#define _LINUX_LUSTRE_DLM_H__
+
+#ifndef _LUSTRE_DLM_H__
+#error Do not #include this file directly. #include <lprocfs_status.h> instead
+#endif
+
+# include <linux/proc_fs.h>
+#  include <asm/processor.h>
+#  include <linux/bit_spinlock.h>
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_fsfilt.h b/drivers/staging/lustre/lustre/include/linux/lustre_fsfilt.h
new file mode 100644
index 000000000000..6c7260957383
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_fsfilt.h
@@ -0,0 +1,181 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/linux/lustre_fsfilt.h
+ *
+ * Filesystem interface helper.
+ */
+
+#ifndef _LINUX_LUSTRE_FSFILT_H
+#define _LINUX_LUSTRE_FSFILT_H
+
+#ifndef _LUSTRE_FSFILT_H
+#error Do not #include this file directly. #include <lustre_fsfilt.h> instead
+#endif
+
+
+#include <obd.h>
+#include <obd_class.h>
+
+typedef void (*fsfilt_cb_t)(struct obd_device *obd, __u64 last_rcvd,
+			    void *data, int error);
+
+struct fsfilt_operations {
+	struct list_head fs_list;
+	module_t *fs_owner;
+	char   *fs_type;
+	char   *(* fs_getlabel)(struct super_block *sb);
+	void   *(* fs_start)(struct inode *inode, int op, void *desc_private,
+			     int logs);
+	int     (* fs_commit)(struct inode *inode, void *handle,int force_sync);
+	int     (* fs_map_inode_pages)(struct inode *inode, struct page **page,
+				       int pages, unsigned long *blocks,
+				       int create, struct mutex *sem);
+	int     (* fs_write_record)(struct file *, void *, int size, loff_t *,
+				    int force_sync);
+	int     (* fs_read_record)(struct file *, void *, int size, loff_t *);
+	int     (* fs_setup)(struct super_block *sb);
+};
+
+extern int fsfilt_register_ops(struct fsfilt_operations *fs_ops);
+extern void fsfilt_unregister_ops(struct fsfilt_operations *fs_ops);
+extern struct fsfilt_operations *fsfilt_get_ops(const char *type);
+extern void fsfilt_put_ops(struct fsfilt_operations *fs_ops);
+
+static inline char *fsfilt_get_label(struct obd_device *obd,
+				     struct super_block *sb)
+{
+	if (obd->obd_fsops->fs_getlabel == NULL)
+		return NULL;
+	if (obd->obd_fsops->fs_getlabel(sb)[0] == '\0')
+		return NULL;
+
+	return obd->obd_fsops->fs_getlabel(sb);
+}
+
+#define FSFILT_OP_UNLINK		1
+#define FSFILT_OP_CANCEL_UNLINK	 10
+
+#define __fsfilt_check_slow(obd, start, msg)			      \
+do {								      \
+	if (cfs_time_before(jiffies, start + 15 * HZ))		\
+		break;						    \
+	else if (cfs_time_before(jiffies, start + 30 * HZ))	   \
+		CDEBUG(D_VFSTRACE, "%s: slow %s %lus\n", obd->obd_name,   \
+		       msg, (jiffies-start) / HZ);		    \
+	else if (cfs_time_before(jiffies, start + DISK_TIMEOUT * HZ)) \
+		CWARN("%s: slow %s %lus\n", obd->obd_name, msg,	   \
+		      (jiffies - start) / HZ);			\
+	else							      \
+		CERROR("%s: slow %s %lus\n", obd->obd_name, msg,	  \
+		       (jiffies - start) / HZ);		       \
+} while (0)
+
+#define fsfilt_check_slow(obd, start, msg)	      \
+do {						    \
+	__fsfilt_check_slow(obd, start, msg);	   \
+	start = jiffies;				\
+} while (0)
+
+static inline void *fsfilt_start_log(struct obd_device *obd,
+				     struct inode *inode, int op,
+				     struct obd_trans_info *oti, int logs)
+{
+	unsigned long now = jiffies;
+	void *parent_handle = oti ? oti->oti_handle : NULL;
+	void *handle;
+
+	handle = obd->obd_fsops->fs_start(inode, op, parent_handle, logs);
+	CDEBUG(D_INFO, "started handle %p (%p)\n", handle, parent_handle);
+
+	if (oti != NULL) {
+		if (parent_handle == NULL) {
+			oti->oti_handle = handle;
+		} else if (handle != parent_handle) {
+			CERROR("mismatch: parent %p, handle %p, oti %p\n",
+			       parent_handle, handle, oti);
+			LBUG();
+		}
+	}
+	fsfilt_check_slow(obd, now, "journal start");
+	return handle;
+}
+
+static inline int fsfilt_commit(struct obd_device *obd, struct inode *inode,
+				void *handle, int force_sync)
+{
+	unsigned long now = jiffies;
+	int rc = obd->obd_fsops->fs_commit(inode, handle, force_sync);
+	CDEBUG(D_INFO, "committing handle %p\n", handle);
+
+	fsfilt_check_slow(obd, now, "journal start");
+
+	return rc;
+}
+
+static inline int fsfilt_map_inode_pages(struct obd_device *obd,
+					 struct inode *inode,
+					 struct page **page, int pages,
+					 unsigned long *blocks,
+					 int create, struct mutex *mutex)
+{
+	return obd->obd_fsops->fs_map_inode_pages(inode, page, pages, blocks,
+						  create, mutex);
+}
+
+static inline int fsfilt_read_record(struct obd_device *obd, struct file *file,
+				     void *buf, loff_t size, loff_t *offs)
+{
+	return obd->obd_fsops->fs_read_record(file, buf, size, offs);
+}
+
+static inline int fsfilt_write_record(struct obd_device *obd, struct file *file,
+				      void *buf, loff_t size, loff_t *offs,
+				      int force_sync)
+{
+	return obd->obd_fsops->fs_write_record(file, buf, size,offs,force_sync);
+}
+
+static inline int fsfilt_setup(struct obd_device *obd, struct super_block *fs)
+{
+	if (obd->obd_fsops->fs_setup)
+		return obd->obd_fsops->fs_setup(fs);
+	return 0;
+}
+
+
+
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_handles.h b/drivers/staging/lustre/lustre/include/linux/lustre_handles.h
new file mode 100644
index 000000000000..ecf184051252
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_handles.h
@@ -0,0 +1,53 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LINUX_LUSTRE_HANDLES_H_
+#define __LINUX_LUSTRE_HANDLES_H_
+
+#ifndef __LUSTRE_HANDLES_H_
+#error Do not #include this file directly. #include <lustre_handles.h> instead
+#endif
+
+#include <asm/types.h>
+#include <asm/atomic.h>
+#include <linux/list.h>
+#include <linux/version.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+
+#include <linux/rcupdate.h> /* for rcu_head{} */
+typedef struct rcu_head cfs_rcu_head_t;
+
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_intent.h b/drivers/staging/lustre/lustre/include/linux/lustre_intent.h
new file mode 100644
index 000000000000..b10ddfa7df29
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_intent.h
@@ -0,0 +1,62 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef LUSTRE_INTENT_H
+#define LUSTRE_INTENT_H
+
+/* intent IT_XXX are defined in lustre/include/obd.h */
+struct lustre_intent_data {
+	int		it_disposition;
+	int		it_status;
+	__u64		it_lock_handle;
+	__u64		it_lock_bits;
+	int		it_lock_mode;
+	int		it_remote_lock_mode;
+	__u64	   it_remote_lock_handle;
+	void	   *it_data;
+	unsigned int    it_lock_set:1;
+};
+
+struct lookup_intent {
+	int     it_op;
+	int     it_flags;
+	int     it_create_mode;
+	union {
+		struct lustre_intent_data lustre;
+	} d;
+};
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_lib.h b/drivers/staging/lustre/lustre/include/linux/lustre_lib.h
new file mode 100644
index 000000000000..b2f755acadf6
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_lib.h
@@ -0,0 +1,87 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/linux/lustre_lib.h
+ *
+ * Basic Lustre library routines.
+ */
+
+#ifndef _LINUX_LUSTRE_LIB_H
+#define _LINUX_LUSTRE_LIB_H
+
+#ifndef _LUSTRE_LIB_H
+#error Do not #include this file directly. #include <lustre_lib.h> instead
+#endif
+
+# include <linux/rwsem.h>
+# include <linux/sched.h>
+# include <linux/signal.h>
+# include <linux/types.h>
+# include <linux/lustre_compat25.h>
+# include <linux/lustre_common.h>
+
+#ifndef LP_POISON
+#if BITS_PER_LONG > 32
+# define LI_POISON ((int)0x5a5a5a5a5a5a5a5a)
+# define LL_POISON ((long)0x5a5a5a5a5a5a5a5a)
+# define LP_POISON ((void *)(long)0x5a5a5a5a5a5a5a5a)
+#else
+# define LI_POISON ((int)0x5a5a5a5a)
+# define LL_POISON ((long)0x5a5a5a5a)
+# define LP_POISON ((void *)(long)0x5a5a5a5a)
+#endif
+#endif
+
+/* This macro is only for compatibility reasons with older Linux Lustre user
+ * tools. New ioctls should NOT use this macro as the ioctl "size". Instead
+ * the ioctl should get a "size" argument which is the actual data type used
+ * by the ioctl, to ensure the ioctl interface is versioned correctly. */
+#define OBD_IOC_DATA_TYPE	       long
+
+#define LUSTRE_FATAL_SIGS (sigmask(SIGKILL) | sigmask(SIGINT) |		\
+			   sigmask(SIGTERM) | sigmask(SIGQUIT) |	       \
+			   sigmask(SIGALRM))
+
+/* initialize ost_lvb according to inode */
+static inline void inode_init_lvb(struct inode *inode, struct ost_lvb *lvb)
+{
+	lvb->lvb_size = i_size_read(inode);
+	lvb->lvb_blocks = inode->i_blocks;
+	lvb->lvb_mtime = LTIME_S(inode->i_mtime);
+	lvb->lvb_atime = LTIME_S(inode->i_atime);
+	lvb->lvb_ctime = LTIME_S(inode->i_ctime);
+}
+
+#endif /* _LUSTRE_LIB_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_lite.h b/drivers/staging/lustre/lustre/include/linux/lustre_lite.h
new file mode 100644
index 000000000000..c95dff900b58
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_lite.h
@@ -0,0 +1,100 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LINUX_LL_H
+#define _LINUX_LL_H
+
+#ifndef _LL_H
+#error Do not #include this file directly. #include <lustre_lite.h> instead
+#endif
+
+
+#include <linux/version.h>
+
+#include <asm/statfs.h>
+
+#include <linux/fs.h>
+#include <linux/dcache.h>
+#include <linux/proc_fs.h>
+
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_ha.h>
+
+#include <linux/rbtree.h>
+#include <linux/lustre_compat25.h>
+#include <linux/lustre_common.h>
+#include <linux/pagemap.h>
+
+/* lprocfs.c */
+enum {
+	 LPROC_LL_DIRTY_HITS = 0,
+	 LPROC_LL_DIRTY_MISSES,
+	 LPROC_LL_READ_BYTES,
+	 LPROC_LL_WRITE_BYTES,
+	 LPROC_LL_BRW_READ,
+	 LPROC_LL_BRW_WRITE,
+	 LPROC_LL_OSC_READ,
+	 LPROC_LL_OSC_WRITE,
+	 LPROC_LL_IOCTL,
+	 LPROC_LL_OPEN,
+	 LPROC_LL_RELEASE,
+	 LPROC_LL_MAP,
+	 LPROC_LL_LLSEEK,
+	 LPROC_LL_FSYNC,
+	 LPROC_LL_READDIR,
+	 LPROC_LL_SETATTR,
+	 LPROC_LL_TRUNC,
+	 LPROC_LL_FLOCK,
+	 LPROC_LL_GETATTR,
+	 LPROC_LL_CREATE,
+	 LPROC_LL_LINK,
+	 LPROC_LL_UNLINK,
+	 LPROC_LL_SYMLINK,
+	 LPROC_LL_MKDIR,
+	 LPROC_LL_RMDIR,
+	 LPROC_LL_MKNOD,
+	 LPROC_LL_RENAME,
+	 LPROC_LL_STAFS,
+	 LPROC_LL_ALLOC_INODE,
+	 LPROC_LL_SETXATTR,
+	 LPROC_LL_GETXATTR,
+	 LPROC_LL_LISTXATTR,
+	 LPROC_LL_REMOVEXATTR,
+	 LPROC_LL_INODE_PERM,
+	 LPROC_LL_FILE_OPCODES
+};
+
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_log.h b/drivers/staging/lustre/lustre/include/linux/lustre_log.h
new file mode 100644
index 000000000000..e9c8e56737d2
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_log.h
@@ -0,0 +1,57 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/linux/lustre_log.h
+ *
+ * Generic infrastructure for managing a collection of logs.
+ * These logs are used for:
+ *  - orphan recovery: OST adds record on create
+ *  - mtime/size consistency: the OST adds a record on first write
+ *  - open/unlinked objects: OST adds a record on destroy
+ *
+ *  - mds unlink log: the MDS adds an entry upon delete
+ *
+ *  - raid1 replication log between OST's
+ *  - MDS replication logs
+ */
+
+#ifndef _LINUX_LUSTRE_LOG_H
+#define _LINUX_LUSTRE_LOG_H
+
+#ifndef _LUSTRE_LOG_H
+#error Do not #include this file directly. #include <lustre_log.h> instead
+#endif
+
+#define LUSTRE_LOG_SERVER
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_net.h b/drivers/staging/lustre/lustre/include/linux/lustre_net.h
new file mode 100644
index 000000000000..2d7c425d7012
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_net.h
@@ -0,0 +1,50 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LINUX_LUSTRE_NET_H
+#define _LINUX_LUSTRE_NET_H
+
+#ifndef _LUSTRE_NET_H
+#error Do not #include this file directly. #include <lustre_net.h> instead
+#endif
+
+#include <linux/version.h>
+#include <linux/workqueue.h>
+
+/* XXX Liang: should be moved to other header instead of here */
+#ifndef WITH_GROUP_INFO
+#define WITH_GROUP_INFO
+#endif
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h b/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h
new file mode 100644
index 000000000000..f0508084e8c5
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h
@@ -0,0 +1,85 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef LUSTRE_PATCHLESS_COMPAT_H
+#define LUSTRE_PATCHLESS_COMPAT_H
+
+#include <linux/fs.h>
+
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/hash.h>
+
+
+#define ll_delete_from_page_cache(page) delete_from_page_cache(page)
+
+static inline void
+truncate_complete_page(struct address_space *mapping, struct page *page)
+{
+	if (page->mapping != mapping)
+		return;
+
+	if (PagePrivate(page))
+		page->mapping->a_ops->invalidatepage(page, 0);
+
+	cancel_dirty_page(page, PAGE_SIZE);
+	ClearPageMappedToDisk(page);
+	ll_delete_from_page_cache(page);
+}
+
+#  define d_refcount(d)		 ((d)->d_count)
+
+#ifdef ATTR_OPEN
+# define ATTR_FROM_OPEN ATTR_OPEN
+#else
+# ifndef ATTR_FROM_OPEN
+#  define ATTR_FROM_OPEN 0
+# endif
+#endif /* ATTR_OPEN */
+
+#ifndef ATTR_RAW
+#define ATTR_RAW 0
+#endif
+
+#ifndef ATTR_CTIME_SET
+/*
+ * set ATTR_CTIME_SET to a high value to avoid any risk of collision with other
+ * ATTR_* attributes (see bug 13828)
+ */
+#define ATTR_CTIME_SET (1 << 28)
+#endif
+
+#endif /* LUSTRE_PATCHLESS_COMPAT_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_quota.h b/drivers/staging/lustre/lustre/include/linux/lustre_quota.h
new file mode 100644
index 000000000000..421866b004cf
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_quota.h
@@ -0,0 +1,47 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LINUX_LUSTRE_QUOTA_H
+#define _LINUX_LUSTRE_QUOTA_H
+
+#ifndef _LUSTRE_QUOTA_H
+#error Do not #include this file directly. #include <lustre_quota.h> instead
+#endif
+
+#include <linux/version.h>
+#include <linux/fs.h>
+#include <linux/quota.h>
+#include <linux/quotaops.h>
+
+#endif /* _LUSTRE_QUOTA_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_user.h b/drivers/staging/lustre/lustre/include/linux/lustre_user.h
new file mode 100644
index 000000000000..ebaf92977f7f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_user.h
@@ -0,0 +1,67 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/linux/lustre_user.h
+ *
+ * Lustre public user-space interface definitions.
+ */
+
+#ifndef _LINUX_LUSTRE_USER_H
+#define _LINUX_LUSTRE_USER_H
+
+# include <linux/version.h>
+# include <linux/quota.h>
+
+/*
+ * asm-x86_64/processor.h on some SLES 9 distros seems to use
+ * kernel-only typedefs.  fortunately skipping it altogether is ok
+ * (for now).
+ */
+#define __ASM_X86_64_PROCESSOR_H
+
+#include <linux/string.h>
+
+#if defined(__x86_64__) || defined(__ia64__) || defined(__ppc64__) || \
+    defined(__craynv) || defined (__mips64__) || defined(__powerpc64__)
+typedef struct stat     lstat_t;
+#define lstat_f	 lstat
+#define HAVE_LOV_USER_MDS_DATA
+#else
+typedef struct stat64   lstat_t;
+#define lstat_f	 lstat64
+#define HAVE_LOV_USER_MDS_DATA
+#endif
+
+#endif /* _LUSTRE_USER_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/lvfs.h b/drivers/staging/lustre/lustre/include/linux/lvfs.h
new file mode 100644
index 000000000000..b4db6cb581bd
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lvfs.h
@@ -0,0 +1,134 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/linux/lvfs.h
+ *
+ * lustre VFS/process permission interface
+ */
+
+#ifndef __LINUX_LVFS_H__
+#define __LINUX_LVFS_H__
+
+#ifndef __LVFS_H__
+#error Do not #include this file directly. #include <lvfs.h> instead
+#endif
+
+#include <linux/lustre_compat25.h>
+#include <linux/lustre_common.h>
+#include <linux/lvfs_linux.h>
+
+#define LLOG_LVFS
+
+/* simple.c */
+
+struct lvfs_ucred {
+	__u32		   luc_uid;
+	__u32		   luc_gid;
+	__u32		   luc_fsuid;
+	__u32		   luc_fsgid;
+	kernel_cap_t	luc_cap;
+	__u32		   luc_umask;
+	struct group_info      *luc_ginfo;
+	struct md_identity     *luc_identity;
+};
+
+struct lvfs_callback_ops {
+	struct dentry *(*l_fid2dentry)(__u64 id_ino, __u32 gen, __u64 gr, void *data);
+};
+
+#define OBD_RUN_CTXT_MAGIC      0xC0FFEEAA
+#define OBD_CTXT_DEBUG	  /* development-only debugging */
+struct lvfs_run_ctxt {
+	struct vfsmount	 *pwdmnt;
+	struct dentry	   *pwd;
+	mm_segment_t	     fs;
+	struct lvfs_ucred	luc;
+	int		      ngroups;
+	struct lvfs_callback_ops cb_ops;
+	struct group_info       *group_info;
+	struct dt_device	*dt;
+#ifdef OBD_CTXT_DEBUG
+	__u32		    magic;
+#endif
+};
+
+#ifdef OBD_CTXT_DEBUG
+#define OBD_SET_CTXT_MAGIC(ctxt) (ctxt)->magic = OBD_RUN_CTXT_MAGIC
+#else
+#define OBD_SET_CTXT_MAGIC(ctxt) do {} while(0)
+#endif
+
+
+int lustre_rename(struct dentry *dir, struct vfsmount *mnt, char *oldname,
+		  char *newname);
+
+static inline void l_dput(struct dentry *de)
+{
+	if (!de || IS_ERR(de))
+		return;
+	//shrink_dcache_parent(de);
+	LASSERT(d_refcount(de) > 0);
+	dput(de);
+}
+
+/* We need to hold the inode semaphore over the dcache lookup itself, or we
+ * run the risk of entering the filesystem lookup path concurrently on SMP
+ * systems, and instantiating two inodes for the same entry.  We still
+ * protect against concurrent addition/removal races with the DLM locking.
+ */
+static inline struct dentry *ll_lookup_one_len(const char *fid_name,
+					       struct dentry *dparent,
+					       int fid_namelen)
+{
+	struct dentry *dchild;
+
+	mutex_lock(&dparent->d_inode->i_mutex);
+	dchild = lookup_one_len(fid_name, dparent, fid_namelen);
+	mutex_unlock(&dparent->d_inode->i_mutex);
+
+	if (IS_ERR(dchild) || dchild->d_inode == NULL)
+		return dchild;
+
+	if (is_bad_inode(dchild->d_inode)) {
+		CERROR("bad inode returned %lu/%u\n",
+		       dchild->d_inode->i_ino, dchild->d_inode->i_generation);
+		dput(dchild);
+		dchild = ERR_PTR(-ENOENT);
+	}
+	return dchild;
+}
+
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/linux/lvfs_linux.h b/drivers/staging/lustre/lustre/include/linux/lvfs_linux.h
new file mode 100644
index 000000000000..140a60f1f0c9
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lvfs_linux.h
@@ -0,0 +1,66 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LVFS_LINUX_H__
+#define __LVFS_LINUX_H__
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/sched.h>
+
+#include <lvfs.h>
+
+#define l_file file
+#define l_dentry dentry
+
+#define l_filp_open filp_open
+
+struct lvfs_run_ctxt;
+struct l_file *l_dentry_open(struct lvfs_run_ctxt *, struct l_dentry *,
+			     int flags);
+
+struct l_linux_dirent {
+	struct list_head      lld_list;
+	ino_t	   lld_ino;
+	unsigned long   lld_off;
+	char	    lld_name[LL_FID_NAMELEN];
+};
+struct l_readdir_callback {
+	struct l_linux_dirent *lrc_dirent;
+	struct list_head	    *lrc_list;
+};
+
+#endif /*  __LVFS_LINUX_H__ */
diff --git a/drivers/staging/lustre/lustre/include/linux/obd.h b/drivers/staging/lustre/lustre/include/linux/obd.h
new file mode 100644
index 000000000000..2c36c0d19d06
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/obd.h
@@ -0,0 +1,128 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LINUX_OBD_H
+#define __LINUX_OBD_H
+
+#ifndef __OBD_H
+#error Do not #include this file directly. #include <obd.h> instead
+#endif
+
+#include <obd_support.h>
+
+# include <linux/fs.h>
+# include <linux/list.h>
+# include <linux/sched.h>  /* for struct task_struct, for current.h */
+# include <linux/proc_fs.h>
+# include <linux/mount.h>
+# include <linux/lustre_intent.h>
+
+struct ll_iattr {
+	struct iattr	iattr;
+	unsigned int	ia_attr_flags;
+};
+
+#define CLIENT_OBD_LIST_LOCK_DEBUG 1
+
+typedef struct {
+	spinlock_t		lock;
+
+	unsigned long       time;
+	struct task_struct *task;
+	const char	 *func;
+	int		 line;
+} client_obd_lock_t;
+
+static inline void __client_obd_list_lock(client_obd_lock_t *lock,
+					  const char *func, int line)
+{
+	unsigned long cur = jiffies;
+	while (1) {
+		if (spin_trylock(&lock->lock)) {
+			LASSERT(lock->task == NULL);
+			lock->task = current;
+			lock->func = func;
+			lock->line = line;
+			lock->time = jiffies;
+			break;
+		}
+
+		if ((jiffies - cur > 5 * HZ) &&
+		    (jiffies - lock->time > 5 * HZ)) {
+			struct task_struct *task = lock->task;
+
+			if (task == NULL)
+				continue;
+
+			LCONSOLE_WARN("%s:%d: lock %p was acquired"
+				      " by <%s:%d:%s:%d> for %lu seconds.\n",
+				      current->comm, current->pid,
+				      lock, task->comm, task->pid,
+				      lock->func, lock->line,
+				      (jiffies - lock->time) / HZ);
+			LCONSOLE_WARN("====== for process holding the "
+				      "lock =====\n");
+			libcfs_debug_dumpstack(task);
+			LCONSOLE_WARN("====== for current process =====\n");
+			libcfs_debug_dumpstack(NULL);
+			LCONSOLE_WARN("====== end =======\n");
+			cfs_pause(1000 * HZ);
+		}
+		cpu_relax();
+	}
+}
+
+#define client_obd_list_lock(lock) \
+	__client_obd_list_lock(lock, __FUNCTION__, __LINE__)
+
+static inline void client_obd_list_unlock(client_obd_lock_t *lock)
+{
+	LASSERT(lock->task != NULL);
+	lock->task = NULL;
+	lock->time = jiffies;
+	spin_unlock(&lock->lock);
+}
+
+
+static inline void client_obd_list_lock_init(client_obd_lock_t *lock)
+{
+	spin_lock_init(&lock->lock);
+}
+
+static inline void client_obd_list_lock_done(client_obd_lock_t *lock)
+{}
+
+#endif /* __LINUX_OBD_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/obd_class.h b/drivers/staging/lustre/lustre/include/linux/obd_class.h
new file mode 100644
index 000000000000..021ead6639fc
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/obd_class.h
@@ -0,0 +1,58 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LINUX_CLASS_OBD_H
+#define __LINUX_CLASS_OBD_H
+
+#ifndef __CLASS_OBD_H
+#error Do not #include this file directly. #include <obd_class.h> instead
+#endif
+
+#include <asm/uaccess.h>
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/timer.h>
+
+/* obdo.c */
+void obdo_from_la(struct obdo *dst, struct lu_attr *la, __u64 valid);
+void la_from_obdo(struct lu_attr *la, struct obdo *dst, obd_flag valid);
+void obdo_refresh_inode(struct inode *dst, struct obdo *src, obd_flag valid);
+void obdo_to_inode(struct inode *dst, struct obdo *src, obd_flag valid);
+#define ll_inode_flags(inode)	 (inode->i_flags)
+
+
+#endif /* __LINUX_OBD_CLASS_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/obd_support.h b/drivers/staging/lustre/lustre/include/linux/obd_support.h
new file mode 100644
index 000000000000..9166503408aa
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/obd_support.h
@@ -0,0 +1,63 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LINUX_OBD_SUPPORT
+#define _LINUX_OBD_SUPPORT
+
+#ifndef _OBD_SUPPORT
+#error Do not #include this file directly. #include <obd_support.h> instead
+#endif
+
+#ifdef CONFIG_X86
+#include <asm/cpufeature.h>
+#endif
+#include <asm/processor.h>
+#include <linux/seq_file.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/swap.h>
+#include <linux/lustre_compat25.h>
+#include <linux/lustre_common.h>
+#include <linux/libcfs/libcfs.h>
+#include <lustre/lustre_idl.h>
+
+
+# include <linux/types.h>
+# include <linux/blkdev.h>
+# include <lvfs.h>
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lprocfs_status.h b/drivers/staging/lustre/lustre/include/lprocfs_status.h
new file mode 100644
index 000000000000..e4e8f72b92b1
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lprocfs_status.h
@@ -0,0 +1,1100 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lprocfs_status.h
+ *
+ * Top level header file for LProc SNMP
+ *
+ * Author: Hariharan Thantry thantry@users.sourceforge.net
+ */
+#ifndef _LPROCFS_SNMP_H
+#define _LPROCFS_SNMP_H
+
+#include <linux/lprocfs_status.h>
+#include <lustre/lustre_idl.h>
+#include <linux/libcfs/params_tree.h>
+
+struct lprocfs_vars {
+	const char	     *name;
+	read_proc_t	*read_fptr;
+	write_proc_t       *write_fptr;
+	void		   *data;
+	struct file_operations *fops;
+	/**
+	 * /proc file mode.
+	 */
+	mode_t		  proc_mode;
+};
+
+struct lprocfs_static_vars {
+	struct lprocfs_vars *module_vars;
+	struct lprocfs_vars *obd_vars;
+};
+
+/* if we find more consumers this could be generalized */
+#define OBD_HIST_MAX 32
+struct obd_histogram {
+	spinlock_t	oh_lock;
+	unsigned long	oh_buckets[OBD_HIST_MAX];
+};
+
+enum {
+	BRW_R_PAGES = 0,
+	BRW_W_PAGES,
+	BRW_R_RPC_HIST,
+	BRW_W_RPC_HIST,
+	BRW_R_IO_TIME,
+	BRW_W_IO_TIME,
+	BRW_R_DISCONT_PAGES,
+	BRW_W_DISCONT_PAGES,
+	BRW_R_DISCONT_BLOCKS,
+	BRW_W_DISCONT_BLOCKS,
+	BRW_R_DISK_IOSIZE,
+	BRW_W_DISK_IOSIZE,
+	BRW_R_DIO_FRAGS,
+	BRW_W_DIO_FRAGS,
+	BRW_LAST,
+};
+
+struct brw_stats {
+	struct obd_histogram hist[BRW_LAST];
+};
+
+enum {
+	RENAME_SAMEDIR_SIZE = 0,
+	RENAME_CROSSDIR_SRC_SIZE,
+	RENAME_CROSSDIR_TGT_SIZE,
+	RENAME_LAST,
+};
+
+struct rename_stats {
+	struct obd_histogram hist[RENAME_LAST];
+};
+
+/* An lprocfs counter can be configured using the enum bit masks below.
+ *
+ * LPROCFS_CNTR_EXTERNALLOCK indicates that an external lock already
+ * protects this counter from concurrent updates. If not specified,
+ * lprocfs an internal per-counter lock variable. External locks are
+ * not used to protect counter increments, but are used to protect
+ * counter readout and resets.
+ *
+ * LPROCFS_CNTR_AVGMINMAX indicates a multi-valued counter samples,
+ * (i.e. counter can be incremented by more than "1"). When specified,
+ * the counter maintains min, max and sum in addition to a simple
+ * invocation count. This allows averages to be be computed.
+ * If not specified, the counter is an increment-by-1 counter.
+ * min, max, sum, etc. are not maintained.
+ *
+ * LPROCFS_CNTR_STDDEV indicates that the counter should track sum of
+ * squares (for multi-valued counter samples only). This allows
+ * external computation of standard deviation, but involves a 64-bit
+ * multiply per counter increment.
+ */
+
+enum {
+	LPROCFS_CNTR_EXTERNALLOCK = 0x0001,
+	LPROCFS_CNTR_AVGMINMAX    = 0x0002,
+	LPROCFS_CNTR_STDDEV       = 0x0004,
+
+	/* counter data type */
+	LPROCFS_TYPE_REGS	 = 0x0100,
+	LPROCFS_TYPE_BYTES	= 0x0200,
+	LPROCFS_TYPE_PAGES	= 0x0400,
+	LPROCFS_TYPE_CYCLE	= 0x0800,
+};
+
+#define LC_MIN_INIT ((~(__u64)0) >> 1)
+
+struct lprocfs_counter_header {
+	unsigned int		lc_config;
+	const char		*lc_name;   /* must be static */
+	const char		*lc_units;  /* must be static */
+};
+
+struct lprocfs_counter {
+	__s64	lc_count;
+	__s64	lc_min;
+	__s64	lc_max;
+	__s64	lc_sumsquare;
+	/*
+	 * Every counter has lc_array_sum[0], while lc_array_sum[1] is only
+	 * for irq context counter, i.e. stats with
+	 * LPROCFS_STATS_FLAG_IRQ_SAFE flag, its counter need
+	 * lc_array_sum[1]
+	 */
+	__s64	lc_array_sum[1];
+};
+#define lc_sum		lc_array_sum[0]
+#define lc_sum_irq	lc_array_sum[1]
+
+struct lprocfs_percpu {
+#ifndef __GNUC__
+	__s64			pad;
+#endif
+	struct lprocfs_counter lp_cntr[0];
+};
+
+#define LPROCFS_GET_NUM_CPU 0x0001
+#define LPROCFS_GET_SMP_ID  0x0002
+
+enum lprocfs_stats_flags {
+	LPROCFS_STATS_FLAG_NONE     = 0x0000, /* per cpu counter */
+	LPROCFS_STATS_FLAG_NOPERCPU = 0x0001, /* stats have no percpu
+					       * area and need locking */
+	LPROCFS_STATS_FLAG_IRQ_SAFE = 0x0002, /* alloc need irq safe */
+};
+
+enum lprocfs_fields_flags {
+	LPROCFS_FIELDS_FLAGS_CONFIG     = 0x0001,
+	LPROCFS_FIELDS_FLAGS_SUM	= 0x0002,
+	LPROCFS_FIELDS_FLAGS_MIN	= 0x0003,
+	LPROCFS_FIELDS_FLAGS_MAX	= 0x0004,
+	LPROCFS_FIELDS_FLAGS_AVG	= 0x0005,
+	LPROCFS_FIELDS_FLAGS_SUMSQUARE  = 0x0006,
+	LPROCFS_FIELDS_FLAGS_COUNT      = 0x0007,
+};
+
+struct lprocfs_stats {
+	/* # of counters */
+	unsigned short			ls_num;
+	/* 1 + the biggest cpu # whose ls_percpu slot has been allocated */
+	unsigned short			ls_biggest_alloc_num;
+	enum lprocfs_stats_flags	ls_flags;
+	/* Lock used when there are no percpu stats areas; For percpu stats,
+	 * it is used to protect ls_biggest_alloc_num change */
+	spinlock_t			ls_lock;
+
+	/* has ls_num of counter headers */
+	struct lprocfs_counter_header	*ls_cnt_header;
+	struct lprocfs_percpu		*ls_percpu[0];
+};
+
+#define OPC_RANGE(seg) (seg ## _LAST_OPC - seg ## _FIRST_OPC)
+
+/* Pack all opcodes down into a single monotonically increasing index */
+static inline int opcode_offset(__u32 opc) {
+	if (opc < OST_LAST_OPC) {
+		 /* OST opcode */
+		return (opc - OST_FIRST_OPC);
+	} else if (opc < MDS_LAST_OPC) {
+		/* MDS opcode */
+		return (opc - MDS_FIRST_OPC +
+			OPC_RANGE(OST));
+	} else if (opc < LDLM_LAST_OPC) {
+		/* LDLM Opcode */
+		return (opc - LDLM_FIRST_OPC +
+			OPC_RANGE(MDS) +
+			OPC_RANGE(OST));
+	} else if (opc < MGS_LAST_OPC) {
+		/* MGS Opcode */
+		return (opc - MGS_FIRST_OPC +
+			OPC_RANGE(LDLM) +
+			OPC_RANGE(MDS) +
+			OPC_RANGE(OST));
+	} else if (opc < OBD_LAST_OPC) {
+		/* OBD Ping */
+		return (opc - OBD_FIRST_OPC +
+			OPC_RANGE(MGS) +
+			OPC_RANGE(LDLM) +
+			OPC_RANGE(MDS) +
+			OPC_RANGE(OST));
+	} else if (opc < LLOG_LAST_OPC) {
+		/* LLOG Opcode */
+		return (opc - LLOG_FIRST_OPC +
+			OPC_RANGE(OBD) +
+			OPC_RANGE(MGS) +
+			OPC_RANGE(LDLM) +
+			OPC_RANGE(MDS) +
+			OPC_RANGE(OST));
+	} else if (opc < QUOTA_LAST_OPC) {
+		/* LQUOTA Opcode */
+		return (opc - QUOTA_FIRST_OPC +
+			OPC_RANGE(LLOG) +
+			OPC_RANGE(OBD) +
+			OPC_RANGE(MGS) +
+			OPC_RANGE(LDLM) +
+			OPC_RANGE(MDS) +
+			OPC_RANGE(OST));
+	} else if (opc < SEQ_LAST_OPC) {
+		/* SEQ opcode */
+		return (opc - SEQ_FIRST_OPC +
+			OPC_RANGE(QUOTA) +
+			OPC_RANGE(LLOG) +
+			OPC_RANGE(OBD) +
+			OPC_RANGE(MGS) +
+			OPC_RANGE(LDLM) +
+			OPC_RANGE(MDS) +
+			OPC_RANGE(OST));
+	} else if (opc < SEC_LAST_OPC) {
+		/* SEC opcode */
+		return (opc - SEC_FIRST_OPC +
+			OPC_RANGE(SEQ) +
+			OPC_RANGE(QUOTA) +
+			OPC_RANGE(LLOG) +
+			OPC_RANGE(OBD) +
+			OPC_RANGE(MGS) +
+			OPC_RANGE(LDLM) +
+			OPC_RANGE(MDS) +
+			OPC_RANGE(OST));
+	} else if (opc < FLD_LAST_OPC) {
+		/* FLD opcode */
+		 return (opc - FLD_FIRST_OPC +
+			OPC_RANGE(SEC) +
+			OPC_RANGE(SEQ) +
+			OPC_RANGE(QUOTA) +
+			OPC_RANGE(LLOG) +
+			OPC_RANGE(OBD) +
+			OPC_RANGE(MGS) +
+			OPC_RANGE(LDLM) +
+			OPC_RANGE(MDS) +
+			OPC_RANGE(OST));
+	} else if (opc < UPDATE_LAST_OPC) {
+		/* update opcode */
+		return (opc - UPDATE_FIRST_OPC +
+			OPC_RANGE(FLD) +
+			OPC_RANGE(SEC) +
+			OPC_RANGE(SEQ) +
+			OPC_RANGE(QUOTA) +
+			OPC_RANGE(LLOG) +
+			OPC_RANGE(OBD) +
+			OPC_RANGE(MGS) +
+			OPC_RANGE(LDLM) +
+			OPC_RANGE(MDS) +
+			OPC_RANGE(OST));
+	} else {
+		/* Unknown Opcode */
+		return -1;
+	}
+}
+
+
+#define LUSTRE_MAX_OPCODES (OPC_RANGE(OST)  + \
+			    OPC_RANGE(MDS)  + \
+			    OPC_RANGE(LDLM) + \
+			    OPC_RANGE(MGS)  + \
+			    OPC_RANGE(OBD)  + \
+			    OPC_RANGE(LLOG) + \
+			    OPC_RANGE(SEC)  + \
+			    OPC_RANGE(SEQ)  + \
+			    OPC_RANGE(SEC)  + \
+			    OPC_RANGE(FLD)  + \
+			    OPC_RANGE(UPDATE))
+
+#define EXTRA_MAX_OPCODES ((PTLRPC_LAST_CNTR - PTLRPC_FIRST_CNTR)  + \
+			    OPC_RANGE(EXTRA))
+
+enum {
+	PTLRPC_REQWAIT_CNTR = 0,
+	PTLRPC_REQQDEPTH_CNTR,
+	PTLRPC_REQACTIVE_CNTR,
+	PTLRPC_TIMEOUT,
+	PTLRPC_REQBUF_AVAIL_CNTR,
+	PTLRPC_LAST_CNTR
+};
+
+#define PTLRPC_FIRST_CNTR PTLRPC_REQWAIT_CNTR
+
+enum {
+	LDLM_GLIMPSE_ENQUEUE = 0,
+	LDLM_PLAIN_ENQUEUE,
+	LDLM_EXTENT_ENQUEUE,
+	LDLM_FLOCK_ENQUEUE,
+	LDLM_IBITS_ENQUEUE,
+	MDS_REINT_SETATTR,
+	MDS_REINT_CREATE,
+	MDS_REINT_LINK,
+	MDS_REINT_UNLINK,
+	MDS_REINT_RENAME,
+	MDS_REINT_OPEN,
+	MDS_REINT_SETXATTR,
+	BRW_READ_BYTES,
+	BRW_WRITE_BYTES,
+	EXTRA_LAST_OPC
+};
+
+#define EXTRA_FIRST_OPC LDLM_GLIMPSE_ENQUEUE
+/* class_obd.c */
+extern proc_dir_entry_t *proc_lustre_root;
+
+struct obd_device;
+struct obd_histogram;
+
+/* Days / hours / mins / seconds format */
+struct dhms {
+	int d,h,m,s;
+};
+static inline void s2dhms(struct dhms *ts, time_t secs)
+{
+	ts->d = secs / 86400;
+	secs = secs % 86400;
+	ts->h = secs / 3600;
+	secs = secs % 3600;
+	ts->m = secs / 60;
+	ts->s = secs % 60;
+}
+#define DHMS_FMT "%dd%dh%02dm%02ds"
+#define DHMS_VARS(x) (x)->d, (x)->h, (x)->m, (x)->s
+
+#define JOBSTATS_JOBID_VAR_MAX_LEN	20
+#define JOBSTATS_DISABLE		"disable"
+#define JOBSTATS_PROCNAME_UID		"procname_uid"
+
+typedef void (*cntr_init_callback)(struct lprocfs_stats *stats);
+
+struct obd_job_stats {
+	cfs_hash_t	*ojs_hash;
+	struct list_head	 ojs_list;
+	rwlock_t       ojs_lock; /* protect the obj_list */
+	cntr_init_callback ojs_cntr_init_fn;
+	int		ojs_cntr_num;
+	int		ojs_cleanup_interval;
+	time_t		   ojs_last_cleanup;
+};
+
+#ifdef LPROCFS
+
+extern int lprocfs_stats_alloc_one(struct lprocfs_stats *stats,
+				   unsigned int cpuid);
+/*
+ * \return value
+ *      < 0     : on error (only possible for opc as LPROCFS_GET_SMP_ID)
+ */
+static inline int lprocfs_stats_lock(struct lprocfs_stats *stats, int opc,
+				     unsigned long *flags)
+{
+	int		rc = 0;
+
+	switch (opc) {
+	default:
+		LBUG();
+
+	case LPROCFS_GET_SMP_ID:
+		if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) {
+			if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE)
+				spin_lock_irqsave(&stats->ls_lock, *flags);
+			else
+				spin_lock(&stats->ls_lock);
+			return 0;
+		} else {
+			unsigned int cpuid = get_cpu();
+
+			if (unlikely(stats->ls_percpu[cpuid] == NULL)) {
+				rc = lprocfs_stats_alloc_one(stats, cpuid);
+				if (rc < 0) {
+					put_cpu();
+					return rc;
+				}
+			}
+			return cpuid;
+		}
+
+	case LPROCFS_GET_NUM_CPU:
+		if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) {
+			if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE)
+				spin_lock_irqsave(&stats->ls_lock, *flags);
+			else
+				spin_lock(&stats->ls_lock);
+			return 1;
+		} else {
+			return stats->ls_biggest_alloc_num;
+		}
+	}
+}
+
+static inline void lprocfs_stats_unlock(struct lprocfs_stats *stats, int opc,
+					unsigned long *flags)
+{
+	switch (opc) {
+	default:
+		LBUG();
+
+	case LPROCFS_GET_SMP_ID:
+		if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) {
+			if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) {
+				spin_unlock_irqrestore(&stats->ls_lock,
+							   *flags);
+			} else {
+				spin_unlock(&stats->ls_lock);
+			}
+		} else {
+			put_cpu();
+		}
+		return;
+
+	case LPROCFS_GET_NUM_CPU:
+		if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) {
+			if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) {
+				spin_unlock_irqrestore(&stats->ls_lock,
+							   *flags);
+			} else {
+				spin_unlock(&stats->ls_lock);
+			}
+		}
+		return;
+	}
+}
+
+static inline unsigned int
+lprocfs_stats_counter_size(struct lprocfs_stats *stats)
+{
+	unsigned int percpusize;
+
+	percpusize = offsetof(struct lprocfs_percpu, lp_cntr[stats->ls_num]);
+
+	/* irq safe stats need lc_array_sum[1] */
+	if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+		percpusize += stats->ls_num * sizeof(__s64);
+
+	if ((stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) == 0)
+		percpusize = L1_CACHE_ALIGN(percpusize);
+
+	return percpusize;
+}
+
+static inline struct lprocfs_counter *
+lprocfs_stats_counter_get(struct lprocfs_stats *stats, unsigned int cpuid,
+			  int index)
+{
+	struct lprocfs_counter *cntr;
+
+	cntr = &stats->ls_percpu[cpuid]->lp_cntr[index];
+
+	if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+		cntr = (void *)cntr + index * sizeof(__s64);
+
+	return cntr;
+}
+
+/* Two optimized LPROCFS counter increment functions are provided:
+ *     lprocfs_counter_incr(cntr, value) - optimized for by-one counters
+ *     lprocfs_counter_add(cntr) - use for multi-valued counters
+ * Counter data layout allows config flag, counter lock and the
+ * count itself to reside within a single cache line.
+ */
+
+extern void lprocfs_counter_add(struct lprocfs_stats *stats, int idx,
+				long amount);
+extern void lprocfs_counter_sub(struct lprocfs_stats *stats, int idx,
+				long amount);
+
+#define lprocfs_counter_incr(stats, idx) \
+	lprocfs_counter_add(stats, idx, 1)
+#define lprocfs_counter_decr(stats, idx) \
+	lprocfs_counter_sub(stats, idx, 1)
+
+extern __s64 lprocfs_read_helper(struct lprocfs_counter *lc,
+				 struct lprocfs_counter_header *header,
+				 enum lprocfs_stats_flags flags,
+				 enum lprocfs_fields_flags field);
+static inline __u64 lprocfs_stats_collector(struct lprocfs_stats *stats,
+					    int idx,
+					    enum lprocfs_fields_flags field)
+{
+	int	      i;
+	unsigned int  num_cpu;
+	unsigned long flags	= 0;
+	__u64	      ret	= 0;
+
+	LASSERT(stats != NULL);
+
+	num_cpu = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
+	for (i = 0; i < num_cpu; i++) {
+		if (stats->ls_percpu[i] == NULL)
+			continue;
+		ret += lprocfs_read_helper(
+				lprocfs_stats_counter_get(stats, i, idx),
+				&stats->ls_cnt_header[idx], stats->ls_flags,
+				field);
+	}
+	lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
+	return ret;
+}
+
+extern struct lprocfs_stats *
+lprocfs_alloc_stats(unsigned int num, enum lprocfs_stats_flags flags);
+extern void lprocfs_clear_stats(struct lprocfs_stats *stats);
+extern void lprocfs_free_stats(struct lprocfs_stats **stats);
+extern void lprocfs_init_ops_stats(int num_private_stats,
+				   struct lprocfs_stats *stats);
+extern void lprocfs_init_mps_stats(int num_private_stats,
+				   struct lprocfs_stats *stats);
+extern void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats);
+extern int lprocfs_alloc_obd_stats(struct obd_device *obddev,
+				   unsigned int num_private_stats);
+extern int lprocfs_alloc_md_stats(struct obd_device *obddev,
+				  unsigned int num_private_stats);
+extern void lprocfs_counter_init(struct lprocfs_stats *stats, int index,
+				 unsigned conf, const char *name,
+				 const char *units);
+extern void lprocfs_free_obd_stats(struct obd_device *obddev);
+extern void lprocfs_free_md_stats(struct obd_device *obddev);
+struct obd_export;
+struct nid_stat;
+extern int lprocfs_add_clear_entry(struct obd_device * obd,
+				   proc_dir_entry_t *entry);
+extern int lprocfs_exp_setup(struct obd_export *exp,
+			     lnet_nid_t *peer_nid, int *newnid);
+extern int lprocfs_exp_cleanup(struct obd_export *exp);
+extern proc_dir_entry_t *lprocfs_add_simple(struct proc_dir_entry *root,
+						char *name,
+						read_proc_t *read_proc,
+						write_proc_t *write_proc,
+						void *data,
+						struct file_operations *fops);
+extern struct proc_dir_entry *
+lprocfs_add_symlink(const char *name, struct proc_dir_entry *parent,
+		    const char *format, ...);
+extern void lprocfs_free_per_client_stats(struct obd_device *obd);
+extern int
+lprocfs_nid_stats_clear_write(struct file *file, const char *buffer,
+			      unsigned long count, void *data);
+extern int lprocfs_nid_stats_clear_read(char *page, char **start, off_t off,
+					int count, int *eof,  void *data);
+
+extern int lprocfs_register_stats(proc_dir_entry_t *root, const char *name,
+				  struct lprocfs_stats *stats);
+
+/* lprocfs_status.c */
+extern int lprocfs_add_vars(proc_dir_entry_t *root,
+			    struct lprocfs_vars *var,
+			    void *data);
+
+extern proc_dir_entry_t *lprocfs_register(const char *name,
+					      proc_dir_entry_t *parent,
+					      struct lprocfs_vars *list,
+					      void *data);
+
+extern void lprocfs_remove(proc_dir_entry_t **root);
+extern void lprocfs_remove_proc_entry(const char *name,
+				      struct proc_dir_entry *parent);
+extern void lprocfs_try_remove_proc_entry(const char *name,
+					  struct proc_dir_entry *parent);
+
+extern proc_dir_entry_t *lprocfs_srch(proc_dir_entry_t *root,
+					  const char *name);
+
+extern int lprocfs_obd_setup(struct obd_device *obd, struct lprocfs_vars *list);
+extern int lprocfs_obd_cleanup(struct obd_device *obd);
+extern struct file_operations lprocfs_evict_client_fops;
+
+extern int lprocfs_seq_create(proc_dir_entry_t *parent, const char *name,
+			      mode_t mode,
+			      const struct file_operations *seq_fops,
+			      void *data);
+extern int lprocfs_obd_seq_create(struct obd_device *dev, const char *name,
+				  mode_t mode,
+				  const struct file_operations *seq_fops,
+				  void *data);
+
+/* Generic callbacks */
+
+extern int lprocfs_rd_u64(char *page, char **start, off_t off,
+			  int count, int *eof, void *data);
+extern int lprocfs_rd_atomic(char *page, char **start, off_t off,
+			     int count, int *eof, void *data);
+extern int lprocfs_wr_atomic(struct file *file, const char *buffer,
+			     unsigned long count, void *data);
+extern int lprocfs_rd_uint(char *page, char **start, off_t off,
+			   int count, int *eof, void *data);
+extern int lprocfs_wr_uint(struct file *file, const char *buffer,
+			   unsigned long count, void *data);
+extern int lprocfs_rd_uuid(char *page, char **start, off_t off,
+			   int count, int *eof, void *data);
+extern int lprocfs_rd_name(char *page, char **start, off_t off,
+			   int count, int *eof, void *data);
+extern int lprocfs_rd_server_uuid(char *page, char **start, off_t off,
+				  int count, int *eof, void *data);
+extern int lprocfs_rd_conn_uuid(char *page, char **start, off_t off,
+				int count, int *eof, void *data);
+extern int lprocfs_rd_import(char *page, char **start, off_t off, int count,
+			     int *eof, void *data);
+extern int lprocfs_rd_state(char *page, char **start, off_t off, int count,
+			    int *eof, void *data);
+extern int lprocfs_rd_connect_flags(char *page, char **start, off_t off,
+				    int count, int *eof, void *data);
+extern int lprocfs_rd_num_exports(char *page, char **start, off_t off,
+				  int count, int *eof, void *data);
+extern int lprocfs_rd_numrefs(char *page, char **start, off_t off,
+			      int count, int *eof, void *data);
+struct adaptive_timeout;
+extern int lprocfs_at_hist_helper(char *page, int count, int rc,
+				  struct adaptive_timeout *at);
+extern int lprocfs_rd_timeouts(char *page, char **start, off_t off,
+			       int count, int *eof, void *data);
+extern int lprocfs_wr_timeouts(struct file *file, const char *buffer,
+			       unsigned long count, void *data);
+extern int lprocfs_wr_evict_client(struct file *file, const char *buffer,
+				   unsigned long count, void *data);
+extern int lprocfs_wr_ping(struct file *file, const char *buffer,
+			   unsigned long count, void *data);
+extern int lprocfs_wr_import(struct file *file, const char *buffer,
+			     unsigned long count, void *data);
+extern int lprocfs_rd_pinger_recov(char *page, char **start, off_t off,
+				   int count, int *eof, void *data);
+extern int lprocfs_wr_pinger_recov(struct file *file, const char *buffer,
+				   unsigned long count, void *data);
+
+/* Statfs helpers */
+extern int lprocfs_rd_blksize(char *page, char **start, off_t off,
+			      int count, int *eof, void *data);
+extern int lprocfs_rd_kbytestotal(char *page, char **start, off_t off,
+				  int count, int *eof, void *data);
+extern int lprocfs_rd_kbytesfree(char *page, char **start, off_t off,
+				 int count, int *eof, void *data);
+extern int lprocfs_rd_kbytesavail(char *page, char **start, off_t off,
+				 int count, int *eof, void *data);
+extern int lprocfs_rd_filestotal(char *page, char **start, off_t off,
+				 int count, int *eof, void *data);
+extern int lprocfs_rd_filesfree(char *page, char **start, off_t off,
+				int count, int *eof, void *data);
+extern int lprocfs_rd_filegroups(char *page, char **start, off_t off,
+				 int count, int *eof, void *data);
+
+extern int lprocfs_write_helper(const char *buffer, unsigned long count,
+				int *val);
+extern int lprocfs_write_frac_helper(const char *buffer, unsigned long count,
+				     int *val, int mult);
+extern int lprocfs_read_frac_helper(char *buffer, unsigned long count,
+				    long val, int mult);
+extern int lprocfs_write_u64_helper(const char *buffer, unsigned long count,
+				    __u64 *val);
+extern int lprocfs_write_frac_u64_helper(const char *buffer,
+					 unsigned long count,
+					 __u64 *val, int mult);
+char *lprocfs_find_named_value(const char *buffer, const char *name,
+				unsigned long *count);
+void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value);
+void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value);
+void lprocfs_oh_clear(struct obd_histogram *oh);
+unsigned long lprocfs_oh_sum(struct obd_histogram *oh);
+
+void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx,
+			   struct lprocfs_counter *cnt);
+
+/* lprocfs_status.c: recovery status */
+int lprocfs_obd_rd_recovery_status(char *page, char **start, off_t off,
+				   int count, int *eof, void *data);
+
+/* lprocfs_statuc.c: hash statistics */
+int lprocfs_obd_rd_hash(char *page, char **start, off_t off,
+			int count, int *eof, void *data);
+
+/* lprocfs_status.c: IR factor */
+int lprocfs_obd_rd_ir_factor(char *page, char **start, off_t off,
+			     int count, int *eof, void *data);
+int lprocfs_obd_wr_ir_factor(struct file *file, const char *buffer,
+			     unsigned long count, void *data);
+
+extern int lprocfs_single_release(cfs_inode_t *, struct file *);
+extern int lprocfs_seq_release(cfs_inode_t *, struct file *);
+
+/* You must use these macros when you want to refer to
+ * the import in a client obd_device for a lprocfs entry */
+#define LPROCFS_CLIMP_CHECK(obd) do {	   \
+	typecheck(struct obd_device *, obd);    \
+	down_read(&(obd)->u.cli.cl_sem);    \
+	if ((obd)->u.cli.cl_import == NULL) {   \
+	     up_read(&(obd)->u.cli.cl_sem); \
+	     return -ENODEV;		    \
+	}				       \
+} while(0)
+#define LPROCFS_CLIMP_EXIT(obd)		 \
+	up_read(&(obd)->u.cli.cl_sem);
+
+
+/* write the name##_seq_show function, call LPROC_SEQ_FOPS_RO for read-only
+  proc entries; otherwise, you will define name##_seq_write function also for
+  a read-write proc entry, and then call LPROC_SEQ_SEQ instead. Finally,
+  call lprocfs_obd_seq_create(obd, filename, 0444, &name#_fops, data); */
+#define __LPROC_SEQ_FOPS(name, custom_seq_write)			   \
+static int name##_single_open(cfs_inode_t *inode, struct file *file) {     \
+	struct proc_dir_entry *dp = PDE(inode);			    \
+	int rc;							    \
+	LPROCFS_ENTRY_AND_CHECK(dp);				       \
+	rc = single_open(file, name##_seq_show, dp->data);		 \
+	if (rc) {							  \
+		LPROCFS_EXIT();					    \
+		return rc;						 \
+	}								  \
+	return 0;							  \
+}									  \
+struct file_operations name##_fops = {				     \
+	.owner   = THIS_MODULE,					    \
+	.open    = name##_single_open,				     \
+	.read    = seq_read,					       \
+	.write   = custom_seq_write,				       \
+	.llseek  = seq_lseek,					      \
+	.release = lprocfs_single_release,				 \
+}
+
+#define LPROC_SEQ_FOPS_RO(name)	 __LPROC_SEQ_FOPS(name, NULL)
+#define LPROC_SEQ_FOPS(name)	    __LPROC_SEQ_FOPS(name, name##_seq_write)
+
+/* lprocfs_jobstats.c */
+int lprocfs_job_stats_log(struct obd_device *obd, char *jobid,
+			  int event, long amount);
+void lprocfs_job_stats_fini(struct obd_device *obd);
+int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num,
+			   cntr_init_callback fn);
+int lprocfs_rd_job_interval(char *page, char **start, off_t off,
+			    int count, int *eof, void *data);
+int lprocfs_wr_job_interval(struct file *file, const char *buffer,
+			    unsigned long count, void *data);
+
+/* lproc_ptlrpc.c */
+struct ptlrpc_request;
+extern void target_print_req(void *seq_file, struct ptlrpc_request *req);
+
+/* lproc_status.c */
+int lprocfs_obd_rd_recovery_time_soft(char *page, char **start, off_t off,
+				      int count, int *eof, void *data);
+int lprocfs_obd_wr_recovery_time_soft(struct file *file,
+				      const char *buffer,
+				      unsigned long count, void *data);
+int lprocfs_obd_rd_recovery_time_hard(char *page, char **start, off_t off,
+				      int count, int *eof, void *data);
+int lprocfs_obd_wr_recovery_time_hard(struct file *file,
+				      const char *buffer,
+				      unsigned long count, void *data);
+int lprocfs_obd_rd_max_pages_per_rpc(char *page, char **start, off_t off,
+				     int count, int *eof, void *data);
+int lprocfs_obd_wr_max_pages_per_rpc(struct file *file, const char *buffer,
+				     unsigned long count, void *data);
+int lprocfs_target_rd_instance(char *page, char **start, off_t off,
+			       int count, int *eof, void *data);
+
+/* all quota proc functions */
+extern int lprocfs_quota_rd_bunit(char *page, char **start,
+				  off_t off, int count,
+				  int *eof, void *data);
+extern int lprocfs_quota_wr_bunit(struct file *file, const char *buffer,
+				  unsigned long count, void *data);
+extern int lprocfs_quota_rd_btune(char *page, char **start,
+				  off_t off, int count,
+				  int *eof, void *data);
+extern int lprocfs_quota_wr_btune(struct file *file, const char *buffer,
+				  unsigned long count, void *data);
+extern int lprocfs_quota_rd_iunit(char *page, char **start,
+				  off_t off, int count,
+				  int *eof, void *data);
+extern int lprocfs_quota_wr_iunit(struct file *file, const char *buffer,
+				  unsigned long count, void *data);
+extern int lprocfs_quota_rd_itune(char *page, char **start,
+				  off_t off, int count,
+				  int *eof, void *data);
+extern int lprocfs_quota_wr_itune(struct file *file, const char *buffer,
+				  unsigned long count, void *data);
+extern int lprocfs_quota_rd_type(char *page, char **start, off_t off, int count,
+				 int *eof, void *data);
+extern int lprocfs_quota_wr_type(struct file *file, const char *buffer,
+				 unsigned long count, void *data);
+extern int lprocfs_quota_rd_switch_seconds(char *page, char **start, off_t off,
+					   int count, int *eof, void *data);
+extern int lprocfs_quota_wr_switch_seconds(struct file *file,
+					   const char *buffer,
+					   unsigned long count, void *data);
+extern int lprocfs_quota_rd_sync_blk(char *page, char **start, off_t off,
+				     int count, int *eof, void *data);
+extern int lprocfs_quota_wr_sync_blk(struct file *file, const char *buffer,
+				     unsigned long count, void *data);
+extern int lprocfs_quota_rd_switch_qs(char *page, char **start, off_t off,
+				      int count, int *eof, void *data);
+extern int lprocfs_quota_wr_switch_qs(struct file *file,
+				      const char *buffer,
+				      unsigned long count, void *data);
+extern int lprocfs_quota_rd_boundary_factor(char *page, char **start, off_t off,
+					    int count, int *eof, void *data);
+extern int lprocfs_quota_wr_boundary_factor(struct file *file,
+					    const char *buffer,
+					    unsigned long count, void *data);
+extern int lprocfs_quota_rd_least_bunit(char *page, char **start, off_t off,
+					int count, int *eof, void *data);
+extern int lprocfs_quota_wr_least_bunit(struct file *file,
+					const char *buffer,
+					unsigned long count, void *data);
+extern int lprocfs_quota_rd_least_iunit(char *page, char **start, off_t off,
+					int count, int *eof, void *data);
+extern int lprocfs_quota_wr_least_iunit(struct file *file,
+					const char *buffer,
+					unsigned long count, void *data);
+extern int lprocfs_quota_rd_qs_factor(char *page, char **start, off_t off,
+				      int count, int *eof, void *data);
+extern int lprocfs_quota_wr_qs_factor(struct file *file,
+				      const char *buffer,
+				      unsigned long count, void *data);
+
+
+
+#else
+/* LPROCFS is not defined */
+
+#define proc_lustre_root NULL
+
+static inline void lprocfs_counter_add(struct lprocfs_stats *stats,
+				       int index, long amount)
+{ return; }
+static inline void lprocfs_counter_incr(struct lprocfs_stats *stats,
+					int index)
+{ return; }
+static inline void lprocfs_counter_sub(struct lprocfs_stats *stats,
+				       int index, long amount)
+{ return; }
+static inline void lprocfs_counter_decr(struct lprocfs_stats *stats,
+					int index)
+{ return; }
+static inline void lprocfs_counter_init(struct lprocfs_stats *stats,
+					int index, unsigned conf,
+					const char *name, const char *units)
+{ return; }
+
+static inline __u64 lc_read_helper(struct lprocfs_counter *lc,
+				   enum lprocfs_fields_flags field)
+{ return 0; }
+
+/* NB: we return !NULL to satisfy error checker */
+static inline struct lprocfs_stats *
+lprocfs_alloc_stats(unsigned int num, enum lprocfs_stats_flags flags)
+{ return (struct lprocfs_stats *)1; }
+static inline void lprocfs_clear_stats(struct lprocfs_stats *stats)
+{ return; }
+static inline void lprocfs_free_stats(struct lprocfs_stats **stats)
+{ return; }
+static inline int lprocfs_register_stats(proc_dir_entry_t *root,
+					 const char *name,
+					 struct lprocfs_stats *stats)
+{ return 0; }
+static inline void lprocfs_init_ops_stats(int num_private_stats,
+					  struct lprocfs_stats *stats)
+{ return; }
+static inline void lprocfs_init_mps_stats(int num_private_stats,
+					  struct lprocfs_stats *stats)
+{ return; }
+static inline void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats)
+{ return; }
+static inline int lprocfs_alloc_obd_stats(struct obd_device *obddev,
+					  unsigned int num_private_stats)
+{ return 0; }
+static inline int lprocfs_alloc_md_stats(struct obd_device *obddev,
+					 unsigned int num_private_stats)
+{ return 0; }
+static inline void lprocfs_free_obd_stats(struct obd_device *obddev)
+{ return; }
+static inline void lprocfs_free_md_stats(struct obd_device *obddev)
+{ return; }
+
+struct obd_export;
+static inline int lprocfs_add_clear_entry(struct obd_export *exp)
+{ return 0; }
+static inline int lprocfs_exp_setup(struct obd_export *exp,lnet_nid_t *peer_nid,
+				    int *newnid)
+{ return 0; }
+static inline int lprocfs_exp_cleanup(struct obd_export *exp)
+{ return 0; }
+static inline proc_dir_entry_t *
+lprocfs_add_simple(struct proc_dir_entry *root, char *name,
+		   read_proc_t *read_proc, write_proc_t *write_proc,
+		   void *data, struct file_operations *fops)
+{return 0; }
+static inline struct proc_dir_entry *
+lprocfs_add_symlink(const char *name, struct proc_dir_entry *parent,
+		    const char *format, ...)
+{return NULL; }
+static inline void lprocfs_free_per_client_stats(struct obd_device *obd)
+{ return; }
+static inline
+int lprocfs_nid_stats_clear_write(struct file *file, const char *buffer,
+				  unsigned long count, void *data)
+{return count;}
+static inline
+int lprocfs_nid_stats_clear_read(char *page, char **start, off_t off,
+				 int count, int *eof,  void *data)
+{return count;}
+
+static inline proc_dir_entry_t *
+lprocfs_register(const char *name, proc_dir_entry_t *parent,
+		 struct lprocfs_vars *list, void *data)
+{ return NULL; }
+static inline int lprocfs_add_vars(proc_dir_entry_t *root,
+				   struct lprocfs_vars *var,
+				   void *data)
+{ return 0; }
+static inline void lprocfs_remove(proc_dir_entry_t **root)
+{ return; }
+static inline void lprocfs_remove_proc_entry(const char *name,
+					     struct proc_dir_entry *parent)
+{ return; }
+static inline void lprocfs_try_remove_proc_entry(const char *name,
+						 struct proc_dir_entry *parent)
+{ return; }
+static inline proc_dir_entry_t *lprocfs_srch(proc_dir_entry_t *head,
+						 const char *name)
+{ return 0; }
+static inline int lprocfs_obd_setup(struct obd_device *dev,
+				    struct lprocfs_vars *list)
+{ return 0; }
+static inline int lprocfs_obd_cleanup(struct obd_device *dev)
+{ return 0; }
+static inline int lprocfs_rd_u64(char *page, char **start, off_t off,
+				 int count, int *eof, void *data)
+{ return 0; }
+static inline int lprocfs_rd_uuid(char *page, char **start, off_t off,
+				  int count, int *eof, void *data)
+{ return 0; }
+static inline int lprocfs_rd_name(char *page, char **start, off_t off,
+				  int count, int *eof, void *data)
+{ return 0; }
+static inline int lprocfs_rd_server_uuid(char *page, char **start, off_t off,
+					 int count, int *eof, void *data)
+{ return 0; }
+static inline int lprocfs_rd_conn_uuid(char *page, char **start, off_t off,
+				       int count, int *eof, void *data)
+{ return 0; }
+static inline int lprocfs_rd_import(char *page, char **start, off_t off,
+				    int count, int *eof, void *data)
+{ return 0; }
+static inline int lprocfs_rd_pinger_recov(char *page, char **start, off_t off,
+					  int count, int *eof, void *data)
+{ return 0; }
+static inline int lprocfs_rd_state(char *page, char **start, off_t off,
+				   int count, int *eof, void *data)
+{ return 0; }
+static inline int lprocfs_rd_connect_flags(char *page, char **start, off_t off,
+					   int count, int *eof, void *data)
+{ return 0; }
+static inline int lprocfs_rd_num_exports(char *page, char **start, off_t off,
+					 int count, int *eof, void *data)
+{ return 0; }
+static inline int lprocfs_rd_numrefs(char *page, char **start, off_t off,
+				     int count, int *eof, void *data)
+{ return 0; }
+struct adaptive_timeout;
+static inline int lprocfs_at_hist_helper(char *page, int count, int rc,
+					 struct adaptive_timeout *at)
+{ return 0; }
+static inline int lprocfs_rd_timeouts(char *page, char **start, off_t off,
+				      int count, int *eof, void *data)
+{ return 0; }
+static inline int lprocfs_wr_timeouts(struct file *file,
+				      const char *buffer,
+				      unsigned long count, void *data)
+{ return 0; }
+static inline int lprocfs_wr_evict_client(struct file *file,
+					  const char *buffer,
+					  unsigned long count, void *data)
+{ return 0; }
+static inline int lprocfs_wr_ping(struct file *file, const char *buffer,
+				  unsigned long count, void *data)
+{ return 0; }
+static inline int lprocfs_wr_import(struct file *file, const char *buffer,
+				    unsigned long count, void *data)
+{ return 0; }
+static inline int lprocfs_wr_pinger_recov(struct file *file, const char *buffer,
+				    unsigned long count, void *data)
+{ return 0; }
+
+/* Statfs helpers */
+static inline
+int lprocfs_rd_blksize(char *page, char **start, off_t off,
+		       int count, int *eof, void *data)
+{ return 0; }
+static inline
+int lprocfs_rd_kbytestotal(char *page, char **start, off_t off,
+			   int count, int *eof, void *data)
+{ return 0; }
+static inline
+int lprocfs_rd_kbytesfree(char *page, char **start, off_t off,
+			  int count, int *eof, void *data)
+{ return 0; }
+static inline
+int lprocfs_rd_kbytesavail(char *page, char **start, off_t off,
+			   int count, int *eof, void *data)
+{ return 0; }
+static inline
+int lprocfs_rd_filestotal(char *page, char **start, off_t off,
+			  int count, int *eof, void *data)
+{ return 0; }
+static inline
+int lprocfs_rd_filesfree(char *page, char **start, off_t off,
+			 int count, int *eof, void *data)
+{ return 0; }
+static inline
+int lprocfs_rd_filegroups(char *page, char **start, off_t off,
+			  int count, int *eof, void *data)
+{ return 0; }
+static inline
+void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value)
+{ return; }
+static inline
+void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value)
+{ return; }
+static inline
+void lprocfs_oh_clear(struct obd_histogram *oh)
+{ return; }
+static inline
+unsigned long lprocfs_oh_sum(struct obd_histogram *oh)
+{ return 0; }
+static inline
+void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx,
+			   struct lprocfs_counter *cnt)
+{ return; }
+static inline
+__u64 lprocfs_stats_collector(struct lprocfs_stats *stats, int idx,
+			       enum lprocfs_fields_flags field)
+{ return (__u64)0; }
+
+#define LPROC_SEQ_FOPS_RO(name)
+#define LPROC_SEQ_FOPS(name)
+
+/* lprocfs_jobstats.c */
+static inline
+int lprocfs_job_stats_log(struct obd_device *obd, char *jobid, int event,
+			  long amount)
+{ return 0; }
+static inline
+void lprocfs_job_stats_fini(struct obd_device *obd)
+{ return; }
+static inline
+int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num,
+			   cntr_init_callback fn)
+{ return 0; }
+
+
+/* lproc_ptlrpc.c */
+#define target_print_req NULL
+
+#endif /* LPROCFS */
+
+#endif /* LPROCFS_SNMP_H */
diff --git a/drivers/staging/lustre/lustre/include/lu_object.h b/drivers/staging/lustre/lustre/include/lu_object.h
new file mode 100644
index 000000000000..4bd11bbe2783
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lu_object.h
@@ -0,0 +1,1346 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LUSTRE_LU_OBJECT_H
+#define __LUSTRE_LU_OBJECT_H
+
+#include <stdarg.h>
+#include <linux/libcfs/libcfs.h>
+#include <lustre/lustre_idl.h>
+#include <lu_ref.h>
+
+struct seq_file;
+struct proc_dir_entry;
+struct lustre_cfg;
+struct lprocfs_stats;
+
+/** \defgroup lu lu
+ * lu_* data-types represent server-side entities shared by data and meta-data
+ * stacks.
+ *
+ * Design goals:
+ *
+ * -# support for layering.
+ *
+ *     Server side object is split into layers, one per device in the
+ *     corresponding device stack. Individual layer is represented by struct
+ *     lu_object. Compound layered object --- by struct lu_object_header. Most
+ *     interface functions take lu_object as an argument and operate on the
+ *     whole compound object. This decision was made due to the following
+ *     reasons:
+ *
+ *	- it's envisaged that lu_object will be used much more often than
+ *	lu_object_header;
+ *
+ *	- we want lower (non-top) layers to be able to initiate operations
+ *	on the whole object.
+ *
+ *     Generic code supports layering more complex than simple stacking, e.g.,
+ *     it is possible that at some layer object "spawns" multiple sub-objects
+ *     on the lower layer.
+ *
+ * -# fid-based identification.
+ *
+ *     Compound object is uniquely identified by its fid. Objects are indexed
+ *     by their fids (hash table is used for index).
+ *
+ * -# caching and life-cycle management.
+ *
+ *     Object's life-time is controlled by reference counting. When reference
+ *     count drops to 0, object is returned to cache. Cached objects still
+ *     retain their identity (i.e., fid), and can be recovered from cache.
+ *
+ *     Objects are kept in the global LRU list, and lu_site_purge() function
+ *     can be used to reclaim given number of unused objects from the tail of
+ *     the LRU.
+ *
+ * -# avoiding recursion.
+ *
+ *     Generic code tries to replace recursion through layers by iterations
+ *     where possible. Additionally to the end of reducing stack consumption,
+ *     data, when practically possible, are allocated through lu_context_key
+ *     interface rather than on stack.
+ * @{
+ */
+
+struct lu_site;
+struct lu_object;
+struct lu_device;
+struct lu_object_header;
+struct lu_context;
+struct lu_env;
+
+/**
+ * Operations common for data and meta-data devices.
+ */
+struct lu_device_operations {
+	/**
+	 * Allocate object for the given device (without lower-layer
+	 * parts). This is called by lu_object_operations::loo_object_init()
+	 * from the parent layer, and should setup at least lu_object::lo_dev
+	 * and lu_object::lo_ops fields of resulting lu_object.
+	 *
+	 * Object creation protocol.
+	 *
+	 * Due to design goal of avoiding recursion, object creation (see
+	 * lu_object_alloc()) is somewhat involved:
+	 *
+	 *  - first, lu_device_operations::ldo_object_alloc() method of the
+	 *  top-level device in the stack is called. It should allocate top
+	 *  level object (including lu_object_header), but without any
+	 *  lower-layer sub-object(s).
+	 *
+	 *  - then lu_object_alloc() sets fid in the header of newly created
+	 *  object.
+	 *
+	 *  - then lu_object_operations::loo_object_init() is called. It has
+	 *  to allocate lower-layer object(s). To do this,
+	 *  lu_object_operations::loo_object_init() calls ldo_object_alloc()
+	 *  of the lower-layer device(s).
+	 *
+	 *  - for all new objects allocated by
+	 *  lu_object_operations::loo_object_init() (and inserted into object
+	 *  stack), lu_object_operations::loo_object_init() is called again
+	 *  repeatedly, until no new objects are created.
+	 *
+	 * \post ergo(!IS_ERR(result), result->lo_dev == d &&
+	 *			     result->lo_ops != NULL);
+	 */
+	struct lu_object *(*ldo_object_alloc)(const struct lu_env *env,
+					      const struct lu_object_header *h,
+					      struct lu_device *d);
+	/**
+	 * process config specific for device.
+	 */
+	int (*ldo_process_config)(const struct lu_env *env,
+				  struct lu_device *, struct lustre_cfg *);
+	int (*ldo_recovery_complete)(const struct lu_env *,
+				     struct lu_device *);
+
+	/**
+	 * initialize local objects for device. this method called after layer has
+	 * been initialized (after LCFG_SETUP stage) and before it starts serving
+	 * user requests.
+	 */
+
+	int (*ldo_prepare)(const struct lu_env *,
+			   struct lu_device *parent,
+			   struct lu_device *dev);
+
+};
+
+/**
+ * For lu_object_conf flags
+ */
+typedef enum {
+	/* This is a new object to be allocated, or the file
+	 * corresponding to the object does not exists. */
+	LOC_F_NEW	= 0x00000001,
+} loc_flags_t;
+
+/**
+ * Object configuration, describing particulars of object being created. On
+ * server this is not used, as server objects are full identified by fid. On
+ * client configuration contains struct lustre_md.
+ */
+struct lu_object_conf {
+	/**
+	 * Some hints for obj find and alloc.
+	 */
+	loc_flags_t     loc_flags;
+};
+
+/**
+ * Type of "printer" function used by lu_object_operations::loo_object_print()
+ * method.
+ *
+ * Printer function is needed to provide some flexibility in (semi-)debugging
+ * output: possible implementations: printk, CDEBUG, sysfs/seq_file
+ */
+typedef int (*lu_printer_t)(const struct lu_env *env,
+			    void *cookie, const char *format, ...)
+	__attribute__ ((format (printf, 3, 4)));
+
+/**
+ * Operations specific for particular lu_object.
+ */
+struct lu_object_operations {
+
+	/**
+	 * Allocate lower-layer parts of the object by calling
+	 * lu_device_operations::ldo_object_alloc() of the corresponding
+	 * underlying device.
+	 *
+	 * This method is called once for each object inserted into object
+	 * stack. It's responsibility of this method to insert lower-layer
+	 * object(s) it create into appropriate places of object stack.
+	 */
+	int (*loo_object_init)(const struct lu_env *env,
+			       struct lu_object *o,
+			       const struct lu_object_conf *conf);
+	/**
+	 * Called (in top-to-bottom order) during object allocation after all
+	 * layers were allocated and initialized. Can be used to perform
+	 * initialization depending on lower layers.
+	 */
+	int (*loo_object_start)(const struct lu_env *env,
+				struct lu_object *o);
+	/**
+	 * Called before lu_object_operations::loo_object_free() to signal
+	 * that object is being destroyed. Dual to
+	 * lu_object_operations::loo_object_init().
+	 */
+	void (*loo_object_delete)(const struct lu_env *env,
+				  struct lu_object *o);
+	/**
+	 * Dual to lu_device_operations::ldo_object_alloc(). Called when
+	 * object is removed from memory.
+	 */
+	void (*loo_object_free)(const struct lu_env *env,
+				struct lu_object *o);
+	/**
+	 * Called when last active reference to the object is released (and
+	 * object returns to the cache). This method is optional.
+	 */
+	void (*loo_object_release)(const struct lu_env *env,
+				   struct lu_object *o);
+	/**
+	 * Optional debugging helper. Print given object.
+	 */
+	int (*loo_object_print)(const struct lu_env *env, void *cookie,
+				lu_printer_t p, const struct lu_object *o);
+	/**
+	 * Optional debugging method. Returns true iff method is internally
+	 * consistent.
+	 */
+	int (*loo_object_invariant)(const struct lu_object *o);
+};
+
+/**
+ * Type of lu_device.
+ */
+struct lu_device_type;
+
+/**
+ * Device: a layer in the server side abstraction stacking.
+ */
+struct lu_device {
+	/**
+	 * reference count. This is incremented, in particular, on each object
+	 * created at this layer.
+	 *
+	 * \todo XXX which means that atomic_t is probably too small.
+	 */
+	atomic_t		       ld_ref;
+	/**
+	 * Pointer to device type. Never modified once set.
+	 */
+	struct lu_device_type       *ld_type;
+	/**
+	 * Operation vector for this device.
+	 */
+	const struct lu_device_operations *ld_ops;
+	/**
+	 * Stack this device belongs to.
+	 */
+	struct lu_site		    *ld_site;
+	struct proc_dir_entry	     *ld_proc_entry;
+
+	/** \todo XXX: temporary back pointer into obd. */
+	struct obd_device		 *ld_obd;
+	/**
+	 * A list of references to this object, for debugging.
+	 */
+	struct lu_ref		      ld_reference;
+	/**
+	 * Link the device to the site.
+	 **/
+	struct list_head			 ld_linkage;
+};
+
+struct lu_device_type_operations;
+
+/**
+ * Tag bits for device type. They are used to distinguish certain groups of
+ * device types.
+ */
+enum lu_device_tag {
+	/** this is meta-data device */
+	LU_DEVICE_MD = (1 << 0),
+	/** this is data device */
+	LU_DEVICE_DT = (1 << 1),
+	/** data device in the client stack */
+	LU_DEVICE_CL = (1 << 2)
+};
+
+/**
+ * Type of device.
+ */
+struct lu_device_type {
+	/**
+	 * Tag bits. Taken from enum lu_device_tag. Never modified once set.
+	 */
+	__u32				   ldt_tags;
+	/**
+	 * Name of this class. Unique system-wide. Never modified once set.
+	 */
+	char				   *ldt_name;
+	/**
+	 * Operations for this type.
+	 */
+	const struct lu_device_type_operations *ldt_ops;
+	/**
+	 * \todo XXX: temporary pointer to associated obd_type.
+	 */
+	struct obd_type			*ldt_obd_type;
+	/**
+	 * \todo XXX: temporary: context tags used by obd_*() calls.
+	 */
+	__u32				   ldt_ctx_tags;
+	/**
+	 * Number of existing device type instances.
+	 */
+	unsigned				ldt_device_nr;
+	/**
+	 * Linkage into a global list of all device types.
+	 *
+	 * \see lu_device_types.
+	 */
+	struct list_head			      ldt_linkage;
+};
+
+/**
+ * Operations on a device type.
+ */
+struct lu_device_type_operations {
+	/**
+	 * Allocate new device.
+	 */
+	struct lu_device *(*ldto_device_alloc)(const struct lu_env *env,
+					       struct lu_device_type *t,
+					       struct lustre_cfg *lcfg);
+	/**
+	 * Free device. Dual to
+	 * lu_device_type_operations::ldto_device_alloc(). Returns pointer to
+	 * the next device in the stack.
+	 */
+	struct lu_device *(*ldto_device_free)(const struct lu_env *,
+					      struct lu_device *);
+
+	/**
+	 * Initialize the devices after allocation
+	 */
+	int  (*ldto_device_init)(const struct lu_env *env,
+				 struct lu_device *, const char *,
+				 struct lu_device *);
+	/**
+	 * Finalize device. Dual to
+	 * lu_device_type_operations::ldto_device_init(). Returns pointer to
+	 * the next device in the stack.
+	 */
+	struct lu_device *(*ldto_device_fini)(const struct lu_env *env,
+					      struct lu_device *);
+	/**
+	 * Initialize device type. This is called on module load.
+	 */
+	int  (*ldto_init)(struct lu_device_type *t);
+	/**
+	 * Finalize device type. Dual to
+	 * lu_device_type_operations::ldto_init(). Called on module unload.
+	 */
+	void (*ldto_fini)(struct lu_device_type *t);
+	/**
+	 * Called when the first device is created.
+	 */
+	void (*ldto_start)(struct lu_device_type *t);
+	/**
+	 * Called when number of devices drops to 0.
+	 */
+	void (*ldto_stop)(struct lu_device_type *t);
+};
+
+static inline int lu_device_is_md(const struct lu_device *d)
+{
+	return ergo(d != NULL, d->ld_type->ldt_tags & LU_DEVICE_MD);
+}
+
+/**
+ * Flags for the object layers.
+ */
+enum lu_object_flags {
+	/**
+	 * this flags is set if lu_object_operations::loo_object_init() has
+	 * been called for this layer. Used by lu_object_alloc().
+	 */
+	LU_OBJECT_ALLOCATED = (1 << 0)
+};
+
+/**
+ * Common object attributes.
+ */
+struct lu_attr {
+	/** size in bytes */
+	__u64	  la_size;
+	/** modification time in seconds since Epoch */
+	obd_time       la_mtime;
+	/** access time in seconds since Epoch */
+	obd_time       la_atime;
+	/** change time in seconds since Epoch */
+	obd_time       la_ctime;
+	/** 512-byte blocks allocated to object */
+	__u64	  la_blocks;
+	/** permission bits and file type */
+	__u32	  la_mode;
+	/** owner id */
+	__u32	  la_uid;
+	/** group id */
+	__u32	  la_gid;
+	/** object flags */
+	__u32	  la_flags;
+	/** number of persistent references to this object */
+	__u32	  la_nlink;
+	/** blk bits of the object*/
+	__u32	  la_blkbits;
+	/** blk size of the object*/
+	__u32	  la_blksize;
+	/** real device */
+	__u32	  la_rdev;
+	/**
+	 * valid bits
+	 *
+	 * \see enum la_valid
+	 */
+	__u64	  la_valid;
+};
+
+/** Bit-mask of valid attributes */
+enum la_valid {
+	LA_ATIME = 1 << 0,
+	LA_MTIME = 1 << 1,
+	LA_CTIME = 1 << 2,
+	LA_SIZE  = 1 << 3,
+	LA_MODE  = 1 << 4,
+	LA_UID   = 1 << 5,
+	LA_GID   = 1 << 6,
+	LA_BLOCKS = 1 << 7,
+	LA_TYPE   = 1 << 8,
+	LA_FLAGS  = 1 << 9,
+	LA_NLINK  = 1 << 10,
+	LA_RDEV   = 1 << 11,
+	LA_BLKSIZE = 1 << 12,
+	LA_KILL_SUID = 1 << 13,
+	LA_KILL_SGID = 1 << 14,
+};
+
+/**
+ * Layer in the layered object.
+ */
+struct lu_object {
+	/**
+	 * Header for this object.
+	 */
+	struct lu_object_header	   *lo_header;
+	/**
+	 * Device for this layer.
+	 */
+	struct lu_device		  *lo_dev;
+	/**
+	 * Operations for this object.
+	 */
+	const struct lu_object_operations *lo_ops;
+	/**
+	 * Linkage into list of all layers.
+	 */
+	struct list_head			 lo_linkage;
+	/**
+	 * Depth. Top level layer depth is 0.
+	 */
+	int				lo_depth;
+	/**
+	 * Flags from enum lu_object_flags.
+	 */
+	__u32					lo_flags;
+	/**
+	 * Link to the device, for debugging.
+	 */
+	struct lu_ref_link		*lo_dev_ref;
+};
+
+enum lu_object_header_flags {
+	/**
+	 * Don't keep this object in cache. Object will be destroyed as soon
+	 * as last reference to it is released. This flag cannot be cleared
+	 * once set.
+	 */
+	LU_OBJECT_HEARD_BANSHEE = 0,
+	/**
+	 * Mark this object has already been taken out of cache.
+	 */
+	LU_OBJECT_UNHASHED = 1
+};
+
+enum lu_object_header_attr {
+	LOHA_EXISTS   = 1 << 0,
+	LOHA_REMOTE   = 1 << 1,
+	/**
+	 * UNIX file type is stored in S_IFMT bits.
+	 */
+	LOHA_FT_START = 001 << 12, /**< S_IFIFO */
+	LOHA_FT_END   = 017 << 12, /**< S_IFMT */
+};
+
+/**
+ * "Compound" object, consisting of multiple layers.
+ *
+ * Compound object with given fid is unique with given lu_site.
+ *
+ * Note, that object does *not* necessary correspond to the real object in the
+ * persistent storage: object is an anchor for locking and method calling, so
+ * it is created for things like not-yet-existing child created by mkdir or
+ * create calls. lu_object_operations::loo_exists() can be used to check
+ * whether object is backed by persistent storage entity.
+ */
+struct lu_object_header {
+	/**
+	 * Object flags from enum lu_object_header_flags. Set and checked
+	 * atomically.
+	 */
+	unsigned long	  loh_flags;
+	/**
+	 * Object reference count. Protected by lu_site::ls_guard.
+	 */
+	atomic_t	   loh_ref;
+	/**
+	 * Fid, uniquely identifying this object.
+	 */
+	struct lu_fid	  loh_fid;
+	/**
+	 * Common object attributes, cached for efficiency. From enum
+	 * lu_object_header_attr.
+	 */
+	__u32		  loh_attr;
+	/**
+	 * Linkage into per-site hash table. Protected by lu_site::ls_guard.
+	 */
+	struct hlist_node       loh_hash;
+	/**
+	 * Linkage into per-site LRU list. Protected by lu_site::ls_guard.
+	 */
+	struct list_head	     loh_lru;
+	/**
+	 * Linkage into list of layers. Never modified once set (except lately
+	 * during object destruction). No locking is necessary.
+	 */
+	struct list_head	     loh_layers;
+	/**
+	 * A list of references to this object, for debugging.
+	 */
+	struct lu_ref	  loh_reference;
+};
+
+struct fld;
+
+struct lu_site_bkt_data {
+	/**
+	 * number of busy object on this bucket
+	 */
+	long		      lsb_busy;
+	/**
+	 * LRU list, updated on each access to object. Protected by
+	 * bucket lock of lu_site::ls_obj_hash.
+	 *
+	 * "Cold" end of LRU is lu_site::ls_lru.next. Accessed object are
+	 * moved to the lu_site::ls_lru.prev (this is due to the non-existence
+	 * of list_for_each_entry_safe_reverse()).
+	 */
+	struct list_head		lsb_lru;
+	/**
+	 * Wait-queue signaled when an object in this site is ultimately
+	 * destroyed (lu_object_free()). It is used by lu_object_find() to
+	 * wait before re-trying when object in the process of destruction is
+	 * found in the hash table.
+	 *
+	 * \see htable_lookup().
+	 */
+	wait_queue_head_t	       lsb_marche_funebre;
+};
+
+enum {
+	LU_SS_CREATED	 = 0,
+	LU_SS_CACHE_HIT,
+	LU_SS_CACHE_MISS,
+	LU_SS_CACHE_RACE,
+	LU_SS_CACHE_DEATH_RACE,
+	LU_SS_LRU_PURGED,
+	LU_SS_LAST_STAT
+};
+
+/**
+ * lu_site is a "compartment" within which objects are unique, and LRU
+ * discipline is maintained.
+ *
+ * lu_site exists so that multiple layered stacks can co-exist in the same
+ * address space.
+ *
+ * lu_site has the same relation to lu_device as lu_object_header to
+ * lu_object.
+ */
+struct lu_site {
+	/**
+	 * objects hash table
+	 */
+	cfs_hash_t	       *ls_obj_hash;
+	/**
+	 * index of bucket on hash table while purging
+	 */
+	int		       ls_purge_start;
+	/**
+	 * Top-level device for this stack.
+	 */
+	struct lu_device	 *ls_top_dev;
+	/**
+	 * Bottom-level device for this stack
+	 */
+	struct lu_device	*ls_bottom_dev;
+	/**
+	 * Linkage into global list of sites.
+	 */
+	struct list_head		ls_linkage;
+	/**
+	 * List for lu device for this site, protected
+	 * by ls_ld_lock.
+	 **/
+	struct list_head		ls_ld_linkage;
+	spinlock_t		ls_ld_lock;
+
+	/**
+	 * lu_site stats
+	 */
+	struct lprocfs_stats	*ls_stats;
+	/**
+	 * XXX: a hack! fld has to find md_site via site, remove when possible
+	 */
+	struct seq_server_site	*ld_seq_site;
+};
+
+static inline struct lu_site_bkt_data *
+lu_site_bkt_from_fid(struct lu_site *site, struct lu_fid *fid)
+{
+	cfs_hash_bd_t bd;
+
+	cfs_hash_bd_get(site->ls_obj_hash, fid, &bd);
+	return cfs_hash_bd_extra_get(site->ls_obj_hash, &bd);
+}
+
+/** \name ctors
+ * Constructors/destructors.
+ * @{
+ */
+
+int  lu_site_init	 (struct lu_site *s, struct lu_device *d);
+void lu_site_fini	 (struct lu_site *s);
+int  lu_site_init_finish  (struct lu_site *s);
+void lu_stack_fini	(const struct lu_env *env, struct lu_device *top);
+void lu_device_get	(struct lu_device *d);
+void lu_device_put	(struct lu_device *d);
+int  lu_device_init       (struct lu_device *d, struct lu_device_type *t);
+void lu_device_fini       (struct lu_device *d);
+int  lu_object_header_init(struct lu_object_header *h);
+void lu_object_header_fini(struct lu_object_header *h);
+int  lu_object_init       (struct lu_object *o,
+			   struct lu_object_header *h, struct lu_device *d);
+void lu_object_fini       (struct lu_object *o);
+void lu_object_add_top    (struct lu_object_header *h, struct lu_object *o);
+void lu_object_add	(struct lu_object *before, struct lu_object *o);
+
+void lu_dev_add_linkage(struct lu_site *s, struct lu_device *d);
+void lu_dev_del_linkage(struct lu_site *s, struct lu_device *d);
+
+/**
+ * Helpers to initialize and finalize device types.
+ */
+
+int  lu_device_type_init(struct lu_device_type *ldt);
+void lu_device_type_fini(struct lu_device_type *ldt);
+void lu_types_stop(void);
+
+/** @} ctors */
+
+/** \name caching
+ * Caching and reference counting.
+ * @{
+ */
+
+/**
+ * Acquire additional reference to the given object. This function is used to
+ * attain additional reference. To acquire initial reference use
+ * lu_object_find().
+ */
+static inline void lu_object_get(struct lu_object *o)
+{
+	LASSERT(atomic_read(&o->lo_header->loh_ref) > 0);
+	atomic_inc(&o->lo_header->loh_ref);
+}
+
+/**
+ * Return true of object will not be cached after last reference to it is
+ * released.
+ */
+static inline int lu_object_is_dying(const struct lu_object_header *h)
+{
+	return test_bit(LU_OBJECT_HEARD_BANSHEE, &h->loh_flags);
+}
+
+void lu_object_put(const struct lu_env *env, struct lu_object *o);
+void lu_object_put_nocache(const struct lu_env *env, struct lu_object *o);
+void lu_object_unhash(const struct lu_env *env, struct lu_object *o);
+
+int lu_site_purge(const struct lu_env *env, struct lu_site *s, int nr);
+
+void lu_site_print(const struct lu_env *env, struct lu_site *s, void *cookie,
+		   lu_printer_t printer);
+struct lu_object *lu_object_find(const struct lu_env *env,
+				 struct lu_device *dev, const struct lu_fid *f,
+				 const struct lu_object_conf *conf);
+struct lu_object *lu_object_find_at(const struct lu_env *env,
+				    struct lu_device *dev,
+				    const struct lu_fid *f,
+				    const struct lu_object_conf *conf);
+struct lu_object *lu_object_find_slice(const struct lu_env *env,
+				       struct lu_device *dev,
+				       const struct lu_fid *f,
+				       const struct lu_object_conf *conf);
+/** @} caching */
+
+/** \name helpers
+ * Helpers.
+ * @{
+ */
+
+/**
+ * First (topmost) sub-object of given compound object
+ */
+static inline struct lu_object *lu_object_top(struct lu_object_header *h)
+{
+	LASSERT(!list_empty(&h->loh_layers));
+	return container_of0(h->loh_layers.next, struct lu_object, lo_linkage);
+}
+
+/**
+ * Next sub-object in the layering
+ */
+static inline struct lu_object *lu_object_next(const struct lu_object *o)
+{
+	return container_of0(o->lo_linkage.next, struct lu_object, lo_linkage);
+}
+
+/**
+ * Pointer to the fid of this object.
+ */
+static inline const struct lu_fid *lu_object_fid(const struct lu_object *o)
+{
+	return &o->lo_header->loh_fid;
+}
+
+/**
+ * return device operations vector for this object
+ */
+static const inline struct lu_device_operations *
+lu_object_ops(const struct lu_object *o)
+{
+	return o->lo_dev->ld_ops;
+}
+
+/**
+ * Given a compound object, find its slice, corresponding to the device type
+ * \a dtype.
+ */
+struct lu_object *lu_object_locate(struct lu_object_header *h,
+				   const struct lu_device_type *dtype);
+
+/**
+ * Printer function emitting messages through libcfs_debug_msg().
+ */
+int lu_cdebug_printer(const struct lu_env *env,
+		      void *cookie, const char *format, ...);
+
+/**
+ * Print object description followed by a user-supplied message.
+ */
+#define LU_OBJECT_DEBUG(mask, env, object, format, ...)		   \
+do {								      \
+	LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);		  \
+									  \
+	if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {		     \
+		lu_object_print(env, &msgdata, lu_cdebug_printer, object);\
+		CDEBUG(mask, format , ## __VA_ARGS__);		    \
+	}								 \
+} while (0)
+
+/**
+ * Print short object description followed by a user-supplied message.
+ */
+#define LU_OBJECT_HEADER(mask, env, object, format, ...)		\
+do {								    \
+	LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);		\
+									\
+	if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {		   \
+		lu_object_header_print(env, &msgdata, lu_cdebug_printer,\
+				       (object)->lo_header);	    \
+		lu_cdebug_printer(env, &msgdata, "\n");		 \
+		CDEBUG(mask, format , ## __VA_ARGS__);		  \
+	}							       \
+} while (0)
+
+void lu_object_print       (const struct lu_env *env, void *cookie,
+			    lu_printer_t printer, const struct lu_object *o);
+void lu_object_header_print(const struct lu_env *env, void *cookie,
+			    lu_printer_t printer,
+			    const struct lu_object_header *hdr);
+
+/**
+ * Check object consistency.
+ */
+int lu_object_invariant(const struct lu_object *o);
+
+
+/**
+ * Check whether object exists, no matter on local or remote storage.
+ * Note: LOHA_EXISTS will be set once some one created the object,
+ * and it does not needs to be committed to storage.
+ */
+#define lu_object_exists(o) ((o)->lo_header->loh_attr & LOHA_EXISTS)
+
+/**
+ * Check whether object on the remote storage.
+ */
+#define lu_object_remote(o) unlikely((o)->lo_header->loh_attr & LOHA_REMOTE)
+
+static inline int lu_object_assert_exists(const struct lu_object *o)
+{
+	return lu_object_exists(o);
+}
+
+static inline int lu_object_assert_not_exists(const struct lu_object *o)
+{
+	return !lu_object_exists(o);
+}
+
+/**
+ * Attr of this object.
+ */
+static inline __u32 lu_object_attr(const struct lu_object *o)
+{
+	LASSERT(lu_object_exists(o) != 0);
+	return o->lo_header->loh_attr;
+}
+
+static inline struct lu_ref_link *lu_object_ref_add(struct lu_object *o,
+						    const char *scope,
+						    const void *source)
+{
+	return lu_ref_add(&o->lo_header->loh_reference, scope, source);
+}
+
+static inline void lu_object_ref_del(struct lu_object *o,
+				     const char *scope, const void *source)
+{
+	lu_ref_del(&o->lo_header->loh_reference, scope, source);
+}
+
+static inline void lu_object_ref_del_at(struct lu_object *o,
+					struct lu_ref_link *link,
+					const char *scope, const void *source)
+{
+	lu_ref_del_at(&o->lo_header->loh_reference, link, scope, source);
+}
+
+/** input params, should be filled out by mdt */
+struct lu_rdpg {
+	/** hash */
+	__u64		   rp_hash;
+	/** count in bytes */
+	unsigned int	    rp_count;
+	/** number of pages */
+	unsigned int	    rp_npages;
+	/** requested attr */
+	__u32		   rp_attrs;
+	/** pointers to pages */
+	struct page	   **rp_pages;
+};
+
+enum lu_xattr_flags {
+	LU_XATTR_REPLACE = (1 << 0),
+	LU_XATTR_CREATE  = (1 << 1)
+};
+
+/** @} helpers */
+
+/** \name lu_context
+ * @{ */
+
+/** For lu_context health-checks */
+enum lu_context_state {
+	LCS_INITIALIZED = 1,
+	LCS_ENTERED,
+	LCS_LEFT,
+	LCS_FINALIZED
+};
+
+/**
+ * lu_context. Execution context for lu_object methods. Currently associated
+ * with thread.
+ *
+ * All lu_object methods, except device and device type methods (called during
+ * system initialization and shutdown) are executed "within" some
+ * lu_context. This means, that pointer to some "current" lu_context is passed
+ * as an argument to all methods.
+ *
+ * All service ptlrpc threads create lu_context as part of their
+ * initialization. It is possible to create "stand-alone" context for other
+ * execution environments (like system calls).
+ *
+ * lu_object methods mainly use lu_context through lu_context_key interface
+ * that allows each layer to associate arbitrary pieces of data with each
+ * context (see pthread_key_create(3) for similar interface).
+ *
+ * On a client, lu_context is bound to a thread, see cl_env_get().
+ *
+ * \see lu_context_key
+ */
+struct lu_context {
+	/**
+	 * lu_context is used on the client side too. Yet we don't want to
+	 * allocate values of server-side keys for the client contexts and
+	 * vice versa.
+	 *
+	 * To achieve this, set of tags in introduced. Contexts and keys are
+	 * marked with tags. Key value are created only for context whose set
+	 * of tags has non-empty intersection with one for key. Tags are taken
+	 * from enum lu_context_tag.
+	 */
+	__u32		  lc_tags;
+	enum lu_context_state  lc_state;
+	/**
+	 * Pointer to the home service thread. NULL for other execution
+	 * contexts.
+	 */
+	struct ptlrpc_thread  *lc_thread;
+	/**
+	 * Pointer to an array with key values. Internal implementation
+	 * detail.
+	 */
+	void		 **lc_value;
+	/**
+	 * Linkage into a list of all remembered contexts. Only
+	 * `non-transient' contexts, i.e., ones created for service threads
+	 * are placed here.
+	 */
+	struct list_head	     lc_remember;
+	/**
+	 * Version counter used to skip calls to lu_context_refill() when no
+	 * keys were registered.
+	 */
+	unsigned	       lc_version;
+	/**
+	 * Debugging cookie.
+	 */
+	unsigned	       lc_cookie;
+};
+
+/**
+ * lu_context_key interface. Similar to pthread_key.
+ */
+
+enum lu_context_tag {
+	/**
+	 * Thread on md server
+	 */
+	LCT_MD_THREAD = 1 << 0,
+	/**
+	 * Thread on dt server
+	 */
+	LCT_DT_THREAD = 1 << 1,
+	/**
+	 * Context for transaction handle
+	 */
+	LCT_TX_HANDLE = 1 << 2,
+	/**
+	 * Thread on client
+	 */
+	LCT_CL_THREAD = 1 << 3,
+	/**
+	 * A per-request session on a server, and a per-system-call session on
+	 * a client.
+	 */
+	LCT_SESSION   = 1 << 4,
+	/**
+	 * A per-request data on OSP device
+	 */
+	LCT_OSP_THREAD = 1 << 5,
+	/**
+	 * MGS device thread
+	 */
+	LCT_MG_THREAD = 1 << 6,
+	/**
+	 * Context for local operations
+	 */
+	LCT_LOCAL = 1 << 7,
+	/**
+	 * Set when at least one of keys, having values in this context has
+	 * non-NULL lu_context_key::lct_exit() method. This is used to
+	 * optimize lu_context_exit() call.
+	 */
+	LCT_HAS_EXIT  = 1 << 28,
+	/**
+	 * Don't add references for modules creating key values in that context.
+	 * This is only for contexts used internally by lu_object framework.
+	 */
+	LCT_NOREF     = 1 << 29,
+	/**
+	 * Key is being prepared for retiring, don't create new values for it.
+	 */
+	LCT_QUIESCENT = 1 << 30,
+	/**
+	 * Context should be remembered.
+	 */
+	LCT_REMEMBER  = 1 << 31,
+	/**
+	 * Contexts usable in cache shrinker thread.
+	 */
+	LCT_SHRINKER  = LCT_MD_THREAD|LCT_DT_THREAD|LCT_CL_THREAD|LCT_NOREF
+};
+
+/**
+ * Key. Represents per-context value slot.
+ *
+ * Keys are usually registered when module owning the key is initialized, and
+ * de-registered when module is unloaded. Once key is registered, all new
+ * contexts with matching tags, will get key value. "Old" contexts, already
+ * initialized at the time of key registration, can be forced to get key value
+ * by calling lu_context_refill().
+ *
+ * Every key value is counted in lu_context_key::lct_used and acquires a
+ * reference on an owning module. This means, that all key values have to be
+ * destroyed before module can be unloaded. This is usually achieved by
+ * stopping threads started by the module, that created contexts in their
+ * entry functions. Situation is complicated by the threads shared by multiple
+ * modules, like ptlrpcd daemon on a client. To work around this problem,
+ * contexts, created in such threads, are `remembered' (see
+ * LCT_REMEMBER)---i.e., added into a global list. When module is preparing
+ * for unloading it does the following:
+ *
+ *     - marks its keys as `quiescent' (lu_context_tag::LCT_QUIESCENT)
+ *       preventing new key values from being allocated in the new contexts,
+ *       and
+ *
+ *     - scans a list of remembered contexts, destroying values of module
+ *       keys, thus releasing references to the module.
+ *
+ * This is done by lu_context_key_quiesce(). If module is re-activated
+ * before key has been de-registered, lu_context_key_revive() call clears
+ * `quiescent' marker.
+ *
+ * lu_context code doesn't provide any internal synchronization for these
+ * activities---it's assumed that startup (including threads start-up) and
+ * shutdown are serialized by some external means.
+ *
+ * \see lu_context
+ */
+struct lu_context_key {
+	/**
+	 * Set of tags for which values of this key are to be instantiated.
+	 */
+	__u32 lct_tags;
+	/**
+	 * Value constructor. This is called when new value is created for a
+	 * context. Returns pointer to new value of error pointer.
+	 */
+	void  *(*lct_init)(const struct lu_context *ctx,
+			   struct lu_context_key *key);
+	/**
+	 * Value destructor. Called when context with previously allocated
+	 * value of this slot is destroyed. \a data is a value that was returned
+	 * by a matching call to lu_context_key::lct_init().
+	 */
+	void   (*lct_fini)(const struct lu_context *ctx,
+			   struct lu_context_key *key, void *data);
+	/**
+	 * Optional method called on lu_context_exit() for all allocated
+	 * keys. Can be used by debugging code checking that locks are
+	 * released, etc.
+	 */
+	void   (*lct_exit)(const struct lu_context *ctx,
+			   struct lu_context_key *key, void *data);
+	/**
+	 * Internal implementation detail: index within lu_context::lc_value[]
+	 * reserved for this key.
+	 */
+	int      lct_index;
+	/**
+	 * Internal implementation detail: number of values created for this
+	 * key.
+	 */
+	atomic_t lct_used;
+	/**
+	 * Internal implementation detail: module for this key.
+	 */
+	module_t *lct_owner;
+	/**
+	 * References to this key. For debugging.
+	 */
+	struct lu_ref  lct_reference;
+};
+
+#define LU_KEY_INIT(mod, type)				    \
+	static void* mod##_key_init(const struct lu_context *ctx, \
+				    struct lu_context_key *key)   \
+	{							 \
+		type *value;				      \
+								  \
+		CLASSERT(PAGE_CACHE_SIZE >= sizeof (*value));       \
+								  \
+		OBD_ALLOC_PTR(value);			     \
+		if (value == NULL)				\
+			value = ERR_PTR(-ENOMEM);		 \
+								  \
+		return value;				     \
+	}							 \
+	struct __##mod##__dummy_init {;} /* semicolon catcher */
+
+#define LU_KEY_FINI(mod, type)					      \
+	static void mod##_key_fini(const struct lu_context *ctx,	    \
+				    struct lu_context_key *key, void* data) \
+	{								   \
+		type *info = data;					  \
+									    \
+		OBD_FREE_PTR(info);					 \
+	}								   \
+	struct __##mod##__dummy_fini {;} /* semicolon catcher */
+
+#define LU_KEY_INIT_FINI(mod, type)   \
+	LU_KEY_INIT(mod,type);	\
+	LU_KEY_FINI(mod,type)
+
+#define LU_CONTEXT_KEY_DEFINE(mod, tags)		\
+	struct lu_context_key mod##_thread_key = {      \
+		.lct_tags = tags,		       \
+		.lct_init = mod##_key_init,	     \
+		.lct_fini = mod##_key_fini	      \
+	}
+
+#define LU_CONTEXT_KEY_INIT(key)			\
+do {						    \
+	(key)->lct_owner = THIS_MODULE;		 \
+} while (0)
+
+int   lu_context_key_register(struct lu_context_key *key);
+void  lu_context_key_degister(struct lu_context_key *key);
+void *lu_context_key_get     (const struct lu_context *ctx,
+			       const struct lu_context_key *key);
+void  lu_context_key_quiesce (struct lu_context_key *key);
+void  lu_context_key_revive  (struct lu_context_key *key);
+
+
+/*
+ * LU_KEY_INIT_GENERIC() has to be a macro to correctly determine an
+ * owning module.
+ */
+
+#define LU_KEY_INIT_GENERIC(mod)					\
+	static void mod##_key_init_generic(struct lu_context_key *k, ...) \
+	{							       \
+		struct lu_context_key *key = k;			 \
+		va_list args;					   \
+									\
+		va_start(args, k);				      \
+		do {						    \
+			LU_CONTEXT_KEY_INIT(key);		       \
+			key = va_arg(args, struct lu_context_key *);    \
+		} while (key != NULL);				  \
+		va_end(args);					   \
+	}
+
+#define LU_TYPE_INIT(mod, ...)					  \
+	LU_KEY_INIT_GENERIC(mod)					\
+	static int mod##_type_init(struct lu_device_type *t)	    \
+	{							       \
+		mod##_key_init_generic(__VA_ARGS__, NULL);	      \
+		return lu_context_key_register_many(__VA_ARGS__, NULL); \
+	}							       \
+	struct __##mod##_dummy_type_init {;}
+
+#define LU_TYPE_FINI(mod, ...)					  \
+	static void mod##_type_fini(struct lu_device_type *t)	   \
+	{							       \
+		lu_context_key_degister_many(__VA_ARGS__, NULL);	\
+	}							       \
+	struct __##mod##_dummy_type_fini {;}
+
+#define LU_TYPE_START(mod, ...)				 \
+	static void mod##_type_start(struct lu_device_type *t)  \
+	{						       \
+		lu_context_key_revive_many(__VA_ARGS__, NULL);  \
+	}						       \
+	struct __##mod##_dummy_type_start {;}
+
+#define LU_TYPE_STOP(mod, ...)				  \
+	static void mod##_type_stop(struct lu_device_type *t)   \
+	{						       \
+		lu_context_key_quiesce_many(__VA_ARGS__, NULL); \
+	}						       \
+	struct __##mod##_dummy_type_stop {;}
+
+
+
+#define LU_TYPE_INIT_FINI(mod, ...)	     \
+	LU_TYPE_INIT(mod, __VA_ARGS__);	 \
+	LU_TYPE_FINI(mod, __VA_ARGS__);	 \
+	LU_TYPE_START(mod, __VA_ARGS__);	\
+	LU_TYPE_STOP(mod, __VA_ARGS__)
+
+int   lu_context_init  (struct lu_context *ctx, __u32 tags);
+void  lu_context_fini  (struct lu_context *ctx);
+void  lu_context_enter (struct lu_context *ctx);
+void  lu_context_exit  (struct lu_context *ctx);
+int   lu_context_refill(struct lu_context *ctx);
+
+/*
+ * Helper functions to operate on multiple keys. These are used by the default
+ * device type operations, defined by LU_TYPE_INIT_FINI().
+ */
+
+int  lu_context_key_register_many(struct lu_context_key *k, ...);
+void lu_context_key_degister_many(struct lu_context_key *k, ...);
+void lu_context_key_revive_many  (struct lu_context_key *k, ...);
+void lu_context_key_quiesce_many (struct lu_context_key *k, ...);
+
+/*
+ * update/clear ctx/ses tags.
+ */
+void lu_context_tags_update(__u32 tags);
+void lu_context_tags_clear(__u32 tags);
+void lu_session_tags_update(__u32 tags);
+void lu_session_tags_clear(__u32 tags);
+
+/**
+ * Environment.
+ */
+struct lu_env {
+	/**
+	 * "Local" context, used to store data instead of stack.
+	 */
+	struct lu_context  le_ctx;
+	/**
+	 * "Session" context for per-request data.
+	 */
+	struct lu_context *le_ses;
+};
+
+int  lu_env_init  (struct lu_env *env, __u32 tags);
+void lu_env_fini  (struct lu_env *env);
+int  lu_env_refill(struct lu_env *env);
+int  lu_env_refill_by_tags(struct lu_env *env, __u32 ctags, __u32 stags);
+
+/** @} lu_context */
+
+/**
+ * Output site statistical counters into a buffer. Suitable for
+ * ll_rd_*()-style functions.
+ */
+int lu_site_stats_print(const struct lu_site *s, char *page, int count);
+
+/**
+ * Common name structure to be passed around for various name related methods.
+ */
+struct lu_name {
+	const char    *ln_name;
+	int	    ln_namelen;
+};
+
+/**
+ * Common buffer structure to be passed around for various xattr_{s,g}et()
+ * methods.
+ */
+struct lu_buf {
+	void   *lb_buf;
+	ssize_t lb_len;
+};
+
+#define DLUBUF "(%p %zu)"
+#define PLUBUF(buf) (buf)->lb_buf, (buf)->lb_len
+/**
+ * One-time initializers, called at obdclass module initialization, not
+ * exported.
+ */
+
+/**
+ * Initialization of global lu_* data.
+ */
+int lu_global_init(void);
+
+/**
+ * Dual to lu_global_init().
+ */
+void lu_global_fini(void);
+
+struct lu_kmem_descr {
+	struct kmem_cache **ckd_cache;
+	const char       *ckd_name;
+	const size_t      ckd_size;
+};
+
+int  lu_kmem_init(struct lu_kmem_descr *caches);
+void lu_kmem_fini(struct lu_kmem_descr *caches);
+
+void lu_object_assign_fid(const struct lu_env *env, struct lu_object *o,
+			  const struct lu_fid *fid);
+struct lu_object *lu_object_anon(const struct lu_env *env,
+				 struct lu_device *dev,
+				 const struct lu_object_conf *conf);
+
+/** null buffer */
+extern struct lu_buf LU_BUF_NULL;
+
+void lu_buf_free(struct lu_buf *buf);
+void lu_buf_alloc(struct lu_buf *buf, int size);
+void lu_buf_realloc(struct lu_buf *buf, int size);
+
+int lu_buf_check_and_grow(struct lu_buf *buf, int len);
+struct lu_buf *lu_buf_check_and_alloc(struct lu_buf *buf, int len);
+
+/** @} lu */
+#endif /* __LUSTRE_LU_OBJECT_H */
diff --git a/drivers/staging/lustre/lustre/include/lu_ref.h b/drivers/staging/lustre/lustre/include/lu_ref.h
new file mode 100644
index 000000000000..624c19be1524
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lu_ref.h
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifndef __LUSTRE_LU_REF_H
+#define __LUSTRE_LU_REF_H
+
+#include <linux/list.h>
+
+/** \defgroup lu_ref lu_ref
+ *
+ * An interface to track references between objects. Mostly for debugging.
+ *
+ * Suppose there is a reference counted data-structure struct foo. To track
+ * who acquired references to instance of struct foo, add lu_ref field to it:
+ *
+ * \code
+ *	 struct foo {
+ *		 atomic_t      foo_refcount;
+ *		 struct lu_ref foo_reference;
+ *		 ...
+ *	 };
+ * \endcode
+ *
+ * foo::foo_reference has to be initialized by calling
+ * lu_ref_init(). Typically there will be functions or macros to increment and
+ * decrement foo::foo_refcount, let's say they are foo_get(struct foo *foo)
+ * and foo_put(struct foo *foo), respectively.
+ *
+ * Whenever foo_get() is called to acquire a reference on a foo, lu_ref_add()
+ * has to be called to insert into foo::foo_reference a record, describing
+ * acquired reference. Dually, lu_ref_del() removes matching record. Typical
+ * usages are:
+ *
+ * \code
+ *	struct bar *bar;
+ *
+ *	// bar owns a reference to foo.
+ *	bar->bar_foo = foo_get(foo);
+ *	lu_ref_add(&foo->foo_reference, "bar", bar);
+ *
+ *	...
+ *
+ *	// reference from bar to foo is released.
+ *	lu_ref_del(&foo->foo_reference, "bar", bar);
+ *	foo_put(bar->bar_foo);
+ *
+ *
+ *	// current thread acquired a temporary reference to foo.
+ *	foo_get(foo);
+ *	lu_ref_add(&foo->reference, __FUNCTION__, current);
+ *
+ *	...
+ *
+ *	// temporary reference is released.
+ *	lu_ref_del(&foo->reference, __FUNCTION__, current);
+ *	foo_put(foo);
+ * \endcode
+ *
+ * \e Et \e cetera. Often it makes sense to include lu_ref_add() and
+ * lu_ref_del() calls into foo_get() and foo_put(). When an instance of struct
+ * foo is destroyed, lu_ref_fini() has to be called that checks that no
+ * pending references remain. lu_ref_print() can be used to dump a list of
+ * pending references, while hunting down a leak.
+ *
+ * For objects to which a large number of references can be acquired,
+ * lu_ref_del() can become cpu consuming, as it has to scan the list of
+ * references. To work around this, remember result of lu_ref_add() (usually
+ * in the same place where pointer to struct foo is stored), and use
+ * lu_ref_del_at():
+ *
+ * \code
+ *	// There is a large number of bar's for a single foo.
+ *	bar->bar_foo     = foo_get(foo);
+ *	bar->bar_foo_ref = lu_ref_add(&foo->foo_reference, "bar", bar);
+ *
+ *	...
+ *
+ *	// reference from bar to foo is released.
+ *	lu_ref_del_at(&foo->foo_reference, bar->bar_foo_ref, "bar", bar);
+ *	foo_put(bar->bar_foo);
+ * \endcode
+ *
+ * lu_ref interface degrades gracefully in case of memory shortages.
+ *
+ * @{
+ */
+
+
+struct lu_ref  {};
+
+static inline void lu_ref_init(struct lu_ref *ref)
+{
+}
+
+static inline void lu_ref_fini(struct lu_ref *ref)
+{
+}
+
+static inline struct lu_ref_link *lu_ref_add(struct lu_ref *ref,
+					     const char *scope,
+					     const void *source)
+{
+	return NULL;
+}
+
+static inline struct lu_ref_link *lu_ref_add_atomic(struct lu_ref *ref,
+						    const char *scope,
+						    const void *source)
+{
+	return NULL;
+}
+
+static inline void lu_ref_del(struct lu_ref *ref, const char *scope,
+			      const void *source)
+{
+}
+
+static inline void lu_ref_set_at(struct lu_ref *ref, struct lu_ref_link *link,
+				 const char *scope, const void *source0,
+				 const void *source1)
+{
+}
+
+static inline void lu_ref_del_at(struct lu_ref *ref, struct lu_ref_link *link,
+				 const char *scope, const void *source)
+{
+}
+
+static inline int lu_ref_global_init(void)
+{
+	return 0;
+}
+
+static inline void lu_ref_global_fini(void)
+{
+}
+
+static inline void lu_ref_print(const struct lu_ref *ref)
+{
+}
+
+static inline void lu_ref_print_all(void)
+{
+}
+
+/** @} lu */
+
+#endif /* __LUSTRE_LU_REF_H */
diff --git a/drivers/staging/lustre/lustre/include/lu_target.h b/drivers/staging/lustre/lustre/include/lu_target.h
new file mode 100644
index 000000000000..8d48cf4e27ee
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lu_target.h
@@ -0,0 +1,91 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTRE_LU_TARGET_H
+#define _LUSTRE_LU_TARGET_H
+
+#include <dt_object.h>
+#include <lustre_disk.h>
+
+struct lu_target {
+	struct obd_device       *lut_obd;
+	struct dt_device	*lut_bottom;
+	/** last_rcvd file */
+	struct dt_object	*lut_last_rcvd;
+	/* transaction callbacks */
+	struct dt_txn_callback   lut_txn_cb;
+	/** server data in last_rcvd file */
+	struct lr_server_data    lut_lsd;
+	/** Server last transaction number */
+	__u64		    lut_last_transno;
+	/** Lock protecting last transaction number */
+	spinlock_t		 lut_translock;
+	/** Lock protecting client bitmap */
+	spinlock_t		 lut_client_bitmap_lock;
+	/** Bitmap of known clients */
+	unsigned long	   *lut_client_bitmap;
+};
+
+typedef void (*tgt_cb_t)(struct lu_target *lut, __u64 transno,
+			 void *data, int err);
+struct tgt_commit_cb {
+	tgt_cb_t  tgt_cb_func;
+	void     *tgt_cb_data;
+};
+
+void tgt_boot_epoch_update(struct lu_target *lut);
+int tgt_last_commit_cb_add(struct thandle *th, struct lu_target *lut,
+			   struct obd_export *exp, __u64 transno);
+int tgt_new_client_cb_add(struct thandle *th, struct obd_export *exp);
+int tgt_init(const struct lu_env *env, struct lu_target *lut,
+	     struct obd_device *obd, struct dt_device *dt);
+void tgt_fini(const struct lu_env *env, struct lu_target *lut);
+int tgt_client_alloc(struct obd_export *exp);
+void tgt_client_free(struct obd_export *exp);
+int tgt_client_del(const struct lu_env *env, struct obd_export *exp);
+int tgt_client_add(const struct lu_env *env, struct obd_export *exp, int);
+int tgt_client_new(const struct lu_env *env, struct obd_export *exp);
+int tgt_client_data_read(const struct lu_env *env, struct lu_target *tg,
+			 struct lsd_client_data *lcd, loff_t *off, int index);
+int tgt_client_data_write(const struct lu_env *env, struct lu_target *tg,
+			  struct lsd_client_data *lcd, loff_t *off, struct thandle *th);
+int tgt_server_data_read(const struct lu_env *env, struct lu_target *tg);
+int tgt_server_data_write(const struct lu_env *env, struct lu_target *tg,
+			  struct thandle *th);
+int tgt_server_data_update(const struct lu_env *env, struct lu_target *tg, int sync);
+int tgt_truncate_last_rcvd(const struct lu_env *env, struct lu_target *tg, loff_t off);
+
+#endif /* __LUSTRE_LU_TARGET_H */
diff --git a/drivers/staging/lustre/lustre/include/lustre/libiam.h b/drivers/staging/lustre/lustre/include/lustre/libiam.h
new file mode 100644
index 000000000000..e8e0b084a6bc
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre/libiam.h
@@ -0,0 +1,145 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre/libiam.h
+ *
+ * iam user level library
+ *
+ * Author: Wang Di <wangdi@clusterfs.com>
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ * Author: Fan Yong <fanyong@clusterfs.com>
+ */
+
+/*
+ *  lustre/libiam.h
+ */
+
+#ifndef __IAM_ULIB_H__
+#define __IAM_ULIB_H__
+
+/** \defgroup libiam libiam
+ *
+ * @{
+ */
+
+
+#define DX_FMT_NAME_LEN 16
+
+enum iam_fmt_t {
+	FMT_LFIX,
+	FMT_LVAR
+};
+
+struct iam_uapi_info {
+	__u16 iui_keysize;
+	__u16 iui_recsize;
+	__u16 iui_ptrsize;
+	__u16 iui_height;
+	char  iui_fmt_name[DX_FMT_NAME_LEN];
+};
+
+/*
+ * Creat an iam file, but do NOT open it.
+ * Return 0 if success, else -1.
+ */
+int iam_creat(char *filename, enum iam_fmt_t fmt,
+	      int blocksize, int keysize, int recsize, int ptrsize);
+
+/*
+ * Open an iam file, but do NOT creat it if the file doesn't exist.
+ * Please use iam_creat for creating the file before use iam_open.
+ * Return file id (fd) if success, else -1.
+ */
+int iam_open(char *filename, struct iam_uapi_info *ua);
+
+/*
+ * Close file opened by iam_open.
+ */
+int iam_close(int fd);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_insert(int fd, struct iam_uapi_info *ua,
+	       int key_need_convert, char *keybuf,
+	       int rec_need_convert, char *recbuf);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_lookup(int fd, struct iam_uapi_info *ua,
+	       int key_need_convert, char *key_buf,
+	       int *keysize, char *save_key,
+	       int rec_need_convert, char *rec_buf,
+	       int *recsize, char *save_rec);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_delete(int fd, struct iam_uapi_info *ua,
+	       int key_need_convert, char *keybuf,
+	       int rec_need_convert, char *recbuf);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_it_start(int fd, struct iam_uapi_info *ua,
+		 int key_need_convert, char *key_buf,
+		 int *keysize, char *save_key,
+		 int rec_need_convert, char *rec_buf,
+		 int *recsize, char *save_rec);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_it_next(int fd, struct iam_uapi_info *ua,
+		int key_need_convert, char *key_buf,
+		int *keysize, char *save_key,
+		int rec_need_convert, char *rec_buf,
+		int *recsize, char *save_rec);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_it_stop(int fd, struct iam_uapi_info *ua,
+		int key_need_convert, char *keybuf,
+		int rec_need_convert, char *recbuf);
+
+/*
+ * Change iam file mode.
+ */
+int iam_polymorph(char *filename, unsigned long mode);
+
+/** @} libiam */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre/liblustreapi.h b/drivers/staging/lustre/lustre/include/lustre/liblustreapi.h
new file mode 100644
index 000000000000..707eb74fdf68
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre/liblustreapi.h
@@ -0,0 +1,43 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/*
+ * NOTE: This file is DEPRECATED!  Please include lustreapi.h directly
+ * instead of this file.  This file will be removed from a future version
+ * of lustre!
+ */
+
+#ifndef _LIBLUSTREAPI_H_
+#define _LIBLUSTREAPI_H_
+
+#include <lustre/lustreapi.h>
+#warning "Including liblustreapi.h is deprecated. Include lustreapi.h directly."
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre/ll_fiemap.h b/drivers/staging/lustre/lustre/include/lustre/ll_fiemap.h
new file mode 100644
index 000000000000..ad253c6deadd
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre/ll_fiemap.h
@@ -0,0 +1,121 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre/ll_fiemap.h
+ *
+ * FIEMAP data structures and flags. This header file will be used until
+ * fiemap.h is available in the upstream kernel.
+ *
+ * Author: Kalpak Shah <kalpak.shah@sun.com>
+ * Author: Andreas Dilger <adilger@sun.com>
+ */
+
+#ifndef _LUSTRE_FIEMAP_H
+#define _LUSTRE_FIEMAP_H
+
+
+
+struct ll_fiemap_extent {
+	__u64 fe_logical;  /* logical offset in bytes for the start of
+			    * the extent from the beginning of the file */
+	__u64 fe_physical; /* physical offset in bytes for the start
+			    * of the extent from the beginning of the disk */
+	__u64 fe_length;   /* length in bytes for this extent */
+	__u64 fe_reserved64[2];
+	__u32 fe_flags;    /* FIEMAP_EXTENT_* flags for this extent */
+	__u32 fe_device;   /* device number for this extent */
+	__u32 fe_reserved[2];
+};
+
+struct ll_user_fiemap {
+	__u64 fm_start;  /* logical offset (inclusive) at
+			  * which to start mapping (in) */
+	__u64 fm_length; /* logical length of mapping which
+			  * userspace wants (in) */
+	__u32 fm_flags;  /* FIEMAP_FLAG_* flags for request (in/out) */
+	__u32 fm_mapped_extents;/* number of extents that were mapped (out) */
+	__u32 fm_extent_count;  /* size of fm_extents array (in) */
+	__u32 fm_reserved;
+	struct ll_fiemap_extent fm_extents[0]; /* array of mapped extents (out) */
+};
+
+#define FIEMAP_MAX_OFFSET      (~0ULL)
+
+#define FIEMAP_FLAG_SYNC	 0x00000001 /* sync file data before map */
+#define FIEMAP_FLAG_XATTR	0x00000002 /* map extended attribute tree */
+
+#define FIEMAP_EXTENT_LAST	      0x00000001 /* Last extent in file. */
+#define FIEMAP_EXTENT_UNKNOWN	   0x00000002 /* Data location unknown. */
+#define FIEMAP_EXTENT_DELALLOC	  0x00000004 /* Location still pending.
+						    * Sets EXTENT_UNKNOWN. */
+#define FIEMAP_EXTENT_ENCODED	   0x00000008 /* Data can not be read
+						    * while fs is unmounted */
+#define FIEMAP_EXTENT_DATA_ENCRYPTED    0x00000080 /* Data is encrypted by fs.
+						    * Sets EXTENT_NO_DIRECT. */
+#define FIEMAP_EXTENT_NOT_ALIGNED       0x00000100 /* Extent offsets may not be
+						    * block aligned. */
+#define FIEMAP_EXTENT_DATA_INLINE       0x00000200 /* Data mixed with metadata.
+						    * Sets EXTENT_NOT_ALIGNED.*/
+#define FIEMAP_EXTENT_DATA_TAIL	 0x00000400 /* Multiple files in block.
+						    * Sets EXTENT_NOT_ALIGNED.*/
+#define FIEMAP_EXTENT_UNWRITTEN	 0x00000800 /* Space allocated, but
+						    * no data (i.e. zero). */
+#define FIEMAP_EXTENT_MERGED	    0x00001000 /* File does not natively
+						    * support extents. Result
+						    * merged for efficiency. */
+
+
+static inline size_t fiemap_count_to_size(size_t extent_count)
+{
+	return (sizeof(struct ll_user_fiemap) + extent_count *
+					       sizeof(struct ll_fiemap_extent));
+}
+
+static inline unsigned fiemap_size_to_count(size_t array_size)
+{
+	return ((array_size - sizeof(struct ll_user_fiemap)) /
+					       sizeof(struct ll_fiemap_extent));
+}
+
+#define FIEMAP_FLAG_DEVICE_ORDER 0x40000000 /* return device ordered mapping */
+
+#ifdef FIEMAP_FLAGS_COMPAT
+#undef FIEMAP_FLAGS_COMPAT
+#endif
+
+/* Lustre specific flags - use a high bit, don't conflict with upstream flag */
+#define FIEMAP_EXTENT_NO_DIRECT	 0x40000000 /* Data mapping undefined */
+#define FIEMAP_EXTENT_NET	       0x80000000 /* Data stored remotely.
+						    * Sets NO_DIRECT flag */
+
+#endif /* _LUSTRE_FIEMAP_H */
diff --git a/drivers/staging/lustre/lustre/include/lustre/lustre_build_version.h b/drivers/staging/lustre/lustre/include/lustre/lustre_build_version.h
new file mode 100644
index 000000000000..93a3d7db3010
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre/lustre_build_version.h
@@ -0,0 +1,2 @@
+#define BUILD_VERSION "v2_3_64_0-g6e62c21-CHANGED-3.9.0"
+#define LUSTRE_RELEASE 3.9.0_g6e62c21
diff --git a/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h b/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h
new file mode 100644
index 000000000000..029aa2fcbe07
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h
@@ -0,0 +1,3629 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre/lustre_idl.h
+ *
+ * Lustre wire protocol definitions.
+ */
+
+/** \defgroup lustreidl lustreidl
+ *
+ * Lustre wire protocol definitions.
+ *
+ * ALL structs passing over the wire should be declared here.  Structs
+ * that are used in interfaces with userspace should go in lustre_user.h.
+ *
+ * All structs being declared here should be built from simple fixed-size
+ * types (__u8, __u16, __u32, __u64) or be built from other types or
+ * structs also declared in this file.  Similarly, all flags and magic
+ * values in those structs should also be declared here.  This ensures
+ * that the Lustre wire protocol is not influenced by external dependencies.
+ *
+ * The only other acceptable items in this file are VERY SIMPLE accessor
+ * functions to avoid callers grubbing inside the structures, and the
+ * prototypes of the swabber functions for each struct.  Nothing that
+ * depends on external functions or definitions should be in here.
+ *
+ * Structs must be properly aligned to put 64-bit values on an 8-byte
+ * boundary.  Any structs being added here must also be added to
+ * utils/wirecheck.c and "make newwiretest" run to regenerate the
+ * utils/wiretest.c sources.  This allows us to verify that wire structs
+ * have the proper alignment/size on all architectures.
+ *
+ * DO NOT CHANGE any of the structs, flags, values declared here and used
+ * in released Lustre versions.  Some structs may have padding fields that
+ * can be used.  Some structs might allow addition at the end (verify this
+ * in the code to ensure that new/old clients that see this larger struct
+ * do not fail, otherwise you need to implement protocol compatibility).
+ *
+ * We assume all nodes are either little-endian or big-endian, and we
+ * always send messages in the sender's native format.  The receiver
+ * detects the message format by checking the 'magic' field of the message
+ * (see lustre_msg_swabbed() below).
+ *
+ * Each wire type has corresponding 'lustre_swab_xxxtypexxx()' routines,
+ * implemented either here, inline (trivial implementations) or in
+ * ptlrpc/pack_generic.c.  These 'swabbers' convert the type from "other"
+ * endian, in-place in the message buffer.
+ *
+ * A swabber takes a single pointer argument.  The caller must already have
+ * verified that the length of the message buffer >= sizeof (type).
+ *
+ * For variable length types, a second 'lustre_swab_v_xxxtypexxx()' routine
+ * may be defined that swabs just the variable part, after the caller has
+ * verified that the message buffer is large enough.
+ *
+ * @{
+ */
+
+#ifndef _LUSTRE_IDL_H_
+#define _LUSTRE_IDL_H_
+
+#if !defined(LASSERT) && !defined(LPU64)
+#include <linux/libcfs/libcfs.h> /* for LASSERT, LPUX64, etc */
+#endif
+
+/* Defn's shared with user-space. */
+#include <lustre/lustre_user.h>
+
+/*
+ *  GENERAL STUFF
+ */
+/* FOO_REQUEST_PORTAL is for incoming requests on the FOO
+ * FOO_REPLY_PORTAL   is for incoming replies on the FOO
+ * FOO_BULK_PORTAL    is for incoming bulk on the FOO
+ */
+
+#define CONNMGR_REQUEST_PORTAL	  1
+#define CONNMGR_REPLY_PORTAL	    2
+//#define OSC_REQUEST_PORTAL	    3
+#define OSC_REPLY_PORTAL		4
+//#define OSC_BULK_PORTAL	       5
+#define OST_IO_PORTAL		   6
+#define OST_CREATE_PORTAL	       7
+#define OST_BULK_PORTAL		 8
+//#define MDC_REQUEST_PORTAL	    9
+#define MDC_REPLY_PORTAL	       10
+//#define MDC_BULK_PORTAL	      11
+#define MDS_REQUEST_PORTAL	     12
+//#define MDS_REPLY_PORTAL	     13
+#define MDS_BULK_PORTAL		14
+#define LDLM_CB_REQUEST_PORTAL	 15
+#define LDLM_CB_REPLY_PORTAL	   16
+#define LDLM_CANCEL_REQUEST_PORTAL     17
+#define LDLM_CANCEL_REPLY_PORTAL       18
+//#define PTLBD_REQUEST_PORTAL	   19
+//#define PTLBD_REPLY_PORTAL	     20
+//#define PTLBD_BULK_PORTAL	      21
+#define MDS_SETATTR_PORTAL	     22
+#define MDS_READPAGE_PORTAL	    23
+#define MDS_MDS_PORTAL		 24
+
+#define MGC_REPLY_PORTAL	       25
+#define MGS_REQUEST_PORTAL	     26
+#define MGS_REPLY_PORTAL	       27
+#define OST_REQUEST_PORTAL	     28
+#define FLD_REQUEST_PORTAL	     29
+#define SEQ_METADATA_PORTAL	    30
+#define SEQ_DATA_PORTAL		31
+#define SEQ_CONTROLLER_PORTAL	  32
+#define MGS_BULK_PORTAL		33
+
+/* Portal 63 is reserved for the Cray Inc DVS - nic@cray.com, roe@cray.com, n8851@cray.com */
+
+/* packet types */
+#define PTL_RPC_MSG_REQUEST 4711
+#define PTL_RPC_MSG_ERR     4712
+#define PTL_RPC_MSG_REPLY   4713
+
+/* DON'T use swabbed values of MAGIC as magic! */
+#define LUSTRE_MSG_MAGIC_V1 0x0BD00BD0
+#define LUSTRE_MSG_MAGIC_V2 0x0BD00BD3
+
+#define LUSTRE_MSG_MAGIC_V1_SWABBED 0xD00BD00B
+#define LUSTRE_MSG_MAGIC_V2_SWABBED 0xD30BD00B
+
+#define LUSTRE_MSG_MAGIC LUSTRE_MSG_MAGIC_V2
+
+#define PTLRPC_MSG_VERSION  0x00000003
+#define LUSTRE_VERSION_MASK 0xffff0000
+#define LUSTRE_OBD_VERSION  0x00010000
+#define LUSTRE_MDS_VERSION  0x00020000
+#define LUSTRE_OST_VERSION  0x00030000
+#define LUSTRE_DLM_VERSION  0x00040000
+#define LUSTRE_LOG_VERSION  0x00050000
+#define LUSTRE_MGS_VERSION  0x00060000
+
+typedef __u32 mdsno_t;
+typedef __u64 seqno_t;
+typedef __u64 obd_id;
+typedef __u64 obd_seq;
+typedef __s64 obd_time;
+typedef __u64 obd_size;
+typedef __u64 obd_off;
+typedef __u64 obd_blocks;
+typedef __u64 obd_valid;
+typedef __u32 obd_blksize;
+typedef __u32 obd_mode;
+typedef __u32 obd_uid;
+typedef __u32 obd_gid;
+typedef __u32 obd_flag;
+typedef __u32 obd_count;
+
+/**
+ * Describes a range of sequence, lsr_start is included but lsr_end is
+ * not in the range.
+ * Same structure is used in fld module where lsr_index field holds mdt id
+ * of the home mdt.
+ */
+struct lu_seq_range {
+	__u64 lsr_start;
+	__u64 lsr_end;
+	__u32 lsr_index;
+	__u32 lsr_flags;
+};
+
+#define LU_SEQ_RANGE_MDT	0x0
+#define LU_SEQ_RANGE_OST	0x1
+#define LU_SEQ_RANGE_ANY	0x3
+
+#define LU_SEQ_RANGE_MASK	0x3
+
+static inline unsigned fld_range_type(const struct lu_seq_range *range)
+{
+	return range->lsr_flags & LU_SEQ_RANGE_MASK;
+}
+
+static inline int fld_range_is_ost(const struct lu_seq_range *range)
+{
+	return fld_range_type(range) == LU_SEQ_RANGE_OST;
+}
+
+static inline int fld_range_is_mdt(const struct lu_seq_range *range)
+{
+	return fld_range_type(range) == LU_SEQ_RANGE_MDT;
+}
+
+/**
+ * This all range is only being used when fld client sends fld query request,
+ * but it does not know whether the seq is MDT or OST, so it will send req
+ * with ALL type, which means either seq type gotten from lookup can be
+ * expected.
+ */
+static inline unsigned fld_range_is_any(const struct lu_seq_range *range)
+{
+	return fld_range_type(range) == LU_SEQ_RANGE_ANY;
+}
+
+static inline void fld_range_set_type(struct lu_seq_range *range,
+				      unsigned flags)
+{
+	LASSERT(!(flags & ~LU_SEQ_RANGE_MASK));
+	range->lsr_flags |= flags;
+}
+
+static inline void fld_range_set_mdt(struct lu_seq_range *range)
+{
+	fld_range_set_type(range, LU_SEQ_RANGE_MDT);
+}
+
+static inline void fld_range_set_ost(struct lu_seq_range *range)
+{
+	fld_range_set_type(range, LU_SEQ_RANGE_OST);
+}
+
+static inline void fld_range_set_any(struct lu_seq_range *range)
+{
+	fld_range_set_type(range, LU_SEQ_RANGE_ANY);
+}
+
+/**
+ * returns  width of given range \a r
+ */
+
+static inline __u64 range_space(const struct lu_seq_range *range)
+{
+	return range->lsr_end - range->lsr_start;
+}
+
+/**
+ * initialize range to zero
+ */
+
+static inline void range_init(struct lu_seq_range *range)
+{
+	range->lsr_start = range->lsr_end = range->lsr_index = 0;
+}
+
+/**
+ * check if given seq id \a s is within given range \a r
+ */
+
+static inline int range_within(const struct lu_seq_range *range,
+			       __u64 s)
+{
+	return s >= range->lsr_start && s < range->lsr_end;
+}
+
+static inline int range_is_sane(const struct lu_seq_range *range)
+{
+	return (range->lsr_end >= range->lsr_start);
+}
+
+static inline int range_is_zero(const struct lu_seq_range *range)
+{
+	return (range->lsr_start == 0 && range->lsr_end == 0);
+}
+
+static inline int range_is_exhausted(const struct lu_seq_range *range)
+
+{
+	return range_space(range) == 0;
+}
+
+/* return 0 if two range have the same location */
+static inline int range_compare_loc(const struct lu_seq_range *r1,
+				    const struct lu_seq_range *r2)
+{
+	return r1->lsr_index != r2->lsr_index ||
+	       r1->lsr_flags != r2->lsr_flags;
+}
+
+#define DRANGE "[%#16.16"LPF64"x-%#16.16"LPF64"x):%x:%s"
+
+#define PRANGE(range)		\
+	(range)->lsr_start,	\
+	(range)->lsr_end,	\
+	(range)->lsr_index,	\
+	fld_range_is_mdt(range) ? "mdt" : "ost"
+
+
+/** \defgroup lu_fid lu_fid
+ * @{ */
+
+/**
+ * Flags for lustre_mdt_attrs::lma_compat and lustre_mdt_attrs::lma_incompat.
+ * Deprecated since HSM and SOM attributes are now stored in separate on-disk
+ * xattr.
+ */
+enum lma_compat {
+	LMAC_HSM = 0x00000001,
+	LMAC_SOM = 0x00000002,
+};
+
+/**
+ * Masks for all features that should be supported by a Lustre version to
+ * access a specific file.
+ * This information is stored in lustre_mdt_attrs::lma_incompat.
+ */
+enum lma_incompat {
+	LMAI_RELEASED = 0x0000001, /* file is released */
+	LMAI_AGENT = 0x00000002, /* agent inode */
+	LMAI_REMOTE_PARENT = 0x00000004, /* the parent of the object
+					    is on the remote MDT */
+};
+#define LMA_INCOMPAT_SUPP	(LMAI_AGENT | LMAI_REMOTE_PARENT)
+
+extern void lustre_lma_swab(struct lustre_mdt_attrs *lma);
+extern void lustre_lma_init(struct lustre_mdt_attrs *lma,
+			    const struct lu_fid *fid, __u32 incompat);
+/**
+ * SOM on-disk attributes stored in a separate xattr.
+ */
+struct som_attrs {
+	/** Bitfield for supported data in this structure. For future use. */
+	__u32	som_compat;
+
+	/** Incompat feature list. The supported feature mask is availabe in
+	 * SOM_INCOMPAT_SUPP */
+	__u32	som_incompat;
+
+	/** IO Epoch SOM attributes belongs to */
+	__u64	som_ioepoch;
+	/** total file size in objects */
+	__u64	som_size;
+	/** total fs blocks in objects */
+	__u64	som_blocks;
+	/** mds mount id the size is valid for */
+	__u64	som_mountid;
+};
+extern void lustre_som_swab(struct som_attrs *attrs);
+
+#define SOM_INCOMPAT_SUPP 0x0
+
+/**
+ * HSM on-disk attributes stored in a separate xattr.
+ */
+struct hsm_attrs {
+	/** Bitfield for supported data in this structure. For future use. */
+	__u32	hsm_compat;
+
+	/** HSM flags, see hsm_flags enum below */
+	__u32	hsm_flags;
+	/** backend archive id associated with the file */
+	__u64	hsm_arch_id;
+	/** version associated with the last archiving, if any */
+	__u64	hsm_arch_ver;
+};
+extern void lustre_hsm_swab(struct hsm_attrs *attrs);
+
+/**
+ * fid constants
+ */
+enum {
+	/** initial fid id value */
+	LUSTRE_FID_INIT_OID  = 1UL
+};
+
+/** returns fid object sequence */
+static inline __u64 fid_seq(const struct lu_fid *fid)
+{
+	return fid->f_seq;
+}
+
+/** returns fid object id */
+static inline __u32 fid_oid(const struct lu_fid *fid)
+{
+	return fid->f_oid;
+}
+
+/** returns fid object version */
+static inline __u32 fid_ver(const struct lu_fid *fid)
+{
+	return fid->f_ver;
+}
+
+static inline void fid_zero(struct lu_fid *fid)
+{
+	memset(fid, 0, sizeof(*fid));
+}
+
+static inline obd_id fid_ver_oid(const struct lu_fid *fid)
+{
+	return ((__u64)fid_ver(fid) << 32 | fid_oid(fid));
+}
+
+/**
+ * Note that reserved SEQ numbers below 12 will conflict with ldiskfs
+ * inodes in the IGIF namespace, so these reserved SEQ numbers can be
+ * used for other purposes and not risk collisions with existing inodes.
+ *
+ * Different FID Format
+ * http://arch.lustre.org/index.php?title=Interoperability_fids_zfs#NEW.0
+ */
+enum fid_seq {
+	FID_SEQ_OST_MDT0	= 0,
+	FID_SEQ_LLOG		= 1, /* unnamed llogs */
+	FID_SEQ_ECHO		= 2,
+	FID_SEQ_OST_MDT1	= 3,
+	FID_SEQ_OST_MAX		= 9, /* Max MDT count before OST_on_FID */
+	FID_SEQ_LLOG_NAME	= 10, /* named llogs */
+	FID_SEQ_RSVD		= 11,
+	FID_SEQ_IGIF		= 12,
+	FID_SEQ_IGIF_MAX	= 0x0ffffffffULL,
+	FID_SEQ_IDIF		= 0x100000000ULL,
+	FID_SEQ_IDIF_MAX	= 0x1ffffffffULL,
+	/* Normal FID sequence starts from this value, i.e. 1<<33 */
+	FID_SEQ_START		= 0x200000000ULL,
+	/* sequence for local pre-defined FIDs listed in local_oid */
+	FID_SEQ_LOCAL_FILE	= 0x200000001ULL,
+	FID_SEQ_DOT_LUSTRE	= 0x200000002ULL,
+	/* sequence is used for local named objects FIDs generated
+	 * by local_object_storage library */
+	FID_SEQ_LOCAL_NAME	= 0x200000003ULL,
+	/* Because current FLD will only cache the fid sequence, instead
+	 * of oid on the client side, if the FID needs to be exposed to
+	 * clients sides, it needs to make sure all of fids under one
+	 * sequence will be located in one MDT. */
+	FID_SEQ_SPECIAL		= 0x200000004ULL,
+	FID_SEQ_QUOTA		= 0x200000005ULL,
+	FID_SEQ_QUOTA_GLB	= 0x200000006ULL,
+	FID_SEQ_ROOT		= 0x200000007ULL,  /* Located on MDT0 */
+	FID_SEQ_NORMAL		= 0x200000400ULL,
+	FID_SEQ_LOV_DEFAULT	= 0xffffffffffffffffULL
+};
+
+#define OBIF_OID_MAX_BITS	   32
+#define OBIF_MAX_OID		(1ULL << OBIF_OID_MAX_BITS)
+#define OBIF_OID_MASK	       ((1ULL << OBIF_OID_MAX_BITS) - 1)
+#define IDIF_OID_MAX_BITS	   48
+#define IDIF_MAX_OID		(1ULL << IDIF_OID_MAX_BITS)
+#define IDIF_OID_MASK	       ((1ULL << IDIF_OID_MAX_BITS) - 1)
+
+/** OID for FID_SEQ_SPECIAL */
+enum special_oid {
+	/* Big Filesystem Lock to serialize rename operations */
+	FID_OID_SPECIAL_BFL     = 1UL,
+};
+
+/** OID for FID_SEQ_DOT_LUSTRE */
+enum dot_lustre_oid {
+	FID_OID_DOT_LUSTRE  = 1UL,
+	FID_OID_DOT_LUSTRE_OBF = 2UL,
+};
+
+static inline int fid_seq_is_mdt0(obd_seq seq)
+{
+	return (seq == FID_SEQ_OST_MDT0);
+}
+
+static inline int fid_seq_is_mdt(const __u64 seq)
+{
+	return seq == FID_SEQ_OST_MDT0 || seq >= FID_SEQ_NORMAL;
+};
+
+static inline int fid_seq_is_echo(obd_seq seq)
+{
+	return (seq == FID_SEQ_ECHO);
+}
+
+static inline int fid_is_echo(const struct lu_fid *fid)
+{
+	return fid_seq_is_echo(fid_seq(fid));
+}
+
+static inline int fid_seq_is_llog(obd_seq seq)
+{
+	return (seq == FID_SEQ_LLOG);
+}
+
+static inline int fid_is_llog(const struct lu_fid *fid)
+{
+	/* file with OID == 1 is not llog but contains last oid */
+	return fid_seq_is_llog(fid_seq(fid)) && fid_oid(fid) > 1;
+}
+
+static inline int fid_seq_is_rsvd(const __u64 seq)
+{
+	return (seq > FID_SEQ_OST_MDT0 && seq <= FID_SEQ_RSVD);
+};
+
+static inline int fid_seq_is_special(const __u64 seq)
+{
+	return seq == FID_SEQ_SPECIAL;
+};
+
+static inline int fid_seq_is_local_file(const __u64 seq)
+{
+	return seq == FID_SEQ_LOCAL_FILE ||
+	       seq == FID_SEQ_LOCAL_NAME;
+};
+
+static inline int fid_seq_is_root(const __u64 seq)
+{
+	return seq == FID_SEQ_ROOT;
+}
+
+static inline int fid_seq_is_dot(const __u64 seq)
+{
+	return seq == FID_SEQ_DOT_LUSTRE;
+}
+
+static inline int fid_seq_is_default(const __u64 seq)
+{
+	return seq == FID_SEQ_LOV_DEFAULT;
+}
+
+static inline int fid_is_mdt0(const struct lu_fid *fid)
+{
+	return fid_seq_is_mdt0(fid_seq(fid));
+}
+
+static inline void lu_root_fid(struct lu_fid *fid)
+{
+	fid->f_seq = FID_SEQ_ROOT;
+	fid->f_oid = 1;
+	fid->f_ver = 0;
+}
+
+/**
+ * Check if a fid is igif or not.
+ * \param fid the fid to be tested.
+ * \return true if the fid is a igif; otherwise false.
+ */
+static inline int fid_seq_is_igif(const __u64 seq)
+{
+	return seq >= FID_SEQ_IGIF && seq <= FID_SEQ_IGIF_MAX;
+}
+
+static inline int fid_is_igif(const struct lu_fid *fid)
+{
+	return fid_seq_is_igif(fid_seq(fid));
+}
+
+/**
+ * Check if a fid is idif or not.
+ * \param fid the fid to be tested.
+ * \return true if the fid is a idif; otherwise false.
+ */
+static inline int fid_seq_is_idif(const __u64 seq)
+{
+	return seq >= FID_SEQ_IDIF && seq <= FID_SEQ_IDIF_MAX;
+}
+
+static inline int fid_is_idif(const struct lu_fid *fid)
+{
+	return fid_seq_is_idif(fid_seq(fid));
+}
+
+static inline int fid_is_local_file(const struct lu_fid *fid)
+{
+	return fid_seq_is_local_file(fid_seq(fid));
+}
+
+static inline int fid_seq_is_norm(const __u64 seq)
+{
+	return (seq >= FID_SEQ_NORMAL);
+}
+
+static inline int fid_is_norm(const struct lu_fid *fid)
+{
+	return fid_seq_is_norm(fid_seq(fid));
+}
+
+/* convert an OST objid into an IDIF FID SEQ number */
+static inline obd_seq fid_idif_seq(obd_id id, __u32 ost_idx)
+{
+	return FID_SEQ_IDIF | (ost_idx << 16) | ((id >> 32) & 0xffff);
+}
+
+/* convert a packed IDIF FID into an OST objid */
+static inline obd_id fid_idif_id(obd_seq seq, __u32 oid, __u32 ver)
+{
+	return ((__u64)ver << 48) | ((seq & 0xffff) << 32) | oid;
+}
+
+/* extract ost index from IDIF FID */
+static inline __u32 fid_idif_ost_idx(const struct lu_fid *fid)
+{
+	LASSERT(fid_is_idif(fid));
+	return (fid_seq(fid) >> 16) & 0xffff;
+}
+
+/* extract OST sequence (group) from a wire ost_id (id/seq) pair */
+static inline obd_seq ostid_seq(const struct ost_id *ostid)
+{
+	if (fid_seq_is_mdt0(ostid->oi.oi_seq))
+		return FID_SEQ_OST_MDT0;
+
+	if (fid_seq_is_default(ostid->oi.oi_seq))
+		return FID_SEQ_LOV_DEFAULT;
+
+	if (fid_is_idif(&ostid->oi_fid))
+		return FID_SEQ_OST_MDT0;
+
+	return fid_seq(&ostid->oi_fid);
+}
+
+/* extract OST objid from a wire ost_id (id/seq) pair */
+static inline obd_id ostid_id(const struct ost_id *ostid)
+{
+	if (fid_seq_is_mdt0(ostid_seq(ostid)))
+		return ostid->oi.oi_id & IDIF_OID_MASK;
+
+	if (fid_is_idif(&ostid->oi_fid))
+		return fid_idif_id(fid_seq(&ostid->oi_fid),
+				   fid_oid(&ostid->oi_fid), 0);
+
+	return fid_oid(&ostid->oi_fid);
+}
+
+static inline void ostid_set_seq(struct ost_id *oi, __u64 seq)
+{
+	if (fid_seq_is_mdt0(seq) || fid_seq_is_default(seq)) {
+		oi->oi.oi_seq = seq;
+	} else {
+		oi->oi_fid.f_seq = seq;
+		/* Note: if f_oid + f_ver is zero, we need init it
+		 * to be 1, otherwise, ostid_seq will treat this
+		 * as old ostid (oi_seq == 0) */
+		if (oi->oi_fid.f_oid == 0 && oi->oi_fid.f_ver == 0)
+			oi->oi_fid.f_oid = LUSTRE_FID_INIT_OID;
+	}
+}
+
+static inline void ostid_set_seq_mdt0(struct ost_id *oi)
+{
+	ostid_set_seq(oi, FID_SEQ_OST_MDT0);
+}
+
+static inline void ostid_set_seq_echo(struct ost_id *oi)
+{
+	ostid_set_seq(oi, FID_SEQ_ECHO);
+}
+
+static inline void ostid_set_seq_llog(struct ost_id *oi)
+{
+	ostid_set_seq(oi, FID_SEQ_LLOG);
+}
+
+/**
+ * Note: we need check oi_seq to decide where to set oi_id,
+ * so oi_seq should always be set ahead of oi_id.
+ */
+static inline void ostid_set_id(struct ost_id *oi, __u64 oid)
+{
+	if (fid_seq_is_mdt0(ostid_seq(oi))) {
+		if (oid >= IDIF_MAX_OID) {
+			CERROR("Bad "LPU64" to set "DOSTID"\n",
+				oid, POSTID(oi));
+			return;
+		}
+		oi->oi.oi_id = oid;
+	} else {
+		if (oid > OBIF_MAX_OID) {
+			CERROR("Bad "LPU64" to set "DOSTID"\n",
+				oid, POSTID(oi));
+			return;
+		}
+		oi->oi_fid.f_oid = oid;
+	}
+}
+
+static inline void ostid_inc_id(struct ost_id *oi)
+{
+	if (fid_seq_is_mdt0(ostid_seq(oi))) {
+		if (unlikely(ostid_id(oi) + 1 > IDIF_MAX_OID)) {
+			CERROR("Bad inc "DOSTID"\n", POSTID(oi));
+			return;
+		}
+		oi->oi.oi_id++;
+	} else {
+		oi->oi_fid.f_oid++;
+	}
+}
+
+static inline void ostid_dec_id(struct ost_id *oi)
+{
+	if (fid_seq_is_mdt0(ostid_seq(oi)))
+		oi->oi.oi_id--;
+	else
+		oi->oi_fid.f_oid--;
+}
+
+/**
+ * Unpack an OST object id/seq (group) into a FID.  This is needed for
+ * converting all obdo, lmm, lsm, etc. 64-bit id/seq pairs into proper
+ * FIDs.  Note that if an id/seq is already in FID/IDIF format it will
+ * be passed through unchanged.  Only legacy OST objects in "group 0"
+ * will be mapped into the IDIF namespace so that they can fit into the
+ * struct lu_fid fields without loss.  For reference see:
+ * http://arch.lustre.org/index.php?title=Interoperability_fids_zfs
+ */
+static inline int ostid_to_fid(struct lu_fid *fid, struct ost_id *ostid,
+			       __u32 ost_idx)
+{
+	if (ost_idx > 0xffff) {
+		CERROR("bad ost_idx, "DOSTID" ost_idx:%u\n", POSTID(ostid),
+		       ost_idx);
+		return -EBADF;
+	}
+
+	if (fid_seq_is_mdt0(ostid_seq(ostid))) {
+		/* This is a "legacy" (old 1.x/2.early) OST object in "group 0"
+		 * that we map into the IDIF namespace.  It allows up to 2^48
+		 * objects per OST, as this is the object namespace that has
+		 * been in production for years.  This can handle create rates
+		 * of 1M objects/s/OST for 9 years, or combinations thereof. */
+		if (ostid_id(ostid) >= IDIF_MAX_OID) {
+			 CERROR("bad MDT0 id, "DOSTID" ost_idx:%u\n",
+				POSTID(ostid), ost_idx);
+			 return -EBADF;
+		}
+		fid->f_seq = fid_idif_seq(ostid_id(ostid), ost_idx);
+		/* truncate to 32 bits by assignment */
+		fid->f_oid = ostid_id(ostid);
+		/* in theory, not currently used */
+		fid->f_ver = ostid_id(ostid) >> 48;
+	} else /* if (fid_seq_is_idif(seq) || fid_seq_is_norm(seq)) */ {
+	       /* This is either an IDIF object, which identifies objects across
+		* all OSTs, or a regular FID.  The IDIF namespace maps legacy
+		* OST objects into the FID namespace.  In both cases, we just
+		* pass the FID through, no conversion needed. */
+		if (ostid->oi_fid.f_ver != 0) {
+			CERROR("bad MDT0 id, "DOSTID" ost_idx:%u\n",
+				POSTID(ostid), ost_idx);
+			return -EBADF;
+		}
+		*fid = ostid->oi_fid;
+	}
+
+	return 0;
+}
+
+/* pack any OST FID into an ostid (id/seq) for the wire/disk */
+static inline int fid_to_ostid(const struct lu_fid *fid, struct ost_id *ostid)
+{
+	if (unlikely(fid_seq_is_igif(fid->f_seq))) {
+		CERROR("bad IGIF, "DFID"\n", PFID(fid));
+		return -EBADF;
+	}
+
+	if (fid_is_idif(fid)) {
+		ostid_set_seq_mdt0(ostid);
+		ostid_set_id(ostid, fid_idif_id(fid_seq(fid), fid_oid(fid),
+						fid_ver(fid)));
+	} else {
+		ostid->oi_fid = *fid;
+	}
+
+	return 0;
+}
+
+/* Check whether the fid is for LAST_ID */
+static inline int fid_is_last_id(const struct lu_fid *fid)
+{
+	return (fid_is_idif(fid) || fid_is_norm(fid) || fid_is_echo(fid)) &&
+		fid_oid(fid) == 0;
+}
+
+/**
+ * Get inode number from a igif.
+ * \param fid a igif to get inode number from.
+ * \return inode number for the igif.
+ */
+static inline ino_t lu_igif_ino(const struct lu_fid *fid)
+{
+	return fid_seq(fid);
+}
+
+extern void lustre_swab_ost_id(struct ost_id *oid);
+
+/**
+ * Get inode generation from a igif.
+ * \param fid a igif to get inode generation from.
+ * \return inode generation for the igif.
+ */
+static inline __u32 lu_igif_gen(const struct lu_fid *fid)
+{
+	return fid_oid(fid);
+}
+
+/**
+ * Build igif from the inode number/generation.
+ */
+static inline void lu_igif_build(struct lu_fid *fid, __u32 ino, __u32 gen)
+{
+	fid->f_seq = ino;
+	fid->f_oid = gen;
+	fid->f_ver = 0;
+}
+
+/*
+ * Fids are transmitted across network (in the sender byte-ordering),
+ * and stored on disk in big-endian order.
+ */
+static inline void fid_cpu_to_le(struct lu_fid *dst, const struct lu_fid *src)
+{
+	/* check that all fields are converted */
+	CLASSERT(sizeof *src ==
+		 sizeof fid_seq(src) +
+		 sizeof fid_oid(src) + sizeof fid_ver(src));
+	dst->f_seq = cpu_to_le64(fid_seq(src));
+	dst->f_oid = cpu_to_le32(fid_oid(src));
+	dst->f_ver = cpu_to_le32(fid_ver(src));
+}
+
+static inline void fid_le_to_cpu(struct lu_fid *dst, const struct lu_fid *src)
+{
+	/* check that all fields are converted */
+	CLASSERT(sizeof *src ==
+		 sizeof fid_seq(src) +
+		 sizeof fid_oid(src) + sizeof fid_ver(src));
+	dst->f_seq = le64_to_cpu(fid_seq(src));
+	dst->f_oid = le32_to_cpu(fid_oid(src));
+	dst->f_ver = le32_to_cpu(fid_ver(src));
+}
+
+static inline void fid_cpu_to_be(struct lu_fid *dst, const struct lu_fid *src)
+{
+	/* check that all fields are converted */
+	CLASSERT(sizeof *src ==
+		 sizeof fid_seq(src) +
+		 sizeof fid_oid(src) + sizeof fid_ver(src));
+	dst->f_seq = cpu_to_be64(fid_seq(src));
+	dst->f_oid = cpu_to_be32(fid_oid(src));
+	dst->f_ver = cpu_to_be32(fid_ver(src));
+}
+
+static inline void fid_be_to_cpu(struct lu_fid *dst, const struct lu_fid *src)
+{
+	/* check that all fields are converted */
+	CLASSERT(sizeof *src ==
+		 sizeof fid_seq(src) +
+		 sizeof fid_oid(src) + sizeof fid_ver(src));
+	dst->f_seq = be64_to_cpu(fid_seq(src));
+	dst->f_oid = be32_to_cpu(fid_oid(src));
+	dst->f_ver = be32_to_cpu(fid_ver(src));
+}
+
+static inline int fid_is_sane(const struct lu_fid *fid)
+{
+	return fid != NULL &&
+	       ((fid_seq(fid) >= FID_SEQ_START && fid_ver(fid) == 0) ||
+		fid_is_igif(fid) || fid_is_idif(fid) ||
+		fid_seq_is_rsvd(fid_seq(fid)));
+}
+
+static inline int fid_is_zero(const struct lu_fid *fid)
+{
+	return fid_seq(fid) == 0 && fid_oid(fid) == 0;
+}
+
+extern void lustre_swab_lu_fid(struct lu_fid *fid);
+extern void lustre_swab_lu_seq_range(struct lu_seq_range *range);
+
+static inline int lu_fid_eq(const struct lu_fid *f0, const struct lu_fid *f1)
+{
+	/* Check that there is no alignment padding. */
+	CLASSERT(sizeof *f0 ==
+		 sizeof f0->f_seq + sizeof f0->f_oid + sizeof f0->f_ver);
+	return memcmp(f0, f1, sizeof *f0) == 0;
+}
+
+#define __diff_normalize(val0, val1)			    \
+({							      \
+	typeof(val0) __val0 = (val0);			   \
+	typeof(val1) __val1 = (val1);			   \
+								\
+	(__val0 == __val1 ? 0 : __val0 > __val1 ? +1 : -1);     \
+})
+
+static inline int lu_fid_cmp(const struct lu_fid *f0,
+			     const struct lu_fid *f1)
+{
+	return
+		__diff_normalize(fid_seq(f0), fid_seq(f1)) ?:
+		__diff_normalize(fid_oid(f0), fid_oid(f1)) ?:
+		__diff_normalize(fid_ver(f0), fid_ver(f1));
+}
+
+static inline void ostid_cpu_to_le(struct ost_id *src_oi,
+				   struct ost_id *dst_oi)
+{
+	if (fid_seq_is_mdt0(ostid_seq(src_oi))) {
+		dst_oi->oi.oi_id = cpu_to_le64(src_oi->oi.oi_id);
+		dst_oi->oi.oi_seq = cpu_to_le64(src_oi->oi.oi_seq);
+	} else {
+		fid_cpu_to_le(&dst_oi->oi_fid, &src_oi->oi_fid);
+	}
+}
+
+static inline void ostid_le_to_cpu(struct ost_id *src_oi,
+				   struct ost_id *dst_oi)
+{
+	if (fid_seq_is_mdt0(ostid_seq(src_oi))) {
+		dst_oi->oi.oi_id = le64_to_cpu(src_oi->oi.oi_id);
+		dst_oi->oi.oi_seq = le64_to_cpu(src_oi->oi.oi_seq);
+	} else {
+		fid_le_to_cpu(&dst_oi->oi_fid, &src_oi->oi_fid);
+	}
+}
+
+/** @} lu_fid */
+
+/** \defgroup lu_dir lu_dir
+ * @{ */
+
+/**
+ * Enumeration of possible directory entry attributes.
+ *
+ * Attributes follow directory entry header in the order they appear in this
+ * enumeration.
+ */
+enum lu_dirent_attrs {
+	LUDA_FID		= 0x0001,
+	LUDA_TYPE		= 0x0002,
+	LUDA_64BITHASH		= 0x0004,
+
+	/* The following attrs are used for MDT interanl only,
+	 * not visible to client */
+
+	/* Verify the dirent consistency */
+	LUDA_VERIFY		= 0x8000,
+	/* Only check but not repair the dirent inconsistency */
+	LUDA_VERIFY_DRYRUN	= 0x4000,
+	/* The dirent has been repaired, or to be repaired (dryrun). */
+	LUDA_REPAIR		= 0x2000,
+	/* The system is upgraded, has beed or to be repaired (dryrun). */
+	LUDA_UPGRADE		= 0x1000,
+	/* Ignore this record, go to next directly. */
+	LUDA_IGNORE		= 0x0800,
+};
+
+#define LU_DIRENT_ATTRS_MASK	0xf800
+
+/**
+ * Layout of readdir pages, as transmitted on wire.
+ */
+struct lu_dirent {
+	/** valid if LUDA_FID is set. */
+	struct lu_fid lde_fid;
+	/** a unique entry identifier: a hash or an offset. */
+	__u64	 lde_hash;
+	/** total record length, including all attributes. */
+	__u16	 lde_reclen;
+	/** name length */
+	__u16	 lde_namelen;
+	/** optional variable size attributes following this entry.
+	 *  taken from enum lu_dirent_attrs.
+	 */
+	__u32	 lde_attrs;
+	/** name is followed by the attributes indicated in ->ldp_attrs, in
+	 *  their natural order. After the last attribute, padding bytes are
+	 *  added to make ->lde_reclen a multiple of 8.
+	 */
+	char	  lde_name[0];
+};
+
+/*
+ * Definitions of optional directory entry attributes formats.
+ *
+ * Individual attributes do not have their length encoded in a generic way. It
+ * is assumed that consumer of an attribute knows its format. This means that
+ * it is impossible to skip over an unknown attribute, except by skipping over all
+ * remaining attributes (by using ->lde_reclen), which is not too
+ * constraining, because new server versions will append new attributes at
+ * the end of an entry.
+ */
+
+/**
+ * Fid directory attribute: a fid of an object referenced by the entry. This
+ * will be almost always requested by the client and supplied by the server.
+ *
+ * Aligned to 8 bytes.
+ */
+/* To have compatibility with 1.8, lets have fid in lu_dirent struct. */
+
+/**
+ * File type.
+ *
+ * Aligned to 2 bytes.
+ */
+struct luda_type {
+	__u16 lt_type;
+};
+
+struct lu_dirpage {
+	__u64	    ldp_hash_start;
+	__u64	    ldp_hash_end;
+	__u32	    ldp_flags;
+	__u32	    ldp_pad0;
+	struct lu_dirent ldp_entries[0];
+};
+
+enum lu_dirpage_flags {
+	/**
+	 * dirpage contains no entry.
+	 */
+	LDF_EMPTY   = 1 << 0,
+	/**
+	 * last entry's lde_hash equals ldp_hash_end.
+	 */
+	LDF_COLLIDE = 1 << 1
+};
+
+static inline struct lu_dirent *lu_dirent_start(struct lu_dirpage *dp)
+{
+	if (le32_to_cpu(dp->ldp_flags) & LDF_EMPTY)
+		return NULL;
+	else
+		return dp->ldp_entries;
+}
+
+static inline struct lu_dirent *lu_dirent_next(struct lu_dirent *ent)
+{
+	struct lu_dirent *next;
+
+	if (le16_to_cpu(ent->lde_reclen) != 0)
+		next = ((void *)ent) + le16_to_cpu(ent->lde_reclen);
+	else
+		next = NULL;
+
+	return next;
+}
+
+static inline int lu_dirent_calc_size(int namelen, __u16 attr)
+{
+	int size;
+
+	if (attr & LUDA_TYPE) {
+		const unsigned align = sizeof(struct luda_type) - 1;
+		size = (sizeof(struct lu_dirent) + namelen + align) & ~align;
+		size += sizeof(struct luda_type);
+	} else
+		size = sizeof(struct lu_dirent) + namelen;
+
+	return (size + 7) & ~7;
+}
+
+static inline int lu_dirent_size(struct lu_dirent *ent)
+{
+	if (le16_to_cpu(ent->lde_reclen) == 0) {
+		return lu_dirent_calc_size(le16_to_cpu(ent->lde_namelen),
+					   le32_to_cpu(ent->lde_attrs));
+	}
+	return le16_to_cpu(ent->lde_reclen);
+}
+
+#define MDS_DIR_END_OFF 0xfffffffffffffffeULL
+
+/**
+ * MDS_READPAGE page size
+ *
+ * This is the directory page size packed in MDS_READPAGE RPC.
+ * It's different than PAGE_CACHE_SIZE because the client needs to
+ * access the struct lu_dirpage header packed at the beginning of
+ * the "page" and without this there isn't any way to know find the
+ * lu_dirpage header is if client and server PAGE_CACHE_SIZE differ.
+ */
+#define LU_PAGE_SHIFT 12
+#define LU_PAGE_SIZE  (1UL << LU_PAGE_SHIFT)
+#define LU_PAGE_MASK  (~(LU_PAGE_SIZE - 1))
+
+#define LU_PAGE_COUNT (1 << (PAGE_CACHE_SHIFT - LU_PAGE_SHIFT))
+
+/** @} lu_dir */
+
+struct lustre_handle {
+	__u64 cookie;
+};
+#define DEAD_HANDLE_MAGIC 0xdeadbeefcafebabeULL
+
+static inline int lustre_handle_is_used(struct lustre_handle *lh)
+{
+	return lh->cookie != 0ull;
+}
+
+static inline int lustre_handle_equal(const struct lustre_handle *lh1,
+				      const struct lustre_handle *lh2)
+{
+	return lh1->cookie == lh2->cookie;
+}
+
+static inline void lustre_handle_copy(struct lustre_handle *tgt,
+				      struct lustre_handle *src)
+{
+	tgt->cookie = src->cookie;
+}
+
+/* flags for lm_flags */
+#define MSGHDR_AT_SUPPORT	       0x1
+#define MSGHDR_CKSUM_INCOMPAT18	 0x2
+
+#define lustre_msg lustre_msg_v2
+/* we depend on this structure to be 8-byte aligned */
+/* this type is only endian-adjusted in lustre_unpack_msg() */
+struct lustre_msg_v2 {
+	__u32 lm_bufcount;
+	__u32 lm_secflvr;
+	__u32 lm_magic;
+	__u32 lm_repsize;
+	__u32 lm_cksum;
+	__u32 lm_flags;
+	__u32 lm_padding_2;
+	__u32 lm_padding_3;
+	__u32 lm_buflens[0];
+};
+
+/* without gss, ptlrpc_body is put at the first buffer. */
+#define PTLRPC_NUM_VERSIONS     4
+#define JOBSTATS_JOBID_SIZE     32  /* 32 bytes string */
+struct ptlrpc_body_v3 {
+	struct lustre_handle pb_handle;
+	__u32 pb_type;
+	__u32 pb_version;
+	__u32 pb_opc;
+	__u32 pb_status;
+	__u64 pb_last_xid;
+	__u64 pb_last_seen;
+	__u64 pb_last_committed;
+	__u64 pb_transno;
+	__u32 pb_flags;
+	__u32 pb_op_flags;
+	__u32 pb_conn_cnt;
+	__u32 pb_timeout;  /* for req, the deadline, for rep, the service est */
+	__u32 pb_service_time; /* for rep, actual service time */
+	__u32 pb_limit;
+	__u64 pb_slv;
+	/* VBR: pre-versions */
+	__u64 pb_pre_versions[PTLRPC_NUM_VERSIONS];
+	/* padding for future needs */
+	__u64 pb_padding[4];
+	char  pb_jobid[JOBSTATS_JOBID_SIZE];
+};
+#define ptlrpc_body     ptlrpc_body_v3
+
+struct ptlrpc_body_v2 {
+	struct lustre_handle pb_handle;
+	__u32 pb_type;
+	__u32 pb_version;
+	__u32 pb_opc;
+	__u32 pb_status;
+	__u64 pb_last_xid;
+	__u64 pb_last_seen;
+	__u64 pb_last_committed;
+	__u64 pb_transno;
+	__u32 pb_flags;
+	__u32 pb_op_flags;
+	__u32 pb_conn_cnt;
+	__u32 pb_timeout;  /* for req, the deadline, for rep, the service est */
+	__u32 pb_service_time; /* for rep, actual service time, also used for
+				  net_latency of req */
+	__u32 pb_limit;
+	__u64 pb_slv;
+	/* VBR: pre-versions */
+	__u64 pb_pre_versions[PTLRPC_NUM_VERSIONS];
+	/* padding for future needs */
+	__u64 pb_padding[4];
+};
+
+extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb);
+
+/* message body offset for lustre_msg_v2 */
+/* ptlrpc body offset in all request/reply messages */
+#define MSG_PTLRPC_BODY_OFF	     0
+
+/* normal request/reply message record offset */
+#define REQ_REC_OFF		     1
+#define REPLY_REC_OFF		   1
+
+/* ldlm request message body offset */
+#define DLM_LOCKREQ_OFF		 1 /* lockreq offset */
+#define DLM_REQ_REC_OFF		 2 /* normal dlm request record offset */
+
+/* ldlm intent lock message body offset */
+#define DLM_INTENT_IT_OFF	       2 /* intent lock it offset */
+#define DLM_INTENT_REC_OFF	      3 /* intent lock record offset */
+
+/* ldlm reply message body offset */
+#define DLM_LOCKREPLY_OFF	       1 /* lockrep offset */
+#define DLM_REPLY_REC_OFF	       2 /* reply record offset */
+
+/** only use in req->rq_{req,rep}_swab_mask */
+#define MSG_PTLRPC_HEADER_OFF	   31
+
+/* Flags that are operation-specific go in the top 16 bits. */
+#define MSG_OP_FLAG_MASK   0xffff0000
+#define MSG_OP_FLAG_SHIFT  16
+
+/* Flags that apply to all requests are in the bottom 16 bits */
+#define MSG_GEN_FLAG_MASK     0x0000ffff
+#define MSG_LAST_REPLAY	   0x0001
+#define MSG_RESENT		0x0002
+#define MSG_REPLAY		0x0004
+/* #define MSG_AT_SUPPORT	 0x0008
+ * This was used in early prototypes of adaptive timeouts, and while there
+ * shouldn't be any users of that code there also isn't a need for using this
+ * bits. Defer usage until at least 1.10 to avoid potential conflict. */
+#define MSG_DELAY_REPLAY	  0x0010
+#define MSG_VERSION_REPLAY	0x0020
+#define MSG_REQ_REPLAY_DONE       0x0040
+#define MSG_LOCK_REPLAY_DONE      0x0080
+
+/*
+ * Flags for all connect opcodes (MDS_CONNECT, OST_CONNECT)
+ */
+
+#define MSG_CONNECT_RECOVERING  0x00000001
+#define MSG_CONNECT_RECONNECT   0x00000002
+#define MSG_CONNECT_REPLAYABLE  0x00000004
+//#define MSG_CONNECT_PEER	0x8
+#define MSG_CONNECT_LIBCLIENT   0x00000010
+#define MSG_CONNECT_INITIAL     0x00000020
+#define MSG_CONNECT_ASYNC       0x00000040
+#define MSG_CONNECT_NEXT_VER    0x00000080 /* use next version of lustre_msg */
+#define MSG_CONNECT_TRANSNO     0x00000100 /* report transno */
+
+/* Connect flags */
+#define OBD_CONNECT_RDONLY		0x1ULL /*client has read-only access*/
+#define OBD_CONNECT_INDEX		 0x2ULL /*connect specific LOV idx */
+#define OBD_CONNECT_MDS		   0x4ULL /*connect from MDT to OST */
+#define OBD_CONNECT_GRANT		 0x8ULL /*OSC gets grant at connect */
+#define OBD_CONNECT_SRVLOCK	      0x10ULL /*server takes locks for cli */
+#define OBD_CONNECT_VERSION	      0x20ULL /*Lustre versions in ocd */
+#define OBD_CONNECT_REQPORTAL	    0x40ULL /*Separate non-IO req portal */
+#define OBD_CONNECT_ACL		  0x80ULL /*access control lists */
+#define OBD_CONNECT_XATTR	       0x100ULL /*client use extended attr */
+#define OBD_CONNECT_CROW		0x200ULL /*MDS+OST create obj on write*/
+#define OBD_CONNECT_TRUNCLOCK	   0x400ULL /*locks on server for punch */
+#define OBD_CONNECT_TRANSNO	     0x800ULL /*replay sends init transno */
+#define OBD_CONNECT_IBITS	      0x1000ULL /*support for inodebits locks*/
+#define OBD_CONNECT_JOIN	       0x2000ULL /*files can be concatenated.
+						  *We do not support JOIN FILE
+						  *anymore, reserve this flags
+						  *just for preventing such bit
+						  *to be reused.*/
+#define OBD_CONNECT_ATTRFID	    0x4000ULL /*Server can GetAttr By Fid*/
+#define OBD_CONNECT_NODEVOH	    0x8000ULL /*No open hndl on specl nodes*/
+#define OBD_CONNECT_RMT_CLIENT	0x10000ULL /*Remote client */
+#define OBD_CONNECT_RMT_CLIENT_FORCE  0x20000ULL /*Remote client by force */
+#define OBD_CONNECT_BRW_SIZE	  0x40000ULL /*Max bytes per rpc */
+#define OBD_CONNECT_QUOTA64	   0x80000ULL /*Not used since 2.4 */
+#define OBD_CONNECT_MDS_CAPA	 0x100000ULL /*MDS capability */
+#define OBD_CONNECT_OSS_CAPA	 0x200000ULL /*OSS capability */
+#define OBD_CONNECT_CANCELSET	0x400000ULL /*Early batched cancels. */
+#define OBD_CONNECT_SOM	      0x800000ULL /*Size on MDS */
+#define OBD_CONNECT_AT	      0x1000000ULL /*client uses AT */
+#define OBD_CONNECT_LRU_RESIZE      0x2000000ULL /*LRU resize feature. */
+#define OBD_CONNECT_MDS_MDS	 0x4000000ULL /*MDS-MDS connection */
+#define OBD_CONNECT_REAL	    0x8000000ULL /*real connection */
+#define OBD_CONNECT_CHANGE_QS      0x10000000ULL /*Not used since 2.4 */
+#define OBD_CONNECT_CKSUM	  0x20000000ULL /*support several cksum algos*/
+#define OBD_CONNECT_FID	    0x40000000ULL /*FID is supported by server */
+#define OBD_CONNECT_VBR	    0x80000000ULL /*version based recovery */
+#define OBD_CONNECT_LOV_V3	0x100000000ULL /*client supports LOV v3 EA */
+#define OBD_CONNECT_GRANT_SHRINK  0x200000000ULL /* support grant shrink */
+#define OBD_CONNECT_SKIP_ORPHAN   0x400000000ULL /* don't reuse orphan objids */
+#define OBD_CONNECT_MAX_EASIZE    0x800000000ULL /* preserved for large EA */
+#define OBD_CONNECT_FULL20       0x1000000000ULL /* it is 2.0 client */
+#define OBD_CONNECT_LAYOUTLOCK   0x2000000000ULL /* client uses layout lock */
+#define OBD_CONNECT_64BITHASH    0x4000000000ULL /* client supports 64-bits
+						  * directory hash */
+#define OBD_CONNECT_MAXBYTES     0x8000000000ULL /* max stripe size */
+#define OBD_CONNECT_IMP_RECOV   0x10000000000ULL /* imp recovery support */
+#define OBD_CONNECT_JOBSTATS    0x20000000000ULL /* jobid in ptlrpc_body */
+#define OBD_CONNECT_UMASK       0x40000000000ULL /* create uses client umask */
+#define OBD_CONNECT_EINPROGRESS 0x80000000000ULL /* client handles -EINPROGRESS
+						  * RPC error properly */
+#define OBD_CONNECT_GRANT_PARAM 0x100000000000ULL/* extra grant params used for
+						  * finer space reservation */
+#define OBD_CONNECT_FLOCK_OWNER 0x200000000000ULL /* for the fixed 1.8
+						   * policy and 2.x server */
+#define OBD_CONNECT_LVB_TYPE	0x400000000000ULL /* variable type of LVB */
+#define OBD_CONNECT_NANOSEC_TIME 0x800000000000ULL /* nanosecond timestamps */
+#define OBD_CONNECT_LIGHTWEIGHT 0x1000000000000ULL/* lightweight connection */
+#define OBD_CONNECT_SHORTIO     0x2000000000000ULL/* short io */
+#define OBD_CONNECT_PINGLESS	0x4000000000000ULL/* pings not required */
+/* XXX README XXX:
+ * Please DO NOT add flag values here before first ensuring that this same
+ * flag value is not in use on some other branch.  Please clear any such
+ * changes with senior engineers before starting to use a new flag.  Then,
+ * submit a small patch against EVERY branch that ONLY adds the new flag,
+ * updates obd_connect_names[] for lprocfs_rd_connect_flags(), adds the
+ * flag to check_obd_connect_data(), and updates wiretests accordingly, so it
+ * can be approved and landed easily to reserve the flag for future use. */
+
+/* The MNE_SWAB flag is overloading the MDS_MDS bit only for the MGS
+ * connection.  It is a temporary bug fix for Imperative Recovery interop
+ * between 2.2 and 2.3 x86/ppc nodes, and can be removed when interop for
+ * 2.2 clients/servers is no longer needed.  LU-1252/LU-1644. */
+#define OBD_CONNECT_MNE_SWAB		 OBD_CONNECT_MDS_MDS
+
+#define OCD_HAS_FLAG(ocd, flg)  \
+	(!!((ocd)->ocd_connect_flags & OBD_CONNECT_##flg))
+
+
+#define LRU_RESIZE_CONNECT_FLAG OBD_CONNECT_LRU_RESIZE
+
+#define MDT_CONNECT_SUPPORTED  (OBD_CONNECT_RDONLY | OBD_CONNECT_VERSION | \
+				OBD_CONNECT_ACL | OBD_CONNECT_XATTR | \
+				OBD_CONNECT_IBITS | \
+				OBD_CONNECT_NODEVOH | OBD_CONNECT_ATTRFID | \
+				OBD_CONNECT_CANCELSET | OBD_CONNECT_AT | \
+				OBD_CONNECT_RMT_CLIENT | \
+				OBD_CONNECT_RMT_CLIENT_FORCE | \
+				OBD_CONNECT_BRW_SIZE | OBD_CONNECT_MDS_CAPA | \
+				OBD_CONNECT_OSS_CAPA | OBD_CONNECT_MDS_MDS | \
+				OBD_CONNECT_FID | LRU_RESIZE_CONNECT_FLAG | \
+				OBD_CONNECT_VBR | OBD_CONNECT_LOV_V3 | \
+				OBD_CONNECT_SOM | OBD_CONNECT_FULL20 | \
+				OBD_CONNECT_64BITHASH | OBD_CONNECT_JOBSTATS | \
+				OBD_CONNECT_EINPROGRESS | \
+				OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_UMASK | \
+				OBD_CONNECT_LVB_TYPE | OBD_CONNECT_LAYOUTLOCK |\
+				OBD_CONNECT_PINGLESS)
+#define OST_CONNECT_SUPPORTED  (OBD_CONNECT_SRVLOCK | OBD_CONNECT_GRANT | \
+				OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \
+				OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_INDEX | \
+				OBD_CONNECT_BRW_SIZE | OBD_CONNECT_OSS_CAPA | \
+				OBD_CONNECT_CANCELSET | OBD_CONNECT_AT | \
+				LRU_RESIZE_CONNECT_FLAG | OBD_CONNECT_CKSUM | \
+				OBD_CONNECT_RMT_CLIENT | \
+				OBD_CONNECT_RMT_CLIENT_FORCE | OBD_CONNECT_VBR | \
+				OBD_CONNECT_MDS | OBD_CONNECT_SKIP_ORPHAN | \
+				OBD_CONNECT_GRANT_SHRINK | OBD_CONNECT_FULL20 | \
+				OBD_CONNECT_64BITHASH | OBD_CONNECT_MAXBYTES | \
+				OBD_CONNECT_MAX_EASIZE | \
+				OBD_CONNECT_EINPROGRESS | \
+				OBD_CONNECT_JOBSTATS | \
+				OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_LVB_TYPE|\
+				OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_FID | \
+				OBD_CONNECT_PINGLESS)
+#define ECHO_CONNECT_SUPPORTED (0)
+#define MGS_CONNECT_SUPPORTED  (OBD_CONNECT_VERSION | OBD_CONNECT_AT | \
+				OBD_CONNECT_FULL20 | OBD_CONNECT_IMP_RECOV | \
+				OBD_CONNECT_MNE_SWAB | OBD_CONNECT_PINGLESS)
+
+/* Features required for this version of the client to work with server */
+#define CLIENT_CONNECT_MDT_REQD (OBD_CONNECT_IBITS | OBD_CONNECT_FID | \
+				 OBD_CONNECT_FULL20)
+
+#define OBD_OCD_VERSION(major,minor,patch,fix) (((major)<<24) + ((minor)<<16) +\
+						((patch)<<8) + (fix))
+#define OBD_OCD_VERSION_MAJOR(version) ((int)((version)>>24)&255)
+#define OBD_OCD_VERSION_MINOR(version) ((int)((version)>>16)&255)
+#define OBD_OCD_VERSION_PATCH(version) ((int)((version)>>8)&255)
+#define OBD_OCD_VERSION_FIX(version)   ((int)(version)&255)
+
+/* This structure is used for both request and reply.
+ *
+ * If we eventually have separate connect data for different types, which we
+ * almost certainly will, then perhaps we stick a union in here. */
+struct obd_connect_data_v1 {
+	__u64 ocd_connect_flags; /* OBD_CONNECT_* per above */
+	__u32 ocd_version;	 /* lustre release version number */
+	__u32 ocd_grant;	 /* initial cache grant amount (bytes) */
+	__u32 ocd_index;	 /* LOV index to connect to */
+	__u32 ocd_brw_size;	 /* Maximum BRW size in bytes, must be 2^n */
+	__u64 ocd_ibits_known;   /* inode bits this client understands */
+	__u8  ocd_blocksize;     /* log2 of the backend filesystem blocksize */
+	__u8  ocd_inodespace;    /* log2 of the per-inode space consumption */
+	__u16 ocd_grant_extent;  /* per-extent grant overhead, in 1K blocks */
+	__u32 ocd_unused;	/* also fix lustre_swab_connect */
+	__u64 ocd_transno;       /* first transno from client to be replayed */
+	__u32 ocd_group;	 /* MDS group on OST */
+	__u32 ocd_cksum_types;   /* supported checksum algorithms */
+	__u32 ocd_max_easize;    /* How big LOV EA can be on MDS */
+	__u32 ocd_instance;      /* also fix lustre_swab_connect */
+	__u64 ocd_maxbytes;      /* Maximum stripe size in bytes */
+};
+
+struct obd_connect_data {
+	__u64 ocd_connect_flags; /* OBD_CONNECT_* per above */
+	__u32 ocd_version;	 /* lustre release version number */
+	__u32 ocd_grant;	 /* initial cache grant amount (bytes) */
+	__u32 ocd_index;	 /* LOV index to connect to */
+	__u32 ocd_brw_size;	 /* Maximum BRW size in bytes */
+	__u64 ocd_ibits_known;   /* inode bits this client understands */
+	__u8  ocd_blocksize;     /* log2 of the backend filesystem blocksize */
+	__u8  ocd_inodespace;    /* log2 of the per-inode space consumption */
+	__u16 ocd_grant_extent;  /* per-extent grant overhead, in 1K blocks */
+	__u32 ocd_unused;	/* also fix lustre_swab_connect */
+	__u64 ocd_transno;       /* first transno from client to be replayed */
+	__u32 ocd_group;	 /* MDS group on OST */
+	__u32 ocd_cksum_types;   /* supported checksum algorithms */
+	__u32 ocd_max_easize;    /* How big LOV EA can be on MDS */
+	__u32 ocd_instance;      /* instance # of this target */
+	__u64 ocd_maxbytes;      /* Maximum stripe size in bytes */
+	/* Fields after ocd_maxbytes are only accessible by the receiver
+	 * if the corresponding flag in ocd_connect_flags is set. Accessing
+	 * any field after ocd_maxbytes on the receiver without a valid flag
+	 * may result in out-of-bound memory access and kernel oops. */
+	__u64 padding1;	  /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 padding2;	  /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 padding3;	  /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 padding4;	  /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 padding5;	  /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 padding6;	  /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 padding7;	  /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 padding8;	  /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 padding9;	  /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 paddingA;	  /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 paddingB;	  /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 paddingC;	  /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 paddingD;	  /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 paddingE;	  /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 paddingF;	  /* added 2.1.0. also fix lustre_swab_connect */
+};
+/* XXX README XXX:
+ * Please DO NOT use any fields here before first ensuring that this same
+ * field is not in use on some other branch.  Please clear any such changes
+ * with senior engineers before starting to use a new field.  Then, submit
+ * a small patch against EVERY branch that ONLY adds the new field along with
+ * the matching OBD_CONNECT flag, so that can be approved and landed easily to
+ * reserve the flag for future use. */
+
+
+extern void lustre_swab_connect(struct obd_connect_data *ocd);
+
+/*
+ * Supported checksum algorithms. Up to 32 checksum types are supported.
+ * (32-bit mask stored in obd_connect_data::ocd_cksum_types)
+ * Please update DECLARE_CKSUM_NAME/OBD_CKSUM_ALL in obd.h when adding a new
+ * algorithm and also the OBD_FL_CKSUM* flags.
+ */
+typedef enum {
+	OBD_CKSUM_CRC32 = 0x00000001,
+	OBD_CKSUM_ADLER = 0x00000002,
+	OBD_CKSUM_CRC32C= 0x00000004,
+} cksum_type_t;
+
+/*
+ *   OST requests: OBDO & OBD request records
+ */
+
+/* opcodes */
+typedef enum {
+	OST_REPLY      =  0,       /* reply ? */
+	OST_GETATTR    =  1,
+	OST_SETATTR    =  2,
+	OST_READ       =  3,
+	OST_WRITE      =  4,
+	OST_CREATE     =  5,
+	OST_DESTROY    =  6,
+	OST_GET_INFO   =  7,
+	OST_CONNECT    =  8,
+	OST_DISCONNECT =  9,
+	OST_PUNCH      = 10,
+	OST_OPEN       = 11,
+	OST_CLOSE      = 12,
+	OST_STATFS     = 13,
+	OST_SYNC       = 16,
+	OST_SET_INFO   = 17,
+	OST_QUOTACHECK = 18,
+	OST_QUOTACTL   = 19,
+	OST_QUOTA_ADJUST_QUNIT = 20, /* not used since 2.4 */
+	OST_LAST_OPC
+} ost_cmd_t;
+#define OST_FIRST_OPC  OST_REPLY
+
+enum obdo_flags {
+	OBD_FL_INLINEDATA   = 0x00000001,
+	OBD_FL_OBDMDEXISTS  = 0x00000002,
+	OBD_FL_DELORPHAN    = 0x00000004, /* if set in o_flags delete orphans */
+	OBD_FL_NORPC	= 0x00000008, /* set in o_flags do in OSC not OST */
+	OBD_FL_IDONLY       = 0x00000010, /* set in o_flags only adjust obj id*/
+	OBD_FL_RECREATE_OBJS= 0x00000020, /* recreate missing obj */
+	OBD_FL_DEBUG_CHECK  = 0x00000040, /* echo client/server debug check */
+	OBD_FL_NO_USRQUOTA  = 0x00000100, /* the object's owner is over quota */
+	OBD_FL_NO_GRPQUOTA  = 0x00000200, /* the object's group is over quota */
+	OBD_FL_CREATE_CROW  = 0x00000400, /* object should be create on write */
+	OBD_FL_SRVLOCK      = 0x00000800, /* delegate DLM locking to server */
+	OBD_FL_CKSUM_CRC32  = 0x00001000, /* CRC32 checksum type */
+	OBD_FL_CKSUM_ADLER  = 0x00002000, /* ADLER checksum type */
+	OBD_FL_CKSUM_CRC32C = 0x00004000, /* CRC32C checksum type */
+	OBD_FL_CKSUM_RSVD2  = 0x00008000, /* for future cksum types */
+	OBD_FL_CKSUM_RSVD3  = 0x00010000, /* for future cksum types */
+	OBD_FL_SHRINK_GRANT = 0x00020000, /* object shrink the grant */
+	OBD_FL_MMAP	 = 0x00040000, /* object is mmapped on the client.
+					   * XXX: obsoleted - reserved for old
+					   * clients prior than 2.2 */
+	OBD_FL_RECOV_RESEND = 0x00080000, /* recoverable resent */
+	OBD_FL_NOSPC_BLK    = 0x00100000, /* no more block space on OST */
+
+	/* Note that while these checksum values are currently separate bits,
+	 * in 2.x we can actually allow all values from 1-31 if we wanted. */
+	OBD_FL_CKSUM_ALL    = OBD_FL_CKSUM_CRC32 | OBD_FL_CKSUM_ADLER |
+			      OBD_FL_CKSUM_CRC32C,
+
+	/* mask for local-only flag, which won't be sent over network */
+	OBD_FL_LOCAL_MASK   = 0xF0000000,
+};
+
+#define LOV_MAGIC_V1      0x0BD10BD0
+#define LOV_MAGIC	 LOV_MAGIC_V1
+#define LOV_MAGIC_JOIN_V1 0x0BD20BD0
+#define LOV_MAGIC_V3      0x0BD30BD0
+
+/*
+ * magic for fully defined striping
+ * the idea is that we should have different magics for striping "hints"
+ * (struct lov_user_md_v[13]) and defined ready-to-use striping (struct
+ * lov_mds_md_v[13]). at the moment the magics are used in wire protocol,
+ * we can't just change it w/o long way preparation, but we still need a
+ * mechanism to allow LOD to differentiate hint versus ready striping.
+ * so, at the moment we do a trick: MDT knows what to expect from request
+ * depending on the case (replay uses ready striping, non-replay req uses
+ * hints), so MDT replaces magic with appropriate one and now LOD can
+ * easily understand what's inside -bzzz
+ */
+#define LOV_MAGIC_V1_DEF  0x0CD10BD0
+#define LOV_MAGIC_V3_DEF  0x0CD30BD0
+
+#define LOV_PATTERN_RAID0 0x001   /* stripes are used round-robin */
+#define LOV_PATTERN_RAID1 0x002   /* stripes are mirrors of each other */
+#define LOV_PATTERN_FIRST 0x100   /* first stripe is not in round-robin */
+#define LOV_PATTERN_CMOBD 0x200
+
+#define lov_ost_data lov_ost_data_v1
+struct lov_ost_data_v1 {	  /* per-stripe data structure (little-endian)*/
+	struct ost_id l_ost_oi;	  /* OST object ID */
+	__u32 l_ost_gen;	  /* generation of this l_ost_idx */
+	__u32 l_ost_idx;	  /* OST index in LOV (lov_tgt_desc->tgts) */
+};
+
+#define lov_mds_md lov_mds_md_v1
+struct lov_mds_md_v1 {	    /* LOV EA mds/wire data (little-endian) */
+	__u32 lmm_magic;	  /* magic number = LOV_MAGIC_V1 */
+	__u32 lmm_pattern;	/* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+	struct ost_id	lmm_oi;	  /* LOV object ID */
+	__u32 lmm_stripe_size;    /* size of stripe in bytes */
+	/* lmm_stripe_count used to be __u32 */
+	__u16 lmm_stripe_count;   /* num stripes in use for this object */
+	__u16 lmm_layout_gen;     /* layout generation number */
+	struct lov_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+};
+
+/**
+ * Sigh, because pre-2.4 uses
+ * struct lov_mds_md_v1 {
+ *	........
+ *	__u64 lmm_object_id;
+ *	__u64 lmm_object_seq;
+ *      ......
+ *      }
+ * to identify the LOV(MDT) object, and lmm_object_seq will
+ * be normal_fid, which make it hard to combine these conversion
+ * to ostid_to FID. so we will do lmm_oi/fid conversion separately
+ *
+ * We can tell the lmm_oi by this way,
+ * 1.8: lmm_object_id = {inode}, lmm_object_gr = 0
+ * 2.1: lmm_object_id = {oid < 128k}, lmm_object_seq = FID_SEQ_NORMAL
+ * 2.4: lmm_oi.f_seq = FID_SEQ_NORMAL, lmm_oi.f_oid = {oid < 128k},
+ *      lmm_oi.f_ver = 0
+ *
+ * But currently lmm_oi/lsm_oi does not have any "real" usages,
+ * except for printing some information, and the user can always
+ * get the real FID from LMA, besides this multiple case check might
+ * make swab more complicate. So we will keep using id/seq for lmm_oi.
+ */
+
+static inline void fid_to_lmm_oi(const struct lu_fid *fid,
+				 struct ost_id *oi)
+{
+	oi->oi.oi_id = fid_oid(fid);
+	oi->oi.oi_seq = fid_seq(fid);
+}
+
+static inline void lmm_oi_set_seq(struct ost_id *oi, __u64 seq)
+{
+	oi->oi.oi_seq = seq;
+}
+
+static inline __u64 lmm_oi_id(struct ost_id *oi)
+{
+	return oi->oi.oi_id;
+}
+
+static inline __u64 lmm_oi_seq(struct ost_id *oi)
+{
+	return oi->oi.oi_seq;
+}
+
+static inline void lmm_oi_le_to_cpu(struct ost_id *dst_oi,
+				    struct ost_id *src_oi)
+{
+	dst_oi->oi.oi_id = le64_to_cpu(src_oi->oi.oi_id);
+	dst_oi->oi.oi_seq = le64_to_cpu(src_oi->oi.oi_seq);
+}
+
+static inline void lmm_oi_cpu_to_le(struct ost_id *dst_oi,
+				    struct ost_id *src_oi)
+{
+	dst_oi->oi.oi_id = cpu_to_le64(src_oi->oi.oi_id);
+	dst_oi->oi.oi_seq = cpu_to_le64(src_oi->oi.oi_seq);
+}
+
+/* extern void lustre_swab_lov_mds_md(struct lov_mds_md *llm); */
+
+#define MAX_MD_SIZE (sizeof(struct lov_mds_md) + 4 * sizeof(struct lov_ost_data))
+#define MIN_MD_SIZE (sizeof(struct lov_mds_md) + 1 * sizeof(struct lov_ost_data))
+
+#define XATTR_NAME_ACL_ACCESS   "system.posix_acl_access"
+#define XATTR_NAME_ACL_DEFAULT  "system.posix_acl_default"
+#define XATTR_USER_PREFIX       "user."
+#define XATTR_TRUSTED_PREFIX    "trusted."
+#define XATTR_SECURITY_PREFIX   "security."
+#define XATTR_LUSTRE_PREFIX     "lustre."
+
+#define XATTR_NAME_LOV	  "trusted.lov"
+#define XATTR_NAME_LMA	  "trusted.lma"
+#define XATTR_NAME_LMV	  "trusted.lmv"
+#define XATTR_NAME_LINK	 "trusted.link"
+#define XATTR_NAME_FID	  "trusted.fid"
+#define XATTR_NAME_VERSION      "trusted.version"
+#define XATTR_NAME_SOM		"trusted.som"
+#define XATTR_NAME_HSM		"trusted.hsm"
+#define XATTR_NAME_LFSCK_NAMESPACE "trusted.lfsck_namespace"
+
+struct lov_mds_md_v3 {	    /* LOV EA mds/wire data (little-endian) */
+	__u32 lmm_magic;	  /* magic number = LOV_MAGIC_V3 */
+	__u32 lmm_pattern;	/* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+	struct ost_id	lmm_oi;	  /* LOV object ID */
+	__u32 lmm_stripe_size;    /* size of stripe in bytes */
+	/* lmm_stripe_count used to be __u32 */
+	__u16 lmm_stripe_count;   /* num stripes in use for this object */
+	__u16 lmm_layout_gen;     /* layout generation number */
+	char  lmm_pool_name[LOV_MAXPOOLNAME]; /* must be 32bit aligned */
+	struct lov_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+};
+
+#define OBD_MD_FLID	(0x00000001ULL) /* object ID */
+#define OBD_MD_FLATIME     (0x00000002ULL) /* access time */
+#define OBD_MD_FLMTIME     (0x00000004ULL) /* data modification time */
+#define OBD_MD_FLCTIME     (0x00000008ULL) /* change time */
+#define OBD_MD_FLSIZE      (0x00000010ULL) /* size */
+#define OBD_MD_FLBLOCKS    (0x00000020ULL) /* allocated blocks count */
+#define OBD_MD_FLBLKSZ     (0x00000040ULL) /* block size */
+#define OBD_MD_FLMODE      (0x00000080ULL) /* access bits (mode & ~S_IFMT) */
+#define OBD_MD_FLTYPE      (0x00000100ULL) /* object type (mode & S_IFMT) */
+#define OBD_MD_FLUID       (0x00000200ULL) /* user ID */
+#define OBD_MD_FLGID       (0x00000400ULL) /* group ID */
+#define OBD_MD_FLFLAGS     (0x00000800ULL) /* flags word */
+#define OBD_MD_FLNLINK     (0x00002000ULL) /* link count */
+#define OBD_MD_FLGENER     (0x00004000ULL) /* generation number */
+/*#define OBD_MD_FLINLINE    (0x00008000ULL)  inline data. used until 1.6.5 */
+#define OBD_MD_FLRDEV      (0x00010000ULL) /* device number */
+#define OBD_MD_FLEASIZE    (0x00020000ULL) /* extended attribute data */
+#define OBD_MD_LINKNAME    (0x00040000ULL) /* symbolic link target */
+#define OBD_MD_FLHANDLE    (0x00080000ULL) /* file/lock handle */
+#define OBD_MD_FLCKSUM     (0x00100000ULL) /* bulk data checksum */
+#define OBD_MD_FLQOS       (0x00200000ULL) /* quality of service stats */
+/*#define OBD_MD_FLOSCOPQ    (0x00400000ULL) osc opaque data, never used */
+#define OBD_MD_FLCOOKIE    (0x00800000ULL) /* log cancellation cookie */
+#define OBD_MD_FLGROUP     (0x01000000ULL) /* group */
+#define OBD_MD_FLFID       (0x02000000ULL) /* ->ost write inline fid */
+#define OBD_MD_FLEPOCH     (0x04000000ULL) /* ->ost write with ioepoch */
+					   /* ->mds if epoch opens or closes */
+#define OBD_MD_FLGRANT     (0x08000000ULL) /* ost preallocation space grant */
+#define OBD_MD_FLDIREA     (0x10000000ULL) /* dir's extended attribute data */
+#define OBD_MD_FLUSRQUOTA  (0x20000000ULL) /* over quota flags sent from ost */
+#define OBD_MD_FLGRPQUOTA  (0x40000000ULL) /* over quota flags sent from ost */
+#define OBD_MD_FLMODEASIZE (0x80000000ULL) /* EA size will be changed */
+
+#define OBD_MD_MDS	 (0x0000000100000000ULL) /* where an inode lives on */
+#define OBD_MD_REINT       (0x0000000200000000ULL) /* reintegrate oa */
+#define OBD_MD_MEA	 (0x0000000400000000ULL) /* CMD split EA  */
+
+/* OBD_MD_MDTIDX is used to get MDT index, but it is never been used overwire,
+ * and it is already obsolete since 2.3 */
+/* #define OBD_MD_MDTIDX      (0x0000000800000000ULL) */
+
+#define OBD_MD_FLXATTR       (0x0000001000000000ULL) /* xattr */
+#define OBD_MD_FLXATTRLS     (0x0000002000000000ULL) /* xattr list */
+#define OBD_MD_FLXATTRRM     (0x0000004000000000ULL) /* xattr remove */
+#define OBD_MD_FLACL	 (0x0000008000000000ULL) /* ACL */
+#define OBD_MD_FLRMTPERM     (0x0000010000000000ULL) /* remote permission */
+#define OBD_MD_FLMDSCAPA     (0x0000020000000000ULL) /* MDS capability */
+#define OBD_MD_FLOSSCAPA     (0x0000040000000000ULL) /* OSS capability */
+#define OBD_MD_FLCKSPLIT     (0x0000080000000000ULL) /* Check split on server */
+#define OBD_MD_FLCROSSREF    (0x0000100000000000ULL) /* Cross-ref case */
+#define OBD_MD_FLGETATTRLOCK (0x0000200000000000ULL) /* Get IOEpoch attributes
+						      * under lock */
+#define OBD_MD_FLOBJCOUNT    (0x0000400000000000ULL) /* for multiple destroy */
+
+#define OBD_MD_FLRMTLSETFACL (0x0001000000000000ULL) /* lfs lsetfacl case */
+#define OBD_MD_FLRMTLGETFACL (0x0002000000000000ULL) /* lfs lgetfacl case */
+#define OBD_MD_FLRMTRSETFACL (0x0004000000000000ULL) /* lfs rsetfacl case */
+#define OBD_MD_FLRMTRGETFACL (0x0008000000000000ULL) /* lfs rgetfacl case */
+
+#define OBD_MD_FLDATAVERSION (0x0010000000000000ULL) /* iversion sum */
+
+#define OBD_MD_FLGETATTR (OBD_MD_FLID    | OBD_MD_FLATIME | OBD_MD_FLMTIME | \
+			  OBD_MD_FLCTIME | OBD_MD_FLSIZE  | OBD_MD_FLBLKSZ | \
+			  OBD_MD_FLMODE  | OBD_MD_FLTYPE  | OBD_MD_FLUID   | \
+			  OBD_MD_FLGID   | OBD_MD_FLFLAGS | OBD_MD_FLNLINK | \
+			  OBD_MD_FLGENER | OBD_MD_FLRDEV  | OBD_MD_FLGROUP)
+
+/* don't forget obdo_fid which is way down at the bottom so it can
+ * come after the definition of llog_cookie */
+
+enum hss_valid {
+	HSS_SETMASK	= 0x01,
+	HSS_CLEARMASK	= 0x02,
+	HSS_ARCHIVE_ID	= 0x04,
+};
+
+struct hsm_state_set {
+	__u32	hss_valid;
+	__u32	hss_archive_id;
+	__u64	hss_setmask;
+	__u64	hss_clearmask;
+};
+
+extern void lustre_swab_hsm_user_state(struct hsm_user_state *hus);
+extern void lustre_swab_hsm_state_set(struct hsm_state_set *hss);
+
+extern void lustre_swab_obd_statfs (struct obd_statfs *os);
+
+/* ost_body.data values for OST_BRW */
+
+#define OBD_BRW_READ	    0x01
+#define OBD_BRW_WRITE	   0x02
+#define OBD_BRW_RWMASK	  (OBD_BRW_READ | OBD_BRW_WRITE)
+#define OBD_BRW_SYNC	    0x08 /* this page is a part of synchronous
+				      * transfer and is not accounted in
+				      * the grant. */
+#define OBD_BRW_CHECK	   0x10
+#define OBD_BRW_FROM_GRANT      0x20 /* the osc manages this under llite */
+#define OBD_BRW_GRANTED	 0x40 /* the ost manages this */
+#define OBD_BRW_NOCACHE	 0x80 /* this page is a part of non-cached IO */
+#define OBD_BRW_NOQUOTA	0x100
+#define OBD_BRW_SRVLOCK	0x200 /* Client holds no lock over this page */
+#define OBD_BRW_ASYNC	  0x400 /* Server may delay commit to disk */
+#define OBD_BRW_MEMALLOC       0x800 /* Client runs in the "kswapd" context */
+#define OBD_BRW_OVER_USRQUOTA 0x1000 /* Running out of user quota */
+#define OBD_BRW_OVER_GRPQUOTA 0x2000 /* Running out of group quota */
+
+#define OBD_OBJECT_EOF 0xffffffffffffffffULL
+
+#define OST_MIN_PRECREATE 32
+#define OST_MAX_PRECREATE 20000
+
+struct obd_ioobj {
+	struct ost_id	ioo_oid;	/* object ID, if multi-obj BRW */
+	__u32		ioo_max_brw;	/* low 16 bits were o_mode before 2.4,
+					 * now (PTLRPC_BULK_OPS_COUNT - 1) in
+					 * high 16 bits in 2.4 and later */
+	__u32		ioo_bufcnt;	/* number of niobufs for this object */
+};
+
+#define IOOBJ_MAX_BRW_BITS	16
+#define IOOBJ_TYPE_MASK		((1U << IOOBJ_MAX_BRW_BITS) - 1)
+#define ioobj_max_brw_get(ioo)	(((ioo)->ioo_max_brw >> IOOBJ_MAX_BRW_BITS) + 1)
+#define ioobj_max_brw_set(ioo, num)					\
+do { (ioo)->ioo_max_brw = ((num) - 1) << IOOBJ_MAX_BRW_BITS; } while (0)
+
+extern void lustre_swab_obd_ioobj (struct obd_ioobj *ioo);
+
+/* multiple of 8 bytes => can array */
+struct niobuf_remote {
+	__u64 offset;
+	__u32 len;
+	__u32 flags;
+};
+
+extern void lustre_swab_niobuf_remote (struct niobuf_remote *nbr);
+
+/* lock value block communicated between the filter and llite */
+
+/* OST_LVB_ERR_INIT is needed because the return code in rc is
+ * negative, i.e. because ((MASK + rc) & MASK) != MASK. */
+#define OST_LVB_ERR_INIT 0xffbadbad80000000ULL
+#define OST_LVB_ERR_MASK 0xffbadbad00000000ULL
+#define OST_LVB_IS_ERR(blocks)					  \
+	((blocks & OST_LVB_ERR_MASK) == OST_LVB_ERR_MASK)
+#define OST_LVB_SET_ERR(blocks, rc)				     \
+	do { blocks = OST_LVB_ERR_INIT + rc; } while (0)
+#define OST_LVB_GET_ERR(blocks)    (int)(blocks - OST_LVB_ERR_INIT)
+
+struct ost_lvb_v1 {
+	__u64		lvb_size;
+	obd_time	lvb_mtime;
+	obd_time	lvb_atime;
+	obd_time	lvb_ctime;
+	__u64		lvb_blocks;
+};
+
+extern void lustre_swab_ost_lvb_v1(struct ost_lvb_v1 *lvb);
+
+struct ost_lvb {
+	__u64		lvb_size;
+	obd_time	lvb_mtime;
+	obd_time	lvb_atime;
+	obd_time	lvb_ctime;
+	__u64		lvb_blocks;
+	__u32		lvb_mtime_ns;
+	__u32		lvb_atime_ns;
+	__u32		lvb_ctime_ns;
+	__u32		lvb_padding;
+};
+
+extern void lustre_swab_ost_lvb(struct ost_lvb *lvb);
+
+/*
+ *   lquota data structures
+ */
+
+#ifndef QUOTABLOCK_BITS
+#define QUOTABLOCK_BITS 10
+#endif
+
+#ifndef QUOTABLOCK_SIZE
+#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS)
+#endif
+
+#ifndef toqb
+#define toqb(x) (((x) + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS)
+#endif
+
+/* The lquota_id structure is an union of all the possible identifier types that
+ * can be used with quota, this includes:
+ * - 64-bit user ID
+ * - 64-bit group ID
+ * - a FID which can be used for per-directory quota in the future */
+union lquota_id {
+	struct lu_fid	qid_fid; /* FID for per-directory quota */
+	__u64		qid_uid; /* user identifier */
+	__u64		qid_gid; /* group identifier */
+};
+
+/* quotactl management */
+struct obd_quotactl {
+	__u32			qc_cmd;
+	__u32			qc_type; /* see Q_* flag below */
+	__u32			qc_id;
+	__u32			qc_stat;
+	struct obd_dqinfo	qc_dqinfo;
+	struct obd_dqblk	qc_dqblk;
+};
+
+extern void lustre_swab_obd_quotactl(struct obd_quotactl *q);
+
+#define Q_QUOTACHECK	0x800100 /* deprecated as of 2.4 */
+#define Q_INITQUOTA	0x800101 /* deprecated as of 2.4  */
+#define Q_GETOINFO	0x800102 /* get obd quota info */
+#define Q_GETOQUOTA	0x800103 /* get obd quotas */
+#define Q_FINVALIDATE	0x800104 /* deprecated as of 2.4 */
+
+#define Q_COPY(out, in, member) (out)->member = (in)->member
+
+#define QCTL_COPY(out, in)		\
+do {					\
+	Q_COPY(out, in, qc_cmd);	\
+	Q_COPY(out, in, qc_type);	\
+	Q_COPY(out, in, qc_id);		\
+	Q_COPY(out, in, qc_stat);	\
+	Q_COPY(out, in, qc_dqinfo);	\
+	Q_COPY(out, in, qc_dqblk);	\
+} while (0)
+
+/* Body of quota request used for quota acquire/release RPCs between quota
+ * master (aka QMT) and slaves (ak QSD). */
+struct quota_body {
+	struct lu_fid	qb_fid;     /* FID of global index packing the pool ID
+				      * and type (data or metadata) as well as
+				      * the quota type (user or group). */
+	union lquota_id	qb_id;      /* uid or gid or directory FID */
+	__u32		qb_flags;   /* see below */
+	__u32		qb_padding;
+	__u64		qb_count;   /* acquire/release count (kbytes/inodes) */
+	__u64		qb_usage;   /* current slave usage (kbytes/inodes) */
+	__u64		qb_slv_ver; /* slave index file version */
+	struct lustre_handle	qb_lockh;     /* per-ID lock handle */
+	struct lustre_handle	qb_glb_lockh; /* global lock handle */
+	__u64		qb_padding1[4];
+};
+
+/* When the quota_body is used in the reply of quota global intent
+ * lock (IT_QUOTA_CONN) reply, qb_fid contains slave index file FID. */
+#define qb_slv_fid	qb_fid
+/* qb_usage is the current qunit (in kbytes/inodes) when quota_body is used in
+ * quota reply */
+#define qb_qunit	qb_usage
+
+#define QUOTA_DQACQ_FL_ACQ	0x1  /* acquire quota */
+#define QUOTA_DQACQ_FL_PREACQ	0x2  /* pre-acquire */
+#define QUOTA_DQACQ_FL_REL	0x4  /* release quota */
+#define QUOTA_DQACQ_FL_REPORT	0x8  /* report usage */
+
+extern void lustre_swab_quota_body(struct quota_body *b);
+
+/* Quota types currently supported */
+enum {
+	LQUOTA_TYPE_USR	= 0x00, /* maps to USRQUOTA */
+	LQUOTA_TYPE_GRP	= 0x01, /* maps to GRPQUOTA */
+	LQUOTA_TYPE_MAX
+};
+
+/* There are 2 different resource types on which a quota limit can be enforced:
+ * - inodes on the MDTs
+ * - blocks on the OSTs */
+enum {
+	LQUOTA_RES_MD		= 0x01, /* skip 0 to avoid null oid in FID */
+	LQUOTA_RES_DT		= 0x02,
+	LQUOTA_LAST_RES,
+	LQUOTA_FIRST_RES	= LQUOTA_RES_MD
+};
+#define LQUOTA_NR_RES (LQUOTA_LAST_RES - LQUOTA_FIRST_RES + 1)
+
+/*
+ * Space accounting support
+ * Format of an accounting record, providing disk usage information for a given
+ * user or group
+ */
+struct lquota_acct_rec { /* 16 bytes */
+	__u64 bspace;  /* current space in use */
+	__u64 ispace;  /* current # inodes in use */
+};
+
+/*
+ * Global quota index support
+ * Format of a global record, providing global quota settings for a given quota
+ * identifier
+ */
+struct lquota_glb_rec { /* 32 bytes */
+	__u64 qbr_hardlimit; /* quota hard limit, in #inodes or kbytes */
+	__u64 qbr_softlimit; /* quota soft limit, in #inodes or kbytes */
+	__u64 qbr_time;      /* grace time, in seconds */
+	__u64 qbr_granted;   /* how much is granted to slaves, in #inodes or
+			      * kbytes */
+};
+
+/*
+ * Slave index support
+ * Format of a slave record, recording how much space is granted to a given
+ * slave
+ */
+struct lquota_slv_rec { /* 8 bytes */
+	__u64 qsr_granted; /* space granted to the slave for the key=ID,
+			    * in #inodes or kbytes */
+};
+
+/* Data structures associated with the quota locks */
+
+/* Glimpse descriptor used for the index & per-ID quota locks */
+struct ldlm_gl_lquota_desc {
+	union lquota_id	gl_id;    /* quota ID subject to the glimpse */
+	__u64		gl_flags; /* see LQUOTA_FL* below */
+	__u64		gl_ver;   /* new index version */
+	__u64		gl_hardlimit; /* new hardlimit or qunit value */
+	__u64		gl_softlimit; /* new softlimit */
+	__u64		gl_time;
+	__u64		gl_pad2;
+};
+#define gl_qunit	gl_hardlimit /* current qunit value used when
+				      * glimpsing per-ID quota locks */
+
+/* quota glimpse flags */
+#define LQUOTA_FL_EDQUOT 0x1 /* user/group out of quota space on QMT */
+
+/* LVB used with quota (global and per-ID) locks */
+struct lquota_lvb {
+	__u64	lvb_flags;	/* see LQUOTA_FL* above */
+	__u64	lvb_id_may_rel; /* space that might be released later */
+	__u64	lvb_id_rel;     /* space released by the slave for this ID */
+	__u64	lvb_id_qunit;   /* current qunit value */
+	__u64	lvb_pad1;
+};
+
+extern void lustre_swab_lquota_lvb(struct lquota_lvb *lvb);
+
+/* LVB used with global quota lock */
+#define lvb_glb_ver  lvb_id_may_rel /* current version of the global index */
+
+/* op codes */
+typedef enum {
+	QUOTA_DQACQ	= 601,
+	QUOTA_DQREL	= 602,
+	QUOTA_LAST_OPC
+} quota_cmd_t;
+#define QUOTA_FIRST_OPC	QUOTA_DQACQ
+
+/*
+ *   MDS REQ RECORDS
+ */
+
+/* opcodes */
+typedef enum {
+	MDS_GETATTR		= 33,
+	MDS_GETATTR_NAME	= 34,
+	MDS_CLOSE		= 35,
+	MDS_REINT		= 36,
+	MDS_READPAGE		= 37,
+	MDS_CONNECT		= 38,
+	MDS_DISCONNECT		= 39,
+	MDS_GETSTATUS		= 40,
+	MDS_STATFS		= 41,
+	MDS_PIN			= 42,
+	MDS_UNPIN		= 43,
+	MDS_SYNC		= 44,
+	MDS_DONE_WRITING	= 45,
+	MDS_SET_INFO		= 46,
+	MDS_QUOTACHECK		= 47,
+	MDS_QUOTACTL		= 48,
+	MDS_GETXATTR		= 49,
+	MDS_SETXATTR		= 50, /* obsolete, now it's MDS_REINT op */
+	MDS_WRITEPAGE		= 51,
+	MDS_IS_SUBDIR		= 52,
+	MDS_GET_INFO		= 53,
+	MDS_HSM_STATE_GET	= 54,
+	MDS_HSM_STATE_SET	= 55,
+	MDS_HSM_ACTION		= 56,
+	MDS_HSM_PROGRESS	= 57,
+	MDS_HSM_REQUEST		= 58,
+	MDS_HSM_CT_REGISTER	= 59,
+	MDS_HSM_CT_UNREGISTER	= 60,
+	MDS_SWAP_LAYOUTS	= 61,
+	MDS_LAST_OPC
+} mds_cmd_t;
+
+#define MDS_FIRST_OPC    MDS_GETATTR
+
+
+/* opcodes for object update */
+typedef enum {
+	UPDATE_OBJ	= 1000,
+	UPDATE_LAST_OPC
+} update_cmd_t;
+
+#define UPDATE_FIRST_OPC    UPDATE_OBJ
+
+/*
+ * Do not exceed 63
+ */
+
+typedef enum {
+	REINT_SETATTR  = 1,
+	REINT_CREATE   = 2,
+	REINT_LINK     = 3,
+	REINT_UNLINK   = 4,
+	REINT_RENAME   = 5,
+	REINT_OPEN     = 6,
+	REINT_SETXATTR = 7,
+	REINT_RMENTRY  = 8,
+//      REINT_WRITE    = 9,
+	REINT_MAX
+} mds_reint_t, mdt_reint_t;
+
+extern void lustre_swab_generic_32s (__u32 *val);
+
+/* the disposition of the intent outlines what was executed */
+#define DISP_IT_EXECD	0x00000001
+#define DISP_LOOKUP_EXECD    0x00000002
+#define DISP_LOOKUP_NEG      0x00000004
+#define DISP_LOOKUP_POS      0x00000008
+#define DISP_OPEN_CREATE     0x00000010
+#define DISP_OPEN_OPEN       0x00000020
+#define DISP_ENQ_COMPLETE    0x00400000
+#define DISP_ENQ_OPEN_REF    0x00800000
+#define DISP_ENQ_CREATE_REF  0x01000000
+#define DISP_OPEN_LOCK       0x02000000
+
+/* INODE LOCK PARTS */
+#define MDS_INODELOCK_LOOKUP 0x000001       /* dentry, mode, owner, group */
+#define MDS_INODELOCK_UPDATE 0x000002       /* size, links, timestamps */
+#define MDS_INODELOCK_OPEN   0x000004       /* For opened files */
+#define MDS_INODELOCK_LAYOUT 0x000008       /* for layout */
+#define MDS_INODELOCK_PERM   0x000010       /* for permission */
+
+#define MDS_INODELOCK_MAXSHIFT 4
+/* This FULL lock is useful to take on unlink sort of operations */
+#define MDS_INODELOCK_FULL ((1<<(MDS_INODELOCK_MAXSHIFT+1))-1)
+
+extern void lustre_swab_ll_fid (struct ll_fid *fid);
+
+/* NOTE: until Lustre 1.8.7/2.1.1 the fid_ver() was packed into name[2],
+ * but was moved into name[1] along with the OID to avoid consuming the
+ * name[2,3] fields that need to be used for the quota id (also a FID). */
+enum {
+	LUSTRE_RES_ID_SEQ_OFF = 0,
+	LUSTRE_RES_ID_VER_OID_OFF = 1,
+	LUSTRE_RES_ID_WAS_VER_OFF = 2, /* see note above */
+	LUSTRE_RES_ID_QUOTA_SEQ_OFF = 2,
+	LUSTRE_RES_ID_QUOTA_VER_OID_OFF = 3,
+	LUSTRE_RES_ID_HSH_OFF = 3
+};
+
+#define MDS_STATUS_CONN 1
+#define MDS_STATUS_LOV 2
+
+/* mdt_thread_info.mti_flags. */
+enum md_op_flags {
+	/* The flag indicates Size-on-MDS attributes are changed. */
+	MF_SOM_CHANGE	   = (1 << 0),
+	/* Flags indicates an epoch opens or closes. */
+	MF_EPOCH_OPEN	   = (1 << 1),
+	MF_EPOCH_CLOSE	  = (1 << 2),
+	MF_MDC_CANCEL_FID1      = (1 << 3),
+	MF_MDC_CANCEL_FID2      = (1 << 4),
+	MF_MDC_CANCEL_FID3      = (1 << 5),
+	MF_MDC_CANCEL_FID4      = (1 << 6),
+	/* There is a pending attribute update. */
+	MF_SOM_AU	       = (1 << 7),
+	/* Cancel OST locks while getattr OST attributes. */
+	MF_GETATTR_LOCK	 = (1 << 8),
+	MF_GET_MDT_IDX	  = (1 << 9),
+};
+
+#define MF_SOM_LOCAL_FLAGS (MF_SOM_CHANGE | MF_EPOCH_OPEN | MF_EPOCH_CLOSE)
+
+#define LUSTRE_BFLAG_UNCOMMITTED_WRITES   0x1
+
+/* these should be identical to their EXT4_*_FL counterparts, they are
+ * redefined here only to avoid dragging in fs/ext4/ext4.h */
+#define LUSTRE_SYNC_FL	 0x00000008 /* Synchronous updates */
+#define LUSTRE_IMMUTABLE_FL    0x00000010 /* Immutable file */
+#define LUSTRE_APPEND_FL       0x00000020 /* writes to file may only append */
+#define LUSTRE_NOATIME_FL      0x00000080 /* do not update atime */
+#define LUSTRE_DIRSYNC_FL      0x00010000 /* dirsync behaviour (dir only) */
+
+/* Convert wire LUSTRE_*_FL to corresponding client local VFS S_* values
+ * for the client inode i_flags.  The LUSTRE_*_FL are the Lustre wire
+ * protocol equivalents of LDISKFS_*_FL values stored on disk, while
+ * the S_* flags are kernel-internal values that change between kernel
+ * versions.  These flags are set/cleared via FSFILT_IOC_{GET,SET}_FLAGS.
+ * See b=16526 for a full history. */
+static inline int ll_ext_to_inode_flags(int flags)
+{
+	return (((flags & LUSTRE_SYNC_FL)      ? S_SYNC      : 0) |
+		((flags & LUSTRE_NOATIME_FL)   ? S_NOATIME   : 0) |
+		((flags & LUSTRE_APPEND_FL)    ? S_APPEND    : 0) |
+#if defined(S_DIRSYNC)
+		((flags & LUSTRE_DIRSYNC_FL)   ? S_DIRSYNC   : 0) |
+#endif
+		((flags & LUSTRE_IMMUTABLE_FL) ? S_IMMUTABLE : 0));
+}
+
+static inline int ll_inode_to_ext_flags(int iflags)
+{
+	return (((iflags & S_SYNC)      ? LUSTRE_SYNC_FL      : 0) |
+		((iflags & S_NOATIME)   ? LUSTRE_NOATIME_FL   : 0) |
+		((iflags & S_APPEND)    ? LUSTRE_APPEND_FL    : 0) |
+#if defined(S_DIRSYNC)
+		((iflags & S_DIRSYNC)   ? LUSTRE_DIRSYNC_FL   : 0) |
+#endif
+		((iflags & S_IMMUTABLE) ? LUSTRE_IMMUTABLE_FL : 0));
+}
+
+struct mdt_body {
+	struct lu_fid  fid1;
+	struct lu_fid  fid2;
+	struct lustre_handle handle;
+	__u64	  valid;
+	__u64	  size;   /* Offset, in the case of MDS_READPAGE */
+       obd_time	mtime;
+       obd_time	atime;
+       obd_time	ctime;
+	__u64	  blocks; /* XID, in the case of MDS_READPAGE */
+	__u64	  ioepoch;
+	__u64	       unused1; /* was "ino" until 2.4.0 */
+	__u32	  fsuid;
+	__u32	  fsgid;
+	__u32	  capability;
+	__u32	  mode;
+	__u32	  uid;
+	__u32	  gid;
+	__u32	  flags; /* from vfs for pin/unpin, LUSTRE_BFLAG close */
+	__u32	  rdev;
+	__u32	  nlink; /* #bytes to read in the case of MDS_READPAGE */
+	__u32	       unused2; /* was "generation" until 2.4.0 */
+	__u32	  suppgid;
+	__u32	  eadatasize;
+	__u32	  aclsize;
+	__u32	  max_mdsize;
+	__u32	  max_cookiesize;
+	__u32	  uid_h; /* high 32-bits of uid, for FUID */
+	__u32	  gid_h; /* high 32-bits of gid, for FUID */
+	__u32	  padding_5; /* also fix lustre_swab_mdt_body */
+	__u64	  padding_6;
+	__u64	  padding_7;
+	__u64	  padding_8;
+	__u64	  padding_9;
+	__u64	  padding_10;
+}; /* 216 */
+
+extern void lustre_swab_mdt_body (struct mdt_body *b);
+
+struct mdt_ioepoch {
+	struct lustre_handle handle;
+	__u64  ioepoch;
+	__u32  flags;
+	__u32  padding;
+};
+
+extern void lustre_swab_mdt_ioepoch (struct mdt_ioepoch *b);
+
+/* permissions for md_perm.mp_perm */
+enum {
+	CFS_SETUID_PERM = 0x01,
+	CFS_SETGID_PERM = 0x02,
+	CFS_SETGRP_PERM = 0x04,
+	CFS_RMTACL_PERM = 0x08,
+	CFS_RMTOWN_PERM = 0x10
+};
+
+/* inode access permission for remote user, the inode info are omitted,
+ * for client knows them. */
+struct mdt_remote_perm {
+	__u32	   rp_uid;
+	__u32	   rp_gid;
+	__u32	   rp_fsuid;
+	__u32	   rp_fsuid_h;
+	__u32	   rp_fsgid;
+	__u32	   rp_fsgid_h;
+	__u32	   rp_access_perm; /* MAY_READ/WRITE/EXEC */
+	__u32	   rp_padding;
+};
+
+extern void lustre_swab_mdt_remote_perm(struct mdt_remote_perm *p);
+
+struct mdt_rec_setattr {
+	__u32	   sa_opcode;
+	__u32	   sa_cap;
+	__u32	   sa_fsuid;
+	__u32	   sa_fsuid_h;
+	__u32	   sa_fsgid;
+	__u32	   sa_fsgid_h;
+	__u32	   sa_suppgid;
+	__u32	   sa_suppgid_h;
+	__u32	   sa_padding_1;
+	__u32	   sa_padding_1_h;
+	struct lu_fid   sa_fid;
+	__u64	   sa_valid;
+	__u32	   sa_uid;
+	__u32	   sa_gid;
+	__u64	   sa_size;
+	__u64	   sa_blocks;
+	obd_time	sa_mtime;
+	obd_time	sa_atime;
+	obd_time	sa_ctime;
+	__u32	   sa_attr_flags;
+	__u32	   sa_mode;
+	__u32	   sa_bias;      /* some operation flags */
+	__u32	   sa_padding_3;
+	__u32	   sa_padding_4;
+	__u32	   sa_padding_5;
+};
+
+extern void lustre_swab_mdt_rec_setattr (struct mdt_rec_setattr *sa);
+
+/*
+ * Attribute flags used in mdt_rec_setattr::sa_valid.
+ * The kernel's #defines for ATTR_* should not be used over the network
+ * since the client and MDS may run different kernels (see bug 13828)
+ * Therefore, we should only use MDS_ATTR_* attributes for sa_valid.
+ */
+#define MDS_ATTR_MODE	  0x1ULL /* = 1 */
+#define MDS_ATTR_UID	   0x2ULL /* = 2 */
+#define MDS_ATTR_GID	   0x4ULL /* = 4 */
+#define MDS_ATTR_SIZE	  0x8ULL /* = 8 */
+#define MDS_ATTR_ATIME	0x10ULL /* = 16 */
+#define MDS_ATTR_MTIME	0x20ULL /* = 32 */
+#define MDS_ATTR_CTIME	0x40ULL /* = 64 */
+#define MDS_ATTR_ATIME_SET    0x80ULL /* = 128 */
+#define MDS_ATTR_MTIME_SET   0x100ULL /* = 256 */
+#define MDS_ATTR_FORCE       0x200ULL /* = 512, Not a change, but a change it */
+#define MDS_ATTR_ATTR_FLAG   0x400ULL /* = 1024 */
+#define MDS_ATTR_KILL_SUID   0x800ULL /* = 2048 */
+#define MDS_ATTR_KILL_SGID  0x1000ULL /* = 4096 */
+#define MDS_ATTR_CTIME_SET  0x2000ULL /* = 8192 */
+#define MDS_ATTR_FROM_OPEN  0x4000ULL /* = 16384, called from open path, ie O_TRUNC */
+#define MDS_ATTR_BLOCKS     0x8000ULL /* = 32768 */
+
+#ifndef FMODE_READ
+#define FMODE_READ	       00000001
+#define FMODE_WRITE	      00000002
+#endif
+
+#define MDS_FMODE_CLOSED	 00000000
+#define MDS_FMODE_EXEC	   00000004
+/* IO Epoch is opened on a closed file. */
+#define MDS_FMODE_EPOCH	  01000000
+/* IO Epoch is opened on a file truncate. */
+#define MDS_FMODE_TRUNC	  02000000
+/* Size-on-MDS Attribute Update is pending. */
+#define MDS_FMODE_SOM	    04000000
+
+#define MDS_OPEN_CREATED	 00000010
+#define MDS_OPEN_CROSS	   00000020
+
+#define MDS_OPEN_CREAT	   00000100
+#define MDS_OPEN_EXCL	    00000200
+#define MDS_OPEN_TRUNC	   00001000
+#define MDS_OPEN_APPEND	  00002000
+#define MDS_OPEN_SYNC	    00010000
+#define MDS_OPEN_DIRECTORY       00200000
+
+#define MDS_OPEN_BY_FID		040000000 /* open_by_fid for known object */
+#define MDS_OPEN_DELAY_CREATE  0100000000 /* delay initial object create */
+#define MDS_OPEN_OWNEROVERRIDE 0200000000 /* NFSD rw-reopen ro file for owner */
+#define MDS_OPEN_JOIN_FILE     0400000000 /* open for join file.
+					   * We do not support JOIN FILE
+					   * anymore, reserve this flags
+					   * just for preventing such bit
+					   * to be reused. */
+
+#define MDS_OPEN_LOCK	 04000000000 /* This open requires open lock */
+#define MDS_OPEN_HAS_EA      010000000000 /* specify object create pattern */
+#define MDS_OPEN_HAS_OBJS    020000000000 /* Just set the EA the obj exist */
+#define MDS_OPEN_NORESTORE  0100000000000ULL /* Do not restore file at open */
+#define MDS_OPEN_NEWSTRIPE  0200000000000ULL /* New stripe needed (restripe or
+					      * hsm restore) */
+#define MDS_OPEN_VOLATILE   0400000000000ULL /* File is volatile = created
+						unlinked */
+
+/* permission for create non-directory file */
+#define MAY_CREATE      (1 << 7)
+/* permission for create directory file */
+#define MAY_LINK	(1 << 8)
+/* permission for delete from the directory */
+#define MAY_UNLINK      (1 << 9)
+/* source's permission for rename */
+#define MAY_RENAME_SRC  (1 << 10)
+/* target's permission for rename */
+#define MAY_RENAME_TAR  (1 << 11)
+/* part (parent's) VTX permission check */
+#define MAY_VTX_PART    (1 << 12)
+/* full VTX permission check */
+#define MAY_VTX_FULL    (1 << 13)
+/* lfs rgetfacl permission check */
+#define MAY_RGETFACL    (1 << 14)
+
+enum {
+	MDS_CHECK_SPLIT		= 1 << 0,
+	MDS_CROSS_REF		= 1 << 1,
+	MDS_VTX_BYPASS		= 1 << 2,
+	MDS_PERM_BYPASS		= 1 << 3,
+	MDS_SOM			= 1 << 4,
+	MDS_QUOTA_IGNORE	= 1 << 5,
+	MDS_CLOSE_CLEANUP	= 1 << 6,
+	MDS_KEEP_ORPHAN		= 1 << 7,
+	MDS_RECOV_OPEN		= 1 << 8,
+	MDS_DATA_MODIFIED	= 1 << 9,
+	MDS_CREATE_VOLATILE	= 1 << 10,
+	MDS_OWNEROVERRIDE	= 1 << 11,
+};
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_create {
+	__u32	   cr_opcode;
+	__u32	   cr_cap;
+	__u32	   cr_fsuid;
+	__u32	   cr_fsuid_h;
+	__u32	   cr_fsgid;
+	__u32	   cr_fsgid_h;
+	__u32	   cr_suppgid1;
+	__u32	   cr_suppgid1_h;
+	__u32	   cr_suppgid2;
+	__u32	   cr_suppgid2_h;
+	struct lu_fid   cr_fid1;
+	struct lu_fid   cr_fid2;
+	struct lustre_handle cr_old_handle; /* handle in case of open replay */
+	obd_time	cr_time;
+	__u64	   cr_rdev;
+	__u64	   cr_ioepoch;
+	__u64	   cr_padding_1;   /* rr_blocks */
+	__u32	   cr_mode;
+	__u32	   cr_bias;
+	/* use of helpers set/get_mrc_cr_flags() is needed to access
+	 * 64 bits cr_flags [cr_flags_l, cr_flags_h], this is done to
+	 * extend cr_flags size without breaking 1.8 compat */
+	__u32	   cr_flags_l;     /* for use with open, low  32 bits  */
+	__u32	   cr_flags_h;     /* for use with open, high 32 bits */
+	__u32	   cr_umask;       /* umask for create */
+	__u32	   cr_padding_4;   /* rr_padding_4 */
+};
+
+static inline void set_mrc_cr_flags(struct mdt_rec_create *mrc, __u64 flags)
+{
+	mrc->cr_flags_l = (__u32)(flags & 0xFFFFFFFFUll);
+	mrc->cr_flags_h = (__u32)(flags >> 32);
+}
+
+static inline __u64 get_mrc_cr_flags(struct mdt_rec_create *mrc)
+{
+	return ((__u64)(mrc->cr_flags_l) | ((__u64)mrc->cr_flags_h << 32));
+}
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_link {
+	__u32	   lk_opcode;
+	__u32	   lk_cap;
+	__u32	   lk_fsuid;
+	__u32	   lk_fsuid_h;
+	__u32	   lk_fsgid;
+	__u32	   lk_fsgid_h;
+	__u32	   lk_suppgid1;
+	__u32	   lk_suppgid1_h;
+	__u32	   lk_suppgid2;
+	__u32	   lk_suppgid2_h;
+	struct lu_fid   lk_fid1;
+	struct lu_fid   lk_fid2;
+	obd_time	lk_time;
+	__u64	   lk_padding_1;   /* rr_atime */
+	__u64	   lk_padding_2;   /* rr_ctime */
+	__u64	   lk_padding_3;   /* rr_size */
+	__u64	   lk_padding_4;   /* rr_blocks */
+	__u32	   lk_bias;
+	__u32	   lk_padding_5;   /* rr_mode */
+	__u32	   lk_padding_6;   /* rr_flags */
+	__u32	   lk_padding_7;   /* rr_padding_2 */
+	__u32	   lk_padding_8;   /* rr_padding_3 */
+	__u32	   lk_padding_9;   /* rr_padding_4 */
+};
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_unlink {
+	__u32	   ul_opcode;
+	__u32	   ul_cap;
+	__u32	   ul_fsuid;
+	__u32	   ul_fsuid_h;
+	__u32	   ul_fsgid;
+	__u32	   ul_fsgid_h;
+	__u32	   ul_suppgid1;
+	__u32	   ul_suppgid1_h;
+	__u32	   ul_suppgid2;
+	__u32	   ul_suppgid2_h;
+	struct lu_fid   ul_fid1;
+	struct lu_fid   ul_fid2;
+	obd_time	ul_time;
+	__u64	   ul_padding_2;   /* rr_atime */
+	__u64	   ul_padding_3;   /* rr_ctime */
+	__u64	   ul_padding_4;   /* rr_size */
+	__u64	   ul_padding_5;   /* rr_blocks */
+	__u32	   ul_bias;
+	__u32	   ul_mode;
+	__u32	   ul_padding_6;   /* rr_flags */
+	__u32	   ul_padding_7;   /* rr_padding_2 */
+	__u32	   ul_padding_8;   /* rr_padding_3 */
+	__u32	   ul_padding_9;   /* rr_padding_4 */
+};
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_rename {
+	__u32	   rn_opcode;
+	__u32	   rn_cap;
+	__u32	   rn_fsuid;
+	__u32	   rn_fsuid_h;
+	__u32	   rn_fsgid;
+	__u32	   rn_fsgid_h;
+	__u32	   rn_suppgid1;
+	__u32	   rn_suppgid1_h;
+	__u32	   rn_suppgid2;
+	__u32	   rn_suppgid2_h;
+	struct lu_fid   rn_fid1;
+	struct lu_fid   rn_fid2;
+	obd_time	rn_time;
+	__u64	   rn_padding_1;   /* rr_atime */
+	__u64	   rn_padding_2;   /* rr_ctime */
+	__u64	   rn_padding_3;   /* rr_size */
+	__u64	   rn_padding_4;   /* rr_blocks */
+	__u32	   rn_bias;	/* some operation flags */
+	__u32	   rn_mode;	/* cross-ref rename has mode */
+	__u32	   rn_padding_5;   /* rr_flags */
+	__u32	   rn_padding_6;   /* rr_padding_2 */
+	__u32	   rn_padding_7;   /* rr_padding_3 */
+	__u32	   rn_padding_8;   /* rr_padding_4 */
+};
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_setxattr {
+	__u32	   sx_opcode;
+	__u32	   sx_cap;
+	__u32	   sx_fsuid;
+	__u32	   sx_fsuid_h;
+	__u32	   sx_fsgid;
+	__u32	   sx_fsgid_h;
+	__u32	   sx_suppgid1;
+	__u32	   sx_suppgid1_h;
+	__u32	   sx_suppgid2;
+	__u32	   sx_suppgid2_h;
+	struct lu_fid   sx_fid;
+	__u64	   sx_padding_1;   /* These three are rr_fid2 */
+	__u32	   sx_padding_2;
+	__u32	   sx_padding_3;
+	__u64	   sx_valid;
+	obd_time	sx_time;
+	__u64	   sx_padding_5;   /* rr_ctime */
+	__u64	   sx_padding_6;   /* rr_size */
+	__u64	   sx_padding_7;   /* rr_blocks */
+	__u32	   sx_size;
+	__u32	   sx_flags;
+	__u32	   sx_padding_8;   /* rr_flags */
+	__u32	   sx_padding_9;   /* rr_padding_2 */
+	__u32	   sx_padding_10;  /* rr_padding_3 */
+	__u32	   sx_padding_11;  /* rr_padding_4 */
+};
+
+/*
+ * mdt_rec_reint is the template for all mdt_reint_xxx structures.
+ * Do NOT change the size of various members, otherwise the value
+ * will be broken in lustre_swab_mdt_rec_reint().
+ *
+ * If you add new members in other mdt_reint_xxx structres and need to use the
+ * rr_padding_x fields, then update lustre_swab_mdt_rec_reint() also.
+ */
+struct mdt_rec_reint {
+	__u32	   rr_opcode;
+	__u32	   rr_cap;
+	__u32	   rr_fsuid;
+	__u32	   rr_fsuid_h;
+	__u32	   rr_fsgid;
+	__u32	   rr_fsgid_h;
+	__u32	   rr_suppgid1;
+	__u32	   rr_suppgid1_h;
+	__u32	   rr_suppgid2;
+	__u32	   rr_suppgid2_h;
+	struct lu_fid   rr_fid1;
+	struct lu_fid   rr_fid2;
+	obd_time	rr_mtime;
+	obd_time	rr_atime;
+	obd_time	rr_ctime;
+	__u64	   rr_size;
+	__u64	   rr_blocks;
+	__u32	   rr_bias;
+	__u32	   rr_mode;
+	__u32	   rr_flags;
+	__u32	   rr_flags_h;
+	__u32	   rr_umask;
+	__u32	   rr_padding_4; /* also fix lustre_swab_mdt_rec_reint */
+};
+
+extern void lustre_swab_mdt_rec_reint(struct mdt_rec_reint *rr);
+
+struct lmv_desc {
+	__u32 ld_tgt_count;		/* how many MDS's */
+	__u32 ld_active_tgt_count;	 /* how many active */
+	__u32 ld_default_stripe_count;     /* how many objects are used */
+	__u32 ld_pattern;		  /* default MEA_MAGIC_* */
+	__u64 ld_default_hash_size;
+	__u64 ld_padding_1;		/* also fix lustre_swab_lmv_desc */
+	__u32 ld_padding_2;		/* also fix lustre_swab_lmv_desc */
+	__u32 ld_qos_maxage;	       /* in second */
+	__u32 ld_padding_3;		/* also fix lustre_swab_lmv_desc */
+	__u32 ld_padding_4;		/* also fix lustre_swab_lmv_desc */
+	struct obd_uuid ld_uuid;
+};
+
+extern void lustre_swab_lmv_desc (struct lmv_desc *ld);
+
+/* TODO: lmv_stripe_md should contain mds capabilities for all slave fids */
+struct lmv_stripe_md {
+	__u32	 mea_magic;
+	__u32	 mea_count;
+	__u32	 mea_master;
+	__u32	 mea_padding;
+	char	  mea_pool_name[LOV_MAXPOOLNAME];
+	struct lu_fid mea_ids[0];
+};
+
+extern void lustre_swab_lmv_stripe_md(struct lmv_stripe_md *mea);
+
+/* lmv structures */
+#define MEA_MAGIC_LAST_CHAR      0xb2221ca1
+#define MEA_MAGIC_ALL_CHARS      0xb222a11c
+#define MEA_MAGIC_HASH_SEGMENT   0xb222a11b
+
+#define MAX_HASH_SIZE_32	 0x7fffffffUL
+#define MAX_HASH_SIZE	    0x7fffffffffffffffULL
+#define MAX_HASH_HIGHEST_BIT     0x1000000000000000ULL
+
+enum fld_rpc_opc {
+	FLD_QUERY		       = 900,
+	FLD_LAST_OPC,
+	FLD_FIRST_OPC		   = FLD_QUERY
+};
+
+enum seq_rpc_opc {
+	SEQ_QUERY		       = 700,
+	SEQ_LAST_OPC,
+	SEQ_FIRST_OPC		   = SEQ_QUERY
+};
+
+enum seq_op {
+	SEQ_ALLOC_SUPER = 0,
+	SEQ_ALLOC_META = 1
+};
+
+/*
+ *  LOV data structures
+ */
+
+#define LOV_MAX_UUID_BUFFER_SIZE  8192
+/* The size of the buffer the lov/mdc reserves for the
+ * array of UUIDs returned by the MDS.  With the current
+ * protocol, this will limit the max number of OSTs per LOV */
+
+#define LOV_DESC_MAGIC 0xB0CCDE5C
+
+/* LOV settings descriptor (should only contain static info) */
+struct lov_desc {
+	__u32 ld_tgt_count;		/* how many OBD's */
+	__u32 ld_active_tgt_count;	 /* how many active */
+	__u32 ld_default_stripe_count;     /* how many objects are used */
+	__u32 ld_pattern;		  /* default PATTERN_RAID0 */
+	__u64 ld_default_stripe_size;      /* in bytes */
+	__u64 ld_default_stripe_offset;    /* in bytes */
+	__u32 ld_padding_0;		/* unused */
+	__u32 ld_qos_maxage;	       /* in second */
+	__u32 ld_padding_1;		/* also fix lustre_swab_lov_desc */
+	__u32 ld_padding_2;		/* also fix lustre_swab_lov_desc */
+	struct obd_uuid ld_uuid;
+};
+
+#define ld_magic ld_active_tgt_count       /* for swabbing from llogs */
+
+extern void lustre_swab_lov_desc (struct lov_desc *ld);
+
+/*
+ *   LDLM requests:
+ */
+/* opcodes -- MUST be distinct from OST/MDS opcodes */
+typedef enum {
+	LDLM_ENQUEUE     = 101,
+	LDLM_CONVERT     = 102,
+	LDLM_CANCEL      = 103,
+	LDLM_BL_CALLBACK = 104,
+	LDLM_CP_CALLBACK = 105,
+	LDLM_GL_CALLBACK = 106,
+	LDLM_SET_INFO    = 107,
+	LDLM_LAST_OPC
+} ldlm_cmd_t;
+#define LDLM_FIRST_OPC LDLM_ENQUEUE
+
+#define RES_NAME_SIZE 4
+struct ldlm_res_id {
+	__u64 name[RES_NAME_SIZE];
+};
+
+extern void lustre_swab_ldlm_res_id (struct ldlm_res_id *id);
+
+static inline int ldlm_res_eq(const struct ldlm_res_id *res0,
+			      const struct ldlm_res_id *res1)
+{
+	return !memcmp(res0, res1, sizeof(*res0));
+}
+
+/* lock types */
+typedef enum {
+	LCK_MINMODE = 0,
+	LCK_EX      = 1,
+	LCK_PW      = 2,
+	LCK_PR      = 4,
+	LCK_CW      = 8,
+	LCK_CR      = 16,
+	LCK_NL      = 32,
+	LCK_GROUP   = 64,
+	LCK_COS     = 128,
+	LCK_MAXMODE
+} ldlm_mode_t;
+
+#define LCK_MODE_NUM    8
+
+typedef enum {
+	LDLM_PLAIN     = 10,
+	LDLM_EXTENT    = 11,
+	LDLM_FLOCK     = 12,
+	LDLM_IBITS     = 13,
+	LDLM_MAX_TYPE
+} ldlm_type_t;
+
+#define LDLM_MIN_TYPE LDLM_PLAIN
+
+struct ldlm_extent {
+	__u64 start;
+	__u64 end;
+	__u64 gid;
+};
+
+static inline int ldlm_extent_overlap(struct ldlm_extent *ex1,
+				      struct ldlm_extent *ex2)
+{
+	return (ex1->start <= ex2->end) && (ex2->start <= ex1->end);
+}
+
+/* check if @ex1 contains @ex2 */
+static inline int ldlm_extent_contain(struct ldlm_extent *ex1,
+				      struct ldlm_extent *ex2)
+{
+	return (ex1->start <= ex2->start) && (ex1->end >= ex2->end);
+}
+
+struct ldlm_inodebits {
+	__u64 bits;
+};
+
+struct ldlm_flock_wire {
+	__u64 lfw_start;
+	__u64 lfw_end;
+	__u64 lfw_owner;
+	__u32 lfw_padding;
+	__u32 lfw_pid;
+};
+
+/* it's important that the fields of the ldlm_extent structure match
+ * the first fields of the ldlm_flock structure because there is only
+ * one ldlm_swab routine to process the ldlm_policy_data_t union. if
+ * this ever changes we will need to swab the union differently based
+ * on the resource type. */
+
+typedef union {
+	struct ldlm_extent l_extent;
+	struct ldlm_flock_wire l_flock;
+	struct ldlm_inodebits l_inodebits;
+} ldlm_wire_policy_data_t;
+
+extern void lustre_swab_ldlm_policy_data (ldlm_wire_policy_data_t *d);
+
+union ldlm_gl_desc {
+	struct ldlm_gl_lquota_desc	lquota_desc;
+};
+
+extern void lustre_swab_gl_desc(union ldlm_gl_desc *);
+
+struct ldlm_intent {
+	__u64 opc;
+};
+
+extern void lustre_swab_ldlm_intent (struct ldlm_intent *i);
+
+struct ldlm_resource_desc {
+	ldlm_type_t lr_type;
+	__u32 lr_padding;       /* also fix lustre_swab_ldlm_resource_desc */
+	struct ldlm_res_id lr_name;
+};
+
+extern void lustre_swab_ldlm_resource_desc (struct ldlm_resource_desc *r);
+
+struct ldlm_lock_desc {
+	struct ldlm_resource_desc l_resource;
+	ldlm_mode_t l_req_mode;
+	ldlm_mode_t l_granted_mode;
+	ldlm_wire_policy_data_t l_policy_data;
+};
+
+extern void lustre_swab_ldlm_lock_desc (struct ldlm_lock_desc *l);
+
+#define LDLM_LOCKREQ_HANDLES 2
+#define LDLM_ENQUEUE_CANCEL_OFF 1
+
+struct ldlm_request {
+	__u32 lock_flags;
+	__u32 lock_count;
+	struct ldlm_lock_desc lock_desc;
+	struct lustre_handle lock_handle[LDLM_LOCKREQ_HANDLES];
+};
+
+extern void lustre_swab_ldlm_request (struct ldlm_request *rq);
+
+/* If LDLM_ENQUEUE, 1 slot is already occupied, 1 is available.
+ * Otherwise, 2 are available. */
+#define ldlm_request_bufsize(count,type)				\
+({								      \
+	int _avail = LDLM_LOCKREQ_HANDLES;			      \
+	_avail -= (type == LDLM_ENQUEUE ? LDLM_ENQUEUE_CANCEL_OFF : 0); \
+	sizeof(struct ldlm_request) +				   \
+	(count > _avail ? count - _avail : 0) *			 \
+	sizeof(struct lustre_handle);				   \
+})
+
+struct ldlm_reply {
+	__u32 lock_flags;
+	__u32 lock_padding;     /* also fix lustre_swab_ldlm_reply */
+	struct ldlm_lock_desc lock_desc;
+	struct lustre_handle lock_handle;
+	__u64  lock_policy_res1;
+	__u64  lock_policy_res2;
+};
+
+extern void lustre_swab_ldlm_reply (struct ldlm_reply *r);
+
+#define ldlm_flags_to_wire(flags)    ((__u32)(flags))
+#define ldlm_flags_from_wire(flags)  ((__u64)(flags))
+
+/*
+ * Opcodes for mountconf (mgs and mgc)
+ */
+typedef enum {
+	MGS_CONNECT = 250,
+	MGS_DISCONNECT,
+	MGS_EXCEPTION,	 /* node died, etc. */
+	MGS_TARGET_REG,	/* whenever target starts up */
+	MGS_TARGET_DEL,
+	MGS_SET_INFO,
+	MGS_CONFIG_READ,
+	MGS_LAST_OPC
+} mgs_cmd_t;
+#define MGS_FIRST_OPC MGS_CONNECT
+
+#define MGS_PARAM_MAXLEN 1024
+#define KEY_SET_INFO "set_info"
+
+struct mgs_send_param {
+	char	     mgs_param[MGS_PARAM_MAXLEN];
+};
+
+/* We pass this info to the MGS so it can write config logs */
+#define MTI_NAME_MAXLEN  64
+#define MTI_PARAM_MAXLEN 4096
+#define MTI_NIDS_MAX     32
+struct mgs_target_info {
+	__u32	    mti_lustre_ver;
+	__u32	    mti_stripe_index;
+	__u32	    mti_config_ver;
+	__u32	    mti_flags;
+	__u32	    mti_nid_count;
+	__u32	    mti_instance; /* Running instance of target */
+	char	     mti_fsname[MTI_NAME_MAXLEN];
+	char	     mti_svname[MTI_NAME_MAXLEN];
+	char	     mti_uuid[sizeof(struct obd_uuid)];
+	__u64	    mti_nids[MTI_NIDS_MAX];     /* host nids (lnet_nid_t)*/
+	char	     mti_params[MTI_PARAM_MAXLEN];
+};
+extern void lustre_swab_mgs_target_info(struct mgs_target_info *oinfo);
+
+struct mgs_nidtbl_entry {
+	__u64	   mne_version;    /* table version of this entry */
+	__u32	   mne_instance;   /* target instance # */
+	__u32	   mne_index;      /* target index */
+	__u32	   mne_length;     /* length of this entry - by bytes */
+	__u8	    mne_type;       /* target type LDD_F_SV_TYPE_OST/MDT */
+	__u8	    mne_nid_type;   /* type of nid(mbz). for ipv6. */
+	__u8	    mne_nid_size;   /* size of each NID, by bytes */
+	__u8	    mne_nid_count;  /* # of NIDs in buffer */
+	union {
+		lnet_nid_t nids[0];     /* variable size buffer for NIDs. */
+	} u;
+};
+extern void lustre_swab_mgs_nidtbl_entry(struct mgs_nidtbl_entry *oinfo);
+
+struct mgs_config_body {
+	char     mcb_name[MTI_NAME_MAXLEN]; /* logname */
+	__u64    mcb_offset;    /* next index of config log to request */
+	__u16    mcb_type;      /* type of log: CONFIG_T_[CONFIG|RECOVER] */
+	__u8     mcb_reserved;
+	__u8     mcb_bits;      /* bits unit size of config log */
+	__u32    mcb_units;     /* # of units for bulk transfer */
+};
+extern void lustre_swab_mgs_config_body(struct mgs_config_body *body);
+
+struct mgs_config_res {
+	__u64    mcr_offset;    /* index of last config log */
+	__u64    mcr_size;      /* size of the log */
+};
+extern void lustre_swab_mgs_config_res(struct mgs_config_res *body);
+
+/* Config marker flags (in config log) */
+#define CM_START       0x01
+#define CM_END	 0x02
+#define CM_SKIP	0x04
+#define CM_UPGRADE146  0x08
+#define CM_EXCLUDE     0x10
+#define CM_START_SKIP (CM_START | CM_SKIP)
+
+struct cfg_marker {
+	__u32	     cm_step;       /* aka config version */
+	__u32	     cm_flags;
+	__u32	     cm_vers;       /* lustre release version number */
+	__u32	     cm_padding;    /* 64 bit align */
+	obd_time	  cm_createtime; /*when this record was first created */
+	obd_time	  cm_canceltime; /*when this record is no longer valid*/
+	char	      cm_tgtname[MTI_NAME_MAXLEN];
+	char	      cm_comment[MTI_NAME_MAXLEN];
+};
+
+extern void lustre_swab_cfg_marker(struct cfg_marker *marker,
+				   int swab, int size);
+
+/*
+ * Opcodes for multiple servers.
+ */
+
+typedef enum {
+	OBD_PING = 400,
+	OBD_LOG_CANCEL,
+	OBD_QC_CALLBACK,
+	OBD_IDX_READ,
+	OBD_LAST_OPC
+} obd_cmd_t;
+#define OBD_FIRST_OPC OBD_PING
+
+/* catalog of log objects */
+
+/** Identifier for a single log object */
+struct llog_logid {
+	struct ost_id		lgl_oi;
+	__u32		   lgl_ogen;
+} __attribute__((packed));
+
+/** Records written to the CATALOGS list */
+#define CATLIST "CATALOGS"
+struct llog_catid {
+	struct llog_logid       lci_logid;
+	__u32		   lci_padding1;
+	__u32		   lci_padding2;
+	__u32		   lci_padding3;
+} __attribute__((packed));
+
+/* Log data record types - there is no specific reason that these need to
+ * be related to the RPC opcodes, but no reason not to (may be handy later?)
+ */
+#define LLOG_OP_MAGIC 0x10600000
+#define LLOG_OP_MASK  0xfff00000
+
+typedef enum {
+	LLOG_PAD_MAGIC		= LLOG_OP_MAGIC | 0x00000,
+	OST_SZ_REC		= LLOG_OP_MAGIC | 0x00f00,
+	/* OST_RAID1_REC	= LLOG_OP_MAGIC | 0x01000, never used */
+	MDS_UNLINK_REC		= LLOG_OP_MAGIC | 0x10000 | (MDS_REINT << 8) |
+				  REINT_UNLINK, /* obsolete after 2.5.0 */
+	MDS_UNLINK64_REC	= LLOG_OP_MAGIC | 0x90000 | (MDS_REINT << 8) |
+				  REINT_UNLINK,
+	/* MDS_SETATTR_REC	= LLOG_OP_MAGIC | 0x12401, obsolete 1.8.0 */
+	MDS_SETATTR64_REC	= LLOG_OP_MAGIC | 0x90000 | (MDS_REINT << 8) |
+				  REINT_SETATTR,
+	OBD_CFG_REC		= LLOG_OP_MAGIC | 0x20000,
+	/* PTL_CFG_REC		= LLOG_OP_MAGIC | 0x30000, obsolete 1.4.0 */
+	LLOG_GEN_REC		= LLOG_OP_MAGIC | 0x40000,
+	/* LLOG_JOIN_REC	= LLOG_OP_MAGIC | 0x50000, obsolete  1.8.0 */
+	CHANGELOG_REC		= LLOG_OP_MAGIC | 0x60000,
+	CHANGELOG_USER_REC	= LLOG_OP_MAGIC | 0x70000,
+	LLOG_HDR_MAGIC		= LLOG_OP_MAGIC | 0x45539,
+	LLOG_LOGID_MAGIC	= LLOG_OP_MAGIC | 0x4553b,
+} llog_op_type;
+
+#define LLOG_REC_HDR_NEEDS_SWABBING(r) \
+	(((r)->lrh_type & __swab32(LLOG_OP_MASK)) == __swab32(LLOG_OP_MAGIC))
+
+/** Log record header - stored in little endian order.
+ * Each record must start with this struct, end with a llog_rec_tail,
+ * and be a multiple of 256 bits in size.
+ */
+struct llog_rec_hdr {
+	__u32	lrh_len;
+	__u32	lrh_index;
+	__u32	lrh_type;
+	__u32	lrh_id;
+};
+
+struct llog_rec_tail {
+	__u32	lrt_len;
+	__u32	lrt_index;
+};
+
+/* Where data follow just after header */
+#define REC_DATA(ptr)						\
+	((void *)((char *)ptr + sizeof(struct llog_rec_hdr)))
+
+#define REC_DATA_LEN(rec)					\
+	(rec->lrh_len - sizeof(struct llog_rec_hdr) -		\
+	 sizeof(struct llog_rec_tail))
+
+struct llog_logid_rec {
+	struct llog_rec_hdr	lid_hdr;
+	struct llog_logid	lid_id;
+	__u32			lid_padding1;
+	__u64			lid_padding2;
+	__u64			lid_padding3;
+	struct llog_rec_tail	lid_tail;
+} __attribute__((packed));
+
+struct llog_unlink_rec {
+	struct llog_rec_hdr	lur_hdr;
+	obd_id			lur_oid;
+	obd_count		lur_oseq;
+	obd_count		lur_count;
+	struct llog_rec_tail	lur_tail;
+} __attribute__((packed));
+
+struct llog_unlink64_rec {
+	struct llog_rec_hdr	lur_hdr;
+	struct lu_fid		lur_fid;
+	obd_count		lur_count; /* to destroy the lost precreated */
+	__u32			lur_padding1;
+	__u64			lur_padding2;
+	__u64			lur_padding3;
+	struct llog_rec_tail    lur_tail;
+} __attribute__((packed));
+
+struct llog_setattr64_rec {
+	struct llog_rec_hdr	lsr_hdr;
+	struct ost_id		lsr_oi;
+	__u32			lsr_uid;
+	__u32			lsr_uid_h;
+	__u32			lsr_gid;
+	__u32			lsr_gid_h;
+	__u64			lsr_padding;
+	struct llog_rec_tail    lsr_tail;
+} __attribute__((packed));
+
+struct llog_size_change_rec {
+	struct llog_rec_hdr	lsc_hdr;
+	struct ll_fid		lsc_fid;
+	__u32			lsc_ioepoch;
+	__u32			lsc_padding1;
+	__u64			lsc_padding2;
+	__u64			lsc_padding3;
+	struct llog_rec_tail	lsc_tail;
+} __attribute__((packed));
+
+#define CHANGELOG_MAGIC 0xca103000
+
+/** \a changelog_rec_type's that can't be masked */
+#define CHANGELOG_MINMASK (1 << CL_MARK)
+/** bits covering all \a changelog_rec_type's */
+#define CHANGELOG_ALLMASK 0XFFFFFFFF
+/** default \a changelog_rec_type mask */
+#define CHANGELOG_DEFMASK CHANGELOG_ALLMASK & ~(1 << CL_ATIME | 1 << CL_CLOSE)
+
+/* changelog llog name, needed by client replicators */
+#define CHANGELOG_CATALOG "changelog_catalog"
+
+struct changelog_setinfo {
+	__u64 cs_recno;
+	__u32 cs_id;
+} __attribute__((packed));
+
+/** changelog record */
+struct llog_changelog_rec {
+	struct llog_rec_hdr  cr_hdr;
+	struct changelog_rec cr;
+	struct llog_rec_tail cr_tail; /**< for_sizezof_only */
+} __attribute__((packed));
+
+struct llog_changelog_ext_rec {
+	struct llog_rec_hdr      cr_hdr;
+	struct changelog_ext_rec cr;
+	struct llog_rec_tail     cr_tail; /**< for_sizezof_only */
+} __attribute__((packed));
+
+#define CHANGELOG_USER_PREFIX "cl"
+
+struct llog_changelog_user_rec {
+	struct llog_rec_hdr   cur_hdr;
+	__u32		 cur_id;
+	__u32		 cur_padding;
+	__u64		 cur_endrec;
+	struct llog_rec_tail  cur_tail;
+} __attribute__((packed));
+
+/* Old llog gen for compatibility */
+struct llog_gen {
+	__u64 mnt_cnt;
+	__u64 conn_cnt;
+} __attribute__((packed));
+
+struct llog_gen_rec {
+	struct llog_rec_hdr	lgr_hdr;
+	struct llog_gen		lgr_gen;
+	__u64			padding1;
+	__u64			padding2;
+	__u64			padding3;
+	struct llog_rec_tail	lgr_tail;
+};
+
+/* On-disk header structure of each log object, stored in little endian order */
+#define LLOG_CHUNK_SIZE	 8192
+#define LLOG_HEADER_SIZE	(96)
+#define LLOG_BITMAP_BYTES       (LLOG_CHUNK_SIZE - LLOG_HEADER_SIZE)
+
+#define LLOG_MIN_REC_SIZE       (24) /* round(llog_rec_hdr + llog_rec_tail) */
+
+/* flags for the logs */
+enum llog_flag {
+	LLOG_F_ZAP_WHEN_EMPTY	= 0x1,
+	LLOG_F_IS_CAT		= 0x2,
+	LLOG_F_IS_PLAIN		= 0x4,
+};
+
+struct llog_log_hdr {
+	struct llog_rec_hdr     llh_hdr;
+	obd_time		llh_timestamp;
+	__u32		   llh_count;
+	__u32		   llh_bitmap_offset;
+	__u32		   llh_size;
+	__u32		   llh_flags;
+	__u32		   llh_cat_idx;
+	/* for a catalog the first plain slot is next to it */
+	struct obd_uuid	 llh_tgtuuid;
+	__u32		   llh_reserved[LLOG_HEADER_SIZE/sizeof(__u32) - 23];
+	__u32		   llh_bitmap[LLOG_BITMAP_BYTES/sizeof(__u32)];
+	struct llog_rec_tail    llh_tail;
+} __attribute__((packed));
+
+#define LLOG_BITMAP_SIZE(llh)  (__u32)((llh->llh_hdr.lrh_len -		\
+					llh->llh_bitmap_offset -	\
+					sizeof(llh->llh_tail)) * 8)
+
+/** log cookies are used to reference a specific log file and a record therein */
+struct llog_cookie {
+	struct llog_logid       lgc_lgl;
+	__u32		   lgc_subsys;
+	__u32		   lgc_index;
+	__u32		   lgc_padding;
+} __attribute__((packed));
+
+/** llog protocol */
+enum llogd_rpc_ops {
+	LLOG_ORIGIN_HANDLE_CREATE       = 501,
+	LLOG_ORIGIN_HANDLE_NEXT_BLOCK   = 502,
+	LLOG_ORIGIN_HANDLE_READ_HEADER  = 503,
+	LLOG_ORIGIN_HANDLE_WRITE_REC    = 504,
+	LLOG_ORIGIN_HANDLE_CLOSE	= 505,
+	LLOG_ORIGIN_CONNECT	     = 506,
+	LLOG_CATINFO			= 507,  /* deprecated */
+	LLOG_ORIGIN_HANDLE_PREV_BLOCK   = 508,
+	LLOG_ORIGIN_HANDLE_DESTROY      = 509,  /* for destroy llog object*/
+	LLOG_LAST_OPC,
+	LLOG_FIRST_OPC		  = LLOG_ORIGIN_HANDLE_CREATE
+};
+
+struct llogd_body {
+	struct llog_logid  lgd_logid;
+	__u32 lgd_ctxt_idx;
+	__u32 lgd_llh_flags;
+	__u32 lgd_index;
+	__u32 lgd_saved_index;
+	__u32 lgd_len;
+	__u64 lgd_cur_offset;
+} __attribute__((packed));
+
+struct llogd_conn_body {
+	struct llog_gen	 lgdc_gen;
+	struct llog_logid       lgdc_logid;
+	__u32		   lgdc_ctxt_idx;
+} __attribute__((packed));
+
+/* Note: 64-bit types are 64-bit aligned in structure */
+struct obdo {
+	obd_valid	       o_valid;	/* hot fields in this obdo */
+	struct ost_id	   o_oi;
+	obd_id		  o_parent_seq;
+	obd_size		o_size;	 /* o_size-o_blocks == ost_lvb */
+	obd_time		o_mtime;
+	obd_time		o_atime;
+	obd_time		o_ctime;
+	obd_blocks	      o_blocks;       /* brw: cli sent cached bytes */
+	obd_size		o_grant;
+
+	/* 32-bit fields start here: keep an even number of them via padding */
+	obd_blksize	     o_blksize;      /* optimal IO blocksize */
+	obd_mode		o_mode;	 /* brw: cli sent cache remain */
+	obd_uid		 o_uid;
+	obd_gid		 o_gid;
+	obd_flag		o_flags;
+	obd_count	       o_nlink;	/* brw: checksum */
+	obd_count	       o_parent_oid;
+	obd_count		o_misc;		/* brw: o_dropped */
+
+	__u64		   o_ioepoch;      /* epoch in ost writes */
+	__u32		   o_stripe_idx;   /* holds stripe idx */
+	__u32		   o_parent_ver;
+	struct lustre_handle    o_handle;       /* brw: lock handle to prolong
+						 * locks */
+	struct llog_cookie      o_lcookie;      /* destroy: unlink cookie from
+						 * MDS */
+	__u32			o_uid_h;
+	__u32			o_gid_h;
+
+	__u64			o_data_version; /* getattr: sum of iversion for
+						 * each stripe.
+						 * brw: grant space consumed on
+						 * the client for the write */
+	__u64			o_padding_4;
+	__u64			o_padding_5;
+	__u64			o_padding_6;
+};
+
+#define o_dirty   o_blocks
+#define o_undirty o_mode
+#define o_dropped o_misc
+#define o_cksum   o_nlink
+#define o_grant_used o_data_version
+
+static inline void lustre_set_wire_obdo(struct obdo *wobdo, struct obdo *lobdo)
+{
+	memcpy(wobdo, lobdo, sizeof(*lobdo));
+	wobdo->o_flags &= ~OBD_FL_LOCAL_MASK;
+}
+
+static inline void lustre_get_wire_obdo(struct obdo *lobdo, struct obdo *wobdo)
+{
+	obd_flag local_flags = 0;
+
+	if (lobdo->o_valid & OBD_MD_FLFLAGS)
+		 local_flags = lobdo->o_flags & OBD_FL_LOCAL_MASK;
+
+	LASSERT(!(wobdo->o_flags & OBD_FL_LOCAL_MASK));
+
+	memcpy(lobdo, wobdo, sizeof(*lobdo));
+	if (local_flags != 0) {
+		 lobdo->o_valid |= OBD_MD_FLFLAGS;
+		 lobdo->o_flags &= ~OBD_FL_LOCAL_MASK;
+		 lobdo->o_flags |= local_flags;
+	}
+}
+
+extern void lustre_swab_obdo (struct obdo *o);
+
+/* request structure for OST's */
+struct ost_body {
+	struct  obdo oa;
+};
+
+/* Key for FIEMAP to be used in get_info calls */
+struct ll_fiemap_info_key {
+	char    name[8];
+	struct  obdo oa;
+	struct  ll_user_fiemap fiemap;
+};
+
+extern void lustre_swab_ost_body (struct ost_body *b);
+extern void lustre_swab_ost_last_id(obd_id *id);
+extern void lustre_swab_fiemap(struct ll_user_fiemap *fiemap);
+
+extern void lustre_swab_lov_user_md_v1(struct lov_user_md_v1 *lum);
+extern void lustre_swab_lov_user_md_v3(struct lov_user_md_v3 *lum);
+extern void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod,
+					    int stripe_count);
+extern void lustre_swab_lov_mds_md(struct lov_mds_md *lmm);
+
+/* llog_swab.c */
+extern void lustre_swab_llogd_body (struct llogd_body *d);
+extern void lustre_swab_llog_hdr (struct llog_log_hdr *h);
+extern void lustre_swab_llogd_conn_body (struct llogd_conn_body *d);
+extern void lustre_swab_llog_rec(struct llog_rec_hdr *rec);
+
+struct lustre_cfg;
+extern void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg);
+
+/* Functions for dumping PTLRPC fields */
+void dump_rniobuf(struct niobuf_remote *rnb);
+void dump_ioo(struct obd_ioobj *nb);
+void dump_obdo(struct obdo *oa);
+void dump_ost_body(struct ost_body *ob);
+void dump_rcs(__u32 *rc);
+
+#define IDX_INFO_MAGIC 0x3D37CC37
+
+/* Index file transfer through the network. The server serializes the index into
+ * a byte stream which is sent to the client via a bulk transfer */
+struct idx_info {
+	__u32		ii_magic;
+
+	/* reply: see idx_info_flags below */
+	__u32		ii_flags;
+
+	/* request & reply: number of lu_idxpage (to be) transferred */
+	__u16		ii_count;
+	__u16		ii_pad0;
+
+	/* request: requested attributes passed down to the iterator API */
+	__u32		ii_attrs;
+
+	/* request & reply: index file identifier (FID) */
+	struct lu_fid	ii_fid;
+
+	/* reply: version of the index file before starting to walk the index.
+	 * Please note that the version can be modified at any time during the
+	 * transfer */
+	__u64		ii_version;
+
+	/* request: hash to start with:
+	 * reply: hash of the first entry of the first lu_idxpage and hash
+	 *	of the entry to read next if any */
+	__u64		ii_hash_start;
+	__u64		ii_hash_end;
+
+	/* reply: size of keys in lu_idxpages, minimal one if II_FL_VARKEY is
+	 * set */
+	__u16		ii_keysize;
+
+	/* reply: size of records in lu_idxpages, minimal one if II_FL_VARREC
+	 * is set */
+	__u16		ii_recsize;
+
+	__u32		ii_pad1;
+	__u64		ii_pad2;
+	__u64		ii_pad3;
+};
+extern void lustre_swab_idx_info(struct idx_info *ii);
+
+#define II_END_OFF	MDS_DIR_END_OFF /* all entries have been read */
+
+/* List of flags used in idx_info::ii_flags */
+enum idx_info_flags {
+	II_FL_NOHASH	= 1 << 0, /* client doesn't care about hash value */
+	II_FL_VARKEY	= 1 << 1, /* keys can be of variable size */
+	II_FL_VARREC	= 1 << 2, /* records can be of variable size */
+	II_FL_NONUNQ	= 1 << 3, /* index supports non-unique keys */
+};
+
+#define LIP_MAGIC 0x8A6D6B6C
+
+/* 4KB (= LU_PAGE_SIZE) container gathering key/record pairs */
+struct lu_idxpage {
+	/* 16-byte header */
+	__u32	lip_magic;
+	__u16	lip_flags;
+	__u16	lip_nr;   /* number of entries in the container */
+	__u64	lip_pad0; /* additional padding for future use */
+
+	/* key/record pairs are stored in the remaining 4080 bytes.
+	 * depending upon the flags in idx_info::ii_flags, each key/record
+	 * pair might be preceded by:
+	 * - a hash value
+	 * - the key size (II_FL_VARKEY is set)
+	 * - the record size (II_FL_VARREC is set)
+	 *
+	 * For the time being, we only support fixed-size key & record. */
+	char	lip_entries[0];
+};
+extern void lustre_swab_lip_header(struct lu_idxpage *lip);
+
+#define LIP_HDR_SIZE (offsetof(struct lu_idxpage, lip_entries))
+
+/* Gather all possible type associated with a 4KB container */
+union lu_page {
+	struct lu_dirpage	lp_dir; /* for MDS_READPAGE */
+	struct lu_idxpage	lp_idx; /* for OBD_IDX_READ */
+	char			lp_array[LU_PAGE_SIZE];
+};
+
+/* security opcodes */
+typedef enum {
+	SEC_CTX_INIT	    = 801,
+	SEC_CTX_INIT_CONT       = 802,
+	SEC_CTX_FINI	    = 803,
+	SEC_LAST_OPC,
+	SEC_FIRST_OPC	   = SEC_CTX_INIT
+} sec_cmd_t;
+
+/*
+ * capa related definitions
+ */
+#define CAPA_HMAC_MAX_LEN       64
+#define CAPA_HMAC_KEY_MAX_LEN   56
+
+/* NB take care when changing the sequence of elements this struct,
+ * because the offset info is used in find_capa() */
+struct lustre_capa {
+	struct lu_fid   lc_fid;	 /** fid */
+	__u64	   lc_opc;	 /** operations allowed */
+	__u64	   lc_uid;	 /** file owner */
+	__u64	   lc_gid;	 /** file group */
+	__u32	   lc_flags;       /** HMAC algorithm & flags */
+	__u32	   lc_keyid;       /** key# used for the capability */
+	__u32	   lc_timeout;     /** capa timeout value (sec) */
+	__u32	   lc_expiry;      /** expiry time (sec) */
+	__u8	    lc_hmac[CAPA_HMAC_MAX_LEN];   /** HMAC */
+} __attribute__((packed));
+
+extern void lustre_swab_lustre_capa(struct lustre_capa *c);
+
+/** lustre_capa::lc_opc */
+enum {
+	CAPA_OPC_BODY_WRITE   = 1<<0,  /**< write object data */
+	CAPA_OPC_BODY_READ    = 1<<1,  /**< read object data */
+	CAPA_OPC_INDEX_LOOKUP = 1<<2,  /**< lookup object fid */
+	CAPA_OPC_INDEX_INSERT = 1<<3,  /**< insert object fid */
+	CAPA_OPC_INDEX_DELETE = 1<<4,  /**< delete object fid */
+	CAPA_OPC_OSS_WRITE    = 1<<5,  /**< write oss object data */
+	CAPA_OPC_OSS_READ     = 1<<6,  /**< read oss object data */
+	CAPA_OPC_OSS_TRUNC    = 1<<7,  /**< truncate oss object */
+	CAPA_OPC_OSS_DESTROY  = 1<<8,  /**< destroy oss object */
+	CAPA_OPC_META_WRITE   = 1<<9,  /**< write object meta data */
+	CAPA_OPC_META_READ    = 1<<10, /**< read object meta data */
+};
+
+#define CAPA_OPC_OSS_RW (CAPA_OPC_OSS_READ | CAPA_OPC_OSS_WRITE)
+#define CAPA_OPC_MDS_ONLY						   \
+	(CAPA_OPC_BODY_WRITE | CAPA_OPC_BODY_READ | CAPA_OPC_INDEX_LOOKUP | \
+	 CAPA_OPC_INDEX_INSERT | CAPA_OPC_INDEX_DELETE)
+#define CAPA_OPC_OSS_ONLY						   \
+	(CAPA_OPC_OSS_WRITE | CAPA_OPC_OSS_READ | CAPA_OPC_OSS_TRUNC |      \
+	 CAPA_OPC_OSS_DESTROY)
+#define CAPA_OPC_MDS_DEFAULT ~CAPA_OPC_OSS_ONLY
+#define CAPA_OPC_OSS_DEFAULT ~(CAPA_OPC_MDS_ONLY | CAPA_OPC_OSS_ONLY)
+
+/* MDS capability covers object capability for operations of body r/w
+ * (dir readpage/sendpage), index lookup/insert/delete and meta data r/w,
+ * while OSS capability only covers object capability for operations of
+ * oss data(file content) r/w/truncate.
+ */
+static inline int capa_for_mds(struct lustre_capa *c)
+{
+	return (c->lc_opc & CAPA_OPC_INDEX_LOOKUP) != 0;
+}
+
+static inline int capa_for_oss(struct lustre_capa *c)
+{
+	return (c->lc_opc & CAPA_OPC_INDEX_LOOKUP) == 0;
+}
+
+/* lustre_capa::lc_hmac_alg */
+enum {
+	CAPA_HMAC_ALG_SHA1 = 1, /**< sha1 algorithm */
+	CAPA_HMAC_ALG_MAX,
+};
+
+#define CAPA_FL_MASK	    0x00ffffff
+#define CAPA_HMAC_ALG_MASK      0xff000000
+
+struct lustre_capa_key {
+	__u64   lk_seq;       /**< mds# */
+	__u32   lk_keyid;     /**< key# */
+	__u32   lk_padding;
+	__u8    lk_key[CAPA_HMAC_KEY_MAX_LEN];    /**< key */
+} __attribute__((packed));
+
+extern void lustre_swab_lustre_capa_key(struct lustre_capa_key *k);
+
+/** The link ea holds 1 \a link_ea_entry for each hardlink */
+#define LINK_EA_MAGIC 0x11EAF1DFUL
+struct link_ea_header {
+	__u32 leh_magic;
+	__u32 leh_reccount;
+	__u64 leh_len;      /* total size */
+	/* future use */
+	__u32 padding1;
+	__u32 padding2;
+};
+
+/** Hardlink data is name and parent fid.
+ * Stored in this crazy struct for maximum packing and endian-neutrality
+ */
+struct link_ea_entry {
+	/** __u16 stored big-endian, unaligned */
+	unsigned char      lee_reclen[2];
+	unsigned char      lee_parent_fid[sizeof(struct lu_fid)];
+	char	       lee_name[0];
+}__attribute__((packed));
+
+/** fid2path request/reply structure */
+struct getinfo_fid2path {
+	struct lu_fid   gf_fid;
+	__u64	   gf_recno;
+	__u32	   gf_linkno;
+	__u32	   gf_pathlen;
+	char	    gf_path[0];
+} __attribute__((packed));
+
+void lustre_swab_fid2path (struct getinfo_fid2path *gf);
+
+enum {
+	LAYOUT_INTENT_ACCESS    = 0,
+	LAYOUT_INTENT_READ      = 1,
+	LAYOUT_INTENT_WRITE     = 2,
+	LAYOUT_INTENT_GLIMPSE   = 3,
+	LAYOUT_INTENT_TRUNC     = 4,
+	LAYOUT_INTENT_RELEASE   = 5,
+	LAYOUT_INTENT_RESTORE   = 6
+};
+
+/* enqueue layout lock with intent */
+struct layout_intent {
+	__u32 li_opc; /* intent operation for enqueue, read, write etc */
+	__u32 li_flags;
+	__u64 li_start;
+	__u64 li_end;
+};
+
+void lustre_swab_layout_intent(struct layout_intent *li);
+
+/**
+ * On the wire version of hsm_progress structure.
+ *
+ * Contains the userspace hsm_progress and some internal fields.
+ */
+struct hsm_progress_kernel {
+	/* Field taken from struct hsm_progress */
+	lustre_fid		hpk_fid;
+	__u64			hpk_cookie;
+	struct hsm_extent	hpk_extent;
+	__u16			hpk_flags;
+	__u16			hpk_errval; /* positive val */
+	__u32			hpk_padding1;
+	/* Additional fields */
+	__u64			hpk_data_version;
+	__u64			hpk_padding2;
+} __attribute__((packed));
+
+extern void lustre_swab_hsm_user_state(struct hsm_user_state *hus);
+extern void lustre_swab_hsm_current_action(struct hsm_current_action *action);
+extern void lustre_swab_hsm_progress_kernel(struct hsm_progress_kernel *hpk);
+extern void lustre_swab_hsm_user_state(struct hsm_user_state *hus);
+extern void lustre_swab_hsm_user_item(struct hsm_user_item *hui);
+extern void lustre_swab_hsm_request(struct hsm_request *hr);
+
+/**
+ * These are object update opcode under UPDATE_OBJ, which is currently
+ * being used by cross-ref operations between MDT.
+ *
+ * During the cross-ref operation, the Master MDT, which the client send the
+ * request to, will disassembly the operation into object updates, then OSP
+ * will send these updates to the remote MDT to be executed.
+ *
+ *   Update request format
+ *   magic:  UPDATE_BUFFER_MAGIC_V1
+ *   Count:  How many updates in the req.
+ *   bufs[0] : following are packets of object.
+ *   update[0]:
+ *		type: object_update_op, the op code of update
+ *		fid: The object fid of the update.
+ *		lens/bufs: other parameters of the update.
+ *   update[1]:
+ *		type: object_update_op, the op code of update
+ *		fid: The object fid of the update.
+ *		lens/bufs: other parameters of the update.
+ *   ..........
+ *   update[7]:	type: object_update_op, the op code of update
+ *		fid: The object fid of the update.
+ *		lens/bufs: other parameters of the update.
+ *   Current 8 maxim updates per object update request.
+ *
+ *******************************************************************
+ *   update reply format:
+ *
+ *   ur_version: UPDATE_REPLY_V1
+ *   ur_count:   The count of the reply, which is usually equal
+ *		 to the number of updates in the request.
+ *   ur_lens:    The reply lengths of each object update.
+ *
+ *   replies:    1st update reply  [4bytes_ret: other body]
+ *		 2nd update reply  [4bytes_ret: other body]
+ *		 .....
+ *		 nth update reply  [4bytes_ret: other body]
+ *
+ *   For each reply of the update, the format would be
+ *	 result(4 bytes):Other stuff
+ */
+
+#define UPDATE_MAX_OPS		10
+#define UPDATE_BUFFER_MAGIC_V1	0xBDDE0001
+#define UPDATE_BUFFER_MAGIC	UPDATE_BUFFER_MAGIC_V1
+#define UPDATE_BUF_COUNT	8
+enum object_update_op {
+	OBJ_CREATE		= 1,
+	OBJ_DESTROY		= 2,
+	OBJ_REF_ADD		= 3,
+	OBJ_REF_DEL		= 4,
+	OBJ_ATTR_SET		= 5,
+	OBJ_ATTR_GET		= 6,
+	OBJ_XATTR_SET		= 7,
+	OBJ_XATTR_GET		= 8,
+	OBJ_INDEX_LOOKUP	= 9,
+	OBJ_INDEX_INSERT	= 10,
+	OBJ_INDEX_DELETE	= 11,
+	OBJ_LAST
+};
+
+struct update {
+	__u32		u_type;
+	__u32		u_batchid;
+	struct lu_fid	u_fid;
+	__u32		u_lens[UPDATE_BUF_COUNT];
+	__u32		u_bufs[0];
+};
+
+struct update_buf {
+	__u32	ub_magic;
+	__u32	ub_count;
+	__u32	ub_bufs[0];
+};
+
+#define UPDATE_REPLY_V1		0x00BD0001
+struct update_reply {
+	__u32	ur_version;
+	__u32	ur_count;
+	__u32	ur_lens[0];
+};
+
+void lustre_swab_update_buf(struct update_buf *ub);
+void lustre_swab_update_reply_buf(struct update_reply *ur);
+
+/** layout swap request structure
+ * fid1 and fid2 are in mdt_body
+ */
+struct mdc_swap_layouts {
+	__u64	   msl_flags;
+} __packed;
+
+void lustre_swab_swap_layouts(struct mdc_swap_layouts *msl);
+
+#endif
+/** @} lustreidl */
diff --git a/drivers/staging/lustre/lustre/include/lustre/lustre_lfsck_user.h b/drivers/staging/lustre/lustre/include/lustre/lustre_lfsck_user.h
new file mode 100644
index 000000000000..1c87a61a7fc1
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre/lustre_lfsck_user.h
@@ -0,0 +1,95 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * lustre/include/lustre/lustre_lfsck_user.h
+ *
+ * Lustre LFSCK userspace interfaces.
+ *
+ * Author: Fan Yong <yong.fan@whamcloud.com>
+ */
+
+#ifndef _LUSTRE_LFSCK_USER_H
+# define _LUSTRE_LFSCK_USER_H
+
+enum lfsck_param_flags {
+	/* Reset LFSCK iterator position to the device beginning. */
+	LPF_RESET       = 0x0001,
+
+	/* Exit when fail. */
+	LPF_FAILOUT     = 0x0002,
+
+	/* Dryrun mode, only check without modification */
+	LPF_DRYRUN      = 0x0004,
+};
+
+enum lfsck_type {
+	/* For MDT-OST consistency check/repair. */
+	LT_LAYOUT	= 0x0001,
+
+	/* For MDT-MDT consistency check/repair. */
+	LT_DNE		= 0x0002,
+
+	/* For FID-in-dirent and linkEA consistency check/repair. */
+	LT_NAMESPACE	= 0x0004,
+};
+
+#define LFSCK_VERSION_V1	1
+#define LFSCK_VERSION_V2	2
+
+#define LFSCK_TYPES_ALL		((__u16)(~0))
+#define LFSCK_TYPES_DEF		((__u16)0)
+#define LFSCK_TYPES_SUPPORTED	LT_NAMESPACE
+
+#define LFSCK_SPEED_NO_LIMIT	0
+#define LFSCK_SPEED_LIMIT_DEF	LFSCK_SPEED_NO_LIMIT
+
+enum lfsck_start_valid {
+	LSV_SPEED_LIMIT		= 0x00000001,
+	LSV_ERROR_HANDLE	= 0x00000002,
+	LSV_DRYRUN		= 0x00000004,
+};
+
+/* Arguments for starting lfsck. */
+struct lfsck_start {
+	/* Which arguments are valid, see 'enum lfsck_start_valid'. */
+	__u32   ls_valid;
+
+	/* How many items can be scanned at most per second. */
+	__u32   ls_speed_limit;
+
+	/* For compatibility between user space tools and kernel service. */
+	__u16   ls_version;
+
+	/* Which LFSCK components to be (have been) started. */
+	__u16   ls_active;
+
+	/* Flags for the LFSCK, see 'enum lfsck_param_flags'. */
+	__u16   ls_flags;
+
+	/* For 64-bits aligned. */
+	__u16   ls_padding;
+};
+
+#endif /* _LUSTRE_LFSCK_USER_H */
diff --git a/drivers/staging/lustre/lustre/include/lustre/lustre_user.h b/drivers/staging/lustre/lustre/include/lustre/lustre_user.h
new file mode 100644
index 000000000000..eaa94f5cab96
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre/lustre_user.h
@@ -0,0 +1,1146 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre/lustre_user.h
+ *
+ * Lustre public user-space interface definitions.
+ */
+
+#ifndef _LUSTRE_USER_H
+#define _LUSTRE_USER_H
+
+/** \defgroup lustreuser lustreuser
+ *
+ * @{
+ */
+
+#include <lustre/ll_fiemap.h>
+#include <linux/lustre_user.h>
+
+/* for statfs() */
+#define LL_SUPER_MAGIC 0x0BD00BD0
+
+#ifndef FSFILT_IOC_GETFLAGS
+#define FSFILT_IOC_GETFLAGS	       _IOR('f', 1, long)
+#define FSFILT_IOC_SETFLAGS	       _IOW('f', 2, long)
+#define FSFILT_IOC_GETVERSION	     _IOR('f', 3, long)
+#define FSFILT_IOC_SETVERSION	     _IOW('f', 4, long)
+#define FSFILT_IOC_GETVERSION_OLD	 _IOR('v', 1, long)
+#define FSFILT_IOC_SETVERSION_OLD	 _IOW('v', 2, long)
+#define FSFILT_IOC_FIEMAP		 _IOWR('f', 11, struct ll_user_fiemap)
+#endif
+
+/* FIEMAP flags supported by Lustre */
+#define LUSTRE_FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_DEVICE_ORDER)
+
+enum obd_statfs_state {
+	OS_STATE_DEGRADED       = 0x00000001, /**< RAID degraded/rebuilding */
+	OS_STATE_READONLY       = 0x00000002, /**< filesystem is read-only */
+	OS_STATE_RDONLY_1       = 0x00000004, /**< obsolete 1.6, was EROFS=30 */
+	OS_STATE_RDONLY_2       = 0x00000008, /**< obsolete 1.6, was EROFS=30 */
+	OS_STATE_RDONLY_3       = 0x00000010, /**< obsolete 1.6, was EROFS=30 */
+};
+
+struct obd_statfs {
+	__u64	   os_type;
+	__u64	   os_blocks;
+	__u64	   os_bfree;
+	__u64	   os_bavail;
+	__u64	   os_files;
+	__u64	   os_ffree;
+	__u8	    os_fsid[40];
+	__u32	   os_bsize;
+	__u32	   os_namelen;
+	__u64	   os_maxbytes;
+	__u32	   os_state;       /**< obd_statfs_state OS_STATE_* flag */
+	__u32	   os_fprecreated;	/* objs available now to the caller */
+					/* used in QoS code to find preferred
+					 * OSTs */
+	__u32	   os_spare2;
+	__u32	   os_spare3;
+	__u32	   os_spare4;
+	__u32	   os_spare5;
+	__u32	   os_spare6;
+	__u32	   os_spare7;
+	__u32	   os_spare8;
+	__u32	   os_spare9;
+};
+
+/**
+ * File IDentifier.
+ *
+ * FID is a cluster-wide unique identifier of a file or an object (stripe).
+ * FIDs are never reused.
+ **/
+struct lu_fid {
+       /**
+	* FID sequence. Sequence is a unit of migration: all files (objects)
+	* with FIDs from a given sequence are stored on the same server.
+	* Lustre should support 2^64 objects, so even if each sequence
+	* has only a single object we can still enumerate 2^64 objects.
+	**/
+	__u64 f_seq;
+	/* FID number within sequence. */
+	__u32 f_oid;
+	/**
+	 * FID version, used to distinguish different versions (in the sense
+	 * of snapshots, etc.) of the same file system object. Not currently
+	 * used.
+	 **/
+	__u32 f_ver;
+};
+
+struct filter_fid {
+	struct lu_fid	ff_parent;  /* ff_parent.f_ver == file stripe number */
+};
+
+/* keep this one for compatibility */
+struct filter_fid_old {
+	struct lu_fid	ff_parent;
+	__u64		ff_objid;
+	__u64		ff_seq;
+};
+
+/* Userspace should treat lu_fid as opaque, and only use the following methods
+ * to print or parse them.  Other functions (e.g. compare, swab) could be moved
+ * here from lustre_idl.h if needed. */
+typedef struct lu_fid lustre_fid;
+
+/**
+ * Following struct for object attributes, that will be kept inode's EA.
+ * Introduced in 2.0 release (please see b15993, for details)
+ * Added to all objects since Lustre 2.4 as contains self FID
+ */
+struct lustre_mdt_attrs {
+	/**
+	 * Bitfield for supported data in this structure. From enum lma_compat.
+	 * lma_self_fid and lma_flags are always available.
+	 */
+	__u32   lma_compat;
+	/**
+	 * Per-file incompat feature list. Lustre version should support all
+	 * flags set in this field. The supported feature mask is available in
+	 * LMA_INCOMPAT_SUPP.
+	 */
+	__u32   lma_incompat;
+	/** FID of this inode */
+	struct lu_fid  lma_self_fid;
+};
+
+/**
+ * Prior to 2.4, the LMA structure also included SOM attributes which has since
+ * been moved to a dedicated xattr
+ * lma_flags was also removed because of lma_compat/incompat fields.
+ */
+#define LMA_OLD_SIZE (sizeof(struct lustre_mdt_attrs) + 5 * sizeof(__u64))
+
+/**
+ * OST object IDentifier.
+ */
+struct ost_id {
+	union {
+		struct ostid {
+			__u64	oi_id;
+			__u64	oi_seq;
+		} oi;
+		struct lu_fid oi_fid;
+	};
+};
+
+#define DOSTID LPX64":"LPU64
+#define POSTID(oi) ostid_seq(oi), ostid_id(oi)
+
+/*
+ * The ioctl naming rules:
+ * LL_*     - works on the currently opened filehandle instead of parent dir
+ * *_OBD_*  - gets data for both OSC or MDC (LOV, LMV indirectly)
+ * *_MDC_*  - gets/sets data related to MDC
+ * *_LOV_*  - gets/sets data related to OSC/LOV
+ * *FILE*   - called on parent dir and passes in a filename
+ * *STRIPE* - set/get lov_user_md
+ * *INFO    - set/get lov_user_mds_data
+ */
+/* see <lustre_lib.h> for ioctl numberss 101-150 */
+#define LL_IOC_GETFLAGS		 _IOR ('f', 151, long)
+#define LL_IOC_SETFLAGS		 _IOW ('f', 152, long)
+#define LL_IOC_CLRFLAGS		 _IOW ('f', 153, long)
+/* LL_IOC_LOV_SETSTRIPE: See also OBD_IOC_LOV_SETSTRIPE */
+#define LL_IOC_LOV_SETSTRIPE	    _IOW ('f', 154, long)
+/* LL_IOC_LOV_GETSTRIPE: See also OBD_IOC_LOV_GETSTRIPE */
+#define LL_IOC_LOV_GETSTRIPE	    _IOW ('f', 155, long)
+/* LL_IOC_LOV_SETEA: See also OBD_IOC_LOV_SETEA */
+#define LL_IOC_LOV_SETEA		_IOW ('f', 156, long)
+#define LL_IOC_RECREATE_OBJ	     _IOW ('f', 157, long)
+#define LL_IOC_RECREATE_FID	     _IOW ('f', 157, struct lu_fid)
+#define LL_IOC_GROUP_LOCK	       _IOW ('f', 158, long)
+#define LL_IOC_GROUP_UNLOCK	     _IOW ('f', 159, long)
+/* LL_IOC_QUOTACHECK: See also OBD_IOC_QUOTACHECK */
+#define LL_IOC_QUOTACHECK	       _IOW ('f', 160, int)
+/* LL_IOC_POLL_QUOTACHECK: See also OBD_IOC_POLL_QUOTACHECK */
+#define LL_IOC_POLL_QUOTACHECK	  _IOR ('f', 161, struct if_quotacheck *)
+/* LL_IOC_QUOTACTL: See also OBD_IOC_QUOTACTL */
+#define LL_IOC_QUOTACTL		 _IOWR('f', 162, struct if_quotactl)
+#define IOC_OBD_STATFS		  _IOWR('f', 164, struct obd_statfs *)
+#define IOC_LOV_GETINFO		 _IOWR('f', 165, struct lov_user_mds_data *)
+#define LL_IOC_FLUSHCTX		 _IOW ('f', 166, long)
+#define LL_IOC_RMTACL		   _IOW ('f', 167, long)
+#define LL_IOC_GETOBDCOUNT	      _IOR ('f', 168, long)
+#define LL_IOC_LLOOP_ATTACH	     _IOWR('f', 169, long)
+#define LL_IOC_LLOOP_DETACH	     _IOWR('f', 170, long)
+#define LL_IOC_LLOOP_INFO	       _IOWR('f', 171, struct lu_fid)
+#define LL_IOC_LLOOP_DETACH_BYDEV       _IOWR('f', 172, long)
+#define LL_IOC_PATH2FID		 _IOR ('f', 173, long)
+#define LL_IOC_GET_CONNECT_FLAGS	_IOWR('f', 174, __u64 *)
+#define LL_IOC_GET_MDTIDX	       _IOR ('f', 175, int)
+
+/* see <lustre_lib.h> for ioctl numbers 177-210 */
+
+#define LL_IOC_HSM_STATE_GET		_IOR('f', 211, struct hsm_user_state)
+#define LL_IOC_HSM_STATE_SET		_IOW('f', 212, struct hsm_state_set)
+#define LL_IOC_HSM_CT_START		_IOW('f', 213, struct lustre_kernelcomm)
+#define LL_IOC_HSM_COPY_START		_IOW('f', 214, struct hsm_copy *)
+#define LL_IOC_HSM_COPY_END		_IOW('f', 215, struct hsm_copy *)
+#define LL_IOC_HSM_PROGRESS		_IOW('f', 216, struct hsm_user_request)
+#define LL_IOC_HSM_REQUEST		_IOW('f', 217, struct hsm_user_request)
+#define LL_IOC_DATA_VERSION		_IOR('f', 218, struct ioc_data_version)
+#define LL_IOC_LOV_SWAP_LAYOUTS		_IOW('f', 219, \
+						struct lustre_swap_layouts)
+#define LL_IOC_HSM_ACTION		_IOR('f', 220, \
+						struct hsm_current_action)
+/* see <lustre_lib.h> for ioctl numbers 221-232 */
+
+#define LL_IOC_LMV_SETSTRIPE	    _IOWR('f', 240, struct lmv_user_md)
+#define LL_IOC_LMV_GETSTRIPE	    _IOWR('f', 241, struct lmv_user_md)
+#define LL_IOC_REMOVE_ENTRY	    _IOWR('f', 242, __u64)
+
+#define LL_STATFS_LMV	   1
+#define LL_STATFS_LOV	   2
+#define LL_STATFS_NODELAY	4
+
+#define IOC_MDC_TYPE	    'i'
+#define IOC_MDC_LOOKUP	  _IOWR(IOC_MDC_TYPE, 20, struct obd_device *)
+#define IOC_MDC_GETFILESTRIPE   _IOWR(IOC_MDC_TYPE, 21, struct lov_user_md *)
+#define IOC_MDC_GETFILEINFO     _IOWR(IOC_MDC_TYPE, 22, struct lov_user_mds_data *)
+#define LL_IOC_MDC_GETINFO      _IOWR(IOC_MDC_TYPE, 23, struct lov_user_mds_data *)
+
+/* Keep these for backward compartability. */
+#define LL_IOC_OBD_STATFS       IOC_OBD_STATFS
+#define IOC_MDC_GETSTRIPE       IOC_MDC_GETFILESTRIPE
+
+
+#define MAX_OBD_NAME 128 /* If this changes, a NEW ioctl must be added */
+
+/* Hopefully O_LOV_DELAY_CREATE does not conflict with standard O_xxx flags.
+ * Previously it was defined as 0100000000 and conflicts with FMODE_NONOTIFY
+ * which was added since kernel 2.6.36, so we redefine it as 020000000.
+ * To be compatible with old version's statically linked binary, finally we
+ * define it as (020000000 | 0100000000).
+ * */
+#define O_LOV_DELAY_CREATE      0120000000
+
+#define LL_FILE_IGNORE_LOCK     0x00000001
+#define LL_FILE_GROUP_LOCKED    0x00000002
+#define LL_FILE_READAHEA	0x00000004
+#define LL_FILE_LOCKED_DIRECTIO 0x00000008 /* client-side locks with dio */
+#define LL_FILE_LOCKLESS_IO     0x00000010 /* server-side locks with cio */
+#define LL_FILE_RMTACL	  0x00000020
+
+#define LOV_USER_MAGIC_V1 0x0BD10BD0
+#define LOV_USER_MAGIC    LOV_USER_MAGIC_V1
+#define LOV_USER_MAGIC_JOIN_V1 0x0BD20BD0
+#define LOV_USER_MAGIC_V3 0x0BD30BD0
+
+#define LMV_MAGIC_V1      0x0CD10CD0    /*normal stripe lmv magic */
+#define LMV_USER_MAGIC    0x0CD20CD0    /*default lmv magic*/
+
+#define LOV_PATTERN_RAID0 0x001
+#define LOV_PATTERN_RAID1 0x002
+#define LOV_PATTERN_FIRST 0x100
+
+#define LOV_MAXPOOLNAME 16
+#define LOV_POOLNAMEF "%.16s"
+
+#define LOV_MIN_STRIPE_BITS 16   /* maximum PAGE_SIZE (ia64), power of 2 */
+#define LOV_MIN_STRIPE_SIZE (1 << LOV_MIN_STRIPE_BITS)
+#define LOV_MAX_STRIPE_COUNT_OLD 160
+/* This calculation is crafted so that input of 4096 will result in 160
+ * which in turn is equal to old maximal stripe count.
+ * XXX: In fact this is too simpified for now, what it also need is to get
+ * ea_type argument to clearly know how much space each stripe consumes.
+ *
+ * The limit of 12 pages is somewhat arbitrary, but is a reasonably large
+ * allocation that is sufficient for the current generation of systems.
+ *
+ * (max buffer size - lov+rpc header) / sizeof(struct lov_ost_data_v1) */
+#define LOV_MAX_STRIPE_COUNT 2000  /* ((12 * 4096 - 256) / 24) */
+#define LOV_ALL_STRIPES       0xffff /* only valid for directories */
+#define LOV_V1_INSANE_STRIPE_COUNT 65532 /* maximum stripe count bz13933 */
+
+#define lov_user_ost_data lov_user_ost_data_v1
+struct lov_user_ost_data_v1 {     /* per-stripe data structure */
+	struct ost_id l_ost_oi;	  /* OST object ID */
+	__u32 l_ost_gen;	  /* generation of this OST index */
+	__u32 l_ost_idx;	  /* OST index in LOV */
+} __attribute__((packed));
+
+#define lov_user_md lov_user_md_v1
+struct lov_user_md_v1 {	   /* LOV EA user data (host-endian) */
+	__u32 lmm_magic;	  /* magic number = LOV_USER_MAGIC_V1 */
+	__u32 lmm_pattern;	/* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+	struct ost_id lmm_oi;	  /* LOV object ID */
+	__u32 lmm_stripe_size;    /* size of stripe in bytes */
+	__u16 lmm_stripe_count;   /* num stripes in use for this object */
+	union {
+		__u16 lmm_stripe_offset;  /* starting stripe offset in
+					   * lmm_objects, use when writing */
+		__u16 lmm_layout_gen;     /* layout generation number
+					   * used when reading */
+	};
+	struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+} __attribute__((packed,  __may_alias__));
+
+struct lov_user_md_v3 {	   /* LOV EA user data (host-endian) */
+	__u32 lmm_magic;	  /* magic number = LOV_USER_MAGIC_V3 */
+	__u32 lmm_pattern;	/* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+	struct ost_id lmm_oi;	  /* LOV object ID */
+	__u32 lmm_stripe_size;    /* size of stripe in bytes */
+	__u16 lmm_stripe_count;   /* num stripes in use for this object */
+	union {
+		__u16 lmm_stripe_offset;  /* starting stripe offset in
+					   * lmm_objects, use when writing */
+		__u16 lmm_layout_gen;     /* layout generation number
+					   * used when reading */
+	};
+	char  lmm_pool_name[LOV_MAXPOOLNAME]; /* pool name */
+	struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+} __attribute__((packed));
+
+/* Compile with -D_LARGEFILE64_SOURCE or -D_GNU_SOURCE (or #define) to
+ * use this.  It is unsafe to #define those values in this header as it
+ * is possible the application has already #included <sys/stat.h>. */
+#ifdef HAVE_LOV_USER_MDS_DATA
+#define lov_user_mds_data lov_user_mds_data_v1
+struct lov_user_mds_data_v1 {
+	lstat_t lmd_st;		 /* MDS stat struct */
+	struct lov_user_md_v1 lmd_lmm;  /* LOV EA V1 user data */
+} __attribute__((packed));
+
+struct lov_user_mds_data_v3 {
+	lstat_t lmd_st;		 /* MDS stat struct */
+	struct lov_user_md_v3 lmd_lmm;  /* LOV EA V3 user data */
+} __attribute__((packed));
+#endif
+
+/* keep this to be the same size as lov_user_ost_data_v1 */
+struct lmv_user_mds_data {
+	struct lu_fid	lum_fid;
+	__u32		lum_padding;
+	__u32		lum_mds;
+};
+
+/* lum_type */
+enum {
+	LMV_STRIPE_TYPE = 0,
+	LMV_DEFAULT_TYPE = 1,
+};
+
+#define lmv_user_md lmv_user_md_v1
+struct lmv_user_md_v1 {
+	__u32	lum_magic;	 /* must be the first field */
+	__u32	lum_stripe_count;  /* dirstripe count */
+	__u32	lum_stripe_offset; /* MDT idx for default dirstripe */
+	__u32	lum_hash_type;     /* Dir stripe policy */
+	__u32	lum_type;	  /* LMV type: default or normal */
+	__u32	lum_padding1;
+	__u32	lum_padding2;
+	__u32	lum_padding3;
+	char	lum_pool_name[LOV_MAXPOOLNAME];
+	struct	lmv_user_mds_data  lum_objects[0];
+};
+
+static inline int lmv_user_md_size(int stripes, int lmm_magic)
+{
+	return sizeof(struct lmv_user_md) +
+		      stripes * sizeof(struct lmv_user_mds_data);
+}
+
+extern void lustre_swab_lmv_user_md(struct lmv_user_md *lum);
+
+struct ll_recreate_obj {
+	__u64 lrc_id;
+	__u32 lrc_ost_idx;
+};
+
+struct ll_fid {
+	__u64 id;	 /* holds object id */
+	__u32 generation; /* holds object generation */
+	__u32 f_type;     /* holds object type or stripe idx when passing it to
+			   * OST for saving into EA. */
+};
+
+#define UUID_MAX	40
+struct obd_uuid {
+	char uuid[UUID_MAX];
+};
+
+static inline int obd_uuid_equals(const struct obd_uuid *u1,
+				  const struct obd_uuid *u2)
+{
+	return strcmp((char *)u1->uuid, (char *)u2->uuid) == 0;
+}
+
+static inline int obd_uuid_empty(struct obd_uuid *uuid)
+{
+	return uuid->uuid[0] == '\0';
+}
+
+static inline void obd_str2uuid(struct obd_uuid *uuid, const char *tmp)
+{
+	strncpy((char *)uuid->uuid, tmp, sizeof(*uuid));
+	uuid->uuid[sizeof(*uuid) - 1] = '\0';
+}
+
+/* For printf's only, make sure uuid is terminated */
+static inline char *obd_uuid2str(struct obd_uuid *uuid)
+{
+	if (uuid->uuid[sizeof(*uuid) - 1] != '\0') {
+		/* Obviously not safe, but for printfs, no real harm done...
+		   we're always null-terminated, even in a race. */
+		static char temp[sizeof(*uuid)];
+		memcpy(temp, uuid->uuid, sizeof(*uuid) - 1);
+		temp[sizeof(*uuid) - 1] = '\0';
+		return temp;
+	}
+	return (char *)(uuid->uuid);
+}
+
+/* Extract fsname from uuid (or target name) of a target
+   e.g. (myfs-OST0007_UUID -> myfs)
+   see also deuuidify. */
+static inline void obd_uuid2fsname(char *buf, char *uuid, int buflen)
+{
+	char *p;
+
+	strncpy(buf, uuid, buflen - 1);
+	buf[buflen - 1] = '\0';
+	p = strrchr(buf, '-');
+	if (p)
+	   *p = '\0';
+}
+
+/* printf display format
+   e.g. printf("file FID is "DFID"\n", PFID(fid)); */
+#define DFID_NOBRACE LPX64":0x%x:0x%x"
+#define DFID "["DFID_NOBRACE"]"
+#define PFID(fid)     \
+	(fid)->f_seq, \
+	(fid)->f_oid, \
+	(fid)->f_ver
+
+/* scanf input parse format -- strip '[' first.
+   e.g. sscanf(fidstr, SFID, RFID(&fid)); */
+/* #define SFID "0x"LPX64i":0x"LPSZX":0x"LPSZX""
+liblustreapi.c:2893: warning: format '%lx' expects type 'long unsigned int *', but argument 4 has type 'unsigned int *'
+liblustreapi.c:2893: warning: format '%lx' expects type 'long unsigned int *', but argument 5 has type 'unsigned int *'
+*/
+#define SFID "0x"LPX64i":0x%x:0x%x"
+#define RFID(fid)     \
+	&((fid)->f_seq), \
+	&((fid)->f_oid), \
+	&((fid)->f_ver)
+
+
+/********* Quotas **********/
+
+/* these must be explicitly translated into linux Q_* in ll_dir_ioctl */
+#define LUSTRE_Q_QUOTAON    0x800002     /* turn quotas on */
+#define LUSTRE_Q_QUOTAOFF   0x800003     /* turn quotas off */
+#define LUSTRE_Q_GETINFO    0x800005     /* get information about quota files */
+#define LUSTRE_Q_SETINFO    0x800006     /* set information about quota files */
+#define LUSTRE_Q_GETQUOTA   0x800007     /* get user quota structure */
+#define LUSTRE_Q_SETQUOTA   0x800008     /* set user quota structure */
+/* lustre-specific control commands */
+#define LUSTRE_Q_INVALIDATE  0x80000b     /* invalidate quota data */
+#define LUSTRE_Q_FINVALIDATE 0x80000c     /* invalidate filter quota data */
+
+#define UGQUOTA 2       /* set both USRQUOTA and GRPQUOTA */
+
+struct if_quotacheck {
+	char		    obd_type[16];
+	struct obd_uuid	 obd_uuid;
+};
+
+#define IDENTITY_DOWNCALL_MAGIC 0x6d6dd629
+
+/* permission */
+#define N_PERMS_MAX      64
+
+struct perm_downcall_data {
+	__u64 pdd_nid;
+	__u32 pdd_perm;
+	__u32 pdd_padding;
+};
+
+struct identity_downcall_data {
+	__u32			    idd_magic;
+	__u32			    idd_err;
+	__u32			    idd_uid;
+	__u32			    idd_gid;
+	__u32			    idd_nperms;
+	__u32			    idd_ngroups;
+	struct perm_downcall_data idd_perms[N_PERMS_MAX];
+	__u32			    idd_groups[0];
+};
+
+/* for non-mapped uid/gid */
+#define NOBODY_UID      99
+#define NOBODY_GID      99
+
+#define INVALID_ID      (-1)
+
+enum {
+	RMT_LSETFACL    = 1,
+	RMT_LGETFACL    = 2,
+	RMT_RSETFACL    = 3,
+	RMT_RGETFACL    = 4
+};
+
+#ifdef NEED_QUOTA_DEFS
+#ifndef QIF_BLIMITS
+#define QIF_BLIMITS     1
+#define QIF_SPACE       2
+#define QIF_ILIMITS     4
+#define QIF_INODES      8
+#define QIF_BTIME       16
+#define QIF_ITIME       32
+#define QIF_LIMITS      (QIF_BLIMITS | QIF_ILIMITS)
+#define QIF_USAGE       (QIF_SPACE | QIF_INODES)
+#define QIF_TIMES       (QIF_BTIME | QIF_ITIME)
+#define QIF_ALL	 (QIF_LIMITS | QIF_USAGE | QIF_TIMES)
+#endif
+
+#endif /* !__KERNEL__ */
+
+/* lustre volatile file support
+ * file name header: .^L^S^T^R:volatile"
+ */
+#define LUSTRE_VOLATILE_HDR	".\x0c\x13\x14\x12:VOLATILE"
+#define LUSTRE_VOLATILE_HDR_LEN	14
+/* hdr + MDT index */
+#define LUSTRE_VOLATILE_IDX	LUSTRE_VOLATILE_HDR":%.4X:"
+
+typedef enum lustre_quota_version {
+	LUSTRE_QUOTA_V2 = 1
+} lustre_quota_version_t;
+
+/* XXX: same as if_dqinfo struct in kernel */
+struct obd_dqinfo {
+	__u64 dqi_bgrace;
+	__u64 dqi_igrace;
+	__u32 dqi_flags;
+	__u32 dqi_valid;
+};
+
+/* XXX: same as if_dqblk struct in kernel, plus one padding */
+struct obd_dqblk {
+	__u64 dqb_bhardlimit;
+	__u64 dqb_bsoftlimit;
+	__u64 dqb_curspace;
+	__u64 dqb_ihardlimit;
+	__u64 dqb_isoftlimit;
+	__u64 dqb_curinodes;
+	__u64 dqb_btime;
+	__u64 dqb_itime;
+	__u32 dqb_valid;
+	__u32 dqb_padding;
+};
+
+enum {
+	QC_GENERAL      = 0,
+	QC_MDTIDX       = 1,
+	QC_OSTIDX       = 2,
+	QC_UUID	 = 3
+};
+
+struct if_quotactl {
+	__u32		   qc_cmd;
+	__u32		   qc_type;
+	__u32		   qc_id;
+	__u32		   qc_stat;
+	__u32		   qc_valid;
+	__u32		   qc_idx;
+	struct obd_dqinfo       qc_dqinfo;
+	struct obd_dqblk	qc_dqblk;
+	char		    obd_type[16];
+	struct obd_uuid	 obd_uuid;
+};
+
+/* swap layout flags */
+#define	SWAP_LAYOUTS_CHECK_DV1		(1 << 0)
+#define	SWAP_LAYOUTS_CHECK_DV2		(1 << 1)
+#define	SWAP_LAYOUTS_KEEP_MTIME		(1 << 2)
+#define	SWAP_LAYOUTS_KEEP_ATIME		(1 << 3)
+struct lustre_swap_layouts {
+	__u64	sl_flags;
+	__u32	sl_fd;
+	__u32	sl_gid;
+	__u64	sl_dv1;
+	__u64	sl_dv2;
+};
+
+
+/********* Changelogs **********/
+/** Changelog record types */
+enum changelog_rec_type {
+	CL_MARK     = 0,
+	CL_CREATE   = 1,  /* namespace */
+	CL_MKDIR    = 2,  /* namespace */
+	CL_HARDLINK = 3,  /* namespace */
+	CL_SOFTLINK = 4,  /* namespace */
+	CL_MKNOD    = 5,  /* namespace */
+	CL_UNLINK   = 6,  /* namespace */
+	CL_RMDIR    = 7,  /* namespace */
+	CL_RENAME   = 8,  /* namespace */
+	CL_EXT      = 9,  /* namespace extended record (2nd half of rename) */
+	CL_OPEN     = 10, /* not currently used */
+	CL_CLOSE    = 11, /* may be written to log only with mtime change */
+	CL_IOCTL    = 12,
+	CL_TRUNC    = 13,
+	CL_SETATTR  = 14,
+	CL_XATTR    = 15,
+	CL_HSM      = 16, /* HSM specific events, see flags */
+	CL_MTIME    = 17, /* Precedence: setattr > mtime > ctime > atime */
+	CL_CTIME    = 18,
+	CL_ATIME    = 19,
+	CL_LAYOUT   = 20,
+	CL_LAST
+};
+
+static inline const char *changelog_type2str(int type) {
+	static const char *changelog_str[] = {
+		"MARK",  "CREAT", "MKDIR", "HLINK", "SLINK", "MKNOD", "UNLNK",
+		"RMDIR", "RENME", "RNMTO", "OPEN",  "CLOSE", "IOCTL", "TRUNC",
+		"SATTR", "XATTR", "HSM",   "MTIME", "CTIME", "ATIME", "LAYOUT"
+	};
+
+	if (type >= 0 && type < CL_LAST)
+		return changelog_str[type];
+	return NULL;
+}
+
+/* per-record flags */
+#define CLF_VERSION     0x1000
+#define CLF_EXT_VERSION 0x2000
+#define CLF_FLAGSHIFT   12
+#define CLF_FLAGMASK    ((1U << CLF_FLAGSHIFT) - 1)
+#define CLF_VERMASK     (~CLF_FLAGMASK)
+/* Anything under the flagmask may be per-type (if desired) */
+/* Flags for unlink */
+#define CLF_UNLINK_LAST       0x0001 /* Unlink of last hardlink */
+#define CLF_UNLINK_HSM_EXISTS 0x0002 /* File has something in HSM */
+				     /* HSM cleaning needed */
+/* Flags for rename */
+#define CLF_RENAME_LAST       0x0001 /* rename unlink last hardlink of target */
+
+/* Flags for HSM */
+/* 12b used (from high weight to low weight):
+ * 2b for flags
+ * 3b for event
+ * 7b for error code
+ */
+#define CLF_HSM_ERR_L	0 /* HSM return code, 7 bits */
+#define CLF_HSM_ERR_H	6
+#define CLF_HSM_EVENT_L      7 /* HSM event, 3 bits, see enum hsm_event */
+#define CLF_HSM_EVENT_H      9
+#define CLF_HSM_FLAG_L      10 /* HSM flags, 2 bits, 1 used, 1 spare */
+#define CLF_HSM_FLAG_H      11
+#define CLF_HSM_SPARE_L     12 /* 4 spare bits */
+#define CLF_HSM_SPARE_H     15
+#define CLF_HSM_LAST	15
+
+/* Remove bits higher than _h, then extract the value
+ * between _h and _l by shifting lower weigth to bit 0. */
+#define CLF_GET_BITS(_b, _h, _l) (((_b << (CLF_HSM_LAST - _h)) & 0xFFFF) \
+				   >> (CLF_HSM_LAST - _h + _l))
+
+#define CLF_HSM_SUCCESS      0x00
+#define CLF_HSM_MAXERROR     0x7E
+#define CLF_HSM_ERROVERFLOW  0x7F
+
+#define CLF_HSM_DIRTY	1 /* file is dirty after HSM request end */
+
+/* 3 bits field => 8 values allowed */
+enum hsm_event {
+	HE_ARCHIVE      = 0,
+	HE_RESTORE      = 1,
+	HE_CANCEL       = 2,
+	HE_RELEASE      = 3,
+	HE_REMOVE       = 4,
+	HE_STATE	= 5,
+	HE_SPARE1       = 6,
+	HE_SPARE2       = 7,
+};
+
+static inline enum hsm_event hsm_get_cl_event(__u16 flags)
+{
+	return CLF_GET_BITS(flags, CLF_HSM_EVENT_H, CLF_HSM_EVENT_L);
+}
+
+static inline void hsm_set_cl_event(int *flags, enum hsm_event he)
+{
+	*flags |= (he << CLF_HSM_EVENT_L);
+}
+
+static inline __u16 hsm_get_cl_flags(int flags)
+{
+	return CLF_GET_BITS(flags, CLF_HSM_FLAG_H, CLF_HSM_FLAG_L);
+}
+
+static inline void hsm_set_cl_flags(int *flags, int bits)
+{
+	*flags |= (bits << CLF_HSM_FLAG_L);
+}
+
+static inline int hsm_get_cl_error(int flags)
+{
+	return CLF_GET_BITS(flags, CLF_HSM_ERR_H, CLF_HSM_ERR_L);
+}
+
+static inline void hsm_set_cl_error(int *flags, int error)
+{
+	*flags |= (error << CLF_HSM_ERR_L);
+}
+
+#define CR_MAXSIZE cfs_size_round(2*NAME_MAX + 1 + sizeof(struct changelog_rec))
+
+struct changelog_rec {
+	__u16		 cr_namelen;
+	__u16		 cr_flags; /**< (flags&CLF_FLAGMASK)|CLF_VERSION */
+	__u32		 cr_type;  /**< \a changelog_rec_type */
+	__u64		 cr_index; /**< changelog record number */
+	__u64		 cr_prev;  /**< last index for this target fid */
+	__u64		 cr_time;
+	union {
+		lustre_fid    cr_tfid;	/**< target fid */
+		__u32	 cr_markerflags; /**< CL_MARK flags */
+	};
+	lustre_fid	    cr_pfid;	/**< parent fid */
+	char		  cr_name[0];     /**< last element */
+} __attribute__((packed));
+
+/* changelog_ext_rec is 2*sizeof(lu_fid) bigger than changelog_rec, to save
+ * space, only rename uses changelog_ext_rec, while others use changelog_rec to
+ * store records.
+ */
+struct changelog_ext_rec {
+	__u16			cr_namelen;
+	__u16			cr_flags; /**< (flags & CLF_FLAGMASK) |
+						CLF_EXT_VERSION */
+	__u32			cr_type;  /**< \a changelog_rec_type */
+	__u64			cr_index; /**< changelog record number */
+	__u64			cr_prev;  /**< last index for this target fid */
+	__u64			cr_time;
+	union {
+		lustre_fid	cr_tfid;	/**< target fid */
+		__u32		cr_markerflags; /**< CL_MARK flags */
+	};
+	lustre_fid		cr_pfid;	/**< target parent fid */
+	lustre_fid		cr_sfid;	/**< source fid, or zero */
+	lustre_fid		cr_spfid;       /**< source parent fid, or zero */
+	char			cr_name[0];     /**< last element */
+} __attribute__((packed));
+
+#define CHANGELOG_REC_EXTENDED(rec) \
+	(((rec)->cr_flags & CLF_VERMASK) == CLF_EXT_VERSION)
+
+static inline int changelog_rec_size(struct changelog_rec *rec)
+{
+	return CHANGELOG_REC_EXTENDED(rec) ? sizeof(struct changelog_ext_rec):
+					     sizeof(*rec);
+}
+
+static inline char *changelog_rec_name(struct changelog_rec *rec)
+{
+	return CHANGELOG_REC_EXTENDED(rec) ?
+		((struct changelog_ext_rec *)rec)->cr_name: rec->cr_name;
+}
+
+static inline int changelog_rec_snamelen(struct changelog_ext_rec *rec)
+{
+	return rec->cr_namelen - strlen(rec->cr_name) - 1;
+}
+
+static inline char *changelog_rec_sname(struct changelog_ext_rec *rec)
+{
+	return rec->cr_name + strlen(rec->cr_name) + 1;
+}
+
+struct ioc_changelog {
+	__u64 icc_recno;
+	__u32 icc_mdtindex;
+	__u32 icc_id;
+	__u32 icc_flags;
+};
+
+enum changelog_message_type {
+	CL_RECORD = 10, /* message is a changelog_rec */
+	CL_EOF    = 11, /* at end of current changelog */
+};
+
+/********* Misc **********/
+
+struct ioc_data_version {
+	__u64 idv_version;
+	__u64 idv_flags;     /* See LL_DV_xxx */
+};
+#define LL_DV_NOFLUSH 0x01   /* Do not take READ EXTENT LOCK before sampling
+				version. Dirty caches are left unchanged. */
+
+#ifndef offsetof
+# define offsetof(typ,memb)     ((unsigned long)((char *)&(((typ *)0)->memb)))
+#endif
+
+#define dot_lustre_name ".lustre"
+
+
+/********* HSM **********/
+
+/** HSM per-file state
+ * See HSM_FLAGS below.
+ */
+enum hsm_states {
+	HS_EXISTS	= 0x00000001,
+	HS_DIRTY	= 0x00000002,
+	HS_RELEASED	= 0x00000004,
+	HS_ARCHIVED	= 0x00000008,
+	HS_NORELEASE	= 0x00000010,
+	HS_NOARCHIVE	= 0x00000020,
+	HS_LOST		= 0x00000040,
+};
+
+/* HSM user-setable flags. */
+#define HSM_USER_MASK   (HS_NORELEASE | HS_NOARCHIVE | HS_DIRTY)
+
+/* Other HSM flags. */
+#define HSM_STATUS_MASK (HS_EXISTS | HS_LOST | HS_RELEASED | HS_ARCHIVED)
+
+/*
+ * All HSM-related possible flags that could be applied to a file.
+ * This should be kept in sync with hsm_states.
+ */
+#define HSM_FLAGS_MASK  (HSM_USER_MASK | HSM_STATUS_MASK)
+
+/**
+ * HSM request progress state
+ */
+enum hsm_progress_states {
+	HPS_WAITING	= 1,
+	HPS_RUNNING	= 2,
+	HPS_DONE	= 3,
+};
+#define HPS_NONE	0
+
+static inline char *hsm_progress_state2name(enum hsm_progress_states s)
+{
+	switch  (s) {
+	case HPS_WAITING:	return "waiting";
+	case HPS_RUNNING:	return "running";
+	case HPS_DONE:		return "done";
+	default:		return "unknown";
+	}
+}
+
+struct hsm_extent {
+	__u64 offset;
+	__u64 length;
+} __attribute__((packed));
+
+/**
+ * Current HSM states of a Lustre file.
+ *
+ * This structure purpose is to be sent to user-space mainly. It describes the
+ * current HSM flags and in-progress action.
+ */
+struct hsm_user_state {
+	/** Current HSM states, from enum hsm_states. */
+	__u32			hus_states;
+	__u32			hus_archive_id;
+	/**  The current undergoing action, if there is one */
+	__u32			hus_in_progress_state;
+	__u32			hus_in_progress_action;
+	struct hsm_extent	hus_in_progress_location;
+	char			hus_extended_info[];
+};
+
+struct hsm_state_set_ioc {
+	struct lu_fid	hssi_fid;
+	__u64		hssi_setmask;
+	__u64		hssi_clearmask;
+};
+
+/*
+ * This structure describes the current in-progress action for a file.
+ * it is retuned to user space and send over the wire
+ */
+struct hsm_current_action {
+	/**  The current undergoing action, if there is one */
+	/* state is one of hsm_progress_states */
+	__u32			hca_state;
+	/* action is one of hsm_user_action */
+	__u32			hca_action;
+	struct hsm_extent	hca_location;
+};
+
+/***** HSM user requests ******/
+/* User-generated (lfs/ioctl) request types */
+enum hsm_user_action {
+	HUA_NONE    =  1, /* no action (noop) */
+	HUA_ARCHIVE = 10, /* copy to hsm */
+	HUA_RESTORE = 11, /* prestage */
+	HUA_RELEASE = 12, /* drop ost objects */
+	HUA_REMOVE  = 13, /* remove from archive */
+	HUA_CANCEL  = 14  /* cancel a request */
+};
+
+static inline char *hsm_user_action2name(enum hsm_user_action  a)
+{
+	switch  (a) {
+	case HUA_NONE:    return "NOOP";
+	case HUA_ARCHIVE: return "ARCHIVE";
+	case HUA_RESTORE: return "RESTORE";
+	case HUA_RELEASE: return "RELEASE";
+	case HUA_REMOVE:  return "REMOVE";
+	case HUA_CANCEL:  return "CANCEL";
+	default:	  return "UNKNOWN";
+	}
+}
+
+/*
+ * List of hr_flags (bit field)
+ */
+#define HSM_FORCE_ACTION 0x0001
+/* used by CT, connot be set by user */
+#define HSM_GHOST_COPY   0x0002
+
+/**
+ * Contains all the fixed part of struct hsm_user_request.
+ *
+ */
+struct hsm_request {
+	__u32 hr_action;	/* enum hsm_user_action */
+	__u32 hr_archive_id;	/* archive id, used only with HUA_ARCHIVE */
+	__u64 hr_flags;		/* request flags */
+	__u32 hr_itemcount;	/* item count in hur_user_item vector */
+	__u32 hr_data_len;
+};
+
+struct hsm_user_item {
+       lustre_fid	hui_fid;
+       struct hsm_extent hui_extent;
+} __attribute__((packed));
+
+struct hsm_user_request {
+	struct hsm_request	hur_request;
+	struct hsm_user_item	hur_user_item[0];
+	/* extra data blob at end of struct (after all
+	 * hur_user_items), only use helpers to access it
+	 */
+} __attribute__((packed));
+
+/** Return pointer to data field in a hsm user request */
+static inline void *hur_data(struct hsm_user_request *hur)
+{
+	return &(hur->hur_user_item[hur->hur_request.hr_itemcount]);
+}
+
+/** Compute the current length of the provided hsm_user_request. */
+static inline int hur_len(struct hsm_user_request *hur)
+{
+	return offsetof(struct hsm_user_request,
+			hur_user_item[hur->hur_request.hr_itemcount]) +
+		hur->hur_request.hr_data_len;
+}
+
+/****** HSM RPCs to copytool *****/
+/* Message types the copytool may receive */
+enum hsm_message_type {
+	HMT_ACTION_LIST = 100, /* message is a hsm_action_list */
+};
+
+/* Actions the copytool may be instructed to take for a given action_item */
+enum hsm_copytool_action {
+	HSMA_NONE    = 10, /* no action */
+	HSMA_ARCHIVE = 20, /* arbitrary offset */
+	HSMA_RESTORE = 21,
+	HSMA_REMOVE  = 22,
+	HSMA_CANCEL  = 23
+};
+
+static inline char *hsm_copytool_action2name(enum hsm_copytool_action  a)
+{
+	switch  (a) {
+	case HSMA_NONE:    return "NOOP";
+	case HSMA_ARCHIVE: return "ARCHIVE";
+	case HSMA_RESTORE: return "RESTORE";
+	case HSMA_REMOVE:  return "REMOVE";
+	case HSMA_CANCEL:  return "CANCEL";
+	default:	   return "UNKNOWN";
+	}
+}
+
+/* Copytool item action description */
+struct hsm_action_item {
+	__u32      hai_len;     /* valid size of this struct */
+	__u32      hai_action;  /* hsm_copytool_action, but use known size */
+	lustre_fid hai_fid;     /* Lustre FID to operated on */
+	lustre_fid hai_dfid;    /* fid used for data access */
+	struct hsm_extent hai_extent;  /* byte range to operate on */
+	__u64      hai_cookie;  /* action cookie from coordinator */
+	__u64      hai_gid;     /* grouplock id */
+	char       hai_data[0]; /* variable length */
+} __attribute__((packed));
+
+/*
+ * helper function which print in hexa the first bytes of
+ * hai opaque field
+ * \param hai [IN] record to print
+ * \param buffer [OUT] output buffer
+ * \param len [IN] max buffer len
+ * \retval buffer
+ */
+static inline char *hai_dump_data_field(struct hsm_action_item *hai,
+					char *buffer, int len)
+{
+	int i, sz, data_len;
+	char *ptr;
+
+	ptr = buffer;
+	sz = len;
+	data_len = hai->hai_len - sizeof(*hai);
+	for (i = 0 ; (i < data_len) && (sz > 0) ; i++)
+	{
+		int cnt;
+
+		cnt = snprintf(ptr, sz, "%.2X",
+			       (unsigned char)hai->hai_data[i]);
+		ptr += cnt;
+		sz -= cnt;
+	}
+	*ptr = '\0';
+	return buffer;
+}
+
+/* Copytool action list */
+#define HAL_VERSION 1
+#define HAL_MAXSIZE LNET_MTU /* bytes, used in userspace only */
+struct hsm_action_list {
+	__u32 hal_version;
+	__u32 hal_count;       /* number of hai's to follow */
+	__u64 hal_compound_id; /* returned by coordinator */
+	__u64 hal_flags;
+	__u32 hal_archive_id; /* which archive backend */
+	__u32 padding1;
+	char  hal_fsname[0];   /* null-terminated */
+	/* struct hsm_action_item[hal_count] follows, aligned on 8-byte
+	   boundaries. See hai_zero */
+} __attribute__((packed));
+
+#ifndef HAVE_CFS_SIZE_ROUND
+static inline int cfs_size_round (int val)
+{
+	return (val + 7) & (~0x7);
+}
+#define HAVE_CFS_SIZE_ROUND
+#endif
+
+/* Return pointer to first hai in action list */
+static inline struct hsm_action_item * hai_zero(struct hsm_action_list *hal)
+{
+	return (struct hsm_action_item *)(hal->hal_fsname +
+					  cfs_size_round(strlen(hal-> \
+								hal_fsname)));
+}
+/* Return pointer to next hai */
+static inline struct hsm_action_item * hai_next(struct hsm_action_item *hai)
+{
+	return (struct hsm_action_item *)((char *)hai +
+					  cfs_size_round(hai->hai_len));
+}
+
+/* Return size of an hsm_action_list */
+static inline int hal_size(struct hsm_action_list *hal)
+{
+	int i, sz;
+	struct hsm_action_item *hai;
+
+	sz = sizeof(*hal) + cfs_size_round(strlen(hal->hal_fsname));
+	hai = hai_zero(hal);
+	for (i = 0 ; i < hal->hal_count ; i++) {
+		sz += cfs_size_round(hai->hai_len);
+		hai = hai_next(hai);
+	}
+	return(sz);
+}
+
+/* Copytool progress reporting */
+#define HP_FLAG_COMPLETED 0x01
+#define HP_FLAG_RETRY     0x02
+
+struct hsm_progress {
+	lustre_fid		hp_fid;
+	__u64			hp_cookie;
+	struct hsm_extent	hp_extent;
+	__u16			hp_flags;
+	__u16			hp_errval; /* positive val */
+	__u32			padding;
+};
+
+/**
+ * Use by copytool during any hsm request they handled.
+ * This structure is initialized by llapi_hsm_copy_start()
+ * which is an helper over the ioctl() interface
+ * Store Lustre, internal use only, data.
+ */
+struct hsm_copy {
+	__u64			hc_data_version;
+	__u16			hc_flags;
+	__u16			hc_errval; /* positive val */
+	__u32			padding;
+	struct hsm_action_item	hc_hai;
+};
+
+/** @} lustreuser */
+
+#endif /* _LUSTRE_USER_H */
diff --git a/drivers/staging/lustre/lustre/include/lustre/lustreapi.h b/drivers/staging/lustre/lustre/include/lustre/lustreapi.h
new file mode 100644
index 000000000000..63da66506639
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre/lustreapi.h
@@ -0,0 +1,310 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Whamcloud, Inc.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTREAPI_H_
+#define _LUSTREAPI_H_
+
+/** \defgroup llapi llapi
+ *
+ * @{
+ */
+
+#include <lustre/lustre_user.h>
+
+typedef void (*llapi_cb_t)(char *obd_type_name, char *obd_name, char *obd_uuid, void *args);
+
+/* lustreapi message severity level */
+enum llapi_message_level {
+	LLAPI_MSG_OFF    = 0,
+	LLAPI_MSG_FATAL  = 1,
+	LLAPI_MSG_ERROR  = 2,
+	LLAPI_MSG_WARN   = 3,
+	LLAPI_MSG_NORMAL = 4,
+	LLAPI_MSG_INFO   = 5,
+	LLAPI_MSG_DEBUG  = 6,
+	LLAPI_MSG_MAX
+};
+
+/* the bottom three bits reserved for llapi_message_level */
+#define LLAPI_MSG_MASK	  0x00000007
+#define LLAPI_MSG_NO_ERRNO      0x00000010
+
+extern void llapi_msg_set_level(int level);
+extern void llapi_error(int level, int rc, char *fmt, ...);
+#define llapi_err_noerrno(level, fmt, a...)			     \
+	llapi_error((level) | LLAPI_MSG_NO_ERRNO, 0, fmt, ## a)
+extern void llapi_printf(int level, char *fmt, ...);
+extern int llapi_file_create(const char *name, unsigned long long stripe_size,
+			     int stripe_offset, int stripe_count,
+			     int stripe_pattern);
+extern int llapi_file_open(const char *name, int flags, int mode,
+			   unsigned long long stripe_size, int stripe_offset,
+			   int stripe_count, int stripe_pattern);
+extern int llapi_file_create_pool(const char *name,
+				  unsigned long long stripe_size,
+				  int stripe_offset, int stripe_count,
+				  int stripe_pattern, char *pool_name);
+extern int llapi_file_open_pool(const char *name, int flags, int mode,
+				unsigned long long stripe_size,
+				int stripe_offset, int stripe_count,
+				int stripe_pattern, char *pool_name);
+extern int llapi_poollist(const char *name);
+extern int llapi_get_poollist(const char *name, char **poollist, int list_size,
+			      char *buffer, int buffer_size);
+extern int llapi_get_poolmembers(const char *poolname, char **members,
+				 int list_size, char *buffer, int buffer_size);
+extern int llapi_file_get_stripe(const char *path, struct lov_user_md *lum);
+#define HAVE_LLAPI_FILE_LOOKUP
+extern int llapi_file_lookup(int dirfd, const char *name);
+
+#define VERBOSE_COUNT      0x1
+#define VERBOSE_SIZE       0x2
+#define VERBOSE_OFFSET     0x4
+#define VERBOSE_POOL       0x8
+#define VERBOSE_DETAIL     0x10
+#define VERBOSE_OBJID      0x20
+#define VERBOSE_GENERATION 0x40
+#define VERBOSE_MDTINDEX   0x80
+#define VERBOSE_ALL	(VERBOSE_COUNT | VERBOSE_SIZE | VERBOSE_OFFSET | \
+			    VERBOSE_POOL | VERBOSE_OBJID | VERBOSE_GENERATION)
+
+struct find_param {
+	unsigned int maxdepth;
+	time_t  atime;
+	time_t  mtime;
+	time_t  ctime;
+	int     asign;  /* cannot be bitfields due to using pointers to */
+	int     csign;  /* access them during argument parsing. */
+	int     msign;
+	int     type;
+	int	     size_sign:2,	/* these need to be signed values */
+			stripesize_sign:2,
+			stripecount_sign:2;
+	unsigned long long size;
+	unsigned long long size_units;
+	uid_t uid;
+	gid_t gid;
+
+	unsigned long   zeroend:1,
+			recursive:1,
+			exclude_pattern:1,
+			exclude_type:1,
+			exclude_obd:1,
+			exclude_mdt:1,
+			exclude_gid:1,
+			exclude_uid:1,
+			check_gid:1,	    /* group ID */
+			check_uid:1,	    /* user ID */
+			check_pool:1,	   /* LOV pool name */
+			check_size:1,	   /* file size */
+			exclude_pool:1,
+			exclude_size:1,
+			exclude_atime:1,
+			exclude_mtime:1,
+			exclude_ctime:1,
+			get_lmv:1,	      /* get MDT list from LMV */
+			raw:1,		  /* do not fill in defaults */
+			check_stripesize:1,     /* LOV stripe size */
+			exclude_stripesize:1,
+			check_stripecount:1,    /* LOV stripe count */
+			exclude_stripecount:1;
+
+	int     verbose;
+	int     quiet;
+
+	/* regular expression */
+	char   *pattern;
+
+	char   *print_fmt;
+
+	struct  obd_uuid       *obduuid;
+	int		     num_obds;
+	int		     num_alloc_obds;
+	int		     obdindex;
+	int		    *obdindexes;
+
+	struct  obd_uuid       *mdtuuid;
+	int		     num_mdts;
+	int		     num_alloc_mdts;
+	int		     mdtindex;
+	int		    *mdtindexes;
+	int		     file_mdtindex;
+
+	int	lumlen;
+	struct  lov_user_mds_data *lmd;
+
+	char poolname[LOV_MAXPOOLNAME + 1];
+
+	int			fp_lmv_count;
+	struct lmv_user_md	*fp_lmv_md;
+
+	unsigned long long stripesize;
+	unsigned long long stripesize_units;
+	unsigned long long stripecount;
+
+	/* In-process parameters. */
+	unsigned long   got_uuids:1,
+			obds_printed:1,
+			have_fileinfo:1;	/* file attrs and LOV xattr */
+	unsigned int    depth;
+	dev_t	   st_dev;
+};
+
+extern int llapi_ostlist(char *path, struct find_param *param);
+extern int llapi_uuid_match(char *real_uuid, char *search_uuid);
+extern int llapi_getstripe(char *path, struct find_param *param);
+extern int llapi_find(char *path, struct find_param *param);
+
+extern int llapi_file_fget_mdtidx(int fd, int *mdtidx);
+extern int llapi_dir_create_pool(const char *name, int flags, int stripe_offset,
+				 int stripe_count, int stripe_pattern,
+				 char *poolname);
+int llapi_direntry_remove(char *dname);
+extern int llapi_obd_statfs(char *path, __u32 type, __u32 index,
+		     struct obd_statfs *stat_buf,
+		     struct obd_uuid *uuid_buf);
+extern int llapi_ping(char *obd_type, char *obd_name);
+extern int llapi_target_check(int num_types, char **obd_types, char *dir);
+extern int llapi_file_get_lov_uuid(const char *path, struct obd_uuid *lov_uuid);
+extern int llapi_file_get_lmv_uuid(const char *path, struct obd_uuid *lmv_uuid);
+extern int llapi_file_fget_lov_uuid(int fd, struct obd_uuid *lov_uuid);
+extern int llapi_lov_get_uuids(int fd, struct obd_uuid *uuidp, int *ost_count);
+extern int llapi_lmv_get_uuids(int fd, struct obd_uuid *uuidp, int *mdt_count);
+extern int llapi_is_lustre_mnttype(const char *type);
+extern int llapi_search_ost(char *fsname, char *poolname, char *ostname);
+extern int llapi_get_obd_count(char *mnt, int *count, int is_mdt);
+extern int parse_size(char *optarg, unsigned long long *size,
+		      unsigned long long *size_units, int bytes_spec);
+extern int llapi_search_mounts(const char *pathname, int index,
+			       char *mntdir, char *fsname);
+extern int llapi_search_fsname(const char *pathname, char *fsname);
+extern int llapi_getname(const char *path, char *buf, size_t size);
+
+extern void llapi_ping_target(char *obd_type, char *obd_name,
+			      char *obd_uuid, void *args);
+
+extern int llapi_search_rootpath(char *pathname, const char *fsname);
+
+struct mntent;
+#define HAVE_LLAPI_IS_LUSTRE_MNT
+extern int llapi_is_lustre_mnt(struct mntent *mnt);
+extern int llapi_quotachown(char *path, int flag);
+extern int llapi_quotacheck(char *mnt, int check_type);
+extern int llapi_poll_quotacheck(char *mnt, struct if_quotacheck *qchk);
+extern int llapi_quotactl(char *mnt, struct if_quotactl *qctl);
+extern int llapi_target_iterate(int type_num, char **obd_type, void *args,
+				llapi_cb_t cb);
+extern int llapi_get_connect_flags(const char *mnt, __u64 *flags);
+extern int llapi_lsetfacl(int argc, char *argv[]);
+extern int llapi_lgetfacl(int argc, char *argv[]);
+extern int llapi_rsetfacl(int argc, char *argv[]);
+extern int llapi_rgetfacl(int argc, char *argv[]);
+extern int llapi_cp(int argc, char *argv[]);
+extern int llapi_ls(int argc, char *argv[]);
+extern int llapi_fid2path(const char *device, const char *fidstr, char *path,
+			  int pathlen, long long *recno, int *linkno);
+extern int llapi_path2fid(const char *path, lustre_fid *fid);
+extern int llapi_fd2fid(const int fd, lustre_fid *fid);
+
+extern int llapi_get_version(char *buffer, int buffer_size, char **version);
+extern int llapi_get_data_version(int fd, __u64 *data_version, __u64 flags);
+extern int llapi_hsm_state_get(const char *path, struct hsm_user_state *hus);
+extern int llapi_hsm_state_set(const char *path, __u64 setmask, __u64 clearmask,
+			       __u32 archive_id);
+
+extern int llapi_create_volatile_idx(char *directory, int idx, int mode);
+static inline int llapi_create_volatile(char *directory, int mode)
+{
+	return llapi_create_volatile_idx(directory, -1, mode);
+}
+
+
+extern int llapi_fswap_layouts(const int fd1, const int fd2,
+			       __u64 dv1, __u64 dv2, __u64 flags);
+extern int llapi_swap_layouts(const char *path1, const char *path2,
+			      __u64 dv1, __u64 dv2, __u64 flags);
+
+/* Changelog interface.  priv is private state, managed internally
+   by these functions */
+#define CHANGELOG_FLAG_FOLLOW 0x01   /* Not yet implemented */
+#define CHANGELOG_FLAG_BLOCK  0x02   /* Blocking IO makes sense in case of
+   slow user parsing of the records, but it also prevents us from cleaning
+   up if the records are not consumed. */
+
+/* Records received are in extentded format now, though most of them are still
+ * written in disk in changelog_rec format (to save space and time), it's
+ * converted to extented format in the lustre api to ease changelog analysis. */
+#define HAVE_CHANGELOG_EXTEND_REC 1
+
+extern int llapi_changelog_start(void **priv, int flags, const char *mdtname,
+				 long long startrec);
+extern int llapi_changelog_fini(void **priv);
+extern int llapi_changelog_recv(void *priv, struct changelog_ext_rec **rech);
+extern int llapi_changelog_free(struct changelog_ext_rec **rech);
+/* Allow records up to endrec to be destroyed; requires registered id. */
+extern int llapi_changelog_clear(const char *mdtname, const char *idstr,
+				 long long endrec);
+
+/* HSM copytool interface.
+ * priv is private state, managed internally by these functions
+ */
+struct hsm_copytool_private;
+extern int llapi_hsm_copytool_start(struct hsm_copytool_private **priv,
+				    char *fsname, int flags,
+				    int archive_count, int *archives);
+extern int llapi_hsm_copytool_fini(struct hsm_copytool_private **priv);
+extern int llapi_hsm_copytool_recv(struct hsm_copytool_private *priv,
+				   struct hsm_action_list **hal, int *msgsize);
+extern int llapi_hsm_copytool_free(struct hsm_action_list **hal);
+extern int llapi_hsm_copy_start(char *mnt, struct hsm_copy *copy,
+				const struct hsm_action_item *hai);
+extern int llapi_hsm_copy_end(char *mnt, struct hsm_copy *copy,
+			      const struct hsm_progress *hp);
+extern int llapi_hsm_progress(char *mnt, struct hsm_progress *hp);
+extern int llapi_hsm_import(const char *dst, int archive, struct stat *st,
+			    unsigned long long stripe_size, int stripe_offset,
+			    int stripe_count, int stripe_pattern,
+			    char *pool_name, lustre_fid *newfid);
+
+/* HSM user interface */
+extern struct hsm_user_request *llapi_hsm_user_request_alloc(int itemcount,
+							     int data_len);
+extern int llapi_hsm_request(char *mnt, struct hsm_user_request *request);
+extern int llapi_hsm_current_action(const char *path,
+				    struct hsm_current_action *hca);
+/** @} llapi */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_acl.h b/drivers/staging/lustre/lustre/include/lustre_acl.h
new file mode 100644
index 000000000000..5cfb87b180c3
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_acl.h
@@ -0,0 +1,42 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_acl.h
+ */
+
+#ifndef _LUSTRE_ACL_H
+#define _LUSTRE_ACL_H
+
+#include <linux/lustre_acl.h>
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_capa.h b/drivers/staging/lustre/lustre/include/lustre_capa.h
new file mode 100644
index 000000000000..d77bffc0b59d
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_capa.h
@@ -0,0 +1,305 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_capa.h
+ *
+ * Author: Lai Siyao <lsy@clusterfs.com>
+ */
+
+#ifndef __LINUX_CAPA_H_
+#define __LINUX_CAPA_H_
+
+/** \defgroup capa capa
+ *
+ * @{
+ */
+
+/*
+ * capability
+ */
+#include <linux/crypto.h>
+#include <lustre/lustre_idl.h>
+
+#define CAPA_TIMEOUT 1800		/* sec, == 30 min */
+#define CAPA_KEY_TIMEOUT (24 * 60 * 60)  /* sec, == 1 days */
+
+struct capa_hmac_alg {
+	const char     *ha_name;
+	int	     ha_len;
+	int	     ha_keylen;
+};
+
+#define DEF_CAPA_HMAC_ALG(name, type, len, keylen)      \
+[CAPA_HMAC_ALG_ ## type] = {			    \
+	.ha_name	 = name,			\
+	.ha_len	  = len,			 \
+	.ha_keylen       = keylen,		      \
+}
+
+struct client_capa {
+	struct inode	     *inode;
+	struct list_head		lli_list;     /* link to lli_oss_capas */
+};
+
+struct target_capa {
+	struct hlist_node	  c_hash;       /* link to capa hash */
+};
+
+struct obd_capa {
+	struct list_head		c_list;       /* link to capa_list */
+
+	struct lustre_capa	c_capa;       /* capa */
+	atomic_t	      c_refc;       /* ref count */
+	cfs_time_t		c_expiry;     /* jiffies */
+	spinlock_t		c_lock;	/* protect capa content */
+	int			c_site;
+
+	union {
+		struct client_capa	cli;
+		struct target_capa	tgt;
+	} u;
+};
+
+enum {
+	CAPA_SITE_CLIENT = 0,
+	CAPA_SITE_SERVER,
+	CAPA_SITE_MAX
+};
+
+static inline struct lu_fid *capa_fid(struct lustre_capa *capa)
+{
+	return &capa->lc_fid;
+}
+
+static inline __u64 capa_opc(struct lustre_capa *capa)
+{
+	return capa->lc_opc;
+}
+
+static inline __u64 capa_uid(struct lustre_capa *capa)
+{
+	return capa->lc_uid;
+}
+
+static inline __u64 capa_gid(struct lustre_capa *capa)
+{
+	return capa->lc_gid;
+}
+
+static inline __u32 capa_flags(struct lustre_capa *capa)
+{
+	return capa->lc_flags & 0xffffff;
+}
+
+static inline __u32 capa_alg(struct lustre_capa *capa)
+{
+	return (capa->lc_flags >> 24);
+}
+
+static inline __u32 capa_keyid(struct lustre_capa *capa)
+{
+	return capa->lc_keyid;
+}
+
+static inline __u64 capa_key_seq(struct lustre_capa_key *key)
+{
+	return key->lk_seq;
+}
+
+static inline __u32 capa_key_keyid(struct lustre_capa_key *key)
+{
+	return key->lk_keyid;
+}
+
+static inline __u32 capa_timeout(struct lustre_capa *capa)
+{
+	return capa->lc_timeout;
+}
+
+static inline __u32 capa_expiry(struct lustre_capa *capa)
+{
+	return capa->lc_expiry;
+}
+
+void _debug_capa(struct lustre_capa *, struct libcfs_debug_msg_data *,
+		 const char *fmt, ... );
+#define DEBUG_CAPA(level, capa, fmt, args...)				  \
+do {									   \
+	if (((level) & D_CANTMASK) != 0 ||				     \
+	    ((libcfs_debug & (level)) != 0 &&				  \
+	     (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0)) {	       \
+		LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL);	      \
+		_debug_capa((capa), &msgdata, fmt, ##args);		    \
+	}								      \
+} while (0)
+
+#define DEBUG_CAPA_KEY(level, k, fmt, args...)				 \
+do {									   \
+CDEBUG(level, fmt " capability key@%p seq "LPU64" keyid %u\n",		 \
+       ##args, k, capa_key_seq(k), capa_key_keyid(k));			 \
+} while (0)
+
+typedef int (* renew_capa_cb_t)(struct obd_capa *, struct lustre_capa *);
+
+/* obdclass/capa.c */
+extern struct list_head capa_list[];
+extern spinlock_t capa_lock;
+extern int capa_count[];
+extern struct kmem_cache *capa_cachep;
+
+struct hlist_head *init_capa_hash(void);
+void cleanup_capa_hash(struct hlist_head *hash);
+
+struct obd_capa *capa_add(struct hlist_head *hash,
+			  struct lustre_capa *capa);
+struct obd_capa *capa_lookup(struct hlist_head *hash,
+			     struct lustre_capa *capa, int alive);
+
+int capa_hmac(__u8 *hmac, struct lustre_capa *capa, __u8 *key);
+int capa_encrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen);
+int capa_decrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen);
+void capa_cpy(void *dst, struct obd_capa *ocapa);
+static inline struct obd_capa *alloc_capa(int site)
+{
+	struct obd_capa *ocapa;
+
+	if (unlikely(site != CAPA_SITE_CLIENT && site != CAPA_SITE_SERVER))
+		return ERR_PTR(-EINVAL);
+
+	OBD_SLAB_ALLOC_PTR(ocapa, capa_cachep);
+	if (unlikely(!ocapa))
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&ocapa->c_list);
+	atomic_set(&ocapa->c_refc, 1);
+	spin_lock_init(&ocapa->c_lock);
+	ocapa->c_site = site;
+	if (ocapa->c_site == CAPA_SITE_CLIENT)
+		INIT_LIST_HEAD(&ocapa->u.cli.lli_list);
+	else
+		INIT_HLIST_NODE(&ocapa->u.tgt.c_hash);
+
+	return ocapa;
+}
+
+static inline struct obd_capa *capa_get(struct obd_capa *ocapa)
+{
+	if (!ocapa)
+		return NULL;
+
+	atomic_inc(&ocapa->c_refc);
+	return ocapa;
+}
+
+static inline void capa_put(struct obd_capa *ocapa)
+{
+	if (!ocapa)
+		return;
+
+	if (atomic_read(&ocapa->c_refc) == 0) {
+		DEBUG_CAPA(D_ERROR, &ocapa->c_capa, "refc is 0 for");
+		LBUG();
+	}
+
+	if (atomic_dec_and_test(&ocapa->c_refc)) {
+		LASSERT(list_empty(&ocapa->c_list));
+		if (ocapa->c_site == CAPA_SITE_CLIENT) {
+			LASSERT(list_empty(&ocapa->u.cli.lli_list));
+		} else {
+			struct hlist_node *hnode;
+
+			hnode = &ocapa->u.tgt.c_hash;
+			LASSERT(!hnode->next && !hnode->pprev);
+		}
+		OBD_SLAB_FREE(ocapa, capa_cachep, sizeof(*ocapa));
+	}
+}
+
+static inline int open_flags_to_accmode(int flags)
+{
+	int mode = flags;
+
+	if ((mode + 1) & O_ACCMODE)
+		mode++;
+	if (mode & O_TRUNC)
+		mode |= 2;
+
+	return mode;
+}
+
+static inline __u64 capa_open_opc(int mode)
+{
+	return mode & FMODE_WRITE ? CAPA_OPC_OSS_WRITE : CAPA_OPC_OSS_READ;
+}
+
+static inline void set_capa_expiry(struct obd_capa *ocapa)
+{
+	cfs_time_t expiry = cfs_time_sub((cfs_time_t)ocapa->c_capa.lc_expiry,
+					 cfs_time_current_sec());
+	ocapa->c_expiry = cfs_time_add(cfs_time_current(),
+				       cfs_time_seconds(expiry));
+}
+
+static inline int capa_is_expired_sec(struct lustre_capa *capa)
+{
+	return (capa->lc_expiry - cfs_time_current_sec() <= 0);
+}
+
+static inline int capa_is_expired(struct obd_capa *ocapa)
+{
+	return cfs_time_beforeq(ocapa->c_expiry, cfs_time_current());
+}
+
+static inline int capa_opc_supported(struct lustre_capa *capa, __u64 opc)
+{
+	return (capa_opc(capa) & opc) == opc;
+}
+
+struct filter_capa_key {
+	struct list_head	      k_list;
+	struct lustre_capa_key  k_key;
+};
+
+enum {
+	LC_ID_NONE      = 0,
+	LC_ID_PLAIN     = 1,
+	LC_ID_CONVERT   = 2
+};
+
+#define BYPASS_CAPA (struct lustre_capa *)ERR_PTR(-ENOENT)
+
+/** @} capa */
+
+#endif /* __LINUX_CAPA_H_ */
diff --git a/drivers/staging/lustre/lustre/include/lustre_cfg.h b/drivers/staging/lustre/lustre/include/lustre_cfg.h
new file mode 100644
index 000000000000..f12429f38215
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_cfg.h
@@ -0,0 +1,299 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTRE_CFG_H
+#define _LUSTRE_CFG_H
+
+/** \defgroup cfg cfg
+ *
+ * @{
+ */
+
+/*
+ * 1cf6
+ * lcfG
+ */
+#define LUSTRE_CFG_VERSION 0x1cf60001
+#define LUSTRE_CFG_MAX_BUFCOUNT 8
+
+#define LCFG_HDR_SIZE(count) \
+    cfs_size_round(offsetof (struct lustre_cfg, lcfg_buflens[(count)]))
+
+/** If the LCFG_REQUIRED bit is set in a configuration command,
+ * then the client is required to understand this parameter
+ * in order to mount the filesystem. If it does not understand
+ * a REQUIRED command the client mount will fail. */
+#define LCFG_REQUIRED	 0x0001000
+
+enum lcfg_command_type {
+	LCFG_ATTACH	     = 0x00cf001, /**< create a new obd instance */
+	LCFG_DETACH	     = 0x00cf002, /**< destroy obd instance */
+	LCFG_SETUP	      = 0x00cf003, /**< call type-specific setup */
+	LCFG_CLEANUP	    = 0x00cf004, /**< call type-specific cleanup */
+	LCFG_ADD_UUID	   = 0x00cf005, /**< add a nid to a niduuid */
+	LCFG_DEL_UUID	   = 0x00cf006, /**< remove a nid from a niduuid */
+	LCFG_MOUNTOPT	   = 0x00cf007, /**< create a profile (mdc, osc) */
+	LCFG_DEL_MOUNTOPT       = 0x00cf008, /**< destroy a profile */
+	LCFG_SET_TIMEOUT	= 0x00cf009, /**< set obd_timeout */
+	LCFG_SET_UPCALL	 = 0x00cf00a, /**< deprecated */
+	LCFG_ADD_CONN	   = 0x00cf00b, /**< add a failover niduuid to an obd */
+	LCFG_DEL_CONN	   = 0x00cf00c, /**< remove a failover niduuid */
+	LCFG_LOV_ADD_OBD	= 0x00cf00d, /**< add an osc to a lov */
+	LCFG_LOV_DEL_OBD	= 0x00cf00e, /**< remove an osc from a lov */
+	LCFG_PARAM	      = 0x00cf00f, /**< set a proc parameter */
+	LCFG_MARKER	     = 0x00cf010, /**< metadata about next cfg rec */
+	LCFG_LOG_START	  = 0x00ce011, /**< mgc only, process a cfg log */
+	LCFG_LOG_END	    = 0x00ce012, /**< stop processing updates */
+	LCFG_LOV_ADD_INA	= 0x00ce013, /**< like LOV_ADD_OBD, inactive */
+	LCFG_ADD_MDC	    = 0x00cf014, /**< add an mdc to a lmv */
+	LCFG_DEL_MDC	    = 0x00cf015, /**< remove an mdc from a lmv */
+	LCFG_SPTLRPC_CONF       = 0x00ce016, /**< security */
+	LCFG_POOL_NEW	   = 0x00ce020, /**< create an ost pool name */
+	LCFG_POOL_ADD	   = 0x00ce021, /**< add an ost to a pool */
+	LCFG_POOL_REM	   = 0x00ce022, /**< remove an ost from a pool */
+	LCFG_POOL_DEL	   = 0x00ce023, /**< destroy an ost pool name */
+	LCFG_SET_LDLM_TIMEOUT   = 0x00ce030, /**< set ldlm_timeout */
+	LCFG_PRE_CLEANUP	= 0x00cf031, /**< call type-specific pre
+					      * cleanup cleanup */
+};
+
+struct lustre_cfg_bufs {
+	void    *lcfg_buf[LUSTRE_CFG_MAX_BUFCOUNT];
+	__u32    lcfg_buflen[LUSTRE_CFG_MAX_BUFCOUNT];
+	__u32    lcfg_bufcount;
+};
+
+struct lustre_cfg {
+	__u32 lcfg_version;
+	__u32 lcfg_command;
+
+	__u32 lcfg_num;
+	__u32 lcfg_flags;
+	__u64 lcfg_nid;
+	__u32 lcfg_nal;		/* not used any more */
+
+	__u32 lcfg_bufcount;
+	__u32 lcfg_buflens[0];
+};
+
+enum cfg_record_type {
+	PORTALS_CFG_TYPE = 1,
+	LUSTRE_CFG_TYPE = 123,
+};
+
+#define LUSTRE_CFG_BUFLEN(lcfg, idx)	    \
+	((lcfg)->lcfg_bufcount <= (idx)	 \
+	 ? 0				    \
+	 : (lcfg)->lcfg_buflens[(idx)])
+
+static inline void lustre_cfg_bufs_set(struct lustre_cfg_bufs *bufs,
+				       __u32		   index,
+				       void		   *buf,
+				       __u32		   buflen)
+{
+	if (index >= LUSTRE_CFG_MAX_BUFCOUNT)
+		return;
+	if (bufs == NULL)
+		return;
+
+	if (bufs->lcfg_bufcount <= index)
+		bufs->lcfg_bufcount = index + 1;
+
+	bufs->lcfg_buf[index]    = buf;
+	bufs->lcfg_buflen[index] = buflen;
+}
+
+static inline void lustre_cfg_bufs_set_string(struct lustre_cfg_bufs *bufs,
+					      __u32 index,
+					      char *str)
+{
+	lustre_cfg_bufs_set(bufs, index, str, str ? strlen(str) + 1 : 0);
+}
+
+static inline void lustre_cfg_bufs_reset(struct lustre_cfg_bufs *bufs, char *name)
+{
+	memset((bufs), 0, sizeof(*bufs));
+	if (name)
+		lustre_cfg_bufs_set_string(bufs, 0, name);
+}
+
+static inline void *lustre_cfg_buf(struct lustre_cfg *lcfg, int index)
+{
+	int i;
+	int offset;
+	int bufcount;
+	LASSERT (lcfg != NULL);
+	LASSERT (index >= 0);
+
+	bufcount = lcfg->lcfg_bufcount;
+	if (index >= bufcount)
+		return NULL;
+
+	offset = LCFG_HDR_SIZE(lcfg->lcfg_bufcount);
+	for (i = 0; i < index; i++)
+		offset += cfs_size_round(lcfg->lcfg_buflens[i]);
+	return (char *)lcfg + offset;
+}
+
+static inline void lustre_cfg_bufs_init(struct lustre_cfg_bufs *bufs,
+					struct lustre_cfg *lcfg)
+{
+	int i;
+	bufs->lcfg_bufcount = lcfg->lcfg_bufcount;
+	for (i = 0; i < bufs->lcfg_bufcount; i++) {
+		bufs->lcfg_buflen[i] = lcfg->lcfg_buflens[i];
+		bufs->lcfg_buf[i] = lustre_cfg_buf(lcfg, i);
+	}
+}
+
+static inline char *lustre_cfg_string(struct lustre_cfg *lcfg, int index)
+{
+	char *s;
+
+	if (lcfg->lcfg_buflens[index] == 0)
+		return NULL;
+
+	s = lustre_cfg_buf(lcfg, index);
+	if (s == NULL)
+		return NULL;
+
+	/*
+	 * make sure it's NULL terminated, even if this kills a char
+	 * of data.  Try to use the padding first though.
+	 */
+	if (s[lcfg->lcfg_buflens[index] - 1] != '\0') {
+		int last = min((int)lcfg->lcfg_buflens[index],
+			       cfs_size_round(lcfg->lcfg_buflens[index]) - 1);
+		char lost = s[last];
+		s[last] = '\0';
+		if (lost != '\0') {
+			CWARN("Truncated buf %d to '%s' (lost '%c'...)\n",
+			      index, s, lost);
+		}
+	}
+	return s;
+}
+
+static inline int lustre_cfg_len(__u32 bufcount, __u32 *buflens)
+{
+	int i;
+	int len;
+	ENTRY;
+
+	len = LCFG_HDR_SIZE(bufcount);
+	for (i = 0; i < bufcount; i++)
+		len += cfs_size_round(buflens[i]);
+
+	RETURN(cfs_size_round(len));
+}
+
+
+#include <obd_support.h>
+
+static inline struct lustre_cfg *lustre_cfg_new(int cmd,
+						struct lustre_cfg_bufs *bufs)
+{
+	struct lustre_cfg *lcfg;
+	char *ptr;
+	int i;
+
+	ENTRY;
+
+	OBD_ALLOC(lcfg, lustre_cfg_len(bufs->lcfg_bufcount,
+				       bufs->lcfg_buflen));
+	if (!lcfg)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	lcfg->lcfg_version = LUSTRE_CFG_VERSION;
+	lcfg->lcfg_command = cmd;
+	lcfg->lcfg_bufcount = bufs->lcfg_bufcount;
+
+	ptr = (char *)lcfg + LCFG_HDR_SIZE(lcfg->lcfg_bufcount);
+	for (i = 0; i < lcfg->lcfg_bufcount; i++) {
+		lcfg->lcfg_buflens[i] = bufs->lcfg_buflen[i];
+		LOGL((char *)bufs->lcfg_buf[i], bufs->lcfg_buflen[i], ptr);
+	}
+	RETURN(lcfg);
+}
+
+static inline void lustre_cfg_free(struct lustre_cfg *lcfg)
+{
+	int len;
+
+	len = lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens);
+
+	OBD_FREE(lcfg, len);
+	EXIT;
+	return;
+}
+
+static inline int lustre_cfg_sanity_check(void *buf, int len)
+{
+	struct lustre_cfg *lcfg = (struct lustre_cfg *)buf;
+	ENTRY;
+	if (!lcfg)
+		RETURN(-EINVAL);
+
+	/* check that the first bits of the struct are valid */
+	if (len < LCFG_HDR_SIZE(0))
+		RETURN(-EINVAL);
+
+	if (lcfg->lcfg_version != LUSTRE_CFG_VERSION)
+		RETURN(-EINVAL);
+
+	if (lcfg->lcfg_bufcount >= LUSTRE_CFG_MAX_BUFCOUNT)
+		RETURN(-EINVAL);
+
+	/* check that the buflens are valid */
+	if (len < LCFG_HDR_SIZE(lcfg->lcfg_bufcount))
+		RETURN(-EINVAL);
+
+	/* make sure all the pointers point inside the data */
+	if (len < lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens))
+		RETURN(-EINVAL);
+
+	RETURN(0);
+}
+
+#include <lustre/lustre_user.h>
+
+#ifndef INVALID_UID
+#define INVALID_UID     (-1)
+#endif
+
+/** @} cfg */
+
+#endif // _LUSTRE_CFG_H
diff --git a/drivers/staging/lustre/lustre/include/lustre_debug.h b/drivers/staging/lustre/lustre/include/lustre_debug.h
new file mode 100644
index 000000000000..3d9e4462af43
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_debug.h
@@ -0,0 +1,76 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTRE_DEBUG_H
+#define _LUSTRE_DEBUG_H
+
+/** \defgroup debug debug
+ *
+ * @{
+ */
+
+#include <lustre_net.h>
+#include <obd.h>
+
+#include <linux/lustre_debug.h>
+
+#define ASSERT_MAX_SIZE_MB 60000ULL
+#define ASSERT_PAGE_INDEX(index, OP)				    \
+do { if (index > ASSERT_MAX_SIZE_MB << (20 - PAGE_CACHE_SHIFT)) {	 \
+	CERROR("bad page index %lu > %llu\n", index,		    \
+	       ASSERT_MAX_SIZE_MB << (20 - PAGE_CACHE_SHIFT));	    \
+	libcfs_debug = ~0UL;					    \
+	OP;							     \
+}} while(0)
+
+#define ASSERT_FILE_OFFSET(offset, OP)				  \
+do { if (offset > ASSERT_MAX_SIZE_MB << 20) {			   \
+	CERROR("bad file offset %llu > %llu\n", offset,		 \
+	       ASSERT_MAX_SIZE_MB << 20);			       \
+	libcfs_debug = ~0UL;					    \
+	OP;							     \
+}} while(0)
+
+/* lib/debug.c */
+void dump_lniobuf(struct niobuf_local *lnb);
+int dump_req(struct ptlrpc_request *req);
+void dump_lsm(int level, struct lov_stripe_md *lsm);
+int block_debug_setup(void *addr, int len, __u64 off, __u64 id);
+int block_debug_check(char *who, void *addr, int len, __u64 off, __u64 id);
+
+/** @} debug */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_disk.h b/drivers/staging/lustre/lustre/include/lustre_disk.h
new file mode 100644
index 000000000000..c2504c5546aa
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_disk.h
@@ -0,0 +1,553 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_disk.h
+ *
+ * Lustre disk format definitions.
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+#ifndef _LUSTRE_DISK_H
+#define _LUSTRE_DISK_H
+
+/** \defgroup disk disk
+ *
+ * @{
+ */
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/types.h>
+
+/****************** on-disk files *********************/
+
+#define MDT_LOGS_DIR      "LOGS"  /* COMPAT_146 */
+#define MOUNT_CONFIGS_DIR "CONFIGS"
+#define CONFIGS_FILE      "mountdata"
+/** Persistent mount data are stored on the disk in this file. */
+#define MOUNT_DATA_FILE    MOUNT_CONFIGS_DIR"/"CONFIGS_FILE
+#define LAST_RCVD	 "last_rcvd"
+#define LOV_OBJID	 "lov_objid"
+#define LOV_OBJSEQ		"lov_objseq"
+#define HEALTH_CHECK      "health_check"
+#define CAPA_KEYS	 "capa_keys"
+#define CHANGELOG_USERS   "changelog_users"
+#define MGS_NIDTBL_DIR    "NIDTBL_VERSIONS"
+#define QMT_DIR	   "quota_master"
+#define QSD_DIR	   "quota_slave"
+
+/****************** persistent mount data *********************/
+
+#define LDD_F_SV_TYPE_MDT   0x0001
+#define LDD_F_SV_TYPE_OST   0x0002
+#define LDD_F_SV_TYPE_MGS   0x0004
+#define LDD_F_SV_TYPE_MASK (LDD_F_SV_TYPE_MDT  | \
+			    LDD_F_SV_TYPE_OST  | \
+			    LDD_F_SV_TYPE_MGS)
+#define LDD_F_SV_ALL	0x0008
+/** need an index assignment */
+#define LDD_F_NEED_INDEX    0x0010
+/** never registered */
+#define LDD_F_VIRGIN	0x0020
+/** update the config logs for this server */
+#define LDD_F_UPDATE	0x0040
+/** rewrite the LDD */
+#define LDD_F_REWRITE_LDD   0x0080
+/** regenerate config logs for this fs or server */
+#define LDD_F_WRITECONF     0x0100
+/** COMPAT_14 */
+#define LDD_F_UPGRADE14     0x0200
+/** process as lctl conf_param */
+#define LDD_F_PARAM	 0x0400
+/** all nodes are specified as service nodes */
+#define LDD_F_NO_PRIMNODE   0x1000
+/** IR enable flag */
+#define LDD_F_IR_CAPABLE    0x2000
+/** the MGS refused to register the target. */
+#define LDD_F_ERROR	 0x4000
+
+/* opc for target register */
+#define LDD_F_OPC_REG   0x10000000
+#define LDD_F_OPC_UNREG 0x20000000
+#define LDD_F_OPC_READY 0x40000000
+#define LDD_F_OPC_MASK  0xf0000000
+
+#define LDD_F_ONDISK_MASK  (LDD_F_SV_TYPE_MASK)
+
+#define LDD_F_MASK	  0xFFFF
+
+enum ldd_mount_type {
+	LDD_MT_EXT3 = 0,
+	LDD_MT_LDISKFS,
+	LDD_MT_SMFS,
+	LDD_MT_REISERFS,
+	LDD_MT_LDISKFS2,
+	LDD_MT_ZFS,
+	LDD_MT_LAST
+};
+
+static inline char *mt_str(enum ldd_mount_type mt)
+{
+	static char *mount_type_string[] = {
+		"ext3",
+		"ldiskfs",
+		"smfs",
+		"reiserfs",
+		"ldiskfs2",
+		"zfs",
+	};
+	return mount_type_string[mt];
+}
+
+static inline char *mt_type(enum ldd_mount_type mt)
+{
+	static char *mount_type_string[] = {
+		"osd-ldiskfs",
+		"osd-ldiskfs",
+		"osd-smfs",
+		"osd-reiserfs",
+		"osd-ldiskfs",
+		"osd-zfs",
+	};
+	return mount_type_string[mt];
+}
+
+#define LDD_INCOMPAT_SUPP 0
+#define LDD_ROCOMPAT_SUPP 0
+
+#define LDD_MAGIC 0x1dd00001
+
+/* On-disk configuration file. In host-endian order. */
+struct lustre_disk_data {
+	__u32      ldd_magic;
+	__u32      ldd_feature_compat;  /* compatible feature flags */
+	__u32      ldd_feature_rocompat;/* read-only compatible feature flags */
+	__u32      ldd_feature_incompat;/* incompatible feature flags */
+
+	__u32      ldd_config_ver;      /* config rewrite count - not used */
+	__u32      ldd_flags;	   /* LDD_SV_TYPE */
+	__u32      ldd_svindex;	 /* server index (0001), must match
+					   svname */
+	__u32      ldd_mount_type;      /* target fs type LDD_MT_* */
+	char       ldd_fsname[64];      /* filesystem this server is part of,
+					   MTI_NAME_MAXLEN */
+	char       ldd_svname[64];      /* this server's name (lustre-mdt0001)*/
+	__u8       ldd_uuid[40];	/* server UUID (COMPAT_146) */
+
+/*200*/ char       ldd_userdata[1024 - 200]; /* arbitrary user string */
+/*1024*/__u8       ldd_padding[4096 - 1024];
+/*4096*/char       ldd_mount_opts[4096]; /* target fs mount opts */
+/*8192*/char       ldd_params[4096];     /* key=value pairs */
+};
+
+
+#define IS_MDT(data)    ((data)->lsi_flags & LDD_F_SV_TYPE_MDT)
+#define IS_OST(data)    ((data)->lsi_flags & LDD_F_SV_TYPE_OST)
+#define IS_MGS(data)    ((data)->lsi_flags & LDD_F_SV_TYPE_MGS)
+#define IS_SERVER(data) ((data)->lsi_flags & (LDD_F_SV_TYPE_MGS | \
+			 LDD_F_SV_TYPE_MDT | LDD_F_SV_TYPE_OST))
+#define MT_STR(data)    mt_str((data)->ldd_mount_type)
+
+/* Make the mdt/ost server obd name based on the filesystem name */
+static inline int server_make_name(__u32 flags, __u16 index, char *fs,
+				   char *name)
+{
+	if (flags & (LDD_F_SV_TYPE_MDT | LDD_F_SV_TYPE_OST)) {
+		if (!(flags & LDD_F_SV_ALL))
+			sprintf(name, "%.8s%c%s%04x", fs,
+				(flags & LDD_F_VIRGIN) ? ':' :
+					((flags & LDD_F_WRITECONF) ? '=' : '-'),
+				(flags & LDD_F_SV_TYPE_MDT) ? "MDT" : "OST",
+				index);
+	} else if (flags & LDD_F_SV_TYPE_MGS) {
+		sprintf(name, "MGS");
+	} else {
+		CERROR("unknown server type %#x\n", flags);
+		return 1;
+	}
+	return 0;
+}
+
+/****************** mount command *********************/
+
+/* The lmd is only used internally by Lustre; mount simply passes
+   everything as string options */
+
+#define LMD_MAGIC    0xbdacbd03
+
+/* gleaned from the mount command - no persistent info here */
+struct lustre_mount_data {
+	__u32      lmd_magic;
+	__u32      lmd_flags;	 /* lustre mount flags */
+	int	lmd_mgs_failnodes; /* mgs failover node count */
+	int	lmd_exclude_count;
+	int	lmd_recovery_time_soft;
+	int	lmd_recovery_time_hard;
+	char      *lmd_dev;	   /* device name */
+	char      *lmd_profile;       /* client only */
+	char      *lmd_mgssec;	/* sptlrpc flavor to mgs */
+	char      *lmd_opts;	  /* lustre mount options (as opposed to
+					 _device_ mount options) */
+	char      *lmd_params;	/* lustre params */
+	__u32     *lmd_exclude;       /* array of OSTs to ignore */
+	char	*lmd_mgs;	   /* MGS nid */
+	char	*lmd_osd_type;      /* OSD type */
+};
+
+#define LMD_FLG_SERVER       0x0001  /* Mounting a server */
+#define LMD_FLG_CLIENT       0x0002  /* Mounting a client */
+#define LMD_FLG_ABORT_RECOV  0x0008  /* Abort recovery */
+#define LMD_FLG_NOSVC	0x0010  /* Only start MGS/MGC for servers,
+					no other services */
+#define LMD_FLG_NOMGS	0x0020  /* Only start target for servers, reusing
+					existing MGS services */
+#define LMD_FLG_WRITECONF    0x0040  /* Rewrite config log */
+#define LMD_FLG_NOIR	 0x0080  /* NO imperative recovery */
+#define LMD_FLG_NOSCRUB	     0x0100  /* Do not trigger scrub automatically */
+#define LMD_FLG_MGS	     0x0200  /* Also start MGS along with server */
+#define LMD_FLG_IAM	     0x0400  /* IAM dir */
+#define LMD_FLG_NO_PRIMNODE  0x0800  /* all nodes are service nodes */
+#define LMD_FLG_VIRGIN	     0x1000  /* the service registers first time */
+#define LMD_FLG_UPDATE	     0x2000  /* update parameters */
+
+#define lmd_is_client(x) ((x)->lmd_flags & LMD_FLG_CLIENT)
+
+
+/****************** last_rcvd file *********************/
+
+/** version recovery epoch */
+#define LR_EPOCH_BITS   32
+#define lr_epoch(a) ((a) >> LR_EPOCH_BITS)
+#define LR_EXPIRE_INTERVALS 16 /**< number of intervals to track transno */
+#define ENOENT_VERSION 1 /** 'virtual' version of non-existent object */
+
+#define LR_SERVER_SIZE   512
+#define LR_CLIENT_START 8192
+#define LR_CLIENT_SIZE   128
+#if LR_CLIENT_START < LR_SERVER_SIZE
+#error "Can't have LR_CLIENT_START < LR_SERVER_SIZE"
+#endif
+
+/*
+ * This limit is arbitrary (131072 clients on x86), but it is convenient to use
+ * 2^n * PAGE_CACHE_SIZE * 8 for the number of bits that fit an order-n allocation.
+ * If we need more than 131072 clients (order-2 allocation on x86) then this
+ * should become an array of single-page pointers that are allocated on demand.
+ */
+#if (128 * 1024UL) > (PAGE_CACHE_SIZE * 8)
+#define LR_MAX_CLIENTS (128 * 1024UL)
+#else
+#define LR_MAX_CLIENTS (PAGE_CACHE_SIZE * 8)
+#endif
+
+/** COMPAT_146: this is an OST (temporary) */
+#define OBD_COMPAT_OST	  0x00000002
+/** COMPAT_146: this is an MDT (temporary) */
+#define OBD_COMPAT_MDT	  0x00000004
+/** 2.0 server, interop flag to show server version is changed */
+#define OBD_COMPAT_20	   0x00000008
+
+/** MDS handles LOV_OBJID file */
+#define OBD_ROCOMPAT_LOVOBJID   0x00000001
+
+/** OST handles group subdirs */
+#define OBD_INCOMPAT_GROUPS     0x00000001
+/** this is an OST */
+#define OBD_INCOMPAT_OST	0x00000002
+/** this is an MDT */
+#define OBD_INCOMPAT_MDT	0x00000004
+/** common last_rvcd format */
+#define OBD_INCOMPAT_COMMON_LR  0x00000008
+/** FID is enabled */
+#define OBD_INCOMPAT_FID	0x00000010
+/** Size-on-MDS is enabled */
+#define OBD_INCOMPAT_SOM	0x00000020
+/** filesystem using iam format to store directory entries */
+#define OBD_INCOMPAT_IAM_DIR    0x00000040
+/** LMA attribute contains per-inode incompatible flags */
+#define OBD_INCOMPAT_LMA	0x00000080
+/** lmm_stripe_count has been shrunk from __u32 to __u16 and the remaining 16
+ * bits are now used to store a generation. Once we start changing the layout
+ * and bumping the generation, old versions expecting a 32-bit lmm_stripe_count
+ * will be confused by interpreting stripe_count | gen << 16 as the actual
+ * stripe count */
+#define OBD_INCOMPAT_LMM_VER    0x00000100
+/** multiple OI files for MDT */
+#define OBD_INCOMPAT_MULTI_OI   0x00000200
+
+/* Data stored per server at the head of the last_rcvd file.  In le32 order.
+   This should be common to filter_internal.h, lustre_mds.h */
+struct lr_server_data {
+	__u8  lsd_uuid[40];	/* server UUID */
+	__u64 lsd_last_transno;    /* last completed transaction ID */
+	__u64 lsd_compat14;	/* reserved - compat with old last_rcvd */
+	__u64 lsd_mount_count;     /* incarnation number */
+	__u32 lsd_feature_compat;  /* compatible feature flags */
+	__u32 lsd_feature_rocompat;/* read-only compatible feature flags */
+	__u32 lsd_feature_incompat;/* incompatible feature flags */
+	__u32 lsd_server_size;     /* size of server data area */
+	__u32 lsd_client_start;    /* start of per-client data area */
+	__u16 lsd_client_size;     /* size of per-client data area */
+	__u16 lsd_subdir_count;    /* number of subdirectories for objects */
+	__u64 lsd_catalog_oid;     /* recovery catalog object id */
+	__u32 lsd_catalog_ogen;    /* recovery catalog inode generation */
+	__u8  lsd_peeruuid[40];    /* UUID of MDS associated with this OST */
+	__u32 lsd_osd_index;       /* index number of OST in LOV */
+	__u32 lsd_padding1;	/* was lsd_mdt_index, unused in 2.4.0 */
+	__u32 lsd_start_epoch;     /* VBR: start epoch from last boot */
+	/** transaction values since lsd_trans_table_time */
+	__u64 lsd_trans_table[LR_EXPIRE_INTERVALS];
+	/** start point of transno table below */
+	__u32 lsd_trans_table_time; /* time of first slot in table above */
+	__u32 lsd_expire_intervals; /* LR_EXPIRE_INTERVALS */
+	__u8  lsd_padding[LR_SERVER_SIZE - 288];
+};
+
+/* Data stored per client in the last_rcvd file.  In le32 order. */
+struct lsd_client_data {
+	__u8  lcd_uuid[40];      /* client UUID */
+	__u64 lcd_last_transno; /* last completed transaction ID */
+	__u64 lcd_last_xid;     /* xid for the last transaction */
+	__u32 lcd_last_result;  /* result from last RPC */
+	__u32 lcd_last_data;    /* per-op data (disposition for open &c.) */
+	/* for MDS_CLOSE requests */
+	__u64 lcd_last_close_transno; /* last completed transaction ID */
+	__u64 lcd_last_close_xid;     /* xid for the last transaction */
+	__u32 lcd_last_close_result;  /* result from last RPC */
+	__u32 lcd_last_close_data;    /* per-op data */
+	/* VBR: last versions */
+	__u64 lcd_pre_versions[4];
+	__u32 lcd_last_epoch;
+	/** orphans handling for delayed export rely on that */
+	__u32 lcd_first_epoch;
+	__u8  lcd_padding[LR_CLIENT_SIZE - 128];
+};
+
+/* bug20354: the lcd_uuid for export of clients may be wrong */
+static inline void check_lcd(char *obd_name, int index,
+			     struct lsd_client_data *lcd)
+{
+	int length = sizeof(lcd->lcd_uuid);
+	if (strnlen((char*)lcd->lcd_uuid, length) == length) {
+		lcd->lcd_uuid[length - 1] = '\0';
+
+		LCONSOLE_ERROR("the client UUID (%s) on %s for exports"
+			       "stored in last_rcvd(index = %d) is bad!\n",
+			       lcd->lcd_uuid, obd_name, index);
+	}
+}
+
+/* last_rcvd handling */
+static inline void lsd_le_to_cpu(struct lr_server_data *buf,
+				 struct lr_server_data *lsd)
+{
+	int i;
+	memcpy(lsd->lsd_uuid, buf->lsd_uuid, sizeof(lsd->lsd_uuid));
+	lsd->lsd_last_transno     = le64_to_cpu(buf->lsd_last_transno);
+	lsd->lsd_compat14	 = le64_to_cpu(buf->lsd_compat14);
+	lsd->lsd_mount_count      = le64_to_cpu(buf->lsd_mount_count);
+	lsd->lsd_feature_compat   = le32_to_cpu(buf->lsd_feature_compat);
+	lsd->lsd_feature_rocompat = le32_to_cpu(buf->lsd_feature_rocompat);
+	lsd->lsd_feature_incompat = le32_to_cpu(buf->lsd_feature_incompat);
+	lsd->lsd_server_size      = le32_to_cpu(buf->lsd_server_size);
+	lsd->lsd_client_start     = le32_to_cpu(buf->lsd_client_start);
+	lsd->lsd_client_size      = le16_to_cpu(buf->lsd_client_size);
+	lsd->lsd_subdir_count     = le16_to_cpu(buf->lsd_subdir_count);
+	lsd->lsd_catalog_oid      = le64_to_cpu(buf->lsd_catalog_oid);
+	lsd->lsd_catalog_ogen     = le32_to_cpu(buf->lsd_catalog_ogen);
+	memcpy(lsd->lsd_peeruuid, buf->lsd_peeruuid, sizeof(lsd->lsd_peeruuid));
+	lsd->lsd_osd_index	= le32_to_cpu(buf->lsd_osd_index);
+	lsd->lsd_padding1	= le32_to_cpu(buf->lsd_padding1);
+	lsd->lsd_start_epoch      = le32_to_cpu(buf->lsd_start_epoch);
+	for (i = 0; i < LR_EXPIRE_INTERVALS; i++)
+		lsd->lsd_trans_table[i] = le64_to_cpu(buf->lsd_trans_table[i]);
+	lsd->lsd_trans_table_time = le32_to_cpu(buf->lsd_trans_table_time);
+	lsd->lsd_expire_intervals = le32_to_cpu(buf->lsd_expire_intervals);
+}
+
+static inline void lsd_cpu_to_le(struct lr_server_data *lsd,
+				 struct lr_server_data *buf)
+{
+	int i;
+	memcpy(buf->lsd_uuid, lsd->lsd_uuid, sizeof(buf->lsd_uuid));
+	buf->lsd_last_transno     = cpu_to_le64(lsd->lsd_last_transno);
+	buf->lsd_compat14	 = cpu_to_le64(lsd->lsd_compat14);
+	buf->lsd_mount_count      = cpu_to_le64(lsd->lsd_mount_count);
+	buf->lsd_feature_compat   = cpu_to_le32(lsd->lsd_feature_compat);
+	buf->lsd_feature_rocompat = cpu_to_le32(lsd->lsd_feature_rocompat);
+	buf->lsd_feature_incompat = cpu_to_le32(lsd->lsd_feature_incompat);
+	buf->lsd_server_size      = cpu_to_le32(lsd->lsd_server_size);
+	buf->lsd_client_start     = cpu_to_le32(lsd->lsd_client_start);
+	buf->lsd_client_size      = cpu_to_le16(lsd->lsd_client_size);
+	buf->lsd_subdir_count     = cpu_to_le16(lsd->lsd_subdir_count);
+	buf->lsd_catalog_oid      = cpu_to_le64(lsd->lsd_catalog_oid);
+	buf->lsd_catalog_ogen     = cpu_to_le32(lsd->lsd_catalog_ogen);
+	memcpy(buf->lsd_peeruuid, lsd->lsd_peeruuid, sizeof(buf->lsd_peeruuid));
+	buf->lsd_osd_index	  = cpu_to_le32(lsd->lsd_osd_index);
+	buf->lsd_padding1	  = cpu_to_le32(lsd->lsd_padding1);
+	buf->lsd_start_epoch      = cpu_to_le32(lsd->lsd_start_epoch);
+	for (i = 0; i < LR_EXPIRE_INTERVALS; i++)
+		buf->lsd_trans_table[i] = cpu_to_le64(lsd->lsd_trans_table[i]);
+	buf->lsd_trans_table_time = cpu_to_le32(lsd->lsd_trans_table_time);
+	buf->lsd_expire_intervals = cpu_to_le32(lsd->lsd_expire_intervals);
+}
+
+static inline void lcd_le_to_cpu(struct lsd_client_data *buf,
+				 struct lsd_client_data *lcd)
+{
+	memcpy(lcd->lcd_uuid, buf->lcd_uuid, sizeof (lcd->lcd_uuid));
+	lcd->lcd_last_transno       = le64_to_cpu(buf->lcd_last_transno);
+	lcd->lcd_last_xid	   = le64_to_cpu(buf->lcd_last_xid);
+	lcd->lcd_last_result	= le32_to_cpu(buf->lcd_last_result);
+	lcd->lcd_last_data	  = le32_to_cpu(buf->lcd_last_data);
+	lcd->lcd_last_close_transno = le64_to_cpu(buf->lcd_last_close_transno);
+	lcd->lcd_last_close_xid     = le64_to_cpu(buf->lcd_last_close_xid);
+	lcd->lcd_last_close_result  = le32_to_cpu(buf->lcd_last_close_result);
+	lcd->lcd_last_close_data    = le32_to_cpu(buf->lcd_last_close_data);
+	lcd->lcd_pre_versions[0]    = le64_to_cpu(buf->lcd_pre_versions[0]);
+	lcd->lcd_pre_versions[1]    = le64_to_cpu(buf->lcd_pre_versions[1]);
+	lcd->lcd_pre_versions[2]    = le64_to_cpu(buf->lcd_pre_versions[2]);
+	lcd->lcd_pre_versions[3]    = le64_to_cpu(buf->lcd_pre_versions[3]);
+	lcd->lcd_last_epoch	 = le32_to_cpu(buf->lcd_last_epoch);
+	lcd->lcd_first_epoch	= le32_to_cpu(buf->lcd_first_epoch);
+}
+
+static inline void lcd_cpu_to_le(struct lsd_client_data *lcd,
+				 struct lsd_client_data *buf)
+{
+	memcpy(buf->lcd_uuid, lcd->lcd_uuid, sizeof (lcd->lcd_uuid));
+	buf->lcd_last_transno       = cpu_to_le64(lcd->lcd_last_transno);
+	buf->lcd_last_xid	   = cpu_to_le64(lcd->lcd_last_xid);
+	buf->lcd_last_result	= cpu_to_le32(lcd->lcd_last_result);
+	buf->lcd_last_data	  = cpu_to_le32(lcd->lcd_last_data);
+	buf->lcd_last_close_transno = cpu_to_le64(lcd->lcd_last_close_transno);
+	buf->lcd_last_close_xid     = cpu_to_le64(lcd->lcd_last_close_xid);
+	buf->lcd_last_close_result  = cpu_to_le32(lcd->lcd_last_close_result);
+	buf->lcd_last_close_data    = cpu_to_le32(lcd->lcd_last_close_data);
+	buf->lcd_pre_versions[0]    = cpu_to_le64(lcd->lcd_pre_versions[0]);
+	buf->lcd_pre_versions[1]    = cpu_to_le64(lcd->lcd_pre_versions[1]);
+	buf->lcd_pre_versions[2]    = cpu_to_le64(lcd->lcd_pre_versions[2]);
+	buf->lcd_pre_versions[3]    = cpu_to_le64(lcd->lcd_pre_versions[3]);
+	buf->lcd_last_epoch	 = cpu_to_le32(lcd->lcd_last_epoch);
+	buf->lcd_first_epoch	= cpu_to_le32(lcd->lcd_first_epoch);
+}
+
+static inline __u64 lcd_last_transno(struct lsd_client_data *lcd)
+{
+	return (lcd->lcd_last_transno > lcd->lcd_last_close_transno ?
+		lcd->lcd_last_transno : lcd->lcd_last_close_transno);
+}
+
+static inline __u64 lcd_last_xid(struct lsd_client_data *lcd)
+{
+	return (lcd->lcd_last_xid > lcd->lcd_last_close_xid ?
+		lcd->lcd_last_xid : lcd->lcd_last_close_xid);
+}
+
+/****************** superblock additional info *********************/
+
+struct ll_sb_info;
+
+struct lustre_sb_info {
+	int		       lsi_flags;
+	struct obd_device	*lsi_mgc;     /* mgc obd */
+	struct lustre_mount_data *lsi_lmd;     /* mount command info */
+	struct ll_sb_info	*lsi_llsbi;   /* add'l client sbi info */
+	struct dt_device	 *lsi_dt_dev;  /* dt device to access disk fs*/
+	struct vfsmount	  *lsi_srv_mnt; /* the one server mount */
+	atomic_t	      lsi_mounts;  /* references to the srv_mnt */
+	char			  lsi_svname[MTI_NAME_MAXLEN];
+	char			  lsi_osd_obdname[64];
+	char			  lsi_osd_uuid[64];
+	struct obd_export	 *lsi_osd_exp;
+	char			  lsi_osd_type[16];
+	char			  lsi_fstype[16];
+	struct backing_dev_info   lsi_bdi;     /* each client mountpoint needs
+						  own backing_dev_info */
+};
+
+#define LSI_UMOUNT_FAILOVER	      0x00200000
+#define LSI_BDI_INITIALIZED	      0x00400000
+
+#define     s2lsi(sb)	((struct lustre_sb_info *)((sb)->s_fs_info))
+#define     s2lsi_nocast(sb) ((sb)->s_fs_info)
+
+#define     get_profile_name(sb)   (s2lsi(sb)->lsi_lmd->lmd_profile)
+#define	    get_mount_flags(sb)	   (s2lsi(sb)->lsi_lmd->lmd_flags)
+#define	    get_mntdev_name(sb)	   (s2lsi(sb)->lsi_lmd->lmd_dev)
+
+
+/****************** mount lookup info *********************/
+
+struct lustre_mount_info {
+	char		 *lmi_name;
+	struct super_block   *lmi_sb;
+	struct vfsmount      *lmi_mnt;
+	struct list_head	    lmi_list_chain;
+};
+
+/* on-disk structure describing local object OIDs storage
+ * the structure to be used with any sequence managed by
+ * local object library */
+struct los_ondisk {
+	__u32 lso_magic;
+	__u32 lso_next_oid;
+};
+
+#define LOS_MAGIC	0xdecafbee
+
+/****************** prototypes *********************/
+
+/* obd_mount.c */
+int server_name2fsname(const char *svname, char *fsname, const char **endptr);
+int server_name2index(const char *svname, __u32 *idx, const char **endptr);
+int server_name2svname(const char *label, char *svname, const char **endptr,
+		       size_t svsize);
+
+int lustre_put_lsi(struct super_block *sb);
+int lustre_start_simple(char *obdname, char *type, char *uuid,
+			char *s1, char *s2, char *s3, char *s4);
+int lustre_start_mgc(struct super_block *sb);
+void lustre_register_client_fill_super(int (*cfs)(struct super_block *sb,
+						  struct vfsmount *mnt));
+void lustre_register_kill_super_cb(void (*cfs)(struct super_block *sb));
+int lustre_common_put_super(struct super_block *sb);
+
+
+int mgc_fsname2resid(char *fsname, struct ldlm_res_id *res_id, int type);
+
+/** @} disk */
+
+#endif // _LUSTRE_DISK_H
diff --git a/drivers/staging/lustre/lustre/include/lustre_dlm.h b/drivers/staging/lustre/lustre/include/lustre_dlm.h
new file mode 100644
index 000000000000..f6eaed810621
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_dlm.h
@@ -0,0 +1,1668 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/** \defgroup LDLM Lustre Distributed Lock Manager
+ *
+ * Lustre DLM is based on VAX DLM.
+ * Its two main roles are:
+ *   - To provide locking assuring consistency of data on all Lustre nodes.
+ *   - To allow clients to cache state protected by a lock by holding the
+ *     lock until a conflicting lock is requested or it is expired by the LRU.
+ *
+ * @{
+ */
+
+#ifndef _LUSTRE_DLM_H__
+#define _LUSTRE_DLM_H__
+
+#include <linux/lustre_dlm.h>
+
+#include <lustre_lib.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_handles.h>
+#include <interval_tree.h> /* for interval_node{}, ldlm_extent */
+#include <lu_ref.h>
+
+struct obd_ops;
+struct obd_device;
+
+#define OBD_LDLM_DEVICENAME  "ldlm"
+
+#define LDLM_DEFAULT_LRU_SIZE (100 * num_online_cpus())
+#define LDLM_DEFAULT_MAX_ALIVE (cfs_time_seconds(36000))
+#define LDLM_CTIME_AGE_LIMIT (10)
+#define LDLM_DEFAULT_PARALLEL_AST_LIMIT 1024
+
+/**
+ * LDLM non-error return states
+ */
+typedef enum {
+	ELDLM_OK = 0,
+
+	ELDLM_LOCK_CHANGED = 300,
+	ELDLM_LOCK_ABORTED = 301,
+	ELDLM_LOCK_REPLACED = 302,
+	ELDLM_NO_LOCK_DATA = 303,
+	ELDLM_LOCK_WOULDBLOCK = 304,
+
+	ELDLM_NAMESPACE_EXISTS = 400,
+	ELDLM_BAD_NAMESPACE    = 401
+} ldlm_error_t;
+
+/**
+ * LDLM namespace type.
+ * The "client" type is actually an indication that this is a narrow local view
+ * into complete namespace on the server. Such namespaces cannot make any
+ * decisions about lack of conflicts or do any autonomous lock granting without
+ * first speaking to a server.
+ */
+typedef enum {
+	LDLM_NAMESPACE_SERVER = 1 << 0,
+	LDLM_NAMESPACE_CLIENT = 1 << 1
+} ldlm_side_t;
+
+/**
+ * Declaration of flags sent through the wire.
+ **/
+#define LDLM_FL_LOCK_CHANGED   0x000001 /* extent, mode, or resource changed */
+
+/**
+ * If the server returns one of these flags, then the lock was put on that list.
+ * If the client sends one of these flags (during recovery ONLY!), it wants the
+ * lock added to the specified list, no questions asked.
+ */
+#define LDLM_FL_BLOCK_GRANTED  0x000002
+#define LDLM_FL_BLOCK_CONV     0x000004
+#define LDLM_FL_BLOCK_WAIT     0x000008
+
+/* Used to be LDLM_FL_CBPENDING 0x000010 moved to non-wire flags */
+
+#define LDLM_FL_AST_SENT       0x000020 /* blocking or cancel packet was
+					 * queued for sending. */
+/* Used to be LDLM_FL_WAIT_NOREPROC 0x000040   moved to non-wire flags */
+/* Used to be LDLM_FL_CANCEL	0x000080   moved to non-wire flags */
+
+/**
+ * Lock is being replayed.  This could probably be implied by the fact that one
+ * of BLOCK_{GRANTED,CONV,WAIT} is set, but that is pretty dangerous.
+ */
+#define LDLM_FL_REPLAY	 0x000100
+
+#define LDLM_FL_INTENT_ONLY    0x000200 /* Don't grant lock, just do intent. */
+
+/* Used to be LDLM_FL_LOCAL_ONLY 0x000400  moved to non-wire flags */
+/* Used to be LDLM_FL_FAILED     0x000800  moved to non-wire flags */
+
+#define LDLM_FL_HAS_INTENT     0x001000 /* lock request has intent */
+
+/* Used to be LDLM_FL_CANCELING  0x002000  moved to non-wire flags */
+/* Used to be LDLM_FL_LOCAL      0x004000  moved to non-wire flags */
+
+#define LDLM_FL_DISCARD_DATA   0x010000 /* discard (no writeback) on cancel */
+
+#define LDLM_FL_NO_TIMEOUT     0x020000 /* Blocked by group lock - wait
+					 * indefinitely */
+
+/** file & record locking */
+#define LDLM_FL_BLOCK_NOWAIT   0x040000 /* Server told not to wait if blocked.
+					 * For AGL, OST will not send glimpse
+					 * callback. */
+#define LDLM_FL_TEST_LOCK      0x080000 // return blocking lock
+
+/* Used to be LDLM_FL_LVB_READY  0x100000 moved to non-wire flags */
+/* Used to be LDLM_FL_KMS_IGNORE 0x200000 moved to non-wire flags */
+/* Used to be LDLM_FL_NO_LRU     0x400000 moved to non-wire flags */
+
+/* Immediatelly cancel such locks when they block some other locks. Send
+ * cancel notification to original lock holder, but expect no reply. This is
+ * for clients (like liblustre) that cannot be expected to reliably response
+ * to blocking AST. */
+#define LDLM_FL_CANCEL_ON_BLOCK 0x800000
+
+/* Flags flags inherited from parent lock when doing intents. */
+#define LDLM_INHERIT_FLAGS     (LDLM_FL_CANCEL_ON_BLOCK)
+
+/* Used to be LDLM_FL_CP_REQD	0x1000000 moved to non-wire flags */
+/* Used to be LDLM_FL_CLEANED	0x2000000 moved to non-wire flags */
+/* Used to be LDLM_FL_ATOMIC_CB      0x4000000 moved to non-wire flags */
+/* Used to be LDLM_FL_BL_AST	 0x10000000 moved to non-wire flags */
+/* Used to be LDLM_FL_BL_DONE	0x20000000 moved to non-wire flags */
+
+/* measure lock contention and return -EUSERS if locking contention is high */
+#define LDLM_FL_DENY_ON_CONTENTION 0x40000000
+
+/* These are flags that are mapped into the flags and ASTs of blocking locks */
+#define LDLM_AST_DISCARD_DATA  0x80000000 /* Add FL_DISCARD to blocking ASTs */
+
+/* Flags sent in AST lock_flags to be mapped into the receiving lock. */
+#define LDLM_AST_FLAGS	 (LDLM_FL_DISCARD_DATA)
+
+/*
+ * --------------------------------------------------------------------------
+ * NOTE! Starting from this point, that is, LDLM_FL_* flags with values above
+ * 0x80000000 will not be sent over the wire.
+ * --------------------------------------------------------------------------
+ */
+
+/**
+ * Declaration of flags not sent through the wire.
+ **/
+
+/**
+ * Used for marking lock as a target for -EINTR while cp_ast sleep
+ * emulation + race with upcoming bl_ast.
+ */
+#define LDLM_FL_FAIL_LOC       0x100000000ULL
+
+/**
+ * Used while processing the unused list to know that we have already
+ * handled this lock and decided to skip it.
+ */
+#define LDLM_FL_SKIPPED	0x200000000ULL
+/* this lock is being destroyed */
+#define LDLM_FL_CBPENDING      0x400000000ULL
+/* not a real flag, not saved in lock */
+#define LDLM_FL_WAIT_NOREPROC  0x800000000ULL
+/* cancellation callback already run */
+#define LDLM_FL_CANCEL	 0x1000000000ULL
+#define LDLM_FL_LOCAL_ONLY     0x2000000000ULL
+/* don't run the cancel callback under ldlm_cli_cancel_unused */
+#define LDLM_FL_FAILED	 0x4000000000ULL
+/* lock cancel has already been sent */
+#define LDLM_FL_CANCELING      0x8000000000ULL
+/* local lock (ie, no srv/cli split) */
+#define LDLM_FL_LOCAL	  0x10000000000ULL
+/* XXX FIXME: This is being added to b_size as a low-risk fix to the fact that
+ * the LVB filling happens _after_ the lock has been granted, so another thread
+ * can match it before the LVB has been updated.  As a dirty hack, we set
+ * LDLM_FL_LVB_READY only after we've done the LVB poop.
+ * this is only needed on LOV/OSC now, where LVB is actually used and callers
+ * must set it in input flags.
+ *
+ * The proper fix is to do the granting inside of the completion AST, which can
+ * be replaced with a LVB-aware wrapping function for OSC locks.  That change is
+ * pretty high-risk, though, and would need a lot more testing. */
+#define LDLM_FL_LVB_READY      0x20000000000ULL
+/* A lock contributes to the known minimum size (KMS) calculation until it has
+ * finished the part of its cancelation that performs write back on its dirty
+ * pages.  It can remain on the granted list during this whole time.  Threads
+ * racing to update the KMS after performing their writeback need to know to
+ * exclude each other's locks from the calculation as they walk the granted
+ * list. */
+#define LDLM_FL_KMS_IGNORE     0x40000000000ULL
+/* completion AST to be executed */
+#define LDLM_FL_CP_REQD	0x80000000000ULL
+/* cleanup_resource has already handled the lock */
+#define LDLM_FL_CLEANED	0x100000000000ULL
+/* optimization hint: LDLM can run blocking callback from current context
+ * w/o involving separate thread. in order to decrease cs rate */
+#define LDLM_FL_ATOMIC_CB      0x200000000000ULL
+
+/* It may happen that a client initiates two operations, e.g. unlink and
+ * mkdir, such that the server sends a blocking AST for conflicting
+ * locks to this client for the first operation, whereas the second
+ * operation has canceled this lock and is waiting for rpc_lock which is
+ * taken by the first operation. LDLM_FL_BL_AST is set by
+ * ldlm_callback_handler() in the lock to prevent the Early Lock Cancel
+ * (ELC) code from cancelling it.
+ *
+ * LDLM_FL_BL_DONE is to be set by ldlm_cancel_callback() when lock
+ * cache is dropped to let ldlm_callback_handler() return EINVAL to the
+ * server. It is used when ELC RPC is already prepared and is waiting
+ * for rpc_lock, too late to send a separate CANCEL RPC. */
+#define LDLM_FL_BL_AST	  0x400000000000ULL
+#define LDLM_FL_BL_DONE	 0x800000000000ULL
+/* Don't put lock into the LRU list, so that it is not canceled due to aging.
+ * Used by MGC locks, they are cancelled only at unmount or by callback. */
+#define LDLM_FL_NO_LRU		0x1000000000000ULL
+
+/**
+ * The blocking callback is overloaded to perform two functions.  These flags
+ * indicate which operation should be performed.
+ */
+#define LDLM_CB_BLOCKING    1
+#define LDLM_CB_CANCELING   2
+
+/**
+ * \name Lock Compatibility Matrix.
+ *
+ * A lock has both a type (extent, flock, inode bits, or plain) and a mode.
+ * Lock types are described in their respective implementation files:
+ * ldlm_{extent,flock,inodebits,plain}.c.
+ *
+ * There are six lock modes along with a compatibility matrix to indicate if
+ * two locks are compatible.
+ *
+ * - EX: Exclusive mode. Before a new file is created, MDS requests EX lock
+ *   on the parent.
+ * - PW: Protective Write (normal write) mode. When a client requests a write
+ *   lock from an OST, a lock with PW mode will be issued.
+ * - PR: Protective Read (normal read) mode. When a client requests a read from
+ *   an OST, a lock with PR mode will be issued. Also, if the client opens a
+ *   file for execution, it is granted a lock with PR mode.
+ * - CW: Concurrent Write mode. The type of lock that the MDS grants if a client
+ *   requests a write lock during a file open operation.
+ * - CR Concurrent Read mode. When a client performs a path lookup, MDS grants
+ *   an inodebit lock with the CR mode on the intermediate path component.
+ * - NL Null mode.
+ *
+ * <PRE>
+ *       NL  CR  CW  PR  PW  EX
+ *  NL    1   1   1   1   1   1
+ *  CR    1   1   1   1   1   0
+ *  CW    1   1   1   0   0   0
+ *  PR    1   1   0   1   0   0
+ *  PW    1   1   0   0   0   0
+ *  EX    1   0   0   0   0   0
+ * </PRE>
+ */
+/** @{ */
+#define LCK_COMPAT_EX  LCK_NL
+#define LCK_COMPAT_PW  (LCK_COMPAT_EX | LCK_CR)
+#define LCK_COMPAT_PR  (LCK_COMPAT_PW | LCK_PR)
+#define LCK_COMPAT_CW  (LCK_COMPAT_PW | LCK_CW)
+#define LCK_COMPAT_CR  (LCK_COMPAT_CW | LCK_PR | LCK_PW)
+#define LCK_COMPAT_NL  (LCK_COMPAT_CR | LCK_EX | LCK_GROUP)
+#define LCK_COMPAT_GROUP  (LCK_GROUP | LCK_NL)
+#define LCK_COMPAT_COS (LCK_COS)
+/** @} Lock Compatibility Matrix */
+
+extern ldlm_mode_t lck_compat_array[];
+
+static inline void lockmode_verify(ldlm_mode_t mode)
+{
+       LASSERT(mode > LCK_MINMODE && mode < LCK_MAXMODE);
+}
+
+static inline int lockmode_compat(ldlm_mode_t exist_mode, ldlm_mode_t new_mode)
+{
+       return (lck_compat_array[exist_mode] & new_mode);
+}
+
+/*
+ *
+ * cluster name spaces
+ *
+ */
+
+#define DLM_OST_NAMESPACE 1
+#define DLM_MDS_NAMESPACE 2
+
+/* XXX
+   - do we just separate this by security domains and use a prefix for
+     multiple namespaces in the same domain?
+   -
+*/
+
+/**
+ * Locking rules for LDLM:
+ *
+ * lr_lock
+ *
+ * lr_lock
+ *     waiting_locks_spinlock
+ *
+ * lr_lock
+ *     led_lock
+ *
+ * lr_lock
+ *     ns_lock
+ *
+ * lr_lvb_mutex
+ *     lr_lock
+ *
+ */
+
+struct ldlm_pool;
+struct ldlm_lock;
+struct ldlm_resource;
+struct ldlm_namespace;
+
+/**
+ * Operations on LDLM pools.
+ * LDLM pool is a pool of locks in the namespace without any implicitly
+ * specified limits.
+ * Locks in the pool are organized in LRU.
+ * Local memory pressure or server instructions (e.g. mempressure on server)
+ * can trigger freeing of locks from the pool
+ */
+struct ldlm_pool_ops {
+	/** Recalculate pool \a pl usage */
+	int (*po_recalc)(struct ldlm_pool *pl);
+	/** Cancel at least \a nr locks from pool \a pl */
+	int (*po_shrink)(struct ldlm_pool *pl, int nr,
+			 unsigned int gfp_mask);
+	int (*po_setup)(struct ldlm_pool *pl, int limit);
+};
+
+/** One second for pools thread check interval. Each pool has own period. */
+#define LDLM_POOLS_THREAD_PERIOD (1)
+
+/** ~6% margin for modest pools. See ldlm_pool.c for details. */
+#define LDLM_POOLS_MODEST_MARGIN_SHIFT (4)
+
+/** Default recalc period for server side pools in sec. */
+#define LDLM_POOL_SRV_DEF_RECALC_PERIOD (1)
+
+/** Default recalc period for client side pools in sec. */
+#define LDLM_POOL_CLI_DEF_RECALC_PERIOD (10)
+
+/**
+ * LDLM pool structure to track granted locks.
+ * For purposes of determining when to release locks on e.g. memory pressure.
+ * This feature is commonly referred to as lru_resize.
+ */
+struct ldlm_pool {
+	/** Pool proc directory. */
+	proc_dir_entry_t	*pl_proc_dir;
+	/** Pool name, must be long enough to hold compound proc entry name. */
+	char			pl_name[100];
+	/** Lock for protecting SLV/CLV updates. */
+	spinlock_t		pl_lock;
+	/** Number of allowed locks in in pool, both, client and server side. */
+	atomic_t		pl_limit;
+	/** Number of granted locks in */
+	atomic_t		pl_granted;
+	/** Grant rate per T. */
+	atomic_t		pl_grant_rate;
+	/** Cancel rate per T. */
+	atomic_t		pl_cancel_rate;
+	/** Server lock volume (SLV). Protected by pl_lock. */
+	__u64			pl_server_lock_volume;
+	/** Current biggest client lock volume. Protected by pl_lock. */
+	__u64			pl_client_lock_volume;
+	/** Lock volume factor. SLV on client is calculated as following:
+	 *  server_slv * lock_volume_factor. */
+	atomic_t		pl_lock_volume_factor;
+	/** Time when last SLV from server was obtained. */
+	time_t			pl_recalc_time;
+	/** Recalculation period for pool. */
+	time_t			pl_recalc_period;
+	/** Recalculation and shrink operations. */
+	struct ldlm_pool_ops	*pl_ops;
+	/** Number of planned locks for next period. */
+	int			pl_grant_plan;
+	/** Pool statistics. */
+	struct lprocfs_stats	*pl_stats;
+};
+
+typedef int (*ldlm_res_policy)(struct ldlm_namespace *, struct ldlm_lock **,
+			       void *req_cookie, ldlm_mode_t mode, __u64 flags,
+			       void *data);
+
+typedef int (*ldlm_cancel_for_recovery)(struct ldlm_lock *lock);
+
+/**
+ * LVB operations.
+ * LVB is Lock Value Block. This is a special opaque (to LDLM) value that could
+ * be associated with an LDLM lock and transferred from client to server and
+ * back.
+ *
+ * Currently LVBs are used by:
+ *  - OSC-OST code to maintain current object size/times
+ *  - layout lock code to return the layout when the layout lock is granted
+ */
+struct ldlm_valblock_ops {
+	int (*lvbo_init)(struct ldlm_resource *res);
+	int (*lvbo_update)(struct ldlm_resource *res,
+			   struct ptlrpc_request *r,
+			   int increase);
+	int (*lvbo_free)(struct ldlm_resource *res);
+	/* Return size of lvb data appropriate RPC size can be reserved */
+	int (*lvbo_size)(struct ldlm_lock *lock);
+	/* Called to fill in lvb data to RPC buffer @buf */
+	int (*lvbo_fill)(struct ldlm_lock *lock, void *buf, int buflen);
+};
+
+/**
+ * LDLM pools related, type of lock pool in the namespace.
+ * Greedy means release cached locks aggressively
+ */
+typedef enum {
+	LDLM_NAMESPACE_GREEDY = 1 << 0,
+	LDLM_NAMESPACE_MODEST = 1 << 1
+} ldlm_appetite_t;
+
+/**
+ * Default values for the "max_nolock_size", "contention_time" and
+ * "contended_locks" namespace tunables.
+ */
+#define NS_DEFAULT_MAX_NOLOCK_BYTES 0
+#define NS_DEFAULT_CONTENTION_SECONDS 2
+#define NS_DEFAULT_CONTENDED_LOCKS 32
+
+struct ldlm_ns_bucket {
+	/** back pointer to namespace */
+	struct ldlm_namespace      *nsb_namespace;
+	/**
+	 * Estimated lock callback time.  Used by adaptive timeout code to
+	 * avoid spurious client evictions due to unresponsiveness when in
+	 * fact the network or overall system load is at fault
+	 */
+	struct adaptive_timeout     nsb_at_estimate;
+};
+
+enum {
+	/** LDLM namespace lock stats */
+	LDLM_NSS_LOCKS	  = 0,
+	LDLM_NSS_LAST
+};
+
+typedef enum {
+	/** invalide type */
+	LDLM_NS_TYPE_UNKNOWN    = 0,
+	/** mdc namespace */
+	LDLM_NS_TYPE_MDC,
+	/** mds namespace */
+	LDLM_NS_TYPE_MDT,
+	/** osc namespace */
+	LDLM_NS_TYPE_OSC,
+	/** ost namespace */
+	LDLM_NS_TYPE_OST,
+	/** mgc namespace */
+	LDLM_NS_TYPE_MGC,
+	/** mgs namespace */
+	LDLM_NS_TYPE_MGT,
+} ldlm_ns_type_t;
+
+/**
+ * LDLM Namespace.
+ *
+ * Namespace serves to contain locks related to a particular service.
+ * There are two kinds of namespaces:
+ * - Server namespace has knowledge of all locks and is therefore authoritative
+ *   to make decisions like what locks could be granted and what conflicts
+ *   exist during new lock enqueue.
+ * - Client namespace only has limited knowledge about locks in the namespace,
+ *   only seeing locks held by the client.
+ *
+ * Every Lustre service has one server namespace present on the server serving
+ * that service. Every client connected to the service has a client namespace
+ * for it.
+ * Every lock obtained by client in that namespace is actually represented by
+ * two in-memory locks. One on the server and one on the client. The locks are
+ * linked by a special cookie by which one node can tell to the other which lock
+ * it actually means during communications. Such locks are called remote locks.
+ * The locks held by server only without any reference to a client are called
+ * local locks.
+ */
+struct ldlm_namespace {
+	/** Backward link to OBD, required for LDLM pool to store new SLV. */
+	struct obd_device	*ns_obd;
+
+	/** Flag indicating if namespace is on client instead of server */
+	ldlm_side_t		ns_client;
+
+	/** Resource hash table for namespace. */
+	cfs_hash_t		*ns_rs_hash;
+
+	/** serialize */
+	spinlock_t		ns_lock;
+
+	/** big refcount (by bucket) */
+	atomic_t		ns_bref;
+
+	/**
+	 * Namespace connect flags supported by server (may be changed via
+	 * /proc, LRU resize may be disabled/enabled).
+	 */
+	__u64			ns_connect_flags;
+
+	/** Client side original connect flags supported by server. */
+	__u64			ns_orig_connect_flags;
+
+	/**
+	 * Position in global namespace list linking all namespaces on
+	 * the node.
+	 */
+	struct list_head		ns_list_chain;
+
+	/**
+	 * List of unused locks for this namespace. This list is also called
+	 * LRU lock list.
+	 * Unused locks are locks with zero reader/writer reference counts.
+	 * This list is only used on clients for lock caching purposes.
+	 * When we want to release some locks voluntarily or if server wants
+	 * us to release some locks due to e.g. memory pressure, we take locks
+	 * to release from the head of this list.
+	 * Locks are linked via l_lru field in \see struct ldlm_lock.
+	 */
+	struct list_head		ns_unused_list;
+	/** Number of locks in the LRU list above */
+	int			ns_nr_unused;
+
+	/**
+	 * Maximum number of locks permitted in the LRU. If 0, means locks
+	 * are managed by pools and there is no preset limit, rather it is all
+	 * controlled by available memory on this client and on server.
+	 */
+	unsigned int		ns_max_unused;
+	/** Maximum allowed age (last used time) for locks in the LRU */
+	unsigned int		ns_max_age;
+	/**
+	 * Server only: number of times we evicted clients due to lack of reply
+	 * to ASTs.
+	 */
+	unsigned int		ns_timeouts;
+	/**
+	 * Number of seconds since the file change time after which the
+	 * MDT will return an UPDATE lock along with a LOOKUP lock.
+	 * This allows the client to start caching negative dentries
+	 * for a directory and may save an RPC for a later stat.
+	 */
+	unsigned int		ns_ctime_age_limit;
+
+	/**
+	 * Used to rate-limit ldlm_namespace_dump calls.
+	 * \see ldlm_namespace_dump. Increased by 10 seconds every time
+	 * it is called.
+	 */
+	cfs_time_t		ns_next_dump;
+
+	/** "policy" function that does actual lock conflict determination */
+	ldlm_res_policy		ns_policy;
+
+	/**
+	 * LVB operations for this namespace.
+	 * \see struct ldlm_valblock_ops
+	 */
+	struct ldlm_valblock_ops *ns_lvbo;
+
+	/**
+	 * Used by filter code to store pointer to OBD of the service.
+	 * Should be dropped in favor of \a ns_obd
+	 */
+	void			*ns_lvbp;
+
+	/**
+	 * Wait queue used by __ldlm_namespace_free. Gets woken up every time
+	 * a resource is removed.
+	 */
+	wait_queue_head_t		ns_waitq;
+	/** LDLM pool structure for this namespace */
+	struct ldlm_pool	ns_pool;
+	/** Definition of how eagerly unused locks will be released from LRU */
+	ldlm_appetite_t		ns_appetite;
+
+	/**
+	 * If more than \a ns_contended_locks are found, the resource is
+	 * considered to be contended. Lock enqueues might specify that no
+	 * contended locks should be granted
+	 */
+	unsigned		ns_contended_locks;
+
+	/**
+	 * The resources in this namespace remember contended state during
+	 * \a ns_contention_time, in seconds.
+	 */
+	unsigned		ns_contention_time;
+
+	/**
+	 * Limit size of contended extent locks, in bytes.
+	 * If extended lock is requested for more then this many bytes and
+	 * caller instructs us not to grant contended locks, we would disregard
+	 * such a request.
+	 */
+	unsigned		ns_max_nolock_size;
+
+	/** Limit of parallel AST RPC count. */
+	unsigned		ns_max_parallel_ast;
+
+	/** Callback to cancel locks before replaying it during recovery. */
+	ldlm_cancel_for_recovery ns_cancel_for_recovery;
+
+	/** LDLM lock stats */
+	struct lprocfs_stats	*ns_stats;
+
+	/**
+	 * Flag to indicate namespace is being freed. Used to determine if
+	 * recalculation of LDLM pool statistics should be skipped.
+	 */
+	unsigned		ns_stopping:1;
+};
+
+/**
+ * Returns 1 if namespace \a ns is a client namespace.
+ */
+static inline int ns_is_client(struct ldlm_namespace *ns)
+{
+	LASSERT(ns != NULL);
+	LASSERT(!(ns->ns_client & ~(LDLM_NAMESPACE_CLIENT |
+				    LDLM_NAMESPACE_SERVER)));
+	LASSERT(ns->ns_client == LDLM_NAMESPACE_CLIENT ||
+		ns->ns_client == LDLM_NAMESPACE_SERVER);
+	return ns->ns_client == LDLM_NAMESPACE_CLIENT;
+}
+
+/**
+ * Returns 1 if namespace \a ns is a server namespace.
+ */
+static inline int ns_is_server(struct ldlm_namespace *ns)
+{
+	LASSERT(ns != NULL);
+	LASSERT(!(ns->ns_client & ~(LDLM_NAMESPACE_CLIENT |
+				    LDLM_NAMESPACE_SERVER)));
+	LASSERT(ns->ns_client == LDLM_NAMESPACE_CLIENT ||
+		ns->ns_client == LDLM_NAMESPACE_SERVER);
+	return ns->ns_client == LDLM_NAMESPACE_SERVER;
+}
+
+/**
+ * Returns 1 if namespace \a ns supports early lock cancel (ELC).
+ */
+static inline int ns_connect_cancelset(struct ldlm_namespace *ns)
+{
+	LASSERT(ns != NULL);
+	return !!(ns->ns_connect_flags & OBD_CONNECT_CANCELSET);
+}
+
+/**
+ * Returns 1 if this namespace supports lru_resize.
+ */
+static inline int ns_connect_lru_resize(struct ldlm_namespace *ns)
+{
+	LASSERT(ns != NULL);
+	return !!(ns->ns_connect_flags & OBD_CONNECT_LRU_RESIZE);
+}
+
+static inline void ns_register_cancel(struct ldlm_namespace *ns,
+				      ldlm_cancel_for_recovery arg)
+{
+	LASSERT(ns != NULL);
+	ns->ns_cancel_for_recovery = arg;
+}
+
+struct ldlm_lock;
+
+/** Type for blocking callback function of a lock. */
+typedef int (*ldlm_blocking_callback)(struct ldlm_lock *lock,
+				      struct ldlm_lock_desc *new, void *data,
+				      int flag);
+/** Type for completion callback function of a lock. */
+typedef int (*ldlm_completion_callback)(struct ldlm_lock *lock, __u64 flags,
+					void *data);
+/** Type for glimpse callback function of a lock. */
+typedef int (*ldlm_glimpse_callback)(struct ldlm_lock *lock, void *data);
+/** Type for weight callback function of a lock. */
+typedef unsigned long (*ldlm_weigh_callback)(struct ldlm_lock *lock);
+
+/** Work list for sending GL ASTs to multiple locks. */
+struct ldlm_glimpse_work {
+	struct ldlm_lock	*gl_lock; /* lock to glimpse */
+	struct list_head		 gl_list; /* linkage to other gl work structs */
+	__u32			 gl_flags;/* see LDLM_GL_WORK_* below */
+	union ldlm_gl_desc	*gl_desc; /* glimpse descriptor to be packed in
+					   * glimpse callback request */
+};
+
+/** The ldlm_glimpse_work is allocated on the stack and should not be freed. */
+#define LDLM_GL_WORK_NOFREE 0x1
+
+/** Interval node data for each LDLM_EXTENT lock. */
+struct ldlm_interval {
+	struct interval_node	li_node;  /* node for tree management */
+	struct list_head		li_group; /* the locks which have the same
+					   * policy - group of the policy */
+};
+#define to_ldlm_interval(n) container_of(n, struct ldlm_interval, li_node)
+
+/**
+ * Interval tree for extent locks.
+ * The interval tree must be accessed under the resource lock.
+ * Interval trees are used for granted extent locks to speed up conflicts
+ * lookup. See ldlm/interval_tree.c for more details.
+ */
+struct ldlm_interval_tree {
+	/** Tree size. */
+	int			lit_size;
+	ldlm_mode_t		lit_mode;  /* lock mode */
+	struct interval_node	*lit_root; /* actual ldlm_interval */
+};
+
+/** Whether to track references to exports by LDLM locks. */
+#define LUSTRE_TRACKS_LOCK_EXP_REFS (0)
+
+/** Cancel flags. */
+typedef enum {
+	LCF_ASYNC      = 0x1, /* Cancel locks asynchronously. */
+	LCF_LOCAL      = 0x2, /* Cancel locks locally, not notifing server */
+	LCF_BL_AST     = 0x4, /* Cancel locks marked as LDLM_FL_BL_AST
+			       * in the same RPC */
+} ldlm_cancel_flags_t;
+
+struct ldlm_flock {
+	__u64 start;
+	__u64 end;
+	__u64 owner;
+	__u64 blocking_owner;
+	struct obd_export *blocking_export;
+	/* Protected by the hash lock */
+	__u32 blocking_refs;
+	__u32 pid;
+};
+
+typedef union {
+	struct ldlm_extent l_extent;
+	struct ldlm_flock l_flock;
+	struct ldlm_inodebits l_inodebits;
+} ldlm_policy_data_t;
+
+void ldlm_convert_policy_to_wire(ldlm_type_t type,
+				 const ldlm_policy_data_t *lpolicy,
+				 ldlm_wire_policy_data_t *wpolicy);
+void ldlm_convert_policy_to_local(struct obd_export *exp, ldlm_type_t type,
+				  const ldlm_wire_policy_data_t *wpolicy,
+				  ldlm_policy_data_t *lpolicy);
+
+enum lvb_type {
+	LVB_T_NONE	= 0,
+	LVB_T_OST	= 1,
+	LVB_T_LQUOTA	= 2,
+	LVB_T_LAYOUT	= 3,
+};
+
+/**
+ * LDLM lock structure
+ *
+ * Represents a single LDLM lock and its state in memory. Each lock is
+ * associated with a single ldlm_resource, the object which is being
+ * locked. There may be multiple ldlm_locks on a single resource,
+ * depending on the lock type and whether the locks are conflicting or
+ * not.
+ */
+struct ldlm_lock {
+	/**
+	 * Local lock handle.
+	 * When remote side wants to tell us about a lock, they address
+	 * it by this opaque handle.  The handle does not hold a
+	 * reference on the ldlm_lock, so it can be safely passed to
+	 * other threads or nodes. When the lock needs to be accessed
+	 * from the handle, it is looked up again in the lock table, and
+	 * may no longer exist.
+	 *
+	 * Must be first in the structure.
+	 */
+	struct portals_handle	l_handle;
+	/**
+	 * Lock reference count.
+	 * This is how many users have pointers to actual structure, so that
+	 * we do not accidentally free lock structure that is in use.
+	 */
+	atomic_t		l_refc;
+	/**
+	 * Internal spinlock protects l_resource.  We should hold this lock
+	 * first before taking res_lock.
+	 */
+	spinlock_t		l_lock;
+	/**
+	 * Pointer to actual resource this lock is in.
+	 * ldlm_lock_change_resource() can change this.
+	 */
+	struct ldlm_resource	*l_resource;
+	/**
+	 * List item for client side LRU list.
+	 * Protected by ns_lock in struct ldlm_namespace.
+	 */
+	struct list_head		l_lru;
+	/**
+	 * Linkage to resource's lock queues according to current lock state.
+	 * (could be granted, waiting or converting)
+	 * Protected by lr_lock in struct ldlm_resource.
+	 */
+	struct list_head		l_res_link;
+	/**
+	 * Tree node for ldlm_extent.
+	 */
+	struct ldlm_interval	*l_tree_node;
+	/**
+	 * Per export hash of locks.
+	 * Protected by per-bucket exp->exp_lock_hash locks.
+	 */
+	struct hlist_node	l_exp_hash;
+	/**
+	 * Per export hash of flock locks.
+	 * Protected by per-bucket exp->exp_flock_hash locks.
+	 */
+	struct hlist_node	l_exp_flock_hash;
+	/**
+	 * Requested mode.
+	 * Protected by lr_lock.
+	 */
+	ldlm_mode_t		l_req_mode;
+	/**
+	 * Granted mode, also protected by lr_lock.
+	 */
+	ldlm_mode_t		l_granted_mode;
+	/** Lock completion handler pointer. Called when lock is granted. */
+	ldlm_completion_callback l_completion_ast;
+	/**
+	 * Lock blocking AST handler pointer.
+	 * It plays two roles:
+	 * - as a notification of an attempt to queue a conflicting lock (once)
+	 * - as a notification when the lock is being cancelled.
+	 *
+	 * As such it's typically called twice: once for the initial conflict
+	 * and then once more when the last user went away and the lock is
+	 * cancelled (could happen recursively).
+	 */
+	ldlm_blocking_callback	l_blocking_ast;
+	/**
+	 * Lock glimpse handler.
+	 * Glimpse handler is used to obtain LVB updates from a client by
+	 * server
+	 */
+	ldlm_glimpse_callback	l_glimpse_ast;
+
+	/** XXX apparently unused "weight" handler. To be removed? */
+	ldlm_weigh_callback	l_weigh_ast;
+
+	/**
+	 * Lock export.
+	 * This is a pointer to actual client export for locks that were granted
+	 * to clients. Used server-side.
+	 */
+	struct obd_export	*l_export;
+	/**
+	 * Lock connection export.
+	 * Pointer to server export on a client.
+	 */
+	struct obd_export	*l_conn_export;
+
+	/**
+	 * Remote lock handle.
+	 * If the lock is remote, this is the handle of the other side lock
+	 * (l_handle)
+	 */
+	struct lustre_handle	l_remote_handle;
+
+	/**
+	 * Representation of private data specific for a lock type.
+	 * Examples are: extent range for extent lock or bitmask for ibits locks
+	 */
+	ldlm_policy_data_t	l_policy_data;
+
+	/**
+	 * Lock state flags.
+	 * Like whenever we receive any blocking requests for this lock, etc.
+	 * Protected by lr_lock.
+	 */
+	__u64			l_flags;
+	/**
+	 * Lock r/w usage counters.
+	 * Protected by lr_lock.
+	 */
+	__u32			l_readers;
+	__u32			l_writers;
+	/**
+	 * If the lock is granted, a process sleeps on this waitq to learn when
+	 * it's no longer in use.  If the lock is not granted, a process sleeps
+	 * on this waitq to learn when it becomes granted.
+	 */
+	wait_queue_head_t		l_waitq;
+
+	/**
+	 * Seconds. It will be updated if there is any activity related to
+	 * the lock, e.g. enqueue the lock or send blocking AST.
+	 */
+	cfs_time_t		l_last_activity;
+
+	/**
+	 * Time last used by e.g. being matched by lock match.
+	 * Jiffies. Should be converted to time if needed.
+	 */
+	cfs_time_t		l_last_used;
+
+	/** Originally requested extent for the extent lock. */
+	struct ldlm_extent	l_req_extent;
+
+	unsigned int		l_failed:1,
+	/**
+	 * Set for locks that were removed from class hash table and will be
+	 * destroyed when last reference to them is released. Set by
+	 * ldlm_lock_destroy_internal().
+	 *
+	 * Protected by lock and resource locks.
+	 */
+				l_destroyed:1,
+	/*
+	 * it's set in lock_res_and_lock() and unset in unlock_res_and_lock().
+	 *
+	 * NB: compared with check_res_locked(), checking this bit is cheaper.
+	 * Also, spin_is_locked() is deprecated for kernel code; one reason is
+	 * because it works only for SMP so user needs to add extra macros like
+	 * LASSERT_SPIN_LOCKED for uniprocessor kernels.
+	 */
+				l_res_locked:1,
+	/*
+	 * It's set once we call ldlm_add_waiting_lock_res_locked()
+	 * to start the lock-timeout timer and it will never be reset.
+	 *
+	 * Protected by lock_res_and_lock().
+	 */
+				l_waited:1,
+	/** Flag whether this is a server namespace lock. */
+				l_ns_srv:1;
+
+	/*
+	 * Client-side-only members.
+	 */
+
+	enum lvb_type	      l_lvb_type;
+
+	/**
+	 * Temporary storage for a LVB received during an enqueue operation.
+	 */
+	__u32			l_lvb_len;
+	void			*l_lvb_data;
+
+	/** Private storage for lock user. Opaque to LDLM. */
+	void			*l_ast_data;
+
+	/*
+	 * Server-side-only members.
+	 */
+
+	/**
+	 * Connection cookie for the client originating the operation.
+	 * Used by Commit on Share (COS) code. Currently only used for
+	 * inodebits locks on MDS.
+	 */
+	__u64			l_client_cookie;
+
+	/**
+	 * List item for locks waiting for cancellation from clients.
+	 * The lists this could be linked into are:
+	 * waiting_locks_list (protected by waiting_locks_spinlock),
+	 * then if the lock timed out, it is moved to
+	 * expired_lock_thread.elt_expired_locks for further processing.
+	 * Protected by elt_lock.
+	 */
+	struct list_head		l_pending_chain;
+
+	/**
+	 * Set when lock is sent a blocking AST. Time in seconds when timeout
+	 * is reached and client holding this lock could be evicted.
+	 * This timeout could be further extended by e.g. certain IO activity
+	 * under this lock.
+	 * \see ost_rw_prolong_locks
+	 */
+	cfs_time_t		l_callback_timeout;
+
+	/** Local PID of process which created this lock. */
+	__u32			l_pid;
+
+	/**
+	 * Number of times blocking AST was sent for this lock.
+	 * This is for debugging. Valid values are 0 and 1, if there is an
+	 * attempt to send blocking AST more than once, an assertion would be
+	 * hit. \see ldlm_work_bl_ast_lock
+	 */
+	int			l_bl_ast_run;
+	/** List item ldlm_add_ast_work_item() for case of blocking ASTs. */
+	struct list_head		l_bl_ast;
+	/** List item ldlm_add_ast_work_item() for case of completion ASTs. */
+	struct list_head		l_cp_ast;
+	/** For ldlm_add_ast_work_item() for "revoke" AST used in COS. */
+	struct list_head		l_rk_ast;
+
+	/**
+	 * Pointer to a conflicting lock that caused blocking AST to be sent
+	 * for this lock
+	 */
+	struct ldlm_lock	*l_blocking_lock;
+
+	/**
+	 * Protected by lr_lock, linkages to "skip lists".
+	 * For more explanations of skip lists see ldlm/ldlm_inodebits.c
+	 */
+	struct list_head		l_sl_mode;
+	struct list_head		l_sl_policy;
+
+	/** Reference tracking structure to debug leaked locks. */
+	struct lu_ref		l_reference;
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+	/* Debugging stuff for bug 20498, for tracking export references. */
+	/** number of export references taken */
+	int			l_exp_refs_nr;
+	/** link all locks referencing one export */
+	struct list_head		l_exp_refs_link;
+	/** referenced export object */
+	struct obd_export	*l_exp_refs_target;
+#endif
+	/**
+	 * export blocking dlm lock list, protected by
+	 * l_export->exp_bl_list_lock.
+	 * Lock order of waiting_lists_spinlock, exp_bl_list_lock and res lock
+	 * is: res lock -> exp_bl_list_lock -> wanting_lists_spinlock.
+	 */
+	struct list_head		l_exp_list;
+};
+
+/**
+ * LDLM resource description.
+ * Basically, resource is a representation for a single object.
+ * Object has a name which is currently 4 64-bit integers. LDLM user is
+ * responsible for creation of a mapping between objects it wants to be
+ * protected and resource names.
+ *
+ * A resource can only hold locks of a single lock type, though there may be
+ * multiple ldlm_locks on a single resource, depending on the lock type and
+ * whether the locks are conflicting or not.
+ */
+struct ldlm_resource {
+	struct ldlm_ns_bucket	*lr_ns_bucket;
+
+	/**
+	 * List item for list in namespace hash.
+	 * protected by ns_lock
+	 */
+	struct hlist_node	lr_hash;
+
+	/** Spinlock to protect locks under this resource. */
+	spinlock_t		lr_lock;
+
+	/**
+	 * protected by lr_lock
+	 * @{ */
+	/** List of locks in granted state */
+	struct list_head		lr_granted;
+	/** List of locks waiting to change their granted mode (converted) */
+	struct list_head		lr_converting;
+	/**
+	 * List of locks that could not be granted due to conflicts and
+	 * that are waiting for conflicts to go away */
+	struct list_head		lr_waiting;
+	/** @} */
+
+	/* XXX No longer needed? Remove ASAP */
+	ldlm_mode_t		lr_most_restr;
+
+	/** Type of locks this resource can hold. Only one type per resource. */
+	ldlm_type_t		lr_type; /* LDLM_{PLAIN,EXTENT,FLOCK,IBITS} */
+
+	/** Resource name */
+	struct ldlm_res_id	lr_name;
+	/** Reference count for this resource */
+	atomic_t		lr_refcount;
+
+	/**
+	 * Interval trees (only for extent locks) for all modes of this resource
+	 */
+	struct ldlm_interval_tree lr_itree[LCK_MODE_NUM];
+
+	/**
+	 * Server-side-only lock value block elements.
+	 * To serialize lvbo_init.
+	 */
+	struct mutex		lr_lvb_mutex;
+	int			lr_lvb_len;
+	/** protected by lr_lock */
+	void			*lr_lvb_data;
+
+	/** When the resource was considered as contended. */
+	cfs_time_t		lr_contention_time;
+	/** List of references to this resource. For debugging. */
+	struct lu_ref		lr_reference;
+
+	struct inode		*lr_lvb_inode;
+};
+
+static inline bool ldlm_has_layout(struct ldlm_lock *lock)
+{
+	return lock->l_resource->lr_type == LDLM_IBITS &&
+		lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_LAYOUT;
+}
+
+static inline char *
+ldlm_ns_name(struct ldlm_namespace *ns)
+{
+	return ns->ns_rs_hash->hs_name;
+}
+
+static inline struct ldlm_namespace *
+ldlm_res_to_ns(struct ldlm_resource *res)
+{
+	return res->lr_ns_bucket->nsb_namespace;
+}
+
+static inline struct ldlm_namespace *
+ldlm_lock_to_ns(struct ldlm_lock *lock)
+{
+	return ldlm_res_to_ns(lock->l_resource);
+}
+
+static inline char *
+ldlm_lock_to_ns_name(struct ldlm_lock *lock)
+{
+	return ldlm_ns_name(ldlm_lock_to_ns(lock));
+}
+
+static inline struct adaptive_timeout *
+ldlm_lock_to_ns_at(struct ldlm_lock *lock)
+{
+	return &lock->l_resource->lr_ns_bucket->nsb_at_estimate;
+}
+
+static inline int ldlm_lvbo_init(struct ldlm_resource *res)
+{
+	struct ldlm_namespace *ns = ldlm_res_to_ns(res);
+
+	if (ns->ns_lvbo != NULL && ns->ns_lvbo->lvbo_init != NULL)
+		return ns->ns_lvbo->lvbo_init(res);
+
+	return 0;
+}
+
+static inline int ldlm_lvbo_size(struct ldlm_lock *lock)
+{
+	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+	if (ns->ns_lvbo != NULL && ns->ns_lvbo->lvbo_size != NULL)
+		return ns->ns_lvbo->lvbo_size(lock);
+
+	return 0;
+}
+
+static inline int ldlm_lvbo_fill(struct ldlm_lock *lock, void *buf, int len)
+{
+	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+	if (ns->ns_lvbo != NULL) {
+		LASSERT(ns->ns_lvbo->lvbo_fill != NULL);
+		return ns->ns_lvbo->lvbo_fill(lock, buf, len);
+	}
+	return 0;
+}
+
+struct ldlm_ast_work {
+	struct ldlm_lock      *w_lock;
+	int		    w_blocking;
+	struct ldlm_lock_desc  w_desc;
+	struct list_head	     w_list;
+	int		    w_flags;
+	void		  *w_data;
+	int		    w_datalen;
+};
+
+/**
+ * Common ldlm_enqueue parameters
+ */
+struct ldlm_enqueue_info {
+	__u32 ei_type;   /** Type of the lock being enqueued. */
+	__u32 ei_mode;   /** Mode of the lock being enqueued. */
+	void *ei_cb_bl;  /** blocking lock callback */
+	void *ei_cb_cp;  /** lock completion callback */
+	void *ei_cb_gl;  /** lock glimpse callback */
+	void *ei_cb_wg;  /** lock weigh callback */
+	void *ei_cbdata; /** Data to be passed into callbacks. */
+};
+
+extern struct obd_ops ldlm_obd_ops;
+
+extern char *ldlm_lockname[];
+extern char *ldlm_typename[];
+extern char *ldlm_it2str(int it);
+
+/**
+ * Just a fancy CDEBUG call with log level preset to LDLM_DEBUG.
+ * For the cases where we do not have actual lock to print along
+ * with a debugging message that is ldlm-related
+ */
+#define LDLM_DEBUG_NOLOCK(format, a...)			\
+	CDEBUG(D_DLMTRACE, "### " format "\n" , ##a)
+
+/**
+ * Support function for lock information printing into debug logs.
+ * \see LDLM_DEBUG
+ */
+#define ldlm_lock_debug(msgdata, mask, cdls, lock, fmt, a...) do {      \
+	CFS_CHECK_STACK(msgdata, mask, cdls);			   \
+									\
+	if (((mask) & D_CANTMASK) != 0 ||			       \
+	    ((libcfs_debug & (mask)) != 0 &&			    \
+	     (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0))	  \
+		_ldlm_lock_debug(lock, msgdata, fmt, ##a);	      \
+} while(0)
+
+void _ldlm_lock_debug(struct ldlm_lock *lock,
+		      struct libcfs_debug_msg_data *data,
+		      const char *fmt, ...)
+	__attribute__ ((format (printf, 3, 4)));
+
+/**
+ * Rate-limited version of lock printing function.
+ */
+#define LDLM_DEBUG_LIMIT(mask, lock, fmt, a...) do {			 \
+	static cfs_debug_limit_state_t _ldlm_cdls;			   \
+	LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, &_ldlm_cdls);	      \
+	ldlm_lock_debug(&msgdata, mask, &_ldlm_cdls, lock, "### " fmt , ##a);\
+} while (0)
+
+#define LDLM_ERROR(lock, fmt, a...) LDLM_DEBUG_LIMIT(D_ERROR, lock, fmt, ## a)
+#define LDLM_WARN(lock, fmt, a...)  LDLM_DEBUG_LIMIT(D_WARNING, lock, fmt, ## a)
+
+/** Non-rate-limited lock printing function for debugging purposes. */
+#define LDLM_DEBUG(lock, fmt, a...)   do {				  \
+	if (likely(lock != NULL)) {					    \
+		LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_DLMTRACE, NULL);      \
+		ldlm_lock_debug(&msgdata, D_DLMTRACE, NULL, lock,	    \
+				"### " fmt , ##a);			    \
+	} else {							    \
+		LDLM_DEBUG_NOLOCK("no dlm lock: " fmt, ##a);		    \
+	}								    \
+} while (0)
+
+typedef int (*ldlm_processing_policy)(struct ldlm_lock *lock, __u64 *flags,
+				      int first_enq, ldlm_error_t *err,
+				      struct list_head *work_list);
+
+/**
+ * Return values for lock iterators.
+ * Also used during deciding of lock grants and cancellations.
+ */
+#define LDLM_ITER_CONTINUE 1 /* keep iterating */
+#define LDLM_ITER_STOP     2 /* stop iterating */
+
+typedef int (*ldlm_iterator_t)(struct ldlm_lock *, void *);
+typedef int (*ldlm_res_iterator_t)(struct ldlm_resource *, void *);
+
+/** \defgroup ldlm_iterator Lock iterators
+ *
+ * LDLM provides for a way to iterate through every lock on a resource or
+ * namespace or every resource in a namespace.
+ * @{ */
+int ldlm_resource_foreach(struct ldlm_resource *res, ldlm_iterator_t iter,
+			  void *closure);
+void ldlm_namespace_foreach(struct ldlm_namespace *ns, ldlm_iterator_t iter,
+			    void *closure);
+int ldlm_resource_iterate(struct ldlm_namespace *, const struct ldlm_res_id *,
+			  ldlm_iterator_t iter, void *data);
+/** @} ldlm_iterator */
+
+int ldlm_replay_locks(struct obd_import *imp);
+
+/* ldlm_flock.c */
+int ldlm_flock_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data);
+
+/* ldlm_extent.c */
+__u64 ldlm_extent_shift_kms(struct ldlm_lock *lock, __u64 old_kms);
+
+struct ldlm_callback_suite {
+	ldlm_completion_callback lcs_completion;
+	ldlm_blocking_callback   lcs_blocking;
+	ldlm_glimpse_callback    lcs_glimpse;
+	ldlm_weigh_callback      lcs_weigh;
+};
+
+/* ldlm_lockd.c */
+int ldlm_del_waiting_lock(struct ldlm_lock *lock);
+int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout);
+int ldlm_get_ref(void);
+void ldlm_put_ref(void);
+int ldlm_init_export(struct obd_export *exp);
+void ldlm_destroy_export(struct obd_export *exp);
+struct ldlm_lock *ldlm_request_lock(struct ptlrpc_request *req);
+
+/* ldlm_lock.c */
+void ldlm_register_intent(struct ldlm_namespace *ns, ldlm_res_policy arg);
+void ldlm_lock2handle(const struct ldlm_lock *lock,
+		      struct lustre_handle *lockh);
+struct ldlm_lock *__ldlm_handle2lock(const struct lustre_handle *, __u64 flags);
+void ldlm_cancel_callback(struct ldlm_lock *);
+int ldlm_lock_remove_from_lru(struct ldlm_lock *);
+int ldlm_lock_set_data(struct lustre_handle *, void *);
+
+/**
+ * Obtain a lock reference by its handle.
+ */
+static inline struct ldlm_lock *ldlm_handle2lock(const struct lustre_handle *h)
+{
+	return __ldlm_handle2lock(h, 0);
+}
+
+#define LDLM_LOCK_REF_DEL(lock) \
+	lu_ref_del(&lock->l_reference, "handle", current)
+
+static inline struct ldlm_lock *
+ldlm_handle2lock_long(const struct lustre_handle *h, __u64 flags)
+{
+	struct ldlm_lock *lock;
+
+	lock = __ldlm_handle2lock(h, flags);
+	if (lock != NULL)
+		LDLM_LOCK_REF_DEL(lock);
+	return lock;
+}
+
+/**
+ * Update Lock Value Block Operations (LVBO) on a resource taking into account
+ * data from reqest \a r
+ */
+static inline int ldlm_res_lvbo_update(struct ldlm_resource *res,
+				       struct ptlrpc_request *r, int increase)
+{
+	if (ldlm_res_to_ns(res)->ns_lvbo &&
+	    ldlm_res_to_ns(res)->ns_lvbo->lvbo_update) {
+		return ldlm_res_to_ns(res)->ns_lvbo->lvbo_update(res, r,
+								 increase);
+	}
+	return 0;
+}
+
+int ldlm_error2errno(ldlm_error_t error);
+ldlm_error_t ldlm_errno2error(int err_no); /* don't call it `errno': this
+					    * confuses user-space. */
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+void ldlm_dump_export_locks(struct obd_export *exp);
+#endif
+
+/**
+ * Release a temporary lock reference obtained by ldlm_handle2lock() or
+ * __ldlm_handle2lock().
+ */
+#define LDLM_LOCK_PUT(lock)		     \
+do {					    \
+	LDLM_LOCK_REF_DEL(lock);		\
+	/*LDLM_DEBUG((lock), "put");*/	  \
+	ldlm_lock_put(lock);		    \
+} while (0)
+
+/**
+ * Release a lock reference obtained by some other means (see
+ * LDLM_LOCK_PUT()).
+ */
+#define LDLM_LOCK_RELEASE(lock)		 \
+do {					    \
+	/*LDLM_DEBUG((lock), "put");*/	  \
+	ldlm_lock_put(lock);		    \
+} while (0)
+
+#define LDLM_LOCK_GET(lock)		     \
+({					      \
+	ldlm_lock_get(lock);		    \
+	/*LDLM_DEBUG((lock), "get");*/	  \
+	lock;				   \
+})
+
+#define ldlm_lock_list_put(head, member, count)		     \
+({								  \
+	struct ldlm_lock *_lock, *_next;			    \
+	int c = count;					      \
+	list_for_each_entry_safe(_lock, _next, head, member) {  \
+		if (c-- == 0)				       \
+			break;				      \
+		list_del_init(&_lock->member);		  \
+		LDLM_LOCK_RELEASE(_lock);			   \
+	}							   \
+	LASSERT(c <= 0);					    \
+})
+
+struct ldlm_lock *ldlm_lock_get(struct ldlm_lock *lock);
+void ldlm_lock_put(struct ldlm_lock *lock);
+void ldlm_lock_destroy(struct ldlm_lock *lock);
+void ldlm_lock2desc(struct ldlm_lock *lock, struct ldlm_lock_desc *desc);
+void ldlm_lock_addref(struct lustre_handle *lockh, __u32 mode);
+int  ldlm_lock_addref_try(struct lustre_handle *lockh, __u32 mode);
+void ldlm_lock_decref(struct lustre_handle *lockh, __u32 mode);
+void ldlm_lock_decref_and_cancel(struct lustre_handle *lockh, __u32 mode);
+void ldlm_lock_fail_match_locked(struct ldlm_lock *lock);
+void ldlm_lock_fail_match(struct ldlm_lock *lock);
+void ldlm_lock_allow_match(struct ldlm_lock *lock);
+void ldlm_lock_allow_match_locked(struct ldlm_lock *lock);
+ldlm_mode_t ldlm_lock_match(struct ldlm_namespace *ns, __u64 flags,
+			    const struct ldlm_res_id *, ldlm_type_t type,
+			    ldlm_policy_data_t *, ldlm_mode_t mode,
+			    struct lustre_handle *, int unref);
+ldlm_mode_t ldlm_revalidate_lock_handle(struct lustre_handle *lockh,
+					__u64 *bits);
+struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode,
+					__u32 *flags);
+void ldlm_lock_downgrade(struct ldlm_lock *lock, int new_mode);
+void ldlm_lock_cancel(struct ldlm_lock *lock);
+void ldlm_reprocess_all(struct ldlm_resource *res);
+void ldlm_reprocess_all_ns(struct ldlm_namespace *ns);
+void ldlm_lock_dump_handle(int level, struct lustre_handle *);
+void ldlm_unlink_lock_skiplist(struct ldlm_lock *req);
+
+/* resource.c */
+struct ldlm_namespace *
+ldlm_namespace_new(struct obd_device *obd, char *name,
+		   ldlm_side_t client, ldlm_appetite_t apt,
+		   ldlm_ns_type_t ns_type);
+int ldlm_namespace_cleanup(struct ldlm_namespace *ns, __u64 flags);
+void ldlm_namespace_free(struct ldlm_namespace *ns,
+			 struct obd_import *imp, int force);
+void ldlm_namespace_register(struct ldlm_namespace *ns, ldlm_side_t client);
+void ldlm_namespace_unregister(struct ldlm_namespace *ns, ldlm_side_t client);
+void ldlm_namespace_move_locked(struct ldlm_namespace *ns, ldlm_side_t client);
+struct ldlm_namespace *ldlm_namespace_first_locked(ldlm_side_t client);
+void ldlm_namespace_get(struct ldlm_namespace *ns);
+void ldlm_namespace_put(struct ldlm_namespace *ns);
+int ldlm_proc_setup(void);
+#ifdef LPROCFS
+void ldlm_proc_cleanup(void);
+#else
+static inline void ldlm_proc_cleanup(void) {}
+#endif
+
+/* resource.c - internal */
+struct ldlm_resource *ldlm_resource_get(struct ldlm_namespace *ns,
+					struct ldlm_resource *parent,
+					const struct ldlm_res_id *,
+					ldlm_type_t type, int create);
+struct ldlm_resource *ldlm_resource_getref(struct ldlm_resource *res);
+int ldlm_resource_putref(struct ldlm_resource *res);
+void ldlm_resource_add_lock(struct ldlm_resource *res,
+			    struct list_head *head,
+			    struct ldlm_lock *lock);
+void ldlm_resource_unlink_lock(struct ldlm_lock *lock);
+void ldlm_res2desc(struct ldlm_resource *res, struct ldlm_resource_desc *desc);
+void ldlm_dump_all_namespaces(ldlm_side_t client, int level);
+void ldlm_namespace_dump(int level, struct ldlm_namespace *);
+void ldlm_resource_dump(int level, struct ldlm_resource *);
+int ldlm_lock_change_resource(struct ldlm_namespace *, struct ldlm_lock *,
+			      const struct ldlm_res_id *);
+
+#define LDLM_RESOURCE_ADDREF(res) do {				  \
+	lu_ref_add_atomic(&(res)->lr_reference, __FUNCTION__, current);  \
+} while (0)
+
+#define LDLM_RESOURCE_DELREF(res) do {				  \
+	lu_ref_del(&(res)->lr_reference, __FUNCTION__, current);  \
+} while (0)
+
+/* ldlm_request.c */
+int ldlm_expired_completion_wait(void *data);
+/** \defgroup ldlm_local_ast Default AST handlers for local locks
+ * These AST handlers are typically used for server-side local locks and are
+ * also used by client-side lock handlers to perform minimum level base
+ * processing.
+ * @{ */
+int ldlm_blocking_ast_nocheck(struct ldlm_lock *lock);
+int ldlm_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+		      void *data, int flag);
+int ldlm_glimpse_ast(struct ldlm_lock *lock, void *reqp);
+int ldlm_completion_ast_async(struct ldlm_lock *lock, __u64 flags, void *data);
+int ldlm_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data);
+/** @} ldlm_local_ast */
+
+/** \defgroup ldlm_cli_api API to operate on locks from actual LDLM users.
+ * These are typically used by client and server (*_local versions)
+ * to obtain and release locks.
+ * @{ */
+int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp,
+		     struct ldlm_enqueue_info *einfo,
+		     const struct ldlm_res_id *res_id,
+		     ldlm_policy_data_t const *policy, __u64 *flags,
+		     void *lvb, __u32 lvb_len, enum lvb_type lvb_type,
+		     struct lustre_handle *lockh, int async);
+int ldlm_prep_enqueue_req(struct obd_export *exp,
+			  struct ptlrpc_request *req,
+			  struct list_head *cancels,
+			  int count);
+int ldlm_prep_elc_req(struct obd_export *exp,
+		      struct ptlrpc_request *req,
+		      int version, int opc, int canceloff,
+		      struct list_head *cancels, int count);
+
+struct ptlrpc_request *ldlm_enqueue_pack(struct obd_export *exp, int lvb_len);
+int ldlm_handle_enqueue0(struct ldlm_namespace *ns, struct ptlrpc_request *req,
+			 const struct ldlm_request *dlm_req,
+			 const struct ldlm_callback_suite *cbs);
+int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
+			  ldlm_type_t type, __u8 with_policy, ldlm_mode_t mode,
+			  __u64 *flags, void *lvb, __u32 lvb_len,
+			  struct lustre_handle *lockh, int rc);
+int ldlm_cli_enqueue_local(struct ldlm_namespace *ns,
+			   const struct ldlm_res_id *res_id,
+			   ldlm_type_t type, ldlm_policy_data_t *policy,
+			   ldlm_mode_t mode, __u64 *flags,
+			   ldlm_blocking_callback blocking,
+			   ldlm_completion_callback completion,
+			   ldlm_glimpse_callback glimpse,
+			   void *data, __u32 lvb_len, enum lvb_type lvb_type,
+			   const __u64 *client_cookie,
+			   struct lustre_handle *lockh);
+int ldlm_server_ast(struct lustre_handle *lockh, struct ldlm_lock_desc *new,
+		    void *data, __u32 data_len);
+int ldlm_cli_convert(struct lustre_handle *, int new_mode, __u32 *flags);
+int ldlm_cli_update_pool(struct ptlrpc_request *req);
+int ldlm_cli_cancel(struct lustre_handle *lockh,
+		    ldlm_cancel_flags_t cancel_flags);
+int ldlm_cli_cancel_unused(struct ldlm_namespace *, const struct ldlm_res_id *,
+			   ldlm_cancel_flags_t flags, void *opaque);
+int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns,
+				    const struct ldlm_res_id *res_id,
+				    ldlm_policy_data_t *policy,
+				    ldlm_mode_t mode,
+				    ldlm_cancel_flags_t flags,
+				    void *opaque);
+int ldlm_cli_cancel_req(struct obd_export *exp, struct list_head *head,
+			int count, ldlm_cancel_flags_t flags);
+int ldlm_cancel_resource_local(struct ldlm_resource *res,
+			       struct list_head *cancels,
+			       ldlm_policy_data_t *policy,
+			       ldlm_mode_t mode, int lock_flags,
+			       ldlm_cancel_flags_t cancel_flags, void *opaque);
+int ldlm_cli_cancel_list_local(struct list_head *cancels, int count,
+			       ldlm_cancel_flags_t flags);
+int ldlm_cli_cancel_list(struct list_head *head, int count,
+			 struct ptlrpc_request *req, ldlm_cancel_flags_t flags);
+/** @} ldlm_cli_api */
+
+/* mds/handler.c */
+/* This has to be here because recursive inclusion sucks. */
+int intent_disposition(struct ldlm_reply *rep, int flag);
+void intent_set_disposition(struct ldlm_reply *rep, int flag);
+
+
+/* ioctls for trying requests */
+#define IOC_LDLM_TYPE		   'f'
+#define IOC_LDLM_MIN_NR		 40
+
+#define IOC_LDLM_TEST		   _IOWR('f', 40, long)
+#define IOC_LDLM_DUMP		   _IOWR('f', 41, long)
+#define IOC_LDLM_REGRESS_START	  _IOWR('f', 42, long)
+#define IOC_LDLM_REGRESS_STOP	   _IOWR('f', 43, long)
+#define IOC_LDLM_MAX_NR		 43
+
+/**
+ * "Modes" of acquiring lock_res, necessary to tell lockdep that taking more
+ * than one lock_res is dead-lock safe.
+ */
+enum lock_res_type {
+	LRT_NORMAL,
+	LRT_NEW
+};
+
+/** Lock resource. */
+static inline void lock_res(struct ldlm_resource *res)
+{
+	spin_lock(&res->lr_lock);
+}
+
+/** Lock resource with a way to instruct lockdep code about nestedness-safe. */
+static inline void lock_res_nested(struct ldlm_resource *res,
+				   enum lock_res_type mode)
+{
+	spin_lock_nested(&res->lr_lock, mode);
+}
+
+/** Unlock resource. */
+static inline void unlock_res(struct ldlm_resource *res)
+{
+	spin_unlock(&res->lr_lock);
+}
+
+/** Check if resource is already locked, assert if not. */
+static inline void check_res_locked(struct ldlm_resource *res)
+{
+	LASSERT(spin_is_locked(&res->lr_lock));
+}
+
+struct ldlm_resource * lock_res_and_lock(struct ldlm_lock *lock);
+void unlock_res_and_lock(struct ldlm_lock *lock);
+
+/* ldlm_pool.c */
+/** \defgroup ldlm_pools Various LDLM pool related functions
+ * There are not used outside of ldlm.
+ * @{
+ */
+void ldlm_pools_recalc(ldlm_side_t client);
+int ldlm_pools_init(void);
+void ldlm_pools_fini(void);
+
+int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns,
+		   int idx, ldlm_side_t client);
+int ldlm_pool_shrink(struct ldlm_pool *pl, int nr,
+		     unsigned int gfp_mask);
+void ldlm_pool_fini(struct ldlm_pool *pl);
+int ldlm_pool_setup(struct ldlm_pool *pl, int limit);
+int ldlm_pool_recalc(struct ldlm_pool *pl);
+__u32 ldlm_pool_get_lvf(struct ldlm_pool *pl);
+__u64 ldlm_pool_get_slv(struct ldlm_pool *pl);
+__u64 ldlm_pool_get_clv(struct ldlm_pool *pl);
+__u32 ldlm_pool_get_limit(struct ldlm_pool *pl);
+void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv);
+void ldlm_pool_set_clv(struct ldlm_pool *pl, __u64 clv);
+void ldlm_pool_set_limit(struct ldlm_pool *pl, __u32 limit);
+void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock);
+void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock);
+/** @} */
+
+#endif
+/** @} LDLM */
diff --git a/drivers/staging/lustre/lustre/include/lustre_eacl.h b/drivers/staging/lustre/lustre/include/lustre_eacl.h
new file mode 100644
index 000000000000..b94f76a3301b
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_eacl.h
@@ -0,0 +1,95 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lustre/include/lustre_idmap.h
+ *
+ * MDS data structures.
+ * See also lustre_idl.h for wire formats of requests.
+ */
+
+#ifndef _LUSTRE_EACL_H
+#define _LUSTRE_EACL_H
+
+/** \defgroup eacl eacl
+ *
+ * @{
+ */
+
+#ifdef CONFIG_FS_POSIX_ACL
+
+#include <linux/posix_acl_xattr.h>
+
+typedef struct {
+	__u16		   e_tag;
+	__u16		   e_perm;
+	__u32		   e_id;
+	__u32		   e_stat;
+} ext_acl_xattr_entry;
+
+typedef struct {
+	__u32		   a_count;
+	ext_acl_xattr_entry     a_entries[0];
+} ext_acl_xattr_header;
+
+#define CFS_ACL_XATTR_SIZE(count, prefix) \
+	(sizeof(prefix ## _header) + (count) * sizeof(prefix ## _entry))
+
+#define CFS_ACL_XATTR_COUNT(size, prefix) \
+	(((size) - sizeof(prefix ## _header)) / sizeof(prefix ## _entry))
+
+
+extern ext_acl_xattr_header *
+lustre_posix_acl_xattr_2ext(posix_acl_xattr_header *header, int size);
+extern int
+lustre_posix_acl_xattr_filter(posix_acl_xattr_header *header, int size,
+			      posix_acl_xattr_header **out);
+extern void
+lustre_posix_acl_xattr_free(posix_acl_xattr_header *header, int size);
+extern void
+lustre_ext_acl_xattr_free(ext_acl_xattr_header *header);
+extern int
+lustre_acl_xattr_merge2posix(posix_acl_xattr_header *posix_header, int size,
+			     ext_acl_xattr_header *ext_header,
+			     posix_acl_xattr_header **out);
+extern ext_acl_xattr_header *
+lustre_acl_xattr_merge2ext(posix_acl_xattr_header *posix_header, int size,
+			   ext_acl_xattr_header *ext_header);
+
+#endif /* CONFIG_FS_POSIX_ACL */
+
+/** @} eacl */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_export.h b/drivers/staging/lustre/lustre/include/lustre_export.h
new file mode 100644
index 000000000000..d61c020a4643
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_export.h
@@ -0,0 +1,389 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/** \defgroup obd_export PortalRPC export definitions
+ *
+ * @{
+ */
+
+#ifndef __EXPORT_H
+#define __EXPORT_H
+
+/** \defgroup export export
+ *
+ * @{
+ */
+
+#include <lprocfs_status.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_dlm.h>
+
+struct mds_client_data;
+struct mdt_client_data;
+struct mds_idmap_table;
+struct mdt_idmap_table;
+
+/**
+ * Target-specific export data
+ */
+struct tg_export_data {
+	/** Protects led_lcd below */
+	struct mutex		ted_lcd_lock;
+	/** Per-client data for each export */
+	struct lsd_client_data	*ted_lcd;
+	/** Offset of record in last_rcvd file */
+	loff_t			ted_lr_off;
+	/** Client index in last_rcvd file */
+	int			ted_lr_idx;
+};
+
+/**
+ * MDT-specific export data
+ */
+struct mdt_export_data {
+	struct tg_export_data	med_ted;
+	/** List of all files opened by client on this MDT */
+	struct list_head		med_open_head;
+	spinlock_t		med_open_lock; /* med_open_head, mfd_list */
+	/** Bitmask of all ibit locks this MDT understands */
+	__u64			med_ibits_known;
+	struct mutex		med_idmap_mutex;
+	struct lustre_idmap_table *med_idmap;
+};
+
+struct ec_export_data { /* echo client */
+	struct list_head eced_locks;
+};
+
+/* In-memory access to client data from OST struct */
+/** Filter (oss-side) specific import data */
+struct filter_export_data {
+	struct tg_export_data	fed_ted;
+	spinlock_t		fed_lock;	/**< protects fed_mod_list */
+	long		       fed_dirty;    /* in bytes */
+	long		       fed_grant;    /* in bytes */
+	struct list_head		 fed_mod_list; /* files being modified */
+	int			fed_mod_count;/* items in fed_writing list */
+	long		       fed_pending;  /* bytes just being written */
+	__u32		      fed_group;
+	__u8		       fed_pagesize; /* log2 of client page size */
+};
+
+struct mgs_export_data {
+	struct list_head		med_clients;	/* mgc fs client via this exp */
+	spinlock_t		med_lock;	/* protect med_clients */
+};
+
+/**
+ * per-NID statistics structure.
+ * It tracks access patterns to this export on a per-client-NID basis
+ */
+struct nid_stat {
+	lnet_nid_t	       nid;
+	struct hlist_node	 nid_hash;
+	struct list_head	       nid_list;
+	struct obd_device       *nid_obd;
+	struct proc_dir_entry   *nid_proc;
+	struct lprocfs_stats    *nid_stats;
+	struct lprocfs_stats    *nid_ldlm_stats;
+	atomic_t	     nid_exp_ref_count; /* for obd_nid_stats_hash
+							   exp_nid_stats */
+};
+
+#define nidstat_getref(nidstat)						\
+do {									   \
+	atomic_inc(&(nidstat)->nid_exp_ref_count);			 \
+} while(0)
+
+#define nidstat_putref(nidstat)						\
+do {									   \
+	atomic_dec(&(nidstat)->nid_exp_ref_count);			 \
+	LASSERTF(atomic_read(&(nidstat)->nid_exp_ref_count) >= 0,	  \
+		 "stat %p nid_exp_ref_count < 0\n", nidstat);		  \
+} while(0)
+
+enum obd_option {
+	OBD_OPT_FORCE =	 0x0001,
+	OBD_OPT_FAILOVER =      0x0002,
+	OBD_OPT_ABORT_RECOV =   0x0004,
+};
+
+/**
+ * Export structure. Represents target-side of connection in portals.
+ * Also used in Lustre to connect between layers on the same node when
+ * there is no network-connection in-between.
+ * For every connected client there is an export structure on the server
+ * attached to the same obd device.
+ */
+struct obd_export {
+	/**
+	 * Export handle, it's id is provided to client on connect
+	 * Subsequent client RPCs contain this handle id to identify
+	 * what export they are talking to.
+	 */
+	struct portals_handle     exp_handle;
+	atomic_t	      exp_refcount;
+	/**
+	 * Set of counters below is to track where export references are
+	 * kept. The exp_rpc_count is used for reconnect handling also,
+	 * the cb_count and locks_count are for debug purposes only for now.
+	 * The sum of them should be less than exp_refcount by 3
+	 */
+	atomic_t	      exp_rpc_count; /* RPC references */
+	atomic_t	      exp_cb_count; /* Commit callback references */
+	/** Number of queued replay requests to be processes */
+	atomic_t		  exp_replay_count;
+	atomic_t	      exp_locks_count; /** Lock references */
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+	struct list_head		exp_locks_list;
+	spinlock_t		  exp_locks_list_guard;
+#endif
+	/** UUID of client connected to this export */
+	struct obd_uuid	   exp_client_uuid;
+	/** To link all exports on an obd device */
+	struct list_head		exp_obd_chain;
+	struct hlist_node	  exp_uuid_hash; /** uuid-export hash*/
+	struct hlist_node	  exp_nid_hash; /** nid-export hash */
+	/**
+	 * All exports eligible for ping evictor are linked into a list
+	 * through this field in "most time since last request on this export"
+	 * order
+	 * protected by obd_dev_lock
+	 */
+	struct list_head		exp_obd_chain_timed;
+	/** Obd device of this export */
+	struct obd_device	*exp_obd;
+	/**
+	 * "reverse" import to send requests (e.g. from ldlm) back to client
+	 * exp_lock protect its change
+	 */
+	struct obd_import	*exp_imp_reverse;
+	struct nid_stat	  *exp_nid_stats;
+	struct lprocfs_stats     *exp_md_stats;
+	/** Active connetion */
+	struct ptlrpc_connection *exp_connection;
+	/** Connection count value from last succesful reconnect rpc */
+	__u32		     exp_conn_cnt;
+	/** Hash list of all ldlm locks granted on this export */
+	cfs_hash_t	       *exp_lock_hash;
+	/**
+	 * Hash list for Posix lock deadlock detection, added with
+	 * ldlm_lock::l_exp_flock_hash.
+	 */
+	cfs_hash_t	       *exp_flock_hash;
+	struct list_head		exp_outstanding_replies;
+	struct list_head		exp_uncommitted_replies;
+	spinlock_t		  exp_uncommitted_replies_lock;
+	/** Last committed transno for this export */
+	__u64		     exp_last_committed;
+	/** When was last request received */
+	cfs_time_t		exp_last_request_time;
+	/** On replay all requests waiting for replay are linked here */
+	struct list_head		exp_req_replay_queue;
+	/**
+	 * protects exp_flags, exp_outstanding_replies and the change
+	 * of exp_imp_reverse
+	 */
+	spinlock_t		  exp_lock;
+	/** Compatibility flags for this export are embedded into
+	 *  exp_connect_data */
+	struct obd_connect_data   exp_connect_data;
+	enum obd_option	   exp_flags;
+	unsigned long	     exp_failed:1,
+				  exp_in_recovery:1,
+				  exp_disconnected:1,
+				  exp_connecting:1,
+				  /** VBR: export missed recovery */
+				  exp_delayed:1,
+				  /** VBR: failed version checking */
+				  exp_vbr_failed:1,
+				  exp_req_replay_needed:1,
+				  exp_lock_replay_needed:1,
+				  exp_need_sync:1,
+				  exp_flvr_changed:1,
+				  exp_flvr_adapt:1,
+				  exp_libclient:1, /* liblustre client? */
+				  /* client timed out and tried to reconnect,
+				   * but couldn't because of active rpcs */
+				  exp_abort_active_req:1,
+				  /* if to swap nidtbl entries for 2.2 clients.
+				   * Only used by the MGS to fix LU-1644. */
+				  exp_need_mne_swab:1;
+	/* also protected by exp_lock */
+	enum lustre_sec_part      exp_sp_peer;
+	struct sptlrpc_flavor     exp_flvr;	     /* current */
+	struct sptlrpc_flavor     exp_flvr_old[2];      /* about-to-expire */
+	cfs_time_t		exp_flvr_expire[2];   /* seconds */
+
+	/** protects exp_hp_rpcs */
+	spinlock_t		  exp_rpc_lock;
+	struct list_head		  exp_hp_rpcs;	/* (potential) HP RPCs */
+
+	/** blocking dlm lock list, protected by exp_bl_list_lock */
+	struct list_head		exp_bl_list;
+	spinlock_t		  exp_bl_list_lock;
+
+	/** Target specific data */
+	union {
+		struct tg_export_data     eu_target_data;
+		struct mdt_export_data    eu_mdt_data;
+		struct filter_export_data eu_filter_data;
+		struct ec_export_data     eu_ec_data;
+		struct mgs_export_data    eu_mgs_data;
+	} u;
+};
+
+#define exp_target_data u.eu_target_data
+#define exp_mdt_data    u.eu_mdt_data
+#define exp_filter_data u.eu_filter_data
+#define exp_ec_data     u.eu_ec_data
+
+static inline __u64 *exp_connect_flags_ptr(struct obd_export *exp)
+{
+	return &exp->exp_connect_data.ocd_connect_flags;
+}
+
+static inline __u64 exp_connect_flags(struct obd_export *exp)
+{
+	return *exp_connect_flags_ptr(exp);
+}
+
+static inline int exp_max_brw_size(struct obd_export *exp)
+{
+	LASSERT(exp != NULL);
+	if (exp_connect_flags(exp) & OBD_CONNECT_BRW_SIZE)
+		return exp->exp_connect_data.ocd_brw_size;
+
+	return ONE_MB_BRW_SIZE;
+}
+
+static inline int exp_connect_multibulk(struct obd_export *exp)
+{
+	return exp_max_brw_size(exp) > ONE_MB_BRW_SIZE;
+}
+
+static inline int exp_expired(struct obd_export *exp, cfs_duration_t age)
+{
+	LASSERT(exp->exp_delayed);
+	return cfs_time_before(cfs_time_add(exp->exp_last_request_time, age),
+			       cfs_time_current_sec());
+}
+
+static inline int exp_connect_cancelset(struct obd_export *exp)
+{
+	LASSERT(exp != NULL);
+	return !!(exp_connect_flags(exp) & OBD_CONNECT_CANCELSET);
+}
+
+static inline int exp_connect_lru_resize(struct obd_export *exp)
+{
+	LASSERT(exp != NULL);
+	return !!(exp_connect_flags(exp) & OBD_CONNECT_LRU_RESIZE);
+}
+
+static inline int exp_connect_rmtclient(struct obd_export *exp)
+{
+	LASSERT(exp != NULL);
+	return !!(exp_connect_flags(exp) & OBD_CONNECT_RMT_CLIENT);
+}
+
+static inline int client_is_remote(struct obd_export *exp)
+{
+	struct obd_import *imp = class_exp2cliimp(exp);
+
+	return !!(imp->imp_connect_data.ocd_connect_flags &
+		  OBD_CONNECT_RMT_CLIENT);
+}
+
+static inline int exp_connect_vbr(struct obd_export *exp)
+{
+	LASSERT(exp != NULL);
+	LASSERT(exp->exp_connection);
+	return !!(exp_connect_flags(exp) & OBD_CONNECT_VBR);
+}
+
+static inline int exp_connect_som(struct obd_export *exp)
+{
+	LASSERT(exp != NULL);
+	return !!(exp_connect_flags(exp) & OBD_CONNECT_SOM);
+}
+
+static inline int exp_connect_umask(struct obd_export *exp)
+{
+	return !!(exp_connect_flags(exp) & OBD_CONNECT_UMASK);
+}
+
+static inline int imp_connect_lru_resize(struct obd_import *imp)
+{
+	struct obd_connect_data *ocd;
+
+	LASSERT(imp != NULL);
+	ocd = &imp->imp_connect_data;
+	return !!(ocd->ocd_connect_flags & OBD_CONNECT_LRU_RESIZE);
+}
+
+static inline int exp_connect_layout(struct obd_export *exp)
+{
+	return !!(exp_connect_flags(exp) & OBD_CONNECT_LAYOUTLOCK);
+}
+
+static inline bool exp_connect_lvb_type(struct obd_export *exp)
+{
+	LASSERT(exp != NULL);
+	if (exp_connect_flags(exp) & OBD_CONNECT_LVB_TYPE)
+		return true;
+	else
+		return false;
+}
+
+static inline bool imp_connect_lvb_type(struct obd_import *imp)
+{
+	struct obd_connect_data *ocd;
+
+	LASSERT(imp != NULL);
+	ocd = &imp->imp_connect_data;
+	if (ocd->ocd_connect_flags & OBD_CONNECT_LVB_TYPE)
+		return true;
+	else
+		return false;
+}
+
+extern struct obd_export *class_conn2export(struct lustre_handle *conn);
+extern struct obd_device *class_conn2obd(struct lustre_handle *conn);
+
+/** @} export */
+
+#endif /* __EXPORT_H */
+/** @} obd_export */
diff --git a/drivers/staging/lustre/lustre/include/lustre_fid.h b/drivers/staging/lustre/lustre/include/lustre_fid.h
new file mode 100644
index 000000000000..acaa1c478bba
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_fid.h
@@ -0,0 +1,761 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_fid.h
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#ifndef __LINUX_FID_H
+#define __LINUX_FID_H
+
+/** \defgroup fid fid
+ *
+ * @{
+ *
+ * http://wiki.lustre.org/index.php/Architecture_-_Interoperability_fids_zfs
+ * describes the FID namespace and interoperability requirements for FIDs.
+ * The important parts of that document are included here for reference.
+ *
+ * FID
+ *   File IDentifier generated by client from range allocated by the SEQuence
+ *   service and stored in struct lu_fid. The FID is composed of three parts:
+ *   SEQuence, ObjectID, and VERsion.  The SEQ component is a filesystem
+ *   unique 64-bit integer, and only one client is ever assigned any SEQ value.
+ *   The first 0x400 FID_SEQ_NORMAL [2^33, 2^33 + 0x400] values are reserved
+ *   for system use.  The OID component is a 32-bit value generated by the
+ *   client on a per-SEQ basis to allow creating many unique FIDs without
+ *   communication with the server.  The VER component is a 32-bit value that
+ *   distinguishes between different FID instantiations, such as snapshots or
+ *   separate subtrees within the filesystem.  FIDs with the same VER field
+ *   are considered part of the same namespace.
+ *
+ * OLD filesystems are those upgraded from Lustre 1.x that predate FIDs, and
+ *   MDTs use 32-bit ldiskfs internal inode/generation numbers (IGIFs), while
+ *   OSTs use 64-bit Lustre object IDs and generation numbers.
+ *
+ * NEW filesystems are those formatted since the introduction of FIDs.
+ *
+ * IGIF
+ *   Inode and Generation In FID, a surrogate FID used to globally identify
+ *   an existing object on OLD formatted MDT file system. This would only be
+ *   used on MDT0 in a DNE filesystem, because there cannot be more than one
+ *   MDT in an OLD formatted filesystem. Belongs to sequence in [12, 2^32 - 1]
+ *   range, where inode number is stored in SEQ, and inode generation is in OID.
+ *   NOTE: This assumes no more than 2^32-1 inodes exist in the MDT filesystem,
+ *   which is the maximum possible for an ldiskfs backend.  It also assumes
+ *   that the reserved ext3/ext4/ldiskfs inode numbers [0-11] are never visible
+ *   to clients, which has always been true.
+ *
+ * IDIF
+ *   object ID In FID, a surrogate FID used to globally identify an existing
+ *   OST object on OLD formatted OST file system. Belongs to a sequence in
+ *   [2^32, 2^33 - 1]. Sequence number is calculated as:
+ *
+ *      1 << 32 | (ost_index << 16) | ((objid >> 32) & 0xffff)
+ *
+ *   that is, SEQ consists of 16-bit OST index, and higher 16 bits of object
+ *   ID. The generation of unique SEQ values per OST allows the IDIF FIDs to
+ *   be identified in the FLD correctly. The OID field is calculated as:
+ *
+ *      objid & 0xffffffff
+ *
+ *   that is, it consists of lower 32 bits of object ID.  For objects within
+ *   the IDIF range, object ID extraction will be:
+ *
+ *      o_id = (fid->f_seq & 0x7fff) << 16 | fid->f_oid;
+ *      o_seq = 0;  // formerly group number
+ *
+ *   NOTE: This assumes that no more than 2^48-1 objects have ever been created
+ *   on any OST, and that no more than 65535 OSTs are in use.  Both are very
+ *   reasonable assumptions, i.e. an IDIF can uniquely map all objects assuming
+ *   a maximum creation rate of 1M objects per second for a maximum of 9 years,
+ *   or combinations thereof.
+ *
+ * OST_MDT0
+ *   Surrogate FID used to identify an existing object on OLD formatted OST
+ *   filesystem. Belongs to the reserved SEQuence 0, and is used prior to
+ *   the introduction of FID-on-OST, at which point IDIF will be used to
+ *   identify objects as residing on a specific OST.
+ *
+ * LLOG
+ *   For Lustre Log objects the object sequence 1 is used. This is compatible
+ *   with both OLD and NEW namespaces, as this SEQ number is in the
+ *   ext3/ldiskfs reserved inode range and does not conflict with IGIF
+ *   sequence numbers.
+ *
+ * ECHO
+ *   For testing OST IO performance the object sequence 2 is used. This is
+ *   compatible with both OLD and NEW namespaces, as this SEQ number is in
+ *   the ext3/ldiskfs reserved inode range and does not conflict with IGIF
+ *   sequence numbers.
+ *
+ * OST_MDT1 .. OST_MAX
+ *   For testing with multiple MDTs the object sequence 3 through 9 is used,
+ *   allowing direct mapping of MDTs 1 through 7 respectively, for a total
+ *   of 8 MDTs including OST_MDT0. This matches the legacy CMD project "group"
+ *   mappings. However, this SEQ range is only for testing prior to any
+ *   production DNE release, as the objects in this range conflict across all
+ *   OSTs, as the OST index is not part of the FID.  For production DNE usage,
+ *   OST objects created by MDT1+ will use FID_SEQ_NORMAL FIDs.
+ *
+ * DLM OST objid to IDIF mapping
+ *   For compatibility with existing OLD OST network protocol structures, the
+ *   FID must map onto the o_id and o_seq in a manner that ensures existing
+ *   objects are identified consistently for IO, as well as onto the LDLM
+ *   namespace to ensure IDIFs there is only a single resource name for any
+ *   object in the DLM.  The OLD OST object DLM resource mapping is:
+ *
+ *      resource[] = {o_id, o_seq, 0, 0}; // o_seq == 0 for production releases
+ *
+ *   The NEW OST object DLM resource mapping is the same for both MDT and OST:
+ *
+ *      resource[] = {SEQ, OID, VER, HASH};
+ *
+ *  NOTE: for mapping IDIF values to DLM resource names the o_id may be
+ *  larger than the 2^33 reserved sequence numbers for IDIF, so it is possible
+ *  for the o_id numbers to overlap FID SEQ numbers in the resource. However,
+ *  in all production releases the OLD o_seq field is always zero, and all
+ *  valid FID OID values are non-zero, so the lock resources will not collide.
+ *  Even so, the MDT and OST resources are also in different LDLM namespaces.
+ */
+
+#include <linux/libcfs/libcfs.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_req_layout.h>
+#include <lustre_mdt.h>
+#include <obd.h>
+
+
+struct lu_site;
+struct lu_context;
+
+/* Whole sequences space range and zero range definitions */
+extern const struct lu_seq_range LUSTRE_SEQ_SPACE_RANGE;
+extern const struct lu_seq_range LUSTRE_SEQ_ZERO_RANGE;
+extern const struct lu_fid LUSTRE_BFL_FID;
+extern const struct lu_fid LU_OBF_FID;
+extern const struct lu_fid LU_DOT_LUSTRE_FID;
+
+enum {
+	/*
+	 * This is how may metadata FIDs may be allocated in one sequence(128k)
+	 */
+	LUSTRE_METADATA_SEQ_MAX_WIDTH = 0x0000000000020000ULL,
+
+	/*
+	 * This is how many data FIDs could be allocated in one sequence(4B - 1)
+	 */
+	LUSTRE_DATA_SEQ_MAX_WIDTH = 0x00000000FFFFFFFFULL,
+
+	/*
+	 * How many sequences to allocate to a client at once.
+	 */
+	LUSTRE_SEQ_META_WIDTH = 0x0000000000000001ULL,
+
+	/*
+	 * seq allocation pool size.
+	 */
+	LUSTRE_SEQ_BATCH_WIDTH = LUSTRE_SEQ_META_WIDTH * 1000,
+
+	/*
+	 * This is how many sequences may be in one super-sequence allocated to
+	 * MDTs.
+	 */
+	LUSTRE_SEQ_SUPER_WIDTH = ((1ULL << 30ULL) * LUSTRE_SEQ_META_WIDTH)
+};
+
+enum {
+	/** 2^6 FIDs for OI containers */
+	OSD_OI_FID_OID_BITS     = 6,
+	/** reserve enough FIDs in case we want more in the future */
+	OSD_OI_FID_OID_BITS_MAX = 10,
+};
+
+/** special OID for local objects */
+enum local_oid {
+	/** \see fld_mod_init */
+	FLD_INDEX_OID		= 3UL,
+	/** \see fid_mod_init */
+	FID_SEQ_CTL_OID		= 4UL,
+	FID_SEQ_SRV_OID		= 5UL,
+	/** \see mdd_mod_init */
+	MDD_ROOT_INDEX_OID	= 6UL, /* deprecated in 2.4 */
+	MDD_ORPHAN_OID		= 7UL, /* deprecated in 2.4 */
+	MDD_LOV_OBJ_OID		= 8UL,
+	MDD_CAPA_KEYS_OID	= 9UL,
+	/** \see mdt_mod_init */
+	LAST_RECV_OID		= 11UL,
+	OSD_FS_ROOT_OID		= 13UL,
+	ACCT_USER_OID		= 15UL,
+	ACCT_GROUP_OID		= 16UL,
+	LFSCK_BOOKMARK_OID	= 17UL,
+	OTABLE_IT_OID		= 18UL,
+	/* These two definitions are obsolete
+	 * OFD_GROUP0_LAST_OID     = 20UL,
+	 * OFD_GROUP4K_LAST_OID    = 20UL+4096,
+	 */
+	OFD_LAST_GROUP_OID	= 4117UL,
+	LLOG_CATALOGS_OID	= 4118UL,
+	MGS_CONFIGS_OID		= 4119UL,
+	OFD_HEALTH_CHECK_OID	= 4120UL,
+	MDD_LOV_OBJ_OSEQ	= 4121UL,
+	LFSCK_NAMESPACE_OID     = 4122UL,
+	REMOTE_PARENT_DIR_OID	= 4123UL,
+};
+
+static inline void lu_local_obj_fid(struct lu_fid *fid, __u32 oid)
+{
+	fid->f_seq = FID_SEQ_LOCAL_FILE;
+	fid->f_oid = oid;
+	fid->f_ver = 0;
+}
+
+static inline void lu_local_name_obj_fid(struct lu_fid *fid, __u32 oid)
+{
+	fid->f_seq = FID_SEQ_LOCAL_NAME;
+	fid->f_oid = oid;
+	fid->f_ver = 0;
+}
+
+/* For new FS (>= 2.4), the root FID will be changed to
+ * [FID_SEQ_ROOT:1:0], for existing FS, (upgraded to 2.4),
+ * the root FID will still be IGIF */
+static inline int fid_is_root(const struct lu_fid *fid)
+{
+	return unlikely((fid_seq(fid) == FID_SEQ_ROOT &&
+			 fid_oid(fid) == 1));
+}
+
+static inline int fid_is_dot_lustre(const struct lu_fid *fid)
+{
+	return unlikely(fid_seq(fid) == FID_SEQ_DOT_LUSTRE &&
+			fid_oid(fid) == FID_OID_DOT_LUSTRE);
+}
+
+static inline int fid_is_obf(const struct lu_fid *fid)
+{
+	return unlikely(fid_seq(fid) == FID_SEQ_DOT_LUSTRE &&
+			fid_oid(fid) == FID_OID_DOT_LUSTRE_OBF);
+}
+
+static inline int fid_is_otable_it(const struct lu_fid *fid)
+{
+	return unlikely(fid_seq(fid) == FID_SEQ_LOCAL_FILE &&
+			fid_oid(fid) == OTABLE_IT_OID);
+}
+
+static inline int fid_is_acct(const struct lu_fid *fid)
+{
+	return fid_seq(fid) == FID_SEQ_LOCAL_FILE &&
+	       (fid_oid(fid) == ACCT_USER_OID ||
+		fid_oid(fid) == ACCT_GROUP_OID);
+}
+
+static inline int fid_is_quota(const struct lu_fid *fid)
+{
+	return fid_seq(fid) == FID_SEQ_QUOTA ||
+	       fid_seq(fid) == FID_SEQ_QUOTA_GLB;
+}
+
+static inline int fid_is_namespace_visible(const struct lu_fid *fid)
+{
+	const __u64 seq = fid_seq(fid);
+
+	/* Here, we cannot distinguish whether the normal FID is for OST
+	 * object or not. It is caller's duty to check more if needed. */
+	return (!fid_is_last_id(fid) &&
+		(fid_seq_is_norm(seq) || fid_seq_is_igif(seq))) ||
+	       fid_is_root(fid) || fid_is_dot_lustre(fid);
+}
+
+static inline int fid_seq_in_fldb(__u64 seq)
+{
+	return fid_seq_is_igif(seq) || fid_seq_is_norm(seq) ||
+	       fid_seq_is_root(seq) || fid_seq_is_dot(seq);
+}
+
+static inline void lu_last_id_fid(struct lu_fid *fid, __u64 seq)
+{
+	if (fid_seq_is_mdt0(seq)) {
+		fid->f_seq = fid_idif_seq(0, 0);
+	} else {
+		LASSERTF(fid_seq_is_norm(seq) || fid_seq_is_echo(seq) ||
+			 fid_seq_is_idif(seq), LPX64"\n", seq);
+		fid->f_seq = seq;
+	}
+	fid->f_oid = 0;
+	fid->f_ver = 0;
+}
+
+enum lu_mgr_type {
+	LUSTRE_SEQ_SERVER,
+	LUSTRE_SEQ_CONTROLLER
+};
+
+struct lu_server_seq;
+
+/* Client sequence manager interface. */
+struct lu_client_seq {
+	/* Sequence-controller export. */
+	struct obd_export      *lcs_exp;
+	struct mutex		lcs_mutex;
+
+	/*
+	 * Range of allowed for allocation sequeces. When using lu_client_seq on
+	 * clients, this contains meta-sequence range. And for servers this
+	 * contains super-sequence range.
+	 */
+	struct lu_seq_range	 lcs_space;
+
+	/* Seq related proc */
+	proc_dir_entry_t   *lcs_proc_dir;
+
+	/* This holds last allocated fid in last obtained seq */
+	struct lu_fid	   lcs_fid;
+
+	/* LUSTRE_SEQ_METADATA or LUSTRE_SEQ_DATA */
+	enum lu_cli_type	lcs_type;
+
+	/*
+	 * Service uuid, passed from MDT + seq name to form unique seq name to
+	 * use it with procfs.
+	 */
+	char		    lcs_name[80];
+
+	/*
+	 * Sequence width, that is how many objects may be allocated in one
+	 * sequence. Default value for it is LUSTRE_SEQ_MAX_WIDTH.
+	 */
+	__u64		   lcs_width;
+
+	/* Seq-server for direct talking */
+	struct lu_server_seq   *lcs_srv;
+
+	/* wait queue for fid allocation and update indicator */
+	wait_queue_head_t	     lcs_waitq;
+	int		     lcs_update;
+};
+
+/* server sequence manager interface */
+struct lu_server_seq {
+	/* Available sequences space */
+	struct lu_seq_range	 lss_space;
+
+	/* keeps highwater in lsr_end for seq allocation algorithm */
+	struct lu_seq_range	 lss_lowater_set;
+	struct lu_seq_range	 lss_hiwater_set;
+
+	/*
+	 * Device for server side seq manager needs (saving sequences to backing
+	 * store).
+	 */
+	struct dt_device       *lss_dev;
+
+	/* /seq file object device */
+	struct dt_object       *lss_obj;
+
+	/* Seq related proc */
+	proc_dir_entry_t   *lss_proc_dir;
+
+	/* LUSTRE_SEQ_SERVER or LUSTRE_SEQ_CONTROLLER */
+	enum lu_mgr_type       lss_type;
+
+	/* Client interafce to request controller */
+	struct lu_client_seq   *lss_cli;
+
+	/* Mutex for protecting allocation */
+	struct mutex		lss_mutex;
+
+	/*
+	 * Service uuid, passed from MDT + seq name to form unique seq name to
+	 * use it with procfs.
+	 */
+	char		    lss_name[80];
+
+	/*
+	 * Allocation chunks for super and meta sequences. Default values are
+	 * LUSTRE_SEQ_SUPER_WIDTH and LUSTRE_SEQ_META_WIDTH.
+	 */
+	__u64		   lss_width;
+
+	/*
+	 * minimum lss_alloc_set size that should be allocated from
+	 * lss_space
+	 */
+	__u64		   lss_set_width;
+
+	/* sync is needed for update operation */
+	__u32		   lss_need_sync;
+
+	/**
+	 * Pointer to site object, required to access site fld.
+	 */
+	struct seq_server_site  *lss_site;
+};
+
+int seq_query(struct com_thread_info *info);
+int seq_handle(struct ptlrpc_request *req);
+
+/* Server methods */
+int seq_server_init(struct lu_server_seq *seq,
+		    struct dt_device *dev,
+		    const char *prefix,
+		    enum lu_mgr_type type,
+		    struct seq_server_site *ss,
+		    const struct lu_env *env);
+
+void seq_server_fini(struct lu_server_seq *seq,
+		     const struct lu_env *env);
+
+int seq_server_alloc_super(struct lu_server_seq *seq,
+			   struct lu_seq_range *out,
+			   const struct lu_env *env);
+
+int seq_server_alloc_meta(struct lu_server_seq *seq,
+			  struct lu_seq_range *out,
+			  const struct lu_env *env);
+
+int seq_server_set_cli(struct lu_server_seq *seq,
+		       struct lu_client_seq *cli,
+		       const struct lu_env *env);
+
+/* Client methods */
+int seq_client_init(struct lu_client_seq *seq,
+		    struct obd_export *exp,
+		    enum lu_cli_type type,
+		    const char *prefix,
+		    struct lu_server_seq *srv);
+
+void seq_client_fini(struct lu_client_seq *seq);
+
+void seq_client_flush(struct lu_client_seq *seq);
+
+int seq_client_alloc_fid(const struct lu_env *env, struct lu_client_seq *seq,
+			 struct lu_fid *fid);
+int seq_client_get_seq(const struct lu_env *env, struct lu_client_seq *seq,
+		       seqno_t *seqnr);
+int seq_site_fini(const struct lu_env *env, struct seq_server_site *ss);
+/* Fids common stuff */
+int fid_is_local(const struct lu_env *env,
+		 struct lu_site *site, const struct lu_fid *fid);
+
+int client_fid_init(struct obd_device *obd, struct obd_export *exp,
+		    enum lu_cli_type type);
+int client_fid_fini(struct obd_device *obd);
+
+/* fid locking */
+
+struct ldlm_namespace;
+
+/*
+ * Build (DLM) resource name from FID.
+ *
+ * NOTE: until Lustre 1.8.7/2.1.1 the fid_ver() was packed into name[2],
+ * but was moved into name[1] along with the OID to avoid consuming the
+ * renaming name[2,3] fields that need to be used for the quota identifier.
+ */
+static inline struct ldlm_res_id *
+fid_build_reg_res_name(const struct lu_fid *f,
+		       struct ldlm_res_id *name)
+{
+	memset(name, 0, sizeof *name);
+	name->name[LUSTRE_RES_ID_SEQ_OFF] = fid_seq(f);
+	name->name[LUSTRE_RES_ID_VER_OID_OFF] = fid_ver_oid(f);
+	return name;
+}
+
+/*
+ * Build (DLM) resource identifier from global quota FID and quota ID.
+ */
+static inline struct ldlm_res_id *
+fid_build_quota_resid(const struct lu_fid *glb_fid, union lquota_id *qid,
+		      struct ldlm_res_id *res)
+{
+	fid_build_reg_res_name(glb_fid, res);
+	res->name[LUSTRE_RES_ID_QUOTA_SEQ_OFF] = fid_seq(&qid->qid_fid);
+	res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF] = fid_ver_oid(&qid->qid_fid);
+	return res;
+}
+
+/*
+ * Extract global FID and quota ID from resource name
+ */
+static inline void fid_extract_quota_resid(struct ldlm_res_id *res,
+					   struct lu_fid *glb_fid,
+					   union lquota_id *qid)
+{
+	glb_fid->f_seq = res->name[LUSTRE_RES_ID_SEQ_OFF];
+	glb_fid->f_oid = (__u32)res->name[LUSTRE_RES_ID_VER_OID_OFF];
+	glb_fid->f_ver = (__u32)(res->name[LUSTRE_RES_ID_VER_OID_OFF] >> 32);
+
+	qid->qid_fid.f_seq = res->name[LUSTRE_RES_ID_QUOTA_SEQ_OFF];
+	qid->qid_fid.f_oid = (__u32)res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF];
+	qid->qid_fid.f_ver =
+		(__u32)(res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF] >> 32);
+}
+
+/*
+ * Return true if resource is for object identified by fid.
+ */
+static inline int fid_res_name_eq(const struct lu_fid *f,
+				  const struct ldlm_res_id *name)
+{
+	return name->name[LUSTRE_RES_ID_SEQ_OFF] == fid_seq(f) &&
+	       name->name[LUSTRE_RES_ID_VER_OID_OFF] == fid_ver_oid(f);
+}
+
+/* reverse function of fid_build_reg_res_name() */
+static inline void fid_build_from_res_name(struct lu_fid *f,
+					   const struct ldlm_res_id *name)
+{
+	fid_zero(f);
+	f->f_seq = name->name[LUSTRE_RES_ID_SEQ_OFF];
+	f->f_oid = name->name[LUSTRE_RES_ID_VER_OID_OFF] & 0xffffffff;
+	f->f_ver = name->name[LUSTRE_RES_ID_VER_OID_OFF] >> 32;
+	LASSERT(fid_res_name_eq(f, name));
+}
+
+static inline struct ldlm_res_id *
+fid_build_pdo_res_name(const struct lu_fid *f,
+		       unsigned int hash,
+		       struct ldlm_res_id *name)
+{
+	fid_build_reg_res_name(f, name);
+	name->name[LUSTRE_RES_ID_HSH_OFF] = hash;
+	return name;
+}
+
+/**
+ * Build DLM resource name from object id & seq, which will be removed
+ * finally, when we replace ost_id with FID in data stack.
+ *
+ * Currently, resid from the old client, whose res[0] = object_id,
+ * res[1] = object_seq, is just oposite with Metatdata
+ * resid, where, res[0] = fid->f_seq, res[1] = fid->f_oid.
+ * To unifiy the resid identification, we will reverse the data
+ * resid to keep it same with Metadata resid, i.e.
+ *
+ * For resid from the old client,
+ *    res[0] = objid,  res[1] = 0, still keep the original order,
+ *    for compatiblity.
+ *
+ * For new resid
+ *    res will be built from normal FID directly, i.e. res[0] = f_seq,
+ *    res[1] = f_oid + f_ver.
+ */
+static inline void ostid_build_res_name(struct ost_id *oi,
+					struct ldlm_res_id *name)
+{
+	memset(name, 0, sizeof *name);
+	if (fid_seq_is_mdt0(ostid_seq(oi))) {
+		name->name[LUSTRE_RES_ID_SEQ_OFF] = ostid_id(oi);
+		name->name[LUSTRE_RES_ID_VER_OID_OFF] = ostid_seq(oi);
+	} else {
+		fid_build_reg_res_name((struct lu_fid *)oi, name);
+	}
+}
+
+static inline void ostid_res_name_to_id(struct ost_id *oi,
+					struct ldlm_res_id *name)
+{
+	if (fid_seq_is_mdt0(name->name[LUSTRE_RES_ID_SEQ_OFF])) {
+		/* old resid */
+		ostid_set_seq(oi, name->name[LUSTRE_RES_ID_VER_OID_OFF]);
+		ostid_set_id(oi, name->name[LUSTRE_RES_ID_SEQ_OFF]);
+	} else {
+		/* new resid */
+		fid_build_from_res_name((struct lu_fid *)oi, name);
+	}
+}
+
+/**
+ * Return true if the resource is for the object identified by this id & group.
+ */
+static inline int ostid_res_name_eq(struct ost_id *oi,
+				    struct ldlm_res_id *name)
+{
+	/* Note: it is just a trick here to save some effort, probably the
+	 * correct way would be turn them into the FID and compare */
+	if (fid_seq_is_mdt0(ostid_seq(oi))) {
+		return name->name[LUSTRE_RES_ID_SEQ_OFF] == ostid_id(oi) &&
+		       name->name[LUSTRE_RES_ID_VER_OID_OFF] == ostid_seq(oi);
+	} else {
+		return name->name[LUSTRE_RES_ID_SEQ_OFF] == ostid_seq(oi) &&
+		       name->name[LUSTRE_RES_ID_VER_OID_OFF] == ostid_id(oi);
+	}
+}
+
+/* The same as osc_build_res_name() */
+static inline void ost_fid_build_resid(const struct lu_fid *fid,
+				       struct ldlm_res_id *resname)
+{
+	if (fid_is_mdt0(fid) || fid_is_idif(fid)) {
+		struct ost_id oi;
+		if (fid_to_ostid(fid, &oi) != 0)
+			return;
+		ostid_build_res_name(&oi, resname);
+	} else {
+		fid_build_reg_res_name(fid, resname);
+	}
+}
+
+static inline void ost_fid_from_resid(struct lu_fid *fid,
+				      const struct ldlm_res_id *name)
+{
+	if (fid_seq_is_mdt0(name->name[LUSTRE_RES_ID_VER_OID_OFF])) {
+		/* old resid */
+		struct ost_id oi;
+		ostid_set_seq(&oi, name->name[LUSTRE_RES_ID_VER_OID_OFF]);
+		ostid_set_id(&oi, name->name[LUSTRE_RES_ID_SEQ_OFF]);
+		ostid_to_fid(fid, &oi, 0);
+	} else {
+		/* new resid */
+		fid_build_from_res_name(fid, name);
+	}
+}
+
+/**
+ * Flatten 128-bit FID values into a 64-bit value for use as an inode number.
+ * For non-IGIF FIDs this starts just over 2^32, and continues without
+ * conflict until 2^64, at which point we wrap the high 24 bits of the SEQ
+ * into the range where there may not be many OID values in use, to minimize
+ * the risk of conflict.
+ *
+ * Suppose LUSTRE_SEQ_MAX_WIDTH less than (1 << 24) which is currently true,
+ * the time between re-used inode numbers is very long - 2^40 SEQ numbers,
+ * or about 2^40 client mounts, if clients create less than 2^24 files/mount.
+ */
+static inline __u64 fid_flatten(const struct lu_fid *fid)
+{
+	__u64 ino;
+	__u64 seq;
+
+	if (fid_is_igif(fid)) {
+		ino = lu_igif_ino(fid);
+		RETURN(ino);
+	}
+
+	seq = fid_seq(fid);
+
+	ino = (seq << 24) + ((seq >> 24) & 0xffffff0000ULL) + fid_oid(fid);
+
+	RETURN(ino ? ino : fid_oid(fid));
+}
+
+static inline __u32 fid_hash(const struct lu_fid *f, int bits)
+{
+	/* all objects with same id and different versions will belong to same
+	 * collisions list. */
+	return cfs_hash_long(fid_flatten(f), bits);
+}
+
+/**
+ * map fid to 32 bit value for ino on 32bit systems. */
+static inline __u32 fid_flatten32(const struct lu_fid *fid)
+{
+	__u32 ino;
+	__u64 seq;
+
+	if (fid_is_igif(fid)) {
+		ino = lu_igif_ino(fid);
+		RETURN(ino);
+	}
+
+	seq = fid_seq(fid) - FID_SEQ_START;
+
+	/* Map the high bits of the OID into higher bits of the inode number so
+	 * that inodes generated at about the same time have a reduced chance
+	 * of collisions. This will give a period of 2^12 = 1024 unique clients
+	 * (from SEQ) and up to min(LUSTRE_SEQ_MAX_WIDTH, 2^20) = 128k objects
+	 * (from OID), or up to 128M inodes without collisions for new files. */
+	ino = ((seq & 0x000fffffULL) << 12) + ((seq >> 8) & 0xfffff000) +
+	       (seq >> (64 - (40-8)) & 0xffffff00) +
+	       (fid_oid(fid) & 0xff000fff) + ((fid_oid(fid) & 0x00fff000) << 8);
+
+	RETURN(ino ? ino : fid_oid(fid));
+}
+
+static inline int lu_fid_diff(struct lu_fid *fid1, struct lu_fid *fid2)
+{
+	LASSERTF(fid_seq(fid1) == fid_seq(fid2), "fid1:"DFID", fid2:"DFID"\n",
+		 PFID(fid1), PFID(fid2));
+
+	if (fid_is_idif(fid1) && fid_is_idif(fid2))
+		return fid_idif_id(fid1->f_seq, fid1->f_oid, fid1->f_ver) -
+		       fid_idif_id(fid2->f_seq, fid2->f_oid, fid2->f_ver);
+
+	return fid_oid(fid1) - fid_oid(fid2);
+}
+
+#define LUSTRE_SEQ_SRV_NAME "seq_srv"
+#define LUSTRE_SEQ_CTL_NAME "seq_ctl"
+
+/* Range common stuff */
+static inline void range_cpu_to_le(struct lu_seq_range *dst, const struct lu_seq_range *src)
+{
+	dst->lsr_start = cpu_to_le64(src->lsr_start);
+	dst->lsr_end = cpu_to_le64(src->lsr_end);
+	dst->lsr_index = cpu_to_le32(src->lsr_index);
+	dst->lsr_flags = cpu_to_le32(src->lsr_flags);
+}
+
+static inline void range_le_to_cpu(struct lu_seq_range *dst, const struct lu_seq_range *src)
+{
+	dst->lsr_start = le64_to_cpu(src->lsr_start);
+	dst->lsr_end = le64_to_cpu(src->lsr_end);
+	dst->lsr_index = le32_to_cpu(src->lsr_index);
+	dst->lsr_flags = le32_to_cpu(src->lsr_flags);
+}
+
+static inline void range_cpu_to_be(struct lu_seq_range *dst, const struct lu_seq_range *src)
+{
+	dst->lsr_start = cpu_to_be64(src->lsr_start);
+	dst->lsr_end = cpu_to_be64(src->lsr_end);
+	dst->lsr_index = cpu_to_be32(src->lsr_index);
+	dst->lsr_flags = cpu_to_be32(src->lsr_flags);
+}
+
+static inline void range_be_to_cpu(struct lu_seq_range *dst, const struct lu_seq_range *src)
+{
+	dst->lsr_start = be64_to_cpu(src->lsr_start);
+	dst->lsr_end = be64_to_cpu(src->lsr_end);
+	dst->lsr_index = be32_to_cpu(src->lsr_index);
+	dst->lsr_flags = be32_to_cpu(src->lsr_flags);
+}
+
+/** @} fid */
+
+#endif /* __LINUX_FID_H */
diff --git a/drivers/staging/lustre/lustre/include/lustre_fld.h b/drivers/staging/lustre/lustre/include/lustre_fld.h
new file mode 100644
index 000000000000..11e034a65b17
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_fld.h
@@ -0,0 +1,202 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LINUX_FLD_H
+#define __LINUX_FLD_H
+
+/** \defgroup fld fld
+ *
+ * @{
+ */
+
+#include <lustre/lustre_idl.h>
+#include <lustre_mdt.h>
+#include <dt_object.h>
+
+#include <linux/libcfs/libcfs.h>
+
+struct lu_client_fld;
+struct lu_server_fld;
+struct lu_fld_hash;
+struct fld_cache;
+
+extern const struct dt_index_features fld_index_features;
+extern const char fld_index_name[];
+
+/*
+ * FLD (Fid Location Database) interface.
+ */
+enum {
+	LUSTRE_CLI_FLD_HASH_DHT = 0,
+	LUSTRE_CLI_FLD_HASH_RRB
+};
+
+
+struct lu_fld_target {
+	struct list_head	       ft_chain;
+	struct obd_export       *ft_exp;
+	struct lu_server_fld    *ft_srv;
+	__u64		    ft_idx;
+};
+
+struct lu_server_fld {
+	/**
+	 * Fld dir proc entry. */
+	proc_dir_entry_t    *lsf_proc_dir;
+
+	/**
+	 * /fld file object device */
+	struct dt_object	*lsf_obj;
+
+	/**
+	 * super sequence controller export, needed to forward fld
+	 * lookup  request. */
+	struct obd_export       *lsf_control_exp;
+
+	/**
+	 * Client FLD cache. */
+	struct fld_cache	*lsf_cache;
+
+	/**
+	 * Protect index modifications */
+	struct mutex		lsf_lock;
+
+	/**
+	 * Fld service name in form "fld-srv-lustre-MDTXXX" */
+	char		     lsf_name[80];
+
+};
+
+struct lu_client_fld {
+	/**
+	 * Client side proc entry. */
+	proc_dir_entry_t    *lcf_proc_dir;
+
+	/**
+	 * List of exports client FLD knows about. */
+	struct list_head	       lcf_targets;
+
+	/**
+	 * Current hash to be used to chose an export. */
+	struct lu_fld_hash      *lcf_hash;
+
+	/**
+	 * Exports count. */
+	int		      lcf_count;
+
+	/**
+	 * Lock protecting exports list and fld_hash. */
+	spinlock_t		 lcf_lock;
+
+	/**
+	 * Client FLD cache. */
+	struct fld_cache	*lcf_cache;
+
+	/**
+	 * Client fld proc entry name. */
+	char		     lcf_name[80];
+
+	const struct lu_context *lcf_ctx;
+
+	int		      lcf_flags;
+};
+
+/**
+ * number of blocks to reserve for particular operations. Should be function of
+ * ... something. Stub for now.
+ */
+enum {
+	/* one insert operation can involve two delete and one insert */
+	FLD_TXN_INDEX_INSERT_CREDITS  = 60,
+	FLD_TXN_INDEX_DELETE_CREDITS  = 20,
+};
+
+int fld_query(struct com_thread_info *info);
+
+/* Server methods */
+int fld_server_init(const struct lu_env *env, struct lu_server_fld *fld,
+		    struct dt_device *dt, const char *prefix, int mds_node_id,
+		    int type);
+
+void fld_server_fini(const struct lu_env *env, struct lu_server_fld *fld);
+
+int fld_declare_server_create(const struct lu_env *env,
+			      struct lu_server_fld *fld,
+			      struct lu_seq_range *new,
+			      struct thandle *th);
+
+int fld_server_create(const struct lu_env *env,
+		      struct lu_server_fld *fld,
+		      struct lu_seq_range *add_range,
+		      struct thandle *th);
+
+int fld_insert_entry(const struct lu_env *env,
+		     struct lu_server_fld *fld,
+		     const struct lu_seq_range *range);
+
+int fld_server_lookup(const struct lu_env *env, struct lu_server_fld *fld,
+		      seqno_t seq, struct lu_seq_range *range);
+
+/* Client methods */
+int fld_client_init(struct lu_client_fld *fld,
+		    const char *prefix, int hash);
+
+void fld_client_fini(struct lu_client_fld *fld);
+
+void fld_client_flush(struct lu_client_fld *fld);
+
+int fld_client_lookup(struct lu_client_fld *fld, seqno_t seq, mdsno_t *mds,
+		      __u32 flags, const struct lu_env *env);
+
+int fld_client_create(struct lu_client_fld *fld,
+		      struct lu_seq_range *range,
+		      const struct lu_env *env);
+
+int fld_client_delete(struct lu_client_fld *fld,
+		      seqno_t seq,
+		      const struct lu_env *env);
+
+int fld_client_add_target(struct lu_client_fld *fld,
+			  struct lu_fld_target *tar);
+
+int fld_client_del_target(struct lu_client_fld *fld,
+			  __u64 idx);
+
+void fld_client_proc_fini(struct lu_client_fld *fld);
+
+/** @} fld */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_fsfilt.h b/drivers/staging/lustre/lustre/include/lustre_fsfilt.h
new file mode 100644
index 000000000000..9dcc332cb2f3
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_fsfilt.h
@@ -0,0 +1,48 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_fsfilt.h
+ *
+ * Filesystem interface helper.
+ */
+
+#ifndef _LUSTRE_FSFILT_H
+#define _LUSTRE_FSFILT_H
+
+#include <linux/lustre_fsfilt.h>
+
+#define LU221_BAD_TIME (0x80000000U + 24 * 3600)
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_ha.h b/drivers/staging/lustre/lustre/include/lustre_ha.h
new file mode 100644
index 000000000000..105f6d61eef0
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_ha.h
@@ -0,0 +1,67 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTRE_HA_H
+#define _LUSTRE_HA_H
+
+/** \defgroup ha ha
+ *
+ * @{
+ */
+
+struct obd_import;
+struct obd_export;
+struct obd_device;
+struct ptlrpc_request;
+
+
+int ptlrpc_replay(struct obd_import *imp);
+int ptlrpc_resend(struct obd_import *imp);
+void ptlrpc_free_committed(struct obd_import *imp);
+void ptlrpc_wake_delayed(struct obd_import *imp);
+int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid, int async);
+int ptlrpc_set_import_active(struct obd_import *imp, int active);
+void ptlrpc_activate_import(struct obd_import *imp);
+void ptlrpc_deactivate_import(struct obd_import *imp);
+void ptlrpc_invalidate_import(struct obd_import *imp);
+void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt);
+int ptlrpc_check_suspend(void);
+void ptlrpc_activate_timeouts(struct obd_import *imp);
+void ptlrpc_deactivate_timeouts(struct obd_import *imp);
+
+/** @} ha */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_handles.h b/drivers/staging/lustre/lustre/include/lustre_handles.h
new file mode 100644
index 000000000000..fcd40f33426a
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_handles.h
@@ -0,0 +1,93 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LUSTRE_HANDLES_H_
+#define __LUSTRE_HANDLES_H_
+
+/** \defgroup handles handles
+ *
+ * @{
+ */
+
+#include <linux/lustre_handles.h>
+
+#include <linux/libcfs/libcfs.h>
+
+
+struct portals_handle_ops {
+	void (*hop_addref)(void *object);
+	void (*hop_free)(void *object, int size);
+};
+
+/* These handles are most easily used by having them appear at the very top of
+ * whatever object that you want to make handles for.  ie:
+ *
+ * struct ldlm_lock {
+ *	 struct portals_handle handle;
+ *	 ...
+ * };
+ *
+ * Now you're able to assign the results of cookie2handle directly to an
+ * ldlm_lock.  If it's not at the top, you'll want to use container_of()
+ * to compute the start of the structure based on the handle field. */
+struct portals_handle {
+	struct list_head			h_link;
+	__u64				h_cookie;
+	struct portals_handle_ops	*h_ops;
+
+	/* newly added fields to handle the RCU issue. -jxiong */
+	cfs_rcu_head_t			h_rcu;
+	spinlock_t			h_lock;
+	unsigned int			h_size:31;
+	unsigned int			h_in:1;
+};
+#define RCU2HANDLE(rcu)    container_of(rcu, struct portals_handle, h_rcu)
+
+/* handles.c */
+
+/* Add a handle to the hash table */
+void class_handle_hash(struct portals_handle *,
+		       struct portals_handle_ops *ops);
+void class_handle_unhash(struct portals_handle *);
+void class_handle_hash_back(struct portals_handle *);
+void *class_handle2object(__u64 cookie);
+void class_handle_free_cb(cfs_rcu_head_t *);
+int class_handle_init(void);
+void class_handle_cleanup(void);
+
+/** @} handles */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_idmap.h b/drivers/staging/lustre/lustre/include/lustre_idmap.h
new file mode 100644
index 000000000000..084bdd6ab4db
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_idmap.h
@@ -0,0 +1,104 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lustre/include/lustre_idmap.h
+ *
+ * MDS data structures.
+ * See also lustre_idl.h for wire formats of requests.
+ */
+
+#ifndef _LUSTRE_IDMAP_H
+#define _LUSTRE_IDMAP_H
+
+/** \defgroup idmap idmap
+ *
+ * @{
+ */
+
+#include <linux/libcfs/libcfs.h>
+
+#define CFS_NGROUPS_PER_BLOCK   ((int)(PAGE_CACHE_SIZE / sizeof(gid_t)))
+
+#define CFS_GROUP_AT(gi, i) \
+	((gi)->blocks[(i) / CFS_NGROUPS_PER_BLOCK][(i) % CFS_NGROUPS_PER_BLOCK])
+
+enum {
+	CFS_IC_NOTHING     = 0,    /* convert nothing */
+	CFS_IC_ALL	 = 1,    /* convert all items */
+	CFS_IC_MAPPED      = 2,    /* convert mapped uid/gid */
+	CFS_IC_UNMAPPED    = 3     /* convert unmapped uid/gid */
+};
+
+#define  CFS_IDMAP_NOTFOUND     (-1)
+
+#define CFS_IDMAP_HASHSIZE      32
+
+enum lustre_idmap_idx {
+	RMT_UIDMAP_IDX,
+	LCL_UIDMAP_IDX,
+	RMT_GIDMAP_IDX,
+	LCL_GIDMAP_IDX,
+	CFS_IDMAP_N_HASHES
+};
+
+struct lustre_idmap_table {
+	spinlock_t	lit_lock;
+	struct list_head	lit_idmaps[CFS_IDMAP_N_HASHES][CFS_IDMAP_HASHSIZE];
+};
+
+struct lu_ucred;
+
+extern void lustre_groups_from_list(group_info_t *ginfo, gid_t *glist);
+extern void lustre_groups_sort(group_info_t *group_info);
+extern int lustre_in_group_p(struct lu_ucred *mu, gid_t grp);
+
+extern int lustre_idmap_add(struct lustre_idmap_table *t,
+			    uid_t ruid, uid_t luid,
+			    gid_t rgid, gid_t lgid);
+extern int lustre_idmap_del(struct lustre_idmap_table *t,
+			    uid_t ruid, uid_t luid,
+			    gid_t rgid, gid_t lgid);
+extern int lustre_idmap_lookup_uid(struct lu_ucred *mu,
+				   struct lustre_idmap_table *t,
+				   int reverse, uid_t uid);
+extern int lustre_idmap_lookup_gid(struct lu_ucred *mu,
+				   struct lustre_idmap_table *t,
+				   int reverse, gid_t gid);
+extern struct lustre_idmap_table *lustre_idmap_init(void);
+extern void lustre_idmap_fini(struct lustre_idmap_table *t);
+
+/** @} idmap */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_import.h b/drivers/staging/lustre/lustre/include/lustre_import.h
new file mode 100644
index 000000000000..3a5dd6a94c08
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_import.h
@@ -0,0 +1,367 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/** \defgroup obd_import PtlRPC import definitions
+ * Imports are client-side representation of remote obd target.
+ *
+ * @{
+ */
+
+#ifndef __IMPORT_H
+#define __IMPORT_H
+
+/** \defgroup export export
+ *
+ * @{
+ */
+
+#include <lustre_handles.h>
+#include <lustre/lustre_idl.h>
+
+
+/**
+ * Adaptive Timeout stuff
+ *
+ * @{
+ */
+#define D_ADAPTTO D_OTHER
+#define AT_BINS 4		  /* "bin" means "N seconds of history" */
+#define AT_FLG_NOHIST 0x1	  /* use last reported value only */
+
+struct adaptive_timeout {
+	time_t		at_binstart;	 /* bin start time */
+	unsigned int	at_hist[AT_BINS];    /* timeout history bins */
+	unsigned int	at_flags;
+	unsigned int	at_current;	  /* current timeout value */
+	unsigned int	at_worst_ever;       /* worst-ever timeout value */
+	time_t		at_worst_time;       /* worst-ever timeout timestamp */
+	spinlock_t	at_lock;
+};
+
+struct ptlrpc_at_array {
+	struct list_head       *paa_reqs_array; /** array to hold requests */
+	__u32	     paa_size;       /** the size of array */
+	__u32	     paa_count;      /** the total count of reqs */
+	time_t	    paa_deadline;   /** the earliest deadline of reqs */
+	__u32	    *paa_reqs_count; /** the count of reqs in each entry */
+};
+
+#define IMP_AT_MAX_PORTALS 8
+struct imp_at {
+	int		     iat_portal[IMP_AT_MAX_PORTALS];
+	struct adaptive_timeout iat_net_latency;
+	struct adaptive_timeout iat_service_estimate[IMP_AT_MAX_PORTALS];
+};
+
+
+/** @} */
+
+/** Possible import states */
+enum lustre_imp_state {
+	LUSTRE_IMP_CLOSED     = 1,
+	LUSTRE_IMP_NEW	= 2,
+	LUSTRE_IMP_DISCON     = 3,
+	LUSTRE_IMP_CONNECTING = 4,
+	LUSTRE_IMP_REPLAY     = 5,
+	LUSTRE_IMP_REPLAY_LOCKS = 6,
+	LUSTRE_IMP_REPLAY_WAIT  = 7,
+	LUSTRE_IMP_RECOVER    = 8,
+	LUSTRE_IMP_FULL       = 9,
+	LUSTRE_IMP_EVICTED    = 10,
+};
+
+/** Returns test string representation of numeric import state \a state */
+static inline char * ptlrpc_import_state_name(enum lustre_imp_state state)
+{
+	static char* import_state_names[] = {
+		"<UNKNOWN>", "CLOSED",  "NEW", "DISCONN",
+		"CONNECTING", "REPLAY", "REPLAY_LOCKS", "REPLAY_WAIT",
+		"RECOVER", "FULL", "EVICTED",
+	};
+
+	LASSERT (state <= LUSTRE_IMP_EVICTED);
+	return import_state_names[state];
+}
+
+/**
+ * List of import event types
+ */
+enum obd_import_event {
+	IMP_EVENT_DISCON     = 0x808001,
+	IMP_EVENT_INACTIVE   = 0x808002,
+	IMP_EVENT_INVALIDATE = 0x808003,
+	IMP_EVENT_ACTIVE     = 0x808004,
+	IMP_EVENT_OCD	= 0x808005,
+	IMP_EVENT_DEACTIVATE = 0x808006,
+	IMP_EVENT_ACTIVATE   = 0x808007,
+};
+
+/**
+ * Definition of import connection structure
+ */
+struct obd_import_conn {
+	/** Item for linking connections together */
+	struct list_head		oic_item;
+	/** Pointer to actual PortalRPC connection */
+	struct ptlrpc_connection *oic_conn;
+	/** uuid of remote side */
+	struct obd_uuid	   oic_uuid;
+	/**
+	 * Time (64 bit jiffies) of last connection attempt on this connection
+	 */
+	__u64		     oic_last_attempt;
+};
+
+/* state history */
+#define IMP_STATE_HIST_LEN 16
+struct import_state_hist {
+	enum lustre_imp_state ish_state;
+	time_t		ish_time;
+};
+
+/**
+ * Defintion of PortalRPC import structure.
+ * Imports are representing client-side view to remote target.
+ */
+struct obd_import {
+	/** Local handle (== id) for this import. */
+	struct portals_handle     imp_handle;
+	/** Reference counter */
+	atomic_t	      imp_refcount;
+	struct lustre_handle      imp_dlm_handle; /* client's ldlm export */
+	/** Currently active connection */
+	struct ptlrpc_connection *imp_connection;
+	/** PortalRPC client structure for this import */
+	struct ptlrpc_client     *imp_client;
+	/** List element for linking into pinger chain */
+	struct list_head		imp_pinger_chain;
+	/** List element for linking into chain for destruction */
+	struct list_head		imp_zombie_chain;
+
+	/**
+	 * Lists of requests that are retained for replay, waiting for a reply,
+	 * or waiting for recovery to complete, respectively.
+	 * @{
+	 */
+	struct list_head		imp_replay_list;
+	struct list_head		imp_sending_list;
+	struct list_head		imp_delayed_list;
+	/** @} */
+
+	/** obd device for this import */
+	struct obd_device	*imp_obd;
+
+	/**
+	 * some seciruty-related fields
+	 * @{
+	 */
+	struct ptlrpc_sec	*imp_sec;
+	struct mutex		  imp_sec_mutex;
+	cfs_time_t		imp_sec_expire;
+	/** @} */
+
+	/** Wait queue for those who need to wait for recovery completion */
+	wait_queue_head_t	       imp_recovery_waitq;
+
+	/** Number of requests currently in-flight */
+	atomic_t	      imp_inflight;
+	/** Number of requests currently unregistering */
+	atomic_t	      imp_unregistering;
+	/** Number of replay requests inflight */
+	atomic_t	      imp_replay_inflight;
+	/** Number of currently happening import invalidations */
+	atomic_t	      imp_inval_count;
+	/** Numbner of request timeouts */
+	atomic_t	      imp_timeouts;
+	/** Current import state */
+	enum lustre_imp_state     imp_state;
+	/** History of import states */
+	struct import_state_hist  imp_state_hist[IMP_STATE_HIST_LEN];
+	int		       imp_state_hist_idx;
+	/** Current import generation. Incremented on every reconnect */
+	int		       imp_generation;
+	/** Incremented every time we send reconnection request */
+	__u32		     imp_conn_cnt;
+       /**
+	* \see ptlrpc_free_committed remembers imp_generation value here
+	* after a check to save on unnecessary replay list iterations
+	*/
+	int		       imp_last_generation_checked;
+	/** Last tranno we replayed */
+	__u64		     imp_last_replay_transno;
+	/** Last transno committed on remote side */
+	__u64		     imp_peer_committed_transno;
+	/**
+	 * \see ptlrpc_free_committed remembers last_transno since its last
+	 * check here and if last_transno did not change since last run of
+	 * ptlrpc_free_committed and import generation is the same, we can
+	 * skip looking for requests to remove from replay list as optimisation
+	 */
+	__u64		     imp_last_transno_checked;
+	/**
+	 * Remote export handle. This is how remote side knows what export
+	 * we are talking to. Filled from response to connect request
+	 */
+	struct lustre_handle      imp_remote_handle;
+	/** When to perform next ping. time in jiffies. */
+	cfs_time_t		imp_next_ping;
+	/** When we last succesfully connected. time in 64bit jiffies */
+	__u64		     imp_last_success_conn;
+
+	/** List of all possible connection for import. */
+	struct list_head		imp_conn_list;
+	/**
+	 * Current connection. \a imp_connection is imp_conn_current->oic_conn
+	 */
+	struct obd_import_conn   *imp_conn_current;
+
+	/** Protects flags, level, generation, conn_cnt, *_list */
+	spinlock_t		  imp_lock;
+
+	/* flags */
+	unsigned long	     imp_no_timeout:1, /* timeouts are disabled */
+				  imp_invalid:1,    /* evicted */
+				  /* administratively disabled */
+				  imp_deactive:1,
+				  /* try to recover the import */
+				  imp_replayable:1,
+				  /* don't run recovery (timeout instead) */
+				  imp_dlm_fake:1,
+				  /* use 1/2 timeout on MDS' OSCs */
+				  imp_server_timeout:1,
+				  /* VBR: imp in delayed recovery */
+				  imp_delayed_recovery:1,
+				  /* VBR: if gap was found then no lock replays
+				   */
+				  imp_no_lock_replay:1,
+				  /* recovery by versions was failed */
+				  imp_vbr_failed:1,
+				  /* force an immidiate ping */
+				  imp_force_verify:1,
+				  /* force a scheduled ping */
+				  imp_force_next_verify:1,
+				  /* pingable */
+				  imp_pingable:1,
+				  /* resend for replay */
+				  imp_resend_replay:1,
+				  /* disable normal recovery, for test only. */
+				  imp_no_pinger_recover:1,
+				  /* need IR MNE swab */
+				  imp_need_mne_swab:1,
+				  /* import must be reconnected instead of
+				   * chouse new connection */
+				  imp_force_reconnect:1,
+				  /* import has tried to connect with server */
+				  imp_connect_tried:1;
+	__u32		     imp_connect_op;
+	struct obd_connect_data   imp_connect_data;
+	__u64		     imp_connect_flags_orig;
+	int		       imp_connect_error;
+
+	__u32		     imp_msg_magic;
+	__u32		     imp_msghdr_flags;       /* adjusted based on server capability */
+
+	struct ptlrpc_request_pool *imp_rq_pool;	  /* emergency request pool */
+
+	struct imp_at	     imp_at;		 /* adaptive timeout data */
+	time_t		    imp_last_reply_time;    /* for health check */
+};
+
+typedef void (*obd_import_callback)(struct obd_import *imp, void *closure,
+				    int event, void *event_arg, void *cb_data);
+
+/**
+ * Structure for import observer.
+ * It is possible to register "observer" on an import and every time
+ * something happens to an import (like connect/evict/disconnect)
+ * obderver will get its callback called with event type
+ */
+struct obd_import_observer {
+	struct list_head	   oio_chain;
+	obd_import_callback  oio_cb;
+	void		*oio_cb_data;
+};
+
+void class_observe_import(struct obd_import *imp, obd_import_callback cb,
+			  void *cb_data);
+void class_unobserve_import(struct obd_import *imp, obd_import_callback cb,
+			    void *cb_data);
+void class_notify_import_observers(struct obd_import *imp, int event,
+				   void *event_arg);
+
+/* import.c */
+static inline unsigned int at_est2timeout(unsigned int val)
+{
+	/* add an arbitrary minimum: 125% +5 sec */
+	return (val + (val >> 2) + 5);
+}
+
+static inline unsigned int at_timeout2est(unsigned int val)
+{
+	/* restore estimate value from timeout: e=4/5(t-5) */
+	LASSERT(val);
+	return (max((val << 2) / 5, 5U) - 4);
+}
+
+static inline void at_reset(struct adaptive_timeout *at, int val) {
+	at->at_current = val;
+	at->at_worst_ever = val;
+	at->at_worst_time = cfs_time_current_sec();
+}
+static inline void at_init(struct adaptive_timeout *at, int val, int flags) {
+	memset(at, 0, sizeof(*at));
+	spin_lock_init(&at->at_lock);
+	at->at_flags = flags;
+	at_reset(at, val);
+}
+extern unsigned int at_min;
+static inline int at_get(struct adaptive_timeout *at) {
+	return (at->at_current > at_min) ? at->at_current : at_min;
+}
+int at_measured(struct adaptive_timeout *at, unsigned int val);
+int import_at_get_index(struct obd_import *imp, int portal);
+extern unsigned int at_max;
+#define AT_OFF (at_max == 0)
+
+/* genops.c */
+struct obd_export;
+extern struct obd_import *class_exp2cliimp(struct obd_export *);
+extern struct obd_import *class_conn2cliimp(struct lustre_handle *);
+
+/** @} import */
+
+#endif /* __IMPORT_H */
+
+/** @} obd_import */
diff --git a/drivers/staging/lustre/lustre/include/lustre_lib.h b/drivers/staging/lustre/lustre/include/lustre_lib.h
new file mode 100644
index 000000000000..bdfc5391c6d2
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_lib.h
@@ -0,0 +1,667 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_lib.h
+ *
+ * Basic Lustre library routines.
+ */
+
+#ifndef _LUSTRE_LIB_H
+#define _LUSTRE_LIB_H
+
+/** \defgroup lib lib
+ *
+ * @{
+ */
+
+#include <linux/libcfs/libcfs.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_ver.h>
+#include <lustre_cfg.h>
+#include <linux/lustre_lib.h>
+
+/* target.c */
+struct ptlrpc_request;
+struct obd_export;
+struct lu_target;
+struct l_wait_info;
+#include <lustre_ha.h>
+#include <lustre_net.h>
+#include <lvfs.h>
+
+
+int target_pack_pool_reply(struct ptlrpc_request *req);
+int do_set_info_async(struct obd_import *imp,
+		      int opcode, int version,
+		      obd_count keylen, void *key,
+		      obd_count vallen, void *val,
+		      struct ptlrpc_request_set *set);
+
+#define OBD_RECOVERY_MAX_TIME (obd_timeout * 18) /* b13079 */
+#define OBD_MAX_IOCTL_BUFFER CONFIG_LUSTRE_OBD_MAX_IOCTL_BUFFER
+
+void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id);
+
+/* client.c */
+
+int client_sanobd_setup(struct obd_device *obddev, struct lustre_cfg* lcfg);
+struct client_obd *client_conn2cli(struct lustre_handle *conn);
+
+struct md_open_data;
+struct obd_client_handle {
+	struct lustre_handle  och_fh;
+	struct lu_fid	 och_fid;
+	struct md_open_data  *och_mod;
+	__u32 och_magic;
+	int och_flags;
+};
+#define OBD_CLIENT_HANDLE_MAGIC 0xd15ea5ed
+
+/* statfs_pack.c */
+void statfs_pack(struct obd_statfs *osfs, struct kstatfs *sfs);
+void statfs_unpack(struct kstatfs *sfs, struct obd_statfs *osfs);
+
+/* l_lock.c */
+struct lustre_lock {
+	int			l_depth;
+	task_t		*l_owner;
+	struct semaphore	l_sem;
+	spinlock_t		l_spin;
+};
+
+void l_lock_init(struct lustre_lock *);
+void l_lock(struct lustre_lock *);
+void l_unlock(struct lustre_lock *);
+int l_has_lock(struct lustre_lock *);
+
+/*
+ * For md echo client
+ */
+enum md_echo_cmd {
+	ECHO_MD_CREATE       = 1, /* Open/Create file on MDT */
+	ECHO_MD_MKDIR	= 2, /* Mkdir on MDT */
+	ECHO_MD_DESTROY      = 3, /* Unlink file on MDT */
+	ECHO_MD_RMDIR	= 4, /* Rmdir on MDT */
+	ECHO_MD_LOOKUP       = 5, /* Lookup on MDT */
+	ECHO_MD_GETATTR      = 6, /* Getattr on MDT */
+	ECHO_MD_SETATTR      = 7, /* Setattr on MDT */
+	ECHO_MD_ALLOC_FID    = 8, /* Get FIDs from MDT */
+};
+
+/*
+ *   OBD IOCTLS
+ */
+#define OBD_IOCTL_VERSION 0x00010004
+
+struct obd_ioctl_data {
+	__u32 ioc_len;
+	__u32 ioc_version;
+
+	union {
+		__u64 ioc_cookie;
+		__u64 ioc_u64_1;
+	};
+	union {
+		__u32 ioc_conn1;
+		__u32 ioc_u32_1;
+	};
+	union {
+		__u32 ioc_conn2;
+		__u32 ioc_u32_2;
+	};
+
+	struct obdo ioc_obdo1;
+	struct obdo ioc_obdo2;
+
+	obd_size ioc_count;
+	obd_off  ioc_offset;
+	__u32    ioc_dev;
+	__u32    ioc_command;
+
+	__u64 ioc_nid;
+	__u32 ioc_nal;
+	__u32 ioc_type;
+
+	/* buffers the kernel will treat as user pointers */
+	__u32  ioc_plen1;
+	char  *ioc_pbuf1;
+	__u32  ioc_plen2;
+	char  *ioc_pbuf2;
+
+	/* inline buffers for various arguments */
+	__u32  ioc_inllen1;
+	char  *ioc_inlbuf1;
+	__u32  ioc_inllen2;
+	char  *ioc_inlbuf2;
+	__u32  ioc_inllen3;
+	char  *ioc_inlbuf3;
+	__u32  ioc_inllen4;
+	char  *ioc_inlbuf4;
+
+	char    ioc_bulk[0];
+};
+
+struct obd_ioctl_hdr {
+	__u32 ioc_len;
+	__u32 ioc_version;
+};
+
+static inline int obd_ioctl_packlen(struct obd_ioctl_data *data)
+{
+	int len = cfs_size_round(sizeof(struct obd_ioctl_data));
+	len += cfs_size_round(data->ioc_inllen1);
+	len += cfs_size_round(data->ioc_inllen2);
+	len += cfs_size_round(data->ioc_inllen3);
+	len += cfs_size_round(data->ioc_inllen4);
+	return len;
+}
+
+
+static inline int obd_ioctl_is_invalid(struct obd_ioctl_data *data)
+{
+	if (data->ioc_len > (1<<30)) {
+		CERROR("OBD ioctl: ioc_len larger than 1<<30\n");
+		return 1;
+	}
+	if (data->ioc_inllen1 > (1<<30)) {
+		CERROR("OBD ioctl: ioc_inllen1 larger than 1<<30\n");
+		return 1;
+	}
+	if (data->ioc_inllen2 > (1<<30)) {
+		CERROR("OBD ioctl: ioc_inllen2 larger than 1<<30\n");
+		return 1;
+	}
+	if (data->ioc_inllen3 > (1<<30)) {
+		CERROR("OBD ioctl: ioc_inllen3 larger than 1<<30\n");
+		return 1;
+	}
+	if (data->ioc_inllen4 > (1<<30)) {
+		CERROR("OBD ioctl: ioc_inllen4 larger than 1<<30\n");
+		return 1;
+	}
+	if (data->ioc_inlbuf1 && !data->ioc_inllen1) {
+		CERROR("OBD ioctl: inlbuf1 pointer but 0 length\n");
+		return 1;
+	}
+	if (data->ioc_inlbuf2 && !data->ioc_inllen2) {
+		CERROR("OBD ioctl: inlbuf2 pointer but 0 length\n");
+		return 1;
+	}
+	if (data->ioc_inlbuf3 && !data->ioc_inllen3) {
+		CERROR("OBD ioctl: inlbuf3 pointer but 0 length\n");
+		return 1;
+	}
+	if (data->ioc_inlbuf4 && !data->ioc_inllen4) {
+		CERROR("OBD ioctl: inlbuf4 pointer but 0 length\n");
+		return 1;
+	}
+	if (data->ioc_pbuf1 && !data->ioc_plen1) {
+		CERROR("OBD ioctl: pbuf1 pointer but 0 length\n");
+		return 1;
+	}
+	if (data->ioc_pbuf2 && !data->ioc_plen2) {
+		CERROR("OBD ioctl: pbuf2 pointer but 0 length\n");
+		return 1;
+	}
+	if (data->ioc_plen1 && !data->ioc_pbuf1) {
+		CERROR("OBD ioctl: plen1 set but NULL pointer\n");
+		return 1;
+	}
+	if (data->ioc_plen2 && !data->ioc_pbuf2) {
+		CERROR("OBD ioctl: plen2 set but NULL pointer\n");
+		return 1;
+	}
+	if (obd_ioctl_packlen(data) > data->ioc_len) {
+		CERROR("OBD ioctl: packlen exceeds ioc_len (%d > %d)\n",
+		       obd_ioctl_packlen(data), data->ioc_len);
+		return 1;
+	}
+	return 0;
+}
+
+
+#include <obd_support.h>
+
+/* function defined in lustre/obdclass/<platform>/<platform>-module.c */
+int obd_ioctl_getdata(char **buf, int *len, void *arg);
+int obd_ioctl_popdata(void *arg, void *data, int len);
+
+static inline void obd_ioctl_freedata(char *buf, int len)
+{
+	ENTRY;
+
+	OBD_FREE_LARGE(buf, len);
+	EXIT;
+	return;
+}
+
+/*
+ * BSD ioctl description:
+ * #define IOC_V1       _IOR(g, n1, long)
+ * #define IOC_V2       _IOW(g, n2, long)
+ *
+ * ioctl(f, IOC_V1, arg);
+ * arg will be treated as a long value,
+ *
+ * ioctl(f, IOC_V2, arg)
+ * arg will be treated as a pointer, bsd will call
+ * copyin(buf, arg, sizeof(long))
+ *
+ * To make BSD ioctl handles argument correctly and simplely,
+ * we change _IOR to _IOWR so BSD will copyin obd_ioctl_data
+ * for us. Does this change affect Linux?  (XXX Liang)
+ */
+#define OBD_IOC_CREATE		 _IOWR('f', 101, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_DESTROY		_IOW ('f', 104, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PREALLOCATE	    _IOWR('f', 105, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_SETATTR		_IOW ('f', 107, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_GETATTR		_IOWR ('f', 108, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_READ		   _IOWR('f', 109, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_WRITE		  _IOWR('f', 110, OBD_IOC_DATA_TYPE)
+
+
+#define OBD_IOC_STATFS		 _IOWR('f', 113, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_SYNC		   _IOW ('f', 114, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_READ2		  _IOWR('f', 115, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_FORMAT		 _IOWR('f', 116, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PARTITION	      _IOWR('f', 117, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_COPY		   _IOWR('f', 120, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_MIGR		   _IOWR('f', 121, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PUNCH		  _IOWR('f', 122, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_MODULE_DEBUG	   _IOWR('f', 124, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_BRW_READ	       _IOWR('f', 125, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_BRW_WRITE	      _IOWR('f', 126, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_NAME2DEV	       _IOWR('f', 127, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_UUID2DEV	       _IOWR('f', 130, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_GETNAME		_IOWR('f', 131, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_GETMDNAME	      _IOR('f', 131, char[MAX_OBD_NAME])
+#define OBD_IOC_GETDTNAME	       OBD_IOC_GETNAME
+
+#define OBD_IOC_LOV_GET_CONFIG	 _IOWR('f', 132, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_CLIENT_RECOVER	 _IOW ('f', 133, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PING_TARGET	    _IOW ('f', 136, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_DEC_FS_USE_COUNT       _IO  ('f', 139      )
+#define OBD_IOC_NO_TRANSNO	     _IOW ('f', 140, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_SET_READONLY	   _IOW ('f', 141, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_ABORT_RECOVERY	 _IOR ('f', 142, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_ROOT_SQUASH	    _IOWR('f', 143, OBD_IOC_DATA_TYPE)
+
+#define OBD_GET_VERSION		_IOWR ('f', 144, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_GSS_SUPPORT	    _IOWR('f', 145, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_CLOSE_UUID	     _IOWR ('f', 147, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_CHANGELOG_SEND	 _IOW ('f', 148, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_GETDEVICE	      _IOWR ('f', 149, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_FID2PATH	       _IOWR ('f', 150, OBD_IOC_DATA_TYPE)
+/* see also <lustre/lustre_user.h> for ioctls 151-153 */
+/* OBD_IOC_LOV_SETSTRIPE: See also LL_IOC_LOV_SETSTRIPE */
+#define OBD_IOC_LOV_SETSTRIPE	  _IOW ('f', 154, OBD_IOC_DATA_TYPE)
+/* OBD_IOC_LOV_GETSTRIPE: See also LL_IOC_LOV_GETSTRIPE */
+#define OBD_IOC_LOV_GETSTRIPE	  _IOW ('f', 155, OBD_IOC_DATA_TYPE)
+/* OBD_IOC_LOV_SETEA: See also LL_IOC_LOV_SETEA */
+#define OBD_IOC_LOV_SETEA	      _IOW ('f', 156, OBD_IOC_DATA_TYPE)
+/* see <lustre/lustre_user.h> for ioctls 157-159 */
+/* OBD_IOC_QUOTACHECK: See also LL_IOC_QUOTACHECK */
+#define OBD_IOC_QUOTACHECK	     _IOW ('f', 160, int)
+/* OBD_IOC_POLL_QUOTACHECK: See also LL_IOC_POLL_QUOTACHECK */
+#define OBD_IOC_POLL_QUOTACHECK	_IOR ('f', 161, struct if_quotacheck *)
+/* OBD_IOC_QUOTACTL: See also LL_IOC_QUOTACTL */
+#define OBD_IOC_QUOTACTL	       _IOWR('f', 162, struct if_quotactl)
+/* see  also <lustre/lustre_user.h> for ioctls 163-176 */
+#define OBD_IOC_CHANGELOG_REG	  _IOW ('f', 177, struct obd_ioctl_data)
+#define OBD_IOC_CHANGELOG_DEREG	_IOW ('f', 178, struct obd_ioctl_data)
+#define OBD_IOC_CHANGELOG_CLEAR	_IOW ('f', 179, struct obd_ioctl_data)
+#define OBD_IOC_RECORD		 _IOWR('f', 180, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_ENDRECORD	      _IOWR('f', 181, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PARSE		  _IOWR('f', 182, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_DORECORD	       _IOWR('f', 183, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PROCESS_CFG	    _IOWR('f', 184, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_DUMP_LOG	       _IOWR('f', 185, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_CLEAR_LOG	      _IOWR('f', 186, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PARAM		  _IOW ('f', 187, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_POOL		   _IOWR('f', 188, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_REPLACE_NIDS	   _IOWR('f', 189, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_CATLOGLIST	     _IOWR('f', 190, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_INFO	      _IOWR('f', 191, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_PRINT	     _IOWR('f', 192, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_CANCEL	    _IOWR('f', 193, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_REMOVE	    _IOWR('f', 194, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_CHECK	     _IOWR('f', 195, OBD_IOC_DATA_TYPE)
+/* OBD_IOC_LLOG_CATINFO is deprecated */
+#define OBD_IOC_LLOG_CATINFO	   _IOWR('f', 196, OBD_IOC_DATA_TYPE)
+
+#define ECHO_IOC_GET_STRIPE	    _IOWR('f', 200, OBD_IOC_DATA_TYPE)
+#define ECHO_IOC_SET_STRIPE	    _IOWR('f', 201, OBD_IOC_DATA_TYPE)
+#define ECHO_IOC_ENQUEUE	       _IOWR('f', 202, OBD_IOC_DATA_TYPE)
+#define ECHO_IOC_CANCEL		_IOWR('f', 203, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_GET_OBJ_VERSION	_IOR('f', 210, OBD_IOC_DATA_TYPE)
+
+/* <lustre/lustre_user.h> defines ioctl number 218-219 */
+#define OBD_IOC_GET_MNTOPT	     _IOW('f', 220, mntopt_t)
+
+#define OBD_IOC_ECHO_MD		_IOR('f', 221, struct obd_ioctl_data)
+#define OBD_IOC_ECHO_ALLOC_SEQ	 _IOWR('f', 222, struct obd_ioctl_data)
+
+#define OBD_IOC_START_LFSCK	       _IOWR('f', 230, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_STOP_LFSCK	       _IOW('f', 231, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PAUSE_LFSCK	       _IOW('f', 232, OBD_IOC_DATA_TYPE)
+
+/* XXX _IOWR('f', 250, long) has been defined in
+ * libcfs/include/libcfs/libcfs_private.h for debug, don't use it
+ */
+
+/* Until such time as we get_info the per-stripe maximum from the OST,
+ * we define this to be 2T - 4k, which is the ext3 maxbytes. */
+#define LUSTRE_STRIPE_MAXBYTES 0x1fffffff000ULL
+
+/* Special values for remove LOV EA from disk */
+#define LOVEA_DELETE_VALUES(size, count, offset) (size == 0 && count == 0 && \
+						 offset == (typeof(offset))(-1))
+
+/* #define POISON_BULK 0 */
+
+/*
+ * l_wait_event is a flexible sleeping function, permitting simple caller
+ * configuration of interrupt and timeout sensitivity along with actions to
+ * be performed in the event of either exception.
+ *
+ * The first form of usage looks like this:
+ *
+ * struct l_wait_info lwi = LWI_TIMEOUT_INTR(timeout, timeout_handler,
+ *					   intr_handler, callback_data);
+ * rc = l_wait_event(waitq, condition, &lwi);
+ *
+ * l_wait_event() makes the current process wait on 'waitq' until 'condition'
+ * is TRUE or a "killable" signal (SIGTERM, SIKGILL, SIGINT) is pending.  It
+ * returns 0 to signify 'condition' is TRUE, but if a signal wakes it before
+ * 'condition' becomes true, it optionally calls the specified 'intr_handler'
+ * if not NULL, and returns -EINTR.
+ *
+ * If a non-zero timeout is specified, signals are ignored until the timeout
+ * has expired.  At this time, if 'timeout_handler' is not NULL it is called.
+ * If it returns FALSE l_wait_event() continues to wait as described above with
+ * signals enabled.  Otherwise it returns -ETIMEDOUT.
+ *
+ * LWI_INTR(intr_handler, callback_data) is shorthand for
+ * LWI_TIMEOUT_INTR(0, NULL, intr_handler, callback_data)
+ *
+ * The second form of usage looks like this:
+ *
+ * struct l_wait_info lwi = LWI_TIMEOUT(timeout, timeout_handler);
+ * rc = l_wait_event(waitq, condition, &lwi);
+ *
+ * This form is the same as the first except that it COMPLETELY IGNORES
+ * SIGNALS.  The caller must therefore beware that if 'timeout' is zero, or if
+ * 'timeout_handler' is not NULL and returns FALSE, then the ONLY thing that
+ * can unblock the current process is 'condition' becoming TRUE.
+ *
+ * Another form of usage is:
+ * struct l_wait_info lwi = LWI_TIMEOUT_INTERVAL(timeout, interval,
+ *					       timeout_handler);
+ * rc = l_wait_event(waitq, condition, &lwi);
+ * This is the same as previous case, but condition is checked once every
+ * 'interval' jiffies (if non-zero).
+ *
+ * Subtle synchronization point: this macro does *not* necessary takes
+ * wait-queue spin-lock before returning, and, hence, following idiom is safe
+ * ONLY when caller provides some external locking:
+ *
+ *	     Thread1			    Thread2
+ *
+ *   l_wait_event(&obj->wq, ....);				       (1)
+ *
+ *				    wake_up(&obj->wq):		 (2)
+ *					 spin_lock(&q->lock);	  (2.1)
+ *					 __wake_up_common(q, ...);     (2.2)
+ *					 spin_unlock(&q->lock, flags); (2.3)
+ *
+ *   OBD_FREE_PTR(obj);						  (3)
+ *
+ * As l_wait_event() may "short-cut" execution and return without taking
+ * wait-queue spin-lock, some additional synchronization is necessary to
+ * guarantee that step (3) can begin only after (2.3) finishes.
+ *
+ * XXX nikita: some ptlrpc daemon threads have races of that sort.
+ *
+ */
+static inline int back_to_sleep(void *arg)
+{
+	return 0;
+}
+
+#define LWI_ON_SIGNAL_NOOP ((void (*)(void *))(-1))
+
+struct l_wait_info {
+	cfs_duration_t lwi_timeout;
+	cfs_duration_t lwi_interval;
+	int	    lwi_allow_intr;
+	int  (*lwi_on_timeout)(void *);
+	void (*lwi_on_signal)(void *);
+	void  *lwi_cb_data;
+};
+
+/* NB: LWI_TIMEOUT ignores signals completely */
+#define LWI_TIMEOUT(time, cb, data)	     \
+((struct l_wait_info) {			 \
+	.lwi_timeout    = time,		 \
+	.lwi_on_timeout = cb,		   \
+	.lwi_cb_data    = data,		 \
+	.lwi_interval   = 0,		    \
+	.lwi_allow_intr = 0		     \
+})
+
+#define LWI_TIMEOUT_INTERVAL(time, interval, cb, data)  \
+((struct l_wait_info) {				 \
+	.lwi_timeout    = time,			 \
+	.lwi_on_timeout = cb,			   \
+	.lwi_cb_data    = data,			 \
+	.lwi_interval   = interval,		     \
+	.lwi_allow_intr = 0			     \
+})
+
+#define LWI_TIMEOUT_INTR(time, time_cb, sig_cb, data)   \
+((struct l_wait_info) {				 \
+	.lwi_timeout    = time,			 \
+	.lwi_on_timeout = time_cb,		      \
+	.lwi_on_signal  = sig_cb,		       \
+	.lwi_cb_data    = data,			 \
+	.lwi_interval   = 0,			    \
+	.lwi_allow_intr = 0			     \
+})
+
+#define LWI_TIMEOUT_INTR_ALL(time, time_cb, sig_cb, data)       \
+((struct l_wait_info) {					 \
+	.lwi_timeout    = time,				 \
+	.lwi_on_timeout = time_cb,			      \
+	.lwi_on_signal  = sig_cb,			       \
+	.lwi_cb_data    = data,				 \
+	.lwi_interval   = 0,				    \
+	.lwi_allow_intr = 1				     \
+})
+
+#define LWI_INTR(cb, data)  LWI_TIMEOUT_INTR(0, NULL, cb, data)
+
+
+/*
+ * wait for @condition to become true, but no longer than timeout, specified
+ * by @info.
+ */
+#define __l_wait_event(wq, condition, info, ret, l_add_wait)		   \
+do {									   \
+	wait_queue_t __wait;						 \
+	cfs_duration_t __timeout = info->lwi_timeout;			  \
+	sigset_t   __blocked;					      \
+	int   __allow_intr = info->lwi_allow_intr;			     \
+									       \
+	ret = 0;							       \
+	if (condition)							 \
+		break;							 \
+									       \
+	init_waitqueue_entry_current(&__wait);					    \
+	l_add_wait(&wq, &__wait);					      \
+									       \
+	/* Block all signals (just the non-fatal ones if no timeout). */       \
+	if (info->lwi_on_signal != NULL && (__timeout == 0 || __allow_intr))   \
+		__blocked = cfs_block_sigsinv(LUSTRE_FATAL_SIGS);	      \
+	else								   \
+		__blocked = cfs_block_sigsinv(0);			      \
+									       \
+	for (;;) {							     \
+		unsigned       __wstate;				       \
+									       \
+		__wstate = info->lwi_on_signal != NULL &&		      \
+			   (__timeout == 0 || __allow_intr) ?		  \
+			TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE;	       \
+									       \
+		set_current_state(TASK_INTERRUPTIBLE);		 \
+									       \
+		if (condition)						 \
+			break;						 \
+									       \
+		if (__timeout == 0) {					  \
+			waitq_wait(&__wait, __wstate);		     \
+		} else {						       \
+			cfs_duration_t interval = info->lwi_interval?	  \
+					     min_t(cfs_duration_t,	     \
+						 info->lwi_interval,__timeout):\
+					     __timeout;			\
+			cfs_duration_t remaining = waitq_timedwait(&__wait,\
+						   __wstate,		   \
+						   interval);		  \
+			__timeout = cfs_time_sub(__timeout,		    \
+					    cfs_time_sub(interval, remaining));\
+			if (__timeout == 0) {				  \
+				if (info->lwi_on_timeout == NULL ||	    \
+				    info->lwi_on_timeout(info->lwi_cb_data)) { \
+					ret = -ETIMEDOUT;		      \
+					break;				 \
+				}					      \
+				/* Take signals after the timeout expires. */  \
+				if (info->lwi_on_signal != NULL)	       \
+				    (void)cfs_block_sigsinv(LUSTRE_FATAL_SIGS);\
+			}						      \
+		}							      \
+									       \
+		if (condition)						 \
+			break;						 \
+		if (cfs_signal_pending()) {				    \
+			if (info->lwi_on_signal != NULL &&		     \
+			    (__timeout == 0 || __allow_intr)) {		\
+				if (info->lwi_on_signal != LWI_ON_SIGNAL_NOOP) \
+					info->lwi_on_signal(info->lwi_cb_data);\
+				ret = -EINTR;				  \
+				break;					 \
+			}						      \
+			/* We have to do this here because some signals */     \
+			/* are not blockable - ie from strace(1).       */     \
+			/* In these cases we want to schedule_timeout() */     \
+			/* again, because we don't want that to return  */     \
+			/* -EINTR when the RPC actually succeeded.      */     \
+			/* the recalc_sigpending() below will deliver the */     \
+			/* signal properly.			     */     \
+			cfs_clear_sigpending();				\
+		}							      \
+	}								      \
+									       \
+	cfs_restore_sigs(__blocked);					   \
+									       \
+	set_current_state(TASK_RUNNING);			       \
+	remove_wait_queue(&wq, &__wait);					   \
+} while (0)
+
+
+
+#define l_wait_event(wq, condition, info)		       \
+({							      \
+	int		 __ret;			      \
+	struct l_wait_info *__info = (info);		    \
+								\
+	__l_wait_event(wq, condition, __info,		   \
+		       __ret, add_wait_queue);		   \
+	__ret;						  \
+})
+
+#define l_wait_event_exclusive(wq, condition, info)	     \
+({							      \
+	int		 __ret;			      \
+	struct l_wait_info *__info = (info);		    \
+								\
+	__l_wait_event(wq, condition, __info,		   \
+		       __ret, add_wait_queue_exclusive);	 \
+	__ret;						  \
+})
+
+#define l_wait_event_exclusive_head(wq, condition, info)	\
+({							      \
+	int		 __ret;			      \
+	struct l_wait_info *__info = (info);		    \
+								\
+	__l_wait_event(wq, condition, __info,		   \
+		       __ret, add_wait_queue_exclusive_head);    \
+	__ret;						  \
+})
+
+#define l_wait_condition(wq, condition)			 \
+({							      \
+	struct l_wait_info lwi = { 0 };			 \
+	l_wait_event(wq, condition, &lwi);		      \
+})
+
+#define l_wait_condition_exclusive(wq, condition)	       \
+({							      \
+	struct l_wait_info lwi = { 0 };			 \
+	l_wait_event_exclusive(wq, condition, &lwi);	    \
+})
+
+#define l_wait_condition_exclusive_head(wq, condition)	  \
+({							      \
+	struct l_wait_info lwi = { 0 };			 \
+	l_wait_event_exclusive_head(wq, condition, &lwi);       \
+})
+
+#define LIBLUSTRE_CLIENT (0)
+
+/** @} lib */
+
+#endif /* _LUSTRE_LIB_H */
diff --git a/drivers/staging/lustre/lustre/include/lustre_linkea.h b/drivers/staging/lustre/lustre/include/lustre_linkea.h
new file mode 100644
index 000000000000..5790be913bf6
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_linkea.h
@@ -0,0 +1,57 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2013, Intel Corporation.
+ * Use is subject to license terms.
+ *
+ * Author: di wang <di.wang@intel.com>
+ */
+
+struct linkea_data {
+	/**
+	 * Buffer to keep link EA body.
+	 */
+	struct lu_buf		*ld_buf;
+	/**
+	 * The matched header, entry and its lenght in the EA
+	 */
+	struct link_ea_header	*ld_leh;
+	struct link_ea_entry	*ld_lee;
+	int			ld_reclen;
+};
+
+int linkea_data_new(struct linkea_data *ldata, struct lu_buf *buf);
+int linkea_init(struct linkea_data *ldata);
+void linkea_entry_unpack(const struct link_ea_entry *lee, int *reclen,
+			 struct lu_name *lname, struct lu_fid *pfid);
+int linkea_add_buf(struct linkea_data *ldata, const struct lu_name *lname,
+		   const struct lu_fid *pfid);
+void linkea_del_buf(struct linkea_data *ldata, const struct lu_name *lname);
+int linkea_links_find(struct linkea_data *ldata, const struct lu_name *lname,
+		      const struct lu_fid  *pfid);
+
+#define LINKEA_NEXT_ENTRY(ldata)	\
+	(struct link_ea_entry *)((char *)ldata.ld_lee + ldata.ld_reclen)
+
+#define LINKEA_FIRST_ENTRY(ldata)	\
+	(struct link_ea_entry *)(ldata.ld_leh + 1)
diff --git a/drivers/staging/lustre/lustre/include/lustre_lite.h b/drivers/staging/lustre/lustre/include/lustre_lite.h
new file mode 100644
index 000000000000..25f8bfaccef3
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_lite.h
@@ -0,0 +1,147 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LL_H
+#define _LL_H
+
+/** \defgroup lite lite
+ *
+ * @{
+ */
+
+#include <linux/lustre_lite.h>
+
+#include <obd_class.h>
+#include <obd_ost.h>
+#include <lustre_net.h>
+#include <lustre_mds.h>
+#include <lustre_ha.h>
+
+/* 4UL * 1024 * 1024 */
+#define LL_MAX_BLKSIZE_BITS     (22)
+#define LL_MAX_BLKSIZE	  (1UL<<LL_MAX_BLKSIZE_BITS)
+
+#include <lustre/lustre_user.h>
+
+
+struct lustre_rw_params {
+	int		lrp_lock_mode;
+	ldlm_policy_data_t lrp_policy;
+	obd_flag	   lrp_brw_flags;
+	int		lrp_ast_flags;
+};
+
+/*
+ * XXX nikita: this function lives in the header because it is used by both
+ * llite kernel module and liblustre library, and there is no (?) better place
+ * to put it in.
+ */
+static inline void lustre_build_lock_params(int cmd, unsigned long open_flags,
+					    __u64 connect_flags,
+					    loff_t pos, ssize_t len,
+					    struct lustre_rw_params *params)
+{
+	params->lrp_lock_mode = (cmd == OBD_BRW_READ) ? LCK_PR : LCK_PW;
+	params->lrp_brw_flags = 0;
+
+	params->lrp_policy.l_extent.start = pos;
+	params->lrp_policy.l_extent.end = pos + len - 1;
+	/*
+	 * for now O_APPEND always takes local locks.
+	 */
+	if (cmd == OBD_BRW_WRITE && (open_flags & O_APPEND)) {
+		params->lrp_policy.l_extent.start = 0;
+		params->lrp_policy.l_extent.end   = OBD_OBJECT_EOF;
+	} else if (LIBLUSTRE_CLIENT && (connect_flags & OBD_CONNECT_SRVLOCK)) {
+		/*
+		 * liblustre: OST-side locking for all non-O_APPEND
+		 * reads/writes.
+		 */
+		params->lrp_lock_mode = LCK_NL;
+		params->lrp_brw_flags = OBD_BRW_SRVLOCK;
+	} else {
+		/*
+		 * nothing special for the kernel. In the future llite may use
+		 * OST-side locks for small writes into highly contended
+		 * files.
+		 */
+	}
+	params->lrp_ast_flags = (open_flags & O_NONBLOCK) ?
+		LDLM_FL_BLOCK_NOWAIT : 0;
+}
+
+/*
+ * This is embedded into liblustre and llite super-blocks to keep track of
+ * connect flags (capabilities) supported by all imports given mount is
+ * connected to.
+ */
+struct lustre_client_ocd {
+	/*
+	 * This is conjunction of connect_flags across all imports (LOVs) this
+	 * mount is connected to. This field is updated by cl_ocd_update()
+	 * under ->lco_lock.
+	 */
+	__u64	      lco_flags;
+	struct mutex	   lco_lock;
+	struct obd_export *lco_md_exp;
+	struct obd_export *lco_dt_exp;
+};
+
+/*
+ * Chain of hash overflow pages.
+ */
+struct ll_dir_chain {
+	/* XXX something. Later */
+};
+
+static inline void ll_dir_chain_init(struct ll_dir_chain *chain)
+{
+}
+
+static inline void ll_dir_chain_fini(struct ll_dir_chain *chain)
+{
+}
+
+static inline unsigned long hash_x_index(__u64 hash, int hash64)
+{
+	if (BITS_PER_LONG == 32 && hash64)
+		hash >>= 32;
+	return ~0UL - hash;
+}
+
+/** @} lite */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_log.h b/drivers/staging/lustre/lustre/include/lustre_log.h
new file mode 100644
index 000000000000..714ab378e431
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_log.h
@@ -0,0 +1,576 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_log.h
+ *
+ * Generic infrastructure for managing a collection of logs.
+ * These logs are used for:
+ *
+ * - orphan recovery: OST adds record on create
+ * - mtime/size consistency: the OST adds a record on first write
+ * - open/unlinked objects: OST adds a record on destroy
+ *
+ * - mds unlink log: the MDS adds an entry upon delete
+ *
+ * - raid1 replication log between OST's
+ * - MDS replication logs
+ */
+
+#ifndef _LUSTRE_LOG_H
+#define _LUSTRE_LOG_H
+
+/** \defgroup log log
+ *
+ * @{
+ */
+
+#include <linux/lustre_log.h>
+
+#include <obd_class.h>
+#include <obd_ost.h>
+#include <lustre/lustre_idl.h>
+#include <dt_object.h>
+
+#define LOG_NAME_LIMIT(logname, name)		   \
+	snprintf(logname, sizeof(logname), "LOGS/%s", name)
+#define LLOG_EEMPTY 4711
+
+enum llog_open_param {
+	LLOG_OPEN_EXISTS	= 0x0000,
+	LLOG_OPEN_NEW		= 0x0001,
+};
+
+struct plain_handle_data {
+	struct list_head	  phd_entry;
+	struct llog_handle *phd_cat_handle;
+	struct llog_cookie  phd_cookie; /* cookie of this log in its cat */
+};
+
+struct cat_handle_data {
+	struct list_head	      chd_head;
+	struct llog_handle     *chd_current_log; /* currently open log */
+	struct llog_handle	*chd_next_log; /* llog to be used next */
+};
+
+static inline void logid_to_fid(struct llog_logid *id, struct lu_fid *fid)
+{
+	/* For compatibility purposes we identify pre-OSD (~< 2.3.51 MDS)
+	 * logid's by non-zero ogen (inode generation) and convert them
+	 * into IGIF */
+	if (id->lgl_ogen == 0) {
+		fid->f_seq = id->lgl_oi.oi.oi_seq;
+		fid->f_oid = id->lgl_oi.oi.oi_id;
+		fid->f_ver = 0;
+	} else {
+		lu_igif_build(fid, id->lgl_oi.oi.oi_id, id->lgl_ogen);
+	}
+}
+
+static inline void fid_to_logid(struct lu_fid *fid, struct llog_logid *id)
+{
+	id->lgl_oi.oi.oi_seq = fid->f_seq;
+	id->lgl_oi.oi.oi_id = fid->f_oid;
+	id->lgl_ogen = 0;
+}
+
+static inline void logid_set_id(struct llog_logid *log_id, __u64 id)
+{
+	log_id->lgl_oi.oi.oi_id = id;
+}
+
+static inline __u64 logid_id(struct llog_logid *log_id)
+{
+	return log_id->lgl_oi.oi.oi_id;
+}
+
+struct llog_handle;
+
+/* llog.c  -  general API */
+int llog_init_handle(const struct lu_env *env, struct llog_handle *handle,
+		     int flags, struct obd_uuid *uuid);
+int llog_copy_handler(const struct lu_env *env, struct llog_handle *llh,
+		      struct llog_rec_hdr *rec, void *data);
+int llog_process(const struct lu_env *env, struct llog_handle *loghandle,
+		 llog_cb_t cb, void *data, void *catdata);
+int llog_process_or_fork(const struct lu_env *env,
+			 struct llog_handle *loghandle,
+			 llog_cb_t cb, void *data, void *catdata, bool fork);
+int llog_reverse_process(const struct lu_env *env,
+			 struct llog_handle *loghandle, llog_cb_t cb,
+			 void *data, void *catdata);
+int llog_cancel_rec(const struct lu_env *env, struct llog_handle *loghandle,
+		    int index);
+int llog_open(const struct lu_env *env, struct llog_ctxt *ctxt,
+	      struct llog_handle **lgh, struct llog_logid *logid,
+	      char *name, enum llog_open_param open_param);
+int llog_close(const struct lu_env *env, struct llog_handle *cathandle);
+int llog_get_size(struct llog_handle *loghandle);
+
+/* llog_process flags */
+#define LLOG_FLAG_NODEAMON 0x0001
+
+/* llog_cat.c - catalog api */
+struct llog_process_data {
+	/**
+	 * Any useful data needed while processing catalog. This is
+	 * passed later to process callback.
+	 */
+	void		*lpd_data;
+	/**
+	 * Catalog process callback function, called for each record
+	 * in catalog.
+	 */
+	llog_cb_t	    lpd_cb;
+	/**
+	 * Start processing the catalog from startcat/startidx
+	 */
+	int		  lpd_startcat;
+	int		  lpd_startidx;
+};
+
+struct llog_process_cat_data {
+	/**
+	 * Temporary stored first_idx while scanning log.
+	 */
+	int		  lpcd_first_idx;
+	/**
+	 * Temporary stored last_idx while scanning log.
+	 */
+	int		  lpcd_last_idx;
+};
+
+int llog_cat_close(const struct lu_env *env, struct llog_handle *cathandle);
+int llog_cat_add_rec(const struct lu_env *env, struct llog_handle *cathandle,
+		     struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+		     void *buf, struct thandle *th);
+int llog_cat_declare_add_rec(const struct lu_env *env,
+			     struct llog_handle *cathandle,
+			     struct llog_rec_hdr *rec, struct thandle *th);
+int llog_cat_add(const struct lu_env *env, struct llog_handle *cathandle,
+		 struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+		 void *buf);
+int llog_cat_cancel_records(const struct lu_env *env,
+			    struct llog_handle *cathandle, int count,
+			    struct llog_cookie *cookies);
+int llog_cat_process_or_fork(const struct lu_env *env,
+			     struct llog_handle *cat_llh, llog_cb_t cb,
+			     void *data, int startcat, int startidx, bool fork);
+int llog_cat_process(const struct lu_env *env, struct llog_handle *cat_llh,
+		     llog_cb_t cb, void *data, int startcat, int startidx);
+int llog_cat_reverse_process(const struct lu_env *env,
+			     struct llog_handle *cat_llh, llog_cb_t cb,
+			     void *data);
+int llog_cat_init_and_process(const struct lu_env *env,
+			      struct llog_handle *llh);
+
+/* llog_obd.c */
+int llog_setup(const struct lu_env *env, struct obd_device *obd,
+	       struct obd_llog_group *olg, int index,
+	       struct obd_device *disk_obd, struct llog_operations *op);
+int __llog_ctxt_put(const struct lu_env *env, struct llog_ctxt *ctxt);
+int llog_cleanup(const struct lu_env *env, struct llog_ctxt *);
+int llog_sync(struct llog_ctxt *ctxt, struct obd_export *exp, int flags);
+int llog_obd_add(const struct lu_env *env, struct llog_ctxt *ctxt,
+		 struct llog_rec_hdr *rec, struct lov_stripe_md *lsm,
+		 struct llog_cookie *logcookies, int numcookies);
+int llog_cancel(const struct lu_env *env, struct llog_ctxt *ctxt,
+		struct lov_stripe_md *lsm, int count,
+		struct llog_cookie *cookies, int flags);
+
+int obd_llog_init(struct obd_device *obd, struct obd_llog_group *olg,
+		  struct obd_device *disk_obd, int *idx);
+
+int obd_llog_finish(struct obd_device *obd, int count);
+
+/* llog_ioctl.c */
+int llog_ioctl(const struct lu_env *env, struct llog_ctxt *ctxt, int cmd,
+	       struct obd_ioctl_data *data);
+
+/* llog_net.c */
+int llog_initiator_connect(struct llog_ctxt *ctxt);
+
+struct llog_operations {
+	int (*lop_destroy)(const struct lu_env *env,
+			   struct llog_handle *handle);
+	int (*lop_next_block)(const struct lu_env *env, struct llog_handle *h,
+			      int *curr_idx, int next_idx, __u64 *offset,
+			      void *buf, int len);
+	int (*lop_prev_block)(const struct lu_env *env, struct llog_handle *h,
+			      int prev_idx, void *buf, int len);
+	int (*lop_read_header)(const struct lu_env *env,
+			       struct llog_handle *handle);
+	int (*lop_setup)(const struct lu_env *env, struct obd_device *obd,
+			 struct obd_llog_group *olg, int ctxt_idx,
+			 struct obd_device *disk_obd);
+	int (*lop_sync)(struct llog_ctxt *ctxt, struct obd_export *exp,
+			int flags);
+	int (*lop_cleanup)(const struct lu_env *env, struct llog_ctxt *ctxt);
+	int (*lop_cancel)(const struct lu_env *env, struct llog_ctxt *ctxt,
+			  struct lov_stripe_md *lsm, int count,
+			  struct llog_cookie *cookies, int flags);
+	int (*lop_connect)(struct llog_ctxt *ctxt, struct llog_logid *logid,
+			   struct llog_gen *gen, struct obd_uuid *uuid);
+	/**
+	 * Any llog file must be opened first using llog_open().  Llog can be
+	 * opened by name, logid or without both, in last case the new logid
+	 * will be generated.
+	 */
+	int (*lop_open)(const struct lu_env *env, struct llog_handle *lgh,
+			struct llog_logid *logid, char *name,
+			enum llog_open_param);
+	/**
+	 * Opened llog may not exist and this must be checked where needed using
+	 * the llog_exist() call.
+	 */
+	int (*lop_exist)(struct llog_handle *lgh);
+	/**
+	 * Close llog file and calls llog_free_handle() implicitly.
+	 * Any opened llog must be closed by llog_close() call.
+	 */
+	int (*lop_close)(const struct lu_env *env, struct llog_handle *handle);
+	/**
+	 * Create new llog file. The llog must be opened.
+	 * Must be used only for local llog operations.
+	 */
+	int (*lop_declare_create)(const struct lu_env *env,
+				  struct llog_handle *handle,
+				  struct thandle *th);
+	int (*lop_create)(const struct lu_env *env, struct llog_handle *handle,
+			  struct thandle *th);
+	/**
+	 * write new record in llog. It appends records usually but can edit
+	 * existing records too.
+	 */
+	int (*lop_declare_write_rec)(const struct lu_env *env,
+				     struct llog_handle *lgh,
+				     struct llog_rec_hdr *rec,
+				     int idx, struct thandle *th);
+	int (*lop_write_rec)(const struct lu_env *env,
+			     struct llog_handle *loghandle,
+			     struct llog_rec_hdr *rec,
+			     struct llog_cookie *cookie, int cookiecount,
+			     void *buf, int idx, struct thandle *th);
+	/**
+	 * Add new record in llog catalog. Does the same as llog_write_rec()
+	 * but using llog catalog.
+	 */
+	int (*lop_declare_add)(const struct lu_env *env,
+			       struct llog_handle *lgh,
+			       struct llog_rec_hdr *rec, struct thandle *th);
+	int (*lop_add)(const struct lu_env *env, struct llog_handle *lgh,
+		       struct llog_rec_hdr *rec, struct llog_cookie *cookie,
+		       void *buf, struct thandle *th);
+	/* Old llog_add version, used in MDS-LOV-OSC now and will gone with
+	 * LOD/OSP replacement */
+	int (*lop_obd_add)(const struct lu_env *env, struct llog_ctxt *ctxt,
+			   struct llog_rec_hdr *rec, struct lov_stripe_md *lsm,
+			   struct llog_cookie *logcookies, int numcookies);
+};
+
+/* In-memory descriptor for a log object or log catalog */
+struct llog_handle {
+	struct rw_semaphore	 lgh_lock;
+	spinlock_t		 lgh_hdr_lock; /* protect lgh_hdr data */
+	struct llog_logid	 lgh_id; /* id of this log */
+	struct llog_log_hdr	*lgh_hdr;
+	struct file		*lgh_file;
+	struct dt_object	*lgh_obj;
+	int			 lgh_last_idx;
+	int			 lgh_cur_idx; /* used during llog_process */
+	__u64			 lgh_cur_offset; /* used during llog_process */
+	struct llog_ctxt	*lgh_ctxt;
+	union {
+		struct plain_handle_data	 phd;
+		struct cat_handle_data		 chd;
+	} u;
+	char			*lgh_name;
+	void			*private_data;
+	struct llog_operations	*lgh_logops;
+	atomic_t		 lgh_refcount;
+};
+
+/* llog_lvfs.c */
+extern struct llog_operations llog_lvfs_ops;
+
+/* llog_osd.c */
+extern struct llog_operations llog_osd_ops;
+int llog_osd_get_cat_list(const struct lu_env *env, struct dt_device *d,
+			  int idx, int count,
+			  struct llog_catid *idarray);
+int llog_osd_put_cat_list(const struct lu_env *env, struct dt_device *d,
+			  int idx, int count,
+			  struct llog_catid *idarray);
+
+#define LLOG_CTXT_FLAG_UNINITIALIZED     0x00000001
+#define LLOG_CTXT_FLAG_STOP		 0x00000002
+
+struct llog_ctxt {
+	int		      loc_idx; /* my index the obd array of ctxt's */
+	struct obd_device       *loc_obd; /* points back to the containing obd*/
+	struct obd_llog_group   *loc_olg; /* group containing that ctxt */
+	struct obd_export       *loc_exp; /* parent "disk" export (e.g. MDS) */
+	struct obd_import       *loc_imp; /* to use in RPC's: can be backward
+					     pointing import */
+	struct llog_operations  *loc_logops;
+	struct llog_handle      *loc_handle;
+	struct mutex		 loc_mutex; /* protect loc_imp */
+	atomic_t	     loc_refcount;
+	long		     loc_flags; /* flags, see above defines */
+	struct dt_object	*loc_dir;
+};
+
+#define LLOG_PROC_BREAK 0x0001
+#define LLOG_DEL_RECORD 0x0002
+
+static inline int llog_obd2ops(struct llog_ctxt *ctxt,
+			       struct llog_operations **lop)
+{
+	if (ctxt == NULL)
+		return -ENOTCONN;
+
+	*lop = ctxt->loc_logops;
+	if (*lop == NULL)
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+static inline int llog_handle2ops(struct llog_handle *loghandle,
+				  struct llog_operations **lop)
+{
+	if (loghandle == NULL || loghandle->lgh_logops == NULL)
+		return -EINVAL;
+
+	*lop = loghandle->lgh_logops;
+	return 0;
+}
+
+static inline int llog_data_len(int len)
+{
+	return cfs_size_round(len);
+}
+
+static inline struct llog_ctxt *llog_ctxt_get(struct llog_ctxt *ctxt)
+{
+	atomic_inc(&ctxt->loc_refcount);
+	CDEBUG(D_INFO, "GETting ctxt %p : new refcount %d\n", ctxt,
+	       atomic_read(&ctxt->loc_refcount));
+	return ctxt;
+}
+
+static inline void llog_ctxt_put(struct llog_ctxt *ctxt)
+{
+	if (ctxt == NULL)
+		return;
+	LASSERT_ATOMIC_GT_LT(&ctxt->loc_refcount, 0, LI_POISON);
+	CDEBUG(D_INFO, "PUTting ctxt %p : new refcount %d\n", ctxt,
+	       atomic_read(&ctxt->loc_refcount) - 1);
+	__llog_ctxt_put(NULL, ctxt);
+}
+
+static inline void llog_group_init(struct obd_llog_group *olg, int group)
+{
+	init_waitqueue_head(&olg->olg_waitq);
+	spin_lock_init(&olg->olg_lock);
+	mutex_init(&olg->olg_cat_processing);
+	olg->olg_seq = group;
+}
+
+static inline int llog_group_set_ctxt(struct obd_llog_group *olg,
+				      struct llog_ctxt *ctxt, int index)
+{
+	LASSERT(index >= 0 && index < LLOG_MAX_CTXTS);
+
+	spin_lock(&olg->olg_lock);
+	if (olg->olg_ctxts[index] != NULL) {
+		spin_unlock(&olg->olg_lock);
+		return -EEXIST;
+	}
+	olg->olg_ctxts[index] = ctxt;
+	spin_unlock(&olg->olg_lock);
+	return 0;
+}
+
+static inline struct llog_ctxt *llog_group_get_ctxt(struct obd_llog_group *olg,
+						    int index)
+{
+	struct llog_ctxt *ctxt;
+
+	LASSERT(index >= 0 && index < LLOG_MAX_CTXTS);
+
+	spin_lock(&olg->olg_lock);
+	if (olg->olg_ctxts[index] == NULL)
+		ctxt = NULL;
+	else
+		ctxt = llog_ctxt_get(olg->olg_ctxts[index]);
+	spin_unlock(&olg->olg_lock);
+	return ctxt;
+}
+
+static inline void llog_group_clear_ctxt(struct obd_llog_group *olg, int index)
+{
+	LASSERT(index >= 0 && index < LLOG_MAX_CTXTS);
+	spin_lock(&olg->olg_lock);
+	olg->olg_ctxts[index] = NULL;
+	spin_unlock(&olg->olg_lock);
+}
+
+static inline struct llog_ctxt *llog_get_context(struct obd_device *obd,
+						 int index)
+{
+	return llog_group_get_ctxt(&obd->obd_olg, index);
+}
+
+static inline int llog_group_ctxt_null(struct obd_llog_group *olg, int index)
+{
+	return (olg->olg_ctxts[index] == NULL);
+}
+
+static inline int llog_ctxt_null(struct obd_device *obd, int index)
+{
+	return (llog_group_ctxt_null(&obd->obd_olg, index));
+}
+
+static inline int llog_destroy(const struct lu_env *env,
+			       struct llog_handle *handle)
+{
+	struct llog_operations *lop;
+	int rc;
+
+	ENTRY;
+
+	rc = llog_handle2ops(handle, &lop);
+	if (rc)
+		RETURN(rc);
+	if (lop->lop_destroy == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	rc = lop->lop_destroy(env, handle);
+	RETURN(rc);
+}
+
+static inline int llog_next_block(const struct lu_env *env,
+				  struct llog_handle *loghandle, int *cur_idx,
+				  int next_idx, __u64 *cur_offset, void *buf,
+				  int len)
+{
+	struct llog_operations *lop;
+	int rc;
+
+	ENTRY;
+
+	rc = llog_handle2ops(loghandle, &lop);
+	if (rc)
+		RETURN(rc);
+	if (lop->lop_next_block == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	rc = lop->lop_next_block(env, loghandle, cur_idx, next_idx,
+				 cur_offset, buf, len);
+	RETURN(rc);
+}
+
+static inline int llog_prev_block(const struct lu_env *env,
+				  struct llog_handle *loghandle,
+				  int prev_idx, void *buf, int len)
+{
+	struct llog_operations *lop;
+	int rc;
+
+	ENTRY;
+
+	rc = llog_handle2ops(loghandle, &lop);
+	if (rc)
+		RETURN(rc);
+	if (lop->lop_prev_block == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	rc = lop->lop_prev_block(env, loghandle, prev_idx, buf, len);
+	RETURN(rc);
+}
+
+static inline int llog_connect(struct llog_ctxt *ctxt,
+			       struct llog_logid *logid, struct llog_gen *gen,
+			       struct obd_uuid *uuid)
+{
+	struct llog_operations	*lop;
+	int			 rc;
+
+	ENTRY;
+
+	rc = llog_obd2ops(ctxt, &lop);
+	if (rc)
+		RETURN(rc);
+	if (lop->lop_connect == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	rc = lop->lop_connect(ctxt, logid, gen, uuid);
+	RETURN(rc);
+}
+
+/* llog.c */
+int llog_exist(struct llog_handle *loghandle);
+int llog_declare_create(const struct lu_env *env,
+			struct llog_handle *loghandle, struct thandle *th);
+int llog_create(const struct lu_env *env, struct llog_handle *handle,
+		struct thandle *th);
+int llog_declare_write_rec(const struct lu_env *env,
+			   struct llog_handle *handle,
+			   struct llog_rec_hdr *rec, int idx,
+			   struct thandle *th);
+int llog_write_rec(const struct lu_env *env, struct llog_handle *handle,
+		   struct llog_rec_hdr *rec, struct llog_cookie *logcookies,
+		   int numcookies, void *buf, int idx, struct thandle *th);
+int llog_add(const struct lu_env *env, struct llog_handle *lgh,
+	     struct llog_rec_hdr *rec, struct llog_cookie *logcookies,
+	     void *buf, struct thandle *th);
+int llog_declare_add(const struct lu_env *env, struct llog_handle *lgh,
+		     struct llog_rec_hdr *rec, struct thandle *th);
+int lustre_process_log(struct super_block *sb, char *logname,
+		       struct config_llog_instance *cfg);
+int lustre_end_log(struct super_block *sb, char *logname,
+		   struct config_llog_instance *cfg);
+int llog_open_create(const struct lu_env *env, struct llog_ctxt *ctxt,
+		     struct llog_handle **res, struct llog_logid *logid,
+		     char *name);
+int llog_erase(const struct lu_env *env, struct llog_ctxt *ctxt,
+	       struct llog_logid *logid, char *name);
+int llog_write(const struct lu_env *env, struct llog_handle *loghandle,
+	       struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+	       int cookiecount, void *buf, int idx);
+
+/** @} log */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_mdc.h b/drivers/staging/lustre/lustre/include/lustre_mdc.h
new file mode 100644
index 000000000000..fb1561a809b9
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_mdc.h
@@ -0,0 +1,176 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_mdc.h
+ *
+ * MDS data structures.
+ * See also lustre_idl.h for wire formats of requests.
+ */
+
+#ifndef _LUSTRE_MDC_H
+#define _LUSTRE_MDC_H
+
+/** \defgroup mdc mdc
+ *
+ * @{
+ */
+
+# include <linux/fs.h>
+# include <linux/dcache.h>
+# ifdef CONFIG_FS_POSIX_ACL
+#  include <linux/posix_acl_xattr.h>
+# endif /* CONFIG_FS_POSIX_ACL */
+# include <linux/lustre_intent.h>
+#include <lustre_handles.h>
+#include <linux/libcfs/libcfs.h>
+#include <obd_class.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_lib.h>
+#include <lustre_dlm.h>
+#include <lustre_export.h>
+
+struct ptlrpc_client;
+struct obd_export;
+struct ptlrpc_request;
+struct obd_device;
+
+struct mdc_rpc_lock {
+	struct mutex		rpcl_mutex;
+	struct lookup_intent	*rpcl_it;
+	int			rpcl_fakes;
+};
+
+#define MDC_FAKE_RPCL_IT ((void *)0x2c0012bfUL)
+
+static inline void mdc_init_rpc_lock(struct mdc_rpc_lock *lck)
+{
+	mutex_init(&lck->rpcl_mutex);
+	lck->rpcl_it = NULL;
+}
+
+static inline void mdc_get_rpc_lock(struct mdc_rpc_lock *lck,
+				    struct lookup_intent *it)
+{
+	ENTRY;
+
+	if (it != NULL && (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP))
+		return;
+
+	/* This would normally block until the existing request finishes.
+	 * If fail_loc is set it will block until the regular request is
+	 * done, then set rpcl_it to MDC_FAKE_RPCL_IT.  Once that is set
+	 * it will only be cleared when all fake requests are finished.
+	 * Only when all fake requests are finished can normal requests
+	 * be sent, to ensure they are recoverable again. */
+ again:
+	mutex_lock(&lck->rpcl_mutex);
+
+	if (CFS_FAIL_CHECK_QUIET(OBD_FAIL_MDC_RPCS_SEM)) {
+		lck->rpcl_it = MDC_FAKE_RPCL_IT;
+		lck->rpcl_fakes++;
+		mutex_unlock(&lck->rpcl_mutex);
+		return;
+	}
+
+	/* This will only happen when the CFS_FAIL_CHECK() was
+	 * just turned off but there are still requests in progress.
+	 * Wait until they finish.  It doesn't need to be efficient
+	 * in this extremely rare case, just have low overhead in
+	 * the common case when it isn't true. */
+	while (unlikely(lck->rpcl_it == MDC_FAKE_RPCL_IT)) {
+		mutex_unlock(&lck->rpcl_mutex);
+		schedule_timeout(cfs_time_seconds(1) / 4);
+		goto again;
+	}
+
+	LASSERT(lck->rpcl_it == NULL);
+	lck->rpcl_it = it;
+}
+
+static inline void mdc_put_rpc_lock(struct mdc_rpc_lock *lck,
+				    struct lookup_intent *it)
+{
+	if (it != NULL && (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP))
+		goto out;
+
+	if (lck->rpcl_it == MDC_FAKE_RPCL_IT) { /* OBD_FAIL_MDC_RPCS_SEM */
+		mutex_lock(&lck->rpcl_mutex);
+
+		LASSERTF(lck->rpcl_fakes > 0, "%d\n", lck->rpcl_fakes);
+		lck->rpcl_fakes--;
+
+		if (lck->rpcl_fakes == 0)
+			lck->rpcl_it = NULL;
+
+	} else {
+		LASSERTF(it == lck->rpcl_it, "%p != %p\n", it, lck->rpcl_it);
+		lck->rpcl_it = NULL;
+	}
+
+	mutex_unlock(&lck->rpcl_mutex);
+ out:
+	EXIT;
+}
+
+static inline void mdc_update_max_ea_from_body(struct obd_export *exp,
+					       struct mdt_body *body)
+{
+	if (body->valid & OBD_MD_FLMODEASIZE) {
+		if (exp->exp_obd->u.cli.cl_max_mds_easize < body->max_mdsize)
+			exp->exp_obd->u.cli.cl_max_mds_easize =
+						body->max_mdsize;
+		if (exp->exp_obd->u.cli.cl_max_mds_cookiesize <
+						body->max_cookiesize)
+			exp->exp_obd->u.cli.cl_max_mds_cookiesize =
+						body->max_cookiesize;
+	}
+}
+
+
+struct mdc_cache_waiter {
+	struct list_head	      mcw_entry;
+	wait_queue_head_t	     mcw_waitq;
+};
+
+/* mdc/mdc_locks.c */
+int it_disposition(struct lookup_intent *it, int flag);
+void it_clear_disposition(struct lookup_intent *it, int flag);
+void it_set_disposition(struct lookup_intent *it, int flag);
+int it_open_error(int phase, struct lookup_intent *it);
+
+/** @} mdc */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_mds.h b/drivers/staging/lustre/lustre/include/lustre_mds.h
new file mode 100644
index 000000000000..b386f87471e3
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_mds.h
@@ -0,0 +1,81 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_mds.h
+ *
+ * MDS data structures.
+ * See also lustre_idl.h for wire formats of requests.
+ */
+
+#ifndef _LUSTRE_MDS_H
+#define _LUSTRE_MDS_H
+
+/** \defgroup mds mds
+ *
+ * @{
+ */
+
+#include <lustre_handles.h>
+#include <linux/libcfs/libcfs.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_lib.h>
+#include <lustre_dlm.h>
+#include <lustre_export.h>
+
+struct mds_group_info {
+	struct obd_uuid *uuid;
+	int group;
+};
+
+struct mds_capa_info {
+	struct obd_uuid	*uuid;
+	struct lustre_capa_key *capa;
+};
+
+#define MDD_OBD_NAME     "mdd_obd"
+#define MDD_OBD_UUID     "mdd_obd_uuid"
+
+static inline int md_should_create(__u64 flags)
+{
+       return !(flags & MDS_OPEN_DELAY_CREATE ||
+	       !(flags & FMODE_WRITE));
+}
+
+/* these are local flags, used only on the client, private */
+#define M_CHECK_STALE	   0200000000
+
+/** @} mds */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_mdt.h b/drivers/staging/lustre/lustre/include/lustre_mdt.h
new file mode 100644
index 000000000000..dba26a6cfa38
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_mdt.h
@@ -0,0 +1,84 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LINUX_MDT_H
+#define __LINUX_MDT_H
+
+/** \defgroup mdt mdt
+ *
+ * @{
+ */
+
+#include <lustre/lustre_idl.h>
+#include <lustre_req_layout.h>
+#include <md_object.h>
+#include <dt_object.h>
+#include <linux/libcfs/libcfs.h>
+
+/*
+ * Common thread info for mdt, seq and fld
+ */
+struct com_thread_info {
+	/*
+	 * for req-layout interface.
+	 */
+	struct req_capsule *cti_pill;
+};
+
+enum {
+	ESERIOUS = 0x0001000
+};
+
+static inline int err_serious(int rc)
+{
+	LASSERT(rc < 0);
+	LASSERT(-rc < ESERIOUS);
+	return -(-rc | ESERIOUS);
+}
+
+static inline int clear_serious(int rc)
+{
+	if (rc < 0)
+		rc = -(-rc & ~ESERIOUS);
+	return rc;
+}
+
+static inline int is_serious(int rc)
+{
+	return (rc < 0 && -rc & ESERIOUS);
+}
+
+/** @} mdt */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_net.h b/drivers/staging/lustre/lustre/include/lustre_net.h
new file mode 100644
index 000000000000..874412ee58fc
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_net.h
@@ -0,0 +1,3453 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/** \defgroup PtlRPC Portal RPC and networking module.
+ *
+ * PortalRPC is the layer used by rest of lustre code to achieve network
+ * communications: establish connections with corresponding export and import
+ * states, listen for a service, send and receive RPCs.
+ * PortalRPC also includes base recovery framework: packet resending and
+ * replaying, reconnections, pinger.
+ *
+ * PortalRPC utilizes LNet as its transport layer.
+ *
+ * @{
+ */
+
+
+#ifndef _LUSTRE_NET_H
+#define _LUSTRE_NET_H
+
+/** \defgroup net net
+ *
+ * @{
+ */
+
+#include <linux/lustre_net.h>
+
+#include <linux/libcfs/libcfs.h>
+// #include <obd.h>
+#include <linux/lnet/lnet.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_ha.h>
+#include <lustre_sec.h>
+#include <lustre_import.h>
+#include <lprocfs_status.h>
+#include <lu_object.h>
+#include <lustre_req_layout.h>
+
+#include <obd_support.h>
+#include <lustre_ver.h>
+
+/* MD flags we _always_ use */
+#define PTLRPC_MD_OPTIONS  0
+
+/**
+ * Max # of bulk operations in one request.
+ * In order for the client and server to properly negotiate the maximum
+ * possible transfer size, PTLRPC_BULK_OPS_COUNT must be a power-of-two
+ * value.  The client is free to limit the actual RPC size for any bulk
+ * transfer via cl_max_pages_per_rpc to some non-power-of-two value. */
+#define PTLRPC_BULK_OPS_BITS	2
+#define PTLRPC_BULK_OPS_COUNT	(1U << PTLRPC_BULK_OPS_BITS)
+/**
+ * PTLRPC_BULK_OPS_MASK is for the convenience of the client only, and
+ * should not be used on the server at all.  Otherwise, it imposes a
+ * protocol limitation on the maximum RPC size that can be used by any
+ * RPC sent to that server in the future.  Instead, the server should
+ * use the negotiated per-client ocd_brw_size to determine the bulk
+ * RPC count. */
+#define PTLRPC_BULK_OPS_MASK	(~((__u64)PTLRPC_BULK_OPS_COUNT - 1))
+
+/**
+ * Define maxima for bulk I/O.
+ *
+ * A single PTLRPC BRW request is sent via up to PTLRPC_BULK_OPS_COUNT
+ * of LNET_MTU sized RDMA transfers.  Clients and servers negotiate the
+ * currently supported maximum between peers at connect via ocd_brw_size.
+ */
+#define PTLRPC_MAX_BRW_BITS	(LNET_MTU_BITS + PTLRPC_BULK_OPS_BITS)
+#define PTLRPC_MAX_BRW_SIZE	(1 << PTLRPC_MAX_BRW_BITS)
+#define PTLRPC_MAX_BRW_PAGES	(PTLRPC_MAX_BRW_SIZE >> PAGE_CACHE_SHIFT)
+
+#define ONE_MB_BRW_SIZE		(1 << LNET_MTU_BITS)
+#define MD_MAX_BRW_SIZE		(1 << LNET_MTU_BITS)
+#define MD_MAX_BRW_PAGES	(MD_MAX_BRW_SIZE >> PAGE_CACHE_SHIFT)
+#define DT_MAX_BRW_SIZE		PTLRPC_MAX_BRW_SIZE
+#define DT_MAX_BRW_PAGES	(DT_MAX_BRW_SIZE >> PAGE_CACHE_SHIFT)
+#define OFD_MAX_BRW_SIZE	(1 << LNET_MTU_BITS)
+
+/* When PAGE_SIZE is a constant, we can check our arithmetic here with cpp! */
+# if ((PTLRPC_MAX_BRW_PAGES & (PTLRPC_MAX_BRW_PAGES - 1)) != 0)
+#  error "PTLRPC_MAX_BRW_PAGES isn't a power of two"
+# endif
+# if (PTLRPC_MAX_BRW_SIZE != (PTLRPC_MAX_BRW_PAGES * PAGE_CACHE_SIZE))
+#  error "PTLRPC_MAX_BRW_SIZE isn't PTLRPC_MAX_BRW_PAGES * PAGE_CACHE_SIZE"
+# endif
+# if (PTLRPC_MAX_BRW_SIZE > LNET_MTU * PTLRPC_BULK_OPS_COUNT)
+#  error "PTLRPC_MAX_BRW_SIZE too big"
+# endif
+# if (PTLRPC_MAX_BRW_PAGES > LNET_MAX_IOV * PTLRPC_BULK_OPS_COUNT)
+#  error "PTLRPC_MAX_BRW_PAGES too big"
+# endif
+
+#define PTLRPC_NTHRS_INIT	2
+
+/**
+ * Buffer Constants
+ *
+ * Constants determine how memory is used to buffer incoming service requests.
+ *
+ * ?_NBUFS	      # buffers to allocate when growing the pool
+ * ?_BUFSIZE	    # bytes in a single request buffer
+ * ?_MAXREQSIZE	 # maximum request service will receive
+ *
+ * When fewer than ?_NBUFS/2 buffers are posted for receive, another chunk
+ * of ?_NBUFS is added to the pool.
+ *
+ * Messages larger than ?_MAXREQSIZE are dropped.  Request buffers are
+ * considered full when less than ?_MAXREQSIZE is left in them.
+ */
+/**
+ * Thread Constants
+ *
+ * Constants determine how threads are created for ptlrpc service.
+ *
+ * ?_NTHRS_INIT		# threads to create for each service partition on
+ *			  initializing. If it's non-affinity service and
+ *			  there is only one partition, it's the overall #
+ *			  threads for the service while initializing.
+ * ?_NTHRS_BASE		# threads should be created at least for each
+ *			  ptlrpc partition to keep the service healthy.
+ *			  It's the low-water mark of threads upper-limit
+ *			  for each partition.
+ * ?_THR_FACTOR	 # threads can be added on threads upper-limit for
+ *			  each CPU core. This factor is only for reference,
+ *			  we might decrease value of factor if number of cores
+ *			  per CPT is above a limit.
+ * ?_NTHRS_MAX		# overall threads can be created for a service,
+ *			  it's a soft limit because if service is running
+ *			  on machine with hundreds of cores and tens of
+ *			  CPU partitions, we need to guarantee each partition
+ *			  has ?_NTHRS_BASE threads, which means total threads
+ *			  will be ?_NTHRS_BASE * number_of_cpts which can
+ *			  exceed ?_NTHRS_MAX.
+ *
+ * Examples
+ *
+ * #define MDS_NTHRS_INIT	2
+ * #define MDS_NTHRS_BASE	64
+ * #define MDS_NTHRS_FACTOR	8
+ * #define MDS_NTHRS_MAX	1024
+ *
+ * Example 1):
+ * ---------------------------------------------------------------------
+ * Server(A) has 16 cores, user configured it to 4 partitions so each
+ * partition has 4 cores, then actual number of service threads on each
+ * partition is:
+ *     MDS_NTHRS_BASE(64) + cores(4) * MDS_NTHRS_FACTOR(8) = 96
+ *
+ * Total number of threads for the service is:
+ *     96 * partitions(4) = 384
+ *
+ * Example 2):
+ * ---------------------------------------------------------------------
+ * Server(B) has 32 cores, user configured it to 4 partitions so each
+ * partition has 8 cores, then actual number of service threads on each
+ * partition is:
+ *     MDS_NTHRS_BASE(64) + cores(8) * MDS_NTHRS_FACTOR(8) = 128
+ *
+ * Total number of threads for the service is:
+ *     128 * partitions(4) = 512
+ *
+ * Example 3):
+ * ---------------------------------------------------------------------
+ * Server(B) has 96 cores, user configured it to 8 partitions so each
+ * partition has 12 cores, then actual number of service threads on each
+ * partition is:
+ *     MDS_NTHRS_BASE(64) + cores(12) * MDS_NTHRS_FACTOR(8) = 160
+ *
+ * Total number of threads for the service is:
+ *     160 * partitions(8) = 1280
+ *
+ * However, it's above the soft limit MDS_NTHRS_MAX, so we choose this number
+ * as upper limit of threads number for each partition:
+ *     MDS_NTHRS_MAX(1024) / partitions(8) = 128
+ *
+ * Example 4):
+ * ---------------------------------------------------------------------
+ * Server(C) have a thousand of cores and user configured it to 32 partitions
+ *     MDS_NTHRS_BASE(64) * 32 = 2048
+ *
+ * which is already above soft limit MDS_NTHRS_MAX(1024), but we still need
+ * to guarantee that each partition has at least MDS_NTHRS_BASE(64) threads
+ * to keep service healthy, so total number of threads will just be 2048.
+ *
+ * NB: we don't suggest to choose server with that many cores because backend
+ *     filesystem itself, buffer cache, or underlying network stack might
+ *     have some SMP scalability issues at that large scale.
+ *
+ *     If user already has a fat machine with hundreds or thousands of cores,
+ *     there are two choices for configuration:
+ *     a) create CPU table from subset of all CPUs and run Lustre on
+ *	top of this subset
+ *     b) bind service threads on a few partitions, see modparameters of
+ *	MDS and OSS for details
+*
+ * NB: these calculations (and examples below) are simplified to help
+ *     understanding, the real implementation is a little more complex,
+ *     please see ptlrpc_server_nthreads_check() for details.
+ *
+ */
+
+ /*
+  * LDLM threads constants:
+  *
+  * Given 8 as factor and 24 as base threads number
+  *
+  * example 1)
+  * On 4-core machine we will have 24 + 8 * 4 = 56 threads.
+  *
+  * example 2)
+  * On 8-core machine with 2 partitions we will have 24 + 4 * 8 = 56
+  * threads for each partition and total threads number will be 112.
+  *
+  * example 3)
+  * On 64-core machine with 8 partitions we will need LDLM_NTHRS_BASE(24)
+  * threads for each partition to keep service healthy, so total threads
+  * number should be 24 * 8 = 192.
+  *
+  * So with these constants, threads number will be at the similar level
+  * of old versions, unless target machine has over a hundred cores
+  */
+#define LDLM_THR_FACTOR		8
+#define LDLM_NTHRS_INIT		PTLRPC_NTHRS_INIT
+#define LDLM_NTHRS_BASE		24
+#define LDLM_NTHRS_MAX		(num_online_cpus() == 1 ? 64 : 128)
+
+#define LDLM_BL_THREADS   LDLM_NTHRS_AUTO_INIT
+#define LDLM_CLIENT_NBUFS 1
+#define LDLM_SERVER_NBUFS 64
+#define LDLM_BUFSIZE      (8 * 1024)
+#define LDLM_MAXREQSIZE   (5 * 1024)
+#define LDLM_MAXREPSIZE   (1024)
+
+ /*
+  * MDS threads constants:
+  *
+  * Please see examples in "Thread Constants", MDS threads number will be at
+  * the comparable level of old versions, unless the server has many cores.
+  */
+#ifndef MDS_MAX_THREADS
+#define MDS_MAX_THREADS		1024
+#define MDS_MAX_OTHR_THREADS	256
+
+#else /* MDS_MAX_THREADS */
+#if MDS_MAX_THREADS < PTLRPC_NTHRS_INIT
+#undef MDS_MAX_THREADS
+#define MDS_MAX_THREADS	PTLRPC_NTHRS_INIT
+#endif
+#define MDS_MAX_OTHR_THREADS	max(PTLRPC_NTHRS_INIT, MDS_MAX_THREADS / 2)
+#endif
+
+/* default service */
+#define MDS_THR_FACTOR		8
+#define MDS_NTHRS_INIT		PTLRPC_NTHRS_INIT
+#define MDS_NTHRS_MAX		MDS_MAX_THREADS
+#define MDS_NTHRS_BASE		min(64, MDS_NTHRS_MAX)
+
+/* read-page service */
+#define MDS_RDPG_THR_FACTOR	4
+#define MDS_RDPG_NTHRS_INIT	PTLRPC_NTHRS_INIT
+#define MDS_RDPG_NTHRS_MAX	MDS_MAX_OTHR_THREADS
+#define MDS_RDPG_NTHRS_BASE	min(48, MDS_RDPG_NTHRS_MAX)
+
+/* these should be removed when we remove setattr service in the future */
+#define MDS_SETA_THR_FACTOR	4
+#define MDS_SETA_NTHRS_INIT	PTLRPC_NTHRS_INIT
+#define MDS_SETA_NTHRS_MAX	MDS_MAX_OTHR_THREADS
+#define MDS_SETA_NTHRS_BASE	min(48, MDS_SETA_NTHRS_MAX)
+
+/* non-affinity threads */
+#define MDS_OTHR_NTHRS_INIT	PTLRPC_NTHRS_INIT
+#define MDS_OTHR_NTHRS_MAX	MDS_MAX_OTHR_THREADS
+
+#define MDS_NBUFS		64
+
+/**
+ * Assume file name length = FNAME_MAX = 256 (true for ext3).
+ *	  path name length = PATH_MAX = 4096
+ *	  LOV MD size max  = EA_MAX = 24 * 2000
+ *		(NB: 24 is size of lov_ost_data)
+ *	  LOV LOGCOOKIE size max = 32 * 2000
+ *		(NB: 32 is size of llog_cookie)
+ * symlink:  FNAME_MAX + PATH_MAX  <- largest
+ * link:     FNAME_MAX + PATH_MAX  (mds_rec_link < mds_rec_create)
+ * rename:   FNAME_MAX + FNAME_MAX
+ * open:     FNAME_MAX + EA_MAX
+ *
+ * MDS_MAXREQSIZE ~= 4736 bytes =
+ * lustre_msg + ldlm_request + mdt_body + mds_rec_create + FNAME_MAX + PATH_MAX
+ * MDS_MAXREPSIZE ~= 8300 bytes = lustre_msg + llog_header
+ *
+ * Realistic size is about 512 bytes (20 character name + 128 char symlink),
+ * except in the open case where there are a large number of OSTs in a LOV.
+ */
+#define MDS_MAXREQSIZE		(5 * 1024)	/* >= 4736 */
+#define MDS_MAXREPSIZE		(9 * 1024)	/* >= 8300 */
+
+/**
+ * MDS incoming request with LOV EA
+ * 24 = sizeof(struct lov_ost_data), i.e: replay of opencreate
+ */
+#define MDS_LOV_MAXREQSIZE	max(MDS_MAXREQSIZE, \
+				    362 + LOV_MAX_STRIPE_COUNT * 24)
+/**
+ * MDS outgoing reply with LOV EA
+ *
+ * NB: max reply size Lustre 2.4+ client can get from old MDS is:
+ * LOV_MAX_STRIPE_COUNT * (llog_cookie + lov_ost_data) + extra bytes
+ *
+ * but 2.4 or later MDS will never send reply with llog_cookie to any
+ * version client. This macro is defined for server side reply buffer size.
+ */
+#define MDS_LOV_MAXREPSIZE	MDS_LOV_MAXREQSIZE
+
+/**
+ * This is the size of a maximum REINT_SETXATTR request:
+ *
+ *   lustre_msg		 56 (32 + 4 x 5 + 4)
+ *   ptlrpc_body	184
+ *   mdt_rec_setxattr	136
+ *   lustre_capa	120
+ *   name		256 (XATTR_NAME_MAX)
+ *   value	      65536 (XATTR_SIZE_MAX)
+ */
+#define MDS_EA_MAXREQSIZE	66288
+
+/**
+ * These are the maximum request and reply sizes (rounded up to 1 KB
+ * boundaries) for the "regular" MDS_REQUEST_PORTAL and MDS_REPLY_PORTAL.
+ */
+#define MDS_REG_MAXREQSIZE	(((max(MDS_EA_MAXREQSIZE, \
+				       MDS_LOV_MAXREQSIZE) + 1023) >> 10) << 10)
+#define MDS_REG_MAXREPSIZE	MDS_REG_MAXREQSIZE
+
+/**
+ * The update request includes all of updates from the create, which might
+ * include linkea (4K maxim), together with other updates, we set it to 9K:
+ * lustre_msg + ptlrpc_body + UPDATE_BUF_SIZE (8K)
+ */
+#define MDS_OUT_MAXREQSIZE	(9 * 1024)
+#define MDS_OUT_MAXREPSIZE	MDS_MAXREPSIZE
+
+/** MDS_BUFSIZE = max_reqsize (w/o LOV EA) + max sptlrpc payload size */
+#define MDS_BUFSIZE		max(MDS_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD, \
+				    8 * 1024)
+
+/**
+ * MDS_REG_BUFSIZE should at least be MDS_REG_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD.
+ * However, we need to allocate a much larger buffer for it because LNet
+ * requires each MD(rqbd) has at least MDS_REQ_MAXREQSIZE bytes left to avoid
+ * dropping of maximum-sized incoming request.  So if MDS_REG_BUFSIZE is only a
+ * little larger than MDS_REG_MAXREQSIZE, then it can only fit in one request
+ * even there are about MDS_REG_MAX_REQSIZE bytes left in a rqbd, and memory
+ * utilization is very low.
+ *
+ * In the meanwhile, size of rqbd can't be too large, because rqbd can't be
+ * reused until all requests fit in it have been processed and released,
+ * which means one long blocked request can prevent the rqbd be reused.
+ * Now we set request buffer size to 160 KB, so even each rqbd is unlinked
+ * from LNet with unused 65 KB, buffer utilization will be about 59%.
+ * Please check LU-2432 for details.
+ */
+#define MDS_REG_BUFSIZE		max(MDS_REG_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD, \
+				    160 * 1024)
+
+/**
+ * MDS_OUT_BUFSIZE = max_out_reqsize + max sptlrpc payload (~1K) which is
+ * about 10K, for the same reason as MDS_REG_BUFSIZE, we also give some
+ * extra bytes to each request buffer to improve buffer utilization rate.
+  */
+#define MDS_OUT_BUFSIZE		max(MDS_OUT_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD, \
+				    24 * 1024)
+
+/** FLD_MAXREQSIZE == lustre_msg + __u32 padding + ptlrpc_body + opc */
+#define FLD_MAXREQSIZE  (160)
+
+/** FLD_MAXREPSIZE == lustre_msg + ptlrpc_body */
+#define FLD_MAXREPSIZE  (152)
+#define FLD_BUFSIZE	(1 << 12)
+
+/**
+ * SEQ_MAXREQSIZE == lustre_msg + __u32 padding + ptlrpc_body + opc + lu_range +
+ * __u32 padding */
+#define SEQ_MAXREQSIZE  (160)
+
+/** SEQ_MAXREPSIZE == lustre_msg + ptlrpc_body + lu_range */
+#define SEQ_MAXREPSIZE  (152)
+#define SEQ_BUFSIZE	(1 << 12)
+
+/** MGS threads must be >= 3, see bug 22458 comment #28 */
+#define MGS_NTHRS_INIT	(PTLRPC_NTHRS_INIT + 1)
+#define MGS_NTHRS_MAX	32
+
+#define MGS_NBUFS       64
+#define MGS_BUFSIZE     (8 * 1024)
+#define MGS_MAXREQSIZE  (7 * 1024)
+#define MGS_MAXREPSIZE  (9 * 1024)
+
+ /*
+  * OSS threads constants:
+  *
+  * Given 8 as factor and 64 as base threads number
+  *
+  * example 1):
+  * On 8-core server configured to 2 partitions, we will have
+  * 64 + 8 * 4 = 96 threads for each partition, 192 total threads.
+  *
+  * example 2):
+  * On 32-core machine configured to 4 partitions, we will have
+  * 64 + 8 * 8 = 112 threads for each partition, so total threads number
+  * will be 112 * 4 = 448.
+  *
+  * example 3):
+  * On 64-core machine configured to 4 partitions, we will have
+  * 64 + 16 * 8 = 192 threads for each partition, so total threads number
+  * will be 192 * 4 = 768 which is above limit OSS_NTHRS_MAX(512), so we
+  * cut off the value to OSS_NTHRS_MAX(512) / 4 which is 128 threads
+  * for each partition.
+  *
+  * So we can see that with these constants, threads number wil be at the
+  * similar level of old versions, unless the server has many cores.
+  */
+ /* depress threads factor for VM with small memory size */
+#define OSS_THR_FACTOR		min_t(int, 8, \
+				NUM_CACHEPAGES >> (28 - PAGE_CACHE_SHIFT))
+#define OSS_NTHRS_INIT		(PTLRPC_NTHRS_INIT + 1)
+#define OSS_NTHRS_BASE		64
+#define OSS_NTHRS_MAX		512
+
+/* threads for handling "create" request */
+#define OSS_CR_THR_FACTOR	1
+#define OSS_CR_NTHRS_INIT	PTLRPC_NTHRS_INIT
+#define OSS_CR_NTHRS_BASE	8
+#define OSS_CR_NTHRS_MAX	64
+
+/**
+ * OST_IO_MAXREQSIZE ~=
+ *	lustre_msg + ptlrpc_body + obdo + obd_ioobj +
+ *	DT_MAX_BRW_PAGES * niobuf_remote
+ *
+ * - single object with 16 pages is 512 bytes
+ * - OST_IO_MAXREQSIZE must be at least 1 page of cookies plus some spillover
+ * - Must be a multiple of 1024
+ * - actual size is about 18K
+ */
+#define _OST_MAXREQSIZE_SUM (sizeof(struct lustre_msg) + \
+			     sizeof(struct ptlrpc_body) + \
+			     sizeof(struct obdo) + \
+			     sizeof(struct obd_ioobj) + \
+			     sizeof(struct niobuf_remote) * DT_MAX_BRW_PAGES)
+/**
+ * FIEMAP request can be 4K+ for now
+ */
+#define OST_MAXREQSIZE		(5 * 1024)
+#define OST_IO_MAXREQSIZE	max_t(int, OST_MAXREQSIZE, \
+				(((_OST_MAXREQSIZE_SUM - 1) | (1024 - 1)) + 1))
+
+#define OST_MAXREPSIZE		(9 * 1024)
+#define OST_IO_MAXREPSIZE	OST_MAXREPSIZE
+
+#define OST_NBUFS		64
+/** OST_BUFSIZE = max_reqsize + max sptlrpc payload size */
+#define OST_BUFSIZE		max_t(int, OST_MAXREQSIZE + 1024, 16 * 1024)
+/**
+ * OST_IO_MAXREQSIZE is 18K, giving extra 46K can increase buffer utilization
+ * rate of request buffer, please check comment of MDS_LOV_BUFSIZE for details.
+ */
+#define OST_IO_BUFSIZE		max_t(int, OST_IO_MAXREQSIZE + 1024, 64 * 1024)
+
+/* Macro to hide a typecast. */
+#define ptlrpc_req_async_args(req) ((void *)&req->rq_async_args)
+
+/**
+ * Structure to single define portal connection.
+ */
+struct ptlrpc_connection {
+	/** linkage for connections hash table */
+	struct hlist_node	c_hash;
+	/** Our own lnet nid for this connection */
+	lnet_nid_t	      c_self;
+	/** Remote side nid for this connection */
+	lnet_process_id_t       c_peer;
+	/** UUID of the other side */
+	struct obd_uuid	 c_remote_uuid;
+	/** reference counter for this connection */
+	atomic_t	    c_refcount;
+};
+
+/** Client definition for PortalRPC */
+struct ptlrpc_client {
+	/** What lnet portal does this client send messages to by default */
+	__u32		   cli_request_portal;
+	/** What portal do we expect replies on */
+	__u32		   cli_reply_portal;
+	/** Name of the client */
+	char		   *cli_name;
+};
+
+/** state flags of requests */
+/* XXX only ones left are those used by the bulk descs as well! */
+#define PTL_RPC_FL_INTR      (1 << 0)  /* reply wait was interrupted by user */
+#define PTL_RPC_FL_TIMEOUT   (1 << 7)  /* request timed out waiting for reply */
+
+#define REQ_MAX_ACK_LOCKS 8
+
+union ptlrpc_async_args {
+	/**
+	 * Scratchpad for passing args to completion interpreter. Users
+	 * cast to the struct of their choosing, and CLASSERT that this is
+	 * big enough.  For _tons_ of context, OBD_ALLOC a struct and store
+	 * a pointer to it here.  The pointer_arg ensures this struct is at
+	 * least big enough for that.
+	 */
+	void      *pointer_arg[11];
+	__u64      space[7];
+};
+
+struct ptlrpc_request_set;
+typedef int (*set_interpreter_func)(struct ptlrpc_request_set *, void *, int);
+typedef int (*set_producer_func)(struct ptlrpc_request_set *, void *);
+
+/**
+ * Definition of request set structure.
+ * Request set is a list of requests (not necessary to the same target) that
+ * once populated with RPCs could be sent in parallel.
+ * There are two kinds of request sets. General purpose and with dedicated
+ * serving thread. Example of the latter is ptlrpcd set.
+ * For general purpose sets once request set started sending it is impossible
+ * to add new requests to such set.
+ * Provides a way to call "completion callbacks" when all requests in the set
+ * returned.
+ */
+struct ptlrpc_request_set {
+	atomic_t	  set_refcount;
+	/** number of in queue requests */
+	atomic_t	  set_new_count;
+	/** number of uncompleted requests */
+	atomic_t	  set_remaining;
+	/** wait queue to wait on for request events */
+	wait_queue_head_t	   set_waitq;
+	wait_queue_head_t	  *set_wakeup_ptr;
+	/** List of requests in the set */
+	struct list_head	    set_requests;
+	/**
+	 * List of completion callbacks to be called when the set is completed
+	 * This is only used if \a set_interpret is NULL.
+	 * Links struct ptlrpc_set_cbdata.
+	 */
+	struct list_head	    set_cblist;
+	/** Completion callback, if only one. */
+	set_interpreter_func  set_interpret;
+	/** opaq argument passed to completion \a set_interpret callback. */
+	void		 *set_arg;
+	/**
+	 * Lock for \a set_new_requests manipulations
+	 * locked so that any old caller can communicate requests to
+	 * the set holder who can then fold them into the lock-free set
+	 */
+	spinlock_t		set_new_req_lock;
+	/** List of new yet unsent requests. Only used with ptlrpcd now. */
+	struct list_head	    set_new_requests;
+
+	/** rq_status of requests that have been freed already */
+	int		   set_rc;
+	/** Additional fields used by the flow control extension */
+	/** Maximum number of RPCs in flight */
+	int		   set_max_inflight;
+	/** Callback function used to generate RPCs */
+	set_producer_func     set_producer;
+	/** opaq argument passed to the producer callback */
+	void		 *set_producer_arg;
+};
+
+/**
+ * Description of a single ptrlrpc_set callback
+ */
+struct ptlrpc_set_cbdata {
+	/** List linkage item */
+	struct list_head	      psc_item;
+	/** Pointer to interpreting function */
+	set_interpreter_func    psc_interpret;
+	/** Opaq argument to pass to the callback */
+	void		   *psc_data;
+};
+
+struct ptlrpc_bulk_desc;
+struct ptlrpc_service_part;
+struct ptlrpc_service;
+
+/**
+ * ptlrpc callback & work item stuff
+ */
+struct ptlrpc_cb_id {
+	void   (*cbid_fn)(lnet_event_t *ev);     /* specific callback fn */
+	void    *cbid_arg;		      /* additional arg */
+};
+
+/** Maximum number of locks to fit into reply state */
+#define RS_MAX_LOCKS 8
+#define RS_DEBUG     0
+
+/**
+ * Structure to define reply state on the server
+ * Reply state holds various reply message information. Also for "difficult"
+ * replies (rep-ack case) we store the state after sending reply and wait
+ * for the client to acknowledge the reception. In these cases locks could be
+ * added to the state for replay/failover consistency guarantees.
+ */
+struct ptlrpc_reply_state {
+	/** Callback description */
+	struct ptlrpc_cb_id    rs_cb_id;
+	/** Linkage for list of all reply states in a system */
+	struct list_head	     rs_list;
+	/** Linkage for list of all reply states on same export */
+	struct list_head	     rs_exp_list;
+	/** Linkage for list of all reply states for same obd */
+	struct list_head	     rs_obd_list;
+#if RS_DEBUG
+	struct list_head	     rs_debug_list;
+#endif
+	/** A spinlock to protect the reply state flags */
+	spinlock_t		rs_lock;
+	/** Reply state flags */
+	unsigned long	  rs_difficult:1;     /* ACK/commit stuff */
+	unsigned long	  rs_no_ack:1;    /* no ACK, even for
+						  difficult requests */
+	unsigned long	  rs_scheduled:1;     /* being handled? */
+	unsigned long	  rs_scheduled_ever:1;/* any schedule attempts? */
+	unsigned long	  rs_handled:1;  /* been handled yet? */
+	unsigned long	  rs_on_net:1;   /* reply_out_callback pending? */
+	unsigned long	  rs_prealloc:1; /* rs from prealloc list */
+	unsigned long	  rs_committed:1;/* the transaction was committed
+						 and the rs was dispatched
+						 by ptlrpc_commit_replies */
+	/** Size of the state */
+	int		    rs_size;
+	/** opcode */
+	__u32		  rs_opc;
+	/** Transaction number */
+	__u64		  rs_transno;
+	/** xid */
+	__u64		  rs_xid;
+	struct obd_export     *rs_export;
+	struct ptlrpc_service_part *rs_svcpt;
+	/** Lnet metadata handle for the reply */
+	lnet_handle_md_t       rs_md_h;
+	atomic_t	   rs_refcount;
+
+	/** Context for the sevice thread */
+	struct ptlrpc_svc_ctx *rs_svc_ctx;
+	/** Reply buffer (actually sent to the client), encoded if needed */
+	struct lustre_msg     *rs_repbuf;       /* wrapper */
+	/** Size of the reply buffer */
+	int		    rs_repbuf_len;   /* wrapper buf length */
+	/** Size of the reply message */
+	int		    rs_repdata_len;  /* wrapper msg length */
+	/**
+	 * Actual reply message. Its content is encrupted (if needed) to
+	 * produce reply buffer for actual sending. In simple case
+	 * of no network encryption we jus set \a rs_repbuf to \a rs_msg
+	 */
+	struct lustre_msg     *rs_msg;	  /* reply message */
+
+	/** Number of locks awaiting client ACK */
+	int		    rs_nlocks;
+	/** Handles of locks awaiting client reply ACK */
+	struct lustre_handle   rs_locks[RS_MAX_LOCKS];
+	/** Lock modes of locks in \a rs_locks */
+	ldlm_mode_t	    rs_modes[RS_MAX_LOCKS];
+};
+
+struct ptlrpc_thread;
+
+/** RPC stages */
+enum rq_phase {
+	RQ_PHASE_NEW	    = 0xebc0de00,
+	RQ_PHASE_RPC	    = 0xebc0de01,
+	RQ_PHASE_BULK	   = 0xebc0de02,
+	RQ_PHASE_INTERPRET      = 0xebc0de03,
+	RQ_PHASE_COMPLETE       = 0xebc0de04,
+	RQ_PHASE_UNREGISTERING  = 0xebc0de05,
+	RQ_PHASE_UNDEFINED      = 0xebc0de06
+};
+
+/** Type of request interpreter call-back */
+typedef int (*ptlrpc_interpterer_t)(const struct lu_env *env,
+				    struct ptlrpc_request *req,
+				    void *arg, int rc);
+
+/**
+ * Definition of request pool structure.
+ * The pool is used to store empty preallocated requests for the case
+ * when we would actually need to send something without performing
+ * any allocations (to avoid e.g. OOM).
+ */
+struct ptlrpc_request_pool {
+	/** Locks the list */
+	spinlock_t prp_lock;
+	/** list of ptlrpc_request structs */
+	struct list_head prp_req_list;
+	/** Maximum message size that would fit into a rquest from this pool */
+	int prp_rq_size;
+	/** Function to allocate more requests for this pool */
+	void (*prp_populate)(struct ptlrpc_request_pool *, int);
+};
+
+struct lu_context;
+struct lu_env;
+
+struct ldlm_lock;
+
+/**
+ * \defgroup nrs Network Request Scheduler
+ * @{
+ */
+struct ptlrpc_nrs_policy;
+struct ptlrpc_nrs_resource;
+struct ptlrpc_nrs_request;
+
+/**
+ * NRS control operations.
+ *
+ * These are common for all policies.
+ */
+enum ptlrpc_nrs_ctl {
+	/**
+	 * Not a valid opcode.
+	 */
+	PTLRPC_NRS_CTL_INVALID,
+	/**
+	 * Activate the policy.
+	 */
+	PTLRPC_NRS_CTL_START,
+	/**
+	 * Reserved for multiple primary policies, which may be a possibility
+	 * in the future.
+	 */
+	PTLRPC_NRS_CTL_STOP,
+	/**
+	 * Policies can start using opcodes from this value and onwards for
+	 * their own purposes; the assigned value itself is arbitrary.
+	 */
+	PTLRPC_NRS_CTL_1ST_POL_SPEC = 0x20,
+};
+
+/**
+ * ORR policy operations
+ */
+enum nrs_ctl_orr {
+	NRS_CTL_ORR_RD_QUANTUM = PTLRPC_NRS_CTL_1ST_POL_SPEC,
+	NRS_CTL_ORR_WR_QUANTUM,
+	NRS_CTL_ORR_RD_OFF_TYPE,
+	NRS_CTL_ORR_WR_OFF_TYPE,
+	NRS_CTL_ORR_RD_SUPP_REQ,
+	NRS_CTL_ORR_WR_SUPP_REQ,
+};
+
+/**
+ * NRS policy operations.
+ *
+ * These determine the behaviour of a policy, and are called in response to
+ * NRS core events.
+ */
+struct ptlrpc_nrs_pol_ops {
+	/**
+	 * Called during policy registration; this operation is optional.
+	 *
+	 * \param[in,out] policy The policy being initialized
+	 */
+	int	(*op_policy_init) (struct ptlrpc_nrs_policy *policy);
+	/**
+	 * Called during policy unregistration; this operation is optional.
+	 *
+	 * \param[in,out] policy The policy being unregistered/finalized
+	 */
+	void	(*op_policy_fini) (struct ptlrpc_nrs_policy *policy);
+	/**
+	 * Called when activating a policy via lprocfs; policies allocate and
+	 * initialize their resources here; this operation is optional.
+	 *
+	 * \param[in,out] policy The policy being started
+	 *
+	 * \see nrs_policy_start_locked()
+	 */
+	int	(*op_policy_start) (struct ptlrpc_nrs_policy *policy);
+	/**
+	 * Called when deactivating a policy via lprocfs; policies deallocate
+	 * their resources here; this operation is optional
+	 *
+	 * \param[in,out] policy The policy being stopped
+	 *
+	 * \see nrs_policy_stop0()
+	 */
+	void	(*op_policy_stop) (struct ptlrpc_nrs_policy *policy);
+	/**
+	 * Used for policy-specific operations; i.e. not generic ones like
+	 * \e PTLRPC_NRS_CTL_START and \e PTLRPC_NRS_CTL_GET_INFO; analogous
+	 * to an ioctl; this operation is optional.
+	 *
+	 * \param[in,out]	 policy The policy carrying out operation \a opc
+	 * \param[in]	  opc	 The command operation being carried out
+	 * \param[in,out] arg	 An generic buffer for communication between the
+	 *			 user and the control operation
+	 *
+	 * \retval -ve error
+	 * \retval   0 success
+	 *
+	 * \see ptlrpc_nrs_policy_control()
+	 */
+	int	(*op_policy_ctl) (struct ptlrpc_nrs_policy *policy,
+				  enum ptlrpc_nrs_ctl opc, void *arg);
+
+	/**
+	 * Called when obtaining references to the resources of the resource
+	 * hierarchy for a request that has arrived for handling at the PTLRPC
+	 * service. Policies should return -ve for requests they do not wish
+	 * to handle. This operation is mandatory.
+	 *
+	 * \param[in,out] policy  The policy we're getting resources for.
+	 * \param[in,out] nrq	  The request we are getting resources for.
+	 * \param[in]	  parent  The parent resource of the resource being
+	 *			  requested; set to NULL if none.
+	 * \param[out]	  resp	  The resource is to be returned here; the
+	 *			  fallback policy in an NRS head should
+	 *			  \e always return a non-NULL pointer value.
+	 * \param[in]  moving_req When set, signifies that this is an attempt
+	 *			  to obtain resources for a request being moved
+	 *			  to the high-priority NRS head by
+	 *			  ldlm_lock_reorder_req().
+	 *			  This implies two things:
+	 *			  1. We are under obd_export::exp_rpc_lock and
+	 *			  so should not sleep.
+	 *			  2. We should not perform non-idempotent or can
+	 *			  skip performing idempotent operations that
+	 *			  were carried out when resources were first
+	 *			  taken for the request when it was initialized
+	 *			  in ptlrpc_nrs_req_initialize().
+	 *
+	 * \retval 0, +ve The level of the returned resource in the resource
+	 *		  hierarchy; currently only 0 (for a non-leaf resource)
+	 *		  and 1 (for a leaf resource) are supported by the
+	 *		  framework.
+	 * \retval -ve	  error
+	 *
+	 * \see ptlrpc_nrs_req_initialize()
+	 * \see ptlrpc_nrs_hpreq_add_nolock()
+	 * \see ptlrpc_nrs_req_hp_move()
+	 */
+	int	(*op_res_get) (struct ptlrpc_nrs_policy *policy,
+			       struct ptlrpc_nrs_request *nrq,
+			       const struct ptlrpc_nrs_resource *parent,
+			       struct ptlrpc_nrs_resource **resp,
+			       bool moving_req);
+	/**
+	 * Called when releasing references taken for resources in the resource
+	 * hierarchy for the request; this operation is optional.
+	 *
+	 * \param[in,out] policy The policy the resource belongs to
+	 * \param[in] res	 The resource to be freed
+	 *
+	 * \see ptlrpc_nrs_req_finalize()
+	 * \see ptlrpc_nrs_hpreq_add_nolock()
+	 * \see ptlrpc_nrs_req_hp_move()
+	 */
+	void	(*op_res_put) (struct ptlrpc_nrs_policy *policy,
+			       const struct ptlrpc_nrs_resource *res);
+
+	/**
+	 * Obtains a request for handling from the policy, and optionally
+	 * removes the request from the policy; this operation is mandatory.
+	 *
+	 * \param[in,out] policy The policy to poll
+	 * \param[in]	  peek	 When set, signifies that we just want to
+	 *			 examine the request, and not handle it, so the
+	 *			 request is not removed from the policy.
+	 * \param[in]	  force	 When set, it will force a policy to return a
+	 *			 request if it has one queued.
+	 *
+	 * \retval NULL No request available for handling
+	 * \retval valid-pointer The request polled for handling
+	 *
+	 * \see ptlrpc_nrs_req_get_nolock()
+	 */
+	struct ptlrpc_nrs_request *
+		(*op_req_get) (struct ptlrpc_nrs_policy *policy, bool peek,
+			       bool force);
+	/**
+	 * Called when attempting to add a request to a policy for later
+	 * handling; this operation is mandatory.
+	 *
+	 * \param[in,out] policy  The policy on which to enqueue \a nrq
+	 * \param[in,out] nrq The request to enqueue
+	 *
+	 * \retval 0	success
+	 * \retval != 0	error
+	 *
+	 * \see ptlrpc_nrs_req_add_nolock()
+	 */
+	int	(*op_req_enqueue) (struct ptlrpc_nrs_policy *policy,
+				   struct ptlrpc_nrs_request *nrq);
+	/**
+	 * Removes a request from the policy's set of pending requests. Normally
+	 * called after a request has been polled successfully from the policy
+	 * for handling; this operation is mandatory.
+	 *
+	 * \param[in,out] policy The policy the request \a nrq belongs to
+	 * \param[in,out] nrq    The request to dequeue
+	 *
+	 * \see ptlrpc_nrs_req_del_nolock()
+	 */
+	void	(*op_req_dequeue) (struct ptlrpc_nrs_policy *policy,
+				   struct ptlrpc_nrs_request *nrq);
+	/**
+	 * Called after the request being carried out. Could be used for
+	 * job/resource control; this operation is optional.
+	 *
+	 * \param[in,out] policy The policy which is stopping to handle request
+	 *			 \a nrq
+	 * \param[in,out] nrq	 The request
+	 *
+	 * \pre spin_is_locked(&svcpt->scp_req_lock)
+	 *
+	 * \see ptlrpc_nrs_req_stop_nolock()
+	 */
+	void	(*op_req_stop) (struct ptlrpc_nrs_policy *policy,
+				struct ptlrpc_nrs_request *nrq);
+	/**
+	 * Registers the policy's lprocfs interface with a PTLRPC service.
+	 *
+	 * \param[in] svc The service
+	 *
+	 * \retval 0	success
+	 * \retval != 0	error
+	 */
+	int	(*op_lprocfs_init) (struct ptlrpc_service *svc);
+	/**
+	 * Unegisters the policy's lprocfs interface with a PTLRPC service.
+	 *
+	 * In cases of failed policy registration in
+	 * \e ptlrpc_nrs_policy_register(), this function may be called for a
+	 * service which has not registered the policy successfully, so
+	 * implementations of this method should make sure their operations are
+	 * safe in such cases.
+	 *
+	 * \param[in] svc The service
+	 */
+	void	(*op_lprocfs_fini) (struct ptlrpc_service *svc);
+};
+
+/**
+ * Policy flags
+ */
+enum nrs_policy_flags {
+	/**
+	 * Fallback policy, use this flag only on a single supported policy per
+	 * service. The flag cannot be used on policies that use
+	 * \e PTLRPC_NRS_FL_REG_EXTERN
+	 */
+	PTLRPC_NRS_FL_FALLBACK		= (1 << 0),
+	/**
+	 * Start policy immediately after registering.
+	 */
+	PTLRPC_NRS_FL_REG_START		= (1 << 1),
+	/**
+	 * This is a policy registering from a module different to the one NRS
+	 * core ships in (currently ptlrpc).
+	 */
+	PTLRPC_NRS_FL_REG_EXTERN	= (1 << 2),
+};
+
+/**
+ * NRS queue type.
+ *
+ * Denotes whether an NRS instance is for handling normal or high-priority
+ * RPCs, or whether an operation pertains to one or both of the NRS instances
+ * in a service.
+ */
+enum ptlrpc_nrs_queue_type {
+	PTLRPC_NRS_QUEUE_REG	= (1 << 0),
+	PTLRPC_NRS_QUEUE_HP	= (1 << 1),
+	PTLRPC_NRS_QUEUE_BOTH	= (PTLRPC_NRS_QUEUE_REG | PTLRPC_NRS_QUEUE_HP)
+};
+
+/**
+ * NRS head
+ *
+ * A PTLRPC service has at least one NRS head instance for handling normal
+ * priority RPCs, and may optionally have a second NRS head instance for
+ * handling high-priority RPCs. Each NRS head maintains a list of available
+ * policies, of which one and only one policy is acting as the fallback policy,
+ * and optionally a different policy may be acting as the primary policy. For
+ * all RPCs handled by this NRS head instance, NRS core will first attempt to
+ * enqueue the RPC using the primary policy (if any). The fallback policy is
+ * used in the following cases:
+ * - when there was no primary policy in the
+ *   ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED state at the time the request
+ *   was initialized.
+ * - when the primary policy that was at the
+ *   ptlrpc_nrs_pol_state::PTLRPC_NRS_POL_STATE_STARTED state at the time the
+ *   RPC was initialized, denoted it did not wish, or for some other reason was
+ *   not able to handle the request, by returning a non-valid NRS resource
+ *   reference.
+ * - when the primary policy that was at the
+ *   ptlrpc_nrs_pol_state::PTLRPC_NRS_POL_STATE_STARTED state at the time the
+ *   RPC was initialized, fails later during the request enqueueing stage.
+ *
+ * \see nrs_resource_get_safe()
+ * \see nrs_request_enqueue()
+ */
+struct ptlrpc_nrs {
+	spinlock_t			nrs_lock;
+	/** XXX Possibly replace svcpt->scp_req_lock with another lock here. */
+	/**
+	 * List of registered policies
+	 */
+	struct list_head			nrs_policy_list;
+	/**
+	 * List of policies with queued requests. Policies that have any
+	 * outstanding requests are queued here, and this list is queried
+	 * in a round-robin manner from NRS core when obtaining a request
+	 * for handling. This ensures that requests from policies that at some
+	 * point transition away from the
+	 * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED state are drained.
+	 */
+	struct list_head			nrs_policy_queued;
+	/**
+	 * Service partition for this NRS head
+	 */
+	struct ptlrpc_service_part     *nrs_svcpt;
+	/**
+	 * Primary policy, which is the preferred policy for handling RPCs
+	 */
+	struct ptlrpc_nrs_policy       *nrs_policy_primary;
+	/**
+	 * Fallback policy, which is the backup policy for handling RPCs
+	 */
+	struct ptlrpc_nrs_policy       *nrs_policy_fallback;
+	/**
+	 * This NRS head handles either HP or regular requests
+	 */
+	enum ptlrpc_nrs_queue_type	nrs_queue_type;
+	/**
+	 * # queued requests from all policies in this NRS head
+	 */
+	unsigned long			nrs_req_queued;
+	/**
+	 * # scheduled requests from all policies in this NRS head
+	 */
+	unsigned long			nrs_req_started;
+	/**
+	 * # policies on this NRS
+	 */
+	unsigned			nrs_num_pols;
+	/**
+	 * This NRS head is in progress of starting a policy
+	 */
+	unsigned			nrs_policy_starting:1;
+	/**
+	 * In progress of shutting down the whole NRS head; used during
+	 * unregistration
+	 */
+	unsigned			nrs_stopping:1;
+};
+
+#define NRS_POL_NAME_MAX		16
+
+struct ptlrpc_nrs_pol_desc;
+
+/**
+ * Service compatibility predicate; this determines whether a policy is adequate
+ * for handling RPCs of a particular PTLRPC service.
+ *
+ * XXX:This should give the same result during policy registration and
+ * unregistration, and for all partitions of a service; so the result should not
+ * depend on temporal service or other properties, that may influence the
+ * result.
+ */
+typedef bool (*nrs_pol_desc_compat_t) (const struct ptlrpc_service *svc,
+				       const struct ptlrpc_nrs_pol_desc *desc);
+
+struct ptlrpc_nrs_pol_conf {
+	/**
+	 * Human-readable policy name
+	 */
+	char				   nc_name[NRS_POL_NAME_MAX];
+	/**
+	 * NRS operations for this policy
+	 */
+	const struct ptlrpc_nrs_pol_ops	  *nc_ops;
+	/**
+	 * Service compatibility predicate
+	 */
+	nrs_pol_desc_compat_t		   nc_compat;
+	/**
+	 * Set for policies that support a single ptlrpc service, i.e. ones that
+	 * have \a pd_compat set to nrs_policy_compat_one(). The variable value
+	 * depicts the name of the single service that such policies are
+	 * compatible with.
+	 */
+	const char			  *nc_compat_svc_name;
+	/**
+	 * Owner module for this policy descriptor; policies registering from a
+	 * different module to the one the NRS framework is held within
+	 * (currently ptlrpc), should set this field to THIS_MODULE.
+	 */
+	module_t			  *nc_owner;
+	/**
+	 * Policy registration flags; a bitmast of \e nrs_policy_flags
+	 */
+	unsigned			   nc_flags;
+};
+
+/**
+ * NRS policy registering descriptor
+ *
+ * Is used to hold a description of a policy that can be passed to NRS core in
+ * order to register the policy with NRS heads in different PTLRPC services.
+ */
+struct ptlrpc_nrs_pol_desc {
+	/**
+	 * Human-readable policy name
+	 */
+	char					pd_name[NRS_POL_NAME_MAX];
+	/**
+	 * Link into nrs_core::nrs_policies
+	 */
+	struct list_head				pd_list;
+	/**
+	 * NRS operations for this policy
+	 */
+	const struct ptlrpc_nrs_pol_ops	       *pd_ops;
+	/**
+	 * Service compatibility predicate
+	 */
+	nrs_pol_desc_compat_t			pd_compat;
+	/**
+	 * Set for policies that are compatible with only one PTLRPC service.
+	 *
+	 * \see ptlrpc_nrs_pol_conf::nc_compat_svc_name
+	 */
+	const char			       *pd_compat_svc_name;
+	/**
+	 * Owner module for this policy descriptor.
+	 *
+	 * We need to hold a reference to the module whenever we might make use
+	 * of any of the module's contents, i.e.
+	 * - If one or more instances of the policy are at a state where they
+	 *   might be handling a request, i.e.
+	 *   ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED or
+	 *   ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING as we will have to
+	 *   call into the policy's ptlrpc_nrs_pol_ops() handlers. A reference
+	 *   is taken on the module when
+	 *   \e ptlrpc_nrs_pol_desc::pd_refs becomes 1, and released when it
+	 *   becomes 0, so that we hold only one reference to the module maximum
+	 *   at any time.
+	 *
+	 *   We do not need to hold a reference to the module, even though we
+	 *   might use code and data from the module, in the following cases:
+	 * - During external policy registration, because this should happen in
+	 *   the module's init() function, in which case the module is safe from
+	 *   removal because a reference is being held on the module by the
+	 *   kernel, and iirc kmod (and I guess module-init-tools also) will
+	 *   serialize any racing processes properly anyway.
+	 * - During external policy unregistration, because this should happen
+	 *   in a module's exit() function, and any attempts to start a policy
+	 *   instance would need to take a reference on the module, and this is
+	 *   not possible once we have reached the point where the exit()
+	 *   handler is called.
+	 * - During service registration and unregistration, as service setup
+	 *   and cleanup, and policy registration, unregistration and policy
+	 *   instance starting, are serialized by \e nrs_core::nrs_mutex, so
+	 *   as long as users adhere to the convention of registering policies
+	 *   in init() and unregistering them in module exit() functions, there
+	 *   should not be a race between these operations.
+	 * - During any policy-specific lprocfs operations, because a reference
+	 *   is held by the kernel on a proc entry that has been entered by a
+	 *   syscall, so as long as proc entries are removed during unregistration time,
+	 *   then unregistration and lprocfs operations will be properly
+	 *   serialized.
+	 */
+	module_t			       *pd_owner;
+	/**
+	 * Bitmask of \e nrs_policy_flags
+	 */
+	unsigned				pd_flags;
+	/**
+	 * # of references on this descriptor
+	 */
+	atomic_t				pd_refs;
+};
+
+/**
+ * NRS policy state
+ *
+ * Policies transition from one state to the other during their lifetime
+ */
+enum ptlrpc_nrs_pol_state {
+	/**
+	 * Not a valid policy state.
+	 */
+	NRS_POL_STATE_INVALID,
+	/**
+	 * Policies are at this state either at the start of their life, or
+	 * transition here when the user selects a different policy to act
+	 * as the primary one.
+	 */
+	NRS_POL_STATE_STOPPED,
+	/**
+	 * Policy is progress of stopping
+	 */
+	NRS_POL_STATE_STOPPING,
+	/**
+	 * Policy is in progress of starting
+	 */
+	NRS_POL_STATE_STARTING,
+	/**
+	 * A policy is in this state in two cases:
+	 * - it is the fallback policy, which is always in this state.
+	 * - it has been activated by the user; i.e. it is the primary policy,
+	 */
+	NRS_POL_STATE_STARTED,
+};
+
+/**
+ * NRS policy information
+ *
+ * Used for obtaining information for the status of a policy via lprocfs
+ */
+struct ptlrpc_nrs_pol_info {
+	/**
+	 * Policy name
+	 */
+	char				pi_name[NRS_POL_NAME_MAX];
+	/**
+	 * Current policy state
+	 */
+	enum ptlrpc_nrs_pol_state	pi_state;
+	/**
+	 * # RPCs enqueued for later dispatching by the policy
+	 */
+	long				pi_req_queued;
+	/**
+	 * # RPCs started for dispatch by the policy
+	 */
+	long				pi_req_started;
+	/**
+	 * Is this a fallback policy?
+	 */
+	unsigned			pi_fallback:1;
+};
+
+/**
+ * NRS policy
+ *
+ * There is one instance of this for each policy in each NRS head of each
+ * PTLRPC service partition.
+ */
+struct ptlrpc_nrs_policy {
+	/**
+	 * Linkage into the NRS head's list of policies,
+	 * ptlrpc_nrs:nrs_policy_list
+	 */
+	struct list_head			pol_list;
+	/**
+	 * Linkage into the NRS head's list of policies with enqueued
+	 * requests ptlrpc_nrs:nrs_policy_queued
+	 */
+	struct list_head			pol_list_queued;
+	/**
+	 * Current state of this policy
+	 */
+	enum ptlrpc_nrs_pol_state	pol_state;
+	/**
+	 * Bitmask of nrs_policy_flags
+	 */
+	unsigned			pol_flags;
+	/**
+	 * # RPCs enqueued for later dispatching by the policy
+	 */
+	long				pol_req_queued;
+	/**
+	 * # RPCs started for dispatch by the policy
+	 */
+	long				pol_req_started;
+	/**
+	 * Usage Reference count taken on the policy instance
+	 */
+	long				pol_ref;
+	/**
+	 * The NRS head this policy has been created at
+	 */
+	struct ptlrpc_nrs	       *pol_nrs;
+	/**
+	 * Private policy data; varies by policy type
+	 */
+	void			       *pol_private;
+	/**
+	 * Policy descriptor for this policy instance.
+	 */
+	struct ptlrpc_nrs_pol_desc     *pol_desc;
+};
+
+/**
+ * NRS resource
+ *
+ * Resources are embedded into two types of NRS entities:
+ * - Inside NRS policies, in the policy's private data in
+ *   ptlrpc_nrs_policy::pol_private
+ * - In objects that act as prime-level scheduling entities in different NRS
+ *   policies; e.g. on a policy that performs round robin or similar order
+ *   scheduling across client NIDs, there would be one NRS resource per unique
+ *   client NID. On a policy which performs round robin scheduling across
+ *   backend filesystem objects, there would be one resource associated with
+ *   each of the backend filesystem objects partaking in the scheduling
+ *   performed by the policy.
+ *
+ * NRS resources share a parent-child relationship, in which resources embedded
+ * in policy instances are the parent entities, with all scheduling entities
+ * a policy schedules across being the children, thus forming a simple resource
+ * hierarchy. This hierarchy may be extended with one or more levels in the
+ * future if the ability to have more than one primary policy is added.
+ *
+ * Upon request initialization, references to the then active NRS policies are
+ * taken and used to later handle the dispatching of the request with one of
+ * these policies.
+ *
+ * \see nrs_resource_get_safe()
+ * \see ptlrpc_nrs_req_add()
+ */
+struct ptlrpc_nrs_resource {
+	/**
+	 * This NRS resource's parent; is NULL for resources embedded in NRS
+	 * policy instances; i.e. those are top-level ones.
+	 */
+	struct ptlrpc_nrs_resource     *res_parent;
+	/**
+	 * The policy associated with this resource.
+	 */
+	struct ptlrpc_nrs_policy       *res_policy;
+};
+
+enum {
+	NRS_RES_FALLBACK,
+	NRS_RES_PRIMARY,
+	NRS_RES_MAX
+};
+
+/* \name fifo
+ *
+ * FIFO policy
+ *
+ * This policy is a logical wrapper around previous, non-NRS functionality.
+ * It dispatches RPCs in the same order as they arrive from the network. This
+ * policy is currently used as the fallback policy, and the only enabled policy
+ * on all NRS heads of all PTLRPC service partitions.
+ * @{
+ */
+
+/**
+ * Private data structure for the FIFO policy
+ */
+struct nrs_fifo_head {
+	/**
+	 * Resource object for policy instance.
+	 */
+	struct ptlrpc_nrs_resource	fh_res;
+	/**
+	 * List of queued requests.
+	 */
+	struct list_head			fh_list;
+	/**
+	 * For debugging purposes.
+	 */
+	__u64				fh_sequence;
+};
+
+struct nrs_fifo_req {
+	struct list_head		fr_list;
+	__u64			fr_sequence;
+};
+
+/** @} fifo */
+
+/**
+ * \name CRR-N
+ *
+ * CRR-N, Client Round Robin over NIDs
+ * @{
+ */
+
+/**
+ * private data structure for CRR-N NRS
+ */
+struct nrs_crrn_net {
+	struct ptlrpc_nrs_resource	cn_res;
+	cfs_binheap_t		       *cn_binheap;
+	cfs_hash_t		       *cn_cli_hash;
+	/**
+	 * Used when a new scheduling round commences, in order to synchronize
+	 * all clients with the new round number.
+	 */
+	__u64				cn_round;
+	/**
+	 * Determines the relevant ordering amongst request batches within a
+	 * scheduling round.
+	 */
+	__u64				cn_sequence;
+	/**
+	 * Round Robin quantum; the maximum number of RPCs that each request
+	 * batch for each client can have in a scheduling round.
+	 */
+	__u16				cn_quantum;
+};
+
+/**
+ * Object representing a client in CRR-N, as identified by its NID
+ */
+struct nrs_crrn_client {
+	struct ptlrpc_nrs_resource	cc_res;
+	struct hlist_node		cc_hnode;
+	lnet_nid_t			cc_nid;
+	/**
+	 * The round number against which this client is currently scheduling
+	 * requests.
+	 */
+	__u64				cc_round;
+	/**
+	 * The sequence number used for requests scheduled by this client during
+	 * the current round number.
+	 */
+	__u64				cc_sequence;
+	atomic_t			cc_ref;
+	/**
+	 * Round Robin quantum; the maximum number of RPCs the client is allowed
+	 * to schedule in a single batch of each round.
+	 */
+	__u16				cc_quantum;
+	/**
+	 * # of pending requests for this client, on all existing rounds
+	 */
+	__u16				cc_active;
+};
+
+/**
+ * CRR-N NRS request definition
+ */
+struct nrs_crrn_req {
+	/**
+	 * Round number for this request; shared with all other requests in the
+	 * same batch.
+	 */
+	__u64			cr_round;
+	/**
+	 * Sequence number for this request; shared with all other requests in
+	 * the same batch.
+	 */
+	__u64			cr_sequence;
+};
+
+/**
+ * CRR-N policy operations.
+ */
+enum nrs_ctl_crr {
+	/**
+	 * Read the RR quantum size of a CRR-N policy.
+	 */
+	NRS_CTL_CRRN_RD_QUANTUM = PTLRPC_NRS_CTL_1ST_POL_SPEC,
+	/**
+	 * Write the RR quantum size of a CRR-N policy.
+	 */
+	NRS_CTL_CRRN_WR_QUANTUM,
+};
+
+/** @} CRR-N */
+
+/**
+ * \name ORR/TRR
+ *
+ * ORR/TRR (Object-based Round Robin/Target-based Round Robin) NRS policies
+ * @{
+ */
+
+/**
+ * Lower and upper byte offsets of a brw RPC
+ */
+struct nrs_orr_req_range {
+	__u64		or_start;
+	__u64		or_end;
+};
+
+/**
+ * RPC types supported by the ORR/TRR policies
+ */
+enum nrs_orr_supp {
+	NOS_OST_READ  = (1 << 0),
+	NOS_OST_WRITE = (1 << 1),
+	NOS_OST_RW    = (NOS_OST_READ | NOS_OST_WRITE),
+	/**
+	 * Default value for policies.
+	 */
+	NOS_DFLT      = NOS_OST_READ
+};
+
+/**
+ * As unique keys for grouping RPCs together, we use the object's OST FID for
+ * the ORR policy, and the OST index for the TRR policy.
+ *
+ * XXX: We waste some space for TRR policy instances by using a union, but it
+ *	allows to consolidate some of the code between ORR and TRR, and these
+ *	policies will probably eventually merge into one anyway.
+ */
+struct nrs_orr_key {
+	union {
+		/** object FID for ORR */
+		struct lu_fid	ok_fid;
+		/** OST index for TRR */
+		__u32		ok_idx;
+	};
+};
+
+/**
+ * The largest base string for unique hash/slab object names is
+ * "nrs_orr_reg_", so 13 characters. We add 3 to this to be used for the CPT
+ * id number, so this _should_ be more than enough for the maximum number of
+ * CPTs on any system. If it does happen that this statement is incorrect,
+ * nrs_orr_genobjname() will inevitably yield a non-unique name and cause
+ * kmem_cache_create() to complain (on Linux), so the erroneous situation
+ * will hopefully not go unnoticed.
+ */
+#define NRS_ORR_OBJ_NAME_MAX	(sizeof("nrs_orr_reg_") + 3)
+
+/**
+ * private data structure for ORR and TRR NRS
+ */
+struct nrs_orr_data {
+	struct ptlrpc_nrs_resource	od_res;
+	cfs_binheap_t		       *od_binheap;
+	cfs_hash_t		       *od_obj_hash;
+	struct kmem_cache		       *od_cache;
+	/**
+	 * Used when a new scheduling round commences, in order to synchronize
+	 * all object or OST batches with the new round number.
+	 */
+	__u64				od_round;
+	/**
+	 * Determines the relevant ordering amongst request batches within a
+	 * scheduling round.
+	 */
+	__u64				od_sequence;
+	/**
+	 * RPC types that are currently supported.
+	 */
+	enum nrs_orr_supp		od_supp;
+	/**
+	 * Round Robin quantum; the maxium number of RPCs that each request
+	 * batch for each object or OST can have in a scheduling round.
+	 */
+	__u16				od_quantum;
+	/**
+	 * Whether to use physical disk offsets or logical file offsets.
+	 */
+	bool				od_physical;
+	/**
+	 * XXX: We need to provide a persistently allocated string to hold
+	 * unique object names for this policy, since in currently supported
+	 * versions of Linux by Lustre, kmem_cache_create() just sets a pointer
+	 * to the name string provided. kstrdup() is used in the version of
+	 * kmeme_cache_create() in current Linux mainline, so we may be able to
+	 * remove this in the future.
+	 */
+	char				od_objname[NRS_ORR_OBJ_NAME_MAX];
+};
+
+/**
+ * Represents a backend-fs object or OST in the ORR and TRR policies
+ * respectively
+ */
+struct nrs_orr_object {
+	struct ptlrpc_nrs_resource	oo_res;
+	struct hlist_node		oo_hnode;
+	/**
+	 * The round number against which requests are being scheduled for this
+	 * object or OST
+	 */
+	__u64				oo_round;
+	/**
+	 * The sequence number used for requests scheduled for this object or
+	 * OST during the current round number.
+	 */
+	__u64				oo_sequence;
+	/**
+	 * The key of the object or OST for which this structure instance is
+	 * scheduling RPCs
+	 */
+	struct nrs_orr_key		oo_key;
+	atomic_t			oo_ref;
+	/**
+	 * Round Robin quantum; the maximum number of RPCs that are allowed to
+	 * be scheduled for the object or OST in a single batch of each round.
+	 */
+	__u16				oo_quantum;
+	/**
+	 * # of pending requests for this object or OST, on all existing rounds
+	 */
+	__u16				oo_active;
+};
+
+/**
+ * ORR/TRR NRS request definition
+ */
+struct nrs_orr_req {
+	/**
+	 * The offset range this request covers
+	 */
+	struct nrs_orr_req_range	or_range;
+	/**
+	 * Round number for this request; shared with all other requests in the
+	 * same batch.
+	 */
+	__u64				or_round;
+	/**
+	 * Sequence number for this request; shared with all other requests in
+	 * the same batch.
+	 */
+	__u64				or_sequence;
+	/**
+	 * For debugging purposes.
+	 */
+	struct nrs_orr_key		or_key;
+	/**
+	 * An ORR policy instance has filled in request information while
+	 * enqueueing the request on the service partition's regular NRS head.
+	 */
+	unsigned int			or_orr_set:1;
+	/**
+	 * A TRR policy instance has filled in request information while
+	 * enqueueing the request on the service partition's regular NRS head.
+	 */
+	unsigned int			or_trr_set:1;
+	/**
+	 * Request offset ranges have been filled in with logical offset
+	 * values.
+	 */
+	unsigned int			or_logical_set:1;
+	/**
+	 * Request offset ranges have been filled in with physical offset
+	 * values.
+	 */
+	unsigned int			or_physical_set:1;
+};
+
+/** @} ORR/TRR */
+
+/**
+ * NRS request
+ *
+ * Instances of this object exist embedded within ptlrpc_request; the main
+ * purpose of this object is to hold references to the request's resources
+ * for the lifetime of the request, and to hold properties that policies use
+ * use for determining the request's scheduling priority.
+ * */
+struct ptlrpc_nrs_request {
+	/**
+	 * The request's resource hierarchy.
+	 */
+	struct ptlrpc_nrs_resource     *nr_res_ptrs[NRS_RES_MAX];
+	/**
+	 * Index into ptlrpc_nrs_request::nr_res_ptrs of the resource of the
+	 * policy that was used to enqueue the request.
+	 *
+	 * \see nrs_request_enqueue()
+	 */
+	unsigned			nr_res_idx;
+	unsigned			nr_initialized:1;
+	unsigned			nr_enqueued:1;
+	unsigned			nr_started:1;
+	unsigned			nr_finalized:1;
+	cfs_binheap_node_t		nr_node;
+
+	/**
+	 * Policy-specific fields, used for determining a request's scheduling
+	 * priority, and other supporting functionality.
+	 */
+	union {
+		/**
+		 * Fields for the FIFO policy
+		 */
+		struct nrs_fifo_req	fifo;
+		/**
+		 * CRR-N request defintion
+		 */
+		struct nrs_crrn_req	crr;
+		/** ORR and TRR share the same request definition */
+		struct nrs_orr_req	orr;
+	} nr_u;
+	/**
+	 * Externally-registering policies may want to use this to allocate
+	 * their own request properties.
+	 */
+	void			       *ext;
+};
+
+/** @} nrs */
+
+/**
+ * Basic request prioritization operations structure.
+ * The whole idea is centered around locks and RPCs that might affect locks.
+ * When a lock is contended we try to give priority to RPCs that might lead
+ * to fastest release of that lock.
+ * Currently only implemented for OSTs only in a way that makes all
+ * IO and truncate RPCs that are coming from a locked region where a lock is
+ * contended a priority over other requests.
+ */
+struct ptlrpc_hpreq_ops {
+	/**
+	 * Check if the lock handle of the given lock is the same as
+	 * taken from the request.
+	 */
+	int  (*hpreq_lock_match)(struct ptlrpc_request *, struct ldlm_lock *);
+	/**
+	 * Check if the request is a high priority one.
+	 */
+	int  (*hpreq_check)(struct ptlrpc_request *);
+	/**
+	 * Called after the request has been handled.
+	 */
+	void (*hpreq_fini)(struct ptlrpc_request *);
+};
+
+/**
+ * Represents remote procedure call.
+ *
+ * This is a staple structure used by everybody wanting to send a request
+ * in Lustre.
+ */
+struct ptlrpc_request {
+	/* Request type: one of PTL_RPC_MSG_* */
+	int rq_type;
+	/** Result of request processing */
+	int rq_status;
+	/**
+	 * Linkage item through which this request is included into
+	 * sending/delayed lists on client and into rqbd list on server
+	 */
+	struct list_head rq_list;
+	/**
+	 * Server side list of incoming unserved requests sorted by arrival
+	 * time.  Traversed from time to time to notice about to expire
+	 * requests and sent back "early replies" to clients to let them
+	 * know server is alive and well, just very busy to service their
+	 * requests in time
+	 */
+	struct list_head rq_timed_list;
+	/** server-side history, used for debuging purposes. */
+	struct list_head rq_history_list;
+	/** server-side per-export list */
+	struct list_head rq_exp_list;
+	/** server-side hp handlers */
+	struct ptlrpc_hpreq_ops *rq_ops;
+
+	/** initial thread servicing this request */
+	struct ptlrpc_thread *rq_svc_thread;
+
+	/** history sequence # */
+	__u64 rq_history_seq;
+	/** \addtogroup  nrs
+	 * @{
+	 */
+	/** stub for NRS request */
+	struct ptlrpc_nrs_request rq_nrq;
+	/** @} nrs */
+	/** the index of service's srv_at_array into which request is linked */
+	time_t rq_at_index;
+	/** Lock to protect request flags and some other important bits, like
+	 * rq_list
+	 */
+	spinlock_t rq_lock;
+	/** client-side flags are serialized by rq_lock */
+	unsigned int rq_intr:1, rq_replied:1, rq_err:1,
+		rq_timedout:1, rq_resend:1, rq_restart:1,
+		/**
+		 * when ->rq_replay is set, request is kept by the client even
+		 * after server commits corresponding transaction. This is
+		 * used for operations that require sequence of multiple
+		 * requests to be replayed. The only example currently is file
+		 * open/close. When last request in such a sequence is
+		 * committed, ->rq_replay is cleared on all requests in the
+		 * sequence.
+		 */
+		rq_replay:1,
+		rq_no_resend:1, rq_waiting:1, rq_receiving_reply:1,
+		rq_no_delay:1, rq_net_err:1, rq_wait_ctx:1,
+		rq_early:1, rq_must_unlink:1,
+		rq_memalloc:1,      /* req originated from "kswapd" */
+		/* server-side flags */
+		rq_packed_final:1,  /* packed final reply */
+		rq_hp:1,	    /* high priority RPC */
+		rq_at_linked:1,     /* link into service's srv_at_array */
+		rq_reply_truncate:1,
+		rq_committed:1,
+		/* whether the "rq_set" is a valid one */
+		rq_invalid_rqset:1,
+		rq_generation_set:1,
+		/* do not resend request on -EINPROGRESS */
+		rq_no_retry_einprogress:1,
+		/* allow the req to be sent if the import is in recovery
+		 * status */
+		rq_allow_replay:1,
+		/* bulk request, sent to server, but uncommitted */
+		rq_unstable:1;
+
+	unsigned int rq_nr_resend;
+
+	enum rq_phase rq_phase; /* one of RQ_PHASE_* */
+	enum rq_phase rq_next_phase; /* one of RQ_PHASE_* to be used next */
+	atomic_t rq_refcount;/* client-side refcount for SENT race,
+				    server-side refcounf for multiple replies */
+
+	/** Portal to which this request would be sent */
+	short rq_request_portal;  /* XXX FIXME bug 249 */
+	/** Portal where to wait for reply and where reply would be sent */
+	short rq_reply_portal;    /* XXX FIXME bug 249 */
+
+	/**
+	 * client-side:
+	 * !rq_truncate : # reply bytes actually received,
+	 *  rq_truncate : required repbuf_len for resend
+	 */
+	int rq_nob_received;
+	/** Request length */
+	int rq_reqlen;
+	/** Reply length */
+	int rq_replen;
+	/** Request message - what client sent */
+	struct lustre_msg *rq_reqmsg;
+	/** Reply message - server response */
+	struct lustre_msg *rq_repmsg;
+	/** Transaction number */
+	__u64 rq_transno;
+	/** xid */
+	__u64 rq_xid;
+	/**
+	 * List item to for replay list. Not yet commited requests get linked
+	 * there.
+	 * Also see \a rq_replay comment above.
+	 */
+	struct list_head rq_replay_list;
+
+	/**
+	 * security and encryption data
+	 * @{ */
+	struct ptlrpc_cli_ctx   *rq_cli_ctx;     /**< client's half ctx */
+	struct ptlrpc_svc_ctx   *rq_svc_ctx;     /**< server's half ctx */
+	struct list_head	       rq_ctx_chain;   /**< link to waited ctx */
+
+	struct sptlrpc_flavor    rq_flvr;	/**< for client & server */
+	enum lustre_sec_part     rq_sp_from;
+
+	/* client/server security flags */
+	unsigned int
+				 rq_ctx_init:1,      /* context initiation */
+				 rq_ctx_fini:1,      /* context destroy */
+				 rq_bulk_read:1,     /* request bulk read */
+				 rq_bulk_write:1,    /* request bulk write */
+				 /* server authentication flags */
+				 rq_auth_gss:1,      /* authenticated by gss */
+				 rq_auth_remote:1,   /* authed as remote user */
+				 rq_auth_usr_root:1, /* authed as root */
+				 rq_auth_usr_mdt:1,  /* authed as mdt */
+				 rq_auth_usr_ost:1,  /* authed as ost */
+				 /* security tfm flags */
+				 rq_pack_udesc:1,
+				 rq_pack_bulk:1,
+				 /* doesn't expect reply FIXME */
+				 rq_no_reply:1,
+				 rq_pill_init:1;     /* pill initialized */
+
+	uid_t		    rq_auth_uid;	/* authed uid */
+	uid_t		    rq_auth_mapped_uid; /* authed uid mapped to */
+
+	/* (server side), pointed directly into req buffer */
+	struct ptlrpc_user_desc *rq_user_desc;
+
+	/* various buffer pointers */
+	struct lustre_msg       *rq_reqbuf;      /* req wrapper */
+	char		    *rq_repbuf;      /* rep buffer */
+	struct lustre_msg       *rq_repdata;     /* rep wrapper msg */
+	struct lustre_msg       *rq_clrbuf;      /* only in priv mode */
+	int		      rq_reqbuf_len;  /* req wrapper buf len */
+	int		      rq_reqdata_len; /* req wrapper msg len */
+	int		      rq_repbuf_len;  /* rep buffer len */
+	int		      rq_repdata_len; /* rep wrapper msg len */
+	int		      rq_clrbuf_len;  /* only in priv mode */
+	int		      rq_clrdata_len; /* only in priv mode */
+
+	/** early replies go to offset 0, regular replies go after that */
+	unsigned int	     rq_reply_off;
+
+	/** @} */
+
+	/** Fields that help to see if request and reply were swabbed or not */
+	__u32 rq_req_swab_mask;
+	__u32 rq_rep_swab_mask;
+
+	/** What was import generation when this request was sent */
+	int rq_import_generation;
+	enum lustre_imp_state rq_send_state;
+
+	/** how many early replies (for stats) */
+	int rq_early_count;
+
+	/** client+server request */
+	lnet_handle_md_t     rq_req_md_h;
+	struct ptlrpc_cb_id  rq_req_cbid;
+	/** optional time limit for send attempts */
+	cfs_duration_t       rq_delay_limit;
+	/** time request was first queued */
+	cfs_time_t	   rq_queued_time;
+
+	/* server-side... */
+	/** request arrival time */
+	struct timeval       rq_arrival_time;
+	/** separated reply state */
+	struct ptlrpc_reply_state *rq_reply_state;
+	/** incoming request buffer */
+	struct ptlrpc_request_buffer_desc *rq_rqbd;
+
+	/** client-only incoming reply */
+	lnet_handle_md_t     rq_reply_md_h;
+	wait_queue_head_t	  rq_reply_waitq;
+	struct ptlrpc_cb_id  rq_reply_cbid;
+
+	/** our LNet NID */
+	lnet_nid_t	   rq_self;
+	/** Peer description (the other side) */
+	lnet_process_id_t    rq_peer;
+	/** Server-side, export on which request was received */
+	struct obd_export   *rq_export;
+	/** Client side, import where request is being sent */
+	struct obd_import   *rq_import;
+
+	/** Replay callback, called after request is replayed at recovery */
+	void (*rq_replay_cb)(struct ptlrpc_request *);
+	/**
+	 * Commit callback, called when request is committed and about to be
+	 * freed.
+	 */
+	void (*rq_commit_cb)(struct ptlrpc_request *);
+	/** Opaq data for replay and commit callbacks. */
+	void  *rq_cb_data;
+
+	/** For bulk requests on client only: bulk descriptor */
+	struct ptlrpc_bulk_desc *rq_bulk;
+
+	/** client outgoing req */
+	/**
+	 * when request/reply sent (secs), or time when request should be sent
+	 */
+	time_t rq_sent;
+	/** time for request really sent out */
+	time_t rq_real_sent;
+
+	/** when request must finish. volatile
+	 * so that servers' early reply updates to the deadline aren't
+	 * kept in per-cpu cache */
+	volatile time_t rq_deadline;
+	/** when req reply unlink must finish. */
+	time_t rq_reply_deadline;
+	/** when req bulk unlink must finish. */
+	time_t rq_bulk_deadline;
+	/**
+	 * service time estimate (secs)
+	 * If the requestsis not served by this time, it is marked as timed out.
+	 */
+	int    rq_timeout;
+
+	/** Multi-rpc bits */
+	/** Per-request waitq introduced by bug 21938 for recovery waiting */
+	wait_queue_head_t rq_set_waitq;
+	/** Link item for request set lists */
+	struct list_head  rq_set_chain;
+	/** Link back to the request set */
+	struct ptlrpc_request_set *rq_set;
+	/** Async completion handler, called when reply is received */
+	ptlrpc_interpterer_t rq_interpret_reply;
+	/** Async completion context */
+	union ptlrpc_async_args rq_async_args;
+
+	/** Pool if request is from preallocated list */
+	struct ptlrpc_request_pool *rq_pool;
+
+	struct lu_context	   rq_session;
+	struct lu_context	   rq_recov_session;
+
+	/** request format description */
+	struct req_capsule	  rq_pill;
+};
+
+/**
+ * Call completion handler for rpc if any, return it's status or original
+ * rc if there was no handler defined for this request.
+ */
+static inline int ptlrpc_req_interpret(const struct lu_env *env,
+				       struct ptlrpc_request *req, int rc)
+{
+	if (req->rq_interpret_reply != NULL) {
+		req->rq_status = req->rq_interpret_reply(env, req,
+							 &req->rq_async_args,
+							 rc);
+		return req->rq_status;
+	}
+	return rc;
+}
+
+/** \addtogroup  nrs
+ * @{
+ */
+int ptlrpc_nrs_policy_register(struct ptlrpc_nrs_pol_conf *conf);
+int ptlrpc_nrs_policy_unregister(struct ptlrpc_nrs_pol_conf *conf);
+void ptlrpc_nrs_req_hp_move(struct ptlrpc_request *req);
+void nrs_policy_get_info_locked(struct ptlrpc_nrs_policy *policy,
+				struct ptlrpc_nrs_pol_info *info);
+
+/*
+ * Can the request be moved from the regular NRS head to the high-priority NRS
+ * head (of the same PTLRPC service partition), if any?
+ *
+ * For a reliable result, this should be checked under svcpt->scp_req lock.
+ */
+static inline bool ptlrpc_nrs_req_can_move(struct ptlrpc_request *req)
+{
+	struct ptlrpc_nrs_request *nrq = &req->rq_nrq;
+
+	/**
+	 * LU-898: Check ptlrpc_nrs_request::nr_enqueued to make sure the
+	 * request has been enqueued first, and ptlrpc_nrs_request::nr_started
+	 * to make sure it has not been scheduled yet (analogous to previous
+	 * (non-NRS) checking of !list_empty(&ptlrpc_request::rq_list).
+	 */
+	return nrq->nr_enqueued && !nrq->nr_started && !req->rq_hp;
+}
+/** @} nrs */
+
+/**
+ * Returns 1 if request buffer at offset \a index was already swabbed
+ */
+static inline int lustre_req_swabbed(struct ptlrpc_request *req, int index)
+{
+	LASSERT(index < sizeof(req->rq_req_swab_mask) * 8);
+	return req->rq_req_swab_mask & (1 << index);
+}
+
+/**
+ * Returns 1 if request reply buffer at offset \a index was already swabbed
+ */
+static inline int lustre_rep_swabbed(struct ptlrpc_request *req, int index)
+{
+	LASSERT(index < sizeof(req->rq_rep_swab_mask) * 8);
+	return req->rq_rep_swab_mask & (1 << index);
+}
+
+/**
+ * Returns 1 if request needs to be swabbed into local cpu byteorder
+ */
+static inline int ptlrpc_req_need_swab(struct ptlrpc_request *req)
+{
+	return lustre_req_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+}
+
+/**
+ * Returns 1 if request reply needs to be swabbed into local cpu byteorder
+ */
+static inline int ptlrpc_rep_need_swab(struct ptlrpc_request *req)
+{
+	return lustre_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+}
+
+/**
+ * Mark request buffer at offset \a index that it was already swabbed
+ */
+static inline void lustre_set_req_swabbed(struct ptlrpc_request *req, int index)
+{
+	LASSERT(index < sizeof(req->rq_req_swab_mask) * 8);
+	LASSERT((req->rq_req_swab_mask & (1 << index)) == 0);
+	req->rq_req_swab_mask |= 1 << index;
+}
+
+/**
+ * Mark request reply buffer at offset \a index that it was already swabbed
+ */
+static inline void lustre_set_rep_swabbed(struct ptlrpc_request *req, int index)
+{
+	LASSERT(index < sizeof(req->rq_rep_swab_mask) * 8);
+	LASSERT((req->rq_rep_swab_mask & (1 << index)) == 0);
+	req->rq_rep_swab_mask |= 1 << index;
+}
+
+/**
+ * Convert numerical request phase value \a phase into text string description
+ */
+static inline const char *
+ptlrpc_phase2str(enum rq_phase phase)
+{
+	switch (phase) {
+	case RQ_PHASE_NEW:
+		return "New";
+	case RQ_PHASE_RPC:
+		return "Rpc";
+	case RQ_PHASE_BULK:
+		return "Bulk";
+	case RQ_PHASE_INTERPRET:
+		return "Interpret";
+	case RQ_PHASE_COMPLETE:
+		return "Complete";
+	case RQ_PHASE_UNREGISTERING:
+		return "Unregistering";
+	default:
+		return "?Phase?";
+	}
+}
+
+/**
+ * Convert numerical request phase of the request \a req into text stringi
+ * description
+ */
+static inline const char *
+ptlrpc_rqphase2str(struct ptlrpc_request *req)
+{
+	return ptlrpc_phase2str(req->rq_phase);
+}
+
+/**
+ * Debugging functions and helpers to print request structure into debug log
+ * @{
+ */
+/* Spare the preprocessor, spoil the bugs. */
+#define FLAG(field, str) (field ? str : "")
+
+/** Convert bit flags into a string */
+#define DEBUG_REQ_FLAGS(req)						    \
+	ptlrpc_rqphase2str(req),						\
+	FLAG(req->rq_intr, "I"), FLAG(req->rq_replied, "R"),		    \
+	FLAG(req->rq_err, "E"),						 \
+	FLAG(req->rq_timedout, "X") /* eXpired */, FLAG(req->rq_resend, "S"),   \
+	FLAG(req->rq_restart, "T"), FLAG(req->rq_replay, "P"),		  \
+	FLAG(req->rq_no_resend, "N"),					   \
+	FLAG(req->rq_waiting, "W"),					     \
+	FLAG(req->rq_wait_ctx, "C"), FLAG(req->rq_hp, "H"),		     \
+	FLAG(req->rq_committed, "M")
+
+#define REQ_FLAGS_FMT "%s:%s%s%s%s%s%s%s%s%s%s%s%s"
+
+void _debug_req(struct ptlrpc_request *req,
+		struct libcfs_debug_msg_data *data, const char *fmt, ...)
+	__attribute__ ((format (printf, 3, 4)));
+
+/**
+ * Helper that decides if we need to print request accordig to current debug
+ * level settings
+ */
+#define debug_req(msgdata, mask, cdls, req, fmt, a...)			\
+do {									  \
+	CFS_CHECK_STACK(msgdata, mask, cdls);				 \
+									      \
+	if (((mask) & D_CANTMASK) != 0 ||				     \
+	    ((libcfs_debug & (mask)) != 0 &&				  \
+	     (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0))		\
+		_debug_req((req), msgdata, fmt, ##a);			 \
+} while(0)
+
+/**
+ * This is the debug print function you need to use to print request sturucture
+ * content into lustre debug log.
+ * for most callers (level is a constant) this is resolved at compile time */
+#define DEBUG_REQ(level, req, fmt, args...)				   \
+do {									  \
+	if ((level) & (D_ERROR | D_WARNING)) {				\
+		static cfs_debug_limit_state_t cdls;			  \
+		LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls);	    \
+		debug_req(&msgdata, level, &cdls, req, "@@@ "fmt" ", ## args);\
+	} else {							      \
+		LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL);	     \
+		debug_req(&msgdata, level, NULL, req, "@@@ "fmt" ", ## args); \
+	}								     \
+} while (0)
+/** @} */
+
+/**
+ * Structure that defines a single page of a bulk transfer
+ */
+struct ptlrpc_bulk_page {
+	/** Linkage to list of pages in a bulk */
+	struct list_head       bp_link;
+	/**
+	 * Number of bytes in a page to transfer starting from \a bp_pageoffset
+	 */
+	int	      bp_buflen;
+	/** offset within a page */
+	int	      bp_pageoffset;
+	/** The page itself */
+	struct page     *bp_page;
+};
+
+#define BULK_GET_SOURCE   0
+#define BULK_PUT_SINK     1
+#define BULK_GET_SINK     2
+#define BULK_PUT_SOURCE   3
+
+/**
+ * Definition of bulk descriptor.
+ * Bulks are special "Two phase" RPCs where initial request message
+ * is sent first and it is followed bt a transfer (o receiving) of a large
+ * amount of data to be settled into pages referenced from the bulk descriptors.
+ * Bulks transfers (the actual data following the small requests) are done
+ * on separate LNet portals.
+ * In lustre we use bulk transfers for READ and WRITE transfers from/to OSTs.
+ *  Another user is readpage for MDT.
+ */
+struct ptlrpc_bulk_desc {
+	/** completed with failure */
+	unsigned long bd_failure:1;
+	/** {put,get}{source,sink} */
+	unsigned long bd_type:2;
+	/** client side */
+	unsigned long bd_registered:1;
+	/** For serialization with callback */
+	spinlock_t bd_lock;
+	/** Import generation when request for this bulk was sent */
+	int bd_import_generation;
+	/** LNet portal for this bulk */
+	__u32 bd_portal;
+	/** Server side - export this bulk created for */
+	struct obd_export *bd_export;
+	/** Client side - import this bulk was sent on */
+	struct obd_import *bd_import;
+	/** Back pointer to the request */
+	struct ptlrpc_request *bd_req;
+	wait_queue_head_t	    bd_waitq;	/* server side only WQ */
+	int		    bd_iov_count;    /* # entries in bd_iov */
+	int		    bd_max_iov;      /* allocated size of bd_iov */
+	int		    bd_nob;	  /* # bytes covered */
+	int		    bd_nob_transferred; /* # bytes GOT/PUT */
+
+	__u64		  bd_last_xid;
+
+	struct ptlrpc_cb_id    bd_cbid;	 /* network callback info */
+	lnet_nid_t	     bd_sender;       /* stash event::sender */
+	int			bd_md_count;	/* # valid entries in bd_mds */
+	int			bd_md_max_brw;	/* max entries in bd_mds */
+	/** array of associated MDs */
+	lnet_handle_md_t	bd_mds[PTLRPC_BULK_OPS_COUNT];
+
+	/*
+	 * encrypt iov, size is either 0 or bd_iov_count.
+	 */
+	lnet_kiov_t	   *bd_enc_iov;
+
+	lnet_kiov_t	    bd_iov[0];
+};
+
+enum {
+	SVC_STOPPED     = 1 << 0,
+	SVC_STOPPING    = 1 << 1,
+	SVC_STARTING    = 1 << 2,
+	SVC_RUNNING     = 1 << 3,
+	SVC_EVENT       = 1 << 4,
+	SVC_SIGNAL      = 1 << 5,
+};
+
+#define PTLRPC_THR_NAME_LEN		32
+/**
+ * Definition of server service thread structure
+ */
+struct ptlrpc_thread {
+	/**
+	 * List of active threads in svc->srv_threads
+	 */
+	struct list_head t_link;
+	/**
+	 * thread-private data (preallocated memory)
+	 */
+	void *t_data;
+	__u32 t_flags;
+	/**
+	 * service thread index, from ptlrpc_start_threads
+	 */
+	unsigned int t_id;
+	/**
+	 * service thread pid
+	 */
+	pid_t t_pid;
+	/**
+	 * put watchdog in the structure per thread b=14840
+	 */
+	struct lc_watchdog *t_watchdog;
+	/**
+	 * the svc this thread belonged to b=18582
+	 */
+	struct ptlrpc_service_part	*t_svcpt;
+	wait_queue_head_t			t_ctl_waitq;
+	struct lu_env			*t_env;
+	char				t_name[PTLRPC_THR_NAME_LEN];
+};
+
+static inline int thread_is_init(struct ptlrpc_thread *thread)
+{
+	return thread->t_flags == 0;
+}
+
+static inline int thread_is_stopped(struct ptlrpc_thread *thread)
+{
+	return !!(thread->t_flags & SVC_STOPPED);
+}
+
+static inline int thread_is_stopping(struct ptlrpc_thread *thread)
+{
+	return !!(thread->t_flags & SVC_STOPPING);
+}
+
+static inline int thread_is_starting(struct ptlrpc_thread *thread)
+{
+	return !!(thread->t_flags & SVC_STARTING);
+}
+
+static inline int thread_is_running(struct ptlrpc_thread *thread)
+{
+	return !!(thread->t_flags & SVC_RUNNING);
+}
+
+static inline int thread_is_event(struct ptlrpc_thread *thread)
+{
+	return !!(thread->t_flags & SVC_EVENT);
+}
+
+static inline int thread_is_signal(struct ptlrpc_thread *thread)
+{
+	return !!(thread->t_flags & SVC_SIGNAL);
+}
+
+static inline void thread_clear_flags(struct ptlrpc_thread *thread, __u32 flags)
+{
+	thread->t_flags &= ~flags;
+}
+
+static inline void thread_set_flags(struct ptlrpc_thread *thread, __u32 flags)
+{
+	thread->t_flags = flags;
+}
+
+static inline void thread_add_flags(struct ptlrpc_thread *thread, __u32 flags)
+{
+	thread->t_flags |= flags;
+}
+
+static inline int thread_test_and_clear_flags(struct ptlrpc_thread *thread,
+					      __u32 flags)
+{
+	if (thread->t_flags & flags) {
+		thread->t_flags &= ~flags;
+		return 1;
+	}
+	return 0;
+}
+
+/**
+ * Request buffer descriptor structure.
+ * This is a structure that contains one posted request buffer for service.
+ * Once data land into a buffer, event callback creates actual request and
+ * notifies wakes one of the service threads to process new incoming request.
+ * More than one request can fit into the buffer.
+ */
+struct ptlrpc_request_buffer_desc {
+	/** Link item for rqbds on a service */
+	struct list_head	     rqbd_list;
+	/** History of requests for this buffer */
+	struct list_head	     rqbd_reqs;
+	/** Back pointer to service for which this buffer is registered */
+	struct ptlrpc_service_part *rqbd_svcpt;
+	/** LNet descriptor */
+	lnet_handle_md_t       rqbd_md_h;
+	int		    rqbd_refcount;
+	/** The buffer itself */
+	char		  *rqbd_buffer;
+	struct ptlrpc_cb_id    rqbd_cbid;
+	/**
+	 * This "embedded" request structure is only used for the
+	 * last request to fit into the buffer
+	 */
+	struct ptlrpc_request  rqbd_req;
+};
+
+typedef int  (*svc_handler_t)(struct ptlrpc_request *req);
+
+struct ptlrpc_service_ops {
+	/**
+	 * if non-NULL called during thread creation (ptlrpc_start_thread())
+	 * to initialize service specific per-thread state.
+	 */
+	int		(*so_thr_init)(struct ptlrpc_thread *thr);
+	/**
+	 * if non-NULL called during thread shutdown (ptlrpc_main()) to
+	 * destruct state created by ->srv_init().
+	 */
+	void		(*so_thr_done)(struct ptlrpc_thread *thr);
+	/**
+	 * Handler function for incoming requests for this service
+	 */
+	int		(*so_req_handler)(struct ptlrpc_request *req);
+	/**
+	 * function to determine priority of the request, it's called
+	 * on every new request
+	 */
+	int		(*so_hpreq_handler)(struct ptlrpc_request *);
+	/**
+	 * service-specific print fn
+	 */
+	void		(*so_req_printer)(void *, struct ptlrpc_request *);
+};
+
+#ifndef __cfs_cacheline_aligned
+/* NB: put it here for reducing patche dependence */
+# define __cfs_cacheline_aligned
+#endif
+
+/**
+ * How many high priority requests to serve before serving one normal
+ * priority request
+ */
+#define PTLRPC_SVC_HP_RATIO 10
+
+/**
+ * Definition of PortalRPC service.
+ * The service is listening on a particular portal (like tcp port)
+ * and perform actions for a specific server like IO service for OST
+ * or general metadata service for MDS.
+ */
+struct ptlrpc_service {
+	/** serialize /proc operations */
+	spinlock_t			srv_lock;
+	/** most often accessed fields */
+	/** chain thru all services */
+	struct list_head		      srv_list;
+	/** service operations table */
+	struct ptlrpc_service_ops	srv_ops;
+	/** only statically allocated strings here; we don't clean them */
+	char			   *srv_name;
+	/** only statically allocated strings here; we don't clean them */
+	char			   *srv_thread_name;
+	/** service thread list */
+	struct list_head		      srv_threads;
+	/** threads # should be created for each partition on initializing */
+	int				srv_nthrs_cpt_init;
+	/** limit of threads number for each partition */
+	int				srv_nthrs_cpt_limit;
+	/** Root of /proc dir tree for this service */
+	proc_dir_entry_t	   *srv_procroot;
+	/** Pointer to statistic data for this service */
+	struct lprocfs_stats	   *srv_stats;
+	/** # hp per lp reqs to handle */
+	int			     srv_hpreq_ratio;
+	/** biggest request to receive */
+	int			     srv_max_req_size;
+	/** biggest reply to send */
+	int			     srv_max_reply_size;
+	/** size of individual buffers */
+	int			     srv_buf_size;
+	/** # buffers to allocate in 1 group */
+	int			     srv_nbuf_per_group;
+	/** Local portal on which to receive requests */
+	__u32			   srv_req_portal;
+	/** Portal on the client to send replies to */
+	__u32			   srv_rep_portal;
+	/**
+	 * Tags for lu_context associated with this thread, see struct
+	 * lu_context.
+	 */
+	__u32			   srv_ctx_tags;
+	/** soft watchdog timeout multiplier */
+	int			     srv_watchdog_factor;
+	/** under unregister_service */
+	unsigned			srv_is_stopping:1;
+
+	/** max # request buffers in history per partition */
+	int				srv_hist_nrqbds_cpt_max;
+	/** number of CPTs this service bound on */
+	int				srv_ncpts;
+	/** CPTs array this service bound on */
+	__u32				*srv_cpts;
+	/** 2^srv_cptab_bits >= cfs_cpt_numbert(srv_cptable) */
+	int				srv_cpt_bits;
+	/** CPT table this service is running over */
+	struct cfs_cpt_table		*srv_cptable;
+	/**
+	 * partition data for ptlrpc service
+	 */
+	struct ptlrpc_service_part	*srv_parts[0];
+};
+
+/**
+ * Definition of PortalRPC service partition data.
+ * Although a service only has one instance of it right now, but we
+ * will have multiple instances very soon (instance per CPT).
+ *
+ * it has four locks:
+ * \a scp_lock
+ *    serialize operations on rqbd and requests waiting for preprocess
+ * \a scp_req_lock
+ *    serialize operations active requests sent to this portal
+ * \a scp_at_lock
+ *    serialize adaptive timeout stuff
+ * \a scp_rep_lock
+ *    serialize operations on RS list (reply states)
+ *
+ * We don't have any use-case to take two or more locks at the same time
+ * for now, so there is no lock order issue.
+ */
+struct ptlrpc_service_part {
+	/** back reference to owner */
+	struct ptlrpc_service		*scp_service __cfs_cacheline_aligned;
+	/* CPT id, reserved */
+	int				scp_cpt;
+	/** always increasing number */
+	int				scp_thr_nextid;
+	/** # of starting threads */
+	int				scp_nthrs_starting;
+	/** # of stopping threads, reserved for shrinking threads */
+	int				scp_nthrs_stopping;
+	/** # running threads */
+	int				scp_nthrs_running;
+	/** service threads list */
+	struct list_head			scp_threads;
+
+	/**
+	 * serialize the following fields, used for protecting
+	 * rqbd list and incoming requests waiting for preprocess,
+	 * threads starting & stopping are also protected by this lock.
+	 */
+	spinlock_t			scp_lock  __cfs_cacheline_aligned;
+	/** total # req buffer descs allocated */
+	int				scp_nrqbds_total;
+	/** # posted request buffers for receiving */
+	int				scp_nrqbds_posted;
+	/** in progress of allocating rqbd */
+	int				scp_rqbd_allocating;
+	/** # incoming reqs */
+	int				scp_nreqs_incoming;
+	/** request buffers to be reposted */
+	struct list_head			scp_rqbd_idle;
+	/** req buffers receiving */
+	struct list_head			scp_rqbd_posted;
+	/** incoming reqs */
+	struct list_head			scp_req_incoming;
+	/** timeout before re-posting reqs, in tick */
+	cfs_duration_t			scp_rqbd_timeout;
+	/**
+	 * all threads sleep on this. This wait-queue is signalled when new
+	 * incoming request arrives and when difficult reply has to be handled.
+	 */
+	wait_queue_head_t			scp_waitq;
+
+	/** request history */
+	struct list_head			scp_hist_reqs;
+	/** request buffer history */
+	struct list_head			scp_hist_rqbds;
+	/** # request buffers in history */
+	int				scp_hist_nrqbds;
+	/** sequence number for request */
+	__u64				scp_hist_seq;
+	/** highest seq culled from history */
+	__u64				scp_hist_seq_culled;
+
+	/**
+	 * serialize the following fields, used for processing requests
+	 * sent to this portal
+	 */
+	spinlock_t			scp_req_lock __cfs_cacheline_aligned;
+	/** # reqs in either of the NRS heads below */
+	/** # reqs being served */
+	int				scp_nreqs_active;
+	/** # HPreqs being served */
+	int				scp_nhreqs_active;
+	/** # hp requests handled */
+	int				scp_hreq_count;
+
+	/** NRS head for regular requests */
+	struct ptlrpc_nrs		scp_nrs_reg;
+	/** NRS head for HP requests; this is only valid for services that can
+	 *  handle HP requests */
+	struct ptlrpc_nrs	       *scp_nrs_hp;
+
+	/** AT stuff */
+	/** @{ */
+	/**
+	 * serialize the following fields, used for changes on
+	 * adaptive timeout
+	 */
+	spinlock_t			scp_at_lock __cfs_cacheline_aligned;
+	/** estimated rpc service time */
+	struct adaptive_timeout		scp_at_estimate;
+	/** reqs waiting for replies */
+	struct ptlrpc_at_array		scp_at_array;
+	/** early reply timer */
+	timer_list_t			scp_at_timer;
+	/** debug */
+	cfs_time_t			scp_at_checktime;
+	/** check early replies */
+	unsigned			scp_at_check;
+	/** @} */
+
+	/**
+	 * serialize the following fields, used for processing
+	 * replies for this portal
+	 */
+	spinlock_t			scp_rep_lock __cfs_cacheline_aligned;
+	/** all the active replies */
+	struct list_head			scp_rep_active;
+	/** List of free reply_states */
+	struct list_head			scp_rep_idle;
+	/** waitq to run, when adding stuff to srv_free_rs_list */
+	wait_queue_head_t			scp_rep_waitq;
+	/** # 'difficult' replies */
+	atomic_t			scp_nreps_difficult;
+};
+
+#define ptlrpc_service_for_each_part(part, i, svc)			\
+	for (i = 0;							\
+	     i < (svc)->srv_ncpts &&					\
+	     (svc)->srv_parts != NULL &&				\
+	     ((part) = (svc)->srv_parts[i]) != NULL; i++)
+
+/**
+ * Declaration of ptlrpcd control structure
+ */
+struct ptlrpcd_ctl {
+	/**
+	 * Ptlrpc thread control flags (LIOD_START, LIOD_STOP, LIOD_FORCE)
+	 */
+	unsigned long			pc_flags;
+	/**
+	 * Thread lock protecting structure fields.
+	 */
+	spinlock_t			pc_lock;
+	/**
+	 * Start completion.
+	 */
+	struct completion		pc_starting;
+	/**
+	 * Stop completion.
+	 */
+	struct completion		pc_finishing;
+	/**
+	 * Thread requests set.
+	 */
+	struct ptlrpc_request_set  *pc_set;
+	/**
+	 * Thread name used in cfs_daemonize()
+	 */
+	char			pc_name[16];
+	/**
+	 * Environment for request interpreters to run in.
+	 */
+	struct lu_env	       pc_env;
+	/**
+	 * Index of ptlrpcd thread in the array.
+	 */
+	int			 pc_index;
+	/**
+	 * Number of the ptlrpcd's partners.
+	 */
+	int			 pc_npartners;
+	/**
+	 * Pointer to the array of partners' ptlrpcd_ctl structure.
+	 */
+	struct ptlrpcd_ctl	**pc_partners;
+	/**
+	 * Record the partner index to be processed next.
+	 */
+	int			 pc_cursor;
+};
+
+/* Bits for pc_flags */
+enum ptlrpcd_ctl_flags {
+	/**
+	 * Ptlrpc thread start flag.
+	 */
+	LIOD_START       = 1 << 0,
+	/**
+	 * Ptlrpc thread stop flag.
+	 */
+	LIOD_STOP	= 1 << 1,
+	/**
+	 * Ptlrpc thread force flag (only stop force so far).
+	 * This will cause aborting any inflight rpcs handled
+	 * by thread if LIOD_STOP is specified.
+	 */
+	LIOD_FORCE       = 1 << 2,
+	/**
+	 * This is a recovery ptlrpc thread.
+	 */
+	LIOD_RECOVERY    = 1 << 3,
+	/**
+	 * The ptlrpcd is bound to some CPU core.
+	 */
+	LIOD_BIND	= 1 << 4,
+};
+
+/**
+ * \addtogroup nrs
+ * @{
+ *
+ * Service compatibility function; the policy is compatible with all services.
+ *
+ * \param[in] svc  The service the policy is attempting to register with.
+ * \param[in] desc The policy descriptor
+ *
+ * \retval true The policy is compatible with the service
+ *
+ * \see ptlrpc_nrs_pol_desc::pd_compat()
+ */
+static inline bool nrs_policy_compat_all(const struct ptlrpc_service *svc,
+					 const struct ptlrpc_nrs_pol_desc *desc)
+{
+	return true;
+}
+
+/**
+ * Service compatibility function; the policy is compatible with only a specific
+ * service which is identified by its human-readable name at
+ * ptlrpc_service::srv_name.
+ *
+ * \param[in] svc  The service the policy is attempting to register with.
+ * \param[in] desc The policy descriptor
+ *
+ * \retval false The policy is not compatible with the service
+ * \retval true	 The policy is compatible with the service
+ *
+ * \see ptlrpc_nrs_pol_desc::pd_compat()
+ */
+static inline bool nrs_policy_compat_one(const struct ptlrpc_service *svc,
+					 const struct ptlrpc_nrs_pol_desc *desc)
+{
+	LASSERT(desc->pd_compat_svc_name != NULL);
+	return strcmp(svc->srv_name, desc->pd_compat_svc_name) == 0;
+}
+
+/** @} nrs */
+
+/* ptlrpc/events.c */
+extern lnet_handle_eq_t ptlrpc_eq_h;
+extern int ptlrpc_uuid_to_peer(struct obd_uuid *uuid,
+			       lnet_process_id_t *peer, lnet_nid_t *self);
+/**
+ * These callbacks are invoked by LNet when something happened to
+ * underlying buffer
+ * @{
+ */
+extern void request_out_callback(lnet_event_t *ev);
+extern void reply_in_callback(lnet_event_t *ev);
+extern void client_bulk_callback(lnet_event_t *ev);
+extern void request_in_callback(lnet_event_t *ev);
+extern void reply_out_callback(lnet_event_t *ev);
+/** @} */
+
+/* ptlrpc/connection.c */
+struct ptlrpc_connection *ptlrpc_connection_get(lnet_process_id_t peer,
+						lnet_nid_t self,
+						struct obd_uuid *uuid);
+int ptlrpc_connection_put(struct ptlrpc_connection *c);
+struct ptlrpc_connection *ptlrpc_connection_addref(struct ptlrpc_connection *);
+int ptlrpc_connection_init(void);
+void ptlrpc_connection_fini(void);
+extern lnet_pid_t ptl_get_pid(void);
+
+/* ptlrpc/niobuf.c */
+/**
+ * Actual interfacing with LNet to put/get/register/unregister stuff
+ * @{
+ */
+
+int ptlrpc_register_bulk(struct ptlrpc_request *req);
+int ptlrpc_unregister_bulk(struct ptlrpc_request *req, int async);
+
+static inline int ptlrpc_client_bulk_active(struct ptlrpc_request *req)
+{
+	struct ptlrpc_bulk_desc *desc;
+	int		      rc;
+
+	LASSERT(req != NULL);
+	desc = req->rq_bulk;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK) &&
+	    req->rq_bulk_deadline > cfs_time_current_sec())
+		return 1;
+
+	if (!desc)
+		return 0;
+
+	spin_lock(&desc->bd_lock);
+	rc = desc->bd_md_count;
+	spin_unlock(&desc->bd_lock);
+	return rc;
+}
+
+#define PTLRPC_REPLY_MAYBE_DIFFICULT 0x01
+#define PTLRPC_REPLY_EARLY	   0x02
+int ptlrpc_send_reply(struct ptlrpc_request *req, int flags);
+int ptlrpc_reply(struct ptlrpc_request *req);
+int ptlrpc_send_error(struct ptlrpc_request *req, int difficult);
+int ptlrpc_error(struct ptlrpc_request *req);
+void ptlrpc_resend_req(struct ptlrpc_request *request);
+int ptlrpc_at_get_net_latency(struct ptlrpc_request *req);
+int ptl_send_rpc(struct ptlrpc_request *request, int noreply);
+int ptlrpc_register_rqbd(struct ptlrpc_request_buffer_desc *rqbd);
+/** @} */
+
+/* ptlrpc/client.c */
+/**
+ * Client-side portals API. Everything to send requests, receive replies,
+ * request queues, request management, etc.
+ * @{
+ */
+void ptlrpc_init_client(int req_portal, int rep_portal, char *name,
+			struct ptlrpc_client *);
+void ptlrpc_cleanup_client(struct obd_import *imp);
+struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid);
+
+int ptlrpc_queue_wait(struct ptlrpc_request *req);
+int ptlrpc_replay_req(struct ptlrpc_request *req);
+int ptlrpc_unregister_reply(struct ptlrpc_request *req, int async);
+void ptlrpc_restart_req(struct ptlrpc_request *req);
+void ptlrpc_abort_inflight(struct obd_import *imp);
+void ptlrpc_cleanup_imp(struct obd_import *imp);
+void ptlrpc_abort_set(struct ptlrpc_request_set *set);
+
+struct ptlrpc_request_set *ptlrpc_prep_set(void);
+struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func,
+					     void *arg);
+int ptlrpc_set_add_cb(struct ptlrpc_request_set *set,
+		      set_interpreter_func fn, void *data);
+int ptlrpc_set_next_timeout(struct ptlrpc_request_set *);
+int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set);
+int ptlrpc_set_wait(struct ptlrpc_request_set *);
+int ptlrpc_expired_set(void *data);
+void ptlrpc_interrupted_set(void *data);
+void ptlrpc_mark_interrupted(struct ptlrpc_request *req);
+void ptlrpc_set_destroy(struct ptlrpc_request_set *);
+void ptlrpc_set_add_req(struct ptlrpc_request_set *, struct ptlrpc_request *);
+void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc,
+			    struct ptlrpc_request *req);
+
+void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool);
+void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq);
+
+struct ptlrpc_request_pool *
+ptlrpc_init_rq_pool(int, int,
+		    void (*populate_pool)(struct ptlrpc_request_pool *, int));
+
+void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req);
+struct ptlrpc_request *ptlrpc_request_alloc(struct obd_import *imp,
+					    const struct req_format *format);
+struct ptlrpc_request *ptlrpc_request_alloc_pool(struct obd_import *imp,
+					    struct ptlrpc_request_pool *,
+					    const struct req_format *format);
+void ptlrpc_request_free(struct ptlrpc_request *request);
+int ptlrpc_request_pack(struct ptlrpc_request *request,
+			__u32 version, int opcode);
+struct ptlrpc_request *ptlrpc_request_alloc_pack(struct obd_import *imp,
+						const struct req_format *format,
+						__u32 version, int opcode);
+int ptlrpc_request_bufs_pack(struct ptlrpc_request *request,
+			     __u32 version, int opcode, char **bufs,
+			     struct ptlrpc_cli_ctx *ctx);
+struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, __u32 version,
+				       int opcode, int count, __u32 *lengths,
+				       char **bufs);
+struct ptlrpc_request *ptlrpc_prep_req_pool(struct obd_import *imp,
+					     __u32 version, int opcode,
+					    int count, __u32 *lengths, char **bufs,
+					    struct ptlrpc_request_pool *pool);
+void ptlrpc_req_finished(struct ptlrpc_request *request);
+void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request);
+struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req);
+struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req,
+					      unsigned npages, unsigned max_brw,
+					      unsigned type, unsigned portal);
+void __ptlrpc_free_bulk(struct ptlrpc_bulk_desc *bulk, int pin);
+static inline void ptlrpc_free_bulk_pin(struct ptlrpc_bulk_desc *bulk)
+{
+	__ptlrpc_free_bulk(bulk, 1);
+}
+static inline void ptlrpc_free_bulk_nopin(struct ptlrpc_bulk_desc *bulk)
+{
+	__ptlrpc_free_bulk(bulk, 0);
+}
+void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
+			     struct page *page, int pageoffset, int len, int);
+static inline void ptlrpc_prep_bulk_page_pin(struct ptlrpc_bulk_desc *desc,
+					     struct page *page, int pageoffset,
+					     int len)
+{
+	__ptlrpc_prep_bulk_page(desc, page, pageoffset, len, 1);
+}
+
+static inline void ptlrpc_prep_bulk_page_nopin(struct ptlrpc_bulk_desc *desc,
+					       struct page *page, int pageoffset,
+					       int len)
+{
+	__ptlrpc_prep_bulk_page(desc, page, pageoffset, len, 0);
+}
+
+void ptlrpc_retain_replayable_request(struct ptlrpc_request *req,
+				      struct obd_import *imp);
+__u64 ptlrpc_next_xid(void);
+__u64 ptlrpc_sample_next_xid(void);
+__u64 ptlrpc_req_xid(struct ptlrpc_request *request);
+
+/* Set of routines to run a function in ptlrpcd context */
+void *ptlrpcd_alloc_work(struct obd_import *imp,
+			 int (*cb)(const struct lu_env *, void *), void *data);
+void ptlrpcd_destroy_work(void *handler);
+int ptlrpcd_queue_work(void *handler);
+
+/** @} */
+struct ptlrpc_service_buf_conf {
+	/* nbufs is buffers # to allocate when growing the pool */
+	unsigned int			bc_nbufs;
+	/* buffer size to post */
+	unsigned int			bc_buf_size;
+	/* portal to listed for requests on */
+	unsigned int			bc_req_portal;
+	/* portal of where to send replies to */
+	unsigned int			bc_rep_portal;
+	/* maximum request size to be accepted for this service */
+	unsigned int			bc_req_max_size;
+	/* maximum reply size this service can ever send */
+	unsigned int			bc_rep_max_size;
+};
+
+struct ptlrpc_service_thr_conf {
+	/* threadname should be 8 characters or less - 6 will be added on */
+	char				*tc_thr_name;
+	/* threads increasing factor for each CPU */
+	unsigned int			tc_thr_factor;
+	/* service threads # to start on each partition while initializing */
+	unsigned int			tc_nthrs_init;
+	/*
+	 * low water of threads # upper-limit on each partition while running,
+	 * service availability may be impacted if threads number is lower
+	 * than this value. It can be ZERO if the service doesn't require
+	 * CPU affinity or there is only one partition.
+	 */
+	unsigned int			tc_nthrs_base;
+	/* "soft" limit for total threads number */
+	unsigned int			tc_nthrs_max;
+	/* user specified threads number, it will be validated due to
+	 * other members of this structure. */
+	unsigned int			tc_nthrs_user;
+	/* set NUMA node affinity for service threads */
+	unsigned int			tc_cpu_affinity;
+	/* Tags for lu_context associated with service thread */
+	__u32				tc_ctx_tags;
+};
+
+struct ptlrpc_service_cpt_conf {
+	struct cfs_cpt_table		*cc_cptable;
+	/* string pattern to describe CPTs for a service */
+	char				*cc_pattern;
+};
+
+struct ptlrpc_service_conf {
+	/* service name */
+	char				*psc_name;
+	/* soft watchdog timeout multiplifier to print stuck service traces */
+	unsigned int			psc_watchdog_factor;
+	/* buffer information */
+	struct ptlrpc_service_buf_conf	psc_buf;
+	/* thread information */
+	struct ptlrpc_service_thr_conf	psc_thr;
+	/* CPU partition information */
+	struct ptlrpc_service_cpt_conf	psc_cpt;
+	/* function table */
+	struct ptlrpc_service_ops	psc_ops;
+};
+
+/* ptlrpc/service.c */
+/**
+ * Server-side services API. Register/unregister service, request state
+ * management, service thread management
+ *
+ * @{
+ */
+void ptlrpc_save_lock(struct ptlrpc_request *req,
+		      struct lustre_handle *lock, int mode, int no_ack);
+void ptlrpc_commit_replies(struct obd_export *exp);
+void ptlrpc_dispatch_difficult_reply(struct ptlrpc_reply_state *rs);
+void ptlrpc_schedule_difficult_reply(struct ptlrpc_reply_state *rs);
+int ptlrpc_hpreq_handler(struct ptlrpc_request *req);
+struct ptlrpc_service *ptlrpc_register_service(
+				struct ptlrpc_service_conf *conf,
+				struct proc_dir_entry *proc_entry);
+void ptlrpc_stop_all_threads(struct ptlrpc_service *svc);
+
+int ptlrpc_start_threads(struct ptlrpc_service *svc);
+int ptlrpc_unregister_service(struct ptlrpc_service *service);
+int liblustre_check_services(void *arg);
+void ptlrpc_daemonize(char *name);
+int ptlrpc_service_health_check(struct ptlrpc_service *);
+void ptlrpc_server_drop_request(struct ptlrpc_request *req);
+void ptlrpc_request_change_export(struct ptlrpc_request *req,
+				  struct obd_export *export);
+
+int ptlrpc_hr_init(void);
+void ptlrpc_hr_fini(void);
+
+/** @} */
+
+/* ptlrpc/import.c */
+/**
+ * Import API
+ * @{
+ */
+int ptlrpc_connect_import(struct obd_import *imp);
+int ptlrpc_init_import(struct obd_import *imp);
+int ptlrpc_disconnect_import(struct obd_import *imp, int noclose);
+int ptlrpc_import_recovery_state_machine(struct obd_import *imp);
+void deuuidify(char *uuid, const char *prefix, char **uuid_start,
+	       int *uuid_len);
+
+/* ptlrpc/pack_generic.c */
+int ptlrpc_reconnect_import(struct obd_import *imp);
+/** @} */
+
+/**
+ * ptlrpc msg buffer and swab interface
+ *
+ * @{
+ */
+int ptlrpc_buf_need_swab(struct ptlrpc_request *req, const int inout,
+			 int index);
+void ptlrpc_buf_set_swabbed(struct ptlrpc_request *req, const int inout,
+				int index);
+int ptlrpc_unpack_rep_msg(struct ptlrpc_request *req, int len);
+int ptlrpc_unpack_req_msg(struct ptlrpc_request *req, int len);
+
+int lustre_msg_check_version(struct lustre_msg *msg, __u32 version);
+void lustre_init_msg_v2(struct lustre_msg_v2 *msg, int count, __u32 *lens,
+			char **bufs);
+int lustre_pack_request(struct ptlrpc_request *, __u32 magic, int count,
+			__u32 *lens, char **bufs);
+int lustre_pack_reply(struct ptlrpc_request *, int count, __u32 *lens,
+		      char **bufs);
+int lustre_pack_reply_v2(struct ptlrpc_request *req, int count,
+			 __u32 *lens, char **bufs, int flags);
+#define LPRFL_EARLY_REPLY 1
+int lustre_pack_reply_flags(struct ptlrpc_request *, int count, __u32 *lens,
+			    char **bufs, int flags);
+int lustre_shrink_msg(struct lustre_msg *msg, int segment,
+		      unsigned int newlen, int move_data);
+void lustre_free_reply_state(struct ptlrpc_reply_state *rs);
+int __lustre_unpack_msg(struct lustre_msg *m, int len);
+int lustre_msg_hdr_size(__u32 magic, int count);
+int lustre_msg_size(__u32 magic, int count, __u32 *lengths);
+int lustre_msg_size_v2(int count, __u32 *lengths);
+int lustre_packed_msg_size(struct lustre_msg *msg);
+int lustre_msg_early_size(void);
+void *lustre_msg_buf_v2(struct lustre_msg_v2 *m, int n, int min_size);
+void *lustre_msg_buf(struct lustre_msg *m, int n, int minlen);
+int lustre_msg_buflen(struct lustre_msg *m, int n);
+void lustre_msg_set_buflen(struct lustre_msg *m, int n, int len);
+int lustre_msg_bufcount(struct lustre_msg *m);
+char *lustre_msg_string(struct lustre_msg *m, int n, int max_len);
+__u32 lustre_msghdr_get_flags(struct lustre_msg *msg);
+void lustre_msghdr_set_flags(struct lustre_msg *msg, __u32 flags);
+__u32 lustre_msg_get_flags(struct lustre_msg *msg);
+void lustre_msg_add_flags(struct lustre_msg *msg, int flags);
+void lustre_msg_set_flags(struct lustre_msg *msg, int flags);
+void lustre_msg_clear_flags(struct lustre_msg *msg, int flags);
+__u32 lustre_msg_get_op_flags(struct lustre_msg *msg);
+void lustre_msg_add_op_flags(struct lustre_msg *msg, int flags);
+void lustre_msg_set_op_flags(struct lustre_msg *msg, int flags);
+struct lustre_handle *lustre_msg_get_handle(struct lustre_msg *msg);
+__u32 lustre_msg_get_type(struct lustre_msg *msg);
+__u32 lustre_msg_get_version(struct lustre_msg *msg);
+void lustre_msg_add_version(struct lustre_msg *msg, int version);
+__u32 lustre_msg_get_opc(struct lustre_msg *msg);
+__u64 lustre_msg_get_last_xid(struct lustre_msg *msg);
+__u64 lustre_msg_get_last_committed(struct lustre_msg *msg);
+__u64 *lustre_msg_get_versions(struct lustre_msg *msg);
+__u64 lustre_msg_get_transno(struct lustre_msg *msg);
+__u64 lustre_msg_get_slv(struct lustre_msg *msg);
+__u32 lustre_msg_get_limit(struct lustre_msg *msg);
+void lustre_msg_set_slv(struct lustre_msg *msg, __u64 slv);
+void lustre_msg_set_limit(struct lustre_msg *msg, __u64 limit);
+int lustre_msg_get_status(struct lustre_msg *msg);
+__u32 lustre_msg_get_conn_cnt(struct lustre_msg *msg);
+int lustre_msg_is_v1(struct lustre_msg *msg);
+__u32 lustre_msg_get_magic(struct lustre_msg *msg);
+__u32 lustre_msg_get_timeout(struct lustre_msg *msg);
+__u32 lustre_msg_get_service_time(struct lustre_msg *msg);
+char *lustre_msg_get_jobid(struct lustre_msg *msg);
+__u32 lustre_msg_get_cksum(struct lustre_msg *msg);
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0)
+__u32 lustre_msg_calc_cksum(struct lustre_msg *msg, int compat18);
+#else
+# warning "remove checksum compatibility support for b1_8"
+__u32 lustre_msg_calc_cksum(struct lustre_msg *msg);
+#endif
+void lustre_msg_set_handle(struct lustre_msg *msg,struct lustre_handle *handle);
+void lustre_msg_set_type(struct lustre_msg *msg, __u32 type);
+void lustre_msg_set_opc(struct lustre_msg *msg, __u32 opc);
+void lustre_msg_set_last_xid(struct lustre_msg *msg, __u64 last_xid);
+void lustre_msg_set_last_committed(struct lustre_msg *msg,__u64 last_committed);
+void lustre_msg_set_versions(struct lustre_msg *msg, __u64 *versions);
+void lustre_msg_set_transno(struct lustre_msg *msg, __u64 transno);
+void lustre_msg_set_status(struct lustre_msg *msg, __u32 status);
+void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt);
+void ptlrpc_req_set_repsize(struct ptlrpc_request *req, int count, __u32 *sizes);
+void ptlrpc_request_set_replen(struct ptlrpc_request *req);
+void lustre_msg_set_timeout(struct lustre_msg *msg, __u32 timeout);
+void lustre_msg_set_service_time(struct lustre_msg *msg, __u32 service_time);
+void lustre_msg_set_jobid(struct lustre_msg *msg, char *jobid);
+void lustre_msg_set_cksum(struct lustre_msg *msg, __u32 cksum);
+
+static inline void
+lustre_shrink_reply(struct ptlrpc_request *req, int segment,
+		    unsigned int newlen, int move_data)
+{
+	LASSERT(req->rq_reply_state);
+	LASSERT(req->rq_repmsg);
+	req->rq_replen = lustre_shrink_msg(req->rq_repmsg, segment,
+					   newlen, move_data);
+}
+/** @} */
+
+/** Change request phase of \a req to \a new_phase */
+static inline void
+ptlrpc_rqphase_move(struct ptlrpc_request *req, enum rq_phase new_phase)
+{
+	if (req->rq_phase == new_phase)
+		return;
+
+	if (new_phase == RQ_PHASE_UNREGISTERING) {
+		req->rq_next_phase = req->rq_phase;
+		if (req->rq_import)
+			atomic_inc(&req->rq_import->imp_unregistering);
+	}
+
+	if (req->rq_phase == RQ_PHASE_UNREGISTERING) {
+		if (req->rq_import)
+			atomic_dec(&req->rq_import->imp_unregistering);
+	}
+
+	DEBUG_REQ(D_INFO, req, "move req \"%s\" -> \"%s\"",
+		  ptlrpc_rqphase2str(req), ptlrpc_phase2str(new_phase));
+
+	req->rq_phase = new_phase;
+}
+
+/**
+ * Returns true if request \a req got early reply and hard deadline is not met
+ */
+static inline int
+ptlrpc_client_early(struct ptlrpc_request *req)
+{
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
+	    req->rq_reply_deadline > cfs_time_current_sec())
+		return 0;
+	return req->rq_early;
+}
+
+/**
+ * Returns true if we got real reply from server for this request
+ */
+static inline int
+ptlrpc_client_replied(struct ptlrpc_request *req)
+{
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
+	    req->rq_reply_deadline > cfs_time_current_sec())
+		return 0;
+	return req->rq_replied;
+}
+
+/** Returns true if request \a req is in process of receiving server reply */
+static inline int
+ptlrpc_client_recv(struct ptlrpc_request *req)
+{
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
+	    req->rq_reply_deadline > cfs_time_current_sec())
+		return 1;
+	return req->rq_receiving_reply;
+}
+
+static inline int
+ptlrpc_client_recv_or_unlink(struct ptlrpc_request *req)
+{
+	int rc;
+
+	spin_lock(&req->rq_lock);
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
+	    req->rq_reply_deadline > cfs_time_current_sec()) {
+		spin_unlock(&req->rq_lock);
+		return 1;
+	}
+	rc = req->rq_receiving_reply || req->rq_must_unlink;
+	spin_unlock(&req->rq_lock);
+	return rc;
+}
+
+static inline void
+ptlrpc_client_wake_req(struct ptlrpc_request *req)
+{
+	if (req->rq_set == NULL)
+		wake_up(&req->rq_reply_waitq);
+	else
+		wake_up(&req->rq_set->set_waitq);
+}
+
+static inline void
+ptlrpc_rs_addref(struct ptlrpc_reply_state *rs)
+{
+	LASSERT(atomic_read(&rs->rs_refcount) > 0);
+	atomic_inc(&rs->rs_refcount);
+}
+
+static inline void
+ptlrpc_rs_decref(struct ptlrpc_reply_state *rs)
+{
+	LASSERT(atomic_read(&rs->rs_refcount) > 0);
+	if (atomic_dec_and_test(&rs->rs_refcount))
+		lustre_free_reply_state(rs);
+}
+
+/* Should only be called once per req */
+static inline void ptlrpc_req_drop_rs(struct ptlrpc_request *req)
+{
+	if (req->rq_reply_state == NULL)
+		return; /* shouldn't occur */
+	ptlrpc_rs_decref(req->rq_reply_state);
+	req->rq_reply_state = NULL;
+	req->rq_repmsg = NULL;
+}
+
+static inline __u32 lustre_request_magic(struct ptlrpc_request *req)
+{
+	return lustre_msg_get_magic(req->rq_reqmsg);
+}
+
+static inline int ptlrpc_req_get_repsize(struct ptlrpc_request *req)
+{
+	switch (req->rq_reqmsg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return req->rq_reqmsg->lm_repsize;
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n",
+			 req->rq_reqmsg->lm_magic);
+		return -EFAULT;
+	}
+}
+
+static inline int ptlrpc_send_limit_expired(struct ptlrpc_request *req)
+{
+	if (req->rq_delay_limit != 0 &&
+	    cfs_time_before(cfs_time_add(req->rq_queued_time,
+					 cfs_time_seconds(req->rq_delay_limit)),
+			    cfs_time_current())) {
+		return 1;
+	}
+	return 0;
+}
+
+static inline int ptlrpc_no_resend(struct ptlrpc_request *req)
+{
+	if (!req->rq_no_resend && ptlrpc_send_limit_expired(req)) {
+		spin_lock(&req->rq_lock);
+		req->rq_no_resend = 1;
+		spin_unlock(&req->rq_lock);
+	}
+	return req->rq_no_resend;
+}
+
+static inline int
+ptlrpc_server_get_timeout(struct ptlrpc_service_part *svcpt)
+{
+	int at = AT_OFF ? 0 : at_get(&svcpt->scp_at_estimate);
+
+	return svcpt->scp_service->srv_watchdog_factor *
+	       max_t(int, at, obd_timeout);
+}
+
+static inline struct ptlrpc_service *
+ptlrpc_req2svc(struct ptlrpc_request *req)
+{
+	LASSERT(req->rq_rqbd != NULL);
+	return req->rq_rqbd->rqbd_svcpt->scp_service;
+}
+
+/* ldlm/ldlm_lib.c */
+/**
+ * Target client logic
+ * @{
+ */
+int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg);
+int client_obd_cleanup(struct obd_device *obddev);
+int client_connect_import(const struct lu_env *env,
+			  struct obd_export **exp, struct obd_device *obd,
+			  struct obd_uuid *cluuid, struct obd_connect_data *,
+			  void *localdata);
+int client_disconnect_export(struct obd_export *exp);
+int client_import_add_conn(struct obd_import *imp, struct obd_uuid *uuid,
+			   int priority);
+int client_import_del_conn(struct obd_import *imp, struct obd_uuid *uuid);
+int client_import_find_conn(struct obd_import *imp, lnet_nid_t peer,
+			    struct obd_uuid *uuid);
+int import_set_conn_priority(struct obd_import *imp, struct obd_uuid *uuid);
+void client_destroy_import(struct obd_import *imp);
+/** @} */
+
+
+/* ptlrpc/pinger.c */
+/**
+ * Pinger API (client side only)
+ * @{
+ */
+enum timeout_event {
+	TIMEOUT_GRANT = 1
+};
+struct timeout_item;
+typedef int (*timeout_cb_t)(struct timeout_item *, void *);
+int ptlrpc_pinger_add_import(struct obd_import *imp);
+int ptlrpc_pinger_del_import(struct obd_import *imp);
+int ptlrpc_add_timeout_client(int time, enum timeout_event event,
+			      timeout_cb_t cb, void *data,
+			      struct list_head *obd_list);
+int ptlrpc_del_timeout_client(struct list_head *obd_list,
+			      enum timeout_event event);
+struct ptlrpc_request * ptlrpc_prep_ping(struct obd_import *imp);
+int ptlrpc_obd_ping(struct obd_device *obd);
+cfs_time_t ptlrpc_suspend_wakeup_time(void);
+void ping_evictor_start(void);
+void ping_evictor_stop(void);
+int ptlrpc_check_and_wait_suspend(struct ptlrpc_request *req);
+void ptlrpc_pinger_ir_up(void);
+void ptlrpc_pinger_ir_down(void);
+/** @} */
+int ptlrpc_pinger_suppress_pings(void);
+
+/* ptlrpc daemon bind policy */
+typedef enum {
+	/* all ptlrpcd threads are free mode */
+	PDB_POLICY_NONE	  = 1,
+	/* all ptlrpcd threads are bound mode */
+	PDB_POLICY_FULL	  = 2,
+	/* <free1 bound1> <free2 bound2> ... <freeN boundN> */
+	PDB_POLICY_PAIR	  = 3,
+	/* <free1 bound1> <bound1 free2> ... <freeN boundN> <boundN free1>,
+	 * means each ptlrpcd[X] has two partners: thread[X-1] and thread[X+1].
+	 * If kernel supports NUMA, pthrpcd threads are binded and
+	 * grouped by NUMA node */
+	PDB_POLICY_NEIGHBOR      = 4,
+} pdb_policy_t;
+
+/* ptlrpc daemon load policy
+ * It is caller's duty to specify how to push the async RPC into some ptlrpcd
+ * queue, but it is not enforced, affected by "ptlrpcd_bind_policy". If it is
+ * "PDB_POLICY_FULL", then the RPC will be processed by the selected ptlrpcd,
+ * Otherwise, the RPC may be processed by the selected ptlrpcd or its partner,
+ * depends on which is scheduled firstly, to accelerate the RPC processing. */
+typedef enum {
+	/* on the same CPU core as the caller */
+	PDL_POLICY_SAME	 = 1,
+	/* within the same CPU partition, but not the same core as the caller */
+	PDL_POLICY_LOCAL	= 2,
+	/* round-robin on all CPU cores, but not the same core as the caller */
+	PDL_POLICY_ROUND	= 3,
+	/* the specified CPU core is preferred, but not enforced */
+	PDL_POLICY_PREFERRED    = 4,
+} pdl_policy_t;
+
+/* ptlrpc/ptlrpcd.c */
+void ptlrpcd_stop(struct ptlrpcd_ctl *pc, int force);
+void ptlrpcd_free(struct ptlrpcd_ctl *pc);
+void ptlrpcd_wake(struct ptlrpc_request *req);
+void ptlrpcd_add_req(struct ptlrpc_request *req, pdl_policy_t policy, int idx);
+void ptlrpcd_add_rqset(struct ptlrpc_request_set *set);
+int ptlrpcd_addref(void);
+void ptlrpcd_decref(void);
+
+/* ptlrpc/lproc_ptlrpc.c */
+/**
+ * procfs output related functions
+ * @{
+ */
+const char* ll_opcode2str(__u32 opcode);
+#ifdef LPROCFS
+void ptlrpc_lprocfs_register_obd(struct obd_device *obd);
+void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd);
+void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes);
+#else
+static inline void ptlrpc_lprocfs_register_obd(struct obd_device *obd) {}
+static inline void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd) {}
+static inline void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes) {}
+#endif
+/** @} */
+
+/* ptlrpc/llog_server.c */
+int llog_origin_handle_open(struct ptlrpc_request *req);
+int llog_origin_handle_destroy(struct ptlrpc_request *req);
+int llog_origin_handle_prev_block(struct ptlrpc_request *req);
+int llog_origin_handle_next_block(struct ptlrpc_request *req);
+int llog_origin_handle_read_header(struct ptlrpc_request *req);
+int llog_origin_handle_close(struct ptlrpc_request *req);
+int llog_origin_handle_cancel(struct ptlrpc_request *req);
+
+/* ptlrpc/llog_client.c */
+extern struct llog_operations llog_client_ops;
+
+/** @} net */
+
+#endif
+/** @} PtlRPC */
diff --git a/drivers/staging/lustre/lustre/include/lustre_param.h b/drivers/staging/lustre/lustre/include/lustre_param.h
new file mode 100644
index 000000000000..ed654684cb64
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_param.h
@@ -0,0 +1,121 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_param.h
+ *
+ * User-settable parameter keys
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+#ifndef _LUSTRE_PARAM_H
+#define _LUSTRE_PARAM_H
+
+/** \defgroup param param
+ *
+ * @{
+ */
+
+/* For interoperability */
+struct cfg_interop_param {
+	char *old_param;
+	char *new_param;
+};
+
+/* obd_config.c */
+int class_find_param(char *buf, char *key, char **valp);
+struct cfg_interop_param *class_find_old_param(const char *param,
+					       struct cfg_interop_param *ptr);
+int class_get_next_param(char **params, char *copy);
+int class_match_param(char *buf, char *key, char **valp);
+int class_parse_nid(char *buf, lnet_nid_t *nid, char **endh);
+int class_parse_nid_quiet(char *buf, lnet_nid_t *nid, char **endh);
+int class_parse_net(char *buf, __u32 *net, char **endh);
+int class_match_nid(char *buf, char *key, lnet_nid_t nid);
+int class_match_net(char *buf, char *key, __u32 net);
+/* obd_mount.c */
+int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd,
+	    char *s1, char *s2, char *s3, char *s4);
+
+
+
+/****************** User-settable parameter keys *********************/
+/* e.g.
+	tunefs.lustre --param="failover.node=192.168.0.13@tcp0" /dev/sda
+	lctl conf_param testfs-OST0000 failover.node=3@elan,192.168.0.3@tcp0
+		    ... testfs-MDT0000.lov.stripesize=4M
+		    ... testfs-OST0000.ost.client_cache_seconds=15
+		    ... testfs.sys.timeout=<secs>
+		    ... testfs.llite.max_read_ahead_mb=16
+*/
+
+/* System global or special params not handled in obd's proc
+ * See mgs_write_log_sys()
+ */
+#define PARAM_TIMEOUT	      "timeout="	  /* global */
+#define PARAM_LDLM_TIMEOUT	 "ldlm_timeout="     /* global */
+#define PARAM_AT_MIN	       "at_min="	   /* global */
+#define PARAM_AT_MAX	       "at_max="	   /* global */
+#define PARAM_AT_EXTRA	     "at_extra="	 /* global */
+#define PARAM_AT_EARLY_MARGIN      "at_early_margin="  /* global */
+#define PARAM_AT_HISTORY	   "at_history="       /* global */
+#define PARAM_JOBID_VAR		   "jobid_var="	       /* global */
+#define PARAM_MGSNODE	      "mgsnode="	  /* only at mounttime */
+#define PARAM_FAILNODE	     "failover.node="    /* add failover nid */
+#define PARAM_FAILMODE	     "failover.mode="    /* initial mount only */
+#define PARAM_ACTIVE	       "active="	   /* activate/deactivate */
+#define PARAM_NETWORK	      "network="	  /* bind on nid */
+#define PARAM_ID_UPCALL		"identity_upcall="  /* identity upcall */
+
+/* Prefixes for parameters handled by obd's proc methods (XXX_process_config) */
+#define PARAM_OST		  "ost."
+#define PARAM_OSC		  "osc."
+#define PARAM_MDT		  "mdt."
+#define PARAM_MDD		  "mdd."
+#define PARAM_MDC		  "mdc."
+#define PARAM_LLITE		"llite."
+#define PARAM_LOV		  "lov."
+#define PARAM_LOD		"lod."
+#define PARAM_OSP		"osp."
+#define PARAM_SYS		  "sys."	      /* global */
+#define PARAM_SRPC		 "srpc."
+#define PARAM_SRPC_FLVR	    "srpc.flavor."
+#define PARAM_SRPC_UDESC	   "srpc.udesc.cli2mdt"
+#define PARAM_SEC		  "security."
+#define PARAM_QUOTA		"quota."	    /* global */
+
+/** @} param */
+
+#endif /* _LUSTRE_PARAM_H */
diff --git a/drivers/staging/lustre/lustre/include/lustre_quota.h b/drivers/staging/lustre/lustre/include/lustre_quota.h
new file mode 100644
index 000000000000..1c3041f50049
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_quota.h
@@ -0,0 +1,239 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ * Use is subject to license terms.
+ */
+
+#ifndef _LUSTRE_QUOTA_H
+#define _LUSTRE_QUOTA_H
+
+/** \defgroup quota quota
+ *
+ */
+
+#include <linux/lustre_quota.h>
+
+#include <dt_object.h>
+#include <lustre_fid.h>
+#include <lustre_dlm.h>
+
+#ifndef MAX_IQ_TIME
+#define MAX_IQ_TIME  604800     /* (7*24*60*60) 1 week */
+#endif
+
+#ifndef MAX_DQ_TIME
+#define MAX_DQ_TIME  604800     /* (7*24*60*60) 1 week */
+#endif
+
+struct lquota_id_info;
+struct lquota_trans;
+
+/* Gather all quota record type in an union that can be used to read any records
+ * from disk. All fields of these records must be 64-bit aligned, otherwise the
+ * OSD layer may swab them incorrectly. */
+union lquota_rec {
+	struct lquota_glb_rec	lqr_glb_rec;
+	struct lquota_slv_rec	lqr_slv_rec;
+	struct lquota_acct_rec	lqr_acct_rec;
+};
+
+/* Index features supported by the global index objects
+ * Only used for migration purpose and should be removed once on-disk migration
+ * is no longer needed */
+extern struct dt_index_features dt_quota_iusr_features;
+extern struct dt_index_features dt_quota_busr_features;
+extern struct dt_index_features dt_quota_igrp_features;
+extern struct dt_index_features dt_quota_bgrp_features;
+
+/* Name used in the configuration logs to identify the default metadata pool
+ * (composed of all the MDTs, with pool ID 0) and the default data pool (all
+ * the OSTs, with pool ID 0 too). */
+#define QUOTA_METAPOOL_NAME   "mdt="
+#define QUOTA_DATAPOOL_NAME   "ost="
+
+/*
+ * Quota Master Target support
+ */
+
+/* Request handlers for quota master operations.
+ * This is used by the MDT to pass quota/lock requests to the quota master
+ * target. This won't be needed any more once the QMT is a real target and
+ * does not rely any more on the MDT service threads and namespace. */
+struct qmt_handlers {
+	/* Handle quotactl request from client. */
+	int (*qmth_quotactl)(const struct lu_env *, struct lu_device *,
+			     struct obd_quotactl *);
+
+	/* Handle dqacq/dqrel request from slave. */
+	int (*qmth_dqacq)(const struct lu_env *, struct lu_device *,
+			  struct ptlrpc_request *);
+
+	/* LDLM intent policy associated with quota locks */
+	int (*qmth_intent_policy)(const struct lu_env *, struct lu_device *,
+				  struct ptlrpc_request *, struct ldlm_lock **,
+				  int);
+
+	/* Initialize LVB of ldlm resource associated with quota objects */
+	int (*qmth_lvbo_init)(struct lu_device *, struct ldlm_resource *);
+
+	/* Update LVB of ldlm resource associated with quota objects */
+	int (*qmth_lvbo_update)(struct lu_device *, struct ldlm_resource *,
+				struct ptlrpc_request *, int);
+
+	/* Return size of LVB to be packed in ldlm message */
+	int (*qmth_lvbo_size)(struct lu_device *, struct ldlm_lock *);
+
+	/* Fill request buffer with lvb */
+	int (*qmth_lvbo_fill)(struct lu_device *, struct ldlm_lock *, void *,
+			      int);
+
+	/* Free lvb associated with ldlm resource */
+	int (*qmth_lvbo_free)(struct lu_device *, struct ldlm_resource *);
+};
+
+/* actual handlers are defined in lustre/quota/qmt_handler.c */
+extern struct qmt_handlers qmt_hdls;
+
+/*
+ * Quota enforcement support on slaves
+ */
+
+struct qsd_instance;
+
+/* The quota slave feature is implemented under the form of a library.
+ * The API is the following:
+ *
+ * - qsd_init(): the user (mostly the OSD layer) should first allocate a qsd
+ *	       instance via qsd_init(). This creates all required structures
+ *	       to manage quota enforcement for this target and performs all
+ *	       low-level initialization which does not involve any lustre
+ *	       object. qsd_init() should typically be called when the OSD
+ *	       is being set up.
+ *
+ * - qsd_prepare(): This sets up on-disk objects associated with the quota slave
+ *		  feature and initiates the quota reintegration procedure if
+ *		  needed. qsd_prepare() should typically be called when
+ *		  ->ldo_prepare is invoked.
+ *
+ * - qsd_start(): a qsd instance should be started once recovery is completed
+ *		(i.e. when ->ldo_recovery_complete is called). This is used
+ *		to notify the qsd layer that quota should now be enforced
+ *		again via the qsd_op_begin/end functions. The last step of the
+ *		reintegration prodecure (namely usage reconciliation) will be
+ *		completed during start.
+ *
+ * - qsd_fini(): is used to release a qsd_instance structure allocated with
+ *	       qsd_init(). This releases all quota slave objects and frees the
+ *	       structures associated with the qsd_instance.
+ *
+ * - qsd_op_begin(): is used to enforce quota, it must be called in the
+ *		   declaration of each operation. qsd_op_end() should then be
+ *		   invoked later once all operations have been completed in
+ *		   order to release/adjust the quota space.
+ *		   Running qsd_op_begin() before qsd_start() isn't fatal and
+ *		   will return success.
+ *		   Once qsd_start() has been run, qsd_op_begin() will block
+ *		   until the reintegration procedure is completed.
+ *
+ * - qsd_op_end(): performs the post operation quota processing. This must be
+ *		 called after the operation transaction stopped.
+ *		 While qsd_op_begin() must be invoked each time a new
+ *		 operation is declared, qsd_op_end() should be called only
+ *		 once for the whole transaction.
+ *
+ * - qsd_op_adjust(): triggers pre-acquire/release if necessary.
+ *
+ * Below are the function prototypes to be used by OSD layer to manage quota
+ * enforcement. Arguments are documented where each function is defined.  */
+
+struct qsd_instance *qsd_init(const struct lu_env *, char *, struct dt_device *,
+			      proc_dir_entry_t *);
+int qsd_prepare(const struct lu_env *, struct qsd_instance *);
+int qsd_start(const struct lu_env *, struct qsd_instance *);
+void qsd_fini(const struct lu_env *, struct qsd_instance *);
+int qsd_op_begin(const struct lu_env *, struct qsd_instance *,
+		 struct lquota_trans *, struct lquota_id_info *, int *);
+void qsd_op_end(const struct lu_env *, struct qsd_instance *,
+		struct lquota_trans *);
+void qsd_op_adjust(const struct lu_env *, struct qsd_instance *,
+		   union lquota_id *, int);
+/* This is exported for the ldiskfs quota migration only,
+ * see convert_quota_file() */
+int lquota_disk_write_glb(const struct lu_env *, struct dt_object *,
+			  __u64, struct lquota_glb_rec *);
+
+/*
+ * Quota information attached to a transaction
+ */
+
+struct lquota_entry;
+
+struct lquota_id_info {
+	/* quota identifier */
+	union lquota_id		 lqi_id;
+
+	/* USRQUOTA or GRPQUOTA for now, could be expanded for
+	 * directory quota or other types later.  */
+	int			 lqi_type;
+
+	/* inodes or kbytes to be consumed or released, it could
+	 * be negative when releasing space.  */
+	long long		 lqi_space;
+
+	/* quota slave entry structure associated with this ID */
+	struct lquota_entry	*lqi_qentry;
+
+	/* whether we are reporting blocks or inodes */
+	bool			 lqi_is_blk;
+};
+
+/* Since we enforce only inode quota in meta pool (MDTs), and block quota in
+ * data pool (OSTs), there are at most 4 quota ids being enforced in a single
+ * transaction, which is chown transaction:
+ * original uid and gid, new uid and gid.
+ *
+ * This value might need to be revised when directory quota is added.  */
+#define QUOTA_MAX_TRANSIDS    4
+
+/* all qids involved in a single transaction */
+struct lquota_trans {
+	unsigned short		lqt_id_cnt;
+	struct lquota_id_info	lqt_ids[QUOTA_MAX_TRANSIDS];
+};
+
+/* flags for quota local enforcement */
+#define QUOTA_FL_OVER_USRQUOTA  0x01
+#define QUOTA_FL_OVER_GRPQUOTA  0x02
+#define QUOTA_FL_SYNC	   0x04
+
+#define IS_LQUOTA_RES(res)						\
+	(res->lr_name.name[LUSTRE_RES_ID_SEQ_OFF] == FID_SEQ_QUOTA ||	\
+	 res->lr_name.name[LUSTRE_RES_ID_SEQ_OFF] == FID_SEQ_QUOTA_GLB)
+
+/* helper function used by MDT & OFD to retrieve quota accounting information
+ * on slave */
+int lquotactl_slv(const struct lu_env *, struct dt_device *,
+		  struct obd_quotactl *);
+/** @} quota */
+#endif /* _LUSTRE_QUOTA_H */
diff --git a/drivers/staging/lustre/lustre/include/lustre_req_layout.h b/drivers/staging/lustre/lustre/include/lustre_req_layout.h
new file mode 100644
index 000000000000..f4d3820865f1
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_req_layout.h
@@ -0,0 +1,334 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_req_layout.h
+ *
+ * Lustre Metadata Target (mdt) request handler
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#ifndef _LUSTRE_REQ_LAYOUT_H__
+#define _LUSTRE_REQ_LAYOUT_H__
+
+/** \defgroup req_layout req_layout
+ *
+ * @{
+ */
+
+struct req_msg_field;
+struct req_format;
+struct req_capsule;
+
+struct ptlrpc_request;
+
+enum req_location {
+	RCL_CLIENT,
+	RCL_SERVER,
+	RCL_NR
+};
+
+/* Maximal number of fields (buffers) in a request message. */
+#define REQ_MAX_FIELD_NR  9
+
+struct req_capsule {
+	struct ptlrpc_request   *rc_req;
+	const struct req_format *rc_fmt;
+	enum req_location	rc_loc;
+	__u32		    rc_area[RCL_NR][REQ_MAX_FIELD_NR];
+};
+
+#if !defined(__REQ_LAYOUT_USER__)
+
+/* struct ptlrpc_request, lustre_msg* */
+#include <lustre_net.h>
+
+void req_capsule_init(struct req_capsule *pill, struct ptlrpc_request *req,
+		      enum req_location location);
+void req_capsule_fini(struct req_capsule *pill);
+
+void req_capsule_set(struct req_capsule *pill, const struct req_format *fmt);
+void req_capsule_client_dump(struct req_capsule *pill);
+void req_capsule_server_dump(struct req_capsule *pill);
+void req_capsule_init_area(struct req_capsule *pill);
+int req_capsule_filled_sizes(struct req_capsule *pill, enum req_location loc);
+int  req_capsule_server_pack(struct req_capsule *pill);
+
+void *req_capsule_client_get(struct req_capsule *pill,
+			     const struct req_msg_field *field);
+void *req_capsule_client_swab_get(struct req_capsule *pill,
+				  const struct req_msg_field *field,
+				  void *swabber);
+void *req_capsule_client_sized_get(struct req_capsule *pill,
+				   const struct req_msg_field *field,
+				   int len);
+void *req_capsule_server_get(struct req_capsule *pill,
+			     const struct req_msg_field *field);
+void *req_capsule_server_sized_get(struct req_capsule *pill,
+				   const struct req_msg_field *field,
+				   int len);
+void *req_capsule_server_swab_get(struct req_capsule *pill,
+				  const struct req_msg_field *field,
+				  void *swabber);
+void *req_capsule_server_sized_swab_get(struct req_capsule *pill,
+					const struct req_msg_field *field,
+					int len, void *swabber);
+const void *req_capsule_other_get(struct req_capsule *pill,
+				  const struct req_msg_field *field);
+
+void req_capsule_set_size(struct req_capsule *pill,
+			  const struct req_msg_field *field,
+			  enum req_location loc, int size);
+int req_capsule_get_size(const struct req_capsule *pill,
+			  const struct req_msg_field *field,
+			  enum req_location loc);
+int req_capsule_msg_size(struct req_capsule *pill, enum req_location loc);
+int req_capsule_fmt_size(__u32 magic, const struct req_format *fmt,
+			 enum req_location loc);
+void req_capsule_extend(struct req_capsule *pill, const struct req_format *fmt);
+
+int req_capsule_has_field(const struct req_capsule *pill,
+			  const struct req_msg_field *field,
+			  enum req_location loc);
+int req_capsule_field_present(const struct req_capsule *pill,
+			      const struct req_msg_field *field,
+			      enum req_location loc);
+void req_capsule_shrink(struct req_capsule *pill,
+			const struct req_msg_field *field,
+			unsigned int newlen,
+			enum req_location loc);
+int req_capsule_server_grow(struct req_capsule *pill,
+			    const struct req_msg_field *field,
+			    unsigned int newlen);
+int  req_layout_init(void);
+void req_layout_fini(void);
+
+/* __REQ_LAYOUT_USER__ */
+#endif
+
+extern struct req_format RQF_OBD_PING;
+extern struct req_format RQF_OBD_SET_INFO;
+extern struct req_format RQF_SEC_CTX;
+extern struct req_format RQF_OBD_IDX_READ;
+/* MGS req_format */
+extern struct req_format RQF_MGS_TARGET_REG;
+extern struct req_format RQF_MGS_SET_INFO;
+extern struct req_format RQF_MGS_CONFIG_READ;
+/* fid/fld req_format */
+extern struct req_format RQF_SEQ_QUERY;
+extern struct req_format RQF_FLD_QUERY;
+/* MDS req_format */
+extern struct req_format RQF_MDS_CONNECT;
+extern struct req_format RQF_MDS_DISCONNECT;
+extern struct req_format RQF_MDS_STATFS;
+extern struct req_format RQF_MDS_GETSTATUS;
+extern struct req_format RQF_MDS_SYNC;
+extern struct req_format RQF_MDS_GETXATTR;
+extern struct req_format RQF_MDS_GETATTR;
+extern struct req_format RQF_UPDATE_OBJ;
+
+/*
+ * This is format of direct (non-intent) MDS_GETATTR_NAME request.
+ */
+extern struct req_format RQF_MDS_GETATTR_NAME;
+extern struct req_format RQF_MDS_CLOSE;
+extern struct req_format RQF_MDS_PIN;
+extern struct req_format RQF_MDS_UNPIN;
+extern struct req_format RQF_MDS_CONNECT;
+extern struct req_format RQF_MDS_DISCONNECT;
+extern struct req_format RQF_MDS_GET_INFO;
+extern struct req_format RQF_MDS_READPAGE;
+extern struct req_format RQF_MDS_WRITEPAGE;
+extern struct req_format RQF_MDS_IS_SUBDIR;
+extern struct req_format RQF_MDS_DONE_WRITING;
+extern struct req_format RQF_MDS_REINT;
+extern struct req_format RQF_MDS_REINT_CREATE;
+extern struct req_format RQF_MDS_REINT_CREATE_RMT_ACL;
+extern struct req_format RQF_MDS_REINT_CREATE_SLAVE;
+extern struct req_format RQF_MDS_REINT_CREATE_SYM;
+extern struct req_format RQF_MDS_REINT_OPEN;
+extern struct req_format RQF_MDS_REINT_UNLINK;
+extern struct req_format RQF_MDS_REINT_LINK;
+extern struct req_format RQF_MDS_REINT_RENAME;
+extern struct req_format RQF_MDS_REINT_SETATTR;
+extern struct req_format RQF_MDS_REINT_SETXATTR;
+extern struct req_format RQF_MDS_QUOTACHECK;
+extern struct req_format RQF_MDS_QUOTACTL;
+extern struct req_format RQF_QC_CALLBACK;
+extern struct req_format RQF_QUOTA_DQACQ;
+extern struct req_format RQF_MDS_SWAP_LAYOUTS;
+/* MDS hsm formats */
+extern struct req_format RQF_MDS_HSM_STATE_GET;
+extern struct req_format RQF_MDS_HSM_STATE_SET;
+extern struct req_format RQF_MDS_HSM_ACTION;
+extern struct req_format RQF_MDS_HSM_PROGRESS;
+extern struct req_format RQF_MDS_HSM_CT_REGISTER;
+extern struct req_format RQF_MDS_HSM_CT_UNREGISTER;
+extern struct req_format RQF_MDS_HSM_REQUEST;
+/* OST req_format */
+extern struct req_format RQF_OST_CONNECT;
+extern struct req_format RQF_OST_DISCONNECT;
+extern struct req_format RQF_OST_QUOTACHECK;
+extern struct req_format RQF_OST_QUOTACTL;
+extern struct req_format RQF_OST_GETATTR;
+extern struct req_format RQF_OST_SETATTR;
+extern struct req_format RQF_OST_CREATE;
+extern struct req_format RQF_OST_PUNCH;
+extern struct req_format RQF_OST_SYNC;
+extern struct req_format RQF_OST_DESTROY;
+extern struct req_format RQF_OST_BRW_READ;
+extern struct req_format RQF_OST_BRW_WRITE;
+extern struct req_format RQF_OST_STATFS;
+extern struct req_format RQF_OST_SET_GRANT_INFO;
+extern struct req_format RQF_OST_GET_INFO_GENERIC;
+extern struct req_format RQF_OST_GET_INFO_LAST_ID;
+extern struct req_format RQF_OST_GET_INFO_LAST_FID;
+extern struct req_format RQF_OST_SET_INFO_LAST_FID;
+extern struct req_format RQF_OST_GET_INFO_FIEMAP;
+
+/* LDLM req_format */
+extern struct req_format RQF_LDLM_ENQUEUE;
+extern struct req_format RQF_LDLM_ENQUEUE_LVB;
+extern struct req_format RQF_LDLM_CONVERT;
+extern struct req_format RQF_LDLM_INTENT;
+extern struct req_format RQF_LDLM_INTENT_BASIC;
+extern struct req_format RQF_LDLM_INTENT_LAYOUT;
+extern struct req_format RQF_LDLM_INTENT_GETATTR;
+extern struct req_format RQF_LDLM_INTENT_OPEN;
+extern struct req_format RQF_LDLM_INTENT_CREATE;
+extern struct req_format RQF_LDLM_INTENT_UNLINK;
+extern struct req_format RQF_LDLM_INTENT_QUOTA;
+extern struct req_format RQF_LDLM_CANCEL;
+extern struct req_format RQF_LDLM_CALLBACK;
+extern struct req_format RQF_LDLM_CP_CALLBACK;
+extern struct req_format RQF_LDLM_BL_CALLBACK;
+extern struct req_format RQF_LDLM_GL_CALLBACK;
+extern struct req_format RQF_LDLM_GL_DESC_CALLBACK;
+/* LOG req_format */
+extern struct req_format RQF_LOG_CANCEL;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_CREATE;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_DESTROY;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_READ_HEADER;
+extern struct req_format RQF_LLOG_ORIGIN_CONNECT;
+
+extern struct req_msg_field RMF_GENERIC_DATA;
+extern struct req_msg_field RMF_PTLRPC_BODY;
+extern struct req_msg_field RMF_MDT_BODY;
+extern struct req_msg_field RMF_MDT_EPOCH;
+extern struct req_msg_field RMF_OBD_STATFS;
+extern struct req_msg_field RMF_NAME;
+extern struct req_msg_field RMF_SYMTGT;
+extern struct req_msg_field RMF_TGTUUID;
+extern struct req_msg_field RMF_CLUUID;
+extern struct req_msg_field RMF_SETINFO_VAL;
+extern struct req_msg_field RMF_SETINFO_KEY;
+extern struct req_msg_field RMF_GETINFO_VAL;
+extern struct req_msg_field RMF_GETINFO_VALLEN;
+extern struct req_msg_field RMF_GETINFO_KEY;
+extern struct req_msg_field RMF_IDX_INFO;
+
+/*
+ * connection handle received in MDS_CONNECT request.
+ */
+extern struct req_msg_field RMF_CONN;
+extern struct req_msg_field RMF_CONNECT_DATA;
+extern struct req_msg_field RMF_DLM_REQ;
+extern struct req_msg_field RMF_DLM_REP;
+extern struct req_msg_field RMF_DLM_LVB;
+extern struct req_msg_field RMF_DLM_GL_DESC;
+extern struct req_msg_field RMF_LDLM_INTENT;
+extern struct req_msg_field RMF_LAYOUT_INTENT;
+extern struct req_msg_field RMF_MDT_MD;
+extern struct req_msg_field RMF_REC_REINT;
+extern struct req_msg_field RMF_EADATA;
+extern struct req_msg_field RMF_ACL;
+extern struct req_msg_field RMF_LOGCOOKIES;
+extern struct req_msg_field RMF_CAPA1;
+extern struct req_msg_field RMF_CAPA2;
+extern struct req_msg_field RMF_OBD_QUOTACHECK;
+extern struct req_msg_field RMF_OBD_QUOTACTL;
+extern struct req_msg_field RMF_QUOTA_BODY;
+extern struct req_msg_field RMF_STRING;
+extern struct req_msg_field RMF_SWAP_LAYOUTS;
+extern struct req_msg_field RMF_MDS_HSM_PROGRESS;
+extern struct req_msg_field RMF_MDS_HSM_REQUEST;
+extern struct req_msg_field RMF_MDS_HSM_USER_ITEM;
+extern struct req_msg_field RMF_MDS_HSM_ARCHIVE;
+extern struct req_msg_field RMF_HSM_USER_STATE;
+extern struct req_msg_field RMF_HSM_STATE_SET;
+extern struct req_msg_field RMF_MDS_HSM_CURRENT_ACTION;
+extern struct req_msg_field RMF_MDS_HSM_REQUEST;
+
+/* seq-mgr fields */
+extern struct req_msg_field RMF_SEQ_OPC;
+extern struct req_msg_field RMF_SEQ_RANGE;
+extern struct req_msg_field RMF_FID_SPACE;
+
+/* FLD fields */
+extern struct req_msg_field RMF_FLD_OPC;
+extern struct req_msg_field RMF_FLD_MDFLD;
+
+extern struct req_msg_field RMF_LLOGD_BODY;
+extern struct req_msg_field RMF_LLOG_LOG_HDR;
+extern struct req_msg_field RMF_LLOGD_CONN_BODY;
+
+extern struct req_msg_field RMF_MGS_TARGET_INFO;
+extern struct req_msg_field RMF_MGS_SEND_PARAM;
+
+extern struct req_msg_field RMF_OST_BODY;
+extern struct req_msg_field RMF_OBD_IOOBJ;
+extern struct req_msg_field RMF_OBD_ID;
+extern struct req_msg_field RMF_FID;
+extern struct req_msg_field RMF_NIOBUF_REMOTE;
+extern struct req_msg_field RMF_RCS;
+extern struct req_msg_field RMF_FIEMAP_KEY;
+extern struct req_msg_field RMF_FIEMAP_VAL;
+extern struct req_msg_field RMF_OST_ID;
+
+/* MGS config read message format */
+extern struct req_msg_field RMF_MGS_CONFIG_BODY;
+extern struct req_msg_field RMF_MGS_CONFIG_RES;
+
+/* generic uint32 */
+extern struct req_msg_field RMF_U32;
+
+/* OBJ update format */
+extern struct req_msg_field RMF_UPDATE;
+extern struct req_msg_field RMF_UPDATE_REPLY;
+/** @} req_layout */
+
+#endif /* _LUSTRE_REQ_LAYOUT_H__ */
diff --git a/drivers/staging/lustre/lustre/include/lustre_sec.h b/drivers/staging/lustre/lustre/include/lustre_sec.h
new file mode 100644
index 000000000000..9e0908e1c4d6
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_sec.h
@@ -0,0 +1,1145 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTRE_SEC_H_
+#define _LUSTRE_SEC_H_
+
+/** \defgroup sptlrpc sptlrpc
+ *
+ * @{
+ */
+
+/*
+ * to avoid include
+ */
+struct obd_import;
+struct obd_export;
+struct ptlrpc_request;
+struct ptlrpc_reply_state;
+struct ptlrpc_bulk_desc;
+struct brw_page;
+/* Linux specific */
+struct key;
+struct seq_file;
+
+/*
+ * forward declaration
+ */
+struct ptlrpc_sec_policy;
+struct ptlrpc_sec_cops;
+struct ptlrpc_sec_sops;
+struct ptlrpc_sec;
+struct ptlrpc_svc_ctx;
+struct ptlrpc_cli_ctx;
+struct ptlrpc_ctx_ops;
+
+/**
+ * \addtogroup flavor flavor
+ *
+ * RPC flavor is represented by a 32 bits integer. Currently the high 12 bits
+ * are unused, must be set to 0 for future expansion.
+ * <pre>
+ * ------------------------------------------------------------------------
+ * | 4b (bulk svc) | 4b (bulk type) | 4b (svc) | 4b (mech)  | 4b (policy) |
+ * ------------------------------------------------------------------------
+ * </pre>
+ *
+ * @{
+ */
+
+/*
+ * flavor constants
+ */
+enum sptlrpc_policy {
+	SPTLRPC_POLICY_NULL	     = 0,
+	SPTLRPC_POLICY_PLAIN	    = 1,
+	SPTLRPC_POLICY_GSS	      = 2,
+	SPTLRPC_POLICY_MAX,
+};
+
+enum sptlrpc_mech_null {
+	SPTLRPC_MECH_NULL	       = 0,
+	SPTLRPC_MECH_NULL_MAX,
+};
+
+enum sptlrpc_mech_plain {
+	SPTLRPC_MECH_PLAIN	      = 0,
+	SPTLRPC_MECH_PLAIN_MAX,
+};
+
+enum sptlrpc_mech_gss {
+	SPTLRPC_MECH_GSS_NULL	   = 0,
+	SPTLRPC_MECH_GSS_KRB5	   = 1,
+	SPTLRPC_MECH_GSS_MAX,
+};
+
+enum sptlrpc_service_type {
+	SPTLRPC_SVC_NULL		= 0,    /**< no security */
+	SPTLRPC_SVC_AUTH		= 1,    /**< authentication only */
+	SPTLRPC_SVC_INTG		= 2,    /**< integrity */
+	SPTLRPC_SVC_PRIV		= 3,    /**< privacy */
+	SPTLRPC_SVC_MAX,
+};
+
+enum sptlrpc_bulk_type {
+	SPTLRPC_BULK_DEFAULT	    = 0,    /**< follow rpc flavor */
+	SPTLRPC_BULK_HASH	       = 1,    /**< hash integrity */
+	SPTLRPC_BULK_MAX,
+};
+
+enum sptlrpc_bulk_service {
+	SPTLRPC_BULK_SVC_NULL	   = 0,    /**< no security */
+	SPTLRPC_BULK_SVC_AUTH	   = 1,    /**< authentication only */
+	SPTLRPC_BULK_SVC_INTG	   = 2,    /**< integrity */
+	SPTLRPC_BULK_SVC_PRIV	   = 3,    /**< privacy */
+	SPTLRPC_BULK_SVC_MAX,
+};
+
+/*
+ * compose/extract macros
+ */
+#define FLVR_POLICY_OFFSET	      (0)
+#define FLVR_MECH_OFFSET		(4)
+#define FLVR_SVC_OFFSET		 (8)
+#define FLVR_BULK_TYPE_OFFSET	   (12)
+#define FLVR_BULK_SVC_OFFSET	    (16)
+
+#define MAKE_FLVR(policy, mech, svc, btype, bsvc)		       \
+	(((__u32)(policy) << FLVR_POLICY_OFFSET) |		      \
+	 ((__u32)(mech) << FLVR_MECH_OFFSET) |			  \
+	 ((__u32)(svc) << FLVR_SVC_OFFSET) |			    \
+	 ((__u32)(btype) << FLVR_BULK_TYPE_OFFSET) |		    \
+	 ((__u32)(bsvc) << FLVR_BULK_SVC_OFFSET))
+
+/*
+ * extraction
+ */
+#define SPTLRPC_FLVR_POLICY(flavor)				     \
+	((((__u32)(flavor)) >> FLVR_POLICY_OFFSET) & 0xF)
+#define SPTLRPC_FLVR_MECH(flavor)				       \
+	((((__u32)(flavor)) >> FLVR_MECH_OFFSET) & 0xF)
+#define SPTLRPC_FLVR_SVC(flavor)					\
+	((((__u32)(flavor)) >> FLVR_SVC_OFFSET) & 0xF)
+#define SPTLRPC_FLVR_BULK_TYPE(flavor)				  \
+	((((__u32)(flavor)) >> FLVR_BULK_TYPE_OFFSET) & 0xF)
+#define SPTLRPC_FLVR_BULK_SVC(flavor)				   \
+	((((__u32)(flavor)) >> FLVR_BULK_SVC_OFFSET) & 0xF)
+
+#define SPTLRPC_FLVR_BASE(flavor)				       \
+	((((__u32)(flavor)) >> FLVR_POLICY_OFFSET) & 0xFFF)
+#define SPTLRPC_FLVR_BASE_SUB(flavor)				   \
+	((((__u32)(flavor)) >> FLVR_MECH_OFFSET) & 0xFF)
+
+/*
+ * gss subflavors
+ */
+#define MAKE_BASE_SUBFLVR(mech, svc)				    \
+	((__u32)(mech) |						\
+	 ((__u32)(svc) << (FLVR_SVC_OFFSET - FLVR_MECH_OFFSET)))
+
+#define SPTLRPC_SUBFLVR_KRB5N					   \
+	MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_NULL)
+#define SPTLRPC_SUBFLVR_KRB5A					   \
+	MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_AUTH)
+#define SPTLRPC_SUBFLVR_KRB5I					   \
+	MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_INTG)
+#define SPTLRPC_SUBFLVR_KRB5P					   \
+	MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_PRIV)
+
+/*
+ * "end user" flavors
+ */
+#define SPTLRPC_FLVR_NULL			       \
+	MAKE_FLVR(SPTLRPC_POLICY_NULL,		  \
+		  SPTLRPC_MECH_NULL,		    \
+		  SPTLRPC_SVC_NULL,		     \
+		  SPTLRPC_BULK_DEFAULT,		 \
+		  SPTLRPC_BULK_SVC_NULL)
+#define SPTLRPC_FLVR_PLAIN			      \
+	MAKE_FLVR(SPTLRPC_POLICY_PLAIN,		 \
+		  SPTLRPC_MECH_PLAIN,		   \
+		  SPTLRPC_SVC_NULL,		     \
+		  SPTLRPC_BULK_HASH,		    \
+		  SPTLRPC_BULK_SVC_INTG)
+#define SPTLRPC_FLVR_KRB5N			      \
+	MAKE_FLVR(SPTLRPC_POLICY_GSS,		   \
+		  SPTLRPC_MECH_GSS_KRB5,		\
+		  SPTLRPC_SVC_NULL,		     \
+		  SPTLRPC_BULK_DEFAULT,		 \
+		  SPTLRPC_BULK_SVC_NULL)
+#define SPTLRPC_FLVR_KRB5A			      \
+	MAKE_FLVR(SPTLRPC_POLICY_GSS,		   \
+		  SPTLRPC_MECH_GSS_KRB5,		\
+		  SPTLRPC_SVC_AUTH,		     \
+		  SPTLRPC_BULK_DEFAULT,		 \
+		  SPTLRPC_BULK_SVC_NULL)
+#define SPTLRPC_FLVR_KRB5I			      \
+	MAKE_FLVR(SPTLRPC_POLICY_GSS,		   \
+		  SPTLRPC_MECH_GSS_KRB5,		\
+		  SPTLRPC_SVC_INTG,		     \
+		  SPTLRPC_BULK_DEFAULT,		 \
+		  SPTLRPC_BULK_SVC_INTG)
+#define SPTLRPC_FLVR_KRB5P			      \
+	MAKE_FLVR(SPTLRPC_POLICY_GSS,		   \
+		  SPTLRPC_MECH_GSS_KRB5,		\
+		  SPTLRPC_SVC_PRIV,		     \
+		  SPTLRPC_BULK_DEFAULT,		 \
+		  SPTLRPC_BULK_SVC_PRIV)
+
+#define SPTLRPC_FLVR_DEFAULT	    SPTLRPC_FLVR_NULL
+
+#define SPTLRPC_FLVR_INVALID	    ((__u32) 0xFFFFFFFF)
+#define SPTLRPC_FLVR_ANY		((__u32) 0xFFF00000)
+
+/**
+ * extract the useful part from wire flavor
+ */
+#define WIRE_FLVR(wflvr)		(((__u32) (wflvr)) & 0x000FFFFF)
+
+/** @} flavor */
+
+static inline void flvr_set_svc(__u32 *flvr, __u32 svc)
+{
+	LASSERT(svc < SPTLRPC_SVC_MAX);
+	*flvr = MAKE_FLVR(SPTLRPC_FLVR_POLICY(*flvr),
+			  SPTLRPC_FLVR_MECH(*flvr),
+			  svc,
+			  SPTLRPC_FLVR_BULK_TYPE(*flvr),
+			  SPTLRPC_FLVR_BULK_SVC(*flvr));
+}
+
+static inline void flvr_set_bulk_svc(__u32 *flvr, __u32 svc)
+{
+	LASSERT(svc < SPTLRPC_BULK_SVC_MAX);
+	*flvr = MAKE_FLVR(SPTLRPC_FLVR_POLICY(*flvr),
+			  SPTLRPC_FLVR_MECH(*flvr),
+			  SPTLRPC_FLVR_SVC(*flvr),
+			  SPTLRPC_FLVR_BULK_TYPE(*flvr),
+			  svc);
+}
+
+struct bulk_spec_hash {
+	__u8    hash_alg;
+};
+
+/**
+ * Full description of flavors being used on a ptlrpc connection, include
+ * both regular RPC and bulk transfer parts.
+ */
+struct sptlrpc_flavor {
+	/**
+	 * wire flavor, should be renamed to sf_wire.
+	 */
+	__u32   sf_rpc;
+	/**
+	 * general flags of PTLRPC_SEC_FL_*
+	 */
+	__u32   sf_flags;
+	/**
+	 * rpc flavor specification
+	 */
+	union {
+		/* nothing for now */
+	} u_rpc;
+	/**
+	 * bulk flavor specification
+	 */
+	union {
+		struct bulk_spec_hash hash;
+	} u_bulk;
+};
+
+/**
+ * identify the RPC is generated from what part of Lustre. It's encoded into
+ * RPC requests and to be checked by ptlrpc service.
+ */
+enum lustre_sec_part {
+	LUSTRE_SP_CLI	   = 0,
+	LUSTRE_SP_MDT,
+	LUSTRE_SP_OST,
+	LUSTRE_SP_MGC,
+	LUSTRE_SP_MGS,
+	LUSTRE_SP_ANY	   = 0xFF
+};
+
+const char *sptlrpc_part2name(enum lustre_sec_part sp);
+enum lustre_sec_part sptlrpc_target_sec_part(struct obd_device *obd);
+
+/**
+ * A rule specifies a flavor to be used by a ptlrpc connection between
+ * two Lustre parts.
+ */
+struct sptlrpc_rule {
+	__u32		   sr_netid;   /* LNET network ID */
+	__u8		    sr_from;    /* sec_part */
+	__u8		    sr_to;      /* sec_part */
+	__u16		   sr_padding;
+	struct sptlrpc_flavor   sr_flvr;
+};
+
+/**
+ * A set of rules in memory.
+ *
+ * Rules are generated and stored on MGS, and propagated to MDT, OST,
+ * and client when needed.
+ */
+struct sptlrpc_rule_set {
+	int		     srs_nslot;
+	int		     srs_nrule;
+	struct sptlrpc_rule    *srs_rules;
+};
+
+int sptlrpc_parse_flavor(const char *str, struct sptlrpc_flavor *flvr);
+int sptlrpc_flavor_has_bulk(struct sptlrpc_flavor *flvr);
+
+static inline void sptlrpc_rule_set_init(struct sptlrpc_rule_set *set)
+{
+	memset(set, 0, sizeof(*set));
+}
+
+void sptlrpc_rule_set_free(struct sptlrpc_rule_set *set);
+int  sptlrpc_rule_set_expand(struct sptlrpc_rule_set *set);
+int  sptlrpc_rule_set_merge(struct sptlrpc_rule_set *set,
+			    struct sptlrpc_rule *rule);
+int sptlrpc_rule_set_choose(struct sptlrpc_rule_set *rset,
+			    enum lustre_sec_part from,
+			    enum lustre_sec_part to,
+			    lnet_nid_t nid,
+			    struct sptlrpc_flavor *sf);
+void sptlrpc_rule_set_dump(struct sptlrpc_rule_set *set);
+
+int  sptlrpc_process_config(struct lustre_cfg *lcfg);
+void sptlrpc_conf_log_start(const char *logname);
+void sptlrpc_conf_log_stop(const char *logname);
+void sptlrpc_conf_log_update_begin(const char *logname);
+void sptlrpc_conf_log_update_end(const char *logname);
+void sptlrpc_conf_client_adapt(struct obd_device *obd);
+int  sptlrpc_conf_target_get_rules(struct obd_device *obd,
+				   struct sptlrpc_rule_set *rset,
+				   int initial);
+void sptlrpc_target_choose_flavor(struct sptlrpc_rule_set *rset,
+				  enum lustre_sec_part from,
+				  lnet_nid_t nid,
+				  struct sptlrpc_flavor *flavor);
+
+/* The maximum length of security payload. 1024 is enough for Kerberos 5,
+ * and should be enough for other future mechanisms but not sure.
+ * Only used by pre-allocated request/reply pool.
+ */
+#define SPTLRPC_MAX_PAYLOAD     (1024)
+
+
+struct vfs_cred {
+	uint32_t	vc_uid;
+	uint32_t	vc_gid;
+};
+
+struct ptlrpc_ctx_ops {
+	/**
+	 * To determine whether it's suitable to use the \a ctx for \a vcred.
+	 */
+	int     (*match)       (struct ptlrpc_cli_ctx *ctx,
+				struct vfs_cred *vcred);
+
+	/**
+	 * To bring the \a ctx uptodate.
+	 */
+	int     (*refresh)     (struct ptlrpc_cli_ctx *ctx);
+
+	/**
+	 * Validate the \a ctx.
+	 */
+	int     (*validate)    (struct ptlrpc_cli_ctx *ctx);
+
+	/**
+	 * Force the \a ctx to die.
+	 */
+	void    (*die)	 (struct ptlrpc_cli_ctx *ctx,
+				int grace);
+	int     (*display)     (struct ptlrpc_cli_ctx *ctx,
+				char *buf, int bufsize);
+
+	/**
+	 * Sign the request message using \a ctx.
+	 *
+	 * \pre req->rq_reqmsg point to request message.
+	 * \pre req->rq_reqlen is the request message length.
+	 * \post req->rq_reqbuf point to request message with signature.
+	 * \post req->rq_reqdata_len is set to the final request message size.
+	 *
+	 * \see null_ctx_sign(), plain_ctx_sign(), gss_cli_ctx_sign().
+	 */
+	int     (*sign)	(struct ptlrpc_cli_ctx *ctx,
+				struct ptlrpc_request *req);
+
+	/**
+	 * Verify the reply message using \a ctx.
+	 *
+	 * \pre req->rq_repdata point to reply message with signature.
+	 * \pre req->rq_repdata_len is the total reply message length.
+	 * \post req->rq_repmsg point to reply message without signature.
+	 * \post req->rq_replen is the reply message length.
+	 *
+	 * \see null_ctx_verify(), plain_ctx_verify(), gss_cli_ctx_verify().
+	 */
+	int     (*verify)      (struct ptlrpc_cli_ctx *ctx,
+				struct ptlrpc_request *req);
+
+	/**
+	 * Encrypt the request message using \a ctx.
+	 *
+	 * \pre req->rq_reqmsg point to request message in clear text.
+	 * \pre req->rq_reqlen is the request message length.
+	 * \post req->rq_reqbuf point to request message.
+	 * \post req->rq_reqdata_len is set to the final request message size.
+	 *
+	 * \see gss_cli_ctx_seal().
+	 */
+	int     (*seal)	(struct ptlrpc_cli_ctx *ctx,
+				struct ptlrpc_request *req);
+
+	/**
+	 * Decrypt the reply message using \a ctx.
+	 *
+	 * \pre req->rq_repdata point to encrypted reply message.
+	 * \pre req->rq_repdata_len is the total cipher text length.
+	 * \post req->rq_repmsg point to reply message in clear text.
+	 * \post req->rq_replen is the reply message length in clear text.
+	 *
+	 * \see gss_cli_ctx_unseal().
+	 */
+	int     (*unseal)      (struct ptlrpc_cli_ctx *ctx,
+				struct ptlrpc_request *req);
+
+	/**
+	 * Wrap bulk request data. This is called before wrapping RPC
+	 * request message.
+	 *
+	 * \pre bulk buffer is descripted by desc->bd_iov and
+	 * desc->bd_iov_count. note for read it's just buffer, no data
+	 * need to be sent;  for write it contains data in clear text.
+	 * \post when necessary, ptlrpc_bulk_sec_desc was properly prepared
+	 * (usually inside of RPC request message).
+	 * - encryption: cipher text bulk buffer is descripted by
+	 *   desc->bd_enc_iov and desc->bd_iov_count (currently assume iov
+	 *   count remains the same).
+	 * - otherwise: bulk buffer is still desc->bd_iov and
+	 *   desc->bd_iov_count.
+	 *
+	 * \return 0: success.
+	 * \return -ev: error code.
+	 *
+	 * \see plain_cli_wrap_bulk(), gss_cli_ctx_wrap_bulk().
+	 */
+	int     (*wrap_bulk)   (struct ptlrpc_cli_ctx *ctx,
+				struct ptlrpc_request *req,
+				struct ptlrpc_bulk_desc *desc);
+
+	/**
+	 * Unwrap bulk reply data. This is called after wrapping RPC
+	 * reply message.
+	 *
+	 * \pre bulk buffer is descripted by desc->bd_iov/desc->bd_enc_iov and
+	 * desc->bd_iov_count, according to wrap_bulk().
+	 * \post final bulk data in clear text is placed in buffer described
+	 * by desc->bd_iov and desc->bd_iov_count.
+	 * \return +ve nob of actual bulk data in clear text.
+	 * \return -ve error code.
+	 *
+	 * \see plain_cli_unwrap_bulk(), gss_cli_ctx_unwrap_bulk().
+	 */
+	int     (*unwrap_bulk) (struct ptlrpc_cli_ctx *ctx,
+				struct ptlrpc_request *req,
+				struct ptlrpc_bulk_desc *desc);
+};
+
+#define PTLRPC_CTX_NEW_BIT	     (0)  /* newly created */
+#define PTLRPC_CTX_UPTODATE_BIT	(1)  /* uptodate */
+#define PTLRPC_CTX_DEAD_BIT	    (2)  /* mark expired gracefully */
+#define PTLRPC_CTX_ERROR_BIT	   (3)  /* fatal error (refresh, etc.) */
+#define PTLRPC_CTX_CACHED_BIT	  (8)  /* in ctx cache (hash etc.) */
+#define PTLRPC_CTX_ETERNAL_BIT	 (9)  /* always valid */
+
+#define PTLRPC_CTX_NEW		 (1 << PTLRPC_CTX_NEW_BIT)
+#define PTLRPC_CTX_UPTODATE	    (1 << PTLRPC_CTX_UPTODATE_BIT)
+#define PTLRPC_CTX_DEAD		(1 << PTLRPC_CTX_DEAD_BIT)
+#define PTLRPC_CTX_ERROR	       (1 << PTLRPC_CTX_ERROR_BIT)
+#define PTLRPC_CTX_CACHED	      (1 << PTLRPC_CTX_CACHED_BIT)
+#define PTLRPC_CTX_ETERNAL	     (1 << PTLRPC_CTX_ETERNAL_BIT)
+
+#define PTLRPC_CTX_STATUS_MASK	 (PTLRPC_CTX_NEW_BIT    |       \
+					PTLRPC_CTX_UPTODATE   |       \
+					PTLRPC_CTX_DEAD       |       \
+					PTLRPC_CTX_ERROR)
+
+struct ptlrpc_cli_ctx {
+	struct hlist_node	cc_cache;      /* linked into ctx cache */
+	atomic_t	    cc_refcount;
+	struct ptlrpc_sec      *cc_sec;
+	struct ptlrpc_ctx_ops  *cc_ops;
+	cfs_time_t	      cc_expire;     /* in seconds */
+	unsigned int	    cc_early_expire:1;
+	unsigned long	   cc_flags;
+	struct vfs_cred	 cc_vcred;
+	spinlock_t		cc_lock;
+	struct list_head	      cc_req_list;   /* waiting reqs linked here */
+	struct list_head	      cc_gc_chain;   /* linked to gc chain */
+};
+
+/**
+ * client side policy operation vector.
+ */
+struct ptlrpc_sec_cops {
+	/**
+	 * Given an \a imp, create and initialize a ptlrpc_sec structure.
+	 * \param ctx service context:
+	 * - regular import: \a ctx should be NULL;
+	 * - reverse import: \a ctx is obtained from incoming request.
+	 * \param flavor specify what flavor to use.
+	 *
+	 * When necessary, policy module is responsible for taking reference
+	 * on the import.
+	 *
+	 * \see null_create_sec(), plain_create_sec(), gss_sec_create_kr().
+	 */
+	struct ptlrpc_sec *     (*create_sec)  (struct obd_import *imp,
+						struct ptlrpc_svc_ctx *ctx,
+						struct sptlrpc_flavor *flavor);
+
+	/**
+	 * Destructor of ptlrpc_sec. When called, refcount has been dropped
+	 * to 0 and all contexts has been destroyed.
+	 *
+	 * \see null_destroy_sec(), plain_destroy_sec(), gss_sec_destroy_kr().
+	 */
+	void		    (*destroy_sec) (struct ptlrpc_sec *sec);
+
+	/**
+	 * Notify that this ptlrpc_sec is going to die. Optionally, policy
+	 * module is supposed to set sec->ps_dying and whatever necessary
+	 * actions.
+	 *
+	 * \see plain_kill_sec(), gss_sec_kill().
+	 */
+	void		    (*kill_sec)    (struct ptlrpc_sec *sec);
+
+	/**
+	 * Given \a vcred, lookup and/or create its context. The policy module
+	 * is supposed to maintain its own context cache.
+	 * XXX currently \a create and \a remove_dead is always 1, perhaps
+	 * should be removed completely.
+	 *
+	 * \see null_lookup_ctx(), plain_lookup_ctx(), gss_sec_lookup_ctx_kr().
+	 */
+	struct ptlrpc_cli_ctx * (*lookup_ctx)  (struct ptlrpc_sec *sec,
+						struct vfs_cred *vcred,
+						int create,
+						int remove_dead);
+
+	/**
+	 * Called then the reference of \a ctx dropped to 0. The policy module
+	 * is supposed to destroy this context or whatever else according to
+	 * its cache maintainance mechamism.
+	 *
+	 * \param sync if zero, we shouldn't wait for the context being
+	 * destroyed completely.
+	 *
+	 * \see plain_release_ctx(), gss_sec_release_ctx_kr().
+	 */
+	void		    (*release_ctx) (struct ptlrpc_sec *sec,
+						struct ptlrpc_cli_ctx *ctx,
+						int sync);
+
+	/**
+	 * Flush the context cache.
+	 *
+	 * \param uid context of which user, -1 means all contexts.
+	 * \param grace if zero, the PTLRPC_CTX_UPTODATE_BIT of affected
+	 * contexts should be cleared immediately.
+	 * \param force if zero, only idle contexts will be flushed.
+	 *
+	 * \see plain_flush_ctx_cache(), gss_sec_flush_ctx_cache_kr().
+	 */
+	int		     (*flush_ctx_cache)
+					       (struct ptlrpc_sec *sec,
+						uid_t uid,
+						int grace,
+						int force);
+
+	/**
+	 * Called periodically by garbage collector to remove dead contexts
+	 * from cache.
+	 *
+	 * \see gss_sec_gc_ctx_kr().
+	 */
+	void		    (*gc_ctx)      (struct ptlrpc_sec *sec);
+
+	/**
+	 * Given an context \a ctx, install a corresponding reverse service
+	 * context on client side.
+	 * XXX currently it's only used by GSS module, maybe we should remove
+	 * this from general API.
+	 */
+	int		     (*install_rctx)(struct obd_import *imp,
+						struct ptlrpc_sec *sec,
+						struct ptlrpc_cli_ctx *ctx);
+
+	/**
+	 * To allocate request buffer for \a req.
+	 *
+	 * \pre req->rq_reqmsg == NULL.
+	 * \pre req->rq_reqbuf == NULL, otherwise it must be pre-allocated,
+	 * we are not supposed to free it.
+	 * \post if success, req->rq_reqmsg point to a buffer with size
+	 * at least \a lustre_msg_size.
+	 *
+	 * \see null_alloc_reqbuf(), plain_alloc_reqbuf(), gss_alloc_reqbuf().
+	 */
+	int		     (*alloc_reqbuf)(struct ptlrpc_sec *sec,
+						struct ptlrpc_request *req,
+						int lustre_msg_size);
+
+	/**
+	 * To free request buffer for \a req.
+	 *
+	 * \pre req->rq_reqbuf != NULL.
+	 *
+	 * \see null_free_reqbuf(), plain_free_reqbuf(), gss_free_reqbuf().
+	 */
+	void		    (*free_reqbuf) (struct ptlrpc_sec *sec,
+						struct ptlrpc_request *req);
+
+	/**
+	 * To allocate reply buffer for \a req.
+	 *
+	 * \pre req->rq_repbuf == NULL.
+	 * \post if success, req->rq_repbuf point to a buffer with size
+	 * req->rq_repbuf_len, the size should be large enough to receive
+	 * reply which be transformed from \a lustre_msg_size of clear text.
+	 *
+	 * \see null_alloc_repbuf(), plain_alloc_repbuf(), gss_alloc_repbuf().
+	 */
+	int		     (*alloc_repbuf)(struct ptlrpc_sec *sec,
+						struct ptlrpc_request *req,
+						int lustre_msg_size);
+
+	/**
+	 * To free reply buffer for \a req.
+	 *
+	 * \pre req->rq_repbuf != NULL.
+	 * \post req->rq_repbuf == NULL.
+	 * \post req->rq_repbuf_len == 0.
+	 *
+	 * \see null_free_repbuf(), plain_free_repbuf(), gss_free_repbuf().
+	 */
+	void		    (*free_repbuf) (struct ptlrpc_sec *sec,
+						struct ptlrpc_request *req);
+
+	/**
+	 * To expand the request buffer of \a req, thus the \a segment in
+	 * the request message pointed by req->rq_reqmsg can accommodate
+	 * at least \a newsize of data.
+	 *
+	 * \pre req->rq_reqmsg->lm_buflens[segment] < newsize.
+	 *
+	 * \see null_enlarge_reqbuf(), plain_enlarge_reqbuf(),
+	 * gss_enlarge_reqbuf().
+	 */
+	int		     (*enlarge_reqbuf)
+					       (struct ptlrpc_sec *sec,
+						struct ptlrpc_request *req,
+						int segment, int newsize);
+	/*
+	 * misc
+	 */
+	int		     (*display)     (struct ptlrpc_sec *sec,
+						struct seq_file *seq);
+};
+
+/**
+ * server side policy operation vector.
+ */
+struct ptlrpc_sec_sops {
+	/**
+	 * verify an incoming request.
+	 *
+	 * \pre request message is pointed by req->rq_reqbuf, size is
+	 * req->rq_reqdata_len; and the message has been unpacked to
+	 * host byte order.
+	 *
+	 * \retval SECSVC_OK success, req->rq_reqmsg point to request message
+	 * in clear text, size is req->rq_reqlen; req->rq_svc_ctx is set;
+	 * req->rq_sp_from is decoded from request.
+	 * \retval SECSVC_COMPLETE success, the request has been fully
+	 * processed, and reply message has been prepared; req->rq_sp_from is
+	 * decoded from request.
+	 * \retval SECSVC_DROP failed, this request should be dropped.
+	 *
+	 * \see null_accept(), plain_accept(), gss_svc_accept_kr().
+	 */
+	int		     (*accept)      (struct ptlrpc_request *req);
+
+	/**
+	 * Perform security transformation upon reply message.
+	 *
+	 * \pre reply message is pointed by req->rq_reply_state->rs_msg, size
+	 * is req->rq_replen.
+	 * \post req->rs_repdata_len is the final message size.
+	 * \post req->rq_reply_off is set.
+	 *
+	 * \see null_authorize(), plain_authorize(), gss_svc_authorize().
+	 */
+	int		     (*authorize)   (struct ptlrpc_request *req);
+
+	/**
+	 * Invalidate server context \a ctx.
+	 *
+	 * \see gss_svc_invalidate_ctx().
+	 */
+	void		    (*invalidate_ctx)
+					       (struct ptlrpc_svc_ctx *ctx);
+
+	/**
+	 * Allocate a ptlrpc_reply_state.
+	 *
+	 * \param msgsize size of the reply message in clear text.
+	 * \pre if req->rq_reply_state != NULL, then it's pre-allocated, we
+	 * should simply use it; otherwise we'll responsible for allocating
+	 * a new one.
+	 * \post req->rq_reply_state != NULL;
+	 * \post req->rq_reply_state->rs_msg != NULL;
+	 *
+	 * \see null_alloc_rs(), plain_alloc_rs(), gss_svc_alloc_rs().
+	 */
+	int		     (*alloc_rs)    (struct ptlrpc_request *req,
+						int msgsize);
+
+	/**
+	 * Free a ptlrpc_reply_state.
+	 */
+	void		    (*free_rs)     (struct ptlrpc_reply_state *rs);
+
+	/**
+	 * Release the server context \a ctx.
+	 *
+	 * \see gss_svc_free_ctx().
+	 */
+	void		    (*free_ctx)    (struct ptlrpc_svc_ctx *ctx);
+
+	/**
+	 * Install a reverse context based on the server context \a ctx.
+	 *
+	 * \see gss_svc_install_rctx_kr().
+	 */
+	int		     (*install_rctx)(struct obd_import *imp,
+						struct ptlrpc_svc_ctx *ctx);
+
+	/**
+	 * Prepare buffer for incoming bulk write.
+	 *
+	 * \pre desc->bd_iov and desc->bd_iov_count describes the buffer
+	 * intended to receive the write.
+	 *
+	 * \see gss_svc_prep_bulk().
+	 */
+	int		     (*prep_bulk)   (struct ptlrpc_request *req,
+						struct ptlrpc_bulk_desc *desc);
+
+	/**
+	 * Unwrap the bulk write data.
+	 *
+	 * \see plain_svc_unwrap_bulk(), gss_svc_unwrap_bulk().
+	 */
+	int		     (*unwrap_bulk) (struct ptlrpc_request *req,
+						struct ptlrpc_bulk_desc *desc);
+
+	/**
+	 * Wrap the bulk read data.
+	 *
+	 * \see plain_svc_wrap_bulk(), gss_svc_wrap_bulk().
+	 */
+	int		     (*wrap_bulk)   (struct ptlrpc_request *req,
+						struct ptlrpc_bulk_desc *desc);
+};
+
+struct ptlrpc_sec_policy {
+	module_t		   *sp_owner;
+	char			   *sp_name;
+	__u16			   sp_policy; /* policy number */
+	struct ptlrpc_sec_cops	 *sp_cops;   /* client ops */
+	struct ptlrpc_sec_sops	 *sp_sops;   /* server ops */
+};
+
+#define PTLRPC_SEC_FL_REVERSE	   0x0001 /* reverse sec */
+#define PTLRPC_SEC_FL_ROOTONLY	  0x0002 /* treat everyone as root */
+#define PTLRPC_SEC_FL_UDESC	     0x0004 /* ship udesc */
+#define PTLRPC_SEC_FL_BULK	      0x0008 /* intensive bulk i/o expected */
+#define PTLRPC_SEC_FL_PAG	       0x0010 /* PAG mode */
+
+/**
+ * The ptlrpc_sec represents the client side ptlrpc security facilities,
+ * each obd_import (both regular and reverse import) must associate with
+ * a ptlrpc_sec.
+ *
+ * \see sptlrpc_import_sec_adapt().
+ */
+struct ptlrpc_sec {
+	struct ptlrpc_sec_policy       *ps_policy;
+	atomic_t		    ps_refcount;
+	/** statistic only */
+	atomic_t		    ps_nctx;
+	/** unique identifier */
+	int			     ps_id;
+	struct sptlrpc_flavor	   ps_flvr;
+	enum lustre_sec_part	    ps_part;
+	/** after set, no more new context will be created */
+	unsigned int		    ps_dying:1;
+	/** owning import */
+	struct obd_import	      *ps_import;
+	spinlock_t			ps_lock;
+
+	/*
+	 * garbage collection
+	 */
+	struct list_head		      ps_gc_list;
+	cfs_time_t		      ps_gc_interval; /* in seconds */
+	cfs_time_t		      ps_gc_next;     /* in seconds */
+};
+
+static inline int sec_is_reverse(struct ptlrpc_sec *sec)
+{
+	return (sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_REVERSE);
+}
+
+static inline int sec_is_rootonly(struct ptlrpc_sec *sec)
+{
+	return (sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_ROOTONLY);
+}
+
+
+struct ptlrpc_svc_ctx {
+	atomic_t		    sc_refcount;
+	struct ptlrpc_sec_policy       *sc_policy;
+};
+
+/*
+ * user identity descriptor
+ */
+#define LUSTRE_MAX_GROUPS	       (128)
+
+struct ptlrpc_user_desc {
+	__u32	   pud_uid;
+	__u32	   pud_gid;
+	__u32	   pud_fsuid;
+	__u32	   pud_fsgid;
+	__u32	   pud_cap;
+	__u32	   pud_ngroups;
+	__u32	   pud_groups[0];
+};
+
+/*
+ * bulk flavors
+ */
+enum sptlrpc_bulk_hash_alg {
+	BULK_HASH_ALG_NULL      = 0,
+	BULK_HASH_ALG_ADLER32,
+	BULK_HASH_ALG_CRC32,
+	BULK_HASH_ALG_MD5,
+	BULK_HASH_ALG_SHA1,
+	BULK_HASH_ALG_SHA256,
+	BULK_HASH_ALG_SHA384,
+	BULK_HASH_ALG_SHA512,
+	BULK_HASH_ALG_MAX
+};
+
+const char * sptlrpc_get_hash_name(__u8 hash_alg);
+__u8 sptlrpc_get_hash_alg(const char *algname);
+
+enum {
+	BSD_FL_ERR      = 1,
+};
+
+struct ptlrpc_bulk_sec_desc {
+	__u8	    bsd_version;    /* 0 */
+	__u8	    bsd_type;       /* SPTLRPC_BULK_XXX */
+	__u8	    bsd_svc;	/* SPTLRPC_BULK_SVC_XXXX */
+	__u8	    bsd_flags;      /* flags */
+	__u32	   bsd_nob;	/* nob of bulk data */
+	__u8	    bsd_data[0];    /* policy-specific token */
+};
+
+
+/*
+ * lprocfs
+ */
+struct proc_dir_entry;
+extern struct proc_dir_entry *sptlrpc_proc_root;
+
+/*
+ * round size up to next power of 2, for slab allocation.
+ * @size must be sane (can't overflow after round up)
+ */
+static inline int size_roundup_power2(int size)
+{
+	size--;
+	size |= size >> 1;
+	size |= size >> 2;
+	size |= size >> 4;
+	size |= size >> 8;
+	size |= size >> 16;
+	size++;
+	return size;
+}
+
+/*
+ * internal support libraries
+ */
+void _sptlrpc_enlarge_msg_inplace(struct lustre_msg *msg,
+				  int segment, int newsize);
+
+/*
+ * security policies
+ */
+int sptlrpc_register_policy(struct ptlrpc_sec_policy *policy);
+int sptlrpc_unregister_policy(struct ptlrpc_sec_policy *policy);
+
+__u32 sptlrpc_name2flavor_base(const char *name);
+const char *sptlrpc_flavor2name_base(__u32 flvr);
+char *sptlrpc_flavor2name_bulk(struct sptlrpc_flavor *sf,
+			       char *buf, int bufsize);
+char *sptlrpc_flavor2name(struct sptlrpc_flavor *sf, char *buf, int bufsize);
+char *sptlrpc_secflags2str(__u32 flags, char *buf, int bufsize);
+
+static inline
+struct ptlrpc_sec_policy *sptlrpc_policy_get(struct ptlrpc_sec_policy *policy)
+{
+	__module_get(policy->sp_owner);
+	return policy;
+}
+
+static inline
+void sptlrpc_policy_put(struct ptlrpc_sec_policy *policy)
+{
+	module_put(policy->sp_owner);
+}
+
+/*
+ * client credential
+ */
+static inline
+unsigned long cli_ctx_status(struct ptlrpc_cli_ctx *ctx)
+{
+	return (ctx->cc_flags & PTLRPC_CTX_STATUS_MASK);
+}
+
+static inline
+int cli_ctx_is_ready(struct ptlrpc_cli_ctx *ctx)
+{
+	return (cli_ctx_status(ctx) == PTLRPC_CTX_UPTODATE);
+}
+
+static inline
+int cli_ctx_is_refreshed(struct ptlrpc_cli_ctx *ctx)
+{
+	return (cli_ctx_status(ctx) != 0);
+}
+
+static inline
+int cli_ctx_is_uptodate(struct ptlrpc_cli_ctx *ctx)
+{
+	return ((ctx->cc_flags & PTLRPC_CTX_UPTODATE) != 0);
+}
+
+static inline
+int cli_ctx_is_error(struct ptlrpc_cli_ctx *ctx)
+{
+	return ((ctx->cc_flags & PTLRPC_CTX_ERROR) != 0);
+}
+
+static inline
+int cli_ctx_is_dead(struct ptlrpc_cli_ctx *ctx)
+{
+	return ((ctx->cc_flags & (PTLRPC_CTX_DEAD | PTLRPC_CTX_ERROR)) != 0);
+}
+
+static inline
+int cli_ctx_is_eternal(struct ptlrpc_cli_ctx *ctx)
+{
+	return ((ctx->cc_flags & PTLRPC_CTX_ETERNAL) != 0);
+}
+
+/*
+ * sec get/put
+ */
+struct ptlrpc_sec *sptlrpc_sec_get(struct ptlrpc_sec *sec);
+void sptlrpc_sec_put(struct ptlrpc_sec *sec);
+
+/*
+ * internal apis which only used by policy impelentation
+ */
+int  sptlrpc_get_next_secid(void);
+void sptlrpc_sec_destroy(struct ptlrpc_sec *sec);
+
+/*
+ * exported client context api
+ */
+struct ptlrpc_cli_ctx *sptlrpc_cli_ctx_get(struct ptlrpc_cli_ctx *ctx);
+void sptlrpc_cli_ctx_put(struct ptlrpc_cli_ctx *ctx, int sync);
+void sptlrpc_cli_ctx_expire(struct ptlrpc_cli_ctx *ctx);
+void sptlrpc_cli_ctx_wakeup(struct ptlrpc_cli_ctx *ctx);
+int sptlrpc_cli_ctx_display(struct ptlrpc_cli_ctx *ctx, char *buf, int bufsize);
+
+/*
+ * exported client context wrap/buffers
+ */
+int sptlrpc_cli_wrap_request(struct ptlrpc_request *req);
+int sptlrpc_cli_unwrap_reply(struct ptlrpc_request *req);
+int sptlrpc_cli_alloc_reqbuf(struct ptlrpc_request *req, int msgsize);
+void sptlrpc_cli_free_reqbuf(struct ptlrpc_request *req);
+int sptlrpc_cli_alloc_repbuf(struct ptlrpc_request *req, int msgsize);
+void sptlrpc_cli_free_repbuf(struct ptlrpc_request *req);
+int sptlrpc_cli_enlarge_reqbuf(struct ptlrpc_request *req,
+			       int segment, int newsize);
+int  sptlrpc_cli_unwrap_early_reply(struct ptlrpc_request *req,
+				    struct ptlrpc_request **req_ret);
+void sptlrpc_cli_finish_early_reply(struct ptlrpc_request *early_req);
+
+void sptlrpc_request_out_callback(struct ptlrpc_request *req);
+
+/*
+ * exported higher interface of import & request
+ */
+int sptlrpc_import_sec_adapt(struct obd_import *imp,
+			     struct ptlrpc_svc_ctx *ctx,
+			     struct sptlrpc_flavor *flvr);
+struct ptlrpc_sec *sptlrpc_import_sec_ref(struct obd_import *imp);
+void sptlrpc_import_sec_put(struct obd_import *imp);
+
+int  sptlrpc_import_check_ctx(struct obd_import *imp);
+void sptlrpc_import_flush_root_ctx(struct obd_import *imp);
+void sptlrpc_import_flush_my_ctx(struct obd_import *imp);
+void sptlrpc_import_flush_all_ctx(struct obd_import *imp);
+int  sptlrpc_req_get_ctx(struct ptlrpc_request *req);
+void sptlrpc_req_put_ctx(struct ptlrpc_request *req, int sync);
+int  sptlrpc_req_refresh_ctx(struct ptlrpc_request *req, long timeout);
+int  sptlrpc_req_replace_dead_ctx(struct ptlrpc_request *req);
+void sptlrpc_req_set_flavor(struct ptlrpc_request *req, int opcode);
+
+int sptlrpc_parse_rule(char *param, struct sptlrpc_rule *rule);
+
+/* gc */
+void sptlrpc_gc_add_sec(struct ptlrpc_sec *sec);
+void sptlrpc_gc_del_sec(struct ptlrpc_sec *sec);
+void sptlrpc_gc_add_ctx(struct ptlrpc_cli_ctx *ctx);
+
+/* misc */
+const char * sec2target_str(struct ptlrpc_sec *sec);
+int sptlrpc_lprocfs_cliobd_attach(struct obd_device *dev);
+
+/*
+ * server side
+ */
+enum secsvc_accept_res {
+	SECSVC_OK       = 0,
+	SECSVC_COMPLETE,
+	SECSVC_DROP,
+};
+
+int  sptlrpc_svc_unwrap_request(struct ptlrpc_request *req);
+int  sptlrpc_svc_alloc_rs(struct ptlrpc_request *req, int msglen);
+int  sptlrpc_svc_wrap_reply(struct ptlrpc_request *req);
+void sptlrpc_svc_free_rs(struct ptlrpc_reply_state *rs);
+void sptlrpc_svc_ctx_addref(struct ptlrpc_request *req);
+void sptlrpc_svc_ctx_decref(struct ptlrpc_request *req);
+void sptlrpc_svc_ctx_invalidate(struct ptlrpc_request *req);
+
+int  sptlrpc_target_export_check(struct obd_export *exp,
+				 struct ptlrpc_request *req);
+void sptlrpc_target_update_exp_flavor(struct obd_device *obd,
+				      struct sptlrpc_rule_set *rset);
+
+/*
+ * reverse context
+ */
+int sptlrpc_svc_install_rvs_ctx(struct obd_import *imp,
+				struct ptlrpc_svc_ctx *ctx);
+int sptlrpc_cli_install_rvs_ctx(struct obd_import *imp,
+				struct ptlrpc_cli_ctx *ctx);
+
+/* bulk security api */
+int sptlrpc_enc_pool_add_user(void);
+int sptlrpc_enc_pool_del_user(void);
+int  sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc);
+void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc);
+
+int sptlrpc_cli_wrap_bulk(struct ptlrpc_request *req,
+			  struct ptlrpc_bulk_desc *desc);
+int sptlrpc_cli_unwrap_bulk_read(struct ptlrpc_request *req,
+				 struct ptlrpc_bulk_desc *desc,
+				 int nob);
+int sptlrpc_cli_unwrap_bulk_write(struct ptlrpc_request *req,
+				  struct ptlrpc_bulk_desc *desc);
+
+/* bulk helpers (internal use only by policies) */
+int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg,
+			      void *buf, int buflen);
+
+int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset, int swabbed);
+
+/* user descriptor helpers */
+static inline int sptlrpc_user_desc_size(int ngroups)
+{
+	return sizeof(struct ptlrpc_user_desc) + ngroups * sizeof(__u32);
+}
+
+int sptlrpc_current_user_desc_size(void);
+int sptlrpc_pack_user_desc(struct lustre_msg *msg, int offset);
+int sptlrpc_unpack_user_desc(struct lustre_msg *req, int offset, int swabbed);
+
+
+#define CFS_CAP_CHOWN_MASK (1 << CFS_CAP_CHOWN)
+#define CFS_CAP_SYS_RESOURCE_MASK (1 << CFS_CAP_SYS_RESOURCE)
+
+enum {
+	LUSTRE_SEC_NONE	 = 0,
+	LUSTRE_SEC_REMOTE       = 1,
+	LUSTRE_SEC_SPECIFY      = 2,
+	LUSTRE_SEC_ALL	  = 3
+};
+
+/** @} sptlrpc */
+
+#endif /* _LUSTRE_SEC_H_ */
diff --git a/drivers/staging/lustre/lustre/include/lustre_update.h b/drivers/staging/lustre/lustre/include/lustre_update.h
new file mode 100644
index 000000000000..84defce0f623
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_update.h
@@ -0,0 +1,189 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.htm
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2013, Intel Corporation.
+ */
+/*
+ * lustre/include/lustre_update.h
+ *
+ * Author: Di Wang <di.wang@intel.com>
+ */
+
+#ifndef _LUSTRE_UPDATE_H
+#define _LUSTRE_UPDATE_H
+
+#define UPDATE_BUFFER_SIZE	8192
+struct update_request {
+	struct dt_device	*ur_dt;
+	struct list_head		ur_list;    /* attached itself to thandle */
+	int			ur_flags;
+	int			ur_rc;	    /* request result */
+	int			ur_batchid; /* Current batch(trans) id */
+	struct update_buf	*ur_buf;   /* Holding the update req */
+};
+
+static inline unsigned long update_size(struct update *update)
+{
+	unsigned long size;
+	int	   i;
+
+	size = cfs_size_round(offsetof(struct update, u_bufs[0]));
+	for (i = 0; i < UPDATE_BUF_COUNT; i++)
+		size += cfs_size_round(update->u_lens[i]);
+
+	return size;
+}
+
+static inline void *update_param_buf(struct update *update, int index,
+				     int *size)
+{
+	int	i;
+	void	*ptr;
+
+	if (index >= UPDATE_BUF_COUNT)
+		return NULL;
+
+	ptr = (char *)update + cfs_size_round(offsetof(struct update,
+						       u_bufs[0]));
+	for (i = 0; i < index; i++) {
+		LASSERT(update->u_lens[i] > 0);
+		ptr += cfs_size_round(update->u_lens[i]);
+	}
+
+	if (size != NULL)
+		*size = update->u_lens[index];
+
+	return ptr;
+}
+
+static inline unsigned long update_buf_size(struct update_buf *buf)
+{
+	unsigned long size;
+	int	   i = 0;
+
+	size = cfs_size_round(offsetof(struct update_buf, ub_bufs[0]));
+	for (i = 0; i < buf->ub_count; i++) {
+		struct update *update;
+
+		update = (struct update *)((char *)buf + size);
+		size += update_size(update);
+	}
+	LASSERT(size <= UPDATE_BUFFER_SIZE);
+	return size;
+}
+
+static inline void *update_buf_get(struct update_buf *buf, int index, int *size)
+{
+	int	count = buf->ub_count;
+	void	*ptr;
+	int	i = 0;
+
+	if (index >= count)
+		return NULL;
+
+	ptr = (char *)buf + cfs_size_round(offsetof(struct update_buf,
+						    ub_bufs[0]));
+	for (i = 0; i < index; i++)
+		ptr += update_size((struct update *)ptr);
+
+	if (size != NULL)
+		*size = update_size((struct update *)ptr);
+
+	return ptr;
+}
+
+static inline void update_init_reply_buf(struct update_reply *reply, int count)
+{
+	reply->ur_version = UPDATE_REPLY_V1;
+	reply->ur_count = count;
+}
+
+static inline void *update_get_buf_internal(struct update_reply *reply,
+					    int index, int *size)
+{
+	char *ptr;
+	int count = reply->ur_count;
+	int i;
+
+	if (index >= count)
+		return NULL;
+
+	ptr = (char *)reply + cfs_size_round(offsetof(struct update_reply,
+					     ur_lens[count]));
+	for (i = 0; i < index; i++) {
+		LASSERT(reply->ur_lens[i] > 0);
+		ptr += cfs_size_round(reply->ur_lens[i]);
+	}
+
+	if (size != NULL)
+		*size = reply->ur_lens[index];
+
+	return ptr;
+}
+
+static inline void update_insert_reply(struct update_reply *reply, void *data,
+				       int data_len, int index, int rc)
+{
+	char *ptr;
+
+	ptr = update_get_buf_internal(reply, index, NULL);
+	LASSERT(ptr != NULL);
+
+	*(int *)ptr = cpu_to_le32(rc);
+	ptr += sizeof(int);
+	if (data_len > 0) {
+		LASSERT(data != NULL);
+		memcpy(ptr, data, data_len);
+	}
+	reply->ur_lens[index] = data_len + sizeof(int);
+}
+
+static inline int update_get_reply_buf(struct update_reply *reply, void **buf,
+				       int index)
+{
+	char *ptr;
+	int  size = 0;
+	int  result;
+
+	ptr = update_get_buf_internal(reply, index, &size);
+	result = *(int *)ptr;
+
+	if (result < 0)
+		return result;
+
+	LASSERT((ptr != NULL && size >= sizeof(int)));
+	*buf = ptr + sizeof(int);
+	return size - sizeof(int);
+}
+
+static inline int update_get_reply_result(struct update_reply *reply,
+					  void **buf, int index)
+{
+	void *ptr;
+	int  size;
+
+	ptr = update_get_buf_internal(reply, index, &size);
+	LASSERT(ptr != NULL && size > sizeof(int));
+	return *(int *)ptr;
+}
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_ver.h b/drivers/staging/lustre/lustre/include/lustre_ver.h
new file mode 100644
index 000000000000..dc187b8f741f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_ver.h
@@ -0,0 +1,24 @@
+#ifndef _LUSTRE_VER_H_
+#define _LUSTRE_VER_H_
+/* This file automatically generated from lustre/include/lustre_ver.h.in,
+ * based on parameters in lustre/autoconf/lustre-version.ac.
+ * Changes made directly to this file will be lost. */
+
+#define LUSTRE_MAJOR 2
+#define LUSTRE_MINOR 3
+#define LUSTRE_PATCH 64
+#define LUSTRE_FIX 0
+#define LUSTRE_VERSION_STRING "2.3.64"
+
+#define LUSTRE_VERSION_CODE OBD_OCD_VERSION(LUSTRE_MAJOR,LUSTRE_MINOR,LUSTRE_PATCH,LUSTRE_FIX)
+
+/* liblustre clients are only allowed to connect if their LUSTRE_FIX mismatches
+ * by this amount (set in lustre/autoconf/lustre-version.ac). */
+#define LUSTRE_VERSION_ALLOWED_OFFSET OBD_OCD_VERSION(0, 0, 1, 32)
+
+/* If lustre version of client and servers it connects to differs by more
+ * than this amount, client would issue a warning.
+ * (set in lustre/autoconf/lustre-version.ac) */
+#define LUSTRE_VERSION_OFFSET_WARN OBD_OCD_VERSION(0, 4, 0, 0)
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lvfs.h b/drivers/staging/lustre/lustre/include/lvfs.h
new file mode 100644
index 000000000000..28f1a6b76f73
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lvfs.h
@@ -0,0 +1,57 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lvfs.h
+ *
+ * lustre VFS/process permission interface
+ */
+
+#ifndef __LVFS_H__
+#define __LVFS_H__
+
+#define LL_FID_NAMELEN (16 + 1 + 8 + 1)
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lvfs.h>
+
+#include <linux/libcfs/lucache.h>
+
+
+/* lvfs_common.c */
+struct dentry *lvfs_fid2dentry(struct lvfs_run_ctxt *, __u64, __u32, __u64 ,void *data);
+
+void push_ctxt(struct lvfs_run_ctxt *save, struct lvfs_run_ctxt *new_ctx,
+	       struct lvfs_ucred *cred);
+void pop_ctxt(struct lvfs_run_ctxt *saved, struct lvfs_run_ctxt *new_ctx,
+	      struct lvfs_ucred *cred);
+#endif
diff --git a/drivers/staging/lustre/lustre/include/md_object.h b/drivers/staging/lustre/lustre/include/md_object.h
new file mode 100644
index 000000000000..eefa0f11bd13
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/md_object.h
@@ -0,0 +1,946 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/md_object.h
+ *
+ * Extention of lu_object.h for metadata objects
+ */
+
+#ifndef _LUSTRE_MD_OBJECT_H
+#define _LUSTRE_MD_OBJECT_H
+
+/** \defgroup md md
+ * Sub-class of lu_object with methods common for "meta-data" objects in MDT
+ * stack.
+ *
+ * Meta-data objects implement namespace operations: you can link, unlink
+ * them, and treat them as directories.
+ *
+ * Examples: mdt, cmm, and mdt are implementations of md interface.
+ * @{
+ */
+
+
+/*
+ * super-class definitions.
+ */
+#include <dt_object.h>
+
+struct md_device;
+struct md_device_operations;
+struct md_object;
+struct obd_export;
+
+enum {
+	UCRED_INVALID   = -1,
+	UCRED_INIT      = 0,
+	UCRED_OLD       = 1,
+	UCRED_NEW       = 2
+};
+
+enum {
+	MD_CAPAINFO_MAX = 5
+};
+
+/** there are at most 5 fids in one operation, see rename, NOTE the last one
+ * is a temporary one used for is_subdir() */
+struct md_capainfo {
+	__u32		   mc_auth;
+	__u32		   mc_padding;
+	struct lu_fid	   mc_fid[MD_CAPAINFO_MAX];
+	struct lustre_capa     *mc_capa[MD_CAPAINFO_MAX];
+};
+
+struct md_quota {
+	struct obd_export       *mq_exp;
+};
+
+/**
+ * Implemented in mdd/mdd_handler.c.
+ *
+ * XXX should be moved into separate .h/.c together with all md security
+ * related definitions.
+ */
+struct md_capainfo *md_capainfo(const struct lu_env *env);
+struct md_quota *md_quota(const struct lu_env *env);
+
+/** metadata attributes */
+enum ma_valid {
+	MA_INODE     = (1 << 0),
+	MA_LOV       = (1 << 1),
+	MA_COOKIE    = (1 << 2),
+	MA_FLAGS     = (1 << 3),
+	MA_LMV       = (1 << 4),
+	MA_ACL_DEF   = (1 << 5),
+	MA_LOV_DEF   = (1 << 6),
+	MA_LAY_GEN   = (1 << 7),
+	MA_HSM       = (1 << 8),
+	MA_SOM       = (1 << 9),
+	MA_PFID      = (1 << 10)
+};
+
+typedef enum {
+	MDL_MINMODE  = 0,
+	MDL_EX       = 1,
+	MDL_PW       = 2,
+	MDL_PR       = 4,
+	MDL_CW       = 8,
+	MDL_CR       = 16,
+	MDL_NL       = 32,
+	MDL_GROUP    = 64,
+	MDL_MAXMODE
+} mdl_mode_t;
+
+typedef enum {
+	MDT_NUL_LOCK = 0,
+	MDT_REG_LOCK = (1 << 0),
+	MDT_PDO_LOCK = (1 << 1)
+} mdl_type_t;
+
+/* memory structure for hsm attributes
+ * for fields description see the on disk structure hsm_attrs
+ * which is defined in lustre_idl.h
+ */
+struct md_hsm {
+	__u32	mh_compat;
+	__u32	mh_flags;
+	__u64	mh_arch_id;
+	__u64	mh_arch_ver;
+};
+
+#define IOEPOCH_INVAL 0
+
+/* memory structure for som attributes
+ * for fields description see the on disk structure som_attrs
+ * which is defined in lustre_idl.h
+ */
+struct md_som_data {
+	__u32	msd_compat;
+	__u32	msd_incompat;
+	__u64	msd_ioepoch;
+	__u64	msd_size;
+	__u64	msd_blocks;
+	__u64	msd_mountid;
+};
+
+struct md_attr {
+	__u64		   ma_valid;
+	__u64		   ma_need;
+	__u64		   ma_attr_flags;
+	struct lu_attr	  ma_attr;
+	struct lu_fid	   ma_pfid;
+	struct md_hsm	   ma_hsm;
+	struct lov_mds_md      *ma_lmm;
+	struct lmv_stripe_md   *ma_lmv;
+	void		   *ma_acl;
+	struct llog_cookie     *ma_cookie;
+	struct lustre_capa     *ma_capa;
+	struct md_som_data     *ma_som;
+	int		     ma_lmm_size;
+	int		     ma_lmv_size;
+	int		     ma_acl_size;
+	int		     ma_cookie_size;
+	__u16		   ma_layout_gen;
+};
+
+/** Additional parameters for create */
+struct md_op_spec {
+	union {
+		/** symlink target */
+		const char	       *sp_symname;
+		/** parent FID for cross-ref mkdir */
+		const struct lu_fid      *sp_pfid;
+		/** eadata for regular files */
+		struct md_spec_reg {
+			/** lov objs exist already */
+			const struct lu_fid   *fid;
+			const void *eadata;
+			int  eadatalen;
+		} sp_ea;
+	} u;
+
+	/** Create flag from client: such as MDS_OPEN_CREAT, and others. */
+	__u64      sp_cr_flags;
+
+	/** don't create lov objects or llog cookie - this replay */
+	unsigned int no_create:1,
+		     sp_cr_lookup:1, /* do lookup sanity check or not. */
+		     sp_rm_entry:1;  /* only remove name entry */
+
+	/** Current lock mode for parent dir where create is performing. */
+	mdl_mode_t sp_cr_mode;
+
+	/** to create directory */
+	const struct dt_index_features *sp_feat;
+};
+
+/**
+ * Operations implemented for each md object (both directory and leaf).
+ */
+struct md_object_operations {
+	int (*moo_permission)(const struct lu_env *env,
+			      struct md_object *pobj, struct md_object *cobj,
+			      struct md_attr *attr, int mask);
+
+	int (*moo_attr_get)(const struct lu_env *env, struct md_object *obj,
+			    struct md_attr *attr);
+
+	int (*moo_attr_set)(const struct lu_env *env, struct md_object *obj,
+			    const struct md_attr *attr);
+
+	int (*moo_xattr_get)(const struct lu_env *env, struct md_object *obj,
+			     struct lu_buf *buf, const char *name);
+
+	int (*moo_xattr_list)(const struct lu_env *env, struct md_object *obj,
+			      struct lu_buf *buf);
+
+	int (*moo_xattr_set)(const struct lu_env *env, struct md_object *obj,
+			     const struct lu_buf *buf, const char *name,
+			     int fl);
+
+	int (*moo_xattr_del)(const struct lu_env *env, struct md_object *obj,
+			     const char *name);
+
+	/** This method is used to swap the layouts between 2 objects */
+	int (*moo_swap_layouts)(const struct lu_env *env,
+			       struct md_object *obj1, struct md_object *obj2,
+			       __u64 flags);
+
+	/** \retval number of bytes actually read upon success */
+	int (*moo_readpage)(const struct lu_env *env, struct md_object *obj,
+			    const struct lu_rdpg *rdpg);
+
+	int (*moo_readlink)(const struct lu_env *env, struct md_object *obj,
+			    struct lu_buf *buf);
+	int (*moo_changelog)(const struct lu_env *env,
+			     enum changelog_rec_type type, int flags,
+			     struct md_object *obj);
+	/** part of cross-ref operation */
+	int (*moo_object_create)(const struct lu_env *env,
+				 struct md_object *obj,
+				 const struct md_op_spec *spec,
+				 struct md_attr *ma);
+
+	int (*moo_ref_add)(const struct lu_env *env,
+			   struct md_object *obj,
+			   const struct md_attr *ma);
+
+	int (*moo_ref_del)(const struct lu_env *env,
+			   struct md_object *obj,
+			   struct md_attr *ma);
+
+	int (*moo_open)(const struct lu_env *env,
+			struct md_object *obj, int flag);
+
+	int (*moo_close)(const struct lu_env *env, struct md_object *obj,
+			 struct md_attr *ma, int mode);
+
+	int (*moo_capa_get)(const struct lu_env *, struct md_object *,
+			    struct lustre_capa *, int renewal);
+
+	int (*moo_object_sync)(const struct lu_env *, struct md_object *);
+
+	int (*moo_file_lock)(const struct lu_env *env, struct md_object *obj,
+			     struct lov_mds_md *lmm, struct ldlm_extent *extent,
+			     struct lustre_handle *lockh);
+	int (*moo_file_unlock)(const struct lu_env *env, struct md_object *obj,
+			       struct lov_mds_md *lmm,
+			       struct lustre_handle *lockh);
+	int (*moo_object_lock)(const struct lu_env *env, struct md_object *obj,
+			       struct lustre_handle *lh,
+			       struct ldlm_enqueue_info *einfo,
+			       void *policy);
+};
+
+/**
+ * Operations implemented for each directory object.
+ */
+struct md_dir_operations {
+	int (*mdo_is_subdir) (const struct lu_env *env, struct md_object *obj,
+			      const struct lu_fid *fid, struct lu_fid *sfid);
+
+	int (*mdo_lookup)(const struct lu_env *env, struct md_object *obj,
+			  const struct lu_name *lname, struct lu_fid *fid,
+			  struct md_op_spec *spec);
+
+	mdl_mode_t (*mdo_lock_mode)(const struct lu_env *env,
+				    struct md_object *obj,
+				    mdl_mode_t mode);
+
+	int (*mdo_create)(const struct lu_env *env, struct md_object *pobj,
+			  const struct lu_name *lname, struct md_object *child,
+			  struct md_op_spec *spec,
+			  struct md_attr *ma);
+
+	/** This method is used for creating data object for this meta object*/
+	int (*mdo_create_data)(const struct lu_env *env, struct md_object *p,
+			       struct md_object *o,
+			       const struct md_op_spec *spec,
+			       struct md_attr *ma);
+
+	int (*mdo_rename)(const struct lu_env *env, struct md_object *spobj,
+			  struct md_object *tpobj, const struct lu_fid *lf,
+			  const struct lu_name *lsname, struct md_object *tobj,
+			  const struct lu_name *ltname, struct md_attr *ma);
+
+	int (*mdo_link)(const struct lu_env *env, struct md_object *tgt_obj,
+			struct md_object *src_obj, const struct lu_name *lname,
+			struct md_attr *ma);
+
+	int (*mdo_unlink)(const struct lu_env *env, struct md_object *pobj,
+			  struct md_object *cobj, const struct lu_name *lname,
+			  struct md_attr *ma, int no_name);
+
+	/** This method is used to compare a requested layout to an existing
+	 * layout (struct lov_mds_md_v1/3 vs struct lov_mds_md_v1/3) */
+	int (*mdo_lum_lmm_cmp)(const struct lu_env *env,
+			       struct md_object *cobj,
+			       const struct md_op_spec *spec,
+			       struct md_attr *ma);
+
+	/** partial ops for cross-ref case */
+	int (*mdo_name_insert)(const struct lu_env *env,
+			       struct md_object *obj,
+			       const struct lu_name *lname,
+			       const struct lu_fid *fid,
+			       const struct md_attr *ma);
+
+	int (*mdo_name_remove)(const struct lu_env *env,
+			       struct md_object *obj,
+			       const struct lu_name *lname,
+			       const struct md_attr *ma);
+
+	int (*mdo_rename_tgt)(const struct lu_env *env, struct md_object *pobj,
+			      struct md_object *tobj, const struct lu_fid *fid,
+			      const struct lu_name *lname, struct md_attr *ma);
+};
+
+struct md_device_operations {
+	/** meta-data device related handlers. */
+	int (*mdo_root_get)(const struct lu_env *env, struct md_device *m,
+			    struct lu_fid *f);
+
+	int (*mdo_maxsize_get)(const struct lu_env *env, struct md_device *m,
+			       int *md_size, int *cookie_size);
+
+	int (*mdo_statfs)(const struct lu_env *env, struct md_device *m,
+			  struct obd_statfs *sfs);
+
+	int (*mdo_init_capa_ctxt)(const struct lu_env *env, struct md_device *m,
+				  int mode, unsigned long timeout, __u32 alg,
+				  struct lustre_capa_key *keys);
+
+	int (*mdo_update_capa_key)(const struct lu_env *env,
+				   struct md_device *m,
+				   struct lustre_capa_key *key);
+
+	int (*mdo_llog_ctxt_get)(const struct lu_env *env,
+				 struct md_device *m, int idx, void **h);
+
+	int (*mdo_iocontrol)(const struct lu_env *env, struct md_device *m,
+			     unsigned int cmd, int len, void *data);
+};
+
+enum md_upcall_event {
+	/** Sync the md layer*/
+	MD_LOV_SYNC = (1 << 0),
+	/** Just for split, no need trans, for replay */
+	MD_NO_TRANS = (1 << 1),
+	MD_LOV_CONFIG = (1 << 2),
+	/** Trigger quota recovery */
+	MD_LOV_QUOTA = (1 << 3)
+};
+
+struct md_upcall {
+	/** this lock protects upcall using against its removal
+	 * read lock is for usage the upcall, write - for init/fini */
+	struct rw_semaphore	mu_upcall_sem;
+	/** device to call, upper layer normally */
+	struct md_device       *mu_upcall_dev;
+	/** upcall function */
+	int (*mu_upcall)(const struct lu_env *env, struct md_device *md,
+			 enum md_upcall_event ev, void *data);
+};
+
+struct md_device {
+	struct lu_device		   md_lu_dev;
+	const struct md_device_operations *md_ops;
+	struct md_upcall		   md_upcall;
+};
+
+static inline void md_upcall_init(struct md_device *m, void *upcl)
+{
+	init_rwsem(&m->md_upcall.mu_upcall_sem);
+	m->md_upcall.mu_upcall_dev = NULL;
+	m->md_upcall.mu_upcall = upcl;
+}
+
+static inline void md_upcall_dev_set(struct md_device *m, struct md_device *up)
+{
+	down_write(&m->md_upcall.mu_upcall_sem);
+	m->md_upcall.mu_upcall_dev = up;
+	up_write(&m->md_upcall.mu_upcall_sem);
+}
+
+static inline void md_upcall_fini(struct md_device *m)
+{
+	down_write(&m->md_upcall.mu_upcall_sem);
+	m->md_upcall.mu_upcall_dev = NULL;
+	m->md_upcall.mu_upcall = NULL;
+	up_write(&m->md_upcall.mu_upcall_sem);
+}
+
+static inline int md_do_upcall(const struct lu_env *env, struct md_device *m,
+				enum md_upcall_event ev, void *data)
+{
+	int rc = 0;
+	down_read(&m->md_upcall.mu_upcall_sem);
+	if (m->md_upcall.mu_upcall_dev != NULL &&
+	    m->md_upcall.mu_upcall_dev->md_upcall.mu_upcall != NULL) {
+		rc = m->md_upcall.mu_upcall_dev->md_upcall.mu_upcall(env,
+					      m->md_upcall.mu_upcall_dev,
+					      ev, data);
+	}
+	up_read(&m->md_upcall.mu_upcall_sem);
+	return rc;
+}
+
+struct md_object {
+	struct lu_object		   mo_lu;
+	const struct md_object_operations *mo_ops;
+	const struct md_dir_operations    *mo_dir_ops;
+};
+
+/**
+ * seq-server site.
+ */
+struct seq_server_site {
+	struct lu_site	     *ss_lu;
+	/**
+	 * mds number of this site.
+	 */
+	mdsno_t	       ss_node_id;
+	/**
+	 * Fid location database
+	 */
+	struct lu_server_fld *ss_server_fld;
+	struct lu_client_fld *ss_client_fld;
+
+	/**
+	 * Server Seq Manager
+	 */
+	struct lu_server_seq *ss_server_seq;
+
+	/**
+	 * Controller Seq Manager
+	 */
+	struct lu_server_seq *ss_control_seq;
+	struct obd_export    *ss_control_exp;
+
+	/**
+	 * Client Seq Manager
+	 */
+	struct lu_client_seq *ss_client_seq;
+};
+
+static inline struct md_device *lu2md_dev(const struct lu_device *d)
+{
+	LASSERT(IS_ERR(d) || lu_device_is_md(d));
+	return container_of0(d, struct md_device, md_lu_dev);
+}
+
+static inline struct lu_device *md2lu_dev(struct md_device *d)
+{
+	return &d->md_lu_dev;
+}
+
+static inline struct md_object *lu2md(const struct lu_object *o)
+{
+	LASSERT(o == NULL || IS_ERR(o) || lu_device_is_md(o->lo_dev));
+	return container_of0(o, struct md_object, mo_lu);
+}
+
+static inline struct md_object *md_object_next(const struct md_object *obj)
+{
+	return (obj ? lu2md(lu_object_next(&obj->mo_lu)) : NULL);
+}
+
+static inline struct md_device *md_obj2dev(const struct md_object *o)
+{
+	LASSERT(o == NULL || IS_ERR(o) || lu_device_is_md(o->mo_lu.lo_dev));
+	return container_of0(o->mo_lu.lo_dev, struct md_device, md_lu_dev);
+}
+
+static inline struct seq_server_site *lu_site2seq(const struct lu_site *s)
+{
+	return s->ld_seq_site;
+}
+
+static inline int md_device_init(struct md_device *md, struct lu_device_type *t)
+{
+	return lu_device_init(&md->md_lu_dev, t);
+}
+
+static inline void md_device_fini(struct md_device *md)
+{
+	lu_device_fini(&md->md_lu_dev);
+}
+
+static inline struct md_object *md_object_find_slice(const struct lu_env *env,
+						     struct md_device *md,
+						     const struct lu_fid *f)
+{
+	return lu2md(lu_object_find_slice(env, md2lu_dev(md), f, NULL));
+}
+
+
+/** md operations */
+static inline int mo_permission(const struct lu_env *env,
+				struct md_object *p,
+				struct md_object *c,
+				struct md_attr *at,
+				int mask)
+{
+	LASSERT(c->mo_ops->moo_permission);
+	return c->mo_ops->moo_permission(env, p, c, at, mask);
+}
+
+static inline int mo_attr_get(const struct lu_env *env,
+			      struct md_object *m,
+			      struct md_attr *at)
+{
+	LASSERT(m->mo_ops->moo_attr_get);
+	return m->mo_ops->moo_attr_get(env, m, at);
+}
+
+static inline int mo_readlink(const struct lu_env *env,
+			      struct md_object *m,
+			      struct lu_buf *buf)
+{
+	LASSERT(m->mo_ops->moo_readlink);
+	return m->mo_ops->moo_readlink(env, m, buf);
+}
+
+static inline int mo_changelog(const struct lu_env *env,
+			       enum changelog_rec_type type,
+			       int flags, struct md_object *m)
+{
+	LASSERT(m->mo_ops->moo_changelog);
+	return m->mo_ops->moo_changelog(env, type, flags, m);
+}
+
+static inline int mo_attr_set(const struct lu_env *env,
+			      struct md_object *m,
+			      const struct md_attr *at)
+{
+	LASSERT(m->mo_ops->moo_attr_set);
+	return m->mo_ops->moo_attr_set(env, m, at);
+}
+
+static inline int mo_xattr_get(const struct lu_env *env,
+			       struct md_object *m,
+			       struct lu_buf *buf,
+			       const char *name)
+{
+	LASSERT(m->mo_ops->moo_xattr_get);
+	return m->mo_ops->moo_xattr_get(env, m, buf, name);
+}
+
+static inline int mo_xattr_del(const struct lu_env *env,
+			       struct md_object *m,
+			       const char *name)
+{
+	LASSERT(m->mo_ops->moo_xattr_del);
+	return m->mo_ops->moo_xattr_del(env, m, name);
+}
+
+static inline int mo_xattr_set(const struct lu_env *env,
+			       struct md_object *m,
+			       const struct lu_buf *buf,
+			       const char *name,
+			       int flags)
+{
+	LASSERT(m->mo_ops->moo_xattr_set);
+	return m->mo_ops->moo_xattr_set(env, m, buf, name, flags);
+}
+
+static inline int mo_xattr_list(const struct lu_env *env,
+				struct md_object *m,
+				struct lu_buf *buf)
+{
+	LASSERT(m->mo_ops->moo_xattr_list);
+	return m->mo_ops->moo_xattr_list(env, m, buf);
+}
+
+static inline int mo_swap_layouts(const struct lu_env *env,
+				  struct md_object *o1,
+				  struct md_object *o2, __u64 flags)
+{
+	LASSERT(o1->mo_ops->moo_swap_layouts);
+	LASSERT(o2->mo_ops->moo_swap_layouts);
+	if (o1->mo_ops->moo_swap_layouts != o2->mo_ops->moo_swap_layouts)
+		return -EPERM;
+	return o1->mo_ops->moo_swap_layouts(env, o1, o2, flags);
+}
+
+static inline int mo_open(const struct lu_env *env,
+			  struct md_object *m,
+			  int flags)
+{
+	LASSERT(m->mo_ops->moo_open);
+	return m->mo_ops->moo_open(env, m, flags);
+}
+
+static inline int mo_close(const struct lu_env *env,
+			   struct md_object *m,
+			   struct md_attr *ma,
+			   int mode)
+{
+	LASSERT(m->mo_ops->moo_close);
+	return m->mo_ops->moo_close(env, m, ma, mode);
+}
+
+static inline int mo_readpage(const struct lu_env *env,
+			      struct md_object *m,
+			      const struct lu_rdpg *rdpg)
+{
+	LASSERT(m->mo_ops->moo_readpage);
+	return m->mo_ops->moo_readpage(env, m, rdpg);
+}
+
+static inline int mo_object_create(const struct lu_env *env,
+				   struct md_object *m,
+				   const struct md_op_spec *spc,
+				   struct md_attr *at)
+{
+	LASSERT(m->mo_ops->moo_object_create);
+	return m->mo_ops->moo_object_create(env, m, spc, at);
+}
+
+static inline int mo_ref_add(const struct lu_env *env,
+			     struct md_object *m,
+			     const struct md_attr *ma)
+{
+	LASSERT(m->mo_ops->moo_ref_add);
+	return m->mo_ops->moo_ref_add(env, m, ma);
+}
+
+static inline int mo_ref_del(const struct lu_env *env,
+			     struct md_object *m,
+			     struct md_attr *ma)
+{
+	LASSERT(m->mo_ops->moo_ref_del);
+	return m->mo_ops->moo_ref_del(env, m, ma);
+}
+
+static inline int mo_capa_get(const struct lu_env *env,
+			      struct md_object *m,
+			      struct lustre_capa *c,
+			      int renewal)
+{
+	LASSERT(m->mo_ops->moo_capa_get);
+	return m->mo_ops->moo_capa_get(env, m, c, renewal);
+}
+
+static inline int mo_object_sync(const struct lu_env *env, struct md_object *m)
+{
+	LASSERT(m->mo_ops->moo_object_sync);
+	return m->mo_ops->moo_object_sync(env, m);
+}
+
+static inline int mo_file_lock(const struct lu_env *env, struct md_object *m,
+			       struct lov_mds_md *lmm,
+			       struct ldlm_extent *extent,
+			       struct lustre_handle *lockh)
+{
+	LASSERT(m->mo_ops->moo_file_lock);
+	return m->mo_ops->moo_file_lock(env, m, lmm, extent, lockh);
+}
+
+static inline int mo_file_unlock(const struct lu_env *env, struct md_object *m,
+				 struct lov_mds_md *lmm,
+				 struct lustre_handle *lockh)
+{
+	LASSERT(m->mo_ops->moo_file_unlock);
+	return m->mo_ops->moo_file_unlock(env, m, lmm, lockh);
+}
+
+static inline int mo_object_lock(const struct lu_env *env,
+				 struct md_object *m,
+				 struct lustre_handle *lh,
+				 struct ldlm_enqueue_info *einfo,
+				 void *policy)
+{
+	LASSERT(m->mo_ops->moo_object_lock);
+	return m->mo_ops->moo_object_lock(env, m, lh, einfo, policy);
+}
+
+static inline int mdo_lookup(const struct lu_env *env,
+			     struct md_object *p,
+			     const struct lu_name *lname,
+			     struct lu_fid *f,
+			     struct md_op_spec *spec)
+{
+	LASSERT(p->mo_dir_ops->mdo_lookup);
+	return p->mo_dir_ops->mdo_lookup(env, p, lname, f, spec);
+}
+
+static inline mdl_mode_t mdo_lock_mode(const struct lu_env *env,
+				       struct md_object *mo,
+				       mdl_mode_t lm)
+{
+	if (mo->mo_dir_ops->mdo_lock_mode == NULL)
+		return MDL_MINMODE;
+	return mo->mo_dir_ops->mdo_lock_mode(env, mo, lm);
+}
+
+static inline int mdo_create(const struct lu_env *env,
+			     struct md_object *p,
+			     const struct lu_name *lchild_name,
+			     struct md_object *c,
+			     struct md_op_spec *spc,
+			     struct md_attr *at)
+{
+	LASSERT(p->mo_dir_ops->mdo_create);
+	return p->mo_dir_ops->mdo_create(env, p, lchild_name, c, spc, at);
+}
+
+static inline int mdo_create_data(const struct lu_env *env,
+				  struct md_object *p,
+				  struct md_object *c,
+				  const struct md_op_spec *spec,
+				  struct md_attr *ma)
+{
+	LASSERT(c->mo_dir_ops->mdo_create_data);
+	return c->mo_dir_ops->mdo_create_data(env, p, c, spec, ma);
+}
+
+static inline int mdo_rename(const struct lu_env *env,
+			     struct md_object *sp,
+			     struct md_object *tp,
+			     const struct lu_fid *lf,
+			     const struct lu_name *lsname,
+			     struct md_object *t,
+			     const struct lu_name *ltname,
+			     struct md_attr *ma)
+{
+	LASSERT(tp->mo_dir_ops->mdo_rename);
+	return tp->mo_dir_ops->mdo_rename(env, sp, tp, lf, lsname, t, ltname,
+					  ma);
+}
+
+static inline int mdo_is_subdir(const struct lu_env *env,
+				struct md_object *mo,
+				const struct lu_fid *fid,
+				struct lu_fid *sfid)
+{
+	LASSERT(mo->mo_dir_ops->mdo_is_subdir);
+	return mo->mo_dir_ops->mdo_is_subdir(env, mo, fid, sfid);
+}
+
+static inline int mdo_link(const struct lu_env *env,
+			   struct md_object *p,
+			   struct md_object *s,
+			   const struct lu_name *lname,
+			   struct md_attr *ma)
+{
+	LASSERT(s->mo_dir_ops->mdo_link);
+	return s->mo_dir_ops->mdo_link(env, p, s, lname, ma);
+}
+
+static inline int mdo_unlink(const struct lu_env *env,
+			     struct md_object *p,
+			     struct md_object *c,
+			     const struct lu_name *lname,
+			     struct md_attr *ma, int no_name)
+{
+	LASSERT(p->mo_dir_ops->mdo_unlink);
+	return p->mo_dir_ops->mdo_unlink(env, p, c, lname, ma, no_name);
+}
+
+static inline int mdo_lum_lmm_cmp(const struct lu_env *env,
+				  struct md_object *c,
+				  const struct md_op_spec *spec,
+				  struct md_attr *ma)
+{
+	LASSERT(c->mo_dir_ops->mdo_lum_lmm_cmp);
+	return c->mo_dir_ops->mdo_lum_lmm_cmp(env, c, spec, ma);
+}
+
+static inline int mdo_name_insert(const struct lu_env *env,
+				  struct md_object *p,
+				  const struct lu_name *lname,
+				  const struct lu_fid *f,
+				  const struct md_attr *ma)
+{
+	LASSERT(p->mo_dir_ops->mdo_name_insert);
+	return p->mo_dir_ops->mdo_name_insert(env, p, lname, f, ma);
+}
+
+static inline int mdo_name_remove(const struct lu_env *env,
+				  struct md_object *p,
+				  const struct lu_name *lname,
+				  const struct md_attr *ma)
+{
+	LASSERT(p->mo_dir_ops->mdo_name_remove);
+	return p->mo_dir_ops->mdo_name_remove(env, p, lname, ma);
+}
+
+static inline int mdo_rename_tgt(const struct lu_env *env,
+				 struct md_object *p,
+				 struct md_object *t,
+				 const struct lu_fid *lf,
+				 const struct lu_name *lname,
+				 struct md_attr *ma)
+{
+	if (t) {
+		LASSERT(t->mo_dir_ops->mdo_rename_tgt);
+		return t->mo_dir_ops->mdo_rename_tgt(env, p, t, lf, lname, ma);
+	} else {
+		LASSERT(p->mo_dir_ops->mdo_rename_tgt);
+		return p->mo_dir_ops->mdo_rename_tgt(env, p, t, lf, lname, ma);
+	}
+}
+
+/**
+ * Used in MDD/OUT layer for object lock rule
+ **/
+enum mdd_object_role {
+	MOR_SRC_PARENT,
+	MOR_SRC_CHILD,
+	MOR_TGT_PARENT,
+	MOR_TGT_CHILD,
+	MOR_TGT_ORPHAN
+};
+
+struct dt_device;
+/**
+ * Structure to hold object information. This is used to create object
+ * \pre llod_dir exist
+ */
+struct lu_local_obj_desc {
+	const char		      *llod_dir;
+	const char		      *llod_name;
+	__u32			    llod_oid;
+	int			      llod_is_index;
+	const struct dt_index_features  *llod_feat;
+	struct list_head		       llod_linkage;
+};
+
+struct md_object *llo_store_resolve(const struct lu_env *env,
+				    struct md_device *md,
+				    struct dt_device *dt,
+				    const char *path,
+				    struct lu_fid *fid);
+
+struct md_object *llo_store_open(const struct lu_env *env,
+				 struct md_device *md,
+				 struct dt_device *dt,
+				 const char *dirname,
+				 const char *objname,
+				 struct lu_fid *fid);
+
+struct md_object *llo_store_create_index(const struct lu_env *env,
+					 struct md_device *md,
+					 struct dt_device *dt,
+					 const char *dirname,
+					 const char *objname,
+					 const struct lu_fid *fid,
+					 const struct dt_index_features *feat);
+
+struct md_object *llo_store_create(const struct lu_env *env,
+				   struct md_device *md,
+				   struct dt_device *dt,
+				   const char *dirname,
+				   const char *objname,
+				   const struct lu_fid *fid);
+
+void llo_local_obj_register(struct lu_local_obj_desc *);
+void llo_local_obj_unregister(struct lu_local_obj_desc *);
+
+int llo_local_objects_setup(const struct lu_env *env,
+			     struct md_device * md,
+			     struct dt_device * dt);
+
+int llo_global_init(void);
+void llo_global_fini(void);
+
+int lustre_buf2som(void *buf, int rc, struct md_som_data *msd);
+int lustre_buf2hsm(void *buf, int rc, struct md_hsm *mh);
+void lustre_hsm2buf(void *buf, struct md_hsm *mh);
+
+struct lu_ucred {
+	__u32	       uc_valid;
+	__u32	       uc_o_uid;
+	__u32	       uc_o_gid;
+	__u32	       uc_o_fsuid;
+	__u32	       uc_o_fsgid;
+	__u32	       uc_uid;
+	__u32	       uc_gid;
+	__u32	       uc_fsuid;
+	__u32	       uc_fsgid;
+	__u32	       uc_suppgids[2];
+	cfs_cap_t	   uc_cap;
+	__u32	       uc_umask;
+	group_info_t   *uc_ginfo;
+	struct md_identity *uc_identity;
+};
+
+struct lu_ucred *lu_ucred(const struct lu_env *env);
+
+struct lu_ucred *lu_ucred_check(const struct lu_env *env);
+
+struct lu_ucred *lu_ucred_assert(const struct lu_env *env);
+
+int lu_ucred_global_init(void);
+
+void lu_ucred_global_fini(void);
+
+#define md_cap_t(x) (x)
+
+#define MD_CAP_TO_MASK(x) (1 << (x))
+
+#define md_cap_raised(c, flag) (md_cap_t(c) & MD_CAP_TO_MASK(flag))
+
+/* capable() is copied from linux kernel! */
+static inline int md_capable(struct lu_ucred *uc, cfs_cap_t cap)
+{
+	if (md_cap_raised(uc->uc_cap, cap))
+		return 1;
+	return 0;
+}
+
+/** @} md */
+#endif /* _LINUX_MD_OBJECT_H */
diff --git a/drivers/staging/lustre/lustre/include/obd.h b/drivers/staging/lustre/lustre/include/obd.h
new file mode 100644
index 000000000000..dade2fd2eb7d
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/obd.h
@@ -0,0 +1,1683 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __OBD_H
+#define __OBD_H
+
+#include <linux/obd.h>
+
+#define IOC_OSC_TYPE	 'h'
+#define IOC_OSC_MIN_NR       20
+#define IOC_OSC_SET_ACTIVE   _IOWR(IOC_OSC_TYPE, 21, struct obd_device *)
+#define IOC_OSC_MAX_NR       50
+
+#define IOC_MDC_TYPE	 'i'
+#define IOC_MDC_MIN_NR       20
+#define IOC_MDC_MAX_NR       50
+
+#include <lustre/lustre_idl.h>
+#include <lu_ref.h>
+#include <lustre_lib.h>
+#include <lustre_export.h>
+#include <lustre_fld.h>
+#include <lustre_capa.h>
+
+#include <linux/libcfs/bitmap.h>
+
+
+#define MAX_OBD_DEVICES 8192
+
+struct osc_async_rc {
+	int     ar_rc;
+	int     ar_force_sync;
+	__u64   ar_min_xid;
+};
+
+struct lov_oinfo {		 /* per-stripe data structure */
+	struct ost_id   loi_oi;    /* object ID/Sequence on the target OST */
+	int loi_ost_idx;	   /* OST stripe index in lov_tgt_desc->tgts */
+	int loi_ost_gen;	   /* generation of this loi_ost_idx */
+
+	unsigned long loi_kms_valid:1;
+	__u64 loi_kms;	     /* known minimum size */
+	struct ost_lvb loi_lvb;
+	struct osc_async_rc     loi_ar;
+};
+
+static inline void loi_kms_set(struct lov_oinfo *oinfo, __u64 kms)
+{
+	oinfo->loi_kms = kms;
+	oinfo->loi_kms_valid = 1;
+}
+
+static inline void loi_init(struct lov_oinfo *loi)
+{
+}
+
+struct lov_stripe_md {
+	atomic_t     lsm_refc;
+	spinlock_t	lsm_lock;
+	pid_t	    lsm_lock_owner; /* debugging */
+
+	/* maximum possible file size, might change as OSTs status changes,
+	 * e.g. disconnected, deactivated */
+	__u64	    lsm_maxbytes;
+	struct {
+		/* Public members. */
+		struct ost_id lw_object_oi; /* lov object id/seq */
+
+		/* LOV-private members start here -- only for use in lov/. */
+		__u32 lw_magic;
+		__u32 lw_stripe_size;      /* size of the stripe */
+		__u32 lw_pattern;	  /* striping pattern (RAID0, RAID1) */
+		__u16 lw_stripe_count;  /* number of objects being striped over */
+		__u16 lw_layout_gen;       /* generation of the layout */
+		char  lw_pool_name[LOV_MAXPOOLNAME]; /* pool name */
+	} lsm_wire;
+
+	struct lov_oinfo *lsm_oinfo[0];
+};
+
+#define lsm_oi		 lsm_wire.lw_object_oi
+#define lsm_magic	lsm_wire.lw_magic
+#define lsm_layout_gen   lsm_wire.lw_layout_gen
+#define lsm_stripe_size  lsm_wire.lw_stripe_size
+#define lsm_pattern      lsm_wire.lw_pattern
+#define lsm_stripe_count lsm_wire.lw_stripe_count
+#define lsm_pool_name    lsm_wire.lw_pool_name
+
+struct obd_info;
+
+typedef int (*obd_enqueue_update_f)(void *cookie, int rc);
+
+/* obd info for a particular level (lov, osc). */
+struct obd_info {
+	/* Lock policy. It keeps an extent which is specific for a particular
+	 * OSC. (e.g. lov_prep_enqueue_set initialises extent of the policy,
+	 * and osc_enqueue passes it into ldlm_lock_match & ldlm_cli_enqueue. */
+	ldlm_policy_data_t      oi_policy;
+	/* Flags used for set request specific flags:
+	   - while lock handling, the flags obtained on the enqueue
+	   request are set here.
+	   - while stats, the flags used for control delay/resend.
+	   - while setattr, the flags used for distinguish punch operation
+	 */
+	__u64		   oi_flags;
+	/* Lock handle specific for every OSC lock. */
+	struct lustre_handle   *oi_lockh;
+	/* lsm data specific for every OSC. */
+	struct lov_stripe_md   *oi_md;
+	/* obdo data specific for every OSC, if needed at all. */
+	struct obdo	    *oi_oa;
+	/* statfs data specific for every OSC, if needed at all. */
+	struct obd_statfs      *oi_osfs;
+	/* An update callback which is called to update some data on upper
+	 * level. E.g. it is used for update lsm->lsm_oinfo at every recieved
+	 * request in osc level for enqueue requests. It is also possible to
+	 * update some caller data from LOV layer if needed. */
+	obd_enqueue_update_f    oi_cb_up;
+	/* oss capability, its type is obd_capa in client to avoid copy.
+	 * in contrary its type is lustre_capa in OSS. */
+	void		   *oi_capa;
+	/* transfer jobid from ost_sync() to filter_sync()... */
+	char		   *oi_jobid;
+};
+
+/* compare all relevant fields. */
+static inline int lov_stripe_md_cmp(struct lov_stripe_md *m1,
+				    struct lov_stripe_md *m2)
+{
+	/*
+	 * ->lsm_wire contains padding, but it should be zeroed out during
+	 * allocation.
+	 */
+	return memcmp(&m1->lsm_wire, &m2->lsm_wire, sizeof m1->lsm_wire);
+}
+
+static inline int lov_lum_lsm_cmp(struct lov_user_md *lum,
+				  struct lov_stripe_md  *lsm)
+{
+	if (lsm->lsm_magic != lum->lmm_magic)
+		return 1;
+	if ((lsm->lsm_stripe_count != 0) && (lum->lmm_stripe_count != 0) &&
+	    (lsm->lsm_stripe_count != lum->lmm_stripe_count))
+		return 2;
+	if ((lsm->lsm_stripe_size != 0) && (lum->lmm_stripe_size != 0) &&
+	    (lsm->lsm_stripe_size != lum->lmm_stripe_size))
+		return 3;
+	if ((lsm->lsm_pattern != 0) && (lum->lmm_pattern != 0) &&
+	    (lsm->lsm_pattern != lum->lmm_pattern))
+		return 4;
+	if ((lsm->lsm_magic == LOV_MAGIC_V3) &&
+	    (strncmp(lsm->lsm_pool_name,
+		     ((struct lov_user_md_v3 *)lum)->lmm_pool_name,
+		     LOV_MAXPOOLNAME) != 0))
+		return 5;
+	return 0;
+}
+
+static inline int lov_lum_swab_if_needed(struct lov_user_md_v3 *lumv3,
+					 int *lmm_magic,
+					 struct lov_user_md *lum)
+{
+	if (lum && copy_from_user(lumv3, lum,sizeof(struct lov_user_md_v1)))
+		return -EFAULT;
+
+	*lmm_magic = lumv3->lmm_magic;
+
+	if (*lmm_magic == __swab32(LOV_USER_MAGIC_V1)) {
+		lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lumv3);
+		*lmm_magic = LOV_USER_MAGIC_V1;
+	} else if (*lmm_magic == LOV_USER_MAGIC_V3) {
+		if (lum && copy_from_user(lumv3, lum, sizeof(*lumv3)))
+			return -EFAULT;
+	} else if (*lmm_magic == __swab32(LOV_USER_MAGIC_V3)) {
+		if (lum && copy_from_user(lumv3, lum, sizeof(*lumv3)))
+			return -EFAULT;
+		lustre_swab_lov_user_md_v3(lumv3);
+		*lmm_magic = LOV_USER_MAGIC_V3;
+	} else if (*lmm_magic != LOV_USER_MAGIC_V1) {
+		CDEBUG(D_IOCTL,
+		       "bad userland LOV MAGIC: %#08x != %#08x nor %#08x\n",
+		       *lmm_magic, LOV_USER_MAGIC_V1, LOV_USER_MAGIC_V3);
+		       return -EINVAL;
+	}
+	return 0;
+}
+
+void lov_stripe_lock(struct lov_stripe_md *md);
+void lov_stripe_unlock(struct lov_stripe_md *md);
+
+struct obd_type {
+	struct list_head typ_chain;
+	struct obd_ops *typ_dt_ops;
+	struct md_ops *typ_md_ops;
+	proc_dir_entry_t *typ_procroot;
+	char *typ_name;
+	int  typ_refcnt;
+	struct lu_device_type *typ_lu;
+	spinlock_t obd_type_lock;
+};
+
+struct brw_page {
+	obd_off  off;
+	struct page *pg;
+	int count;
+	obd_flag flag;
+};
+
+/* Individual type definitions */
+
+struct ost_server_data;
+
+struct osd_properties {
+	size_t osd_max_ea_size;
+};
+
+#define OBT_MAGIC       0xBDDECEAE
+/* hold common fields for "target" device */
+struct obd_device_target {
+	__u32		     obt_magic;
+	__u32		     obt_instance;
+	struct super_block       *obt_sb;
+	/** last_rcvd file */
+	struct file	      *obt_rcvd_filp;
+	__u64		     obt_mount_count;
+	struct rw_semaphore	  obt_rwsem;
+	struct vfsmount	  *obt_vfsmnt;
+	struct file	      *obt_health_check_filp;
+	struct osd_properties     obt_osd_properties;
+	struct obd_job_stats      obt_jobstats;
+};
+
+/* llog contexts */
+enum llog_ctxt_id {
+	LLOG_CONFIG_ORIG_CTXT  =  0,
+	LLOG_CONFIG_REPL_CTXT,
+	LLOG_MDS_OST_ORIG_CTXT,
+	LLOG_MDS_OST_REPL_CTXT,
+	LLOG_SIZE_ORIG_CTXT,
+	LLOG_SIZE_REPL_CTXT,
+	LLOG_RD1_ORIG_CTXT,
+	LLOG_RD1_REPL_CTXT,
+	LLOG_TEST_ORIG_CTXT,
+	LLOG_TEST_REPL_CTXT,
+	LLOG_LOVEA_ORIG_CTXT,
+	LLOG_LOVEA_REPL_CTXT,
+	LLOG_CHANGELOG_ORIG_CTXT,      /**< changelog generation on mdd */
+	LLOG_CHANGELOG_REPL_CTXT,      /**< changelog access on clients */
+	LLOG_CHANGELOG_USER_ORIG_CTXT, /**< for multiple changelog consumers */
+	LLOG_MAX_CTXTS
+};
+
+#define FILTER_SUBDIR_COUNT      32	    /* set to zero for no subdirs */
+
+struct filter_subdirs {
+	struct dentry *dentry[FILTER_SUBDIR_COUNT];
+};
+
+
+struct filter_ext {
+	__u64		fe_start;
+	__u64		fe_end;
+};
+
+struct filter_obd {
+	/* NB this field MUST be first */
+	struct obd_device_target fo_obt;
+	const char		*fo_fstype;
+
+	int			fo_group_count;
+	struct dentry		*fo_dentry_O;
+	struct dentry		**fo_dentry_O_groups;
+	struct filter_subdirs	*fo_dentry_O_sub;
+	struct mutex		fo_init_lock;	/* group initialization lock*/
+	int			fo_committed_group;
+
+	spinlock_t		fo_objidlock;	/* protect fo_lastobjid */
+
+	unsigned long		fo_destroys_in_progress;
+	struct mutex		fo_create_locks[FILTER_SUBDIR_COUNT];
+
+	struct list_head fo_export_list;
+	int		  fo_subdir_count;
+
+	obd_size	     fo_tot_dirty;      /* protected by obd_osfs_lock */
+	obd_size	     fo_tot_granted;    /* all values in bytes */
+	obd_size	     fo_tot_pending;
+	int		  fo_tot_granted_clients;
+
+	obd_size	     fo_readcache_max_filesize;
+	spinlock_t		fo_flags_lock;
+	unsigned int	 fo_read_cache:1,   /**< enable read-only cache */
+			     fo_writethrough_cache:1,/**< read cache writes */
+			     fo_mds_ost_sync:1, /**< MDS-OST orphan recovery*/
+			     fo_raid_degraded:1;/**< RAID device degraded */
+
+	struct obd_import   *fo_mdc_imp;
+	struct obd_uuid      fo_mdc_uuid;
+	struct lustre_handle fo_mdc_conn;
+	struct file	**fo_last_objid_files;
+	__u64	       *fo_last_objids; /* last created objid for groups,
+					      * protected by fo_objidlock */
+
+	struct mutex		fo_alloc_lock;
+
+	atomic_t	 fo_r_in_flight;
+	atomic_t	 fo_w_in_flight;
+
+	/*
+	 * per-filter pool of kiobuf's allocated by filter_common_setup() and
+	 * torn down by filter_cleanup().
+	 *
+	 * This pool contains kiobuf used by
+	 * filter_{prep,commit}rw_{read,write}() and is shared by all OST
+	 * threads.
+	 *
+	 * Locking: protected by internal lock of cfs_hash, pool can be
+	 * found from this hash table by t_id of ptlrpc_thread.
+	 */
+	struct cfs_hash		*fo_iobuf_hash;
+
+	struct brw_stats	 fo_filter_stats;
+
+	int		      fo_fmd_max_num; /* per exp filter_mod_data */
+	int		      fo_fmd_max_age; /* jiffies to fmd expiry */
+	unsigned long	    fo_syncjournal:1, /* sync journal on writes */
+				 fo_sync_lock_cancel:2;/* sync on lock cancel */
+
+
+	/* sptlrpc stuff */
+	rwlock_t		fo_sptlrpc_lock;
+	struct sptlrpc_rule_set  fo_sptlrpc_rset;
+
+	/* capability related */
+	unsigned int	     fo_fl_oss_capa;
+	struct list_head	       fo_capa_keys;
+	struct hlist_head	*fo_capa_hash;
+	int		      fo_sec_level;
+};
+
+struct timeout_item {
+	enum timeout_event ti_event;
+	cfs_time_t	 ti_timeout;
+	timeout_cb_t       ti_cb;
+	void	      *ti_cb_data;
+	struct list_head	 ti_obd_list;
+	struct list_head	 ti_chain;
+};
+
+#define OSC_MAX_RIF_DEFAULT       8
+#define MDS_OSC_MAX_RIF_DEFAULT   50
+#define OSC_MAX_RIF_MAX	 256
+#define OSC_MAX_DIRTY_DEFAULT  (OSC_MAX_RIF_DEFAULT * 4)
+#define OSC_MAX_DIRTY_MB_MAX   2048     /* arbitrary, but < MAX_LONG bytes */
+#define OSC_DEFAULT_RESENDS      10
+
+/* possible values for fo_sync_lock_cancel */
+enum {
+	NEVER_SYNC_ON_CANCEL = 0,
+	BLOCKING_SYNC_ON_CANCEL = 1,
+	ALWAYS_SYNC_ON_CANCEL = 2,
+	NUM_SYNC_ON_CANCEL_STATES
+};
+
+#define MDC_MAX_RIF_DEFAULT       8
+#define MDC_MAX_RIF_MAX	 512
+
+struct mdc_rpc_lock;
+struct obd_import;
+struct client_obd {
+	struct rw_semaphore  cl_sem;
+	struct obd_uuid	  cl_target_uuid;
+	struct obd_import       *cl_import; /* ptlrpc connection state */
+	int		      cl_conn_count;
+	/* max_mds_easize is purely a performance thing so we don't have to
+	 * call obd_size_diskmd() all the time. */
+	int		      cl_default_mds_easize;
+	int		      cl_max_mds_easize;
+	int		      cl_max_mds_cookiesize;
+
+	enum lustre_sec_part     cl_sp_me;
+	enum lustre_sec_part     cl_sp_to;
+	struct sptlrpc_flavor    cl_flvr_mgc;   /* fixed flavor of mgc->mgs */
+
+	/* the grant values are protected by loi_list_lock below */
+	long		     cl_dirty;	 /* all _dirty_ in bytes */
+	long		     cl_dirty_max;     /* allowed w/o rpc */
+	long		     cl_dirty_transit; /* dirty synchronous */
+	long		     cl_avail_grant;   /* bytes of credit for ost */
+	long		     cl_lost_grant;    /* lost credits (trunc) */
+
+	/* since we allocate grant by blocks, we don't know how many grant will
+	 * be used to add a page into cache. As a solution, we reserve maximum
+	 * grant before trying to dirty a page and unreserve the rest.
+	 * See osc_{reserve|unreserve}_grant for details. */
+	long		 cl_reserved_grant;
+	struct list_head	   cl_cache_waiters; /* waiting for cache/grant */
+	cfs_time_t	   cl_next_shrink_grant;   /* jiffies */
+	struct list_head	   cl_grant_shrink_list;  /* Timeout event list */
+	int		  cl_grant_shrink_interval; /* seconds */
+
+	/* A chunk is an optimal size used by osc_extent to determine
+	 * the extent size. A chunk is max(PAGE_CACHE_SIZE, OST block size) */
+	int		  cl_chunkbits;
+	int		  cl_chunk;
+	int		  cl_extent_tax; /* extent overhead, by bytes */
+
+	/* keep track of objects that have lois that contain pages which
+	 * have been queued for async brw.  this lock also protects the
+	 * lists of osc_client_pages that hang off of the loi */
+	/*
+	 * ->cl_loi_list_lock protects consistency of
+	 * ->cl_loi_{ready,read,write}_list. ->ap_make_ready() and
+	 * ->ap_completion() call-backs are executed under this lock. As we
+	 * cannot guarantee that these call-backs never block on all platforms
+	 * (as a matter of fact they do block on Mac OS X), type of
+	 * ->cl_loi_list_lock is platform dependent: it's a spin-lock on Linux
+	 * and blocking mutex on Mac OS X. (Alternative is to make this lock
+	 * blocking everywhere, but we don't want to slow down fast-path of
+	 * our main platform.)
+	 *
+	 * Exact type of ->cl_loi_list_lock is defined in arch/obd.h together
+	 * with client_obd_list_{un,}lock() and
+	 * client_obd_list_lock_{init,done}() functions.
+	 *
+	 * NB by Jinshan: though field names are still _loi_, but actually
+	 * osc_object{}s are in the list.
+	 */
+	client_obd_lock_t	cl_loi_list_lock;
+	struct list_head	       cl_loi_ready_list;
+	struct list_head	       cl_loi_hp_ready_list;
+	struct list_head	       cl_loi_write_list;
+	struct list_head	       cl_loi_read_list;
+	int		      cl_r_in_flight;
+	int		      cl_w_in_flight;
+	/* just a sum of the loi/lop pending numbers to be exported by /proc */
+	atomic_t	     cl_pending_w_pages;
+	atomic_t	     cl_pending_r_pages;
+	__u32			 cl_max_pages_per_rpc;
+	int		      cl_max_rpcs_in_flight;
+	struct obd_histogram     cl_read_rpc_hist;
+	struct obd_histogram     cl_write_rpc_hist;
+	struct obd_histogram     cl_read_page_hist;
+	struct obd_histogram     cl_write_page_hist;
+	struct obd_histogram     cl_read_offset_hist;
+	struct obd_histogram     cl_write_offset_hist;
+
+	/* lru for osc caching pages */
+	struct cl_client_cache	*cl_cache;
+	struct list_head		 cl_lru_osc; /* member of cl_cache->ccc_lru */
+	atomic_t		*cl_lru_left;
+	atomic_t		 cl_lru_busy;
+	atomic_t		 cl_lru_shrinkers;
+	atomic_t		 cl_lru_in_list;
+	struct list_head		 cl_lru_list; /* lru page list */
+	client_obd_lock_t	 cl_lru_list_lock; /* page list protector */
+
+	/* number of in flight destroy rpcs is limited to max_rpcs_in_flight */
+	atomic_t	     cl_destroy_in_flight;
+	wait_queue_head_t	      cl_destroy_waitq;
+
+	struct mdc_rpc_lock     *cl_rpc_lock;
+	struct mdc_rpc_lock     *cl_close_lock;
+
+	/* mgc datastruct */
+	struct semaphore	 cl_mgc_sem;
+	struct vfsmount	 *cl_mgc_vfsmnt;
+	struct dentry	   *cl_mgc_configs_dir;
+	atomic_t	     cl_mgc_refcount;
+	struct obd_export       *cl_mgc_mgsexp;
+
+	/* checksumming for data sent over the network */
+	unsigned int	     cl_checksum:1; /* 0 = disabled, 1 = enabled */
+	/* supported checksum types that are worked out at connect time */
+	__u32		    cl_supp_cksum_types;
+	/* checksum algorithm to be used */
+	cksum_type_t	     cl_cksum_type;
+
+	/* also protected by the poorly named _loi_list_lock lock above */
+	struct osc_async_rc      cl_ar;
+
+	/* used by quotacheck when the servers are older than 2.4 */
+	int		      cl_qchk_stat; /* quotacheck stat of the peer */
+#define CL_NOT_QUOTACHECKED 1   /* client->cl_qchk_stat init value */
+#if LUSTRE_VERSION_CODE >= OBD_OCD_VERSION(2, 7, 50, 0)
+#warning "please consider removing quotacheck compatibility code"
+#endif
+
+	/* sequence manager */
+	struct lu_client_seq    *cl_seq;
+
+	atomic_t	     cl_resends; /* resend count */
+
+	/* ptlrpc work for writeback in ptlrpcd context */
+	void		    *cl_writeback_work;
+	/* hash tables for osc_quota_info */
+	cfs_hash_t	      *cl_quota_hash[MAXQUOTAS];
+};
+#define obd2cli_tgt(obd) ((char *)(obd)->u.cli.cl_target_uuid.uuid)
+
+struct obd_id_info {
+	__u32   idx;
+	obd_id  *data;
+};
+
+/* */
+
+struct echo_obd {
+	struct obd_device_target eo_obt;
+	struct obdo		eo_oa;
+	spinlock_t		 eo_lock;
+	__u64			 eo_lastino;
+	struct lustre_handle	eo_nl_lock;
+	atomic_t		eo_prep;
+};
+
+struct ost_obd {
+	struct ptlrpc_service	*ost_service;
+	struct ptlrpc_service	*ost_create_service;
+	struct ptlrpc_service	*ost_io_service;
+	struct ptlrpc_service	*ost_seq_service;
+	struct mutex		ost_health_mutex;
+};
+
+struct echo_client_obd {
+	struct obd_export	*ec_exp;   /* the local connection to osc/lov */
+	spinlock_t		ec_lock;
+	struct list_head	   ec_objects;
+	struct list_head	   ec_locks;
+	int		  ec_nstripes;
+	__u64		ec_unique;
+};
+
+struct lov_qos_oss {
+	struct obd_uuid     lqo_uuid;       /* ptlrpc's c_remote_uuid */
+	struct list_head	  lqo_oss_list;   /* link to lov_qos */
+	__u64	       lqo_bavail;     /* total bytes avail on OSS */
+	__u64	       lqo_penalty;    /* current penalty */
+	__u64	       lqo_penalty_per_obj;/* penalty decrease every obj*/
+	time_t	      lqo_used;       /* last used time, seconds */
+	__u32	       lqo_ost_count;  /* number of osts on this oss */
+};
+
+struct ltd_qos {
+	struct lov_qos_oss *ltq_oss;	 /* oss info */
+	__u64	       ltq_penalty;     /* current penalty */
+	__u64	       ltq_penalty_per_obj; /* penalty decrease every obj*/
+	__u64	       ltq_weight;      /* net weighting */
+	time_t	      ltq_used;	/* last used time, seconds */
+	unsigned int	ltq_usable:1;    /* usable for striping */
+};
+
+/* Generic subset of OSTs */
+struct ost_pool {
+	__u32	      *op_array;      /* array of index of
+						   lov_obd->lov_tgts */
+	unsigned int	op_count;      /* number of OSTs in the array */
+	unsigned int	op_size;       /* allocated size of lp_array */
+	struct rw_semaphore op_rw_sem;     /* to protect ost_pool use */
+};
+
+/* Round-robin allocator data */
+struct lov_qos_rr {
+	__u32	       lqr_start_idx;   /* start index of new inode */
+	__u32	       lqr_offset_idx;  /* aliasing for start_idx  */
+	int		 lqr_start_count; /* reseed counter */
+	struct ost_pool     lqr_pool;	/* round-robin optimized list */
+	unsigned long       lqr_dirty:1;     /* recalc round-robin list */
+};
+
+/* allow statfs data caching for 1 second */
+#define OBD_STATFS_CACHE_SECONDS 1
+
+struct lov_statfs_data {
+	struct obd_info   lsd_oi;
+	struct obd_statfs lsd_statfs;
+};
+/* Stripe placement optimization */
+struct lov_qos {
+	struct list_head	  lq_oss_list; /* list of OSSs that targets use */
+	struct rw_semaphore lq_rw_sem;
+	__u32	       lq_active_oss_count;
+	unsigned int	lq_prio_free;   /* priority for free space */
+	unsigned int	lq_threshold_rr;/* priority for rr */
+	struct lov_qos_rr   lq_rr;	  /* round robin qos data */
+	unsigned long       lq_dirty:1,     /* recalc qos data */
+			    lq_same_space:1,/* the ost's all have approx.
+					       the same space avail */
+			    lq_reset:1,     /* zero current penalties */
+			    lq_statfs_in_progress:1; /* statfs op in
+							progress */
+	/* qos statfs data */
+	struct lov_statfs_data *lq_statfs_data;
+	wait_queue_head_t	 lq_statfs_waitq; /* waitqueue to notify statfs
+					      * requests completion */
+};
+
+struct lov_tgt_desc {
+	struct list_head	  ltd_kill;
+	struct obd_uuid     ltd_uuid;
+	struct obd_device  *ltd_obd;
+	struct obd_export  *ltd_exp;
+	struct ltd_qos      ltd_qos;     /* qos info per target */
+	__u32	       ltd_gen;
+	__u32	       ltd_index;   /* index in lov_obd->tgts */
+	unsigned long       ltd_active:1,/* is this target up for requests */
+			    ltd_activate:1,/* should  target be activated */
+			    ltd_reap:1;  /* should this target be deleted */
+};
+
+/* Pool metadata */
+#define pool_tgt_size(_p)   _p->pool_obds.op_size
+#define pool_tgt_count(_p)  _p->pool_obds.op_count
+#define pool_tgt_array(_p)  _p->pool_obds.op_array
+#define pool_tgt_rw_sem(_p) _p->pool_obds.op_rw_sem
+
+struct pool_desc {
+	char		  pool_name[LOV_MAXPOOLNAME + 1]; /* name of pool */
+	struct ost_pool       pool_obds;	      /* pool members */
+	atomic_t	  pool_refcount;	  /* pool ref. counter */
+	struct lov_qos_rr     pool_rr;		/* round robin qos */
+	struct hlist_node      pool_hash;	      /* access by poolname */
+	struct list_head	    pool_list;	      /* serial access */
+	proc_dir_entry_t *pool_proc_entry;	/* file in /proc */
+	struct obd_device    *pool_lobd;	      /* obd of the lov/lod to which
+						       * this pool belongs */
+};
+
+struct lov_obd {
+	struct lov_desc	 desc;
+	struct lov_tgt_desc   **lov_tgts;	      /* sparse array */
+	struct ost_pool	 lov_packed;	    /* all OSTs in a packed
+							  array */
+	struct mutex		lov_lock;
+	struct obd_connect_data lov_ocd;
+	atomic_t	    lov_refcount;
+	__u32		   lov_tgt_count;	 /* how many OBD's */
+	__u32		   lov_active_tgt_count;  /* how many active */
+	__u32		   lov_death_row;/* tgts scheduled to be deleted */
+	__u32		   lov_tgt_size;   /* size of tgts array */
+	int		     lov_connects;
+	int		     lov_pool_count;
+	cfs_hash_t	     *lov_pools_hash_body; /* used for key access */
+	struct list_head	      lov_pool_list; /* used for sequential access */
+	proc_dir_entry_t   *lov_pool_proc_entry;
+	enum lustre_sec_part    lov_sp_me;
+
+	/* Cached LRU and unstable data from upper layer */
+	void		       *lov_cache;
+
+	struct rw_semaphore     lov_notify_lock;
+};
+
+struct lmv_tgt_desc {
+	struct obd_uuid		ltd_uuid;
+	struct obd_export	*ltd_exp;
+	int			ltd_idx;
+	struct mutex		ltd_fid_mutex;
+	unsigned long		ltd_active:1; /* target up for requests */
+};
+
+enum placement_policy {
+	PLACEMENT_CHAR_POLICY   = 0,
+	PLACEMENT_NID_POLICY    = 1,
+	PLACEMENT_INVAL_POLICY  = 2,
+	PLACEMENT_MAX_POLICY
+};
+
+typedef enum placement_policy placement_policy_t;
+
+struct lmv_obd {
+	int			refcount;
+	struct lu_client_fld	lmv_fld;
+	spinlock_t		lmv_lock;
+	placement_policy_t	lmv_placement;
+	struct lmv_desc		desc;
+	struct obd_uuid		cluuid;
+	struct obd_export	*exp;
+
+	struct mutex		init_mutex;
+	int			connected;
+	int			max_easize;
+	int			max_def_easize;
+	int			max_cookiesize;
+	int			server_timeout;
+
+	int			tgts_size; /* size of tgts array */
+	struct lmv_tgt_desc	**tgts;
+
+	struct obd_connect_data	conn_data;
+};
+
+struct niobuf_local {
+	__u64		lnb_file_offset;
+	__u32		lnb_page_offset;
+	__u32		len;
+	__u32		flags;
+	struct page	*page;
+	struct dentry	*dentry;
+	int		lnb_grant_used;
+	int		rc;
+};
+
+#define LUSTRE_FLD_NAME	 "fld"
+#define LUSTRE_SEQ_NAME	 "seq"
+
+#define LUSTRE_MDD_NAME	 "mdd"
+#define LUSTRE_OSD_LDISKFS_NAME	"osd-ldiskfs"
+#define LUSTRE_OSD_ZFS_NAME     "osd-zfs"
+#define LUSTRE_VVP_NAME	 "vvp"
+#define LUSTRE_LMV_NAME	 "lmv"
+#define LUSTRE_SLP_NAME	 "slp"
+#define LUSTRE_LOD_NAME		"lod"
+#define LUSTRE_OSP_NAME		"osp"
+#define LUSTRE_LWP_NAME		"lwp"
+
+/* obd device type names */
+ /* FIXME all the references to LUSTRE_MDS_NAME should be swapped with LUSTRE_MDT_NAME */
+#define LUSTRE_MDS_NAME	 "mds"
+#define LUSTRE_MDT_NAME	 "mdt"
+#define LUSTRE_MDC_NAME	 "mdc"
+#define LUSTRE_OSS_NAME	 "ost"       /* FIXME change name to oss */
+#define LUSTRE_OST_NAME	 "obdfilter" /* FIXME change name to ost */
+#define LUSTRE_OSC_NAME	 "osc"
+#define LUSTRE_LOV_NAME	 "lov"
+#define LUSTRE_MGS_NAME	 "mgs"
+#define LUSTRE_MGC_NAME	 "mgc"
+
+#define LUSTRE_ECHO_NAME	"obdecho"
+#define LUSTRE_ECHO_CLIENT_NAME "echo_client"
+#define LUSTRE_QMT_NAME	 "qmt"
+
+/* Constant obd names (post-rename) */
+#define LUSTRE_MDS_OBDNAME "MDS"
+#define LUSTRE_OSS_OBDNAME "OSS"
+#define LUSTRE_MGS_OBDNAME "MGS"
+#define LUSTRE_MGC_OBDNAME "MGC"
+
+static inline int is_osp_on_mdt(char *name)
+{
+	char   *ptr;
+
+	ptr = strrchr(name, '-');
+	if (ptr == NULL) {
+		CERROR("%s is not a obdname\n", name);
+		return 0;
+	}
+
+	/* 1.8 OSC/OSP name on MDT is fsname-OSTxxxx-osc */
+	if (strncmp(ptr + 1, "osc", 3) == 0)
+		return 1;
+
+	if (strncmp(ptr + 1, "MDT", 3) != 0)
+		return 0;
+
+	while (*(--ptr) != '-' && ptr != name);
+
+	if (ptr == name)
+		return 0;
+
+	if (strncmp(ptr + 1, LUSTRE_OSP_NAME, strlen(LUSTRE_OSP_NAME)) != 0 &&
+	    strncmp(ptr + 1, LUSTRE_OSC_NAME, strlen(LUSTRE_OSC_NAME)) != 0)
+		return 0;
+
+	return 1;
+}
+
+/* Don't conflict with on-wire flags OBD_BRW_WRITE, etc */
+#define N_LOCAL_TEMP_PAGE 0x10000000
+
+struct obd_trans_info {
+	__u64		    oti_transno;
+	__u64		    oti_xid;
+	/* Only used on the server side for tracking acks. */
+	struct oti_req_ack_lock {
+		struct lustre_handle lock;
+		__u32		mode;
+	}			oti_ack_locks[4];
+	void		    *oti_handle;
+	struct llog_cookie       oti_onecookie;
+	struct llog_cookie      *oti_logcookies;
+	int		      oti_numcookies;
+	/** synchronous write is needed */
+	unsigned long		 oti_sync_write:1;
+
+	/* initial thread handling transaction */
+	struct ptlrpc_thread *   oti_thread;
+	__u32		    oti_conn_cnt;
+	/** VBR: versions */
+	__u64		    oti_pre_version;
+	/** JobID */
+	char		    *oti_jobid;
+
+	struct obd_uuid	 *oti_ost_uuid;
+};
+
+static inline void oti_init(struct obd_trans_info *oti,
+			    struct ptlrpc_request *req)
+{
+	if (oti == NULL)
+		return;
+	memset(oti, 0, sizeof(*oti));
+
+	if (req == NULL)
+		return;
+
+	oti->oti_xid = req->rq_xid;
+	/** VBR: take versions from request */
+	if (req->rq_reqmsg != NULL &&
+	    lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) {
+		__u64 *pre_version = lustre_msg_get_versions(req->rq_reqmsg);
+		oti->oti_pre_version = pre_version ? pre_version[0] : 0;
+		oti->oti_transno = lustre_msg_get_transno(req->rq_reqmsg);
+	}
+
+	/** called from mds_create_objects */
+	if (req->rq_repmsg != NULL)
+		oti->oti_transno = lustre_msg_get_transno(req->rq_repmsg);
+	oti->oti_thread = req->rq_svc_thread;
+	if (req->rq_reqmsg != NULL)
+		oti->oti_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg);
+}
+
+static inline void oti_alloc_cookies(struct obd_trans_info *oti,int num_cookies)
+{
+	if (!oti)
+		return;
+
+	if (num_cookies == 1)
+		oti->oti_logcookies = &oti->oti_onecookie;
+	else
+		OBD_ALLOC_LARGE(oti->oti_logcookies,
+				num_cookies * sizeof(oti->oti_onecookie));
+
+	oti->oti_numcookies = num_cookies;
+}
+
+static inline void oti_free_cookies(struct obd_trans_info *oti)
+{
+	if (!oti || !oti->oti_logcookies)
+		return;
+
+	if (oti->oti_logcookies == &oti->oti_onecookie)
+		LASSERT(oti->oti_numcookies == 1);
+	else
+		OBD_FREE_LARGE(oti->oti_logcookies,
+			       oti->oti_numcookies*sizeof(oti->oti_onecookie));
+	oti->oti_logcookies = NULL;
+	oti->oti_numcookies = 0;
+}
+
+/*
+ * Events signalled through obd_notify() upcall-chain.
+ */
+enum obd_notify_event {
+	/* target added */
+	OBD_NOTIFY_CREATE,
+	/* Device connect start */
+	OBD_NOTIFY_CONNECT,
+	/* Device activated */
+	OBD_NOTIFY_ACTIVE,
+	/* Device deactivated */
+	OBD_NOTIFY_INACTIVE,
+	/* Device disconnected */
+	OBD_NOTIFY_DISCON,
+	/* Connect data for import were changed */
+	OBD_NOTIFY_OCD,
+	/* Sync request */
+	OBD_NOTIFY_SYNC_NONBLOCK,
+	OBD_NOTIFY_SYNC,
+	/* Configuration event */
+	OBD_NOTIFY_CONFIG,
+	/* Administratively deactivate/activate event */
+	OBD_NOTIFY_DEACTIVATE,
+	OBD_NOTIFY_ACTIVATE
+};
+
+/* bit-mask flags for config events */
+enum config_flags {
+	CONFIG_LOG      = 0x1,  /* finished processing config log */
+	CONFIG_SYNC     = 0x2,  /* mdt synced 1 ost */
+	CONFIG_TARGET   = 0x4   /* one target is added */
+};
+
+/*
+ * Data structure used to pass obd_notify()-event to non-obd listeners (llite
+ * and liblustre being main examples).
+ */
+struct obd_notify_upcall {
+	int (*onu_upcall)(struct obd_device *host, struct obd_device *watched,
+			  enum obd_notify_event ev, void *owner, void *data);
+	/* Opaque datum supplied by upper layer listener */
+	void *onu_owner;
+};
+
+struct target_recovery_data {
+	svc_handler_t		trd_recovery_handler;
+	pid_t			trd_processing_task;
+	struct completion	trd_starting;
+	struct completion	trd_finishing;
+};
+
+struct obd_llog_group {
+	int		olg_seq;
+	struct llog_ctxt  *olg_ctxts[LLOG_MAX_CTXTS];
+	wait_queue_head_t	olg_waitq;
+	spinlock_t	   olg_lock;
+	struct mutex	   olg_cat_processing;
+};
+
+/* corresponds to one of the obd's */
+#define OBD_DEVICE_MAGIC	0XAB5CD6EF
+#define OBD_DEV_BY_DEVNAME      0xffffd0de
+
+struct obd_device {
+	struct obd_type	*obd_type;
+	__u32		   obd_magic;
+
+	/* common and UUID name of this device */
+	char		    obd_name[MAX_OBD_NAME];
+	struct obd_uuid	 obd_uuid;
+
+	struct lu_device       *obd_lu_dev;
+
+	int		     obd_minor;
+	/* bitfield modification is protected by obd_dev_lock */
+	unsigned long obd_attached:1,      /* finished attach */
+		      obd_set_up:1,	/* finished setup */
+		      obd_recovering:1,    /* there are recoverable clients */
+		      obd_abort_recovery:1,/* recovery expired */
+		      obd_version_recov:1, /* obd uses version checking */
+		      obd_replayable:1,    /* recovery is enabled; inform clients */
+		      obd_no_transno:1,    /* no committed-transno notification */
+		      obd_no_recov:1,      /* fail instead of retry messages */
+		      obd_stopping:1,      /* started cleanup */
+		      obd_starting:1,      /* started setup */
+		      obd_force:1,	 /* cleanup with > 0 obd refcount */
+		      obd_fail:1,	  /* cleanup with failover */
+		      obd_async_recov:1,   /* allow asynchronous orphan cleanup */
+		      obd_no_conn:1,       /* deny new connections */
+		      obd_inactive:1,      /* device active/inactive
+					   * (for /proc/status only!!) */
+		      obd_no_ir:1,	 /* no imperative recovery. */
+		      obd_process_conf:1;  /* device is processing mgs config */
+	/* use separate field as it is set in interrupt to don't mess with
+	 * protection of other bits using _bh lock */
+	unsigned long obd_recovery_expired:1;
+	/* uuid-export hash body */
+	cfs_hash_t	     *obd_uuid_hash;
+	/* nid-export hash body */
+	cfs_hash_t	     *obd_nid_hash;
+	/* nid stats body */
+	cfs_hash_t	     *obd_nid_stats_hash;
+	struct list_head	      obd_nid_stats;
+	atomic_t	    obd_refcount;
+	wait_queue_head_t	     obd_refcount_waitq;
+	struct list_head	      obd_exports;
+	struct list_head	      obd_unlinked_exports;
+	struct list_head	      obd_delayed_exports;
+	int		     obd_num_exports;
+	spinlock_t		obd_nid_lock;
+	struct ldlm_namespace  *obd_namespace;
+	struct ptlrpc_client	obd_ldlm_client; /* XXX OST/MDS only */
+	/* a spinlock is OK for what we do now, may need a semaphore later */
+	spinlock_t		obd_dev_lock; /* protect OBD bitfield above */
+	struct mutex		obd_dev_mutex;
+	__u64			obd_last_committed;
+	struct fsfilt_operations *obd_fsops;
+	spinlock_t		obd_osfs_lock;
+	struct obd_statfs	obd_osfs;       /* locked by obd_osfs_lock */
+	__u64			obd_osfs_age;
+	struct lvfs_run_ctxt	obd_lvfs_ctxt;
+	struct obd_llog_group	obd_olg;	/* default llog group */
+	struct obd_device	*obd_observer;
+	struct rw_semaphore	obd_observer_link_sem;
+	struct obd_notify_upcall obd_upcall;
+	struct obd_export       *obd_self_export;
+	/* list of exports in LRU order, for ping evictor, with obd_dev_lock */
+	struct list_head	      obd_exports_timed;
+	time_t		  obd_eviction_timer; /* for ping evictor */
+
+	int			      obd_max_recoverable_clients;
+	atomic_t		     obd_connected_clients;
+	int			      obd_stale_clients;
+	int			      obd_delayed_clients;
+	/* this lock protects all recovery list_heads, timer and
+	 * obd_next_recovery_transno value */
+	spinlock_t			 obd_recovery_task_lock;
+	__u64			    obd_next_recovery_transno;
+	int			      obd_replayed_requests;
+	int			      obd_requests_queued_for_recovery;
+	wait_queue_head_t		      obd_next_transno_waitq;
+	/* protected by obd_recovery_task_lock */
+	timer_list_t		      obd_recovery_timer;
+	time_t			   obd_recovery_start; /* seconds */
+	time_t			   obd_recovery_end; /* seconds, for lprocfs_status */
+	int			      obd_recovery_time_hard;
+	int			      obd_recovery_timeout;
+	int			      obd_recovery_ir_factor;
+
+	/* new recovery stuff from CMD2 */
+	struct target_recovery_data      obd_recovery_data;
+	int			      obd_replayed_locks;
+	atomic_t		     obd_req_replay_clients;
+	atomic_t		     obd_lock_replay_clients;
+	/* all lists are protected by obd_recovery_task_lock */
+	struct list_head		       obd_req_replay_queue;
+	struct list_head		       obd_lock_replay_queue;
+	struct list_head		       obd_final_req_queue;
+	int			      obd_recovery_stage;
+
+	union {
+		struct obd_device_target obt;
+		struct filter_obd filter;
+		struct client_obd cli;
+		struct ost_obd ost;
+		struct echo_client_obd echo_client;
+		struct echo_obd echo;
+		struct lov_obd lov;
+		struct lmv_obd lmv;
+	} u;
+	/* Fields used by LProcFS */
+	unsigned int	   obd_cntr_base;
+	struct lprocfs_stats  *obd_stats;
+
+	unsigned int	   md_cntr_base;
+	struct lprocfs_stats  *md_stats;
+
+	proc_dir_entry_t  *obd_proc_entry;
+	proc_dir_entry_t  *obd_proc_exports_entry;
+	proc_dir_entry_t  *obd_svc_procroot;
+	struct lprocfs_stats  *obd_svc_stats;
+	atomic_t	   obd_evict_inprogress;
+	wait_queue_head_t	    obd_evict_inprogress_waitq;
+	struct list_head	     obd_evict_list; /* protected with pet_lock */
+
+	/**
+	 * Ldlm pool part. Save last calculated SLV and Limit.
+	 */
+	rwlock_t		obd_pool_lock;
+	int		    obd_pool_limit;
+	__u64		  obd_pool_slv;
+
+	/**
+	 * A list of outstanding class_incref()'s against this obd. For
+	 * debugging.
+	 */
+	struct lu_ref	  obd_reference;
+
+	int		       obd_conn_inprogress;
+};
+
+#define OBD_LLOG_FL_SENDNOW     0x0001
+#define OBD_LLOG_FL_EXIT	0x0002
+
+enum obd_cleanup_stage {
+/* Special case hack for MDS LOVs */
+	OBD_CLEANUP_EARLY,
+/* can be directly mapped to .ldto_device_fini() */
+	OBD_CLEANUP_EXPORTS,
+};
+
+/* get/set_info keys */
+#define KEY_ASYNC	       "async"
+#define KEY_BLOCKSIZE_BITS      "blocksize_bits"
+#define KEY_BLOCKSIZE	   "blocksize"
+#define KEY_CAPA_KEY	    "capa_key"
+#define KEY_CHANGELOG_CLEAR     "changelog_clear"
+#define KEY_FID2PATH	    "fid2path"
+#define KEY_CHECKSUM	    "checksum"
+#define KEY_CLEAR_FS	    "clear_fs"
+#define KEY_CONN_DATA	   "conn_data"
+#define KEY_EVICT_BY_NID	"evict_by_nid"
+#define KEY_FIEMAP	      "fiemap"
+#define KEY_FLUSH_CTX	   "flush_ctx"
+#define KEY_GRANT_SHRINK	"grant_shrink"
+#define KEY_HSM_COPYTOOL_SEND   "hsm_send"
+#define KEY_INIT_RECOV_BACKUP   "init_recov_bk"
+#define KEY_INIT_RECOV	  "initial_recov"
+#define KEY_INTERMDS	    "inter_mds"
+#define KEY_LAST_ID	     "last_id"
+#define KEY_LAST_FID		"last_fid"
+#define KEY_LOCK_TO_STRIPE      "lock_to_stripe"
+#define KEY_LOVDESC	     "lovdesc"
+#define KEY_LOV_IDX	     "lov_idx"
+#define KEY_MAX_EASIZE	  "max_easize"
+#define KEY_MDS_CONN	    "mds_conn"
+#define KEY_MGSSEC	      "mgssec"
+#define KEY_NEXT_ID	     "next_id"
+#define KEY_READ_ONLY	   "read-only"
+#define KEY_REGISTER_TARGET     "register_target"
+#define KEY_SET_FS	      "set_fs"
+#define KEY_TGT_COUNT	   "tgt_count"
+/*      KEY_SET_INFO in lustre_idl.h */
+#define KEY_SPTLRPC_CONF	"sptlrpc_conf"
+#define KEY_CONNECT_FLAG	"connect_flags"
+#define KEY_SYNC_LOCK_CANCEL    "sync_lock_cancel"
+
+#define KEY_CACHE_SET		"cache_set"
+#define KEY_CACHE_LRU_SHRINK	"cache_lru_shrink"
+#define KEY_CHANGELOG_INDEX	"changelog_index"
+
+struct lu_context;
+
+/* /!\ must be coherent with include/linux/namei.h on patched kernel */
+#define IT_OPEN     (1 << 0)
+#define IT_CREAT    (1 << 1)
+#define IT_READDIR  (1 << 2)
+#define IT_GETATTR  (1 << 3)
+#define IT_LOOKUP   (1 << 4)
+#define IT_UNLINK   (1 << 5)
+#define IT_TRUNC    (1 << 6)
+#define IT_GETXATTR (1 << 7)
+#define IT_EXEC     (1 << 8)
+#define IT_PIN      (1 << 9)
+#define IT_LAYOUT   (1 << 10)
+#define IT_QUOTA_DQACQ (1 << 11)
+#define IT_QUOTA_CONN  (1 << 12)
+
+static inline int it_to_lock_mode(struct lookup_intent *it)
+{
+	/* CREAT needs to be tested before open (both could be set) */
+	if (it->it_op & IT_CREAT)
+		return LCK_CW;
+	else if (it->it_op & (IT_READDIR | IT_GETATTR | IT_OPEN | IT_LOOKUP |
+			      IT_LAYOUT))
+		return LCK_CR;
+
+	LASSERTF(0, "Invalid it_op: %d\n", it->it_op);
+	return -EINVAL;
+}
+
+struct md_op_data {
+	struct lu_fid	   op_fid1; /* operation fid1 (usualy parent) */
+	struct lu_fid	   op_fid2; /* operation fid2 (usualy child) */
+	struct lu_fid	   op_fid3; /* 2 extra fids to find conflicting */
+	struct lu_fid	   op_fid4; /* to the operation locks. */
+	mdsno_t		 op_mds;  /* what mds server open will go to */
+	struct lustre_handle    op_handle;
+	obd_time		op_mod_time;
+	const char	     *op_name;
+	int		     op_namelen;
+	__u32		   op_mode;
+	struct lmv_stripe_md   *op_mea1;
+	struct lmv_stripe_md   *op_mea2;
+	__u32		   op_suppgids[2];
+	__u32		   op_fsuid;
+	__u32		   op_fsgid;
+	cfs_cap_t	       op_cap;
+	void		   *op_data;
+
+	/* iattr fields and blocks. */
+	struct iattr	    op_attr;
+	unsigned int	    op_attr_flags;
+	__u64		   op_valid;
+	loff_t		  op_attr_blocks;
+
+	/* Size-on-MDS epoch and flags. */
+	__u64		   op_ioepoch;
+	__u32		   op_flags;
+
+	/* Capa fields */
+	struct obd_capa	*op_capa1;
+	struct obd_capa	*op_capa2;
+
+	/* Various operation flags. */
+	__u32		   op_bias;
+
+	/* Operation type */
+	__u32		   op_opc;
+
+	/* Used by readdir */
+	__u64		   op_offset;
+
+	/* Used by readdir */
+	__u32		   op_npages;
+
+	/* used to transfer info between the stacks of MD client
+	 * see enum op_cli_flags */
+	__u32			op_cli_flags;
+};
+
+enum op_cli_flags {
+	CLI_SET_MEA	= 1 << 0,
+	CLI_RM_ENTRY	= 1 << 1,
+};
+
+struct md_enqueue_info;
+/* metadata stat-ahead */
+typedef int (* md_enqueue_cb_t)(struct ptlrpc_request *req,
+				struct md_enqueue_info *minfo,
+				int rc);
+
+/* seq client type */
+enum lu_cli_type {
+	LUSTRE_SEQ_METADATA = 1,
+	LUSTRE_SEQ_DATA
+};
+
+struct md_enqueue_info {
+	struct md_op_data       mi_data;
+	struct lookup_intent    mi_it;
+	struct lustre_handle    mi_lockh;
+	struct inode	   *mi_dir;
+	md_enqueue_cb_t	 mi_cb;
+	__u64		   mi_cbdata;
+	unsigned int	    mi_generation;
+};
+
+struct obd_ops {
+	module_t *o_owner;
+	int (*o_iocontrol)(unsigned int cmd, struct obd_export *exp, int len,
+			   void *karg, void *uarg);
+	int (*o_get_info)(const struct lu_env *env, struct obd_export *,
+			  __u32 keylen, void *key, __u32 *vallen, void *val,
+			  struct lov_stripe_md *lsm);
+	int (*o_set_info_async)(const struct lu_env *, struct obd_export *,
+				__u32 keylen, void *key,
+				__u32 vallen, void *val,
+				struct ptlrpc_request_set *set);
+	int (*o_attach)(struct obd_device *dev, obd_count len, void *data);
+	int (*o_detach)(struct obd_device *dev);
+	int (*o_setup) (struct obd_device *dev, struct lustre_cfg *cfg);
+	int (*o_precleanup)(struct obd_device *dev,
+			    enum obd_cleanup_stage cleanup_stage);
+	int (*o_cleanup)(struct obd_device *dev);
+	int (*o_process_config)(struct obd_device *dev, obd_count len,
+				void *data);
+	int (*o_postrecov)(struct obd_device *dev);
+	int (*o_add_conn)(struct obd_import *imp, struct obd_uuid *uuid,
+			  int priority);
+	int (*o_del_conn)(struct obd_import *imp, struct obd_uuid *uuid);
+	/* connect to the target device with given connection
+	 * data. @ocd->ocd_connect_flags is modified to reflect flags actually
+	 * granted by the target, which are guaranteed to be a subset of flags
+	 * asked for. If @ocd == NULL, use default parameters. */
+	int (*o_connect)(const struct lu_env *env,
+			 struct obd_export **exp, struct obd_device *src,
+			 struct obd_uuid *cluuid, struct obd_connect_data *ocd,
+			 void *localdata);
+	int (*o_reconnect)(const struct lu_env *env,
+			   struct obd_export *exp, struct obd_device *src,
+			   struct obd_uuid *cluuid,
+			   struct obd_connect_data *ocd,
+			   void *localdata);
+	int (*o_disconnect)(struct obd_export *exp);
+
+	/* Initialize/finalize fids infrastructure. */
+	int (*o_fid_init)(struct obd_device *obd,
+			  struct obd_export *exp, enum lu_cli_type type);
+	int (*o_fid_fini)(struct obd_device *obd);
+
+	/* Allocate new fid according to passed @hint. */
+	int (*o_fid_alloc)(struct obd_export *exp, struct lu_fid *fid,
+			   struct md_op_data *op_data);
+
+	/*
+	 * Object with @fid is getting deleted, we may want to do something
+	 * about this.
+	 */
+	int (*o_statfs)(const struct lu_env *, struct obd_export *exp,
+			struct obd_statfs *osfs, __u64 max_age, __u32 flags);
+	int (*o_statfs_async)(struct obd_export *exp, struct obd_info *oinfo,
+			      __u64 max_age, struct ptlrpc_request_set *set);
+	int (*o_packmd)(struct obd_export *exp, struct lov_mds_md **disk_tgt,
+			struct lov_stripe_md *mem_src);
+	int (*o_unpackmd)(struct obd_export *exp,struct lov_stripe_md **mem_tgt,
+			  struct lov_mds_md *disk_src, int disk_len);
+	int (*o_preallocate)(struct lustre_handle *, obd_count *req,
+			     obd_id *ids);
+	/* FIXME: add fid capability support for create & destroy! */
+	int (*o_precreate)(struct obd_export *exp);
+	int (*o_create)(const struct lu_env *env, struct obd_export *exp,
+			struct obdo *oa, struct lov_stripe_md **ea,
+			struct obd_trans_info *oti);
+	int (*o_create_async)(struct obd_export *exp,  struct obd_info *oinfo,
+			      struct lov_stripe_md **ea,
+			      struct obd_trans_info *oti);
+	int (*o_destroy)(const struct lu_env *env, struct obd_export *exp,
+			 struct obdo *oa, struct lov_stripe_md *ea,
+			 struct obd_trans_info *oti, struct obd_export *md_exp,
+			 void *capa);
+	int (*o_setattr)(const struct lu_env *, struct obd_export *exp,
+			 struct obd_info *oinfo, struct obd_trans_info *oti);
+	int (*o_setattr_async)(struct obd_export *exp, struct obd_info *oinfo,
+			       struct obd_trans_info *oti,
+			       struct ptlrpc_request_set *rqset);
+	int (*o_getattr)(const struct lu_env *env, struct obd_export *exp,
+			 struct obd_info *oinfo);
+	int (*o_getattr_async)(struct obd_export *exp, struct obd_info *oinfo,
+			       struct ptlrpc_request_set *set);
+	int (*o_brw)(int rw, struct obd_export *exp, struct obd_info *oinfo,
+		     obd_count oa_bufs, struct brw_page *pgarr,
+		     struct obd_trans_info *oti);
+	int (*o_merge_lvb)(struct obd_export *exp, struct lov_stripe_md *lsm,
+			   struct ost_lvb *lvb, int kms_only);
+	int (*o_adjust_kms)(struct obd_export *exp, struct lov_stripe_md *lsm,
+			    obd_off size, int shrink);
+	int (*o_punch)(const struct lu_env *, struct obd_export *exp,
+		       struct obd_info *oinfo, struct obd_trans_info *oti,
+		       struct ptlrpc_request_set *rqset);
+	int (*o_sync)(const struct lu_env *env, struct obd_export *exp,
+		      struct obd_info *oinfo, obd_size start, obd_size end,
+		      struct ptlrpc_request_set *set);
+	int (*o_migrate)(struct lustre_handle *conn, struct lov_stripe_md *dst,
+			 struct lov_stripe_md *src, obd_size start,
+			 obd_size end, struct obd_trans_info *oti);
+	int (*o_copy)(struct lustre_handle *dstconn, struct lov_stripe_md *dst,
+		      struct lustre_handle *srconn, struct lov_stripe_md *src,
+		      obd_size start, obd_size end, struct obd_trans_info *);
+	int (*o_iterate)(struct lustre_handle *conn,
+			 int (*)(obd_id, obd_seq, void *),
+			 obd_id *startid, obd_seq seq, void *data);
+	int (*o_preprw)(const struct lu_env *env, int cmd,
+			struct obd_export *exp, struct obdo *oa, int objcount,
+			struct obd_ioobj *obj, struct niobuf_remote *remote,
+			int *nr_pages, struct niobuf_local *local,
+			struct obd_trans_info *oti, struct lustre_capa *capa);
+	int (*o_commitrw)(const struct lu_env *env, int cmd,
+			  struct obd_export *exp, struct obdo *oa,
+			  int objcount, struct obd_ioobj *obj,
+			  struct niobuf_remote *remote, int pages,
+			  struct niobuf_local *local,
+			  struct obd_trans_info *oti, int rc);
+	int (*o_enqueue)(struct obd_export *, struct obd_info *oinfo,
+			 struct ldlm_enqueue_info *einfo,
+			 struct ptlrpc_request_set *rqset);
+	int (*o_change_cbdata)(struct obd_export *, struct lov_stripe_md *,
+			       ldlm_iterator_t it, void *data);
+	int (*o_find_cbdata)(struct obd_export *, struct lov_stripe_md *,
+			     ldlm_iterator_t it, void *data);
+	int (*o_cancel)(struct obd_export *, struct lov_stripe_md *md,
+			__u32 mode, struct lustre_handle *);
+	int (*o_cancel_unused)(struct obd_export *, struct lov_stripe_md *,
+			       ldlm_cancel_flags_t flags, void *opaque);
+	int (*o_init_export)(struct obd_export *exp);
+	int (*o_destroy_export)(struct obd_export *exp);
+	int (*o_extent_calc)(struct obd_export *, struct lov_stripe_md *,
+			     int cmd, obd_off *);
+
+	/* llog related obd_methods */
+	int (*o_llog_init)(struct obd_device *obd, struct obd_llog_group *grp,
+			   struct obd_device *disk_obd, int *idx);
+	int (*o_llog_finish)(struct obd_device *obd, int count);
+	int (*o_llog_connect)(struct obd_export *, struct llogd_conn_body *);
+
+	/* metadata-only methods */
+	int (*o_pin)(struct obd_export *, const struct lu_fid *fid,
+		     struct obd_capa *, struct obd_client_handle *, int flag);
+	int (*o_unpin)(struct obd_export *, struct obd_client_handle *, int);
+
+	int (*o_import_event)(struct obd_device *, struct obd_import *,
+			      enum obd_import_event);
+
+	int (*o_notify)(struct obd_device *obd, struct obd_device *watched,
+			enum obd_notify_event ev, void *data);
+
+	int (*o_health_check)(const struct lu_env *env, struct obd_device *);
+	struct obd_uuid *(*o_get_uuid) (struct obd_export *exp);
+
+	/* quota methods */
+	int (*o_quotacheck)(struct obd_device *, struct obd_export *,
+			    struct obd_quotactl *);
+	int (*o_quotactl)(struct obd_device *, struct obd_export *,
+			  struct obd_quotactl *);
+
+	int (*o_ping)(const struct lu_env *, struct obd_export *exp);
+
+	/* pools methods */
+	int (*o_pool_new)(struct obd_device *obd, char *poolname);
+	int (*o_pool_del)(struct obd_device *obd, char *poolname);
+	int (*o_pool_add)(struct obd_device *obd, char *poolname,
+			  char *ostname);
+	int (*o_pool_rem)(struct obd_device *obd, char *poolname,
+			  char *ostname);
+	void (*o_getref)(struct obd_device *obd);
+	void (*o_putref)(struct obd_device *obd);
+	/*
+	 * NOTE: If adding ops, add another LPROCFS_OBD_OP_INIT() line
+	 * to lprocfs_alloc_obd_stats() in obdclass/lprocfs_status.c.
+	 * Also, add a wrapper function in include/linux/obd_class.h. */
+};
+
+enum {
+	LUSTRE_OPC_MKDIR    = (1 << 0),
+	LUSTRE_OPC_SYMLINK  = (1 << 1),
+	LUSTRE_OPC_MKNOD    = (1 << 2),
+	LUSTRE_OPC_CREATE   = (1 << 3),
+	LUSTRE_OPC_ANY      = (1 << 4)
+};
+
+/* lmv structures */
+#define MEA_MAGIC_LAST_CHAR      0xb2221ca1
+#define MEA_MAGIC_ALL_CHARS      0xb222a11c
+#define MEA_MAGIC_HASH_SEGMENT   0xb222a11b
+
+#define MAX_HASH_SIZE_32	 0x7fffffffUL
+#define MAX_HASH_SIZE	    0x7fffffffffffffffULL
+#define MAX_HASH_HIGHEST_BIT     0x1000000000000000ULL
+
+struct lustre_md {
+	struct mdt_body	 *body;
+	struct lov_stripe_md    *lsm;
+	struct lmv_stripe_md    *mea;
+#ifdef CONFIG_FS_POSIX_ACL
+	struct posix_acl	*posix_acl;
+#endif
+	struct mdt_remote_perm  *remote_perm;
+	struct obd_capa	 *mds_capa;
+	struct obd_capa	 *oss_capa;
+};
+
+struct md_open_data {
+	struct obd_client_handle *mod_och;
+	struct ptlrpc_request    *mod_open_req;
+	struct ptlrpc_request    *mod_close_req;
+	atomic_t	      mod_refcount;
+};
+
+struct lookup_intent;
+
+struct md_ops {
+	int (*m_getstatus)(struct obd_export *, struct lu_fid *,
+			   struct obd_capa **);
+	int (*m_null_inode)(struct obd_export *, const struct lu_fid *);
+	int (*m_find_cbdata)(struct obd_export *, const struct lu_fid *,
+			     ldlm_iterator_t, void *);
+	int (*m_close)(struct obd_export *, struct md_op_data *,
+		       struct md_open_data *, struct ptlrpc_request **);
+	int (*m_create)(struct obd_export *, struct md_op_data *,
+			const void *, int, int, __u32, __u32, cfs_cap_t,
+			__u64, struct ptlrpc_request **);
+	int (*m_done_writing)(struct obd_export *, struct md_op_data  *,
+			      struct md_open_data *);
+	int (*m_enqueue)(struct obd_export *, struct ldlm_enqueue_info *,
+			 struct lookup_intent *, struct md_op_data *,
+			 struct lustre_handle *, void *, int,
+			 struct ptlrpc_request **, __u64);
+	int (*m_getattr)(struct obd_export *, struct md_op_data *,
+			 struct ptlrpc_request **);
+	int (*m_getattr_name)(struct obd_export *, struct md_op_data *,
+			      struct ptlrpc_request **);
+	int (*m_intent_lock)(struct obd_export *, struct md_op_data *,
+			     void *, int, struct lookup_intent *, int,
+			     struct ptlrpc_request **,
+			     ldlm_blocking_callback, __u64);
+	int (*m_link)(struct obd_export *, struct md_op_data *,
+		      struct ptlrpc_request **);
+	int (*m_rename)(struct obd_export *, struct md_op_data *,
+			const char *, int, const char *, int,
+			struct ptlrpc_request **);
+	int (*m_is_subdir)(struct obd_export *, const struct lu_fid *,
+			   const struct lu_fid *,
+			   struct ptlrpc_request **);
+	int (*m_setattr)(struct obd_export *, struct md_op_data *, void *,
+			 int , void *, int, struct ptlrpc_request **,
+			 struct md_open_data **mod);
+	int (*m_sync)(struct obd_export *, const struct lu_fid *,
+		      struct obd_capa *, struct ptlrpc_request **);
+	int (*m_readpage)(struct obd_export *, struct md_op_data *,
+			  struct page **, struct ptlrpc_request **);
+
+	int (*m_unlink)(struct obd_export *, struct md_op_data *,
+			struct ptlrpc_request **);
+
+	int (*m_setxattr)(struct obd_export *, const struct lu_fid *,
+			  struct obd_capa *, obd_valid, const char *,
+			  const char *, int, int, int, __u32,
+			  struct ptlrpc_request **);
+
+	int (*m_getxattr)(struct obd_export *, const struct lu_fid *,
+			  struct obd_capa *, obd_valid, const char *,
+			  const char *, int, int, int,
+			  struct ptlrpc_request **);
+
+	int (*m_init_ea_size)(struct obd_export *, int, int, int);
+
+	int (*m_get_lustre_md)(struct obd_export *, struct ptlrpc_request *,
+			       struct obd_export *, struct obd_export *,
+			       struct lustre_md *);
+
+	int (*m_free_lustre_md)(struct obd_export *, struct lustre_md *);
+
+	int (*m_set_open_replay_data)(struct obd_export *,
+				      struct obd_client_handle *,
+				      struct ptlrpc_request *);
+	int (*m_clear_open_replay_data)(struct obd_export *,
+					struct obd_client_handle *);
+	int (*m_set_lock_data)(struct obd_export *, __u64 *, void *, __u64 *);
+
+	ldlm_mode_t (*m_lock_match)(struct obd_export *, __u64,
+				    const struct lu_fid *, ldlm_type_t,
+				    ldlm_policy_data_t *, ldlm_mode_t,
+				    struct lustre_handle *);
+
+	int (*m_cancel_unused)(struct obd_export *, const struct lu_fid *,
+			       ldlm_policy_data_t *, ldlm_mode_t,
+			       ldlm_cancel_flags_t flags, void *opaque);
+	int (*m_renew_capa)(struct obd_export *, struct obd_capa *oc,
+			    renew_capa_cb_t cb);
+	int (*m_unpack_capa)(struct obd_export *, struct ptlrpc_request *,
+			     const struct req_msg_field *, struct obd_capa **);
+
+	int (*m_get_remote_perm)(struct obd_export *, const struct lu_fid *,
+				 struct obd_capa *, __u32,
+				 struct ptlrpc_request **);
+
+	int (*m_intent_getattr_async)(struct obd_export *,
+				      struct md_enqueue_info *,
+				      struct ldlm_enqueue_info *);
+
+	int (*m_revalidate_lock)(struct obd_export *, struct lookup_intent *,
+				 struct lu_fid *, __u64 *bits);
+
+	/*
+	 * NOTE: If adding ops, add another LPROCFS_MD_OP_INIT() line to
+	 * lprocfs_alloc_md_stats() in obdclass/lprocfs_status.c. Also, add a
+	 * wrapper function in include/linux/obd_class.h.
+	 */
+};
+
+struct lsm_operations {
+	void (*lsm_free)(struct lov_stripe_md *);
+	int (*lsm_destroy)(struct lov_stripe_md *, struct obdo *oa,
+			   struct obd_export *md_exp);
+	void (*lsm_stripe_by_index)(struct lov_stripe_md *, int *, obd_off *,
+				    obd_off *);
+	void (*lsm_stripe_by_offset)(struct lov_stripe_md *, int *, obd_off *,
+				     obd_off *);
+	int (*lsm_lmm_verify) (struct lov_mds_md *lmm, int lmm_bytes,
+			       __u16 *stripe_count);
+	int (*lsm_unpackmd) (struct lov_obd *lov, struct lov_stripe_md *lsm,
+			     struct lov_mds_md *lmm);
+};
+
+extern const struct lsm_operations lsm_v1_ops;
+extern const struct lsm_operations lsm_v3_ops;
+static inline const struct lsm_operations *lsm_op_find(int magic)
+{
+	switch(magic) {
+	case LOV_MAGIC_V1:
+	       return &lsm_v1_ops;
+	case LOV_MAGIC_V3:
+	       return &lsm_v3_ops;
+	default:
+	       CERROR("Cannot recognize lsm_magic %08x\n", magic);
+	       return NULL;
+	}
+}
+
+/* Requests for obd_extent_calc() */
+#define OBD_CALC_STRIPE_START   1
+#define OBD_CALC_STRIPE_END     2
+
+static inline struct lustre_capa *oinfo_capa(struct obd_info *oinfo)
+{
+	return oinfo->oi_capa;
+}
+
+static inline struct md_open_data *obd_mod_alloc(void)
+{
+	struct md_open_data *mod;
+	OBD_ALLOC_PTR(mod);
+	if (mod == NULL)
+		return NULL;
+	atomic_set(&mod->mod_refcount, 1);
+	return mod;
+}
+
+#define obd_mod_get(mod) atomic_inc(&(mod)->mod_refcount)
+#define obd_mod_put(mod)					\
+({							      \
+	if (atomic_dec_and_test(&(mod)->mod_refcount)) {	  \
+		if ((mod)->mod_open_req)			  \
+			ptlrpc_req_finished((mod)->mod_open_req);   \
+		OBD_FREE_PTR(mod);			      \
+	}						       \
+})
+
+void obdo_from_inode(struct obdo *dst, struct inode *src, obd_flag valid);
+void obdo_set_parent_fid(struct obdo *dst, const struct lu_fid *parent);
+
+/* return 1 if client should be resend request */
+static inline int client_should_resend(int resend, struct client_obd *cli)
+{
+	return atomic_read(&cli->cl_resends) ?
+	       atomic_read(&cli->cl_resends) > resend : 1;
+}
+
+/**
+ * Return device name for this device
+ *
+ * XXX: lu_device is declared before obd_device, while a pointer pointing
+ * back to obd_device in lu_device, so this helper function defines here
+ * instead of in lu_object.h
+ */
+static inline const char *lu_dev_name(const struct lu_device *lu_dev)
+{
+	return lu_dev->ld_obd->obd_name;
+}
+
+static inline bool filename_is_volatile(const char *name, int namelen, int *idx)
+{
+	const char	*start;
+	char		*end;
+
+	if (strncmp(name, LUSTRE_VOLATILE_HDR, LUSTRE_VOLATILE_HDR_LEN) != 0)
+		return false;
+
+	/* caller does not care of idx */
+	if (idx == NULL)
+		return true;
+
+	/* volatile file, the MDT can be set from name */
+	/* name format is LUSTRE_VOLATILE_HDR:[idx]: */
+	/* if no MDT is specified, use std way */
+	if (namelen < LUSTRE_VOLATILE_HDR_LEN + 2)
+		goto bad_format;
+	/* test for no MDT idx case */
+	if ((*(name + LUSTRE_VOLATILE_HDR_LEN) == ':') &&
+	    (*(name + LUSTRE_VOLATILE_HDR_LEN + 1) == ':')) {
+		*idx = -1;
+		return true;
+	}
+	/* we have an idx, read it */
+	start = name + LUSTRE_VOLATILE_HDR_LEN + 1;
+	*idx = strtoul(start, &end, 0);
+	/* error cases:
+	 * no digit, no trailing :, negative value
+	 */
+	if (((*idx == 0) && (end == start)) ||
+	    (*end != ':') || (*idx < 0))
+		goto bad_format;
+
+	return true;
+bad_format:
+	/* bad format of mdt idx, we cannot return an error
+	 * to caller so we use hash algo */
+	CERROR("Bad volatile file name format: %s\n",
+	       name + LUSTRE_VOLATILE_HDR_LEN);
+	return false;
+}
+
+static inline int cli_brw_size(struct obd_device *obd)
+{
+	LASSERT(obd != NULL);
+	return obd->u.cli.cl_max_pages_per_rpc << PAGE_CACHE_SHIFT;
+}
+
+#endif /* __OBD_H */
diff --git a/drivers/staging/lustre/lustre/include/obd_cache.h b/drivers/staging/lustre/lustre/include/obd_cache.h
new file mode 100644
index 000000000000..c8249fbb0d72
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/obd_cache.h
@@ -0,0 +1,39 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _OBD_CACHE_H__
+#define _OBD_CACHE_H__
+
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/obd_cksum.h b/drivers/staging/lustre/lustre/include/obd_cksum.h
new file mode 100644
index 000000000000..5f740f1743ca
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/obd_cksum.h
@@ -0,0 +1,176 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __OBD_CKSUM
+#define __OBD_CKSUM
+#include <linux/libcfs/libcfs.h>
+#include <lustre/lustre_idl.h>
+
+static inline unsigned char cksum_obd2cfs(cksum_type_t cksum_type)
+{
+	switch (cksum_type) {
+	case OBD_CKSUM_CRC32:
+		return CFS_HASH_ALG_CRC32;
+	case OBD_CKSUM_ADLER:
+		return CFS_HASH_ALG_ADLER32;
+	case OBD_CKSUM_CRC32C:
+		return CFS_HASH_ALG_CRC32C;
+	default:
+		CERROR("Unknown checksum type (%x)!!!\n", cksum_type);
+		LBUG();
+	}
+	return 0;
+}
+
+/* The OBD_FL_CKSUM_* flags is packed into 5 bits of o_flags, since there can
+ * only be a single checksum type per RPC.
+ *
+ * The OBD_CHECKSUM_* type bits passed in ocd_cksum_types are a 32-bit bitmask
+ * since they need to represent the full range of checksum algorithms that
+ * both the client and server can understand.
+ *
+ * In case of an unsupported types/flags we fall back to ADLER
+ * because that is supported by all clients since 1.8
+ *
+ * In case multiple algorithms are supported the best one is used. */
+static inline obd_flag cksum_type_pack(cksum_type_t cksum_type)
+{
+	unsigned int    performance = 0, tmp;
+	obd_flag	flag = OBD_FL_CKSUM_ADLER;
+
+	if (cksum_type & OBD_CKSUM_CRC32) {
+		tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32));
+		if (tmp > performance) {
+			performance = tmp;
+			flag = OBD_FL_CKSUM_CRC32;
+		}
+	}
+	if (cksum_type & OBD_CKSUM_CRC32C) {
+		tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C));
+		if (tmp > performance) {
+			performance = tmp;
+			flag = OBD_FL_CKSUM_CRC32C;
+		}
+	}
+	if (cksum_type & OBD_CKSUM_ADLER) {
+		tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER));
+		if (tmp > performance) {
+			performance = tmp;
+			flag = OBD_FL_CKSUM_ADLER;
+		}
+	}
+	if (unlikely(cksum_type && !(cksum_type & (OBD_CKSUM_CRC32C |
+						   OBD_CKSUM_CRC32 |
+						   OBD_CKSUM_ADLER))))
+		CWARN("unknown cksum type %x\n", cksum_type);
+
+	return flag;
+}
+
+static inline cksum_type_t cksum_type_unpack(obd_flag o_flags)
+{
+	switch (o_flags & OBD_FL_CKSUM_ALL) {
+	case OBD_FL_CKSUM_CRC32C:
+		return OBD_CKSUM_CRC32C;
+	case OBD_FL_CKSUM_CRC32:
+		return OBD_CKSUM_CRC32;
+	default:
+		break;
+	}
+
+	return OBD_CKSUM_ADLER;
+}
+
+/* Return a bitmask of the checksum types supported on this system.
+ * 1.8 supported ADLER it is base and not depend on hw
+ * Client uses all available local algos
+ */
+static inline cksum_type_t cksum_types_supported_client(void)
+{
+	cksum_type_t ret = OBD_CKSUM_ADLER;
+
+	CDEBUG(D_INFO, "Crypto hash speed: crc %d, crc32c %d, adler %d\n",
+	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)),
+	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)),
+	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)));
+
+	if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)) > 0)
+		ret |= OBD_CKSUM_CRC32C;
+	if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)) > 0)
+		ret |= OBD_CKSUM_CRC32;
+
+	return ret;
+}
+
+/* Server uses algos that perform at 50% or better of the Adler */
+static inline cksum_type_t cksum_types_supported_server(void)
+{
+	int	     base_speed;
+	cksum_type_t    ret = OBD_CKSUM_ADLER;
+
+	CDEBUG(D_INFO, "Crypto hash speed: crc %d, crc32c %d, adler %d\n",
+	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)),
+	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)),
+	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)));
+
+	base_speed = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)) / 2;
+
+	if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)) >=
+	    base_speed)
+		ret |= OBD_CKSUM_CRC32C;
+	if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)) >=
+	    base_speed)
+		ret |= OBD_CKSUM_CRC32;
+
+	return ret;
+}
+
+
+/* Select the best checksum algorithm among those supplied in the cksum_types
+ * input.
+ *
+ * Currently, calling cksum_type_pack() with a mask will return the fastest
+ * checksum type due to its benchmarking at libcfs module load.
+ * Caution is advised, however, since what is fastest on a single client may
+ * not be the fastest or most efficient algorithm on the server.  */
+static inline cksum_type_t cksum_type_select(cksum_type_t cksum_types)
+{
+	return cksum_type_unpack(cksum_type_pack(cksum_types));
+}
+
+/* Checksum algorithm names. Must be defined in the same order as the
+ * OBD_CKSUM_* flags. */
+#define DECLARE_CKSUM_NAME char *cksum_name[] = {"crc32", "adler", "crc32c"}
+
+#endif /* __OBD_H */
diff --git a/drivers/staging/lustre/lustre/include/obd_class.h b/drivers/staging/lustre/lustre/include/obd_class.h
new file mode 100644
index 000000000000..de5c5853647f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/obd_class.h
@@ -0,0 +1,2281 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#ifndef __CLASS_OBD_H
+#define __CLASS_OBD_H
+
+
+#include <obd_support.h>
+#include <lustre_import.h>
+#include <lustre_net.h>
+#include <obd.h>
+#include <lustre_lib.h>
+#include <lustre/lustre_idl.h>
+#include <lprocfs_status.h>
+
+#include <linux/obd_class.h>
+
+#define OBD_STATFS_NODELAY      0x0001  /* requests should be send without delay
+					 * and resends for avoid deadlocks */
+#define OBD_STATFS_FROM_CACHE   0x0002  /* the statfs callback should not update
+					 * obd_osfs_age */
+#define OBD_STATFS_PTLRPCD      0x0004  /* requests will be sent via ptlrpcd
+					 * instead of a specific set. This
+					 * means that we cannot rely on the set
+					 * interpret routine to be called.
+					 * lov_statfs_fini() must thus be called
+					 * by the request interpret routine */
+#define OBD_STATFS_FOR_MDT0	0x0008	/* The statfs is only for retrieving
+					 * information from MDT0. */
+#define OBD_FL_PUNCH    0x00000001      /* To indicate it is punch operation */
+
+/* OBD Device Declarations */
+extern struct obd_device *obd_devs[MAX_OBD_DEVICES];
+extern rwlock_t obd_dev_lock;
+
+/* OBD Operations Declarations */
+extern struct obd_device *class_conn2obd(struct lustre_handle *);
+extern struct obd_device *class_exp2obd(struct obd_export *);
+extern int class_handle_ioctl(unsigned int cmd, unsigned long arg);
+extern int lustre_get_jobid(char *jobid);
+
+struct lu_device_type;
+
+/* genops.c */
+struct obd_export *class_conn2export(struct lustre_handle *);
+int class_register_type(struct obd_ops *, struct md_ops *,
+			struct lprocfs_vars *, const char *nm,
+			struct lu_device_type *ldt);
+int class_unregister_type(const char *nm);
+
+struct obd_device *class_newdev(const char *type_name, const char *name);
+void class_release_dev(struct obd_device *obd);
+
+int class_name2dev(const char *name);
+struct obd_device *class_name2obd(const char *name);
+int class_uuid2dev(struct obd_uuid *uuid);
+struct obd_device *class_uuid2obd(struct obd_uuid *uuid);
+void class_obd_list(void);
+struct obd_device * class_find_client_obd(struct obd_uuid *tgt_uuid,
+					  const char * typ_name,
+					  struct obd_uuid *grp_uuid);
+struct obd_device * class_devices_in_group(struct obd_uuid *grp_uuid,
+					   int *next);
+struct obd_device * class_num2obd(int num);
+int get_devices_count(void);
+
+int class_notify_sptlrpc_conf(const char *fsname, int namelen);
+
+char *obd_export_nid2str(struct obd_export *exp);
+
+int obd_export_evict_by_nid(struct obd_device *obd, const char *nid);
+int obd_export_evict_by_uuid(struct obd_device *obd, const char *uuid);
+int obd_connect_flags2str(char *page, int count, __u64 flags, char *sep);
+
+int obd_zombie_impexp_init(void);
+void obd_zombie_impexp_stop(void);
+void obd_zombie_impexp_cull(void);
+void obd_zombie_barrier(void);
+void obd_exports_barrier(struct obd_device *obd);
+int kuc_len(int payload_len);
+struct kuc_hdr * kuc_ptr(void *p);
+int kuc_ispayload(void *p);
+void *kuc_alloc(int payload_len, int transport, int type);
+void kuc_free(void *p, int payload_len);
+
+struct llog_handle;
+struct llog_rec_hdr;
+typedef int (*llog_cb_t)(const struct lu_env *, struct llog_handle *,
+			 struct llog_rec_hdr *, void *);
+/* obd_config.c */
+struct lustre_cfg *lustre_cfg_rename(struct lustre_cfg *cfg,
+				     const char *new_name);
+int class_process_config(struct lustre_cfg *lcfg);
+int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars,
+			     struct lustre_cfg *lcfg, void *data);
+int class_attach(struct lustre_cfg *lcfg);
+int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg);
+int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg);
+int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg);
+struct obd_device *class_incref(struct obd_device *obd,
+				const char *scope, const void *source);
+void class_decref(struct obd_device *obd,
+		  const char *scope, const void *source);
+void dump_exports(struct obd_device *obd, int locks);
+int class_config_llog_handler(const struct lu_env *env,
+			      struct llog_handle *handle,
+			      struct llog_rec_hdr *rec, void *data);
+int class_add_conn(struct obd_device *obd, struct lustre_cfg *lcfg);
+int class_add_uuid(const char *uuid, __u64 nid);
+
+/*obdecho*/
+#ifdef LPROCFS
+extern void lprocfs_echo_init_vars(struct lprocfs_static_vars *lvars);
+#else
+static inline void lprocfs_echo_init_vars(struct lprocfs_static_vars *lvars)
+{
+	memset(lvars, 0, sizeof(*lvars));
+}
+#endif
+
+#define CFG_F_START     0x01   /* Set when we start updating from a log */
+#define CFG_F_MARKER    0x02   /* We are within a maker */
+#define CFG_F_SKIP      0x04   /* We should ignore this cfg command */
+#define CFG_F_COMPAT146 0x08   /* Allow old-style logs */
+#define CFG_F_EXCLUDE   0x10   /* OST exclusion list */
+
+/* Passed as data param to class_config_parse_llog */
+struct config_llog_instance {
+	char	       *cfg_obdname;
+	void	       *cfg_instance;
+	struct super_block *cfg_sb;
+	struct obd_uuid     cfg_uuid;
+	llog_cb_t	    cfg_callback;
+	int		 cfg_last_idx; /* for partial llog processing */
+	int		 cfg_flags;
+};
+int class_config_parse_llog(const struct lu_env *env, struct llog_ctxt *ctxt,
+			    char *name, struct config_llog_instance *cfg);
+int class_config_dump_llog(const struct lu_env *env, struct llog_ctxt *ctxt,
+			   char *name, struct config_llog_instance *cfg);
+
+enum {
+	CONFIG_T_CONFIG  = 0,
+	CONFIG_T_SPTLRPC = 1,
+	CONFIG_T_RECOVER = 2,
+	CONFIG_T_MAX     = 3
+};
+
+/* list of active configuration logs  */
+struct config_llog_data {
+	struct ldlm_res_id	  cld_resid;
+	struct config_llog_instance cld_cfg;
+	struct list_head		  cld_list_chain;
+	atomic_t		cld_refcount;
+	struct config_llog_data    *cld_sptlrpc;/* depended sptlrpc log */
+	struct config_llog_data    *cld_recover;    /* imperative recover log */
+	struct obd_export	  *cld_mgcexp;
+	struct mutex		    cld_lock;
+	int			 cld_type;
+	unsigned int		cld_stopping:1, /* we were told to stop
+						     * watching */
+				    cld_lostlock:1; /* lock not requeued */
+	char			cld_logname[0];
+};
+
+struct lustre_profile {
+	struct list_head       lp_list;
+	char	    *lp_profile;
+	char	    *lp_dt;
+	char	    *lp_md;
+};
+
+struct lustre_profile *class_get_profile(const char * prof);
+void class_del_profile(const char *prof);
+void class_del_profiles(void);
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+
+void __class_export_add_lock_ref(struct obd_export *, struct ldlm_lock *);
+void __class_export_del_lock_ref(struct obd_export *, struct ldlm_lock *);
+extern void (*class_export_dump_hook)(struct obd_export *);
+
+#else
+
+#define __class_export_add_lock_ref(exp, lock)	     do {} while(0)
+#define __class_export_del_lock_ref(exp, lock)	     do {} while(0)
+
+#endif
+
+#define class_export_rpc_inc(exp)				       \
+({								      \
+	atomic_inc(&(exp)->exp_rpc_count);			  \
+	CDEBUG(D_INFO, "RPC GETting export %p : new rpc_count %d\n",    \
+	       (exp), atomic_read(&(exp)->exp_rpc_count));	  \
+})
+
+#define class_export_rpc_dec(exp)				       \
+({								      \
+	LASSERT_ATOMIC_POS(&exp->exp_rpc_count);			\
+	atomic_dec(&(exp)->exp_rpc_count);			  \
+	CDEBUG(D_INFO, "RPC PUTting export %p : new rpc_count %d\n",    \
+	       (exp), atomic_read(&(exp)->exp_rpc_count));	  \
+})
+
+#define class_export_lock_get(exp, lock)				\
+({								      \
+	atomic_inc(&(exp)->exp_locks_count);			\
+	__class_export_add_lock_ref(exp, lock);			 \
+	CDEBUG(D_INFO, "lock GETting export %p : new locks_count %d\n", \
+	       (exp), atomic_read(&(exp)->exp_locks_count));	\
+	class_export_get(exp);					  \
+})
+
+#define class_export_lock_put(exp, lock)				\
+({								      \
+	LASSERT_ATOMIC_POS(&exp->exp_locks_count);		      \
+	atomic_dec(&(exp)->exp_locks_count);			\
+	__class_export_del_lock_ref(exp, lock);			 \
+	CDEBUG(D_INFO, "lock PUTting export %p : new locks_count %d\n", \
+	       (exp), atomic_read(&(exp)->exp_locks_count));	\
+	class_export_put(exp);					  \
+})
+
+#define class_export_cb_get(exp)					\
+({								      \
+	atomic_inc(&(exp)->exp_cb_count);			   \
+	CDEBUG(D_INFO, "callback GETting export %p : new cb_count %d\n",\
+	       (exp), atomic_read(&(exp)->exp_cb_count));	   \
+	class_export_get(exp);					  \
+})
+
+#define class_export_cb_put(exp)					\
+({								      \
+	LASSERT_ATOMIC_POS(&exp->exp_cb_count);			 \
+	atomic_dec(&(exp)->exp_cb_count);			   \
+	CDEBUG(D_INFO, "callback PUTting export %p : new cb_count %d\n",\
+	       (exp), atomic_read(&(exp)->exp_cb_count));	   \
+	class_export_put(exp);					  \
+})
+
+/* genops.c */
+struct obd_export *class_export_get(struct obd_export *exp);
+void class_export_put(struct obd_export *exp);
+struct obd_export *class_new_export(struct obd_device *obddev,
+				    struct obd_uuid *cluuid);
+void class_unlink_export(struct obd_export *exp);
+
+struct obd_import *class_import_get(struct obd_import *);
+void class_import_put(struct obd_import *);
+struct obd_import *class_new_import(struct obd_device *obd);
+void class_destroy_import(struct obd_import *exp);
+
+struct obd_type *class_search_type(const char *name);
+struct obd_type *class_get_type(const char *name);
+void class_put_type(struct obd_type *type);
+int class_connect(struct lustre_handle *conn, struct obd_device *obd,
+		  struct obd_uuid *cluuid);
+int class_disconnect(struct obd_export *exp);
+void class_fail_export(struct obd_export *exp);
+int class_connected_export(struct obd_export *exp);
+void class_disconnect_exports(struct obd_device *obddev);
+int class_manual_cleanup(struct obd_device *obd);
+void class_disconnect_stale_exports(struct obd_device *,
+				    int (*test_export)(struct obd_export *));
+static inline enum obd_option exp_flags_from_obd(struct obd_device *obd)
+{
+	return ((obd->obd_fail ? OBD_OPT_FAILOVER : 0) |
+		(obd->obd_force ? OBD_OPT_FORCE : 0) |
+		(obd->obd_abort_recovery ? OBD_OPT_ABORT_RECOV : 0) |
+		0);
+}
+
+
+void obdo_cpy_md(struct obdo *dst, struct obdo *src, obd_flag valid);
+void obdo_to_ioobj(struct obdo *oa, struct obd_ioobj *ioobj);
+void obdo_from_iattr(struct obdo *oa, struct iattr *attr,
+		     unsigned int ia_valid);
+void iattr_from_obdo(struct iattr *attr, struct obdo *oa, obd_flag valid);
+void md_from_obdo(struct md_op_data *op_data, struct obdo *oa, obd_flag valid);
+void obdo_from_md(struct obdo *oa, struct md_op_data *op_data,
+		  unsigned int valid);
+
+void obdo_cpu_to_le(struct obdo *dobdo, struct obdo *sobdo);
+void obdo_le_to_cpu(struct obdo *dobdo, struct obdo *sobdo);
+
+#define OBT(dev)	(dev)->obd_type
+#define OBP(dev, op)    (dev)->obd_type->typ_dt_ops->o_ ## op
+#define MDP(dev, op)    (dev)->obd_type->typ_md_ops->m_ ## op
+#define CTXTP(ctxt, op) (ctxt)->loc_logops->lop_##op
+
+/* Ensure obd_setup: used for cleanup which must be called
+   while obd is stopping */
+#define OBD_CHECK_DEV(obd)				      \
+do {							    \
+	if (!(obd)) {					   \
+		CERROR("NULL device\n");			\
+		RETURN(-ENODEV);				\
+	}						       \
+} while (0)
+
+/* ensure obd_setup and !obd_stopping */
+#define OBD_CHECK_DEV_ACTIVE(obd)			       \
+do {							    \
+	OBD_CHECK_DEV(obd);				     \
+	if (!(obd)->obd_set_up || (obd)->obd_stopping) {	\
+		CERROR("Device %d not setup\n",		 \
+		       (obd)->obd_minor);		       \
+		RETURN(-ENODEV);				\
+	}						       \
+} while (0)
+
+
+#ifdef LPROCFS
+#define OBD_COUNTER_OFFSET(op)				  \
+	((offsetof(struct obd_ops, o_ ## op) -		  \
+	  offsetof(struct obd_ops, o_iocontrol))		\
+	 / sizeof(((struct obd_ops *)(0))->o_iocontrol))
+
+#define OBD_COUNTER_INCREMENT(obdx, op)			   \
+	if ((obdx)->obd_stats != NULL) {			  \
+		unsigned int coffset;			     \
+		coffset = (unsigned int)((obdx)->obd_cntr_base) + \
+			OBD_COUNTER_OFFSET(op);		   \
+		LASSERT(coffset < (obdx)->obd_stats->ls_num);     \
+		lprocfs_counter_incr((obdx)->obd_stats, coffset); \
+	}
+
+#define EXP_COUNTER_INCREMENT(export, op)				    \
+	if ((export)->exp_obd->obd_stats != NULL) {			  \
+		unsigned int coffset;					\
+		coffset = (unsigned int)((export)->exp_obd->obd_cntr_base) + \
+			OBD_COUNTER_OFFSET(op);			      \
+		LASSERT(coffset < (export)->exp_obd->obd_stats->ls_num);     \
+		lprocfs_counter_incr((export)->exp_obd->obd_stats, coffset); \
+		if ((export)->exp_nid_stats != NULL &&		       \
+		    (export)->exp_nid_stats->nid_stats != NULL)	      \
+			lprocfs_counter_incr(				\
+				(export)->exp_nid_stats->nid_stats, coffset);\
+	}
+
+#define MD_COUNTER_OFFSET(op)				   \
+	((offsetof(struct md_ops, m_ ## op) -		   \
+	  offsetof(struct md_ops, m_getstatus))		 \
+	 / sizeof(((struct md_ops *)(0))->m_getstatus))
+
+#define MD_COUNTER_INCREMENT(obdx, op)			   \
+	if ((obd)->md_stats != NULL) {			   \
+		unsigned int coffset;			    \
+		coffset = (unsigned int)((obdx)->md_cntr_base) + \
+			MD_COUNTER_OFFSET(op);		   \
+		LASSERT(coffset < (obdx)->md_stats->ls_num);     \
+		lprocfs_counter_incr((obdx)->md_stats, coffset); \
+	}
+
+#define EXP_MD_COUNTER_INCREMENT(export, op)				 \
+	if ((export)->exp_obd->obd_stats != NULL) {			  \
+		unsigned int coffset;					\
+		coffset = (unsigned int)((export)->exp_obd->md_cntr_base) +  \
+			MD_COUNTER_OFFSET(op);			       \
+		LASSERT(coffset < (export)->exp_obd->md_stats->ls_num);      \
+		lprocfs_counter_incr((export)->exp_obd->md_stats, coffset);  \
+		if ((export)->exp_md_stats != NULL)			  \
+			lprocfs_counter_incr(				\
+				(export)->exp_md_stats, coffset);	    \
+	}
+
+#else
+#define OBD_COUNTER_OFFSET(op)
+#define OBD_COUNTER_INCREMENT(obd, op)
+#define EXP_COUNTER_INCREMENT(exp, op)
+#define MD_COUNTER_INCREMENT(obd, op)
+#define EXP_MD_COUNTER_INCREMENT(exp, op)
+#endif
+
+static inline int lprocfs_nid_ldlm_stats_init(struct nid_stat* tmp)
+{
+	/* Always add in ldlm_stats */
+	tmp->nid_ldlm_stats = lprocfs_alloc_stats(LDLM_LAST_OPC - LDLM_FIRST_OPC
+						  ,LPROCFS_STATS_FLAG_NOPERCPU);
+	if (tmp->nid_ldlm_stats == NULL)
+		return -ENOMEM;
+
+	lprocfs_init_ldlm_stats(tmp->nid_ldlm_stats);
+
+	return lprocfs_register_stats(tmp->nid_proc, "ldlm_stats",
+				      tmp->nid_ldlm_stats);
+}
+
+#define OBD_CHECK_MD_OP(obd, op, err)			   \
+do {							    \
+	if (!OBT(obd) || !MDP((obd), op)) {		     \
+		if (err)					\
+			CERROR("md_" #op ": dev %s/%d no operation\n", \
+			       obd->obd_name, obd->obd_minor);  \
+		RETURN(err);				    \
+	}						       \
+} while (0)
+
+#define EXP_CHECK_MD_OP(exp, op)				\
+do {							    \
+	if ((exp) == NULL) {				    \
+		CERROR("obd_" #op ": NULL export\n");	   \
+		RETURN(-ENODEV);				\
+	}						       \
+	if ((exp)->exp_obd == NULL || !OBT((exp)->exp_obd)) {   \
+		CERROR("obd_" #op ": cleaned up obd\n");	\
+		RETURN(-EOPNOTSUPP);			    \
+	}						       \
+	if (!OBT((exp)->exp_obd) || !MDP((exp)->exp_obd, op)) { \
+		CERROR("obd_" #op ": dev %s/%d no operation\n", \
+		       (exp)->exp_obd->obd_name,		\
+		       (exp)->exp_obd->obd_minor);	      \
+		RETURN(-EOPNOTSUPP);			    \
+	}						       \
+} while (0)
+
+
+#define OBD_CHECK_DT_OP(obd, op, err)			   \
+do {							    \
+	if (!OBT(obd) || !OBP((obd), op)) {		     \
+		if (err)					\
+			CERROR("obd_" #op ": dev %d no operation\n",    \
+			       obd->obd_minor);		 \
+		RETURN(err);				    \
+	}						       \
+} while (0)
+
+#define EXP_CHECK_DT_OP(exp, op)				\
+do {							    \
+	if ((exp) == NULL) {				    \
+		CERROR("obd_" #op ": NULL export\n");	   \
+		RETURN(-ENODEV);				\
+	}						       \
+	if ((exp)->exp_obd == NULL || !OBT((exp)->exp_obd)) {   \
+		CERROR("obd_" #op ": cleaned up obd\n");	\
+		RETURN(-EOPNOTSUPP);			    \
+	}						       \
+	if (!OBT((exp)->exp_obd) || !OBP((exp)->exp_obd, op)) { \
+		CERROR("obd_" #op ": dev %d no operation\n",    \
+		       (exp)->exp_obd->obd_minor);	      \
+		RETURN(-EOPNOTSUPP);			    \
+	}						       \
+} while (0)
+
+#define CTXT_CHECK_OP(ctxt, op, err)				 \
+do {								 \
+	if (!OBT(ctxt->loc_obd) || !CTXTP((ctxt), op)) {	     \
+		if (err)					     \
+			CERROR("lop_" #op ": dev %d no operation\n", \
+			       ctxt->loc_obd->obd_minor);	    \
+		RETURN(err);					 \
+	}							    \
+} while (0)
+
+static inline int class_devno_max(void)
+{
+	return MAX_OBD_DEVICES;
+}
+
+static inline int obd_get_info(const struct lu_env *env,
+			       struct obd_export *exp, __u32 keylen,
+			       void *key, __u32 *vallen, void *val,
+			       struct lov_stripe_md *lsm)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, get_info);
+	EXP_COUNTER_INCREMENT(exp, get_info);
+
+	rc = OBP(exp->exp_obd, get_info)(env, exp, keylen, key, vallen, val,
+					 lsm);
+	RETURN(rc);
+}
+
+static inline int obd_set_info_async(const struct lu_env *env,
+				     struct obd_export *exp, obd_count keylen,
+				     void *key, obd_count vallen, void *val,
+				     struct ptlrpc_request_set *set)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, set_info_async);
+	EXP_COUNTER_INCREMENT(exp, set_info_async);
+
+	rc = OBP(exp->exp_obd, set_info_async)(env, exp, keylen, key, vallen,
+					       val, set);
+	RETURN(rc);
+}
+
+/*
+ * obd-lu integration.
+ *
+ * Functionality is being moved into new lu_device-based layering, but some
+ * pieces of configuration process are still based on obd devices.
+ *
+ * Specifically, lu_device_type_operations::ldto_device_alloc() methods fully
+ * subsume ->o_setup() methods of obd devices they replace. The same for
+ * lu_device_operations::ldo_process_config() and ->o_process_config(). As a
+ * result, obd_setup() and obd_process_config() branch and call one XOR
+ * another.
+ *
+ * Yet neither lu_device_type_operations::ldto_device_fini() nor
+ * lu_device_type_operations::ldto_device_free() fully implement the
+ * functionality of ->o_precleanup() and ->o_cleanup() they override. Hence,
+ * obd_precleanup() and obd_cleanup() call both lu_device and obd operations.
+ */
+
+#define DECLARE_LU_VARS(ldt, d)		 \
+	struct lu_device_type *ldt;       \
+	struct lu_device *d
+
+static inline int obd_setup(struct obd_device *obd, struct lustre_cfg *cfg)
+{
+	int rc;
+	DECLARE_LU_VARS(ldt, d);
+	ENTRY;
+
+	ldt = obd->obd_type->typ_lu;
+	if (ldt != NULL) {
+		struct lu_context  session_ctx;
+		struct lu_env env;
+		lu_context_init(&session_ctx, LCT_SESSION);
+		session_ctx.lc_thread = NULL;
+		lu_context_enter(&session_ctx);
+
+		rc = lu_env_init(&env, ldt->ldt_ctx_tags);
+		if (rc == 0) {
+			env.le_ses = &session_ctx;
+			d = ldt->ldt_ops->ldto_device_alloc(&env, ldt, cfg);
+			lu_env_fini(&env);
+			if (!IS_ERR(d)) {
+				obd->obd_lu_dev = d;
+				d->ld_obd = obd;
+				rc = 0;
+			} else
+				rc = PTR_ERR(d);
+		}
+		lu_context_exit(&session_ctx);
+		lu_context_fini(&session_ctx);
+
+	} else {
+		OBD_CHECK_DT_OP(obd, setup, -EOPNOTSUPP);
+		OBD_COUNTER_INCREMENT(obd, setup);
+		rc = OBP(obd, setup)(obd, cfg);
+	}
+	RETURN(rc);
+}
+
+static inline int obd_precleanup(struct obd_device *obd,
+				 enum obd_cleanup_stage cleanup_stage)
+{
+	int rc;
+	DECLARE_LU_VARS(ldt, d);
+	ENTRY;
+
+	OBD_CHECK_DEV(obd);
+	ldt = obd->obd_type->typ_lu;
+	d = obd->obd_lu_dev;
+	if (ldt != NULL && d != NULL) {
+		if (cleanup_stage == OBD_CLEANUP_EXPORTS) {
+			struct lu_env env;
+
+			rc = lu_env_init(&env, ldt->ldt_ctx_tags);
+			if (rc == 0) {
+				ldt->ldt_ops->ldto_device_fini(&env, d);
+				lu_env_fini(&env);
+			}
+		}
+	}
+	OBD_CHECK_DT_OP(obd, precleanup, 0);
+	OBD_COUNTER_INCREMENT(obd, precleanup);
+
+	rc = OBP(obd, precleanup)(obd, cleanup_stage);
+	RETURN(rc);
+}
+
+static inline int obd_cleanup(struct obd_device *obd)
+{
+	int rc;
+	DECLARE_LU_VARS(ldt, d);
+	ENTRY;
+
+	OBD_CHECK_DEV(obd);
+
+	ldt = obd->obd_type->typ_lu;
+	d = obd->obd_lu_dev;
+	if (ldt != NULL && d != NULL) {
+		struct lu_env env;
+
+		rc = lu_env_init(&env, ldt->ldt_ctx_tags);
+		if (rc == 0) {
+			ldt->ldt_ops->ldto_device_free(&env, d);
+			lu_env_fini(&env);
+			obd->obd_lu_dev = NULL;
+		}
+	}
+	OBD_CHECK_DT_OP(obd, cleanup, 0);
+	OBD_COUNTER_INCREMENT(obd, cleanup);
+
+	rc = OBP(obd, cleanup)(obd);
+	RETURN(rc);
+}
+
+static inline void obd_cleanup_client_import(struct obd_device *obd)
+{
+	ENTRY;
+
+	/* If we set up but never connected, the
+	   client import will not have been cleaned. */
+	down_write(&obd->u.cli.cl_sem);
+	if (obd->u.cli.cl_import) {
+		struct obd_import *imp;
+		imp = obd->u.cli.cl_import;
+		CDEBUG(D_CONFIG, "%s: client import never connected\n",
+		       obd->obd_name);
+		ptlrpc_invalidate_import(imp);
+		if (imp->imp_rq_pool) {
+			ptlrpc_free_rq_pool(imp->imp_rq_pool);
+			imp->imp_rq_pool = NULL;
+		}
+		client_destroy_import(imp);
+		obd->u.cli.cl_import = NULL;
+	}
+	up_write(&obd->u.cli.cl_sem);
+
+	EXIT;
+}
+
+static inline int
+obd_process_config(struct obd_device *obd, int datalen, void *data)
+{
+	int rc;
+	DECLARE_LU_VARS(ldt, d);
+	ENTRY;
+
+	OBD_CHECK_DEV(obd);
+
+	obd->obd_process_conf = 1;
+	ldt = obd->obd_type->typ_lu;
+	d = obd->obd_lu_dev;
+	if (ldt != NULL && d != NULL) {
+		struct lu_env env;
+
+		rc = lu_env_init(&env, ldt->ldt_ctx_tags);
+		if (rc == 0) {
+			rc = d->ld_ops->ldo_process_config(&env, d, data);
+			lu_env_fini(&env);
+		}
+	} else {
+		OBD_CHECK_DT_OP(obd, process_config, -EOPNOTSUPP);
+		rc = OBP(obd, process_config)(obd, datalen, data);
+	}
+	OBD_COUNTER_INCREMENT(obd, process_config);
+	obd->obd_process_conf = 0;
+
+	RETURN(rc);
+}
+
+/* Pack an in-memory MD struct for storage on disk.
+ * Returns +ve size of packed MD (0 for free), or -ve error.
+ *
+ * If @disk_tgt == NULL, MD size is returned (max size if @mem_src == NULL).
+ * If @*disk_tgt != NULL and @mem_src == NULL, @*disk_tgt will be freed.
+ * If @*disk_tgt == NULL, it will be allocated
+ */
+static inline int obd_packmd(struct obd_export *exp,
+			     struct lov_mds_md **disk_tgt,
+			     struct lov_stripe_md *mem_src)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, packmd);
+	EXP_COUNTER_INCREMENT(exp, packmd);
+
+	rc = OBP(exp->exp_obd, packmd)(exp, disk_tgt, mem_src);
+	RETURN(rc);
+}
+
+static inline int obd_size_diskmd(struct obd_export *exp,
+				  struct lov_stripe_md *mem_src)
+{
+	return obd_packmd(exp, NULL, mem_src);
+}
+
+/* helper functions */
+static inline int obd_alloc_diskmd(struct obd_export *exp,
+				   struct lov_mds_md **disk_tgt)
+{
+	LASSERT(disk_tgt);
+	LASSERT(*disk_tgt == NULL);
+	return obd_packmd(exp, disk_tgt, NULL);
+}
+
+static inline int obd_free_diskmd(struct obd_export *exp,
+				  struct lov_mds_md **disk_tgt)
+{
+	LASSERT(disk_tgt);
+	LASSERT(*disk_tgt);
+	/*
+	 * LU-2590, for caller's convenience, *disk_tgt could be host
+	 * endianness, it needs swab to LE if necessary, while just
+	 * lov_mds_md header needs it for figuring out how much memory
+	 * needs to be freed.
+	 */
+	if ((cpu_to_le32(LOV_MAGIC) != LOV_MAGIC) &&
+	    (((*disk_tgt)->lmm_magic == LOV_MAGIC_V1) ||
+	     ((*disk_tgt)->lmm_magic == LOV_MAGIC_V3)))
+		lustre_swab_lov_mds_md(*disk_tgt);
+	return obd_packmd(exp, disk_tgt, NULL);
+}
+
+/* Unpack an MD struct from disk to in-memory format.
+ * Returns +ve size of unpacked MD (0 for free), or -ve error.
+ *
+ * If @mem_tgt == NULL, MD size is returned (max size if @disk_src == NULL).
+ * If @*mem_tgt != NULL and @disk_src == NULL, @*mem_tgt will be freed.
+ * If @*mem_tgt == NULL, it will be allocated
+ */
+static inline int obd_unpackmd(struct obd_export *exp,
+			       struct lov_stripe_md **mem_tgt,
+			       struct lov_mds_md *disk_src,
+			       int disk_len)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, unpackmd);
+	EXP_COUNTER_INCREMENT(exp, unpackmd);
+
+	rc = OBP(exp->exp_obd, unpackmd)(exp, mem_tgt, disk_src, disk_len);
+	RETURN(rc);
+}
+
+/* helper functions */
+static inline int obd_alloc_memmd(struct obd_export *exp,
+				  struct lov_stripe_md **mem_tgt)
+{
+	LASSERT(mem_tgt);
+	LASSERT(*mem_tgt == NULL);
+	return obd_unpackmd(exp, mem_tgt, NULL, 0);
+}
+
+static inline int obd_free_memmd(struct obd_export *exp,
+				 struct lov_stripe_md **mem_tgt)
+{
+	int rc;
+
+	LASSERT(mem_tgt);
+	LASSERT(*mem_tgt);
+	rc = obd_unpackmd(exp, mem_tgt, NULL, 0);
+	*mem_tgt = NULL;
+	return rc;
+}
+
+static inline int obd_precreate(struct obd_export *exp)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, precreate);
+	OBD_COUNTER_INCREMENT(exp->exp_obd, precreate);
+
+	rc = OBP(exp->exp_obd, precreate)(exp);
+	RETURN(rc);
+}
+
+static inline int obd_create_async(struct obd_export *exp,
+				   struct obd_info *oinfo,
+				   struct lov_stripe_md **ea,
+				   struct obd_trans_info *oti)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, create_async);
+	EXP_COUNTER_INCREMENT(exp, create_async);
+
+	rc = OBP(exp->exp_obd, create_async)(exp, oinfo, ea, oti);
+	RETURN(rc);
+}
+
+static inline int obd_create(const struct lu_env *env, struct obd_export *exp,
+			     struct obdo *obdo, struct lov_stripe_md **ea,
+			     struct obd_trans_info *oti)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, create);
+	EXP_COUNTER_INCREMENT(exp, create);
+
+	rc = OBP(exp->exp_obd, create)(env, exp, obdo, ea, oti);
+	RETURN(rc);
+}
+
+static inline int obd_destroy(const struct lu_env *env, struct obd_export *exp,
+			      struct obdo *obdo, struct lov_stripe_md *ea,
+			      struct obd_trans_info *oti,
+			      struct obd_export *md_exp, void *capa)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, destroy);
+	EXP_COUNTER_INCREMENT(exp, destroy);
+
+	rc = OBP(exp->exp_obd, destroy)(env, exp, obdo, ea, oti, md_exp, capa);
+	RETURN(rc);
+}
+
+static inline int obd_getattr(const struct lu_env *env, struct obd_export *exp,
+			      struct obd_info *oinfo)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, getattr);
+	EXP_COUNTER_INCREMENT(exp, getattr);
+
+	rc = OBP(exp->exp_obd, getattr)(env, exp, oinfo);
+	RETURN(rc);
+}
+
+static inline int obd_getattr_async(struct obd_export *exp,
+				    struct obd_info *oinfo,
+				    struct ptlrpc_request_set *set)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, getattr_async);
+	EXP_COUNTER_INCREMENT(exp, getattr_async);
+
+	rc = OBP(exp->exp_obd, getattr_async)(exp, oinfo, set);
+	RETURN(rc);
+}
+
+static inline int obd_setattr(const struct lu_env *env, struct obd_export *exp,
+			      struct obd_info *oinfo,
+			      struct obd_trans_info *oti)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, setattr);
+	EXP_COUNTER_INCREMENT(exp, setattr);
+
+	rc = OBP(exp->exp_obd, setattr)(env, exp, oinfo, oti);
+	RETURN(rc);
+}
+
+/* This performs all the requests set init/wait/destroy actions. */
+static inline int obd_setattr_rqset(struct obd_export *exp,
+				    struct obd_info *oinfo,
+				    struct obd_trans_info *oti)
+{
+	struct ptlrpc_request_set *set = NULL;
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, setattr_async);
+	EXP_COUNTER_INCREMENT(exp, setattr_async);
+
+	set =  ptlrpc_prep_set();
+	if (set == NULL)
+		RETURN(-ENOMEM);
+
+	rc = OBP(exp->exp_obd, setattr_async)(exp, oinfo, oti, set);
+	if (rc == 0)
+		rc = ptlrpc_set_wait(set);
+	ptlrpc_set_destroy(set);
+	RETURN(rc);
+}
+
+/* This adds all the requests into @set if @set != NULL, otherwise
+   all requests are sent asynchronously without waiting for response. */
+static inline int obd_setattr_async(struct obd_export *exp,
+				    struct obd_info *oinfo,
+				    struct obd_trans_info *oti,
+				    struct ptlrpc_request_set *set)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, setattr_async);
+	EXP_COUNTER_INCREMENT(exp, setattr_async);
+
+	rc = OBP(exp->exp_obd, setattr_async)(exp, oinfo, oti, set);
+	RETURN(rc);
+}
+
+static inline int obd_add_conn(struct obd_import *imp, struct obd_uuid *uuid,
+			       int priority)
+{
+	struct obd_device *obd = imp->imp_obd;
+	int rc;
+	ENTRY;
+
+	OBD_CHECK_DEV_ACTIVE(obd);
+	OBD_CHECK_DT_OP(obd, add_conn, -EOPNOTSUPP);
+	OBD_COUNTER_INCREMENT(obd, add_conn);
+
+	rc = OBP(obd, add_conn)(imp, uuid, priority);
+	RETURN(rc);
+}
+
+static inline int obd_del_conn(struct obd_import *imp, struct obd_uuid *uuid)
+{
+	struct obd_device *obd = imp->imp_obd;
+	int rc;
+	ENTRY;
+
+	OBD_CHECK_DEV_ACTIVE(obd);
+	OBD_CHECK_DT_OP(obd, del_conn, -EOPNOTSUPP);
+	OBD_COUNTER_INCREMENT(obd, del_conn);
+
+	rc = OBP(obd, del_conn)(imp, uuid);
+	RETURN(rc);
+}
+
+static inline struct obd_uuid *obd_get_uuid(struct obd_export *exp)
+{
+	struct obd_uuid *uuid;
+	ENTRY;
+
+	OBD_CHECK_DT_OP(exp->exp_obd, get_uuid, NULL);
+	EXP_COUNTER_INCREMENT(exp, get_uuid);
+
+	uuid = OBP(exp->exp_obd, get_uuid)(exp);
+	RETURN(uuid);
+}
+
+/** Create a new /a exp on device /a obd for the uuid /a cluuid
+ * @param exp New export handle
+ * @param d Connect data, supported flags are set, flags also understood
+ *    by obd are returned.
+ */
+static inline int obd_connect(const struct lu_env *env,
+			      struct obd_export **exp,struct obd_device *obd,
+			      struct obd_uuid *cluuid,
+			      struct obd_connect_data *data,
+			      void *localdata)
+{
+	int rc;
+	__u64 ocf = data ? data->ocd_connect_flags : 0; /* for post-condition
+						   * check */
+	ENTRY;
+
+	OBD_CHECK_DEV_ACTIVE(obd);
+	OBD_CHECK_DT_OP(obd, connect, -EOPNOTSUPP);
+	OBD_COUNTER_INCREMENT(obd, connect);
+
+	rc = OBP(obd, connect)(env, exp, obd, cluuid, data, localdata);
+	/* check that only subset is granted */
+	LASSERT(ergo(data != NULL, (data->ocd_connect_flags & ocf) ==
+				    data->ocd_connect_flags));
+	RETURN(rc);
+}
+
+static inline int obd_reconnect(const struct lu_env *env,
+				struct obd_export *exp,
+				struct obd_device *obd,
+				struct obd_uuid *cluuid,
+				struct obd_connect_data *d,
+				void *localdata)
+{
+	int rc;
+	__u64 ocf = d ? d->ocd_connect_flags : 0; /* for post-condition
+						   * check */
+
+	ENTRY;
+
+	OBD_CHECK_DEV_ACTIVE(obd);
+	OBD_CHECK_DT_OP(obd, reconnect, 0);
+	OBD_COUNTER_INCREMENT(obd, reconnect);
+
+	rc = OBP(obd, reconnect)(env, exp, obd, cluuid, d, localdata);
+	/* check that only subset is granted */
+	LASSERT(ergo(d != NULL,
+		     (d->ocd_connect_flags & ocf) == d->ocd_connect_flags));
+	RETURN(rc);
+}
+
+static inline int obd_disconnect(struct obd_export *exp)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, disconnect);
+	EXP_COUNTER_INCREMENT(exp, disconnect);
+
+	rc = OBP(exp->exp_obd, disconnect)(exp);
+	RETURN(rc);
+}
+
+static inline int obd_fid_init(struct obd_device *obd, struct obd_export *exp,
+			       enum lu_cli_type type)
+{
+	int rc;
+	ENTRY;
+
+	OBD_CHECK_DT_OP(obd, fid_init, 0);
+	OBD_COUNTER_INCREMENT(obd, fid_init);
+
+	rc = OBP(obd, fid_init)(obd, exp, type);
+	RETURN(rc);
+}
+
+static inline int obd_fid_fini(struct obd_device *obd)
+{
+	int rc;
+	ENTRY;
+
+	OBD_CHECK_DT_OP(obd, fid_fini, 0);
+	OBD_COUNTER_INCREMENT(obd, fid_fini);
+
+	rc = OBP(obd, fid_fini)(obd);
+	RETURN(rc);
+}
+
+static inline int obd_fid_alloc(struct obd_export *exp,
+				struct lu_fid *fid,
+				struct md_op_data *op_data)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, fid_alloc);
+	EXP_COUNTER_INCREMENT(exp, fid_alloc);
+
+	rc = OBP(exp->exp_obd, fid_alloc)(exp, fid, op_data);
+	RETURN(rc);
+}
+
+static inline int obd_ping(const struct lu_env *env, struct obd_export *exp)
+{
+	int rc;
+	ENTRY;
+
+	OBD_CHECK_DT_OP(exp->exp_obd, ping, 0);
+	EXP_COUNTER_INCREMENT(exp, ping);
+
+	rc = OBP(exp->exp_obd, ping)(env, exp);
+	RETURN(rc);
+}
+
+static inline int obd_pool_new(struct obd_device *obd, char *poolname)
+{
+	int rc;
+	ENTRY;
+
+	OBD_CHECK_DT_OP(obd, pool_new, -EOPNOTSUPP);
+	OBD_COUNTER_INCREMENT(obd, pool_new);
+
+	rc = OBP(obd, pool_new)(obd, poolname);
+	RETURN(rc);
+}
+
+static inline int obd_pool_del(struct obd_device *obd, char *poolname)
+{
+	int rc;
+	ENTRY;
+
+	OBD_CHECK_DT_OP(obd, pool_del, -EOPNOTSUPP);
+	OBD_COUNTER_INCREMENT(obd, pool_del);
+
+	rc = OBP(obd, pool_del)(obd, poolname);
+	RETURN(rc);
+}
+
+static inline int obd_pool_add(struct obd_device *obd, char *poolname, char *ostname)
+{
+	int rc;
+	ENTRY;
+
+	OBD_CHECK_DT_OP(obd, pool_add, -EOPNOTSUPP);
+	OBD_COUNTER_INCREMENT(obd, pool_add);
+
+	rc = OBP(obd, pool_add)(obd, poolname, ostname);
+	RETURN(rc);
+}
+
+static inline int obd_pool_rem(struct obd_device *obd, char *poolname, char *ostname)
+{
+	int rc;
+	ENTRY;
+
+	OBD_CHECK_DT_OP(obd, pool_rem, -EOPNOTSUPP);
+	OBD_COUNTER_INCREMENT(obd, pool_rem);
+
+	rc = OBP(obd, pool_rem)(obd, poolname, ostname);
+	RETURN(rc);
+}
+
+static inline void obd_getref(struct obd_device *obd)
+{
+	ENTRY;
+	if (OBT(obd) && OBP(obd, getref)) {
+		OBD_COUNTER_INCREMENT(obd, getref);
+		OBP(obd, getref)(obd);
+	}
+	EXIT;
+}
+
+static inline void obd_putref(struct obd_device *obd)
+{
+	ENTRY;
+	if (OBT(obd) && OBP(obd, putref)) {
+		OBD_COUNTER_INCREMENT(obd, putref);
+		OBP(obd, putref)(obd);
+	}
+	EXIT;
+}
+
+static inline int obd_init_export(struct obd_export *exp)
+{
+	int rc = 0;
+
+	ENTRY;
+	if ((exp)->exp_obd != NULL && OBT((exp)->exp_obd) &&
+	    OBP((exp)->exp_obd, init_export))
+		rc = OBP(exp->exp_obd, init_export)(exp);
+	RETURN(rc);
+}
+
+static inline int obd_destroy_export(struct obd_export *exp)
+{
+	ENTRY;
+	if ((exp)->exp_obd != NULL && OBT((exp)->exp_obd) &&
+	    OBP((exp)->exp_obd, destroy_export))
+		OBP(exp->exp_obd, destroy_export)(exp);
+	RETURN(0);
+}
+
+static inline int obd_extent_calc(struct obd_export *exp,
+				  struct lov_stripe_md *md,
+				  int cmd, obd_off *offset)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_DT_OP(exp, extent_calc);
+	rc = OBP(exp->exp_obd, extent_calc)(exp, md, cmd, offset);
+	RETURN(rc);
+}
+
+static inline struct dentry *
+obd_lvfs_fid2dentry(struct obd_export *exp, struct ost_id *oi, __u32 gen)
+{
+	struct lvfs_run_ctxt *ctxt = &exp->exp_obd->obd_lvfs_ctxt;
+	LASSERT(exp->exp_obd);
+
+	return ctxt->cb_ops.l_fid2dentry(ostid_id(oi), gen, ostid_seq(oi),
+					 exp->exp_obd);
+}
+
+/* @max_age is the oldest time in jiffies that we accept using a cached data.
+ * If the cache is older than @max_age we will get a new value from the
+ * target.  Use a value of "cfs_time_current() + HZ" to guarantee freshness. */
+static inline int obd_statfs_async(struct obd_export *exp,
+				   struct obd_info *oinfo,
+				   __u64 max_age,
+				   struct ptlrpc_request_set *rqset)
+{
+	int rc = 0;
+	struct obd_device *obd;
+	ENTRY;
+
+	if (exp == NULL || exp->exp_obd == NULL)
+		RETURN(-EINVAL);
+
+	obd = exp->exp_obd;
+	OBD_CHECK_DT_OP(obd, statfs, -EOPNOTSUPP);
+	OBD_COUNTER_INCREMENT(obd, statfs);
+
+	CDEBUG(D_SUPER, "%s: osfs %p age "LPU64", max_age "LPU64"\n",
+	       obd->obd_name, &obd->obd_osfs, obd->obd_osfs_age, max_age);
+	if (cfs_time_before_64(obd->obd_osfs_age, max_age)) {
+		rc = OBP(obd, statfs_async)(exp, oinfo, max_age, rqset);
+	} else {
+		CDEBUG(D_SUPER,"%s: use %p cache blocks "LPU64"/"LPU64
+		       " objects "LPU64"/"LPU64"\n",
+		       obd->obd_name, &obd->obd_osfs,
+		       obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks,
+		       obd->obd_osfs.os_ffree, obd->obd_osfs.os_files);
+		spin_lock(&obd->obd_osfs_lock);
+		memcpy(oinfo->oi_osfs, &obd->obd_osfs, sizeof(*oinfo->oi_osfs));
+		spin_unlock(&obd->obd_osfs_lock);
+		oinfo->oi_flags |= OBD_STATFS_FROM_CACHE;
+		if (oinfo->oi_cb_up)
+			oinfo->oi_cb_up(oinfo, 0);
+	}
+	RETURN(rc);
+}
+
+static inline int obd_statfs_rqset(struct obd_export *exp,
+				   struct obd_statfs *osfs, __u64 max_age,
+				   __u32 flags)
+{
+	struct ptlrpc_request_set *set = NULL;
+	struct obd_info oinfo = { { { 0 } } };
+	int rc = 0;
+	ENTRY;
+
+	set =  ptlrpc_prep_set();
+	if (set == NULL)
+		RETURN(-ENOMEM);
+
+	oinfo.oi_osfs = osfs;
+	oinfo.oi_flags = flags;
+	rc = obd_statfs_async(exp, &oinfo, max_age, set);
+	if (rc == 0)
+		rc = ptlrpc_set_wait(set);
+	ptlrpc_set_destroy(set);
+	RETURN(rc);
+}
+
+/* @max_age is the oldest time in jiffies that we accept using a cached data.
+ * If the cache is older than @max_age we will get a new value from the
+ * target.  Use a value of "cfs_time_current() + HZ" to guarantee freshness. */
+static inline int obd_statfs(const struct lu_env *env, struct obd_export *exp,
+			     struct obd_statfs *osfs, __u64 max_age,
+			     __u32 flags)
+{
+	int rc = 0;
+	struct obd_device *obd = exp->exp_obd;
+	ENTRY;
+
+	if (obd == NULL)
+		RETURN(-EINVAL);
+
+	OBD_CHECK_DT_OP(obd, statfs, -EOPNOTSUPP);
+	OBD_COUNTER_INCREMENT(obd, statfs);
+
+	CDEBUG(D_SUPER, "osfs "LPU64", max_age "LPU64"\n",
+	       obd->obd_osfs_age, max_age);
+	if (cfs_time_before_64(obd->obd_osfs_age, max_age)) {
+		rc = OBP(obd, statfs)(env, exp, osfs, max_age, flags);
+		if (rc == 0) {
+			spin_lock(&obd->obd_osfs_lock);
+			memcpy(&obd->obd_osfs, osfs, sizeof(obd->obd_osfs));
+			obd->obd_osfs_age = cfs_time_current_64();
+			spin_unlock(&obd->obd_osfs_lock);
+		}
+	} else {
+		CDEBUG(D_SUPER, "%s: use %p cache blocks "LPU64"/"LPU64
+		       " objects "LPU64"/"LPU64"\n",
+		       obd->obd_name, &obd->obd_osfs,
+		       obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks,
+		       obd->obd_osfs.os_ffree, obd->obd_osfs.os_files);
+		spin_lock(&obd->obd_osfs_lock);
+		memcpy(osfs, &obd->obd_osfs, sizeof(*osfs));
+		spin_unlock(&obd->obd_osfs_lock);
+	}
+	RETURN(rc);
+}
+
+static inline int obd_sync_rqset(struct obd_export *exp, struct obd_info *oinfo,
+				 obd_size start, obd_size end)
+{
+	struct ptlrpc_request_set *set = NULL;
+	int rc;
+	ENTRY;
+
+	OBD_CHECK_DT_OP(exp->exp_obd, sync, -EOPNOTSUPP);
+	EXP_COUNTER_INCREMENT(exp, sync);
+
+	set =  ptlrpc_prep_set();
+	if (set == NULL)
+		RETURN(-ENOMEM);
+
+	rc = OBP(exp->exp_obd, sync)(NULL, exp, oinfo, start, end, set);
+	if (rc == 0)
+		rc = ptlrpc_set_wait(set);
+	ptlrpc_set_destroy(set);
+	RETURN(rc);
+}
+
+static inline int obd_sync(const struct lu_env *env, struct obd_export *exp,
+			   struct obd_info *oinfo, obd_size start, obd_size end,
+			   struct ptlrpc_request_set *set)
+{
+	int rc;
+	ENTRY;
+
+	OBD_CHECK_DT_OP(exp->exp_obd, sync, -EOPNOTSUPP);
+	EXP_COUNTER_INCREMENT(exp, sync);
+
+	rc = OBP(exp->exp_obd, sync)(env, exp, oinfo, start, end, set);
+	RETURN(rc);
+}
+
+static inline int obd_punch_rqset(struct obd_export *exp,
+				  struct obd_info *oinfo,
+				  struct obd_trans_info *oti)
+{
+	struct ptlrpc_request_set *set = NULL;
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, punch);
+	EXP_COUNTER_INCREMENT(exp, punch);
+
+	set =  ptlrpc_prep_set();
+	if (set == NULL)
+		RETURN(-ENOMEM);
+
+	rc = OBP(exp->exp_obd, punch)(NULL, exp, oinfo, oti, set);
+	if (rc == 0)
+		rc = ptlrpc_set_wait(set);
+	ptlrpc_set_destroy(set);
+	RETURN(rc);
+}
+
+static inline int obd_punch(const struct lu_env *env, struct obd_export *exp,
+			    struct obd_info *oinfo, struct obd_trans_info *oti,
+			    struct ptlrpc_request_set *rqset)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, punch);
+	EXP_COUNTER_INCREMENT(exp, punch);
+
+	rc = OBP(exp->exp_obd, punch)(env, exp, oinfo, oti, rqset);
+	RETURN(rc);
+}
+
+static inline int obd_brw(int cmd, struct obd_export *exp,
+			  struct obd_info *oinfo, obd_count oa_bufs,
+			  struct brw_page *pg, struct obd_trans_info *oti)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, brw);
+	EXP_COUNTER_INCREMENT(exp, brw);
+
+	if (!(cmd & (OBD_BRW_RWMASK | OBD_BRW_CHECK))) {
+		CERROR("obd_brw: cmd must be OBD_BRW_READ, OBD_BRW_WRITE, "
+		       "or OBD_BRW_CHECK\n");
+		LBUG();
+	}
+
+	rc = OBP(exp->exp_obd, brw)(cmd, exp, oinfo, oa_bufs, pg, oti);
+	RETURN(rc);
+}
+
+static inline int obd_preprw(const struct lu_env *env, int cmd,
+			     struct obd_export *exp, struct obdo *oa,
+			     int objcount, struct obd_ioobj *obj,
+			     struct niobuf_remote *remote, int *pages,
+			     struct niobuf_local *local,
+			     struct obd_trans_info *oti,
+			     struct lustre_capa *capa)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, preprw);
+	EXP_COUNTER_INCREMENT(exp, preprw);
+
+	rc = OBP(exp->exp_obd, preprw)(env, cmd, exp, oa, objcount, obj, remote,
+				       pages, local, oti, capa);
+	RETURN(rc);
+}
+
+static inline int obd_commitrw(const struct lu_env *env, int cmd,
+			       struct obd_export *exp, struct obdo *oa,
+			       int objcount, struct obd_ioobj *obj,
+			       struct niobuf_remote *rnb, int pages,
+			       struct niobuf_local *local,
+			       struct obd_trans_info *oti, int rc)
+{
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, commitrw);
+	EXP_COUNTER_INCREMENT(exp, commitrw);
+
+	rc = OBP(exp->exp_obd, commitrw)(env, cmd, exp, oa, objcount, obj,
+					 rnb, pages, local, oti, rc);
+	RETURN(rc);
+}
+
+static inline int obd_merge_lvb(struct obd_export *exp,
+				struct lov_stripe_md *lsm,
+				struct ost_lvb *lvb, int kms_only)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, merge_lvb);
+	EXP_COUNTER_INCREMENT(exp, merge_lvb);
+
+	rc = OBP(exp->exp_obd, merge_lvb)(exp, lsm, lvb, kms_only);
+	RETURN(rc);
+}
+
+static inline int obd_adjust_kms(struct obd_export *exp,
+				 struct lov_stripe_md *lsm, obd_off size,
+				 int shrink)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, adjust_kms);
+	EXP_COUNTER_INCREMENT(exp, adjust_kms);
+
+	rc = OBP(exp->exp_obd, adjust_kms)(exp, lsm, size, shrink);
+	RETURN(rc);
+}
+
+static inline int obd_iocontrol(unsigned int cmd, struct obd_export *exp,
+				int len, void *karg, void *uarg)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, iocontrol);
+	EXP_COUNTER_INCREMENT(exp, iocontrol);
+
+	rc = OBP(exp->exp_obd, iocontrol)(cmd, exp, len, karg, uarg);
+	RETURN(rc);
+}
+
+static inline int obd_enqueue_rqset(struct obd_export *exp,
+				    struct obd_info *oinfo,
+				    struct ldlm_enqueue_info *einfo)
+{
+	struct ptlrpc_request_set *set = NULL;
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, enqueue);
+	EXP_COUNTER_INCREMENT(exp, enqueue);
+
+	set =  ptlrpc_prep_set();
+	if (set == NULL)
+		RETURN(-ENOMEM);
+
+	rc = OBP(exp->exp_obd, enqueue)(exp, oinfo, einfo, set);
+	if (rc == 0)
+		rc = ptlrpc_set_wait(set);
+	ptlrpc_set_destroy(set);
+	RETURN(rc);
+}
+
+static inline int obd_enqueue(struct obd_export *exp,
+			      struct obd_info *oinfo,
+			      struct ldlm_enqueue_info *einfo,
+			      struct ptlrpc_request_set *set)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, enqueue);
+	EXP_COUNTER_INCREMENT(exp, enqueue);
+
+	rc = OBP(exp->exp_obd, enqueue)(exp, oinfo, einfo, set);
+	RETURN(rc);
+}
+
+static inline int obd_change_cbdata(struct obd_export *exp,
+				    struct lov_stripe_md *lsm,
+				    ldlm_iterator_t it, void *data)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, change_cbdata);
+	EXP_COUNTER_INCREMENT(exp, change_cbdata);
+
+	rc = OBP(exp->exp_obd, change_cbdata)(exp, lsm, it, data);
+	RETURN(rc);
+}
+
+static inline int obd_find_cbdata(struct obd_export *exp,
+				  struct lov_stripe_md *lsm,
+				  ldlm_iterator_t it, void *data)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, find_cbdata);
+	EXP_COUNTER_INCREMENT(exp, find_cbdata);
+
+	rc = OBP(exp->exp_obd, find_cbdata)(exp, lsm, it, data);
+	RETURN(rc);
+}
+
+static inline int obd_cancel(struct obd_export *exp,
+			     struct lov_stripe_md *ea, __u32 mode,
+			     struct lustre_handle *lockh)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, cancel);
+	EXP_COUNTER_INCREMENT(exp, cancel);
+
+	rc = OBP(exp->exp_obd, cancel)(exp, ea, mode, lockh);
+	RETURN(rc);
+}
+
+static inline int obd_cancel_unused(struct obd_export *exp,
+				    struct lov_stripe_md *ea,
+				    ldlm_cancel_flags_t flags,
+				    void *opaque)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, cancel_unused);
+	EXP_COUNTER_INCREMENT(exp, cancel_unused);
+
+	rc = OBP(exp->exp_obd, cancel_unused)(exp, ea, flags, opaque);
+	RETURN(rc);
+}
+
+static inline int obd_pin(struct obd_export *exp, const struct lu_fid *fid,
+			  struct obd_capa *oc, struct obd_client_handle *handle,
+			  int flag)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, pin);
+	EXP_COUNTER_INCREMENT(exp, pin);
+
+	rc = OBP(exp->exp_obd, pin)(exp, fid, oc, handle, flag);
+	RETURN(rc);
+}
+
+static inline int obd_unpin(struct obd_export *exp,
+			    struct obd_client_handle *handle, int flag)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, unpin);
+	EXP_COUNTER_INCREMENT(exp, unpin);
+
+	rc = OBP(exp->exp_obd, unpin)(exp, handle, flag);
+	RETURN(rc);
+}
+
+
+static inline void obd_import_event(struct obd_device *obd,
+				    struct obd_import *imp,
+				    enum obd_import_event event)
+{
+	ENTRY;
+	if (!obd) {
+		CERROR("NULL device\n");
+		EXIT;
+		return;
+	}
+	if (obd->obd_set_up && OBP(obd, import_event)) {
+		OBD_COUNTER_INCREMENT(obd, import_event);
+		OBP(obd, import_event)(obd, imp, event);
+	}
+	EXIT;
+}
+
+static inline int obd_llog_connect(struct obd_export *exp,
+				   struct llogd_conn_body *body)
+{
+	int rc;
+	ENTRY;
+
+	OBD_CHECK_DT_OP(exp->exp_obd, llog_connect, 0);
+	EXP_COUNTER_INCREMENT(exp, llog_connect);
+
+	rc = OBP(exp->exp_obd, llog_connect)(exp, body);
+	RETURN(rc);
+}
+
+
+static inline int obd_notify(struct obd_device *obd,
+			     struct obd_device *watched,
+			     enum obd_notify_event ev,
+			     void *data)
+{
+	int rc;
+	ENTRY;
+	OBD_CHECK_DEV(obd);
+
+	/* the check for async_recov is a complete hack - I'm hereby
+	   overloading the meaning to also mean "this was called from
+	   mds_postsetup".  I know that my mds is able to handle notifies
+	   by this point, and it needs to get them to execute mds_postrecov. */
+	if (!obd->obd_set_up && !obd->obd_async_recov) {
+		CDEBUG(D_HA, "obd %s not set up\n", obd->obd_name);
+		RETURN(-EINVAL);
+	}
+
+	if (!OBP(obd, notify)) {
+		CDEBUG(D_HA, "obd %s has no notify handler\n", obd->obd_name);
+		RETURN(-ENOSYS);
+	}
+
+	OBD_COUNTER_INCREMENT(obd, notify);
+	rc = OBP(obd, notify)(obd, watched, ev, data);
+	RETURN(rc);
+}
+
+static inline int obd_notify_observer(struct obd_device *observer,
+				      struct obd_device *observed,
+				      enum obd_notify_event ev,
+				      void *data)
+{
+	int rc1;
+	int rc2;
+
+	struct obd_notify_upcall *onu;
+
+	if (observer->obd_observer)
+		rc1 = obd_notify(observer->obd_observer, observed, ev, data);
+	else
+		rc1 = 0;
+	/*
+	 * Also, call non-obd listener, if any
+	 */
+	onu = &observer->obd_upcall;
+	if (onu->onu_upcall != NULL)
+		rc2 = onu->onu_upcall(observer, observed, ev,
+				      onu->onu_owner, NULL);
+	else
+		rc2 = 0;
+
+	return rc1 ? rc1 : rc2;
+}
+
+static inline int obd_quotacheck(struct obd_export *exp,
+				 struct obd_quotactl *oqctl)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, quotacheck);
+	EXP_COUNTER_INCREMENT(exp, quotacheck);
+
+	rc = OBP(exp->exp_obd, quotacheck)(exp->exp_obd, exp, oqctl);
+	RETURN(rc);
+}
+
+static inline int obd_quotactl(struct obd_export *exp,
+			       struct obd_quotactl *oqctl)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, quotactl);
+	EXP_COUNTER_INCREMENT(exp, quotactl);
+
+	rc = OBP(exp->exp_obd, quotactl)(exp->exp_obd, exp, oqctl);
+	RETURN(rc);
+}
+
+static inline int obd_health_check(const struct lu_env *env,
+				   struct obd_device *obd)
+{
+	/* returns: 0 on healthy
+	 *	 >0 on unhealthy + reason code/flag
+	 *	    however the only suppored reason == 1 right now
+	 *	    We'll need to define some better reasons
+	 *	    or flags in the future.
+	 *	 <0 on error
+	 */
+	int rc;
+	ENTRY;
+
+	/* don't use EXP_CHECK_DT_OP, because NULL method is normal here */
+	if (obd == NULL || !OBT(obd)) {
+		CERROR("cleaned up obd\n");
+		RETURN(-EOPNOTSUPP);
+	}
+	if (!obd->obd_set_up || obd->obd_stopping)
+		RETURN(0);
+	if (!OBP(obd, health_check))
+		RETURN(0);
+
+	rc = OBP(obd, health_check)(env, obd);
+	RETURN(rc);
+}
+
+static inline int obd_register_observer(struct obd_device *obd,
+					struct obd_device *observer)
+{
+	ENTRY;
+	OBD_CHECK_DEV(obd);
+	down_write(&obd->obd_observer_link_sem);
+	if (obd->obd_observer && observer) {
+		up_write(&obd->obd_observer_link_sem);
+		RETURN(-EALREADY);
+	}
+	obd->obd_observer = observer;
+	up_write(&obd->obd_observer_link_sem);
+	RETURN(0);
+}
+
+static inline int obd_pin_observer(struct obd_device *obd,
+				   struct obd_device **observer)
+{
+	ENTRY;
+	down_read(&obd->obd_observer_link_sem);
+	if (!obd->obd_observer) {
+		*observer = NULL;
+		up_read(&obd->obd_observer_link_sem);
+		RETURN(-ENOENT);
+	}
+	*observer = obd->obd_observer;
+	RETURN(0);
+}
+
+static inline int obd_unpin_observer(struct obd_device *obd)
+{
+	ENTRY;
+	up_read(&obd->obd_observer_link_sem);
+	RETURN(0);
+}
+
+#if 0
+static inline int obd_register_page_removal_cb(struct obd_export *exp,
+					       obd_page_removal_cb_t cb,
+					       obd_pin_extent_cb pin_cb)
+{
+	int rc;
+	ENTRY;
+
+	OBD_CHECK_DT_OP(exp->exp_obd, register_page_removal_cb, 0);
+	OBD_COUNTER_INCREMENT(exp->exp_obd, register_page_removal_cb);
+
+	rc = OBP(exp->exp_obd, register_page_removal_cb)(exp, cb, pin_cb);
+	RETURN(rc);
+}
+
+static inline int obd_unregister_page_removal_cb(struct obd_export *exp,
+						 obd_page_removal_cb_t cb)
+{
+	int rc;
+	ENTRY;
+
+	OBD_CHECK_DT_OP(exp->exp_obd, unregister_page_removal_cb, 0);
+	OBD_COUNTER_INCREMENT(exp->exp_obd, unregister_page_removal_cb);
+
+	rc = OBP(exp->exp_obd, unregister_page_removal_cb)(exp, cb);
+	RETURN(rc);
+}
+
+static inline int obd_register_lock_cancel_cb(struct obd_export *exp,
+					      obd_lock_cancel_cb cb)
+{
+	int rc;
+	ENTRY;
+
+	OBD_CHECK_DT_OP(exp->exp_obd, register_lock_cancel_cb, 0);
+	OBD_COUNTER_INCREMENT(exp->exp_obd, register_lock_cancel_cb);
+
+	rc = OBP(exp->exp_obd, register_lock_cancel_cb)(exp, cb);
+	RETURN(rc);
+}
+
+static inline int obd_unregister_lock_cancel_cb(struct obd_export *exp,
+						 obd_lock_cancel_cb cb)
+{
+	int rc;
+	ENTRY;
+
+	OBD_CHECK_DT_OP(exp->exp_obd, unregister_lock_cancel_cb, 0);
+	OBD_COUNTER_INCREMENT(exp->exp_obd, unregister_lock_cancel_cb);
+
+	rc = OBP(exp->exp_obd, unregister_lock_cancel_cb)(exp, cb);
+	RETURN(rc);
+}
+#endif
+
+/* metadata helpers */
+static inline int md_getstatus(struct obd_export *exp,
+			       struct lu_fid *fid, struct obd_capa **pc)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_MD_OP(exp, getstatus);
+	EXP_MD_COUNTER_INCREMENT(exp, getstatus);
+	rc = MDP(exp->exp_obd, getstatus)(exp, fid, pc);
+	RETURN(rc);
+}
+
+static inline int md_getattr(struct obd_export *exp, struct md_op_data *op_data,
+			     struct ptlrpc_request **request)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, getattr);
+	EXP_MD_COUNTER_INCREMENT(exp, getattr);
+	rc = MDP(exp->exp_obd, getattr)(exp, op_data, request);
+	RETURN(rc);
+}
+
+static inline int md_null_inode(struct obd_export *exp,
+				   const struct lu_fid *fid)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, null_inode);
+	EXP_MD_COUNTER_INCREMENT(exp, null_inode);
+	rc = MDP(exp->exp_obd, null_inode)(exp, fid);
+	RETURN(rc);
+}
+
+static inline int md_find_cbdata(struct obd_export *exp,
+				 const struct lu_fid *fid,
+				 ldlm_iterator_t it, void *data)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, find_cbdata);
+	EXP_MD_COUNTER_INCREMENT(exp, find_cbdata);
+	rc = MDP(exp->exp_obd, find_cbdata)(exp, fid, it, data);
+	RETURN(rc);
+}
+
+static inline int md_close(struct obd_export *exp, struct md_op_data *op_data,
+			   struct md_open_data *mod,
+			   struct ptlrpc_request **request)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, close);
+	EXP_MD_COUNTER_INCREMENT(exp, close);
+	rc = MDP(exp->exp_obd, close)(exp, op_data, mod, request);
+	RETURN(rc);
+}
+
+static inline int md_create(struct obd_export *exp, struct md_op_data *op_data,
+			    const void *data, int datalen, int mode, __u32 uid,
+			    __u32 gid, cfs_cap_t cap_effective, __u64 rdev,
+			    struct ptlrpc_request **request)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, create);
+	EXP_MD_COUNTER_INCREMENT(exp, create);
+	rc = MDP(exp->exp_obd, create)(exp, op_data, data, datalen, mode,
+				       uid, gid, cap_effective, rdev, request);
+	RETURN(rc);
+}
+
+static inline int md_done_writing(struct obd_export *exp,
+				  struct md_op_data *op_data,
+				  struct md_open_data *mod)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, done_writing);
+	EXP_MD_COUNTER_INCREMENT(exp, done_writing);
+	rc = MDP(exp->exp_obd, done_writing)(exp, op_data, mod);
+	RETURN(rc);
+}
+
+static inline int md_enqueue(struct obd_export *exp,
+			     struct ldlm_enqueue_info *einfo,
+			     struct lookup_intent *it,
+			     struct md_op_data *op_data,
+			     struct lustre_handle *lockh,
+			     void *lmm, int lmmsize,
+			     struct ptlrpc_request **req,
+			     int extra_lock_flags)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, enqueue);
+	EXP_MD_COUNTER_INCREMENT(exp, enqueue);
+	rc = MDP(exp->exp_obd, enqueue)(exp, einfo, it, op_data, lockh,
+					lmm, lmmsize, req, extra_lock_flags);
+	RETURN(rc);
+}
+
+static inline int md_getattr_name(struct obd_export *exp,
+				  struct md_op_data *op_data,
+				  struct ptlrpc_request **request)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, getattr_name);
+	EXP_MD_COUNTER_INCREMENT(exp, getattr_name);
+	rc = MDP(exp->exp_obd, getattr_name)(exp, op_data, request);
+	RETURN(rc);
+}
+
+static inline int md_intent_lock(struct obd_export *exp,
+				 struct md_op_data *op_data, void *lmm,
+				 int lmmsize, struct lookup_intent *it,
+				 int lookup_flags, struct ptlrpc_request **reqp,
+				 ldlm_blocking_callback cb_blocking,
+				 __u64 extra_lock_flags)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, intent_lock);
+	EXP_MD_COUNTER_INCREMENT(exp, intent_lock);
+	rc = MDP(exp->exp_obd, intent_lock)(exp, op_data, lmm, lmmsize,
+					    it, lookup_flags, reqp, cb_blocking,
+					    extra_lock_flags);
+	RETURN(rc);
+}
+
+static inline int md_link(struct obd_export *exp, struct md_op_data *op_data,
+			  struct ptlrpc_request **request)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, link);
+	EXP_MD_COUNTER_INCREMENT(exp, link);
+	rc = MDP(exp->exp_obd, link)(exp, op_data, request);
+	RETURN(rc);
+}
+
+static inline int md_rename(struct obd_export *exp, struct md_op_data *op_data,
+			    const char *old, int oldlen, const char *new,
+			    int newlen, struct ptlrpc_request **request)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, rename);
+	EXP_MD_COUNTER_INCREMENT(exp, rename);
+	rc = MDP(exp->exp_obd, rename)(exp, op_data, old, oldlen, new,
+				       newlen, request);
+	RETURN(rc);
+}
+
+static inline int md_is_subdir(struct obd_export *exp,
+			       const struct lu_fid *pfid,
+			       const struct lu_fid *cfid,
+			       struct ptlrpc_request **request)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, is_subdir);
+	EXP_MD_COUNTER_INCREMENT(exp, is_subdir);
+	rc = MDP(exp->exp_obd, is_subdir)(exp, pfid, cfid, request);
+	RETURN(rc);
+}
+
+static inline int md_setattr(struct obd_export *exp, struct md_op_data *op_data,
+			     void *ea, int ealen, void *ea2, int ea2len,
+			     struct ptlrpc_request **request,
+			     struct md_open_data **mod)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, setattr);
+	EXP_MD_COUNTER_INCREMENT(exp, setattr);
+	rc = MDP(exp->exp_obd, setattr)(exp, op_data, ea, ealen,
+					ea2, ea2len, request, mod);
+	RETURN(rc);
+}
+
+static inline int md_sync(struct obd_export *exp, const struct lu_fid *fid,
+			  struct obd_capa *oc, struct ptlrpc_request **request)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, sync);
+	EXP_MD_COUNTER_INCREMENT(exp, sync);
+	rc = MDP(exp->exp_obd, sync)(exp, fid, oc, request);
+	RETURN(rc);
+}
+
+static inline int md_readpage(struct obd_export *exp, struct md_op_data *opdata,
+			      struct page **pages,
+			      struct ptlrpc_request **request)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, readpage);
+	EXP_MD_COUNTER_INCREMENT(exp, readpage);
+	rc = MDP(exp->exp_obd, readpage)(exp, opdata, pages, request);
+	RETURN(rc);
+}
+
+static inline int md_unlink(struct obd_export *exp, struct md_op_data *op_data,
+			    struct ptlrpc_request **request)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, unlink);
+	EXP_MD_COUNTER_INCREMENT(exp, unlink);
+	rc = MDP(exp->exp_obd, unlink)(exp, op_data, request);
+	RETURN(rc);
+}
+
+static inline int md_get_lustre_md(struct obd_export *exp,
+				   struct ptlrpc_request *req,
+				   struct obd_export *dt_exp,
+				   struct obd_export *md_exp,
+				   struct lustre_md *md)
+{
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, get_lustre_md);
+	EXP_MD_COUNTER_INCREMENT(exp, get_lustre_md);
+	RETURN(MDP(exp->exp_obd, get_lustre_md)(exp, req, dt_exp, md_exp, md));
+}
+
+static inline int md_free_lustre_md(struct obd_export *exp,
+				    struct lustre_md *md)
+{
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, free_lustre_md);
+	EXP_MD_COUNTER_INCREMENT(exp, free_lustre_md);
+	RETURN(MDP(exp->exp_obd, free_lustre_md)(exp, md));
+}
+
+static inline int md_setxattr(struct obd_export *exp,
+			      const struct lu_fid *fid, struct obd_capa *oc,
+			      obd_valid valid, const char *name,
+			      const char *input, int input_size,
+			      int output_size, int flags, __u32 suppgid,
+			      struct ptlrpc_request **request)
+{
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, setxattr);
+	EXP_MD_COUNTER_INCREMENT(exp, setxattr);
+	RETURN(MDP(exp->exp_obd, setxattr)(exp, fid, oc, valid, name, input,
+					   input_size, output_size, flags,
+					   suppgid, request));
+}
+
+static inline int md_getxattr(struct obd_export *exp,
+			      const struct lu_fid *fid, struct obd_capa *oc,
+			      obd_valid valid, const char *name,
+			      const char *input, int input_size,
+			      int output_size, int flags,
+			      struct ptlrpc_request **request)
+{
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, getxattr);
+	EXP_MD_COUNTER_INCREMENT(exp, getxattr);
+	RETURN(MDP(exp->exp_obd, getxattr)(exp, fid, oc, valid, name, input,
+					   input_size, output_size, flags,
+					   request));
+}
+
+static inline int md_set_open_replay_data(struct obd_export *exp,
+					  struct obd_client_handle *och,
+					  struct ptlrpc_request *open_req)
+{
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, set_open_replay_data);
+	EXP_MD_COUNTER_INCREMENT(exp, set_open_replay_data);
+	RETURN(MDP(exp->exp_obd, set_open_replay_data)(exp, och, open_req));
+}
+
+static inline int md_clear_open_replay_data(struct obd_export *exp,
+					    struct obd_client_handle *och)
+{
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, clear_open_replay_data);
+	EXP_MD_COUNTER_INCREMENT(exp, clear_open_replay_data);
+	RETURN(MDP(exp->exp_obd, clear_open_replay_data)(exp, och));
+}
+
+static inline int md_set_lock_data(struct obd_export *exp,
+				   __u64 *lockh, void *data, __u64 *bits)
+{
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, set_lock_data);
+	EXP_MD_COUNTER_INCREMENT(exp, set_lock_data);
+	RETURN(MDP(exp->exp_obd, set_lock_data)(exp, lockh, data, bits));
+}
+
+static inline int md_cancel_unused(struct obd_export *exp,
+				   const struct lu_fid *fid,
+				   ldlm_policy_data_t *policy,
+				   ldlm_mode_t mode,
+				   ldlm_cancel_flags_t flags,
+				   void *opaque)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_MD_OP(exp, cancel_unused);
+	EXP_MD_COUNTER_INCREMENT(exp, cancel_unused);
+
+	rc = MDP(exp->exp_obd, cancel_unused)(exp, fid, policy, mode,
+					      flags, opaque);
+	RETURN(rc);
+}
+
+static inline ldlm_mode_t md_lock_match(struct obd_export *exp, __u64 flags,
+					const struct lu_fid *fid,
+					ldlm_type_t type,
+					ldlm_policy_data_t *policy,
+					ldlm_mode_t mode,
+					struct lustre_handle *lockh)
+{
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, lock_match);
+	EXP_MD_COUNTER_INCREMENT(exp, lock_match);
+	RETURN(MDP(exp->exp_obd, lock_match)(exp, flags, fid, type,
+					     policy, mode, lockh));
+}
+
+static inline int md_init_ea_size(struct obd_export *exp, int easize,
+				  int def_asize, int cookiesize)
+{
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, init_ea_size);
+	EXP_MD_COUNTER_INCREMENT(exp, init_ea_size);
+	RETURN(MDP(exp->exp_obd, init_ea_size)(exp, easize, def_asize,
+					       cookiesize));
+}
+
+static inline int md_get_remote_perm(struct obd_export *exp,
+				     const struct lu_fid *fid,
+				     struct obd_capa *oc, __u32 suppgid,
+				     struct ptlrpc_request **request)
+{
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, get_remote_perm);
+	EXP_MD_COUNTER_INCREMENT(exp, get_remote_perm);
+	RETURN(MDP(exp->exp_obd, get_remote_perm)(exp, fid, oc, suppgid,
+						  request));
+}
+
+static inline int md_renew_capa(struct obd_export *exp, struct obd_capa *ocapa,
+				renew_capa_cb_t cb)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, renew_capa);
+	EXP_MD_COUNTER_INCREMENT(exp, renew_capa);
+	rc = MDP(exp->exp_obd, renew_capa)(exp, ocapa, cb);
+	RETURN(rc);
+}
+
+static inline int md_unpack_capa(struct obd_export *exp,
+				 struct ptlrpc_request *req,
+				 const struct req_msg_field *field,
+				 struct obd_capa **oc)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, unpack_capa);
+	EXP_MD_COUNTER_INCREMENT(exp, unpack_capa);
+	rc = MDP(exp->exp_obd, unpack_capa)(exp, req, field, oc);
+	RETURN(rc);
+}
+
+static inline int md_intent_getattr_async(struct obd_export *exp,
+					  struct md_enqueue_info *minfo,
+					  struct ldlm_enqueue_info *einfo)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, intent_getattr_async);
+	EXP_MD_COUNTER_INCREMENT(exp, intent_getattr_async);
+	rc = MDP(exp->exp_obd, intent_getattr_async)(exp, minfo, einfo);
+	RETURN(rc);
+}
+
+static inline int md_revalidate_lock(struct obd_export *exp,
+				     struct lookup_intent *it,
+				     struct lu_fid *fid, __u64 *bits)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, revalidate_lock);
+	EXP_MD_COUNTER_INCREMENT(exp, revalidate_lock);
+	rc = MDP(exp->exp_obd, revalidate_lock)(exp, it, fid, bits);
+	RETURN(rc);
+}
+
+
+/* OBD Metadata Support */
+
+extern int obd_init_caches(void);
+extern void obd_cleanup_caches(void);
+
+/* support routines */
+extern struct kmem_cache *obdo_cachep;
+
+#define OBDO_ALLOC(ptr)						       \
+do {									  \
+	OBD_SLAB_ALLOC_PTR_GFP((ptr), obdo_cachep, __GFP_IO);	     \
+} while(0)
+
+#define OBDO_FREE(ptr)							\
+do {									  \
+	OBD_SLAB_FREE_PTR((ptr), obdo_cachep);				\
+} while(0)
+
+
+static inline void obdo2fid(struct obdo *oa, struct lu_fid *fid)
+{
+	/* something here */
+}
+
+static inline void fid2obdo(struct lu_fid *fid, struct obdo *oa)
+{
+	/* something here */
+}
+
+typedef int (*register_lwp_cb)(void *data);
+
+struct lwp_register_item {
+	struct obd_export **lri_exp;
+	register_lwp_cb	    lri_cb_func;
+	void		   *lri_cb_data;
+	struct list_head	    lri_list;
+	char		    lri_name[MTI_NAME_MAXLEN];
+};
+
+/* I'm as embarrassed about this as you are.
+ *
+ * <shaver> // XXX do not look into _superhack with remaining eye
+ * <shaver> // XXX if this were any uglier, I'd get my own show on MTV */
+extern int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c);
+
+/* obd_mount.c */
+
+/* sysctl.c */
+extern void obd_sysctl_init (void);
+extern void obd_sysctl_clean (void);
+
+/* uuid.c  */
+typedef __u8 class_uuid_t[16];
+void class_uuid_unparse(class_uuid_t in, struct obd_uuid *out);
+
+/* lustre_peer.c    */
+int lustre_uuid_to_peer(const char *uuid, lnet_nid_t *peer_nid, int index);
+int class_add_uuid(const char *uuid, __u64 nid);
+int class_del_uuid (const char *uuid);
+int class_check_uuid(struct obd_uuid *uuid, __u64 nid);
+void class_init_uuidlist(void);
+void class_exit_uuidlist(void);
+
+/* mea.c */
+int mea_name2idx(struct lmv_stripe_md *mea, const char *name, int namelen);
+int raw_name2idx(int hashtype, int count, const char *name, int namelen);
+
+/* prng.c */
+#define ll_generate_random_uuid(uuid_out) cfs_get_random_bytes(uuid_out, sizeof(class_uuid_t))
+
+#endif /* __LINUX_OBD_CLASS_H */
diff --git a/drivers/staging/lustre/lustre/include/obd_lov.h b/drivers/staging/lustre/lustre/include/obd_lov.h
new file mode 100644
index 000000000000..d82f3341d0a8
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/obd_lov.h
@@ -0,0 +1,126 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _OBD_LOV_H__
+#define _OBD_LOV_H__
+
+#define LOV_DEFAULT_STRIPE_SIZE (1 << LNET_MTU_BITS)
+
+static inline int lov_stripe_md_size(__u16 stripes)
+{
+	return sizeof(struct lov_stripe_md) + stripes*sizeof(struct lov_oinfo*);
+}
+
+static inline __u32 lov_mds_md_size(__u16 stripes, __u32 lmm_magic)
+{
+	if (lmm_magic == LOV_MAGIC_V3)
+		return sizeof(struct lov_mds_md_v3) +
+			stripes * sizeof(struct lov_ost_data_v1);
+	else
+		return sizeof(struct lov_mds_md_v1) +
+			stripes * sizeof(struct lov_ost_data_v1);
+}
+
+struct lov_version_size {
+	__u32   lvs_magic;
+	size_t  lvs_lmm_size;
+	size_t  lvs_lod_size;
+};
+
+static inline __u32 lov_mds_md_stripecnt(int ea_size, __u32 lmm_magic)
+{
+	static const struct lov_version_size lmm_ver_size[] = {
+			{ .lvs_magic = LOV_MAGIC_V3,
+			  .lvs_lmm_size = sizeof(struct lov_mds_md_v3),
+			  .lvs_lod_size = sizeof(struct lov_ost_data_v1) },
+			{ .lvs_magic = LOV_MAGIC_V1,
+			  .lvs_lmm_size = sizeof(struct lov_mds_md_v1),
+			  .lvs_lod_size = sizeof(struct lov_ost_data_v1)} };
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(lmm_ver_size); i++) {
+		if (lmm_magic == lmm_ver_size[i].lvs_magic) {
+			if (ea_size <= lmm_ver_size[i].lvs_lmm_size)
+				return 0;
+			return (ea_size - lmm_ver_size[i].lvs_lmm_size) /
+				lmm_ver_size[i].lvs_lod_size;
+		}
+	}
+
+	/* Invalid LOV magic, so no stripes could fit */
+	return 0;
+}
+
+/* lov_do_div64(a, b) returns a % b, and a = a / b.
+ * The 32-bit code is LOV-specific due to knowing about stripe limits in
+ * order to reduce the divisor to a 32-bit number.  If the divisor is
+ * already a 32-bit value the compiler handles this directly. */
+#if BITS_PER_LONG > 32
+# define lov_do_div64(n,base) ({					\
+	uint64_t __base = (base);					\
+	uint64_t __rem;							\
+	__rem = ((uint64_t)(n)) % __base;				\
+	(n) = ((uint64_t)(n)) / __base;					\
+	__rem;								\
+  })
+#else
+# define lov_do_div64(n,base) ({					\
+	uint64_t __rem;							\
+	if ((sizeof(base) > 4) && (((base) & 0xffffffff00000000ULL) != 0)) {  \
+		int __remainder;					      \
+		LASSERTF(!((base) & (LOV_MIN_STRIPE_SIZE - 1)), "64 bit lov " \
+			 "division %llu / %llu\n", (n), (uint64_t)(base));    \
+		__remainder = (n) & (LOV_MIN_STRIPE_SIZE - 1);		\
+		(n) >>= LOV_MIN_STRIPE_BITS;				\
+		__rem = do_div(n, (base) >> LOV_MIN_STRIPE_BITS);	\
+		__rem <<= LOV_MIN_STRIPE_BITS;				\
+		__rem += __remainder;					\
+	} else {							\
+		__rem = do_div(n, base);				\
+	}								\
+	__rem;								\
+  })
+#endif
+
+#define IOC_LOV_TYPE		   'g'
+#define IOC_LOV_MIN_NR		 50
+#define IOC_LOV_SET_OSC_ACTIVE	 _IOWR('g', 50, long)
+#define IOC_LOV_MAX_NR		 50
+
+#define QOS_DEFAULT_THRESHOLD	   10 /* MB */
+#define QOS_DEFAULT_MAXAGE	      5  /* Seconds */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/obd_ost.h b/drivers/staging/lustre/lustre/include/obd_ost.h
new file mode 100644
index 000000000000..af89843c312b
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/obd_ost.h
@@ -0,0 +1,96 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/obd_ost.h
+ *
+ * Data structures for object storage targets and client: OST & OSC's
+ *
+ * See also lustre_idl.h for wire formats of requests.
+ */
+
+#ifndef _LUSTRE_OST_H
+#define _LUSTRE_OST_H
+
+#include <obd_class.h>
+
+struct osc_brw_async_args {
+	struct obdo       *aa_oa;
+	int		aa_requested_nob;
+	int		aa_nio_count;
+	obd_count	  aa_page_count;
+	int		aa_resends;
+	struct brw_page  **aa_ppga;
+	struct client_obd *aa_cli;
+	struct list_head	 aa_oaps;
+	struct list_head	 aa_exts;
+	struct obd_capa   *aa_ocapa;
+	struct cl_req     *aa_clerq;
+};
+
+#define osc_grant_args osc_brw_async_args
+struct osc_async_args {
+	struct obd_info   *aa_oi;
+};
+
+struct osc_setattr_args {
+	struct obdo	 *sa_oa;
+	obd_enqueue_update_f sa_upcall;
+	void		*sa_cookie;
+};
+
+struct osc_fsync_args {
+	struct obd_info     *fa_oi;
+	obd_enqueue_update_f fa_upcall;
+	void		*fa_cookie;
+};
+
+struct osc_enqueue_args {
+	struct obd_export	*oa_exp;
+	__u64		    *oa_flags;
+	obd_enqueue_update_f      oa_upcall;
+	void		     *oa_cookie;
+	struct ost_lvb	   *oa_lvb;
+	struct lustre_handle     *oa_lockh;
+	struct ldlm_enqueue_info *oa_ei;
+	unsigned int	      oa_agl:1;
+};
+
+#if 0
+int osc_extent_blocking_cb(struct ldlm_lock *lock,
+			   struct ldlm_lock_desc *new, void *data,
+			   int flag);
+#endif
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/obd_support.h b/drivers/staging/lustre/lustre/include/obd_support.h
new file mode 100644
index 000000000000..5f2b4e88f78f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/obd_support.h
@@ -0,0 +1,853 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _OBD_SUPPORT
+#define _OBD_SUPPORT
+
+#include <linux/libcfs/libcfs.h>
+#include <lvfs.h>
+#include <lprocfs_status.h>
+
+#include <linux/obd_support.h>
+
+/* global variables */
+extern struct lprocfs_stats *obd_memory;
+enum {
+	OBD_MEMORY_STAT = 0,
+	OBD_MEMORY_PAGES_STAT = 1,
+	OBD_STATS_NUM,
+};
+
+extern unsigned int obd_debug_peer_on_timeout;
+extern unsigned int obd_dump_on_timeout;
+extern unsigned int obd_dump_on_eviction;
+/* obd_timeout should only be used for recovery, not for
+   networking / disk / timings affected by load (use Adaptive Timeouts) */
+extern unsigned int obd_timeout;	  /* seconds */
+extern unsigned int ldlm_timeout;	 /* seconds */
+extern unsigned int obd_timeout_set;
+extern unsigned int ldlm_timeout_set;
+extern unsigned int at_min;
+extern unsigned int at_max;
+extern unsigned int at_history;
+extern int at_early_margin;
+extern int at_extra;
+extern unsigned int obd_sync_filter;
+extern unsigned int obd_max_dirty_pages;
+extern atomic_t obd_unstable_pages;
+extern atomic_t obd_dirty_pages;
+extern atomic_t obd_dirty_transit_pages;
+extern unsigned int obd_alloc_fail_rate;
+extern char obd_jobid_var[];
+
+/* lvfs.c */
+int obd_alloc_fail(const void *ptr, const char *name, const char *type,
+		   size_t size, const char *file, int line);
+
+/* Some hash init argument constants */
+#define HASH_POOLS_BKT_BITS 3
+#define HASH_POOLS_CUR_BITS 3
+#define HASH_POOLS_MAX_BITS 7
+#define HASH_UUID_BKT_BITS 5
+#define HASH_UUID_CUR_BITS 7
+#define HASH_UUID_MAX_BITS 12
+#define HASH_NID_BKT_BITS 5
+#define HASH_NID_CUR_BITS 7
+#define HASH_NID_MAX_BITS 12
+#define HASH_NID_STATS_BKT_BITS 5
+#define HASH_NID_STATS_CUR_BITS 7
+#define HASH_NID_STATS_MAX_BITS 12
+#define HASH_LQE_BKT_BITS 5
+#define HASH_LQE_CUR_BITS 7
+#define HASH_LQE_MAX_BITS 12
+#define HASH_CONN_BKT_BITS 5
+#define HASH_CONN_CUR_BITS 5
+#define HASH_CONN_MAX_BITS 15
+#define HASH_EXP_LOCK_BKT_BITS  5
+#define HASH_EXP_LOCK_CUR_BITS  7
+#define HASH_EXP_LOCK_MAX_BITS  16
+#define HASH_CL_ENV_BKT_BITS    5
+#define HASH_CL_ENV_BITS	10
+#define HASH_JOB_STATS_BKT_BITS 5
+#define HASH_JOB_STATS_CUR_BITS 7
+#define HASH_JOB_STATS_MAX_BITS 12
+
+/* Timeout definitions */
+#define OBD_TIMEOUT_DEFAULT	     100
+#define LDLM_TIMEOUT_DEFAULT	    20
+#define MDS_LDLM_TIMEOUT_DEFAULT	6
+/* Time to wait for all clients to reconnect during recovery (hard limit) */
+#define OBD_RECOVERY_TIME_HARD	  (obd_timeout * 9)
+/* Time to wait for all clients to reconnect during recovery (soft limit) */
+/* Should be very conservative; must catch the first reconnect after reboot */
+#define OBD_RECOVERY_TIME_SOFT	  (obd_timeout * 3)
+/* Change recovery-small 26b time if you change this */
+#define PING_INTERVAL max(obd_timeout / 4, 1U)
+/* a bit more than maximal journal commit time in seconds */
+#define PING_INTERVAL_SHORT min(PING_INTERVAL, 7U)
+/* Client may skip 1 ping; we must wait at least 2.5. But for multiple
+ * failover targets the client only pings one server at a time, and pings
+ * can be lost on a loaded network. Since eviction has serious consequences,
+ * and there's no urgent need to evict a client just because it's idle, we
+ * should be very conservative here. */
+#define PING_EVICT_TIMEOUT (PING_INTERVAL * 6)
+#define DISK_TIMEOUT 50	  /* Beyond this we warn about disk speed */
+#define CONNECTION_SWITCH_MIN 5U /* Connection switching rate limiter */
+ /* Max connect interval for nonresponsive servers; ~50s to avoid building up
+    connect requests in the LND queues, but within obd_timeout so we don't
+    miss the recovery window */
+#define CONNECTION_SWITCH_MAX min(50U, max(CONNECTION_SWITCH_MIN,obd_timeout))
+#define CONNECTION_SWITCH_INC 5  /* Connection timeout backoff */
+/* In general this should be low to have quick detection of a system
+   running on a backup server. (If it's too low, import_select_connection
+   will increase the timeout anyhow.)  */
+#define INITIAL_CONNECT_TIMEOUT max(CONNECTION_SWITCH_MIN,obd_timeout/20)
+/* The max delay between connects is SWITCH_MAX + SWITCH_INC + INITIAL */
+#define RECONNECT_DELAY_MAX (CONNECTION_SWITCH_MAX + CONNECTION_SWITCH_INC + \
+			     INITIAL_CONNECT_TIMEOUT)
+/* The min time a target should wait for clients to reconnect in recovery */
+#define OBD_RECOVERY_TIME_MIN    (2*RECONNECT_DELAY_MAX)
+#define OBD_IR_FACTOR_MIN	 1
+#define OBD_IR_FACTOR_MAX	 10
+#define OBD_IR_FACTOR_DEFAULT    (OBD_IR_FACTOR_MAX/2)
+/* default timeout for the MGS to become IR_FULL */
+#define OBD_IR_MGS_TIMEOUT       (4*obd_timeout)
+#define LONG_UNLINK 300	  /* Unlink should happen before now */
+
+/**
+ * Time interval of shrink, if the client is "idle" more than this interval,
+ * then the ll_grant thread will return the requested grant space to filter
+ */
+#define GRANT_SHRINK_INTERVAL	    1200/*20 minutes*/
+
+#define OBD_FAIL_MDS		     0x100
+#define OBD_FAIL_MDS_HANDLE_UNPACK       0x101
+#define OBD_FAIL_MDS_GETATTR_NET	 0x102
+#define OBD_FAIL_MDS_GETATTR_PACK	0x103
+#define OBD_FAIL_MDS_READPAGE_NET	0x104
+#define OBD_FAIL_MDS_READPAGE_PACK       0x105
+#define OBD_FAIL_MDS_SENDPAGE	    0x106
+#define OBD_FAIL_MDS_REINT_NET	   0x107
+#define OBD_FAIL_MDS_REINT_UNPACK	0x108
+#define OBD_FAIL_MDS_REINT_SETATTR       0x109
+#define OBD_FAIL_MDS_REINT_SETATTR_WRITE 0x10a
+#define OBD_FAIL_MDS_REINT_CREATE	0x10b
+#define OBD_FAIL_MDS_REINT_CREATE_WRITE  0x10c
+#define OBD_FAIL_MDS_REINT_UNLINK	0x10d
+#define OBD_FAIL_MDS_REINT_UNLINK_WRITE  0x10e
+#define OBD_FAIL_MDS_REINT_LINK	  0x10f
+#define OBD_FAIL_MDS_REINT_LINK_WRITE    0x110
+#define OBD_FAIL_MDS_REINT_RENAME	0x111
+#define OBD_FAIL_MDS_REINT_RENAME_WRITE  0x112
+#define OBD_FAIL_MDS_OPEN_NET	    0x113
+#define OBD_FAIL_MDS_OPEN_PACK	   0x114
+#define OBD_FAIL_MDS_CLOSE_NET	   0x115
+#define OBD_FAIL_MDS_CLOSE_PACK	  0x116
+#define OBD_FAIL_MDS_CONNECT_NET	 0x117
+#define OBD_FAIL_MDS_CONNECT_PACK	0x118
+#define OBD_FAIL_MDS_REINT_NET_REP       0x119
+#define OBD_FAIL_MDS_DISCONNECT_NET      0x11a
+#define OBD_FAIL_MDS_GETSTATUS_NET       0x11b
+#define OBD_FAIL_MDS_GETSTATUS_PACK      0x11c
+#define OBD_FAIL_MDS_STATFS_PACK	 0x11d
+#define OBD_FAIL_MDS_STATFS_NET	  0x11e
+#define OBD_FAIL_MDS_GETATTR_NAME_NET    0x11f
+#define OBD_FAIL_MDS_PIN_NET	     0x120
+#define OBD_FAIL_MDS_UNPIN_NET	   0x121
+#define OBD_FAIL_MDS_ALL_REPLY_NET       0x122
+#define OBD_FAIL_MDS_ALL_REQUEST_NET     0x123
+#define OBD_FAIL_MDS_SYNC_NET	    0x124
+#define OBD_FAIL_MDS_SYNC_PACK	   0x125
+#define OBD_FAIL_MDS_DONE_WRITING_NET    0x126
+#define OBD_FAIL_MDS_DONE_WRITING_PACK   0x127
+#define OBD_FAIL_MDS_ALLOC_OBDO	  0x128
+#define OBD_FAIL_MDS_PAUSE_OPEN	  0x129
+#define OBD_FAIL_MDS_STATFS_LCW_SLEEP    0x12a
+#define OBD_FAIL_MDS_OPEN_CREATE	 0x12b
+#define OBD_FAIL_MDS_OST_SETATTR	 0x12c
+#define OBD_FAIL_MDS_QUOTACHECK_NET      0x12d
+#define OBD_FAIL_MDS_QUOTACTL_NET	0x12e
+#define OBD_FAIL_MDS_CLIENT_ADD	  0x12f
+#define OBD_FAIL_MDS_GETXATTR_NET	0x130
+#define OBD_FAIL_MDS_GETXATTR_PACK       0x131
+#define OBD_FAIL_MDS_SETXATTR_NET	0x132
+#define OBD_FAIL_MDS_SETXATTR	    0x133
+#define OBD_FAIL_MDS_SETXATTR_WRITE      0x134
+#define OBD_FAIL_MDS_FS_SETUP	    0x135
+#define OBD_FAIL_MDS_RESEND	      0x136
+#define OBD_FAIL_MDS_LLOG_CREATE_FAILED  0x137
+#define OBD_FAIL_MDS_LOV_SYNC_RACE       0x138
+#define OBD_FAIL_MDS_OSC_PRECREATE       0x139
+#define OBD_FAIL_MDS_LLOG_SYNC_TIMEOUT   0x13a
+#define OBD_FAIL_MDS_CLOSE_NET_REP       0x13b
+#define OBD_FAIL_MDS_BLOCK_QUOTA_REQ     0x13c
+#define OBD_FAIL_MDS_DROP_QUOTA_REQ      0x13d
+#define OBD_FAIL_MDS_REMOVE_COMMON_EA    0x13e
+#define OBD_FAIL_MDS_ALLOW_COMMON_EA_SETTING   0x13f
+#define OBD_FAIL_MDS_FAIL_LOV_LOG_ADD    0x140
+#define OBD_FAIL_MDS_LOV_PREP_CREATE     0x141
+#define OBD_FAIL_MDS_REINT_DELAY	 0x142
+#define OBD_FAIL_MDS_READLINK_EPROTO     0x143
+#define OBD_FAIL_MDS_OPEN_WAIT_CREATE    0x144
+#define OBD_FAIL_MDS_PDO_LOCK	    0x145
+#define OBD_FAIL_MDS_PDO_LOCK2	   0x146
+#define OBD_FAIL_MDS_OSC_CREATE_FAIL     0x147
+#define OBD_FAIL_MDS_NEGATIVE_POSITIVE	 0x148
+#define OBD_FAIL_MDS_HSM_STATE_GET_NET		0x149
+#define OBD_FAIL_MDS_HSM_STATE_SET_NET		0x14a
+#define OBD_FAIL_MDS_HSM_PROGRESS_NET		0x14b
+#define OBD_FAIL_MDS_HSM_REQUEST_NET		0x14c
+#define OBD_FAIL_MDS_HSM_CT_REGISTER_NET	0x14d
+#define OBD_FAIL_MDS_HSM_CT_UNREGISTER_NET	0x14e
+#define OBD_FAIL_MDS_SWAP_LAYOUTS_NET		0x14f
+#define OBD_FAIL_MDS_HSM_ACTION_NET		0x150
+#define OBD_FAIL_MDS_CHANGELOG_INIT		0x151
+
+/* layout lock */
+#define OBD_FAIL_MDS_NO_LL_GETATTR	 0x170
+#define OBD_FAIL_MDS_NO_LL_OPEN		 0x171
+#define OBD_FAIL_MDS_LL_BLOCK		 0x172
+
+/* CMD */
+#define OBD_FAIL_MDS_IS_SUBDIR_NET       0x180
+#define OBD_FAIL_MDS_IS_SUBDIR_PACK      0x181
+#define OBD_FAIL_MDS_SET_INFO_NET	0x182
+#define OBD_FAIL_MDS_WRITEPAGE_NET       0x183
+#define OBD_FAIL_MDS_WRITEPAGE_PACK      0x184
+#define OBD_FAIL_MDS_RECOVERY_ACCEPTS_GAPS 0x185
+#define OBD_FAIL_MDS_GET_INFO_NET	0x186
+#define OBD_FAIL_MDS_DQACQ_NET	   0x187
+
+/* OI scrub */
+#define OBD_FAIL_OSD_SCRUB_DELAY			0x190
+#define OBD_FAIL_OSD_SCRUB_CRASH			0x191
+#define OBD_FAIL_OSD_SCRUB_FATAL			0x192
+#define OBD_FAIL_OSD_FID_MAPPING			0x193
+#define OBD_FAIL_OSD_LMA_INCOMPAT			0x194
+
+#define OBD_FAIL_OST		     0x200
+#define OBD_FAIL_OST_CONNECT_NET	 0x201
+#define OBD_FAIL_OST_DISCONNECT_NET      0x202
+#define OBD_FAIL_OST_GET_INFO_NET	0x203
+#define OBD_FAIL_OST_CREATE_NET	  0x204
+#define OBD_FAIL_OST_DESTROY_NET	 0x205
+#define OBD_FAIL_OST_GETATTR_NET	 0x206
+#define OBD_FAIL_OST_SETATTR_NET	 0x207
+#define OBD_FAIL_OST_OPEN_NET	    0x208
+#define OBD_FAIL_OST_CLOSE_NET	   0x209
+#define OBD_FAIL_OST_BRW_NET	     0x20a
+#define OBD_FAIL_OST_PUNCH_NET	   0x20b
+#define OBD_FAIL_OST_STATFS_NET	  0x20c
+#define OBD_FAIL_OST_HANDLE_UNPACK       0x20d
+#define OBD_FAIL_OST_BRW_WRITE_BULK      0x20e
+#define OBD_FAIL_OST_BRW_READ_BULK       0x20f
+#define OBD_FAIL_OST_SYNC_NET	    0x210
+#define OBD_FAIL_OST_ALL_REPLY_NET       0x211
+#define OBD_FAIL_OST_ALL_REQUEST_NET     0x212
+#define OBD_FAIL_OST_LDLM_REPLY_NET      0x213
+#define OBD_FAIL_OST_BRW_PAUSE_BULK      0x214
+#define OBD_FAIL_OST_ENOSPC	      0x215
+#define OBD_FAIL_OST_EROFS	       0x216
+#define OBD_FAIL_OST_ENOENT	      0x217
+#define OBD_FAIL_OST_QUOTACHECK_NET      0x218
+#define OBD_FAIL_OST_QUOTACTL_NET	0x219
+#define OBD_FAIL_OST_CHECKSUM_RECEIVE    0x21a
+#define OBD_FAIL_OST_CHECKSUM_SEND       0x21b
+#define OBD_FAIL_OST_BRW_SIZE	    0x21c
+#define OBD_FAIL_OST_DROP_REQ	    0x21d
+#define OBD_FAIL_OST_SETATTR_CREDITS     0x21e
+#define OBD_FAIL_OST_HOLD_WRITE_RPC      0x21f
+#define OBD_FAIL_OST_BRW_WRITE_BULK2     0x220
+#define OBD_FAIL_OST_LLOG_RECOVERY_TIMEOUT 0x221
+#define OBD_FAIL_OST_CANCEL_COOKIE_TIMEOUT 0x222
+#define OBD_FAIL_OST_PAUSE_CREATE	0x223
+#define OBD_FAIL_OST_BRW_PAUSE_PACK      0x224
+#define OBD_FAIL_OST_CONNECT_NET2	0x225
+#define OBD_FAIL_OST_NOMEM	       0x226
+#define OBD_FAIL_OST_BRW_PAUSE_BULK2     0x227
+#define OBD_FAIL_OST_MAPBLK_ENOSPC       0x228
+#define OBD_FAIL_OST_ENOINO	      0x229
+#define OBD_FAIL_OST_DQACQ_NET	   0x230
+#define OBD_FAIL_OST_STATFS_EINPROGRESS  0x231
+
+#define OBD_FAIL_LDLM		    0x300
+#define OBD_FAIL_LDLM_NAMESPACE_NEW      0x301
+#define OBD_FAIL_LDLM_ENQUEUE_NET			0x302
+#define OBD_FAIL_LDLM_CONVERT_NET			0x303
+#define OBD_FAIL_LDLM_CANCEL_NET			0x304
+#define OBD_FAIL_LDLM_BL_CALLBACK_NET			0x305
+#define OBD_FAIL_LDLM_CP_CALLBACK_NET			0x306
+#define OBD_FAIL_LDLM_GL_CALLBACK_NET			0x307
+#define OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR 0x308
+#define OBD_FAIL_LDLM_ENQUEUE_INTENT_ERR 0x309
+#define OBD_FAIL_LDLM_CREATE_RESOURCE    0x30a
+#define OBD_FAIL_LDLM_ENQUEUE_BLOCKED    0x30b
+#define OBD_FAIL_LDLM_REPLY	      0x30c
+#define OBD_FAIL_LDLM_RECOV_CLIENTS      0x30d
+#define OBD_FAIL_LDLM_ENQUEUE_OLD_EXPORT 0x30e
+#define OBD_FAIL_LDLM_GLIMPSE	    0x30f
+#define OBD_FAIL_LDLM_CANCEL_RACE	0x310
+#define OBD_FAIL_LDLM_CANCEL_EVICT_RACE  0x311
+#define OBD_FAIL_LDLM_PAUSE_CANCEL       0x312
+#define OBD_FAIL_LDLM_CLOSE_THREAD       0x313
+#define OBD_FAIL_LDLM_CANCEL_BL_CB_RACE  0x314
+#define OBD_FAIL_LDLM_CP_CB_WAIT	 0x315
+#define OBD_FAIL_LDLM_OST_FAIL_RACE      0x316
+#define OBD_FAIL_LDLM_INTR_CP_AST	0x317
+#define OBD_FAIL_LDLM_CP_BL_RACE	 0x318
+#define OBD_FAIL_LDLM_NEW_LOCK	   0x319
+#define OBD_FAIL_LDLM_AGL_DELAY	  0x31a
+#define OBD_FAIL_LDLM_AGL_NOLOCK	 0x31b
+#define OBD_FAIL_LDLM_OST_LVB		 0x31c
+
+/* LOCKLESS IO */
+#define OBD_FAIL_LDLM_SET_CONTENTION     0x385
+
+#define OBD_FAIL_OSC		     0x400
+#define OBD_FAIL_OSC_BRW_READ_BULK       0x401
+#define OBD_FAIL_OSC_BRW_WRITE_BULK      0x402
+#define OBD_FAIL_OSC_LOCK_BL_AST	 0x403
+#define OBD_FAIL_OSC_LOCK_CP_AST	 0x404
+#define OBD_FAIL_OSC_MATCH	       0x405
+#define OBD_FAIL_OSC_BRW_PREP_REQ	0x406
+#define OBD_FAIL_OSC_SHUTDOWN	    0x407
+#define OBD_FAIL_OSC_CHECKSUM_RECEIVE    0x408
+#define OBD_FAIL_OSC_CHECKSUM_SEND       0x409
+#define OBD_FAIL_OSC_BRW_PREP_REQ2       0x40a
+#define OBD_FAIL_OSC_CONNECT_CKSUM       0x40b
+#define OBD_FAIL_OSC_CKSUM_ADLER_ONLY    0x40c
+#define OBD_FAIL_OSC_DIO_PAUSE	   0x40d
+#define OBD_FAIL_OSC_OBJECT_CONTENTION   0x40e
+#define OBD_FAIL_OSC_CP_CANCEL_RACE      0x40f
+#define OBD_FAIL_OSC_CP_ENQ_RACE	 0x410
+#define OBD_FAIL_OSC_NO_GRANT	    0x411
+#define OBD_FAIL_OSC_DELAY_SETTIME	 0x412
+
+#define OBD_FAIL_PTLRPC		  0x500
+#define OBD_FAIL_PTLRPC_ACK	      0x501
+#define OBD_FAIL_PTLRPC_RQBD	     0x502
+#define OBD_FAIL_PTLRPC_BULK_GET_NET     0x503
+#define OBD_FAIL_PTLRPC_BULK_PUT_NET     0x504
+#define OBD_FAIL_PTLRPC_DROP_RPC	 0x505
+#define OBD_FAIL_PTLRPC_DELAY_SEND       0x506
+#define OBD_FAIL_PTLRPC_DELAY_RECOV      0x507
+#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB   0x508
+#define OBD_FAIL_PTLRPC_PAUSE_REQ	0x50a
+#define OBD_FAIL_PTLRPC_PAUSE_REP	0x50c
+#define OBD_FAIL_PTLRPC_IMP_DEACTIVE     0x50d
+#define OBD_FAIL_PTLRPC_DUMP_LOG	 0x50e
+#define OBD_FAIL_PTLRPC_LONG_REPL_UNLINK 0x50f
+#define OBD_FAIL_PTLRPC_LONG_BULK_UNLINK 0x510
+#define OBD_FAIL_PTLRPC_HPREQ_TIMEOUT    0x511
+#define OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT  0x512
+#define OBD_FAIL_PTLRPC_DROP_REQ_OPC     0x513
+#define OBD_FAIL_PTLRPC_FINISH_REPLAY    0x514
+#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB2  0x515
+#define OBD_FAIL_PTLRPC_DELAY_IMP_FULL   0x516
+#define OBD_FAIL_PTLRPC_CANCEL_RESEND    0x517
+
+#define OBD_FAIL_OBD_PING_NET	    0x600
+#define OBD_FAIL_OBD_LOG_CANCEL_NET      0x601
+#define OBD_FAIL_OBD_LOGD_NET	    0x602
+#define OBD_FAIL_OBD_QC_CALLBACK_NET     0x603
+#define OBD_FAIL_OBD_DQACQ	       0x604
+#define OBD_FAIL_OBD_LLOG_SETUP	  0x605
+#define OBD_FAIL_OBD_LOG_CANCEL_REP      0x606
+#define OBD_FAIL_OBD_IDX_READ_NET	0x607
+#define OBD_FAIL_OBD_IDX_READ_BREAK	 0x608
+#define OBD_FAIL_OBD_NO_LRU		 0x609
+
+#define OBD_FAIL_TGT_REPLY_NET	   0x700
+#define OBD_FAIL_TGT_CONN_RACE	   0x701
+#define OBD_FAIL_TGT_FORCE_RECONNECT     0x702
+#define OBD_FAIL_TGT_DELAY_CONNECT       0x703
+#define OBD_FAIL_TGT_DELAY_RECONNECT     0x704
+#define OBD_FAIL_TGT_DELAY_PRECREATE     0x705
+#define OBD_FAIL_TGT_TOOMANY_THREADS     0x706
+#define OBD_FAIL_TGT_REPLAY_DROP	 0x707
+#define OBD_FAIL_TGT_FAKE_EXP	    0x708
+#define OBD_FAIL_TGT_REPLAY_DELAY	0x709
+#define OBD_FAIL_TGT_LAST_REPLAY	 0x710
+#define OBD_FAIL_TGT_CLIENT_ADD	  0x711
+#define OBD_FAIL_TGT_RCVG_FLAG	   0x712
+
+#define OBD_FAIL_MDC_REVALIDATE_PAUSE    0x800
+#define OBD_FAIL_MDC_ENQUEUE_PAUSE       0x801
+#define OBD_FAIL_MDC_OLD_EXT_FLAGS       0x802
+#define OBD_FAIL_MDC_GETATTR_ENQUEUE     0x803
+#define OBD_FAIL_MDC_RPCS_SEM		 0x804
+#define OBD_FAIL_MDC_LIGHTWEIGHT	 0x805
+
+#define OBD_FAIL_MGS		     0x900
+#define OBD_FAIL_MGS_ALL_REQUEST_NET     0x901
+#define OBD_FAIL_MGS_ALL_REPLY_NET       0x902
+#define OBD_FAIL_MGC_PAUSE_PROCESS_LOG   0x903
+#define OBD_FAIL_MGS_PAUSE_REQ	   0x904
+#define OBD_FAIL_MGS_PAUSE_TARGET_REG    0x905
+
+#define OBD_FAIL_QUOTA_DQACQ_NET			0xA01
+#define OBD_FAIL_QUOTA_EDQUOT	    0xA02
+#define OBD_FAIL_QUOTA_DELAY_REINT       0xA03
+#define OBD_FAIL_QUOTA_RECOVERABLE_ERR   0xA04
+
+#define OBD_FAIL_LPROC_REMOVE	    0xB00
+
+#define OBD_FAIL_GENERAL_ALLOC	   0xC00
+
+#define OBD_FAIL_SEQ		     0x1000
+#define OBD_FAIL_SEQ_QUERY_NET	   0x1001
+#define OBD_FAIL_SEQ_EXHAUST		 0x1002
+
+#define OBD_FAIL_FLD		     0x1100
+#define OBD_FAIL_FLD_QUERY_NET	   0x1101
+
+#define OBD_FAIL_SEC_CTX		 0x1200
+#define OBD_FAIL_SEC_CTX_INIT_NET	0x1201
+#define OBD_FAIL_SEC_CTX_INIT_CONT_NET   0x1202
+#define OBD_FAIL_SEC_CTX_FINI_NET	0x1203
+#define OBD_FAIL_SEC_CTX_HDL_PAUSE       0x1204
+
+#define OBD_FAIL_LLOG			       0x1300
+#define OBD_FAIL_LLOG_ORIGIN_CONNECT_NET	    0x1301
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_CREATE_NET      0x1302
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_DESTROY_NET     0x1303
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_READ_HEADER_NET 0x1304
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_NEXT_BLOCK_NET  0x1305
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_PREV_BLOCK_NET  0x1306
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_WRITE_REC_NET   0x1307
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_CLOSE_NET       0x1308
+#define OBD_FAIL_LLOG_CATINFO_NET		   0x1309
+#define OBD_FAIL_MDS_SYNC_CAPA_SL		   0x1310
+#define OBD_FAIL_SEQ_ALLOC			  0x1311
+
+#define OBD_FAIL_LLITE			      0x1400
+#define OBD_FAIL_LLITE_FAULT_TRUNC_RACE	     0x1401
+#define OBD_FAIL_LOCK_STATE_WAIT_INTR	       0x1402
+#define OBD_FAIL_LOV_INIT			    0x1403
+#define OBD_FAIL_GLIMPSE_DELAY			    0x1404
+
+#define OBD_FAIL_FID_INDIR	0x1501
+#define OBD_FAIL_FID_INLMA	0x1502
+#define OBD_FAIL_FID_IGIF	0x1504
+#define OBD_FAIL_FID_LOOKUP	0x1505
+#define OBD_FAIL_FID_NOLMA	0x1506
+
+/* LFSCK */
+#define OBD_FAIL_LFSCK_DELAY1		0x1600
+#define OBD_FAIL_LFSCK_DELAY2		0x1601
+#define OBD_FAIL_LFSCK_DELAY3		0x1602
+#define OBD_FAIL_LFSCK_LINKEA_CRASH	0x1603
+#define OBD_FAIL_LFSCK_LINKEA_MORE	0x1604
+#define OBD_FAIL_LFSCK_FATAL1		0x1608
+#define OBD_FAIL_LFSCK_FATAL2		0x1609
+#define OBD_FAIL_LFSCK_CRASH		0x160a
+#define OBD_FAIL_LFSCK_NO_AUTO		0x160b
+#define OBD_FAIL_LFSCK_NO_DOUBLESCAN	0x160c
+
+/* UPDATE */
+#define OBD_FAIL_UPDATE_OBJ_NET			0x1700
+#define OBD_FAIL_UPDATE_OBJ_NET_REP		0x1701
+
+
+/* Assign references to moved code to reduce code changes */
+#define OBD_FAIL_PRECHECK(id)		   CFS_FAIL_PRECHECK(id)
+#define OBD_FAIL_CHECK(id)		      CFS_FAIL_CHECK(id)
+#define OBD_FAIL_CHECK_VALUE(id, value)	 CFS_FAIL_CHECK_VALUE(id, value)
+#define OBD_FAIL_CHECK_ORSET(id, value)	 CFS_FAIL_CHECK_ORSET(id, value)
+#define OBD_FAIL_CHECK_RESET(id, value)	 CFS_FAIL_CHECK_RESET(id, value)
+#define OBD_FAIL_RETURN(id, ret)		CFS_FAIL_RETURN(id, ret)
+#define OBD_FAIL_TIMEOUT(id, secs)	      CFS_FAIL_TIMEOUT(id, secs)
+#define OBD_FAIL_TIMEOUT_MS(id, ms)	     CFS_FAIL_TIMEOUT_MS(id, ms)
+#define OBD_FAIL_TIMEOUT_ORSET(id, value, secs) CFS_FAIL_TIMEOUT_ORSET(id, value, secs)
+#define OBD_RACE(id)			    CFS_RACE(id)
+#define OBD_FAIL_ONCE			   CFS_FAIL_ONCE
+#define OBD_FAILED			      CFS_FAILED
+
+extern atomic_t libcfs_kmemory;
+
+#ifdef LPROCFS
+#define obd_memory_add(size)						  \
+	lprocfs_counter_add(obd_memory, OBD_MEMORY_STAT, (long)(size))
+#define obd_memory_sub(size)						  \
+	lprocfs_counter_sub(obd_memory, OBD_MEMORY_STAT, (long)(size))
+#define obd_memory_sum()						      \
+	lprocfs_stats_collector(obd_memory, OBD_MEMORY_STAT,		  \
+				LPROCFS_FIELDS_FLAGS_SUM)
+#define obd_pages_add(order)						  \
+	lprocfs_counter_add(obd_memory, OBD_MEMORY_PAGES_STAT,		\
+			    (long)(1 << (order)))
+#define obd_pages_sub(order)						  \
+	lprocfs_counter_sub(obd_memory, OBD_MEMORY_PAGES_STAT,		\
+			    (long)(1 << (order)))
+#define obd_pages_sum()						       \
+	lprocfs_stats_collector(obd_memory, OBD_MEMORY_PAGES_STAT,	    \
+				LPROCFS_FIELDS_FLAGS_SUM)
+
+extern void obd_update_maxusage(void);
+extern __u64 obd_memory_max(void);
+extern __u64 obd_pages_max(void);
+
+#else
+
+extern __u64 obd_alloc;
+extern __u64 obd_pages;
+
+extern __u64 obd_max_alloc;
+extern __u64 obd_max_pages;
+
+static inline void obd_memory_add(long size)
+{
+	obd_alloc += size;
+	if (obd_alloc > obd_max_alloc)
+		obd_max_alloc = obd_alloc;
+}
+
+static inline void obd_memory_sub(long size)
+{
+	obd_alloc -= size;
+}
+
+static inline void obd_pages_add(int order)
+{
+	obd_pages += 1<< order;
+	if (obd_pages > obd_max_pages)
+		obd_max_pages = obd_pages;
+}
+
+static inline void obd_pages_sub(int order)
+{
+	obd_pages -= 1<< order;
+}
+
+#define obd_memory_sum() (obd_alloc)
+#define obd_pages_sum()  (obd_pages)
+
+#define obd_memory_max() (obd_max_alloc)
+#define obd_pages_max() (obd_max_pages)
+
+#endif
+
+#define OBD_DEBUG_MEMUSAGE (1)
+
+#if OBD_DEBUG_MEMUSAGE
+#define OBD_ALLOC_POST(ptr, size, name)				 \
+		obd_memory_add(size);				   \
+		CDEBUG(D_MALLOC, name " '" #ptr "': %d at %p.\n",       \
+		       (int)(size), ptr)
+
+#define OBD_FREE_PRE(ptr, size, name)				   \
+	LASSERT(ptr);						   \
+	obd_memory_sub(size);					   \
+	CDEBUG(D_MALLOC, name " '" #ptr "': %d at %p.\n",	       \
+	       (int)(size), ptr);				       \
+	POISON(ptr, 0x5a, size)
+
+#else /* !OBD_DEBUG_MEMUSAGE */
+
+#define OBD_ALLOC_POST(ptr, size, name) ((void)0)
+#define OBD_FREE_PRE(ptr, size, name)   ((void)0)
+
+#endif /* !OBD_DEBUG_MEMUSAGE */
+
+#define HAS_FAIL_ALLOC_FLAG OBD_FAIL_CHECK(OBD_FAIL_GENERAL_ALLOC)
+
+#define OBD_ALLOC_FAIL_BITS 24
+#define OBD_ALLOC_FAIL_MASK ((1 << OBD_ALLOC_FAIL_BITS) - 1)
+#define OBD_ALLOC_FAIL_MULT (OBD_ALLOC_FAIL_MASK / 100)
+
+#if defined(LUSTRE_UTILS) /* this version is for utils only */
+#define __OBD_MALLOC_VERBOSE(ptr, cptab, cpt, size, flags)		      \
+do {									      \
+	(ptr) = (cptab) == NULL ?					      \
+		kmalloc(size, flags) :				      \
+		cfs_cpt_malloc(cptab, cpt, size, flags);		      \
+	if (unlikely((ptr) == NULL)) {					\
+		CERROR("kmalloc of '" #ptr "' (%d bytes) failed at %s:%d\n",  \
+		       (int)(size), __FILE__, __LINE__);		      \
+	} else {							      \
+		memset(ptr, 0, size);					      \
+		CDEBUG(D_MALLOC, "kmalloced '" #ptr "': %d at %p\n",	      \
+		       (int)(size), ptr);				      \
+	}								      \
+} while (0)
+
+#else /* this version is for the kernel and liblustre */
+#define OBD_FREE_RTN0(ptr)						    \
+({									    \
+	kfree(ptr);							\
+	(ptr) = NULL;							 \
+	0;								    \
+})
+
+#define __OBD_MALLOC_VERBOSE(ptr, cptab, cpt, size, flags)		      \
+do {									      \
+	(ptr) = (cptab) == NULL ?					      \
+		kmalloc(size, flags) :				      \
+		cfs_cpt_malloc(cptab, cpt, size, flags);		      \
+	if (likely((ptr) != NULL &&					   \
+		   (!HAS_FAIL_ALLOC_FLAG || obd_alloc_fail_rate == 0 ||       \
+		    !obd_alloc_fail(ptr, #ptr, "km", size,		    \
+				    __FILE__, __LINE__) ||		    \
+		    OBD_FREE_RTN0(ptr)))){				    \
+		memset(ptr, 0, size);					 \
+		OBD_ALLOC_POST(ptr, size, "kmalloced");		       \
+	}								     \
+} while (0)
+#endif
+
+#define OBD_ALLOC_GFP(ptr, size, gfp_mask)				      \
+	__OBD_MALLOC_VERBOSE(ptr, NULL, 0, size, gfp_mask)
+
+#define OBD_ALLOC(ptr, size) OBD_ALLOC_GFP(ptr, size, __GFP_IO)
+#define OBD_ALLOC_WAIT(ptr, size) OBD_ALLOC_GFP(ptr, size, GFP_IOFS)
+#define OBD_ALLOC_PTR(ptr) OBD_ALLOC(ptr, sizeof *(ptr))
+#define OBD_ALLOC_PTR_WAIT(ptr) OBD_ALLOC_WAIT(ptr, sizeof *(ptr))
+
+#define OBD_CPT_ALLOC_GFP(ptr, cptab, cpt, size, gfp_mask)		      \
+	__OBD_MALLOC_VERBOSE(ptr, cptab, cpt, size, gfp_mask)
+
+#define OBD_CPT_ALLOC(ptr, cptab, cpt, size)				      \
+	OBD_CPT_ALLOC_GFP(ptr, cptab, cpt, size, __GFP_IO)
+
+#define OBD_CPT_ALLOC_PTR(ptr, cptab, cpt)				      \
+	OBD_CPT_ALLOC(ptr, cptab, cpt, sizeof *(ptr))
+
+# define __OBD_VMALLOC_VEROBSE(ptr, cptab, cpt, size)			      \
+do {									      \
+	(ptr) = cptab == NULL ?						      \
+		vmalloc(size) :					      \
+		cfs_cpt_vmalloc(cptab, cpt, size);			      \
+	if (unlikely((ptr) == NULL)) {					\
+		CERROR("vmalloc of '" #ptr "' (%d bytes) failed\n",	   \
+		       (int)(size));					  \
+		CERROR(LPU64" total bytes allocated by Lustre, %d by LNET\n", \
+		       obd_memory_sum(), atomic_read(&libcfs_kmemory));   \
+	} else {							      \
+		memset(ptr, 0, size);					 \
+		OBD_ALLOC_POST(ptr, size, "vmalloced");		       \
+	}								     \
+} while(0)
+
+# define OBD_VMALLOC(ptr, size)						      \
+	 __OBD_VMALLOC_VEROBSE(ptr, NULL, 0, size)
+# define OBD_CPT_VMALLOC(ptr, cptab, cpt, size)				      \
+	 __OBD_VMALLOC_VEROBSE(ptr, cptab, cpt, size)
+
+
+/* Allocations above this size are considered too big and could not be done
+ * atomically.
+ *
+ * Be very careful when changing this value, especially when decreasing it,
+ * since vmalloc in Linux doesn't perform well on multi-cores system, calling
+ * vmalloc in critical path would hurt peformance badly. See LU-66.
+ */
+#define OBD_ALLOC_BIG (4 * PAGE_CACHE_SIZE)
+
+#define OBD_ALLOC_LARGE(ptr, size)					    \
+do {									  \
+	if (size > OBD_ALLOC_BIG)					     \
+		OBD_VMALLOC(ptr, size);				       \
+	else								  \
+		OBD_ALLOC(ptr, size);					 \
+} while (0)
+
+#define OBD_CPT_ALLOC_LARGE(ptr, cptab, cpt, size)			      \
+do {									      \
+	if (size > OBD_ALLOC_BIG)					      \
+		OBD_CPT_VMALLOC(ptr, cptab, cpt, size);			      \
+	else								      \
+		OBD_CPT_ALLOC(ptr, cptab, cpt, size);			      \
+} while (0)
+
+#define OBD_FREE_LARGE(ptr, size)					     \
+do {									  \
+	if (size > OBD_ALLOC_BIG)					     \
+		OBD_VFREE(ptr, size);					 \
+	else								  \
+		OBD_FREE(ptr, size);					  \
+} while (0)
+
+
+#ifdef CONFIG_DEBUG_SLAB
+#define POISON(ptr, c, s) do {} while (0)
+#define POISON_PTR(ptr)  ((void)0)
+#else
+#define POISON(ptr, c, s) memset(ptr, c, s)
+#define POISON_PTR(ptr)  (ptr) = (void *)0xdeadbeef
+#endif
+
+#ifdef POISON_BULK
+#define POISON_PAGE(page, val) do { memset(kmap(page), val, PAGE_CACHE_SIZE);   \
+				    kunmap(page); } while (0)
+#else
+#define POISON_PAGE(page, val) do { } while (0)
+#endif
+
+#define OBD_FREE(ptr, size)						   \
+do {									  \
+	OBD_FREE_PRE(ptr, size, "kfreed");				    \
+	kfree(ptr);							\
+	POISON_PTR(ptr);						      \
+} while(0)
+
+
+#define OBD_FREE_RCU(ptr, size, handle)					      \
+do {									      \
+	struct portals_handle *__h = (handle);				      \
+									      \
+	LASSERT(handle != NULL);					      \
+	__h->h_cookie = (unsigned long)(ptr);				      \
+	__h->h_size = (size);						      \
+	call_rcu(&__h->h_rcu, class_handle_free_cb);			      \
+	POISON_PTR(ptr);						      \
+} while(0)
+
+
+#define OBD_VFREE(ptr, size)				\
+	do {						\
+		OBD_FREE_PRE(ptr, size, "vfreed");	\
+		vfree(ptr);			\
+		POISON_PTR(ptr);			\
+	} while (0)
+
+/* we memset() the slab object to 0 when allocation succeeds, so DO NOT
+ * HAVE A CTOR THAT DOES ANYTHING.  its work will be cleared here.  we'd
+ * love to assert on that, but slab.c keeps kmem_cache_s all to itself. */
+#define OBD_SLAB_FREE_RTN0(ptr, slab)					 \
+({									    \
+	kmem_cache_free((slab), (ptr));				    \
+	(ptr) = NULL;							 \
+	0;								    \
+})
+
+#define __OBD_SLAB_ALLOC_VERBOSE(ptr, slab, cptab, cpt, size, type)	      \
+do {									      \
+	LASSERT(ergo((type) != GFP_ATOMIC, !in_interrupt()));	      \
+	(ptr) = (cptab) == NULL ?					      \
+		kmem_cache_alloc(slab, type) :			      \
+		cfs_mem_cache_cpt_alloc(slab, cptab, cpt, type);	      \
+	if (likely((ptr) != NULL &&					   \
+		   (!HAS_FAIL_ALLOC_FLAG || obd_alloc_fail_rate == 0 ||       \
+		    !obd_alloc_fail(ptr, #ptr, "slab-", size,		 \
+				    __FILE__, __LINE__) ||		    \
+		    OBD_SLAB_FREE_RTN0(ptr, slab)))) {			\
+		memset(ptr, 0, size);					 \
+		OBD_ALLOC_POST(ptr, size, "slab-alloced");		    \
+	}								     \
+} while(0)
+
+#define OBD_SLAB_ALLOC_GFP(ptr, slab, size, flags)			      \
+	__OBD_SLAB_ALLOC_VERBOSE(ptr, slab, NULL, 0, size, flags)
+#define OBD_SLAB_CPT_ALLOC_GFP(ptr, slab, cptab, cpt, size, flags)	      \
+	__OBD_SLAB_ALLOC_VERBOSE(ptr, slab, cptab, cpt, size, flags)
+
+#define OBD_FREE_PTR(ptr) OBD_FREE(ptr, sizeof *(ptr))
+
+#define OBD_SLAB_FREE(ptr, slab, size)					\
+do {									  \
+	OBD_FREE_PRE(ptr, size, "slab-freed");				\
+	kmem_cache_free(slab, ptr);					\
+	POISON_PTR(ptr);						      \
+} while(0)
+
+#define OBD_SLAB_ALLOC(ptr, slab, size)					      \
+	OBD_SLAB_ALLOC_GFP(ptr, slab, size, __GFP_IO)
+
+#define OBD_SLAB_CPT_ALLOC(ptr, slab, cptab, cpt, size)			      \
+	OBD_SLAB_CPT_ALLOC_GFP(ptr, slab, cptab, cpt, size, __GFP_IO)
+
+#define OBD_SLAB_ALLOC_PTR(ptr, slab)					      \
+	OBD_SLAB_ALLOC(ptr, slab, sizeof *(ptr))
+
+#define OBD_SLAB_CPT_ALLOC_PTR(ptr, slab, cptab, cpt)			      \
+	OBD_SLAB_CPT_ALLOC(ptr, slab, cptab, cpt, sizeof *(ptr))
+
+#define OBD_SLAB_ALLOC_PTR_GFP(ptr, slab, flags)			      \
+	OBD_SLAB_ALLOC_GFP(ptr, slab, sizeof *(ptr), flags)
+
+#define OBD_SLAB_CPT_ALLOC_PTR_GFP(ptr, slab, cptab, cpt, flags)		      \
+	OBD_SLAB_CPT_ALLOC_GFP(ptr, slab, cptab, cpt, sizeof *(ptr), flags)
+
+#define OBD_SLAB_FREE_PTR(ptr, slab)					      \
+	OBD_SLAB_FREE((ptr), (slab), sizeof *(ptr))
+
+#define KEY_IS(str) \
+	(keylen >= (sizeof(str)-1) && memcmp(key, str, (sizeof(str)-1)) == 0)
+
+/* Wrapper for contiguous page frame allocation */
+#define __OBD_PAGE_ALLOC_VERBOSE(ptr, cptab, cpt, gfp_mask)		      \
+do {									      \
+	(ptr) = (cptab) == NULL ?					      \
+		alloc_page(gfp_mask) :				      \
+		cfs_page_cpt_alloc(cptab, cpt, gfp_mask);		      \
+	if (unlikely((ptr) == NULL)) {					\
+		CERROR("alloc_pages of '" #ptr "' %d page(s) / "LPU64" bytes "\
+		       "failed\n", (int)1,				    \
+		       (__u64)(1 << PAGE_CACHE_SHIFT));			 \
+		CERROR(LPU64" total bytes and "LPU64" total pages "	   \
+		       "("LPU64" bytes) allocated by Lustre, "		\
+		       "%d total bytes by LNET\n",			    \
+		       obd_memory_sum(),				      \
+		       obd_pages_sum() << PAGE_CACHE_SHIFT,		     \
+		       obd_pages_sum(),				       \
+		       atomic_read(&libcfs_kmemory));		     \
+	} else {							      \
+		obd_pages_add(0);					     \
+		CDEBUG(D_MALLOC, "alloc_pages '" #ptr "': %d page(s) / "      \
+		       LPU64" bytes at %p.\n",				\
+		       (int)1,						\
+		       (__u64)(1 << PAGE_CACHE_SHIFT), ptr);		    \
+	}								     \
+} while (0)
+
+#define OBD_PAGE_ALLOC(ptr, gfp_mask)					      \
+	__OBD_PAGE_ALLOC_VERBOSE(ptr, NULL, 0, gfp_mask)
+#define OBD_PAGE_CPT_ALLOC(ptr, cptab, cpt, gfp_mask)			      \
+	__OBD_PAGE_ALLOC_VERBOSE(ptr, cptab, cpt, gfp_mask)
+
+#define OBD_PAGE_FREE(ptr)						    \
+do {									  \
+	LASSERT(ptr);							 \
+	obd_pages_sub(0);						     \
+	CDEBUG(D_MALLOC, "free_pages '" #ptr "': %d page(s) / "LPU64" bytes " \
+	       "at %p.\n",						    \
+	       (int)1, (__u64)(1 << PAGE_CACHE_SHIFT),			  \
+	       ptr);							  \
+	__free_page(ptr);						   \
+	(ptr) = (void *)0xdeadbeef;					   \
+} while (0)
+
+#endif
diff --git a/drivers/staging/lustre/lustre/lclient/glimpse.c b/drivers/staging/lustre/lustre/lclient/glimpse.c
new file mode 100644
index 000000000000..7f3974be1f92
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lclient/glimpse.c
@@ -0,0 +1,274 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * glimpse code shared between vvp and liblustre (and other Lustre clients in
+ * the future).
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Oleg Drokin <oleg.drokin@sun.com>
+ */
+
+#include <linux/libcfs/libcfs.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <obd.h>
+
+# include <lustre_dlm.h>
+# include <lustre_lite.h>
+# include <lustre_mdc.h>
+# include <linux/pagemap.h>
+# include <linux/file.h>
+
+#include "cl_object.h"
+#include "lclient.h"
+# include "../llite/llite_internal.h"
+
+static const struct cl_lock_descr whole_file = {
+	.cld_start = 0,
+	.cld_end   = CL_PAGE_EOF,
+	.cld_mode  = CLM_READ
+};
+
+/*
+ * Check whether file has possible unwriten pages.
+ *
+ * \retval 1    file is mmap-ed or has dirty pages
+ *	 0    otherwise
+ */
+blkcnt_t dirty_cnt(struct inode *inode)
+{
+	blkcnt_t cnt = 0;
+	struct ccc_object *vob = cl_inode2ccc(inode);
+	void	      *results[1];
+
+	if (inode->i_mapping != NULL)
+		cnt += radix_tree_gang_lookup_tag(&inode->i_mapping->page_tree,
+						  results, 0, 1,
+						  PAGECACHE_TAG_DIRTY);
+	if (cnt == 0 && atomic_read(&vob->cob_mmap_cnt) > 0)
+		cnt = 1;
+
+	return (cnt > 0) ? 1 : 0;
+}
+
+int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io,
+		    struct inode *inode, struct cl_object *clob, int agl)
+{
+	struct cl_lock_descr *descr = &ccc_env_info(env)->cti_descr;
+	struct cl_inode_info *lli   = cl_i2info(inode);
+	const struct lu_fid  *fid   = lu_object_fid(&clob->co_lu);
+	struct ccc_io	*cio   = ccc_env_io(env);
+	struct cl_lock       *lock;
+	int result;
+
+	ENTRY;
+	result = 0;
+	if (!(lli->lli_flags & LLIF_MDS_SIZE_LOCK)) {
+		CDEBUG(D_DLMTRACE, "Glimpsing inode "DFID"\n", PFID(fid));
+		if (lli->lli_has_smd) {
+			/* NOTE: this looks like DLM lock request, but it may
+			 *       not be one. Due to CEF_ASYNC flag (translated
+			 *       to LDLM_FL_HAS_INTENT by osc), this is
+			 *       glimpse request, that won't revoke any
+			 *       conflicting DLM locks held. Instead,
+			 *       ll_glimpse_callback() will be called on each
+			 *       client holding a DLM lock against this file,
+			 *       and resulting size will be returned for each
+			 *       stripe. DLM lock on [0, EOF] is acquired only
+			 *       if there were no conflicting locks. If there
+			 *       were conflicting locks, enqueuing or waiting
+			 *       fails with -ENAVAIL, but valid inode
+			 *       attributes are returned anyway. */
+			*descr = whole_file;
+			descr->cld_obj   = clob;
+			descr->cld_mode  = CLM_PHANTOM;
+			descr->cld_enq_flags = CEF_ASYNC | CEF_MUST;
+			if (agl)
+				descr->cld_enq_flags |= CEF_AGL;
+			cio->cui_glimpse = 1;
+			/*
+			 * CEF_ASYNC is used because glimpse sub-locks cannot
+			 * deadlock (because they never conflict with other
+			 * locks) and, hence, can be enqueued out-of-order.
+			 *
+			 * CEF_MUST protects glimpse lock from conversion into
+			 * a lockless mode.
+			 */
+			lock = cl_lock_request(env, io, descr, "glimpse",
+					       current);
+			cio->cui_glimpse = 0;
+
+			if (lock == NULL)
+				RETURN(0);
+
+			if (IS_ERR(lock))
+				RETURN(PTR_ERR(lock));
+
+			LASSERT(agl == 0);
+			result = cl_wait(env, lock);
+			if (result == 0) {
+				cl_merge_lvb(env, inode);
+				if (cl_isize_read(inode) > 0 &&
+				    inode->i_blocks == 0) {
+					/*
+					 * LU-417: Add dirty pages block count
+					 * lest i_blocks reports 0, some "cp" or
+					 * "tar" may think it's a completely
+					 * sparse file and skip it.
+					 */
+					inode->i_blocks = dirty_cnt(inode);
+				}
+				cl_unuse(env, lock);
+			}
+			cl_lock_release(env, lock, "glimpse", current);
+		} else {
+			CDEBUG(D_DLMTRACE, "No objects for inode\n");
+			cl_merge_lvb(env, inode);
+		}
+	}
+
+	RETURN(result);
+}
+
+static int cl_io_get(struct inode *inode, struct lu_env **envout,
+		     struct cl_io **ioout, int *refcheck)
+{
+	struct lu_env	  *env;
+	struct cl_io	   *io;
+	struct cl_inode_info   *lli = cl_i2info(inode);
+	struct cl_object       *clob = lli->lli_clob;
+	int result;
+
+	if (S_ISREG(cl_inode_mode(inode))) {
+		env = cl_env_get(refcheck);
+		if (!IS_ERR(env)) {
+			io = ccc_env_thread_io(env);
+			io->ci_obj = clob;
+			*envout = env;
+			*ioout  = io;
+			result = +1;
+		} else
+			result = PTR_ERR(env);
+	} else
+		result = 0;
+	return result;
+}
+
+int cl_glimpse_size0(struct inode *inode, int agl)
+{
+	/*
+	 * We don't need ast_flags argument to cl_glimpse_size(), because
+	 * osc_lock_enqueue() takes care of the possible deadlock that said
+	 * argument was introduced to avoid.
+	 */
+	/*
+	 * XXX but note that ll_file_seek() passes LDLM_FL_BLOCK_NOWAIT to
+	 * cl_glimpse_size(), which doesn't make sense: glimpse locks are not
+	 * blocking anyway.
+	 */
+	struct lu_env	  *env = NULL;
+	struct cl_io	   *io  = NULL;
+	int		     result;
+	int		     refcheck;
+
+	ENTRY;
+
+	result = cl_io_get(inode, &env, &io, &refcheck);
+	if (result > 0) {
+	again:
+		io->ci_verify_layout = 1;
+		result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+		if (result > 0)
+			/*
+			 * nothing to do for this io. This currently happens
+			 * when stripe sub-object's are not yet created.
+			 */
+			result = io->ci_result;
+		else if (result == 0)
+			result = cl_glimpse_lock(env, io, inode, io->ci_obj,
+						 agl);
+
+		OBD_FAIL_TIMEOUT(OBD_FAIL_GLIMPSE_DELAY, 2);
+		cl_io_fini(env, io);
+		if (unlikely(io->ci_need_restart))
+			goto again;
+		cl_env_put(env, &refcheck);
+	}
+	RETURN(result);
+}
+
+int cl_local_size(struct inode *inode)
+{
+	struct lu_env	   *env = NULL;
+	struct cl_io	    *io  = NULL;
+	struct ccc_thread_info  *cti;
+	struct cl_object	*clob;
+	struct cl_lock_descr    *descr;
+	struct cl_lock	  *lock;
+	int		      result;
+	int		      refcheck;
+
+	ENTRY;
+
+	if (!cl_i2info(inode)->lli_has_smd)
+		RETURN(0);
+
+	result = cl_io_get(inode, &env, &io, &refcheck);
+	if (result <= 0)
+		RETURN(result);
+
+	clob = io->ci_obj;
+	result = cl_io_init(env, io, CIT_MISC, clob);
+	if (result > 0)
+		result = io->ci_result;
+	else if (result == 0) {
+		cti = ccc_env_info(env);
+		descr = &cti->cti_descr;
+
+		*descr = whole_file;
+		descr->cld_obj = clob;
+		lock = cl_lock_peek(env, io, descr, "localsize", current);
+		if (lock != NULL) {
+			cl_merge_lvb(env, inode);
+			cl_unuse(env, lock);
+			cl_lock_release(env, lock, "localsize", current);
+			result = 0;
+		} else
+			result = -ENODATA;
+	}
+	cl_io_fini(env, io);
+	cl_env_put(env, &refcheck);
+	RETURN(result);
+}
diff --git a/drivers/staging/lustre/lustre/lclient/lcommon_cl.c b/drivers/staging/lustre/lustre/lclient/lcommon_cl.c
new file mode 100644
index 000000000000..4a0166687f07
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lclient/lcommon_cl.c
@@ -0,0 +1,1325 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * cl code shared between vvp and liblustre (and other Lustre clients in the
+ * future).
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/fs.h>
+# include <linux/sched.h>
+# include <linux/mm.h>
+# include <linux/quotaops.h>
+# include <linux/highmem.h>
+# include <linux/pagemap.h>
+# include <linux/rbtree.h>
+
+#include <obd.h>
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <lustre_lite.h>
+#include <lustre_dlm.h>
+#include <lustre_ver.h>
+#include <lustre_mdc.h>
+#include <cl_object.h>
+
+#include <lclient.h>
+
+#include "../llite/llite_internal.h"
+
+const struct cl_req_operations ccc_req_ops;
+
+/*
+ * ccc_ prefix stands for "Common Client Code".
+ */
+
+static struct kmem_cache *ccc_lock_kmem;
+static struct kmem_cache *ccc_object_kmem;
+static struct kmem_cache *ccc_thread_kmem;
+static struct kmem_cache *ccc_session_kmem;
+static struct kmem_cache *ccc_req_kmem;
+
+static struct lu_kmem_descr ccc_caches[] = {
+	{
+		.ckd_cache = &ccc_lock_kmem,
+		.ckd_name  = "ccc_lock_kmem",
+		.ckd_size  = sizeof (struct ccc_lock)
+	},
+	{
+		.ckd_cache = &ccc_object_kmem,
+		.ckd_name  = "ccc_object_kmem",
+		.ckd_size  = sizeof (struct ccc_object)
+	},
+	{
+		.ckd_cache = &ccc_thread_kmem,
+		.ckd_name  = "ccc_thread_kmem",
+		.ckd_size  = sizeof (struct ccc_thread_info),
+	},
+	{
+		.ckd_cache = &ccc_session_kmem,
+		.ckd_name  = "ccc_session_kmem",
+		.ckd_size  = sizeof (struct ccc_session)
+	},
+	{
+		.ckd_cache = &ccc_req_kmem,
+		.ckd_name  = "ccc_req_kmem",
+		.ckd_size  = sizeof (struct ccc_req)
+	},
+	{
+		.ckd_cache = NULL
+	}
+};
+
+/*****************************************************************************
+ *
+ * Vvp device and device type functions.
+ *
+ */
+
+void *ccc_key_init(const struct lu_context *ctx,
+			  struct lu_context_key *key)
+{
+	struct ccc_thread_info *info;
+
+	OBD_SLAB_ALLOC_PTR_GFP(info, ccc_thread_kmem, __GFP_IO);
+	if (info == NULL)
+		info = ERR_PTR(-ENOMEM);
+	return info;
+}
+
+void ccc_key_fini(const struct lu_context *ctx,
+			 struct lu_context_key *key, void *data)
+{
+	struct ccc_thread_info *info = data;
+	OBD_SLAB_FREE_PTR(info, ccc_thread_kmem);
+}
+
+void *ccc_session_key_init(const struct lu_context *ctx,
+				  struct lu_context_key *key)
+{
+	struct ccc_session *session;
+
+	OBD_SLAB_ALLOC_PTR_GFP(session, ccc_session_kmem, __GFP_IO);
+	if (session == NULL)
+		session = ERR_PTR(-ENOMEM);
+	return session;
+}
+
+void ccc_session_key_fini(const struct lu_context *ctx,
+				 struct lu_context_key *key, void *data)
+{
+	struct ccc_session *session = data;
+	OBD_SLAB_FREE_PTR(session, ccc_session_kmem);
+}
+
+struct lu_context_key ccc_key = {
+	.lct_tags = LCT_CL_THREAD,
+	.lct_init = ccc_key_init,
+	.lct_fini = ccc_key_fini
+};
+
+struct lu_context_key ccc_session_key = {
+	.lct_tags = LCT_SESSION,
+	.lct_init = ccc_session_key_init,
+	.lct_fini = ccc_session_key_fini
+};
+
+
+/* type constructor/destructor: ccc_type_{init,fini,start,stop}(). */
+// LU_TYPE_INIT_FINI(ccc, &ccc_key, &ccc_session_key);
+
+int ccc_device_init(const struct lu_env *env, struct lu_device *d,
+			   const char *name, struct lu_device *next)
+{
+	struct ccc_device  *vdv;
+	int rc;
+	ENTRY;
+
+	vdv = lu2ccc_dev(d);
+	vdv->cdv_next = lu2cl_dev(next);
+
+	LASSERT(d->ld_site != NULL && next->ld_type != NULL);
+	next->ld_site = d->ld_site;
+	rc = next->ld_type->ldt_ops->ldto_device_init(
+			env, next, next->ld_type->ldt_name, NULL);
+	if (rc == 0) {
+		lu_device_get(next);
+		lu_ref_add(&next->ld_reference, "lu-stack", &lu_site_init);
+	}
+	RETURN(rc);
+}
+
+struct lu_device *ccc_device_fini(const struct lu_env *env,
+					 struct lu_device *d)
+{
+	return cl2lu_dev(lu2ccc_dev(d)->cdv_next);
+}
+
+struct lu_device *ccc_device_alloc(const struct lu_env *env,
+				   struct lu_device_type *t,
+				   struct lustre_cfg *cfg,
+				   const struct lu_device_operations *luops,
+				   const struct cl_device_operations *clops)
+{
+	struct ccc_device *vdv;
+	struct lu_device  *lud;
+	struct cl_site    *site;
+	int rc;
+	ENTRY;
+
+	OBD_ALLOC_PTR(vdv);
+	if (vdv == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	lud = &vdv->cdv_cl.cd_lu_dev;
+	cl_device_init(&vdv->cdv_cl, t);
+	ccc2lu_dev(vdv)->ld_ops = luops;
+	vdv->cdv_cl.cd_ops = clops;
+
+	OBD_ALLOC_PTR(site);
+	if (site != NULL) {
+		rc = cl_site_init(site, &vdv->cdv_cl);
+		if (rc == 0)
+			rc = lu_site_init_finish(&site->cs_lu);
+		else {
+			LASSERT(lud->ld_site == NULL);
+			CERROR("Cannot init lu_site, rc %d.\n", rc);
+			OBD_FREE_PTR(site);
+		}
+	} else
+		rc = -ENOMEM;
+	if (rc != 0) {
+		ccc_device_free(env, lud);
+		lud = ERR_PTR(rc);
+	}
+	RETURN(lud);
+}
+
+struct lu_device *ccc_device_free(const struct lu_env *env,
+					 struct lu_device *d)
+{
+	struct ccc_device *vdv  = lu2ccc_dev(d);
+	struct cl_site    *site = lu2cl_site(d->ld_site);
+	struct lu_device  *next = cl2lu_dev(vdv->cdv_next);
+
+	if (d->ld_site != NULL) {
+		cl_site_fini(site);
+		OBD_FREE_PTR(site);
+	}
+	cl_device_fini(lu2cl_dev(d));
+	OBD_FREE_PTR(vdv);
+	return next;
+}
+
+int ccc_req_init(const struct lu_env *env, struct cl_device *dev,
+			struct cl_req *req)
+{
+	struct ccc_req *vrq;
+	int result;
+
+	OBD_SLAB_ALLOC_PTR_GFP(vrq, ccc_req_kmem, __GFP_IO);
+	if (vrq != NULL) {
+		cl_req_slice_add(req, &vrq->crq_cl, dev, &ccc_req_ops);
+		result = 0;
+	} else
+		result = -ENOMEM;
+	return result;
+}
+
+/**
+ * An `emergency' environment used by ccc_inode_fini() when cl_env_get()
+ * fails. Access to this environment is serialized by ccc_inode_fini_guard
+ * mutex.
+ */
+static struct lu_env *ccc_inode_fini_env = NULL;
+
+/**
+ * A mutex serializing calls to slp_inode_fini() under extreme memory
+ * pressure, when environments cannot be allocated.
+ */
+static DEFINE_MUTEX(ccc_inode_fini_guard);
+static int dummy_refcheck;
+
+int ccc_global_init(struct lu_device_type *device_type)
+{
+	int result;
+
+	result = lu_kmem_init(ccc_caches);
+	if (result)
+		return result;
+
+	result = lu_device_type_init(device_type);
+	if (result)
+		goto out_kmem;
+
+	ccc_inode_fini_env = cl_env_alloc(&dummy_refcheck,
+					  LCT_REMEMBER|LCT_NOREF);
+	if (IS_ERR(ccc_inode_fini_env)) {
+		result = PTR_ERR(ccc_inode_fini_env);
+		goto out_device;
+	}
+
+	ccc_inode_fini_env->le_ctx.lc_cookie = 0x4;
+	return 0;
+out_device:
+	lu_device_type_fini(device_type);
+out_kmem:
+	lu_kmem_fini(ccc_caches);
+	return result;
+}
+
+void ccc_global_fini(struct lu_device_type *device_type)
+{
+	if (ccc_inode_fini_env != NULL) {
+		cl_env_put(ccc_inode_fini_env, &dummy_refcheck);
+		ccc_inode_fini_env = NULL;
+	}
+	lu_device_type_fini(device_type);
+	lu_kmem_fini(ccc_caches);
+}
+
+/*****************************************************************************
+ *
+ * Object operations.
+ *
+ */
+
+struct lu_object *ccc_object_alloc(const struct lu_env *env,
+				   const struct lu_object_header *unused,
+				   struct lu_device *dev,
+				   const struct cl_object_operations *clops,
+				   const struct lu_object_operations *luops)
+{
+	struct ccc_object *vob;
+	struct lu_object  *obj;
+
+	OBD_SLAB_ALLOC_PTR_GFP(vob, ccc_object_kmem, __GFP_IO);
+	if (vob != NULL) {
+		struct cl_object_header *hdr;
+
+		obj = ccc2lu(vob);
+		hdr = &vob->cob_header;
+		cl_object_header_init(hdr);
+		lu_object_init(obj, &hdr->coh_lu, dev);
+		lu_object_add_top(&hdr->coh_lu, obj);
+
+		vob->cob_cl.co_ops = clops;
+		obj->lo_ops = luops;
+	} else
+		obj = NULL;
+	return obj;
+}
+
+int ccc_object_init0(const struct lu_env *env,
+			    struct ccc_object *vob,
+			    const struct cl_object_conf *conf)
+{
+	vob->cob_inode = conf->coc_inode;
+	vob->cob_transient_pages = 0;
+	cl_object_page_init(&vob->cob_cl, sizeof(struct ccc_page));
+	return 0;
+}
+
+int ccc_object_init(const struct lu_env *env, struct lu_object *obj,
+			   const struct lu_object_conf *conf)
+{
+	struct ccc_device *dev = lu2ccc_dev(obj->lo_dev);
+	struct ccc_object *vob = lu2ccc(obj);
+	struct lu_object  *below;
+	struct lu_device  *under;
+	int result;
+
+	under = &dev->cdv_next->cd_lu_dev;
+	below = under->ld_ops->ldo_object_alloc(env, obj->lo_header, under);
+	if (below != NULL) {
+		const struct cl_object_conf *cconf;
+
+		cconf = lu2cl_conf(conf);
+		INIT_LIST_HEAD(&vob->cob_pending_list);
+		lu_object_add(obj, below);
+		result = ccc_object_init0(env, vob, cconf);
+	} else
+		result = -ENOMEM;
+	return result;
+}
+
+void ccc_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+	struct ccc_object *vob = lu2ccc(obj);
+
+	lu_object_fini(obj);
+	lu_object_header_fini(obj->lo_header);
+	OBD_SLAB_FREE_PTR(vob, ccc_object_kmem);
+}
+
+int ccc_lock_init(const struct lu_env *env,
+		  struct cl_object *obj, struct cl_lock *lock,
+		  const struct cl_io *unused,
+		  const struct cl_lock_operations *lkops)
+{
+	struct ccc_lock *clk;
+	int result;
+
+	CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+
+	OBD_SLAB_ALLOC_PTR_GFP(clk, ccc_lock_kmem, __GFP_IO);
+	if (clk != NULL) {
+		cl_lock_slice_add(lock, &clk->clk_cl, obj, lkops);
+		result = 0;
+	} else
+		result = -ENOMEM;
+	return result;
+}
+
+int ccc_attr_set(const struct lu_env *env, struct cl_object *obj,
+		 const struct cl_attr *attr, unsigned valid)
+{
+	return 0;
+}
+
+int ccc_object_glimpse(const struct lu_env *env,
+		       const struct cl_object *obj, struct ost_lvb *lvb)
+{
+	struct inode *inode = ccc_object_inode(obj);
+
+	ENTRY;
+	lvb->lvb_mtime = cl_inode_mtime(inode);
+	lvb->lvb_atime = cl_inode_atime(inode);
+	lvb->lvb_ctime = cl_inode_ctime(inode);
+	/*
+	 * LU-417: Add dirty pages block count lest i_blocks reports 0, some
+	 * "cp" or "tar" on remote node may think it's a completely sparse file
+	 * and skip it.
+	 */
+	if (lvb->lvb_size > 0 && lvb->lvb_blocks == 0)
+		lvb->lvb_blocks = dirty_cnt(inode);
+	RETURN(0);
+}
+
+
+
+int ccc_conf_set(const struct lu_env *env, struct cl_object *obj,
+			const struct cl_object_conf *conf)
+{
+	/* TODO: destroy all pages attached to this object. */
+	return 0;
+}
+
+static void ccc_object_size_lock(struct cl_object *obj)
+{
+	struct inode *inode = ccc_object_inode(obj);
+
+	cl_isize_lock(inode);
+	cl_object_attr_lock(obj);
+}
+
+static void ccc_object_size_unlock(struct cl_object *obj)
+{
+	struct inode *inode = ccc_object_inode(obj);
+
+	cl_object_attr_unlock(obj);
+	cl_isize_unlock(inode);
+}
+
+/*****************************************************************************
+ *
+ * Page operations.
+ *
+ */
+
+struct page *ccc_page_vmpage(const struct lu_env *env,
+			    const struct cl_page_slice *slice)
+{
+	return cl2vm_page(slice);
+}
+
+int ccc_page_is_under_lock(const struct lu_env *env,
+			   const struct cl_page_slice *slice,
+			   struct cl_io *io)
+{
+	struct ccc_io	*cio  = ccc_env_io(env);
+	struct cl_lock_descr *desc = &ccc_env_info(env)->cti_descr;
+	struct cl_page       *page = slice->cpl_page;
+
+	int result;
+
+	ENTRY;
+
+	if (io->ci_type == CIT_READ || io->ci_type == CIT_WRITE ||
+	    io->ci_type == CIT_FAULT) {
+		if (cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)
+			result = -EBUSY;
+		else {
+			desc->cld_start = page->cp_index;
+			desc->cld_end   = page->cp_index;
+			desc->cld_obj   = page->cp_obj;
+			desc->cld_mode  = CLM_READ;
+			result = cl_queue_match(&io->ci_lockset.cls_done,
+						desc) ? -EBUSY : 0;
+		}
+	} else
+		result = 0;
+	RETURN(result);
+}
+
+int ccc_fail(const struct lu_env *env, const struct cl_page_slice *slice)
+{
+	/*
+	 * Cached read?
+	 */
+	LBUG();
+	return 0;
+}
+
+void ccc_transient_page_verify(const struct cl_page *page)
+{
+}
+
+int ccc_transient_page_own(const struct lu_env *env,
+				   const struct cl_page_slice *slice,
+				   struct cl_io *unused,
+				   int nonblock)
+{
+	ccc_transient_page_verify(slice->cpl_page);
+	return 0;
+}
+
+void ccc_transient_page_assume(const struct lu_env *env,
+				      const struct cl_page_slice *slice,
+				      struct cl_io *unused)
+{
+	ccc_transient_page_verify(slice->cpl_page);
+}
+
+void ccc_transient_page_unassume(const struct lu_env *env,
+					const struct cl_page_slice *slice,
+					struct cl_io *unused)
+{
+	ccc_transient_page_verify(slice->cpl_page);
+}
+
+void ccc_transient_page_disown(const struct lu_env *env,
+				      const struct cl_page_slice *slice,
+				      struct cl_io *unused)
+{
+	ccc_transient_page_verify(slice->cpl_page);
+}
+
+void ccc_transient_page_discard(const struct lu_env *env,
+				       const struct cl_page_slice *slice,
+				       struct cl_io *unused)
+{
+	struct cl_page *page = slice->cpl_page;
+
+	ccc_transient_page_verify(slice->cpl_page);
+
+	/*
+	 * For transient pages, remove it from the radix tree.
+	 */
+	cl_page_delete(env, page);
+}
+
+int ccc_transient_page_prep(const struct lu_env *env,
+				   const struct cl_page_slice *slice,
+				   struct cl_io *unused)
+{
+	ENTRY;
+	/* transient page should always be sent. */
+	RETURN(0);
+}
+
+/*****************************************************************************
+ *
+ * Lock operations.
+ *
+ */
+
+void ccc_lock_delete(const struct lu_env *env,
+		     const struct cl_lock_slice *slice)
+{
+	CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj));
+}
+
+void ccc_lock_fini(const struct lu_env *env, struct cl_lock_slice *slice)
+{
+	struct ccc_lock *clk = cl2ccc_lock(slice);
+	OBD_SLAB_FREE_PTR(clk, ccc_lock_kmem);
+}
+
+int ccc_lock_enqueue(const struct lu_env *env,
+		     const struct cl_lock_slice *slice,
+		     struct cl_io *unused, __u32 enqflags)
+{
+	CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj));
+	return 0;
+}
+
+int ccc_lock_unuse(const struct lu_env *env, const struct cl_lock_slice *slice)
+{
+	CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj));
+	return 0;
+}
+
+int ccc_lock_wait(const struct lu_env *env, const struct cl_lock_slice *slice)
+{
+	CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj));
+	return 0;
+}
+
+/**
+ * Implementation of cl_lock_operations::clo_fits_into() methods for ccc
+ * layer. This function is executed every time io finds an existing lock in
+ * the lock cache while creating new lock. This function has to decide whether
+ * cached lock "fits" into io.
+ *
+ * \param slice lock to be checked
+ * \param io    IO that wants a lock.
+ *
+ * \see lov_lock_fits_into().
+ */
+int ccc_lock_fits_into(const struct lu_env *env,
+		       const struct cl_lock_slice *slice,
+		       const struct cl_lock_descr *need,
+		       const struct cl_io *io)
+{
+	const struct cl_lock       *lock  = slice->cls_lock;
+	const struct cl_lock_descr *descr = &lock->cll_descr;
+	const struct ccc_io	*cio   = ccc_env_io(env);
+	int			 result;
+
+	ENTRY;
+	/*
+	 * Work around DLM peculiarity: it assumes that glimpse
+	 * (LDLM_FL_HAS_INTENT) lock is always LCK_PR, and returns reads lock
+	 * when asked for LCK_PW lock with LDLM_FL_HAS_INTENT flag set. Make
+	 * sure that glimpse doesn't get CLM_WRITE top-lock, so that it
+	 * doesn't enqueue CLM_WRITE sub-locks.
+	 */
+	if (cio->cui_glimpse)
+		result = descr->cld_mode != CLM_WRITE;
+
+	/*
+	 * Also, don't match incomplete write locks for read, otherwise read
+	 * would enqueue missing sub-locks in the write mode.
+	 */
+	else if (need->cld_mode != descr->cld_mode)
+		result = lock->cll_state >= CLS_ENQUEUED;
+	else
+		result = 1;
+	RETURN(result);
+}
+
+/**
+ * Implements cl_lock_operations::clo_state() method for ccc layer, invoked
+ * whenever lock state changes. Transfers object attributes, that might be
+ * updated as a result of lock acquiring into inode.
+ */
+void ccc_lock_state(const struct lu_env *env,
+		    const struct cl_lock_slice *slice,
+		    enum cl_lock_state state)
+{
+	struct cl_lock *lock = slice->cls_lock;
+	ENTRY;
+
+	/*
+	 * Refresh inode attributes when the lock is moving into CLS_HELD
+	 * state, and only when this is a result of real enqueue, rather than
+	 * of finding lock in the cache.
+	 */
+	if (state == CLS_HELD && lock->cll_state < CLS_HELD) {
+		struct cl_object *obj;
+		struct inode     *inode;
+
+		obj   = slice->cls_obj;
+		inode = ccc_object_inode(obj);
+
+		/* vmtruncate() sets the i_size
+		 * under both a DLM lock and the
+		 * ll_inode_size_lock().  If we don't get the
+		 * ll_inode_size_lock() here we can match the DLM lock and
+		 * reset i_size.  generic_file_write can then trust the
+		 * stale i_size when doing appending writes and effectively
+		 * cancel the result of the truncate.  Getting the
+		 * ll_inode_size_lock() after the enqueue maintains the DLM
+		 * -> ll_inode_size_lock() acquiring order. */
+		if (lock->cll_descr.cld_start == 0 &&
+		    lock->cll_descr.cld_end == CL_PAGE_EOF)
+			cl_merge_lvb(env, inode);
+	}
+	EXIT;
+}
+
+/*****************************************************************************
+ *
+ * io operations.
+ *
+ */
+
+void ccc_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+	struct cl_io *io = ios->cis_io;
+
+	CLOBINVRNT(env, io->ci_obj, ccc_object_invariant(io->ci_obj));
+}
+
+int ccc_io_one_lock_index(const struct lu_env *env, struct cl_io *io,
+			  __u32 enqflags, enum cl_lock_mode mode,
+			  pgoff_t start, pgoff_t end)
+{
+	struct ccc_io	  *cio   = ccc_env_io(env);
+	struct cl_lock_descr   *descr = &cio->cui_link.cill_descr;
+	struct cl_object       *obj   = io->ci_obj;
+
+	CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "lock: %d [%lu, %lu]\n", mode, start, end);
+
+	memset(&cio->cui_link, 0, sizeof cio->cui_link);
+
+	if (cio->cui_fd && (cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
+		descr->cld_mode = CLM_GROUP;
+		descr->cld_gid  = cio->cui_fd->fd_grouplock.cg_gid;
+	} else {
+		descr->cld_mode  = mode;
+	}
+	descr->cld_obj   = obj;
+	descr->cld_start = start;
+	descr->cld_end   = end;
+	descr->cld_enq_flags = enqflags;
+
+	cl_io_lock_add(env, io, &cio->cui_link);
+	RETURN(0);
+}
+
+void ccc_io_update_iov(const struct lu_env *env,
+		       struct ccc_io *cio, struct cl_io *io)
+{
+	int i;
+	size_t size = io->u.ci_rw.crw_count;
+
+	cio->cui_iov_olen = 0;
+	if (!cl_is_normalio(env, io) || cio->cui_tot_nrsegs == 0)
+		return;
+
+	for (i = 0; i < cio->cui_tot_nrsegs; i++) {
+		struct iovec *iv = &cio->cui_iov[i];
+
+		if (iv->iov_len < size)
+			size -= iv->iov_len;
+		else {
+			if (iv->iov_len > size) {
+				cio->cui_iov_olen = iv->iov_len;
+				iv->iov_len = size;
+			}
+			break;
+		}
+	}
+
+	cio->cui_nrsegs = i + 1;
+	LASSERTF(cio->cui_tot_nrsegs >= cio->cui_nrsegs,
+		 "tot_nrsegs: %lu, nrsegs: %lu\n",
+		 cio->cui_tot_nrsegs, cio->cui_nrsegs);
+}
+
+int ccc_io_one_lock(const struct lu_env *env, struct cl_io *io,
+		    __u32 enqflags, enum cl_lock_mode mode,
+		    loff_t start, loff_t end)
+{
+	struct cl_object *obj = io->ci_obj;
+	return ccc_io_one_lock_index(env, io, enqflags, mode,
+				     cl_index(obj, start), cl_index(obj, end));
+}
+
+void ccc_io_end(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+	CLOBINVRNT(env, ios->cis_io->ci_obj,
+		   ccc_object_invariant(ios->cis_io->ci_obj));
+}
+
+void ccc_io_advance(const struct lu_env *env,
+		    const struct cl_io_slice *ios,
+		    size_t nob)
+{
+	struct ccc_io    *cio = cl2ccc_io(env, ios);
+	struct cl_io     *io  = ios->cis_io;
+	struct cl_object *obj = ios->cis_io->ci_obj;
+
+	CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+
+	if (!cl_is_normalio(env, io))
+		return;
+
+	LASSERT(cio->cui_tot_nrsegs >= cio->cui_nrsegs);
+	LASSERT(cio->cui_tot_count  >= nob);
+
+	cio->cui_iov	+= cio->cui_nrsegs;
+	cio->cui_tot_nrsegs -= cio->cui_nrsegs;
+	cio->cui_tot_count  -= nob;
+
+	/* update the iov */
+	if (cio->cui_iov_olen > 0) {
+		struct iovec *iv;
+
+		cio->cui_iov--;
+		cio->cui_tot_nrsegs++;
+		iv = &cio->cui_iov[0];
+		if (io->ci_continue) {
+			iv->iov_base += iv->iov_len;
+			LASSERT(cio->cui_iov_olen > iv->iov_len);
+			iv->iov_len = cio->cui_iov_olen - iv->iov_len;
+		} else {
+			/* restore the iov_len, in case of restart io. */
+			iv->iov_len = cio->cui_iov_olen;
+		}
+		cio->cui_iov_olen = 0;
+	}
+}
+
+/**
+ * Helper function that if necessary adjusts file size (inode->i_size), when
+ * position at the offset \a pos is accessed. File size can be arbitrary stale
+ * on a Lustre client, but client at least knows KMS. If accessed area is
+ * inside [0, KMS], set file size to KMS, otherwise glimpse file size.
+ *
+ * Locking: cl_isize_lock is used to serialize changes to inode size and to
+ * protect consistency between inode size and cl_object
+ * attributes. cl_object_size_lock() protects consistency between cl_attr's of
+ * top-object and sub-objects.
+ */
+int ccc_prep_size(const struct lu_env *env, struct cl_object *obj,
+		  struct cl_io *io, loff_t start, size_t count, int *exceed)
+{
+	struct cl_attr *attr  = ccc_env_thread_attr(env);
+	struct inode   *inode = ccc_object_inode(obj);
+	loff_t	  pos   = start + count - 1;
+	loff_t kms;
+	int result;
+
+	/*
+	 * Consistency guarantees: following possibilities exist for the
+	 * relation between region being accessed and real file size at this
+	 * moment:
+	 *
+	 *  (A): the region is completely inside of the file;
+	 *
+	 *  (B-x): x bytes of region are inside of the file, the rest is
+	 *  outside;
+	 *
+	 *  (C): the region is completely outside of the file.
+	 *
+	 * This classification is stable under DLM lock already acquired by
+	 * the caller, because to change the class, other client has to take
+	 * DLM lock conflicting with our lock. Also, any updates to ->i_size
+	 * by other threads on this client are serialized by
+	 * ll_inode_size_lock(). This guarantees that short reads are handled
+	 * correctly in the face of concurrent writes and truncates.
+	 */
+	ccc_object_size_lock(obj);
+	result = cl_object_attr_get(env, obj, attr);
+	if (result == 0) {
+		kms = attr->cat_kms;
+		if (pos > kms) {
+			/*
+			 * A glimpse is necessary to determine whether we
+			 * return a short read (B) or some zeroes at the end
+			 * of the buffer (C)
+			 */
+			ccc_object_size_unlock(obj);
+			result = cl_glimpse_lock(env, io, inode, obj, 0);
+			if (result == 0 && exceed != NULL) {
+				/* If objective page index exceed end-of-file
+				 * page index, return directly. Do not expect
+				 * kernel will check such case correctly.
+				 * linux-2.6.18-128.1.1 miss to do that.
+				 * --bug 17336 */
+				loff_t size = cl_isize_read(inode);
+				unsigned long cur_index = start >> PAGE_CACHE_SHIFT;
+
+				if ((size == 0 && cur_index != 0) ||
+				    (((size - 1) >> PAGE_CACHE_SHIFT) < cur_index))
+				*exceed = 1;
+			}
+			return result;
+		} else {
+			/*
+			 * region is within kms and, hence, within real file
+			 * size (A). We need to increase i_size to cover the
+			 * read region so that generic_file_read() will do its
+			 * job, but that doesn't mean the kms size is
+			 * _correct_, it is only the _minimum_ size. If
+			 * someone does a stat they will get the correct size
+			 * which will always be >= the kms value here.
+			 * b=11081
+			 */
+			if (cl_isize_read(inode) < kms) {
+				cl_isize_write_nolock(inode, kms);
+				CDEBUG(D_VFSTRACE,
+				       DFID" updating i_size "LPU64"\n",
+				       PFID(lu_object_fid(&obj->co_lu)),
+				       (__u64)cl_isize_read(inode));
+
+			}
+		}
+	}
+	ccc_object_size_unlock(obj);
+	return result;
+}
+
+/*****************************************************************************
+ *
+ * Transfer operations.
+ *
+ */
+
+void ccc_req_completion(const struct lu_env *env,
+			const struct cl_req_slice *slice, int ioret)
+{
+	struct ccc_req *vrq;
+
+	if (ioret > 0)
+		cl_stats_tally(slice->crs_dev, slice->crs_req->crq_type, ioret);
+
+	vrq = cl2ccc_req(slice);
+	OBD_SLAB_FREE_PTR(vrq, ccc_req_kmem);
+}
+
+/**
+ * Implementation of struct cl_req_operations::cro_attr_set() for ccc
+ * layer. ccc is responsible for
+ *
+ *    - o_[mac]time
+ *
+ *    - o_mode
+ *
+ *    - o_parent_seq
+ *
+ *    - o_[ug]id
+ *
+ *    - o_parent_oid
+ *
+ *    - o_parent_ver
+ *
+ *    - o_ioepoch,
+ *
+ *  and capability.
+ */
+void ccc_req_attr_set(const struct lu_env *env,
+		      const struct cl_req_slice *slice,
+		      const struct cl_object *obj,
+		      struct cl_req_attr *attr, obd_valid flags)
+{
+	struct inode *inode;
+	struct obdo  *oa;
+	obd_flag      valid_flags;
+
+	oa = attr->cra_oa;
+	inode = ccc_object_inode(obj);
+	valid_flags = OBD_MD_FLTYPE;
+
+	if ((flags & OBD_MD_FLOSSCAPA) != 0) {
+		LASSERT(attr->cra_capa == NULL);
+		attr->cra_capa = cl_capa_lookup(inode,
+						slice->crs_req->crq_type);
+	}
+
+	if (slice->crs_req->crq_type == CRT_WRITE) {
+		if (flags & OBD_MD_FLEPOCH) {
+			oa->o_valid |= OBD_MD_FLEPOCH;
+			oa->o_ioepoch = cl_i2info(inode)->lli_ioepoch;
+			valid_flags |= OBD_MD_FLMTIME | OBD_MD_FLCTIME |
+				       OBD_MD_FLUID | OBD_MD_FLGID;
+		}
+	}
+	obdo_from_inode(oa, inode, valid_flags & flags);
+	obdo_set_parent_fid(oa, &cl_i2info(inode)->lli_fid);
+	memcpy(attr->cra_jobid, cl_i2info(inode)->lli_jobid,
+	       JOBSTATS_JOBID_SIZE);
+}
+
+const struct cl_req_operations ccc_req_ops = {
+	.cro_attr_set   = ccc_req_attr_set,
+	.cro_completion = ccc_req_completion
+};
+
+int cl_setattr_ost(struct inode *inode, const struct iattr *attr,
+		   struct obd_capa *capa)
+{
+	struct lu_env *env;
+	struct cl_io  *io;
+	int	    result;
+	int	    refcheck;
+
+	ENTRY;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	io = ccc_env_thread_io(env);
+	io->ci_obj = cl_i2info(inode)->lli_clob;
+
+	io->u.ci_setattr.sa_attr.lvb_atime = LTIME_S(attr->ia_atime);
+	io->u.ci_setattr.sa_attr.lvb_mtime = LTIME_S(attr->ia_mtime);
+	io->u.ci_setattr.sa_attr.lvb_ctime = LTIME_S(attr->ia_ctime);
+	io->u.ci_setattr.sa_attr.lvb_size = attr->ia_size;
+	io->u.ci_setattr.sa_valid = attr->ia_valid;
+	io->u.ci_setattr.sa_capa = capa;
+
+again:
+	if (cl_io_init(env, io, CIT_SETATTR, io->ci_obj) == 0) {
+		struct ccc_io *cio = ccc_env_io(env);
+
+		if (attr->ia_valid & ATTR_FILE)
+			/* populate the file descriptor for ftruncate to honor
+			 * group lock - see LU-787 */
+			cio->cui_fd = cl_iattr2fd(inode, attr);
+
+		result = cl_io_loop(env, io);
+	} else {
+		result = io->ci_result;
+	}
+	cl_io_fini(env, io);
+	if (unlikely(io->ci_need_restart))
+		goto again;
+	cl_env_put(env, &refcheck);
+	RETURN(result);
+}
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ */
+
+struct lu_device *ccc2lu_dev(struct ccc_device *vdv)
+{
+	return &vdv->cdv_cl.cd_lu_dev;
+}
+
+struct ccc_device *lu2ccc_dev(const struct lu_device *d)
+{
+	return container_of0(d, struct ccc_device, cdv_cl.cd_lu_dev);
+}
+
+struct ccc_device *cl2ccc_dev(const struct cl_device *d)
+{
+	return container_of0(d, struct ccc_device, cdv_cl);
+}
+
+struct lu_object *ccc2lu(struct ccc_object *vob)
+{
+	return &vob->cob_cl.co_lu;
+}
+
+struct ccc_object *lu2ccc(const struct lu_object *obj)
+{
+	return container_of0(obj, struct ccc_object, cob_cl.co_lu);
+}
+
+struct ccc_object *cl2ccc(const struct cl_object *obj)
+{
+	return container_of0(obj, struct ccc_object, cob_cl);
+}
+
+struct ccc_lock *cl2ccc_lock(const struct cl_lock_slice *slice)
+{
+	return container_of(slice, struct ccc_lock, clk_cl);
+}
+
+struct ccc_io *cl2ccc_io(const struct lu_env *env,
+			 const struct cl_io_slice *slice)
+{
+	struct ccc_io *cio;
+
+	cio = container_of(slice, struct ccc_io, cui_cl);
+	LASSERT(cio == ccc_env_io(env));
+	return cio;
+}
+
+struct ccc_req *cl2ccc_req(const struct cl_req_slice *slice)
+{
+	return container_of0(slice, struct ccc_req, crq_cl);
+}
+
+struct page *cl2vm_page(const struct cl_page_slice *slice)
+{
+	return cl2ccc_page(slice)->cpg_page;
+}
+
+/*****************************************************************************
+ *
+ * Accessors.
+ *
+ */
+int ccc_object_invariant(const struct cl_object *obj)
+{
+	struct inode	 *inode = ccc_object_inode(obj);
+	struct cl_inode_info *lli   = cl_i2info(inode);
+
+	return (S_ISREG(cl_inode_mode(inode)) ||
+		/* i_mode of unlinked inode is zeroed. */
+		cl_inode_mode(inode) == 0) && lli->lli_clob == obj;
+}
+
+struct inode *ccc_object_inode(const struct cl_object *obj)
+{
+	return cl2ccc(obj)->cob_inode;
+}
+
+/**
+ * Returns a pointer to cl_page associated with \a vmpage, without acquiring
+ * additional reference to the resulting page. This is an unsafe version of
+ * cl_vmpage_page() that can only be used under vmpage lock.
+ */
+struct cl_page *ccc_vmpage_page_transient(struct page *vmpage)
+{
+	KLASSERT(PageLocked(vmpage));
+	return (struct cl_page *)vmpage->private;
+}
+
+/**
+ * Initialize or update CLIO structures for regular files when new
+ * meta-data arrives from the server.
+ *
+ * \param inode regular file inode
+ * \param md    new file metadata from MDS
+ * - allocates cl_object if necessary,
+ * - updated layout, if object was already here.
+ */
+int cl_file_inode_init(struct inode *inode, struct lustre_md *md)
+{
+	struct lu_env	*env;
+	struct cl_inode_info *lli;
+	struct cl_object     *clob;
+	struct lu_site       *site;
+	struct lu_fid	*fid;
+	struct cl_object_conf conf = {
+		.coc_inode = inode,
+		.u = {
+			.coc_md    = md
+		}
+	};
+	int result = 0;
+	int refcheck;
+
+	LASSERT(md->body->valid & OBD_MD_FLID);
+	LASSERT(S_ISREG(cl_inode_mode(inode)));
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		return PTR_ERR(env);
+
+	site = cl_i2sbi(inode)->ll_site;
+	lli  = cl_i2info(inode);
+	fid  = &lli->lli_fid;
+	LASSERT(fid_is_sane(fid));
+
+	if (lli->lli_clob == NULL) {
+		/* clob is slave of inode, empty lli_clob means for new inode,
+		 * there is no clob in cache with the given fid, so it is
+		 * unnecessary to perform lookup-alloc-lookup-insert, just
+		 * alloc and insert directly. */
+		LASSERT(inode->i_state & I_NEW);
+		conf.coc_lu.loc_flags = LOC_F_NEW;
+		clob = cl_object_find(env, lu2cl_dev(site->ls_top_dev),
+				      fid, &conf);
+		if (!IS_ERR(clob)) {
+			/*
+			 * No locking is necessary, as new inode is
+			 * locked by I_NEW bit.
+			 */
+			lli->lli_clob = clob;
+			lli->lli_has_smd = md->lsm != NULL;
+			lu_object_ref_add(&clob->co_lu, "inode", inode);
+		} else
+			result = PTR_ERR(clob);
+	} else {
+		result = cl_conf_set(env, lli->lli_clob, &conf);
+	}
+
+	cl_env_put(env, &refcheck);
+
+	if (result != 0)
+		CERROR("Failure to initialize cl object "DFID": %d\n",
+		       PFID(fid), result);
+	return result;
+}
+
+/**
+ * Wait for others drop their references of the object at first, then we drop
+ * the last one, which will lead to the object be destroyed immediately.
+ * Must be called after cl_object_kill() against this object.
+ *
+ * The reason we want to do this is: destroying top object will wait for sub
+ * objects being destroyed first, so we can't let bottom layer (e.g. from ASTs)
+ * to initiate top object destroying which may deadlock. See bz22520.
+ */
+static void cl_object_put_last(struct lu_env *env, struct cl_object *obj)
+{
+	struct lu_object_header *header = obj->co_lu.lo_header;
+	wait_queue_t	   waiter;
+
+	if (unlikely(atomic_read(&header->loh_ref) != 1)) {
+		struct lu_site *site = obj->co_lu.lo_dev->ld_site;
+		struct lu_site_bkt_data *bkt;
+
+		bkt = lu_site_bkt_from_fid(site, &header->loh_fid);
+
+		init_waitqueue_entry_current(&waiter);
+		add_wait_queue(&bkt->lsb_marche_funebre, &waiter);
+
+		while (1) {
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			if (atomic_read(&header->loh_ref) == 1)
+				break;
+			waitq_wait(&waiter, TASK_UNINTERRUPTIBLE);
+		}
+
+		set_current_state(TASK_RUNNING);
+		remove_wait_queue(&bkt->lsb_marche_funebre, &waiter);
+	}
+
+	cl_object_put(env, obj);
+}
+
+void cl_inode_fini(struct inode *inode)
+{
+	struct lu_env	   *env;
+	struct cl_inode_info    *lli  = cl_i2info(inode);
+	struct cl_object	*clob = lli->lli_clob;
+	int refcheck;
+	int emergency;
+
+	if (clob != NULL) {
+		void		    *cookie;
+
+		cookie = cl_env_reenter();
+		env = cl_env_get(&refcheck);
+		emergency = IS_ERR(env);
+		if (emergency) {
+			mutex_lock(&ccc_inode_fini_guard);
+			LASSERT(ccc_inode_fini_env != NULL);
+			cl_env_implant(ccc_inode_fini_env, &refcheck);
+			env = ccc_inode_fini_env;
+		}
+		/*
+		 * cl_object cache is a slave to inode cache (which, in turn
+		 * is a slave to dentry cache), don't keep cl_object in memory
+		 * when its master is evicted.
+		 */
+		cl_object_kill(env, clob);
+		lu_object_ref_del(&clob->co_lu, "inode", inode);
+		cl_object_put_last(env, clob);
+		lli->lli_clob = NULL;
+		if (emergency) {
+			cl_env_unplant(ccc_inode_fini_env, &refcheck);
+			mutex_unlock(&ccc_inode_fini_guard);
+		} else
+			cl_env_put(env, &refcheck);
+		cl_env_reexit(cookie);
+	}
+}
+
+/**
+ * return IF_* type for given lu_dirent entry.
+ * IF_* flag shld be converted to particular OS file type in
+ * platform llite module.
+ */
+__u16 ll_dirent_type_get(struct lu_dirent *ent)
+{
+	__u16 type = 0;
+	struct luda_type *lt;
+	int len = 0;
+
+	if (le32_to_cpu(ent->lde_attrs) & LUDA_TYPE) {
+		const unsigned align = sizeof(struct luda_type) - 1;
+
+		len = le16_to_cpu(ent->lde_namelen);
+		len = (len + align) & ~align;
+		lt = (void *)ent->lde_name + len;
+		type = IFTODT(le16_to_cpu(lt->lt_type));
+	}
+	return type;
+}
+
+/**
+ * build inode number from passed @fid */
+__u64 cl_fid_build_ino(const struct lu_fid *fid, int api32)
+{
+	if (BITS_PER_LONG == 32 || api32)
+		RETURN(fid_flatten32(fid));
+	else
+		RETURN(fid_flatten(fid));
+}
+
+/**
+ * build inode generation from passed @fid.  If our FID overflows the 32-bit
+ * inode number then return a non-zero generation to distinguish them. */
+__u32 cl_fid_build_gen(const struct lu_fid *fid)
+{
+	__u32 gen;
+	ENTRY;
+
+	if (fid_is_igif(fid)) {
+		gen = lu_igif_gen(fid);
+		RETURN(gen);
+	}
+
+	gen = (fid_flatten(fid) >> 32);
+	RETURN(gen);
+}
+
+/* lsm is unreliable after hsm implementation as layout can be changed at
+ * any time. This is only to support old, non-clio-ized interfaces. It will
+ * cause deadlock if clio operations are called with this extra layout refcount
+ * because in case the layout changed during the IO, ll_layout_refresh() will
+ * have to wait for the refcount to become zero to destroy the older layout.
+ *
+ * Notice that the lsm returned by this function may not be valid unless called
+ * inside layout lock - MDS_INODELOCK_LAYOUT. */
+struct lov_stripe_md *ccc_inode_lsm_get(struct inode *inode)
+{
+	return lov_lsm_get(cl_i2info(inode)->lli_clob);
+}
+
+void inline ccc_inode_lsm_put(struct inode *inode, struct lov_stripe_md *lsm)
+{
+	lov_lsm_put(cl_i2info(inode)->lli_clob, lsm);
+}
diff --git a/drivers/staging/lustre/lustre/lclient/lcommon_misc.c b/drivers/staging/lustre/lustre/lclient/lcommon_misc.c
new file mode 100644
index 000000000000..8ecbef92753d
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lclient/lcommon_misc.c
@@ -0,0 +1,194 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * cl code shared between vvp and liblustre (and other Lustre clients in the
+ * future).
+ *
+ */
+#include <obd_class.h>
+#include <obd_support.h>
+#include <obd.h>
+#include <cl_object.h>
+#include <lclient.h>
+
+#include <lustre_lite.h>
+
+
+/* Initialize the default and maximum LOV EA and cookie sizes.  This allows
+ * us to make MDS RPCs with large enough reply buffers to hold the
+ * maximum-sized (= maximum striped) EA and cookie without having to
+ * calculate this (via a call into the LOV + OSCs) each time we make an RPC. */
+int cl_init_ea_size(struct obd_export *md_exp, struct obd_export *dt_exp)
+{
+	struct lov_stripe_md lsm = { .lsm_magic = LOV_MAGIC_V3 };
+	__u32 valsize = sizeof(struct lov_desc);
+	int rc, easize, def_easize, cookiesize;
+	struct lov_desc desc;
+	__u16 stripes;
+	ENTRY;
+
+	rc = obd_get_info(NULL, dt_exp, sizeof(KEY_LOVDESC), KEY_LOVDESC,
+			  &valsize, &desc, NULL);
+	if (rc)
+		RETURN(rc);
+
+	stripes = min(desc.ld_tgt_count, (__u32)LOV_MAX_STRIPE_COUNT);
+	lsm.lsm_stripe_count = stripes;
+	easize = obd_size_diskmd(dt_exp, &lsm);
+
+	lsm.lsm_stripe_count = desc.ld_default_stripe_count;
+	def_easize = obd_size_diskmd(dt_exp, &lsm);
+
+	cookiesize = stripes * sizeof(struct llog_cookie);
+
+	CDEBUG(D_HA, "updating max_mdsize/max_cookiesize: %d/%d\n",
+	       easize, cookiesize);
+
+	rc = md_init_ea_size(md_exp, easize, def_easize, cookiesize);
+	RETURN(rc);
+}
+
+/**
+ * This function is used as an upcall-callback hooked by liblustre and llite
+ * clients into obd_notify() listeners chain to handle notifications about
+ * change of import connect_flags. See llu_fsswop_mount() and
+ * lustre_common_fill_super().
+ */
+int cl_ocd_update(struct obd_device *host,
+		  struct obd_device *watched,
+		  enum obd_notify_event ev, void *owner, void *data)
+{
+	struct lustre_client_ocd *lco;
+	struct client_obd	*cli;
+	__u64 flags;
+	int   result;
+
+	ENTRY;
+	if (!strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME)) {
+		cli = &watched->u.cli;
+		lco = owner;
+		flags = cli->cl_import->imp_connect_data.ocd_connect_flags;
+		CDEBUG(D_SUPER, "Changing connect_flags: "LPX64" -> "LPX64"\n",
+		       lco->lco_flags, flags);
+		mutex_lock(&lco->lco_lock);
+		lco->lco_flags &= flags;
+		/* for each osc event update ea size */
+		if (lco->lco_dt_exp)
+			cl_init_ea_size(lco->lco_md_exp, lco->lco_dt_exp);
+
+		mutex_unlock(&lco->lco_lock);
+		result = 0;
+	} else {
+		CERROR("unexpected notification from %s %s!\n",
+		       watched->obd_type->typ_name,
+		       watched->obd_name);
+		result = -EINVAL;
+	}
+	RETURN(result);
+}
+
+#define GROUPLOCK_SCOPE "grouplock"
+
+int cl_get_grouplock(struct cl_object *obj, unsigned long gid, int nonblock,
+		     struct ccc_grouplock *cg)
+{
+	struct lu_env	  *env;
+	struct cl_io	   *io;
+	struct cl_lock	 *lock;
+	struct cl_lock_descr   *descr;
+	__u32		   enqflags;
+	int		     refcheck;
+	int		     rc;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		return PTR_ERR(env);
+
+	io = ccc_env_thread_io(env);
+	io->ci_obj = obj;
+	io->ci_ignore_layout = 1;
+
+	rc = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+	if (rc) {
+		LASSERT(rc < 0);
+		cl_env_put(env, &refcheck);
+		return rc;
+	}
+
+	descr = &ccc_env_info(env)->cti_descr;
+	descr->cld_obj = obj;
+	descr->cld_start = 0;
+	descr->cld_end = CL_PAGE_EOF;
+	descr->cld_gid = gid;
+	descr->cld_mode = CLM_GROUP;
+
+	enqflags = CEF_MUST | (nonblock ? CEF_NONBLOCK : 0);
+	descr->cld_enq_flags = enqflags;
+
+	lock = cl_lock_request(env, io, descr, GROUPLOCK_SCOPE, current);
+	if (IS_ERR(lock)) {
+		cl_io_fini(env, io);
+		cl_env_put(env, &refcheck);
+		return PTR_ERR(lock);
+	}
+
+	cg->cg_env  = cl_env_get(&refcheck);
+	cg->cg_io   = io;
+	cg->cg_lock = lock;
+	cg->cg_gid  = gid;
+	LASSERT(cg->cg_env == env);
+
+	cl_env_unplant(env, &refcheck);
+	return 0;
+}
+
+void cl_put_grouplock(struct ccc_grouplock *cg)
+{
+	struct lu_env  *env  = cg->cg_env;
+	struct cl_io   *io   = cg->cg_io;
+	struct cl_lock *lock = cg->cg_lock;
+	int	     refcheck;
+
+	LASSERT(cg->cg_env);
+	LASSERT(cg->cg_gid);
+
+	cl_env_implant(env, &refcheck);
+	cl_env_put(env, &refcheck);
+
+	cl_unuse(env, lock);
+	cl_lock_release(env, lock, GROUPLOCK_SCOPE, current);
+	cl_io_fini(env, io);
+	cl_env_put(env, NULL);
+}
diff --git a/drivers/staging/lustre/lustre/ldlm/interval_tree.c b/drivers/staging/lustre/lustre/ldlm/interval_tree.c
new file mode 100644
index 000000000000..ce90c7e3c488
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ldlm/interval_tree.c
@@ -0,0 +1,764 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/interval_tree.c
+ *
+ * Interval tree library used by ldlm extent lock code
+ *
+ * Author: Huang Wei <huangwei@clusterfs.com>
+ * Author: Jay Xiong <jinshan.xiong@sun.com>
+ */
+# include <lustre_dlm.h>
+#include <obd_support.h>
+#include <interval_tree.h>
+
+enum {
+	INTERVAL_RED = 0,
+	INTERVAL_BLACK = 1
+};
+
+static inline int node_is_left_child(struct interval_node *node)
+{
+	LASSERT(node->in_parent != NULL);
+	return node == node->in_parent->in_left;
+}
+
+static inline int node_is_right_child(struct interval_node *node)
+{
+	LASSERT(node->in_parent != NULL);
+	return node == node->in_parent->in_right;
+}
+
+static inline int node_is_red(struct interval_node *node)
+{
+	return node->in_color == INTERVAL_RED;
+}
+
+static inline int node_is_black(struct interval_node *node)
+{
+	return node->in_color == INTERVAL_BLACK;
+}
+
+static inline int extent_compare(struct interval_node_extent *e1,
+				 struct interval_node_extent *e2)
+{
+	int rc;
+	if (e1->start == e2->start) {
+		if (e1->end < e2->end)
+			rc = -1;
+		else if (e1->end > e2->end)
+			rc = 1;
+		else
+			rc = 0;
+	} else {
+		if (e1->start < e2->start)
+			rc = -1;
+		else
+			rc = 1;
+	}
+	return rc;
+}
+
+static inline int extent_equal(struct interval_node_extent *e1,
+			       struct interval_node_extent *e2)
+{
+	return (e1->start == e2->start) && (e1->end == e2->end);
+}
+
+static inline int extent_overlapped(struct interval_node_extent *e1,
+				    struct interval_node_extent *e2)
+{
+	return (e1->start <= e2->end) && (e2->start <= e1->end);
+}
+
+static inline int node_compare(struct interval_node *n1,
+			       struct interval_node *n2)
+{
+	return extent_compare(&n1->in_extent, &n2->in_extent);
+}
+
+static inline int node_equal(struct interval_node *n1,
+			     struct interval_node *n2)
+{
+	return extent_equal(&n1->in_extent, &n2->in_extent);
+}
+
+static inline __u64 max_u64(__u64 x, __u64 y)
+{
+	return x > y ? x : y;
+}
+
+static inline __u64 min_u64(__u64 x, __u64 y)
+{
+	return x < y ? x : y;
+}
+
+#define interval_for_each(node, root)		   \
+for (node = interval_first(root); node != NULL;	 \
+     node = interval_next(node))
+
+#define interval_for_each_reverse(node, root)	   \
+for (node = interval_last(root); node != NULL;	  \
+     node = interval_prev(node))
+
+static struct interval_node *interval_first(struct interval_node *node)
+{
+	ENTRY;
+
+	if (!node)
+		RETURN(NULL);
+	while (node->in_left)
+		node = node->in_left;
+	RETURN(node);
+}
+
+static struct interval_node *interval_last(struct interval_node *node)
+{
+	ENTRY;
+
+	if (!node)
+		RETURN(NULL);
+	while (node->in_right)
+		node = node->in_right;
+	RETURN(node);
+}
+
+static struct interval_node *interval_next(struct interval_node *node)
+{
+	ENTRY;
+
+	if (!node)
+		RETURN(NULL);
+	if (node->in_right)
+		RETURN(interval_first(node->in_right));
+	while (node->in_parent && node_is_right_child(node))
+		node = node->in_parent;
+	RETURN(node->in_parent);
+}
+
+static struct interval_node *interval_prev(struct interval_node *node)
+{
+	ENTRY;
+
+	if (!node)
+		RETURN(NULL);
+
+	if (node->in_left)
+		RETURN(interval_last(node->in_left));
+
+	while (node->in_parent && node_is_left_child(node))
+		node = node->in_parent;
+
+	RETURN(node->in_parent);
+}
+
+enum interval_iter interval_iterate(struct interval_node *root,
+				    interval_callback_t func,
+				    void *data)
+{
+	struct interval_node *node;
+	enum interval_iter rc = INTERVAL_ITER_CONT;
+	ENTRY;
+
+	interval_for_each(node, root) {
+		rc = func(node, data);
+		if (rc == INTERVAL_ITER_STOP)
+			break;
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(interval_iterate);
+
+enum interval_iter interval_iterate_reverse(struct interval_node *root,
+					    interval_callback_t func,
+					    void *data)
+{
+	struct interval_node *node;
+	enum interval_iter rc = INTERVAL_ITER_CONT;
+	ENTRY;
+
+	interval_for_each_reverse(node, root) {
+		rc = func(node, data);
+		if (rc == INTERVAL_ITER_STOP)
+			break;
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(interval_iterate_reverse);
+
+/* try to find a node with same interval in the tree,
+ * if found, return the pointer to the node, otherwise return NULL*/
+struct interval_node *interval_find(struct interval_node *root,
+				    struct interval_node_extent *ex)
+{
+	struct interval_node *walk = root;
+	int rc;
+	ENTRY;
+
+	while (walk) {
+		rc = extent_compare(ex, &walk->in_extent);
+		if (rc == 0)
+			break;
+		else if (rc < 0)
+			walk = walk->in_left;
+		else
+			walk = walk->in_right;
+	}
+
+	RETURN(walk);
+}
+EXPORT_SYMBOL(interval_find);
+
+static void __rotate_change_maxhigh(struct interval_node *node,
+				    struct interval_node *rotate)
+{
+	__u64 left_max, right_max;
+
+	rotate->in_max_high = node->in_max_high;
+	left_max = node->in_left ? node->in_left->in_max_high : 0;
+	right_max = node->in_right ? node->in_right->in_max_high : 0;
+	node->in_max_high  = max_u64(interval_high(node),
+				     max_u64(left_max,right_max));
+}
+
+/* The left rotation "pivots" around the link from node to node->right, and
+ * - node will be linked to node->right's left child, and
+ * - node->right's left child will be linked to node's right child.  */
+static void __rotate_left(struct interval_node *node,
+			  struct interval_node **root)
+{
+	struct interval_node *right = node->in_right;
+	struct interval_node *parent = node->in_parent;
+
+	node->in_right = right->in_left;
+	if (node->in_right)
+		right->in_left->in_parent = node;
+
+	right->in_left = node;
+	right->in_parent = parent;
+	if (parent) {
+		if (node_is_left_child(node))
+			parent->in_left = right;
+		else
+			parent->in_right = right;
+	} else {
+		*root = right;
+	}
+	node->in_parent = right;
+
+	/* update max_high for node and right */
+	__rotate_change_maxhigh(node, right);
+}
+
+/* The right rotation "pivots" around the link from node to node->left, and
+ * - node will be linked to node->left's right child, and
+ * - node->left's right child will be linked to node's left child.  */
+static void __rotate_right(struct interval_node *node,
+			   struct interval_node **root)
+{
+	struct interval_node *left = node->in_left;
+	struct interval_node *parent = node->in_parent;
+
+	node->in_left = left->in_right;
+	if (node->in_left)
+		left->in_right->in_parent = node;
+	left->in_right = node;
+
+	left->in_parent = parent;
+	if (parent) {
+		if (node_is_right_child(node))
+			parent->in_right = left;
+		else
+			parent->in_left = left;
+	} else {
+		*root = left;
+	}
+	node->in_parent = left;
+
+	/* update max_high for node and left */
+	__rotate_change_maxhigh(node, left);
+}
+
+#define interval_swap(a, b) do {			\
+	struct interval_node *c = a; a = b; b = c;      \
+} while (0)
+
+/*
+ * Operations INSERT and DELETE, when run on a tree with n keys,
+ * take O(logN) time.Because they modify the tree, the result
+ * may violate the red-black properties.To restore these properties,
+ * we must change the colors of some of the nodes in the tree
+ * and also change the pointer structure.
+ */
+static void interval_insert_color(struct interval_node *node,
+				  struct interval_node **root)
+{
+	struct interval_node *parent, *gparent;
+	ENTRY;
+
+	while ((parent = node->in_parent) && node_is_red(parent)) {
+		gparent = parent->in_parent;
+		/* Parent is RED, so gparent must not be NULL */
+		if (node_is_left_child(parent)) {
+			struct interval_node *uncle;
+			uncle = gparent->in_right;
+			if (uncle && node_is_red(uncle)) {
+				uncle->in_color = INTERVAL_BLACK;
+				parent->in_color = INTERVAL_BLACK;
+				gparent->in_color = INTERVAL_RED;
+				node = gparent;
+				continue;
+			}
+
+			if (parent->in_right == node) {
+				__rotate_left(parent, root);
+				interval_swap(node, parent);
+			}
+
+			parent->in_color = INTERVAL_BLACK;
+			gparent->in_color = INTERVAL_RED;
+			__rotate_right(gparent, root);
+		} else {
+			struct interval_node *uncle;
+			uncle = gparent->in_left;
+			if (uncle && node_is_red(uncle)) {
+				uncle->in_color = INTERVAL_BLACK;
+				parent->in_color = INTERVAL_BLACK;
+				gparent->in_color = INTERVAL_RED;
+				node = gparent;
+				continue;
+			}
+
+			if (node_is_left_child(node)) {
+				__rotate_right(parent, root);
+				interval_swap(node, parent);
+			}
+
+			parent->in_color = INTERVAL_BLACK;
+			gparent->in_color = INTERVAL_RED;
+			__rotate_left(gparent, root);
+		}
+	}
+
+	(*root)->in_color = INTERVAL_BLACK;
+	EXIT;
+}
+
+struct interval_node *interval_insert(struct interval_node *node,
+				      struct interval_node **root)
+
+{
+	struct interval_node **p, *parent = NULL;
+	ENTRY;
+
+	LASSERT(!interval_is_intree(node));
+	p = root;
+	while (*p) {
+		parent = *p;
+		if (node_equal(parent, node))
+			RETURN(parent);
+
+		/* max_high field must be updated after each iteration */
+		if (parent->in_max_high < interval_high(node))
+			parent->in_max_high = interval_high(node);
+
+		if (node_compare(node, parent) < 0)
+			p = &parent->in_left;
+		else
+			p = &parent->in_right;
+	}
+
+	/* link node into the tree */
+	node->in_parent = parent;
+	node->in_color = INTERVAL_RED;
+	node->in_left = node->in_right = NULL;
+	*p = node;
+
+	interval_insert_color(node, root);
+	node->in_intree = 1;
+
+	RETURN(NULL);
+}
+EXPORT_SYMBOL(interval_insert);
+
+static inline int node_is_black_or_0(struct interval_node *node)
+{
+	return !node || node_is_black(node);
+}
+
+static void interval_erase_color(struct interval_node *node,
+				 struct interval_node *parent,
+				 struct interval_node **root)
+{
+	struct interval_node *tmp;
+	ENTRY;
+
+	while (node_is_black_or_0(node) && node != *root) {
+		if (parent->in_left == node) {
+			tmp = parent->in_right;
+			if (node_is_red(tmp)) {
+				tmp->in_color = INTERVAL_BLACK;
+				parent->in_color = INTERVAL_RED;
+				__rotate_left(parent, root);
+				tmp = parent->in_right;
+			}
+			if (node_is_black_or_0(tmp->in_left) &&
+			    node_is_black_or_0(tmp->in_right)) {
+				tmp->in_color = INTERVAL_RED;
+				node = parent;
+				parent = node->in_parent;
+			} else {
+				if (node_is_black_or_0(tmp->in_right)) {
+					struct interval_node *o_left;
+					if ((o_left = tmp->in_left))
+					     o_left->in_color = INTERVAL_BLACK;
+					tmp->in_color = INTERVAL_RED;
+					__rotate_right(tmp, root);
+					tmp = parent->in_right;
+				}
+				tmp->in_color = parent->in_color;
+				parent->in_color = INTERVAL_BLACK;
+				if (tmp->in_right)
+				    tmp->in_right->in_color = INTERVAL_BLACK;
+				__rotate_left(parent, root);
+				node = *root;
+				break;
+			}
+		} else {
+			tmp = parent->in_left;
+			if (node_is_red(tmp)) {
+				tmp->in_color = INTERVAL_BLACK;
+				parent->in_color = INTERVAL_RED;
+				__rotate_right(parent, root);
+				tmp = parent->in_left;
+			}
+			if (node_is_black_or_0(tmp->in_left) &&
+			    node_is_black_or_0(tmp->in_right)) {
+				tmp->in_color = INTERVAL_RED;
+				node = parent;
+				parent = node->in_parent;
+			} else {
+				if (node_is_black_or_0(tmp->in_left)) {
+					struct interval_node *o_right;
+					if ((o_right = tmp->in_right))
+					    o_right->in_color = INTERVAL_BLACK;
+					tmp->in_color = INTERVAL_RED;
+					__rotate_left(tmp, root);
+					tmp = parent->in_left;
+				}
+				tmp->in_color = parent->in_color;
+				parent->in_color = INTERVAL_BLACK;
+				if (tmp->in_left)
+					tmp->in_left->in_color = INTERVAL_BLACK;
+				__rotate_right(parent, root);
+				node = *root;
+				break;
+			}
+		}
+	}
+	if (node)
+		node->in_color = INTERVAL_BLACK;
+	EXIT;
+}
+
+/*
+ * if the @max_high value of @node is changed, this function traverse  a path
+ * from node  up to the root to update max_high for the whole tree.
+ */
+static void update_maxhigh(struct interval_node *node,
+			   __u64  old_maxhigh)
+{
+	__u64 left_max, right_max;
+	ENTRY;
+
+	while (node) {
+		left_max = node->in_left ? node->in_left->in_max_high : 0;
+		right_max = node->in_right ? node->in_right->in_max_high : 0;
+		node->in_max_high = max_u64(interval_high(node),
+					    max_u64(left_max, right_max));
+
+		if (node->in_max_high >= old_maxhigh)
+			break;
+		node = node->in_parent;
+	}
+	EXIT;
+}
+
+void interval_erase(struct interval_node *node,
+		    struct interval_node **root)
+{
+	struct interval_node *child, *parent;
+	int color;
+	ENTRY;
+
+	LASSERT(interval_is_intree(node));
+	node->in_intree = 0;
+	if (!node->in_left) {
+		child = node->in_right;
+	} else if (!node->in_right) {
+		child = node->in_left;
+	} else { /* Both left and right child are not NULL */
+		struct interval_node *old = node;
+
+		node = interval_next(node);
+		child = node->in_right;
+		parent = node->in_parent;
+		color = node->in_color;
+
+		if (child)
+			child->in_parent = parent;
+		if (parent == old)
+			parent->in_right = child;
+		else
+			parent->in_left = child;
+
+		node->in_color = old->in_color;
+		node->in_right = old->in_right;
+		node->in_left = old->in_left;
+		node->in_parent = old->in_parent;
+
+		if (old->in_parent) {
+			if (node_is_left_child(old))
+				old->in_parent->in_left = node;
+			else
+				old->in_parent->in_right = node;
+		} else {
+			*root = node;
+		}
+
+		old->in_left->in_parent = node;
+		if (old->in_right)
+			old->in_right->in_parent = node;
+		update_maxhigh(child ? : parent, node->in_max_high);
+		update_maxhigh(node, old->in_max_high);
+		if (parent == old)
+			 parent = node;
+		goto color;
+	}
+	parent = node->in_parent;
+	color = node->in_color;
+
+	if (child)
+		child->in_parent = parent;
+	if (parent) {
+		if (node_is_left_child(node))
+			parent->in_left = child;
+		else
+			parent->in_right = child;
+	} else {
+		*root = child;
+	}
+
+	update_maxhigh(child ? : parent, node->in_max_high);
+
+color:
+	if (color == INTERVAL_BLACK)
+		interval_erase_color(child, parent, root);
+	EXIT;
+}
+EXPORT_SYMBOL(interval_erase);
+
+static inline int interval_may_overlap(struct interval_node *node,
+					  struct interval_node_extent *ext)
+{
+	return (ext->start <= node->in_max_high &&
+		ext->end >= interval_low(node));
+}
+
+/*
+ * This function finds all intervals that overlap interval ext,
+ * and calls func to handle resulted intervals one by one.
+ * in lustre, this function will find all conflicting locks in
+ * the granted queue and add these locks to the ast work list.
+ *
+ * {
+ *       if (node == NULL)
+ *	       return 0;
+ *       if (ext->end < interval_low(node)) {
+ *	       interval_search(node->in_left, ext, func, data);
+ *       } else if (interval_may_overlap(node, ext)) {
+ *	       if (extent_overlapped(ext, &node->in_extent))
+ *		       func(node, data);
+ *	       interval_search(node->in_left, ext, func, data);
+ *	       interval_search(node->in_right, ext, func, data);
+ *       }
+ *       return 0;
+ * }
+ *
+ */
+enum interval_iter interval_search(struct interval_node *node,
+				   struct interval_node_extent *ext,
+				   interval_callback_t func,
+				   void *data)
+{
+	struct interval_node *parent;
+	enum interval_iter rc = INTERVAL_ITER_CONT;
+
+	LASSERT(ext != NULL);
+	LASSERT(func != NULL);
+
+	while (node) {
+		if (ext->end < interval_low(node)) {
+			if (node->in_left) {
+				node = node->in_left;
+				continue;
+			}
+		} else if (interval_may_overlap(node, ext)) {
+			if (extent_overlapped(ext, &node->in_extent)) {
+				rc = func(node, data);
+				if (rc == INTERVAL_ITER_STOP)
+					break;
+			}
+
+			if (node->in_left) {
+				node = node->in_left;
+				continue;
+			}
+			if (node->in_right) {
+				node = node->in_right;
+				continue;
+			}
+		}
+
+		parent = node->in_parent;
+		while (parent) {
+			if (node_is_left_child(node) &&
+			    parent->in_right) {
+				/* If we ever got the left, it means that the
+				 * parent met ext->end<interval_low(parent), or
+				 * may_overlap(parent). If the former is true,
+				 * we needn't go back. So stop early and check
+				 * may_overlap(parent) after this loop.  */
+				node = parent->in_right;
+				break;
+			}
+			node = parent;
+			parent = parent->in_parent;
+		}
+		if (parent == NULL || !interval_may_overlap(parent, ext))
+			break;
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(interval_search);
+
+static enum interval_iter interval_overlap_cb(struct interval_node *n,
+					      void *args)
+{
+	*(int *)args = 1;
+	return INTERVAL_ITER_STOP;
+}
+
+int interval_is_overlapped(struct interval_node *root,
+			   struct interval_node_extent *ext)
+{
+	int has = 0;
+	(void)interval_search(root, ext, interval_overlap_cb, &has);
+	return has;
+}
+EXPORT_SYMBOL(interval_is_overlapped);
+
+/* Don't expand to low. Expanding downwards is expensive, and meaningless to
+ * some extents, because programs seldom do IO backward.
+ *
+ * The recursive algorithm of expanding low:
+ * expand_low {
+ *	struct interval_node *tmp;
+ *	static __u64 res = 0;
+ *
+ *	if (root == NULL)
+ *		return res;
+ *	if (root->in_max_high < low) {
+ *		res = max_u64(root->in_max_high + 1, res);
+ *		return res;
+ *	} else if (low < interval_low(root)) {
+ *		interval_expand_low(root->in_left, low);
+ *		return res;
+ *	}
+ *
+ *	if (interval_high(root) < low)
+ *		res = max_u64(interval_high(root) + 1, res);
+ *	interval_expand_low(root->in_left, low);
+ *	interval_expand_low(root->in_right, low);
+ *
+ *	return res;
+ * }
+ *
+ * It's much easy to eliminate the recursion, see interval_search for
+ * an example. -jay
+ */
+static inline __u64 interval_expand_low(struct interval_node *root, __u64 low)
+{
+	/* we only concern the empty tree right now. */
+	if (root == NULL)
+		return 0;
+	return low;
+}
+
+static inline __u64 interval_expand_high(struct interval_node *node, __u64 high)
+{
+	__u64 result = ~0;
+
+	while (node != NULL) {
+		if (node->in_max_high < high)
+			break;
+
+		if (interval_low(node) > high) {
+			result = interval_low(node) - 1;
+			node = node->in_left;
+		} else {
+			node = node->in_right;
+		}
+	}
+
+	return result;
+}
+
+/* expanding the extent based on @ext. */
+void interval_expand(struct interval_node *root,
+		     struct interval_node_extent *ext,
+		     struct interval_node_extent *limiter)
+{
+	/* The assertion of interval_is_overlapped is expensive because we may
+	 * travel many nodes to find the overlapped node. */
+	LASSERT(interval_is_overlapped(root, ext) == 0);
+	if (!limiter || limiter->start < ext->start)
+		ext->start = interval_expand_low(root, ext->start);
+	if (!limiter || limiter->end > ext->end)
+		ext->end = interval_expand_high(root, ext->end);
+	LASSERT(interval_is_overlapped(root, ext) == 0);
+}
+EXPORT_SYMBOL(interval_expand);
diff --git a/drivers/staging/lustre/lustre/ldlm/l_lock.c b/drivers/staging/lustre/lustre/ldlm/l_lock.c
new file mode 100644
index 000000000000..853409aa945d
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ldlm/l_lock.c
@@ -0,0 +1,76 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+#include <linux/libcfs/libcfs.h>
+
+#include <lustre_dlm.h>
+#include <lustre_lib.h>
+
+/**
+ * Lock a lock and its resource.
+ *
+ * LDLM locking uses resource to serialize access to locks
+ * but there is a case when we change resource of lock upon
+ * enqueue reply. We rely on lock->l_resource = new_res
+ * being an atomic operation.
+ */
+struct ldlm_resource *lock_res_and_lock(struct ldlm_lock *lock)
+{
+	/* on server-side resource of lock doesn't change */
+	if (!lock->l_ns_srv)
+		spin_lock(&lock->l_lock);
+
+	lock_res(lock->l_resource);
+
+	lock->l_res_locked = 1;
+	return lock->l_resource;
+}
+EXPORT_SYMBOL(lock_res_and_lock);
+
+/**
+ * Unlock a lock and its resource previously locked with lock_res_and_lock
+ */
+void unlock_res_and_lock(struct ldlm_lock *lock)
+{
+	/* on server-side resource of lock doesn't change */
+	lock->l_res_locked = 0;
+
+	unlock_res(lock->l_resource);
+	if (!lock->l_ns_srv)
+		spin_unlock(&lock->l_lock);
+}
+EXPORT_SYMBOL(unlock_res_and_lock);
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_extent.c b/drivers/staging/lustre/lustre/ldlm/ldlm_extent.c
new file mode 100644
index 000000000000..f7432f78e396
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_extent.c
@@ -0,0 +1,242 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_extent.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+/**
+ * This file contains implementation of EXTENT lock type
+ *
+ * EXTENT lock type is for locking a contiguous range of values, represented
+ * by 64-bit starting and ending offsets (inclusive). There are several extent
+ * lock modes, some of which may be mutually incompatible. Extent locks are
+ * considered incompatible if their modes are incompatible and their extents
+ * intersect.  See the lock mode compatibility matrix in lustre_dlm.h.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+# include <linux/libcfs/libcfs.h>
+
+#include <lustre_dlm.h>
+#include <obd_support.h>
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_lib.h>
+
+#include "ldlm_internal.h"
+
+
+/* When a lock is cancelled by a client, the KMS may undergo change if this
+ * is the "highest lock".  This function returns the new KMS value.
+ * Caller must hold lr_lock already.
+ *
+ * NB: A lock on [x,y] protects a KMS of up to y + 1 bytes! */
+__u64 ldlm_extent_shift_kms(struct ldlm_lock *lock, __u64 old_kms)
+{
+	struct ldlm_resource *res = lock->l_resource;
+	struct list_head *tmp;
+	struct ldlm_lock *lck;
+	__u64 kms = 0;
+	ENTRY;
+
+	/* don't let another thread in ldlm_extent_shift_kms race in
+	 * just after we finish and take our lock into account in its
+	 * calculation of the kms */
+	lock->l_flags |= LDLM_FL_KMS_IGNORE;
+
+	list_for_each(tmp, &res->lr_granted) {
+		lck = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+		if (lck->l_flags & LDLM_FL_KMS_IGNORE)
+			continue;
+
+		if (lck->l_policy_data.l_extent.end >= old_kms)
+			RETURN(old_kms);
+
+		/* This extent _has_ to be smaller than old_kms (checked above)
+		 * so kms can only ever be smaller or the same as old_kms. */
+		if (lck->l_policy_data.l_extent.end + 1 > kms)
+			kms = lck->l_policy_data.l_extent.end + 1;
+	}
+	LASSERTF(kms <= old_kms, "kms "LPU64" old_kms "LPU64"\n", kms, old_kms);
+
+	RETURN(kms);
+}
+EXPORT_SYMBOL(ldlm_extent_shift_kms);
+
+struct kmem_cache *ldlm_interval_slab;
+struct ldlm_interval *ldlm_interval_alloc(struct ldlm_lock *lock)
+{
+	struct ldlm_interval *node;
+	ENTRY;
+
+	LASSERT(lock->l_resource->lr_type == LDLM_EXTENT);
+	OBD_SLAB_ALLOC_PTR_GFP(node, ldlm_interval_slab, __GFP_IO);
+	if (node == NULL)
+		RETURN(NULL);
+
+	INIT_LIST_HEAD(&node->li_group);
+	ldlm_interval_attach(node, lock);
+	RETURN(node);
+}
+
+void ldlm_interval_free(struct ldlm_interval *node)
+{
+	if (node) {
+		LASSERT(list_empty(&node->li_group));
+		LASSERT(!interval_is_intree(&node->li_node));
+		OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node));
+	}
+}
+
+/* interval tree, for LDLM_EXTENT. */
+void ldlm_interval_attach(struct ldlm_interval *n,
+			  struct ldlm_lock *l)
+{
+	LASSERT(l->l_tree_node == NULL);
+	LASSERT(l->l_resource->lr_type == LDLM_EXTENT);
+
+	list_add_tail(&l->l_sl_policy, &n->li_group);
+	l->l_tree_node = n;
+}
+
+struct ldlm_interval *ldlm_interval_detach(struct ldlm_lock *l)
+{
+	struct ldlm_interval *n = l->l_tree_node;
+
+	if (n == NULL)
+		return NULL;
+
+	LASSERT(!list_empty(&n->li_group));
+	l->l_tree_node = NULL;
+	list_del_init(&l->l_sl_policy);
+
+	return (list_empty(&n->li_group) ? n : NULL);
+}
+
+static inline int lock_mode_to_index(ldlm_mode_t mode)
+{
+	int index;
+
+	LASSERT(mode != 0);
+	LASSERT(IS_PO2(mode));
+	for (index = -1; mode; index++, mode >>= 1) ;
+	LASSERT(index < LCK_MODE_NUM);
+	return index;
+}
+
+/** Add newly granted lock into interval tree for the resource. */
+void ldlm_extent_add_lock(struct ldlm_resource *res,
+			  struct ldlm_lock *lock)
+{
+	struct interval_node *found, **root;
+	struct ldlm_interval *node;
+	struct ldlm_extent *extent;
+	int idx;
+
+	LASSERT(lock->l_granted_mode == lock->l_req_mode);
+
+	node = lock->l_tree_node;
+	LASSERT(node != NULL);
+	LASSERT(!interval_is_intree(&node->li_node));
+
+	idx = lock_mode_to_index(lock->l_granted_mode);
+	LASSERT(lock->l_granted_mode == 1 << idx);
+	LASSERT(lock->l_granted_mode == res->lr_itree[idx].lit_mode);
+
+	/* node extent initialize */
+	extent = &lock->l_policy_data.l_extent;
+	interval_set(&node->li_node, extent->start, extent->end);
+
+	root = &res->lr_itree[idx].lit_root;
+	found = interval_insert(&node->li_node, root);
+	if (found) { /* The policy group found. */
+		struct ldlm_interval *tmp = ldlm_interval_detach(lock);
+		LASSERT(tmp != NULL);
+		ldlm_interval_free(tmp);
+		ldlm_interval_attach(to_ldlm_interval(found), lock);
+	}
+	res->lr_itree[idx].lit_size++;
+
+	/* even though we use interval tree to manage the extent lock, we also
+	 * add the locks into grant list, for debug purpose, .. */
+	ldlm_resource_add_lock(res, &res->lr_granted, lock);
+}
+
+/** Remove cancelled lock from resource interval tree. */
+void ldlm_extent_unlink_lock(struct ldlm_lock *lock)
+{
+	struct ldlm_resource *res = lock->l_resource;
+	struct ldlm_interval *node = lock->l_tree_node;
+	struct ldlm_interval_tree *tree;
+	int idx;
+
+	if (!node || !interval_is_intree(&node->li_node)) /* duplicate unlink */
+		return;
+
+	idx = lock_mode_to_index(lock->l_granted_mode);
+	LASSERT(lock->l_granted_mode == 1 << idx);
+	tree = &res->lr_itree[idx];
+
+	LASSERT(tree->lit_root != NULL); /* assure the tree is not null */
+
+	tree->lit_size--;
+	node = ldlm_interval_detach(lock);
+	if (node) {
+		interval_erase(&node->li_node, &tree->lit_root);
+		ldlm_interval_free(node);
+	}
+}
+
+void ldlm_extent_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy,
+				     ldlm_policy_data_t *lpolicy)
+{
+	memset(lpolicy, 0, sizeof(*lpolicy));
+	lpolicy->l_extent.start = wpolicy->l_extent.start;
+	lpolicy->l_extent.end = wpolicy->l_extent.end;
+	lpolicy->l_extent.gid = wpolicy->l_extent.gid;
+}
+
+void ldlm_extent_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+				     ldlm_wire_policy_data_t *wpolicy)
+{
+	memset(wpolicy, 0, sizeof(*wpolicy));
+	wpolicy->l_extent.start = lpolicy->l_extent.start;
+	wpolicy->l_extent.end = lpolicy->l_extent.end;
+	wpolicy->l_extent.gid = lpolicy->l_extent.gid;
+}
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_flock.c b/drivers/staging/lustre/lustre/ldlm/ldlm_flock.c
new file mode 100644
index 000000000000..f100a84bde73
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_flock.c
@@ -0,0 +1,849 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003 Hewlett-Packard Development Company LP.
+ * Developed under the sponsorship of the US Government under
+ * Subcontract No. B514193
+ *
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/**
+ * This file implements POSIX lock type for Lustre.
+ * Its policy properties are start and end of extent and PID.
+ *
+ * These locks are only done through MDS due to POSIX semantics requiring
+ * e.g. that locks could be only partially released and as such split into
+ * two parts, and also that two adjacent locks from the same process may be
+ * merged into a single wider lock.
+ *
+ * Lock modes are mapped like this:
+ * PR and PW for READ and WRITE locks
+ * NL to request a releasing of a portion of the lock
+ *
+ * These flock locks never timeout.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include <lustre_dlm.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_lib.h>
+#include <linux/list.h>
+
+#include "ldlm_internal.h"
+
+int ldlm_flock_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+			    void *data, int flag);
+
+/**
+ * list_for_remaining_safe - iterate over the remaining entries in a list
+ *	      and safeguard against removal of a list entry.
+ * \param pos   the &struct list_head to use as a loop counter. pos MUST
+ *	      have been initialized prior to using it in this macro.
+ * \param n     another &struct list_head to use as temporary storage
+ * \param head  the head for your list.
+ */
+#define list_for_remaining_safe(pos, n, head) \
+	for (n = pos->next; pos != (head); pos = n, n = pos->next)
+
+static inline int
+ldlm_same_flock_owner(struct ldlm_lock *lock, struct ldlm_lock *new)
+{
+	return((new->l_policy_data.l_flock.owner ==
+		lock->l_policy_data.l_flock.owner) &&
+	       (new->l_export == lock->l_export));
+}
+
+static inline int
+ldlm_flocks_overlap(struct ldlm_lock *lock, struct ldlm_lock *new)
+{
+	return((new->l_policy_data.l_flock.start <=
+		lock->l_policy_data.l_flock.end) &&
+	       (new->l_policy_data.l_flock.end >=
+		lock->l_policy_data.l_flock.start));
+}
+
+static inline int ldlm_flock_blocking_link(struct ldlm_lock *req,
+					   struct ldlm_lock *lock)
+{
+	int rc = 0;
+
+	/* For server only */
+	if (req->l_export == NULL)
+		return 0;
+
+	if (unlikely(req->l_export->exp_flock_hash == NULL)) {
+		rc = ldlm_init_flock_export(req->l_export);
+		if (rc)
+			goto error;
+	}
+
+	LASSERT(hlist_unhashed(&req->l_exp_flock_hash));
+
+	req->l_policy_data.l_flock.blocking_owner =
+		lock->l_policy_data.l_flock.owner;
+	req->l_policy_data.l_flock.blocking_export =
+		lock->l_export;
+	req->l_policy_data.l_flock.blocking_refs = 0;
+
+	cfs_hash_add(req->l_export->exp_flock_hash,
+		     &req->l_policy_data.l_flock.owner,
+		     &req->l_exp_flock_hash);
+error:
+	return rc;
+}
+
+static inline void ldlm_flock_blocking_unlink(struct ldlm_lock *req)
+{
+	/* For server only */
+	if (req->l_export == NULL)
+		return;
+
+	check_res_locked(req->l_resource);
+	if (req->l_export->exp_flock_hash != NULL &&
+	    !hlist_unhashed(&req->l_exp_flock_hash))
+		cfs_hash_del(req->l_export->exp_flock_hash,
+			     &req->l_policy_data.l_flock.owner,
+			     &req->l_exp_flock_hash);
+}
+
+static inline void
+ldlm_flock_destroy(struct ldlm_lock *lock, ldlm_mode_t mode, __u64 flags)
+{
+	ENTRY;
+
+	LDLM_DEBUG(lock, "ldlm_flock_destroy(mode: %d, flags: 0x%llx)",
+		   mode, flags);
+
+	/* Safe to not lock here, since it should be empty anyway */
+	LASSERT(hlist_unhashed(&lock->l_exp_flock_hash));
+
+	list_del_init(&lock->l_res_link);
+	if (flags == LDLM_FL_WAIT_NOREPROC &&
+	    !(lock->l_flags & LDLM_FL_FAILED)) {
+		/* client side - set a flag to prevent sending a CANCEL */
+		lock->l_flags |= LDLM_FL_LOCAL_ONLY | LDLM_FL_CBPENDING;
+
+		/* when reaching here, it is under lock_res_and_lock(). Thus,
+		   need call the nolock version of ldlm_lock_decref_internal*/
+		ldlm_lock_decref_internal_nolock(lock, mode);
+	}
+
+	ldlm_lock_destroy_nolock(lock);
+	EXIT;
+}
+
+/**
+ * POSIX locks deadlock detection code.
+ *
+ * Given a new lock \a req and an existing lock \a bl_lock it conflicts
+ * with, we need to iterate through all blocked POSIX locks for this
+ * export and see if there is a deadlock condition arising. (i.e. when
+ * one client holds a lock on something and want a lock on something
+ * else and at the same time another client has the opposite situation).
+ */
+static int
+ldlm_flock_deadlock(struct ldlm_lock *req, struct ldlm_lock *bl_lock)
+{
+	struct obd_export *req_exp = req->l_export;
+	struct obd_export *bl_exp = bl_lock->l_export;
+	__u64 req_owner = req->l_policy_data.l_flock.owner;
+	__u64 bl_owner = bl_lock->l_policy_data.l_flock.owner;
+
+	/* For server only */
+	if (req_exp == NULL)
+		return 0;
+
+	class_export_get(bl_exp);
+	while (1) {
+		struct obd_export *bl_exp_new;
+		struct ldlm_lock *lock = NULL;
+		struct ldlm_flock *flock;
+
+		if (bl_exp->exp_flock_hash != NULL)
+			lock = cfs_hash_lookup(bl_exp->exp_flock_hash,
+					       &bl_owner);
+		if (lock == NULL)
+			break;
+
+		flock = &lock->l_policy_data.l_flock;
+		LASSERT(flock->owner == bl_owner);
+		bl_owner = flock->blocking_owner;
+		bl_exp_new = class_export_get(flock->blocking_export);
+		class_export_put(bl_exp);
+
+		cfs_hash_put(bl_exp->exp_flock_hash, &lock->l_exp_flock_hash);
+		bl_exp = bl_exp_new;
+
+		if (bl_owner == req_owner && bl_exp == req_exp) {
+			class_export_put(bl_exp);
+			return 1;
+		}
+	}
+	class_export_put(bl_exp);
+
+	return 0;
+}
+
+/**
+ * Process a granting attempt for flock lock.
+ * Must be called under ns lock held.
+ *
+ * This function looks for any conflicts for \a lock in the granted or
+ * waiting queues. The lock is granted if no conflicts are found in
+ * either queue.
+ *
+ * It is also responsible for splitting a lock if a portion of the lock
+ * is released.
+ *
+ * If \a first_enq is 0 (ie, called from ldlm_reprocess_queue):
+ *   - blocking ASTs have already been sent
+ *
+ * If \a first_enq is 1 (ie, called from ldlm_lock_enqueue):
+ *   - blocking ASTs have not been sent yet, so list of conflicting locks
+ *     would be collected and ASTs sent.
+ */
+int
+ldlm_process_flock_lock(struct ldlm_lock *req, __u64 *flags, int first_enq,
+			ldlm_error_t *err, struct list_head *work_list)
+{
+	struct ldlm_resource *res = req->l_resource;
+	struct ldlm_namespace *ns = ldlm_res_to_ns(res);
+	struct list_head *tmp;
+	struct list_head *ownlocks = NULL;
+	struct ldlm_lock *lock = NULL;
+	struct ldlm_lock *new = req;
+	struct ldlm_lock *new2 = NULL;
+	ldlm_mode_t mode = req->l_req_mode;
+	int local = ns_is_client(ns);
+	int added = (mode == LCK_NL);
+	int overlaps = 0;
+	int splitted = 0;
+	const struct ldlm_callback_suite null_cbs = { NULL };
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_DLMTRACE, "flags %#llx owner "LPU64" pid %u mode %u start "
+	       LPU64" end "LPU64"\n", *flags,
+	       new->l_policy_data.l_flock.owner,
+	       new->l_policy_data.l_flock.pid, mode,
+	       req->l_policy_data.l_flock.start,
+	       req->l_policy_data.l_flock.end);
+
+	*err = ELDLM_OK;
+
+	if (local) {
+		/* No blocking ASTs are sent to the clients for
+		 * Posix file & record locks */
+		req->l_blocking_ast = NULL;
+	} else {
+		/* Called on the server for lock cancels. */
+		req->l_blocking_ast = ldlm_flock_blocking_ast;
+	}
+
+reprocess:
+	if ((*flags == LDLM_FL_WAIT_NOREPROC) || (mode == LCK_NL)) {
+		/* This loop determines where this processes locks start
+		 * in the resource lr_granted list. */
+		list_for_each(tmp, &res->lr_granted) {
+			lock = list_entry(tmp, struct ldlm_lock,
+					      l_res_link);
+			if (ldlm_same_flock_owner(lock, req)) {
+				ownlocks = tmp;
+				break;
+			}
+		}
+	} else {
+		lockmode_verify(mode);
+
+		/* This loop determines if there are existing locks
+		 * that conflict with the new lock request. */
+		list_for_each(tmp, &res->lr_granted) {
+			lock = list_entry(tmp, struct ldlm_lock,
+					      l_res_link);
+
+			if (ldlm_same_flock_owner(lock, req)) {
+				if (!ownlocks)
+					ownlocks = tmp;
+				continue;
+			}
+
+			/* locks are compatible, overlap doesn't matter */
+			if (lockmode_compat(lock->l_granted_mode, mode))
+				continue;
+
+			if (!ldlm_flocks_overlap(lock, req))
+				continue;
+
+			if (!first_enq)
+				RETURN(LDLM_ITER_CONTINUE);
+
+			if (*flags & LDLM_FL_BLOCK_NOWAIT) {
+				ldlm_flock_destroy(req, mode, *flags);
+				*err = -EAGAIN;
+				RETURN(LDLM_ITER_STOP);
+			}
+
+			if (*flags & LDLM_FL_TEST_LOCK) {
+				ldlm_flock_destroy(req, mode, *flags);
+				req->l_req_mode = lock->l_granted_mode;
+				req->l_policy_data.l_flock.pid =
+					lock->l_policy_data.l_flock.pid;
+				req->l_policy_data.l_flock.start =
+					lock->l_policy_data.l_flock.start;
+				req->l_policy_data.l_flock.end =
+					lock->l_policy_data.l_flock.end;
+				*flags |= LDLM_FL_LOCK_CHANGED;
+				RETURN(LDLM_ITER_STOP);
+			}
+
+			if (ldlm_flock_deadlock(req, lock)) {
+				ldlm_flock_destroy(req, mode, *flags);
+				*err = -EDEADLK;
+				RETURN(LDLM_ITER_STOP);
+			}
+
+			rc = ldlm_flock_blocking_link(req, lock);
+			if (rc) {
+				ldlm_flock_destroy(req, mode, *flags);
+				*err = rc;
+				RETURN(LDLM_ITER_STOP);
+			}
+			ldlm_resource_add_lock(res, &res->lr_waiting, req);
+			*flags |= LDLM_FL_BLOCK_GRANTED;
+			RETURN(LDLM_ITER_STOP);
+		}
+	}
+
+	if (*flags & LDLM_FL_TEST_LOCK) {
+		ldlm_flock_destroy(req, mode, *flags);
+		req->l_req_mode = LCK_NL;
+		*flags |= LDLM_FL_LOCK_CHANGED;
+		RETURN(LDLM_ITER_STOP);
+	}
+
+	/* In case we had slept on this lock request take it off of the
+	 * deadlock detection hash list. */
+	ldlm_flock_blocking_unlink(req);
+
+	/* Scan the locks owned by this process that overlap this request.
+	 * We may have to merge or split existing locks. */
+
+	if (!ownlocks)
+		ownlocks = &res->lr_granted;
+
+	list_for_remaining_safe(ownlocks, tmp, &res->lr_granted) {
+		lock = list_entry(ownlocks, struct ldlm_lock, l_res_link);
+
+		if (!ldlm_same_flock_owner(lock, new))
+			break;
+
+		if (lock->l_granted_mode == mode) {
+			/* If the modes are the same then we need to process
+			 * locks that overlap OR adjoin the new lock. The extra
+			 * logic condition is necessary to deal with arithmetic
+			 * overflow and underflow. */
+			if ((new->l_policy_data.l_flock.start >
+			     (lock->l_policy_data.l_flock.end + 1))
+			    && (lock->l_policy_data.l_flock.end !=
+				OBD_OBJECT_EOF))
+				continue;
+
+			if ((new->l_policy_data.l_flock.end <
+			     (lock->l_policy_data.l_flock.start - 1))
+			    && (lock->l_policy_data.l_flock.start != 0))
+				break;
+
+			if (new->l_policy_data.l_flock.start <
+			    lock->l_policy_data.l_flock.start) {
+				lock->l_policy_data.l_flock.start =
+					new->l_policy_data.l_flock.start;
+			} else {
+				new->l_policy_data.l_flock.start =
+					lock->l_policy_data.l_flock.start;
+			}
+
+			if (new->l_policy_data.l_flock.end >
+			    lock->l_policy_data.l_flock.end) {
+				lock->l_policy_data.l_flock.end =
+					new->l_policy_data.l_flock.end;
+			} else {
+				new->l_policy_data.l_flock.end =
+					lock->l_policy_data.l_flock.end;
+			}
+
+			if (added) {
+				ldlm_flock_destroy(lock, mode, *flags);
+			} else {
+				new = lock;
+				added = 1;
+			}
+			continue;
+		}
+
+		if (new->l_policy_data.l_flock.start >
+		    lock->l_policy_data.l_flock.end)
+			continue;
+
+		if (new->l_policy_data.l_flock.end <
+		    lock->l_policy_data.l_flock.start)
+			break;
+
+		++overlaps;
+
+		if (new->l_policy_data.l_flock.start <=
+		    lock->l_policy_data.l_flock.start) {
+			if (new->l_policy_data.l_flock.end <
+			    lock->l_policy_data.l_flock.end) {
+				lock->l_policy_data.l_flock.start =
+					new->l_policy_data.l_flock.end + 1;
+				break;
+			}
+			ldlm_flock_destroy(lock, lock->l_req_mode, *flags);
+			continue;
+		}
+		if (new->l_policy_data.l_flock.end >=
+		    lock->l_policy_data.l_flock.end) {
+			lock->l_policy_data.l_flock.end =
+				new->l_policy_data.l_flock.start - 1;
+			continue;
+		}
+
+		/* split the existing lock into two locks */
+
+		/* if this is an F_UNLCK operation then we could avoid
+		 * allocating a new lock and use the req lock passed in
+		 * with the request but this would complicate the reply
+		 * processing since updates to req get reflected in the
+		 * reply. The client side replays the lock request so
+		 * it must see the original lock data in the reply. */
+
+		/* XXX - if ldlm_lock_new() can sleep we should
+		 * release the lr_lock, allocate the new lock,
+		 * and restart processing this lock. */
+		if (!new2) {
+			unlock_res_and_lock(req);
+			new2 = ldlm_lock_create(ns, &res->lr_name, LDLM_FLOCK,
+						lock->l_granted_mode, &null_cbs,
+						NULL, 0, LVB_T_NONE);
+			lock_res_and_lock(req);
+			if (!new2) {
+				ldlm_flock_destroy(req, lock->l_granted_mode,
+						   *flags);
+				*err = -ENOLCK;
+				RETURN(LDLM_ITER_STOP);
+			}
+			goto reprocess;
+		}
+
+		splitted = 1;
+
+		new2->l_granted_mode = lock->l_granted_mode;
+		new2->l_policy_data.l_flock.pid =
+			new->l_policy_data.l_flock.pid;
+		new2->l_policy_data.l_flock.owner =
+			new->l_policy_data.l_flock.owner;
+		new2->l_policy_data.l_flock.start =
+			lock->l_policy_data.l_flock.start;
+		new2->l_policy_data.l_flock.end =
+			new->l_policy_data.l_flock.start - 1;
+		lock->l_policy_data.l_flock.start =
+			new->l_policy_data.l_flock.end + 1;
+		new2->l_conn_export = lock->l_conn_export;
+		if (lock->l_export != NULL) {
+			new2->l_export = class_export_lock_get(lock->l_export, new2);
+			if (new2->l_export->exp_lock_hash &&
+			    hlist_unhashed(&new2->l_exp_hash))
+				cfs_hash_add(new2->l_export->exp_lock_hash,
+					     &new2->l_remote_handle,
+					     &new2->l_exp_hash);
+		}
+		if (*flags == LDLM_FL_WAIT_NOREPROC)
+			ldlm_lock_addref_internal_nolock(new2,
+							 lock->l_granted_mode);
+
+		/* insert new2 at lock */
+		ldlm_resource_add_lock(res, ownlocks, new2);
+		LDLM_LOCK_RELEASE(new2);
+		break;
+	}
+
+	/* if new2 is created but never used, destroy it*/
+	if (splitted == 0 && new2 != NULL)
+		ldlm_lock_destroy_nolock(new2);
+
+	/* At this point we're granting the lock request. */
+	req->l_granted_mode = req->l_req_mode;
+
+	/* Add req to the granted queue before calling ldlm_reprocess_all(). */
+	if (!added) {
+		list_del_init(&req->l_res_link);
+		/* insert new lock before ownlocks in list. */
+		ldlm_resource_add_lock(res, ownlocks, req);
+	}
+
+	if (*flags != LDLM_FL_WAIT_NOREPROC) {
+		/* The only one possible case for client-side calls flock
+		 * policy function is ldlm_flock_completion_ast inside which
+		 * carries LDLM_FL_WAIT_NOREPROC flag. */
+		CERROR("Illegal parameter for client-side-only module.\n");
+		LBUG();
+	}
+
+	/* In case we're reprocessing the requested lock we can't destroy
+	 * it until after calling ldlm_add_ast_work_item() above so that laawi()
+	 * can bump the reference count on \a req. Otherwise \a req
+	 * could be freed before the completion AST can be sent.  */
+	if (added)
+		ldlm_flock_destroy(req, mode, *flags);
+
+	ldlm_resource_dump(D_INFO, res);
+	RETURN(LDLM_ITER_CONTINUE);
+}
+
+struct ldlm_flock_wait_data {
+	struct ldlm_lock *fwd_lock;
+	int	       fwd_generation;
+};
+
+static void
+ldlm_flock_interrupted_wait(void *data)
+{
+	struct ldlm_lock *lock;
+	ENTRY;
+
+	lock = ((struct ldlm_flock_wait_data *)data)->fwd_lock;
+
+	/* take lock off the deadlock detection hash list. */
+	lock_res_and_lock(lock);
+	ldlm_flock_blocking_unlink(lock);
+
+	/* client side - set flag to prevent lock from being put on LRU list */
+	lock->l_flags |= LDLM_FL_CBPENDING;
+	unlock_res_and_lock(lock);
+
+	EXIT;
+}
+
+/**
+ * Flock completion callback function.
+ *
+ * \param lock [in,out]: A lock to be handled
+ * \param flags    [in]: flags
+ * \param *data    [in]: ldlm_work_cp_ast_lock() will use ldlm_cb_set_arg
+ *
+ * \retval 0    : success
+ * \retval <0   : failure
+ */
+int
+ldlm_flock_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
+{
+	struct file_lock		*getlk = lock->l_ast_data;
+	struct obd_device	      *obd;
+	struct obd_import	      *imp = NULL;
+	struct ldlm_flock_wait_data     fwd;
+	struct l_wait_info	      lwi;
+	ldlm_error_t		    err;
+	int			     rc = 0;
+	ENTRY;
+
+	CDEBUG(D_DLMTRACE, "flags: 0x%llx data: %p getlk: %p\n",
+	       flags, data, getlk);
+
+	/* Import invalidation. We need to actually release the lock
+	 * references being held, so that it can go away. No point in
+	 * holding the lock even if app still believes it has it, since
+	 * server already dropped it anyway. Only for granted locks too. */
+	if ((lock->l_flags & (LDLM_FL_FAILED|LDLM_FL_LOCAL_ONLY)) ==
+	    (LDLM_FL_FAILED|LDLM_FL_LOCAL_ONLY)) {
+		if (lock->l_req_mode == lock->l_granted_mode &&
+		    lock->l_granted_mode != LCK_NL &&
+		    NULL == data)
+			ldlm_lock_decref_internal(lock, lock->l_req_mode);
+
+		/* Need to wake up the waiter if we were evicted */
+		wake_up(&lock->l_waitq);
+		RETURN(0);
+	}
+
+	LASSERT(flags != LDLM_FL_WAIT_NOREPROC);
+
+	if (!(flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
+		       LDLM_FL_BLOCK_CONV))) {
+		if (NULL == data)
+			/* mds granted the lock in the reply */
+			goto granted;
+		/* CP AST RPC: lock get granted, wake it up */
+		wake_up(&lock->l_waitq);
+		RETURN(0);
+	}
+
+	LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, "
+		   "sleeping");
+	fwd.fwd_lock = lock;
+	obd = class_exp2obd(lock->l_conn_export);
+
+	/* if this is a local lock, there is no import */
+	if (NULL != obd)
+		imp = obd->u.cli.cl_import;
+
+	if (NULL != imp) {
+		spin_lock(&imp->imp_lock);
+		fwd.fwd_generation = imp->imp_generation;
+		spin_unlock(&imp->imp_lock);
+	}
+
+	lwi = LWI_TIMEOUT_INTR(0, NULL, ldlm_flock_interrupted_wait, &fwd);
+
+	/* Go to sleep until the lock is granted. */
+	rc = l_wait_event(lock->l_waitq, is_granted_or_cancelled(lock), &lwi);
+
+	if (rc) {
+		LDLM_DEBUG(lock, "client-side enqueue waking up: failed (%d)",
+			   rc);
+		RETURN(rc);
+	}
+
+granted:
+	OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CP_CB_WAIT, 10);
+
+	if (lock->l_destroyed) {
+		LDLM_DEBUG(lock, "client-side enqueue waking up: destroyed");
+		RETURN(0);
+	}
+
+	if (lock->l_flags & LDLM_FL_FAILED) {
+		LDLM_DEBUG(lock, "client-side enqueue waking up: failed");
+		RETURN(-EIO);
+	}
+
+	if (rc) {
+		LDLM_DEBUG(lock, "client-side enqueue waking up: failed (%d)",
+			   rc);
+		RETURN(rc);
+	}
+
+	LDLM_DEBUG(lock, "client-side enqueue granted");
+
+	lock_res_and_lock(lock);
+
+	/* take lock off the deadlock detection hash list. */
+	ldlm_flock_blocking_unlink(lock);
+
+	/* ldlm_lock_enqueue() has already placed lock on the granted list. */
+	list_del_init(&lock->l_res_link);
+
+	if (flags & LDLM_FL_TEST_LOCK) {
+		/* fcntl(F_GETLK) request */
+		/* The old mode was saved in getlk->fl_type so that if the mode
+		 * in the lock changes we can decref the appropriate refcount.*/
+		ldlm_flock_destroy(lock, flock_type(getlk),
+				   LDLM_FL_WAIT_NOREPROC);
+		switch (lock->l_granted_mode) {
+		case LCK_PR:
+			flock_set_type(getlk, F_RDLCK);
+			break;
+		case LCK_PW:
+			flock_set_type(getlk, F_WRLCK);
+			break;
+		default:
+			flock_set_type(getlk, F_UNLCK);
+		}
+		flock_set_pid(getlk, (pid_t)lock->l_policy_data.l_flock.pid);
+		flock_set_start(getlk,
+				(loff_t)lock->l_policy_data.l_flock.start);
+		flock_set_end(getlk,
+			      (loff_t)lock->l_policy_data.l_flock.end);
+	} else {
+		__u64 noreproc = LDLM_FL_WAIT_NOREPROC;
+
+		/* We need to reprocess the lock to do merges or splits
+		 * with existing locks owned by this process. */
+		ldlm_process_flock_lock(lock, &noreproc, 1, &err, NULL);
+	}
+	unlock_res_and_lock(lock);
+	RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_flock_completion_ast);
+
+int ldlm_flock_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+			    void *data, int flag)
+{
+	ENTRY;
+
+	LASSERT(lock);
+	LASSERT(flag == LDLM_CB_CANCELING);
+
+	/* take lock off the deadlock detection hash list. */
+	lock_res_and_lock(lock);
+	ldlm_flock_blocking_unlink(lock);
+	unlock_res_and_lock(lock);
+	RETURN(0);
+}
+
+void ldlm_flock_policy_wire18_to_local(const ldlm_wire_policy_data_t *wpolicy,
+				       ldlm_policy_data_t *lpolicy)
+{
+	memset(lpolicy, 0, sizeof(*lpolicy));
+	lpolicy->l_flock.start = wpolicy->l_flock.lfw_start;
+	lpolicy->l_flock.end = wpolicy->l_flock.lfw_end;
+	lpolicy->l_flock.pid = wpolicy->l_flock.lfw_pid;
+	/* Compat code, old clients had no idea about owner field and
+	 * relied solely on pid for ownership. Introduced in LU-104, 2.1,
+	 * April 2011 */
+	lpolicy->l_flock.owner = wpolicy->l_flock.lfw_pid;
+}
+
+
+void ldlm_flock_policy_wire21_to_local(const ldlm_wire_policy_data_t *wpolicy,
+				       ldlm_policy_data_t *lpolicy)
+{
+	memset(lpolicy, 0, sizeof(*lpolicy));
+	lpolicy->l_flock.start = wpolicy->l_flock.lfw_start;
+	lpolicy->l_flock.end = wpolicy->l_flock.lfw_end;
+	lpolicy->l_flock.pid = wpolicy->l_flock.lfw_pid;
+	lpolicy->l_flock.owner = wpolicy->l_flock.lfw_owner;
+}
+
+void ldlm_flock_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+				     ldlm_wire_policy_data_t *wpolicy)
+{
+	memset(wpolicy, 0, sizeof(*wpolicy));
+	wpolicy->l_flock.lfw_start = lpolicy->l_flock.start;
+	wpolicy->l_flock.lfw_end = lpolicy->l_flock.end;
+	wpolicy->l_flock.lfw_pid = lpolicy->l_flock.pid;
+	wpolicy->l_flock.lfw_owner = lpolicy->l_flock.owner;
+}
+
+/*
+ * Export handle<->flock hash operations.
+ */
+static unsigned
+ldlm_export_flock_hash(cfs_hash_t *hs, const void *key, unsigned mask)
+{
+	return cfs_hash_u64_hash(*(__u64 *)key, mask);
+}
+
+static void *
+ldlm_export_flock_key(struct hlist_node *hnode)
+{
+	struct ldlm_lock *lock;
+
+	lock = hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash);
+	return &lock->l_policy_data.l_flock.owner;
+}
+
+static int
+ldlm_export_flock_keycmp(const void *key, struct hlist_node *hnode)
+{
+	return !memcmp(ldlm_export_flock_key(hnode), key, sizeof(__u64));
+}
+
+static void *
+ldlm_export_flock_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash);
+}
+
+static void
+ldlm_export_flock_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct ldlm_lock *lock;
+	struct ldlm_flock *flock;
+
+	lock = hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash);
+	LDLM_LOCK_GET(lock);
+
+	flock = &lock->l_policy_data.l_flock;
+	LASSERT(flock->blocking_export != NULL);
+	class_export_get(flock->blocking_export);
+	flock->blocking_refs++;
+}
+
+static void
+ldlm_export_flock_put(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct ldlm_lock *lock;
+	struct ldlm_flock *flock;
+
+	lock = hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash);
+	LDLM_LOCK_RELEASE(lock);
+
+	flock = &lock->l_policy_data.l_flock;
+	LASSERT(flock->blocking_export != NULL);
+	class_export_put(flock->blocking_export);
+	if (--flock->blocking_refs == 0) {
+		flock->blocking_owner = 0;
+		flock->blocking_export = NULL;
+	}
+}
+
+static cfs_hash_ops_t ldlm_export_flock_ops = {
+	.hs_hash	= ldlm_export_flock_hash,
+	.hs_key	 = ldlm_export_flock_key,
+	.hs_keycmp      = ldlm_export_flock_keycmp,
+	.hs_object      = ldlm_export_flock_object,
+	.hs_get	 = ldlm_export_flock_get,
+	.hs_put	 = ldlm_export_flock_put,
+	.hs_put_locked  = ldlm_export_flock_put,
+};
+
+int ldlm_init_flock_export(struct obd_export *exp)
+{
+	exp->exp_flock_hash =
+		cfs_hash_create(obd_uuid2str(&exp->exp_client_uuid),
+				HASH_EXP_LOCK_CUR_BITS,
+				HASH_EXP_LOCK_MAX_BITS,
+				HASH_EXP_LOCK_BKT_BITS, 0,
+				CFS_HASH_MIN_THETA, CFS_HASH_MAX_THETA,
+				&ldlm_export_flock_ops,
+				CFS_HASH_DEFAULT | CFS_HASH_NBLK_CHANGE);
+	if (!exp->exp_flock_hash)
+		RETURN(-ENOMEM);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_init_flock_export);
+
+void ldlm_destroy_flock_export(struct obd_export *exp)
+{
+	ENTRY;
+	if (exp->exp_flock_hash) {
+		cfs_hash_putref(exp->exp_flock_hash);
+		exp->exp_flock_hash = NULL;
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(ldlm_destroy_flock_export);
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_inodebits.c b/drivers/staging/lustre/lustre/ldlm/ldlm_inodebits.c
new file mode 100644
index 000000000000..574b2ff43b74
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_inodebits.c
@@ -0,0 +1,75 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_inodebits.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+/**
+ * This file contains implementation of IBITS lock type
+ *
+ * IBITS lock type contains a bit mask determining various properties of an
+ * object. The meanings of specific bits are specific to the caller and are
+ * opaque to LDLM code.
+ *
+ * Locks with intersecting bitmasks and conflicting lock modes (e.g.  LCK_PW)
+ * are considered conflicting.  See the lock mode compatibility matrix
+ * in lustre_dlm.h.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include <lustre_dlm.h>
+#include <obd_support.h>
+#include <lustre_lib.h>
+
+#include "ldlm_internal.h"
+
+
+void ldlm_ibits_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy,
+				     ldlm_policy_data_t *lpolicy)
+{
+	memset(lpolicy, 0, sizeof(*lpolicy));
+	lpolicy->l_inodebits.bits = wpolicy->l_inodebits.bits;
+}
+
+void ldlm_ibits_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+				     ldlm_wire_policy_data_t *wpolicy)
+{
+	memset(wpolicy, 0, sizeof(*wpolicy));
+	wpolicy->l_inodebits.bits = lpolicy->l_inodebits.bits;
+}
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_internal.h b/drivers/staging/lustre/lustre/ldlm/ldlm_internal.h
new file mode 100644
index 000000000000..a08e6d9757be
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_internal.h
@@ -0,0 +1,277 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define MAX_STRING_SIZE 128
+
+extern atomic_t ldlm_srv_namespace_nr;
+extern atomic_t ldlm_cli_namespace_nr;
+extern struct mutex ldlm_srv_namespace_lock;
+extern struct list_head ldlm_srv_namespace_list;
+extern struct mutex ldlm_cli_namespace_lock;
+extern struct list_head ldlm_cli_namespace_list;
+
+static inline atomic_t *ldlm_namespace_nr(ldlm_side_t client)
+{
+	return client == LDLM_NAMESPACE_SERVER ?
+		&ldlm_srv_namespace_nr : &ldlm_cli_namespace_nr;
+}
+
+static inline struct list_head *ldlm_namespace_list(ldlm_side_t client)
+{
+	return client == LDLM_NAMESPACE_SERVER ?
+		&ldlm_srv_namespace_list : &ldlm_cli_namespace_list;
+}
+
+static inline struct mutex *ldlm_namespace_lock(ldlm_side_t client)
+{
+	return client == LDLM_NAMESPACE_SERVER ?
+		&ldlm_srv_namespace_lock : &ldlm_cli_namespace_lock;
+}
+
+/* ldlm_request.c */
+/* Cancel lru flag, it indicates we cancel aged locks. */
+enum {
+	LDLM_CANCEL_AGED   = 1 << 0, /* Cancel aged locks (non lru resize). */
+	LDLM_CANCEL_PASSED = 1 << 1, /* Cancel passed number of locks. */
+	LDLM_CANCEL_SHRINK = 1 << 2, /* Cancel locks from shrinker. */
+	LDLM_CANCEL_LRUR   = 1 << 3, /* Cancel locks from lru resize. */
+	LDLM_CANCEL_NO_WAIT = 1 << 4 /* Cancel locks w/o blocking (neither
+				      * sending nor waiting for any rpcs) */
+};
+
+int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr,
+		    ldlm_cancel_flags_t sync, int flags);
+int ldlm_cancel_lru_local(struct ldlm_namespace *ns,
+			  struct list_head *cancels, int count, int max,
+			  ldlm_cancel_flags_t cancel_flags, int flags);
+extern int ldlm_enqueue_min;
+int ldlm_get_enq_timeout(struct ldlm_lock *lock);
+
+/* ldlm_resource.c */
+int ldlm_resource_putref_locked(struct ldlm_resource *res);
+void ldlm_resource_insert_lock_after(struct ldlm_lock *original,
+				     struct ldlm_lock *new);
+void ldlm_namespace_free_prior(struct ldlm_namespace *ns,
+			       struct obd_import *imp, int force);
+void ldlm_namespace_free_post(struct ldlm_namespace *ns);
+/* ldlm_lock.c */
+
+struct ldlm_cb_set_arg {
+	struct ptlrpc_request_set	*set;
+	int				 type; /* LDLM_{CP,BL,GL}_CALLBACK */
+	atomic_t			 restart;
+	struct list_head			*list;
+	union ldlm_gl_desc		*gl_desc; /* glimpse AST descriptor */
+};
+
+typedef enum {
+	LDLM_WORK_BL_AST,
+	LDLM_WORK_CP_AST,
+	LDLM_WORK_REVOKE_AST,
+	LDLM_WORK_GL_AST
+} ldlm_desc_ast_t;
+
+void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list);
+int ldlm_fill_lvb(struct ldlm_lock *lock, struct req_capsule *pill,
+		  enum req_location loc, void *data, int size);
+struct ldlm_lock *
+ldlm_lock_create(struct ldlm_namespace *ns, const struct ldlm_res_id *,
+		 ldlm_type_t type, ldlm_mode_t,
+		 const struct ldlm_callback_suite *cbs,
+		 void *data, __u32 lvb_len, enum lvb_type lvb_type);
+ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *, struct ldlm_lock **,
+			       void *cookie, __u64 *flags);
+void ldlm_lock_addref_internal(struct ldlm_lock *, __u32 mode);
+void ldlm_lock_addref_internal_nolock(struct ldlm_lock *, __u32 mode);
+void ldlm_lock_decref_internal(struct ldlm_lock *, __u32 mode);
+void ldlm_lock_decref_internal_nolock(struct ldlm_lock *, __u32 mode);
+void ldlm_add_ast_work_item(struct ldlm_lock *lock, struct ldlm_lock *new,
+			    struct list_head *work_list);
+int ldlm_run_ast_work(struct ldlm_namespace *ns, struct list_head *rpc_list,
+		      ldlm_desc_ast_t ast_type);
+int ldlm_work_gl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq);
+int ldlm_lock_remove_from_lru(struct ldlm_lock *lock);
+int ldlm_lock_remove_from_lru_nolock(struct ldlm_lock *lock);
+void ldlm_lock_add_to_lru_nolock(struct ldlm_lock *lock);
+void ldlm_lock_add_to_lru(struct ldlm_lock *lock);
+void ldlm_lock_touch_in_lru(struct ldlm_lock *lock);
+void ldlm_lock_destroy_nolock(struct ldlm_lock *lock);
+
+void ldlm_cancel_locks_for_export(struct obd_export *export);
+
+/* ldlm_lockd.c */
+int ldlm_bl_to_thread_lock(struct ldlm_namespace *ns, struct ldlm_lock_desc *ld,
+			   struct ldlm_lock *lock);
+int ldlm_bl_to_thread_list(struct ldlm_namespace *ns,
+			   struct ldlm_lock_desc *ld,
+			   struct list_head *cancels, int count,
+			   ldlm_cancel_flags_t cancel_flags);
+
+void ldlm_handle_bl_callback(struct ldlm_namespace *ns,
+			     struct ldlm_lock_desc *ld, struct ldlm_lock *lock);
+
+
+/* ldlm_extent.c */
+void ldlm_extent_add_lock(struct ldlm_resource *res, struct ldlm_lock *lock);
+void ldlm_extent_unlink_lock(struct ldlm_lock *lock);
+
+/* ldlm_flock.c */
+int ldlm_process_flock_lock(struct ldlm_lock *req, __u64 *flags,
+			    int first_enq, ldlm_error_t *err,
+			    struct list_head *work_list);
+int ldlm_init_flock_export(struct obd_export *exp);
+void ldlm_destroy_flock_export(struct obd_export *exp);
+
+/* l_lock.c */
+void l_check_ns_lock(struct ldlm_namespace *ns);
+void l_check_no_ns_lock(struct ldlm_namespace *ns);
+
+extern proc_dir_entry_t *ldlm_svc_proc_dir;
+extern proc_dir_entry_t *ldlm_type_proc_dir;
+
+struct ldlm_state {
+	struct ptlrpc_service *ldlm_cb_service;
+	struct ptlrpc_service *ldlm_cancel_service;
+	struct ptlrpc_client *ldlm_client;
+	struct ptlrpc_connection *ldlm_server_conn;
+	struct ldlm_bl_pool *ldlm_bl_pool;
+};
+
+/* interval tree, for LDLM_EXTENT. */
+extern struct kmem_cache *ldlm_interval_slab; /* slab cache for ldlm_interval */
+extern void ldlm_interval_attach(struct ldlm_interval *n, struct ldlm_lock *l);
+extern struct ldlm_interval *ldlm_interval_detach(struct ldlm_lock *l);
+extern struct ldlm_interval *ldlm_interval_alloc(struct ldlm_lock *lock);
+extern void ldlm_interval_free(struct ldlm_interval *node);
+/* this function must be called with res lock held */
+static inline struct ldlm_extent *
+ldlm_interval_extent(struct ldlm_interval *node)
+{
+	struct ldlm_lock *lock;
+	LASSERT(!list_empty(&node->li_group));
+
+	lock = list_entry(node->li_group.next, struct ldlm_lock,
+			      l_sl_policy);
+	return &lock->l_policy_data.l_extent;
+}
+
+int ldlm_init(void);
+void ldlm_exit(void);
+
+enum ldlm_policy_res {
+	LDLM_POLICY_CANCEL_LOCK,
+	LDLM_POLICY_KEEP_LOCK,
+	LDLM_POLICY_SKIP_LOCK
+};
+
+typedef enum ldlm_policy_res ldlm_policy_res_t;
+
+#define LDLM_POOL_PROC_READER(var, type)				    \
+	static int lprocfs_rd_##var(char *page, char **start, off_t off,    \
+				    int count, int *eof, void *data)	\
+	{								   \
+		struct ldlm_pool *pl = data;				\
+		type tmp;						   \
+									    \
+		spin_lock(&pl->pl_lock);				    \
+		tmp = pl->pl_##var;					    \
+		spin_unlock(&pl->pl_lock);				    \
+									    \
+		return lprocfs_rd_uint(page, start, off, count, eof, &tmp); \
+	}								   \
+	struct __##var##__dummy_read {;} /* semicolon catcher */
+
+#define LDLM_POOL_PROC_WRITER(var, type)				    \
+	int lprocfs_wr_##var(struct file *file, const char *buffer,	 \
+			     unsigned long count, void *data)	       \
+	{								   \
+		struct ldlm_pool *pl = data;				\
+		type tmp;						   \
+		int rc;						     \
+									    \
+		rc = lprocfs_wr_uint(file, buffer, count, &tmp);	    \
+		if (rc < 0) {					       \
+			CERROR("Can't parse user input, rc = %d\n", rc);    \
+			return rc;					  \
+		}							   \
+									    \
+		spin_lock(&pl->pl_lock);				    \
+		pl->pl_##var = tmp;					    \
+		spin_unlock(&pl->pl_lock);				    \
+									    \
+		return rc;						  \
+	}								   \
+	struct __##var##__dummy_write {;} /* semicolon catcher */
+
+static inline int is_granted_or_cancelled(struct ldlm_lock *lock)
+{
+	int ret = 0;
+
+	lock_res_and_lock(lock);
+	if (((lock->l_req_mode == lock->l_granted_mode) &&
+	     !(lock->l_flags & LDLM_FL_CP_REQD)) ||
+	    (lock->l_flags & (LDLM_FL_FAILED | LDLM_FL_CANCEL)))
+		ret = 1;
+	unlock_res_and_lock(lock);
+
+	return ret;
+}
+
+typedef void (*ldlm_policy_wire_to_local_t)(const ldlm_wire_policy_data_t *,
+					    ldlm_policy_data_t *);
+
+typedef void (*ldlm_policy_local_to_wire_t)(const ldlm_policy_data_t *,
+					    ldlm_wire_policy_data_t *);
+
+void ldlm_plain_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy,
+				     ldlm_policy_data_t *lpolicy);
+void ldlm_plain_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+				     ldlm_wire_policy_data_t *wpolicy);
+void ldlm_ibits_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy,
+				     ldlm_policy_data_t *lpolicy);
+void ldlm_ibits_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+				     ldlm_wire_policy_data_t *wpolicy);
+void ldlm_extent_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy,
+				     ldlm_policy_data_t *lpolicy);
+void ldlm_extent_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+				     ldlm_wire_policy_data_t *wpolicy);
+void ldlm_flock_policy_wire18_to_local(const ldlm_wire_policy_data_t *wpolicy,
+				     ldlm_policy_data_t *lpolicy);
+void ldlm_flock_policy_wire21_to_local(const ldlm_wire_policy_data_t *wpolicy,
+				     ldlm_policy_data_t *lpolicy);
+
+void ldlm_flock_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+				     ldlm_wire_policy_data_t *wpolicy);
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c b/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c
new file mode 100644
index 000000000000..42df53072dc3
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c
@@ -0,0 +1,868 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/**
+ * This file deals with various client/target related logic including recovery.
+ *
+ * TODO: This code more logically belongs in the ptlrpc module than in ldlm and
+ * should be moved.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+# include <linux/libcfs/libcfs.h>
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_dlm.h>
+#include <lustre_net.h>
+#include <lustre_sec.h>
+#include "ldlm_internal.h"
+
+/* @priority: If non-zero, move the selected connection to the list head.
+ * @create: If zero, only search in existing connections.
+ */
+static int import_set_conn(struct obd_import *imp, struct obd_uuid *uuid,
+			   int priority, int create)
+{
+	struct ptlrpc_connection *ptlrpc_conn;
+	struct obd_import_conn *imp_conn = NULL, *item;
+	int rc = 0;
+	ENTRY;
+
+	if (!create && !priority) {
+		CDEBUG(D_HA, "Nothing to do\n");
+		RETURN(-EINVAL);
+	}
+
+	ptlrpc_conn = ptlrpc_uuid_to_connection(uuid);
+	if (!ptlrpc_conn) {
+		CDEBUG(D_HA, "can't find connection %s\n", uuid->uuid);
+		RETURN (-ENOENT);
+	}
+
+	if (create) {
+		OBD_ALLOC(imp_conn, sizeof(*imp_conn));
+		if (!imp_conn) {
+			GOTO(out_put, rc = -ENOMEM);
+		}
+	}
+
+	spin_lock(&imp->imp_lock);
+	list_for_each_entry(item, &imp->imp_conn_list, oic_item) {
+		if (obd_uuid_equals(uuid, &item->oic_uuid)) {
+			if (priority) {
+				list_del(&item->oic_item);
+				list_add(&item->oic_item,
+					     &imp->imp_conn_list);
+				item->oic_last_attempt = 0;
+			}
+			CDEBUG(D_HA, "imp %p@%s: found existing conn %s%s\n",
+			       imp, imp->imp_obd->obd_name, uuid->uuid,
+			       (priority ? ", moved to head" : ""));
+			spin_unlock(&imp->imp_lock);
+			GOTO(out_free, rc = 0);
+		}
+	}
+	/* No existing import connection found for \a uuid. */
+	if (create) {
+		imp_conn->oic_conn = ptlrpc_conn;
+		imp_conn->oic_uuid = *uuid;
+		imp_conn->oic_last_attempt = 0;
+		if (priority)
+			list_add(&imp_conn->oic_item, &imp->imp_conn_list);
+		else
+			list_add_tail(&imp_conn->oic_item,
+					  &imp->imp_conn_list);
+		CDEBUG(D_HA, "imp %p@%s: add connection %s at %s\n",
+		       imp, imp->imp_obd->obd_name, uuid->uuid,
+		       (priority ? "head" : "tail"));
+	} else {
+		spin_unlock(&imp->imp_lock);
+		GOTO(out_free, rc = -ENOENT);
+	}
+
+	spin_unlock(&imp->imp_lock);
+	RETURN(0);
+out_free:
+	if (imp_conn)
+		OBD_FREE(imp_conn, sizeof(*imp_conn));
+out_put:
+	ptlrpc_connection_put(ptlrpc_conn);
+	RETURN(rc);
+}
+
+int import_set_conn_priority(struct obd_import *imp, struct obd_uuid *uuid)
+{
+	return import_set_conn(imp, uuid, 1, 0);
+}
+
+int client_import_add_conn(struct obd_import *imp, struct obd_uuid *uuid,
+			   int priority)
+{
+	return import_set_conn(imp, uuid, priority, 1);
+}
+EXPORT_SYMBOL(client_import_add_conn);
+
+int client_import_del_conn(struct obd_import *imp, struct obd_uuid *uuid)
+{
+	struct obd_import_conn *imp_conn;
+	struct obd_export *dlmexp;
+	int rc = -ENOENT;
+	ENTRY;
+
+	spin_lock(&imp->imp_lock);
+	if (list_empty(&imp->imp_conn_list)) {
+		LASSERT(!imp->imp_connection);
+		GOTO(out, rc);
+	}
+
+	list_for_each_entry(imp_conn, &imp->imp_conn_list, oic_item) {
+		if (!obd_uuid_equals(uuid, &imp_conn->oic_uuid))
+			continue;
+		LASSERT(imp_conn->oic_conn);
+
+		if (imp_conn == imp->imp_conn_current) {
+			LASSERT(imp_conn->oic_conn == imp->imp_connection);
+
+			if (imp->imp_state != LUSTRE_IMP_CLOSED &&
+			    imp->imp_state != LUSTRE_IMP_DISCON) {
+				CERROR("can't remove current connection\n");
+				GOTO(out, rc = -EBUSY);
+			}
+
+			ptlrpc_connection_put(imp->imp_connection);
+			imp->imp_connection = NULL;
+
+			dlmexp = class_conn2export(&imp->imp_dlm_handle);
+			if (dlmexp && dlmexp->exp_connection) {
+				LASSERT(dlmexp->exp_connection ==
+					imp_conn->oic_conn);
+				ptlrpc_connection_put(dlmexp->exp_connection);
+				dlmexp->exp_connection = NULL;
+			}
+		}
+
+		list_del(&imp_conn->oic_item);
+		ptlrpc_connection_put(imp_conn->oic_conn);
+		OBD_FREE(imp_conn, sizeof(*imp_conn));
+		CDEBUG(D_HA, "imp %p@%s: remove connection %s\n",
+		       imp, imp->imp_obd->obd_name, uuid->uuid);
+		rc = 0;
+		break;
+	}
+out:
+	spin_unlock(&imp->imp_lock);
+	if (rc == -ENOENT)
+		CERROR("connection %s not found\n", uuid->uuid);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(client_import_del_conn);
+
+/**
+ * Find conn UUID by peer NID. \a peer is a server NID. This function is used
+ * to find a conn uuid of \a imp which can reach \a peer.
+ */
+int client_import_find_conn(struct obd_import *imp, lnet_nid_t peer,
+			    struct obd_uuid *uuid)
+{
+	struct obd_import_conn *conn;
+	int rc = -ENOENT;
+	ENTRY;
+
+	spin_lock(&imp->imp_lock);
+	list_for_each_entry(conn, &imp->imp_conn_list, oic_item) {
+		/* Check if conn UUID does have this peer NID. */
+		if (class_check_uuid(&conn->oic_uuid, peer)) {
+			*uuid = conn->oic_uuid;
+			rc = 0;
+			break;
+		}
+	}
+	spin_unlock(&imp->imp_lock);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(client_import_find_conn);
+
+void client_destroy_import(struct obd_import *imp)
+{
+	/* Drop security policy instance after all RPCs have finished/aborted
+	 * to let all busy contexts be released. */
+	class_import_get(imp);
+	class_destroy_import(imp);
+	sptlrpc_import_sec_put(imp);
+	class_import_put(imp);
+}
+EXPORT_SYMBOL(client_destroy_import);
+
+/**
+ * Check whether or not the OSC is on MDT.
+ * In the config log,
+ * osc on MDT
+ *	setup 0:{fsname}-OSTxxxx-osc[-MDTxxxx] 1:lustre-OST0000_UUID 2:NID
+ * osc on client
+ *	setup 0:{fsname}-OSTxxxx-osc 1:lustre-OST0000_UUID 2:NID
+ *
+ **/
+static int osc_on_mdt(char *obdname)
+{
+	char *ptr;
+
+	ptr = strrchr(obdname, '-');
+	if (ptr == NULL)
+		return 0;
+
+	if (strncmp(ptr + 1, "MDT", 3) == 0)
+		return 1;
+
+	return 0;
+}
+
+/* Configure an RPC client OBD device.
+ *
+ * lcfg parameters:
+ * 1 - client UUID
+ * 2 - server UUID
+ * 3 - inactive-on-startup
+ */
+int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
+{
+	struct client_obd *cli = &obddev->u.cli;
+	struct obd_import *imp;
+	struct obd_uuid server_uuid;
+	int rq_portal, rp_portal, connect_op;
+	char *name = obddev->obd_type->typ_name;
+	ldlm_ns_type_t ns_type = LDLM_NS_TYPE_UNKNOWN;
+	int rc;
+	char	*cli_name = lustre_cfg_buf(lcfg, 0);
+	ENTRY;
+
+	/* In a more perfect world, we would hang a ptlrpc_client off of
+	 * obd_type and just use the values from there. */
+	if (!strcmp(name, LUSTRE_OSC_NAME) ||
+	    (!(strcmp(name, LUSTRE_OSP_NAME)) &&
+	     (is_osp_on_mdt(cli_name) &&
+	       strstr(lustre_cfg_buf(lcfg, 1), "OST") != NULL))) {
+		/* OSC or OSP_on_MDT for OSTs */
+		rq_portal = OST_REQUEST_PORTAL;
+		rp_portal = OSC_REPLY_PORTAL;
+		connect_op = OST_CONNECT;
+		cli->cl_sp_me = LUSTRE_SP_CLI;
+		cli->cl_sp_to = LUSTRE_SP_OST;
+		ns_type = LDLM_NS_TYPE_OSC;
+	} else if (!strcmp(name, LUSTRE_MDC_NAME) ||
+		   !strcmp(name, LUSTRE_LWP_NAME) ||
+		   (!strcmp(name, LUSTRE_OSP_NAME) &&
+		    (is_osp_on_mdt(cli_name) &&
+		     strstr(lustre_cfg_buf(lcfg, 1), "OST") == NULL))) {
+		/* MDC or OSP_on_MDT for other MDTs */
+		rq_portal = MDS_REQUEST_PORTAL;
+		rp_portal = MDC_REPLY_PORTAL;
+		connect_op = MDS_CONNECT;
+		cli->cl_sp_me = LUSTRE_SP_CLI;
+		cli->cl_sp_to = LUSTRE_SP_MDT;
+		ns_type = LDLM_NS_TYPE_MDC;
+	} else if (!strcmp(name, LUSTRE_MGC_NAME)) {
+		rq_portal = MGS_REQUEST_PORTAL;
+		rp_portal = MGC_REPLY_PORTAL;
+		connect_op = MGS_CONNECT;
+		cli->cl_sp_me = LUSTRE_SP_MGC;
+		cli->cl_sp_to = LUSTRE_SP_MGS;
+		cli->cl_flvr_mgc.sf_rpc = SPTLRPC_FLVR_INVALID;
+		ns_type = LDLM_NS_TYPE_MGC;
+	} else {
+		CERROR("unknown client OBD type \"%s\", can't setup\n",
+		       name);
+		RETURN(-EINVAL);
+	}
+
+	if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
+		CERROR("requires a TARGET UUID\n");
+		RETURN(-EINVAL);
+	}
+
+	if (LUSTRE_CFG_BUFLEN(lcfg, 1) > 37) {
+		CERROR("client UUID must be less than 38 characters\n");
+		RETURN(-EINVAL);
+	}
+
+	if (LUSTRE_CFG_BUFLEN(lcfg, 2) < 1) {
+		CERROR("setup requires a SERVER UUID\n");
+		RETURN(-EINVAL);
+	}
+
+	if (LUSTRE_CFG_BUFLEN(lcfg, 2) > 37) {
+		CERROR("target UUID must be less than 38 characters\n");
+		RETURN(-EINVAL);
+	}
+
+	init_rwsem(&cli->cl_sem);
+	sema_init(&cli->cl_mgc_sem, 1);
+	cli->cl_conn_count = 0;
+	memcpy(server_uuid.uuid, lustre_cfg_buf(lcfg, 2),
+	       min_t(unsigned int, LUSTRE_CFG_BUFLEN(lcfg, 2),
+		     sizeof(server_uuid)));
+
+	cli->cl_dirty = 0;
+	cli->cl_avail_grant = 0;
+	/* FIXME: Should limit this for the sum of all cl_dirty_max. */
+	cli->cl_dirty_max = OSC_MAX_DIRTY_DEFAULT * 1024 * 1024;
+	if (cli->cl_dirty_max >> PAGE_CACHE_SHIFT > num_physpages / 8)
+		cli->cl_dirty_max = num_physpages << (PAGE_CACHE_SHIFT - 3);
+	INIT_LIST_HEAD(&cli->cl_cache_waiters);
+	INIT_LIST_HEAD(&cli->cl_loi_ready_list);
+	INIT_LIST_HEAD(&cli->cl_loi_hp_ready_list);
+	INIT_LIST_HEAD(&cli->cl_loi_write_list);
+	INIT_LIST_HEAD(&cli->cl_loi_read_list);
+	client_obd_list_lock_init(&cli->cl_loi_list_lock);
+	atomic_set(&cli->cl_pending_w_pages, 0);
+	atomic_set(&cli->cl_pending_r_pages, 0);
+	cli->cl_r_in_flight = 0;
+	cli->cl_w_in_flight = 0;
+
+	spin_lock_init(&cli->cl_read_rpc_hist.oh_lock);
+	spin_lock_init(&cli->cl_write_rpc_hist.oh_lock);
+	spin_lock_init(&cli->cl_read_page_hist.oh_lock);
+	spin_lock_init(&cli->cl_write_page_hist.oh_lock);
+	spin_lock_init(&cli->cl_read_offset_hist.oh_lock);
+	spin_lock_init(&cli->cl_write_offset_hist.oh_lock);
+
+	/* lru for osc. */
+	INIT_LIST_HEAD(&cli->cl_lru_osc);
+	atomic_set(&cli->cl_lru_shrinkers, 0);
+	atomic_set(&cli->cl_lru_busy, 0);
+	atomic_set(&cli->cl_lru_in_list, 0);
+	INIT_LIST_HEAD(&cli->cl_lru_list);
+	client_obd_list_lock_init(&cli->cl_lru_list_lock);
+
+	init_waitqueue_head(&cli->cl_destroy_waitq);
+	atomic_set(&cli->cl_destroy_in_flight, 0);
+	/* Turn on checksumming by default. */
+	cli->cl_checksum = 1;
+	/*
+	 * The supported checksum types will be worked out at connect time
+	 * Set cl_chksum* to CRC32 for now to avoid returning screwed info
+	 * through procfs.
+	 */
+	cli->cl_cksum_type = cli->cl_supp_cksum_types = OBD_CKSUM_CRC32;
+	atomic_set(&cli->cl_resends, OSC_DEFAULT_RESENDS);
+
+	/* This value may be reduced at connect time in
+	 * ptlrpc_connect_interpret() . We initialize it to only
+	 * 1MB until we know what the performance looks like.
+	 * In the future this should likely be increased. LU-1431 */
+	cli->cl_max_pages_per_rpc = min_t(int, PTLRPC_MAX_BRW_PAGES,
+					  LNET_MTU >> PAGE_CACHE_SHIFT);
+
+	if (!strcmp(name, LUSTRE_MDC_NAME)) {
+		cli->cl_max_rpcs_in_flight = MDC_MAX_RIF_DEFAULT;
+	} else if (num_physpages >> (20 - PAGE_CACHE_SHIFT) <= 128 /* MB */) {
+		cli->cl_max_rpcs_in_flight = 2;
+	} else if (num_physpages >> (20 - PAGE_CACHE_SHIFT) <= 256 /* MB */) {
+		cli->cl_max_rpcs_in_flight = 3;
+	} else if (num_physpages >> (20 - PAGE_CACHE_SHIFT) <= 512 /* MB */) {
+		cli->cl_max_rpcs_in_flight = 4;
+	} else {
+		if (osc_on_mdt(obddev->obd_name))
+			cli->cl_max_rpcs_in_flight = MDS_OSC_MAX_RIF_DEFAULT;
+		else
+			cli->cl_max_rpcs_in_flight = OSC_MAX_RIF_DEFAULT;
+	}
+	rc = ldlm_get_ref();
+	if (rc) {
+		CERROR("ldlm_get_ref failed: %d\n", rc);
+		GOTO(err, rc);
+	}
+
+	ptlrpc_init_client(rq_portal, rp_portal, name,
+			   &obddev->obd_ldlm_client);
+
+	imp = class_new_import(obddev);
+	if (imp == NULL)
+		GOTO(err_ldlm, rc = -ENOENT);
+	imp->imp_client = &obddev->obd_ldlm_client;
+	imp->imp_connect_op = connect_op;
+	memcpy(cli->cl_target_uuid.uuid, lustre_cfg_buf(lcfg, 1),
+	       LUSTRE_CFG_BUFLEN(lcfg, 1));
+	class_import_put(imp);
+
+	rc = client_import_add_conn(imp, &server_uuid, 1);
+	if (rc) {
+		CERROR("can't add initial connection\n");
+		GOTO(err_import, rc);
+	}
+
+	cli->cl_import = imp;
+	/* cli->cl_max_mds_{easize,cookiesize} updated by mdc_init_ea_size() */
+	cli->cl_max_mds_easize = sizeof(struct lov_mds_md_v3);
+	cli->cl_max_mds_cookiesize = sizeof(struct llog_cookie);
+
+	if (LUSTRE_CFG_BUFLEN(lcfg, 3) > 0) {
+		if (!strcmp(lustre_cfg_string(lcfg, 3), "inactive")) {
+			CDEBUG(D_HA, "marking %s %s->%s as inactive\n",
+			       name, obddev->obd_name,
+			       cli->cl_target_uuid.uuid);
+			spin_lock(&imp->imp_lock);
+			imp->imp_deactive = 1;
+			spin_unlock(&imp->imp_lock);
+		}
+	}
+
+	obddev->obd_namespace = ldlm_namespace_new(obddev, obddev->obd_name,
+						   LDLM_NAMESPACE_CLIENT,
+						   LDLM_NAMESPACE_GREEDY,
+						   ns_type);
+	if (obddev->obd_namespace == NULL) {
+		CERROR("Unable to create client namespace - %s\n",
+		       obddev->obd_name);
+		GOTO(err_import, rc = -ENOMEM);
+	}
+
+	cli->cl_qchk_stat = CL_NOT_QUOTACHECKED;
+
+	RETURN(rc);
+
+err_import:
+	class_destroy_import(imp);
+err_ldlm:
+	ldlm_put_ref();
+err:
+	RETURN(rc);
+
+}
+EXPORT_SYMBOL(client_obd_setup);
+
+int client_obd_cleanup(struct obd_device *obddev)
+{
+	ENTRY;
+
+	ldlm_namespace_free_post(obddev->obd_namespace);
+	obddev->obd_namespace = NULL;
+
+	LASSERT(obddev->u.cli.cl_import == NULL);
+
+	ldlm_put_ref();
+	RETURN(0);
+}
+EXPORT_SYMBOL(client_obd_cleanup);
+
+/* ->o_connect() method for client side (OSC and MDC and MGC) */
+int client_connect_import(const struct lu_env *env,
+			  struct obd_export **exp,
+			  struct obd_device *obd, struct obd_uuid *cluuid,
+			  struct obd_connect_data *data, void *localdata)
+{
+	struct client_obd       *cli    = &obd->u.cli;
+	struct obd_import       *imp    = cli->cl_import;
+	struct obd_connect_data *ocd;
+	struct lustre_handle    conn    = { 0 };
+	int		     rc;
+	ENTRY;
+
+	*exp = NULL;
+	down_write(&cli->cl_sem);
+	if (cli->cl_conn_count > 0 )
+		GOTO(out_sem, rc = -EALREADY);
+
+	rc = class_connect(&conn, obd, cluuid);
+	if (rc)
+		GOTO(out_sem, rc);
+
+	cli->cl_conn_count++;
+	*exp = class_conn2export(&conn);
+
+	LASSERT(obd->obd_namespace);
+
+	imp->imp_dlm_handle = conn;
+	rc = ptlrpc_init_import(imp);
+	if (rc != 0)
+		GOTO(out_ldlm, rc);
+
+	ocd = &imp->imp_connect_data;
+	if (data) {
+		*ocd = *data;
+		imp->imp_connect_flags_orig = data->ocd_connect_flags;
+	}
+
+	rc = ptlrpc_connect_import(imp);
+	if (rc != 0) {
+		LASSERT (imp->imp_state == LUSTRE_IMP_DISCON);
+		GOTO(out_ldlm, rc);
+	}
+	LASSERT((*exp)->exp_connection);
+
+	if (data) {
+		LASSERTF((ocd->ocd_connect_flags & data->ocd_connect_flags) ==
+			 ocd->ocd_connect_flags, "old "LPX64", new "LPX64"\n",
+			 data->ocd_connect_flags, ocd->ocd_connect_flags);
+		data->ocd_connect_flags = ocd->ocd_connect_flags;
+	}
+
+	ptlrpc_pinger_add_import(imp);
+
+	EXIT;
+
+	if (rc) {
+out_ldlm:
+		cli->cl_conn_count--;
+		class_disconnect(*exp);
+		*exp = NULL;
+	}
+out_sem:
+	up_write(&cli->cl_sem);
+
+	return rc;
+}
+EXPORT_SYMBOL(client_connect_import);
+
+int client_disconnect_export(struct obd_export *exp)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	struct client_obd *cli;
+	struct obd_import *imp;
+	int rc = 0, err;
+	ENTRY;
+
+	if (!obd) {
+		CERROR("invalid export for disconnect: exp %p cookie "LPX64"\n",
+		       exp, exp ? exp->exp_handle.h_cookie : -1);
+		RETURN(-EINVAL);
+	}
+
+	cli = &obd->u.cli;
+	imp = cli->cl_import;
+
+	down_write(&cli->cl_sem);
+	CDEBUG(D_INFO, "disconnect %s - %d\n", obd->obd_name,
+	       cli->cl_conn_count);
+
+	if (!cli->cl_conn_count) {
+		CERROR("disconnecting disconnected device (%s)\n",
+		       obd->obd_name);
+		GOTO(out_disconnect, rc = -EINVAL);
+	}
+
+	cli->cl_conn_count--;
+	if (cli->cl_conn_count)
+		GOTO(out_disconnect, rc = 0);
+
+	/* Mark import deactivated now, so we don't try to reconnect if any
+	 * of the cleanup RPCs fails (e.g. LDLM cancel, etc).  We don't
+	 * fully deactivate the import, or that would drop all requests. */
+	spin_lock(&imp->imp_lock);
+	imp->imp_deactive = 1;
+	spin_unlock(&imp->imp_lock);
+
+	/* Some non-replayable imports (MDS's OSCs) are pinged, so just
+	 * delete it regardless.  (It's safe to delete an import that was
+	 * never added.) */
+	(void)ptlrpc_pinger_del_import(imp);
+
+	if (obd->obd_namespace != NULL) {
+		/* obd_force == local only */
+		ldlm_cli_cancel_unused(obd->obd_namespace, NULL,
+				       obd->obd_force ? LCF_LOCAL : 0, NULL);
+		ldlm_namespace_free_prior(obd->obd_namespace, imp, obd->obd_force);
+	}
+
+	/* There's no need to hold sem while disconnecting an import,
+	 * and it may actually cause deadlock in GSS. */
+	up_write(&cli->cl_sem);
+	rc = ptlrpc_disconnect_import(imp, 0);
+	down_write(&cli->cl_sem);
+
+	ptlrpc_invalidate_import(imp);
+
+	EXIT;
+
+out_disconnect:
+	/* Use server style - class_disconnect should be always called for
+	 * o_disconnect. */
+	err = class_disconnect(exp);
+	if (!rc && err)
+		rc = err;
+
+	up_write(&cli->cl_sem);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(client_disconnect_export);
+
+
+/**
+ * Packs current SLV and Limit into \a req.
+ */
+int target_pack_pool_reply(struct ptlrpc_request *req)
+{
+	struct obd_device *obd;
+	ENTRY;
+
+	/* Check that we still have all structures alive as this may
+	 * be some late RPC at shutdown time. */
+	if (unlikely(!req->rq_export || !req->rq_export->exp_obd ||
+		     !exp_connect_lru_resize(req->rq_export))) {
+		lustre_msg_set_slv(req->rq_repmsg, 0);
+		lustre_msg_set_limit(req->rq_repmsg, 0);
+		RETURN(0);
+	}
+
+	/* OBD is alive here as export is alive, which we checked above. */
+	obd = req->rq_export->exp_obd;
+
+	read_lock(&obd->obd_pool_lock);
+	lustre_msg_set_slv(req->rq_repmsg, obd->obd_pool_slv);
+	lustre_msg_set_limit(req->rq_repmsg, obd->obd_pool_limit);
+	read_unlock(&obd->obd_pool_lock);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(target_pack_pool_reply);
+
+int target_send_reply_msg(struct ptlrpc_request *req, int rc, int fail_id)
+{
+	if (OBD_FAIL_CHECK_ORSET(fail_id & ~OBD_FAIL_ONCE, OBD_FAIL_ONCE)) {
+		DEBUG_REQ(D_ERROR, req, "dropping reply");
+		return (-ECOMM);
+	}
+
+	if (unlikely(rc)) {
+		DEBUG_REQ(D_NET, req, "processing error (%d)", rc);
+		req->rq_status = rc;
+		return (ptlrpc_send_error(req, 1));
+	} else {
+		DEBUG_REQ(D_NET, req, "sending reply");
+	}
+
+	return (ptlrpc_send_reply(req, PTLRPC_REPLY_MAYBE_DIFFICULT));
+}
+
+void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id)
+{
+	struct ptlrpc_service_part *svcpt;
+	int			netrc;
+	struct ptlrpc_reply_state *rs;
+	struct obd_export	 *exp;
+	ENTRY;
+
+	if (req->rq_no_reply) {
+		EXIT;
+		return;
+	}
+
+	svcpt = req->rq_rqbd->rqbd_svcpt;
+	rs = req->rq_reply_state;
+	if (rs == NULL || !rs->rs_difficult) {
+		/* no notifiers */
+		target_send_reply_msg (req, rc, fail_id);
+		EXIT;
+		return;
+	}
+
+	/* must be an export if locks saved */
+	LASSERT (req->rq_export != NULL);
+	/* req/reply consistent */
+	LASSERT(rs->rs_svcpt == svcpt);
+
+	/* "fresh" reply */
+	LASSERT (!rs->rs_scheduled);
+	LASSERT (!rs->rs_scheduled_ever);
+	LASSERT (!rs->rs_handled);
+	LASSERT (!rs->rs_on_net);
+	LASSERT (rs->rs_export == NULL);
+	LASSERT (list_empty(&rs->rs_obd_list));
+	LASSERT (list_empty(&rs->rs_exp_list));
+
+	exp = class_export_get (req->rq_export);
+
+	/* disable reply scheduling while I'm setting up */
+	rs->rs_scheduled = 1;
+	rs->rs_on_net    = 1;
+	rs->rs_xid       = req->rq_xid;
+	rs->rs_transno   = req->rq_transno;
+	rs->rs_export    = exp;
+	rs->rs_opc       = lustre_msg_get_opc(req->rq_reqmsg);
+
+	spin_lock(&exp->exp_uncommitted_replies_lock);
+	CDEBUG(D_NET, "rs transno = "LPU64", last committed = "LPU64"\n",
+	       rs->rs_transno, exp->exp_last_committed);
+	if (rs->rs_transno > exp->exp_last_committed) {
+		/* not committed already */
+		list_add_tail(&rs->rs_obd_list,
+				  &exp->exp_uncommitted_replies);
+	}
+	spin_unlock(&exp->exp_uncommitted_replies_lock);
+
+	spin_lock(&exp->exp_lock);
+	list_add_tail(&rs->rs_exp_list, &exp->exp_outstanding_replies);
+	spin_unlock(&exp->exp_lock);
+
+	netrc = target_send_reply_msg(req, rc, fail_id);
+
+	spin_lock(&svcpt->scp_rep_lock);
+
+	atomic_inc(&svcpt->scp_nreps_difficult);
+
+	if (netrc != 0) {
+		/* error sending: reply is off the net.  Also we need +1
+		 * reply ref until ptlrpc_handle_rs() is done
+		 * with the reply state (if the send was successful, there
+		 * would have been +1 ref for the net, which
+		 * reply_out_callback leaves alone) */
+		rs->rs_on_net = 0;
+		ptlrpc_rs_addref(rs);
+	}
+
+	spin_lock(&rs->rs_lock);
+	if (rs->rs_transno <= exp->exp_last_committed ||
+	    (!rs->rs_on_net && !rs->rs_no_ack) ||
+	    list_empty(&rs->rs_exp_list) ||     /* completed already */
+	    list_empty(&rs->rs_obd_list)) {
+		CDEBUG(D_HA, "Schedule reply immediately\n");
+		ptlrpc_dispatch_difficult_reply(rs);
+	} else {
+		list_add(&rs->rs_list, &svcpt->scp_rep_active);
+		rs->rs_scheduled = 0;	/* allow notifier to schedule */
+	}
+	spin_unlock(&rs->rs_lock);
+	spin_unlock(&svcpt->scp_rep_lock);
+	EXIT;
+}
+EXPORT_SYMBOL(target_send_reply);
+
+ldlm_mode_t lck_compat_array[] = {
+	[LCK_EX] LCK_COMPAT_EX,
+	[LCK_PW] LCK_COMPAT_PW,
+	[LCK_PR] LCK_COMPAT_PR,
+	[LCK_CW] LCK_COMPAT_CW,
+	[LCK_CR] LCK_COMPAT_CR,
+	[LCK_NL] LCK_COMPAT_NL,
+	[LCK_GROUP] LCK_COMPAT_GROUP,
+	[LCK_COS] LCK_COMPAT_COS,
+};
+
+/**
+ * Rather arbitrary mapping from LDLM error codes to errno values. This should
+ * not escape to the user level.
+ */
+int ldlm_error2errno(ldlm_error_t error)
+{
+	int result;
+
+	switch (error) {
+	case ELDLM_OK:
+		result = 0;
+		break;
+	case ELDLM_LOCK_CHANGED:
+		result = -ESTALE;
+		break;
+	case ELDLM_LOCK_ABORTED:
+		result = -ENAVAIL;
+		break;
+	case ELDLM_LOCK_REPLACED:
+		result = -ESRCH;
+		break;
+	case ELDLM_NO_LOCK_DATA:
+		result = -ENOENT;
+		break;
+	case ELDLM_NAMESPACE_EXISTS:
+		result = -EEXIST;
+		break;
+	case ELDLM_BAD_NAMESPACE:
+		result = -EBADF;
+		break;
+	default:
+		if (((int)error) < 0)  /* cast to signed type */
+			result = error; /* as ldlm_error_t can be unsigned */
+		else {
+			CERROR("Invalid DLM result code: %d\n", error);
+			result = -EPROTO;
+		}
+	}
+	return result;
+}
+EXPORT_SYMBOL(ldlm_error2errno);
+
+/**
+ * Dual to ldlm_error2errno(): maps errno values back to ldlm_error_t.
+ */
+ldlm_error_t ldlm_errno2error(int err_no)
+{
+	int error;
+
+	switch (err_no) {
+	case 0:
+		error = ELDLM_OK;
+		break;
+	case -ESTALE:
+		error = ELDLM_LOCK_CHANGED;
+		break;
+	case -ENAVAIL:
+		error = ELDLM_LOCK_ABORTED;
+		break;
+	case -ESRCH:
+		error = ELDLM_LOCK_REPLACED;
+		break;
+	case -ENOENT:
+		error = ELDLM_NO_LOCK_DATA;
+		break;
+	case -EEXIST:
+		error = ELDLM_NAMESPACE_EXISTS;
+		break;
+	case -EBADF:
+		error = ELDLM_BAD_NAMESPACE;
+		break;
+	default:
+		error = err_no;
+	}
+	return error;
+}
+EXPORT_SYMBOL(ldlm_errno2error);
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+void ldlm_dump_export_locks(struct obd_export *exp)
+{
+	spin_lock(&exp->exp_locks_list_guard);
+	if (!list_empty(&exp->exp_locks_list)) {
+		struct ldlm_lock *lock;
+
+		CERROR("dumping locks for export %p,"
+		       "ignore if the unmount doesn't hang\n", exp);
+		list_for_each_entry(lock, &exp->exp_locks_list,
+					l_exp_refs_link)
+			LDLM_ERROR(lock, "lock:");
+	}
+	spin_unlock(&exp->exp_locks_list_guard);
+}
+#endif
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_lock.c b/drivers/staging/lustre/lustre/ldlm/ldlm_lock.c
new file mode 100644
index 000000000000..bd39e1c78527
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_lock.c
@@ -0,0 +1,2443 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_lock.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/lustre_intent.h>
+
+#include <obd_class.h>
+#include "ldlm_internal.h"
+
+/* lock types */
+char *ldlm_lockname[] = {
+	[0] "--",
+	[LCK_EX] "EX",
+	[LCK_PW] "PW",
+	[LCK_PR] "PR",
+	[LCK_CW] "CW",
+	[LCK_CR] "CR",
+	[LCK_NL] "NL",
+	[LCK_GROUP] "GROUP",
+	[LCK_COS] "COS"
+};
+EXPORT_SYMBOL(ldlm_lockname);
+
+char *ldlm_typename[] = {
+	[LDLM_PLAIN] "PLN",
+	[LDLM_EXTENT] "EXT",
+	[LDLM_FLOCK] "FLK",
+	[LDLM_IBITS] "IBT",
+};
+EXPORT_SYMBOL(ldlm_typename);
+
+static ldlm_policy_wire_to_local_t ldlm_policy_wire18_to_local[] = {
+	[LDLM_PLAIN - LDLM_MIN_TYPE] ldlm_plain_policy_wire_to_local,
+	[LDLM_EXTENT - LDLM_MIN_TYPE] ldlm_extent_policy_wire_to_local,
+	[LDLM_FLOCK - LDLM_MIN_TYPE] ldlm_flock_policy_wire18_to_local,
+	[LDLM_IBITS - LDLM_MIN_TYPE] ldlm_ibits_policy_wire_to_local,
+};
+
+static ldlm_policy_wire_to_local_t ldlm_policy_wire21_to_local[] = {
+	[LDLM_PLAIN - LDLM_MIN_TYPE] ldlm_plain_policy_wire_to_local,
+	[LDLM_EXTENT - LDLM_MIN_TYPE] ldlm_extent_policy_wire_to_local,
+	[LDLM_FLOCK - LDLM_MIN_TYPE] ldlm_flock_policy_wire21_to_local,
+	[LDLM_IBITS - LDLM_MIN_TYPE] ldlm_ibits_policy_wire_to_local,
+};
+
+static ldlm_policy_local_to_wire_t ldlm_policy_local_to_wire[] = {
+	[LDLM_PLAIN - LDLM_MIN_TYPE] ldlm_plain_policy_local_to_wire,
+	[LDLM_EXTENT - LDLM_MIN_TYPE] ldlm_extent_policy_local_to_wire,
+	[LDLM_FLOCK - LDLM_MIN_TYPE] ldlm_flock_policy_local_to_wire,
+	[LDLM_IBITS - LDLM_MIN_TYPE] ldlm_ibits_policy_local_to_wire,
+};
+
+/**
+ * Converts lock policy from local format to on the wire lock_desc format
+ */
+void ldlm_convert_policy_to_wire(ldlm_type_t type,
+				 const ldlm_policy_data_t *lpolicy,
+				 ldlm_wire_policy_data_t *wpolicy)
+{
+	ldlm_policy_local_to_wire_t convert;
+
+	convert = ldlm_policy_local_to_wire[type - LDLM_MIN_TYPE];
+
+	convert(lpolicy, wpolicy);
+}
+
+/**
+ * Converts lock policy from on the wire lock_desc format to local format
+ */
+void ldlm_convert_policy_to_local(struct obd_export *exp, ldlm_type_t type,
+				  const ldlm_wire_policy_data_t *wpolicy,
+				  ldlm_policy_data_t *lpolicy)
+{
+	ldlm_policy_wire_to_local_t convert;
+	int new_client;
+
+	/** some badness for 2.0.0 clients, but 2.0.0 isn't supported */
+	new_client = (exp_connect_flags(exp) & OBD_CONNECT_FULL20) != 0;
+	if (new_client)
+		convert = ldlm_policy_wire21_to_local[type - LDLM_MIN_TYPE];
+	else
+		convert = ldlm_policy_wire18_to_local[type - LDLM_MIN_TYPE];
+
+	convert(wpolicy, lpolicy);
+}
+
+char *ldlm_it2str(int it)
+{
+	switch (it) {
+	case IT_OPEN:
+		return "open";
+	case IT_CREAT:
+		return "creat";
+	case (IT_OPEN | IT_CREAT):
+		return "open|creat";
+	case IT_READDIR:
+		return "readdir";
+	case IT_GETATTR:
+		return "getattr";
+	case IT_LOOKUP:
+		return "lookup";
+	case IT_UNLINK:
+		return "unlink";
+	case IT_GETXATTR:
+		return "getxattr";
+	case IT_LAYOUT:
+		return "layout";
+	default:
+		CERROR("Unknown intent %d\n", it);
+		return "UNKNOWN";
+	}
+}
+EXPORT_SYMBOL(ldlm_it2str);
+
+extern struct kmem_cache *ldlm_lock_slab;
+
+
+void ldlm_register_intent(struct ldlm_namespace *ns, ldlm_res_policy arg)
+{
+	ns->ns_policy = arg;
+}
+EXPORT_SYMBOL(ldlm_register_intent);
+
+/*
+ * REFCOUNTED LOCK OBJECTS
+ */
+
+
+/**
+ * Get a reference on a lock.
+ *
+ * Lock refcounts, during creation:
+ *   - one special one for allocation, dec'd only once in destroy
+ *   - one for being a lock that's in-use
+ *   - one for the addref associated with a new lock
+ */
+struct ldlm_lock *ldlm_lock_get(struct ldlm_lock *lock)
+{
+	atomic_inc(&lock->l_refc);
+	return lock;
+}
+EXPORT_SYMBOL(ldlm_lock_get);
+
+/**
+ * Release lock reference.
+ *
+ * Also frees the lock if it was last reference.
+ */
+void ldlm_lock_put(struct ldlm_lock *lock)
+{
+	ENTRY;
+
+	LASSERT(lock->l_resource != LP_POISON);
+	LASSERT(atomic_read(&lock->l_refc) > 0);
+	if (atomic_dec_and_test(&lock->l_refc)) {
+		struct ldlm_resource *res;
+
+		LDLM_DEBUG(lock,
+			   "final lock_put on destroyed lock, freeing it.");
+
+		res = lock->l_resource;
+		LASSERT(lock->l_destroyed);
+		LASSERT(list_empty(&lock->l_res_link));
+		LASSERT(list_empty(&lock->l_pending_chain));
+
+		lprocfs_counter_decr(ldlm_res_to_ns(res)->ns_stats,
+				     LDLM_NSS_LOCKS);
+		lu_ref_del(&res->lr_reference, "lock", lock);
+		ldlm_resource_putref(res);
+		lock->l_resource = NULL;
+		if (lock->l_export) {
+			class_export_lock_put(lock->l_export, lock);
+			lock->l_export = NULL;
+		}
+
+		if (lock->l_lvb_data != NULL)
+			OBD_FREE(lock->l_lvb_data, lock->l_lvb_len);
+
+		ldlm_interval_free(ldlm_interval_detach(lock));
+		lu_ref_fini(&lock->l_reference);
+		OBD_FREE_RCU(lock, sizeof(*lock), &lock->l_handle);
+	}
+
+	EXIT;
+}
+EXPORT_SYMBOL(ldlm_lock_put);
+
+/**
+ * Removes LDLM lock \a lock from LRU. Assumes LRU is already locked.
+ */
+int ldlm_lock_remove_from_lru_nolock(struct ldlm_lock *lock)
+{
+	int rc = 0;
+	if (!list_empty(&lock->l_lru)) {
+		struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+		LASSERT(lock->l_resource->lr_type != LDLM_FLOCK);
+		list_del_init(&lock->l_lru);
+		if (lock->l_flags & LDLM_FL_SKIPPED)
+			lock->l_flags &= ~LDLM_FL_SKIPPED;
+		LASSERT(ns->ns_nr_unused > 0);
+		ns->ns_nr_unused--;
+		rc = 1;
+	}
+	return rc;
+}
+
+/**
+ * Removes LDLM lock \a lock from LRU. Obtains the LRU lock first.
+ */
+int ldlm_lock_remove_from_lru(struct ldlm_lock *lock)
+{
+	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+	int rc;
+
+	ENTRY;
+	if (lock->l_ns_srv) {
+		LASSERT(list_empty(&lock->l_lru));
+		RETURN(0);
+	}
+
+	spin_lock(&ns->ns_lock);
+	rc = ldlm_lock_remove_from_lru_nolock(lock);
+	spin_unlock(&ns->ns_lock);
+	EXIT;
+	return rc;
+}
+
+/**
+ * Adds LDLM lock \a lock to namespace LRU. Assumes LRU is already locked.
+ */
+void ldlm_lock_add_to_lru_nolock(struct ldlm_lock *lock)
+{
+	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+	lock->l_last_used = cfs_time_current();
+	LASSERT(list_empty(&lock->l_lru));
+	LASSERT(lock->l_resource->lr_type != LDLM_FLOCK);
+	list_add_tail(&lock->l_lru, &ns->ns_unused_list);
+	LASSERT(ns->ns_nr_unused >= 0);
+	ns->ns_nr_unused++;
+}
+
+/**
+ * Adds LDLM lock \a lock to namespace LRU. Obtains necessary LRU locks
+ * first.
+ */
+void ldlm_lock_add_to_lru(struct ldlm_lock *lock)
+{
+	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+	ENTRY;
+	spin_lock(&ns->ns_lock);
+	ldlm_lock_add_to_lru_nolock(lock);
+	spin_unlock(&ns->ns_lock);
+	EXIT;
+}
+
+/**
+ * Moves LDLM lock \a lock that is already in namespace LRU to the tail of
+ * the LRU. Performs necessary LRU locking
+ */
+void ldlm_lock_touch_in_lru(struct ldlm_lock *lock)
+{
+	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+	ENTRY;
+	if (lock->l_ns_srv) {
+		LASSERT(list_empty(&lock->l_lru));
+		EXIT;
+		return;
+	}
+
+	spin_lock(&ns->ns_lock);
+	if (!list_empty(&lock->l_lru)) {
+		ldlm_lock_remove_from_lru_nolock(lock);
+		ldlm_lock_add_to_lru_nolock(lock);
+	}
+	spin_unlock(&ns->ns_lock);
+	EXIT;
+}
+
+/**
+ * Helper to destroy a locked lock.
+ *
+ * Used by ldlm_lock_destroy and ldlm_lock_destroy_nolock
+ * Must be called with l_lock and lr_lock held.
+ *
+ * Does not actually free the lock data, but rather marks the lock as
+ * destroyed by setting l_destroyed field in the lock to 1.  Destroys a
+ * handle->lock association too, so that the lock can no longer be found
+ * and removes the lock from LRU list.  Actual lock freeing occurs when
+ * last lock reference goes away.
+ *
+ * Original comment (of some historical value):
+ * This used to have a 'strict' flag, which recovery would use to mark an
+ * in-use lock as needing-to-die.  Lest I am ever tempted to put it back, I
+ * shall explain why it's gone: with the new hash table scheme, once you call
+ * ldlm_lock_destroy, you can never drop your final references on this lock.
+ * Because it's not in the hash table anymore.  -phil
+ */
+int ldlm_lock_destroy_internal(struct ldlm_lock *lock)
+{
+	ENTRY;
+
+	if (lock->l_readers || lock->l_writers) {
+		LDLM_ERROR(lock, "lock still has references");
+		LBUG();
+	}
+
+	if (!list_empty(&lock->l_res_link)) {
+		LDLM_ERROR(lock, "lock still on resource");
+		LBUG();
+	}
+
+	if (lock->l_destroyed) {
+		LASSERT(list_empty(&lock->l_lru));
+		EXIT;
+		return 0;
+	}
+	lock->l_destroyed = 1;
+
+	if (lock->l_export && lock->l_export->exp_lock_hash) {
+		/* NB: it's safe to call cfs_hash_del() even lock isn't
+		 * in exp_lock_hash. */
+		/* In the function below, .hs_keycmp resolves to
+		 * ldlm_export_lock_keycmp() */
+		/* coverity[overrun-buffer-val] */
+		cfs_hash_del(lock->l_export->exp_lock_hash,
+			     &lock->l_remote_handle, &lock->l_exp_hash);
+	}
+
+	ldlm_lock_remove_from_lru(lock);
+	class_handle_unhash(&lock->l_handle);
+
+#if 0
+	/* Wake anyone waiting for this lock */
+	/* FIXME: I should probably add yet another flag, instead of using
+	 * l_export to only call this on clients */
+	if (lock->l_export)
+		class_export_put(lock->l_export);
+	lock->l_export = NULL;
+	if (lock->l_export && lock->l_completion_ast)
+		lock->l_completion_ast(lock, 0);
+#endif
+	EXIT;
+	return 1;
+}
+
+/**
+ * Destroys a LDLM lock \a lock. Performs necessary locking first.
+ */
+void ldlm_lock_destroy(struct ldlm_lock *lock)
+{
+	int first;
+	ENTRY;
+	lock_res_and_lock(lock);
+	first = ldlm_lock_destroy_internal(lock);
+	unlock_res_and_lock(lock);
+
+	/* drop reference from hashtable only for first destroy */
+	if (first) {
+		lu_ref_del(&lock->l_reference, "hash", lock);
+		LDLM_LOCK_RELEASE(lock);
+	}
+	EXIT;
+}
+
+/**
+ * Destroys a LDLM lock \a lock that is already locked.
+ */
+void ldlm_lock_destroy_nolock(struct ldlm_lock *lock)
+{
+	int first;
+	ENTRY;
+	first = ldlm_lock_destroy_internal(lock);
+	/* drop reference from hashtable only for first destroy */
+	if (first) {
+		lu_ref_del(&lock->l_reference, "hash", lock);
+		LDLM_LOCK_RELEASE(lock);
+	}
+	EXIT;
+}
+
+/* this is called by portals_handle2object with the handle lock taken */
+static void lock_handle_addref(void *lock)
+{
+	LDLM_LOCK_GET((struct ldlm_lock *)lock);
+}
+
+static void lock_handle_free(void *lock, int size)
+{
+	LASSERT(size == sizeof(struct ldlm_lock));
+	OBD_SLAB_FREE(lock, ldlm_lock_slab, size);
+}
+
+struct portals_handle_ops lock_handle_ops = {
+	.hop_addref = lock_handle_addref,
+	.hop_free   = lock_handle_free,
+};
+
+/**
+ *
+ * Allocate and initialize new lock structure.
+ *
+ * usage: pass in a resource on which you have done ldlm_resource_get
+ *	new lock will take over the refcount.
+ * returns: lock with refcount 2 - one for current caller and one for remote
+ */
+static struct ldlm_lock *ldlm_lock_new(struct ldlm_resource *resource)
+{
+	struct ldlm_lock *lock;
+	ENTRY;
+
+	if (resource == NULL)
+		LBUG();
+
+	OBD_SLAB_ALLOC_PTR_GFP(lock, ldlm_lock_slab, __GFP_IO);
+	if (lock == NULL)
+		RETURN(NULL);
+
+	spin_lock_init(&lock->l_lock);
+	lock->l_resource = resource;
+	lu_ref_add(&resource->lr_reference, "lock", lock);
+
+	atomic_set(&lock->l_refc, 2);
+	INIT_LIST_HEAD(&lock->l_res_link);
+	INIT_LIST_HEAD(&lock->l_lru);
+	INIT_LIST_HEAD(&lock->l_pending_chain);
+	INIT_LIST_HEAD(&lock->l_bl_ast);
+	INIT_LIST_HEAD(&lock->l_cp_ast);
+	INIT_LIST_HEAD(&lock->l_rk_ast);
+	init_waitqueue_head(&lock->l_waitq);
+	lock->l_blocking_lock = NULL;
+	INIT_LIST_HEAD(&lock->l_sl_mode);
+	INIT_LIST_HEAD(&lock->l_sl_policy);
+	INIT_HLIST_NODE(&lock->l_exp_hash);
+	INIT_HLIST_NODE(&lock->l_exp_flock_hash);
+
+	lprocfs_counter_incr(ldlm_res_to_ns(resource)->ns_stats,
+			     LDLM_NSS_LOCKS);
+	INIT_LIST_HEAD(&lock->l_handle.h_link);
+	class_handle_hash(&lock->l_handle, &lock_handle_ops);
+
+	lu_ref_init(&lock->l_reference);
+	lu_ref_add(&lock->l_reference, "hash", lock);
+	lock->l_callback_timeout = 0;
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+	INIT_LIST_HEAD(&lock->l_exp_refs_link);
+	lock->l_exp_refs_nr = 0;
+	lock->l_exp_refs_target = NULL;
+#endif
+	INIT_LIST_HEAD(&lock->l_exp_list);
+
+	RETURN(lock);
+}
+
+/**
+ * Moves LDLM lock \a lock to another resource.
+ * This is used on client when server returns some other lock than requested
+ * (typically as a result of intent operation)
+ */
+int ldlm_lock_change_resource(struct ldlm_namespace *ns, struct ldlm_lock *lock,
+			      const struct ldlm_res_id *new_resid)
+{
+	struct ldlm_resource *oldres = lock->l_resource;
+	struct ldlm_resource *newres;
+	int type;
+	ENTRY;
+
+	LASSERT(ns_is_client(ns));
+
+	lock_res_and_lock(lock);
+	if (memcmp(new_resid, &lock->l_resource->lr_name,
+		   sizeof(lock->l_resource->lr_name)) == 0) {
+		/* Nothing to do */
+		unlock_res_and_lock(lock);
+		RETURN(0);
+	}
+
+	LASSERT(new_resid->name[0] != 0);
+
+	/* This function assumes that the lock isn't on any lists */
+	LASSERT(list_empty(&lock->l_res_link));
+
+	type = oldres->lr_type;
+	unlock_res_and_lock(lock);
+
+	newres = ldlm_resource_get(ns, NULL, new_resid, type, 1);
+	if (newres == NULL)
+		RETURN(-ENOMEM);
+
+	lu_ref_add(&newres->lr_reference, "lock", lock);
+	/*
+	 * To flip the lock from the old to the new resource, lock, oldres and
+	 * newres have to be locked. Resource spin-locks are nested within
+	 * lock->l_lock, and are taken in the memory address order to avoid
+	 * dead-locks.
+	 */
+	spin_lock(&lock->l_lock);
+	oldres = lock->l_resource;
+	if (oldres < newres) {
+		lock_res(oldres);
+		lock_res_nested(newres, LRT_NEW);
+	} else {
+		lock_res(newres);
+		lock_res_nested(oldres, LRT_NEW);
+	}
+	LASSERT(memcmp(new_resid, &oldres->lr_name,
+		       sizeof oldres->lr_name) != 0);
+	lock->l_resource = newres;
+	unlock_res(oldres);
+	unlock_res_and_lock(lock);
+
+	/* ...and the flowers are still standing! */
+	lu_ref_del(&oldres->lr_reference, "lock", lock);
+	ldlm_resource_putref(oldres);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_lock_change_resource);
+
+/** \defgroup ldlm_handles LDLM HANDLES
+ * Ways to get hold of locks without any addresses.
+ * @{
+ */
+
+/**
+ * Fills in handle for LDLM lock \a lock into supplied \a lockh
+ * Does not take any references.
+ */
+void ldlm_lock2handle(const struct ldlm_lock *lock, struct lustre_handle *lockh)
+{
+	lockh->cookie = lock->l_handle.h_cookie;
+}
+EXPORT_SYMBOL(ldlm_lock2handle);
+
+/**
+ * Obtain a lock reference by handle.
+ *
+ * if \a flags: atomically get the lock and set the flags.
+ *	      Return NULL if flag already set
+ */
+struct ldlm_lock *__ldlm_handle2lock(const struct lustre_handle *handle,
+				     __u64 flags)
+{
+	struct ldlm_lock *lock;
+	ENTRY;
+
+	LASSERT(handle);
+
+	lock = class_handle2object(handle->cookie);
+	if (lock == NULL)
+		RETURN(NULL);
+
+	/* It's unlikely but possible that someone marked the lock as
+	 * destroyed after we did handle2object on it */
+	if (flags == 0 && !lock->l_destroyed) {
+		lu_ref_add(&lock->l_reference, "handle", current);
+		RETURN(lock);
+	}
+
+	lock_res_and_lock(lock);
+
+	LASSERT(lock->l_resource != NULL);
+
+	lu_ref_add_atomic(&lock->l_reference, "handle", current);
+	if (unlikely(lock->l_destroyed)) {
+		unlock_res_and_lock(lock);
+		CDEBUG(D_INFO, "lock already destroyed: lock %p\n", lock);
+		LDLM_LOCK_PUT(lock);
+		RETURN(NULL);
+	}
+
+	if (flags && (lock->l_flags & flags)) {
+		unlock_res_and_lock(lock);
+		LDLM_LOCK_PUT(lock);
+		RETURN(NULL);
+	}
+
+	if (flags)
+		lock->l_flags |= flags;
+
+	unlock_res_and_lock(lock);
+	RETURN(lock);
+}
+EXPORT_SYMBOL(__ldlm_handle2lock);
+/** @} ldlm_handles */
+
+/**
+ * Fill in "on the wire" representation for given LDLM lock into supplied
+ * lock descriptor \a desc structure.
+ */
+void ldlm_lock2desc(struct ldlm_lock *lock, struct ldlm_lock_desc *desc)
+{
+	struct obd_export *exp = lock->l_export ?: lock->l_conn_export;
+
+	/* INODEBITS_INTEROP: If the other side does not support
+	 * inodebits, reply with a plain lock descriptor. */
+	if ((lock->l_resource->lr_type == LDLM_IBITS) &&
+	    (exp && !(exp_connect_flags(exp) & OBD_CONNECT_IBITS))) {
+		/* Make sure all the right bits are set in this lock we
+		   are going to pass to client */
+		LASSERTF(lock->l_policy_data.l_inodebits.bits ==
+			 (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_UPDATE |
+			  MDS_INODELOCK_LAYOUT),
+			 "Inappropriate inode lock bits during "
+			 "conversion " LPU64 "\n",
+			 lock->l_policy_data.l_inodebits.bits);
+
+		ldlm_res2desc(lock->l_resource, &desc->l_resource);
+		desc->l_resource.lr_type = LDLM_PLAIN;
+
+		/* Convert "new" lock mode to something old client can
+		   understand */
+		if ((lock->l_req_mode == LCK_CR) ||
+		    (lock->l_req_mode == LCK_CW))
+			desc->l_req_mode = LCK_PR;
+		else
+			desc->l_req_mode = lock->l_req_mode;
+		if ((lock->l_granted_mode == LCK_CR) ||
+		    (lock->l_granted_mode == LCK_CW)) {
+			desc->l_granted_mode = LCK_PR;
+		} else {
+			/* We never grant PW/EX locks to clients */
+			LASSERT((lock->l_granted_mode != LCK_PW) &&
+				(lock->l_granted_mode != LCK_EX));
+			desc->l_granted_mode = lock->l_granted_mode;
+		}
+
+		/* We do not copy policy here, because there is no
+		   policy for plain locks */
+	} else {
+		ldlm_res2desc(lock->l_resource, &desc->l_resource);
+		desc->l_req_mode = lock->l_req_mode;
+		desc->l_granted_mode = lock->l_granted_mode;
+		ldlm_convert_policy_to_wire(lock->l_resource->lr_type,
+					    &lock->l_policy_data,
+					    &desc->l_policy_data);
+	}
+}
+EXPORT_SYMBOL(ldlm_lock2desc);
+
+/**
+ * Add a lock to list of conflicting locks to send AST to.
+ *
+ * Only add if we have not sent a blocking AST to the lock yet.
+ */
+void ldlm_add_bl_work_item(struct ldlm_lock *lock, struct ldlm_lock *new,
+			   struct list_head *work_list)
+{
+	if ((lock->l_flags & LDLM_FL_AST_SENT) == 0) {
+		LDLM_DEBUG(lock, "lock incompatible; sending blocking AST.");
+		lock->l_flags |= LDLM_FL_AST_SENT;
+		/* If the enqueuing client said so, tell the AST recipient to
+		 * discard dirty data, rather than writing back. */
+		if (new->l_flags & LDLM_AST_DISCARD_DATA)
+			lock->l_flags |= LDLM_FL_DISCARD_DATA;
+		LASSERT(list_empty(&lock->l_bl_ast));
+		list_add(&lock->l_bl_ast, work_list);
+		LDLM_LOCK_GET(lock);
+		LASSERT(lock->l_blocking_lock == NULL);
+		lock->l_blocking_lock = LDLM_LOCK_GET(new);
+	}
+}
+
+/**
+ * Add a lock to list of just granted locks to send completion AST to.
+ */
+void ldlm_add_cp_work_item(struct ldlm_lock *lock, struct list_head *work_list)
+{
+	if ((lock->l_flags & LDLM_FL_CP_REQD) == 0) {
+		lock->l_flags |= LDLM_FL_CP_REQD;
+		LDLM_DEBUG(lock, "lock granted; sending completion AST.");
+		LASSERT(list_empty(&lock->l_cp_ast));
+		list_add(&lock->l_cp_ast, work_list);
+		LDLM_LOCK_GET(lock);
+	}
+}
+
+/**
+ * Aggregator function to add AST work items into a list. Determines
+ * what sort of an AST work needs to be done and calls the proper
+ * adding function.
+ * Must be called with lr_lock held.
+ */
+void ldlm_add_ast_work_item(struct ldlm_lock *lock, struct ldlm_lock *new,
+			    struct list_head *work_list)
+{
+	ENTRY;
+	check_res_locked(lock->l_resource);
+	if (new)
+		ldlm_add_bl_work_item(lock, new, work_list);
+	else
+		ldlm_add_cp_work_item(lock, work_list);
+	EXIT;
+}
+
+/**
+ * Add specified reader/writer reference to LDLM lock with handle \a lockh.
+ * r/w reference type is determined by \a mode
+ * Calls ldlm_lock_addref_internal.
+ */
+void ldlm_lock_addref(struct lustre_handle *lockh, __u32 mode)
+{
+	struct ldlm_lock *lock;
+
+	lock = ldlm_handle2lock(lockh);
+	LASSERT(lock != NULL);
+	ldlm_lock_addref_internal(lock, mode);
+	LDLM_LOCK_PUT(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_addref);
+
+/**
+ * Helper function.
+ * Add specified reader/writer reference to LDLM lock \a lock.
+ * r/w reference type is determined by \a mode
+ * Removes lock from LRU if it is there.
+ * Assumes the LDLM lock is already locked.
+ */
+void ldlm_lock_addref_internal_nolock(struct ldlm_lock *lock, __u32 mode)
+{
+	ldlm_lock_remove_from_lru(lock);
+	if (mode & (LCK_NL | LCK_CR | LCK_PR)) {
+		lock->l_readers++;
+		lu_ref_add_atomic(&lock->l_reference, "reader", lock);
+	}
+	if (mode & (LCK_EX | LCK_CW | LCK_PW | LCK_GROUP | LCK_COS)) {
+		lock->l_writers++;
+		lu_ref_add_atomic(&lock->l_reference, "writer", lock);
+	}
+	LDLM_LOCK_GET(lock);
+	lu_ref_add_atomic(&lock->l_reference, "user", lock);
+	LDLM_DEBUG(lock, "ldlm_lock_addref(%s)", ldlm_lockname[mode]);
+}
+
+/**
+ * Attempts to add reader/writer reference to a lock with handle \a lockh, and
+ * fails if lock is already LDLM_FL_CBPENDING or destroyed.
+ *
+ * \retval 0 success, lock was addref-ed
+ *
+ * \retval -EAGAIN lock is being canceled.
+ */
+int ldlm_lock_addref_try(struct lustre_handle *lockh, __u32 mode)
+{
+	struct ldlm_lock *lock;
+	int	       result;
+
+	result = -EAGAIN;
+	lock = ldlm_handle2lock(lockh);
+	if (lock != NULL) {
+		lock_res_and_lock(lock);
+		if (lock->l_readers != 0 || lock->l_writers != 0 ||
+		    !(lock->l_flags & LDLM_FL_CBPENDING)) {
+			ldlm_lock_addref_internal_nolock(lock, mode);
+			result = 0;
+		}
+		unlock_res_and_lock(lock);
+		LDLM_LOCK_PUT(lock);
+	}
+	return result;
+}
+EXPORT_SYMBOL(ldlm_lock_addref_try);
+
+/**
+ * Add specified reader/writer reference to LDLM lock \a lock.
+ * Locks LDLM lock and calls ldlm_lock_addref_internal_nolock to do the work.
+ * Only called for local locks.
+ */
+void ldlm_lock_addref_internal(struct ldlm_lock *lock, __u32 mode)
+{
+	lock_res_and_lock(lock);
+	ldlm_lock_addref_internal_nolock(lock, mode);
+	unlock_res_and_lock(lock);
+}
+
+/**
+ * Removes reader/writer reference for LDLM lock \a lock.
+ * Assumes LDLM lock is already locked.
+ * only called in ldlm_flock_destroy and for local locks.
+ * Does NOT add lock to LRU if no r/w references left to accomodate flock locks
+ * that cannot be placed in LRU.
+ */
+void ldlm_lock_decref_internal_nolock(struct ldlm_lock *lock, __u32 mode)
+{
+	LDLM_DEBUG(lock, "ldlm_lock_decref(%s)", ldlm_lockname[mode]);
+	if (mode & (LCK_NL | LCK_CR | LCK_PR)) {
+		LASSERT(lock->l_readers > 0);
+		lu_ref_del(&lock->l_reference, "reader", lock);
+		lock->l_readers--;
+	}
+	if (mode & (LCK_EX | LCK_CW | LCK_PW | LCK_GROUP | LCK_COS)) {
+		LASSERT(lock->l_writers > 0);
+		lu_ref_del(&lock->l_reference, "writer", lock);
+		lock->l_writers--;
+	}
+
+	lu_ref_del(&lock->l_reference, "user", lock);
+	LDLM_LOCK_RELEASE(lock);    /* matches the LDLM_LOCK_GET() in addref */
+}
+
+/**
+ * Removes reader/writer reference for LDLM lock \a lock.
+ * Locks LDLM lock first.
+ * If the lock is determined to be client lock on a client and r/w refcount
+ * drops to zero and the lock is not blocked, the lock is added to LRU lock
+ * on the namespace.
+ * For blocked LDLM locks if r/w count drops to zero, blocking_ast is called.
+ */
+void ldlm_lock_decref_internal(struct ldlm_lock *lock, __u32 mode)
+{
+	struct ldlm_namespace *ns;
+	ENTRY;
+
+	lock_res_and_lock(lock);
+
+	ns = ldlm_lock_to_ns(lock);
+
+	ldlm_lock_decref_internal_nolock(lock, mode);
+
+	/* release lvb data for layout lock */
+	if (ns_is_client(ns) && !lock->l_readers && !lock->l_writers &&
+	    ldlm_has_layout(lock) && lock->l_flags & LDLM_FL_LVB_READY) {
+		/* this is the last user of a layout lock and stripe has
+		 * been set up, lvb is no longer used.
+		 * This may be a large amount of memory, so we should free it
+		 * when possible. */
+		if (lock->l_lvb_data != NULL) {
+			OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
+			lock->l_lvb_data = NULL;
+			lock->l_lvb_len = 0;
+		}
+	}
+
+	if (lock->l_flags & LDLM_FL_LOCAL &&
+	    !lock->l_readers && !lock->l_writers) {
+		/* If this is a local lock on a server namespace and this was
+		 * the last reference, cancel the lock. */
+		CDEBUG(D_INFO, "forcing cancel of local lock\n");
+		lock->l_flags |= LDLM_FL_CBPENDING;
+	}
+
+	if (!lock->l_readers && !lock->l_writers &&
+	    (lock->l_flags & LDLM_FL_CBPENDING)) {
+		/* If we received a blocked AST and this was the last reference,
+		 * run the callback. */
+		if (lock->l_ns_srv && lock->l_export)
+			CERROR("FL_CBPENDING set on non-local lock--just a "
+			       "warning\n");
+
+		LDLM_DEBUG(lock, "final decref done on cbpending lock");
+
+		LDLM_LOCK_GET(lock); /* dropped by bl thread */
+		ldlm_lock_remove_from_lru(lock);
+		unlock_res_and_lock(lock);
+
+		if (lock->l_flags & LDLM_FL_FAIL_LOC)
+			OBD_RACE(OBD_FAIL_LDLM_CP_BL_RACE);
+
+		if ((lock->l_flags & LDLM_FL_ATOMIC_CB) ||
+		    ldlm_bl_to_thread_lock(ns, NULL, lock) != 0)
+			ldlm_handle_bl_callback(ns, NULL, lock);
+	} else if (ns_is_client(ns) &&
+		   !lock->l_readers && !lock->l_writers &&
+		   !(lock->l_flags & LDLM_FL_NO_LRU) &&
+		   !(lock->l_flags & LDLM_FL_BL_AST)) {
+
+		LDLM_DEBUG(lock, "add lock into lru list");
+
+		/* If this is a client-side namespace and this was the last
+		 * reference, put it on the LRU. */
+		ldlm_lock_add_to_lru(lock);
+		unlock_res_and_lock(lock);
+
+		if (lock->l_flags & LDLM_FL_FAIL_LOC)
+			OBD_RACE(OBD_FAIL_LDLM_CP_BL_RACE);
+
+		/* Call ldlm_cancel_lru() only if EARLY_CANCEL and LRU RESIZE
+		 * are not supported by the server, otherwise, it is done on
+		 * enqueue. */
+		if (!exp_connect_cancelset(lock->l_conn_export) &&
+		    !ns_connect_lru_resize(ns))
+			ldlm_cancel_lru(ns, 0, LCF_ASYNC, 0);
+	} else {
+		LDLM_DEBUG(lock, "do not add lock into lru list");
+		unlock_res_and_lock(lock);
+	}
+
+	EXIT;
+}
+
+/**
+ * Decrease reader/writer refcount for LDLM lock with handle \a lockh
+ */
+void ldlm_lock_decref(struct lustre_handle *lockh, __u32 mode)
+{
+	struct ldlm_lock *lock = __ldlm_handle2lock(lockh, 0);
+	LASSERTF(lock != NULL, "Non-existing lock: "LPX64"\n", lockh->cookie);
+	ldlm_lock_decref_internal(lock, mode);
+	LDLM_LOCK_PUT(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_decref);
+
+/**
+ * Decrease reader/writer refcount for LDLM lock with handle
+ * \a lockh and mark it for subsequent cancellation once r/w refcount
+ * drops to zero instead of putting into LRU.
+ *
+ * Typical usage is for GROUP locks which we cannot allow to be cached.
+ */
+void ldlm_lock_decref_and_cancel(struct lustre_handle *lockh, __u32 mode)
+{
+	struct ldlm_lock *lock = __ldlm_handle2lock(lockh, 0);
+	ENTRY;
+
+	LASSERT(lock != NULL);
+
+	LDLM_DEBUG(lock, "ldlm_lock_decref(%s)", ldlm_lockname[mode]);
+	lock_res_and_lock(lock);
+	lock->l_flags |= LDLM_FL_CBPENDING;
+	unlock_res_and_lock(lock);
+	ldlm_lock_decref_internal(lock, mode);
+	LDLM_LOCK_PUT(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_decref_and_cancel);
+
+struct sl_insert_point {
+	struct list_head *res_link;
+	struct list_head *mode_link;
+	struct list_head *policy_link;
+};
+
+/**
+ * Finds a position to insert the new lock into granted lock list.
+ *
+ * Used for locks eligible for skiplist optimization.
+ *
+ * Parameters:
+ *      queue [input]:  the granted list where search acts on;
+ *      req [input]:    the lock whose position to be located;
+ *      prev [output]:  positions within 3 lists to insert @req to
+ * Return Value:
+ *      filled @prev
+ * NOTE: called by
+ *  - ldlm_grant_lock_with_skiplist
+ */
+static void search_granted_lock(struct list_head *queue,
+				struct ldlm_lock *req,
+				struct sl_insert_point *prev)
+{
+	struct list_head *tmp;
+	struct ldlm_lock *lock, *mode_end, *policy_end;
+	ENTRY;
+
+	list_for_each(tmp, queue) {
+		lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+		mode_end = list_entry(lock->l_sl_mode.prev,
+					  struct ldlm_lock, l_sl_mode);
+
+		if (lock->l_req_mode != req->l_req_mode) {
+			/* jump to last lock of mode group */
+			tmp = &mode_end->l_res_link;
+			continue;
+		}
+
+		/* suitable mode group is found */
+		if (lock->l_resource->lr_type == LDLM_PLAIN) {
+			/* insert point is last lock of the mode group */
+			prev->res_link = &mode_end->l_res_link;
+			prev->mode_link = &mode_end->l_sl_mode;
+			prev->policy_link = &req->l_sl_policy;
+			EXIT;
+			return;
+		} else if (lock->l_resource->lr_type == LDLM_IBITS) {
+			for (;;) {
+				policy_end =
+					list_entry(lock->l_sl_policy.prev,
+						       struct ldlm_lock,
+						       l_sl_policy);
+
+				if (lock->l_policy_data.l_inodebits.bits ==
+				    req->l_policy_data.l_inodebits.bits) {
+					/* insert point is last lock of
+					 * the policy group */
+					prev->res_link =
+						&policy_end->l_res_link;
+					prev->mode_link =
+						&policy_end->l_sl_mode;
+					prev->policy_link =
+						&policy_end->l_sl_policy;
+					EXIT;
+					return;
+				}
+
+				if (policy_end == mode_end)
+					/* done with mode group */
+					break;
+
+				/* go to next policy group within mode group */
+				tmp = policy_end->l_res_link.next;
+				lock = list_entry(tmp, struct ldlm_lock,
+						      l_res_link);
+			}  /* loop over policy groups within the mode group */
+
+			/* insert point is last lock of the mode group,
+			 * new policy group is started */
+			prev->res_link = &mode_end->l_res_link;
+			prev->mode_link = &mode_end->l_sl_mode;
+			prev->policy_link = &req->l_sl_policy;
+			EXIT;
+			return;
+		} else {
+			LDLM_ERROR(lock,"is not LDLM_PLAIN or LDLM_IBITS lock");
+			LBUG();
+		}
+	}
+
+	/* insert point is last lock on the queue,
+	 * new mode group and new policy group are started */
+	prev->res_link = queue->prev;
+	prev->mode_link = &req->l_sl_mode;
+	prev->policy_link = &req->l_sl_policy;
+	EXIT;
+	return;
+}
+
+/**
+ * Add a lock into resource granted list after a position described by
+ * \a prev.
+ */
+static void ldlm_granted_list_add_lock(struct ldlm_lock *lock,
+				       struct sl_insert_point *prev)
+{
+	struct ldlm_resource *res = lock->l_resource;
+	ENTRY;
+
+	check_res_locked(res);
+
+	ldlm_resource_dump(D_INFO, res);
+	LDLM_DEBUG(lock, "About to add lock:");
+
+	if (lock->l_destroyed) {
+		CDEBUG(D_OTHER, "Lock destroyed, not adding to resource\n");
+		return;
+	}
+
+	LASSERT(list_empty(&lock->l_res_link));
+	LASSERT(list_empty(&lock->l_sl_mode));
+	LASSERT(list_empty(&lock->l_sl_policy));
+
+	/*
+	 * lock->link == prev->link means lock is first starting the group.
+	 * Don't re-add to itself to suppress kernel warnings.
+	 */
+	if (&lock->l_res_link != prev->res_link)
+		list_add(&lock->l_res_link, prev->res_link);
+	if (&lock->l_sl_mode != prev->mode_link)
+		list_add(&lock->l_sl_mode, prev->mode_link);
+	if (&lock->l_sl_policy != prev->policy_link)
+		list_add(&lock->l_sl_policy, prev->policy_link);
+
+	EXIT;
+}
+
+/**
+ * Add a lock to granted list on a resource maintaining skiplist
+ * correctness.
+ */
+static void ldlm_grant_lock_with_skiplist(struct ldlm_lock *lock)
+{
+	struct sl_insert_point prev;
+	ENTRY;
+
+	LASSERT(lock->l_req_mode == lock->l_granted_mode);
+
+	search_granted_lock(&lock->l_resource->lr_granted, lock, &prev);
+	ldlm_granted_list_add_lock(lock, &prev);
+	EXIT;
+}
+
+/**
+ * Perform lock granting bookkeeping.
+ *
+ * Includes putting the lock into granted list and updating lock mode.
+ * NOTE: called by
+ *  - ldlm_lock_enqueue
+ *  - ldlm_reprocess_queue
+ *  - ldlm_lock_convert
+ *
+ * must be called with lr_lock held
+ */
+void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list)
+{
+	struct ldlm_resource *res = lock->l_resource;
+	ENTRY;
+
+	check_res_locked(res);
+
+	lock->l_granted_mode = lock->l_req_mode;
+	if (res->lr_type == LDLM_PLAIN || res->lr_type == LDLM_IBITS)
+		ldlm_grant_lock_with_skiplist(lock);
+	else if (res->lr_type == LDLM_EXTENT)
+		ldlm_extent_add_lock(res, lock);
+	else
+		ldlm_resource_add_lock(res, &res->lr_granted, lock);
+
+	if (lock->l_granted_mode < res->lr_most_restr)
+		res->lr_most_restr = lock->l_granted_mode;
+
+	if (work_list && lock->l_completion_ast != NULL)
+		ldlm_add_ast_work_item(lock, NULL, work_list);
+
+	ldlm_pool_add(&ldlm_res_to_ns(res)->ns_pool, lock);
+	EXIT;
+}
+
+/**
+ * Search for a lock with given properties in a queue.
+ *
+ * \retval a referenced lock or NULL.  See the flag descriptions below, in the
+ * comment above ldlm_lock_match
+ */
+static struct ldlm_lock *search_queue(struct list_head *queue,
+				      ldlm_mode_t *mode,
+				      ldlm_policy_data_t *policy,
+				      struct ldlm_lock *old_lock,
+				      __u64 flags, int unref)
+{
+	struct ldlm_lock *lock;
+	struct list_head       *tmp;
+
+	list_for_each(tmp, queue) {
+		ldlm_mode_t match;
+
+		lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+		if (lock == old_lock)
+			break;
+
+		/* llite sometimes wants to match locks that will be
+		 * canceled when their users drop, but we allow it to match
+		 * if it passes in CBPENDING and the lock still has users.
+		 * this is generally only going to be used by children
+		 * whose parents already hold a lock so forward progress
+		 * can still happen. */
+		if (lock->l_flags & LDLM_FL_CBPENDING &&
+		    !(flags & LDLM_FL_CBPENDING))
+			continue;
+		if (!unref && lock->l_flags & LDLM_FL_CBPENDING &&
+		    lock->l_readers == 0 && lock->l_writers == 0)
+			continue;
+
+		if (!(lock->l_req_mode & *mode))
+			continue;
+		match = lock->l_req_mode;
+
+		if (lock->l_resource->lr_type == LDLM_EXTENT &&
+		    (lock->l_policy_data.l_extent.start >
+		     policy->l_extent.start ||
+		     lock->l_policy_data.l_extent.end < policy->l_extent.end))
+			continue;
+
+		if (unlikely(match == LCK_GROUP) &&
+		    lock->l_resource->lr_type == LDLM_EXTENT &&
+		    lock->l_policy_data.l_extent.gid != policy->l_extent.gid)
+			continue;
+
+		/* We match if we have existing lock with same or wider set
+		   of bits. */
+		if (lock->l_resource->lr_type == LDLM_IBITS &&
+		     ((lock->l_policy_data.l_inodebits.bits &
+		      policy->l_inodebits.bits) !=
+		      policy->l_inodebits.bits))
+			continue;
+
+		if (!unref &&
+		    (lock->l_destroyed || lock->l_flags & LDLM_FL_FAILED ||
+		     lock->l_failed))
+			continue;
+
+		if ((flags & LDLM_FL_LOCAL_ONLY) &&
+		    !(lock->l_flags & LDLM_FL_LOCAL))
+			continue;
+
+		if (flags & LDLM_FL_TEST_LOCK) {
+			LDLM_LOCK_GET(lock);
+			ldlm_lock_touch_in_lru(lock);
+		} else {
+			ldlm_lock_addref_internal_nolock(lock, match);
+		}
+		*mode = match;
+		return lock;
+	}
+
+	return NULL;
+}
+
+void ldlm_lock_fail_match_locked(struct ldlm_lock *lock)
+{
+	if (!lock->l_failed) {
+		lock->l_failed = 1;
+		wake_up_all(&lock->l_waitq);
+	}
+}
+EXPORT_SYMBOL(ldlm_lock_fail_match_locked);
+
+void ldlm_lock_fail_match(struct ldlm_lock *lock)
+{
+	lock_res_and_lock(lock);
+	ldlm_lock_fail_match_locked(lock);
+	unlock_res_and_lock(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_fail_match);
+
+/**
+ * Mark lock as "matchable" by OST.
+ *
+ * Used to prevent certain races in LOV/OSC where the lock is granted, but LVB
+ * is not yet valid.
+ * Assumes LDLM lock is already locked.
+ */
+void ldlm_lock_allow_match_locked(struct ldlm_lock *lock)
+{
+	lock->l_flags |= LDLM_FL_LVB_READY;
+	wake_up_all(&lock->l_waitq);
+}
+EXPORT_SYMBOL(ldlm_lock_allow_match_locked);
+
+/**
+ * Mark lock as "matchable" by OST.
+ * Locks the lock and then \see ldlm_lock_allow_match_locked
+ */
+void ldlm_lock_allow_match(struct ldlm_lock *lock)
+{
+	lock_res_and_lock(lock);
+	ldlm_lock_allow_match_locked(lock);
+	unlock_res_and_lock(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_allow_match);
+
+/**
+ * Attempt to find a lock with specified properties.
+ *
+ * Typically returns a reference to matched lock unless LDLM_FL_TEST_LOCK is
+ * set in \a flags
+ *
+ * Can be called in two ways:
+ *
+ * If 'ns' is NULL, then lockh describes an existing lock that we want to look
+ * for a duplicate of.
+ *
+ * Otherwise, all of the fields must be filled in, to match against.
+ *
+ * If 'flags' contains LDLM_FL_LOCAL_ONLY, then only match local locks on the
+ *     server (ie, connh is NULL)
+ * If 'flags' contains LDLM_FL_BLOCK_GRANTED, then only locks on the granted
+ *     list will be considered
+ * If 'flags' contains LDLM_FL_CBPENDING, then locks that have been marked
+ *     to be canceled can still be matched as long as they still have reader
+ *     or writer refernces
+ * If 'flags' contains LDLM_FL_TEST_LOCK, then don't actually reference a lock,
+ *     just tell us if we would have matched.
+ *
+ * \retval 1 if it finds an already-existing lock that is compatible; in this
+ * case, lockh is filled in with a addref()ed lock
+ *
+ * We also check security context, and if that fails we simply return 0 (to
+ * keep caller code unchanged), the context failure will be discovered by
+ * caller sometime later.
+ */
+ldlm_mode_t ldlm_lock_match(struct ldlm_namespace *ns, __u64 flags,
+			    const struct ldlm_res_id *res_id, ldlm_type_t type,
+			    ldlm_policy_data_t *policy, ldlm_mode_t mode,
+			    struct lustre_handle *lockh, int unref)
+{
+	struct ldlm_resource *res;
+	struct ldlm_lock *lock, *old_lock = NULL;
+	int rc = 0;
+	ENTRY;
+
+	if (ns == NULL) {
+		old_lock = ldlm_handle2lock(lockh);
+		LASSERT(old_lock);
+
+		ns = ldlm_lock_to_ns(old_lock);
+		res_id = &old_lock->l_resource->lr_name;
+		type = old_lock->l_resource->lr_type;
+		mode = old_lock->l_req_mode;
+	}
+
+	res = ldlm_resource_get(ns, NULL, res_id, type, 0);
+	if (res == NULL) {
+		LASSERT(old_lock == NULL);
+		RETURN(0);
+	}
+
+	LDLM_RESOURCE_ADDREF(res);
+	lock_res(res);
+
+	lock = search_queue(&res->lr_granted, &mode, policy, old_lock,
+			    flags, unref);
+	if (lock != NULL)
+		GOTO(out, rc = 1);
+	if (flags & LDLM_FL_BLOCK_GRANTED)
+		GOTO(out, rc = 0);
+	lock = search_queue(&res->lr_converting, &mode, policy, old_lock,
+			    flags, unref);
+	if (lock != NULL)
+		GOTO(out, rc = 1);
+	lock = search_queue(&res->lr_waiting, &mode, policy, old_lock,
+			    flags, unref);
+	if (lock != NULL)
+		GOTO(out, rc = 1);
+
+	EXIT;
+ out:
+	unlock_res(res);
+	LDLM_RESOURCE_DELREF(res);
+	ldlm_resource_putref(res);
+
+	if (lock) {
+		ldlm_lock2handle(lock, lockh);
+		if ((flags & LDLM_FL_LVB_READY) &&
+		    (!(lock->l_flags & LDLM_FL_LVB_READY))) {
+			struct l_wait_info lwi;
+			if (lock->l_completion_ast) {
+				int err = lock->l_completion_ast(lock,
+							  LDLM_FL_WAIT_NOREPROC,
+								 NULL);
+				if (err) {
+					if (flags & LDLM_FL_TEST_LOCK)
+						LDLM_LOCK_RELEASE(lock);
+					else
+						ldlm_lock_decref_internal(lock,
+									  mode);
+					rc = 0;
+					goto out2;
+				}
+			}
+
+			lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(obd_timeout),
+					       NULL, LWI_ON_SIGNAL_NOOP, NULL);
+
+			/* XXX FIXME see comment on CAN_MATCH in lustre_dlm.h */
+			l_wait_event(lock->l_waitq,
+				     lock->l_flags & LDLM_FL_LVB_READY ||
+				     lock->l_destroyed || lock->l_failed,
+				     &lwi);
+			if (!(lock->l_flags & LDLM_FL_LVB_READY)) {
+				if (flags & LDLM_FL_TEST_LOCK)
+					LDLM_LOCK_RELEASE(lock);
+				else
+					ldlm_lock_decref_internal(lock, mode);
+				rc = 0;
+			}
+		}
+	}
+ out2:
+	if (rc) {
+		LDLM_DEBUG(lock, "matched ("LPU64" "LPU64")",
+			   (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+				res_id->name[2] : policy->l_extent.start,
+			   (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+				res_id->name[3] : policy->l_extent.end);
+
+		/* check user's security context */
+		if (lock->l_conn_export &&
+		    sptlrpc_import_check_ctx(
+				class_exp2cliimp(lock->l_conn_export))) {
+			if (!(flags & LDLM_FL_TEST_LOCK))
+				ldlm_lock_decref_internal(lock, mode);
+			rc = 0;
+		}
+
+		if (flags & LDLM_FL_TEST_LOCK)
+			LDLM_LOCK_RELEASE(lock);
+
+	} else if (!(flags & LDLM_FL_TEST_LOCK)) {/*less verbose for test-only*/
+		LDLM_DEBUG_NOLOCK("not matched ns %p type %u mode %u res "
+				  LPU64"/"LPU64" ("LPU64" "LPU64")", ns,
+				  type, mode, res_id->name[0], res_id->name[1],
+				  (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+					res_id->name[2] :policy->l_extent.start,
+				  (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+					res_id->name[3] : policy->l_extent.end);
+	}
+	if (old_lock)
+		LDLM_LOCK_PUT(old_lock);
+
+	return rc ? mode : 0;
+}
+EXPORT_SYMBOL(ldlm_lock_match);
+
+ldlm_mode_t ldlm_revalidate_lock_handle(struct lustre_handle *lockh,
+					__u64 *bits)
+{
+	struct ldlm_lock *lock;
+	ldlm_mode_t mode = 0;
+	ENTRY;
+
+	lock = ldlm_handle2lock(lockh);
+	if (lock != NULL) {
+		lock_res_and_lock(lock);
+		if (lock->l_destroyed || lock->l_flags & LDLM_FL_FAILED ||
+		    lock->l_failed)
+			GOTO(out, mode);
+
+		if (lock->l_flags & LDLM_FL_CBPENDING &&
+		    lock->l_readers == 0 && lock->l_writers == 0)
+			GOTO(out, mode);
+
+		if (bits)
+			*bits = lock->l_policy_data.l_inodebits.bits;
+		mode = lock->l_granted_mode;
+		ldlm_lock_addref_internal_nolock(lock, mode);
+	}
+
+	EXIT;
+
+out:
+	if (lock != NULL) {
+		unlock_res_and_lock(lock);
+		LDLM_LOCK_PUT(lock);
+	}
+	return mode;
+}
+EXPORT_SYMBOL(ldlm_revalidate_lock_handle);
+
+/** The caller must guarantee that the buffer is large enough. */
+int ldlm_fill_lvb(struct ldlm_lock *lock, struct req_capsule *pill,
+		  enum req_location loc, void *data, int size)
+{
+	void *lvb;
+	ENTRY;
+
+	LASSERT(data != NULL);
+	LASSERT(size >= 0);
+
+	switch (lock->l_lvb_type) {
+	case LVB_T_OST:
+		if (size == sizeof(struct ost_lvb)) {
+			if (loc == RCL_CLIENT)
+				lvb = req_capsule_client_swab_get(pill,
+						&RMF_DLM_LVB,
+						lustre_swab_ost_lvb);
+			else
+				lvb = req_capsule_server_swab_get(pill,
+						&RMF_DLM_LVB,
+						lustre_swab_ost_lvb);
+			if (unlikely(lvb == NULL)) {
+				LDLM_ERROR(lock, "no LVB");
+				RETURN(-EPROTO);
+			}
+
+			memcpy(data, lvb, size);
+		} else if (size == sizeof(struct ost_lvb_v1)) {
+			struct ost_lvb *olvb = data;
+
+			if (loc == RCL_CLIENT)
+				lvb = req_capsule_client_swab_get(pill,
+						&RMF_DLM_LVB,
+						lustre_swab_ost_lvb_v1);
+			else
+				lvb = req_capsule_server_sized_swab_get(pill,
+						&RMF_DLM_LVB, size,
+						lustre_swab_ost_lvb_v1);
+			if (unlikely(lvb == NULL)) {
+				LDLM_ERROR(lock, "no LVB");
+				RETURN(-EPROTO);
+			}
+
+			memcpy(data, lvb, size);
+			olvb->lvb_mtime_ns = 0;
+			olvb->lvb_atime_ns = 0;
+			olvb->lvb_ctime_ns = 0;
+		} else {
+			LDLM_ERROR(lock, "Replied unexpected ost LVB size %d",
+				   size);
+			RETURN(-EINVAL);
+		}
+		break;
+	case LVB_T_LQUOTA:
+		if (size == sizeof(struct lquota_lvb)) {
+			if (loc == RCL_CLIENT)
+				lvb = req_capsule_client_swab_get(pill,
+						&RMF_DLM_LVB,
+						lustre_swab_lquota_lvb);
+			else
+				lvb = req_capsule_server_swab_get(pill,
+						&RMF_DLM_LVB,
+						lustre_swab_lquota_lvb);
+			if (unlikely(lvb == NULL)) {
+				LDLM_ERROR(lock, "no LVB");
+				RETURN(-EPROTO);
+			}
+
+			memcpy(data, lvb, size);
+		} else {
+			LDLM_ERROR(lock, "Replied unexpected lquota LVB size %d",
+				   size);
+			RETURN(-EINVAL);
+		}
+		break;
+	case LVB_T_LAYOUT:
+		if (size == 0)
+			break;
+
+		if (loc == RCL_CLIENT)
+			lvb = req_capsule_client_get(pill, &RMF_DLM_LVB);
+		else
+			lvb = req_capsule_server_get(pill, &RMF_DLM_LVB);
+		if (unlikely(lvb == NULL)) {
+			LDLM_ERROR(lock, "no LVB");
+			RETURN(-EPROTO);
+		}
+
+		memcpy(data, lvb, size);
+		break;
+	default:
+		LDLM_ERROR(lock, "Unknown LVB type: %d\n", lock->l_lvb_type);
+		libcfs_debug_dumpstack(NULL);
+		RETURN(-EINVAL);
+	}
+
+	RETURN(0);
+}
+
+/**
+ * Create and fill in new LDLM lock with specified properties.
+ * Returns a referenced lock
+ */
+struct ldlm_lock *ldlm_lock_create(struct ldlm_namespace *ns,
+				   const struct ldlm_res_id *res_id,
+				   ldlm_type_t type,
+				   ldlm_mode_t mode,
+				   const struct ldlm_callback_suite *cbs,
+				   void *data, __u32 lvb_len,
+				   enum lvb_type lvb_type)
+{
+	struct ldlm_lock *lock;
+	struct ldlm_resource *res;
+	ENTRY;
+
+	res = ldlm_resource_get(ns, NULL, res_id, type, 1);
+	if (res == NULL)
+		RETURN(NULL);
+
+	lock = ldlm_lock_new(res);
+
+	if (lock == NULL)
+		RETURN(NULL);
+
+	lock->l_req_mode = mode;
+	lock->l_ast_data = data;
+	lock->l_pid = current_pid();
+	lock->l_ns_srv = !!ns_is_server(ns);
+	if (cbs) {
+		lock->l_blocking_ast = cbs->lcs_blocking;
+		lock->l_completion_ast = cbs->lcs_completion;
+		lock->l_glimpse_ast = cbs->lcs_glimpse;
+		lock->l_weigh_ast = cbs->lcs_weigh;
+	}
+
+	lock->l_tree_node = NULL;
+	/* if this is the extent lock, allocate the interval tree node */
+	if (type == LDLM_EXTENT) {
+		if (ldlm_interval_alloc(lock) == NULL)
+			GOTO(out, 0);
+	}
+
+	if (lvb_len) {
+		lock->l_lvb_len = lvb_len;
+		OBD_ALLOC(lock->l_lvb_data, lvb_len);
+		if (lock->l_lvb_data == NULL)
+			GOTO(out, 0);
+	}
+
+	lock->l_lvb_type = lvb_type;
+	if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_NEW_LOCK))
+		GOTO(out, 0);
+
+	RETURN(lock);
+
+out:
+	ldlm_lock_destroy(lock);
+	LDLM_LOCK_RELEASE(lock);
+	return NULL;
+}
+
+/**
+ * Enqueue (request) a lock.
+ *
+ * Does not block. As a result of enqueue the lock would be put
+ * into granted or waiting list.
+ *
+ * If namespace has intent policy sent and the lock has LDLM_FL_HAS_INTENT flag
+ * set, skip all the enqueueing and delegate lock processing to intent policy
+ * function.
+ */
+ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns,
+			       struct ldlm_lock **lockp,
+			       void *cookie, __u64 *flags)
+{
+	struct ldlm_lock *lock = *lockp;
+	struct ldlm_resource *res = lock->l_resource;
+	int local = ns_is_client(ldlm_res_to_ns(res));
+	ldlm_error_t rc = ELDLM_OK;
+	struct ldlm_interval *node = NULL;
+	ENTRY;
+
+	lock->l_last_activity = cfs_time_current_sec();
+	/* policies are not executed on the client or during replay */
+	if ((*flags & (LDLM_FL_HAS_INTENT|LDLM_FL_REPLAY)) == LDLM_FL_HAS_INTENT
+	    && !local && ns->ns_policy) {
+		rc = ns->ns_policy(ns, lockp, cookie, lock->l_req_mode, *flags,
+				   NULL);
+		if (rc == ELDLM_LOCK_REPLACED) {
+			/* The lock that was returned has already been granted,
+			 * and placed into lockp.  If it's not the same as the
+			 * one we passed in, then destroy the old one and our
+			 * work here is done. */
+			if (lock != *lockp) {
+				ldlm_lock_destroy(lock);
+				LDLM_LOCK_RELEASE(lock);
+			}
+			*flags |= LDLM_FL_LOCK_CHANGED;
+			RETURN(0);
+		} else if (rc != ELDLM_OK ||
+			   (rc == ELDLM_OK && (*flags & LDLM_FL_INTENT_ONLY))) {
+			ldlm_lock_destroy(lock);
+			RETURN(rc);
+		}
+	}
+
+	/* For a replaying lock, it might be already in granted list. So
+	 * unlinking the lock will cause the interval node to be freed, we
+	 * have to allocate the interval node early otherwise we can't regrant
+	 * this lock in the future. - jay */
+	if (!local && (*flags & LDLM_FL_REPLAY) && res->lr_type == LDLM_EXTENT)
+		OBD_SLAB_ALLOC_PTR_GFP(node, ldlm_interval_slab, __GFP_IO);
+
+	lock_res_and_lock(lock);
+	if (local && lock->l_req_mode == lock->l_granted_mode) {
+		/* The server returned a blocked lock, but it was granted
+		 * before we got a chance to actually enqueue it.  We don't
+		 * need to do anything else. */
+		*flags &= ~(LDLM_FL_BLOCK_GRANTED |
+			    LDLM_FL_BLOCK_CONV | LDLM_FL_BLOCK_WAIT);
+		GOTO(out, ELDLM_OK);
+	}
+
+	ldlm_resource_unlink_lock(lock);
+	if (res->lr_type == LDLM_EXTENT && lock->l_tree_node == NULL) {
+		if (node == NULL) {
+			ldlm_lock_destroy_nolock(lock);
+			GOTO(out, rc = -ENOMEM);
+		}
+
+		INIT_LIST_HEAD(&node->li_group);
+		ldlm_interval_attach(node, lock);
+		node = NULL;
+	}
+
+	/* Some flags from the enqueue want to make it into the AST, via the
+	 * lock's l_flags. */
+	lock->l_flags |= *flags & LDLM_AST_DISCARD_DATA;
+
+	/* This distinction between local lock trees is very important; a client
+	 * namespace only has information about locks taken by that client, and
+	 * thus doesn't have enough information to decide for itself if it can
+	 * be granted (below).  In this case, we do exactly what the server
+	 * tells us to do, as dictated by the 'flags'.
+	 *
+	 * We do exactly the same thing during recovery, when the server is
+	 * more or less trusting the clients not to lie.
+	 *
+	 * FIXME (bug 268): Detect obvious lies by checking compatibility in
+	 * granted/converting queues. */
+	if (local) {
+		if (*flags & LDLM_FL_BLOCK_CONV)
+			ldlm_resource_add_lock(res, &res->lr_converting, lock);
+		else if (*flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED))
+			ldlm_resource_add_lock(res, &res->lr_waiting, lock);
+		else
+			ldlm_grant_lock(lock, NULL);
+		GOTO(out, ELDLM_OK);
+	} else {
+		CERROR("This is client-side-only module, cannot handle "
+		       "LDLM_NAMESPACE_SERVER resource type lock.\n");
+		LBUG();
+	}
+
+out:
+	unlock_res_and_lock(lock);
+	if (node)
+		OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node));
+	return rc;
+}
+
+
+/**
+ * Process a call to blocking AST callback for a lock in ast_work list
+ */
+static int
+ldlm_work_bl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
+{
+	struct ldlm_cb_set_arg *arg = opaq;
+	struct ldlm_lock_desc   d;
+	int		     rc;
+	struct ldlm_lock       *lock;
+	ENTRY;
+
+	if (list_empty(arg->list))
+		RETURN(-ENOENT);
+
+	lock = list_entry(arg->list->next, struct ldlm_lock, l_bl_ast);
+
+	/* nobody should touch l_bl_ast */
+	lock_res_and_lock(lock);
+	list_del_init(&lock->l_bl_ast);
+
+	LASSERT(lock->l_flags & LDLM_FL_AST_SENT);
+	LASSERT(lock->l_bl_ast_run == 0);
+	LASSERT(lock->l_blocking_lock);
+	lock->l_bl_ast_run++;
+	unlock_res_and_lock(lock);
+
+	ldlm_lock2desc(lock->l_blocking_lock, &d);
+
+	rc = lock->l_blocking_ast(lock, &d, (void *)arg, LDLM_CB_BLOCKING);
+	LDLM_LOCK_RELEASE(lock->l_blocking_lock);
+	lock->l_blocking_lock = NULL;
+	LDLM_LOCK_RELEASE(lock);
+
+	RETURN(rc);
+}
+
+/**
+ * Process a call to completion AST callback for a lock in ast_work list
+ */
+static int
+ldlm_work_cp_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
+{
+	struct ldlm_cb_set_arg  *arg = opaq;
+	int		      rc = 0;
+	struct ldlm_lock	*lock;
+	ldlm_completion_callback completion_callback;
+	ENTRY;
+
+	if (list_empty(arg->list))
+		RETURN(-ENOENT);
+
+	lock = list_entry(arg->list->next, struct ldlm_lock, l_cp_ast);
+
+	/* It's possible to receive a completion AST before we've set
+	 * the l_completion_ast pointer: either because the AST arrived
+	 * before the reply, or simply because there's a small race
+	 * window between receiving the reply and finishing the local
+	 * enqueue. (bug 842)
+	 *
+	 * This can't happen with the blocking_ast, however, because we
+	 * will never call the local blocking_ast until we drop our
+	 * reader/writer reference, which we won't do until we get the
+	 * reply and finish enqueueing. */
+
+	/* nobody should touch l_cp_ast */
+	lock_res_and_lock(lock);
+	list_del_init(&lock->l_cp_ast);
+	LASSERT(lock->l_flags & LDLM_FL_CP_REQD);
+	/* save l_completion_ast since it can be changed by
+	 * mds_intent_policy(), see bug 14225 */
+	completion_callback = lock->l_completion_ast;
+	lock->l_flags &= ~LDLM_FL_CP_REQD;
+	unlock_res_and_lock(lock);
+
+	if (completion_callback != NULL)
+		rc = completion_callback(lock, 0, (void *)arg);
+	LDLM_LOCK_RELEASE(lock);
+
+	RETURN(rc);
+}
+
+/**
+ * Process a call to revocation AST callback for a lock in ast_work list
+ */
+static int
+ldlm_work_revoke_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
+{
+	struct ldlm_cb_set_arg *arg = opaq;
+	struct ldlm_lock_desc   desc;
+	int		     rc;
+	struct ldlm_lock       *lock;
+	ENTRY;
+
+	if (list_empty(arg->list))
+		RETURN(-ENOENT);
+
+	lock = list_entry(arg->list->next, struct ldlm_lock, l_rk_ast);
+	list_del_init(&lock->l_rk_ast);
+
+	/* the desc just pretend to exclusive */
+	ldlm_lock2desc(lock, &desc);
+	desc.l_req_mode = LCK_EX;
+	desc.l_granted_mode = 0;
+
+	rc = lock->l_blocking_ast(lock, &desc, (void*)arg, LDLM_CB_BLOCKING);
+	LDLM_LOCK_RELEASE(lock);
+
+	RETURN(rc);
+}
+
+/**
+ * Process a call to glimpse AST callback for a lock in ast_work list
+ */
+int ldlm_work_gl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
+{
+	struct ldlm_cb_set_arg		*arg = opaq;
+	struct ldlm_glimpse_work	*gl_work;
+	struct ldlm_lock		*lock;
+	int				 rc = 0;
+	ENTRY;
+
+	if (list_empty(arg->list))
+		RETURN(-ENOENT);
+
+	gl_work = list_entry(arg->list->next, struct ldlm_glimpse_work,
+				 gl_list);
+	list_del_init(&gl_work->gl_list);
+
+	lock = gl_work->gl_lock;
+
+	/* transfer the glimpse descriptor to ldlm_cb_set_arg */
+	arg->gl_desc = gl_work->gl_desc;
+
+	/* invoke the actual glimpse callback */
+	if (lock->l_glimpse_ast(lock, (void*)arg) == 0)
+		rc = 1;
+
+	LDLM_LOCK_RELEASE(lock);
+
+	if ((gl_work->gl_flags & LDLM_GL_WORK_NOFREE) == 0)
+		OBD_FREE_PTR(gl_work);
+
+	RETURN(rc);
+}
+
+/**
+ * Process list of locks in need of ASTs being sent.
+ *
+ * Used on server to send multiple ASTs together instead of sending one by
+ * one.
+ */
+int ldlm_run_ast_work(struct ldlm_namespace *ns, struct list_head *rpc_list,
+		      ldlm_desc_ast_t ast_type)
+{
+	struct ldlm_cb_set_arg *arg;
+	set_producer_func       work_ast_lock;
+	int		     rc;
+
+	if (list_empty(rpc_list))
+		RETURN(0);
+
+	OBD_ALLOC_PTR(arg);
+	if (arg == NULL)
+		RETURN(-ENOMEM);
+
+	atomic_set(&arg->restart, 0);
+	arg->list = rpc_list;
+
+	switch (ast_type) {
+		case LDLM_WORK_BL_AST:
+			arg->type = LDLM_BL_CALLBACK;
+			work_ast_lock = ldlm_work_bl_ast_lock;
+			break;
+		case LDLM_WORK_CP_AST:
+			arg->type = LDLM_CP_CALLBACK;
+			work_ast_lock = ldlm_work_cp_ast_lock;
+			break;
+		case LDLM_WORK_REVOKE_AST:
+			arg->type = LDLM_BL_CALLBACK;
+			work_ast_lock = ldlm_work_revoke_ast_lock;
+			break;
+		case LDLM_WORK_GL_AST:
+			arg->type = LDLM_GL_CALLBACK;
+			work_ast_lock = ldlm_work_gl_ast_lock;
+			break;
+		default:
+			LBUG();
+	}
+
+	/* We create a ptlrpc request set with flow control extension.
+	 * This request set will use the work_ast_lock function to produce new
+	 * requests and will send a new request each time one completes in order
+	 * to keep the number of requests in flight to ns_max_parallel_ast */
+	arg->set = ptlrpc_prep_fcset(ns->ns_max_parallel_ast ? : UINT_MAX,
+				     work_ast_lock, arg);
+	if (arg->set == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	ptlrpc_set_wait(arg->set);
+	ptlrpc_set_destroy(arg->set);
+
+	rc = atomic_read(&arg->restart) ? -ERESTART : 0;
+	GOTO(out, rc);
+out:
+	OBD_FREE_PTR(arg);
+	return rc;
+}
+
+static int reprocess_one_queue(struct ldlm_resource *res, void *closure)
+{
+	ldlm_reprocess_all(res);
+	return LDLM_ITER_CONTINUE;
+}
+
+static int ldlm_reprocess_res(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+			      struct hlist_node *hnode, void *arg)
+{
+	struct ldlm_resource *res = cfs_hash_object(hs, hnode);
+	int    rc;
+
+	rc = reprocess_one_queue(res, arg);
+
+	return rc == LDLM_ITER_STOP;
+}
+
+/**
+ * Iterate through all resources on a namespace attempting to grant waiting
+ * locks.
+ */
+void ldlm_reprocess_all_ns(struct ldlm_namespace *ns)
+{
+	ENTRY;
+
+	if (ns != NULL) {
+		cfs_hash_for_each_nolock(ns->ns_rs_hash,
+					 ldlm_reprocess_res, NULL);
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(ldlm_reprocess_all_ns);
+
+/**
+ * Try to grant all waiting locks on a resource.
+ *
+ * Calls ldlm_reprocess_queue on converting and waiting queues.
+ *
+ * Typically called after some resource locks are cancelled to see
+ * if anything could be granted as a result of the cancellation.
+ */
+void ldlm_reprocess_all(struct ldlm_resource *res)
+{
+	LIST_HEAD(rpc_list);
+
+	ENTRY;
+	if (!ns_is_client(ldlm_res_to_ns(res))) {
+		CERROR("This is client-side-only module, cannot handle "
+		       "LDLM_NAMESPACE_SERVER resource type lock.\n");
+		LBUG();
+	}
+	EXIT;
+}
+
+/**
+ * Helper function to call blocking AST for LDLM lock \a lock in a
+ * "cancelling" mode.
+ */
+void ldlm_cancel_callback(struct ldlm_lock *lock)
+{
+	check_res_locked(lock->l_resource);
+	if (!(lock->l_flags & LDLM_FL_CANCEL)) {
+		lock->l_flags |= LDLM_FL_CANCEL;
+		if (lock->l_blocking_ast) {
+			unlock_res_and_lock(lock);
+			lock->l_blocking_ast(lock, NULL, lock->l_ast_data,
+					     LDLM_CB_CANCELING);
+			lock_res_and_lock(lock);
+		} else {
+			LDLM_DEBUG(lock, "no blocking ast");
+		}
+	}
+	lock->l_flags |= LDLM_FL_BL_DONE;
+}
+
+/**
+ * Remove skiplist-enabled LDLM lock \a req from granted list
+ */
+void ldlm_unlink_lock_skiplist(struct ldlm_lock *req)
+{
+	if (req->l_resource->lr_type != LDLM_PLAIN &&
+	    req->l_resource->lr_type != LDLM_IBITS)
+		return;
+
+	list_del_init(&req->l_sl_policy);
+	list_del_init(&req->l_sl_mode);
+}
+
+/**
+ * Attempts to cancel LDLM lock \a lock that has no reader/writer references.
+ */
+void ldlm_lock_cancel(struct ldlm_lock *lock)
+{
+	struct ldlm_resource *res;
+	struct ldlm_namespace *ns;
+	ENTRY;
+
+	lock_res_and_lock(lock);
+
+	res = lock->l_resource;
+	ns  = ldlm_res_to_ns(res);
+
+	/* Please do not, no matter how tempting, remove this LBUG without
+	 * talking to me first. -phik */
+	if (lock->l_readers || lock->l_writers) {
+		LDLM_ERROR(lock, "lock still has references");
+		LBUG();
+	}
+
+	if (lock->l_waited)
+		ldlm_del_waiting_lock(lock);
+
+	/* Releases cancel callback. */
+	ldlm_cancel_callback(lock);
+
+	/* Yes, second time, just in case it was added again while we were
+	   running with no res lock in ldlm_cancel_callback */
+	if (lock->l_waited)
+		ldlm_del_waiting_lock(lock);
+
+	ldlm_resource_unlink_lock(lock);
+	ldlm_lock_destroy_nolock(lock);
+
+	if (lock->l_granted_mode == lock->l_req_mode)
+		ldlm_pool_del(&ns->ns_pool, lock);
+
+	/* Make sure we will not be called again for same lock what is possible
+	 * if not to zero out lock->l_granted_mode */
+	lock->l_granted_mode = LCK_MINMODE;
+	unlock_res_and_lock(lock);
+
+	EXIT;
+}
+EXPORT_SYMBOL(ldlm_lock_cancel);
+
+/**
+ * Set opaque data into the lock that only makes sense to upper layer.
+ */
+int ldlm_lock_set_data(struct lustre_handle *lockh, void *data)
+{
+	struct ldlm_lock *lock = ldlm_handle2lock(lockh);
+	int rc = -EINVAL;
+	ENTRY;
+
+	if (lock) {
+		if (lock->l_ast_data == NULL)
+			lock->l_ast_data = data;
+		if (lock->l_ast_data == data)
+			rc = 0;
+		LDLM_LOCK_PUT(lock);
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_lock_set_data);
+
+struct export_cl_data {
+	struct obd_export	*ecl_exp;
+	int			ecl_loop;
+};
+
+/**
+ * Iterator function for ldlm_cancel_locks_for_export.
+ * Cancels passed locks.
+ */
+int ldlm_cancel_locks_for_export_cb(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+				    struct hlist_node *hnode, void *data)
+
+{
+	struct export_cl_data	*ecl = (struct export_cl_data *)data;
+	struct obd_export	*exp  = ecl->ecl_exp;
+	struct ldlm_lock     *lock = cfs_hash_object(hs, hnode);
+	struct ldlm_resource *res;
+
+	res = ldlm_resource_getref(lock->l_resource);
+	LDLM_LOCK_GET(lock);
+
+	LDLM_DEBUG(lock, "export %p", exp);
+	ldlm_res_lvbo_update(res, NULL, 1);
+	ldlm_lock_cancel(lock);
+	ldlm_reprocess_all(res);
+	ldlm_resource_putref(res);
+	LDLM_LOCK_RELEASE(lock);
+
+	ecl->ecl_loop++;
+	if ((ecl->ecl_loop & -ecl->ecl_loop) == ecl->ecl_loop) {
+		CDEBUG(D_INFO,
+		       "Cancel lock %p for export %p (loop %d), still have "
+		       "%d locks left on hash table.\n",
+		       lock, exp, ecl->ecl_loop,
+		       atomic_read(&hs->hs_count));
+	}
+
+	return 0;
+}
+
+/**
+ * Cancel all locks for given export.
+ *
+ * Typically called on client disconnection/eviction
+ */
+void ldlm_cancel_locks_for_export(struct obd_export *exp)
+{
+	struct export_cl_data	ecl = {
+		.ecl_exp	= exp,
+		.ecl_loop	= 0,
+	};
+
+	cfs_hash_for_each_empty(exp->exp_lock_hash,
+				ldlm_cancel_locks_for_export_cb, &ecl);
+}
+
+/**
+ * Downgrade an exclusive lock.
+ *
+ * A fast variant of ldlm_lock_convert for convertion of exclusive
+ * locks. The convertion is always successful.
+ * Used by Commit on Sharing (COS) code.
+ *
+ * \param lock A lock to convert
+ * \param new_mode new lock mode
+ */
+void ldlm_lock_downgrade(struct ldlm_lock *lock, int new_mode)
+{
+	ENTRY;
+
+	LASSERT(lock->l_granted_mode & (LCK_PW | LCK_EX));
+	LASSERT(new_mode == LCK_COS);
+
+	lock_res_and_lock(lock);
+	ldlm_resource_unlink_lock(lock);
+	/*
+	 * Remove the lock from pool as it will be added again in
+	 * ldlm_grant_lock() called below.
+	 */
+	ldlm_pool_del(&ldlm_lock_to_ns(lock)->ns_pool, lock);
+
+	lock->l_req_mode = new_mode;
+	ldlm_grant_lock(lock, NULL);
+	unlock_res_and_lock(lock);
+	ldlm_reprocess_all(lock->l_resource);
+
+	EXIT;
+}
+EXPORT_SYMBOL(ldlm_lock_downgrade);
+
+/**
+ * Attempt to convert already granted lock to a different mode.
+ *
+ * While lock conversion is not currently used, future client-side
+ * optimizations could take advantage of it to avoid discarding cached
+ * pages on a file.
+ */
+struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode,
+					__u32 *flags)
+{
+	LIST_HEAD(rpc_list);
+	struct ldlm_resource *res;
+	struct ldlm_namespace *ns;
+	int granted = 0;
+	struct ldlm_interval *node;
+	ENTRY;
+
+	/* Just return if mode is unchanged. */
+	if (new_mode == lock->l_granted_mode) {
+		*flags |= LDLM_FL_BLOCK_GRANTED;
+		RETURN(lock->l_resource);
+	}
+
+	/* I can't check the type of lock here because the bitlock of lock
+	 * is not held here, so do the allocation blindly. -jay */
+	OBD_SLAB_ALLOC_PTR_GFP(node, ldlm_interval_slab, __GFP_IO);
+	if (node == NULL)  /* Actually, this causes EDEADLOCK to be returned */
+		RETURN(NULL);
+
+	LASSERTF((new_mode == LCK_PW && lock->l_granted_mode == LCK_PR),
+		 "new_mode %u, granted %u\n", new_mode, lock->l_granted_mode);
+
+	lock_res_and_lock(lock);
+
+	res = lock->l_resource;
+	ns  = ldlm_res_to_ns(res);
+
+	lock->l_req_mode = new_mode;
+	if (res->lr_type == LDLM_PLAIN || res->lr_type == LDLM_IBITS) {
+		ldlm_resource_unlink_lock(lock);
+	} else {
+		ldlm_resource_unlink_lock(lock);
+		if (res->lr_type == LDLM_EXTENT) {
+			/* FIXME: ugly code, I have to attach the lock to a
+			 * interval node again since perhaps it will be granted
+			 * soon */
+			INIT_LIST_HEAD(&node->li_group);
+			ldlm_interval_attach(node, lock);
+			node = NULL;
+		}
+	}
+
+	/*
+	 * Remove old lock from the pool before adding the lock with new
+	 * mode below in ->policy()
+	 */
+	ldlm_pool_del(&ns->ns_pool, lock);
+
+	/* If this is a local resource, put it on the appropriate list. */
+	if (ns_is_client(ldlm_res_to_ns(res))) {
+		if (*flags & (LDLM_FL_BLOCK_CONV | LDLM_FL_BLOCK_GRANTED)) {
+			ldlm_resource_add_lock(res, &res->lr_converting, lock);
+		} else {
+			/* This should never happen, because of the way the
+			 * server handles conversions. */
+			LDLM_ERROR(lock, "Erroneous flags %x on local lock\n",
+				   *flags);
+			LBUG();
+
+			ldlm_grant_lock(lock, &rpc_list);
+			granted = 1;
+			/* FIXME: completion handling not with lr_lock held ! */
+			if (lock->l_completion_ast)
+				lock->l_completion_ast(lock, 0, NULL);
+		}
+	} else {
+		CERROR("This is client-side-only module, cannot handle "
+		       "LDLM_NAMESPACE_SERVER resource type lock.\n");
+		LBUG();
+	}
+	unlock_res_and_lock(lock);
+
+	if (granted)
+		ldlm_run_ast_work(ns, &rpc_list, LDLM_WORK_CP_AST);
+	if (node)
+		OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node));
+	RETURN(res);
+}
+EXPORT_SYMBOL(ldlm_lock_convert);
+
+/**
+ * Print lock with lock handle \a lockh description into debug log.
+ *
+ * Used when printing all locks on a resource for debug purposes.
+ */
+void ldlm_lock_dump_handle(int level, struct lustre_handle *lockh)
+{
+	struct ldlm_lock *lock;
+
+	if (!((libcfs_debug | D_ERROR) & level))
+		return;
+
+	lock = ldlm_handle2lock(lockh);
+	if (lock == NULL)
+		return;
+
+	LDLM_DEBUG_LIMIT(level, lock, "###");
+
+	LDLM_LOCK_PUT(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_dump_handle);
+
+/**
+ * Print lock information with custom message into debug log.
+ * Helper function.
+ */
+void _ldlm_lock_debug(struct ldlm_lock *lock,
+		      struct libcfs_debug_msg_data *msgdata,
+		      const char *fmt, ...)
+{
+	va_list args;
+	struct obd_export *exp = lock->l_export;
+	struct ldlm_resource *resource = lock->l_resource;
+	char *nid = "local";
+
+	va_start(args, fmt);
+
+	if (exp && exp->exp_connection) {
+		nid = libcfs_nid2str(exp->exp_connection->c_peer.nid);
+	} else if (exp && exp->exp_obd != NULL) {
+		struct obd_import *imp = exp->exp_obd->u.cli.cl_import;
+		nid = libcfs_nid2str(imp->imp_connection->c_peer.nid);
+	}
+
+	if (resource == NULL) {
+		libcfs_debug_vmsg2(msgdata, fmt, args,
+		       " ns: \?\? lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s "
+		       "res: \?\? rrc=\?\? type: \?\?\? flags: "LPX64" nid: %s "
+		       "remote: "LPX64" expref: %d pid: %u timeout: %lu "
+		       "lvb_type: %d\n",
+		       lock,
+		       lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
+		       lock->l_readers, lock->l_writers,
+		       ldlm_lockname[lock->l_granted_mode],
+		       ldlm_lockname[lock->l_req_mode],
+		       lock->l_flags, nid, lock->l_remote_handle.cookie,
+		       exp ? atomic_read(&exp->exp_refcount) : -99,
+		       lock->l_pid, lock->l_callback_timeout, lock->l_lvb_type);
+		va_end(args);
+		return;
+	}
+
+	switch (resource->lr_type) {
+	case LDLM_EXTENT:
+		libcfs_debug_vmsg2(msgdata, fmt, args,
+		       " ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s "
+		       "res: "LPU64"/"LPU64" rrc: %d type: %s ["LPU64"->"LPU64
+		       "] (req "LPU64"->"LPU64") flags: "LPX64" nid: %s remote:"
+		       " "LPX64" expref: %d pid: %u timeout: %lu lvb_type: %d\n",
+		       ldlm_lock_to_ns_name(lock), lock,
+		       lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
+		       lock->l_readers, lock->l_writers,
+		       ldlm_lockname[lock->l_granted_mode],
+		       ldlm_lockname[lock->l_req_mode],
+		       resource->lr_name.name[0],
+		       resource->lr_name.name[1],
+		       atomic_read(&resource->lr_refcount),
+		       ldlm_typename[resource->lr_type],
+		       lock->l_policy_data.l_extent.start,
+		       lock->l_policy_data.l_extent.end,
+		       lock->l_req_extent.start, lock->l_req_extent.end,
+		       lock->l_flags, nid, lock->l_remote_handle.cookie,
+		       exp ? atomic_read(&exp->exp_refcount) : -99,
+		       lock->l_pid, lock->l_callback_timeout, lock->l_lvb_type);
+		break;
+
+	case LDLM_FLOCK:
+		libcfs_debug_vmsg2(msgdata, fmt, args,
+		       " ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s "
+		       "res: "LPU64"/"LPU64" rrc: %d type: %s pid: %d "
+		       "["LPU64"->"LPU64"] flags: "LPX64" nid: %s remote: "LPX64
+		       " expref: %d pid: %u timeout: %lu\n",
+		       ldlm_lock_to_ns_name(lock), lock,
+		       lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
+		       lock->l_readers, lock->l_writers,
+		       ldlm_lockname[lock->l_granted_mode],
+		       ldlm_lockname[lock->l_req_mode],
+		       resource->lr_name.name[0],
+		       resource->lr_name.name[1],
+		       atomic_read(&resource->lr_refcount),
+		       ldlm_typename[resource->lr_type],
+		       lock->l_policy_data.l_flock.pid,
+		       lock->l_policy_data.l_flock.start,
+		       lock->l_policy_data.l_flock.end,
+		       lock->l_flags, nid, lock->l_remote_handle.cookie,
+		       exp ? atomic_read(&exp->exp_refcount) : -99,
+		       lock->l_pid, lock->l_callback_timeout);
+		break;
+
+	case LDLM_IBITS:
+		libcfs_debug_vmsg2(msgdata, fmt, args,
+		       " ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s "
+		       "res: "LPU64"/"LPU64" bits "LPX64" rrc: %d type: %s "
+		       "flags: "LPX64" nid: %s remote: "LPX64" expref: %d "
+		       "pid: %u timeout: %lu lvb_type: %d\n",
+		       ldlm_lock_to_ns_name(lock),
+		       lock, lock->l_handle.h_cookie,
+		       atomic_read (&lock->l_refc),
+		       lock->l_readers, lock->l_writers,
+		       ldlm_lockname[lock->l_granted_mode],
+		       ldlm_lockname[lock->l_req_mode],
+		       resource->lr_name.name[0],
+		       resource->lr_name.name[1],
+		       lock->l_policy_data.l_inodebits.bits,
+		       atomic_read(&resource->lr_refcount),
+		       ldlm_typename[resource->lr_type],
+		       lock->l_flags, nid, lock->l_remote_handle.cookie,
+		       exp ? atomic_read(&exp->exp_refcount) : -99,
+		       lock->l_pid, lock->l_callback_timeout, lock->l_lvb_type);
+		break;
+
+	default:
+		libcfs_debug_vmsg2(msgdata, fmt, args,
+		       " ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s "
+		       "res: "LPU64"/"LPU64" rrc: %d type: %s flags: "LPX64" "
+		       "nid: %s remote: "LPX64" expref: %d pid: %u timeout: %lu"
+		       "lvb_type: %d\n",
+		       ldlm_lock_to_ns_name(lock),
+		       lock, lock->l_handle.h_cookie,
+		       atomic_read (&lock->l_refc),
+		       lock->l_readers, lock->l_writers,
+		       ldlm_lockname[lock->l_granted_mode],
+		       ldlm_lockname[lock->l_req_mode],
+		       resource->lr_name.name[0],
+		       resource->lr_name.name[1],
+		       atomic_read(&resource->lr_refcount),
+		       ldlm_typename[resource->lr_type],
+		       lock->l_flags, nid, lock->l_remote_handle.cookie,
+		       exp ? atomic_read(&exp->exp_refcount) : -99,
+		       lock->l_pid, lock->l_callback_timeout, lock->l_lvb_type);
+		break;
+	}
+	va_end(args);
+}
+EXPORT_SYMBOL(_ldlm_lock_debug);
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c b/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c
new file mode 100644
index 000000000000..324d5e4286dc
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c
@@ -0,0 +1,1238 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_lockd.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+# include <linux/libcfs/libcfs.h>
+
+#include <lustre_dlm.h>
+#include <obd_class.h>
+#include <linux/list.h>
+#include "ldlm_internal.h"
+
+static int ldlm_num_threads;
+CFS_MODULE_PARM(ldlm_num_threads, "i", int, 0444,
+		"number of DLM service threads to start");
+
+static char *ldlm_cpts;
+CFS_MODULE_PARM(ldlm_cpts, "s", charp, 0444,
+		"CPU partitions ldlm threads should run on");
+
+extern struct kmem_cache *ldlm_resource_slab;
+extern struct kmem_cache *ldlm_lock_slab;
+static struct mutex	ldlm_ref_mutex;
+static int ldlm_refcount;
+
+struct ldlm_cb_async_args {
+	struct ldlm_cb_set_arg *ca_set_arg;
+	struct ldlm_lock       *ca_lock;
+};
+
+/* LDLM state */
+
+static struct ldlm_state *ldlm_state;
+
+inline cfs_time_t round_timeout(cfs_time_t timeout)
+{
+	return cfs_time_seconds((int)cfs_duration_sec(cfs_time_sub(timeout, 0)) + 1);
+}
+
+/* timeout for initial callback (AST) reply (bz10399) */
+static inline unsigned int ldlm_get_rq_timeout(void)
+{
+	/* Non-AT value */
+	unsigned int timeout = min(ldlm_timeout, obd_timeout / 3);
+
+	return timeout < 1 ? 1 : timeout;
+}
+
+#define ELT_STOPPED   0
+#define ELT_READY     1
+#define ELT_TERMINATE 2
+
+struct ldlm_bl_pool {
+	spinlock_t		blp_lock;
+
+	/*
+	 * blp_prio_list is used for callbacks that should be handled
+	 * as a priority. It is used for LDLM_FL_DISCARD_DATA requests.
+	 * see bug 13843
+	 */
+	struct list_head	      blp_prio_list;
+
+	/*
+	 * blp_list is used for all other callbacks which are likely
+	 * to take longer to process.
+	 */
+	struct list_head	      blp_list;
+
+	wait_queue_head_t	     blp_waitq;
+	struct completion	blp_comp;
+	atomic_t	    blp_num_threads;
+	atomic_t	    blp_busy_threads;
+	int		     blp_min_threads;
+	int		     blp_max_threads;
+};
+
+struct ldlm_bl_work_item {
+	struct list_head	      blwi_entry;
+	struct ldlm_namespace  *blwi_ns;
+	struct ldlm_lock_desc   blwi_ld;
+	struct ldlm_lock       *blwi_lock;
+	struct list_head	      blwi_head;
+	int		     blwi_count;
+	struct completion	blwi_comp;
+	ldlm_cancel_flags_t     blwi_flags;
+	int		     blwi_mem_pressure;
+};
+
+
+int ldlm_del_waiting_lock(struct ldlm_lock *lock)
+{
+	RETURN(0);
+}
+
+int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout)
+{
+	RETURN(0);
+}
+
+
+
+/**
+ * Callback handler for receiving incoming blocking ASTs.
+ *
+ * This can only happen on client side.
+ */
+void ldlm_handle_bl_callback(struct ldlm_namespace *ns,
+			     struct ldlm_lock_desc *ld, struct ldlm_lock *lock)
+{
+	int do_ast;
+	ENTRY;
+
+	LDLM_DEBUG(lock, "client blocking AST callback handler");
+
+	lock_res_and_lock(lock);
+	lock->l_flags |= LDLM_FL_CBPENDING;
+
+	if (lock->l_flags & LDLM_FL_CANCEL_ON_BLOCK)
+		lock->l_flags |= LDLM_FL_CANCEL;
+
+	do_ast = (!lock->l_readers && !lock->l_writers);
+	unlock_res_and_lock(lock);
+
+	if (do_ast) {
+		CDEBUG(D_DLMTRACE, "Lock %p already unused, calling callback (%p)\n",
+		       lock, lock->l_blocking_ast);
+		if (lock->l_blocking_ast != NULL)
+			lock->l_blocking_ast(lock, ld, lock->l_ast_data,
+					     LDLM_CB_BLOCKING);
+	} else {
+		CDEBUG(D_DLMTRACE, "Lock %p is referenced, will be cancelled later\n",
+		       lock);
+	}
+
+	LDLM_DEBUG(lock, "client blocking callback handler END");
+	LDLM_LOCK_RELEASE(lock);
+	EXIT;
+}
+
+/**
+ * Callback handler for receiving incoming completion ASTs.
+ *
+ * This only can happen on client side.
+ */
+static void ldlm_handle_cp_callback(struct ptlrpc_request *req,
+				    struct ldlm_namespace *ns,
+				    struct ldlm_request *dlm_req,
+				    struct ldlm_lock *lock)
+{
+	int lvb_len;
+	LIST_HEAD(ast_list);
+	int rc = 0;
+	ENTRY;
+
+	LDLM_DEBUG(lock, "client completion callback handler START");
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE)) {
+		int to = cfs_time_seconds(1);
+		while (to > 0) {
+			schedule_timeout_and_set_state(
+				TASK_INTERRUPTIBLE, to);
+			if (lock->l_granted_mode == lock->l_req_mode ||
+			    lock->l_destroyed)
+				break;
+		}
+	}
+
+	lvb_len = req_capsule_get_size(&req->rq_pill, &RMF_DLM_LVB, RCL_CLIENT);
+	if (lvb_len < 0) {
+		LDLM_ERROR(lock, "Fail to get lvb_len, rc = %d", lvb_len);
+		GOTO(out, rc = lvb_len);
+	} else if (lvb_len > 0) {
+		if (lock->l_lvb_len > 0) {
+			/* for extent lock, lvb contains ost_lvb{}. */
+			LASSERT(lock->l_lvb_data != NULL);
+
+			if (unlikely(lock->l_lvb_len < lvb_len)) {
+				LDLM_ERROR(lock, "Replied LVB is larger than "
+					   "expectation, expected = %d, "
+					   "replied = %d",
+					   lock->l_lvb_len, lvb_len);
+				GOTO(out, rc = -EINVAL);
+			}
+		} else if (ldlm_has_layout(lock)) { /* for layout lock, lvb has
+						     * variable length */
+			void *lvb_data;
+
+			OBD_ALLOC(lvb_data, lvb_len);
+			if (lvb_data == NULL) {
+				LDLM_ERROR(lock, "No memory: %d.\n", lvb_len);
+				GOTO(out, rc = -ENOMEM);
+			}
+
+			lock_res_and_lock(lock);
+			LASSERT(lock->l_lvb_data == NULL);
+			lock->l_lvb_data = lvb_data;
+			lock->l_lvb_len = lvb_len;
+			unlock_res_and_lock(lock);
+		}
+	}
+
+	lock_res_and_lock(lock);
+	if (lock->l_destroyed ||
+	    lock->l_granted_mode == lock->l_req_mode) {
+		/* bug 11300: the lock has already been granted */
+		unlock_res_and_lock(lock);
+		LDLM_DEBUG(lock, "Double grant race happened");
+		GOTO(out, rc = 0);
+	}
+
+	/* If we receive the completion AST before the actual enqueue returned,
+	 * then we might need to switch lock modes, resources, or extents. */
+	if (dlm_req->lock_desc.l_granted_mode != lock->l_req_mode) {
+		lock->l_req_mode = dlm_req->lock_desc.l_granted_mode;
+		LDLM_DEBUG(lock, "completion AST, new lock mode");
+	}
+
+	if (lock->l_resource->lr_type != LDLM_PLAIN) {
+		ldlm_convert_policy_to_local(req->rq_export,
+					  dlm_req->lock_desc.l_resource.lr_type,
+					  &dlm_req->lock_desc.l_policy_data,
+					  &lock->l_policy_data);
+		LDLM_DEBUG(lock, "completion AST, new policy data");
+	}
+
+	ldlm_resource_unlink_lock(lock);
+	if (memcmp(&dlm_req->lock_desc.l_resource.lr_name,
+		   &lock->l_resource->lr_name,
+		   sizeof(lock->l_resource->lr_name)) != 0) {
+		unlock_res_and_lock(lock);
+		rc = ldlm_lock_change_resource(ns, lock,
+				&dlm_req->lock_desc.l_resource.lr_name);
+		if (rc < 0) {
+			LDLM_ERROR(lock, "Failed to allocate resource");
+			GOTO(out, rc);
+		}
+		LDLM_DEBUG(lock, "completion AST, new resource");
+		CERROR("change resource!\n");
+		lock_res_and_lock(lock);
+	}
+
+	if (dlm_req->lock_flags & LDLM_FL_AST_SENT) {
+		/* BL_AST locks are not needed in LRU.
+		 * Let ldlm_cancel_lru() be fast. */
+		ldlm_lock_remove_from_lru(lock);
+		lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_BL_AST;
+		LDLM_DEBUG(lock, "completion AST includes blocking AST");
+	}
+
+	if (lock->l_lvb_len > 0) {
+		rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_CLIENT,
+				   lock->l_lvb_data, lvb_len);
+		if (rc < 0) {
+			unlock_res_and_lock(lock);
+			GOTO(out, rc);
+		}
+	}
+
+	ldlm_grant_lock(lock, &ast_list);
+	unlock_res_and_lock(lock);
+
+	LDLM_DEBUG(lock, "callback handler finished, about to run_ast_work");
+
+	/* Let Enqueue to call osc_lock_upcall() and initialize
+	 * l_ast_data */
+	OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_ENQ_RACE, 2);
+
+	ldlm_run_ast_work(ns, &ast_list, LDLM_WORK_CP_AST);
+
+	LDLM_DEBUG_NOLOCK("client completion callback handler END (lock %p)",
+			  lock);
+	GOTO(out, rc);
+
+out:
+	if (rc < 0) {
+		lock_res_and_lock(lock);
+		lock->l_flags |= LDLM_FL_FAILED;
+		unlock_res_and_lock(lock);
+		wake_up(&lock->l_waitq);
+	}
+	LDLM_LOCK_RELEASE(lock);
+}
+
+/**
+ * Callback handler for receiving incoming glimpse ASTs.
+ *
+ * This only can happen on client side.  After handling the glimpse AST
+ * we also consider dropping the lock here if it is unused locally for a
+ * long time.
+ */
+static void ldlm_handle_gl_callback(struct ptlrpc_request *req,
+				    struct ldlm_namespace *ns,
+				    struct ldlm_request *dlm_req,
+				    struct ldlm_lock *lock)
+{
+	int rc = -ENOSYS;
+	ENTRY;
+
+	LDLM_DEBUG(lock, "client glimpse AST callback handler");
+
+	if (lock->l_glimpse_ast != NULL)
+		rc = lock->l_glimpse_ast(lock, req);
+
+	if (req->rq_repmsg != NULL) {
+		ptlrpc_reply(req);
+	} else {
+		req->rq_status = rc;
+		ptlrpc_error(req);
+	}
+
+	lock_res_and_lock(lock);
+	if (lock->l_granted_mode == LCK_PW &&
+	    !lock->l_readers && !lock->l_writers &&
+	    cfs_time_after(cfs_time_current(),
+			   cfs_time_add(lock->l_last_used,
+					cfs_time_seconds(10)))) {
+		unlock_res_and_lock(lock);
+		if (ldlm_bl_to_thread_lock(ns, NULL, lock))
+			ldlm_handle_bl_callback(ns, NULL, lock);
+
+		EXIT;
+		return;
+	}
+	unlock_res_and_lock(lock);
+	LDLM_LOCK_RELEASE(lock);
+	EXIT;
+}
+
+static int ldlm_callback_reply(struct ptlrpc_request *req, int rc)
+{
+	if (req->rq_no_reply)
+		return 0;
+
+	req->rq_status = rc;
+	if (!req->rq_packed_final) {
+		rc = lustre_pack_reply(req, 1, NULL, NULL);
+		if (rc)
+			return rc;
+	}
+	return ptlrpc_reply(req);
+}
+
+static int __ldlm_bl_to_thread(struct ldlm_bl_work_item *blwi,
+			       ldlm_cancel_flags_t cancel_flags)
+{
+	struct ldlm_bl_pool *blp = ldlm_state->ldlm_bl_pool;
+	ENTRY;
+
+	spin_lock(&blp->blp_lock);
+	if (blwi->blwi_lock &&
+	    blwi->blwi_lock->l_flags & LDLM_FL_DISCARD_DATA) {
+		/* add LDLM_FL_DISCARD_DATA requests to the priority list */
+		list_add_tail(&blwi->blwi_entry, &blp->blp_prio_list);
+	} else {
+		/* other blocking callbacks are added to the regular list */
+		list_add_tail(&blwi->blwi_entry, &blp->blp_list);
+	}
+	spin_unlock(&blp->blp_lock);
+
+	wake_up(&blp->blp_waitq);
+
+	/* can not check blwi->blwi_flags as blwi could be already freed in
+	   LCF_ASYNC mode */
+	if (!(cancel_flags & LCF_ASYNC))
+		wait_for_completion(&blwi->blwi_comp);
+
+	RETURN(0);
+}
+
+static inline void init_blwi(struct ldlm_bl_work_item *blwi,
+			     struct ldlm_namespace *ns,
+			     struct ldlm_lock_desc *ld,
+			     struct list_head *cancels, int count,
+			     struct ldlm_lock *lock,
+			     ldlm_cancel_flags_t cancel_flags)
+{
+	init_completion(&blwi->blwi_comp);
+	INIT_LIST_HEAD(&blwi->blwi_head);
+
+	if (memory_pressure_get())
+		blwi->blwi_mem_pressure = 1;
+
+	blwi->blwi_ns = ns;
+	blwi->blwi_flags = cancel_flags;
+	if (ld != NULL)
+		blwi->blwi_ld = *ld;
+	if (count) {
+		list_add(&blwi->blwi_head, cancels);
+		list_del_init(cancels);
+		blwi->blwi_count = count;
+	} else {
+		blwi->blwi_lock = lock;
+	}
+}
+
+/**
+ * Queues a list of locks \a cancels containing \a count locks
+ * for later processing by a blocking thread.  If \a count is zero,
+ * then the lock referenced as \a lock is queued instead.
+ *
+ * The blocking thread would then call ->l_blocking_ast callback in the lock.
+ * If list addition fails an error is returned and caller is supposed to
+ * call ->l_blocking_ast itself.
+ */
+static int ldlm_bl_to_thread(struct ldlm_namespace *ns,
+			     struct ldlm_lock_desc *ld,
+			     struct ldlm_lock *lock,
+			     struct list_head *cancels, int count,
+			     ldlm_cancel_flags_t cancel_flags)
+{
+	ENTRY;
+
+	if (cancels && count == 0)
+		RETURN(0);
+
+	if (cancel_flags & LCF_ASYNC) {
+		struct ldlm_bl_work_item *blwi;
+
+		OBD_ALLOC(blwi, sizeof(*blwi));
+		if (blwi == NULL)
+			RETURN(-ENOMEM);
+		init_blwi(blwi, ns, ld, cancels, count, lock, cancel_flags);
+
+		RETURN(__ldlm_bl_to_thread(blwi, cancel_flags));
+	} else {
+		/* if it is synchronous call do minimum mem alloc, as it could
+		 * be triggered from kernel shrinker
+		 */
+		struct ldlm_bl_work_item blwi;
+
+		memset(&blwi, 0, sizeof(blwi));
+		init_blwi(&blwi, ns, ld, cancels, count, lock, cancel_flags);
+		RETURN(__ldlm_bl_to_thread(&blwi, cancel_flags));
+	}
+}
+
+
+int ldlm_bl_to_thread_lock(struct ldlm_namespace *ns, struct ldlm_lock_desc *ld,
+			   struct ldlm_lock *lock)
+{
+	return ldlm_bl_to_thread(ns, ld, lock, NULL, 0, LCF_ASYNC);
+}
+
+int ldlm_bl_to_thread_list(struct ldlm_namespace *ns, struct ldlm_lock_desc *ld,
+			   struct list_head *cancels, int count,
+			   ldlm_cancel_flags_t cancel_flags)
+{
+	return ldlm_bl_to_thread(ns, ld, NULL, cancels, count, cancel_flags);
+}
+
+/* Setinfo coming from Server (eg MDT) to Client (eg MDC)! */
+static int ldlm_handle_setinfo(struct ptlrpc_request *req)
+{
+	struct obd_device *obd = req->rq_export->exp_obd;
+	char *key;
+	void *val;
+	int keylen, vallen;
+	int rc = -ENOSYS;
+	ENTRY;
+
+	DEBUG_REQ(D_HSM, req, "%s: handle setinfo\n", obd->obd_name);
+
+	req_capsule_set(&req->rq_pill, &RQF_OBD_SET_INFO);
+
+	key = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY);
+	if (key == NULL) {
+		DEBUG_REQ(D_IOCTL, req, "no set_info key");
+		RETURN(-EFAULT);
+	}
+	keylen = req_capsule_get_size(&req->rq_pill, &RMF_SETINFO_KEY,
+				      RCL_CLIENT);
+	val = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_VAL);
+	if (val == NULL) {
+		DEBUG_REQ(D_IOCTL, req, "no set_info val");
+		RETURN(-EFAULT);
+	}
+	vallen = req_capsule_get_size(&req->rq_pill, &RMF_SETINFO_VAL,
+				      RCL_CLIENT);
+
+	/* We are responsible for swabbing contents of val */
+
+	if (KEY_IS(KEY_HSM_COPYTOOL_SEND))
+		/* Pass it on to mdc (the "export" in this case) */
+		rc = obd_set_info_async(req->rq_svc_thread->t_env,
+					req->rq_export,
+					sizeof(KEY_HSM_COPYTOOL_SEND),
+					KEY_HSM_COPYTOOL_SEND,
+					vallen, val, NULL);
+	else
+		DEBUG_REQ(D_WARNING, req, "ignoring unknown key %s", key);
+
+	return rc;
+}
+
+static inline void ldlm_callback_errmsg(struct ptlrpc_request *req,
+					const char *msg, int rc,
+					struct lustre_handle *handle)
+{
+	DEBUG_REQ((req->rq_no_reply || rc) ? D_WARNING : D_DLMTRACE, req,
+		  "%s: [nid %s] [rc %d] [lock "LPX64"]",
+		  msg, libcfs_id2str(req->rq_peer), rc,
+		  handle ? handle->cookie : 0);
+	if (req->rq_no_reply)
+		CWARN("No reply was sent, maybe cause bug 21636.\n");
+	else if (rc)
+		CWARN("Send reply failed, maybe cause bug 21636.\n");
+}
+
+static int ldlm_handle_qc_callback(struct ptlrpc_request *req)
+{
+	struct obd_quotactl *oqctl;
+	struct client_obd *cli = &req->rq_export->exp_obd->u.cli;
+
+	oqctl = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL);
+	if (oqctl == NULL) {
+		CERROR("Can't unpack obd_quotactl\n");
+		RETURN(-EPROTO);
+	}
+
+	cli->cl_qchk_stat = oqctl->qc_stat;
+	return 0;
+}
+
+/* TODO: handle requests in a similar way as MDT: see mdt_handle_common() */
+static int ldlm_callback_handler(struct ptlrpc_request *req)
+{
+	struct ldlm_namespace *ns;
+	struct ldlm_request *dlm_req;
+	struct ldlm_lock *lock;
+	int rc;
+	ENTRY;
+
+	/* Requests arrive in sender's byte order.  The ptlrpc service
+	 * handler has already checked and, if necessary, byte-swapped the
+	 * incoming request message body, but I am responsible for the
+	 * message buffers. */
+
+	/* do nothing for sec context finalize */
+	if (lustre_msg_get_opc(req->rq_reqmsg) == SEC_CTX_FINI)
+		RETURN(0);
+
+	req_capsule_init(&req->rq_pill, req, RCL_SERVER);
+
+	if (req->rq_export == NULL) {
+		rc = ldlm_callback_reply(req, -ENOTCONN);
+		ldlm_callback_errmsg(req, "Operate on unconnected server",
+				     rc, NULL);
+		RETURN(0);
+	}
+
+	LASSERT(req->rq_export != NULL);
+	LASSERT(req->rq_export->exp_obd != NULL);
+
+	switch (lustre_msg_get_opc(req->rq_reqmsg)) {
+	case LDLM_BL_CALLBACK:
+		if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET))
+			RETURN(0);
+		break;
+	case LDLM_CP_CALLBACK:
+		if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CP_CALLBACK_NET))
+			RETURN(0);
+		break;
+	case LDLM_GL_CALLBACK:
+		if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_GL_CALLBACK_NET))
+			RETURN(0);
+		break;
+	case LDLM_SET_INFO:
+		rc = ldlm_handle_setinfo(req);
+		ldlm_callback_reply(req, rc);
+		RETURN(0);
+	case OBD_LOG_CANCEL: /* remove this eventually - for 1.4.0 compat */
+		CERROR("shouldn't be handling OBD_LOG_CANCEL on DLM thread\n");
+		req_capsule_set(&req->rq_pill, &RQF_LOG_CANCEL);
+		if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOG_CANCEL_NET))
+			RETURN(0);
+		rc = llog_origin_handle_cancel(req);
+		if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOG_CANCEL_REP))
+			RETURN(0);
+		ldlm_callback_reply(req, rc);
+		RETURN(0);
+	case LLOG_ORIGIN_HANDLE_CREATE:
+		req_capsule_set(&req->rq_pill, &RQF_LLOG_ORIGIN_HANDLE_CREATE);
+		if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOGD_NET))
+			RETURN(0);
+		rc = llog_origin_handle_open(req);
+		ldlm_callback_reply(req, rc);
+		RETURN(0);
+	case LLOG_ORIGIN_HANDLE_NEXT_BLOCK:
+		req_capsule_set(&req->rq_pill,
+				&RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK);
+		if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOGD_NET))
+			RETURN(0);
+		rc = llog_origin_handle_next_block(req);
+		ldlm_callback_reply(req, rc);
+		RETURN(0);
+	case LLOG_ORIGIN_HANDLE_READ_HEADER:
+		req_capsule_set(&req->rq_pill,
+				&RQF_LLOG_ORIGIN_HANDLE_READ_HEADER);
+		if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOGD_NET))
+			RETURN(0);
+		rc = llog_origin_handle_read_header(req);
+		ldlm_callback_reply(req, rc);
+		RETURN(0);
+	case LLOG_ORIGIN_HANDLE_CLOSE:
+		if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOGD_NET))
+			RETURN(0);
+		rc = llog_origin_handle_close(req);
+		ldlm_callback_reply(req, rc);
+		RETURN(0);
+	case OBD_QC_CALLBACK:
+		req_capsule_set(&req->rq_pill, &RQF_QC_CALLBACK);
+		if (OBD_FAIL_CHECK(OBD_FAIL_OBD_QC_CALLBACK_NET))
+			RETURN(0);
+		rc = ldlm_handle_qc_callback(req);
+		ldlm_callback_reply(req, rc);
+		RETURN(0);
+	default:
+		CERROR("unknown opcode %u\n",
+		       lustre_msg_get_opc(req->rq_reqmsg));
+		ldlm_callback_reply(req, -EPROTO);
+		RETURN(0);
+	}
+
+	ns = req->rq_export->exp_obd->obd_namespace;
+	LASSERT(ns != NULL);
+
+	req_capsule_set(&req->rq_pill, &RQF_LDLM_CALLBACK);
+
+	dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+	if (dlm_req == NULL) {
+		rc = ldlm_callback_reply(req, -EPROTO);
+		ldlm_callback_errmsg(req, "Operate without parameter", rc,
+				     NULL);
+		RETURN(0);
+	}
+
+	/* Force a known safe race, send a cancel to the server for a lock
+	 * which the server has already started a blocking callback on. */
+	if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE) &&
+	    lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK) {
+		rc = ldlm_cli_cancel(&dlm_req->lock_handle[0], 0);
+		if (rc < 0)
+			CERROR("ldlm_cli_cancel: %d\n", rc);
+	}
+
+	lock = ldlm_handle2lock_long(&dlm_req->lock_handle[0], 0);
+	if (!lock) {
+		CDEBUG(D_DLMTRACE, "callback on lock "LPX64" - lock "
+		       "disappeared\n", dlm_req->lock_handle[0].cookie);
+		rc = ldlm_callback_reply(req, -EINVAL);
+		ldlm_callback_errmsg(req, "Operate with invalid parameter", rc,
+				     &dlm_req->lock_handle[0]);
+		RETURN(0);
+	}
+
+	if ((lock->l_flags & LDLM_FL_FAIL_LOC) &&
+	    lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK)
+		OBD_RACE(OBD_FAIL_LDLM_CP_BL_RACE);
+
+	/* Copy hints/flags (e.g. LDLM_FL_DISCARD_DATA) from AST. */
+	lock_res_and_lock(lock);
+	lock->l_flags |= ldlm_flags_from_wire(dlm_req->lock_flags &
+					      LDLM_AST_FLAGS);
+	if (lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK) {
+		/* If somebody cancels lock and cache is already dropped,
+		 * or lock is failed before cp_ast received on client,
+		 * we can tell the server we have no lock. Otherwise, we
+		 * should send cancel after dropping the cache. */
+		if (((lock->l_flags & LDLM_FL_CANCELING) &&
+		    (lock->l_flags & LDLM_FL_BL_DONE)) ||
+		    (lock->l_flags & LDLM_FL_FAILED)) {
+			LDLM_DEBUG(lock, "callback on lock "
+				   LPX64" - lock disappeared\n",
+				   dlm_req->lock_handle[0].cookie);
+			unlock_res_and_lock(lock);
+			LDLM_LOCK_RELEASE(lock);
+			rc = ldlm_callback_reply(req, -EINVAL);
+			ldlm_callback_errmsg(req, "Operate on stale lock", rc,
+					     &dlm_req->lock_handle[0]);
+			RETURN(0);
+		}
+		/* BL_AST locks are not needed in LRU.
+		 * Let ldlm_cancel_lru() be fast. */
+		ldlm_lock_remove_from_lru(lock);
+		lock->l_flags |= LDLM_FL_BL_AST;
+	}
+	unlock_res_and_lock(lock);
+
+	/* We want the ost thread to get this reply so that it can respond
+	 * to ost requests (write cache writeback) that might be triggered
+	 * in the callback.
+	 *
+	 * But we'd also like to be able to indicate in the reply that we're
+	 * cancelling right now, because it's unused, or have an intent result
+	 * in the reply, so we might have to push the responsibility for sending
+	 * the reply down into the AST handlers, alas. */
+
+	switch (lustre_msg_get_opc(req->rq_reqmsg)) {
+	case LDLM_BL_CALLBACK:
+		CDEBUG(D_INODE, "blocking ast\n");
+		req_capsule_extend(&req->rq_pill, &RQF_LDLM_BL_CALLBACK);
+		if (!(lock->l_flags & LDLM_FL_CANCEL_ON_BLOCK)) {
+			rc = ldlm_callback_reply(req, 0);
+			if (req->rq_no_reply || rc)
+				ldlm_callback_errmsg(req, "Normal process", rc,
+						     &dlm_req->lock_handle[0]);
+		}
+		if (ldlm_bl_to_thread_lock(ns, &dlm_req->lock_desc, lock))
+			ldlm_handle_bl_callback(ns, &dlm_req->lock_desc, lock);
+		break;
+	case LDLM_CP_CALLBACK:
+		CDEBUG(D_INODE, "completion ast\n");
+		req_capsule_extend(&req->rq_pill, &RQF_LDLM_CP_CALLBACK);
+		ldlm_callback_reply(req, 0);
+		ldlm_handle_cp_callback(req, ns, dlm_req, lock);
+		break;
+	case LDLM_GL_CALLBACK:
+		CDEBUG(D_INODE, "glimpse ast\n");
+		req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_CALLBACK);
+		ldlm_handle_gl_callback(req, ns, dlm_req, lock);
+		break;
+	default:
+		LBUG();			 /* checked above */
+	}
+
+	RETURN(0);
+}
+
+
+static struct ldlm_bl_work_item *ldlm_bl_get_work(struct ldlm_bl_pool *blp)
+{
+	struct ldlm_bl_work_item *blwi = NULL;
+	static unsigned int num_bl = 0;
+
+	spin_lock(&blp->blp_lock);
+	/* process a request from the blp_list at least every blp_num_threads */
+	if (!list_empty(&blp->blp_list) &&
+	    (list_empty(&blp->blp_prio_list) || num_bl == 0))
+		blwi = list_entry(blp->blp_list.next,
+				      struct ldlm_bl_work_item, blwi_entry);
+	else
+		if (!list_empty(&blp->blp_prio_list))
+			blwi = list_entry(blp->blp_prio_list.next,
+					      struct ldlm_bl_work_item,
+					      blwi_entry);
+
+	if (blwi) {
+		if (++num_bl >= atomic_read(&blp->blp_num_threads))
+			num_bl = 0;
+		list_del(&blwi->blwi_entry);
+	}
+	spin_unlock(&blp->blp_lock);
+
+	return blwi;
+}
+
+/* This only contains temporary data until the thread starts */
+struct ldlm_bl_thread_data {
+	char			bltd_name[CFS_CURPROC_COMM_MAX];
+	struct ldlm_bl_pool	*bltd_blp;
+	struct completion	bltd_comp;
+	int			bltd_num;
+};
+
+static int ldlm_bl_thread_main(void *arg);
+
+static int ldlm_bl_thread_start(struct ldlm_bl_pool *blp)
+{
+	struct ldlm_bl_thread_data bltd = { .bltd_blp = blp };
+	task_t *task;
+
+	init_completion(&bltd.bltd_comp);
+	bltd.bltd_num = atomic_read(&blp->blp_num_threads);
+	snprintf(bltd.bltd_name, sizeof(bltd.bltd_name) - 1,
+		"ldlm_bl_%02d", bltd.bltd_num);
+	task = kthread_run(ldlm_bl_thread_main, &bltd, bltd.bltd_name);
+	if (IS_ERR(task)) {
+		CERROR("cannot start LDLM thread ldlm_bl_%02d: rc %ld\n",
+		       atomic_read(&blp->blp_num_threads), PTR_ERR(task));
+		return PTR_ERR(task);
+	}
+	wait_for_completion(&bltd.bltd_comp);
+
+	return 0;
+}
+
+/**
+ * Main blocking requests processing thread.
+ *
+ * Callers put locks into its queue by calling ldlm_bl_to_thread.
+ * This thread in the end ends up doing actual call to ->l_blocking_ast
+ * for queued locks.
+ */
+static int ldlm_bl_thread_main(void *arg)
+{
+	struct ldlm_bl_pool *blp;
+	ENTRY;
+
+	{
+		struct ldlm_bl_thread_data *bltd = arg;
+
+		blp = bltd->bltd_blp;
+
+		atomic_inc(&blp->blp_num_threads);
+		atomic_inc(&blp->blp_busy_threads);
+
+		complete(&bltd->bltd_comp);
+		/* cannot use bltd after this, it is only on caller's stack */
+	}
+
+	while (1) {
+		struct l_wait_info lwi = { 0 };
+		struct ldlm_bl_work_item *blwi = NULL;
+		int busy;
+
+		blwi = ldlm_bl_get_work(blp);
+
+		if (blwi == NULL) {
+			atomic_dec(&blp->blp_busy_threads);
+			l_wait_event_exclusive(blp->blp_waitq,
+					 (blwi = ldlm_bl_get_work(blp)) != NULL,
+					 &lwi);
+			busy = atomic_inc_return(&blp->blp_busy_threads);
+		} else {
+			busy = atomic_read(&blp->blp_busy_threads);
+		}
+
+		if (blwi->blwi_ns == NULL)
+			/* added by ldlm_cleanup() */
+			break;
+
+		/* Not fatal if racy and have a few too many threads */
+		if (unlikely(busy < blp->blp_max_threads &&
+			     busy >= atomic_read(&blp->blp_num_threads) &&
+			     !blwi->blwi_mem_pressure))
+			/* discard the return value, we tried */
+			ldlm_bl_thread_start(blp);
+
+		if (blwi->blwi_mem_pressure)
+			memory_pressure_set();
+
+		if (blwi->blwi_count) {
+			int count;
+			/* The special case when we cancel locks in LRU
+			 * asynchronously, we pass the list of locks here.
+			 * Thus locks are marked LDLM_FL_CANCELING, but NOT
+			 * canceled locally yet. */
+			count = ldlm_cli_cancel_list_local(&blwi->blwi_head,
+							   blwi->blwi_count,
+							   LCF_BL_AST);
+			ldlm_cli_cancel_list(&blwi->blwi_head, count, NULL,
+					     blwi->blwi_flags);
+		} else {
+			ldlm_handle_bl_callback(blwi->blwi_ns, &blwi->blwi_ld,
+						blwi->blwi_lock);
+		}
+		if (blwi->blwi_mem_pressure)
+			memory_pressure_clr();
+
+		if (blwi->blwi_flags & LCF_ASYNC)
+			OBD_FREE(blwi, sizeof(*blwi));
+		else
+			complete(&blwi->blwi_comp);
+	}
+
+	atomic_dec(&blp->blp_busy_threads);
+	atomic_dec(&blp->blp_num_threads);
+	complete(&blp->blp_comp);
+	RETURN(0);
+}
+
+
+static int ldlm_setup(void);
+static int ldlm_cleanup(void);
+
+int ldlm_get_ref(void)
+{
+	int rc = 0;
+	ENTRY;
+	mutex_lock(&ldlm_ref_mutex);
+	if (++ldlm_refcount == 1) {
+		rc = ldlm_setup();
+		if (rc)
+			ldlm_refcount--;
+	}
+	mutex_unlock(&ldlm_ref_mutex);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_get_ref);
+
+void ldlm_put_ref(void)
+{
+	ENTRY;
+	mutex_lock(&ldlm_ref_mutex);
+	if (ldlm_refcount == 1) {
+		int rc = ldlm_cleanup();
+		if (rc)
+			CERROR("ldlm_cleanup failed: %d\n", rc);
+		else
+			ldlm_refcount--;
+	} else {
+		ldlm_refcount--;
+	}
+	mutex_unlock(&ldlm_ref_mutex);
+
+	EXIT;
+}
+EXPORT_SYMBOL(ldlm_put_ref);
+
+/*
+ * Export handle<->lock hash operations.
+ */
+static unsigned
+ldlm_export_lock_hash(cfs_hash_t *hs, const void *key, unsigned mask)
+{
+	return cfs_hash_u64_hash(((struct lustre_handle *)key)->cookie, mask);
+}
+
+static void *
+ldlm_export_lock_key(struct hlist_node *hnode)
+{
+	struct ldlm_lock *lock;
+
+	lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash);
+	return &lock->l_remote_handle;
+}
+
+static void
+ldlm_export_lock_keycpy(struct hlist_node *hnode, void *key)
+{
+	struct ldlm_lock     *lock;
+
+	lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash);
+	lock->l_remote_handle = *(struct lustre_handle *)key;
+}
+
+static int
+ldlm_export_lock_keycmp(const void *key, struct hlist_node *hnode)
+{
+	return lustre_handle_equal(ldlm_export_lock_key(hnode), key);
+}
+
+static void *
+ldlm_export_lock_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct ldlm_lock, l_exp_hash);
+}
+
+static void
+ldlm_export_lock_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct ldlm_lock *lock;
+
+	lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash);
+	LDLM_LOCK_GET(lock);
+}
+
+static void
+ldlm_export_lock_put(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct ldlm_lock *lock;
+
+	lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash);
+	LDLM_LOCK_RELEASE(lock);
+}
+
+static cfs_hash_ops_t ldlm_export_lock_ops = {
+	.hs_hash	= ldlm_export_lock_hash,
+	.hs_key	 = ldlm_export_lock_key,
+	.hs_keycmp      = ldlm_export_lock_keycmp,
+	.hs_keycpy      = ldlm_export_lock_keycpy,
+	.hs_object      = ldlm_export_lock_object,
+	.hs_get	 = ldlm_export_lock_get,
+	.hs_put	 = ldlm_export_lock_put,
+	.hs_put_locked  = ldlm_export_lock_put,
+};
+
+int ldlm_init_export(struct obd_export *exp)
+{
+	ENTRY;
+
+	exp->exp_lock_hash =
+		cfs_hash_create(obd_uuid2str(&exp->exp_client_uuid),
+				HASH_EXP_LOCK_CUR_BITS,
+				HASH_EXP_LOCK_MAX_BITS,
+				HASH_EXP_LOCK_BKT_BITS, 0,
+				CFS_HASH_MIN_THETA, CFS_HASH_MAX_THETA,
+				&ldlm_export_lock_ops,
+				CFS_HASH_DEFAULT | CFS_HASH_REHASH_KEY |
+				CFS_HASH_NBLK_CHANGE);
+
+	if (!exp->exp_lock_hash)
+		RETURN(-ENOMEM);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_init_export);
+
+void ldlm_destroy_export(struct obd_export *exp)
+{
+	ENTRY;
+	cfs_hash_putref(exp->exp_lock_hash);
+	exp->exp_lock_hash = NULL;
+
+	ldlm_destroy_flock_export(exp);
+	EXIT;
+}
+EXPORT_SYMBOL(ldlm_destroy_export);
+
+static int ldlm_setup(void)
+{
+	static struct ptlrpc_service_conf	conf;
+	struct ldlm_bl_pool			*blp = NULL;
+	int rc = 0;
+	int i;
+	ENTRY;
+
+	if (ldlm_state != NULL)
+		RETURN(-EALREADY);
+
+	OBD_ALLOC(ldlm_state, sizeof(*ldlm_state));
+	if (ldlm_state == NULL)
+		RETURN(-ENOMEM);
+
+#ifdef LPROCFS
+	rc = ldlm_proc_setup();
+	if (rc != 0)
+		GOTO(out, rc);
+#endif
+
+	memset(&conf, 0, sizeof(conf));
+	conf = (typeof(conf)) {
+		.psc_name		= "ldlm_cbd",
+		.psc_watchdog_factor	= 2,
+		.psc_buf		= {
+			.bc_nbufs		= LDLM_CLIENT_NBUFS,
+			.bc_buf_size		= LDLM_BUFSIZE,
+			.bc_req_max_size	= LDLM_MAXREQSIZE,
+			.bc_rep_max_size	= LDLM_MAXREPSIZE,
+			.bc_req_portal		= LDLM_CB_REQUEST_PORTAL,
+			.bc_rep_portal		= LDLM_CB_REPLY_PORTAL,
+		},
+		.psc_thr		= {
+			.tc_thr_name		= "ldlm_cb",
+			.tc_thr_factor		= LDLM_THR_FACTOR,
+			.tc_nthrs_init		= LDLM_NTHRS_INIT,
+			.tc_nthrs_base		= LDLM_NTHRS_BASE,
+			.tc_nthrs_max		= LDLM_NTHRS_MAX,
+			.tc_nthrs_user		= ldlm_num_threads,
+			.tc_cpu_affinity	= 1,
+			.tc_ctx_tags		= LCT_MD_THREAD | LCT_DT_THREAD,
+		},
+		.psc_cpt		= {
+			.cc_pattern		= ldlm_cpts,
+		},
+		.psc_ops		= {
+			.so_req_handler		= ldlm_callback_handler,
+		},
+	};
+	ldlm_state->ldlm_cb_service = \
+			ptlrpc_register_service(&conf, ldlm_svc_proc_dir);
+	if (IS_ERR(ldlm_state->ldlm_cb_service)) {
+		CERROR("failed to start service\n");
+		rc = PTR_ERR(ldlm_state->ldlm_cb_service);
+		ldlm_state->ldlm_cb_service = NULL;
+		GOTO(out, rc);
+	}
+
+
+	OBD_ALLOC(blp, sizeof(*blp));
+	if (blp == NULL)
+		GOTO(out, rc = -ENOMEM);
+	ldlm_state->ldlm_bl_pool = blp;
+
+	spin_lock_init(&blp->blp_lock);
+	INIT_LIST_HEAD(&blp->blp_list);
+	INIT_LIST_HEAD(&blp->blp_prio_list);
+	init_waitqueue_head(&blp->blp_waitq);
+	atomic_set(&blp->blp_num_threads, 0);
+	atomic_set(&blp->blp_busy_threads, 0);
+
+	if (ldlm_num_threads == 0) {
+		blp->blp_min_threads = LDLM_NTHRS_INIT;
+		blp->blp_max_threads = LDLM_NTHRS_MAX;
+	} else {
+		blp->blp_min_threads = blp->blp_max_threads = \
+			min_t(int, LDLM_NTHRS_MAX, max_t(int, LDLM_NTHRS_INIT,
+							 ldlm_num_threads));
+	}
+
+	for (i = 0; i < blp->blp_min_threads; i++) {
+		rc = ldlm_bl_thread_start(blp);
+		if (rc < 0)
+			GOTO(out, rc);
+	}
+
+
+	rc = ldlm_pools_init();
+	if (rc) {
+		CERROR("Failed to initialize LDLM pools: %d\n", rc);
+		GOTO(out, rc);
+	}
+	RETURN(0);
+
+ out:
+	ldlm_cleanup();
+	RETURN(rc);
+}
+
+static int ldlm_cleanup(void)
+{
+	ENTRY;
+
+	if (!list_empty(ldlm_namespace_list(LDLM_NAMESPACE_SERVER)) ||
+	    !list_empty(ldlm_namespace_list(LDLM_NAMESPACE_CLIENT))) {
+		CERROR("ldlm still has namespaces; clean these up first.\n");
+		ldlm_dump_all_namespaces(LDLM_NAMESPACE_SERVER, D_DLMTRACE);
+		ldlm_dump_all_namespaces(LDLM_NAMESPACE_CLIENT, D_DLMTRACE);
+		RETURN(-EBUSY);
+	}
+
+	ldlm_pools_fini();
+
+	if (ldlm_state->ldlm_bl_pool != NULL) {
+		struct ldlm_bl_pool *blp = ldlm_state->ldlm_bl_pool;
+
+		while (atomic_read(&blp->blp_num_threads) > 0) {
+			struct ldlm_bl_work_item blwi = { .blwi_ns = NULL };
+
+			init_completion(&blp->blp_comp);
+
+			spin_lock(&blp->blp_lock);
+			list_add_tail(&blwi.blwi_entry, &blp->blp_list);
+			wake_up(&blp->blp_waitq);
+			spin_unlock(&blp->blp_lock);
+
+			wait_for_completion(&blp->blp_comp);
+		}
+
+		OBD_FREE(blp, sizeof(*blp));
+	}
+
+	if (ldlm_state->ldlm_cb_service != NULL)
+		ptlrpc_unregister_service(ldlm_state->ldlm_cb_service);
+
+	ldlm_proc_cleanup();
+
+
+	OBD_FREE(ldlm_state, sizeof(*ldlm_state));
+	ldlm_state = NULL;
+
+	RETURN(0);
+}
+
+int ldlm_init(void)
+{
+	mutex_init(&ldlm_ref_mutex);
+	mutex_init(ldlm_namespace_lock(LDLM_NAMESPACE_SERVER));
+	mutex_init(ldlm_namespace_lock(LDLM_NAMESPACE_CLIENT));
+	ldlm_resource_slab = kmem_cache_create("ldlm_resources",
+					       sizeof(struct ldlm_resource), 0,
+					       SLAB_HWCACHE_ALIGN, NULL);
+	if (ldlm_resource_slab == NULL)
+		return -ENOMEM;
+
+	ldlm_lock_slab = kmem_cache_create("ldlm_locks",
+			      sizeof(struct ldlm_lock), 0,
+			      SLAB_HWCACHE_ALIGN | SLAB_DESTROY_BY_RCU, NULL);
+	if (ldlm_lock_slab == NULL) {
+		kmem_cache_destroy(ldlm_resource_slab);
+		return -ENOMEM;
+	}
+
+	ldlm_interval_slab = kmem_cache_create("interval_node",
+					sizeof(struct ldlm_interval),
+					0, SLAB_HWCACHE_ALIGN, NULL);
+	if (ldlm_interval_slab == NULL) {
+		kmem_cache_destroy(ldlm_resource_slab);
+		kmem_cache_destroy(ldlm_lock_slab);
+		return -ENOMEM;
+	}
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+	class_export_dump_hook = ldlm_dump_export_locks;
+#endif
+	return 0;
+}
+
+void ldlm_exit(void)
+{
+	if (ldlm_refcount)
+		CERROR("ldlm_refcount is %d in ldlm_exit!\n", ldlm_refcount);
+	kmem_cache_destroy(ldlm_resource_slab);
+	/* ldlm_lock_put() use RCU to call ldlm_lock_free, so need call
+	 * synchronize_rcu() to wait a grace period elapsed, so that
+	 * ldlm_lock_free() get a chance to be called. */
+	synchronize_rcu();
+	kmem_cache_destroy(ldlm_lock_slab);
+	kmem_cache_destroy(ldlm_interval_slab);
+}
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_plain.c b/drivers/staging/lustre/lustre/ldlm/ldlm_plain.c
new file mode 100644
index 000000000000..ec29e28624fe
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_plain.c
@@ -0,0 +1,72 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_plain.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+/**
+ * This file contains implementation of PLAIN lock type.
+ *
+ * PLAIN locks are the simplest form of LDLM locking, and are used when
+ * there only needs to be a single lock on a resource. This avoids some
+ * of the complexity of EXTENT and IBITS lock types, but doesn't allow
+ * different "parts" of a resource to be locked concurrently.  Example
+ * use cases for PLAIN locks include locking of MGS configuration logs
+ * and (as of Lustre 2.4) quota records.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include <lustre_dlm.h>
+#include <obd_support.h>
+#include <lustre_lib.h>
+
+#include "ldlm_internal.h"
+
+
+void ldlm_plain_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy,
+				     ldlm_policy_data_t *lpolicy)
+{
+	/* No policy for plain locks */
+}
+
+void ldlm_plain_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+				     ldlm_wire_policy_data_t *wpolicy)
+{
+	/* No policy for plain locks */
+}
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_pool.c b/drivers/staging/lustre/lustre/ldlm/ldlm_pool.c
new file mode 100644
index 000000000000..0604295a493d
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_pool.c
@@ -0,0 +1,1406 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_pool.c
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+/*
+ * Idea of this code is rather simple. Each second, for each server namespace
+ * we have SLV - server lock volume which is calculated on current number of
+ * granted locks, grant speed for past period, etc - that is, locking load.
+ * This SLV number may be thought as a flow definition for simplicity. It is
+ * sent to clients with each occasion to let them know what is current load
+ * situation on the server. By default, at the beginning, SLV on server is
+ * set max value which is calculated as the following: allow to one client
+ * have all locks of limit ->pl_limit for 10h.
+ *
+ * Next, on clients, number of cached locks is not limited artificially in any
+ * way as it was before. Instead, client calculates CLV, that is, client lock
+ * volume for each lock and compares it with last SLV from the server. CLV is
+ * calculated as the number of locks in LRU * lock live time in seconds. If
+ * CLV > SLV - lock is canceled.
+ *
+ * Client has LVF, that is, lock volume factor which regulates how much sensitive
+ * client should be about last SLV from server. The higher LVF is the more locks
+ * will be canceled on client. Default value for it is 1. Setting LVF to 2 means
+ * that client will cancel locks 2 times faster.
+ *
+ * Locks on a client will be canceled more intensively in these cases:
+ * (1) if SLV is smaller, that is, load is higher on the server;
+ * (2) client has a lot of locks (the more locks are held by client, the bigger
+ *     chances that some of them should be canceled);
+ * (3) client has old locks (taken some time ago);
+ *
+ * Thus, according to flow paradigm that we use for better understanding SLV,
+ * CLV is the volume of particle in flow described by SLV. According to this,
+ * if flow is getting thinner, more and more particles become outside of it and
+ * as particles are locks, they should be canceled.
+ *
+ * General idea of this belongs to Vitaly Fertman (vitaly@clusterfs.com). Andreas
+ * Dilger (adilger@clusterfs.com) proposed few nice ideas like using LVF and many
+ * cleanups. Flow definition to allow more easy understanding of the logic belongs
+ * to Nikita Danilov (nikita@clusterfs.com) as well as many cleanups and fixes.
+ * And design and implementation are done by Yury Umanets (umka@clusterfs.com).
+ *
+ * Glossary for terms used:
+ *
+ * pl_limit - Number of allowed locks in pool. Applies to server and client
+ * side (tunable);
+ *
+ * pl_granted - Number of granted locks (calculated);
+ * pl_grant_rate - Number of granted locks for last T (calculated);
+ * pl_cancel_rate - Number of canceled locks for last T (calculated);
+ * pl_grant_speed - Grant speed (GR - CR) for last T (calculated);
+ * pl_grant_plan - Planned number of granted locks for next T (calculated);
+ * pl_server_lock_volume - Current server lock volume (calculated);
+ *
+ * As it may be seen from list above, we have few possible tunables which may
+ * affect behavior much. They all may be modified via proc. However, they also
+ * give a possibility for constructing few pre-defined behavior policies. If
+ * none of predefines is suitable for a working pattern being used, new one may
+ * be "constructed" via proc tunables.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+# include <lustre_dlm.h>
+
+#include <cl_object.h>
+
+#include <obd_class.h>
+#include <obd_support.h>
+#include "ldlm_internal.h"
+
+
+/*
+ * 50 ldlm locks for 1MB of RAM.
+ */
+#define LDLM_POOL_HOST_L ((NUM_CACHEPAGES >> (20 - PAGE_CACHE_SHIFT)) * 50)
+
+/*
+ * Maximal possible grant step plan in %.
+ */
+#define LDLM_POOL_MAX_GSP (30)
+
+/*
+ * Minimal possible grant step plan in %.
+ */
+#define LDLM_POOL_MIN_GSP (1)
+
+/*
+ * This controls the speed of reaching LDLM_POOL_MAX_GSP
+ * with increasing thread period.
+ */
+#define LDLM_POOL_GSP_STEP_SHIFT (2)
+
+/*
+ * LDLM_POOL_GSP% of all locks is default GP.
+ */
+#define LDLM_POOL_GP(L)   (((L) * LDLM_POOL_MAX_GSP) / 100)
+
+/*
+ * Max age for locks on clients.
+ */
+#define LDLM_POOL_MAX_AGE (36000)
+
+/*
+ * The granularity of SLV calculation.
+ */
+#define LDLM_POOL_SLV_SHIFT (10)
+
+extern proc_dir_entry_t *ldlm_ns_proc_dir;
+
+static inline __u64 dru(__u64 val, __u32 shift, int round_up)
+{
+	return (val + (round_up ? (1 << shift) - 1 : 0)) >> shift;
+}
+
+static inline __u64 ldlm_pool_slv_max(__u32 L)
+{
+	/*
+	 * Allow to have all locks for 1 client for 10 hrs.
+	 * Formula is the following: limit * 10h / 1 client.
+	 */
+	__u64 lim = (__u64)L *  LDLM_POOL_MAX_AGE / 1;
+	return lim;
+}
+
+static inline __u64 ldlm_pool_slv_min(__u32 L)
+{
+	return 1;
+}
+
+enum {
+	LDLM_POOL_FIRST_STAT = 0,
+	LDLM_POOL_GRANTED_STAT = LDLM_POOL_FIRST_STAT,
+	LDLM_POOL_GRANT_STAT,
+	LDLM_POOL_CANCEL_STAT,
+	LDLM_POOL_GRANT_RATE_STAT,
+	LDLM_POOL_CANCEL_RATE_STAT,
+	LDLM_POOL_GRANT_PLAN_STAT,
+	LDLM_POOL_SLV_STAT,
+	LDLM_POOL_SHRINK_REQTD_STAT,
+	LDLM_POOL_SHRINK_FREED_STAT,
+	LDLM_POOL_RECALC_STAT,
+	LDLM_POOL_TIMING_STAT,
+	LDLM_POOL_LAST_STAT
+};
+
+static inline struct ldlm_namespace *ldlm_pl2ns(struct ldlm_pool *pl)
+{
+	return container_of(pl, struct ldlm_namespace, ns_pool);
+}
+
+/**
+ * Calculates suggested grant_step in % of available locks for passed
+ * \a period. This is later used in grant_plan calculations.
+ */
+static inline int ldlm_pool_t2gsp(unsigned int t)
+{
+	/*
+	 * This yields 1% grant step for anything below LDLM_POOL_GSP_STEP
+	 * and up to 30% for anything higher than LDLM_POOL_GSP_STEP.
+	 *
+	 * How this will affect execution is the following:
+	 *
+	 * - for thread period 1s we will have grant_step 1% which good from
+	 * pov of taking some load off from server and push it out to clients.
+	 * This is like that because 1% for grant_step means that server will
+	 * not allow clients to get lots of locks in short period of time and
+	 * keep all old locks in their caches. Clients will always have to
+	 * get some locks back if they want to take some new;
+	 *
+	 * - for thread period 10s (which is default) we will have 23% which
+	 * means that clients will have enough of room to take some new locks
+	 * without getting some back. All locks from this 23% which were not
+	 * taken by clients in current period will contribute in SLV growing.
+	 * SLV growing means more locks cached on clients until limit or grant
+	 * plan is reached.
+	 */
+	return LDLM_POOL_MAX_GSP -
+		((LDLM_POOL_MAX_GSP - LDLM_POOL_MIN_GSP) >>
+		 (t >> LDLM_POOL_GSP_STEP_SHIFT));
+}
+
+/**
+ * Recalculates next grant limit on passed \a pl.
+ *
+ * \pre ->pl_lock is locked.
+ */
+static void ldlm_pool_recalc_grant_plan(struct ldlm_pool *pl)
+{
+	int granted, grant_step, limit;
+
+	limit = ldlm_pool_get_limit(pl);
+	granted = atomic_read(&pl->pl_granted);
+
+	grant_step = ldlm_pool_t2gsp(pl->pl_recalc_period);
+	grant_step = ((limit - granted) * grant_step) / 100;
+	pl->pl_grant_plan = granted + grant_step;
+	limit = (limit * 5) >> 2;
+	if (pl->pl_grant_plan > limit)
+		pl->pl_grant_plan = limit;
+}
+
+/**
+ * Recalculates next SLV on passed \a pl.
+ *
+ * \pre ->pl_lock is locked.
+ */
+static void ldlm_pool_recalc_slv(struct ldlm_pool *pl)
+{
+	int granted;
+	int grant_plan;
+	int round_up;
+	__u64 slv;
+	__u64 slv_factor;
+	__u64 grant_usage;
+	__u32 limit;
+
+	slv = pl->pl_server_lock_volume;
+	grant_plan = pl->pl_grant_plan;
+	limit = ldlm_pool_get_limit(pl);
+	granted = atomic_read(&pl->pl_granted);
+	round_up = granted < limit;
+
+	grant_usage = max_t(int, limit - (granted - grant_plan), 1);
+
+	/*
+	 * Find out SLV change factor which is the ratio of grant usage
+	 * from limit. SLV changes as fast as the ratio of grant plan
+	 * consumption. The more locks from grant plan are not consumed
+	 * by clients in last interval (idle time), the faster grows
+	 * SLV. And the opposite, the more grant plan is over-consumed
+	 * (load time) the faster drops SLV.
+	 */
+	slv_factor = (grant_usage << LDLM_POOL_SLV_SHIFT);
+	do_div(slv_factor, limit);
+	slv = slv * slv_factor;
+	slv = dru(slv, LDLM_POOL_SLV_SHIFT, round_up);
+
+	if (slv > ldlm_pool_slv_max(limit)) {
+		slv = ldlm_pool_slv_max(limit);
+	} else if (slv < ldlm_pool_slv_min(limit)) {
+		slv = ldlm_pool_slv_min(limit);
+	}
+
+	pl->pl_server_lock_volume = slv;
+}
+
+/**
+ * Recalculates next stats on passed \a pl.
+ *
+ * \pre ->pl_lock is locked.
+ */
+static void ldlm_pool_recalc_stats(struct ldlm_pool *pl)
+{
+	int grant_plan = pl->pl_grant_plan;
+	__u64 slv = pl->pl_server_lock_volume;
+	int granted = atomic_read(&pl->pl_granted);
+	int grant_rate = atomic_read(&pl->pl_grant_rate);
+	int cancel_rate = atomic_read(&pl->pl_cancel_rate);
+
+	lprocfs_counter_add(pl->pl_stats, LDLM_POOL_SLV_STAT,
+			    slv);
+	lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANTED_STAT,
+			    granted);
+	lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANT_RATE_STAT,
+			    grant_rate);
+	lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANT_PLAN_STAT,
+			    grant_plan);
+	lprocfs_counter_add(pl->pl_stats, LDLM_POOL_CANCEL_RATE_STAT,
+			    cancel_rate);
+}
+
+/**
+ * Sets current SLV into obd accessible via ldlm_pl2ns(pl)->ns_obd.
+ */
+static void ldlm_srv_pool_push_slv(struct ldlm_pool *pl)
+{
+	struct obd_device *obd;
+
+	/*
+	 * Set new SLV in obd field for using it later without accessing the
+	 * pool. This is required to avoid race between sending reply to client
+	 * with new SLV and cleanup server stack in which we can't guarantee
+	 * that namespace is still alive. We know only that obd is alive as
+	 * long as valid export is alive.
+	 */
+	obd = ldlm_pl2ns(pl)->ns_obd;
+	LASSERT(obd != NULL);
+	write_lock(&obd->obd_pool_lock);
+	obd->obd_pool_slv = pl->pl_server_lock_volume;
+	write_unlock(&obd->obd_pool_lock);
+}
+
+/**
+ * Recalculates all pool fields on passed \a pl.
+ *
+ * \pre ->pl_lock is not locked.
+ */
+static int ldlm_srv_pool_recalc(struct ldlm_pool *pl)
+{
+	time_t recalc_interval_sec;
+	ENTRY;
+
+	recalc_interval_sec = cfs_time_current_sec() - pl->pl_recalc_time;
+	if (recalc_interval_sec < pl->pl_recalc_period)
+		RETURN(0);
+
+	spin_lock(&pl->pl_lock);
+	recalc_interval_sec = cfs_time_current_sec() - pl->pl_recalc_time;
+	if (recalc_interval_sec < pl->pl_recalc_period) {
+		spin_unlock(&pl->pl_lock);
+		RETURN(0);
+	}
+	/*
+	 * Recalc SLV after last period. This should be done
+	 * _before_ recalculating new grant plan.
+	 */
+	ldlm_pool_recalc_slv(pl);
+
+	/*
+	 * Make sure that pool informed obd of last SLV changes.
+	 */
+	ldlm_srv_pool_push_slv(pl);
+
+	/*
+	 * Update grant_plan for new period.
+	 */
+	ldlm_pool_recalc_grant_plan(pl);
+
+	pl->pl_recalc_time = cfs_time_current_sec();
+	lprocfs_counter_add(pl->pl_stats, LDLM_POOL_TIMING_STAT,
+			    recalc_interval_sec);
+	spin_unlock(&pl->pl_lock);
+	RETURN(0);
+}
+
+/**
+ * This function is used on server side as main entry point for memory
+ * pressure handling. It decreases SLV on \a pl according to passed
+ * \a nr and \a gfp_mask.
+ *
+ * Our goal here is to decrease SLV such a way that clients hold \a nr
+ * locks smaller in next 10h.
+ */
+static int ldlm_srv_pool_shrink(struct ldlm_pool *pl,
+				int nr, unsigned int gfp_mask)
+{
+	__u32 limit;
+
+	/*
+	 * VM is asking how many entries may be potentially freed.
+	 */
+	if (nr == 0)
+		return atomic_read(&pl->pl_granted);
+
+	/*
+	 * Client already canceled locks but server is already in shrinker
+	 * and can't cancel anything. Let's catch this race.
+	 */
+	if (atomic_read(&pl->pl_granted) == 0)
+		RETURN(0);
+
+	spin_lock(&pl->pl_lock);
+
+	/*
+	 * We want shrinker to possibly cause cancellation of @nr locks from
+	 * clients or grant approximately @nr locks smaller next intervals.
+	 *
+	 * This is why we decreased SLV by @nr. This effect will only be as
+	 * long as one re-calc interval (1s these days) and this should be
+	 * enough to pass this decreased SLV to all clients. On next recalc
+	 * interval pool will either increase SLV if locks load is not high
+	 * or will keep on same level or even decrease again, thus, shrinker
+	 * decreased SLV will affect next recalc intervals and this way will
+	 * make locking load lower.
+	 */
+	if (nr < pl->pl_server_lock_volume) {
+		pl->pl_server_lock_volume = pl->pl_server_lock_volume - nr;
+	} else {
+		limit = ldlm_pool_get_limit(pl);
+		pl->pl_server_lock_volume = ldlm_pool_slv_min(limit);
+	}
+
+	/*
+	 * Make sure that pool informed obd of last SLV changes.
+	 */
+	ldlm_srv_pool_push_slv(pl);
+	spin_unlock(&pl->pl_lock);
+
+	/*
+	 * We did not really free any memory here so far, it only will be
+	 * freed later may be, so that we return 0 to not confuse VM.
+	 */
+	return 0;
+}
+
+/**
+ * Setup server side pool \a pl with passed \a limit.
+ */
+static int ldlm_srv_pool_setup(struct ldlm_pool *pl, int limit)
+{
+	struct obd_device *obd;
+
+	obd = ldlm_pl2ns(pl)->ns_obd;
+	LASSERT(obd != NULL && obd != LP_POISON);
+	LASSERT(obd->obd_type != LP_POISON);
+	write_lock(&obd->obd_pool_lock);
+	obd->obd_pool_limit = limit;
+	write_unlock(&obd->obd_pool_lock);
+
+	ldlm_pool_set_limit(pl, limit);
+	return 0;
+}
+
+/**
+ * Sets SLV and Limit from ldlm_pl2ns(pl)->ns_obd tp passed \a pl.
+ */
+static void ldlm_cli_pool_pop_slv(struct ldlm_pool *pl)
+{
+	struct obd_device *obd;
+
+	/*
+	 * Get new SLV and Limit from obd which is updated with coming
+	 * RPCs.
+	 */
+	obd = ldlm_pl2ns(pl)->ns_obd;
+	LASSERT(obd != NULL);
+	read_lock(&obd->obd_pool_lock);
+	pl->pl_server_lock_volume = obd->obd_pool_slv;
+	ldlm_pool_set_limit(pl, obd->obd_pool_limit);
+	read_unlock(&obd->obd_pool_lock);
+}
+
+/**
+ * Recalculates client size pool \a pl according to current SLV and Limit.
+ */
+static int ldlm_cli_pool_recalc(struct ldlm_pool *pl)
+{
+	time_t recalc_interval_sec;
+	ENTRY;
+
+	recalc_interval_sec = cfs_time_current_sec() - pl->pl_recalc_time;
+	if (recalc_interval_sec < pl->pl_recalc_period)
+		RETURN(0);
+
+	spin_lock(&pl->pl_lock);
+	/*
+	 * Check if we need to recalc lists now.
+	 */
+	recalc_interval_sec = cfs_time_current_sec() - pl->pl_recalc_time;
+	if (recalc_interval_sec < pl->pl_recalc_period) {
+		spin_unlock(&pl->pl_lock);
+		RETURN(0);
+	}
+
+	/*
+	 * Make sure that pool knows last SLV and Limit from obd.
+	 */
+	ldlm_cli_pool_pop_slv(pl);
+
+	pl->pl_recalc_time = cfs_time_current_sec();
+	lprocfs_counter_add(pl->pl_stats, LDLM_POOL_TIMING_STAT,
+			    recalc_interval_sec);
+	spin_unlock(&pl->pl_lock);
+
+	/*
+	 * Do not cancel locks in case lru resize is disabled for this ns.
+	 */
+	if (!ns_connect_lru_resize(ldlm_pl2ns(pl)))
+		RETURN(0);
+
+	/*
+	 * In the time of canceling locks on client we do not need to maintain
+	 * sharp timing, we only want to cancel locks asap according to new SLV.
+	 * It may be called when SLV has changed much, this is why we do not
+	 * take into account pl->pl_recalc_time here.
+	 */
+	RETURN(ldlm_cancel_lru(ldlm_pl2ns(pl), 0, LCF_ASYNC,
+			       LDLM_CANCEL_LRUR));
+}
+
+/**
+ * This function is main entry point for memory pressure handling on client
+ * side.  Main goal of this function is to cancel some number of locks on
+ * passed \a pl according to \a nr and \a gfp_mask.
+ */
+static int ldlm_cli_pool_shrink(struct ldlm_pool *pl,
+				int nr, unsigned int gfp_mask)
+{
+	struct ldlm_namespace *ns;
+	int canceled = 0, unused;
+
+	ns = ldlm_pl2ns(pl);
+
+	/*
+	 * Do not cancel locks in case lru resize is disabled for this ns.
+	 */
+	if (!ns_connect_lru_resize(ns))
+		RETURN(0);
+
+	/*
+	 * Make sure that pool knows last SLV and Limit from obd.
+	 */
+	ldlm_cli_pool_pop_slv(pl);
+
+	spin_lock(&ns->ns_lock);
+	unused = ns->ns_nr_unused;
+	spin_unlock(&ns->ns_lock);
+
+	if (nr) {
+		canceled = ldlm_cancel_lru(ns, nr, LCF_ASYNC,
+					   LDLM_CANCEL_SHRINK);
+	}
+	/*
+	 * Return the number of potentially reclaimable locks.
+	 */
+	return ((unused - canceled) / 100) * sysctl_vfs_cache_pressure;
+}
+
+struct ldlm_pool_ops ldlm_srv_pool_ops = {
+	.po_recalc = ldlm_srv_pool_recalc,
+	.po_shrink = ldlm_srv_pool_shrink,
+	.po_setup  = ldlm_srv_pool_setup
+};
+
+struct ldlm_pool_ops ldlm_cli_pool_ops = {
+	.po_recalc = ldlm_cli_pool_recalc,
+	.po_shrink = ldlm_cli_pool_shrink
+};
+
+/**
+ * Pool recalc wrapper. Will call either client or server pool recalc callback
+ * depending what pool \a pl is used.
+ */
+int ldlm_pool_recalc(struct ldlm_pool *pl)
+{
+	time_t recalc_interval_sec;
+	int count;
+
+	recalc_interval_sec = cfs_time_current_sec() - pl->pl_recalc_time;
+	if (recalc_interval_sec <= 0)
+		goto recalc;
+
+	spin_lock(&pl->pl_lock);
+	recalc_interval_sec = cfs_time_current_sec() - pl->pl_recalc_time;
+	if (recalc_interval_sec > 0) {
+		/*
+		 * Update pool statistics every 1s.
+		 */
+		ldlm_pool_recalc_stats(pl);
+
+		/*
+		 * Zero out all rates and speed for the last period.
+		 */
+		atomic_set(&pl->pl_grant_rate, 0);
+		atomic_set(&pl->pl_cancel_rate, 0);
+	}
+	spin_unlock(&pl->pl_lock);
+
+ recalc:
+	if (pl->pl_ops->po_recalc != NULL) {
+		count = pl->pl_ops->po_recalc(pl);
+		lprocfs_counter_add(pl->pl_stats, LDLM_POOL_RECALC_STAT,
+				    count);
+		return count;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(ldlm_pool_recalc);
+
+/**
+ * Pool shrink wrapper. Will call either client or server pool recalc callback
+ * depending what pool \a pl is used.
+ */
+int ldlm_pool_shrink(struct ldlm_pool *pl, int nr,
+		     unsigned int gfp_mask)
+{
+	int cancel = 0;
+
+	if (pl->pl_ops->po_shrink != NULL) {
+		cancel = pl->pl_ops->po_shrink(pl, nr, gfp_mask);
+		if (nr > 0) {
+			lprocfs_counter_add(pl->pl_stats,
+					    LDLM_POOL_SHRINK_REQTD_STAT,
+					    nr);
+			lprocfs_counter_add(pl->pl_stats,
+					    LDLM_POOL_SHRINK_FREED_STAT,
+					    cancel);
+			CDEBUG(D_DLMTRACE, "%s: request to shrink %d locks, "
+			       "shrunk %d\n", pl->pl_name, nr, cancel);
+		}
+	}
+	return cancel;
+}
+EXPORT_SYMBOL(ldlm_pool_shrink);
+
+/**
+ * Pool setup wrapper. Will call either client or server pool recalc callback
+ * depending what pool \a pl is used.
+ *
+ * Sets passed \a limit into pool \a pl.
+ */
+int ldlm_pool_setup(struct ldlm_pool *pl, int limit)
+{
+	if (pl->pl_ops->po_setup != NULL)
+		return(pl->pl_ops->po_setup(pl, limit));
+	return 0;
+}
+EXPORT_SYMBOL(ldlm_pool_setup);
+
+static int lprocfs_rd_pool_state(char *page, char **start, off_t off,
+				 int count, int *eof, void *data)
+{
+	int granted, grant_rate, cancel_rate, grant_step;
+	int nr = 0, grant_speed, grant_plan, lvf;
+	struct ldlm_pool *pl = data;
+	__u64 slv, clv;
+	__u32 limit;
+
+	spin_lock(&pl->pl_lock);
+	slv = pl->pl_server_lock_volume;
+	clv = pl->pl_client_lock_volume;
+	limit = ldlm_pool_get_limit(pl);
+	grant_plan = pl->pl_grant_plan;
+	granted = atomic_read(&pl->pl_granted);
+	grant_rate = atomic_read(&pl->pl_grant_rate);
+	cancel_rate = atomic_read(&pl->pl_cancel_rate);
+	grant_speed = grant_rate - cancel_rate;
+	lvf = atomic_read(&pl->pl_lock_volume_factor);
+	grant_step = ldlm_pool_t2gsp(pl->pl_recalc_period);
+	spin_unlock(&pl->pl_lock);
+
+	nr += snprintf(page + nr, count - nr, "LDLM pool state (%s):\n",
+		       pl->pl_name);
+	nr += snprintf(page + nr, count - nr, "  SLV: "LPU64"\n", slv);
+	nr += snprintf(page + nr, count - nr, "  CLV: "LPU64"\n", clv);
+	nr += snprintf(page + nr, count - nr, "  LVF: %d\n", lvf);
+
+	if (ns_is_server(ldlm_pl2ns(pl))) {
+		nr += snprintf(page + nr, count - nr, "  GSP: %d%%\n",
+			       grant_step);
+		nr += snprintf(page + nr, count - nr, "  GP:  %d\n",
+			       grant_plan);
+	}
+	nr += snprintf(page + nr, count - nr, "  GR:  %d\n",
+		       grant_rate);
+	nr += snprintf(page + nr, count - nr, "  CR:  %d\n",
+		       cancel_rate);
+	nr += snprintf(page + nr, count - nr, "  GS:  %d\n",
+		       grant_speed);
+	nr += snprintf(page + nr, count - nr, "  G:   %d\n",
+		       granted);
+	nr += snprintf(page + nr, count - nr, "  L:   %d\n",
+		       limit);
+	return nr;
+}
+
+static int lprocfs_rd_grant_speed(char *page, char **start, off_t off,
+				  int count, int *eof, void *data)
+{
+	struct ldlm_pool *pl = data;
+	int	       grant_speed;
+
+	spin_lock(&pl->pl_lock);
+	/* serialize with ldlm_pool_recalc */
+	grant_speed = atomic_read(&pl->pl_grant_rate) -
+			atomic_read(&pl->pl_cancel_rate);
+	spin_unlock(&pl->pl_lock);
+	return lprocfs_rd_uint(page, start, off, count, eof, &grant_speed);
+}
+
+LDLM_POOL_PROC_READER(grant_plan, int);
+LDLM_POOL_PROC_READER(recalc_period, int);
+LDLM_POOL_PROC_WRITER(recalc_period, int);
+
+static int ldlm_pool_proc_init(struct ldlm_pool *pl)
+{
+	struct ldlm_namespace *ns = ldlm_pl2ns(pl);
+	struct proc_dir_entry *parent_ns_proc;
+	struct lprocfs_vars pool_vars[2];
+	char *var_name = NULL;
+	int rc = 0;
+	ENTRY;
+
+	OBD_ALLOC(var_name, MAX_STRING_SIZE + 1);
+	if (!var_name)
+		RETURN(-ENOMEM);
+
+	parent_ns_proc = lprocfs_srch(ldlm_ns_proc_dir,
+				      ldlm_ns_name(ns));
+	if (parent_ns_proc == NULL) {
+		CERROR("%s: proc entry is not initialized\n",
+		       ldlm_ns_name(ns));
+		GOTO(out_free_name, rc = -EINVAL);
+	}
+	pl->pl_proc_dir = lprocfs_register("pool", parent_ns_proc,
+					   NULL, NULL);
+	if (IS_ERR(pl->pl_proc_dir)) {
+		CERROR("LProcFS failed in ldlm-pool-init\n");
+		rc = PTR_ERR(pl->pl_proc_dir);
+		GOTO(out_free_name, rc);
+	}
+
+	var_name[MAX_STRING_SIZE] = '\0';
+	memset(pool_vars, 0, sizeof(pool_vars));
+	pool_vars[0].name = var_name;
+
+	snprintf(var_name, MAX_STRING_SIZE, "server_lock_volume");
+	pool_vars[0].data = &pl->pl_server_lock_volume;
+	pool_vars[0].read_fptr = lprocfs_rd_u64;
+	lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
+
+	snprintf(var_name, MAX_STRING_SIZE, "limit");
+	pool_vars[0].data = &pl->pl_limit;
+	pool_vars[0].read_fptr = lprocfs_rd_atomic;
+	pool_vars[0].write_fptr = lprocfs_wr_atomic;
+	lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
+
+	snprintf(var_name, MAX_STRING_SIZE, "granted");
+	pool_vars[0].data = &pl->pl_granted;
+	pool_vars[0].read_fptr = lprocfs_rd_atomic;
+	lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
+
+	snprintf(var_name, MAX_STRING_SIZE, "grant_speed");
+	pool_vars[0].data = pl;
+	pool_vars[0].read_fptr = lprocfs_rd_grant_speed;
+	lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
+
+	snprintf(var_name, MAX_STRING_SIZE, "cancel_rate");
+	pool_vars[0].data = &pl->pl_cancel_rate;
+	pool_vars[0].read_fptr = lprocfs_rd_atomic;
+	lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
+
+	snprintf(var_name, MAX_STRING_SIZE, "grant_rate");
+	pool_vars[0].data = &pl->pl_grant_rate;
+	pool_vars[0].read_fptr = lprocfs_rd_atomic;
+	lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
+
+	snprintf(var_name, MAX_STRING_SIZE, "grant_plan");
+	pool_vars[0].data = pl;
+	pool_vars[0].read_fptr = lprocfs_rd_grant_plan;
+	lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
+
+	snprintf(var_name, MAX_STRING_SIZE, "recalc_period");
+	pool_vars[0].data = pl;
+	pool_vars[0].read_fptr = lprocfs_rd_recalc_period;
+	pool_vars[0].write_fptr = lprocfs_wr_recalc_period;
+	lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
+
+	snprintf(var_name, MAX_STRING_SIZE, "lock_volume_factor");
+	pool_vars[0].data = &pl->pl_lock_volume_factor;
+	pool_vars[0].read_fptr = lprocfs_rd_atomic;
+	pool_vars[0].write_fptr = lprocfs_wr_atomic;
+	lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
+
+	snprintf(var_name, MAX_STRING_SIZE, "state");
+	pool_vars[0].data = pl;
+	pool_vars[0].read_fptr = lprocfs_rd_pool_state;
+	lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
+
+	pl->pl_stats = lprocfs_alloc_stats(LDLM_POOL_LAST_STAT -
+					   LDLM_POOL_FIRST_STAT, 0);
+	if (!pl->pl_stats)
+		GOTO(out_free_name, rc = -ENOMEM);
+
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANTED_STAT,
+			     LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+			     "granted", "locks");
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_STAT,
+			     LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+			     "grant", "locks");
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_CANCEL_STAT,
+			     LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+			     "cancel", "locks");
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_RATE_STAT,
+			     LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+			     "grant_rate", "locks/s");
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_CANCEL_RATE_STAT,
+			     LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+			     "cancel_rate", "locks/s");
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_PLAN_STAT,
+			     LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+			     "grant_plan", "locks/s");
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SLV_STAT,
+			     LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+			     "slv", "slv");
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SHRINK_REQTD_STAT,
+			     LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+			     "shrink_request", "locks");
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SHRINK_FREED_STAT,
+			     LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+			     "shrink_freed", "locks");
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_RECALC_STAT,
+			     LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+			     "recalc_freed", "locks");
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_TIMING_STAT,
+			     LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+			     "recalc_timing", "sec");
+	rc = lprocfs_register_stats(pl->pl_proc_dir, "stats", pl->pl_stats);
+
+	EXIT;
+out_free_name:
+	OBD_FREE(var_name, MAX_STRING_SIZE + 1);
+	return rc;
+}
+
+static void ldlm_pool_proc_fini(struct ldlm_pool *pl)
+{
+	if (pl->pl_stats != NULL) {
+		lprocfs_free_stats(&pl->pl_stats);
+		pl->pl_stats = NULL;
+	}
+	if (pl->pl_proc_dir != NULL) {
+		lprocfs_remove(&pl->pl_proc_dir);
+		pl->pl_proc_dir = NULL;
+	}
+}
+
+int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns,
+		   int idx, ldlm_side_t client)
+{
+	int rc;
+	ENTRY;
+
+	spin_lock_init(&pl->pl_lock);
+	atomic_set(&pl->pl_granted, 0);
+	pl->pl_recalc_time = cfs_time_current_sec();
+	atomic_set(&pl->pl_lock_volume_factor, 1);
+
+	atomic_set(&pl->pl_grant_rate, 0);
+	atomic_set(&pl->pl_cancel_rate, 0);
+	pl->pl_grant_plan = LDLM_POOL_GP(LDLM_POOL_HOST_L);
+
+	snprintf(pl->pl_name, sizeof(pl->pl_name), "ldlm-pool-%s-%d",
+		 ldlm_ns_name(ns), idx);
+
+	if (client == LDLM_NAMESPACE_SERVER) {
+		pl->pl_ops = &ldlm_srv_pool_ops;
+		ldlm_pool_set_limit(pl, LDLM_POOL_HOST_L);
+		pl->pl_recalc_period = LDLM_POOL_SRV_DEF_RECALC_PERIOD;
+		pl->pl_server_lock_volume = ldlm_pool_slv_max(LDLM_POOL_HOST_L);
+	} else {
+		ldlm_pool_set_limit(pl, 1);
+		pl->pl_server_lock_volume = 0;
+		pl->pl_ops = &ldlm_cli_pool_ops;
+		pl->pl_recalc_period = LDLM_POOL_CLI_DEF_RECALC_PERIOD;
+	}
+	pl->pl_client_lock_volume = 0;
+	rc = ldlm_pool_proc_init(pl);
+	if (rc)
+		RETURN(rc);
+
+	CDEBUG(D_DLMTRACE, "Lock pool %s is initialized\n", pl->pl_name);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_pool_init);
+
+void ldlm_pool_fini(struct ldlm_pool *pl)
+{
+	ENTRY;
+	ldlm_pool_proc_fini(pl);
+
+	/*
+	 * Pool should not be used after this point. We can't free it here as
+	 * it lives in struct ldlm_namespace, but still interested in catching
+	 * any abnormal using cases.
+	 */
+	POISON(pl, 0x5a, sizeof(*pl));
+	EXIT;
+}
+EXPORT_SYMBOL(ldlm_pool_fini);
+
+/**
+ * Add new taken ldlm lock \a lock into pool \a pl accounting.
+ */
+void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock)
+{
+	/*
+	 * FLOCK locks are special in a sense that they are almost never
+	 * cancelled, instead special kind of lock is used to drop them.
+	 * also there is no LRU for flock locks, so no point in tracking
+	 * them anyway.
+	 */
+	if (lock->l_resource->lr_type == LDLM_FLOCK)
+		return;
+
+	atomic_inc(&pl->pl_granted);
+	atomic_inc(&pl->pl_grant_rate);
+	lprocfs_counter_incr(pl->pl_stats, LDLM_POOL_GRANT_STAT);
+	/*
+	 * Do not do pool recalc for client side as all locks which
+	 * potentially may be canceled has already been packed into
+	 * enqueue/cancel rpc. Also we do not want to run out of stack
+	 * with too long call paths.
+	 */
+	if (ns_is_server(ldlm_pl2ns(pl)))
+		ldlm_pool_recalc(pl);
+}
+EXPORT_SYMBOL(ldlm_pool_add);
+
+/**
+ * Remove ldlm lock \a lock from pool \a pl accounting.
+ */
+void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock)
+{
+	/*
+	 * Filter out FLOCK locks. Read above comment in ldlm_pool_add().
+	 */
+	if (lock->l_resource->lr_type == LDLM_FLOCK)
+		return;
+
+	LASSERT(atomic_read(&pl->pl_granted) > 0);
+	atomic_dec(&pl->pl_granted);
+	atomic_inc(&pl->pl_cancel_rate);
+
+	lprocfs_counter_incr(pl->pl_stats, LDLM_POOL_CANCEL_STAT);
+
+	if (ns_is_server(ldlm_pl2ns(pl)))
+		ldlm_pool_recalc(pl);
+}
+EXPORT_SYMBOL(ldlm_pool_del);
+
+/**
+ * Returns current \a pl SLV.
+ *
+ * \pre ->pl_lock is not locked.
+ */
+__u64 ldlm_pool_get_slv(struct ldlm_pool *pl)
+{
+	__u64 slv;
+	spin_lock(&pl->pl_lock);
+	slv = pl->pl_server_lock_volume;
+	spin_unlock(&pl->pl_lock);
+	return slv;
+}
+EXPORT_SYMBOL(ldlm_pool_get_slv);
+
+/**
+ * Sets passed \a slv to \a pl.
+ *
+ * \pre ->pl_lock is not locked.
+ */
+void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv)
+{
+	spin_lock(&pl->pl_lock);
+	pl->pl_server_lock_volume = slv;
+	spin_unlock(&pl->pl_lock);
+}
+EXPORT_SYMBOL(ldlm_pool_set_slv);
+
+/**
+ * Returns current \a pl CLV.
+ *
+ * \pre ->pl_lock is not locked.
+ */
+__u64 ldlm_pool_get_clv(struct ldlm_pool *pl)
+{
+	__u64 slv;
+	spin_lock(&pl->pl_lock);
+	slv = pl->pl_client_lock_volume;
+	spin_unlock(&pl->pl_lock);
+	return slv;
+}
+EXPORT_SYMBOL(ldlm_pool_get_clv);
+
+/**
+ * Sets passed \a clv to \a pl.
+ *
+ * \pre ->pl_lock is not locked.
+ */
+void ldlm_pool_set_clv(struct ldlm_pool *pl, __u64 clv)
+{
+	spin_lock(&pl->pl_lock);
+	pl->pl_client_lock_volume = clv;
+	spin_unlock(&pl->pl_lock);
+}
+EXPORT_SYMBOL(ldlm_pool_set_clv);
+
+/**
+ * Returns current \a pl limit.
+ */
+__u32 ldlm_pool_get_limit(struct ldlm_pool *pl)
+{
+	return atomic_read(&pl->pl_limit);
+}
+EXPORT_SYMBOL(ldlm_pool_get_limit);
+
+/**
+ * Sets passed \a limit to \a pl.
+ */
+void ldlm_pool_set_limit(struct ldlm_pool *pl, __u32 limit)
+{
+	atomic_set(&pl->pl_limit, limit);
+}
+EXPORT_SYMBOL(ldlm_pool_set_limit);
+
+/**
+ * Returns current LVF from \a pl.
+ */
+__u32 ldlm_pool_get_lvf(struct ldlm_pool *pl)
+{
+	return atomic_read(&pl->pl_lock_volume_factor);
+}
+EXPORT_SYMBOL(ldlm_pool_get_lvf);
+
+static int ldlm_pool_granted(struct ldlm_pool *pl)
+{
+	return atomic_read(&pl->pl_granted);
+}
+
+static struct ptlrpc_thread *ldlm_pools_thread;
+static struct shrinker *ldlm_pools_srv_shrinker;
+static struct shrinker *ldlm_pools_cli_shrinker;
+static struct completion ldlm_pools_comp;
+
+/*
+ * Cancel \a nr locks from all namespaces (if possible). Returns number of
+ * cached locks after shrink is finished. All namespaces are asked to
+ * cancel approximately equal amount of locks to keep balancing.
+ */
+static int ldlm_pools_shrink(ldlm_side_t client, int nr,
+			     unsigned int gfp_mask)
+{
+	int total = 0, cached = 0, nr_ns;
+	struct ldlm_namespace *ns;
+	void *cookie;
+
+	if (client == LDLM_NAMESPACE_CLIENT && nr != 0 &&
+	    !(gfp_mask & __GFP_FS))
+		return -1;
+
+	CDEBUG(D_DLMTRACE, "Request to shrink %d %s locks from all pools\n",
+	       nr, client == LDLM_NAMESPACE_CLIENT ? "client" : "server");
+
+	cookie = cl_env_reenter();
+
+	/*
+	 * Find out how many resources we may release.
+	 */
+	for (nr_ns = atomic_read(ldlm_namespace_nr(client));
+	     nr_ns > 0; nr_ns--)
+	{
+		mutex_lock(ldlm_namespace_lock(client));
+		if (list_empty(ldlm_namespace_list(client))) {
+			mutex_unlock(ldlm_namespace_lock(client));
+			cl_env_reexit(cookie);
+			return 0;
+		}
+		ns = ldlm_namespace_first_locked(client);
+		ldlm_namespace_get(ns);
+		ldlm_namespace_move_locked(ns, client);
+		mutex_unlock(ldlm_namespace_lock(client));
+		total += ldlm_pool_shrink(&ns->ns_pool, 0, gfp_mask);
+		ldlm_namespace_put(ns);
+	}
+
+	if (nr == 0 || total == 0) {
+		cl_env_reexit(cookie);
+		return total;
+	}
+
+	/*
+	 * Shrink at least ldlm_namespace_nr(client) namespaces.
+	 */
+	for (nr_ns = atomic_read(ldlm_namespace_nr(client));
+	     nr_ns > 0; nr_ns--)
+	{
+		int cancel, nr_locks;
+
+		/*
+		 * Do not call shrink under ldlm_namespace_lock(client)
+		 */
+		mutex_lock(ldlm_namespace_lock(client));
+		if (list_empty(ldlm_namespace_list(client))) {
+			mutex_unlock(ldlm_namespace_lock(client));
+			/*
+			 * If list is empty, we can't return any @cached > 0,
+			 * that probably would cause needless shrinker
+			 * call.
+			 */
+			cached = 0;
+			break;
+		}
+		ns = ldlm_namespace_first_locked(client);
+		ldlm_namespace_get(ns);
+		ldlm_namespace_move_locked(ns, client);
+		mutex_unlock(ldlm_namespace_lock(client));
+
+		nr_locks = ldlm_pool_granted(&ns->ns_pool);
+		cancel = 1 + nr_locks * nr / total;
+		ldlm_pool_shrink(&ns->ns_pool, cancel, gfp_mask);
+		cached += ldlm_pool_granted(&ns->ns_pool);
+		ldlm_namespace_put(ns);
+	}
+	cl_env_reexit(cookie);
+	/* we only decrease the SLV in server pools shrinker, return -1 to
+	 * kernel to avoid needless loop. LU-1128 */
+	return (client == LDLM_NAMESPACE_SERVER) ? -1 : cached;
+}
+
+static int ldlm_pools_srv_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask))
+{
+	return ldlm_pools_shrink(LDLM_NAMESPACE_SERVER,
+				 shrink_param(sc, nr_to_scan),
+				 shrink_param(sc, gfp_mask));
+}
+
+static int ldlm_pools_cli_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask))
+{
+	return ldlm_pools_shrink(LDLM_NAMESPACE_CLIENT,
+				 shrink_param(sc, nr_to_scan),
+				 shrink_param(sc, gfp_mask));
+}
+
+void ldlm_pools_recalc(ldlm_side_t client)
+{
+	__u32 nr_l = 0, nr_p = 0, l;
+	struct ldlm_namespace *ns;
+	int nr, equal = 0;
+
+	/*
+	 * No need to setup pool limit for client pools.
+	 */
+	if (client == LDLM_NAMESPACE_SERVER) {
+		/*
+		 * Check all modest namespaces first.
+		 */
+		mutex_lock(ldlm_namespace_lock(client));
+		list_for_each_entry(ns, ldlm_namespace_list(client),
+					ns_list_chain)
+		{
+			if (ns->ns_appetite != LDLM_NAMESPACE_MODEST)
+				continue;
+
+			l = ldlm_pool_granted(&ns->ns_pool);
+			if (l == 0)
+				l = 1;
+
+			/*
+			 * Set the modest pools limit equal to their avg granted
+			 * locks + ~6%.
+			 */
+			l += dru(l, LDLM_POOLS_MODEST_MARGIN_SHIFT, 0);
+			ldlm_pool_setup(&ns->ns_pool, l);
+			nr_l += l;
+			nr_p++;
+		}
+
+		/*
+		 * Make sure that modest namespaces did not eat more that 2/3
+		 * of limit.
+		 */
+		if (nr_l >= 2 * (LDLM_POOL_HOST_L / 3)) {
+			CWARN("\"Modest\" pools eat out 2/3 of server locks "
+			      "limit (%d of %lu). This means that you have too "
+			      "many clients for this amount of server RAM. "
+			      "Upgrade server!\n", nr_l, LDLM_POOL_HOST_L);
+			equal = 1;
+		}
+
+		/*
+		 * The rest is given to greedy namespaces.
+		 */
+		list_for_each_entry(ns, ldlm_namespace_list(client),
+					ns_list_chain)
+		{
+			if (!equal && ns->ns_appetite != LDLM_NAMESPACE_GREEDY)
+				continue;
+
+			if (equal) {
+				/*
+				 * In the case 2/3 locks are eaten out by
+				 * modest pools, we re-setup equal limit
+				 * for _all_ pools.
+				 */
+				l = LDLM_POOL_HOST_L /
+					atomic_read(
+						ldlm_namespace_nr(client));
+			} else {
+				/*
+				 * All the rest of greedy pools will have
+				 * all locks in equal parts.
+				 */
+				l = (LDLM_POOL_HOST_L - nr_l) /
+					(atomic_read(
+						ldlm_namespace_nr(client)) -
+					 nr_p);
+			}
+			ldlm_pool_setup(&ns->ns_pool, l);
+		}
+		mutex_unlock(ldlm_namespace_lock(client));
+	}
+
+	/*
+	 * Recalc at least ldlm_namespace_nr(client) namespaces.
+	 */
+	for (nr = atomic_read(ldlm_namespace_nr(client)); nr > 0; nr--) {
+		int     skip;
+		/*
+		 * Lock the list, get first @ns in the list, getref, move it
+		 * to the tail, unlock and call pool recalc. This way we avoid
+		 * calling recalc under @ns lock what is really good as we get
+		 * rid of potential deadlock on client nodes when canceling
+		 * locks synchronously.
+		 */
+		mutex_lock(ldlm_namespace_lock(client));
+		if (list_empty(ldlm_namespace_list(client))) {
+			mutex_unlock(ldlm_namespace_lock(client));
+			break;
+		}
+		ns = ldlm_namespace_first_locked(client);
+
+		spin_lock(&ns->ns_lock);
+		/*
+		 * skip ns which is being freed, and we don't want to increase
+		 * its refcount again, not even temporarily. bz21519 & LU-499.
+		 */
+		if (ns->ns_stopping) {
+			skip = 1;
+		} else {
+			skip = 0;
+			ldlm_namespace_get(ns);
+		}
+		spin_unlock(&ns->ns_lock);
+
+		ldlm_namespace_move_locked(ns, client);
+		mutex_unlock(ldlm_namespace_lock(client));
+
+		/*
+		 * After setup is done - recalc the pool.
+		 */
+		if (!skip) {
+			ldlm_pool_recalc(&ns->ns_pool);
+			ldlm_namespace_put(ns);
+		}
+	}
+}
+EXPORT_SYMBOL(ldlm_pools_recalc);
+
+static int ldlm_pools_thread_main(void *arg)
+{
+	struct ptlrpc_thread *thread = (struct ptlrpc_thread *)arg;
+	ENTRY;
+
+	thread_set_flags(thread, SVC_RUNNING);
+	wake_up(&thread->t_ctl_waitq);
+
+	CDEBUG(D_DLMTRACE, "%s: pool thread starting, process %d\n",
+		"ldlm_poold", current_pid());
+
+	while (1) {
+		struct l_wait_info lwi;
+
+		/*
+		 * Recal all pools on this tick.
+		 */
+		ldlm_pools_recalc(LDLM_NAMESPACE_SERVER);
+		ldlm_pools_recalc(LDLM_NAMESPACE_CLIENT);
+
+		/*
+		 * Wait until the next check time, or until we're
+		 * stopped.
+		 */
+		lwi = LWI_TIMEOUT(cfs_time_seconds(LDLM_POOLS_THREAD_PERIOD),
+				  NULL, NULL);
+		l_wait_event(thread->t_ctl_waitq,
+			     thread_is_stopping(thread) ||
+			     thread_is_event(thread),
+			     &lwi);
+
+		if (thread_test_and_clear_flags(thread, SVC_STOPPING))
+			break;
+		else
+			thread_test_and_clear_flags(thread, SVC_EVENT);
+	}
+
+	thread_set_flags(thread, SVC_STOPPED);
+	wake_up(&thread->t_ctl_waitq);
+
+	CDEBUG(D_DLMTRACE, "%s: pool thread exiting, process %d\n",
+		"ldlm_poold", current_pid());
+
+	complete_and_exit(&ldlm_pools_comp, 0);
+}
+
+static int ldlm_pools_thread_start(void)
+{
+	struct l_wait_info lwi = { 0 };
+	task_t *task;
+	ENTRY;
+
+	if (ldlm_pools_thread != NULL)
+		RETURN(-EALREADY);
+
+	OBD_ALLOC_PTR(ldlm_pools_thread);
+	if (ldlm_pools_thread == NULL)
+		RETURN(-ENOMEM);
+
+	init_completion(&ldlm_pools_comp);
+	init_waitqueue_head(&ldlm_pools_thread->t_ctl_waitq);
+
+	task = kthread_run(ldlm_pools_thread_main, ldlm_pools_thread,
+			   "ldlm_poold");
+	if (IS_ERR(task)) {
+		CERROR("Can't start pool thread, error %ld\n", PTR_ERR(task));
+		OBD_FREE(ldlm_pools_thread, sizeof(*ldlm_pools_thread));
+		ldlm_pools_thread = NULL;
+		RETURN(PTR_ERR(task));
+	}
+	l_wait_event(ldlm_pools_thread->t_ctl_waitq,
+		     thread_is_running(ldlm_pools_thread), &lwi);
+	RETURN(0);
+}
+
+static void ldlm_pools_thread_stop(void)
+{
+	ENTRY;
+
+	if (ldlm_pools_thread == NULL) {
+		EXIT;
+		return;
+	}
+
+	thread_set_flags(ldlm_pools_thread, SVC_STOPPING);
+	wake_up(&ldlm_pools_thread->t_ctl_waitq);
+
+	/*
+	 * Make sure that pools thread is finished before freeing @thread.
+	 * This fixes possible race and oops due to accessing freed memory
+	 * in pools thread.
+	 */
+	wait_for_completion(&ldlm_pools_comp);
+	OBD_FREE_PTR(ldlm_pools_thread);
+	ldlm_pools_thread = NULL;
+	EXIT;
+}
+
+int ldlm_pools_init(void)
+{
+	int rc;
+	ENTRY;
+
+	rc = ldlm_pools_thread_start();
+	if (rc == 0) {
+		ldlm_pools_srv_shrinker =
+			set_shrinker(DEFAULT_SEEKS,
+					 ldlm_pools_srv_shrink);
+		ldlm_pools_cli_shrinker =
+			set_shrinker(DEFAULT_SEEKS,
+					 ldlm_pools_cli_shrink);
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_pools_init);
+
+void ldlm_pools_fini(void)
+{
+	if (ldlm_pools_srv_shrinker != NULL) {
+		remove_shrinker(ldlm_pools_srv_shrinker);
+		ldlm_pools_srv_shrinker = NULL;
+	}
+	if (ldlm_pools_cli_shrinker != NULL) {
+		remove_shrinker(ldlm_pools_cli_shrinker);
+		ldlm_pools_cli_shrinker = NULL;
+	}
+	ldlm_pools_thread_stop();
+}
+EXPORT_SYMBOL(ldlm_pools_fini);
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_request.c b/drivers/staging/lustre/lustre/ldlm/ldlm_request.c
new file mode 100644
index 000000000000..1a690edaba03
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_request.c
@@ -0,0 +1,2333 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/**
+ * This file contains Asynchronous System Trap (AST) handlers and related
+ * LDLM request-processing routines.
+ *
+ * An AST is a callback issued on a lock when its state is changed. There are
+ * several different types of ASTs (callbacks) registered for each lock:
+ *
+ * - completion AST: when a lock is enqueued by some process, but cannot be
+ *   granted immediately due to other conflicting locks on the same resource,
+ *   the completion AST is sent to notify the caller when the lock is
+ *   eventually granted
+ *
+ * - blocking AST: when a lock is granted to some process, if another process
+ *   enqueues a conflicting (blocking) lock on a resource, a blocking AST is
+ *   sent to notify the holder(s) of the lock(s) of the conflicting lock
+ *   request. The lock holder(s) must release their lock(s) on that resource in
+ *   a timely manner or be evicted by the server.
+ *
+ * - glimpse AST: this is used when a process wants information about a lock
+ *   (i.e. the lock value block (LVB)) but does not necessarily require holding
+ *   the lock. If the resource is locked, the lock holder(s) are sent glimpse
+ *   ASTs and the LVB is returned to the caller, and lock holder(s) may CANCEL
+ *   their lock(s) if they are idle. If the resource is not locked, the server
+ *   may grant the lock.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include <lustre_dlm.h>
+#include <obd_class.h>
+#include <obd.h>
+
+#include "ldlm_internal.h"
+
+int ldlm_enqueue_min = OBD_TIMEOUT_DEFAULT;
+CFS_MODULE_PARM(ldlm_enqueue_min, "i", int, 0644,
+		"lock enqueue timeout minimum");
+
+/* in client side, whether the cached locks will be canceled before replay */
+unsigned int ldlm_cancel_unused_locks_before_replay = 1;
+
+static void interrupted_completion_wait(void *data)
+{
+}
+
+struct lock_wait_data {
+	struct ldlm_lock *lwd_lock;
+	__u32	     lwd_conn_cnt;
+};
+
+struct ldlm_async_args {
+	struct lustre_handle lock_handle;
+};
+
+int ldlm_expired_completion_wait(void *data)
+{
+	struct lock_wait_data *lwd = data;
+	struct ldlm_lock *lock = lwd->lwd_lock;
+	struct obd_import *imp;
+	struct obd_device *obd;
+
+	ENTRY;
+	if (lock->l_conn_export == NULL) {
+		static cfs_time_t next_dump = 0, last_dump = 0;
+
+		if (ptlrpc_check_suspend())
+			RETURN(0);
+
+		LCONSOLE_WARN("lock timed out (enqueued at "CFS_TIME_T", "
+			      CFS_DURATION_T"s ago)\n",
+			      lock->l_last_activity,
+			      cfs_time_sub(cfs_time_current_sec(),
+					   lock->l_last_activity));
+		LDLM_DEBUG(lock, "lock timed out (enqueued at "CFS_TIME_T", "
+			   CFS_DURATION_T"s ago); not entering recovery in "
+			   "server code, just going back to sleep",
+			   lock->l_last_activity,
+			   cfs_time_sub(cfs_time_current_sec(),
+					lock->l_last_activity));
+		if (cfs_time_after(cfs_time_current(), next_dump)) {
+			last_dump = next_dump;
+			next_dump = cfs_time_shift(300);
+			ldlm_namespace_dump(D_DLMTRACE,
+					    ldlm_lock_to_ns(lock));
+			if (last_dump == 0)
+				libcfs_debug_dumplog();
+		}
+		RETURN(0);
+	}
+
+	obd = lock->l_conn_export->exp_obd;
+	imp = obd->u.cli.cl_import;
+	ptlrpc_fail_import(imp, lwd->lwd_conn_cnt);
+	LDLM_ERROR(lock, "lock timed out (enqueued at "CFS_TIME_T", "
+		  CFS_DURATION_T"s ago), entering recovery for %s@%s",
+		  lock->l_last_activity,
+		  cfs_time_sub(cfs_time_current_sec(), lock->l_last_activity),
+		  obd2cli_tgt(obd), imp->imp_connection->c_remote_uuid.uuid);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_expired_completion_wait);
+
+/* We use the same basis for both server side and client side functions
+   from a single node. */
+int ldlm_get_enq_timeout(struct ldlm_lock *lock)
+{
+	int timeout = at_get(ldlm_lock_to_ns_at(lock));
+	if (AT_OFF)
+		return obd_timeout / 2;
+	/* Since these are non-updating timeouts, we should be conservative.
+	   It would be nice to have some kind of "early reply" mechanism for
+	   lock callbacks too... */
+	timeout = min_t(int, at_max, timeout + (timeout >> 1)); /* 150% */
+	return max(timeout, ldlm_enqueue_min);
+}
+EXPORT_SYMBOL(ldlm_get_enq_timeout);
+
+/**
+ * Helper function for ldlm_completion_ast(), updating timings when lock is
+ * actually granted.
+ */
+static int ldlm_completion_tail(struct ldlm_lock *lock)
+{
+	long delay;
+	int  result;
+
+	if (lock->l_destroyed || lock->l_flags & LDLM_FL_FAILED) {
+		LDLM_DEBUG(lock, "client-side enqueue: destroyed");
+		result = -EIO;
+	} else {
+		delay = cfs_time_sub(cfs_time_current_sec(),
+				     lock->l_last_activity);
+		LDLM_DEBUG(lock, "client-side enqueue: granted after "
+			   CFS_DURATION_T"s", delay);
+
+		/* Update our time estimate */
+		at_measured(ldlm_lock_to_ns_at(lock),
+			    delay);
+		result = 0;
+	}
+	return result;
+}
+
+/**
+ * Implementation of ->l_completion_ast() for a client, that doesn't wait
+ * until lock is granted. Suitable for locks enqueued through ptlrpcd, of
+ * other threads that cannot block for long.
+ */
+int ldlm_completion_ast_async(struct ldlm_lock *lock, __u64 flags, void *data)
+{
+	ENTRY;
+
+	if (flags == LDLM_FL_WAIT_NOREPROC) {
+		LDLM_DEBUG(lock, "client-side enqueue waiting on pending lock");
+		RETURN(0);
+	}
+
+	if (!(flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
+		       LDLM_FL_BLOCK_CONV))) {
+		wake_up(&lock->l_waitq);
+		RETURN(ldlm_completion_tail(lock));
+	}
+
+	LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, "
+		   "going forward");
+	ldlm_reprocess_all(lock->l_resource);
+	RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_completion_ast_async);
+
+/**
+ * Generic LDLM "completion" AST. This is called in several cases:
+ *
+ *     - when a reply to an ENQUEUE RPC is received from the server
+ *       (ldlm_cli_enqueue_fini()). Lock might be granted or not granted at
+ *       this point (determined by flags);
+ *
+ *     - when LDLM_CP_CALLBACK RPC comes to client to notify it that lock has
+ *       been granted;
+ *
+ *     - when ldlm_lock_match(LDLM_FL_LVB_READY) is about to wait until lock
+ *       gets correct lvb;
+ *
+ *     - to force all locks when resource is destroyed (cleanup_resource());
+ *
+ *     - during lock conversion (not used currently).
+ *
+ * If lock is not granted in the first case, this function waits until second
+ * or penultimate cases happen in some other thread.
+ *
+ */
+int ldlm_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
+{
+	/* XXX ALLOCATE - 160 bytes */
+	struct lock_wait_data lwd;
+	struct obd_device *obd;
+	struct obd_import *imp = NULL;
+	struct l_wait_info lwi;
+	__u32 timeout;
+	int rc = 0;
+	ENTRY;
+
+	if (flags == LDLM_FL_WAIT_NOREPROC) {
+		LDLM_DEBUG(lock, "client-side enqueue waiting on pending lock");
+		goto noreproc;
+	}
+
+	if (!(flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
+		       LDLM_FL_BLOCK_CONV))) {
+		wake_up(&lock->l_waitq);
+		RETURN(0);
+	}
+
+	LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, "
+		   "sleeping");
+
+noreproc:
+
+	obd = class_exp2obd(lock->l_conn_export);
+
+	/* if this is a local lock, then there is no import */
+	if (obd != NULL) {
+		imp = obd->u.cli.cl_import;
+	}
+
+	/* Wait a long time for enqueue - server may have to callback a
+	   lock from another client.  Server will evict the other client if it
+	   doesn't respond reasonably, and then give us the lock. */
+	timeout = ldlm_get_enq_timeout(lock) * 2;
+
+	lwd.lwd_lock = lock;
+
+	if (lock->l_flags & LDLM_FL_NO_TIMEOUT) {
+		LDLM_DEBUG(lock, "waiting indefinitely because of NO_TIMEOUT");
+		lwi = LWI_INTR(interrupted_completion_wait, &lwd);
+	} else {
+		lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(timeout),
+				       ldlm_expired_completion_wait,
+				       interrupted_completion_wait, &lwd);
+	}
+
+	if (imp != NULL) {
+		spin_lock(&imp->imp_lock);
+		lwd.lwd_conn_cnt = imp->imp_conn_cnt;
+		spin_unlock(&imp->imp_lock);
+	}
+
+	if (ns_is_client(ldlm_lock_to_ns(lock)) &&
+	    OBD_FAIL_CHECK_RESET(OBD_FAIL_LDLM_INTR_CP_AST,
+				 OBD_FAIL_LDLM_CP_BL_RACE | OBD_FAIL_ONCE)) {
+		lock->l_flags |= LDLM_FL_FAIL_LOC;
+		rc = -EINTR;
+	} else {
+		/* Go to sleep until the lock is granted or cancelled. */
+		rc = l_wait_event(lock->l_waitq,
+				  is_granted_or_cancelled(lock), &lwi);
+	}
+
+	if (rc) {
+		LDLM_DEBUG(lock, "client-side enqueue waking up: failed (%d)",
+			   rc);
+		RETURN(rc);
+	}
+
+	RETURN(ldlm_completion_tail(lock));
+}
+EXPORT_SYMBOL(ldlm_completion_ast);
+
+/**
+ * A helper to build a blocking AST function
+ *
+ * Perform a common operation for blocking ASTs:
+ * defferred lock cancellation.
+ *
+ * \param lock the lock blocking or canceling AST was called on
+ * \retval 0
+ * \see mdt_blocking_ast
+ * \see ldlm_blocking_ast
+ */
+int ldlm_blocking_ast_nocheck(struct ldlm_lock *lock)
+{
+	int do_ast;
+	ENTRY;
+
+	lock->l_flags |= LDLM_FL_CBPENDING;
+	do_ast = (!lock->l_readers && !lock->l_writers);
+	unlock_res_and_lock(lock);
+
+	if (do_ast) {
+		struct lustre_handle lockh;
+		int rc;
+
+		LDLM_DEBUG(lock, "already unused, calling ldlm_cli_cancel");
+		ldlm_lock2handle(lock, &lockh);
+		rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
+		if (rc < 0)
+			CERROR("ldlm_cli_cancel: %d\n", rc);
+	} else {
+		LDLM_DEBUG(lock, "Lock still has references, will be "
+			   "cancelled later");
+	}
+	RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_blocking_ast_nocheck);
+
+/**
+ * Server blocking AST
+ *
+ * ->l_blocking_ast() callback for LDLM locks acquired by server-side
+ * OBDs.
+ *
+ * \param lock the lock which blocks a request or cancelling lock
+ * \param desc unused
+ * \param data unused
+ * \param flag indicates whether this cancelling or blocking callback
+ * \retval 0
+ * \see ldlm_blocking_ast_nocheck
+ */
+int ldlm_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+		      void *data, int flag)
+{
+	ENTRY;
+
+	if (flag == LDLM_CB_CANCELING) {
+		/* Don't need to do anything here. */
+		RETURN(0);
+	}
+
+	lock_res_and_lock(lock);
+	/* Get this: if ldlm_blocking_ast is racing with intent_policy, such
+	 * that ldlm_blocking_ast is called just before intent_policy method
+	 * takes the lr_lock, then by the time we get the lock, we might not
+	 * be the correct blocking function anymore.  So check, and return
+	 * early, if so. */
+	if (lock->l_blocking_ast != ldlm_blocking_ast) {
+		unlock_res_and_lock(lock);
+		RETURN(0);
+	}
+	RETURN(ldlm_blocking_ast_nocheck(lock));
+}
+EXPORT_SYMBOL(ldlm_blocking_ast);
+
+/**
+ * ->l_glimpse_ast() for DLM extent locks acquired on the server-side. See
+ * comment in filter_intent_policy() on why you may need this.
+ */
+int ldlm_glimpse_ast(struct ldlm_lock *lock, void *reqp)
+{
+	/*
+	 * Returning -ELDLM_NO_LOCK_DATA actually works, but the reason for
+	 * that is rather subtle: with OST-side locking, it may so happen that
+	 * _all_ extent locks are held by the OST. If client wants to obtain
+	 * current file size it calls ll{,u}_glimpse_size(), and (as locks are
+	 * on the server), dummy glimpse callback fires and does
+	 * nothing. Client still receives correct file size due to the
+	 * following fragment in filter_intent_policy():
+	 *
+	 * rc = l->l_glimpse_ast(l, NULL); // this will update the LVB
+	 * if (rc != 0 && res->lr_namespace->ns_lvbo &&
+	 *     res->lr_namespace->ns_lvbo->lvbo_update) {
+	 *	 res->lr_namespace->ns_lvbo->lvbo_update(res, NULL, 0, 1);
+	 * }
+	 *
+	 * that is, after glimpse_ast() fails, filter_lvbo_update() runs, and
+	 * returns correct file size to the client.
+	 */
+	return -ELDLM_NO_LOCK_DATA;
+}
+EXPORT_SYMBOL(ldlm_glimpse_ast);
+
+/**
+ * Enqueue a local lock (typically on a server).
+ */
+int ldlm_cli_enqueue_local(struct ldlm_namespace *ns,
+			   const struct ldlm_res_id *res_id,
+			   ldlm_type_t type, ldlm_policy_data_t *policy,
+			   ldlm_mode_t mode, __u64 *flags,
+			   ldlm_blocking_callback blocking,
+			   ldlm_completion_callback completion,
+			   ldlm_glimpse_callback glimpse,
+			   void *data, __u32 lvb_len, enum lvb_type lvb_type,
+			   const __u64 *client_cookie,
+			   struct lustre_handle *lockh)
+{
+	struct ldlm_lock *lock;
+	int err;
+	const struct ldlm_callback_suite cbs = { .lcs_completion = completion,
+						 .lcs_blocking   = blocking,
+						 .lcs_glimpse    = glimpse,
+	};
+	ENTRY;
+
+	LASSERT(!(*flags & LDLM_FL_REPLAY));
+	if (unlikely(ns_is_client(ns))) {
+		CERROR("Trying to enqueue local lock in a shadow namespace\n");
+		LBUG();
+	}
+
+	lock = ldlm_lock_create(ns, res_id, type, mode, &cbs, data, lvb_len,
+				lvb_type);
+	if (unlikely(!lock))
+		GOTO(out_nolock, err = -ENOMEM);
+
+	ldlm_lock2handle(lock, lockh);
+
+	/* NB: we don't have any lock now (lock_res_and_lock)
+	 * because it's a new lock */
+	ldlm_lock_addref_internal_nolock(lock, mode);
+	lock->l_flags |= LDLM_FL_LOCAL;
+	if (*flags & LDLM_FL_ATOMIC_CB)
+		lock->l_flags |= LDLM_FL_ATOMIC_CB;
+
+	if (policy != NULL)
+		lock->l_policy_data = *policy;
+	if (client_cookie != NULL)
+		lock->l_client_cookie = *client_cookie;
+	if (type == LDLM_EXTENT)
+		lock->l_req_extent = policy->l_extent;
+
+	err = ldlm_lock_enqueue(ns, &lock, policy, flags);
+	if (unlikely(err != ELDLM_OK))
+		GOTO(out, err);
+
+	if (policy != NULL)
+		*policy = lock->l_policy_data;
+
+	if (lock->l_completion_ast)
+		lock->l_completion_ast(lock, *flags, NULL);
+
+	LDLM_DEBUG(lock, "client-side local enqueue handler, new lock created");
+	EXIT;
+ out:
+	LDLM_LOCK_RELEASE(lock);
+ out_nolock:
+	return err;
+}
+EXPORT_SYMBOL(ldlm_cli_enqueue_local);
+
+static void failed_lock_cleanup(struct ldlm_namespace *ns,
+				struct ldlm_lock *lock, int mode)
+{
+	int need_cancel = 0;
+
+	/* Set a flag to prevent us from sending a CANCEL (bug 407) */
+	lock_res_and_lock(lock);
+	/* Check that lock is not granted or failed, we might race. */
+	if ((lock->l_req_mode != lock->l_granted_mode) &&
+	    !(lock->l_flags & LDLM_FL_FAILED)) {
+		/* Make sure that this lock will not be found by raced
+		 * bl_ast and -EINVAL reply is sent to server anyways.
+		 * bug 17645 */
+		lock->l_flags |= LDLM_FL_LOCAL_ONLY | LDLM_FL_FAILED |
+				 LDLM_FL_ATOMIC_CB | LDLM_FL_CBPENDING;
+		need_cancel = 1;
+	}
+	unlock_res_and_lock(lock);
+
+	if (need_cancel)
+		LDLM_DEBUG(lock,
+			   "setting FL_LOCAL_ONLY | LDLM_FL_FAILED | "
+			   "LDLM_FL_ATOMIC_CB | LDLM_FL_CBPENDING");
+	else
+		LDLM_DEBUG(lock, "lock was granted or failed in race");
+
+	ldlm_lock_decref_internal(lock, mode);
+
+	/* XXX - HACK because we shouldn't call ldlm_lock_destroy()
+	 *       from llite/file.c/ll_file_flock(). */
+	/* This code makes for the fact that we do not have blocking handler on
+	 * a client for flock locks. As such this is the place where we must
+	 * completely kill failed locks. (interrupted and those that
+	 * were waiting to be granted when server evicted us. */
+	if (lock->l_resource->lr_type == LDLM_FLOCK) {
+		lock_res_and_lock(lock);
+		ldlm_resource_unlink_lock(lock);
+		ldlm_lock_destroy_nolock(lock);
+		unlock_res_and_lock(lock);
+	}
+}
+
+/**
+ * Finishing portion of client lock enqueue code.
+ *
+ * Called after receiving reply from server.
+ */
+int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
+			  ldlm_type_t type, __u8 with_policy, ldlm_mode_t mode,
+			  __u64 *flags, void *lvb, __u32 lvb_len,
+			  struct lustre_handle *lockh,int rc)
+{
+	struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
+	int is_replay = *flags & LDLM_FL_REPLAY;
+	struct ldlm_lock *lock;
+	struct ldlm_reply *reply;
+	int cleanup_phase = 1;
+	int size = 0;
+	ENTRY;
+
+	lock = ldlm_handle2lock(lockh);
+	/* ldlm_cli_enqueue is holding a reference on this lock. */
+	if (!lock) {
+		LASSERT(type == LDLM_FLOCK);
+		RETURN(-ENOLCK);
+	}
+
+	LASSERTF(ergo(lvb_len != 0, lvb_len == lock->l_lvb_len),
+		 "lvb_len = %d, l_lvb_len = %d\n", lvb_len, lock->l_lvb_len);
+
+	if (rc != ELDLM_OK) {
+		LASSERT(!is_replay);
+		LDLM_DEBUG(lock, "client-side enqueue END (%s)",
+			   rc == ELDLM_LOCK_ABORTED ? "ABORTED" : "FAILED");
+
+		if (rc != ELDLM_LOCK_ABORTED)
+			GOTO(cleanup, rc);
+	}
+
+	/* Before we return, swab the reply */
+	reply = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+	if (reply == NULL)
+		GOTO(cleanup, rc = -EPROTO);
+
+	if (lvb_len != 0) {
+		LASSERT(lvb != NULL);
+
+		size = req_capsule_get_size(&req->rq_pill, &RMF_DLM_LVB,
+					    RCL_SERVER);
+		if (size < 0) {
+			LDLM_ERROR(lock, "Fail to get lvb_len, rc = %d", size);
+			GOTO(cleanup, rc = size);
+		} else if (unlikely(size > lvb_len)) {
+			LDLM_ERROR(lock, "Replied LVB is larger than "
+				   "expectation, expected = %d, replied = %d",
+				   lvb_len, size);
+			GOTO(cleanup, rc = -EINVAL);
+		}
+	}
+
+	if (rc == ELDLM_LOCK_ABORTED) {
+		if (lvb_len != 0)
+			rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_SERVER,
+					   lvb, size);
+		GOTO(cleanup, rc = (rc != 0 ? rc : ELDLM_LOCK_ABORTED));
+	}
+
+	/* lock enqueued on the server */
+	cleanup_phase = 0;
+
+	lock_res_and_lock(lock);
+	/* Key change rehash lock in per-export hash with new key */
+	if (exp->exp_lock_hash) {
+		/* In the function below, .hs_keycmp resolves to
+		 * ldlm_export_lock_keycmp() */
+		/* coverity[overrun-buffer-val] */
+		cfs_hash_rehash_key(exp->exp_lock_hash,
+				    &lock->l_remote_handle,
+				    &reply->lock_handle,
+				    &lock->l_exp_hash);
+	} else {
+		lock->l_remote_handle = reply->lock_handle;
+	}
+
+	*flags = ldlm_flags_from_wire(reply->lock_flags);
+	lock->l_flags |= ldlm_flags_from_wire(reply->lock_flags &
+					      LDLM_INHERIT_FLAGS);
+	/* move NO_TIMEOUT flag to the lock to force ldlm_lock_match()
+	 * to wait with no timeout as well */
+	lock->l_flags |= ldlm_flags_from_wire(reply->lock_flags &
+					      LDLM_FL_NO_TIMEOUT);
+	unlock_res_and_lock(lock);
+
+	CDEBUG(D_INFO, "local: %p, remote cookie: "LPX64", flags: 0x%llx\n",
+	       lock, reply->lock_handle.cookie, *flags);
+
+	/* If enqueue returned a blocked lock but the completion handler has
+	 * already run, then it fixed up the resource and we don't need to do it
+	 * again. */
+	if ((*flags) & LDLM_FL_LOCK_CHANGED) {
+		int newmode = reply->lock_desc.l_req_mode;
+		LASSERT(!is_replay);
+		if (newmode && newmode != lock->l_req_mode) {
+			LDLM_DEBUG(lock, "server returned different mode %s",
+				   ldlm_lockname[newmode]);
+			lock->l_req_mode = newmode;
+		}
+
+		if (memcmp(reply->lock_desc.l_resource.lr_name.name,
+			  lock->l_resource->lr_name.name,
+			  sizeof(struct ldlm_res_id))) {
+			CDEBUG(D_INFO, "remote intent success, locking "
+					"(%ld,%ld,%ld) instead of "
+					"(%ld,%ld,%ld)\n",
+			      (long)reply->lock_desc.l_resource.lr_name.name[0],
+			      (long)reply->lock_desc.l_resource.lr_name.name[1],
+			      (long)reply->lock_desc.l_resource.lr_name.name[2],
+			      (long)lock->l_resource->lr_name.name[0],
+			      (long)lock->l_resource->lr_name.name[1],
+			      (long)lock->l_resource->lr_name.name[2]);
+
+			rc = ldlm_lock_change_resource(ns, lock,
+					&reply->lock_desc.l_resource.lr_name);
+			if (rc || lock->l_resource == NULL)
+				GOTO(cleanup, rc = -ENOMEM);
+			LDLM_DEBUG(lock, "client-side enqueue, new resource");
+		}
+		if (with_policy)
+			if (!(type == LDLM_IBITS &&
+			      !(exp_connect_flags(exp) & OBD_CONNECT_IBITS)))
+				/* We assume lock type cannot change on server*/
+				ldlm_convert_policy_to_local(exp,
+						lock->l_resource->lr_type,
+						&reply->lock_desc.l_policy_data,
+						&lock->l_policy_data);
+		if (type != LDLM_PLAIN)
+			LDLM_DEBUG(lock,"client-side enqueue, new policy data");
+	}
+
+	if ((*flags) & LDLM_FL_AST_SENT ||
+	    /* Cancel extent locks as soon as possible on a liblustre client,
+	     * because it cannot handle asynchronous ASTs robustly (see
+	     * bug 7311). */
+	    (LIBLUSTRE_CLIENT && type == LDLM_EXTENT)) {
+		lock_res_and_lock(lock);
+		lock->l_flags |= LDLM_FL_CBPENDING |  LDLM_FL_BL_AST;
+		unlock_res_and_lock(lock);
+		LDLM_DEBUG(lock, "enqueue reply includes blocking AST");
+	}
+
+	/* If the lock has already been granted by a completion AST, don't
+	 * clobber the LVB with an older one. */
+	if (lvb_len != 0) {
+		/* We must lock or a racing completion might update lvb without
+		 * letting us know and we'll clobber the correct value.
+		 * Cannot unlock after the check either, a that still leaves
+		 * a tiny window for completion to get in */
+		lock_res_and_lock(lock);
+		if (lock->l_req_mode != lock->l_granted_mode)
+			rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_SERVER,
+					   lock->l_lvb_data, size);
+		unlock_res_and_lock(lock);
+		if (rc < 0) {
+			cleanup_phase = 1;
+			GOTO(cleanup, rc);
+		}
+	}
+
+	if (!is_replay) {
+		rc = ldlm_lock_enqueue(ns, &lock, NULL, flags);
+		if (lock->l_completion_ast != NULL) {
+			int err = lock->l_completion_ast(lock, *flags, NULL);
+			if (!rc)
+				rc = err;
+			if (rc)
+				cleanup_phase = 1;
+		}
+	}
+
+	if (lvb_len && lvb != NULL) {
+		/* Copy the LVB here, and not earlier, because the completion
+		 * AST (if any) can override what we got in the reply */
+		memcpy(lvb, lock->l_lvb_data, lvb_len);
+	}
+
+	LDLM_DEBUG(lock, "client-side enqueue END");
+	EXIT;
+cleanup:
+	if (cleanup_phase == 1 && rc)
+		failed_lock_cleanup(ns, lock, mode);
+	/* Put lock 2 times, the second reference is held by ldlm_cli_enqueue */
+	LDLM_LOCK_PUT(lock);
+	LDLM_LOCK_RELEASE(lock);
+	return rc;
+}
+EXPORT_SYMBOL(ldlm_cli_enqueue_fini);
+
+/**
+ * Estimate number of lock handles that would fit into request of given
+ * size.  PAGE_SIZE-512 is to allow TCP/IP and LNET headers to fit into
+ * a single page on the send/receive side. XXX: 512 should be changed to
+ * more adequate value.
+ */
+static inline int ldlm_req_handles_avail(int req_size, int off)
+{
+	int avail;
+
+	avail = min_t(int, LDLM_MAXREQSIZE, PAGE_CACHE_SIZE - 512) - req_size;
+	if (likely(avail >= 0))
+		avail /= (int)sizeof(struct lustre_handle);
+	else
+		avail = 0;
+	avail += LDLM_LOCKREQ_HANDLES - off;
+
+	return avail;
+}
+
+static inline int ldlm_capsule_handles_avail(struct req_capsule *pill,
+					     enum req_location loc,
+					     int off)
+{
+	int size = req_capsule_msg_size(pill, loc);
+	return ldlm_req_handles_avail(size, off);
+}
+
+static inline int ldlm_format_handles_avail(struct obd_import *imp,
+					    const struct req_format *fmt,
+					    enum req_location loc, int off)
+{
+	int size = req_capsule_fmt_size(imp->imp_msg_magic, fmt, loc);
+	return ldlm_req_handles_avail(size, off);
+}
+
+/**
+ * Cancel LRU locks and pack them into the enqueue request. Pack there the given
+ * \a count locks in \a cancels.
+ *
+ * This is to be called by functions preparing their own requests that
+ * might contain lists of locks to cancel in addition to actual operation
+ * that needs to be performed.
+ */
+int ldlm_prep_elc_req(struct obd_export *exp, struct ptlrpc_request *req,
+		      int version, int opc, int canceloff,
+		      struct list_head *cancels, int count)
+{
+	struct ldlm_namespace   *ns = exp->exp_obd->obd_namespace;
+	struct req_capsule      *pill = &req->rq_pill;
+	struct ldlm_request     *dlm = NULL;
+	int flags, avail, to_free, pack = 0;
+	LIST_HEAD(head);
+	int rc;
+	ENTRY;
+
+	if (cancels == NULL)
+		cancels = &head;
+	if (ns_connect_cancelset(ns)) {
+		/* Estimate the amount of available space in the request. */
+		req_capsule_filled_sizes(pill, RCL_CLIENT);
+		avail = ldlm_capsule_handles_avail(pill, RCL_CLIENT, canceloff);
+
+		flags = ns_connect_lru_resize(ns) ?
+			LDLM_CANCEL_LRUR : LDLM_CANCEL_AGED;
+		to_free = !ns_connect_lru_resize(ns) &&
+			  opc == LDLM_ENQUEUE ? 1 : 0;
+
+		/* Cancel LRU locks here _only_ if the server supports
+		 * EARLY_CANCEL. Otherwise we have to send extra CANCEL
+		 * RPC, which will make us slower. */
+		if (avail > count)
+			count += ldlm_cancel_lru_local(ns, cancels, to_free,
+						       avail - count, 0, flags);
+		if (avail > count)
+			pack = count;
+		else
+			pack = avail;
+		req_capsule_set_size(pill, &RMF_DLM_REQ, RCL_CLIENT,
+				     ldlm_request_bufsize(pack, opc));
+	}
+
+	rc = ptlrpc_request_pack(req, version, opc);
+	if (rc) {
+		ldlm_lock_list_put(cancels, l_bl_ast, count);
+		RETURN(rc);
+	}
+
+	if (ns_connect_cancelset(ns)) {
+		if (canceloff) {
+			dlm = req_capsule_client_get(pill, &RMF_DLM_REQ);
+			LASSERT(dlm);
+			/* Skip first lock handler in ldlm_request_pack(),
+			 * this method will incrment @lock_count according
+			 * to the lock handle amount actually written to
+			 * the buffer. */
+			dlm->lock_count = canceloff;
+		}
+		/* Pack into the request @pack lock handles. */
+		ldlm_cli_cancel_list(cancels, pack, req, 0);
+		/* Prepare and send separate cancel RPC for others. */
+		ldlm_cli_cancel_list(cancels, count - pack, NULL, 0);
+	} else {
+		ldlm_lock_list_put(cancels, l_bl_ast, count);
+	}
+	RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_prep_elc_req);
+
+int ldlm_prep_enqueue_req(struct obd_export *exp, struct ptlrpc_request *req,
+			  struct list_head *cancels, int count)
+{
+	return ldlm_prep_elc_req(exp, req, LUSTRE_DLM_VERSION, LDLM_ENQUEUE,
+				 LDLM_ENQUEUE_CANCEL_OFF, cancels, count);
+}
+EXPORT_SYMBOL(ldlm_prep_enqueue_req);
+
+struct ptlrpc_request *ldlm_enqueue_pack(struct obd_export *exp, int lvb_len)
+{
+	struct ptlrpc_request *req;
+	int rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LDLM_ENQUEUE);
+	if (req == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(ERR_PTR(rc));
+	}
+
+	req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, lvb_len);
+	ptlrpc_request_set_replen(req);
+	RETURN(req);
+}
+EXPORT_SYMBOL(ldlm_enqueue_pack);
+
+/**
+ * Client-side lock enqueue.
+ *
+ * If a request has some specific initialisation it is passed in \a reqp,
+ * otherwise it is created in ldlm_cli_enqueue.
+ *
+ * Supports sync and async requests, pass \a async flag accordingly. If a
+ * request was created in ldlm_cli_enqueue and it is the async request,
+ * pass it to the caller in \a reqp.
+ */
+int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp,
+		     struct ldlm_enqueue_info *einfo,
+		     const struct ldlm_res_id *res_id,
+		     ldlm_policy_data_t const *policy, __u64 *flags,
+		     void *lvb, __u32 lvb_len, enum lvb_type lvb_type,
+		     struct lustre_handle *lockh, int async)
+{
+	struct ldlm_namespace *ns;
+	struct ldlm_lock      *lock;
+	struct ldlm_request   *body;
+	int		    is_replay = *flags & LDLM_FL_REPLAY;
+	int		    req_passed_in = 1;
+	int		    rc, err;
+	struct ptlrpc_request *req;
+	ENTRY;
+
+	LASSERT(exp != NULL);
+
+	ns = exp->exp_obd->obd_namespace;
+
+	/* If we're replaying this lock, just check some invariants.
+	 * If we're creating a new lock, get everything all setup nice. */
+	if (is_replay) {
+		lock = ldlm_handle2lock_long(lockh, 0);
+		LASSERT(lock != NULL);
+		LDLM_DEBUG(lock, "client-side enqueue START");
+		LASSERT(exp == lock->l_conn_export);
+	} else {
+		const struct ldlm_callback_suite cbs = {
+			.lcs_completion = einfo->ei_cb_cp,
+			.lcs_blocking   = einfo->ei_cb_bl,
+			.lcs_glimpse    = einfo->ei_cb_gl,
+			.lcs_weigh      = einfo->ei_cb_wg
+		};
+		lock = ldlm_lock_create(ns, res_id, einfo->ei_type,
+					einfo->ei_mode, &cbs, einfo->ei_cbdata,
+					lvb_len, lvb_type);
+		if (lock == NULL)
+			RETURN(-ENOMEM);
+		/* for the local lock, add the reference */
+		ldlm_lock_addref_internal(lock, einfo->ei_mode);
+		ldlm_lock2handle(lock, lockh);
+		if (policy != NULL) {
+			/* INODEBITS_INTEROP: If the server does not support
+			 * inodebits, we will request a plain lock in the
+			 * descriptor (ldlm_lock2desc() below) but use an
+			 * inodebits lock internally with both bits set.
+			 */
+			if (einfo->ei_type == LDLM_IBITS &&
+			    !(exp_connect_flags(exp) &
+			      OBD_CONNECT_IBITS))
+				lock->l_policy_data.l_inodebits.bits =
+					MDS_INODELOCK_LOOKUP |
+					MDS_INODELOCK_UPDATE;
+			else
+				lock->l_policy_data = *policy;
+		}
+
+		if (einfo->ei_type == LDLM_EXTENT)
+			lock->l_req_extent = policy->l_extent;
+		LDLM_DEBUG(lock, "client-side enqueue START, flags %llx\n",
+			   *flags);
+	}
+
+	lock->l_conn_export = exp;
+	lock->l_export = NULL;
+	lock->l_blocking_ast = einfo->ei_cb_bl;
+	lock->l_flags |= (*flags & LDLM_FL_NO_LRU);
+
+	/* lock not sent to server yet */
+
+	if (reqp == NULL || *reqp == NULL) {
+		req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+						&RQF_LDLM_ENQUEUE,
+						LUSTRE_DLM_VERSION,
+						LDLM_ENQUEUE);
+		if (req == NULL) {
+			failed_lock_cleanup(ns, lock, einfo->ei_mode);
+			LDLM_LOCK_RELEASE(lock);
+			RETURN(-ENOMEM);
+		}
+		req_passed_in = 0;
+		if (reqp)
+			*reqp = req;
+	} else {
+		int len;
+
+		req = *reqp;
+		len = req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ,
+					   RCL_CLIENT);
+		LASSERTF(len >= sizeof(*body), "buflen[%d] = %d, not %d\n",
+			 DLM_LOCKREQ_OFF, len, (int)sizeof(*body));
+	}
+
+	/* Dump lock data into the request buffer */
+	body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+	ldlm_lock2desc(lock, &body->lock_desc);
+	body->lock_flags = ldlm_flags_to_wire(*flags);
+	body->lock_handle[0] = *lockh;
+
+	/* Continue as normal. */
+	if (!req_passed_in) {
+		if (lvb_len > 0)
+			req_capsule_extend(&req->rq_pill,
+					   &RQF_LDLM_ENQUEUE_LVB);
+		req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
+				     lvb_len);
+		ptlrpc_request_set_replen(req);
+	}
+
+	/*
+	 * Liblustre client doesn't get extent locks, except for O_APPEND case
+	 * where [0, OBD_OBJECT_EOF] lock is taken, or truncate, where
+	 * [i_size, OBD_OBJECT_EOF] lock is taken.
+	 */
+	LASSERT(ergo(LIBLUSTRE_CLIENT, einfo->ei_type != LDLM_EXTENT ||
+		     policy->l_extent.end == OBD_OBJECT_EOF));
+
+	if (async) {
+		LASSERT(reqp != NULL);
+		RETURN(0);
+	}
+
+	LDLM_DEBUG(lock, "sending request");
+
+	rc = ptlrpc_queue_wait(req);
+
+	err = ldlm_cli_enqueue_fini(exp, req, einfo->ei_type, policy ? 1 : 0,
+				    einfo->ei_mode, flags, lvb, lvb_len,
+				    lockh, rc);
+
+	/* If ldlm_cli_enqueue_fini did not find the lock, we need to free
+	 * one reference that we took */
+	if (err == -ENOLCK)
+		LDLM_LOCK_RELEASE(lock);
+	else
+		rc = err;
+
+	if (!req_passed_in && req != NULL) {
+		ptlrpc_req_finished(req);
+		if (reqp)
+			*reqp = NULL;
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_cli_enqueue);
+
+static int ldlm_cli_convert_local(struct ldlm_lock *lock, int new_mode,
+				  __u32 *flags)
+{
+	struct ldlm_resource *res;
+	int rc;
+	ENTRY;
+	if (ns_is_client(ldlm_lock_to_ns(lock))) {
+		CERROR("Trying to cancel local lock\n");
+		LBUG();
+	}
+	LDLM_DEBUG(lock, "client-side local convert");
+
+	res = ldlm_lock_convert(lock, new_mode, flags);
+	if (res) {
+		ldlm_reprocess_all(res);
+		rc = 0;
+	} else {
+		rc = EDEADLOCK;
+	}
+	LDLM_DEBUG(lock, "client-side local convert handler END");
+	LDLM_LOCK_PUT(lock);
+	RETURN(rc);
+}
+
+/* FIXME: one of ldlm_cli_convert or the server side should reject attempted
+ * conversion of locks which are on the waiting or converting queue */
+/* Caller of this code is supposed to take care of lock readers/writers
+   accounting */
+int ldlm_cli_convert(struct lustre_handle *lockh, int new_mode, __u32 *flags)
+{
+	struct ldlm_request   *body;
+	struct ldlm_reply     *reply;
+	struct ldlm_lock      *lock;
+	struct ldlm_resource  *res;
+	struct ptlrpc_request *req;
+	int		    rc;
+	ENTRY;
+
+	lock = ldlm_handle2lock(lockh);
+	if (!lock) {
+		LBUG();
+		RETURN(-EINVAL);
+	}
+	*flags = 0;
+
+	if (lock->l_conn_export == NULL)
+		RETURN(ldlm_cli_convert_local(lock, new_mode, flags));
+
+	LDLM_DEBUG(lock, "client-side convert");
+
+	req = ptlrpc_request_alloc_pack(class_exp2cliimp(lock->l_conn_export),
+					&RQF_LDLM_CONVERT, LUSTRE_DLM_VERSION,
+					LDLM_CONVERT);
+	if (req == NULL) {
+		LDLM_LOCK_PUT(lock);
+		RETURN(-ENOMEM);
+	}
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+	body->lock_handle[0] = lock->l_remote_handle;
+
+	body->lock_desc.l_req_mode = new_mode;
+	body->lock_flags = ldlm_flags_to_wire(*flags);
+
+
+	ptlrpc_request_set_replen(req);
+	rc = ptlrpc_queue_wait(req);
+	if (rc != ELDLM_OK)
+		GOTO(out, rc);
+
+	reply = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+	if (reply == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	if (req->rq_status)
+		GOTO(out, rc = req->rq_status);
+
+	res = ldlm_lock_convert(lock, new_mode, &reply->lock_flags);
+	if (res != NULL) {
+		ldlm_reprocess_all(res);
+		/* Go to sleep until the lock is granted. */
+		/* FIXME: or cancelled. */
+		if (lock->l_completion_ast) {
+			rc = lock->l_completion_ast(lock, LDLM_FL_WAIT_NOREPROC,
+						    NULL);
+			if (rc)
+				GOTO(out, rc);
+		}
+	} else {
+		rc = EDEADLOCK;
+	}
+	EXIT;
+ out:
+	LDLM_LOCK_PUT(lock);
+	ptlrpc_req_finished(req);
+	return rc;
+}
+EXPORT_SYMBOL(ldlm_cli_convert);
+
+/**
+ * Cancel locks locally.
+ * Returns:
+ * \retval LDLM_FL_LOCAL_ONLY if there is no need for a CANCEL RPC to the server
+ * \retval LDLM_FL_CANCELING otherwise;
+ * \retval LDLM_FL_BL_AST if there is a need for a separate CANCEL RPC.
+ */
+static __u64 ldlm_cli_cancel_local(struct ldlm_lock *lock)
+{
+	__u64 rc = LDLM_FL_LOCAL_ONLY;
+	ENTRY;
+
+	if (lock->l_conn_export) {
+		bool local_only;
+
+		LDLM_DEBUG(lock, "client-side cancel");
+		/* Set this flag to prevent others from getting new references*/
+		lock_res_and_lock(lock);
+		lock->l_flags |= LDLM_FL_CBPENDING;
+		local_only = !!(lock->l_flags &
+				(LDLM_FL_LOCAL_ONLY|LDLM_FL_CANCEL_ON_BLOCK));
+		ldlm_cancel_callback(lock);
+		rc = (lock->l_flags & LDLM_FL_BL_AST) ?
+			LDLM_FL_BL_AST : LDLM_FL_CANCELING;
+		unlock_res_and_lock(lock);
+
+		if (local_only) {
+			CDEBUG(D_DLMTRACE, "not sending request (at caller's "
+			       "instruction)\n");
+			rc = LDLM_FL_LOCAL_ONLY;
+		}
+		ldlm_lock_cancel(lock);
+	} else {
+		if (ns_is_client(ldlm_lock_to_ns(lock))) {
+			LDLM_ERROR(lock, "Trying to cancel local lock");
+			LBUG();
+		}
+		LDLM_DEBUG(lock, "server-side local cancel");
+		ldlm_lock_cancel(lock);
+		ldlm_reprocess_all(lock->l_resource);
+	}
+
+	RETURN(rc);
+}
+
+/**
+ * Pack \a count locks in \a head into ldlm_request buffer of request \a req.
+ */
+static void ldlm_cancel_pack(struct ptlrpc_request *req,
+			     struct list_head *head, int count)
+{
+	struct ldlm_request *dlm;
+	struct ldlm_lock *lock;
+	int max, packed = 0;
+	ENTRY;
+
+	dlm = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+	LASSERT(dlm != NULL);
+
+	/* Check the room in the request buffer. */
+	max = req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT) -
+		sizeof(struct ldlm_request);
+	max /= sizeof(struct lustre_handle);
+	max += LDLM_LOCKREQ_HANDLES;
+	LASSERT(max >= dlm->lock_count + count);
+
+	/* XXX: it would be better to pack lock handles grouped by resource.
+	 * so that the server cancel would call filter_lvbo_update() less
+	 * frequently. */
+	list_for_each_entry(lock, head, l_bl_ast) {
+		if (!count--)
+			break;
+		LASSERT(lock->l_conn_export);
+		/* Pack the lock handle to the given request buffer. */
+		LDLM_DEBUG(lock, "packing");
+		dlm->lock_handle[dlm->lock_count++] = lock->l_remote_handle;
+		packed++;
+	}
+	CDEBUG(D_DLMTRACE, "%d locks packed\n", packed);
+	EXIT;
+}
+
+/**
+ * Prepare and send a batched cancel RPC. It will include \a count lock
+ * handles of locks given in \a cancels list. */
+int ldlm_cli_cancel_req(struct obd_export *exp, struct list_head *cancels,
+			int count, ldlm_cancel_flags_t flags)
+{
+	struct ptlrpc_request *req = NULL;
+	struct obd_import *imp;
+	int free, sent = 0;
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(exp != NULL);
+	LASSERT(count > 0);
+
+	CFS_FAIL_TIMEOUT(OBD_FAIL_LDLM_PAUSE_CANCEL, cfs_fail_val);
+
+	if (CFS_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_RACE))
+		RETURN(count);
+
+	free = ldlm_format_handles_avail(class_exp2cliimp(exp),
+					 &RQF_LDLM_CANCEL, RCL_CLIENT, 0);
+	if (count > free)
+		count = free;
+
+	while (1) {
+		imp = class_exp2cliimp(exp);
+		if (imp == NULL || imp->imp_invalid) {
+			CDEBUG(D_DLMTRACE,
+			       "skipping cancel on invalid import %p\n", imp);
+			RETURN(count);
+		}
+
+		req = ptlrpc_request_alloc(imp, &RQF_LDLM_CANCEL);
+		if (req == NULL)
+			GOTO(out, rc = -ENOMEM);
+
+		req_capsule_filled_sizes(&req->rq_pill, RCL_CLIENT);
+		req_capsule_set_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT,
+				     ldlm_request_bufsize(count, LDLM_CANCEL));
+
+		rc = ptlrpc_request_pack(req, LUSTRE_DLM_VERSION, LDLM_CANCEL);
+		if (rc) {
+			ptlrpc_request_free(req);
+			GOTO(out, rc);
+		}
+
+		req->rq_request_portal = LDLM_CANCEL_REQUEST_PORTAL;
+		req->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL;
+		ptlrpc_at_set_req_timeout(req);
+
+		ldlm_cancel_pack(req, cancels, count);
+
+		ptlrpc_request_set_replen(req);
+		if (flags & LCF_ASYNC) {
+			ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1);
+			sent = count;
+			GOTO(out, 0);
+		} else {
+			rc = ptlrpc_queue_wait(req);
+		}
+		if (rc == ESTALE) {
+			CDEBUG(D_DLMTRACE, "client/server (nid %s) "
+			       "out of sync -- not fatal\n",
+			       libcfs_nid2str(req->rq_import->
+					      imp_connection->c_peer.nid));
+			rc = 0;
+		} else if (rc == -ETIMEDOUT && /* check there was no reconnect*/
+			   req->rq_import_generation == imp->imp_generation) {
+			ptlrpc_req_finished(req);
+			continue;
+		} else if (rc != ELDLM_OK) {
+			/* -ESHUTDOWN is common on umount */
+			CDEBUG_LIMIT(rc == -ESHUTDOWN ? D_DLMTRACE : D_ERROR,
+				     "Got rc %d from cancel RPC: "
+				     "canceling anyway\n", rc);
+			break;
+		}
+		sent = count;
+		break;
+	}
+
+	ptlrpc_req_finished(req);
+	EXIT;
+out:
+	return sent ? sent : rc;
+}
+EXPORT_SYMBOL(ldlm_cli_cancel_req);
+
+static inline struct ldlm_pool *ldlm_imp2pl(struct obd_import *imp)
+{
+	LASSERT(imp != NULL);
+	return &imp->imp_obd->obd_namespace->ns_pool;
+}
+
+/**
+ * Update client's OBD pool related fields with new SLV and Limit from \a req.
+ */
+int ldlm_cli_update_pool(struct ptlrpc_request *req)
+{
+	struct obd_device *obd;
+	__u64 new_slv;
+	__u32 new_limit;
+	ENTRY;
+	if (unlikely(!req->rq_import || !req->rq_import->imp_obd ||
+		     !imp_connect_lru_resize(req->rq_import)))
+	{
+		/*
+		 * Do nothing for corner cases.
+		 */
+		RETURN(0);
+	}
+
+	/* In some cases RPC may contain SLV and limit zeroed out. This
+	 * is the case when server does not support LRU resize feature.
+	 * This is also possible in some recovery cases when server-side
+	 * reqs have no reference to the OBD export and thus access to
+	 * server-side namespace is not possible. */
+	if (lustre_msg_get_slv(req->rq_repmsg) == 0 ||
+	    lustre_msg_get_limit(req->rq_repmsg) == 0) {
+		DEBUG_REQ(D_HA, req, "Zero SLV or Limit found "
+			  "(SLV: "LPU64", Limit: %u)",
+			  lustre_msg_get_slv(req->rq_repmsg),
+			  lustre_msg_get_limit(req->rq_repmsg));
+		RETURN(0);
+	}
+
+	new_limit = lustre_msg_get_limit(req->rq_repmsg);
+	new_slv = lustre_msg_get_slv(req->rq_repmsg);
+	obd = req->rq_import->imp_obd;
+
+	/* Set new SLV and limit in OBD fields to make them accessible
+	 * to the pool thread. We do not access obd_namespace and pool
+	 * directly here as there is no reliable way to make sure that
+	 * they are still alive at cleanup time. Evil races are possible
+	 * which may cause Oops at that time. */
+	write_lock(&obd->obd_pool_lock);
+	obd->obd_pool_slv = new_slv;
+	obd->obd_pool_limit = new_limit;
+	write_unlock(&obd->obd_pool_lock);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_cli_update_pool);
+
+/**
+ * Client side lock cancel.
+ *
+ * Lock must not have any readers or writers by this time.
+ */
+int ldlm_cli_cancel(struct lustre_handle *lockh,
+		    ldlm_cancel_flags_t cancel_flags)
+{
+	struct obd_export *exp;
+	int avail, flags, count = 1;
+	__u64 rc = 0;
+	struct ldlm_namespace *ns;
+	struct ldlm_lock *lock;
+	LIST_HEAD(cancels);
+	ENTRY;
+
+	/* concurrent cancels on the same handle can happen */
+	lock = ldlm_handle2lock_long(lockh, LDLM_FL_CANCELING);
+	if (lock == NULL) {
+		LDLM_DEBUG_NOLOCK("lock is already being destroyed\n");
+		RETURN(0);
+	}
+
+	rc = ldlm_cli_cancel_local(lock);
+	if (rc == LDLM_FL_LOCAL_ONLY) {
+		LDLM_LOCK_RELEASE(lock);
+		RETURN(0);
+	}
+	/* Even if the lock is marked as LDLM_FL_BL_AST, this is a LDLM_CANCEL
+	 * RPC which goes to canceld portal, so we can cancel other LRU locks
+	 * here and send them all as one LDLM_CANCEL RPC. */
+	LASSERT(list_empty(&lock->l_bl_ast));
+	list_add(&lock->l_bl_ast, &cancels);
+
+	exp = lock->l_conn_export;
+	if (exp_connect_cancelset(exp)) {
+		avail = ldlm_format_handles_avail(class_exp2cliimp(exp),
+						  &RQF_LDLM_CANCEL,
+						  RCL_CLIENT, 0);
+		LASSERT(avail > 0);
+
+		ns = ldlm_lock_to_ns(lock);
+		flags = ns_connect_lru_resize(ns) ?
+			LDLM_CANCEL_LRUR : LDLM_CANCEL_AGED;
+		count += ldlm_cancel_lru_local(ns, &cancels, 0, avail - 1,
+					       LCF_BL_AST, flags);
+	}
+	ldlm_cli_cancel_list(&cancels, count, NULL, cancel_flags);
+	RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_cli_cancel);
+
+/**
+ * Locally cancel up to \a count locks in list \a cancels.
+ * Return the number of cancelled locks.
+ */
+int ldlm_cli_cancel_list_local(struct list_head *cancels, int count,
+			       ldlm_cancel_flags_t flags)
+{
+	LIST_HEAD(head);
+	struct ldlm_lock *lock, *next;
+	int left = 0, bl_ast = 0;
+	__u64 rc;
+
+	left = count;
+	list_for_each_entry_safe(lock, next, cancels, l_bl_ast) {
+		if (left-- == 0)
+			break;
+
+		if (flags & LCF_LOCAL) {
+			rc = LDLM_FL_LOCAL_ONLY;
+			ldlm_lock_cancel(lock);
+		} else {
+			rc = ldlm_cli_cancel_local(lock);
+		}
+		/* Until we have compound requests and can send LDLM_CANCEL
+		 * requests batched with generic RPCs, we need to send cancels
+		 * with the LDLM_FL_BL_AST flag in a separate RPC from
+		 * the one being generated now. */
+		if (!(flags & LCF_BL_AST) && (rc == LDLM_FL_BL_AST)) {
+			LDLM_DEBUG(lock, "Cancel lock separately");
+			list_del_init(&lock->l_bl_ast);
+			list_add(&lock->l_bl_ast, &head);
+			bl_ast++;
+			continue;
+		}
+		if (rc == LDLM_FL_LOCAL_ONLY) {
+			/* CANCEL RPC should not be sent to server. */
+			list_del_init(&lock->l_bl_ast);
+			LDLM_LOCK_RELEASE(lock);
+			count--;
+		}
+	}
+	if (bl_ast > 0) {
+		count -= bl_ast;
+		ldlm_cli_cancel_list(&head, bl_ast, NULL, 0);
+	}
+
+	RETURN(count);
+}
+EXPORT_SYMBOL(ldlm_cli_cancel_list_local);
+
+/**
+ * Cancel as many locks as possible w/o sending any RPCs (e.g. to write back
+ * dirty data, to close a file, ...) or waiting for any RPCs in-flight (e.g.
+ * readahead requests, ...)
+ */
+static ldlm_policy_res_t ldlm_cancel_no_wait_policy(struct ldlm_namespace *ns,
+						    struct ldlm_lock *lock,
+						    int unused, int added,
+						    int count)
+{
+	ldlm_policy_res_t result = LDLM_POLICY_CANCEL_LOCK;
+	ldlm_cancel_for_recovery cb = ns->ns_cancel_for_recovery;
+	lock_res_and_lock(lock);
+
+	/* don't check added & count since we want to process all locks
+	 * from unused list */
+	switch (lock->l_resource->lr_type) {
+		case LDLM_EXTENT:
+		case LDLM_IBITS:
+			if (cb && cb(lock))
+				break;
+		default:
+			result = LDLM_POLICY_SKIP_LOCK;
+			lock->l_flags |= LDLM_FL_SKIPPED;
+			break;
+	}
+
+	unlock_res_and_lock(lock);
+	RETURN(result);
+}
+
+/**
+ * Callback function for LRU-resize policy. Decides whether to keep
+ * \a lock in LRU for current \a LRU size \a unused, added in current
+ * scan \a added and number of locks to be preferably canceled \a count.
+ *
+ * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
+ *
+ * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU
+ */
+static ldlm_policy_res_t ldlm_cancel_lrur_policy(struct ldlm_namespace *ns,
+						 struct ldlm_lock *lock,
+						 int unused, int added,
+						 int count)
+{
+	cfs_time_t cur = cfs_time_current();
+	struct ldlm_pool *pl = &ns->ns_pool;
+	__u64 slv, lvf, lv;
+	cfs_time_t la;
+
+	/* Stop LRU processing when we reach past @count or have checked all
+	 * locks in LRU. */
+	if (count && added >= count)
+		return LDLM_POLICY_KEEP_LOCK;
+
+	slv = ldlm_pool_get_slv(pl);
+	lvf = ldlm_pool_get_lvf(pl);
+	la = cfs_duration_sec(cfs_time_sub(cur,
+			      lock->l_last_used));
+	lv = lvf * la * unused;
+
+	/* Inform pool about current CLV to see it via proc. */
+	ldlm_pool_set_clv(pl, lv);
+
+	/* Stop when SLV is not yet come from server or lv is smaller than
+	 * it is. */
+	return (slv == 0 || lv < slv) ?
+		LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK;
+}
+
+/**
+ * Callback function for proc used policy. Makes decision whether to keep
+ * \a lock in LRU for current \a LRU size \a unused, added in current scan \a
+ * added and number of locks to be preferably canceled \a count.
+ *
+ * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
+ *
+ * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU
+ */
+static ldlm_policy_res_t ldlm_cancel_passed_policy(struct ldlm_namespace *ns,
+						   struct ldlm_lock *lock,
+						   int unused, int added,
+						   int count)
+{
+	/* Stop LRU processing when we reach past @count or have checked all
+	 * locks in LRU. */
+	return (added >= count) ?
+		LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK;
+}
+
+/**
+ * Callback function for aged policy. Makes decision whether to keep \a lock in
+ * LRU for current LRU size \a unused, added in current scan \a added and
+ * number of locks to be preferably canceled \a count.
+ *
+ * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
+ *
+ * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU
+ */
+static ldlm_policy_res_t ldlm_cancel_aged_policy(struct ldlm_namespace *ns,
+						 struct ldlm_lock *lock,
+						 int unused, int added,
+						 int count)
+{
+	/* Stop LRU processing if young lock is found and we reach past count */
+	return ((added >= count) &&
+		cfs_time_before(cfs_time_current(),
+				cfs_time_add(lock->l_last_used,
+					     ns->ns_max_age))) ?
+		LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK;
+}
+
+/**
+ * Callback function for default policy. Makes decision whether to keep \a lock
+ * in LRU for current LRU size \a unused, added in current scan \a added and
+ * number of locks to be preferably canceled \a count.
+ *
+ * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
+ *
+ * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU
+ */
+static ldlm_policy_res_t ldlm_cancel_default_policy(struct ldlm_namespace *ns,
+						    struct ldlm_lock *lock,
+						    int unused, int added,
+						    int count)
+{
+	/* Stop LRU processing when we reach past count or have checked all
+	 * locks in LRU. */
+	return (added >= count) ?
+		LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK;
+}
+
+typedef ldlm_policy_res_t (*ldlm_cancel_lru_policy_t)(struct ldlm_namespace *,
+						      struct ldlm_lock *, int,
+						      int, int);
+
+static ldlm_cancel_lru_policy_t
+ldlm_cancel_lru_policy(struct ldlm_namespace *ns, int flags)
+{
+	if (flags & LDLM_CANCEL_NO_WAIT)
+		return ldlm_cancel_no_wait_policy;
+
+	if (ns_connect_lru_resize(ns)) {
+		if (flags & LDLM_CANCEL_SHRINK)
+			/* We kill passed number of old locks. */
+			return ldlm_cancel_passed_policy;
+		else if (flags & LDLM_CANCEL_LRUR)
+			return ldlm_cancel_lrur_policy;
+		else if (flags & LDLM_CANCEL_PASSED)
+			return ldlm_cancel_passed_policy;
+	} else {
+		if (flags & LDLM_CANCEL_AGED)
+			return ldlm_cancel_aged_policy;
+	}
+
+	return ldlm_cancel_default_policy;
+}
+
+/**
+ * - Free space in LRU for \a count new locks,
+ *   redundant unused locks are canceled locally;
+ * - also cancel locally unused aged locks;
+ * - do not cancel more than \a max locks;
+ * - GET the found locks and add them into the \a cancels list.
+ *
+ * A client lock can be added to the l_bl_ast list only when it is
+ * marked LDLM_FL_CANCELING. Otherwise, somebody is already doing
+ * CANCEL.  There are the following use cases:
+ * ldlm_cancel_resource_local(), ldlm_cancel_lru_local() and
+ * ldlm_cli_cancel(), which check and set this flag properly. As any
+ * attempt to cancel a lock rely on this flag, l_bl_ast list is accessed
+ * later without any special locking.
+ *
+ * Calling policies for enabled LRU resize:
+ * ----------------------------------------
+ * flags & LDLM_CANCEL_LRUR - use LRU resize policy (SLV from server) to
+ *			    cancel not more than \a count locks;
+ *
+ * flags & LDLM_CANCEL_PASSED - cancel \a count number of old locks (located at
+ *			      the beginning of LRU list);
+ *
+ * flags & LDLM_CANCEL_SHRINK - cancel not more than \a count locks according to
+ *			      memory pressre policy function;
+ *
+ * flags & LDLM_CANCEL_AGED - cancel \a count locks according to "aged policy".
+ *
+ * flags & LDLM_CANCEL_NO_WAIT - cancel as many unused locks as possible
+ *			       (typically before replaying locks) w/o
+ *			       sending any RPCs or waiting for any
+ *			       outstanding RPC to complete.
+ */
+static int ldlm_prepare_lru_list(struct ldlm_namespace *ns, struct list_head *cancels,
+				 int count, int max, int flags)
+{
+	ldlm_cancel_lru_policy_t pf;
+	struct ldlm_lock *lock, *next;
+	int added = 0, unused, remained;
+	ENTRY;
+
+	spin_lock(&ns->ns_lock);
+	unused = ns->ns_nr_unused;
+	remained = unused;
+
+	if (!ns_connect_lru_resize(ns))
+		count += unused - ns->ns_max_unused;
+
+	pf = ldlm_cancel_lru_policy(ns, flags);
+	LASSERT(pf != NULL);
+
+	while (!list_empty(&ns->ns_unused_list)) {
+		ldlm_policy_res_t result;
+
+		/* all unused locks */
+		if (remained-- <= 0)
+			break;
+
+		/* For any flags, stop scanning if @max is reached. */
+		if (max && added >= max)
+			break;
+
+		list_for_each_entry_safe(lock, next, &ns->ns_unused_list,
+					     l_lru) {
+			/* No locks which got blocking requests. */
+			LASSERT(!(lock->l_flags & LDLM_FL_BL_AST));
+
+			if (flags & LDLM_CANCEL_NO_WAIT &&
+			    lock->l_flags & LDLM_FL_SKIPPED)
+				/* already processed */
+				continue;
+
+			/* Somebody is already doing CANCEL. No need for this
+			 * lock in LRU, do not traverse it again. */
+			if (!(lock->l_flags & LDLM_FL_CANCELING))
+				break;
+
+			ldlm_lock_remove_from_lru_nolock(lock);
+		}
+		if (&lock->l_lru == &ns->ns_unused_list)
+			break;
+
+		LDLM_LOCK_GET(lock);
+		spin_unlock(&ns->ns_lock);
+		lu_ref_add(&lock->l_reference, __FUNCTION__, current);
+
+		/* Pass the lock through the policy filter and see if it
+		 * should stay in LRU.
+		 *
+		 * Even for shrinker policy we stop scanning if
+		 * we find a lock that should stay in the cache.
+		 * We should take into account lock age anyway
+		 * as a new lock is a valuable resource even if
+		 * it has a low weight.
+		 *
+		 * That is, for shrinker policy we drop only
+		 * old locks, but additionally choose them by
+		 * their weight. Big extent locks will stay in
+		 * the cache. */
+		result = pf(ns, lock, unused, added, count);
+		if (result == LDLM_POLICY_KEEP_LOCK) {
+			lu_ref_del(&lock->l_reference,
+				   __FUNCTION__, current);
+			LDLM_LOCK_RELEASE(lock);
+			spin_lock(&ns->ns_lock);
+			break;
+		}
+		if (result == LDLM_POLICY_SKIP_LOCK) {
+			lu_ref_del(&lock->l_reference,
+				   __func__, current);
+			LDLM_LOCK_RELEASE(lock);
+			spin_lock(&ns->ns_lock);
+			continue;
+		}
+
+		lock_res_and_lock(lock);
+		/* Check flags again under the lock. */
+		if ((lock->l_flags & LDLM_FL_CANCELING) ||
+		    (ldlm_lock_remove_from_lru(lock) == 0)) {
+			/* Another thread is removing lock from LRU, or
+			 * somebody is already doing CANCEL, or there
+			 * is a blocking request which will send cancel
+			 * by itself, or the lock is no longer unused. */
+			unlock_res_and_lock(lock);
+			lu_ref_del(&lock->l_reference,
+				   __FUNCTION__, current);
+			LDLM_LOCK_RELEASE(lock);
+			spin_lock(&ns->ns_lock);
+			continue;
+		}
+		LASSERT(!lock->l_readers && !lock->l_writers);
+
+		/* If we have chosen to cancel this lock voluntarily, we
+		 * better send cancel notification to server, so that it
+		 * frees appropriate state. This might lead to a race
+		 * where while we are doing cancel here, server is also
+		 * silently cancelling this lock. */
+		lock->l_flags &= ~LDLM_FL_CANCEL_ON_BLOCK;
+
+		/* Setting the CBPENDING flag is a little misleading,
+		 * but prevents an important race; namely, once
+		 * CBPENDING is set, the lock can accumulate no more
+		 * readers/writers. Since readers and writers are
+		 * already zero here, ldlm_lock_decref() won't see
+		 * this flag and call l_blocking_ast */
+		lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_CANCELING;
+
+		/* We can't re-add to l_lru as it confuses the
+		 * refcounting in ldlm_lock_remove_from_lru() if an AST
+		 * arrives after we drop lr_lock below. We use l_bl_ast
+		 * and can't use l_pending_chain as it is used both on
+		 * server and client nevertheless bug 5666 says it is
+		 * used only on server */
+		LASSERT(list_empty(&lock->l_bl_ast));
+		list_add(&lock->l_bl_ast, cancels);
+		unlock_res_and_lock(lock);
+		lu_ref_del(&lock->l_reference, __FUNCTION__, current);
+		spin_lock(&ns->ns_lock);
+		added++;
+		unused--;
+	}
+	spin_unlock(&ns->ns_lock);
+	RETURN(added);
+}
+
+int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels,
+			  int count, int max, ldlm_cancel_flags_t cancel_flags,
+			  int flags)
+{
+	int added;
+	added = ldlm_prepare_lru_list(ns, cancels, count, max, flags);
+	if (added <= 0)
+		return added;
+	return ldlm_cli_cancel_list_local(cancels, added, cancel_flags);
+}
+
+/**
+ * Cancel at least \a nr locks from given namespace LRU.
+ *
+ * When called with LCF_ASYNC the blocking callback will be handled
+ * in a thread and this function will return after the thread has been
+ * asked to call the callback.  When called with LCF_ASYNC the blocking
+ * callback will be performed in this function.
+ */
+int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr,
+		    ldlm_cancel_flags_t cancel_flags,
+		    int flags)
+{
+	LIST_HEAD(cancels);
+	int count, rc;
+	ENTRY;
+
+	/* Just prepare the list of locks, do not actually cancel them yet.
+	 * Locks are cancelled later in a separate thread. */
+	count = ldlm_prepare_lru_list(ns, &cancels, nr, 0, flags);
+	rc = ldlm_bl_to_thread_list(ns, NULL, &cancels, count, cancel_flags);
+	if (rc == 0)
+		RETURN(count);
+
+	RETURN(0);
+}
+
+/**
+ * Find and cancel locally unused locks found on resource, matched to the
+ * given policy, mode. GET the found locks and add them into the \a cancels
+ * list.
+ */
+int ldlm_cancel_resource_local(struct ldlm_resource *res,
+			       struct list_head *cancels,
+			       ldlm_policy_data_t *policy,
+			       ldlm_mode_t mode, int lock_flags,
+			       ldlm_cancel_flags_t cancel_flags, void *opaque)
+{
+	struct ldlm_lock *lock;
+	int count = 0;
+	ENTRY;
+
+	lock_res(res);
+	list_for_each_entry(lock, &res->lr_granted, l_res_link) {
+		if (opaque != NULL && lock->l_ast_data != opaque) {
+			LDLM_ERROR(lock, "data %p doesn't match opaque %p",
+				   lock->l_ast_data, opaque);
+			//LBUG();
+			continue;
+		}
+
+		if (lock->l_readers || lock->l_writers)
+			continue;
+
+		/* If somebody is already doing CANCEL, or blocking AST came,
+		 * skip this lock. */
+		if (lock->l_flags & LDLM_FL_BL_AST ||
+		    lock->l_flags & LDLM_FL_CANCELING)
+			continue;
+
+		if (lockmode_compat(lock->l_granted_mode, mode))
+			continue;
+
+		/* If policy is given and this is IBITS lock, add to list only
+		 * those locks that match by policy. */
+		if (policy && (lock->l_resource->lr_type == LDLM_IBITS) &&
+		    !(lock->l_policy_data.l_inodebits.bits &
+		      policy->l_inodebits.bits))
+			continue;
+
+		/* See CBPENDING comment in ldlm_cancel_lru */
+		lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_CANCELING |
+				 lock_flags;
+
+		LASSERT(list_empty(&lock->l_bl_ast));
+		list_add(&lock->l_bl_ast, cancels);
+		LDLM_LOCK_GET(lock);
+		count++;
+	}
+	unlock_res(res);
+
+	RETURN(ldlm_cli_cancel_list_local(cancels, count, cancel_flags));
+}
+EXPORT_SYMBOL(ldlm_cancel_resource_local);
+
+/**
+ * Cancel client-side locks from a list and send/prepare cancel RPCs to the
+ * server.
+ * If \a req is NULL, send CANCEL request to server with handles of locks
+ * in the \a cancels. If EARLY_CANCEL is not supported, send CANCEL requests
+ * separately per lock.
+ * If \a req is not NULL, put handles of locks in \a cancels into the request
+ * buffer at the offset \a off.
+ * Destroy \a cancels at the end.
+ */
+int ldlm_cli_cancel_list(struct list_head *cancels, int count,
+			 struct ptlrpc_request *req, ldlm_cancel_flags_t flags)
+{
+	struct ldlm_lock *lock;
+	int res = 0;
+	ENTRY;
+
+	if (list_empty(cancels) || count == 0)
+		RETURN(0);
+
+	/* XXX: requests (both batched and not) could be sent in parallel.
+	 * Usually it is enough to have just 1 RPC, but it is possible that
+	 * there are too many locks to be cancelled in LRU or on a resource.
+	 * It would also speed up the case when the server does not support
+	 * the feature. */
+	while (count > 0) {
+		LASSERT(!list_empty(cancels));
+		lock = list_entry(cancels->next, struct ldlm_lock,
+				      l_bl_ast);
+		LASSERT(lock->l_conn_export);
+
+		if (exp_connect_cancelset(lock->l_conn_export)) {
+			res = count;
+			if (req)
+				ldlm_cancel_pack(req, cancels, count);
+			else
+				res = ldlm_cli_cancel_req(lock->l_conn_export,
+							  cancels, count,
+							  flags);
+		} else {
+			res = ldlm_cli_cancel_req(lock->l_conn_export,
+						  cancels, 1, flags);
+		}
+
+		if (res < 0) {
+			CDEBUG_LIMIT(res == -ESHUTDOWN ? D_DLMTRACE : D_ERROR,
+				     "ldlm_cli_cancel_list: %d\n", res);
+			res = count;
+		}
+
+		count -= res;
+		ldlm_lock_list_put(cancels, l_bl_ast, res);
+	}
+	LASSERT(count == 0);
+	RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_cli_cancel_list);
+
+/**
+ * Cancel all locks on a resource that have 0 readers/writers.
+ *
+ * If flags & LDLM_FL_LOCAL_ONLY, throw the locks away without trying
+ * to notify the server. */
+int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns,
+				    const struct ldlm_res_id *res_id,
+				    ldlm_policy_data_t *policy,
+				    ldlm_mode_t mode,
+				    ldlm_cancel_flags_t flags,
+				    void *opaque)
+{
+	struct ldlm_resource *res;
+	LIST_HEAD(cancels);
+	int count;
+	int rc;
+	ENTRY;
+
+	res = ldlm_resource_get(ns, NULL, res_id, 0, 0);
+	if (res == NULL) {
+		/* This is not a problem. */
+		CDEBUG(D_INFO, "No resource "LPU64"\n", res_id->name[0]);
+		RETURN(0);
+	}
+
+	LDLM_RESOURCE_ADDREF(res);
+	count = ldlm_cancel_resource_local(res, &cancels, policy, mode,
+					   0, flags | LCF_BL_AST, opaque);
+	rc = ldlm_cli_cancel_list(&cancels, count, NULL, flags);
+	if (rc != ELDLM_OK)
+		CERROR("ldlm_cli_cancel_unused_resource: %d\n", rc);
+
+	LDLM_RESOURCE_DELREF(res);
+	ldlm_resource_putref(res);
+	RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_cli_cancel_unused_resource);
+
+struct ldlm_cli_cancel_arg {
+	int     lc_flags;
+	void   *lc_opaque;
+};
+
+static int ldlm_cli_hash_cancel_unused(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+				       struct hlist_node *hnode, void *arg)
+{
+	struct ldlm_resource	   *res = cfs_hash_object(hs, hnode);
+	struct ldlm_cli_cancel_arg     *lc = arg;
+	int			     rc;
+
+	rc = ldlm_cli_cancel_unused_resource(ldlm_res_to_ns(res), &res->lr_name,
+					     NULL, LCK_MINMODE,
+					     lc->lc_flags, lc->lc_opaque);
+	if (rc != 0) {
+		CERROR("ldlm_cli_cancel_unused ("LPU64"): %d\n",
+		       res->lr_name.name[0], rc);
+	}
+	/* must return 0 for hash iteration */
+	return 0;
+}
+
+/**
+ * Cancel all locks on a namespace (or a specific resource, if given)
+ * that have 0 readers/writers.
+ *
+ * If flags & LCF_LOCAL, throw the locks away without trying
+ * to notify the server. */
+int ldlm_cli_cancel_unused(struct ldlm_namespace *ns,
+			   const struct ldlm_res_id *res_id,
+			   ldlm_cancel_flags_t flags, void *opaque)
+{
+	struct ldlm_cli_cancel_arg arg = {
+		.lc_flags       = flags,
+		.lc_opaque      = opaque,
+	};
+
+	ENTRY;
+
+	if (ns == NULL)
+		RETURN(ELDLM_OK);
+
+	if (res_id != NULL) {
+		RETURN(ldlm_cli_cancel_unused_resource(ns, res_id, NULL,
+						       LCK_MINMODE, flags,
+						       opaque));
+	} else {
+		cfs_hash_for_each_nolock(ns->ns_rs_hash,
+					 ldlm_cli_hash_cancel_unused, &arg);
+		RETURN(ELDLM_OK);
+	}
+}
+EXPORT_SYMBOL(ldlm_cli_cancel_unused);
+
+/* Lock iterators. */
+
+int ldlm_resource_foreach(struct ldlm_resource *res, ldlm_iterator_t iter,
+			  void *closure)
+{
+	struct list_head *tmp, *next;
+	struct ldlm_lock *lock;
+	int rc = LDLM_ITER_CONTINUE;
+
+	ENTRY;
+
+	if (!res)
+		RETURN(LDLM_ITER_CONTINUE);
+
+	lock_res(res);
+	list_for_each_safe(tmp, next, &res->lr_granted) {
+		lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+		if (iter(lock, closure) == LDLM_ITER_STOP)
+			GOTO(out, rc = LDLM_ITER_STOP);
+	}
+
+	list_for_each_safe(tmp, next, &res->lr_converting) {
+		lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+		if (iter(lock, closure) == LDLM_ITER_STOP)
+			GOTO(out, rc = LDLM_ITER_STOP);
+	}
+
+	list_for_each_safe(tmp, next, &res->lr_waiting) {
+		lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+		if (iter(lock, closure) == LDLM_ITER_STOP)
+			GOTO(out, rc = LDLM_ITER_STOP);
+	}
+ out:
+	unlock_res(res);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_resource_foreach);
+
+struct iter_helper_data {
+	ldlm_iterator_t iter;
+	void *closure;
+};
+
+static int ldlm_iter_helper(struct ldlm_lock *lock, void *closure)
+{
+	struct iter_helper_data *helper = closure;
+	return helper->iter(lock, helper->closure);
+}
+
+static int ldlm_res_iter_helper(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+				struct hlist_node *hnode, void *arg)
+
+{
+	struct ldlm_resource *res = cfs_hash_object(hs, hnode);
+
+	return ldlm_resource_foreach(res, ldlm_iter_helper, arg) ==
+	       LDLM_ITER_STOP;
+}
+
+void ldlm_namespace_foreach(struct ldlm_namespace *ns,
+			    ldlm_iterator_t iter, void *closure)
+
+{
+	struct iter_helper_data helper = { iter: iter, closure: closure };
+
+	cfs_hash_for_each_nolock(ns->ns_rs_hash,
+				 ldlm_res_iter_helper, &helper);
+
+}
+EXPORT_SYMBOL(ldlm_namespace_foreach);
+
+/* non-blocking function to manipulate a lock whose cb_data is being put away.
+ * return  0:  find no resource
+ *       > 0:  must be LDLM_ITER_STOP/LDLM_ITER_CONTINUE.
+ *       < 0:  errors
+ */
+int ldlm_resource_iterate(struct ldlm_namespace *ns,
+			  const struct ldlm_res_id *res_id,
+			  ldlm_iterator_t iter, void *data)
+{
+	struct ldlm_resource *res;
+	int rc;
+	ENTRY;
+
+	if (ns == NULL) {
+		CERROR("must pass in namespace\n");
+		LBUG();
+	}
+
+	res = ldlm_resource_get(ns, NULL, res_id, 0, 0);
+	if (res == NULL)
+		RETURN(0);
+
+	LDLM_RESOURCE_ADDREF(res);
+	rc = ldlm_resource_foreach(res, iter, data);
+	LDLM_RESOURCE_DELREF(res);
+	ldlm_resource_putref(res);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_resource_iterate);
+
+/* Lock replay */
+
+static int ldlm_chain_lock_for_replay(struct ldlm_lock *lock, void *closure)
+{
+	struct list_head *list = closure;
+
+	/* we use l_pending_chain here, because it's unused on clients. */
+	LASSERTF(list_empty(&lock->l_pending_chain),
+		 "lock %p next %p prev %p\n",
+		 lock, &lock->l_pending_chain.next,&lock->l_pending_chain.prev);
+	/* bug 9573: don't replay locks left after eviction, or
+	 * bug 17614: locks being actively cancelled. Get a reference
+	 * on a lock so that it does not disapear under us (e.g. due to cancel)
+	 */
+	if (!(lock->l_flags & (LDLM_FL_FAILED|LDLM_FL_CANCELING))) {
+		list_add(&lock->l_pending_chain, list);
+		LDLM_LOCK_GET(lock);
+	}
+
+	return LDLM_ITER_CONTINUE;
+}
+
+static int replay_lock_interpret(const struct lu_env *env,
+				 struct ptlrpc_request *req,
+				 struct ldlm_async_args *aa, int rc)
+{
+	struct ldlm_lock     *lock;
+	struct ldlm_reply    *reply;
+	struct obd_export    *exp;
+
+	ENTRY;
+	atomic_dec(&req->rq_import->imp_replay_inflight);
+	if (rc != ELDLM_OK)
+		GOTO(out, rc);
+
+
+	reply = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+	if (reply == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	lock = ldlm_handle2lock(&aa->lock_handle);
+	if (!lock) {
+		CERROR("received replay ack for unknown local cookie "LPX64
+		       " remote cookie "LPX64 " from server %s id %s\n",
+		       aa->lock_handle.cookie, reply->lock_handle.cookie,
+		       req->rq_export->exp_client_uuid.uuid,
+		       libcfs_id2str(req->rq_peer));
+		GOTO(out, rc = -ESTALE);
+	}
+
+	/* Key change rehash lock in per-export hash with new key */
+	exp = req->rq_export;
+	if (exp && exp->exp_lock_hash) {
+		/* In the function below, .hs_keycmp resolves to
+		 * ldlm_export_lock_keycmp() */
+		/* coverity[overrun-buffer-val] */
+		cfs_hash_rehash_key(exp->exp_lock_hash,
+				    &lock->l_remote_handle,
+				    &reply->lock_handle,
+				    &lock->l_exp_hash);
+	} else {
+		lock->l_remote_handle = reply->lock_handle;
+	}
+
+	LDLM_DEBUG(lock, "replayed lock:");
+	ptlrpc_import_recovery_state_machine(req->rq_import);
+	LDLM_LOCK_PUT(lock);
+out:
+	if (rc != ELDLM_OK)
+		ptlrpc_connect_import(req->rq_import);
+
+	RETURN(rc);
+}
+
+static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock)
+{
+	struct ptlrpc_request *req;
+	struct ldlm_async_args *aa;
+	struct ldlm_request   *body;
+	int flags;
+	ENTRY;
+
+
+	/* Bug 11974: Do not replay a lock which is actively being canceled */
+	if (lock->l_flags & LDLM_FL_CANCELING) {
+		LDLM_DEBUG(lock, "Not replaying canceled lock:");
+		RETURN(0);
+	}
+
+	/* If this is reply-less callback lock, we cannot replay it, since
+	 * server might have long dropped it, but notification of that event was
+	 * lost by network. (and server granted conflicting lock already) */
+	if (lock->l_flags & LDLM_FL_CANCEL_ON_BLOCK) {
+		LDLM_DEBUG(lock, "Not replaying reply-less lock:");
+		ldlm_lock_cancel(lock);
+		RETURN(0);
+	}
+
+	/*
+	 * If granted mode matches the requested mode, this lock is granted.
+	 *
+	 * If they differ, but we have a granted mode, then we were granted
+	 * one mode and now want another: ergo, converting.
+	 *
+	 * If we haven't been granted anything and are on a resource list,
+	 * then we're blocked/waiting.
+	 *
+	 * If we haven't been granted anything and we're NOT on a resource list,
+	 * then we haven't got a reply yet and don't have a known disposition.
+	 * This happens whenever a lock enqueue is the request that triggers
+	 * recovery.
+	 */
+	if (lock->l_granted_mode == lock->l_req_mode)
+		flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_GRANTED;
+	else if (lock->l_granted_mode)
+		flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_CONV;
+	else if (!list_empty(&lock->l_res_link))
+		flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_WAIT;
+	else
+		flags = LDLM_FL_REPLAY;
+
+	req = ptlrpc_request_alloc_pack(imp, &RQF_LDLM_ENQUEUE,
+					LUSTRE_DLM_VERSION, LDLM_ENQUEUE);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	/* We're part of recovery, so don't wait for it. */
+	req->rq_send_state = LUSTRE_IMP_REPLAY_LOCKS;
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+	ldlm_lock2desc(lock, &body->lock_desc);
+	body->lock_flags = ldlm_flags_to_wire(flags);
+
+	ldlm_lock2handle(lock, &body->lock_handle[0]);
+	if (lock->l_lvb_len > 0)
+		req_capsule_extend(&req->rq_pill, &RQF_LDLM_ENQUEUE_LVB);
+	req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
+			     lock->l_lvb_len);
+	ptlrpc_request_set_replen(req);
+	/* notify the server we've replayed all requests.
+	 * also, we mark the request to be put on a dedicated
+	 * queue to be processed after all request replayes.
+	 * bug 6063 */
+	lustre_msg_set_flags(req->rq_reqmsg, MSG_REQ_REPLAY_DONE);
+
+	LDLM_DEBUG(lock, "replaying lock:");
+
+	atomic_inc(&req->rq_import->imp_replay_inflight);
+	CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+	aa = ptlrpc_req_async_args(req);
+	aa->lock_handle = body->lock_handle[0];
+	req->rq_interpret_reply = (ptlrpc_interpterer_t)replay_lock_interpret;
+	ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1);
+
+	RETURN(0);
+}
+
+/**
+ * Cancel as many unused locks as possible before replay. since we are
+ * in recovery, we can't wait for any outstanding RPCs to send any RPC
+ * to the server.
+ *
+ * Called only in recovery before replaying locks. there is no need to
+ * replay locks that are unused. since the clients may hold thousands of
+ * cached unused locks, dropping the unused locks can greatly reduce the
+ * load on the servers at recovery time.
+ */
+static void ldlm_cancel_unused_locks_for_replay(struct ldlm_namespace *ns)
+{
+	int canceled;
+	LIST_HEAD(cancels);
+
+	CDEBUG(D_DLMTRACE, "Dropping as many unused locks as possible before"
+			   "replay for namespace %s (%d)\n",
+			   ldlm_ns_name(ns), ns->ns_nr_unused);
+
+	/* We don't need to care whether or not LRU resize is enabled
+	 * because the LDLM_CANCEL_NO_WAIT policy doesn't use the
+	 * count parameter */
+	canceled = ldlm_cancel_lru_local(ns, &cancels, ns->ns_nr_unused, 0,
+					 LCF_LOCAL, LDLM_CANCEL_NO_WAIT);
+
+	CDEBUG(D_DLMTRACE, "Canceled %d unused locks from namespace %s\n",
+			   canceled, ldlm_ns_name(ns));
+}
+
+int ldlm_replay_locks(struct obd_import *imp)
+{
+	struct ldlm_namespace *ns = imp->imp_obd->obd_namespace;
+	LIST_HEAD(list);
+	struct ldlm_lock *lock, *next;
+	int rc = 0;
+
+	ENTRY;
+
+	LASSERT(atomic_read(&imp->imp_replay_inflight) == 0);
+
+	/* don't replay locks if import failed recovery */
+	if (imp->imp_vbr_failed)
+		RETURN(0);
+
+	/* ensure this doesn't fall to 0 before all have been queued */
+	atomic_inc(&imp->imp_replay_inflight);
+
+	if (ldlm_cancel_unused_locks_before_replay)
+		ldlm_cancel_unused_locks_for_replay(ns);
+
+	ldlm_namespace_foreach(ns, ldlm_chain_lock_for_replay, &list);
+
+	list_for_each_entry_safe(lock, next, &list, l_pending_chain) {
+		list_del_init(&lock->l_pending_chain);
+		if (rc) {
+			LDLM_LOCK_RELEASE(lock);
+			continue; /* or try to do the rest? */
+		}
+		rc = replay_one_lock(imp, lock);
+		LDLM_LOCK_RELEASE(lock);
+	}
+
+	atomic_dec(&imp->imp_replay_inflight);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_replay_locks);
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_resource.c b/drivers/staging/lustre/lustre/ldlm/ldlm_resource.c
new file mode 100644
index 000000000000..6bdfb428a41a
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_resource.c
@@ -0,0 +1,1444 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_resource.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Peter Braam <braam@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+# include <lustre_dlm.h>
+
+#include <lustre_fid.h>
+#include <obd_class.h>
+#include "ldlm_internal.h"
+
+struct kmem_cache *ldlm_resource_slab, *ldlm_lock_slab;
+
+atomic_t ldlm_srv_namespace_nr = ATOMIC_INIT(0);
+atomic_t ldlm_cli_namespace_nr = ATOMIC_INIT(0);
+
+struct mutex ldlm_srv_namespace_lock;
+LIST_HEAD(ldlm_srv_namespace_list);
+
+struct mutex ldlm_cli_namespace_lock;
+LIST_HEAD(ldlm_cli_namespace_list);
+
+proc_dir_entry_t *ldlm_type_proc_dir = NULL;
+proc_dir_entry_t *ldlm_ns_proc_dir = NULL;
+proc_dir_entry_t *ldlm_svc_proc_dir = NULL;
+
+extern unsigned int ldlm_cancel_unused_locks_before_replay;
+
+/* during debug dump certain amount of granted locks for one resource to avoid
+ * DDOS. */
+unsigned int ldlm_dump_granted_max = 256;
+
+#ifdef LPROCFS
+static int ldlm_proc_dump_ns(struct file *file, const char *buffer,
+			     unsigned long count, void *data)
+{
+	ldlm_dump_all_namespaces(LDLM_NAMESPACE_SERVER, D_DLMTRACE);
+	ldlm_dump_all_namespaces(LDLM_NAMESPACE_CLIENT, D_DLMTRACE);
+	RETURN(count);
+}
+
+int ldlm_proc_setup(void)
+{
+	int rc;
+	struct lprocfs_vars list[] = {
+		{ "dump_namespaces", NULL, ldlm_proc_dump_ns, NULL },
+		{ "dump_granted_max",
+		  lprocfs_rd_uint, lprocfs_wr_uint,
+		  &ldlm_dump_granted_max, NULL },
+		{ "cancel_unused_locks_before_replay",
+		  lprocfs_rd_uint, lprocfs_wr_uint,
+		  &ldlm_cancel_unused_locks_before_replay, NULL },
+		{ NULL }};
+	ENTRY;
+	LASSERT(ldlm_ns_proc_dir == NULL);
+
+	ldlm_type_proc_dir = lprocfs_register(OBD_LDLM_DEVICENAME,
+					      proc_lustre_root,
+					      NULL, NULL);
+	if (IS_ERR(ldlm_type_proc_dir)) {
+		CERROR("LProcFS failed in ldlm-init\n");
+		rc = PTR_ERR(ldlm_type_proc_dir);
+		GOTO(err, rc);
+	}
+
+	ldlm_ns_proc_dir = lprocfs_register("namespaces",
+					    ldlm_type_proc_dir,
+					    NULL, NULL);
+	if (IS_ERR(ldlm_ns_proc_dir)) {
+		CERROR("LProcFS failed in ldlm-init\n");
+		rc = PTR_ERR(ldlm_ns_proc_dir);
+		GOTO(err_type, rc);
+	}
+
+	ldlm_svc_proc_dir = lprocfs_register("services",
+					    ldlm_type_proc_dir,
+					    NULL, NULL);
+	if (IS_ERR(ldlm_svc_proc_dir)) {
+		CERROR("LProcFS failed in ldlm-init\n");
+		rc = PTR_ERR(ldlm_svc_proc_dir);
+		GOTO(err_ns, rc);
+	}
+
+	rc = lprocfs_add_vars(ldlm_type_proc_dir, list, NULL);
+
+	RETURN(0);
+
+err_ns:
+	lprocfs_remove(&ldlm_ns_proc_dir);
+err_type:
+	lprocfs_remove(&ldlm_type_proc_dir);
+err:
+	ldlm_svc_proc_dir = NULL;
+	RETURN(rc);
+}
+
+void ldlm_proc_cleanup(void)
+{
+	if (ldlm_svc_proc_dir)
+		lprocfs_remove(&ldlm_svc_proc_dir);
+
+	if (ldlm_ns_proc_dir)
+		lprocfs_remove(&ldlm_ns_proc_dir);
+
+	if (ldlm_type_proc_dir)
+		lprocfs_remove(&ldlm_type_proc_dir);
+}
+
+static int lprocfs_rd_ns_resources(char *page, char **start, off_t off,
+				   int count, int *eof, void *data)
+{
+	struct ldlm_namespace *ns  = data;
+	__u64		  res = 0;
+	cfs_hash_bd_t	  bd;
+	int		    i;
+
+	/* result is not strictly consistant */
+	cfs_hash_for_each_bucket(ns->ns_rs_hash, &bd, i)
+		res += cfs_hash_bd_count_get(&bd);
+	return lprocfs_rd_u64(page, start, off, count, eof, &res);
+}
+
+static int lprocfs_rd_ns_locks(char *page, char **start, off_t off,
+			       int count, int *eof, void *data)
+{
+	struct ldlm_namespace *ns = data;
+	__u64		  locks;
+
+	locks = lprocfs_stats_collector(ns->ns_stats, LDLM_NSS_LOCKS,
+					LPROCFS_FIELDS_FLAGS_SUM);
+	return lprocfs_rd_u64(page, start, off, count, eof, &locks);
+}
+
+static int lprocfs_rd_lru_size(char *page, char **start, off_t off,
+			       int count, int *eof, void *data)
+{
+	struct ldlm_namespace *ns = data;
+	__u32 *nr = &ns->ns_max_unused;
+
+	if (ns_connect_lru_resize(ns))
+		nr = &ns->ns_nr_unused;
+	return lprocfs_rd_uint(page, start, off, count, eof, nr);
+}
+
+static int lprocfs_wr_lru_size(struct file *file, const char *buffer,
+			       unsigned long count, void *data)
+{
+	struct ldlm_namespace *ns = data;
+	char dummy[MAX_STRING_SIZE + 1], *end;
+	unsigned long tmp;
+	int lru_resize;
+
+	dummy[MAX_STRING_SIZE] = '\0';
+	if (copy_from_user(dummy, buffer, MAX_STRING_SIZE))
+		return -EFAULT;
+
+	if (strncmp(dummy, "clear", 5) == 0) {
+		CDEBUG(D_DLMTRACE,
+		       "dropping all unused locks from namespace %s\n",
+		       ldlm_ns_name(ns));
+		if (ns_connect_lru_resize(ns)) {
+			int canceled, unused  = ns->ns_nr_unused;
+
+			/* Try to cancel all @ns_nr_unused locks. */
+			canceled = ldlm_cancel_lru(ns, unused, 0,
+						   LDLM_CANCEL_PASSED);
+			if (canceled < unused) {
+				CDEBUG(D_DLMTRACE,
+				       "not all requested locks are canceled, "
+				       "requested: %d, canceled: %d\n", unused,
+				       canceled);
+				return -EINVAL;
+			}
+		} else {
+			tmp = ns->ns_max_unused;
+			ns->ns_max_unused = 0;
+			ldlm_cancel_lru(ns, 0, 0, LDLM_CANCEL_PASSED);
+			ns->ns_max_unused = tmp;
+		}
+		return count;
+	}
+
+	tmp = simple_strtoul(dummy, &end, 0);
+	if (dummy == end) {
+		CERROR("invalid value written\n");
+		return -EINVAL;
+	}
+	lru_resize = (tmp == 0);
+
+	if (ns_connect_lru_resize(ns)) {
+		if (!lru_resize)
+			ns->ns_max_unused = (unsigned int)tmp;
+
+		if (tmp > ns->ns_nr_unused)
+			tmp = ns->ns_nr_unused;
+		tmp = ns->ns_nr_unused - tmp;
+
+		CDEBUG(D_DLMTRACE,
+		       "changing namespace %s unused locks from %u to %u\n",
+		       ldlm_ns_name(ns), ns->ns_nr_unused,
+		       (unsigned int)tmp);
+		ldlm_cancel_lru(ns, tmp, LCF_ASYNC, LDLM_CANCEL_PASSED);
+
+		if (!lru_resize) {
+			CDEBUG(D_DLMTRACE,
+			       "disable lru_resize for namespace %s\n",
+			       ldlm_ns_name(ns));
+			ns->ns_connect_flags &= ~OBD_CONNECT_LRU_RESIZE;
+		}
+	} else {
+		CDEBUG(D_DLMTRACE,
+		       "changing namespace %s max_unused from %u to %u\n",
+		       ldlm_ns_name(ns), ns->ns_max_unused,
+		       (unsigned int)tmp);
+		ns->ns_max_unused = (unsigned int)tmp;
+		ldlm_cancel_lru(ns, 0, LCF_ASYNC, LDLM_CANCEL_PASSED);
+
+		/* Make sure that LRU resize was originally supported before
+		 * turning it on here. */
+		if (lru_resize &&
+		    (ns->ns_orig_connect_flags & OBD_CONNECT_LRU_RESIZE)) {
+			CDEBUG(D_DLMTRACE,
+			       "enable lru_resize for namespace %s\n",
+			       ldlm_ns_name(ns));
+			ns->ns_connect_flags |= OBD_CONNECT_LRU_RESIZE;
+		}
+	}
+
+	return count;
+}
+
+static int lprocfs_rd_elc(char *page, char **start, off_t off,
+			  int count, int *eof, void *data)
+{
+	struct ldlm_namespace *ns = data;
+	unsigned int supp = ns_connect_cancelset(ns);
+
+	return lprocfs_rd_uint(page, start, off, count, eof, &supp);
+}
+
+static int lprocfs_wr_elc(struct file *file, const char *buffer,
+			       unsigned long count, void *data)
+{
+	struct ldlm_namespace *ns = data;
+	unsigned int supp = -1;
+	int rc;
+
+	rc = lprocfs_wr_uint(file, buffer, count, &supp);
+	if (rc < 0)
+		return rc;
+
+	if (supp == 0)
+		ns->ns_connect_flags &= ~OBD_CONNECT_CANCELSET;
+	else if (ns->ns_orig_connect_flags & OBD_CONNECT_CANCELSET)
+		ns->ns_connect_flags |= OBD_CONNECT_CANCELSET;
+	return count;
+}
+
+void ldlm_namespace_proc_unregister(struct ldlm_namespace *ns)
+{
+	struct proc_dir_entry *dir;
+
+	dir = lprocfs_srch(ldlm_ns_proc_dir, ldlm_ns_name(ns));
+	if (dir == NULL) {
+		CERROR("dlm namespace %s has no procfs dir?\n",
+		       ldlm_ns_name(ns));
+	} else {
+		lprocfs_remove(&dir);
+	}
+
+	if (ns->ns_stats != NULL)
+		lprocfs_free_stats(&ns->ns_stats);
+}
+
+int ldlm_namespace_proc_register(struct ldlm_namespace *ns)
+{
+	struct lprocfs_vars lock_vars[2];
+	char lock_name[MAX_STRING_SIZE + 1];
+
+	LASSERT(ns != NULL);
+	LASSERT(ns->ns_rs_hash != NULL);
+
+	ns->ns_stats = lprocfs_alloc_stats(LDLM_NSS_LAST, 0);
+	if (ns->ns_stats == NULL)
+		return -ENOMEM;
+
+	lprocfs_counter_init(ns->ns_stats, LDLM_NSS_LOCKS,
+			     LPROCFS_CNTR_AVGMINMAX, "locks", "locks");
+
+	lock_name[MAX_STRING_SIZE] = '\0';
+
+	memset(lock_vars, 0, sizeof(lock_vars));
+	lock_vars[0].name = lock_name;
+
+	snprintf(lock_name, MAX_STRING_SIZE, "%s/resource_count",
+		 ldlm_ns_name(ns));
+	lock_vars[0].data = ns;
+	lock_vars[0].read_fptr = lprocfs_rd_ns_resources;
+	lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0);
+
+	snprintf(lock_name, MAX_STRING_SIZE, "%s/lock_count",
+		 ldlm_ns_name(ns));
+	lock_vars[0].data = ns;
+	lock_vars[0].read_fptr = lprocfs_rd_ns_locks;
+	lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0);
+
+	if (ns_is_client(ns)) {
+		snprintf(lock_name, MAX_STRING_SIZE, "%s/lock_unused_count",
+			 ldlm_ns_name(ns));
+		lock_vars[0].data = &ns->ns_nr_unused;
+		lock_vars[0].read_fptr = lprocfs_rd_uint;
+		lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0);
+
+		snprintf(lock_name, MAX_STRING_SIZE, "%s/lru_size",
+			 ldlm_ns_name(ns));
+		lock_vars[0].data = ns;
+		lock_vars[0].read_fptr = lprocfs_rd_lru_size;
+		lock_vars[0].write_fptr = lprocfs_wr_lru_size;
+		lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0);
+
+		snprintf(lock_name, MAX_STRING_SIZE, "%s/lru_max_age",
+			 ldlm_ns_name(ns));
+		lock_vars[0].data = &ns->ns_max_age;
+		lock_vars[0].read_fptr = lprocfs_rd_uint;
+		lock_vars[0].write_fptr = lprocfs_wr_uint;
+		lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0);
+
+		snprintf(lock_name, MAX_STRING_SIZE, "%s/early_lock_cancel",
+			 ldlm_ns_name(ns));
+		lock_vars[0].data = ns;
+		lock_vars[0].read_fptr = lprocfs_rd_elc;
+		lock_vars[0].write_fptr = lprocfs_wr_elc;
+		lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0);
+	} else {
+		snprintf(lock_name, MAX_STRING_SIZE, "%s/ctime_age_limit",
+			 ldlm_ns_name(ns));
+		lock_vars[0].data = &ns->ns_ctime_age_limit;
+		lock_vars[0].read_fptr = lprocfs_rd_uint;
+		lock_vars[0].write_fptr = lprocfs_wr_uint;
+		lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0);
+
+		snprintf(lock_name, MAX_STRING_SIZE, "%s/lock_timeouts",
+			 ldlm_ns_name(ns));
+		lock_vars[0].data = &ns->ns_timeouts;
+		lock_vars[0].read_fptr = lprocfs_rd_uint;
+		lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0);
+
+		snprintf(lock_name, MAX_STRING_SIZE, "%s/max_nolock_bytes",
+			 ldlm_ns_name(ns));
+		lock_vars[0].data = &ns->ns_max_nolock_size;
+		lock_vars[0].read_fptr = lprocfs_rd_uint;
+		lock_vars[0].write_fptr = lprocfs_wr_uint;
+		lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0);
+
+		snprintf(lock_name, MAX_STRING_SIZE, "%s/contention_seconds",
+			 ldlm_ns_name(ns));
+		lock_vars[0].data = &ns->ns_contention_time;
+		lock_vars[0].read_fptr = lprocfs_rd_uint;
+		lock_vars[0].write_fptr = lprocfs_wr_uint;
+		lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0);
+
+		snprintf(lock_name, MAX_STRING_SIZE, "%s/contended_locks",
+			 ldlm_ns_name(ns));
+		lock_vars[0].data = &ns->ns_contended_locks;
+		lock_vars[0].read_fptr = lprocfs_rd_uint;
+		lock_vars[0].write_fptr = lprocfs_wr_uint;
+		lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0);
+
+		snprintf(lock_name, MAX_STRING_SIZE, "%s/max_parallel_ast",
+			 ldlm_ns_name(ns));
+		lock_vars[0].data = &ns->ns_max_parallel_ast;
+		lock_vars[0].read_fptr = lprocfs_rd_uint;
+		lock_vars[0].write_fptr = lprocfs_wr_uint;
+		lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0);
+	}
+	return 0;
+}
+#undef MAX_STRING_SIZE
+#else /* LPROCFS */
+
+#define ldlm_namespace_proc_unregister(ns)      ({;})
+#define ldlm_namespace_proc_register(ns)	({0;})
+
+#endif /* LPROCFS */
+
+static unsigned ldlm_res_hop_hash(cfs_hash_t *hs,
+				  const void *key, unsigned mask)
+{
+	const struct ldlm_res_id     *id  = key;
+	unsigned		val = 0;
+	unsigned		i;
+
+	for (i = 0; i < RES_NAME_SIZE; i++)
+		val += id->name[i];
+	return val & mask;
+}
+
+static unsigned ldlm_res_hop_fid_hash(cfs_hash_t *hs,
+				      const void *key, unsigned mask)
+{
+	const struct ldlm_res_id *id = key;
+	struct lu_fid       fid;
+	__u32	       hash;
+	__u32	       val;
+
+	fid.f_seq = id->name[LUSTRE_RES_ID_SEQ_OFF];
+	fid.f_oid = (__u32)id->name[LUSTRE_RES_ID_VER_OID_OFF];
+	fid.f_ver = (__u32)(id->name[LUSTRE_RES_ID_VER_OID_OFF] >> 32);
+
+	hash = fid_flatten32(&fid);
+	hash += (hash >> 4) + (hash << 12); /* mixing oid and seq */
+	if (id->name[LUSTRE_RES_ID_HSH_OFF] != 0) {
+		val = id->name[LUSTRE_RES_ID_HSH_OFF];
+		hash += (val >> 5) + (val << 11);
+	} else {
+		val = fid_oid(&fid);
+	}
+	hash = cfs_hash_long(hash, hs->hs_bkt_bits);
+	/* give me another random factor */
+	hash -= cfs_hash_long((unsigned long)hs, val % 11 + 3);
+
+	hash <<= hs->hs_cur_bits - hs->hs_bkt_bits;
+	hash |= ldlm_res_hop_hash(hs, key, CFS_HASH_NBKT(hs) - 1);
+
+	return hash & mask;
+}
+
+static void *ldlm_res_hop_key(struct hlist_node *hnode)
+{
+	struct ldlm_resource   *res;
+
+	res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+	return &res->lr_name;
+}
+
+static int ldlm_res_hop_keycmp(const void *key, struct hlist_node *hnode)
+{
+	struct ldlm_resource   *res;
+
+	res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+	return ldlm_res_eq((const struct ldlm_res_id *)key,
+			   (const struct ldlm_res_id *)&res->lr_name);
+}
+
+static void *ldlm_res_hop_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct ldlm_resource, lr_hash);
+}
+
+static void ldlm_res_hop_get_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct ldlm_resource *res;
+
+	res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+	ldlm_resource_getref(res);
+}
+
+static void ldlm_res_hop_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct ldlm_resource *res;
+
+	res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+	/* cfs_hash_for_each_nolock is the only chance we call it */
+	ldlm_resource_putref_locked(res);
+}
+
+static void ldlm_res_hop_put(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct ldlm_resource *res;
+
+	res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+	ldlm_resource_putref(res);
+}
+
+cfs_hash_ops_t ldlm_ns_hash_ops = {
+	.hs_hash	= ldlm_res_hop_hash,
+	.hs_key	 = ldlm_res_hop_key,
+	.hs_keycmp      = ldlm_res_hop_keycmp,
+	.hs_keycpy      = NULL,
+	.hs_object      = ldlm_res_hop_object,
+	.hs_get	 = ldlm_res_hop_get_locked,
+	.hs_put_locked  = ldlm_res_hop_put_locked,
+	.hs_put	 = ldlm_res_hop_put
+};
+
+cfs_hash_ops_t ldlm_ns_fid_hash_ops = {
+	.hs_hash	= ldlm_res_hop_fid_hash,
+	.hs_key	 = ldlm_res_hop_key,
+	.hs_keycmp      = ldlm_res_hop_keycmp,
+	.hs_keycpy      = NULL,
+	.hs_object      = ldlm_res_hop_object,
+	.hs_get	 = ldlm_res_hop_get_locked,
+	.hs_put_locked  = ldlm_res_hop_put_locked,
+	.hs_put	 = ldlm_res_hop_put
+};
+
+typedef struct {
+	ldlm_ns_type_t  nsd_type;
+	/** hash bucket bits */
+	unsigned	nsd_bkt_bits;
+	/** hash bits */
+	unsigned	nsd_all_bits;
+	/** hash operations */
+	cfs_hash_ops_t *nsd_hops;
+} ldlm_ns_hash_def_t;
+
+ldlm_ns_hash_def_t ldlm_ns_hash_defs[] =
+{
+	{
+		.nsd_type       = LDLM_NS_TYPE_MDC,
+		.nsd_bkt_bits   = 11,
+		.nsd_all_bits   = 16,
+		.nsd_hops       = &ldlm_ns_fid_hash_ops,
+	},
+	{
+		.nsd_type       = LDLM_NS_TYPE_MDT,
+		.nsd_bkt_bits   = 14,
+		.nsd_all_bits   = 21,
+		.nsd_hops       = &ldlm_ns_fid_hash_ops,
+	},
+	{
+		.nsd_type       = LDLM_NS_TYPE_OSC,
+		.nsd_bkt_bits   = 8,
+		.nsd_all_bits   = 12,
+		.nsd_hops       = &ldlm_ns_hash_ops,
+	},
+	{
+		.nsd_type       = LDLM_NS_TYPE_OST,
+		.nsd_bkt_bits   = 11,
+		.nsd_all_bits   = 17,
+		.nsd_hops       = &ldlm_ns_hash_ops,
+	},
+	{
+		.nsd_type       = LDLM_NS_TYPE_MGC,
+		.nsd_bkt_bits   = 4,
+		.nsd_all_bits   = 4,
+		.nsd_hops       = &ldlm_ns_hash_ops,
+	},
+	{
+		.nsd_type       = LDLM_NS_TYPE_MGT,
+		.nsd_bkt_bits   = 4,
+		.nsd_all_bits   = 4,
+		.nsd_hops       = &ldlm_ns_hash_ops,
+	},
+	{
+		.nsd_type       = LDLM_NS_TYPE_UNKNOWN,
+	},
+};
+
+/**
+ * Create and initialize new empty namespace.
+ */
+struct ldlm_namespace *ldlm_namespace_new(struct obd_device *obd, char *name,
+					  ldlm_side_t client,
+					  ldlm_appetite_t apt,
+					  ldlm_ns_type_t ns_type)
+{
+	struct ldlm_namespace *ns = NULL;
+	struct ldlm_ns_bucket *nsb;
+	ldlm_ns_hash_def_t    *nsd;
+	cfs_hash_bd_t	  bd;
+	int		    idx;
+	int		    rc;
+	ENTRY;
+
+	LASSERT(obd != NULL);
+
+	rc = ldlm_get_ref();
+	if (rc) {
+		CERROR("ldlm_get_ref failed: %d\n", rc);
+		RETURN(NULL);
+	}
+
+	for (idx = 0;;idx++) {
+		nsd = &ldlm_ns_hash_defs[idx];
+		if (nsd->nsd_type == LDLM_NS_TYPE_UNKNOWN) {
+			CERROR("Unknown type %d for ns %s\n", ns_type, name);
+			GOTO(out_ref, NULL);
+		}
+
+		if (nsd->nsd_type == ns_type)
+			break;
+	}
+
+	OBD_ALLOC_PTR(ns);
+	if (!ns)
+		GOTO(out_ref, NULL);
+
+	ns->ns_rs_hash = cfs_hash_create(name,
+					 nsd->nsd_all_bits, nsd->nsd_all_bits,
+					 nsd->nsd_bkt_bits, sizeof(*nsb),
+					 CFS_HASH_MIN_THETA,
+					 CFS_HASH_MAX_THETA,
+					 nsd->nsd_hops,
+					 CFS_HASH_DEPTH |
+					 CFS_HASH_BIGNAME |
+					 CFS_HASH_SPIN_BKTLOCK |
+					 CFS_HASH_NO_ITEMREF);
+	if (ns->ns_rs_hash == NULL)
+		GOTO(out_ns, NULL);
+
+	cfs_hash_for_each_bucket(ns->ns_rs_hash, &bd, idx) {
+		nsb = cfs_hash_bd_extra_get(ns->ns_rs_hash, &bd);
+		at_init(&nsb->nsb_at_estimate, ldlm_enqueue_min, 0);
+		nsb->nsb_namespace = ns;
+	}
+
+	ns->ns_obd      = obd;
+	ns->ns_appetite = apt;
+	ns->ns_client   = client;
+
+	INIT_LIST_HEAD(&ns->ns_list_chain);
+	INIT_LIST_HEAD(&ns->ns_unused_list);
+	spin_lock_init(&ns->ns_lock);
+	atomic_set(&ns->ns_bref, 0);
+	init_waitqueue_head(&ns->ns_waitq);
+
+	ns->ns_max_nolock_size    = NS_DEFAULT_MAX_NOLOCK_BYTES;
+	ns->ns_contention_time    = NS_DEFAULT_CONTENTION_SECONDS;
+	ns->ns_contended_locks    = NS_DEFAULT_CONTENDED_LOCKS;
+
+	ns->ns_max_parallel_ast   = LDLM_DEFAULT_PARALLEL_AST_LIMIT;
+	ns->ns_nr_unused	  = 0;
+	ns->ns_max_unused	 = LDLM_DEFAULT_LRU_SIZE;
+	ns->ns_max_age	    = LDLM_DEFAULT_MAX_ALIVE;
+	ns->ns_ctime_age_limit    = LDLM_CTIME_AGE_LIMIT;
+	ns->ns_timeouts	   = 0;
+	ns->ns_orig_connect_flags = 0;
+	ns->ns_connect_flags      = 0;
+	ns->ns_stopping	   = 0;
+	rc = ldlm_namespace_proc_register(ns);
+	if (rc != 0) {
+		CERROR("Can't initialize ns proc, rc %d\n", rc);
+		GOTO(out_hash, rc);
+	}
+
+	idx = atomic_read(ldlm_namespace_nr(client));
+	rc = ldlm_pool_init(&ns->ns_pool, ns, idx, client);
+	if (rc) {
+		CERROR("Can't initialize lock pool, rc %d\n", rc);
+		GOTO(out_proc, rc);
+	}
+
+	ldlm_namespace_register(ns, client);
+	RETURN(ns);
+out_proc:
+	ldlm_namespace_proc_unregister(ns);
+	ldlm_namespace_cleanup(ns, 0);
+out_hash:
+	cfs_hash_putref(ns->ns_rs_hash);
+out_ns:
+	OBD_FREE_PTR(ns);
+out_ref:
+	ldlm_put_ref();
+	RETURN(NULL);
+}
+EXPORT_SYMBOL(ldlm_namespace_new);
+
+extern struct ldlm_lock *ldlm_lock_get(struct ldlm_lock *lock);
+
+/**
+ * Cancel and destroy all locks on a resource.
+ *
+ * If flags contains FL_LOCAL_ONLY, don't try to tell the server, just
+ * clean up.  This is currently only used for recovery, and we make
+ * certain assumptions as a result--notably, that we shouldn't cancel
+ * locks with refs.
+ */
+static void cleanup_resource(struct ldlm_resource *res, struct list_head *q,
+			     __u64 flags)
+{
+	struct list_head *tmp;
+	int rc = 0, client = ns_is_client(ldlm_res_to_ns(res));
+	bool local_only = !!(flags & LDLM_FL_LOCAL_ONLY);
+
+	do {
+		struct ldlm_lock *lock = NULL;
+
+		/* First, we look for non-cleaned-yet lock
+		 * all cleaned locks are marked by CLEANED flag. */
+		lock_res(res);
+		list_for_each(tmp, q) {
+			lock = list_entry(tmp, struct ldlm_lock,
+					      l_res_link);
+			if (lock->l_flags & LDLM_FL_CLEANED) {
+				lock = NULL;
+				continue;
+			}
+			LDLM_LOCK_GET(lock);
+			lock->l_flags |= LDLM_FL_CLEANED;
+			break;
+		}
+
+		if (lock == NULL) {
+			unlock_res(res);
+			break;
+		}
+
+		/* Set CBPENDING so nothing in the cancellation path
+		 * can match this lock. */
+		lock->l_flags |= LDLM_FL_CBPENDING;
+		lock->l_flags |= LDLM_FL_FAILED;
+		lock->l_flags |= flags;
+
+		/* ... without sending a CANCEL message for local_only. */
+		if (local_only)
+			lock->l_flags |= LDLM_FL_LOCAL_ONLY;
+
+		if (local_only && (lock->l_readers || lock->l_writers)) {
+			/* This is a little bit gross, but much better than the
+			 * alternative: pretend that we got a blocking AST from
+			 * the server, so that when the lock is decref'd, it
+			 * will go away ... */
+			unlock_res(res);
+			LDLM_DEBUG(lock, "setting FL_LOCAL_ONLY");
+			if (lock->l_completion_ast)
+				lock->l_completion_ast(lock, 0, NULL);
+			LDLM_LOCK_RELEASE(lock);
+			continue;
+		}
+
+		if (client) {
+			struct lustre_handle lockh;
+
+			unlock_res(res);
+			ldlm_lock2handle(lock, &lockh);
+			rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
+			if (rc)
+				CERROR("ldlm_cli_cancel: %d\n", rc);
+		} else {
+			ldlm_resource_unlink_lock(lock);
+			unlock_res(res);
+			LDLM_DEBUG(lock, "Freeing a lock still held by a "
+				   "client node");
+			ldlm_lock_destroy(lock);
+		}
+		LDLM_LOCK_RELEASE(lock);
+	} while (1);
+}
+
+static int ldlm_resource_clean(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+			       struct hlist_node *hnode, void *arg)
+{
+	struct ldlm_resource *res = cfs_hash_object(hs, hnode);
+	__u64 flags = *(__u64 *)arg;
+
+	cleanup_resource(res, &res->lr_granted, flags);
+	cleanup_resource(res, &res->lr_converting, flags);
+	cleanup_resource(res, &res->lr_waiting, flags);
+
+	return 0;
+}
+
+static int ldlm_resource_complain(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+				  struct hlist_node *hnode, void *arg)
+{
+	struct ldlm_resource  *res = cfs_hash_object(hs, hnode);
+
+	lock_res(res);
+	CERROR("Namespace %s resource refcount nonzero "
+	       "(%d) after lock cleanup; forcing "
+	       "cleanup.\n",
+	       ldlm_ns_name(ldlm_res_to_ns(res)),
+	       atomic_read(&res->lr_refcount) - 1);
+
+	CERROR("Resource: %p ("LPU64"/"LPU64"/"LPU64"/"
+	       LPU64") (rc: %d)\n", res,
+	       res->lr_name.name[0], res->lr_name.name[1],
+	       res->lr_name.name[2], res->lr_name.name[3],
+	       atomic_read(&res->lr_refcount) - 1);
+
+	ldlm_resource_dump(D_ERROR, res);
+	unlock_res(res);
+	return 0;
+}
+
+/**
+ * Cancel and destroy all locks in the namespace.
+ *
+ * Typically used during evictions when server notified client that it was
+ * evicted and all of its state needs to be destroyed.
+ * Also used during shutdown.
+ */
+int ldlm_namespace_cleanup(struct ldlm_namespace *ns, __u64 flags)
+{
+	if (ns == NULL) {
+		CDEBUG(D_INFO, "NULL ns, skipping cleanup\n");
+		return ELDLM_OK;
+	}
+
+	cfs_hash_for_each_nolock(ns->ns_rs_hash, ldlm_resource_clean, &flags);
+	cfs_hash_for_each_nolock(ns->ns_rs_hash, ldlm_resource_complain, NULL);
+	return ELDLM_OK;
+}
+EXPORT_SYMBOL(ldlm_namespace_cleanup);
+
+/**
+ * Attempts to free namespace.
+ *
+ * Only used when namespace goes away, like during an unmount.
+ */
+static int __ldlm_namespace_free(struct ldlm_namespace *ns, int force)
+{
+	ENTRY;
+
+	/* At shutdown time, don't call the cancellation callback */
+	ldlm_namespace_cleanup(ns, force ? LDLM_FL_LOCAL_ONLY : 0);
+
+	if (atomic_read(&ns->ns_bref) > 0) {
+		struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+		int rc;
+		CDEBUG(D_DLMTRACE,
+		       "dlm namespace %s free waiting on refcount %d\n",
+		       ldlm_ns_name(ns), atomic_read(&ns->ns_bref));
+force_wait:
+		if (force)
+			lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, NULL, NULL);
+
+		rc = l_wait_event(ns->ns_waitq,
+				  atomic_read(&ns->ns_bref) == 0, &lwi);
+
+		/* Forced cleanups should be able to reclaim all references,
+		 * so it's safe to wait forever... we can't leak locks... */
+		if (force && rc == -ETIMEDOUT) {
+			LCONSOLE_ERROR("Forced cleanup waiting for %s "
+				       "namespace with %d resources in use, "
+				       "(rc=%d)\n", ldlm_ns_name(ns),
+				       atomic_read(&ns->ns_bref), rc);
+			GOTO(force_wait, rc);
+		}
+
+		if (atomic_read(&ns->ns_bref)) {
+			LCONSOLE_ERROR("Cleanup waiting for %s namespace "
+				       "with %d resources in use, (rc=%d)\n",
+				       ldlm_ns_name(ns),
+				       atomic_read(&ns->ns_bref), rc);
+			RETURN(ELDLM_NAMESPACE_EXISTS);
+		}
+		CDEBUG(D_DLMTRACE, "dlm namespace %s free done waiting\n",
+		       ldlm_ns_name(ns));
+	}
+
+	RETURN(ELDLM_OK);
+}
+
+/**
+ * Performs various cleanups for passed \a ns to make it drop refc and be
+ * ready for freeing. Waits for refc == 0.
+ *
+ * The following is done:
+ * (0) Unregister \a ns from its list to make inaccessible for potential
+ * users like pools thread and others;
+ * (1) Clear all locks in \a ns.
+ */
+void ldlm_namespace_free_prior(struct ldlm_namespace *ns,
+			       struct obd_import *imp,
+			       int force)
+{
+	int rc;
+	ENTRY;
+	if (!ns) {
+		EXIT;
+		return;
+	}
+
+	spin_lock(&ns->ns_lock);
+	ns->ns_stopping = 1;
+	spin_unlock(&ns->ns_lock);
+
+	/*
+	 * Can fail with -EINTR when force == 0 in which case try harder.
+	 */
+	rc = __ldlm_namespace_free(ns, force);
+	if (rc != ELDLM_OK) {
+		if (imp) {
+			ptlrpc_disconnect_import(imp, 0);
+			ptlrpc_invalidate_import(imp);
+		}
+
+		/*
+		 * With all requests dropped and the import inactive
+		 * we are gaurenteed all reference will be dropped.
+		 */
+		rc = __ldlm_namespace_free(ns, 1);
+		LASSERT(rc == 0);
+	}
+	EXIT;
+}
+
+/**
+ * Performs freeing memory structures related to \a ns. This is only done
+ * when ldlm_namespce_free_prior() successfully removed all resources
+ * referencing \a ns and its refc == 0.
+ */
+void ldlm_namespace_free_post(struct ldlm_namespace *ns)
+{
+	ENTRY;
+	if (!ns) {
+		EXIT;
+		return;
+	}
+
+	/* Make sure that nobody can find this ns in its list. */
+	ldlm_namespace_unregister(ns, ns->ns_client);
+	/* Fini pool _before_ parent proc dir is removed. This is important as
+	 * ldlm_pool_fini() removes own proc dir which is child to @dir.
+	 * Removing it after @dir may cause oops. */
+	ldlm_pool_fini(&ns->ns_pool);
+
+	ldlm_namespace_proc_unregister(ns);
+	cfs_hash_putref(ns->ns_rs_hash);
+	/* Namespace \a ns should be not on list at this time, otherwise
+	 * this will cause issues related to using freed \a ns in poold
+	 * thread. */
+	LASSERT(list_empty(&ns->ns_list_chain));
+	OBD_FREE_PTR(ns);
+	ldlm_put_ref();
+	EXIT;
+}
+
+/**
+ * Cleanup the resource, and free namespace.
+ * bug 12864:
+ * Deadlock issue:
+ * proc1: destroy import
+ *	class_disconnect_export(grab cl_sem) ->
+ *	      -> ldlm_namespace_free ->
+ *	      -> lprocfs_remove(grab _lprocfs_lock).
+ * proc2: read proc info
+ *	lprocfs_fops_read(grab _lprocfs_lock) ->
+ *	      -> osc_rd_active, etc(grab cl_sem).
+ *
+ * So that I have to split the ldlm_namespace_free into two parts - the first
+ * part ldlm_namespace_free_prior is used to cleanup the resource which is
+ * being used; the 2nd part ldlm_namespace_free_post is used to unregister the
+ * lprocfs entries, and then free memory. It will be called w/o cli->cl_sem
+ * held.
+ */
+void ldlm_namespace_free(struct ldlm_namespace *ns,
+			 struct obd_import *imp,
+			 int force)
+{
+	ldlm_namespace_free_prior(ns, imp, force);
+	ldlm_namespace_free_post(ns);
+}
+EXPORT_SYMBOL(ldlm_namespace_free);
+
+void ldlm_namespace_get(struct ldlm_namespace *ns)
+{
+	atomic_inc(&ns->ns_bref);
+}
+EXPORT_SYMBOL(ldlm_namespace_get);
+
+void ldlm_namespace_put(struct ldlm_namespace *ns)
+{
+	if (atomic_dec_and_lock(&ns->ns_bref, &ns->ns_lock)) {
+		wake_up(&ns->ns_waitq);
+		spin_unlock(&ns->ns_lock);
+	}
+}
+EXPORT_SYMBOL(ldlm_namespace_put);
+
+/** Register \a ns in the list of namespaces */
+void ldlm_namespace_register(struct ldlm_namespace *ns, ldlm_side_t client)
+{
+	mutex_lock(ldlm_namespace_lock(client));
+	LASSERT(list_empty(&ns->ns_list_chain));
+	list_add(&ns->ns_list_chain, ldlm_namespace_list(client));
+	atomic_inc(ldlm_namespace_nr(client));
+	mutex_unlock(ldlm_namespace_lock(client));
+}
+
+/** Unregister \a ns from the list of namespaces. */
+void ldlm_namespace_unregister(struct ldlm_namespace *ns, ldlm_side_t client)
+{
+	mutex_lock(ldlm_namespace_lock(client));
+	LASSERT(!list_empty(&ns->ns_list_chain));
+	/* Some asserts and possibly other parts of the code are still
+	 * using list_empty(&ns->ns_list_chain). This is why it is
+	 * important to use list_del_init() here. */
+	list_del_init(&ns->ns_list_chain);
+	atomic_dec(ldlm_namespace_nr(client));
+	mutex_unlock(ldlm_namespace_lock(client));
+}
+
+/** Should be called with ldlm_namespace_lock(client) taken. */
+void ldlm_namespace_move_locked(struct ldlm_namespace *ns, ldlm_side_t client)
+{
+	LASSERT(!list_empty(&ns->ns_list_chain));
+	LASSERT(mutex_is_locked(ldlm_namespace_lock(client)));
+	list_move_tail(&ns->ns_list_chain, ldlm_namespace_list(client));
+}
+
+/** Should be called with ldlm_namespace_lock(client) taken. */
+struct ldlm_namespace *ldlm_namespace_first_locked(ldlm_side_t client)
+{
+	LASSERT(mutex_is_locked(ldlm_namespace_lock(client)));
+	LASSERT(!list_empty(ldlm_namespace_list(client)));
+	return container_of(ldlm_namespace_list(client)->next,
+		struct ldlm_namespace, ns_list_chain);
+}
+
+/** Create and initialize new resource. */
+static struct ldlm_resource *ldlm_resource_new(void)
+{
+	struct ldlm_resource *res;
+	int idx;
+
+	OBD_SLAB_ALLOC_PTR_GFP(res, ldlm_resource_slab, __GFP_IO);
+	if (res == NULL)
+		return NULL;
+
+	INIT_LIST_HEAD(&res->lr_granted);
+	INIT_LIST_HEAD(&res->lr_converting);
+	INIT_LIST_HEAD(&res->lr_waiting);
+
+	/* Initialize interval trees for each lock mode. */
+	for (idx = 0; idx < LCK_MODE_NUM; idx++) {
+		res->lr_itree[idx].lit_size = 0;
+		res->lr_itree[idx].lit_mode = 1 << idx;
+		res->lr_itree[idx].lit_root = NULL;
+	}
+
+	atomic_set(&res->lr_refcount, 1);
+	spin_lock_init(&res->lr_lock);
+	lu_ref_init(&res->lr_reference);
+
+	/* The creator of the resource must unlock the mutex after LVB
+	 * initialization. */
+	mutex_init(&res->lr_lvb_mutex);
+	mutex_lock(&res->lr_lvb_mutex);
+
+	return res;
+}
+
+/**
+ * Return a reference to resource with given name, creating it if necessary.
+ * Args: namespace with ns_lock unlocked
+ * Locks: takes and releases NS hash-lock and res->lr_lock
+ * Returns: referenced, unlocked ldlm_resource or NULL
+ */
+struct ldlm_resource *
+ldlm_resource_get(struct ldlm_namespace *ns, struct ldlm_resource *parent,
+		  const struct ldlm_res_id *name, ldlm_type_t type, int create)
+{
+	struct hlist_node     *hnode;
+	struct ldlm_resource *res;
+	cfs_hash_bd_t	 bd;
+	__u64		 version;
+
+	LASSERT(ns != NULL);
+	LASSERT(parent == NULL);
+	LASSERT(ns->ns_rs_hash != NULL);
+	LASSERT(name->name[0] != 0);
+
+	cfs_hash_bd_get_and_lock(ns->ns_rs_hash, (void *)name, &bd, 0);
+	hnode = cfs_hash_bd_lookup_locked(ns->ns_rs_hash, &bd, (void *)name);
+	if (hnode != NULL) {
+		cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 0);
+		res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+		/* Synchronize with regard to resource creation. */
+		if (ns->ns_lvbo && ns->ns_lvbo->lvbo_init) {
+			mutex_lock(&res->lr_lvb_mutex);
+			mutex_unlock(&res->lr_lvb_mutex);
+		}
+
+		if (unlikely(res->lr_lvb_len < 0)) {
+			ldlm_resource_putref(res);
+			res = NULL;
+		}
+		return res;
+	}
+
+	version = cfs_hash_bd_version_get(&bd);
+	cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 0);
+
+	if (create == 0)
+		return NULL;
+
+	LASSERTF(type >= LDLM_MIN_TYPE && type < LDLM_MAX_TYPE,
+		 "type: %d\n", type);
+	res = ldlm_resource_new();
+	if (!res)
+		return NULL;
+
+	res->lr_ns_bucket  = cfs_hash_bd_extra_get(ns->ns_rs_hash, &bd);
+	res->lr_name       = *name;
+	res->lr_type       = type;
+	res->lr_most_restr = LCK_NL;
+
+	cfs_hash_bd_lock(ns->ns_rs_hash, &bd, 1);
+	hnode = (version == cfs_hash_bd_version_get(&bd)) ?  NULL :
+		cfs_hash_bd_lookup_locked(ns->ns_rs_hash, &bd, (void *)name);
+
+	if (hnode != NULL) {
+		/* Someone won the race and already added the resource. */
+		cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1);
+		/* Clean lu_ref for failed resource. */
+		lu_ref_fini(&res->lr_reference);
+		/* We have taken lr_lvb_mutex. Drop it. */
+		mutex_unlock(&res->lr_lvb_mutex);
+		OBD_SLAB_FREE(res, ldlm_resource_slab, sizeof *res);
+
+		res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+		/* Synchronize with regard to resource creation. */
+		if (ns->ns_lvbo && ns->ns_lvbo->lvbo_init) {
+			mutex_lock(&res->lr_lvb_mutex);
+			mutex_unlock(&res->lr_lvb_mutex);
+		}
+
+		if (unlikely(res->lr_lvb_len < 0)) {
+			ldlm_resource_putref(res);
+			res = NULL;
+		}
+		return res;
+	}
+	/* We won! Let's add the resource. */
+	cfs_hash_bd_add_locked(ns->ns_rs_hash, &bd, &res->lr_hash);
+	if (cfs_hash_bd_count_get(&bd) == 1)
+		ldlm_namespace_get(ns);
+
+	cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1);
+	if (ns->ns_lvbo && ns->ns_lvbo->lvbo_init) {
+		int rc;
+
+		OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CREATE_RESOURCE, 2);
+		rc = ns->ns_lvbo->lvbo_init(res);
+		if (rc < 0) {
+			CERROR("lvbo_init failed for resource "
+			       LPU64": rc %d\n", name->name[0], rc);
+			if (res->lr_lvb_data) {
+				OBD_FREE(res->lr_lvb_data, res->lr_lvb_len);
+				res->lr_lvb_data = NULL;
+			}
+			res->lr_lvb_len = rc;
+			mutex_unlock(&res->lr_lvb_mutex);
+			ldlm_resource_putref(res);
+			return NULL;
+		}
+	}
+
+	/* We create resource with locked lr_lvb_mutex. */
+	mutex_unlock(&res->lr_lvb_mutex);
+
+	return res;
+}
+EXPORT_SYMBOL(ldlm_resource_get);
+
+struct ldlm_resource *ldlm_resource_getref(struct ldlm_resource *res)
+{
+	LASSERT(res != NULL);
+	LASSERT(res != LP_POISON);
+	atomic_inc(&res->lr_refcount);
+	CDEBUG(D_INFO, "getref res: %p count: %d\n", res,
+	       atomic_read(&res->lr_refcount));
+	return res;
+}
+
+static void __ldlm_resource_putref_final(cfs_hash_bd_t *bd,
+					 struct ldlm_resource *res)
+{
+	struct ldlm_ns_bucket *nsb = res->lr_ns_bucket;
+
+	if (!list_empty(&res->lr_granted)) {
+		ldlm_resource_dump(D_ERROR, res);
+		LBUG();
+	}
+
+	if (!list_empty(&res->lr_converting)) {
+		ldlm_resource_dump(D_ERROR, res);
+		LBUG();
+	}
+
+	if (!list_empty(&res->lr_waiting)) {
+		ldlm_resource_dump(D_ERROR, res);
+		LBUG();
+	}
+
+	cfs_hash_bd_del_locked(nsb->nsb_namespace->ns_rs_hash,
+			       bd, &res->lr_hash);
+	lu_ref_fini(&res->lr_reference);
+	if (cfs_hash_bd_count_get(bd) == 0)
+		ldlm_namespace_put(nsb->nsb_namespace);
+}
+
+/* Returns 1 if the resource was freed, 0 if it remains. */
+int ldlm_resource_putref(struct ldlm_resource *res)
+{
+	struct ldlm_namespace *ns = ldlm_res_to_ns(res);
+	cfs_hash_bd_t   bd;
+
+	LASSERT_ATOMIC_GT_LT(&res->lr_refcount, 0, LI_POISON);
+	CDEBUG(D_INFO, "putref res: %p count: %d\n",
+	       res, atomic_read(&res->lr_refcount) - 1);
+
+	cfs_hash_bd_get(ns->ns_rs_hash, &res->lr_name, &bd);
+	if (cfs_hash_bd_dec_and_lock(ns->ns_rs_hash, &bd, &res->lr_refcount)) {
+		__ldlm_resource_putref_final(&bd, res);
+		cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1);
+		if (ns->ns_lvbo && ns->ns_lvbo->lvbo_free)
+			ns->ns_lvbo->lvbo_free(res);
+		OBD_SLAB_FREE(res, ldlm_resource_slab, sizeof *res);
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ldlm_resource_putref);
+
+/* Returns 1 if the resource was freed, 0 if it remains. */
+int ldlm_resource_putref_locked(struct ldlm_resource *res)
+{
+	struct ldlm_namespace *ns = ldlm_res_to_ns(res);
+
+	LASSERT_ATOMIC_GT_LT(&res->lr_refcount, 0, LI_POISON);
+	CDEBUG(D_INFO, "putref res: %p count: %d\n",
+	       res, atomic_read(&res->lr_refcount) - 1);
+
+	if (atomic_dec_and_test(&res->lr_refcount)) {
+		cfs_hash_bd_t bd;
+
+		cfs_hash_bd_get(ldlm_res_to_ns(res)->ns_rs_hash,
+				&res->lr_name, &bd);
+		__ldlm_resource_putref_final(&bd, res);
+		cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1);
+		/* NB: ns_rs_hash is created with CFS_HASH_NO_ITEMREF,
+		 * so we should never be here while calling cfs_hash_del,
+		 * cfs_hash_for_each_nolock is the only case we can get
+		 * here, which is safe to release cfs_hash_bd_lock.
+		 */
+		if (ns->ns_lvbo && ns->ns_lvbo->lvbo_free)
+			ns->ns_lvbo->lvbo_free(res);
+		OBD_SLAB_FREE(res, ldlm_resource_slab, sizeof *res);
+
+		cfs_hash_bd_lock(ns->ns_rs_hash, &bd, 1);
+		return 1;
+	}
+	return 0;
+}
+
+/**
+ * Add a lock into a given resource into specified lock list.
+ */
+void ldlm_resource_add_lock(struct ldlm_resource *res, struct list_head *head,
+			    struct ldlm_lock *lock)
+{
+	check_res_locked(res);
+
+	LDLM_DEBUG(lock, "About to add this lock:\n");
+
+	if (lock->l_destroyed) {
+		CDEBUG(D_OTHER, "Lock destroyed, not adding to resource\n");
+		return;
+	}
+
+	LASSERT(list_empty(&lock->l_res_link));
+
+	list_add_tail(&lock->l_res_link, head);
+}
+
+/**
+ * Insert a lock into resource after specified lock.
+ *
+ * Obtain resource description from the lock we are inserting after.
+ */
+void ldlm_resource_insert_lock_after(struct ldlm_lock *original,
+				     struct ldlm_lock *new)
+{
+	struct ldlm_resource *res = original->l_resource;
+
+	check_res_locked(res);
+
+	ldlm_resource_dump(D_INFO, res);
+	LDLM_DEBUG(new, "About to insert this lock after %p:\n", original);
+
+	if (new->l_destroyed) {
+		CDEBUG(D_OTHER, "Lock destroyed, not adding to resource\n");
+		goto out;
+	}
+
+	LASSERT(list_empty(&new->l_res_link));
+
+	list_add(&new->l_res_link, &original->l_res_link);
+ out:;
+}
+
+void ldlm_resource_unlink_lock(struct ldlm_lock *lock)
+{
+	int type = lock->l_resource->lr_type;
+
+	check_res_locked(lock->l_resource);
+	if (type == LDLM_IBITS || type == LDLM_PLAIN)
+		ldlm_unlink_lock_skiplist(lock);
+	else if (type == LDLM_EXTENT)
+		ldlm_extent_unlink_lock(lock);
+	list_del_init(&lock->l_res_link);
+}
+EXPORT_SYMBOL(ldlm_resource_unlink_lock);
+
+void ldlm_res2desc(struct ldlm_resource *res, struct ldlm_resource_desc *desc)
+{
+	desc->lr_type = res->lr_type;
+	desc->lr_name = res->lr_name;
+}
+
+/**
+ * Print information about all locks in all namespaces on this node to debug
+ * log.
+ */
+void ldlm_dump_all_namespaces(ldlm_side_t client, int level)
+{
+	struct list_head *tmp;
+
+	if (!((libcfs_debug | D_ERROR) & level))
+		return;
+
+	mutex_lock(ldlm_namespace_lock(client));
+
+	list_for_each(tmp, ldlm_namespace_list(client)) {
+		struct ldlm_namespace *ns;
+		ns = list_entry(tmp, struct ldlm_namespace, ns_list_chain);
+		ldlm_namespace_dump(level, ns);
+	}
+
+	mutex_unlock(ldlm_namespace_lock(client));
+}
+EXPORT_SYMBOL(ldlm_dump_all_namespaces);
+
+static int ldlm_res_hash_dump(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+			      struct hlist_node *hnode, void *arg)
+{
+	struct ldlm_resource *res = cfs_hash_object(hs, hnode);
+	int    level = (int)(unsigned long)arg;
+
+	lock_res(res);
+	ldlm_resource_dump(level, res);
+	unlock_res(res);
+
+	return 0;
+}
+
+/**
+ * Print information about all locks in this namespace on this node to debug
+ * log.
+ */
+void ldlm_namespace_dump(int level, struct ldlm_namespace *ns)
+{
+	if (!((libcfs_debug | D_ERROR) & level))
+		return;
+
+	CDEBUG(level, "--- Namespace: %s (rc: %d, side: %s)\n",
+	       ldlm_ns_name(ns), atomic_read(&ns->ns_bref),
+	       ns_is_client(ns) ? "client" : "server");
+
+	if (cfs_time_before(cfs_time_current(), ns->ns_next_dump))
+		return;
+
+	cfs_hash_for_each_nolock(ns->ns_rs_hash,
+				 ldlm_res_hash_dump,
+				 (void *)(unsigned long)level);
+	spin_lock(&ns->ns_lock);
+	ns->ns_next_dump = cfs_time_shift(10);
+	spin_unlock(&ns->ns_lock);
+}
+EXPORT_SYMBOL(ldlm_namespace_dump);
+
+/**
+ * Print information about all locks in this resource to debug log.
+ */
+void ldlm_resource_dump(int level, struct ldlm_resource *res)
+{
+	struct ldlm_lock *lock;
+	unsigned int granted = 0;
+
+	CLASSERT(RES_NAME_SIZE == 4);
+
+	if (!((libcfs_debug | D_ERROR) & level))
+		return;
+
+	CDEBUG(level, "--- Resource: %p ("LPU64"/"LPU64"/"LPU64"/"LPU64
+	       ") (rc: %d)\n", res, res->lr_name.name[0], res->lr_name.name[1],
+	       res->lr_name.name[2], res->lr_name.name[3],
+	       atomic_read(&res->lr_refcount));
+
+	if (!list_empty(&res->lr_granted)) {
+		CDEBUG(level, "Granted locks (in reverse order):\n");
+		list_for_each_entry_reverse(lock, &res->lr_granted,
+						l_res_link) {
+			LDLM_DEBUG_LIMIT(level, lock, "###");
+			if (!(level & D_CANTMASK) &&
+			    ++granted > ldlm_dump_granted_max) {
+				CDEBUG(level, "only dump %d granted locks to "
+				       "avoid DDOS.\n", granted);
+				break;
+			}
+		}
+	}
+	if (!list_empty(&res->lr_converting)) {
+		CDEBUG(level, "Converting locks:\n");
+		list_for_each_entry(lock, &res->lr_converting, l_res_link)
+			LDLM_DEBUG_LIMIT(level, lock, "###");
+	}
+	if (!list_empty(&res->lr_waiting)) {
+		CDEBUG(level, "Waiting locks:\n");
+		list_for_each_entry(lock, &res->lr_waiting, l_res_link)
+			LDLM_DEBUG_LIMIT(level, lock, "###");
+	}
+}
diff --git a/drivers/staging/lustre/lustre/libcfs/Makefile b/drivers/staging/lustre/lustre/libcfs/Makefile
new file mode 100644
index 000000000000..d64a3d0aec74
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/Makefile
@@ -0,0 +1,22 @@
+obj-$(CONFIG_LUSTRE_FS) += libcfs.o
+
+libcfs-linux-objs := linux-tracefile.o linux-debug.o
+libcfs-linux-objs += linux-prim.o linux-mem.o linux-cpu.o
+libcfs-linux-objs += linux-fs.o linux-sync.o linux-tcpip.o
+libcfs-linux-objs += linux-proc.o linux-curproc.o
+libcfs-linux-objs += linux-utils.o linux-module.o
+libcfs-linux-objs += linux-crypto.o linux-crypto-crc32.o
+libcfs-linux-objs += linux-crypto-adler.o
+libcfs-linux-objs += linux-crypto-crc32pclmul.o
+
+libcfs-linux-objs := $(addprefix linux/,$(libcfs-linux-objs))
+
+libcfs-all-objs := debug.o fail.o nidstrings.o module.o tracefile.o \
+		   watchdog.o libcfs_string.o hash.o kernel_user_comm.o \
+		   prng.o workitem.o upcall_cache.o libcfs_cpu.o \
+		   libcfs_mem.o libcfs_lock.o crc32-pclmul_asm.o
+
+libcfs-objs := $(libcfs-linux-objs) $(libcfs-all-objs)
+
+ccflags-y := -I$(src)/../include
+ccflags-y += -I$(src)/
diff --git a/drivers/staging/lustre/lustre/libcfs/crc32-pclmul_asm.S b/drivers/staging/lustre/lustre/libcfs/crc32-pclmul_asm.S
new file mode 100644
index 000000000000..cfaf13f275b3
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/crc32-pclmul_asm.S
@@ -0,0 +1,360 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ *
+ * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
+ * calculation.
+ * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
+ * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
+ * at:
+ * http://www.intel.com/products/processor/manuals/
+ * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
+ * Volume 2B: Instruction Set Reference, N-Z
+ *
+ * Authors:     Gregory Prestas <Gregory_Prestas@us.xyratex.com>
+ *	      Alexander Boyko <Alexander_Boyko@xyratex.com>
+ */
+
+/* gcc 4.1.2 does not support pclmulqdq instruction
+ * Use macro defenition from linux kernel 2.6.38  */
+
+#define REG_NUM_INVALID	100
+	.macro R32_NUM opd r32
+	\opd = REG_NUM_INVALID
+	.ifc \r32,%eax
+	\opd = 0
+	.endif
+	.ifc \r32,%ecx
+	\opd = 1
+	.endif
+	.ifc \r32,%edx
+	\opd = 2
+	.endif
+	.ifc \r32,%ebx
+	\opd = 3
+	.endif
+	.ifc \r32,%esp
+	\opd = 4
+	.endif
+	.ifc \r32,%ebp
+	\opd = 5
+	.endif
+	.ifc \r32,%esi
+	\opd = 6
+	.endif
+	.ifc \r32,%edi
+	\opd = 7
+	.endif
+	.endm
+
+	.macro XMM_NUM opd xmm
+	\opd = REG_NUM_INVALID
+	.ifc \xmm,%xmm0
+	\opd = 0
+	.endif
+	.ifc \xmm,%xmm1
+	\opd = 1
+	.endif
+	.ifc \xmm,%xmm2
+	\opd = 2
+	.endif
+	.ifc \xmm,%xmm3
+	\opd = 3
+	.endif
+	.ifc \xmm,%xmm4
+	\opd = 4
+	.endif
+	.ifc \xmm,%xmm5
+	\opd = 5
+	.endif
+	.ifc \xmm,%xmm6
+	\opd = 6
+	.endif
+	.ifc \xmm,%xmm7
+	\opd = 7
+	.endif
+	.ifc \xmm,%xmm8
+	\opd = 8
+	.endif
+	.ifc \xmm,%xmm9
+	\opd = 9
+	.endif
+	.ifc \xmm,%xmm10
+	\opd = 10
+	.endif
+	.ifc \xmm,%xmm11
+	\opd = 11
+	.endif
+	.ifc \xmm,%xmm12
+	\opd = 12
+	.endif
+	.ifc \xmm,%xmm13
+	\opd = 13
+	.endif
+	.ifc \xmm,%xmm14
+	\opd = 14
+	.endif
+	.ifc \xmm,%xmm15
+	\opd = 15
+	.endif
+	.endm
+
+	.macro PFX_OPD_SIZE
+	.byte 0x66
+	.endm
+
+	.macro PFX_REX opd1 opd2 W=0
+	.if ((\opd1 | \opd2) & 8) || \W
+	.byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1) | (\W << 3)
+	.endif
+	.endm
+
+	.macro MODRM mod opd1 opd2
+	.byte \mod | (\opd1 & 7) | ((\opd2 & 7) << 3)
+	.endm
+
+	.macro PCLMULQDQ imm8 xmm1 xmm2
+	XMM_NUM clmul_opd1 \xmm1
+	XMM_NUM clmul_opd2 \xmm2
+	PFX_OPD_SIZE
+	PFX_REX clmul_opd1 clmul_opd2
+	.byte 0x0f, 0x3a, 0x44
+	MODRM 0xc0 clmul_opd1 clmul_opd2
+	.byte \imm8
+	.endm
+
+	.macro PEXTRD imm8 xmm1 reg1
+	XMM_NUM extrd_opd2 \xmm1
+	R32_NUM extrd_opd1 \reg1
+	PFX_OPD_SIZE
+	PFX_REX extrd_opd1 extrd_opd2
+	.byte 0x0f, 0x3a, 0x16
+	MODRM 0xc0 extrd_opd1 extrd_opd2
+	.byte \imm8
+	.endm
+
+.align 16
+/*
+ * [x4*128+32 mod P(x) << 32)]'  << 1   = 0x154442bd4
+ * #define CONSTANT_R1  0x154442bd4LL
+ *
+ * [(x4*128-32 mod P(x) << 32)]' << 1   = 0x1c6e41596
+ * #define CONSTANT_R2  0x1c6e41596LL
+ */
+.Lconstant_R2R1:
+	.octa 0x00000001c6e415960000000154442bd4
+/*
+ * [(x128+32 mod P(x) << 32)]'   << 1   = 0x1751997d0
+ * #define CONSTANT_R3  0x1751997d0LL
+ *
+ * [(x128-32 mod P(x) << 32)]'   << 1   = 0x0ccaa009e
+ * #define CONSTANT_R4  0x0ccaa009eLL
+ */
+.Lconstant_R4R3:
+	.octa 0x00000000ccaa009e00000001751997d0
+/*
+ * [(x64 mod P(x) << 32)]'       << 1   = 0x163cd6124
+ * #define CONSTANT_R5  0x163cd6124LL
+ */
+.Lconstant_R5:
+	.octa 0x00000000000000000000000163cd6124
+.Lconstant_mask32:
+	.octa 0x000000000000000000000000FFFFFFFF
+/*
+ * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
+ *
+ * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` = 0x1F7011641LL
+ * #define CONSTANT_RU  0x1F7011641LL
+ */
+.Lconstant_RUpoly:
+	.octa 0x00000001F701164100000001DB710641
+
+#define CONSTANT %xmm0
+
+#ifdef __x86_64__
+#define BUF     %rdi
+#define LEN     %rsi
+#define CRC     %edx
+#else
+#define BUF     %eax
+#define LEN     %edx
+#define CRC     %ecx
+#endif
+
+
+
+.text
+/**
+ *      Calculate crc32
+ *      BUF - buffer (16 bytes aligned)
+ *      LEN - sizeof buffer (16 bytes aligned), LEN should be grater than 63
+ *      CRC - initial crc32
+ *      return %eax crc32
+ *      uint crc32_pclmul_le_16(unsigned char const *buffer,
+ *			     size_t len, uint crc32)
+ */
+.globl crc32_pclmul_le_16
+.align 4, 0x90
+crc32_pclmul_le_16:/* buffer and buffer size are 16 bytes aligned */
+	movdqa  (BUF), %xmm1
+	movdqa  0x10(BUF), %xmm2
+	movdqa  0x20(BUF), %xmm3
+	movdqa  0x30(BUF), %xmm4
+	movd    CRC, CONSTANT
+	pxor    CONSTANT, %xmm1
+	sub     $0x40, LEN
+	add     $0x40, BUF
+#ifndef __x86_64__
+	/* This is for position independed code(-fPIC) support for 32bit */
+	call    delta
+delta:
+	pop     %ecx
+#endif
+	cmp     $0x40, LEN
+	jb      less_64
+
+#ifdef __x86_64__
+	movdqa .Lconstant_R2R1(%rip), CONSTANT
+#else
+	movdqa .Lconstant_R2R1 - delta(%ecx), CONSTANT
+#endif
+
+loop_64:/*  64 bytes Full cache line folding */
+	prefetchnta    0x40(BUF)
+	movdqa  %xmm1, %xmm5
+	movdqa  %xmm2, %xmm6
+	movdqa  %xmm3, %xmm7
+#ifdef __x86_64__
+	movdqa  %xmm4, %xmm8
+#endif
+	PCLMULQDQ 00, CONSTANT, %xmm1
+	PCLMULQDQ 00, CONSTANT, %xmm2
+	PCLMULQDQ 00, CONSTANT, %xmm3
+#ifdef __x86_64__
+	PCLMULQDQ 00, CONSTANT, %xmm4
+#endif
+	PCLMULQDQ 0x11, CONSTANT, %xmm5
+	PCLMULQDQ 0x11, CONSTANT, %xmm6
+	PCLMULQDQ 0x11, CONSTANT, %xmm7
+#ifdef __x86_64__
+	PCLMULQDQ 0x11, CONSTANT, %xmm8
+#endif
+	pxor    %xmm5, %xmm1
+	pxor    %xmm6, %xmm2
+	pxor    %xmm7, %xmm3
+#ifdef __x86_64__
+	pxor    %xmm8, %xmm4
+#else
+	/* xmm8 unsupported for x32 */
+	movdqa  %xmm4, %xmm5
+	PCLMULQDQ 00, CONSTANT, %xmm4
+	PCLMULQDQ 0x11, CONSTANT, %xmm5
+	pxor    %xmm5, %xmm4
+#endif
+
+	pxor    (BUF), %xmm1
+	pxor    0x10(BUF), %xmm2
+	pxor    0x20(BUF), %xmm3
+	pxor    0x30(BUF), %xmm4
+
+	sub     $0x40, LEN
+	add     $0x40, BUF
+	cmp     $0x40, LEN
+	jge     loop_64
+less_64:/*  Folding cache line into 128bit */
+#ifdef __x86_64__
+	movdqa  .Lconstant_R4R3(%rip), CONSTANT
+#else
+	movdqa  .Lconstant_R4R3 - delta(%ecx), CONSTANT
+#endif
+	prefetchnta     (BUF)
+
+	movdqa  %xmm1, %xmm5
+	PCLMULQDQ 0x00, CONSTANT, %xmm1
+	PCLMULQDQ 0x11, CONSTANT, %xmm5
+	pxor    %xmm5, %xmm1
+	pxor    %xmm2, %xmm1
+
+	movdqa  %xmm1, %xmm5
+	PCLMULQDQ 0x00, CONSTANT, %xmm1
+	PCLMULQDQ 0x11, CONSTANT, %xmm5
+	pxor    %xmm5, %xmm1
+	pxor    %xmm3, %xmm1
+
+	movdqa  %xmm1, %xmm5
+	PCLMULQDQ 0x00, CONSTANT, %xmm1
+	PCLMULQDQ 0x11, CONSTANT, %xmm5
+	pxor    %xmm5, %xmm1
+	pxor    %xmm4, %xmm1
+
+	cmp     $0x10, LEN
+	jb      fold_64
+loop_16:/* Folding rest buffer into 128bit */
+	movdqa  %xmm1, %xmm5
+	PCLMULQDQ 0x00, CONSTANT, %xmm1
+	PCLMULQDQ 0x11, CONSTANT, %xmm5
+	pxor    %xmm5, %xmm1
+	pxor    (BUF), %xmm1
+	sub     $0x10, LEN
+	add     $0x10, BUF
+	cmp     $0x10, LEN
+	jge     loop_16
+
+fold_64:
+	/* perform the last 64 bit fold, also adds 32 zeroes
+	 * to the input stream */
+	PCLMULQDQ 0x01, %xmm1, CONSTANT /* R4 * xmm1.low */
+	psrldq  $0x08, %xmm1
+	pxor    CONSTANT, %xmm1
+
+	/* final 32-bit fold */
+	movdqa  %xmm1, %xmm2
+#ifdef __x86_64__
+	movdqa  .Lconstant_R5(%rip), CONSTANT
+	movdqa  .Lconstant_mask32(%rip), %xmm3
+#else
+	movdqa  .Lconstant_R5 - delta(%ecx), CONSTANT
+	movdqa  .Lconstant_mask32 - delta(%ecx), %xmm3
+#endif
+	psrldq  $0x04, %xmm2
+	pand    %xmm3, %xmm1
+	PCLMULQDQ 0x00, CONSTANT, %xmm1
+	pxor    %xmm2, %xmm1
+
+	/* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
+#ifdef __x86_64__
+	movdqa  .Lconstant_RUpoly(%rip), CONSTANT
+#else
+	movdqa  .Lconstant_RUpoly - delta(%ecx), CONSTANT
+#endif
+	movdqa  %xmm1, %xmm2
+	pand    %xmm3, %xmm1
+	PCLMULQDQ 0x10, CONSTANT, %xmm1
+	pand    %xmm3, %xmm1
+	PCLMULQDQ 0x00, CONSTANT, %xmm1
+	pxor    %xmm2, %xmm1
+	PEXTRD  0x01, %xmm1, %eax
+
+	ret
diff --git a/drivers/staging/lustre/lustre/libcfs/debug.c b/drivers/staging/lustre/lustre/libcfs/debug.c
new file mode 100644
index 000000000000..5a87b0832074
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/debug.c
@@ -0,0 +1,476 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/debug.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ *
+ */
+
+# define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+#include "tracefile.h"
+
+static char debug_file_name[1024];
+
+unsigned int libcfs_subsystem_debug = ~0;
+CFS_MODULE_PARM(libcfs_subsystem_debug, "i", int, 0644,
+		"Lustre kernel debug subsystem mask");
+EXPORT_SYMBOL(libcfs_subsystem_debug);
+
+unsigned int libcfs_debug = (D_CANTMASK |
+			     D_NETERROR | D_HA | D_CONFIG | D_IOCTL);
+CFS_MODULE_PARM(libcfs_debug, "i", int, 0644,
+		"Lustre kernel debug mask");
+EXPORT_SYMBOL(libcfs_debug);
+
+unsigned int libcfs_debug_mb = 0;
+CFS_MODULE_PARM(libcfs_debug_mb, "i", uint, 0644,
+		"Total debug buffer size.");
+EXPORT_SYMBOL(libcfs_debug_mb);
+
+unsigned int libcfs_printk = D_CANTMASK;
+CFS_MODULE_PARM(libcfs_printk, "i", uint, 0644,
+		"Lustre kernel debug console mask");
+EXPORT_SYMBOL(libcfs_printk);
+
+unsigned int libcfs_console_ratelimit = 1;
+CFS_MODULE_PARM(libcfs_console_ratelimit, "i", uint, 0644,
+		"Lustre kernel debug console ratelimit (0 to disable)");
+EXPORT_SYMBOL(libcfs_console_ratelimit);
+
+unsigned int libcfs_console_max_delay;
+CFS_MODULE_PARM(libcfs_console_max_delay, "l", uint, 0644,
+		"Lustre kernel debug console max delay (jiffies)");
+EXPORT_SYMBOL(libcfs_console_max_delay);
+
+unsigned int libcfs_console_min_delay;
+CFS_MODULE_PARM(libcfs_console_min_delay, "l", uint, 0644,
+		"Lustre kernel debug console min delay (jiffies)");
+EXPORT_SYMBOL(libcfs_console_min_delay);
+
+unsigned int libcfs_console_backoff = CDEBUG_DEFAULT_BACKOFF;
+CFS_MODULE_PARM(libcfs_console_backoff, "i", uint, 0644,
+		"Lustre kernel debug console backoff factor");
+EXPORT_SYMBOL(libcfs_console_backoff);
+
+unsigned int libcfs_debug_binary = 1;
+EXPORT_SYMBOL(libcfs_debug_binary);
+
+unsigned int libcfs_stack = 3 * THREAD_SIZE / 4;
+EXPORT_SYMBOL(libcfs_stack);
+
+unsigned int portal_enter_debugger;
+EXPORT_SYMBOL(portal_enter_debugger);
+
+unsigned int libcfs_catastrophe;
+EXPORT_SYMBOL(libcfs_catastrophe);
+
+unsigned int libcfs_watchdog_ratelimit = 300;
+EXPORT_SYMBOL(libcfs_watchdog_ratelimit);
+
+unsigned int libcfs_panic_on_lbug = 1;
+CFS_MODULE_PARM(libcfs_panic_on_lbug, "i", uint, 0644,
+		"Lustre kernel panic on LBUG");
+EXPORT_SYMBOL(libcfs_panic_on_lbug);
+
+atomic_t libcfs_kmemory = ATOMIC_INIT(0);
+EXPORT_SYMBOL(libcfs_kmemory);
+
+static wait_queue_head_t debug_ctlwq;
+
+char libcfs_debug_file_path_arr[PATH_MAX] = LIBCFS_DEBUG_FILE_PATH_DEFAULT;
+
+/* We need to pass a pointer here, but elsewhere this must be a const */
+char *libcfs_debug_file_path;
+CFS_MODULE_PARM(libcfs_debug_file_path, "s", charp, 0644,
+		"Path for dumping debug logs, "
+		"set 'NONE' to prevent log dumping");
+
+int libcfs_panic_in_progress;
+
+/* libcfs_debug_token2mask() expects the returned
+ * string in lower-case */
+const char *
+libcfs_debug_subsys2str(int subsys)
+{
+	switch (1 << subsys) {
+	default:
+		return NULL;
+	case S_UNDEFINED:
+		return "undefined";
+	case S_MDC:
+		return "mdc";
+	case S_MDS:
+		return "mds";
+	case S_OSC:
+		return "osc";
+	case S_OST:
+		return "ost";
+	case S_CLASS:
+		return "class";
+	case S_LOG:
+		return "log";
+	case S_LLITE:
+		return "llite";
+	case S_RPC:
+		return "rpc";
+	case S_LNET:
+		return "lnet";
+	case S_LND:
+		return "lnd";
+	case S_PINGER:
+		return "pinger";
+	case S_FILTER:
+		return "filter";
+	case S_ECHO:
+		return "echo";
+	case S_LDLM:
+		return "ldlm";
+	case S_LOV:
+		return "lov";
+	case S_LQUOTA:
+		return "lquota";
+	case S_OSD:
+		return "osd";
+	case S_LMV:
+		return "lmv";
+	case S_SEC:
+		return "sec";
+	case S_GSS:
+		return "gss";
+	case S_MGC:
+		return "mgc";
+	case S_MGS:
+		return "mgs";
+	case S_FID:
+		return "fid";
+	case S_FLD:
+		return "fld";
+	}
+}
+
+/* libcfs_debug_token2mask() expects the returned
+ * string in lower-case */
+const char *
+libcfs_debug_dbg2str(int debug)
+{
+	switch (1 << debug) {
+	default:
+		return NULL;
+	case D_TRACE:
+		return "trace";
+	case D_INODE:
+		return "inode";
+	case D_SUPER:
+		return "super";
+	case D_EXT2:
+		return "ext2";
+	case D_MALLOC:
+		return "malloc";
+	case D_CACHE:
+		return "cache";
+	case D_INFO:
+		return "info";
+	case D_IOCTL:
+		return "ioctl";
+	case D_NETERROR:
+		return "neterror";
+	case D_NET:
+		return "net";
+	case D_WARNING:
+		return "warning";
+	case D_BUFFS:
+		return "buffs";
+	case D_OTHER:
+		return "other";
+	case D_DENTRY:
+		return "dentry";
+	case D_NETTRACE:
+		return "nettrace";
+	case D_PAGE:
+		return "page";
+	case D_DLMTRACE:
+		return "dlmtrace";
+	case D_ERROR:
+		return "error";
+	case D_EMERG:
+		return "emerg";
+	case D_HA:
+		return "ha";
+	case D_RPCTRACE:
+		return "rpctrace";
+	case D_VFSTRACE:
+		return "vfstrace";
+	case D_READA:
+		return "reada";
+	case D_MMAP:
+		return "mmap";
+	case D_CONFIG:
+		return "config";
+	case D_CONSOLE:
+		return "console";
+	case D_QUOTA:
+		return "quota";
+	case D_SEC:
+		return "sec";
+	case D_LFSCK:
+		return "lfsck";
+	}
+}
+
+int
+libcfs_debug_mask2str(char *str, int size, int mask, int is_subsys)
+{
+	const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str :
+						 libcfs_debug_dbg2str;
+	int	   len = 0;
+	const char   *token;
+	int	   i;
+
+	if (mask == 0) {			/* "0" */
+		if (size > 0)
+			str[0] = '0';
+		len = 1;
+	} else {				/* space-separated tokens */
+		for (i = 0; i < 32; i++) {
+			if ((mask & (1 << i)) == 0)
+				continue;
+
+			token = fn(i);
+			if (token == NULL)	      /* unused bit */
+				continue;
+
+			if (len > 0) {		  /* separator? */
+				if (len < size)
+					str[len] = ' ';
+				len++;
+			}
+
+			while (*token != 0) {
+				if (len < size)
+					str[len] = *token;
+				token++;
+				len++;
+			}
+		}
+	}
+
+	/* terminate 'str' */
+	if (len < size)
+		str[len] = 0;
+	else
+		str[size - 1] = 0;
+
+	return len;
+}
+
+int
+libcfs_debug_str2mask(int *mask, const char *str, int is_subsys)
+{
+	const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str :
+						 libcfs_debug_dbg2str;
+	int	 m = 0;
+	int	 matched;
+	int	 n;
+	int	 t;
+
+	/* Allow a number for backwards compatibility */
+
+	for (n = strlen(str); n > 0; n--)
+		if (!isspace(str[n-1]))
+			break;
+	matched = n;
+
+	if ((t = sscanf(str, "%i%n", &m, &matched)) >= 1 &&
+	    matched == n) {
+		/* don't print warning for lctl set_param debug=0 or -1 */
+		if (m != 0 && m != -1)
+			CWARN("You are trying to use a numerical value for the "
+			      "mask - this will be deprecated in a future "
+			      "release.\n");
+		*mask = m;
+		return 0;
+	}
+
+	return cfs_str2mask(str, fn, mask, is_subsys ? 0 : D_CANTMASK,
+			    0xffffffff);
+}
+
+/**
+ * Dump Lustre log to ::debug_file_path by calling tracefile_dump_all_pages()
+ */
+void libcfs_debug_dumplog_internal(void *arg)
+{
+	DECL_JOURNAL_DATA;
+
+	PUSH_JOURNAL;
+
+	if (strncmp(libcfs_debug_file_path_arr, "NONE", 4) != 0) {
+		snprintf(debug_file_name, sizeof(debug_file_name) - 1,
+			 "%s.%ld." LPLD, libcfs_debug_file_path_arr,
+			 cfs_time_current_sec(), (long_ptr_t)arg);
+		printk(KERN_ALERT "LustreError: dumping log to %s\n",
+		       debug_file_name);
+		cfs_tracefile_dump_all_pages(debug_file_name);
+		libcfs_run_debug_log_upcall(debug_file_name);
+	}
+	POP_JOURNAL;
+}
+
+int libcfs_debug_dumplog_thread(void *arg)
+{
+	libcfs_debug_dumplog_internal(arg);
+	wake_up(&debug_ctlwq);
+	return 0;
+}
+
+void libcfs_debug_dumplog(void)
+{
+	wait_queue_t wait;
+	task_t    *dumper;
+	ENTRY;
+
+	/* we're being careful to ensure that the kernel thread is
+	 * able to set our state to running as it exits before we
+	 * get to schedule() */
+	init_waitqueue_entry_current(&wait);
+	set_current_state(TASK_INTERRUPTIBLE);
+	add_wait_queue(&debug_ctlwq, &wait);
+
+	dumper = kthread_run(libcfs_debug_dumplog_thread,
+			     (void *)(long)current_pid(),
+			     "libcfs_debug_dumper");
+	if (IS_ERR(dumper))
+		printk(KERN_ERR "LustreError: cannot start log dump thread:"
+		       " %ld\n", PTR_ERR(dumper));
+	else
+		waitq_wait(&wait, TASK_INTERRUPTIBLE);
+
+	/* be sure to teardown if cfs_create_thread() failed */
+	remove_wait_queue(&debug_ctlwq, &wait);
+	set_current_state(TASK_RUNNING);
+}
+EXPORT_SYMBOL(libcfs_debug_dumplog);
+
+int libcfs_debug_init(unsigned long bufsize)
+{
+	int    rc = 0;
+	unsigned int max = libcfs_debug_mb;
+
+	init_waitqueue_head(&debug_ctlwq);
+
+	if (libcfs_console_max_delay <= 0 || /* not set by user or */
+	    libcfs_console_min_delay <= 0 || /* set to invalid values */
+	    libcfs_console_min_delay >= libcfs_console_max_delay) {
+		libcfs_console_max_delay = CDEBUG_DEFAULT_MAX_DELAY;
+		libcfs_console_min_delay = CDEBUG_DEFAULT_MIN_DELAY;
+	}
+
+	if (libcfs_debug_file_path != NULL) {
+		memset(libcfs_debug_file_path_arr, 0, PATH_MAX);
+		strncpy(libcfs_debug_file_path_arr,
+			libcfs_debug_file_path, PATH_MAX-1);
+	}
+
+	/* If libcfs_debug_mb is set to an invalid value or uninitialized
+	 * then just make the total buffers smp_num_cpus * TCD_MAX_PAGES */
+	if (max > cfs_trace_max_debug_mb() || max < num_possible_cpus()) {
+		max = TCD_MAX_PAGES;
+	} else {
+		max = (max / num_possible_cpus());
+		max = (max << (20 - PAGE_CACHE_SHIFT));
+	}
+	rc = cfs_tracefile_init(max);
+
+	if (rc == 0)
+		libcfs_register_panic_notifier();
+
+	return rc;
+}
+
+int libcfs_debug_cleanup(void)
+{
+	libcfs_unregister_panic_notifier();
+	cfs_tracefile_exit();
+	return 0;
+}
+
+int libcfs_debug_clear_buffer(void)
+{
+	cfs_trace_flush_pages();
+	return 0;
+}
+
+/* Debug markers, although printed by S_LNET
+ * should not be be marked as such. */
+#undef DEBUG_SUBSYSTEM
+#define DEBUG_SUBSYSTEM S_UNDEFINED
+int libcfs_debug_mark_buffer(const char *text)
+{
+	CDEBUG(D_TRACE,"***************************************************\n");
+	LCONSOLE(D_WARNING, "DEBUG MARKER: %s\n", text);
+	CDEBUG(D_TRACE,"***************************************************\n");
+
+	return 0;
+}
+#undef DEBUG_SUBSYSTEM
+#define DEBUG_SUBSYSTEM S_LNET
+
+void libcfs_debug_set_level(unsigned int debug_level)
+{
+	printk(KERN_WARNING "Lustre: Setting portals debug level to %08x\n",
+	       debug_level);
+	libcfs_debug = debug_level;
+}
+
+EXPORT_SYMBOL(libcfs_debug_set_level);
+
+long libcfs_log_return(struct libcfs_debug_msg_data *msgdata, long rc)
+{
+	libcfs_debug_msg(msgdata, "Process leaving (rc=%lu : %ld : %lx)\n",
+			 rc, rc, rc);
+	return rc;
+}
+EXPORT_SYMBOL(libcfs_log_return);
+
+void libcfs_log_goto(struct libcfs_debug_msg_data *msgdata, const char *label,
+		     long_ptr_t rc)
+{
+	libcfs_debug_msg(msgdata, "Process leaving via %s (rc=" LPLU " : " LPLD
+			 " : " LPLX ")\n", label, (ulong_ptr_t)rc, rc, rc);
+}
+EXPORT_SYMBOL(libcfs_log_goto);
diff --git a/drivers/staging/lustre/lustre/libcfs/fail.c b/drivers/staging/lustre/lustre/libcfs/fail.c
new file mode 100644
index 000000000000..c54448d69008
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/fail.c
@@ -0,0 +1,137 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please contact Oracle Corporation, Inc., 500 Oracle Parkway, Redwood Shores,
+ * CA 94065 USA or visit www.oracle.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Oracle Corporation, Inc.
+ */
+
+#include <linux/libcfs/libcfs.h>
+
+unsigned long cfs_fail_loc = 0;
+unsigned int cfs_fail_val = 0;
+wait_queue_head_t cfs_race_waitq;
+int cfs_race_state;
+
+EXPORT_SYMBOL(cfs_fail_loc);
+EXPORT_SYMBOL(cfs_fail_val);
+EXPORT_SYMBOL(cfs_race_waitq);
+EXPORT_SYMBOL(cfs_race_state);
+
+int __cfs_fail_check_set(__u32 id, __u32 value, int set)
+{
+	static atomic_t cfs_fail_count = ATOMIC_INIT(0);
+
+	LASSERT(!(id & CFS_FAIL_ONCE));
+
+	if ((cfs_fail_loc & (CFS_FAILED | CFS_FAIL_ONCE)) ==
+	    (CFS_FAILED | CFS_FAIL_ONCE)) {
+		atomic_set(&cfs_fail_count, 0); /* paranoia */
+		return 0;
+	}
+
+	/* Fail 1/cfs_fail_val times */
+	if (cfs_fail_loc & CFS_FAIL_RAND) {
+		if (cfs_fail_val < 2 || cfs_rand() % cfs_fail_val > 0)
+			return 0;
+	}
+
+	/* Skip the first cfs_fail_val, then fail */
+	if (cfs_fail_loc & CFS_FAIL_SKIP) {
+		if (atomic_inc_return(&cfs_fail_count) <= cfs_fail_val)
+			return 0;
+	}
+
+	/* check cfs_fail_val... */
+	if (set == CFS_FAIL_LOC_VALUE) {
+		if (cfs_fail_val != -1 && cfs_fail_val != value)
+			return 0;
+	}
+
+	/* Fail cfs_fail_val times, overridden by FAIL_ONCE */
+	if (cfs_fail_loc & CFS_FAIL_SOME &&
+	    (!(cfs_fail_loc & CFS_FAIL_ONCE) || cfs_fail_val <= 1)) {
+		int count = atomic_inc_return(&cfs_fail_count);
+
+		if (count >= cfs_fail_val) {
+			set_bit(CFS_FAIL_ONCE_BIT, &cfs_fail_loc);
+			atomic_set(&cfs_fail_count, 0);
+			/* we are lost race to increase  */
+			if (count > cfs_fail_val)
+				return 0;
+		}
+	}
+
+	if ((set == CFS_FAIL_LOC_ORSET || set == CFS_FAIL_LOC_RESET) &&
+	    (value & CFS_FAIL_ONCE))
+		set_bit(CFS_FAIL_ONCE_BIT, &cfs_fail_loc);
+	/* Lost race to set CFS_FAILED_BIT. */
+	if (test_and_set_bit(CFS_FAILED_BIT, &cfs_fail_loc)) {
+		/* If CFS_FAIL_ONCE is valid, only one process can fail,
+		 * otherwise multi-process can fail at the same time. */
+		if (cfs_fail_loc & CFS_FAIL_ONCE)
+			return 0;
+	}
+
+	switch (set) {
+		case CFS_FAIL_LOC_NOSET:
+		case CFS_FAIL_LOC_VALUE:
+			break;
+		case CFS_FAIL_LOC_ORSET:
+			cfs_fail_loc |= value & ~(CFS_FAILED | CFS_FAIL_ONCE);
+			break;
+		case CFS_FAIL_LOC_RESET:
+			cfs_fail_loc = value;
+			break;
+		default:
+			LASSERTF(0, "called with bad set %u\n", set);
+			break;
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL(__cfs_fail_check_set);
+
+int __cfs_fail_timeout_set(__u32 id, __u32 value, int ms, int set)
+{
+	int ret = 0;
+
+	ret = __cfs_fail_check_set(id, value, set);
+	if (ret) {
+		CERROR("cfs_fail_timeout id %x sleeping for %dms\n",
+		       id, ms);
+		schedule_timeout_and_set_state(TASK_UNINTERRUPTIBLE,
+						   cfs_time_seconds(ms) / 1000);
+		set_current_state(TASK_RUNNING);
+		CERROR("cfs_fail_timeout id %x awake\n", id);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(__cfs_fail_timeout_set);
diff --git a/drivers/staging/lustre/lustre/libcfs/hash.c b/drivers/staging/lustre/lustre/libcfs/hash.c
new file mode 100644
index 000000000000..231c678f76df
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/hash.c
@@ -0,0 +1,2135 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/hash.c
+ *
+ * Implement a hash class for hash process in lustre system.
+ *
+ * Author: YuZhangyong <yzy@clusterfs.com>
+ *
+ * 2008-08-15: Brian Behlendorf <behlendorf1@llnl.gov>
+ * - Simplified API and improved documentation
+ * - Added per-hash feature flags:
+ *   * CFS_HASH_DEBUG additional validation
+ *   * CFS_HASH_REHASH dynamic rehashing
+ * - Added per-hash statistics
+ * - General performance enhancements
+ *
+ * 2009-07-31: Liang Zhen <zhen.liang@sun.com>
+ * - move all stuff to libcfs
+ * - don't allow cur_bits != max_bits without setting of CFS_HASH_REHASH
+ * - ignore hs_rwlock if without CFS_HASH_REHASH setting
+ * - buckets are allocated one by one(intead of contiguous memory),
+ *   to avoid unnecessary cacheline conflict
+ *
+ * 2010-03-01: Liang Zhen <zhen.liang@sun.com>
+ * - "bucket" is a group of hlist_head now, user can speicify bucket size
+ *   by bkt_bits of cfs_hash_create(), all hlist_heads in a bucket share
+ *   one lock for reducing memory overhead.
+ *
+ * - support lockless hash, caller will take care of locks:
+ *   avoid lock overhead for hash tables that are already protected
+ *   by locking in the caller for another reason
+ *
+ * - support both spin_lock/rwlock for bucket:
+ *   overhead of spinlock contention is lower than read/write
+ *   contention of rwlock, so using spinlock to serialize operations on
+ *   bucket is more reasonable for those frequently changed hash tables
+ *
+ * - support one-single lock mode:
+ *   one lock to protect all hash operations to avoid overhead of
+ *   multiple locks if hash table is always small
+ *
+ * - removed a lot of unnecessary addref & decref on hash element:
+ *   addref & decref are atomic operations in many use-cases which
+ *   are expensive.
+ *
+ * - support non-blocking cfs_hash_add() and cfs_hash_findadd():
+ *   some lustre use-cases require these functions to be strictly
+ *   non-blocking, we need to schedule required rehash on a different
+ *   thread on those cases.
+ *
+ * - safer rehash on large hash table
+ *   In old implementation, rehash function will exclusively lock the
+ *   hash table and finish rehash in one batch, it's dangerous on SMP
+ *   system because rehash millions of elements could take long time.
+ *   New implemented rehash can release lock and relax CPU in middle
+ *   of rehash, it's safe for another thread to search/change on the
+ *   hash table even it's in rehasing.
+ *
+ * - support two different refcount modes
+ *   . hash table has refcount on element
+ *   . hash table doesn't change refcount on adding/removing element
+ *
+ * - support long name hash table (for param-tree)
+ *
+ * - fix a bug for cfs_hash_rehash_key:
+ *   in old implementation, cfs_hash_rehash_key could screw up the
+ *   hash-table because @key is overwritten without any protection.
+ *   Now we need user to define hs_keycpy for those rehash enabled
+ *   hash tables, cfs_hash_rehash_key will overwrite hash-key
+ *   inside lock by calling hs_keycpy.
+ *
+ * - better hash iteration:
+ *   Now we support both locked iteration & lockless iteration of hash
+ *   table. Also, user can break the iteration by return 1 in callback.
+ */
+
+#include <linux/libcfs/libcfs.h>
+
+#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1
+static unsigned int warn_on_depth = 8;
+CFS_MODULE_PARM(warn_on_depth, "i", uint, 0644,
+		"warning when hash depth is high.");
+#endif
+
+struct cfs_wi_sched *cfs_sched_rehash;
+
+static inline void
+cfs_hash_nl_lock(cfs_hash_lock_t *lock, int exclusive) {}
+
+static inline void
+cfs_hash_nl_unlock(cfs_hash_lock_t *lock, int exclusive) {}
+
+static inline void
+cfs_hash_spin_lock(cfs_hash_lock_t *lock, int exclusive)
+{
+	spin_lock(&lock->spin);
+}
+
+static inline void
+cfs_hash_spin_unlock(cfs_hash_lock_t *lock, int exclusive)
+{
+	spin_unlock(&lock->spin);
+}
+
+static inline void
+cfs_hash_rw_lock(cfs_hash_lock_t *lock, int exclusive)
+{
+	if (!exclusive)
+		read_lock(&lock->rw);
+	else
+		write_lock(&lock->rw);
+}
+
+static inline void
+cfs_hash_rw_unlock(cfs_hash_lock_t *lock, int exclusive)
+{
+	if (!exclusive)
+		read_unlock(&lock->rw);
+	else
+		write_unlock(&lock->rw);
+}
+
+/** No lock hash */
+static cfs_hash_lock_ops_t cfs_hash_nl_lops =
+{
+	.hs_lock	= cfs_hash_nl_lock,
+	.hs_unlock      = cfs_hash_nl_unlock,
+	.hs_bkt_lock    = cfs_hash_nl_lock,
+	.hs_bkt_unlock  = cfs_hash_nl_unlock,
+};
+
+/** no bucket lock, one spinlock to protect everything */
+static cfs_hash_lock_ops_t cfs_hash_nbl_lops =
+{
+	.hs_lock	= cfs_hash_spin_lock,
+	.hs_unlock      = cfs_hash_spin_unlock,
+	.hs_bkt_lock    = cfs_hash_nl_lock,
+	.hs_bkt_unlock  = cfs_hash_nl_unlock,
+};
+
+/** spin bucket lock, rehash is enabled */
+static cfs_hash_lock_ops_t cfs_hash_bkt_spin_lops =
+{
+	.hs_lock	= cfs_hash_rw_lock,
+	.hs_unlock      = cfs_hash_rw_unlock,
+	.hs_bkt_lock    = cfs_hash_spin_lock,
+	.hs_bkt_unlock  = cfs_hash_spin_unlock,
+};
+
+/** rw bucket lock, rehash is enabled */
+static cfs_hash_lock_ops_t cfs_hash_bkt_rw_lops =
+{
+	.hs_lock	= cfs_hash_rw_lock,
+	.hs_unlock      = cfs_hash_rw_unlock,
+	.hs_bkt_lock    = cfs_hash_rw_lock,
+	.hs_bkt_unlock  = cfs_hash_rw_unlock,
+};
+
+/** spin bucket lock, rehash is disabled */
+static cfs_hash_lock_ops_t cfs_hash_nr_bkt_spin_lops =
+{
+	.hs_lock	= cfs_hash_nl_lock,
+	.hs_unlock      = cfs_hash_nl_unlock,
+	.hs_bkt_lock    = cfs_hash_spin_lock,
+	.hs_bkt_unlock  = cfs_hash_spin_unlock,
+};
+
+/** rw bucket lock, rehash is disabled */
+static cfs_hash_lock_ops_t cfs_hash_nr_bkt_rw_lops =
+{
+	.hs_lock	= cfs_hash_nl_lock,
+	.hs_unlock      = cfs_hash_nl_unlock,
+	.hs_bkt_lock    = cfs_hash_rw_lock,
+	.hs_bkt_unlock  = cfs_hash_rw_unlock,
+};
+
+static void
+cfs_hash_lock_setup(cfs_hash_t *hs)
+{
+	if (cfs_hash_with_no_lock(hs)) {
+		hs->hs_lops = &cfs_hash_nl_lops;
+
+	} else if (cfs_hash_with_no_bktlock(hs)) {
+		hs->hs_lops = &cfs_hash_nbl_lops;
+		spin_lock_init(&hs->hs_lock.spin);
+
+	} else if (cfs_hash_with_rehash(hs)) {
+		rwlock_init(&hs->hs_lock.rw);
+
+		if (cfs_hash_with_rw_bktlock(hs))
+			hs->hs_lops = &cfs_hash_bkt_rw_lops;
+		else if (cfs_hash_with_spin_bktlock(hs))
+			hs->hs_lops = &cfs_hash_bkt_spin_lops;
+		else
+			LBUG();
+	} else {
+		if (cfs_hash_with_rw_bktlock(hs))
+			hs->hs_lops = &cfs_hash_nr_bkt_rw_lops;
+		else if (cfs_hash_with_spin_bktlock(hs))
+			hs->hs_lops = &cfs_hash_nr_bkt_spin_lops;
+		else
+			LBUG();
+	}
+}
+
+/**
+ * Simple hash head without depth tracking
+ * new element is always added to head of hlist
+ */
+typedef struct {
+	struct hlist_head	hh_head;	/**< entries list */
+} cfs_hash_head_t;
+
+static int
+cfs_hash_hh_hhead_size(cfs_hash_t *hs)
+{
+	return sizeof(cfs_hash_head_t);
+}
+
+static struct hlist_head *
+cfs_hash_hh_hhead(cfs_hash_t *hs, cfs_hash_bd_t *bd)
+{
+	cfs_hash_head_t *head = (cfs_hash_head_t *)&bd->bd_bucket->hsb_head[0];
+
+	return &head[bd->bd_offset].hh_head;
+}
+
+static int
+cfs_hash_hh_hnode_add(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+		      struct hlist_node *hnode)
+{
+	hlist_add_head(hnode, cfs_hash_hh_hhead(hs, bd));
+	return -1; /* unknown depth */
+}
+
+static int
+cfs_hash_hh_hnode_del(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+		      struct hlist_node *hnode)
+{
+	hlist_del_init(hnode);
+	return -1; /* unknown depth */
+}
+
+/**
+ * Simple hash head with depth tracking
+ * new element is always added to head of hlist
+ */
+typedef struct {
+	struct hlist_head	hd_head;	/**< entries list */
+	unsigned int	    hd_depth;       /**< list length */
+} cfs_hash_head_dep_t;
+
+static int
+cfs_hash_hd_hhead_size(cfs_hash_t *hs)
+{
+	return sizeof(cfs_hash_head_dep_t);
+}
+
+static struct hlist_head *
+cfs_hash_hd_hhead(cfs_hash_t *hs, cfs_hash_bd_t *bd)
+{
+	cfs_hash_head_dep_t   *head;
+
+	head = (cfs_hash_head_dep_t *)&bd->bd_bucket->hsb_head[0];
+	return &head[bd->bd_offset].hd_head;
+}
+
+static int
+cfs_hash_hd_hnode_add(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+		      struct hlist_node *hnode)
+{
+	cfs_hash_head_dep_t *hh = container_of(cfs_hash_hd_hhead(hs, bd),
+					       cfs_hash_head_dep_t, hd_head);
+	hlist_add_head(hnode, &hh->hd_head);
+	return ++hh->hd_depth;
+}
+
+static int
+cfs_hash_hd_hnode_del(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+		      struct hlist_node *hnode)
+{
+	cfs_hash_head_dep_t *hh = container_of(cfs_hash_hd_hhead(hs, bd),
+					       cfs_hash_head_dep_t, hd_head);
+	hlist_del_init(hnode);
+	return --hh->hd_depth;
+}
+
+/**
+ * double links hash head without depth tracking
+ * new element is always added to tail of hlist
+ */
+typedef struct {
+	struct hlist_head	dh_head;	/**< entries list */
+	struct hlist_node       *dh_tail;	/**< the last entry */
+} cfs_hash_dhead_t;
+
+static int
+cfs_hash_dh_hhead_size(cfs_hash_t *hs)
+{
+	return sizeof(cfs_hash_dhead_t);
+}
+
+static struct hlist_head *
+cfs_hash_dh_hhead(cfs_hash_t *hs, cfs_hash_bd_t *bd)
+{
+	cfs_hash_dhead_t *head;
+
+	head = (cfs_hash_dhead_t *)&bd->bd_bucket->hsb_head[0];
+	return &head[bd->bd_offset].dh_head;
+}
+
+static int
+cfs_hash_dh_hnode_add(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+		      struct hlist_node *hnode)
+{
+	cfs_hash_dhead_t *dh = container_of(cfs_hash_dh_hhead(hs, bd),
+					    cfs_hash_dhead_t, dh_head);
+
+	if (dh->dh_tail != NULL) /* not empty */
+		hlist_add_after(dh->dh_tail, hnode);
+	else /* empty list */
+		hlist_add_head(hnode, &dh->dh_head);
+	dh->dh_tail = hnode;
+	return -1; /* unknown depth */
+}
+
+static int
+cfs_hash_dh_hnode_del(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+		      struct hlist_node *hnd)
+{
+	cfs_hash_dhead_t *dh = container_of(cfs_hash_dh_hhead(hs, bd),
+					    cfs_hash_dhead_t, dh_head);
+
+	if (hnd->next == NULL) { /* it's the tail */
+		dh->dh_tail = (hnd->pprev == &dh->dh_head.first) ? NULL :
+			      container_of(hnd->pprev, struct hlist_node, next);
+	}
+	hlist_del_init(hnd);
+	return -1; /* unknown depth */
+}
+
+/**
+ * double links hash head with depth tracking
+ * new element is always added to tail of hlist
+ */
+typedef struct {
+	struct hlist_head	dd_head;	/**< entries list */
+	struct hlist_node       *dd_tail;	/**< the last entry */
+	unsigned int	    dd_depth;       /**< list length */
+} cfs_hash_dhead_dep_t;
+
+static int
+cfs_hash_dd_hhead_size(cfs_hash_t *hs)
+{
+	return sizeof(cfs_hash_dhead_dep_t);
+}
+
+static struct hlist_head *
+cfs_hash_dd_hhead(cfs_hash_t *hs, cfs_hash_bd_t *bd)
+{
+	cfs_hash_dhead_dep_t *head;
+
+	head = (cfs_hash_dhead_dep_t *)&bd->bd_bucket->hsb_head[0];
+	return &head[bd->bd_offset].dd_head;
+}
+
+static int
+cfs_hash_dd_hnode_add(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+		      struct hlist_node *hnode)
+{
+	cfs_hash_dhead_dep_t *dh = container_of(cfs_hash_dd_hhead(hs, bd),
+						cfs_hash_dhead_dep_t, dd_head);
+
+	if (dh->dd_tail != NULL) /* not empty */
+		hlist_add_after(dh->dd_tail, hnode);
+	else /* empty list */
+		hlist_add_head(hnode, &dh->dd_head);
+	dh->dd_tail = hnode;
+	return ++dh->dd_depth;
+}
+
+static int
+cfs_hash_dd_hnode_del(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+		      struct hlist_node *hnd)
+{
+	cfs_hash_dhead_dep_t *dh = container_of(cfs_hash_dd_hhead(hs, bd),
+						cfs_hash_dhead_dep_t, dd_head);
+
+	if (hnd->next == NULL) { /* it's the tail */
+		dh->dd_tail = (hnd->pprev == &dh->dd_head.first) ? NULL :
+			      container_of(hnd->pprev, struct hlist_node, next);
+	}
+	hlist_del_init(hnd);
+	return --dh->dd_depth;
+}
+
+static cfs_hash_hlist_ops_t cfs_hash_hh_hops = {
+       .hop_hhead      = cfs_hash_hh_hhead,
+       .hop_hhead_size = cfs_hash_hh_hhead_size,
+       .hop_hnode_add  = cfs_hash_hh_hnode_add,
+       .hop_hnode_del  = cfs_hash_hh_hnode_del,
+};
+
+static cfs_hash_hlist_ops_t cfs_hash_hd_hops = {
+       .hop_hhead      = cfs_hash_hd_hhead,
+       .hop_hhead_size = cfs_hash_hd_hhead_size,
+       .hop_hnode_add  = cfs_hash_hd_hnode_add,
+       .hop_hnode_del  = cfs_hash_hd_hnode_del,
+};
+
+static cfs_hash_hlist_ops_t cfs_hash_dh_hops = {
+       .hop_hhead      = cfs_hash_dh_hhead,
+       .hop_hhead_size = cfs_hash_dh_hhead_size,
+       .hop_hnode_add  = cfs_hash_dh_hnode_add,
+       .hop_hnode_del  = cfs_hash_dh_hnode_del,
+};
+
+static cfs_hash_hlist_ops_t cfs_hash_dd_hops = {
+       .hop_hhead      = cfs_hash_dd_hhead,
+       .hop_hhead_size = cfs_hash_dd_hhead_size,
+       .hop_hnode_add  = cfs_hash_dd_hnode_add,
+       .hop_hnode_del  = cfs_hash_dd_hnode_del,
+};
+
+static void
+cfs_hash_hlist_setup(cfs_hash_t *hs)
+{
+	if (cfs_hash_with_add_tail(hs)) {
+		hs->hs_hops = cfs_hash_with_depth(hs) ?
+			      &cfs_hash_dd_hops : &cfs_hash_dh_hops;
+	} else {
+		hs->hs_hops = cfs_hash_with_depth(hs) ?
+			      &cfs_hash_hd_hops : &cfs_hash_hh_hops;
+	}
+}
+
+static void
+cfs_hash_bd_from_key(cfs_hash_t *hs, cfs_hash_bucket_t **bkts,
+		     unsigned int bits, const void *key, cfs_hash_bd_t *bd)
+{
+	unsigned int index = cfs_hash_id(hs, key, (1U << bits) - 1);
+
+	LASSERT(bits == hs->hs_cur_bits || bits == hs->hs_rehash_bits);
+
+	bd->bd_bucket = bkts[index & ((1U << (bits - hs->hs_bkt_bits)) - 1)];
+	bd->bd_offset = index >> (bits - hs->hs_bkt_bits);
+}
+
+void
+cfs_hash_bd_get(cfs_hash_t *hs, const void *key, cfs_hash_bd_t *bd)
+{
+	/* NB: caller should hold hs->hs_rwlock if REHASH is set */
+	if (likely(hs->hs_rehash_buckets == NULL)) {
+		cfs_hash_bd_from_key(hs, hs->hs_buckets,
+				     hs->hs_cur_bits, key, bd);
+	} else {
+		LASSERT(hs->hs_rehash_bits != 0);
+		cfs_hash_bd_from_key(hs, hs->hs_rehash_buckets,
+				     hs->hs_rehash_bits, key, bd);
+	}
+}
+EXPORT_SYMBOL(cfs_hash_bd_get);
+
+static inline void
+cfs_hash_bd_dep_record(cfs_hash_t *hs, cfs_hash_bd_t *bd, int dep_cur)
+{
+	if (likely(dep_cur <= bd->bd_bucket->hsb_depmax))
+		return;
+
+	bd->bd_bucket->hsb_depmax = dep_cur;
+# if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1
+	if (likely(warn_on_depth == 0 ||
+		   max(warn_on_depth, hs->hs_dep_max) >= dep_cur))
+		return;
+
+	spin_lock(&hs->hs_dep_lock);
+	hs->hs_dep_max  = dep_cur;
+	hs->hs_dep_bkt  = bd->bd_bucket->hsb_index;
+	hs->hs_dep_off  = bd->bd_offset;
+	hs->hs_dep_bits = hs->hs_cur_bits;
+	spin_unlock(&hs->hs_dep_lock);
+
+	cfs_wi_schedule(cfs_sched_rehash, &hs->hs_dep_wi);
+# endif
+}
+
+void
+cfs_hash_bd_add_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+		       struct hlist_node *hnode)
+{
+	int		rc;
+
+	rc = hs->hs_hops->hop_hnode_add(hs, bd, hnode);
+	cfs_hash_bd_dep_record(hs, bd, rc);
+	bd->bd_bucket->hsb_version++;
+	if (unlikely(bd->bd_bucket->hsb_version == 0))
+		bd->bd_bucket->hsb_version++;
+	bd->bd_bucket->hsb_count++;
+
+	if (cfs_hash_with_counter(hs))
+		atomic_inc(&hs->hs_count);
+	if (!cfs_hash_with_no_itemref(hs))
+		cfs_hash_get(hs, hnode);
+}
+EXPORT_SYMBOL(cfs_hash_bd_add_locked);
+
+void
+cfs_hash_bd_del_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+		       struct hlist_node *hnode)
+{
+	hs->hs_hops->hop_hnode_del(hs, bd, hnode);
+
+	LASSERT(bd->bd_bucket->hsb_count > 0);
+	bd->bd_bucket->hsb_count--;
+	bd->bd_bucket->hsb_version++;
+	if (unlikely(bd->bd_bucket->hsb_version == 0))
+		bd->bd_bucket->hsb_version++;
+
+	if (cfs_hash_with_counter(hs)) {
+		LASSERT(atomic_read(&hs->hs_count) > 0);
+		atomic_dec(&hs->hs_count);
+	}
+	if (!cfs_hash_with_no_itemref(hs))
+		cfs_hash_put_locked(hs, hnode);
+}
+EXPORT_SYMBOL(cfs_hash_bd_del_locked);
+
+void
+cfs_hash_bd_move_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd_old,
+			cfs_hash_bd_t *bd_new, struct hlist_node *hnode)
+{
+	cfs_hash_bucket_t *obkt = bd_old->bd_bucket;
+	cfs_hash_bucket_t *nbkt = bd_new->bd_bucket;
+	int		rc;
+
+	if (cfs_hash_bd_compare(bd_old, bd_new) == 0)
+		return;
+
+	/* use cfs_hash_bd_hnode_add/del, to avoid atomic & refcount ops
+	 * in cfs_hash_bd_del/add_locked */
+	hs->hs_hops->hop_hnode_del(hs, bd_old, hnode);
+	rc = hs->hs_hops->hop_hnode_add(hs, bd_new, hnode);
+	cfs_hash_bd_dep_record(hs, bd_new, rc);
+
+	LASSERT(obkt->hsb_count > 0);
+	obkt->hsb_count--;
+	obkt->hsb_version++;
+	if (unlikely(obkt->hsb_version == 0))
+		obkt->hsb_version++;
+	nbkt->hsb_count++;
+	nbkt->hsb_version++;
+	if (unlikely(nbkt->hsb_version == 0))
+		nbkt->hsb_version++;
+}
+EXPORT_SYMBOL(cfs_hash_bd_move_locked);
+
+enum {
+	/** always set, for sanity (avoid ZERO intent) */
+	CFS_HS_LOOKUP_MASK_FIND     = 1 << 0,
+	/** return entry with a ref */
+	CFS_HS_LOOKUP_MASK_REF      = 1 << 1,
+	/** add entry if not existing */
+	CFS_HS_LOOKUP_MASK_ADD      = 1 << 2,
+	/** delete entry, ignore other masks */
+	CFS_HS_LOOKUP_MASK_DEL      = 1 << 3,
+};
+
+typedef enum cfs_hash_lookup_intent {
+	/** return item w/o refcount */
+	CFS_HS_LOOKUP_IT_PEEK       = CFS_HS_LOOKUP_MASK_FIND,
+	/** return item with refcount */
+	CFS_HS_LOOKUP_IT_FIND       = (CFS_HS_LOOKUP_MASK_FIND |
+				       CFS_HS_LOOKUP_MASK_REF),
+	/** return item w/o refcount if existed, otherwise add */
+	CFS_HS_LOOKUP_IT_ADD	= (CFS_HS_LOOKUP_MASK_FIND |
+				       CFS_HS_LOOKUP_MASK_ADD),
+	/** return item with refcount if existed, otherwise add */
+	CFS_HS_LOOKUP_IT_FINDADD    = (CFS_HS_LOOKUP_IT_FIND |
+				       CFS_HS_LOOKUP_MASK_ADD),
+	/** delete if existed */
+	CFS_HS_LOOKUP_IT_FINDDEL    = (CFS_HS_LOOKUP_MASK_FIND |
+				       CFS_HS_LOOKUP_MASK_DEL)
+} cfs_hash_lookup_intent_t;
+
+static struct hlist_node *
+cfs_hash_bd_lookup_intent(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+			  const void *key, struct hlist_node *hnode,
+			  cfs_hash_lookup_intent_t intent)
+
+{
+	struct hlist_head  *hhead = cfs_hash_bd_hhead(hs, bd);
+	struct hlist_node  *ehnode;
+	struct hlist_node  *match;
+	int  intent_add = (intent & CFS_HS_LOOKUP_MASK_ADD) != 0;
+
+	/* with this function, we can avoid a lot of useless refcount ops,
+	 * which are expensive atomic operations most time. */
+	match = intent_add ? NULL : hnode;
+	hlist_for_each(ehnode, hhead) {
+		if (!cfs_hash_keycmp(hs, key, ehnode))
+			continue;
+
+		if (match != NULL && match != ehnode) /* can't match */
+			continue;
+
+		/* match and ... */
+		if ((intent & CFS_HS_LOOKUP_MASK_DEL) != 0) {
+			cfs_hash_bd_del_locked(hs, bd, ehnode);
+			return ehnode;
+		}
+
+		/* caller wants refcount? */
+		if ((intent & CFS_HS_LOOKUP_MASK_REF) != 0)
+			cfs_hash_get(hs, ehnode);
+		return ehnode;
+	}
+	/* no match item */
+	if (!intent_add)
+		return NULL;
+
+	LASSERT(hnode != NULL);
+	cfs_hash_bd_add_locked(hs, bd, hnode);
+	return hnode;
+}
+
+struct hlist_node *
+cfs_hash_bd_lookup_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd, const void *key)
+{
+	return cfs_hash_bd_lookup_intent(hs, bd, key, NULL,
+					 CFS_HS_LOOKUP_IT_FIND);
+}
+EXPORT_SYMBOL(cfs_hash_bd_lookup_locked);
+
+struct hlist_node *
+cfs_hash_bd_peek_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd, const void *key)
+{
+	return cfs_hash_bd_lookup_intent(hs, bd, key, NULL,
+					 CFS_HS_LOOKUP_IT_PEEK);
+}
+EXPORT_SYMBOL(cfs_hash_bd_peek_locked);
+
+struct hlist_node *
+cfs_hash_bd_findadd_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+			   const void *key, struct hlist_node *hnode,
+			   int noref)
+{
+	return cfs_hash_bd_lookup_intent(hs, bd, key, hnode,
+					 CFS_HS_LOOKUP_IT_ADD |
+					 (!noref * CFS_HS_LOOKUP_MASK_REF));
+}
+EXPORT_SYMBOL(cfs_hash_bd_findadd_locked);
+
+struct hlist_node *
+cfs_hash_bd_finddel_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+			   const void *key, struct hlist_node *hnode)
+{
+	/* hnode can be NULL, we find the first item with @key */
+	return cfs_hash_bd_lookup_intent(hs, bd, key, hnode,
+					 CFS_HS_LOOKUP_IT_FINDDEL);
+}
+EXPORT_SYMBOL(cfs_hash_bd_finddel_locked);
+
+static void
+cfs_hash_multi_bd_lock(cfs_hash_t *hs, cfs_hash_bd_t *bds,
+		       unsigned n, int excl)
+{
+	cfs_hash_bucket_t *prev = NULL;
+	int		i;
+
+	/**
+	 * bds must be ascendantly ordered by bd->bd_bucket->hsb_index.
+	 * NB: it's possible that several bds point to the same bucket but
+	 * have different bd::bd_offset, so need take care of deadlock.
+	 */
+	cfs_hash_for_each_bd(bds, n, i) {
+		if (prev == bds[i].bd_bucket)
+			continue;
+
+		LASSERT(prev == NULL ||
+			prev->hsb_index < bds[i].bd_bucket->hsb_index);
+		cfs_hash_bd_lock(hs, &bds[i], excl);
+		prev = bds[i].bd_bucket;
+	}
+}
+
+static void
+cfs_hash_multi_bd_unlock(cfs_hash_t *hs, cfs_hash_bd_t *bds,
+			 unsigned n, int excl)
+{
+	cfs_hash_bucket_t *prev = NULL;
+	int		i;
+
+	cfs_hash_for_each_bd(bds, n, i) {
+		if (prev != bds[i].bd_bucket) {
+			cfs_hash_bd_unlock(hs, &bds[i], excl);
+			prev = bds[i].bd_bucket;
+		}
+	}
+}
+
+static struct hlist_node *
+cfs_hash_multi_bd_lookup_locked(cfs_hash_t *hs, cfs_hash_bd_t *bds,
+				unsigned n, const void *key)
+{
+	struct hlist_node  *ehnode;
+	unsigned	   i;
+
+	cfs_hash_for_each_bd(bds, n, i) {
+		ehnode = cfs_hash_bd_lookup_intent(hs, &bds[i], key, NULL,
+						   CFS_HS_LOOKUP_IT_FIND);
+		if (ehnode != NULL)
+			return ehnode;
+	}
+	return NULL;
+}
+
+static struct hlist_node *
+cfs_hash_multi_bd_findadd_locked(cfs_hash_t *hs,
+				 cfs_hash_bd_t *bds, unsigned n, const void *key,
+				 struct hlist_node *hnode, int noref)
+{
+	struct hlist_node  *ehnode;
+	int		intent;
+	unsigned	   i;
+
+	LASSERT(hnode != NULL);
+	intent = CFS_HS_LOOKUP_IT_PEEK | (!noref * CFS_HS_LOOKUP_MASK_REF);
+
+	cfs_hash_for_each_bd(bds, n, i) {
+		ehnode = cfs_hash_bd_lookup_intent(hs, &bds[i], key,
+						   NULL, intent);
+		if (ehnode != NULL)
+			return ehnode;
+	}
+
+	if (i == 1) { /* only one bucket */
+		cfs_hash_bd_add_locked(hs, &bds[0], hnode);
+	} else {
+		cfs_hash_bd_t      mybd;
+
+		cfs_hash_bd_get(hs, key, &mybd);
+		cfs_hash_bd_add_locked(hs, &mybd, hnode);
+	}
+
+	return hnode;
+}
+
+static struct hlist_node *
+cfs_hash_multi_bd_finddel_locked(cfs_hash_t *hs, cfs_hash_bd_t *bds,
+				 unsigned n, const void *key,
+				 struct hlist_node *hnode)
+{
+	struct hlist_node  *ehnode;
+	unsigned	   i;
+
+	cfs_hash_for_each_bd(bds, n, i) {
+		ehnode = cfs_hash_bd_lookup_intent(hs, &bds[i], key, hnode,
+						   CFS_HS_LOOKUP_IT_FINDDEL);
+		if (ehnode != NULL)
+			return ehnode;
+	}
+	return NULL;
+}
+
+static void
+cfs_hash_bd_order(cfs_hash_bd_t *bd1, cfs_hash_bd_t *bd2)
+{
+	int     rc;
+
+	if (bd2->bd_bucket == NULL)
+		return;
+
+	if (bd1->bd_bucket == NULL) {
+		*bd1 = *bd2;
+		bd2->bd_bucket = NULL;
+		return;
+	}
+
+	rc = cfs_hash_bd_compare(bd1, bd2);
+	if (rc == 0) {
+		bd2->bd_bucket = NULL;
+
+	} else if (rc > 0) { /* swab bd1 and bd2 */
+		cfs_hash_bd_t tmp;
+
+		tmp = *bd2;
+		*bd2 = *bd1;
+		*bd1 = tmp;
+	}
+}
+
+void
+cfs_hash_dual_bd_get(cfs_hash_t *hs, const void *key, cfs_hash_bd_t *bds)
+{
+	/* NB: caller should hold hs_lock.rw if REHASH is set */
+	cfs_hash_bd_from_key(hs, hs->hs_buckets,
+			     hs->hs_cur_bits, key, &bds[0]);
+	if (likely(hs->hs_rehash_buckets == NULL)) {
+		/* no rehash or not rehashing */
+		bds[1].bd_bucket = NULL;
+		return;
+	}
+
+	LASSERT(hs->hs_rehash_bits != 0);
+	cfs_hash_bd_from_key(hs, hs->hs_rehash_buckets,
+			     hs->hs_rehash_bits, key, &bds[1]);
+
+	cfs_hash_bd_order(&bds[0], &bds[1]);
+}
+EXPORT_SYMBOL(cfs_hash_dual_bd_get);
+
+void
+cfs_hash_dual_bd_lock(cfs_hash_t *hs, cfs_hash_bd_t *bds, int excl)
+{
+	cfs_hash_multi_bd_lock(hs, bds, 2, excl);
+}
+EXPORT_SYMBOL(cfs_hash_dual_bd_lock);
+
+void
+cfs_hash_dual_bd_unlock(cfs_hash_t *hs, cfs_hash_bd_t *bds, int excl)
+{
+	cfs_hash_multi_bd_unlock(hs, bds, 2, excl);
+}
+EXPORT_SYMBOL(cfs_hash_dual_bd_unlock);
+
+struct hlist_node *
+cfs_hash_dual_bd_lookup_locked(cfs_hash_t *hs, cfs_hash_bd_t *bds,
+			       const void *key)
+{
+	return cfs_hash_multi_bd_lookup_locked(hs, bds, 2, key);
+}
+EXPORT_SYMBOL(cfs_hash_dual_bd_lookup_locked);
+
+struct hlist_node *
+cfs_hash_dual_bd_findadd_locked(cfs_hash_t *hs, cfs_hash_bd_t *bds,
+				const void *key, struct hlist_node *hnode,
+				int noref)
+{
+	return cfs_hash_multi_bd_findadd_locked(hs, bds, 2, key,
+						hnode, noref);
+}
+EXPORT_SYMBOL(cfs_hash_dual_bd_findadd_locked);
+
+struct hlist_node *
+cfs_hash_dual_bd_finddel_locked(cfs_hash_t *hs, cfs_hash_bd_t *bds,
+				const void *key, struct hlist_node *hnode)
+{
+	return cfs_hash_multi_bd_finddel_locked(hs, bds, 2, key, hnode);
+}
+EXPORT_SYMBOL(cfs_hash_dual_bd_finddel_locked);
+
+static void
+cfs_hash_buckets_free(cfs_hash_bucket_t **buckets,
+		      int bkt_size, int prev_size, int size)
+{
+	int     i;
+
+	for (i = prev_size; i < size; i++) {
+		if (buckets[i] != NULL)
+			LIBCFS_FREE(buckets[i], bkt_size);
+	}
+
+	LIBCFS_FREE(buckets, sizeof(buckets[0]) * size);
+}
+
+/*
+ * Create or grow bucket memory. Return old_buckets if no allocation was
+ * needed, the newly allocated buckets if allocation was needed and
+ * successful, and NULL on error.
+ */
+static cfs_hash_bucket_t **
+cfs_hash_buckets_realloc(cfs_hash_t *hs, cfs_hash_bucket_t **old_bkts,
+			 unsigned int old_size, unsigned int new_size)
+{
+	cfs_hash_bucket_t **new_bkts;
+	int		 i;
+
+	LASSERT(old_size == 0 || old_bkts != NULL);
+
+	if (old_bkts != NULL && old_size == new_size)
+		return old_bkts;
+
+	LIBCFS_ALLOC(new_bkts, sizeof(new_bkts[0]) * new_size);
+	if (new_bkts == NULL)
+		return NULL;
+
+	if (old_bkts != NULL) {
+		memcpy(new_bkts, old_bkts,
+		       min(old_size, new_size) * sizeof(*old_bkts));
+	}
+
+	for (i = old_size; i < new_size; i++) {
+		struct hlist_head *hhead;
+		cfs_hash_bd_t     bd;
+
+		LIBCFS_ALLOC(new_bkts[i], cfs_hash_bkt_size(hs));
+		if (new_bkts[i] == NULL) {
+			cfs_hash_buckets_free(new_bkts, cfs_hash_bkt_size(hs),
+					      old_size, new_size);
+			return NULL;
+		}
+
+		new_bkts[i]->hsb_index   = i;
+		new_bkts[i]->hsb_version = 1;  /* shouldn't be zero */
+		new_bkts[i]->hsb_depmax  = -1; /* unknown */
+		bd.bd_bucket = new_bkts[i];
+		cfs_hash_bd_for_each_hlist(hs, &bd, hhead)
+			INIT_HLIST_HEAD(hhead);
+
+		if (cfs_hash_with_no_lock(hs) ||
+		    cfs_hash_with_no_bktlock(hs))
+			continue;
+
+		if (cfs_hash_with_rw_bktlock(hs))
+			rwlock_init(&new_bkts[i]->hsb_lock.rw);
+		else if (cfs_hash_with_spin_bktlock(hs))
+			spin_lock_init(&new_bkts[i]->hsb_lock.spin);
+		else
+			LBUG(); /* invalid use-case */
+	}
+	return new_bkts;
+}
+
+/**
+ * Initialize new libcfs hash, where:
+ * @name     - Descriptive hash name
+ * @cur_bits - Initial hash table size, in bits
+ * @max_bits - Maximum allowed hash table resize, in bits
+ * @ops      - Registered hash table operations
+ * @flags    - CFS_HASH_REHASH enable synamic hash resizing
+ *	   - CFS_HASH_SORT enable chained hash sort
+ */
+static int cfs_hash_rehash_worker(cfs_workitem_t *wi);
+
+#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1
+static int cfs_hash_dep_print(cfs_workitem_t *wi)
+{
+	cfs_hash_t *hs = container_of(wi, cfs_hash_t, hs_dep_wi);
+	int	 dep;
+	int	 bkt;
+	int	 off;
+	int	 bits;
+
+	spin_lock(&hs->hs_dep_lock);
+	dep  = hs->hs_dep_max;
+	bkt  = hs->hs_dep_bkt;
+	off  = hs->hs_dep_off;
+	bits = hs->hs_dep_bits;
+	spin_unlock(&hs->hs_dep_lock);
+
+	LCONSOLE_WARN("#### HASH %s (bits: %d): max depth %d at bucket %d/%d\n",
+		      hs->hs_name, bits, dep, bkt, off);
+	spin_lock(&hs->hs_dep_lock);
+	hs->hs_dep_bits = 0; /* mark as workitem done */
+	spin_unlock(&hs->hs_dep_lock);
+	return 0;
+}
+
+static void cfs_hash_depth_wi_init(cfs_hash_t *hs)
+{
+	spin_lock_init(&hs->hs_dep_lock);
+	cfs_wi_init(&hs->hs_dep_wi, hs, cfs_hash_dep_print);
+}
+
+static void cfs_hash_depth_wi_cancel(cfs_hash_t *hs)
+{
+	if (cfs_wi_deschedule(cfs_sched_rehash, &hs->hs_dep_wi))
+		return;
+
+	spin_lock(&hs->hs_dep_lock);
+	while (hs->hs_dep_bits != 0) {
+		spin_unlock(&hs->hs_dep_lock);
+		cond_resched();
+		spin_lock(&hs->hs_dep_lock);
+	}
+	spin_unlock(&hs->hs_dep_lock);
+}
+
+#else /* CFS_HASH_DEBUG_LEVEL < CFS_HASH_DEBUG_1 */
+
+static inline void cfs_hash_depth_wi_init(cfs_hash_t *hs) {}
+static inline void cfs_hash_depth_wi_cancel(cfs_hash_t *hs) {}
+
+#endif /* CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1 */
+
+cfs_hash_t *
+cfs_hash_create(char *name, unsigned cur_bits, unsigned max_bits,
+		unsigned bkt_bits, unsigned extra_bytes,
+		unsigned min_theta, unsigned max_theta,
+		cfs_hash_ops_t *ops, unsigned flags)
+{
+	cfs_hash_t *hs;
+	int	 len;
+
+	ENTRY;
+
+	CLASSERT(CFS_HASH_THETA_BITS < 15);
+
+	LASSERT(name != NULL);
+	LASSERT(ops != NULL);
+	LASSERT(ops->hs_key);
+	LASSERT(ops->hs_hash);
+	LASSERT(ops->hs_object);
+	LASSERT(ops->hs_keycmp);
+	LASSERT(ops->hs_get != NULL);
+	LASSERT(ops->hs_put_locked != NULL);
+
+	if ((flags & CFS_HASH_REHASH) != 0)
+		flags |= CFS_HASH_COUNTER; /* must have counter */
+
+	LASSERT(cur_bits > 0);
+	LASSERT(cur_bits >= bkt_bits);
+	LASSERT(max_bits >= cur_bits && max_bits < 31);
+	LASSERT(ergo((flags & CFS_HASH_REHASH) == 0, cur_bits == max_bits));
+	LASSERT(ergo((flags & CFS_HASH_REHASH) != 0,
+		     (flags & CFS_HASH_NO_LOCK) == 0));
+	LASSERT(ergo((flags & CFS_HASH_REHASH_KEY) != 0,
+		      ops->hs_keycpy != NULL));
+
+	len = (flags & CFS_HASH_BIGNAME) == 0 ?
+	      CFS_HASH_NAME_LEN : CFS_HASH_BIGNAME_LEN;
+	LIBCFS_ALLOC(hs, offsetof(cfs_hash_t, hs_name[len]));
+	if (hs == NULL)
+		RETURN(NULL);
+
+	strncpy(hs->hs_name, name, len);
+	hs->hs_name[len - 1] = '\0';
+	hs->hs_flags = flags;
+
+	atomic_set(&hs->hs_refcount, 1);
+	atomic_set(&hs->hs_count, 0);
+
+	cfs_hash_lock_setup(hs);
+	cfs_hash_hlist_setup(hs);
+
+	hs->hs_cur_bits = (__u8)cur_bits;
+	hs->hs_min_bits = (__u8)cur_bits;
+	hs->hs_max_bits = (__u8)max_bits;
+	hs->hs_bkt_bits = (__u8)bkt_bits;
+
+	hs->hs_ops	 = ops;
+	hs->hs_extra_bytes = extra_bytes;
+	hs->hs_rehash_bits = 0;
+	cfs_wi_init(&hs->hs_rehash_wi, hs, cfs_hash_rehash_worker);
+	cfs_hash_depth_wi_init(hs);
+
+	if (cfs_hash_with_rehash(hs))
+		__cfs_hash_set_theta(hs, min_theta, max_theta);
+
+	hs->hs_buckets = cfs_hash_buckets_realloc(hs, NULL, 0,
+						  CFS_HASH_NBKT(hs));
+	if (hs->hs_buckets != NULL)
+		return hs;
+
+	LIBCFS_FREE(hs, offsetof(cfs_hash_t, hs_name[len]));
+	RETURN(NULL);
+}
+EXPORT_SYMBOL(cfs_hash_create);
+
+/**
+ * Cleanup libcfs hash @hs.
+ */
+static void
+cfs_hash_destroy(cfs_hash_t *hs)
+{
+	struct hlist_node     *hnode;
+	struct hlist_node     *pos;
+	cfs_hash_bd_t	 bd;
+	int		   i;
+	ENTRY;
+
+	LASSERT(hs != NULL);
+	LASSERT(!cfs_hash_is_exiting(hs) &&
+		!cfs_hash_is_iterating(hs));
+
+	/**
+	 * prohibit further rehashes, don't need any lock because
+	 * I'm the only (last) one can change it.
+	 */
+	hs->hs_exiting = 1;
+	if (cfs_hash_with_rehash(hs))
+		cfs_hash_rehash_cancel(hs);
+
+	cfs_hash_depth_wi_cancel(hs);
+	/* rehash should be done/canceled */
+	LASSERT(hs->hs_buckets != NULL &&
+		hs->hs_rehash_buckets == NULL);
+
+	cfs_hash_for_each_bucket(hs, &bd, i) {
+		struct hlist_head *hhead;
+
+		LASSERT(bd.bd_bucket != NULL);
+		/* no need to take this lock, just for consistent code */
+		cfs_hash_bd_lock(hs, &bd, 1);
+
+		cfs_hash_bd_for_each_hlist(hs, &bd, hhead) {
+			hlist_for_each_safe(hnode, pos, hhead) {
+				LASSERTF(!cfs_hash_with_assert_empty(hs),
+					 "hash %s bucket %u(%u) is not "
+					 " empty: %u items left\n",
+					 hs->hs_name, bd.bd_bucket->hsb_index,
+					 bd.bd_offset, bd.bd_bucket->hsb_count);
+				/* can't assert key valicate, because we
+				 * can interrupt rehash */
+				cfs_hash_bd_del_locked(hs, &bd, hnode);
+				cfs_hash_exit(hs, hnode);
+			}
+		}
+		LASSERT(bd.bd_bucket->hsb_count == 0);
+		cfs_hash_bd_unlock(hs, &bd, 1);
+		cond_resched();
+	}
+
+	LASSERT(atomic_read(&hs->hs_count) == 0);
+
+	cfs_hash_buckets_free(hs->hs_buckets, cfs_hash_bkt_size(hs),
+			      0, CFS_HASH_NBKT(hs));
+	i = cfs_hash_with_bigname(hs) ?
+	    CFS_HASH_BIGNAME_LEN : CFS_HASH_NAME_LEN;
+	LIBCFS_FREE(hs, offsetof(cfs_hash_t, hs_name[i]));
+
+	EXIT;
+}
+
+cfs_hash_t *cfs_hash_getref(cfs_hash_t *hs)
+{
+	if (atomic_inc_not_zero(&hs->hs_refcount))
+		return hs;
+	return NULL;
+}
+EXPORT_SYMBOL(cfs_hash_getref);
+
+void cfs_hash_putref(cfs_hash_t *hs)
+{
+	if (atomic_dec_and_test(&hs->hs_refcount))
+		cfs_hash_destroy(hs);
+}
+EXPORT_SYMBOL(cfs_hash_putref);
+
+static inline int
+cfs_hash_rehash_bits(cfs_hash_t *hs)
+{
+	if (cfs_hash_with_no_lock(hs) ||
+	    !cfs_hash_with_rehash(hs))
+		return -EOPNOTSUPP;
+
+	if (unlikely(cfs_hash_is_exiting(hs)))
+		return -ESRCH;
+
+	if (unlikely(cfs_hash_is_rehashing(hs)))
+		return -EALREADY;
+
+	if (unlikely(cfs_hash_is_iterating(hs)))
+		return -EAGAIN;
+
+	/* XXX: need to handle case with max_theta != 2.0
+	 *      and the case with min_theta != 0.5 */
+	if ((hs->hs_cur_bits < hs->hs_max_bits) &&
+	    (__cfs_hash_theta(hs) > hs->hs_max_theta))
+		return hs->hs_cur_bits + 1;
+
+	if (!cfs_hash_with_shrink(hs))
+		return 0;
+
+	if ((hs->hs_cur_bits > hs->hs_min_bits) &&
+	    (__cfs_hash_theta(hs) < hs->hs_min_theta))
+		return hs->hs_cur_bits - 1;
+
+	return 0;
+}
+
+/**
+ * don't allow inline rehash if:
+ * - user wants non-blocking change (add/del) on hash table
+ * - too many elements
+ */
+static inline int
+cfs_hash_rehash_inline(cfs_hash_t *hs)
+{
+	return !cfs_hash_with_nblk_change(hs) &&
+	       atomic_read(&hs->hs_count) < CFS_HASH_LOOP_HOG;
+}
+
+/**
+ * Add item @hnode to libcfs hash @hs using @key.  The registered
+ * ops->hs_get function will be called when the item is added.
+ */
+void
+cfs_hash_add(cfs_hash_t *hs, const void *key, struct hlist_node *hnode)
+{
+	cfs_hash_bd_t   bd;
+	int	     bits;
+
+	LASSERT(hlist_unhashed(hnode));
+
+	cfs_hash_lock(hs, 0);
+	cfs_hash_bd_get_and_lock(hs, key, &bd, 1);
+
+	cfs_hash_key_validate(hs, key, hnode);
+	cfs_hash_bd_add_locked(hs, &bd, hnode);
+
+	cfs_hash_bd_unlock(hs, &bd, 1);
+
+	bits = cfs_hash_rehash_bits(hs);
+	cfs_hash_unlock(hs, 0);
+	if (bits > 0)
+		cfs_hash_rehash(hs, cfs_hash_rehash_inline(hs));
+}
+EXPORT_SYMBOL(cfs_hash_add);
+
+static struct hlist_node *
+cfs_hash_find_or_add(cfs_hash_t *hs, const void *key,
+		     struct hlist_node *hnode, int noref)
+{
+	struct hlist_node *ehnode;
+	cfs_hash_bd_t     bds[2];
+	int	       bits = 0;
+
+	LASSERT(hlist_unhashed(hnode));
+
+	cfs_hash_lock(hs, 0);
+	cfs_hash_dual_bd_get_and_lock(hs, key, bds, 1);
+
+	cfs_hash_key_validate(hs, key, hnode);
+	ehnode = cfs_hash_dual_bd_findadd_locked(hs, bds, key,
+						 hnode, noref);
+	cfs_hash_dual_bd_unlock(hs, bds, 1);
+
+	if (ehnode == hnode) /* new item added */
+		bits = cfs_hash_rehash_bits(hs);
+	cfs_hash_unlock(hs, 0);
+	if (bits > 0)
+		cfs_hash_rehash(hs, cfs_hash_rehash_inline(hs));
+
+	return ehnode;
+}
+
+/**
+ * Add item @hnode to libcfs hash @hs using @key.  The registered
+ * ops->hs_get function will be called if the item was added.
+ * Returns 0 on success or -EALREADY on key collisions.
+ */
+int
+cfs_hash_add_unique(cfs_hash_t *hs, const void *key, struct hlist_node *hnode)
+{
+	return cfs_hash_find_or_add(hs, key, hnode, 1) != hnode ?
+	       -EALREADY : 0;
+}
+EXPORT_SYMBOL(cfs_hash_add_unique);
+
+/**
+ * Add item @hnode to libcfs hash @hs using @key.  If this @key
+ * already exists in the hash then ops->hs_get will be called on the
+ * conflicting entry and that entry will be returned to the caller.
+ * Otherwise ops->hs_get is called on the item which was added.
+ */
+void *
+cfs_hash_findadd_unique(cfs_hash_t *hs, const void *key,
+			struct hlist_node *hnode)
+{
+	hnode = cfs_hash_find_or_add(hs, key, hnode, 0);
+
+	return cfs_hash_object(hs, hnode);
+}
+EXPORT_SYMBOL(cfs_hash_findadd_unique);
+
+/**
+ * Delete item @hnode from the libcfs hash @hs using @key.  The @key
+ * is required to ensure the correct hash bucket is locked since there
+ * is no direct linkage from the item to the bucket.  The object
+ * removed from the hash will be returned and obs->hs_put is called
+ * on the removed object.
+ */
+void *
+cfs_hash_del(cfs_hash_t *hs, const void *key, struct hlist_node *hnode)
+{
+	void	   *obj  = NULL;
+	int	     bits = 0;
+	cfs_hash_bd_t   bds[2];
+
+	cfs_hash_lock(hs, 0);
+	cfs_hash_dual_bd_get_and_lock(hs, key, bds, 1);
+
+	/* NB: do nothing if @hnode is not in hash table */
+	if (hnode == NULL || !hlist_unhashed(hnode)) {
+		if (bds[1].bd_bucket == NULL && hnode != NULL) {
+			cfs_hash_bd_del_locked(hs, &bds[0], hnode);
+		} else {
+			hnode = cfs_hash_dual_bd_finddel_locked(hs, bds,
+								key, hnode);
+		}
+	}
+
+	if (hnode != NULL) {
+		obj  = cfs_hash_object(hs, hnode);
+		bits = cfs_hash_rehash_bits(hs);
+	}
+
+	cfs_hash_dual_bd_unlock(hs, bds, 1);
+	cfs_hash_unlock(hs, 0);
+	if (bits > 0)
+		cfs_hash_rehash(hs, cfs_hash_rehash_inline(hs));
+
+	return obj;
+}
+EXPORT_SYMBOL(cfs_hash_del);
+
+/**
+ * Delete item given @key in libcfs hash @hs.  The first @key found in
+ * the hash will be removed, if the key exists multiple times in the hash
+ * @hs this function must be called once per key.  The removed object
+ * will be returned and ops->hs_put is called on the removed object.
+ */
+void *
+cfs_hash_del_key(cfs_hash_t *hs, const void *key)
+{
+	return cfs_hash_del(hs, key, NULL);
+}
+EXPORT_SYMBOL(cfs_hash_del_key);
+
+/**
+ * Lookup an item using @key in the libcfs hash @hs and return it.
+ * If the @key is found in the hash hs->hs_get() is called and the
+ * matching objects is returned.  It is the callers responsibility
+ * to call the counterpart ops->hs_put using the cfs_hash_put() macro
+ * when when finished with the object.  If the @key was not found
+ * in the hash @hs NULL is returned.
+ */
+void *
+cfs_hash_lookup(cfs_hash_t *hs, const void *key)
+{
+	void		 *obj = NULL;
+	struct hlist_node     *hnode;
+	cfs_hash_bd_t	 bds[2];
+
+	cfs_hash_lock(hs, 0);
+	cfs_hash_dual_bd_get_and_lock(hs, key, bds, 0);
+
+	hnode = cfs_hash_dual_bd_lookup_locked(hs, bds, key);
+	if (hnode != NULL)
+		obj = cfs_hash_object(hs, hnode);
+
+	cfs_hash_dual_bd_unlock(hs, bds, 0);
+	cfs_hash_unlock(hs, 0);
+
+	return obj;
+}
+EXPORT_SYMBOL(cfs_hash_lookup);
+
+static void
+cfs_hash_for_each_enter(cfs_hash_t *hs)
+{
+	LASSERT(!cfs_hash_is_exiting(hs));
+
+	if (!cfs_hash_with_rehash(hs))
+		return;
+	/*
+	 * NB: it's race on cfs_has_t::hs_iterating, but doesn't matter
+	 * because it's just an unreliable signal to rehash-thread,
+	 * rehash-thread will try to finsih rehash ASAP when seeing this.
+	 */
+	hs->hs_iterating = 1;
+
+	cfs_hash_lock(hs, 1);
+	hs->hs_iterators++;
+
+	/* NB: iteration is mostly called by service thread,
+	 * we tend to cancel pending rehash-requst, instead of
+	 * blocking service thread, we will relaunch rehash request
+	 * after iteration */
+	if (cfs_hash_is_rehashing(hs))
+		cfs_hash_rehash_cancel_locked(hs);
+	cfs_hash_unlock(hs, 1);
+}
+
+static void
+cfs_hash_for_each_exit(cfs_hash_t *hs)
+{
+	int remained;
+	int bits;
+
+	if (!cfs_hash_with_rehash(hs))
+		return;
+	cfs_hash_lock(hs, 1);
+	remained = --hs->hs_iterators;
+	bits = cfs_hash_rehash_bits(hs);
+	cfs_hash_unlock(hs, 1);
+	/* NB: it's race on cfs_has_t::hs_iterating, see above */
+	if (remained == 0)
+		hs->hs_iterating = 0;
+	if (bits > 0) {
+		cfs_hash_rehash(hs, atomic_read(&hs->hs_count) <
+				    CFS_HASH_LOOP_HOG);
+	}
+}
+
+/**
+ * For each item in the libcfs hash @hs call the passed callback @func
+ * and pass to it as an argument each hash item and the private @data.
+ *
+ * a) the function may sleep!
+ * b) during the callback:
+ *    . the bucket lock is held so the callback must never sleep.
+ *    . if @removal_safe is true, use can remove current item by
+ *      cfs_hash_bd_del_locked
+ */
+static __u64
+cfs_hash_for_each_tight(cfs_hash_t *hs, cfs_hash_for_each_cb_t func,
+			void *data, int remove_safe)
+{
+	struct hlist_node     *hnode;
+	struct hlist_node     *pos;
+	cfs_hash_bd_t	 bd;
+	__u64		 count = 0;
+	int		   excl  = !!remove_safe;
+	int		   loop  = 0;
+	int		   i;
+	ENTRY;
+
+	cfs_hash_for_each_enter(hs);
+
+	cfs_hash_lock(hs, 0);
+	LASSERT(!cfs_hash_is_rehashing(hs));
+
+	cfs_hash_for_each_bucket(hs, &bd, i) {
+		struct hlist_head *hhead;
+
+		cfs_hash_bd_lock(hs, &bd, excl);
+		if (func == NULL) { /* only glimpse size */
+			count += bd.bd_bucket->hsb_count;
+			cfs_hash_bd_unlock(hs, &bd, excl);
+			continue;
+		}
+
+		cfs_hash_bd_for_each_hlist(hs, &bd, hhead) {
+			hlist_for_each_safe(hnode, pos, hhead) {
+				cfs_hash_bucket_validate(hs, &bd, hnode);
+				count++;
+				loop++;
+				if (func(hs, &bd, hnode, data)) {
+					cfs_hash_bd_unlock(hs, &bd, excl);
+					goto out;
+				}
+			}
+		}
+		cfs_hash_bd_unlock(hs, &bd, excl);
+		if (loop < CFS_HASH_LOOP_HOG)
+			continue;
+		loop = 0;
+		cfs_hash_unlock(hs, 0);
+		cond_resched();
+		cfs_hash_lock(hs, 0);
+	}
+ out:
+	cfs_hash_unlock(hs, 0);
+
+	cfs_hash_for_each_exit(hs);
+	RETURN(count);
+}
+
+typedef struct {
+	cfs_hash_cond_opt_cb_t  func;
+	void		   *arg;
+} cfs_hash_cond_arg_t;
+
+static int
+cfs_hash_cond_del_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+			 struct hlist_node *hnode, void *data)
+{
+	cfs_hash_cond_arg_t *cond = data;
+
+	if (cond->func(cfs_hash_object(hs, hnode), cond->arg))
+		cfs_hash_bd_del_locked(hs, bd, hnode);
+	return 0;
+}
+
+/**
+ * Delete item from the libcfs hash @hs when @func return true.
+ * The write lock being hold during loop for each bucket to avoid
+ * any object be reference.
+ */
+void
+cfs_hash_cond_del(cfs_hash_t *hs, cfs_hash_cond_opt_cb_t func, void *data)
+{
+	cfs_hash_cond_arg_t arg = {
+		.func   = func,
+		.arg    = data,
+	};
+
+	cfs_hash_for_each_tight(hs, cfs_hash_cond_del_locked, &arg, 1);
+}
+EXPORT_SYMBOL(cfs_hash_cond_del);
+
+void
+cfs_hash_for_each(cfs_hash_t *hs,
+		  cfs_hash_for_each_cb_t func, void *data)
+{
+	cfs_hash_for_each_tight(hs, func, data, 0);
+}
+EXPORT_SYMBOL(cfs_hash_for_each);
+
+void
+cfs_hash_for_each_safe(cfs_hash_t *hs,
+		       cfs_hash_for_each_cb_t func, void *data)
+{
+	cfs_hash_for_each_tight(hs, func, data, 1);
+}
+EXPORT_SYMBOL(cfs_hash_for_each_safe);
+
+static int
+cfs_hash_peek(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+	      struct hlist_node *hnode, void *data)
+{
+	*(int *)data = 0;
+	return 1; /* return 1 to break the loop */
+}
+
+int
+cfs_hash_is_empty(cfs_hash_t *hs)
+{
+	int empty = 1;
+
+	cfs_hash_for_each_tight(hs, cfs_hash_peek, &empty, 0);
+	return empty;
+}
+EXPORT_SYMBOL(cfs_hash_is_empty);
+
+__u64
+cfs_hash_size_get(cfs_hash_t *hs)
+{
+	return cfs_hash_with_counter(hs) ?
+	       atomic_read(&hs->hs_count) :
+	       cfs_hash_for_each_tight(hs, NULL, NULL, 0);
+}
+EXPORT_SYMBOL(cfs_hash_size_get);
+
+/*
+ * cfs_hash_for_each_relax:
+ * Iterate the hash table and call @func on each item without
+ * any lock. This function can't guarantee to finish iteration
+ * if these features are enabled:
+ *
+ *  a. if rehash_key is enabled, an item can be moved from
+ *     one bucket to another bucket
+ *  b. user can remove non-zero-ref item from hash-table,
+ *     so the item can be removed from hash-table, even worse,
+ *     it's possible that user changed key and insert to another
+ *     hash bucket.
+ * there's no way for us to finish iteration correctly on previous
+ * two cases, so iteration has to be stopped on change.
+ */
+static int
+cfs_hash_for_each_relax(cfs_hash_t *hs, cfs_hash_for_each_cb_t func, void *data)
+{
+	struct hlist_node *hnode;
+	struct hlist_node *tmp;
+	cfs_hash_bd_t     bd;
+	__u32	     version;
+	int	       count = 0;
+	int	       stop_on_change;
+	int	       rc;
+	int	       i;
+	ENTRY;
+
+	stop_on_change = cfs_hash_with_rehash_key(hs) ||
+			 !cfs_hash_with_no_itemref(hs) ||
+			 CFS_HOP(hs, put_locked) == NULL;
+	cfs_hash_lock(hs, 0);
+	LASSERT(!cfs_hash_is_rehashing(hs));
+
+	cfs_hash_for_each_bucket(hs, &bd, i) {
+		struct hlist_head *hhead;
+
+		cfs_hash_bd_lock(hs, &bd, 0);
+		version = cfs_hash_bd_version_get(&bd);
+
+		cfs_hash_bd_for_each_hlist(hs, &bd, hhead) {
+			for (hnode = hhead->first; hnode != NULL;) {
+				cfs_hash_bucket_validate(hs, &bd, hnode);
+				cfs_hash_get(hs, hnode);
+				cfs_hash_bd_unlock(hs, &bd, 0);
+				cfs_hash_unlock(hs, 0);
+
+				rc = func(hs, &bd, hnode, data);
+				if (stop_on_change)
+					cfs_hash_put(hs, hnode);
+				cond_resched();
+				count++;
+
+				cfs_hash_lock(hs, 0);
+				cfs_hash_bd_lock(hs, &bd, 0);
+				if (!stop_on_change) {
+					tmp = hnode->next;
+					cfs_hash_put_locked(hs, hnode);
+					hnode = tmp;
+				} else { /* bucket changed? */
+					if (version !=
+					    cfs_hash_bd_version_get(&bd))
+						break;
+					/* safe to continue because no change */
+					hnode = hnode->next;
+				}
+				if (rc) /* callback wants to break iteration */
+					break;
+			}
+		}
+		cfs_hash_bd_unlock(hs, &bd, 0);
+	}
+	cfs_hash_unlock(hs, 0);
+
+	return count;
+}
+
+int
+cfs_hash_for_each_nolock(cfs_hash_t *hs,
+			 cfs_hash_for_each_cb_t func, void *data)
+{
+	ENTRY;
+
+	if (cfs_hash_with_no_lock(hs) ||
+	    cfs_hash_with_rehash_key(hs) ||
+	    !cfs_hash_with_no_itemref(hs))
+		RETURN(-EOPNOTSUPP);
+
+	if (CFS_HOP(hs, get) == NULL ||
+	    (CFS_HOP(hs, put) == NULL &&
+	     CFS_HOP(hs, put_locked) == NULL))
+		RETURN(-EOPNOTSUPP);
+
+	cfs_hash_for_each_enter(hs);
+	cfs_hash_for_each_relax(hs, func, data);
+	cfs_hash_for_each_exit(hs);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(cfs_hash_for_each_nolock);
+
+/**
+ * For each hash bucket in the libcfs hash @hs call the passed callback
+ * @func until all the hash buckets are empty.  The passed callback @func
+ * or the previously registered callback hs->hs_put must remove the item
+ * from the hash.  You may either use the cfs_hash_del() or hlist_del()
+ * functions.  No rwlocks will be held during the callback @func it is
+ * safe to sleep if needed.  This function will not terminate until the
+ * hash is empty.  Note it is still possible to concurrently add new
+ * items in to the hash.  It is the callers responsibility to ensure
+ * the required locking is in place to prevent concurrent insertions.
+ */
+int
+cfs_hash_for_each_empty(cfs_hash_t *hs,
+			cfs_hash_for_each_cb_t func, void *data)
+{
+	unsigned  i = 0;
+	ENTRY;
+
+	if (cfs_hash_with_no_lock(hs))
+		return -EOPNOTSUPP;
+
+	if (CFS_HOP(hs, get) == NULL ||
+	    (CFS_HOP(hs, put) == NULL &&
+	     CFS_HOP(hs, put_locked) == NULL))
+		return -EOPNOTSUPP;
+
+	cfs_hash_for_each_enter(hs);
+	while (cfs_hash_for_each_relax(hs, func, data)) {
+		CDEBUG(D_INFO, "Try to empty hash: %s, loop: %u\n",
+		       hs->hs_name, i++);
+	}
+	cfs_hash_for_each_exit(hs);
+	RETURN(0);
+}
+EXPORT_SYMBOL(cfs_hash_for_each_empty);
+
+void
+cfs_hash_hlist_for_each(cfs_hash_t *hs, unsigned hindex,
+			cfs_hash_for_each_cb_t func, void *data)
+{
+	struct hlist_head   *hhead;
+	struct hlist_node   *hnode;
+	cfs_hash_bd_t       bd;
+
+	cfs_hash_for_each_enter(hs);
+	cfs_hash_lock(hs, 0);
+	if (hindex >= CFS_HASH_NHLIST(hs))
+		goto out;
+
+	cfs_hash_bd_index_set(hs, hindex, &bd);
+
+	cfs_hash_bd_lock(hs, &bd, 0);
+	hhead = cfs_hash_bd_hhead(hs, &bd);
+	hlist_for_each(hnode, hhead) {
+		if (func(hs, &bd, hnode, data))
+			break;
+	}
+	cfs_hash_bd_unlock(hs, &bd, 0);
+ out:
+	cfs_hash_unlock(hs, 0);
+	cfs_hash_for_each_exit(hs);
+}
+
+EXPORT_SYMBOL(cfs_hash_hlist_for_each);
+
+/*
+ * For each item in the libcfs hash @hs which matches the @key call
+ * the passed callback @func and pass to it as an argument each hash
+ * item and the private @data. During the callback the bucket lock
+ * is held so the callback must never sleep.
+   */
+void
+cfs_hash_for_each_key(cfs_hash_t *hs, const void *key,
+		      cfs_hash_for_each_cb_t func, void *data)
+{
+	struct hlist_node   *hnode;
+	cfs_hash_bd_t       bds[2];
+	unsigned	    i;
+
+	cfs_hash_lock(hs, 0);
+
+	cfs_hash_dual_bd_get_and_lock(hs, key, bds, 0);
+
+	cfs_hash_for_each_bd(bds, 2, i) {
+		struct hlist_head *hlist = cfs_hash_bd_hhead(hs, &bds[i]);
+
+		hlist_for_each(hnode, hlist) {
+			cfs_hash_bucket_validate(hs, &bds[i], hnode);
+
+			if (cfs_hash_keycmp(hs, key, hnode)) {
+				if (func(hs, &bds[i], hnode, data))
+					break;
+			}
+		}
+	}
+
+	cfs_hash_dual_bd_unlock(hs, bds, 0);
+	cfs_hash_unlock(hs, 0);
+}
+EXPORT_SYMBOL(cfs_hash_for_each_key);
+
+/**
+ * Rehash the libcfs hash @hs to the given @bits.  This can be used
+ * to grow the hash size when excessive chaining is detected, or to
+ * shrink the hash when it is larger than needed.  When the CFS_HASH_REHASH
+ * flag is set in @hs the libcfs hash may be dynamically rehashed
+ * during addition or removal if the hash's theta value exceeds
+ * either the hs->hs_min_theta or hs->max_theta values.  By default
+ * these values are tuned to keep the chained hash depth small, and
+ * this approach assumes a reasonably uniform hashing function.  The
+ * theta thresholds for @hs are tunable via cfs_hash_set_theta().
+ */
+void
+cfs_hash_rehash_cancel_locked(cfs_hash_t *hs)
+{
+	int     i;
+
+	/* need hold cfs_hash_lock(hs, 1) */
+	LASSERT(cfs_hash_with_rehash(hs) &&
+		!cfs_hash_with_no_lock(hs));
+
+	if (!cfs_hash_is_rehashing(hs))
+		return;
+
+	if (cfs_wi_deschedule(cfs_sched_rehash, &hs->hs_rehash_wi)) {
+		hs->hs_rehash_bits = 0;
+		return;
+	}
+
+	for (i = 2; cfs_hash_is_rehashing(hs); i++) {
+		cfs_hash_unlock(hs, 1);
+		/* raise console warning while waiting too long */
+		CDEBUG(IS_PO2(i >> 3) ? D_WARNING : D_INFO,
+		       "hash %s is still rehashing, rescheded %d\n",
+		       hs->hs_name, i - 1);
+		cond_resched();
+		cfs_hash_lock(hs, 1);
+	}
+}
+EXPORT_SYMBOL(cfs_hash_rehash_cancel_locked);
+
+void
+cfs_hash_rehash_cancel(cfs_hash_t *hs)
+{
+	cfs_hash_lock(hs, 1);
+	cfs_hash_rehash_cancel_locked(hs);
+	cfs_hash_unlock(hs, 1);
+}
+EXPORT_SYMBOL(cfs_hash_rehash_cancel);
+
+int
+cfs_hash_rehash(cfs_hash_t *hs, int do_rehash)
+{
+	int     rc;
+
+	LASSERT(cfs_hash_with_rehash(hs) && !cfs_hash_with_no_lock(hs));
+
+	cfs_hash_lock(hs, 1);
+
+	rc = cfs_hash_rehash_bits(hs);
+	if (rc <= 0) {
+		cfs_hash_unlock(hs, 1);
+		return rc;
+	}
+
+	hs->hs_rehash_bits = rc;
+	if (!do_rehash) {
+		/* launch and return */
+		cfs_wi_schedule(cfs_sched_rehash, &hs->hs_rehash_wi);
+		cfs_hash_unlock(hs, 1);
+		return 0;
+	}
+
+	/* rehash right now */
+	cfs_hash_unlock(hs, 1);
+
+	return cfs_hash_rehash_worker(&hs->hs_rehash_wi);
+}
+EXPORT_SYMBOL(cfs_hash_rehash);
+
+static int
+cfs_hash_rehash_bd(cfs_hash_t *hs, cfs_hash_bd_t *old)
+{
+	cfs_hash_bd_t      new;
+	struct hlist_head  *hhead;
+	struct hlist_node  *hnode;
+	struct hlist_node  *pos;
+	void	      *key;
+	int		c = 0;
+
+	/* hold cfs_hash_lock(hs, 1), so don't need any bucket lock */
+	cfs_hash_bd_for_each_hlist(hs, old, hhead) {
+		hlist_for_each_safe(hnode, pos, hhead) {
+			key = cfs_hash_key(hs, hnode);
+			LASSERT(key != NULL);
+			/* Validate hnode is in the correct bucket. */
+			cfs_hash_bucket_validate(hs, old, hnode);
+			/*
+			 * Delete from old hash bucket; move to new bucket.
+			 * ops->hs_key must be defined.
+			 */
+			cfs_hash_bd_from_key(hs, hs->hs_rehash_buckets,
+					     hs->hs_rehash_bits, key, &new);
+			cfs_hash_bd_move_locked(hs, old, &new, hnode);
+			c++;
+		}
+	}
+
+	return c;
+}
+
+static int
+cfs_hash_rehash_worker(cfs_workitem_t *wi)
+{
+	cfs_hash_t	 *hs = container_of(wi, cfs_hash_t, hs_rehash_wi);
+	cfs_hash_bucket_t **bkts;
+	cfs_hash_bd_t       bd;
+	unsigned int	old_size;
+	unsigned int	new_size;
+	int		 bsize;
+	int		 count = 0;
+	int		 rc = 0;
+	int		 i;
+
+	LASSERT (hs != NULL && cfs_hash_with_rehash(hs));
+
+	cfs_hash_lock(hs, 0);
+	LASSERT(cfs_hash_is_rehashing(hs));
+
+	old_size = CFS_HASH_NBKT(hs);
+	new_size = CFS_HASH_RH_NBKT(hs);
+
+	cfs_hash_unlock(hs, 0);
+
+	/*
+	 * don't need hs::hs_rwlock for hs::hs_buckets,
+	 * because nobody can change bkt-table except me.
+	 */
+	bkts = cfs_hash_buckets_realloc(hs, hs->hs_buckets,
+					old_size, new_size);
+	cfs_hash_lock(hs, 1);
+	if (bkts == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	if (bkts == hs->hs_buckets) {
+		bkts = NULL; /* do nothing */
+		goto out;
+	}
+
+	rc = __cfs_hash_theta(hs);
+	if ((rc >= hs->hs_min_theta) && (rc <= hs->hs_max_theta)) {
+		/* free the new allocated bkt-table */
+		old_size = new_size;
+		new_size = CFS_HASH_NBKT(hs);
+		rc = -EALREADY;
+		goto out;
+	}
+
+	LASSERT(hs->hs_rehash_buckets == NULL);
+	hs->hs_rehash_buckets = bkts;
+
+	rc = 0;
+	cfs_hash_for_each_bucket(hs, &bd, i) {
+		if (cfs_hash_is_exiting(hs)) {
+			rc = -ESRCH;
+			/* someone wants to destroy the hash, abort now */
+			if (old_size < new_size) /* OK to free old bkt-table */
+				break;
+			/* it's shrinking, need free new bkt-table */
+			hs->hs_rehash_buckets = NULL;
+			old_size = new_size;
+			new_size = CFS_HASH_NBKT(hs);
+			goto out;
+		}
+
+		count += cfs_hash_rehash_bd(hs, &bd);
+		if (count < CFS_HASH_LOOP_HOG ||
+		    cfs_hash_is_iterating(hs)) { /* need to finish ASAP */
+			continue;
+		}
+
+		count = 0;
+		cfs_hash_unlock(hs, 1);
+		cond_resched();
+		cfs_hash_lock(hs, 1);
+	}
+
+	hs->hs_rehash_count++;
+
+	bkts = hs->hs_buckets;
+	hs->hs_buckets = hs->hs_rehash_buckets;
+	hs->hs_rehash_buckets = NULL;
+
+	hs->hs_cur_bits = hs->hs_rehash_bits;
+ out:
+	hs->hs_rehash_bits = 0;
+	if (rc == -ESRCH) /* never be scheduled again */
+		cfs_wi_exit(cfs_sched_rehash, wi);
+	bsize = cfs_hash_bkt_size(hs);
+	cfs_hash_unlock(hs, 1);
+	/* can't refer to @hs anymore because it could be destroyed */
+	if (bkts != NULL)
+		cfs_hash_buckets_free(bkts, bsize, new_size, old_size);
+	if (rc != 0)
+		CDEBUG(D_INFO, "early quit of of rehashing: %d\n", rc);
+	/* return 1 only if cfs_wi_exit is called */
+	return rc == -ESRCH;
+}
+
+/**
+ * Rehash the object referenced by @hnode in the libcfs hash @hs.  The
+ * @old_key must be provided to locate the objects previous location
+ * in the hash, and the @new_key will be used to reinsert the object.
+ * Use this function instead of a cfs_hash_add() + cfs_hash_del()
+ * combo when it is critical that there is no window in time where the
+ * object is missing from the hash.  When an object is being rehashed
+ * the registered cfs_hash_get() and cfs_hash_put() functions will
+ * not be called.
+ */
+void cfs_hash_rehash_key(cfs_hash_t *hs, const void *old_key,
+			 void *new_key, struct hlist_node *hnode)
+{
+	cfs_hash_bd_t	bds[3];
+	cfs_hash_bd_t	old_bds[2];
+	cfs_hash_bd_t	new_bd;
+
+	LASSERT(!hlist_unhashed(hnode));
+
+	cfs_hash_lock(hs, 0);
+
+	cfs_hash_dual_bd_get(hs, old_key, old_bds);
+	cfs_hash_bd_get(hs, new_key, &new_bd);
+
+	bds[0] = old_bds[0];
+	bds[1] = old_bds[1];
+	bds[2] = new_bd;
+
+	/* NB: bds[0] and bds[1] are ordered already */
+	cfs_hash_bd_order(&bds[1], &bds[2]);
+	cfs_hash_bd_order(&bds[0], &bds[1]);
+
+	cfs_hash_multi_bd_lock(hs, bds, 3, 1);
+	if (likely(old_bds[1].bd_bucket == NULL)) {
+		cfs_hash_bd_move_locked(hs, &old_bds[0], &new_bd, hnode);
+	} else {
+		cfs_hash_dual_bd_finddel_locked(hs, old_bds, old_key, hnode);
+		cfs_hash_bd_add_locked(hs, &new_bd, hnode);
+	}
+	/* overwrite key inside locks, otherwise may screw up with
+	 * other operations, i.e: rehash */
+	cfs_hash_keycpy(hs, new_key, hnode);
+
+	cfs_hash_multi_bd_unlock(hs, bds, 3, 1);
+	cfs_hash_unlock(hs, 0);
+}
+EXPORT_SYMBOL(cfs_hash_rehash_key);
+
+int cfs_hash_debug_header(char *str, int size)
+{
+	return snprintf(str, size, "%-*s%6s%6s%6s%6s%6s%6s%6s%7s%8s%8s%8s%s\n",
+		 CFS_HASH_BIGNAME_LEN,
+		 "name", "cur", "min", "max", "theta", "t-min", "t-max",
+		 "flags", "rehash", "count", "maxdep", "maxdepb",
+		 " distribution");
+}
+EXPORT_SYMBOL(cfs_hash_debug_header);
+
+static cfs_hash_bucket_t **
+cfs_hash_full_bkts(cfs_hash_t *hs)
+{
+	/* NB: caller should hold hs->hs_rwlock if REHASH is set */
+	if (hs->hs_rehash_buckets == NULL)
+		return hs->hs_buckets;
+
+	LASSERT(hs->hs_rehash_bits != 0);
+	return hs->hs_rehash_bits > hs->hs_cur_bits ?
+	       hs->hs_rehash_buckets : hs->hs_buckets;
+}
+
+static unsigned int
+cfs_hash_full_nbkt(cfs_hash_t *hs)
+{
+	/* NB: caller should hold hs->hs_rwlock if REHASH is set */
+	if (hs->hs_rehash_buckets == NULL)
+		return CFS_HASH_NBKT(hs);
+
+	LASSERT(hs->hs_rehash_bits != 0);
+	return hs->hs_rehash_bits > hs->hs_cur_bits ?
+	       CFS_HASH_RH_NBKT(hs) : CFS_HASH_NBKT(hs);
+}
+
+int cfs_hash_debug_str(cfs_hash_t *hs, char *str, int size)
+{
+	int		    dist[8] = { 0, };
+	int		    maxdep  = -1;
+	int		    maxdepb = -1;
+	int		    total   = 0;
+	int		    c       = 0;
+	int		    theta;
+	int		    i;
+
+	if (str == NULL || size == 0)
+		return 0;
+
+	cfs_hash_lock(hs, 0);
+	theta = __cfs_hash_theta(hs);
+
+	c += snprintf(str + c, size - c, "%-*s ",
+		      CFS_HASH_BIGNAME_LEN, hs->hs_name);
+	c += snprintf(str + c, size - c, "%5d ",  1 << hs->hs_cur_bits);
+	c += snprintf(str + c, size - c, "%5d ",  1 << hs->hs_min_bits);
+	c += snprintf(str + c, size - c, "%5d ",  1 << hs->hs_max_bits);
+	c += snprintf(str + c, size - c, "%d.%03d ",
+		      __cfs_hash_theta_int(theta),
+		      __cfs_hash_theta_frac(theta));
+	c += snprintf(str + c, size - c, "%d.%03d ",
+		      __cfs_hash_theta_int(hs->hs_min_theta),
+		      __cfs_hash_theta_frac(hs->hs_min_theta));
+	c += snprintf(str + c, size - c, "%d.%03d ",
+		      __cfs_hash_theta_int(hs->hs_max_theta),
+		      __cfs_hash_theta_frac(hs->hs_max_theta));
+	c += snprintf(str + c, size - c, " 0x%02x ", hs->hs_flags);
+	c += snprintf(str + c, size - c, "%6d ", hs->hs_rehash_count);
+
+	/*
+	 * The distribution is a summary of the chained hash depth in
+	 * each of the libcfs hash buckets.  Each buckets hsb_count is
+	 * divided by the hash theta value and used to generate a
+	 * histogram of the hash distribution.  A uniform hash will
+	 * result in all hash buckets being close to the average thus
+	 * only the first few entries in the histogram will be non-zero.
+	 * If you hash function results in a non-uniform hash the will
+	 * be observable by outlier bucks in the distribution histogram.
+	 *
+	 * Uniform hash distribution:      128/128/0/0/0/0/0/0
+	 * Non-Uniform hash distribution:  128/125/0/0/0/0/2/1
+	 */
+	for (i = 0; i < cfs_hash_full_nbkt(hs); i++) {
+		cfs_hash_bd_t  bd;
+
+		bd.bd_bucket = cfs_hash_full_bkts(hs)[i];
+		cfs_hash_bd_lock(hs, &bd, 0);
+		if (maxdep < bd.bd_bucket->hsb_depmax) {
+			maxdep  = bd.bd_bucket->hsb_depmax;
+			maxdepb = ffz(~maxdep);
+		}
+		total += bd.bd_bucket->hsb_count;
+		dist[min(__cfs_fls(bd.bd_bucket->hsb_count/max(theta,1)),7)]++;
+		cfs_hash_bd_unlock(hs, &bd, 0);
+	}
+
+	c += snprintf(str + c, size - c, "%7d ", total);
+	c += snprintf(str + c, size - c, "%7d ", maxdep);
+	c += snprintf(str + c, size - c, "%7d ", maxdepb);
+	for (i = 0; i < 8; i++)
+		c += snprintf(str + c, size - c, "%d%c",  dist[i],
+			      (i == 7) ? '\n' : '/');
+
+	cfs_hash_unlock(hs, 0);
+
+	return c;
+}
+EXPORT_SYMBOL(cfs_hash_debug_str);
diff --git a/drivers/staging/lustre/lustre/libcfs/heap.c b/drivers/staging/lustre/lustre/libcfs/heap.c
new file mode 100644
index 000000000000..147e4fe4762d
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/heap.c
@@ -0,0 +1,475 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011 Intel Corporation
+ */
+/*
+ * libcfs/libcfs/heap.c
+ *
+ * Author: Eric Barton	<eeb@whamcloud.com>
+ *	   Liang Zhen	<liang@whamcloud.com>
+ */
+/** \addtogroup heap
+ *
+ * @{
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+
+#define CBH_ALLOC(ptr, h)						\
+do {									\
+	if ((h)->cbh_flags & CBH_FLAG_ATOMIC_GROW)			\
+		LIBCFS_CPT_ALLOC_GFP((ptr), h->cbh_cptab, h->cbh_cptid,	\
+				     CBH_NOB, GFP_ATOMIC);	\
+	else								\
+		LIBCFS_CPT_ALLOC((ptr), h->cbh_cptab, h->cbh_cptid,	\
+				 CBH_NOB);				\
+} while (0)
+
+#define CBH_FREE(ptr)	LIBCFS_FREE(ptr, CBH_NOB)
+
+/**
+ * Grows the capacity of a binary heap so that it can handle a larger number of
+ * \e cfs_binheap_node_t objects.
+ *
+ * \param[in] h The binary heap
+ *
+ * \retval 0	   Successfully grew the heap
+ * \retval -ENOMEM OOM error
+ */
+static int
+cfs_binheap_grow(cfs_binheap_t *h)
+{
+	cfs_binheap_node_t ***frag1 = NULL;
+	cfs_binheap_node_t  **frag2;
+	int hwm = h->cbh_hwm;
+
+	/* need a whole new chunk of pointers */
+	LASSERT((h->cbh_hwm & CBH_MASK) == 0);
+
+	if (hwm == 0) {
+		/* first use of single indirect */
+		CBH_ALLOC(h->cbh_elements1, h);
+		if (h->cbh_elements1 == NULL)
+			return -ENOMEM;
+
+		goto out;
+	}
+
+	hwm -= CBH_SIZE;
+	if (hwm < CBH_SIZE * CBH_SIZE) {
+		/* not filled double indirect */
+		CBH_ALLOC(frag2, h);
+		if (frag2 == NULL)
+			return -ENOMEM;
+
+		if (hwm == 0) {
+			/* first use of double indirect */
+			CBH_ALLOC(h->cbh_elements2, h);
+			if (h->cbh_elements2 == NULL) {
+				CBH_FREE(frag2);
+				return -ENOMEM;
+			}
+		}
+
+		h->cbh_elements2[hwm >> CBH_SHIFT] = frag2;
+		goto out;
+	}
+
+	hwm -= CBH_SIZE * CBH_SIZE;
+#if (CBH_SHIFT * 3 < 32)
+	if (hwm >= CBH_SIZE * CBH_SIZE * CBH_SIZE) {
+		/* filled triple indirect */
+		return -ENOMEM;
+	}
+#endif
+	CBH_ALLOC(frag2, h);
+	if (frag2 == NULL)
+		return -ENOMEM;
+
+	if (((hwm >> CBH_SHIFT) & CBH_MASK) == 0) {
+		/* first use of this 2nd level index */
+		CBH_ALLOC(frag1, h);
+		if (frag1 == NULL) {
+			CBH_FREE(frag2);
+			return -ENOMEM;
+		}
+	}
+
+	if (hwm == 0) {
+		/* first use of triple indirect */
+		CBH_ALLOC(h->cbh_elements3, h);
+		if (h->cbh_elements3 == NULL) {
+			CBH_FREE(frag2);
+			CBH_FREE(frag1);
+			return -ENOMEM;
+		}
+	}
+
+	if (frag1 != NULL) {
+		LASSERT(h->cbh_elements3[hwm >> (2 * CBH_SHIFT)] == NULL);
+		h->cbh_elements3[hwm >> (2 * CBH_SHIFT)] = frag1;
+	} else {
+		frag1 = h->cbh_elements3[hwm >> (2 * CBH_SHIFT)];
+		LASSERT(frag1 != NULL);
+	}
+
+	frag1[(hwm >> CBH_SHIFT) & CBH_MASK] = frag2;
+
+ out:
+	h->cbh_hwm += CBH_SIZE;
+	return 0;
+}
+
+/**
+ * Creates and initializes a binary heap instance.
+ *
+ * \param[in] ops   The operations to be used
+ * \param[in] flags The heap flags
+ * \parm[in]  count The initial heap capacity in # of elements
+ * \param[in] arg   An optional private argument
+ * \param[in] cptab The CPT table this heap instance will operate over
+ * \param[in] cptid The CPT id of \a cptab this heap instance will operate over
+ *
+ * \retval valid-pointer A newly-created and initialized binary heap object
+ * \retval NULL		 error
+ */
+cfs_binheap_t *
+cfs_binheap_create(cfs_binheap_ops_t *ops, unsigned int flags,
+		   unsigned count, void *arg, struct cfs_cpt_table *cptab,
+		   int cptid)
+{
+	cfs_binheap_t *h;
+
+	LASSERT(ops != NULL);
+	LASSERT(ops->hop_compare != NULL);
+	LASSERT(cptab != NULL);
+	LASSERT(cptid == CFS_CPT_ANY ||
+	       (cptid >= 0 && cptid < cptab->ctb_nparts));
+
+	LIBCFS_CPT_ALLOC(h, cptab, cptid, sizeof(*h));
+	if (h == NULL)
+		return NULL;
+
+	h->cbh_ops	  = ops;
+	h->cbh_nelements  = 0;
+	h->cbh_hwm	  = 0;
+	h->cbh_private	  = arg;
+	h->cbh_flags	  = flags & (~CBH_FLAG_ATOMIC_GROW);
+	h->cbh_cptab	  = cptab;
+	h->cbh_cptid	  = cptid;
+
+	while (h->cbh_hwm < count) { /* preallocate */
+		if (cfs_binheap_grow(h) != 0) {
+			cfs_binheap_destroy(h);
+			return NULL;
+		}
+	}
+
+	h->cbh_flags |= flags & CBH_FLAG_ATOMIC_GROW;
+
+	return h;
+}
+EXPORT_SYMBOL(cfs_binheap_create);
+
+/**
+ * Releases all resources associated with a binary heap instance.
+ *
+ * Deallocates memory for all indirection levels and the binary heap object
+ * itself.
+ *
+ * \param[in] h The binary heap object
+ */
+void
+cfs_binheap_destroy(cfs_binheap_t *h)
+{
+	int idx0;
+	int idx1;
+	int n;
+
+	LASSERT(h != NULL);
+
+	n = h->cbh_hwm;
+
+	if (n > 0) {
+		CBH_FREE(h->cbh_elements1);
+		n -= CBH_SIZE;
+	}
+
+	if (n > 0) {
+		for (idx0 = 0; idx0 < CBH_SIZE && n > 0; idx0++) {
+			CBH_FREE(h->cbh_elements2[idx0]);
+			n -= CBH_SIZE;
+		}
+
+		CBH_FREE(h->cbh_elements2);
+	}
+
+	if (n > 0) {
+		for (idx0 = 0; idx0 < CBH_SIZE && n > 0; idx0++) {
+
+			for (idx1 = 0; idx1 < CBH_SIZE && n > 0; idx1++) {
+				CBH_FREE(h->cbh_elements3[idx0][idx1]);
+				n -= CBH_SIZE;
+			}
+
+			CBH_FREE(h->cbh_elements3[idx0]);
+		}
+
+		CBH_FREE(h->cbh_elements3);
+	}
+
+	LIBCFS_FREE(h, sizeof(*h));
+}
+EXPORT_SYMBOL(cfs_binheap_destroy);
+
+/**
+ * Obtains a double pointer to a heap element, given its index into the binary
+ * tree.
+ *
+ * \param[in] h	  The binary heap instance
+ * \param[in] idx The requested node's index
+ *
+ * \retval valid-pointer A double pointer to a heap pointer entry
+ */
+static cfs_binheap_node_t **
+cfs_binheap_pointer(cfs_binheap_t *h, unsigned int idx)
+{
+	if (idx < CBH_SIZE)
+		return &(h->cbh_elements1[idx]);
+
+	idx -= CBH_SIZE;
+	if (idx < CBH_SIZE * CBH_SIZE)
+		return &(h->cbh_elements2[idx >> CBH_SHIFT][idx & CBH_MASK]);
+
+	idx -= CBH_SIZE * CBH_SIZE;
+	return &(h->cbh_elements3[idx >> (2 * CBH_SHIFT)]\
+				 [(idx >> CBH_SHIFT) & CBH_MASK]\
+				 [idx & CBH_MASK]);
+}
+
+/**
+ * Obtains a pointer to a heap element, given its index into the binary tree.
+ *
+ * \param[in] h	  The binary heap
+ * \param[in] idx The requested node's index
+ *
+ * \retval valid-pointer The requested heap node
+ * \retval NULL		 Supplied index is out of bounds
+ */
+cfs_binheap_node_t *
+cfs_binheap_find(cfs_binheap_t *h, unsigned int idx)
+{
+	if (idx >= h->cbh_nelements)
+		return NULL;
+
+	return *cfs_binheap_pointer(h, idx);
+}
+EXPORT_SYMBOL(cfs_binheap_find);
+
+/**
+ * Moves a node upwards, towards the root of the binary tree.
+ *
+ * \param[in] h The heap
+ * \param[in] e The node
+ *
+ * \retval 1 The position of \a e in the tree was changed at least once
+ * \retval 0 The position of \a e in the tree was not changed
+ */
+static int
+cfs_binheap_bubble(cfs_binheap_t *h, cfs_binheap_node_t *e)
+{
+	unsigned int	     cur_idx = e->chn_index;
+	cfs_binheap_node_t **cur_ptr;
+	unsigned int	     parent_idx;
+	cfs_binheap_node_t **parent_ptr;
+	int		     did_sth = 0;
+
+	cur_ptr = cfs_binheap_pointer(h, cur_idx);
+	LASSERT(*cur_ptr == e);
+
+	while (cur_idx > 0) {
+		parent_idx = (cur_idx - 1) >> 1;
+
+		parent_ptr = cfs_binheap_pointer(h, parent_idx);
+		LASSERT((*parent_ptr)->chn_index == parent_idx);
+
+		if (h->cbh_ops->hop_compare(*parent_ptr, e))
+			break;
+
+		(*parent_ptr)->chn_index = cur_idx;
+		*cur_ptr = *parent_ptr;
+		cur_ptr = parent_ptr;
+		cur_idx = parent_idx;
+		did_sth = 1;
+	}
+
+	e->chn_index = cur_idx;
+	*cur_ptr = e;
+
+	return did_sth;
+}
+
+/**
+ * Moves a node downwards, towards the last level of the binary tree.
+ *
+ * \param[in] h The heap
+ * \param[in] e The node
+ *
+ * \retval 1 The position of \a e in the tree was changed at least once
+ * \retval 0 The position of \a e in the tree was not changed
+ */
+static int
+cfs_binheap_sink(cfs_binheap_t *h, cfs_binheap_node_t *e)
+{
+	unsigned int	     n = h->cbh_nelements;
+	unsigned int	     child_idx;
+	cfs_binheap_node_t **child_ptr;
+	cfs_binheap_node_t  *child;
+	unsigned int	     child2_idx;
+	cfs_binheap_node_t **child2_ptr;
+	cfs_binheap_node_t  *child2;
+	unsigned int	     cur_idx;
+	cfs_binheap_node_t **cur_ptr;
+	int		     did_sth = 0;
+
+	cur_idx = e->chn_index;
+	cur_ptr = cfs_binheap_pointer(h, cur_idx);
+	LASSERT(*cur_ptr == e);
+
+	while (cur_idx < n) {
+		child_idx = (cur_idx << 1) + 1;
+		if (child_idx >= n)
+			break;
+
+		child_ptr = cfs_binheap_pointer(h, child_idx);
+		child = *child_ptr;
+
+		child2_idx = child_idx + 1;
+		if (child2_idx < n) {
+			child2_ptr = cfs_binheap_pointer(h, child2_idx);
+			child2 = *child2_ptr;
+
+			if (h->cbh_ops->hop_compare(child2, child)) {
+				child_idx = child2_idx;
+				child_ptr = child2_ptr;
+				child = child2;
+			}
+		}
+
+		LASSERT(child->chn_index == child_idx);
+
+		if (h->cbh_ops->hop_compare(e, child))
+			break;
+
+		child->chn_index = cur_idx;
+		*cur_ptr = child;
+		cur_ptr = child_ptr;
+		cur_idx = child_idx;
+		did_sth = 1;
+	}
+
+	e->chn_index = cur_idx;
+	*cur_ptr = e;
+
+	return did_sth;
+}
+
+/**
+ * Sort-inserts a node into the binary heap.
+ *
+ * \param[in] h The heap
+ * \param[in] e The node
+ *
+ * \retval 0	Element inserted successfully
+ * \retval != 0 error
+ */
+int
+cfs_binheap_insert(cfs_binheap_t *h, cfs_binheap_node_t *e)
+{
+	cfs_binheap_node_t **new_ptr;
+	unsigned int	     new_idx = h->cbh_nelements;
+	int		     rc;
+
+	if (new_idx == h->cbh_hwm) {
+		rc = cfs_binheap_grow(h);
+		if (rc != 0)
+			return rc;
+	}
+
+	if (h->cbh_ops->hop_enter) {
+		rc = h->cbh_ops->hop_enter(h, e);
+		if (rc != 0)
+			return rc;
+	}
+
+	e->chn_index = new_idx;
+	new_ptr = cfs_binheap_pointer(h, new_idx);
+	h->cbh_nelements++;
+	*new_ptr = e;
+
+	cfs_binheap_bubble(h, e);
+
+	return 0;
+}
+EXPORT_SYMBOL(cfs_binheap_insert);
+
+/**
+ * Removes a node from the binary heap.
+ *
+ * \param[in] h The heap
+ * \param[in] e The node
+ */
+void
+cfs_binheap_remove(cfs_binheap_t *h, cfs_binheap_node_t *e)
+{
+	unsigned int	     n = h->cbh_nelements;
+	unsigned int	     cur_idx = e->chn_index;
+	cfs_binheap_node_t **cur_ptr;
+	cfs_binheap_node_t  *last;
+
+	LASSERT(cur_idx != CBH_POISON);
+	LASSERT(cur_idx < n);
+
+	cur_ptr = cfs_binheap_pointer(h, cur_idx);
+	LASSERT(*cur_ptr == e);
+
+	n--;
+	last = *cfs_binheap_pointer(h, n);
+	h->cbh_nelements = n;
+	if (last == e)
+		return;
+
+	last->chn_index = cur_idx;
+	*cur_ptr = last;
+	if (!cfs_binheap_bubble(h, *cur_ptr))
+		cfs_binheap_sink(h, *cur_ptr);
+
+	e->chn_index = CBH_POISON;
+	if (h->cbh_ops->hop_exit)
+		h->cbh_ops->hop_exit(h, e);
+}
+EXPORT_SYMBOL(cfs_binheap_remove);
+
+/** @} heap */
diff --git a/drivers/staging/lustre/lustre/libcfs/kernel_user_comm.c b/drivers/staging/lustre/lustre/libcfs/kernel_user_comm.c
new file mode 100644
index 000000000000..c152223ed5d3
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/kernel_user_comm.c
@@ -0,0 +1,336 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Author: Nathan Rutman <nathan.rutman@sun.com>
+ *
+ * Kernel <-> userspace communication routines.
+ * Using pipes for all arches.
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#define D_KUC D_OTHER
+
+#include <linux/libcfs/libcfs.h>
+
+#ifdef LUSTRE_UTILS
+/* This is the userspace side. */
+
+/** Start the userspace side of a KUC pipe.
+ * @param link Private descriptor for pipe/socket.
+ * @param groups KUC broadcast group to listen to
+ *	  (can be null for unicast to this pid)
+ */
+int libcfs_ukuc_start(lustre_kernelcomm *link, int group)
+{
+	int pfd[2];
+
+	if (pipe(pfd) < 0)
+		return -errno;
+
+	memset(link, 0, sizeof(*link));
+	link->lk_rfd = pfd[0];
+	link->lk_wfd = pfd[1];
+	link->lk_group = group;
+	link->lk_uid = getpid();
+	return 0;
+}
+
+int libcfs_ukuc_stop(lustre_kernelcomm *link)
+{
+	if (link->lk_wfd > 0)
+		close(link->lk_wfd);
+	return close(link->lk_rfd);
+}
+
+#define lhsz sizeof(*kuch)
+
+/** Read a message from the link.
+ * Allocates memory, returns handle
+ *
+ * @param link Private descriptor for pipe/socket.
+ * @param buf Buffer to read into, must include size for kuc_hdr
+ * @param maxsize Maximum message size allowed
+ * @param transport Only listen to messages on this transport
+ *      (and the generic transport)
+ */
+int libcfs_ukuc_msg_get(lustre_kernelcomm *link, char *buf, int maxsize,
+			int transport)
+{
+	struct kuc_hdr *kuch;
+	int rc = 0;
+
+	memset(buf, 0, maxsize);
+
+	CDEBUG(D_KUC, "Waiting for message from kernel on fd %d\n",
+	       link->lk_rfd);
+
+	while (1) {
+		/* Read header first to get message size */
+		rc = read(link->lk_rfd, buf, lhsz);
+		if (rc <= 0) {
+			rc = -errno;
+			break;
+		}
+		kuch = (struct kuc_hdr *)buf;
+
+		CDEBUG(D_KUC, "Received message mg=%x t=%d m=%d l=%d\n",
+		       kuch->kuc_magic, kuch->kuc_transport, kuch->kuc_msgtype,
+		       kuch->kuc_msglen);
+
+		if (kuch->kuc_magic != KUC_MAGIC) {
+			CERROR("bad message magic %x != %x\n",
+			       kuch->kuc_magic, KUC_MAGIC);
+			rc = -EPROTO;
+			break;
+		}
+
+		if (kuch->kuc_msglen > maxsize) {
+			rc = -EMSGSIZE;
+			break;
+		}
+
+		/* Read payload */
+		rc = read(link->lk_rfd, buf + lhsz, kuch->kuc_msglen - lhsz);
+		if (rc < 0) {
+			rc = -errno;
+			break;
+		}
+		if (rc < (kuch->kuc_msglen - lhsz)) {
+			CERROR("short read: got %d of %d bytes\n",
+			       rc, kuch->kuc_msglen);
+			rc = -EPROTO;
+			break;
+		}
+
+		if (kuch->kuc_transport == transport ||
+		    kuch->kuc_transport == KUC_TRANSPORT_GENERIC) {
+			return 0;
+		}
+		/* Drop messages for other transports */
+	}
+	return rc;
+}
+
+#else /* LUSTRE_UTILS */
+/* This is the kernel side (liblustre as well). */
+
+/**
+ * libcfs_kkuc_msg_put - send an message from kernel to userspace
+ * @param fp to send the message to
+ * @param payload Payload data.  First field of payload is always
+ *   struct kuc_hdr
+ */
+int libcfs_kkuc_msg_put(struct file *filp, void *payload)
+{
+	struct kuc_hdr *kuch = (struct kuc_hdr *)payload;
+	int rc = -ENOSYS;
+
+	if (filp == NULL || IS_ERR(filp))
+		return -EBADF;
+
+	if (kuch->kuc_magic != KUC_MAGIC) {
+		CERROR("KernelComm: bad magic %x\n", kuch->kuc_magic);
+		return -ENOSYS;
+	}
+
+	{
+		loff_t offset = 0;
+		rc = filp_user_write(filp, payload, kuch->kuc_msglen,
+				     &offset);
+	}
+
+	if (rc < 0)
+		CWARN("message send failed (%d)\n", rc);
+	else
+		CDEBUG(D_KUC, "Sent message rc=%d, fp=%p\n", rc, filp);
+
+	return rc;
+}
+EXPORT_SYMBOL(libcfs_kkuc_msg_put);
+
+/* Broadcast groups are global across all mounted filesystems;
+ * i.e. registering for a group on 1 fs will get messages for that
+ * group from any fs */
+/** A single group reigstration has a uid and a file pointer */
+struct kkuc_reg {
+	struct list_head	kr_chain;
+	int		kr_uid;
+	struct file	*kr_fp;
+	__u32		kr_data;
+};
+static struct list_head kkuc_groups[KUC_GRP_MAX+1] = {};
+/* Protect message sending against remove and adds */
+static DECLARE_RWSEM(kg_sem);
+
+/** Add a receiver to a broadcast group
+ * @param filp pipe to write into
+ * @param uid identidier for this receiver
+ * @param group group number
+ */
+int libcfs_kkuc_group_add(struct file *filp, int uid, int group, __u32 data)
+{
+	struct kkuc_reg *reg;
+
+	if (group > KUC_GRP_MAX) {
+		CDEBUG(D_WARNING, "Kernelcomm: bad group %d\n", group);
+		return -EINVAL;
+	}
+
+	/* fput in group_rem */
+	if (filp == NULL)
+		return -EBADF;
+
+	/* freed in group_rem */
+	reg = kmalloc(sizeof(*reg), 0);
+	if (reg == NULL)
+		return -ENOMEM;
+
+	reg->kr_fp = filp;
+	reg->kr_uid = uid;
+	reg->kr_data = data;
+
+	down_write(&kg_sem);
+	if (kkuc_groups[group].next == NULL)
+		INIT_LIST_HEAD(&kkuc_groups[group]);
+	list_add(&reg->kr_chain, &kkuc_groups[group]);
+	up_write(&kg_sem);
+
+	CDEBUG(D_KUC, "Added uid=%d fp=%p to group %d\n", uid, filp, group);
+
+	return 0;
+}
+EXPORT_SYMBOL(libcfs_kkuc_group_add);
+
+int libcfs_kkuc_group_rem(int uid, int group)
+{
+	struct kkuc_reg *reg, *next;
+	ENTRY;
+
+	if (kkuc_groups[group].next == NULL)
+		RETURN(0);
+
+	if (uid == 0) {
+		/* Broadcast a shutdown message */
+		struct kuc_hdr lh;
+
+		lh.kuc_magic = KUC_MAGIC;
+		lh.kuc_transport = KUC_TRANSPORT_GENERIC;
+		lh.kuc_msgtype = KUC_MSG_SHUTDOWN;
+		lh.kuc_msglen = sizeof(lh);
+		libcfs_kkuc_group_put(group, &lh);
+	}
+
+	down_write(&kg_sem);
+	list_for_each_entry_safe(reg, next, &kkuc_groups[group], kr_chain) {
+		if ((uid == 0) || (uid == reg->kr_uid)) {
+			list_del(&reg->kr_chain);
+			CDEBUG(D_KUC, "Removed uid=%d fp=%p from group %d\n",
+			       reg->kr_uid, reg->kr_fp, group);
+			if (reg->kr_fp != NULL)
+				fput(reg->kr_fp);
+			kfree(reg);
+		}
+	}
+	up_write(&kg_sem);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(libcfs_kkuc_group_rem);
+
+int libcfs_kkuc_group_put(int group, void *payload)
+{
+	struct kkuc_reg	*reg;
+	int		 rc = 0;
+	int one_success = 0;
+	ENTRY;
+
+	down_read(&kg_sem);
+	list_for_each_entry(reg, &kkuc_groups[group], kr_chain) {
+		if (reg->kr_fp != NULL) {
+			rc = libcfs_kkuc_msg_put(reg->kr_fp, payload);
+			if (rc == 0)
+				one_success = 1;
+			else if (rc == -EPIPE) {
+				fput(reg->kr_fp);
+				reg->kr_fp = NULL;
+			}
+		}
+	}
+	up_read(&kg_sem);
+
+	/* don't return an error if the message has been delivered
+	 * at least to one agent */
+	if (one_success)
+		rc = 0;
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(libcfs_kkuc_group_put);
+
+/**
+ * Calls a callback function for each link of the given kuc group.
+ * @param group the group to call the function on.
+ * @param cb_func the function to be called.
+ * @param cb_arg iextra argument to be passed to the callback function.
+ */
+int libcfs_kkuc_group_foreach(int group, libcfs_kkuc_cb_t cb_func,
+			      void *cb_arg)
+{
+	struct kkuc_reg *reg;
+	int rc = 0;
+	ENTRY;
+
+	if (group > KUC_GRP_MAX) {
+		CDEBUG(D_WARNING, "Kernelcomm: bad group %d\n", group);
+		RETURN(-EINVAL);
+	}
+
+	/* no link for this group */
+	if (kkuc_groups[group].next == NULL)
+		RETURN(0);
+
+	down_read(&kg_sem);
+	list_for_each_entry(reg, &kkuc_groups[group], kr_chain) {
+		if (reg->kr_fp != NULL) {
+			rc = cb_func(reg->kr_data, cb_arg);
+		}
+	}
+	up_read(&kg_sem);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(libcfs_kkuc_group_foreach);
+
+#endif /* LUSTRE_UTILS */
diff --git a/drivers/staging/lustre/lustre/libcfs/libcfs_cpu.c b/drivers/staging/lustre/lustre/libcfs/libcfs_cpu.c
new file mode 100644
index 000000000000..8e88eb59dd51
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/libcfs_cpu.c
@@ -0,0 +1,204 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Please see comments in libcfs/include/libcfs/libcfs_cpu.h for introduction
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+
+/** Global CPU partition table */
+struct cfs_cpt_table   *cfs_cpt_table __read_mostly = NULL;
+EXPORT_SYMBOL(cfs_cpt_table);
+
+#ifndef HAVE_LIBCFS_CPT
+
+#define CFS_CPU_VERSION_MAGIC	   0xbabecafe
+
+struct cfs_cpt_table *
+cfs_cpt_table_alloc(unsigned int ncpt)
+{
+	struct cfs_cpt_table *cptab;
+
+	if (ncpt != 1) {
+		CERROR("Can't support cpu partition number %d\n", ncpt);
+		return NULL;
+	}
+
+	LIBCFS_ALLOC(cptab, sizeof(*cptab));
+	if (cptab != NULL) {
+		cptab->ctb_version = CFS_CPU_VERSION_MAGIC;
+		cptab->ctb_nparts  = ncpt;
+	}
+
+	return cptab;
+}
+EXPORT_SYMBOL(cfs_cpt_table_alloc);
+
+void
+cfs_cpt_table_free(struct cfs_cpt_table *cptab)
+{
+	LASSERT(cptab->ctb_version == CFS_CPU_VERSION_MAGIC);
+
+	LIBCFS_FREE(cptab, sizeof(*cptab));
+}
+EXPORT_SYMBOL(cfs_cpt_table_free);
+
+int
+cfs_cpt_number(struct cfs_cpt_table *cptab)
+{
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_number);
+
+int
+cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt)
+{
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_weight);
+
+int
+cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt)
+{
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_online);
+
+int
+cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_cpu);
+
+void
+cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+}
+EXPORT_SYMBOL(cfs_cpt_unset_cpu);
+
+int
+cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask)
+{
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_cpumask);
+
+void
+cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask)
+{
+}
+EXPORT_SYMBOL(cfs_cpt_unset_cpumask);
+
+int
+cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_node);
+
+void
+cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+}
+EXPORT_SYMBOL(cfs_cpt_unset_node);
+
+int
+cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask)
+{
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_nodemask);
+
+void
+cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask)
+{
+}
+EXPORT_SYMBOL(cfs_cpt_unset_nodemask);
+
+void
+cfs_cpt_clear(struct cfs_cpt_table *cptab, int cpt)
+{
+}
+EXPORT_SYMBOL(cfs_cpt_clear);
+
+int
+cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt)
+{
+	return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_spread_node);
+
+int
+cfs_cpt_current(struct cfs_cpt_table *cptab, int remap)
+{
+	return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_current);
+
+int
+cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu)
+{
+	return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_of_cpu);
+
+int
+cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt)
+{
+	return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_bind);
+
+void
+cfs_cpu_fini(void)
+{
+	if (cfs_cpt_table != NULL) {
+		cfs_cpt_table_free(cfs_cpt_table);
+		cfs_cpt_table = NULL;
+	}
+}
+
+int
+cfs_cpu_init(void)
+{
+	cfs_cpt_table = cfs_cpt_table_alloc(1);
+
+	return cfs_cpt_table != NULL ? 0 : -1;
+}
+
+#endif /* HAVE_LIBCFS_CPT */
diff --git a/drivers/staging/lustre/lustre/libcfs/libcfs_lock.c b/drivers/staging/lustre/lustre/libcfs/libcfs_lock.c
new file mode 100644
index 000000000000..8d6c4adf2ee6
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/libcfs_lock.c
@@ -0,0 +1,192 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+
+
+/** destroy cpu-partition lock, see libcfs_private.h for more detail */
+void
+cfs_percpt_lock_free(struct cfs_percpt_lock *pcl)
+{
+	LASSERT(pcl->pcl_locks != NULL);
+	LASSERT(!pcl->pcl_locked);
+
+	cfs_percpt_free(pcl->pcl_locks);
+	LIBCFS_FREE(pcl, sizeof(*pcl));
+}
+EXPORT_SYMBOL(cfs_percpt_lock_free);
+
+/**
+ * create cpu-partition lock, see libcfs_private.h for more detail.
+ *
+ * cpu-partition lock is designed for large-scale SMP system, so we need to
+ * reduce cacheline conflict as possible as we can, that's the
+ * reason we always allocate cacheline-aligned memory block.
+ */
+struct cfs_percpt_lock *
+cfs_percpt_lock_alloc(struct cfs_cpt_table *cptab)
+{
+	struct cfs_percpt_lock	*pcl;
+	spinlock_t		*lock;
+	int			i;
+
+	/* NB: cptab can be NULL, pcl will be for HW CPUs on that case */
+	LIBCFS_ALLOC(pcl, sizeof(*pcl));
+	if (pcl == NULL)
+		return NULL;
+
+	pcl->pcl_cptab = cptab;
+	pcl->pcl_locks = cfs_percpt_alloc(cptab, sizeof(*lock));
+	if (pcl->pcl_locks == NULL) {
+		LIBCFS_FREE(pcl, sizeof(*pcl));
+		return NULL;
+	}
+
+	cfs_percpt_for_each(lock, i, pcl->pcl_locks)
+		spin_lock_init(lock);
+
+	return pcl;
+}
+EXPORT_SYMBOL(cfs_percpt_lock_alloc);
+
+/**
+ * lock a CPU partition
+ *
+ * \a index != CFS_PERCPT_LOCK_EX
+ *     hold private lock indexed by \a index
+ *
+ * \a index == CFS_PERCPT_LOCK_EX
+ *     exclusively lock @pcl and nobody can take private lock
+ */
+void
+cfs_percpt_lock(struct cfs_percpt_lock *pcl, int index)
+{
+	int	ncpt = cfs_cpt_number(pcl->pcl_cptab);
+	int	i;
+
+	LASSERT(index >= CFS_PERCPT_LOCK_EX && index < ncpt);
+
+	if (ncpt == 1) {
+		index = 0;
+	} else { /* serialize with exclusive lock */
+		while (pcl->pcl_locked)
+			cpu_relax();
+	}
+
+	if (likely(index != CFS_PERCPT_LOCK_EX)) {
+		spin_lock(pcl->pcl_locks[index]);
+		return;
+	}
+
+	/* exclusive lock request */
+	for (i = 0; i < ncpt; i++) {
+		spin_lock(pcl->pcl_locks[i]);
+		if (i == 0) {
+			LASSERT(!pcl->pcl_locked);
+			/* nobody should take private lock after this
+			 * so I wouldn't starve for too long time */
+			pcl->pcl_locked = 1;
+		}
+	}
+}
+EXPORT_SYMBOL(cfs_percpt_lock);
+
+/** unlock a CPU partition */
+void
+cfs_percpt_unlock(struct cfs_percpt_lock *pcl, int index)
+{
+	int	ncpt = cfs_cpt_number(pcl->pcl_cptab);
+	int	i;
+
+	index = ncpt == 1 ? 0 : index;
+
+	if (likely(index != CFS_PERCPT_LOCK_EX)) {
+		spin_unlock(pcl->pcl_locks[index]);
+		return;
+	}
+
+	for (i = ncpt - 1; i >= 0; i--) {
+		if (i == 0) {
+			LASSERT(pcl->pcl_locked);
+			pcl->pcl_locked = 0;
+		}
+		spin_unlock(pcl->pcl_locks[i]);
+	}
+}
+EXPORT_SYMBOL(cfs_percpt_unlock);
+
+
+/** free cpu-partition refcount */
+void
+cfs_percpt_atomic_free(atomic_t **refs)
+{
+	cfs_percpt_free(refs);
+}
+EXPORT_SYMBOL(cfs_percpt_atomic_free);
+
+/** allocate cpu-partition refcount with initial value @init_val */
+atomic_t **
+cfs_percpt_atomic_alloc(struct cfs_cpt_table *cptab, int init_val)
+{
+	atomic_t	**refs;
+	atomic_t	*ref;
+	int		i;
+
+	refs = cfs_percpt_alloc(cptab, sizeof(*ref));
+	if (refs == NULL)
+		return NULL;
+
+	cfs_percpt_for_each(ref, i, refs)
+		atomic_set(ref, init_val);
+	return refs;
+}
+EXPORT_SYMBOL(cfs_percpt_atomic_alloc);
+
+/** return sum of cpu-partition refs */
+int
+cfs_percpt_atomic_summary(atomic_t **refs)
+{
+	atomic_t	*ref;
+	int		i;
+	int		val = 0;
+
+	cfs_percpt_for_each(ref, i, refs)
+		val += atomic_read(ref);
+
+	return val;
+}
+EXPORT_SYMBOL(cfs_percpt_atomic_summary);
diff --git a/drivers/staging/lustre/lustre/libcfs/libcfs_mem.c b/drivers/staging/lustre/lustre/libcfs/libcfs_mem.c
new file mode 100644
index 000000000000..879137303482
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/libcfs_mem.c
@@ -0,0 +1,205 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+
+struct cfs_var_array {
+	unsigned int		va_count;	/* # of buffers */
+	unsigned int		va_size;	/* size of each var */
+	struct cfs_cpt_table	*va_cptab;	/* cpu partition table */
+	void			*va_ptrs[0];	/* buffer addresses */
+};
+
+/*
+ * free per-cpu data, see more detail in cfs_percpt_free
+ */
+void
+cfs_percpt_free(void *vars)
+{
+	struct	cfs_var_array *arr;
+	int	i;
+
+	arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);
+
+	for (i = 0; i < arr->va_count; i++) {
+		if (arr->va_ptrs[i] != NULL)
+			LIBCFS_FREE(arr->va_ptrs[i], arr->va_size);
+	}
+
+	LIBCFS_FREE(arr, offsetof(struct cfs_var_array,
+				  va_ptrs[arr->va_count]));
+}
+EXPORT_SYMBOL(cfs_percpt_free);
+
+/*
+ * allocate per cpu-partition variables, returned value is an array of pointers,
+ * variable can be indexed by CPU partition ID, i.e:
+ *
+ *	arr = cfs_percpt_alloc(cfs_cpu_pt, size);
+ *	then caller can access memory block for CPU 0 by arr[0],
+ *	memory block for CPU 1 by arr[1]...
+ *	memory block for CPU N by arr[N]...
+ *
+ * cacheline aligned.
+ */
+void *
+cfs_percpt_alloc(struct cfs_cpt_table *cptab, unsigned int size)
+{
+	struct cfs_var_array	*arr;
+	int			count;
+	int			i;
+
+	count = cfs_cpt_number(cptab);
+
+	LIBCFS_ALLOC(arr, offsetof(struct cfs_var_array, va_ptrs[count]));
+	if (arr == NULL)
+		return NULL;
+
+	arr->va_size	= size = L1_CACHE_ALIGN(size);
+	arr->va_count	= count;
+	arr->va_cptab	= cptab;
+
+	for (i = 0; i < count; i++) {
+		LIBCFS_CPT_ALLOC(arr->va_ptrs[i], cptab, i, size);
+		if (arr->va_ptrs[i] == NULL) {
+			cfs_percpt_free((void *)&arr->va_ptrs[0]);
+			return NULL;
+		}
+	}
+
+	return (void *)&arr->va_ptrs[0];
+}
+EXPORT_SYMBOL(cfs_percpt_alloc);
+
+/*
+ * return number of CPUs (or number of elements in per-cpu data)
+ * according to cptab of @vars
+ */
+int
+cfs_percpt_number(void *vars)
+{
+	struct cfs_var_array *arr;
+
+	arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);
+
+	return arr->va_count;
+}
+EXPORT_SYMBOL(cfs_percpt_number);
+
+/*
+ * return memory block shadowed from current CPU
+ */
+void *
+cfs_percpt_current(void *vars)
+{
+	struct cfs_var_array *arr;
+	int    cpt;
+
+	arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);
+	cpt = cfs_cpt_current(arr->va_cptab, 0);
+	if (cpt < 0)
+		return NULL;
+
+	return arr->va_ptrs[cpt];
+}
+EXPORT_SYMBOL(cfs_percpt_current);
+
+void *
+cfs_percpt_index(void *vars, int idx)
+{
+	struct cfs_var_array *arr;
+
+	arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);
+
+	LASSERT(idx >= 0 && idx < arr->va_count);
+	return arr->va_ptrs[idx];
+}
+EXPORT_SYMBOL(cfs_percpt_index);
+
+/*
+ * free variable array, see more detail in cfs_array_alloc
+ */
+void
+cfs_array_free(void *vars)
+{
+	struct cfs_var_array	*arr;
+	int			i;
+
+	arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);
+
+	for (i = 0; i < arr->va_count; i++) {
+		if (arr->va_ptrs[i] == NULL)
+			continue;
+
+		LIBCFS_FREE(arr->va_ptrs[i], arr->va_size);
+	}
+	LIBCFS_FREE(arr, offsetof(struct cfs_var_array,
+				  va_ptrs[arr->va_count]));
+}
+EXPORT_SYMBOL(cfs_array_free);
+
+/*
+ * allocate a variable array, returned value is an array of pointers.
+ * Caller can specify length of array by @count, @size is size of each
+ * memory block in array.
+ */
+void *
+cfs_array_alloc(int count, unsigned int size)
+{
+	struct cfs_var_array	*arr;
+	int			i;
+
+	LIBCFS_ALLOC(arr, offsetof(struct cfs_var_array, va_ptrs[count]));
+	if (arr == NULL)
+		return NULL;
+
+	arr->va_count	= count;
+	arr->va_size	= size;
+
+	for (i = 0; i < count; i++) {
+		LIBCFS_ALLOC(arr->va_ptrs[i], size);
+
+		if (arr->va_ptrs[i] == NULL) {
+			cfs_array_free((void *)&arr->va_ptrs[0]);
+			return NULL;
+		}
+	}
+
+	return (void *)&arr->va_ptrs[0];
+}
+EXPORT_SYMBOL(cfs_array_alloc);
diff --git a/drivers/staging/lustre/lustre/libcfs/libcfs_string.c b/drivers/staging/lustre/lustre/libcfs/libcfs_string.c
new file mode 100644
index 000000000000..9edccc99683e
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/libcfs_string.c
@@ -0,0 +1,647 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * String manipulation functions.
+ *
+ * libcfs/libcfs/libcfs_string.c
+ *
+ * Author: Nathan Rutman <nathan.rutman@sun.com>
+ */
+
+#include <linux/libcfs/libcfs.h>
+
+/* non-0 = don't match */
+int cfs_strncasecmp(const char *s1, const char *s2, size_t n)
+{
+	if (s1 == NULL || s2 == NULL)
+		return 1;
+
+	if (n == 0)
+		return 0;
+
+	while (n-- != 0 && tolower(*s1) == tolower(*s2)) {
+		if (n == 0 || *s1 == '\0' || *s2 == '\0')
+			break;
+		s1++;
+		s2++;
+	}
+
+	return tolower(*(unsigned char *)s1) - tolower(*(unsigned char *)s2);
+}
+EXPORT_SYMBOL(cfs_strncasecmp);
+
+/* Convert a text string to a bitmask */
+int cfs_str2mask(const char *str, const char *(*bit2str)(int bit),
+		 int *oldmask, int minmask, int allmask)
+{
+	const char *debugstr;
+	char op = 0;
+	int newmask = minmask, i, len, found = 0;
+	ENTRY;
+
+	/* <str> must be a list of tokens separated by whitespace
+	 * and optionally an operator ('+' or '-').  If an operator
+	 * appears first in <str>, '*oldmask' is used as the starting point
+	 * (relative), otherwise minmask is used (absolute).  An operator
+	 * applies to all following tokens up to the next operator. */
+	while (*str != 0) {
+		while (isspace(*str))
+			str++;
+		if (*str == 0)
+			break;
+		if (*str == '+' || *str == '-') {
+			op = *str++;
+			if (!found)
+				/* only if first token is relative */
+				newmask = *oldmask;
+			while (isspace(*str))
+				str++;
+			if (*str == 0)	  /* trailing op */
+				return -EINVAL;
+		}
+
+		/* find token length */
+		for (len = 0; str[len] != 0 && !isspace(str[len]) &&
+		      str[len] != '+' && str[len] != '-'; len++);
+
+		/* match token */
+		found = 0;
+		for (i = 0; i < 32; i++) {
+			debugstr = bit2str(i);
+			if (debugstr != NULL &&
+			    strlen(debugstr) == len &&
+			    cfs_strncasecmp(str, debugstr, len) == 0) {
+				if (op == '-')
+					newmask &= ~(1 << i);
+				else
+					newmask |= (1 << i);
+				found = 1;
+				break;
+			}
+		}
+		if (!found && len == 3 &&
+		    (cfs_strncasecmp(str, "ALL", len) == 0)) {
+			if (op == '-')
+				newmask = minmask;
+			else
+				newmask = allmask;
+			found = 1;
+		}
+		if (!found) {
+			CWARN("unknown mask '%.*s'.\n"
+			      "mask usage: [+|-]<all|type> ...\n", len, str);
+			return -EINVAL;
+		}
+		str += len;
+	}
+
+	*oldmask = newmask;
+	return 0;
+}
+EXPORT_SYMBOL(cfs_str2mask);
+
+/* Duplicate a string in a platform-independent way */
+char *cfs_strdup(const char *str, u_int32_t flags)
+{
+	size_t lenz; /* length of str + zero byte */
+	char *dup_str;
+
+	lenz = strlen(str) + 1;
+
+	dup_str = kmalloc(lenz, flags);
+	if (dup_str == NULL)
+		return NULL;
+
+	memcpy(dup_str, str, lenz);
+
+	return dup_str;
+}
+EXPORT_SYMBOL(cfs_strdup);
+
+/**
+ * cfs_{v}snprintf() return the actual size that is printed rather than
+ * the size that would be printed in standard functions.
+ */
+/* safe vsnprintf */
+int cfs_vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
+{
+	int i;
+
+	LASSERT(size > 0);
+	i = vsnprintf(buf, size, fmt, args);
+
+	return  (i >= size ? size - 1 : i);
+}
+EXPORT_SYMBOL(cfs_vsnprintf);
+
+/* safe snprintf */
+int cfs_snprintf(char *buf, size_t size, const char *fmt, ...)
+{
+	va_list args;
+	int i;
+
+	va_start(args, fmt);
+	i = cfs_vsnprintf(buf, size, fmt, args);
+	va_end(args);
+
+	return  i;
+}
+EXPORT_SYMBOL(cfs_snprintf);
+
+/* get the first string out of @str */
+char *cfs_firststr(char *str, size_t size)
+{
+	size_t i = 0;
+	char  *end;
+
+	/* trim leading spaces */
+	while (i < size && *str && isspace(*str)) {
+		++i;
+		++str;
+	}
+
+	/* string with all spaces */
+	if (*str == '\0')
+		goto out;
+
+	end = str;
+	while (i < size && *end != '\0' && !isspace(*end)) {
+		++i;
+		++end;
+	}
+
+	*end= '\0';
+out:
+	return str;
+}
+EXPORT_SYMBOL(cfs_firststr);
+
+char *
+cfs_trimwhite(char *str)
+{
+	char *end;
+
+	while (cfs_iswhite(*str))
+		str++;
+
+	end = str + strlen(str);
+	while (end > str) {
+		if (!cfs_iswhite(end[-1]))
+			break;
+		end--;
+	}
+
+	*end = 0;
+	return str;
+}
+EXPORT_SYMBOL(cfs_trimwhite);
+
+/**
+ * Extracts tokens from strings.
+ *
+ * Looks for \a delim in string \a next, sets \a res to point to
+ * substring before the delimiter, sets \a next right after the found
+ * delimiter.
+ *
+ * \retval 1 if \a res points to a string of non-whitespace characters
+ * \retval 0 otherwise
+ */
+int
+cfs_gettok(struct cfs_lstr *next, char delim, struct cfs_lstr *res)
+{
+	char *end;
+
+	if (next->ls_str == NULL)
+		return 0;
+
+	/* skip leading white spaces */
+	while (next->ls_len) {
+		if (!cfs_iswhite(*next->ls_str))
+			break;
+		next->ls_str++;
+		next->ls_len--;
+	}
+
+	if (next->ls_len == 0) /* whitespaces only */
+		return 0;
+
+	if (*next->ls_str == delim) {
+		/* first non-writespace is the delimiter */
+		return 0;
+	}
+
+	res->ls_str = next->ls_str;
+	end = memchr(next->ls_str, delim, next->ls_len);
+	if (end == NULL) {
+		/* there is no the delimeter in the string */
+		end = next->ls_str + next->ls_len;
+		next->ls_str = NULL;
+	} else {
+		next->ls_str = end + 1;
+		next->ls_len -= (end - res->ls_str + 1);
+	}
+
+	/* skip ending whitespaces */
+	while (--end != res->ls_str) {
+		if (!cfs_iswhite(*end))
+			break;
+	}
+
+	res->ls_len = end - res->ls_str + 1;
+	return 1;
+}
+EXPORT_SYMBOL(cfs_gettok);
+
+/**
+ * Converts string to integer.
+ *
+ * Accepts decimal and hexadecimal number recordings.
+ *
+ * \retval 1 if first \a nob chars of \a str convert to decimal or
+ * hexadecimal integer in the range [\a min, \a max]
+ * \retval 0 otherwise
+ */
+int
+cfs_str2num_check(char *str, int nob, unsigned *num,
+		  unsigned min, unsigned max)
+{
+	char	*endp;
+
+	str = cfs_trimwhite(str);
+	*num = strtoul(str, &endp, 0);
+	if (endp == str)
+		return 0;
+
+	for (; endp < str + nob; endp++) {
+		if (!cfs_iswhite(*endp))
+			return 0;
+	}
+
+	return (*num >= min && *num <= max);
+}
+EXPORT_SYMBOL(cfs_str2num_check);
+
+/**
+ * Parses \<range_expr\> token of the syntax. If \a bracketed is false,
+ * \a src should only have a single token which can be \<number\> or  \*
+ *
+ * \retval pointer to allocated range_expr and initialized
+ * range_expr::re_lo, range_expr::re_hi and range_expr:re_stride if \a
+ `* src parses to
+ * \<number\> |
+ * \<number\> '-' \<number\> |
+ * \<number\> '-' \<number\> '/' \<number\>
+ * \retval 0 will be returned if it can be parsed, otherwise -EINVAL or
+ * -ENOMEM will be returned.
+ */
+int
+cfs_range_expr_parse(struct cfs_lstr *src, unsigned min, unsigned max,
+		     int bracketed, struct cfs_range_expr **expr)
+{
+	struct cfs_range_expr	*re;
+	struct cfs_lstr		tok;
+
+	LIBCFS_ALLOC(re, sizeof(*re));
+	if (re == NULL)
+		return -ENOMEM;
+
+	if (src->ls_len == 1 && src->ls_str[0] == '*') {
+		re->re_lo = min;
+		re->re_hi = max;
+		re->re_stride = 1;
+		goto out;
+	}
+
+	if (cfs_str2num_check(src->ls_str, src->ls_len,
+			      &re->re_lo, min, max)) {
+		/* <number> is parsed */
+		re->re_hi = re->re_lo;
+		re->re_stride = 1;
+		goto out;
+	}
+
+	if (!bracketed || !cfs_gettok(src, '-', &tok))
+		goto failed;
+
+	if (!cfs_str2num_check(tok.ls_str, tok.ls_len,
+			       &re->re_lo, min, max))
+		goto failed;
+
+	/* <number> - */
+	if (cfs_str2num_check(src->ls_str, src->ls_len,
+			      &re->re_hi, min, max)) {
+		/* <number> - <number> is parsed */
+		re->re_stride = 1;
+		goto out;
+	}
+
+	/* go to check <number> '-' <number> '/' <number> */
+	if (cfs_gettok(src, '/', &tok)) {
+		if (!cfs_str2num_check(tok.ls_str, tok.ls_len,
+				       &re->re_hi, min, max))
+			goto failed;
+
+		/* <number> - <number> / ... */
+		if (cfs_str2num_check(src->ls_str, src->ls_len,
+				      &re->re_stride, min, max)) {
+			/* <number> - <number> / <number> is parsed */
+			goto out;
+		}
+	}
+
+ out:
+	*expr = re;
+	return 0;
+
+ failed:
+	LIBCFS_FREE(re, sizeof(*re));
+	return -EINVAL;
+}
+EXPORT_SYMBOL(cfs_range_expr_parse);
+
+/**
+ * Matches value (\a value) against ranges expression list \a expr_list.
+ *
+ * \retval 1 if \a value matches
+ * \retval 0 otherwise
+ */
+int
+cfs_expr_list_match(__u32 value, struct cfs_expr_list *expr_list)
+{
+	struct cfs_range_expr	*expr;
+
+	list_for_each_entry(expr, &expr_list->el_exprs, re_link) {
+		if (value >= expr->re_lo && value <= expr->re_hi &&
+		    ((value - expr->re_lo) % expr->re_stride) == 0)
+			return 1;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(cfs_expr_list_match);
+
+/**
+ * Convert express list (\a expr_list) to an array of all matched values
+ *
+ * \retval N N is total number of all matched values
+ * \retval 0 if expression list is empty
+ * \retval < 0 for failure
+ */
+int
+cfs_expr_list_values(struct cfs_expr_list *expr_list, int max, __u32 **valpp)
+{
+	struct cfs_range_expr	*expr;
+	__u32			*val;
+	int			count = 0;
+	int			i;
+
+	list_for_each_entry(expr, &expr_list->el_exprs, re_link) {
+		for (i = expr->re_lo; i <= expr->re_hi; i++) {
+			if (((i - expr->re_lo) % expr->re_stride) == 0)
+				count++;
+		}
+	}
+
+	if (count == 0) /* empty expression list */
+		return 0;
+
+	if (count > max) {
+		CERROR("Number of values %d exceeds max allowed %d\n",
+		       max, count);
+		return -EINVAL;
+	}
+
+	LIBCFS_ALLOC(val, sizeof(val[0]) * count);
+	if (val == NULL)
+		return -ENOMEM;
+
+	count = 0;
+	list_for_each_entry(expr, &expr_list->el_exprs, re_link) {
+		for (i = expr->re_lo; i <= expr->re_hi; i++) {
+			if (((i - expr->re_lo) % expr->re_stride) == 0)
+				val[count++] = i;
+		}
+	}
+
+	*valpp = val;
+	return count;
+}
+EXPORT_SYMBOL(cfs_expr_list_values);
+
+/**
+ * Frees cfs_range_expr structures of \a expr_list.
+ *
+ * \retval none
+ */
+void
+cfs_expr_list_free(struct cfs_expr_list *expr_list)
+{
+	while (!list_empty(&expr_list->el_exprs)) {
+		struct cfs_range_expr *expr;
+
+		expr = list_entry(expr_list->el_exprs.next,
+				      struct cfs_range_expr, re_link),
+		list_del(&expr->re_link);
+		LIBCFS_FREE(expr, sizeof(*expr));
+	}
+
+	LIBCFS_FREE(expr_list, sizeof(*expr_list));
+}
+EXPORT_SYMBOL(cfs_expr_list_free);
+
+void
+cfs_expr_list_print(struct cfs_expr_list *expr_list)
+{
+	struct cfs_range_expr *expr;
+
+	list_for_each_entry(expr, &expr_list->el_exprs, re_link) {
+		CDEBUG(D_WARNING, "%d-%d/%d\n",
+		       expr->re_lo, expr->re_hi, expr->re_stride);
+	}
+}
+EXPORT_SYMBOL(cfs_expr_list_print);
+
+/**
+ * Parses \<cfs_expr_list\> token of the syntax.
+ *
+ * \retval 1 if \a str parses to \<number\> | \<expr_list\>
+ * \retval 0 otherwise
+ */
+int
+cfs_expr_list_parse(char *str, int len, unsigned min, unsigned max,
+		    struct cfs_expr_list **elpp)
+{
+	struct cfs_expr_list	*expr_list;
+	struct cfs_range_expr	*expr;
+	struct cfs_lstr		src;
+	int			rc;
+
+	LIBCFS_ALLOC(expr_list, sizeof(*expr_list));
+	if (expr_list == NULL)
+		return -ENOMEM;
+
+	src.ls_str = str;
+	src.ls_len = len;
+
+	INIT_LIST_HEAD(&expr_list->el_exprs);
+
+	if (src.ls_str[0] == '[' &&
+	    src.ls_str[src.ls_len - 1] == ']') {
+		src.ls_str++;
+		src.ls_len -= 2;
+
+		rc = -EINVAL;
+		while (src.ls_str != NULL) {
+			struct cfs_lstr tok;
+
+			if (!cfs_gettok(&src, ',', &tok)) {
+				rc = -EINVAL;
+				break;
+			}
+
+			rc = cfs_range_expr_parse(&tok, min, max, 1, &expr);
+			if (rc != 0)
+				break;
+
+			list_add_tail(&expr->re_link,
+					  &expr_list->el_exprs);
+		}
+	} else {
+		rc = cfs_range_expr_parse(&src, min, max, 0, &expr);
+		if (rc == 0) {
+			list_add_tail(&expr->re_link,
+					  &expr_list->el_exprs);
+		}
+	}
+
+	if (rc != 0)
+		cfs_expr_list_free(expr_list);
+	else
+		*elpp = expr_list;
+
+	return rc;
+}
+EXPORT_SYMBOL(cfs_expr_list_parse);
+
+/**
+ * Frees cfs_expr_list structures of \a list.
+ *
+ * For each struct cfs_expr_list structure found on \a list it frees
+ * range_expr list attached to it and frees the cfs_expr_list itself.
+ *
+ * \retval none
+ */
+void
+cfs_expr_list_free_list(struct list_head *list)
+{
+	struct cfs_expr_list *el;
+
+	while (!list_empty(list)) {
+		el = list_entry(list->next,
+				    struct cfs_expr_list, el_link);
+		list_del(&el->el_link);
+		cfs_expr_list_free(el);
+	}
+}
+EXPORT_SYMBOL(cfs_expr_list_free_list);
+
+int
+cfs_ip_addr_parse(char *str, int len, struct list_head *list)
+{
+	struct cfs_expr_list	*el;
+	struct cfs_lstr		src;
+	int			rc;
+	int			i;
+
+	src.ls_str = str;
+	src.ls_len = len;
+	i = 0;
+
+	while (src.ls_str != NULL) {
+		struct cfs_lstr res;
+
+		if (!cfs_gettok(&src, '.', &res)) {
+			rc = -EINVAL;
+			goto out;
+		}
+
+		rc = cfs_expr_list_parse(res.ls_str, res.ls_len, 0, 255, &el);
+		if (rc != 0)
+			goto out;
+
+		list_add_tail(&el->el_link, list);
+		i++;
+	}
+
+	if (i == 4)
+		return 0;
+
+	rc = -EINVAL;
+ out:
+	cfs_expr_list_free_list(list);
+
+	return rc;
+}
+EXPORT_SYMBOL(cfs_ip_addr_parse);
+
+/**
+ * Matches address (\a addr) against address set encoded in \a list.
+ *
+ * \retval 1 if \a addr matches
+ * \retval 0 otherwise
+ */
+int
+cfs_ip_addr_match(__u32 addr, struct list_head *list)
+{
+	struct cfs_expr_list *el;
+	int i = 0;
+
+	list_for_each_entry_reverse(el, list, el_link) {
+		if (!cfs_expr_list_match(addr & 0xff, el))
+			return 0;
+		addr >>= 8;
+		i++;
+	}
+
+	return i == 4;
+}
+EXPORT_SYMBOL(cfs_ip_addr_match);
+
+void
+cfs_ip_addr_free(struct list_head *list)
+{
+	cfs_expr_list_free_list(list);
+}
+EXPORT_SYMBOL(cfs_ip_addr_free);
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c
new file mode 100644
index 000000000000..6e255ff55e85
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c
@@ -0,0 +1,1085 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/cpu.h>
+#include <linux/sched.h>
+#include <linux/libcfs/libcfs.h>
+
+#ifdef CONFIG_SMP
+
+/**
+ * modparam for setting number of partitions
+ *
+ *  0 : estimate best value based on cores or NUMA nodes
+ *  1 : disable multiple partitions
+ * >1 : specify number of partitions
+ */
+static int	cpu_npartitions;
+CFS_MODULE_PARM(cpu_npartitions, "i", int, 0444, "# of CPU partitions");
+
+/**
+ * modparam for setting CPU partitions patterns:
+ *
+ * i.e: "0[0,1,2,3] 1[4,5,6,7]", number before bracket is CPU partition ID,
+ *      number in bracket is processor ID (core or HT)
+ *
+ * i.e: "N 0[0,1] 1[2,3]" the first character 'N' means numbers in bracket
+ *       are NUMA node ID, number before bracket is CPU partition ID.
+ *
+ * NB: If user specified cpu_pattern, cpu_npartitions will be ignored
+ */
+static char	*cpu_pattern = "";
+CFS_MODULE_PARM(cpu_pattern, "s", charp, 0444, "CPU partitions pattern");
+
+struct cfs_cpt_data {
+	/* serialize hotplug etc */
+	spinlock_t		cpt_lock;
+	/* reserved for hotplug */
+	unsigned long		cpt_version;
+	/* mutex to protect cpt_cpumask */
+	struct semaphore	cpt_mutex;
+	/* scratch buffer for set/unset_node */
+	cpumask_t		*cpt_cpumask;
+};
+
+static struct cfs_cpt_data	cpt_data;
+
+void
+cfs_cpu_core_siblings(int cpu, cpumask_t *mask)
+{
+	/* return cpumask of cores in the same socket */
+	cpumask_copy(mask, topology_core_cpumask(cpu));
+}
+EXPORT_SYMBOL(cfs_cpu_core_siblings);
+
+/* return number of cores in the same socket of \a cpu */
+int
+cfs_cpu_core_nsiblings(int cpu)
+{
+	int	num;
+
+	down(&cpt_data.cpt_mutex);
+
+	cfs_cpu_core_siblings(cpu, cpt_data.cpt_cpumask);
+	num = cpus_weight(*cpt_data.cpt_cpumask);
+
+	up(&cpt_data.cpt_mutex);
+
+	return num;
+}
+EXPORT_SYMBOL(cfs_cpu_core_nsiblings);
+
+/* return cpumask of HTs in the same core */
+void
+cfs_cpu_ht_siblings(int cpu, cpumask_t *mask)
+{
+	cpumask_copy(mask, topology_thread_cpumask(cpu));
+}
+EXPORT_SYMBOL(cfs_cpu_ht_siblings);
+
+/* return number of HTs in the same core of \a cpu */
+int
+cfs_cpu_ht_nsiblings(int cpu)
+{
+	int	num;
+
+	down(&cpt_data.cpt_mutex);
+
+	cfs_cpu_ht_siblings(cpu, cpt_data.cpt_cpumask);
+	num = cpus_weight(*cpt_data.cpt_cpumask);
+
+	up(&cpt_data.cpt_mutex);
+
+	return num;
+}
+EXPORT_SYMBOL(cfs_cpu_ht_nsiblings);
+
+void
+cfs_node_to_cpumask(int node, cpumask_t *mask)
+{
+	cpumask_copy(mask, cpumask_of_node(node));
+}
+EXPORT_SYMBOL(cfs_node_to_cpumask);
+
+void
+cfs_cpt_table_free(struct cfs_cpt_table *cptab)
+{
+	int	i;
+
+	if (cptab->ctb_cpu2cpt != NULL) {
+		LIBCFS_FREE(cptab->ctb_cpu2cpt,
+			    num_possible_cpus() *
+			    sizeof(cptab->ctb_cpu2cpt[0]));
+	}
+
+	for (i = 0; cptab->ctb_parts != NULL && i < cptab->ctb_nparts; i++) {
+		struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
+
+		if (part->cpt_nodemask != NULL) {
+			LIBCFS_FREE(part->cpt_nodemask,
+				    sizeof(*part->cpt_nodemask));
+		}
+
+		if (part->cpt_cpumask != NULL)
+			LIBCFS_FREE(part->cpt_cpumask, cpumask_size());
+	}
+
+	if (cptab->ctb_parts != NULL) {
+		LIBCFS_FREE(cptab->ctb_parts,
+			    cptab->ctb_nparts * sizeof(cptab->ctb_parts[0]));
+	}
+
+	if (cptab->ctb_nodemask != NULL)
+		LIBCFS_FREE(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
+	if (cptab->ctb_cpumask != NULL)
+		LIBCFS_FREE(cptab->ctb_cpumask, cpumask_size());
+
+	LIBCFS_FREE(cptab, sizeof(*cptab));
+}
+EXPORT_SYMBOL(cfs_cpt_table_free);
+
+struct cfs_cpt_table *
+cfs_cpt_table_alloc(unsigned int ncpt)
+{
+	struct cfs_cpt_table *cptab;
+	int	i;
+
+	LIBCFS_ALLOC(cptab, sizeof(*cptab));
+	if (cptab == NULL)
+		return NULL;
+
+	cptab->ctb_nparts = ncpt;
+
+	LIBCFS_ALLOC(cptab->ctb_cpumask, cpumask_size());
+	LIBCFS_ALLOC(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
+
+	if (cptab->ctb_cpumask == NULL || cptab->ctb_nodemask == NULL)
+		goto failed;
+
+	LIBCFS_ALLOC(cptab->ctb_cpu2cpt,
+		     num_possible_cpus() * sizeof(cptab->ctb_cpu2cpt[0]));
+	if (cptab->ctb_cpu2cpt == NULL)
+		goto failed;
+
+	memset(cptab->ctb_cpu2cpt, -1,
+	       num_possible_cpus() * sizeof(cptab->ctb_cpu2cpt[0]));
+
+	LIBCFS_ALLOC(cptab->ctb_parts, ncpt * sizeof(cptab->ctb_parts[0]));
+	if (cptab->ctb_parts == NULL)
+		goto failed;
+
+	for (i = 0; i < ncpt; i++) {
+		struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
+
+		LIBCFS_ALLOC(part->cpt_cpumask, cpumask_size());
+		LIBCFS_ALLOC(part->cpt_nodemask, sizeof(*part->cpt_nodemask));
+		if (part->cpt_cpumask == NULL || part->cpt_nodemask == NULL)
+			goto failed;
+	}
+
+	spin_lock(&cpt_data.cpt_lock);
+	/* Reserved for hotplug */
+	cptab->ctb_version = cpt_data.cpt_version;
+	spin_unlock(&cpt_data.cpt_lock);
+
+	return cptab;
+
+ failed:
+	cfs_cpt_table_free(cptab);
+	return NULL;
+}
+EXPORT_SYMBOL(cfs_cpt_table_alloc);
+
+int
+cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len)
+{
+	char	*tmp = buf;
+	int	rc = 0;
+	int	i;
+	int	j;
+
+	for (i = 0; i < cptab->ctb_nparts; i++) {
+		if (len > 0) {
+			rc = snprintf(tmp, len, "%d\t: ", i);
+			len -= rc;
+		}
+
+		if (len <= 0) {
+			rc = -EFBIG;
+			goto out;
+		}
+
+		tmp += rc;
+		for_each_cpu_mask(j, *cptab->ctb_parts[i].cpt_cpumask) {
+			rc = snprintf(tmp, len, "%d ", j);
+			len -= rc;
+			if (len <= 0) {
+				rc = -EFBIG;
+				goto out;
+			}
+			tmp += rc;
+		}
+
+		*tmp = '\n';
+		tmp++;
+		len--;
+	}
+
+ out:
+	if (rc < 0)
+		return rc;
+
+	return tmp - buf;
+}
+EXPORT_SYMBOL(cfs_cpt_table_print);
+
+int
+cfs_cpt_number(struct cfs_cpt_table *cptab)
+{
+	return cptab->ctb_nparts;
+}
+EXPORT_SYMBOL(cfs_cpt_number);
+
+int
+cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt)
+{
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	return cpt == CFS_CPT_ANY ?
+	       cpus_weight(*cptab->ctb_cpumask) :
+	       cpus_weight(*cptab->ctb_parts[cpt].cpt_cpumask);
+}
+EXPORT_SYMBOL(cfs_cpt_weight);
+
+int
+cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt)
+{
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	return cpt == CFS_CPT_ANY ?
+	       any_online_cpu(*cptab->ctb_cpumask) != NR_CPUS :
+	       any_online_cpu(*cptab->ctb_parts[cpt].cpt_cpumask) != NR_CPUS;
+}
+EXPORT_SYMBOL(cfs_cpt_online);
+
+cpumask_t *
+cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt)
+{
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	return cpt == CFS_CPT_ANY ?
+	       cptab->ctb_cpumask : cptab->ctb_parts[cpt].cpt_cpumask;
+}
+EXPORT_SYMBOL(cfs_cpt_cpumask);
+
+nodemask_t *
+cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt)
+{
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	return cpt == CFS_CPT_ANY ?
+	       cptab->ctb_nodemask : cptab->ctb_parts[cpt].cpt_nodemask;
+}
+EXPORT_SYMBOL(cfs_cpt_nodemask);
+
+int
+cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+	int	node;
+
+	LASSERT(cpt >= 0 && cpt < cptab->ctb_nparts);
+
+	if (cpu < 0 || cpu >= NR_CPUS || !cpu_online(cpu)) {
+		CDEBUG(D_INFO, "CPU %d is invalid or it's offline\n", cpu);
+		return 0;
+	}
+
+	if (cptab->ctb_cpu2cpt[cpu] != -1) {
+		CDEBUG(D_INFO, "CPU %d is already in partition %d\n",
+		       cpu, cptab->ctb_cpu2cpt[cpu]);
+		return 0;
+	}
+
+	cptab->ctb_cpu2cpt[cpu] = cpt;
+
+	LASSERT(!cpu_isset(cpu, *cptab->ctb_cpumask));
+	LASSERT(!cpu_isset(cpu, *cptab->ctb_parts[cpt].cpt_cpumask));
+
+	cpu_set(cpu, *cptab->ctb_cpumask);
+	cpu_set(cpu, *cptab->ctb_parts[cpt].cpt_cpumask);
+
+	node = cpu_to_node(cpu);
+
+	/* first CPU of @node in this CPT table */
+	if (!node_isset(node, *cptab->ctb_nodemask))
+		node_set(node, *cptab->ctb_nodemask);
+
+	/* first CPU of @node in this partition */
+	if (!node_isset(node, *cptab->ctb_parts[cpt].cpt_nodemask))
+		node_set(node, *cptab->ctb_parts[cpt].cpt_nodemask);
+
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_cpu);
+
+void
+cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+	int	node;
+	int	i;
+
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	if (cpu < 0 || cpu >= NR_CPUS) {
+		CDEBUG(D_INFO, "Invalid CPU id %d\n", cpu);
+		return;
+	}
+
+	if (cpt == CFS_CPT_ANY) {
+		/* caller doesn't know the partition ID */
+		cpt = cptab->ctb_cpu2cpt[cpu];
+		if (cpt < 0) { /* not set in this CPT-table */
+			CDEBUG(D_INFO, "Try to unset cpu %d which is "
+				       "not in CPT-table %p\n", cpt, cptab);
+			return;
+		}
+
+	} else if (cpt != cptab->ctb_cpu2cpt[cpu]) {
+		CDEBUG(D_INFO,
+		       "CPU %d is not in cpu-partition %d\n", cpu, cpt);
+		return;
+	}
+
+	LASSERT(cpu_isset(cpu, *cptab->ctb_parts[cpt].cpt_cpumask));
+	LASSERT(cpu_isset(cpu, *cptab->ctb_cpumask));
+
+	cpu_clear(cpu, *cptab->ctb_parts[cpt].cpt_cpumask);
+	cpu_clear(cpu, *cptab->ctb_cpumask);
+	cptab->ctb_cpu2cpt[cpu] = -1;
+
+	node = cpu_to_node(cpu);
+
+	LASSERT(node_isset(node, *cptab->ctb_parts[cpt].cpt_nodemask));
+	LASSERT(node_isset(node, *cptab->ctb_nodemask));
+
+	for_each_cpu_mask(i, *cptab->ctb_parts[cpt].cpt_cpumask) {
+		/* this CPT has other CPU belonging to this node? */
+		if (cpu_to_node(i) == node)
+			break;
+	}
+
+	if (i == NR_CPUS)
+		node_clear(node, *cptab->ctb_parts[cpt].cpt_nodemask);
+
+	for_each_cpu_mask(i, *cptab->ctb_cpumask) {
+		/* this CPT-table has other CPU belonging to this node? */
+		if (cpu_to_node(i) == node)
+			break;
+	}
+
+	if (i == NR_CPUS)
+		node_clear(node, *cptab->ctb_nodemask);
+
+	return;
+}
+EXPORT_SYMBOL(cfs_cpt_unset_cpu);
+
+int
+cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask)
+{
+	int	i;
+
+	if (cpus_weight(*mask) == 0 || any_online_cpu(*mask) == NR_CPUS) {
+		CDEBUG(D_INFO, "No online CPU is found in the CPU mask "
+			       "for CPU partition %d\n", cpt);
+		return 0;
+	}
+
+	for_each_cpu_mask(i, *mask) {
+		if (!cfs_cpt_set_cpu(cptab, cpt, i))
+			return 0;
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_cpumask);
+
+void
+cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask)
+{
+	int	i;
+
+	for_each_cpu_mask(i, *mask)
+		cfs_cpt_unset_cpu(cptab, cpt, i);
+}
+EXPORT_SYMBOL(cfs_cpt_unset_cpumask);
+
+int
+cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+	cpumask_t	*mask;
+	int		rc;
+
+	if (node < 0 || node >= MAX_NUMNODES) {
+		CDEBUG(D_INFO,
+		       "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
+		return 0;
+	}
+
+	down(&cpt_data.cpt_mutex);
+
+	mask = cpt_data.cpt_cpumask;
+	cfs_node_to_cpumask(node, mask);
+
+	rc = cfs_cpt_set_cpumask(cptab, cpt, mask);
+
+	up(&cpt_data.cpt_mutex);
+
+	return rc;
+}
+EXPORT_SYMBOL(cfs_cpt_set_node);
+
+void
+cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+	cpumask_t *mask;
+
+	if (node < 0 || node >= MAX_NUMNODES) {
+		CDEBUG(D_INFO,
+		       "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
+		return;
+	}
+
+	down(&cpt_data.cpt_mutex);
+
+	mask = cpt_data.cpt_cpumask;
+	cfs_node_to_cpumask(node, mask);
+
+	cfs_cpt_unset_cpumask(cptab, cpt, mask);
+
+	up(&cpt_data.cpt_mutex);
+}
+EXPORT_SYMBOL(cfs_cpt_unset_node);
+
+int
+cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask)
+{
+	int	i;
+
+	for_each_node_mask(i, *mask) {
+		if (!cfs_cpt_set_node(cptab, cpt, i))
+			return 0;
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_nodemask);
+
+void
+cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask)
+{
+	int	i;
+
+	for_each_node_mask(i, *mask)
+		cfs_cpt_unset_node(cptab, cpt, i);
+}
+EXPORT_SYMBOL(cfs_cpt_unset_nodemask);
+
+void
+cfs_cpt_clear(struct cfs_cpt_table *cptab, int cpt)
+{
+	int	last;
+	int	i;
+
+	if (cpt == CFS_CPT_ANY) {
+		last = cptab->ctb_nparts - 1;
+		cpt = 0;
+	} else {
+		last = cpt;
+	}
+
+	for (; cpt <= last; cpt++) {
+		for_each_cpu_mask(i, *cptab->ctb_parts[cpt].cpt_cpumask)
+			cfs_cpt_unset_cpu(cptab, cpt, i);
+	}
+}
+EXPORT_SYMBOL(cfs_cpt_clear);
+
+int
+cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt)
+{
+	nodemask_t	*mask;
+	int		weight;
+	int		rotor;
+	int		node;
+
+	/* convert CPU partition ID to HW node id */
+
+	if (cpt < 0 || cpt >= cptab->ctb_nparts) {
+		mask = cptab->ctb_nodemask;
+		rotor = cptab->ctb_spread_rotor++;
+	} else {
+		mask = cptab->ctb_parts[cpt].cpt_nodemask;
+		rotor = cptab->ctb_parts[cpt].cpt_spread_rotor++;
+	}
+
+	weight = nodes_weight(*mask);
+	LASSERT(weight > 0);
+
+	rotor %= weight;
+
+	for_each_node_mask(node, *mask) {
+		if (rotor-- == 0)
+			return node;
+	}
+
+	LBUG();
+	return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_spread_node);
+
+int
+cfs_cpt_current(struct cfs_cpt_table *cptab, int remap)
+{
+	int	cpu = smp_processor_id();
+	int	cpt = cptab->ctb_cpu2cpt[cpu];
+
+	if (cpt < 0) {
+		if (!remap)
+			return cpt;
+
+		/* don't return negative value for safety of upper layer,
+		 * instead we shadow the unknown cpu to a valid partition ID */
+		cpt = cpu % cptab->ctb_nparts;
+	}
+
+	return cpt;
+}
+EXPORT_SYMBOL(cfs_cpt_current);
+
+int
+cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu)
+{
+	LASSERT(cpu >= 0 && cpu < NR_CPUS);
+
+	return cptab->ctb_cpu2cpt[cpu];
+}
+EXPORT_SYMBOL(cfs_cpt_of_cpu);
+
+int
+cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt)
+{
+	cpumask_t	*cpumask;
+	nodemask_t	*nodemask;
+	int		rc;
+	int		i;
+
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	if (cpt == CFS_CPT_ANY) {
+		cpumask = cptab->ctb_cpumask;
+		nodemask = cptab->ctb_nodemask;
+	} else {
+		cpumask = cptab->ctb_parts[cpt].cpt_cpumask;
+		nodemask = cptab->ctb_parts[cpt].cpt_nodemask;
+	}
+
+	if (any_online_cpu(*cpumask) == NR_CPUS) {
+		CERROR("No online CPU found in CPU partition %d, did someone "
+		       "do CPU hotplug on system? You might need to reload "
+		       "Lustre modules to keep system working well.\n", cpt);
+		return -EINVAL;
+	}
+
+	for_each_online_cpu(i) {
+		if (cpu_isset(i, *cpumask))
+			continue;
+
+		rc = set_cpus_allowed(current, *cpumask);
+		set_mems_allowed(*nodemask);
+		if (rc == 0)
+			schedule(); /* switch to allowed CPU */
+
+		return rc;
+	}
+
+	/* don't need to set affinity because all online CPUs are covered */
+	return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_bind);
+
+/**
+ * Choose max to \a number CPUs from \a node and set them in \a cpt.
+ * We always prefer to choose CPU in the same core/socket.
+ */
+static int
+cfs_cpt_choose_ncpus(struct cfs_cpt_table *cptab, int cpt,
+		     cpumask_t *node, int number)
+{
+	cpumask_t	*socket = NULL;
+	cpumask_t	*core = NULL;
+	int		rc = 0;
+	int		cpu;
+
+	LASSERT(number > 0);
+
+	if (number >= cpus_weight(*node)) {
+		while (!cpus_empty(*node)) {
+			cpu = first_cpu(*node);
+
+			rc = cfs_cpt_set_cpu(cptab, cpt, cpu);
+			if (!rc)
+				return -EINVAL;
+			cpu_clear(cpu, *node);
+		}
+		return 0;
+	}
+
+	/* allocate scratch buffer */
+	LIBCFS_ALLOC(socket, cpumask_size());
+	LIBCFS_ALLOC(core, cpumask_size());
+	if (socket == NULL || core == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	while (!cpus_empty(*node)) {
+		cpu = first_cpu(*node);
+
+		/* get cpumask for cores in the same socket */
+		cfs_cpu_core_siblings(cpu, socket);
+		cpus_and(*socket, *socket, *node);
+
+		LASSERT(!cpus_empty(*socket));
+
+		while (!cpus_empty(*socket)) {
+			int     i;
+
+			/* get cpumask for hts in the same core */
+			cfs_cpu_ht_siblings(cpu, core);
+			cpus_and(*core, *core, *node);
+
+			LASSERT(!cpus_empty(*core));
+
+			for_each_cpu_mask(i, *core) {
+				cpu_clear(i, *socket);
+				cpu_clear(i, *node);
+
+				rc = cfs_cpt_set_cpu(cptab, cpt, i);
+				if (!rc) {
+					rc = -EINVAL;
+					goto out;
+				}
+
+				if (--number == 0)
+					goto out;
+			}
+			cpu = first_cpu(*socket);
+		}
+	}
+
+ out:
+	if (socket != NULL)
+		LIBCFS_FREE(socket, cpumask_size());
+	if (core != NULL)
+		LIBCFS_FREE(core, cpumask_size());
+	return rc;
+}
+
+#define CPT_WEIGHT_MIN  4u
+
+static unsigned int
+cfs_cpt_num_estimate(void)
+{
+	unsigned nnode = num_online_nodes();
+	unsigned ncpu  = num_online_cpus();
+	unsigned ncpt;
+
+	if (ncpu <= CPT_WEIGHT_MIN) {
+		ncpt = 1;
+		goto out;
+	}
+
+	/* generate reasonable number of CPU partitions based on total number
+	 * of CPUs, Preferred N should be power2 and match this condition:
+	 * 2 * (N - 1)^2 < NCPUS <= 2 * N^2 */
+	for (ncpt = 2; ncpu > 2 * ncpt * ncpt; ncpt <<= 1) {}
+
+	if (ncpt <= nnode) { /* fat numa system */
+		while (nnode > ncpt)
+			nnode >>= 1;
+
+	} else { /* ncpt > nnode */
+		while ((nnode << 1) <= ncpt)
+			nnode <<= 1;
+	}
+
+	ncpt = nnode;
+
+ out:
+#if (BITS_PER_LONG == 32)
+	/* config many CPU partitions on 32-bit system could consume
+	 * too much memory */
+	ncpt = min(2U, ncpt);
+#endif
+	while (ncpu % ncpt != 0)
+		ncpt--; /* worst case is 1 */
+
+	return ncpt;
+}
+
+static struct cfs_cpt_table *
+cfs_cpt_table_create(int ncpt)
+{
+	struct cfs_cpt_table *cptab = NULL;
+	cpumask_t	*mask = NULL;
+	int		cpt = 0;
+	int		num;
+	int		rc;
+	int		i;
+
+	rc = cfs_cpt_num_estimate();
+	if (ncpt <= 0)
+		ncpt = rc;
+
+	if (ncpt > num_online_cpus() || ncpt > 4 * rc) {
+		CWARN("CPU partition number %d is larger than suggested "
+		      "value (%d), your system may have performance"
+		      "issue or run out of memory while under pressure\n",
+		      ncpt, rc);
+	}
+
+	if (num_online_cpus() % ncpt != 0) {
+		CERROR("CPU number %d is not multiple of cpu_npartition %d, "
+		       "please try different cpu_npartitions value or"
+		       "set pattern string by cpu_pattern=STRING\n",
+		       (int)num_online_cpus(), ncpt);
+		goto failed;
+	}
+
+	cptab = cfs_cpt_table_alloc(ncpt);
+	if (cptab == NULL) {
+		CERROR("Failed to allocate CPU map(%d)\n", ncpt);
+		goto failed;
+	}
+
+	num = num_online_cpus() / ncpt;
+	if (num == 0) {
+		CERROR("CPU changed while setting CPU partition\n");
+		goto failed;
+	}
+
+	LIBCFS_ALLOC(mask, cpumask_size());
+	if (mask == NULL) {
+		CERROR("Failed to allocate scratch cpumask\n");
+		goto failed;
+	}
+
+	for_each_online_node(i) {
+		cfs_node_to_cpumask(i, mask);
+
+		while (!cpus_empty(*mask)) {
+			struct cfs_cpu_partition *part;
+			int    n;
+
+			if (cpt >= ncpt)
+				goto failed;
+
+			part = &cptab->ctb_parts[cpt];
+
+			n = num - cpus_weight(*part->cpt_cpumask);
+			LASSERT(n > 0);
+
+			rc = cfs_cpt_choose_ncpus(cptab, cpt, mask, n);
+			if (rc < 0)
+				goto failed;
+
+			LASSERT(num >= cpus_weight(*part->cpt_cpumask));
+			if (num == cpus_weight(*part->cpt_cpumask))
+				cpt++;
+		}
+	}
+
+	if (cpt != ncpt ||
+	    num != cpus_weight(*cptab->ctb_parts[ncpt - 1].cpt_cpumask)) {
+		CERROR("Expect %d(%d) CPU partitions but got %d(%d), "
+		       "CPU hotplug/unplug while setting?\n",
+		       cptab->ctb_nparts, num, cpt,
+		       cpus_weight(*cptab->ctb_parts[ncpt - 1].cpt_cpumask));
+		goto failed;
+	}
+
+	LIBCFS_FREE(mask, cpumask_size());
+
+	return cptab;
+
+ failed:
+	CERROR("Failed to setup CPU-partition-table with %d "
+	       "CPU-partitions, online HW nodes: %d, HW cpus: %d.\n",
+	       ncpt, num_online_nodes(), num_online_cpus());
+
+	if (mask != NULL)
+		LIBCFS_FREE(mask, cpumask_size());
+
+	if (cptab != NULL)
+		cfs_cpt_table_free(cptab);
+
+	return NULL;
+}
+
+static struct cfs_cpt_table *
+cfs_cpt_table_create_pattern(char *pattern)
+{
+	struct cfs_cpt_table	*cptab;
+	char			*str	= pattern;
+	int			node	= 0;
+	int			high;
+	int			ncpt;
+	int			c;
+
+	for (ncpt = 0;; ncpt++) { /* quick scan bracket */
+		str = strchr(str, '[');
+		if (str == NULL)
+			break;
+		str++;
+	}
+
+	str = cfs_trimwhite(pattern);
+	if (*str == 'n' || *str == 'N') {
+		pattern = str + 1;
+		node = 1;
+	}
+
+	if (ncpt == 0 ||
+	    (node && ncpt > num_online_nodes()) ||
+	    (!node && ncpt > num_online_cpus())) {
+		CERROR("Invalid pattern %s, or too many partitions %d\n",
+		       pattern, ncpt);
+		return NULL;
+	}
+
+	high = node ? MAX_NUMNODES - 1 : NR_CPUS - 1;
+
+	cptab = cfs_cpt_table_alloc(ncpt);
+	if (cptab == NULL) {
+		CERROR("Failed to allocate cpu partition table\n");
+		return NULL;
+	}
+
+	for (str = cfs_trimwhite(pattern), c = 0;; c++) {
+		struct cfs_range_expr	*range;
+		struct cfs_expr_list	*el;
+		char			*bracket = strchr(str, '[');
+		int			cpt;
+		int			rc;
+		int			i;
+		int			n;
+
+		if (bracket == NULL) {
+			if (*str != 0) {
+				CERROR("Invalid pattern %s\n", str);
+				goto failed;
+			} else if (c != ncpt) {
+				CERROR("expect %d partitions but found %d\n",
+				       ncpt, c);
+				goto failed;
+			}
+			break;
+		}
+
+		if (sscanf(str, "%u%n", &cpt, &n) < 1) {
+			CERROR("Invalid cpu pattern %s\n", str);
+			goto failed;
+		}
+
+		if (cpt < 0 || cpt >= ncpt) {
+			CERROR("Invalid partition id %d, total partitions %d\n",
+			       cpt, ncpt);
+			goto failed;
+		}
+
+		if (cfs_cpt_weight(cptab, cpt) != 0) {
+			CERROR("Partition %d has already been set.\n", cpt);
+			goto failed;
+		}
+
+		str = cfs_trimwhite(str + n);
+		if (str != bracket) {
+			CERROR("Invalid pattern %s\n", str);
+			goto failed;
+		}
+
+		bracket = strchr(str, ']');
+		if (bracket == NULL) {
+			CERROR("missing right bracket for cpt %d, %s\n",
+			       cpt, str);
+			goto failed;
+		}
+
+		if (cfs_expr_list_parse(str, (bracket - str) + 1,
+					0, high, &el) != 0) {
+			CERROR("Can't parse number range: %s\n", str);
+			goto failed;
+		}
+
+		list_for_each_entry(range, &el->el_exprs, re_link) {
+			for (i = range->re_lo; i <= range->re_hi; i++) {
+				if ((i - range->re_lo) % range->re_stride != 0)
+					continue;
+
+				rc = node ? cfs_cpt_set_node(cptab, cpt, i) :
+					    cfs_cpt_set_cpu(cptab, cpt, i);
+				if (!rc) {
+					cfs_expr_list_free(el);
+					goto failed;
+				}
+			}
+		}
+
+		cfs_expr_list_free(el);
+
+		if (!cfs_cpt_online(cptab, cpt)) {
+			CERROR("No online CPU is found on partition %d\n", cpt);
+			goto failed;
+		}
+
+		str = cfs_trimwhite(bracket + 1);
+	}
+
+	return cptab;
+
+ failed:
+	cfs_cpt_table_free(cptab);
+	return NULL;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int
+cfs_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
+{
+	unsigned int  cpu = (unsigned long)hcpu;
+
+	switch (action) {
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+		spin_lock(&cpt_data.cpt_lock);
+		cpt_data.cpt_version++;
+		spin_unlock(&cpt_data.cpt_lock);
+	default:
+		CWARN("Lustre: can't support CPU hotplug well now, "
+		      "performance and stability could be impacted"
+		      "[CPU %u notify: %lx]\n", cpu, action);
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block cfs_cpu_notifier = {
+	.notifier_call	= cfs_cpu_notify,
+	.priority	= 0
+};
+
+#endif
+
+void
+cfs_cpu_fini(void)
+{
+	if (cfs_cpt_table != NULL)
+		cfs_cpt_table_free(cfs_cpt_table);
+
+#ifdef CONFIG_HOTPLUG_CPU
+	unregister_hotcpu_notifier(&cfs_cpu_notifier);
+#endif
+	if (cpt_data.cpt_cpumask != NULL)
+		LIBCFS_FREE(cpt_data.cpt_cpumask, cpumask_size());
+}
+
+int
+cfs_cpu_init(void)
+{
+	LASSERT(cfs_cpt_table == NULL);
+
+	memset(&cpt_data, 0, sizeof(cpt_data));
+
+	LIBCFS_ALLOC(cpt_data.cpt_cpumask, cpumask_size());
+	if (cpt_data.cpt_cpumask == NULL) {
+		CERROR("Failed to allocate scratch buffer\n");
+		return -1;
+	}
+
+	spin_lock_init(&cpt_data.cpt_lock);
+	sema_init(&cpt_data.cpt_mutex, 1);
+
+#ifdef CONFIG_HOTPLUG_CPU
+	register_hotcpu_notifier(&cfs_cpu_notifier);
+#endif
+
+	if (*cpu_pattern != 0) {
+		cfs_cpt_table = cfs_cpt_table_create_pattern(cpu_pattern);
+		if (cfs_cpt_table == NULL) {
+			CERROR("Failed to create cptab from pattern %s\n",
+			       cpu_pattern);
+			goto failed;
+		}
+
+	} else {
+		cfs_cpt_table = cfs_cpt_table_create(cpu_npartitions);
+		if (cfs_cpt_table == NULL) {
+			CERROR("Failed to create ptable with npartitions %d\n",
+			       cpu_npartitions);
+			goto failed;
+		}
+	}
+
+	spin_lock(&cpt_data.cpt_lock);
+	if (cfs_cpt_table->ctb_version != cpt_data.cpt_version) {
+		spin_unlock(&cpt_data.cpt_lock);
+		CERROR("CPU hotplug/unplug during setup\n");
+		goto failed;
+	}
+	spin_unlock(&cpt_data.cpt_lock);
+
+	LCONSOLE(0, "HW CPU cores: %d, npartitions: %d\n",
+		 num_online_cpus(), cfs_cpt_number(cfs_cpt_table));
+	return 0;
+
+ failed:
+	cfs_cpu_fini();
+	return -1;
+}
+
+#endif
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-adler.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-adler.c
new file mode 100644
index 000000000000..20b2d61d9ff2
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-adler.c
@@ -0,0 +1,144 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ */
+
+/*
+ * This is crypto api shash wrappers to zlib_adler32.
+ */
+
+#include <linux/module.h>
+#include <linux/zutil.h>
+#include <crypto/internal/hash.h>
+
+
+#define CHKSUM_BLOCK_SIZE	1
+#define CHKSUM_DIGEST_SIZE	4
+
+
+static u32 __adler32(u32 cksum, unsigned char const *p, size_t len)
+{
+	return zlib_adler32(cksum, p, len);
+}
+
+static int adler32_cra_init(struct crypto_tfm *tfm)
+{
+	u32 *key = crypto_tfm_ctx(tfm);
+
+	*key = 1;
+
+	return 0;
+}
+
+static int adler32_setkey(struct crypto_shash *hash, const u8 *key,
+			  unsigned int keylen)
+{
+	u32 *mctx = crypto_shash_ctx(hash);
+
+	if (keylen != sizeof(u32)) {
+		crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+	*mctx = *(u32 *)key;
+	return 0;
+}
+
+static int adler32_init(struct shash_desc *desc)
+{
+	u32 *mctx = crypto_shash_ctx(desc->tfm);
+	u32 *cksump = shash_desc_ctx(desc);
+
+	*cksump = *mctx;
+
+	return 0;
+}
+
+static int adler32_update(struct shash_desc *desc, const u8 *data,
+			  unsigned int len)
+{
+	u32 *cksump = shash_desc_ctx(desc);
+
+	*cksump = __adler32(*cksump, data, len);
+	return 0;
+}
+static int __adler32_finup(u32 *cksump, const u8 *data, unsigned int len,
+			   u8 *out)
+{
+	*(u32 *)out = __adler32(*cksump, data, len);
+	return 0;
+}
+
+static int adler32_finup(struct shash_desc *desc, const u8 *data,
+			 unsigned int len, u8 *out)
+{
+	return __adler32_finup(shash_desc_ctx(desc), data, len, out);
+}
+
+static int adler32_final(struct shash_desc *desc, u8 *out)
+{
+	u32 *cksump = shash_desc_ctx(desc);
+
+	*(u32 *)out = *cksump;
+	return 0;
+}
+
+static int adler32_digest(struct shash_desc *desc, const u8 *data,
+			  unsigned int len, u8 *out)
+{
+	return __adler32_finup(crypto_shash_ctx(desc->tfm), data, len,
+				    out);
+}
+static struct shash_alg alg = {
+	.setkey		= adler32_setkey,
+	.init		= adler32_init,
+	.update		= adler32_update,
+	.final		= adler32_final,
+	.finup		= adler32_finup,
+	.digest		= adler32_digest,
+	.descsize	= sizeof(u32),
+	.digestsize	= CHKSUM_DIGEST_SIZE,
+	.base		= {
+		.cra_name		= "adler32",
+		.cra_driver_name	= "adler32-zlib",
+		.cra_priority		= 100,
+		.cra_blocksize		= CHKSUM_BLOCK_SIZE,
+		.cra_ctxsize		= sizeof(u32),
+		.cra_module		= THIS_MODULE,
+		.cra_init		= adler32_cra_init,
+	}
+};
+
+
+int cfs_crypto_adler32_register(void)
+{
+	return crypto_register_shash(&alg);
+}
+EXPORT_SYMBOL(cfs_crypto_adler32_register);
+
+void cfs_crypto_adler32_unregister(void)
+{
+	crypto_unregister_shash(&alg);
+}
+EXPORT_SYMBOL(cfs_crypto_adler32_unregister);
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-crc32.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-crc32.c
new file mode 100644
index 000000000000..83af630c06c6
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-crc32.c
@@ -0,0 +1,149 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ */
+
+/*
+ * This is crypto api shash wrappers to crc32_le.
+ */
+
+#include <linux/module.h>
+#include <linux/crc32.h>
+#include <crypto/internal/hash.h>
+
+#define CHKSUM_BLOCK_SIZE	1
+#define CHKSUM_DIGEST_SIZE	4
+
+static u32 __crc32_le(u32 crc, unsigned char const *p, size_t len)
+{
+	return crc32_le(crc, p, len);
+}
+
+/** No default init with ~0 */
+static int crc32_cra_init(struct crypto_tfm *tfm)
+{
+	u32 *key = crypto_tfm_ctx(tfm);
+
+	*key = 0;
+
+	return 0;
+}
+
+
+/*
+ * Setting the seed allows arbitrary accumulators and flexible XOR policy
+ * If your algorithm starts with ~0, then XOR with ~0 before you set
+ * the seed.
+ */
+static int crc32_setkey(struct crypto_shash *hash, const u8 *key,
+			unsigned int keylen)
+{
+	u32 *mctx = crypto_shash_ctx(hash);
+
+	if (keylen != sizeof(u32)) {
+		crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+	*mctx = le32_to_cpup((__le32 *)key);
+	return 0;
+}
+
+static int crc32_init(struct shash_desc *desc)
+{
+	u32 *mctx = crypto_shash_ctx(desc->tfm);
+	u32 *crcp = shash_desc_ctx(desc);
+
+	*crcp = *mctx;
+
+	return 0;
+}
+
+static int crc32_update(struct shash_desc *desc, const u8 *data,
+			unsigned int len)
+{
+	u32 *crcp = shash_desc_ctx(desc);
+
+	*crcp = __crc32_le(*crcp, data, len);
+	return 0;
+}
+/* No final XOR 0xFFFFFFFF, like crc32_le */
+static int __crc32_finup(u32 *crcp, const u8 *data, unsigned int len,
+			 u8 *out)
+{
+	*(__le32 *)out = cpu_to_le32(__crc32_le(*crcp, data, len));
+	return 0;
+}
+
+static int crc32_finup(struct shash_desc *desc, const u8 *data,
+		       unsigned int len, u8 *out)
+{
+	return __crc32_finup(shash_desc_ctx(desc), data, len, out);
+}
+
+static int crc32_final(struct shash_desc *desc, u8 *out)
+{
+	u32 *crcp = shash_desc_ctx(desc);
+
+	*(__le32 *)out = cpu_to_le32p(crcp);
+	return 0;
+}
+
+static int crc32_digest(struct shash_desc *desc, const u8 *data,
+			unsigned int len, u8 *out)
+{
+	return __crc32_finup(crypto_shash_ctx(desc->tfm), data, len,
+			     out);
+}
+static struct shash_alg alg = {
+	.setkey		= crc32_setkey,
+	.init		= crc32_init,
+	.update		= crc32_update,
+	.final		= crc32_final,
+	.finup		= crc32_finup,
+	.digest		= crc32_digest,
+	.descsize	= sizeof(u32),
+	.digestsize	= CHKSUM_DIGEST_SIZE,
+	.base		= {
+		.cra_name		= "crc32",
+		.cra_driver_name	= "crc32-table",
+		.cra_priority		= 100,
+		.cra_blocksize		= CHKSUM_BLOCK_SIZE,
+		.cra_ctxsize		= sizeof(u32),
+		.cra_module		= THIS_MODULE,
+		.cra_init		= crc32_cra_init,
+	}
+};
+
+int cfs_crypto_crc32_register(void)
+{
+	return crypto_register_shash(&alg);
+}
+EXPORT_SYMBOL(cfs_crypto_crc32_register);
+
+void cfs_crypto_crc32_unregister(void)
+{
+	crypto_unregister_shash(&alg);
+}
+EXPORT_SYMBOL(cfs_crypto_crc32_unregister);
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-crc32pclmul.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-crc32pclmul.c
new file mode 100644
index 000000000000..dd29aa502980
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-crc32pclmul.c
@@ -0,0 +1,193 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ *
+ * Wrappers for kernel crypto shash api to pclmulqdq crc32 imlementation.
+ *
+ * Author:     Alexander Boyko <Alexander_Boyko@xyratex.com>
+ */
+#include <linux/crc32.h>
+#include <crypto/internal/hash.h>
+#include <linux/crc32.h>
+#include <asm/cpufeature.h>
+#include <asm/i387.h>
+#include <linux/libcfs/libcfs.h>
+
+#define CHKSUM_BLOCK_SIZE	1
+#define CHKSUM_DIGEST_SIZE	4
+
+#define PCLMUL_MIN_LEN		64L     /* minimum size of buffer
+					 * for crc32_pclmul_le_16 */
+#define SCALE_F			16L	/* size of xmm register */
+#define SCALE_F_MASK		(SCALE_F - 1)
+
+u32 crc32_pclmul_le_16(unsigned char const *buffer, size_t len, u32 crc32);
+
+static u32 __attribute__((pure))
+	crc32_pclmul_le(u32 crc, unsigned char const *p, size_t len)
+{
+	unsigned int iquotient;
+	unsigned int iremainder;
+	unsigned int prealign;
+
+	if (len < PCLMUL_MIN_LEN + SCALE_F_MASK)
+		return crc32_le(crc, p, len);
+
+	if ((long)p & SCALE_F_MASK) {
+		/* align p to 16 byte */
+		prealign = SCALE_F - ((long)p & SCALE_F_MASK);
+
+		crc = crc32_le(crc, p, prealign);
+		len -= prealign;
+		p = (unsigned char *)(((unsigned long)p + SCALE_F_MASK) &
+				     ~SCALE_F_MASK);
+	}
+	iquotient = len & (~SCALE_F_MASK);
+	iremainder = len & SCALE_F_MASK;
+
+	kernel_fpu_begin();
+	crc = crc32_pclmul_le_16(p, iquotient, crc);
+	kernel_fpu_end();
+
+	if (iremainder)
+		crc = crc32_le(crc, p + iquotient, iremainder);
+
+	return crc;
+}
+
+static int crc32_pclmul_cra_init(struct crypto_tfm *tfm)
+{
+	u32 *key = crypto_tfm_ctx(tfm);
+
+	*key = 0;
+
+	return 0;
+}
+
+/*
+ * Setting the seed allows arbitrary accumulators and flexible XOR policy
+ * If your algorithm starts with ~0, then XOR with ~0 before you set
+ * the seed.
+ */
+static int crc32_pclmul_setkey(struct crypto_shash *hash, const u8 *key,
+			unsigned int keylen)
+{
+	u32 *mctx = crypto_shash_ctx(hash);
+
+	if (keylen != sizeof(u32)) {
+		crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+	*mctx = le32_to_cpup((__le32 *)key);
+	return 0;
+}
+
+static int crc32_pclmul_init(struct shash_desc *desc)
+{
+	u32 *mctx = crypto_shash_ctx(desc->tfm);
+	u32 *crcp = shash_desc_ctx(desc);
+
+	*crcp = *mctx;
+
+	return 0;
+}
+
+static int crc32_pclmul_update(struct shash_desc *desc, const u8 *data,
+			       unsigned int len)
+{
+	u32 *crcp = shash_desc_ctx(desc);
+
+	*crcp = crc32_pclmul_le(*crcp, data, len);
+	return 0;
+}
+
+/* No final XOR 0xFFFFFFFF, like crc32_le */
+static int __crc32_pclmul_finup(u32 *crcp, const u8 *data, unsigned int len,
+				u8 *out)
+{
+	*(__le32 *)out = cpu_to_le32(crc32_pclmul_le(*crcp, data, len));
+	return 0;
+}
+
+static int crc32_pclmul_finup(struct shash_desc *desc, const u8 *data,
+			      unsigned int len, u8 *out)
+{
+	return __crc32_pclmul_finup(shash_desc_ctx(desc), data, len, out);
+}
+
+static int crc32_pclmul_final(struct shash_desc *desc, u8 *out)
+{
+	u32 *crcp = shash_desc_ctx(desc);
+
+	*(__le32 *)out = cpu_to_le32p(crcp);
+	return 0;
+}
+
+static int crc32_pclmul_digest(struct shash_desc *desc, const u8 *data,
+			       unsigned int len, u8 *out)
+{
+	return __crc32_pclmul_finup(crypto_shash_ctx(desc->tfm), data, len,
+				    out);
+}
+
+static struct shash_alg alg = {
+	.setkey		= crc32_pclmul_setkey,
+	.init		= crc32_pclmul_init,
+	.update		= crc32_pclmul_update,
+	.final		= crc32_pclmul_final,
+	.finup		= crc32_pclmul_finup,
+	.digest		= crc32_pclmul_digest,
+	.descsize	= sizeof(u32),
+	.digestsize	= CHKSUM_DIGEST_SIZE,
+	.base		= {
+			.cra_name		= "crc32",
+			.cra_driver_name	= "crc32-pclmul",
+			.cra_priority		= 200,
+			.cra_blocksize		= CHKSUM_BLOCK_SIZE,
+			.cra_ctxsize		= sizeof(u32),
+			.cra_module		= THIS_MODULE,
+			.cra_init		= crc32_pclmul_cra_init,
+	}
+};
+
+#ifndef X86_FEATURE_PCLMULQDQ
+#define X86_FEATURE_PCLMULQDQ	(4 * 32 + 1)	/* PCLMULQDQ instruction */
+#endif
+
+int cfs_crypto_crc32_pclmul_register(void)
+{
+
+	if (!boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
+		CDEBUG(D_INFO, "PCLMULQDQ-NI instructions are not "
+		       "detected.\n");
+		return -ENODEV;
+	}
+	return crypto_register_shash(&alg);
+}
+
+void cfs_crypto_crc32_pclmul_unregister(void)
+{
+	crypto_unregister_shash(&alg);
+}
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.c
new file mode 100644
index 000000000000..f3899bd94408
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.c
@@ -0,0 +1,305 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+
+#include <linux/crypto.h>
+#include <linux/scatterlist.h>
+#include <linux/libcfs/libcfs.h>
+#include <linux/libcfs/linux/linux-crypto.h>
+/**
+ *  Array of  hash algorithm speed in MByte per second
+ */
+static int cfs_crypto_hash_speeds[CFS_HASH_ALG_MAX];
+
+
+
+static int cfs_crypto_hash_alloc(unsigned char alg_id,
+				 const struct cfs_crypto_hash_type **type,
+				 struct hash_desc *desc, unsigned char *key,
+				 unsigned int key_len)
+{
+	int     err = 0;
+
+	*type = cfs_crypto_hash_type(alg_id);
+
+	if (*type == NULL) {
+		CWARN("Unsupported hash algorithm id = %d, max id is %d\n",
+		      alg_id, CFS_HASH_ALG_MAX);
+		return -EINVAL;
+	}
+	desc->tfm = crypto_alloc_hash((*type)->cht_name, 0, 0);
+
+	if (desc->tfm == NULL)
+		return -EINVAL;
+
+	if (IS_ERR(desc->tfm)) {
+		CDEBUG(D_INFO, "Failed to alloc crypto hash %s\n",
+		       (*type)->cht_name);
+		return PTR_ERR(desc->tfm);
+	}
+
+	desc->flags = 0;
+
+	/** Shash have different logic for initialization then digest
+	 * shash: crypto_hash_setkey, crypto_hash_init
+	 * digest: crypto_digest_init, crypto_digest_setkey
+	 * Skip this function for digest, because we use shash logic at
+	 * cfs_crypto_hash_alloc.
+	 */
+	if (key != NULL) {
+		err = crypto_hash_setkey(desc->tfm, key, key_len);
+	} else if ((*type)->cht_key != 0) {
+		err = crypto_hash_setkey(desc->tfm,
+					 (unsigned char *)&((*type)->cht_key),
+					 (*type)->cht_size);
+	}
+
+	if (err != 0) {
+		crypto_free_hash(desc->tfm);
+		return err;
+	}
+
+	CDEBUG(D_INFO, "Using crypto hash: %s (%s) speed %d MB/s\n",
+	       (crypto_hash_tfm(desc->tfm))->__crt_alg->cra_name,
+	       (crypto_hash_tfm(desc->tfm))->__crt_alg->cra_driver_name,
+	       cfs_crypto_hash_speeds[alg_id]);
+
+	return crypto_hash_init(desc);
+}
+
+int cfs_crypto_hash_digest(unsigned char alg_id,
+			   const void *buf, unsigned int buf_len,
+			   unsigned char *key, unsigned int key_len,
+			   unsigned char *hash, unsigned int *hash_len)
+{
+	struct scatterlist	sl;
+	struct hash_desc	hdesc;
+	int			err;
+	const struct cfs_crypto_hash_type	*type;
+
+	if (buf == NULL || buf_len == 0 || hash_len == NULL)
+		return -EINVAL;
+
+	err = cfs_crypto_hash_alloc(alg_id, &type, &hdesc, key, key_len);
+	if (err != 0)
+		return err;
+
+	if (hash == NULL || *hash_len < type->cht_size) {
+		*hash_len = type->cht_size;
+		crypto_free_hash(hdesc.tfm);
+		return -ENOSPC;
+	}
+	sg_init_one(&sl, (void *)buf, buf_len);
+
+	hdesc.flags = 0;
+	err = crypto_hash_digest(&hdesc, &sl, sl.length, hash);
+	crypto_free_hash(hdesc.tfm);
+
+	return err;
+}
+EXPORT_SYMBOL(cfs_crypto_hash_digest);
+
+struct cfs_crypto_hash_desc *
+	cfs_crypto_hash_init(unsigned char alg_id,
+			     unsigned char *key, unsigned int key_len)
+{
+
+	struct  hash_desc       *hdesc;
+	int		     err;
+	const struct cfs_crypto_hash_type       *type;
+
+	hdesc = kmalloc(sizeof(*hdesc), 0);
+	if (hdesc == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	err = cfs_crypto_hash_alloc(alg_id, &type, hdesc, key, key_len);
+
+	if (err) {
+		kfree(hdesc);
+		return ERR_PTR(err);
+	}
+	return (struct cfs_crypto_hash_desc *)hdesc;
+}
+EXPORT_SYMBOL(cfs_crypto_hash_init);
+
+int cfs_crypto_hash_update_page(struct cfs_crypto_hash_desc *hdesc,
+				struct page *page, unsigned int offset,
+				unsigned int len)
+{
+	struct scatterlist sl;
+
+	sg_init_table(&sl, 1);
+	sg_set_page(&sl, page, len, offset & ~CFS_PAGE_MASK);
+
+	return crypto_hash_update((struct hash_desc *)hdesc, &sl, sl.length);
+}
+EXPORT_SYMBOL(cfs_crypto_hash_update_page);
+
+int cfs_crypto_hash_update(struct cfs_crypto_hash_desc *hdesc,
+			   const void *buf, unsigned int buf_len)
+{
+	struct scatterlist sl;
+
+	sg_init_one(&sl, (void *)buf, buf_len);
+
+	return crypto_hash_update((struct hash_desc *)hdesc, &sl, sl.length);
+}
+EXPORT_SYMBOL(cfs_crypto_hash_update);
+
+/*      If hash_len pointer is NULL - destroy descriptor. */
+int cfs_crypto_hash_final(struct cfs_crypto_hash_desc *hdesc,
+			  unsigned char *hash, unsigned int *hash_len)
+{
+	int     err;
+	int     size = crypto_hash_digestsize(((struct hash_desc *)hdesc)->tfm);
+
+	if (hash_len == NULL) {
+		crypto_free_hash(((struct hash_desc *)hdesc)->tfm);
+		kfree(hdesc);
+		return 0;
+	}
+	if (hash == NULL || *hash_len < size) {
+		*hash_len = size;
+		return -ENOSPC;
+	}
+	err = crypto_hash_final((struct hash_desc *) hdesc, hash);
+
+	if (err < 0) {
+		/* May be caller can fix error */
+		return err;
+	}
+	crypto_free_hash(((struct hash_desc *)hdesc)->tfm);
+	kfree(hdesc);
+	return err;
+}
+EXPORT_SYMBOL(cfs_crypto_hash_final);
+
+static void cfs_crypto_performance_test(unsigned char alg_id,
+					const unsigned char *buf,
+					unsigned int buf_len)
+{
+	unsigned long		   start, end;
+	int			     bcount, err = 0;
+	int			     sec = 1; /* do test only 1 sec */
+	unsigned char		   hash[64];
+	unsigned int		    hash_len = 64;
+
+	for (start = jiffies, end = start + sec * HZ, bcount = 0;
+	     time_before(jiffies, end); bcount++) {
+		err = cfs_crypto_hash_digest(alg_id, buf, buf_len, NULL, 0,
+					     hash, &hash_len);
+		if (err)
+			break;
+
+	}
+	end = jiffies;
+
+	if (err) {
+		cfs_crypto_hash_speeds[alg_id] =  -1;
+		CDEBUG(D_INFO, "Crypto hash algorithm %s, err = %d\n",
+		       cfs_crypto_hash_name(alg_id), err);
+	} else {
+		unsigned long   tmp;
+		tmp = ((bcount * buf_len / jiffies_to_msecs(end - start)) *
+		       1000) / (1024 * 1024);
+		cfs_crypto_hash_speeds[alg_id] = (int)tmp;
+	}
+	CDEBUG(D_INFO, "Crypto hash algorithm %s speed = %d MB/s\n",
+	       cfs_crypto_hash_name(alg_id), cfs_crypto_hash_speeds[alg_id]);
+}
+
+int cfs_crypto_hash_speed(unsigned char hash_alg)
+{
+	if (hash_alg < CFS_HASH_ALG_MAX)
+		return cfs_crypto_hash_speeds[hash_alg];
+	else
+		return -1;
+}
+EXPORT_SYMBOL(cfs_crypto_hash_speed);
+
+/**
+ * Do performance test for all hash algorithms.
+ */
+static int cfs_crypto_test_hashes(void)
+{
+	unsigned char	   i;
+	unsigned char	   *data;
+	unsigned int	    j;
+	/* Data block size for testing hash. Maximum
+	 * kmalloc size for 2.6.18 kernel is 128K */
+	unsigned int	    data_len = 1 * 128 * 1024;
+
+	data = kmalloc(data_len, 0);
+	if (data == NULL) {
+		CERROR("Failed to allocate mem\n");
+		return -ENOMEM;
+	}
+
+	for (j = 0; j < data_len; j++)
+		data[j] = j & 0xff;
+
+	for (i = 0; i < CFS_HASH_ALG_MAX; i++)
+		cfs_crypto_performance_test(i, data, data_len);
+
+	kfree(data);
+	return 0;
+}
+
+static int crc32, adler32;
+
+#ifdef CONFIG_X86
+static int crc32pclmul;
+#endif
+
+int cfs_crypto_register(void)
+{
+	crc32 = cfs_crypto_crc32_register();
+	adler32 = cfs_crypto_adler32_register();
+
+#ifdef CONFIG_X86
+	crc32pclmul = cfs_crypto_crc32_pclmul_register();
+#endif
+
+	/* check all algorithms and do performance test */
+	cfs_crypto_test_hashes();
+	return 0;
+}
+void cfs_crypto_unregister(void)
+{
+	if (crc32 == 0)
+		cfs_crypto_crc32_unregister();
+	if (adler32 == 0)
+		cfs_crypto_adler32_unregister();
+
+#ifdef CONFIG_X86
+	if (crc32pclmul == 0)
+		cfs_crypto_crc32_pclmul_unregister();
+#endif
+
+	return;
+}
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-curproc.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-curproc.c
new file mode 100644
index 000000000000..f236510a2f3f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-curproc.c
@@ -0,0 +1,339 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/linux/linux-curproc.c
+ *
+ * Lustre curproc API implementation for Linux kernel
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#include <linux/sched.h>
+#include <linux/fs_struct.h>
+
+#include <linux/compat.h>
+#include <linux/thread_info.h>
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+
+/*
+ * Implementation of cfs_curproc API (see portals/include/libcfs/curproc.h)
+ * for Linux kernel.
+ */
+
+int    cfs_curproc_groups_nr(void)
+{
+	int nr;
+
+	task_lock(current);
+	nr = current_cred()->group_info->ngroups;
+	task_unlock(current);
+	return nr;
+}
+
+void   cfs_curproc_groups_dump(gid_t *array, int size)
+{
+	task_lock(current);
+	size = min_t(int, size, current_cred()->group_info->ngroups);
+	memcpy(array, current_cred()->group_info->blocks[0], size * sizeof(__u32));
+	task_unlock(current);
+}
+
+
+int    current_is_in_group(gid_t gid)
+{
+	return in_group_p(gid);
+}
+
+/* Currently all the CFS_CAP_* defines match CAP_* ones. */
+#define cfs_cap_pack(cap) (cap)
+#define cfs_cap_unpack(cap) (cap)
+
+void cfs_cap_raise(cfs_cap_t cap)
+{
+	struct cred *cred;
+	if ((cred = prepare_creds())) {
+		cap_raise(cred->cap_effective, cfs_cap_unpack(cap));
+		commit_creds(cred);
+	}
+}
+
+void cfs_cap_lower(cfs_cap_t cap)
+{
+	struct cred *cred;
+	if ((cred = prepare_creds())) {
+		cap_lower(cred->cap_effective, cfs_cap_unpack(cap));
+		commit_creds(cred);
+	}
+}
+
+int cfs_cap_raised(cfs_cap_t cap)
+{
+	return cap_raised(current_cap(), cfs_cap_unpack(cap));
+}
+
+void cfs_kernel_cap_pack(kernel_cap_t kcap, cfs_cap_t *cap)
+{
+#if defined (_LINUX_CAPABILITY_VERSION) && _LINUX_CAPABILITY_VERSION == 0x19980330
+	*cap = cfs_cap_pack(kcap);
+#elif defined (_LINUX_CAPABILITY_VERSION) && _LINUX_CAPABILITY_VERSION == 0x20071026
+	*cap = cfs_cap_pack(kcap[0]);
+#elif defined(_KERNEL_CAPABILITY_VERSION) && _KERNEL_CAPABILITY_VERSION == 0x20080522
+	/* XXX lost high byte */
+	*cap = cfs_cap_pack(kcap.cap[0]);
+#else
+	#error "need correct _KERNEL_CAPABILITY_VERSION "
+#endif
+}
+
+void cfs_kernel_cap_unpack(kernel_cap_t *kcap, cfs_cap_t cap)
+{
+#if defined (_LINUX_CAPABILITY_VERSION) && _LINUX_CAPABILITY_VERSION == 0x19980330
+	*kcap = cfs_cap_unpack(cap);
+#elif defined (_LINUX_CAPABILITY_VERSION) && _LINUX_CAPABILITY_VERSION == 0x20071026
+	(*kcap)[0] = cfs_cap_unpack(cap);
+#elif defined(_KERNEL_CAPABILITY_VERSION) && _KERNEL_CAPABILITY_VERSION == 0x20080522
+	kcap->cap[0] = cfs_cap_unpack(cap);
+#else
+	#error "need correct _KERNEL_CAPABILITY_VERSION "
+#endif
+}
+
+cfs_cap_t cfs_curproc_cap_pack(void)
+{
+	cfs_cap_t cap;
+	cfs_kernel_cap_pack(current_cap(), &cap);
+	return cap;
+}
+
+void cfs_curproc_cap_unpack(cfs_cap_t cap)
+{
+	struct cred *cred;
+	if ((cred = prepare_creds())) {
+		cfs_kernel_cap_unpack(&cred->cap_effective, cap);
+		commit_creds(cred);
+	}
+}
+
+int cfs_capable(cfs_cap_t cap)
+{
+	return capable(cfs_cap_unpack(cap));
+}
+
+/* Check if task is running in 32-bit API mode, for the purpose of
+ * userspace binary interfaces.  On 32-bit Linux this is (unfortunately)
+ * always true, even if the application is using LARGEFILE64 and 64-bit
+ * APIs, because Linux provides no way for the filesystem to know if it
+ * is called via 32-bit or 64-bit APIs.  Other clients may vary.  On
+ * 64-bit systems, this will only be true if the binary is calling a
+ * 32-bit system call. */
+int current_is_32bit(void)
+{
+	return is_compat_task();
+}
+
+static int cfs_access_process_vm(struct task_struct *tsk, unsigned long addr,
+				 void *buf, int len, int write)
+{
+	/* Just copied from kernel for the kernels which doesn't
+	 * have access_process_vm() exported */
+	struct mm_struct *mm;
+	struct vm_area_struct *vma;
+	struct page *page;
+	void *old_buf = buf;
+
+	mm = get_task_mm(tsk);
+	if (!mm)
+		return 0;
+
+	down_read(&mm->mmap_sem);
+	/* ignore errors, just check how much was sucessfully transfered */
+	while (len) {
+		int bytes, rc, offset;
+		void *maddr;
+
+		rc = get_user_pages(tsk, mm, addr, 1,
+				     write, 1, &page, &vma);
+		if (rc <= 0)
+			break;
+
+		bytes = len;
+		offset = addr & (PAGE_SIZE-1);
+		if (bytes > PAGE_SIZE-offset)
+			bytes = PAGE_SIZE-offset;
+
+		maddr = kmap(page);
+		if (write) {
+			copy_to_user_page(vma, page, addr,
+					  maddr + offset, buf, bytes);
+			set_page_dirty_lock(page);
+		} else {
+			copy_from_user_page(vma, page, addr,
+					    buf, maddr + offset, bytes);
+		}
+		kunmap(page);
+		page_cache_release(page);
+		len -= bytes;
+		buf += bytes;
+		addr += bytes;
+	}
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+
+	return buf - old_buf;
+}
+
+/* Read the environment variable of current process specified by @key. */
+int cfs_get_environ(const char *key, char *value, int *val_len)
+{
+	struct mm_struct *mm;
+	char *buffer, *tmp_buf = NULL;
+	int buf_len = PAGE_CACHE_SIZE;
+	int key_len = strlen(key);
+	unsigned long addr;
+	int rc;
+	ENTRY;
+
+	buffer = kmalloc(buf_len, GFP_USER);
+	if (!buffer)
+		RETURN(-ENOMEM);
+
+	mm = get_task_mm(current);
+	if (!mm) {
+		kfree(buffer);
+		RETURN(-EINVAL);
+	}
+
+	/* Avoid deadlocks on mmap_sem if called from sys_mmap_pgoff(),
+	 * which is already holding mmap_sem for writes.  If some other
+	 * thread gets the write lock in the meantime, this thread will
+	 * block, but at least it won't deadlock on itself.  LU-1735 */
+	if (down_read_trylock(&mm->mmap_sem) == 0)
+		return -EDEADLK;
+	up_read(&mm->mmap_sem);
+
+	addr = mm->env_start;
+	while (addr < mm->env_end) {
+		int this_len, retval, scan_len;
+		char *env_start, *env_end;
+
+		memset(buffer, 0, buf_len);
+
+		this_len = min_t(int, mm->env_end - addr, buf_len);
+		retval = cfs_access_process_vm(current, addr, buffer,
+					       this_len, 0);
+		if (retval != this_len)
+			break;
+
+		addr += retval;
+
+		/* Parse the buffer to find out the specified key/value pair.
+		 * The "key=value" entries are separated by '\0'. */
+		env_start = buffer;
+		scan_len = this_len;
+		while (scan_len) {
+			char *entry;
+			int entry_len;
+
+			env_end = memscan(env_start, '\0', scan_len);
+			LASSERT(env_end >= env_start &&
+				env_end <= env_start + scan_len);
+
+			/* The last entry of this buffer cross the buffer
+			 * boundary, reread it in next cycle. */
+			if (unlikely(env_end - env_start == scan_len)) {
+				/* This entry is too large to fit in buffer */
+				if (unlikely(scan_len == this_len)) {
+					CERROR("Too long env variable.\n");
+					GOTO(out, rc = -EINVAL);
+				}
+				addr -= scan_len;
+				break;
+			}
+
+			entry = env_start;
+			entry_len = env_end - env_start;
+
+			/* Key length + length of '=' */
+			if (entry_len > key_len + 1 &&
+			    !memcmp(entry, key, key_len)) {
+				entry += key_len + 1;
+				entry_len -= key_len + 1;
+				/* The 'value' buffer passed in is too small.*/
+				if (entry_len >= *val_len)
+					GOTO(out, rc = -EOVERFLOW);
+
+				memcpy(value, entry, entry_len);
+				*val_len = entry_len;
+				GOTO(out, rc = 0);
+			}
+
+			scan_len -= (env_end - env_start + 1);
+			env_start = env_end + 1;
+		}
+	}
+	GOTO(out, rc = -ENOENT);
+
+out:
+	mmput(mm);
+	kfree((void *)buffer);
+	if (tmp_buf)
+		kfree((void *)tmp_buf);
+	return rc;
+}
+EXPORT_SYMBOL(cfs_get_environ);
+
+EXPORT_SYMBOL(cfs_curproc_groups_nr);
+EXPORT_SYMBOL(cfs_curproc_groups_dump);
+EXPORT_SYMBOL(current_is_in_group);
+EXPORT_SYMBOL(cfs_cap_raise);
+EXPORT_SYMBOL(cfs_cap_lower);
+EXPORT_SYMBOL(cfs_cap_raised);
+EXPORT_SYMBOL(cfs_curproc_cap_pack);
+EXPORT_SYMBOL(cfs_curproc_cap_unpack);
+EXPORT_SYMBOL(cfs_capable);
+EXPORT_SYMBOL(current_is_32bit);
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 80
+ * scroll-step: 1
+ * End:
+ */
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-debug.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-debug.c
new file mode 100644
index 000000000000..e2c195b8dd53
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-debug.c
@@ -0,0 +1,264 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/linux/linux-debug.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kmod.h>
+#include <linux/notifier.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/interrupt.h>
+#include <asm/uaccess.h>
+#include <linux/completion.h>
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <asm/uaccess.h>
+#include <linux/miscdevice.h>
+#include <linux/version.h>
+
+# define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/libcfs/linux/portals_compat25.h>
+
+#include "tracefile.h"
+
+#include <linux/kallsyms.h>
+
+char lnet_upcall[1024] = "/usr/lib/lustre/lnet_upcall";
+char lnet_debug_log_upcall[1024] = "/usr/lib/lustre/lnet_debug_log_upcall";
+
+/**
+ * Upcall function once a Lustre log has been dumped.
+ *
+ * \param file  path of the dumped log
+ */
+void libcfs_run_debug_log_upcall(char *file)
+{
+	char *argv[3];
+	int   rc;
+	char *envp[] = {
+		"HOME=/",
+		"PATH=/sbin:/bin:/usr/sbin:/usr/bin",
+		NULL};
+	ENTRY;
+
+	argv[0] = lnet_debug_log_upcall;
+
+	LASSERTF(file != NULL, "called on a null filename\n");
+	argv[1] = file; //only need to pass the path of the file
+
+	argv[2] = NULL;
+
+	rc = USERMODEHELPER(argv[0], argv, envp);
+	if (rc < 0 && rc != -ENOENT) {
+		CERROR("Error %d invoking LNET debug log upcall %s %s; "
+		       "check /proc/sys/lnet/debug_log_upcall\n",
+		       rc, argv[0], argv[1]);
+	} else {
+		CDEBUG(D_HA, "Invoked LNET debug log upcall %s %s\n",
+		       argv[0], argv[1]);
+	}
+
+	EXIT;
+}
+
+void libcfs_run_upcall(char **argv)
+{
+	int   rc;
+	int   argc;
+	char *envp[] = {
+		"HOME=/",
+		"PATH=/sbin:/bin:/usr/sbin:/usr/bin",
+		NULL};
+	ENTRY;
+
+	argv[0] = lnet_upcall;
+	argc = 1;
+	while (argv[argc] != NULL)
+		argc++;
+
+	LASSERT(argc >= 2);
+
+	rc = USERMODEHELPER(argv[0], argv, envp);
+	if (rc < 0 && rc != -ENOENT) {
+		CERROR("Error %d invoking LNET upcall %s %s%s%s%s%s%s%s%s; "
+		       "check /proc/sys/lnet/upcall\n",
+		       rc, argv[0], argv[1],
+		       argc < 3 ? "" : ",", argc < 3 ? "" : argv[2],
+		       argc < 4 ? "" : ",", argc < 4 ? "" : argv[3],
+		       argc < 5 ? "" : ",", argc < 5 ? "" : argv[4],
+		       argc < 6 ? "" : ",...");
+	} else {
+		CDEBUG(D_HA, "Invoked LNET upcall %s %s%s%s%s%s%s%s%s\n",
+		       argv[0], argv[1],
+		       argc < 3 ? "" : ",", argc < 3 ? "" : argv[2],
+		       argc < 4 ? "" : ",", argc < 4 ? "" : argv[3],
+		       argc < 5 ? "" : ",", argc < 5 ? "" : argv[4],
+		       argc < 6 ? "" : ",...");
+	}
+}
+
+void libcfs_run_lbug_upcall(struct libcfs_debug_msg_data *msgdata)
+{
+	char *argv[6];
+	char buf[32];
+
+	ENTRY;
+	snprintf (buf, sizeof buf, "%d", msgdata->msg_line);
+
+	argv[1] = "LBUG";
+	argv[2] = (char *)msgdata->msg_file;
+	argv[3] = (char *)msgdata->msg_fn;
+	argv[4] = buf;
+	argv[5] = NULL;
+
+	libcfs_run_upcall (argv);
+}
+
+/* coverity[+kill] */
+void lbug_with_loc(struct libcfs_debug_msg_data *msgdata)
+{
+	libcfs_catastrophe = 1;
+	libcfs_debug_msg(msgdata, "LBUG\n");
+
+	if (in_interrupt()) {
+		panic("LBUG in interrupt.\n");
+		/* not reached */
+	}
+
+	libcfs_debug_dumpstack(NULL);
+	if (!libcfs_panic_on_lbug)
+		libcfs_debug_dumplog();
+	libcfs_run_lbug_upcall(msgdata);
+	if (libcfs_panic_on_lbug)
+		panic("LBUG");
+	set_task_state(current, TASK_UNINTERRUPTIBLE);
+	while (1)
+		schedule();
+}
+
+
+#include <linux/nmi.h>
+#include <asm/stacktrace.h>
+
+
+static int print_trace_stack(void *data, char *name)
+{
+	printk(" <%s> ", name);
+	return 0;
+}
+
+# define RELIABLE reliable
+# define DUMP_TRACE_CONST const
+static void print_trace_address(void *data, unsigned long addr, int reliable)
+{
+	char fmt[32];
+	touch_nmi_watchdog();
+	sprintf(fmt, " [<%016lx>] %s%%s\n", addr, RELIABLE ? "": "? ");
+	__print_symbol(fmt, addr);
+}
+
+static DUMP_TRACE_CONST struct stacktrace_ops print_trace_ops = {
+	.stack = print_trace_stack,
+	.address = print_trace_address,
+	.walk_stack = print_context_stack,
+};
+
+void libcfs_debug_dumpstack(struct task_struct *tsk)
+{
+	/* dump_stack() */
+	/* show_trace() */
+	if (tsk == NULL)
+		tsk = current;
+	printk("Pid: %d, comm: %.20s\n", tsk->pid, tsk->comm);
+	/* show_trace_log_lvl() */
+	printk("\nCall Trace:\n");
+	dump_trace(tsk, NULL, NULL,
+		   0,
+		   &print_trace_ops, NULL);
+	printk("\n");
+}
+
+task_t *libcfs_current(void)
+{
+	CWARN("current task struct is %p\n", current);
+	return current;
+}
+
+static int panic_notifier(struct notifier_block *self, unsigned long unused1,
+			 void *unused2)
+{
+	if (libcfs_panic_in_progress)
+		return 0;
+
+	libcfs_panic_in_progress = 1;
+	mb();
+
+	return 0;
+}
+
+static struct notifier_block libcfs_panic_notifier = {
+	notifier_call :     panic_notifier,
+	next :	      NULL,
+	priority :	  10000
+};
+
+void libcfs_register_panic_notifier(void)
+{
+	atomic_notifier_chain_register(&panic_notifier_list, &libcfs_panic_notifier);
+}
+
+void libcfs_unregister_panic_notifier(void)
+{
+	atomic_notifier_chain_unregister(&panic_notifier_list, &libcfs_panic_notifier);
+}
+
+EXPORT_SYMBOL(libcfs_debug_dumpstack);
+EXPORT_SYMBOL(libcfs_current);
+
+
+EXPORT_SYMBOL(libcfs_run_upcall);
+EXPORT_SYMBOL(libcfs_run_lbug_upcall);
+EXPORT_SYMBOL(lbug_with_loc);
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-fs.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-fs.c
new file mode 100644
index 000000000000..cb969694a38f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-fs.c
@@ -0,0 +1,113 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+# define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/fs.h>
+#include <linux/kdev_t.h>
+#include <linux/ctype.h>
+#include <asm/uaccess.h>
+
+#include <linux/libcfs/libcfs.h>
+
+/* write a userspace buffer to disk.
+ * NOTE: this returns 0 on success, not the number of bytes written. */
+ssize_t
+filp_user_write(struct file *filp, const void *buf, size_t count,
+		loff_t *offset)
+{
+	mm_segment_t fs;
+	ssize_t size = 0;
+
+	fs = get_fs();
+	set_fs(KERNEL_DS);
+	while ((ssize_t)count > 0) {
+		size = vfs_write(filp, (const void __user *)buf, count, offset);
+		if (size < 0)
+			break;
+		count -= size;
+		buf += size;
+		size = 0;
+	}
+	set_fs(fs);
+
+	return size;
+}
+EXPORT_SYMBOL(filp_user_write);
+
+#if !(CFS_O_CREAT == O_CREAT && CFS_O_EXCL == O_EXCL &&	\
+     CFS_O_NOACCESS == O_NOACCESS &&\
+     CFS_O_TRUNC == O_TRUNC && CFS_O_APPEND == O_APPEND &&\
+     CFS_O_NONBLOCK == O_NONBLOCK && CFS_O_NDELAY == O_NDELAY &&\
+     CFS_O_SYNC == O_SYNC && CFS_O_ASYNC == FASYNC &&\
+     CFS_O_DIRECT == O_DIRECT && CFS_O_LARGEFILE == O_LARGEFILE &&\
+     CFS_O_DIRECTORY == O_DIRECTORY && CFS_O_NOFOLLOW == O_NOFOLLOW)
+
+int cfs_oflags2univ(int flags)
+{
+	int f;
+
+	f = flags & O_NOACCESS;
+	f |= (flags & O_CREAT) ? CFS_O_CREAT: 0;
+	f |= (flags & O_EXCL) ? CFS_O_EXCL: 0;
+	f |= (flags & O_NOCTTY) ? CFS_O_NOCTTY: 0;
+	f |= (flags & O_TRUNC) ? CFS_O_TRUNC: 0;
+	f |= (flags & O_APPEND) ? CFS_O_APPEND: 0;
+	f |= (flags & O_NONBLOCK) ? CFS_O_NONBLOCK: 0;
+	f |= (flags & O_SYNC)? CFS_O_SYNC: 0;
+	f |= (flags & FASYNC)? CFS_O_ASYNC: 0;
+	f |= (flags & O_DIRECTORY)? CFS_O_DIRECTORY: 0;
+	f |= (flags & O_DIRECT)? CFS_O_DIRECT: 0;
+	f |= (flags & O_LARGEFILE)? CFS_O_LARGEFILE: 0;
+	f |= (flags & O_NOFOLLOW)? CFS_O_NOFOLLOW: 0;
+	f |= (flags & O_NOATIME)? CFS_O_NOATIME: 0;
+	return f;
+}
+#else
+
+int cfs_oflags2univ(int flags)
+{
+	return (flags);
+}
+#endif
+EXPORT_SYMBOL(cfs_oflags2univ);
+
+/*
+ * XXX Liang: we don't need cfs_univ2oflags() now.
+ */
+int cfs_univ2oflags(int flags)
+{
+	return (flags);
+}
+EXPORT_SYMBOL(cfs_univ2oflags);
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-lock.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-lock.c
new file mode 100644
index 000000000000..6f7162e71fb3
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-lock.c
@@ -0,0 +1,38 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+# define DEBUG_SUBSYSTEM S_LNET
+
+#include <arch-linux/cfs_lock.h>
+#include <linux/libcfs/libcfs.h>
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-mem.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-mem.c
new file mode 100644
index 000000000000..3be3ede1148d
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-mem.c
@@ -0,0 +1,87 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/libcfs/libcfs.h>
+
+/*
+ * NB: we will rename some of above functions in another patch:
+ * - rename kmalloc to cfs_malloc
+ * - rename kmalloc/free_page to cfs_page_alloc/free
+ * - rename kmalloc/free_large to cfs_vmalloc/vfree
+ */
+
+void *
+cfs_cpt_malloc(struct cfs_cpt_table *cptab, int cpt,
+	       size_t nr_bytes, unsigned int flags)
+{
+	void    *ptr;
+
+	ptr = kmalloc_node(nr_bytes, flags,
+			   cfs_cpt_spread_node(cptab, cpt));
+	if (ptr != NULL && (flags & __GFP_ZERO) != 0)
+		memset(ptr, 0, nr_bytes);
+
+	return ptr;
+}
+EXPORT_SYMBOL(cfs_cpt_malloc);
+
+void *
+cfs_cpt_vmalloc(struct cfs_cpt_table *cptab, int cpt, size_t nr_bytes)
+{
+	return vmalloc_node(nr_bytes, cfs_cpt_spread_node(cptab, cpt));
+}
+EXPORT_SYMBOL(cfs_cpt_vmalloc);
+
+struct page *
+cfs_page_cpt_alloc(struct cfs_cpt_table *cptab, int cpt, unsigned int flags)
+{
+	return alloc_pages_node(cfs_cpt_spread_node(cptab, cpt), flags, 0);
+}
+EXPORT_SYMBOL(cfs_page_cpt_alloc);
+
+void *
+cfs_mem_cache_cpt_alloc(struct kmem_cache *cachep, struct cfs_cpt_table *cptab,
+			int cpt, unsigned int flags)
+{
+	return kmem_cache_alloc_node(cachep, flags,
+				     cfs_cpt_spread_node(cptab, cpt));
+}
+EXPORT_SYMBOL(cfs_mem_cache_cpt_alloc);
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-module.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-module.c
new file mode 100644
index 000000000000..2c7d4a3d660f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-module.c
@@ -0,0 +1,183 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+
+#define LNET_MINOR 240
+
+int libcfs_ioctl_getdata(char *buf, char *end, void *arg)
+{
+	struct libcfs_ioctl_hdr   *hdr;
+	struct libcfs_ioctl_data  *data;
+	int err;
+	ENTRY;
+
+	hdr = (struct libcfs_ioctl_hdr *)buf;
+	data = (struct libcfs_ioctl_data *)buf;
+
+	err = copy_from_user(buf, (void *)arg, sizeof(*hdr));
+	if (err)
+		RETURN(err);
+
+	if (hdr->ioc_version != LIBCFS_IOCTL_VERSION) {
+		CERROR("PORTALS: version mismatch kernel vs application\n");
+		RETURN(-EINVAL);
+	}
+
+	if (hdr->ioc_len + buf >= end) {
+		CERROR("PORTALS: user buffer exceeds kernel buffer\n");
+		RETURN(-EINVAL);
+	}
+
+
+	if (hdr->ioc_len < sizeof(struct libcfs_ioctl_data)) {
+		CERROR("PORTALS: user buffer too small for ioctl\n");
+		RETURN(-EINVAL);
+	}
+
+	err = copy_from_user(buf, (void *)arg, hdr->ioc_len);
+	if (err)
+		RETURN(err);
+
+	if (libcfs_ioctl_is_invalid(data)) {
+		CERROR("PORTALS: ioctl not correctly formatted\n");
+		RETURN(-EINVAL);
+	}
+
+	if (data->ioc_inllen1)
+		data->ioc_inlbuf1 = &data->ioc_bulk[0];
+
+	if (data->ioc_inllen2)
+		data->ioc_inlbuf2 = &data->ioc_bulk[0] +
+			cfs_size_round(data->ioc_inllen1);
+
+	RETURN(0);
+}
+
+int libcfs_ioctl_popdata(void *arg, void *data, int size)
+{
+	if (copy_to_user((char *)arg, data, size))
+		return -EFAULT;
+	return 0;
+}
+
+extern struct cfs_psdev_ops	  libcfs_psdev_ops;
+
+static int
+libcfs_psdev_open(struct inode * inode, struct file * file)
+{
+	struct libcfs_device_userstate **pdu = NULL;
+	int    rc = 0;
+
+	if (!inode)
+		return (-EINVAL);
+	pdu = (struct libcfs_device_userstate **)&file->private_data;
+	if (libcfs_psdev_ops.p_open != NULL)
+		rc = libcfs_psdev_ops.p_open(0, (void *)pdu);
+	else
+		return (-EPERM);
+	return rc;
+}
+
+/* called when closing /dev/device */
+static int
+libcfs_psdev_release(struct inode * inode, struct file * file)
+{
+	struct libcfs_device_userstate *pdu;
+	int    rc = 0;
+
+	if (!inode)
+		return (-EINVAL);
+	pdu = file->private_data;
+	if (libcfs_psdev_ops.p_close != NULL)
+		rc = libcfs_psdev_ops.p_close(0, (void *)pdu);
+	else
+		rc = -EPERM;
+	return rc;
+}
+
+static long libcfs_ioctl(struct file *file,
+			 unsigned int cmd, unsigned long arg)
+{
+	struct cfs_psdev_file	 pfile;
+	int    rc = 0;
+
+	if (current_fsuid() != 0)
+		return -EACCES;
+
+	if ( _IOC_TYPE(cmd) != IOC_LIBCFS_TYPE ||
+	     _IOC_NR(cmd) < IOC_LIBCFS_MIN_NR  ||
+	     _IOC_NR(cmd) > IOC_LIBCFS_MAX_NR ) {
+		CDEBUG(D_IOCTL, "invalid ioctl ( type %d, nr %d, size %d )\n",
+		       _IOC_TYPE(cmd), _IOC_NR(cmd), _IOC_SIZE(cmd));
+		return (-EINVAL);
+	}
+
+	/* Handle platform-dependent IOC requests */
+	switch (cmd) {
+	case IOC_LIBCFS_PANIC:
+		if (!cfs_capable(CFS_CAP_SYS_BOOT))
+			return (-EPERM);
+		panic("debugctl-invoked panic");
+		return (0);
+	case IOC_LIBCFS_MEMHOG:
+		if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+			return -EPERM;
+		/* go thought */
+	}
+
+	pfile.off = 0;
+	pfile.private_data = file->private_data;
+	if (libcfs_psdev_ops.p_ioctl != NULL)
+		rc = libcfs_psdev_ops.p_ioctl(&pfile, cmd, (void *)arg);
+	else
+		rc = -EPERM;
+	return (rc);
+}
+
+static struct file_operations libcfs_fops = {
+	unlocked_ioctl: libcfs_ioctl,
+	open :	  libcfs_psdev_open,
+	release :       libcfs_psdev_release
+};
+
+psdev_t libcfs_dev = {
+	LNET_MINOR,
+	"lnet",
+	&libcfs_fops
+};
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-prim.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-prim.c
new file mode 100644
index 000000000000..b652a79a4811
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-prim.c
@@ -0,0 +1,259 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/fs_struct.h>
+#include <linux/sched.h>
+
+#include <linux/libcfs/libcfs.h>
+
+#if defined(CONFIG_KGDB)
+#include <asm/kgdb.h>
+#endif
+
+#define LINUX_WAITQ(w) ((wait_queue_t *) w)
+#define LINUX_WAITQ_HEAD(w) ((wait_queue_head_t *) w)
+
+void
+init_waitqueue_entry_current(wait_queue_t *link)
+{
+	init_waitqueue_entry(LINUX_WAITQ(link), current);
+}
+EXPORT_SYMBOL(init_waitqueue_entry_current);
+
+/**
+ * wait_queue_t of Linux (version < 2.6.34) is a FIFO list for exclusively
+ * waiting threads, which is not always desirable because all threads will
+ * be waken up again and again, even user only needs a few of them to be
+ * active most time. This is not good for performance because cache can
+ * be polluted by different threads.
+ *
+ * LIFO list can resolve this problem because we always wakeup the most
+ * recent active thread by default.
+ *
+ * NB: please don't call non-exclusive & exclusive wait on the same
+ * waitq if add_wait_queue_exclusive_head is used.
+ */
+void
+add_wait_queue_exclusive_head(wait_queue_head_t *waitq, wait_queue_t *link)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&LINUX_WAITQ_HEAD(waitq)->lock, flags);
+	__add_wait_queue_exclusive(LINUX_WAITQ_HEAD(waitq), LINUX_WAITQ(link));
+	spin_unlock_irqrestore(&LINUX_WAITQ_HEAD(waitq)->lock, flags);
+}
+EXPORT_SYMBOL(add_wait_queue_exclusive_head);
+
+void
+waitq_wait(wait_queue_t *link, cfs_task_state_t state)
+{
+	schedule();
+}
+EXPORT_SYMBOL(waitq_wait);
+
+int64_t
+waitq_timedwait(wait_queue_t *link, cfs_task_state_t state,
+		    int64_t timeout)
+{
+	return schedule_timeout(timeout);
+}
+EXPORT_SYMBOL(waitq_timedwait);
+
+void
+schedule_timeout_and_set_state(cfs_task_state_t state, int64_t timeout)
+{
+	set_current_state(state);
+	schedule_timeout(timeout);
+}
+EXPORT_SYMBOL(schedule_timeout_and_set_state);
+
+/* deschedule for a bit... */
+void
+cfs_pause(cfs_duration_t ticks)
+{
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	schedule_timeout(ticks);
+}
+EXPORT_SYMBOL(cfs_pause);
+
+void cfs_init_timer(timer_list_t *t)
+{
+	init_timer(t);
+}
+EXPORT_SYMBOL(cfs_init_timer);
+
+void cfs_timer_init(timer_list_t *t, cfs_timer_func_t *func, void *arg)
+{
+	init_timer(t);
+	t->function = func;
+	t->data = (unsigned long)arg;
+}
+EXPORT_SYMBOL(cfs_timer_init);
+
+void cfs_timer_done(timer_list_t *t)
+{
+	return;
+}
+EXPORT_SYMBOL(cfs_timer_done);
+
+void cfs_timer_arm(timer_list_t *t, cfs_time_t deadline)
+{
+	mod_timer(t, deadline);
+}
+EXPORT_SYMBOL(cfs_timer_arm);
+
+void cfs_timer_disarm(timer_list_t *t)
+{
+	del_timer(t);
+}
+EXPORT_SYMBOL(cfs_timer_disarm);
+
+int  cfs_timer_is_armed(timer_list_t *t)
+{
+	return timer_pending(t);
+}
+EXPORT_SYMBOL(cfs_timer_is_armed);
+
+cfs_time_t cfs_timer_deadline(timer_list_t *t)
+{
+	return t->expires;
+}
+EXPORT_SYMBOL(cfs_timer_deadline);
+
+void cfs_enter_debugger(void)
+{
+#if defined(CONFIG_KGDB)
+//	BREAKPOINT();
+#else
+	/* nothing */
+#endif
+}
+
+
+sigset_t
+cfs_block_allsigs(void)
+{
+	unsigned long	  flags;
+	sigset_t	old;
+
+	SIGNAL_MASK_LOCK(current, flags);
+	old = current->blocked;
+	sigfillset(&current->blocked);
+	recalc_sigpending();
+	SIGNAL_MASK_UNLOCK(current, flags);
+
+	return old;
+}
+
+sigset_t cfs_block_sigs(unsigned long sigs)
+{
+	unsigned long  flags;
+	sigset_t	old;
+
+	SIGNAL_MASK_LOCK(current, flags);
+	old = current->blocked;
+	sigaddsetmask(&current->blocked, sigs);
+	recalc_sigpending();
+	SIGNAL_MASK_UNLOCK(current, flags);
+	return old;
+}
+
+/* Block all signals except for the @sigs */
+sigset_t cfs_block_sigsinv(unsigned long sigs)
+{
+	unsigned long flags;
+	sigset_t old;
+
+	SIGNAL_MASK_LOCK(current, flags);
+	old = current->blocked;
+	sigaddsetmask(&current->blocked, ~sigs);
+	recalc_sigpending();
+	SIGNAL_MASK_UNLOCK(current, flags);
+
+	return old;
+}
+
+void
+cfs_restore_sigs (sigset_t old)
+{
+	unsigned long  flags;
+
+	SIGNAL_MASK_LOCK(current, flags);
+	current->blocked = old;
+	recalc_sigpending();
+	SIGNAL_MASK_UNLOCK(current, flags);
+}
+
+int
+cfs_signal_pending(void)
+{
+	return signal_pending(current);
+}
+
+void
+cfs_clear_sigpending(void)
+{
+	unsigned long flags;
+
+	SIGNAL_MASK_LOCK(current, flags);
+	clear_tsk_thread_flag(current, TIF_SIGPENDING);
+	SIGNAL_MASK_UNLOCK(current, flags);
+}
+
+int
+libcfs_arch_init(void)
+{
+	return 0;
+}
+
+void
+libcfs_arch_cleanup(void)
+{
+	return;
+}
+
+EXPORT_SYMBOL(libcfs_arch_init);
+EXPORT_SYMBOL(libcfs_arch_cleanup);
+EXPORT_SYMBOL(cfs_enter_debugger);
+EXPORT_SYMBOL(cfs_block_allsigs);
+EXPORT_SYMBOL(cfs_block_sigs);
+EXPORT_SYMBOL(cfs_block_sigsinv);
+EXPORT_SYMBOL(cfs_restore_sigs);
+EXPORT_SYMBOL(cfs_signal_pending);
+EXPORT_SYMBOL(cfs_clear_sigpending);
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-proc.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-proc.c
new file mode 100644
index 000000000000..522b28e99e41
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-proc.c
@@ -0,0 +1,580 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/linux/linux-proc.c
+ *
+ * Author: Zach Brown <zab@zabbo.net>
+ * Author: Peter J. Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <net/sock.h>
+#include <linux/uio.h>
+
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <asm/uaccess.h>
+
+#include <linux/proc_fs.h>
+#include <linux/sysctl.h>
+
+# define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+#include <asm/div64.h>
+#include "tracefile.h"
+
+#ifdef CONFIG_SYSCTL
+static ctl_table_header_t *lnet_table_header = NULL;
+#endif
+extern char lnet_upcall[1024];
+/**
+ * The path of debug log dump upcall script.
+ */
+extern char lnet_debug_log_upcall[1024];
+
+#define CTL_LNET	(0x100)
+enum {
+	PSDEV_DEBUG = 1,	  /* control debugging */
+	PSDEV_SUBSYSTEM_DEBUG,    /* control debugging */
+	PSDEV_PRINTK,	     /* force all messages to console */
+	PSDEV_CONSOLE_RATELIMIT,  /* ratelimit console messages */
+	PSDEV_CONSOLE_MAX_DELAY_CS, /* maximum delay over which we skip messages */
+	PSDEV_CONSOLE_MIN_DELAY_CS, /* initial delay over which we skip messages */
+	PSDEV_CONSOLE_BACKOFF,    /* delay increase factor */
+	PSDEV_DEBUG_PATH,	 /* crashdump log location */
+	PSDEV_DEBUG_DUMP_PATH,    /* crashdump tracelog location */
+	PSDEV_CPT_TABLE,	  /* information about cpu partitions */
+	PSDEV_LNET_UPCALL,	/* User mode upcall script  */
+	PSDEV_LNET_MEMUSED,       /* bytes currently PORTAL_ALLOCated */
+	PSDEV_LNET_CATASTROPHE,   /* if we have LBUGged or panic'd */
+	PSDEV_LNET_PANIC_ON_LBUG, /* flag to panic on LBUG */
+	PSDEV_LNET_DUMP_KERNEL,   /* snapshot kernel debug buffer to file */
+	PSDEV_LNET_DAEMON_FILE,   /* spool kernel debug buffer to file */
+	PSDEV_LNET_DEBUG_MB,      /* size of debug buffer */
+	PSDEV_LNET_DEBUG_LOG_UPCALL, /* debug log upcall script */
+	PSDEV_LNET_WATCHDOG_RATELIMIT,  /* ratelimit watchdog messages  */
+	PSDEV_LNET_FORCE_LBUG,    /* hook to force an LBUG */
+	PSDEV_LNET_FAIL_LOC,      /* control test failures instrumentation */
+	PSDEV_LNET_FAIL_VAL,      /* userdata for fail loc */
+};
+
+int
+proc_call_handler(void *data, int write,
+		  loff_t *ppos, void *buffer, size_t *lenp,
+		  int (*handler)(void *data, int write,
+				 loff_t pos, void *buffer, int len))
+{
+	int rc = handler(data, write, *ppos, buffer, *lenp);
+
+	if (rc < 0)
+		return rc;
+
+	if (write) {
+		*ppos += *lenp;
+	} else {
+		*lenp = rc;
+		*ppos += rc;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(proc_call_handler);
+
+static int __proc_dobitmasks(void *data, int write,
+			     loff_t pos, void *buffer, int nob)
+{
+	const int     tmpstrlen = 512;
+	char	 *tmpstr;
+	int	   rc;
+	unsigned int *mask = data;
+	int	   is_subsys = (mask == &libcfs_subsystem_debug) ? 1 : 0;
+	int	   is_printk = (mask == &libcfs_printk) ? 1 : 0;
+
+	rc = cfs_trace_allocate_string_buffer(&tmpstr, tmpstrlen);
+	if (rc < 0)
+		return rc;
+
+	if (!write) {
+		libcfs_debug_mask2str(tmpstr, tmpstrlen, *mask, is_subsys);
+		rc = strlen(tmpstr);
+
+		if (pos >= rc) {
+			rc = 0;
+		} else {
+			rc = cfs_trace_copyout_string(buffer, nob,
+						      tmpstr + pos, "\n");
+		}
+	} else {
+		rc = cfs_trace_copyin_string(tmpstr, tmpstrlen, buffer, nob);
+		if (rc < 0) {
+			cfs_trace_free_string_buffer(tmpstr, tmpstrlen);
+			return rc;
+		}
+
+		rc = libcfs_debug_str2mask(mask, tmpstr, is_subsys);
+		/* Always print LBUG/LASSERT to console, so keep this mask */
+		if (is_printk)
+			*mask |= D_EMERG;
+	}
+
+	cfs_trace_free_string_buffer(tmpstr, tmpstrlen);
+	return rc;
+}
+
+DECLARE_PROC_HANDLER(proc_dobitmasks)
+
+static int min_watchdog_ratelimit = 0;	  /* disable ratelimiting */
+static int max_watchdog_ratelimit = (24*60*60); /* limit to once per day */
+
+static int __proc_dump_kernel(void *data, int write,
+			      loff_t pos, void *buffer, int nob)
+{
+	if (!write)
+		return 0;
+
+	return cfs_trace_dump_debug_buffer_usrstr(buffer, nob);
+}
+
+DECLARE_PROC_HANDLER(proc_dump_kernel)
+
+static int __proc_daemon_file(void *data, int write,
+			      loff_t pos, void *buffer, int nob)
+{
+	if (!write) {
+		int len = strlen(cfs_tracefile);
+
+		if (pos >= len)
+			return 0;
+
+		return cfs_trace_copyout_string(buffer, nob,
+						cfs_tracefile + pos, "\n");
+	}
+
+	return cfs_trace_daemon_command_usrstr(buffer, nob);
+}
+
+DECLARE_PROC_HANDLER(proc_daemon_file)
+
+static int __proc_debug_mb(void *data, int write,
+			   loff_t pos, void *buffer, int nob)
+{
+	if (!write) {
+		char tmpstr[32];
+		int  len = snprintf(tmpstr, sizeof(tmpstr), "%d",
+				    cfs_trace_get_debug_mb());
+
+		if (pos >= len)
+			return 0;
+
+		return cfs_trace_copyout_string(buffer, nob, tmpstr + pos,
+		       "\n");
+	}
+
+	return cfs_trace_set_debug_mb_usrstr(buffer, nob);
+}
+
+DECLARE_PROC_HANDLER(proc_debug_mb)
+
+int LL_PROC_PROTO(proc_console_max_delay_cs)
+{
+	int rc, max_delay_cs;
+	ctl_table_t dummy = *table;
+	cfs_duration_t d;
+
+	dummy.data = &max_delay_cs;
+	dummy.proc_handler = &proc_dointvec;
+
+	if (!write) { /* read */
+		max_delay_cs = cfs_duration_sec(libcfs_console_max_delay * 100);
+		rc = ll_proc_dointvec(&dummy, write, filp, buffer, lenp, ppos);
+		return rc;
+	}
+
+	/* write */
+	max_delay_cs = 0;
+	rc = ll_proc_dointvec(&dummy, write, filp, buffer, lenp, ppos);
+	if (rc < 0)
+		return rc;
+	if (max_delay_cs <= 0)
+		return -EINVAL;
+
+	d = cfs_time_seconds(max_delay_cs) / 100;
+	if (d == 0 || d < libcfs_console_min_delay)
+		return -EINVAL;
+	libcfs_console_max_delay = d;
+
+	return rc;
+}
+
+int LL_PROC_PROTO(proc_console_min_delay_cs)
+{
+	int rc, min_delay_cs;
+	ctl_table_t dummy = *table;
+	cfs_duration_t d;
+
+	dummy.data = &min_delay_cs;
+	dummy.proc_handler = &proc_dointvec;
+
+	if (!write) { /* read */
+		min_delay_cs = cfs_duration_sec(libcfs_console_min_delay * 100);
+		rc = ll_proc_dointvec(&dummy, write, filp, buffer, lenp, ppos);
+		return rc;
+	}
+
+	/* write */
+	min_delay_cs = 0;
+	rc = ll_proc_dointvec(&dummy, write, filp, buffer, lenp, ppos);
+	if (rc < 0)
+		return rc;
+	if (min_delay_cs <= 0)
+		return -EINVAL;
+
+	d = cfs_time_seconds(min_delay_cs) / 100;
+	if (d == 0 || d > libcfs_console_max_delay)
+		return -EINVAL;
+	libcfs_console_min_delay = d;
+
+	return rc;
+}
+
+int LL_PROC_PROTO(proc_console_backoff)
+{
+	int rc, backoff;
+	ctl_table_t dummy = *table;
+
+	dummy.data = &backoff;
+	dummy.proc_handler = &proc_dointvec;
+
+	if (!write) { /* read */
+		backoff= libcfs_console_backoff;
+		rc = ll_proc_dointvec(&dummy, write, filp, buffer, lenp, ppos);
+		return rc;
+	}
+
+	/* write */
+	backoff = 0;
+	rc = ll_proc_dointvec(&dummy, write, filp, buffer, lenp, ppos);
+	if (rc < 0)
+		return rc;
+	if (backoff <= 0)
+		return -EINVAL;
+
+	libcfs_console_backoff = backoff;
+
+	return rc;
+}
+
+int LL_PROC_PROTO(libcfs_force_lbug)
+{
+	if (write)
+		LBUG();
+	return 0;
+}
+
+int LL_PROC_PROTO(proc_fail_loc)
+{
+	int rc;
+	long old_fail_loc = cfs_fail_loc;
+
+	rc = ll_proc_dolongvec(table, write, filp, buffer, lenp, ppos);
+	if (old_fail_loc != cfs_fail_loc)
+		wake_up(&cfs_race_waitq);
+	return rc;
+}
+
+static int __proc_cpt_table(void *data, int write,
+			    loff_t pos, void *buffer, int nob)
+{
+	char *buf = NULL;
+	int   len = 4096;
+	int   rc  = 0;
+
+	if (write)
+		return -EPERM;
+
+	LASSERT(cfs_cpt_table != NULL);
+
+	while (1) {
+		LIBCFS_ALLOC(buf, len);
+		if (buf == NULL)
+			return -ENOMEM;
+
+		rc = cfs_cpt_table_print(cfs_cpt_table, buf, len);
+		if (rc >= 0)
+			break;
+
+		LIBCFS_FREE(buf, len);
+		if (rc == -EFBIG) {
+			len <<= 1;
+			continue;
+		}
+		goto out;
+	}
+
+	if (pos >= rc) {
+		rc = 0;
+		goto out;
+	}
+
+	rc = cfs_trace_copyout_string(buffer, nob, buf + pos, NULL);
+ out:
+	if (buf != NULL)
+		LIBCFS_FREE(buf, len);
+	return rc;
+}
+DECLARE_PROC_HANDLER(proc_cpt_table)
+
+static ctl_table_t lnet_table[] = {
+	/*
+	 * NB No .strategy entries have been provided since sysctl(8) prefers
+	 * to go via /proc for portability.
+	 */
+	{
+		INIT_CTL_NAME(PSDEV_DEBUG)
+		.procname = "debug",
+		.data     = &libcfs_debug,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dobitmasks,
+	},
+	{
+		INIT_CTL_NAME(PSDEV_SUBSYSTEM_DEBUG)
+		.procname = "subsystem_debug",
+		.data     = &libcfs_subsystem_debug,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dobitmasks,
+	},
+	{
+		INIT_CTL_NAME(PSDEV_PRINTK)
+		.procname = "printk",
+		.data     = &libcfs_printk,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dobitmasks,
+	},
+	{
+		INIT_CTL_NAME(PSDEV_CONSOLE_RATELIMIT)
+		.procname = "console_ratelimit",
+		.data     = &libcfs_console_ratelimit,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME(PSDEV_CONSOLE_MAX_DELAY_CS)
+		.procname = "console_max_delay_centisecs",
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_console_max_delay_cs
+	},
+	{
+		INIT_CTL_NAME(PSDEV_CONSOLE_MIN_DELAY_CS)
+		.procname = "console_min_delay_centisecs",
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_console_min_delay_cs
+	},
+	{
+		INIT_CTL_NAME(PSDEV_CONSOLE_BACKOFF)
+		.procname = "console_backoff",
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_console_backoff
+	},
+
+	{
+		INIT_CTL_NAME(PSDEV_DEBUG_PATH)
+		.procname = "debug_path",
+		.data     = libcfs_debug_file_path_arr,
+		.maxlen   = sizeof(libcfs_debug_file_path_arr),
+		.mode     = 0644,
+		.proc_handler = &proc_dostring,
+	},
+
+	{
+		INIT_CTL_NAME(PSDEV_CPT_TABLE)
+		.procname = "cpu_partition_table",
+		.maxlen   = 128,
+		.mode     = 0444,
+		.proc_handler = &proc_cpt_table,
+	},
+
+	{
+		INIT_CTL_NAME(PSDEV_LNET_UPCALL)
+		.procname = "upcall",
+		.data     = lnet_upcall,
+		.maxlen   = sizeof(lnet_upcall),
+		.mode     = 0644,
+		.proc_handler = &proc_dostring,
+	},
+	{
+		INIT_CTL_NAME(PSDEV_LNET_DEBUG_LOG_UPCALL)
+		.procname = "debug_log_upcall",
+		.data     = lnet_debug_log_upcall,
+		.maxlen   = sizeof(lnet_debug_log_upcall),
+		.mode     = 0644,
+		.proc_handler = &proc_dostring,
+	},
+	{
+		INIT_CTL_NAME(PSDEV_LNET_MEMUSED)
+		.procname = "lnet_memused",
+		.data     = (int *)&libcfs_kmemory.counter,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec,
+		INIT_STRATEGY(&sysctl_intvec)
+	},
+	{
+		INIT_CTL_NAME(PSDEV_LNET_CATASTROPHE)
+		.procname = "catastrophe",
+		.data     = &libcfs_catastrophe,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec,
+		INIT_STRATEGY(&sysctl_intvec)
+	},
+	{
+		INIT_CTL_NAME(PSDEV_LNET_PANIC_ON_LBUG)
+		.procname = "panic_on_lbug",
+		.data     = &libcfs_panic_on_lbug,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		INIT_STRATEGY(&sysctl_intvec)
+	},
+	{
+		INIT_CTL_NAME(PSDEV_LNET_DUMP_KERNEL)
+		.procname = "dump_kernel",
+		.maxlen   = 256,
+		.mode     = 0200,
+		.proc_handler = &proc_dump_kernel,
+	},
+	{
+		INIT_CTL_NAME(PSDEV_LNET_DAEMON_FILE)
+		.procname = "daemon_file",
+		.mode     = 0644,
+		.maxlen   = 256,
+		.proc_handler = &proc_daemon_file,
+	},
+	{
+		INIT_CTL_NAME(PSDEV_LNET_DEBUG_MB)
+		.procname = "debug_mb",
+		.mode     = 0644,
+		.proc_handler = &proc_debug_mb,
+	},
+	{
+		INIT_CTL_NAME(PSDEV_LNET_WATCHDOG_RATELIMIT)
+		.procname = "watchdog_ratelimit",
+		.data     = &libcfs_watchdog_ratelimit,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec_minmax,
+		.extra1   = &min_watchdog_ratelimit,
+		.extra2   = &max_watchdog_ratelimit,
+	},
+	{       INIT_CTL_NAME(PSDEV_LNET_FORCE_LBUG)
+		.procname = "force_lbug",
+		.data     = NULL,
+		.maxlen   = 0,
+		.mode     = 0200,
+		.proc_handler = &libcfs_force_lbug
+	},
+	{
+		INIT_CTL_NAME(PSDEV_LNET_FAIL_LOC)
+		.procname = "fail_loc",
+		.data     = &cfs_fail_loc,
+		.maxlen   = sizeof(cfs_fail_loc),
+		.mode     = 0644,
+		.proc_handler = &proc_fail_loc
+	},
+	{
+		INIT_CTL_NAME(PSDEV_LNET_FAIL_VAL)
+		.procname = "fail_val",
+		.data     = &cfs_fail_val,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME(0)
+	}
+};
+
+#ifdef CONFIG_SYSCTL
+static ctl_table_t top_table[] = {
+	{
+		INIT_CTL_NAME(CTL_LNET)
+		.procname = "lnet",
+		.mode     = 0555,
+		.data     = NULL,
+		.maxlen   = 0,
+		.child    = lnet_table,
+	},
+	{
+		INIT_CTL_NAME(0)
+	}
+};
+#endif
+
+int insert_proc(void)
+{
+#ifdef CONFIG_SYSCTL
+	if (lnet_table_header == NULL)
+		lnet_table_header = cfs_register_sysctl_table(top_table, 0);
+#endif
+	return 0;
+}
+
+void remove_proc(void)
+{
+#ifdef CONFIG_SYSCTL
+	if (lnet_table_header != NULL)
+		unregister_sysctl_table(lnet_table_header);
+
+	lnet_table_header = NULL;
+#endif
+}
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-sync.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-sync.c
new file mode 100644
index 000000000000..a3043478b7c1
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-sync.c
@@ -0,0 +1,35 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+# define DEBUG_SUBSYSTEM S_LNET
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c
new file mode 100644
index 000000000000..4a018167cb03
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c
@@ -0,0 +1,664 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/libcfs/libcfs.h>
+
+#include <linux/if.h>
+#include <linux/in.h>
+#include <linux/file.h>
+/* For sys_open & sys_close */
+#include <linux/syscalls.h>
+
+int
+libcfs_sock_ioctl(int cmd, unsigned long arg)
+{
+	mm_segment_t    oldmm = get_fs();
+	struct socket  *sock;
+	int	     fd = -1;
+	int	     rc;
+	struct file    *sock_filp;
+
+	rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock);
+	if (rc != 0) {
+		CERROR ("Can't create socket: %d\n", rc);
+		return rc;
+	}
+
+	sock_filp = sock_alloc_file(sock, 0, NULL);
+	if (!sock_filp) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	set_fs(KERNEL_DS);
+	if (sock_filp->f_op->unlocked_ioctl)
+		rc = sock_filp->f_op->unlocked_ioctl(sock_filp, cmd, arg);
+	set_fs(oldmm);
+
+	fput(sock_filp);
+
+ out:
+	if (fd >= 0)
+		sys_close(fd);
+	else
+		sock_release(sock);
+	return rc;
+}
+
+int
+libcfs_ipif_query (char *name, int *up, __u32 *ip, __u32 *mask)
+{
+	struct ifreq   ifr;
+	int	    nob;
+	int	    rc;
+	__u32	  val;
+
+	nob = strnlen(name, IFNAMSIZ);
+	if (nob == IFNAMSIZ) {
+		CERROR("Interface name %s too long\n", name);
+		return -EINVAL;
+	}
+
+	CLASSERT (sizeof(ifr.ifr_name) >= IFNAMSIZ);
+
+	strcpy(ifr.ifr_name, name);
+	rc = libcfs_sock_ioctl(SIOCGIFFLAGS, (unsigned long)&ifr);
+
+	if (rc != 0) {
+		CERROR("Can't get flags for interface %s\n", name);
+		return rc;
+	}
+
+	if ((ifr.ifr_flags & IFF_UP) == 0) {
+		CDEBUG(D_NET, "Interface %s down\n", name);
+		*up = 0;
+		*ip = *mask = 0;
+		return 0;
+	}
+
+	*up = 1;
+
+	strcpy(ifr.ifr_name, name);
+	ifr.ifr_addr.sa_family = AF_INET;
+	rc = libcfs_sock_ioctl(SIOCGIFADDR, (unsigned long)&ifr);
+
+	if (rc != 0) {
+		CERROR("Can't get IP address for interface %s\n", name);
+		return rc;
+	}
+
+	val = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr.s_addr;
+	*ip = ntohl(val);
+
+	strcpy(ifr.ifr_name, name);
+	ifr.ifr_addr.sa_family = AF_INET;
+	rc = libcfs_sock_ioctl(SIOCGIFNETMASK, (unsigned long)&ifr);
+
+	if (rc != 0) {
+		CERROR("Can't get netmask for interface %s\n", name);
+		return rc;
+	}
+
+	val = ((struct sockaddr_in *)&ifr.ifr_netmask)->sin_addr.s_addr;
+	*mask = ntohl(val);
+
+	return 0;
+}
+
+EXPORT_SYMBOL(libcfs_ipif_query);
+
+int
+libcfs_ipif_enumerate (char ***namesp)
+{
+	/* Allocate and fill in 'names', returning # interfaces/error */
+	char	   **names;
+	int	     toobig;
+	int	     nalloc;
+	int	     nfound;
+	struct ifreq   *ifr;
+	struct ifconf   ifc;
+	int	     rc;
+	int	     nob;
+	int	     i;
+
+
+	nalloc = 16;	/* first guess at max interfaces */
+	toobig = 0;
+	for (;;) {
+		if (nalloc * sizeof(*ifr) > PAGE_CACHE_SIZE) {
+			toobig = 1;
+			nalloc = PAGE_CACHE_SIZE/sizeof(*ifr);
+			CWARN("Too many interfaces: only enumerating first %d\n",
+			      nalloc);
+		}
+
+		LIBCFS_ALLOC(ifr, nalloc * sizeof(*ifr));
+		if (ifr == NULL) {
+			CERROR ("ENOMEM enumerating up to %d interfaces\n", nalloc);
+			rc = -ENOMEM;
+			goto out0;
+		}
+
+		ifc.ifc_buf = (char *)ifr;
+		ifc.ifc_len = nalloc * sizeof(*ifr);
+
+		rc = libcfs_sock_ioctl(SIOCGIFCONF, (unsigned long)&ifc);
+
+		if (rc < 0) {
+			CERROR ("Error %d enumerating interfaces\n", rc);
+			goto out1;
+		}
+
+		LASSERT (rc == 0);
+
+		nfound = ifc.ifc_len/sizeof(*ifr);
+		LASSERT (nfound <= nalloc);
+
+		if (nfound < nalloc || toobig)
+			break;
+
+		LIBCFS_FREE(ifr, nalloc * sizeof(*ifr));
+		nalloc *= 2;
+	}
+
+	if (nfound == 0)
+		goto out1;
+
+	LIBCFS_ALLOC(names, nfound * sizeof(*names));
+	if (names == NULL) {
+		rc = -ENOMEM;
+		goto out1;
+	}
+	/* NULL out all names[i] */
+	memset (names, 0, nfound * sizeof(*names));
+
+	for (i = 0; i < nfound; i++) {
+
+		nob = strnlen (ifr[i].ifr_name, IFNAMSIZ);
+		if (nob == IFNAMSIZ) {
+			/* no space for terminating NULL */
+			CERROR("interface name %.*s too long (%d max)\n",
+			       nob, ifr[i].ifr_name, IFNAMSIZ);
+			rc = -ENAMETOOLONG;
+			goto out2;
+		}
+
+		LIBCFS_ALLOC(names[i], IFNAMSIZ);
+		if (names[i] == NULL) {
+			rc = -ENOMEM;
+			goto out2;
+		}
+
+		memcpy(names[i], ifr[i].ifr_name, nob);
+		names[i][nob] = 0;
+	}
+
+	*namesp = names;
+	rc = nfound;
+
+ out2:
+	if (rc < 0)
+		libcfs_ipif_free_enumeration(names, nfound);
+ out1:
+	LIBCFS_FREE(ifr, nalloc * sizeof(*ifr));
+ out0:
+	return rc;
+}
+
+EXPORT_SYMBOL(libcfs_ipif_enumerate);
+
+void
+libcfs_ipif_free_enumeration (char **names, int n)
+{
+	int      i;
+
+	LASSERT (n > 0);
+
+	for (i = 0; i < n && names[i] != NULL; i++)
+		LIBCFS_FREE(names[i], IFNAMSIZ);
+
+	LIBCFS_FREE(names, n * sizeof(*names));
+}
+
+EXPORT_SYMBOL(libcfs_ipif_free_enumeration);
+
+int
+libcfs_sock_write (struct socket *sock, void *buffer, int nob, int timeout)
+{
+	int	    rc;
+	mm_segment_t   oldmm = get_fs();
+	long	   ticks = timeout * HZ;
+	unsigned long  then;
+	struct timeval tv;
+
+	LASSERT (nob > 0);
+	/* Caller may pass a zero timeout if she thinks the socket buffer is
+	 * empty enough to take the whole message immediately */
+
+	for (;;) {
+		struct iovec  iov = {
+			.iov_base = buffer,
+			.iov_len  = nob
+		};
+		struct msghdr msg = {
+			.msg_name       = NULL,
+			.msg_namelen    = 0,
+			.msg_iov	= &iov,
+			.msg_iovlen     = 1,
+			.msg_control    = NULL,
+			.msg_controllen = 0,
+			.msg_flags      = (timeout == 0) ? MSG_DONTWAIT : 0
+		};
+
+		if (timeout != 0) {
+			/* Set send timeout to remaining time */
+			tv = (struct timeval) {
+				.tv_sec = ticks / HZ,
+				.tv_usec = ((ticks % HZ) * 1000000) / HZ
+			};
+			set_fs(KERNEL_DS);
+			rc = sock_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO,
+					     (char *)&tv, sizeof(tv));
+			set_fs(oldmm);
+			if (rc != 0) {
+				CERROR("Can't set socket send timeout "
+				       "%ld.%06d: %d\n",
+				       (long)tv.tv_sec, (int)tv.tv_usec, rc);
+				return rc;
+			}
+		}
+
+		set_fs (KERNEL_DS);
+		then = jiffies;
+		rc = sock_sendmsg (sock, &msg, iov.iov_len);
+		ticks -= jiffies - then;
+		set_fs (oldmm);
+
+		if (rc == nob)
+			return 0;
+
+		if (rc < 0)
+			return rc;
+
+		if (rc == 0) {
+			CERROR ("Unexpected zero rc\n");
+			return (-ECONNABORTED);
+		}
+
+		if (ticks <= 0)
+			return -EAGAIN;
+
+		buffer = ((char *)buffer) + rc;
+		nob -= rc;
+	}
+
+	return (0);
+}
+EXPORT_SYMBOL(libcfs_sock_write);
+
+int
+libcfs_sock_read (struct socket *sock, void *buffer, int nob, int timeout)
+{
+	int	    rc;
+	mm_segment_t   oldmm = get_fs();
+	long	   ticks = timeout * HZ;
+	unsigned long  then;
+	struct timeval tv;
+
+	LASSERT (nob > 0);
+	LASSERT (ticks > 0);
+
+	for (;;) {
+		struct iovec  iov = {
+			.iov_base = buffer,
+			.iov_len  = nob
+		};
+		struct msghdr msg = {
+			.msg_name       = NULL,
+			.msg_namelen    = 0,
+			.msg_iov	= &iov,
+			.msg_iovlen     = 1,
+			.msg_control    = NULL,
+			.msg_controllen = 0,
+			.msg_flags      = 0
+		};
+
+		/* Set receive timeout to remaining time */
+		tv = (struct timeval) {
+			.tv_sec = ticks / HZ,
+			.tv_usec = ((ticks % HZ) * 1000000) / HZ
+		};
+		set_fs(KERNEL_DS);
+		rc = sock_setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO,
+				     (char *)&tv, sizeof(tv));
+		set_fs(oldmm);
+		if (rc != 0) {
+			CERROR("Can't set socket recv timeout %ld.%06d: %d\n",
+			       (long)tv.tv_sec, (int)tv.tv_usec, rc);
+			return rc;
+		}
+
+		set_fs(KERNEL_DS);
+		then = jiffies;
+		rc = sock_recvmsg(sock, &msg, iov.iov_len, 0);
+		ticks -= jiffies - then;
+		set_fs(oldmm);
+
+		if (rc < 0)
+			return rc;
+
+		if (rc == 0)
+			return -ECONNRESET;
+
+		buffer = ((char *)buffer) + rc;
+		nob -= rc;
+
+		if (nob == 0)
+			return 0;
+
+		if (ticks <= 0)
+			return -ETIMEDOUT;
+	}
+}
+
+EXPORT_SYMBOL(libcfs_sock_read);
+
+static int
+libcfs_sock_create (struct socket **sockp, int *fatal,
+		    __u32 local_ip, int local_port)
+{
+	struct sockaddr_in  locaddr;
+	struct socket      *sock;
+	int		 rc;
+	int		 option;
+	mm_segment_t	oldmm = get_fs();
+
+	/* All errors are fatal except bind failure if the port is in use */
+	*fatal = 1;
+
+	rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock);
+	*sockp = sock;
+	if (rc != 0) {
+		CERROR ("Can't create socket: %d\n", rc);
+		return (rc);
+	}
+
+	set_fs (KERNEL_DS);
+	option = 1;
+	rc = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
+			     (char *)&option, sizeof (option));
+	set_fs (oldmm);
+	if (rc != 0) {
+		CERROR("Can't set SO_REUSEADDR for socket: %d\n", rc);
+		goto failed;
+	}
+
+	if (local_ip != 0 || local_port != 0) {
+		memset(&locaddr, 0, sizeof(locaddr));
+		locaddr.sin_family = AF_INET;
+		locaddr.sin_port = htons(local_port);
+		locaddr.sin_addr.s_addr = (local_ip == 0) ?
+					  INADDR_ANY : htonl(local_ip);
+
+		rc = sock->ops->bind(sock, (struct sockaddr *)&locaddr,
+				     sizeof(locaddr));
+		if (rc == -EADDRINUSE) {
+			CDEBUG(D_NET, "Port %d already in use\n", local_port);
+			*fatal = 0;
+			goto failed;
+		}
+		if (rc != 0) {
+			CERROR("Error trying to bind to port %d: %d\n",
+			       local_port, rc);
+			goto failed;
+		}
+	}
+
+	return 0;
+
+ failed:
+	sock_release(sock);
+	return rc;
+}
+
+int
+libcfs_sock_setbuf (struct socket *sock, int txbufsize, int rxbufsize)
+{
+	mm_segment_t	oldmm = get_fs();
+	int		 option;
+	int		 rc;
+
+	if (txbufsize != 0) {
+		option = txbufsize;
+		set_fs (KERNEL_DS);
+		rc = sock_setsockopt(sock, SOL_SOCKET, SO_SNDBUF,
+				     (char *)&option, sizeof (option));
+		set_fs (oldmm);
+		if (rc != 0) {
+			CERROR ("Can't set send buffer %d: %d\n",
+				option, rc);
+			return (rc);
+		}
+	}
+
+	if (rxbufsize != 0) {
+		option = rxbufsize;
+		set_fs (KERNEL_DS);
+		rc = sock_setsockopt (sock, SOL_SOCKET, SO_RCVBUF,
+				      (char *)&option, sizeof (option));
+		set_fs (oldmm);
+		if (rc != 0) {
+			CERROR ("Can't set receive buffer %d: %d\n",
+				option, rc);
+			return (rc);
+		}
+	}
+
+	return 0;
+}
+
+EXPORT_SYMBOL(libcfs_sock_setbuf);
+
+int
+libcfs_sock_getaddr (struct socket *sock, int remote, __u32 *ip, int *port)
+{
+	struct sockaddr_in sin;
+	int		len = sizeof (sin);
+	int		rc;
+
+	rc = sock->ops->getname (sock, (struct sockaddr *)&sin, &len,
+				 remote ? 2 : 0);
+	if (rc != 0) {
+		CERROR ("Error %d getting sock %s IP/port\n",
+			rc, remote ? "peer" : "local");
+		return rc;
+	}
+
+	if (ip != NULL)
+		*ip = ntohl (sin.sin_addr.s_addr);
+
+	if (port != NULL)
+		*port = ntohs (sin.sin_port);
+
+	return 0;
+}
+
+EXPORT_SYMBOL(libcfs_sock_getaddr);
+
+int
+libcfs_sock_getbuf (struct socket *sock, int *txbufsize, int *rxbufsize)
+{
+
+	if (txbufsize != NULL) {
+		*txbufsize = sock->sk->sk_sndbuf;
+	}
+
+	if (rxbufsize != NULL) {
+		*rxbufsize = sock->sk->sk_rcvbuf;
+	}
+
+	return 0;
+}
+
+EXPORT_SYMBOL(libcfs_sock_getbuf);
+
+int
+libcfs_sock_listen (struct socket **sockp,
+		    __u32 local_ip, int local_port, int backlog)
+{
+	int      fatal;
+	int      rc;
+
+	rc = libcfs_sock_create(sockp, &fatal, local_ip, local_port);
+	if (rc != 0) {
+		if (!fatal)
+			CERROR("Can't create socket: port %d already in use\n",
+			       local_port);
+		return rc;
+	}
+
+	rc = (*sockp)->ops->listen(*sockp, backlog);
+	if (rc == 0)
+		return 0;
+
+	CERROR("Can't set listen backlog %d: %d\n", backlog, rc);
+	sock_release(*sockp);
+	return rc;
+}
+
+EXPORT_SYMBOL(libcfs_sock_listen);
+
+int
+libcfs_sock_accept (struct socket **newsockp, struct socket *sock)
+{
+	wait_queue_t   wait;
+	struct socket *newsock;
+	int	    rc;
+
+	init_waitqueue_entry(&wait, current);
+
+	/* XXX this should add a ref to sock->ops->owner, if
+	 * TCP could be a module */
+	rc = sock_create_lite(PF_PACKET, sock->type, IPPROTO_TCP, &newsock);
+	if (rc) {
+		CERROR("Can't allocate socket\n");
+		return rc;
+	}
+
+	newsock->ops = sock->ops;
+
+	set_current_state(TASK_INTERRUPTIBLE);
+	add_wait_queue(cfs_sk_sleep(sock->sk), &wait);
+
+	rc = sock->ops->accept(sock, newsock, O_NONBLOCK);
+	if (rc == -EAGAIN) {
+		/* Nothing ready, so wait for activity */
+		schedule();
+		rc = sock->ops->accept(sock, newsock, O_NONBLOCK);
+	}
+
+	remove_wait_queue(cfs_sk_sleep(sock->sk), &wait);
+	set_current_state(TASK_RUNNING);
+
+	if (rc != 0)
+		goto failed;
+
+	*newsockp = newsock;
+	return 0;
+
+ failed:
+	sock_release(newsock);
+	return rc;
+}
+
+EXPORT_SYMBOL(libcfs_sock_accept);
+
+void
+libcfs_sock_abort_accept (struct socket *sock)
+{
+	wake_up_all(cfs_sk_sleep(sock->sk));
+}
+
+EXPORT_SYMBOL(libcfs_sock_abort_accept);
+
+int
+libcfs_sock_connect (struct socket **sockp, int *fatal,
+		     __u32 local_ip, int local_port,
+		     __u32 peer_ip, int peer_port)
+{
+	struct sockaddr_in  srvaddr;
+	int		 rc;
+
+	rc = libcfs_sock_create(sockp, fatal, local_ip, local_port);
+	if (rc != 0)
+		return rc;
+
+	memset (&srvaddr, 0, sizeof (srvaddr));
+	srvaddr.sin_family = AF_INET;
+	srvaddr.sin_port = htons(peer_port);
+	srvaddr.sin_addr.s_addr = htonl(peer_ip);
+
+	rc = (*sockp)->ops->connect(*sockp,
+				    (struct sockaddr *)&srvaddr, sizeof(srvaddr),
+				    0);
+	if (rc == 0)
+		return 0;
+
+	/* EADDRNOTAVAIL probably means we're already connected to the same
+	 * peer/port on the same local port on a differently typed
+	 * connection.  Let our caller retry with a different local
+	 * port... */
+	*fatal = !(rc == -EADDRNOTAVAIL);
+
+	CDEBUG_LIMIT(*fatal ? D_NETERROR : D_NET,
+	       "Error %d connecting %u.%u.%u.%u/%d -> %u.%u.%u.%u/%d\n", rc,
+	       HIPQUAD(local_ip), local_port, HIPQUAD(peer_ip), peer_port);
+
+	sock_release(*sockp);
+	return rc;
+}
+
+EXPORT_SYMBOL(libcfs_sock_connect);
+
+void
+libcfs_sock_release (struct socket *sock)
+{
+	sock_release(sock);
+}
+
+EXPORT_SYMBOL(libcfs_sock_release);
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.c
new file mode 100644
index 000000000000..6f563436a255
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.c
@@ -0,0 +1,275 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#define LUSTRE_TRACEFILE_PRIVATE
+
+#include <linux/libcfs/libcfs.h>
+#include "tracefile.h"
+
+/* percents to share the total debug memory for each type */
+static unsigned int pages_factor[CFS_TCD_TYPE_MAX] = {
+	80,  /* 80% pages for CFS_TCD_TYPE_PROC */
+	10,  /* 10% pages for CFS_TCD_TYPE_SOFTIRQ */
+	10   /* 10% pages for CFS_TCD_TYPE_IRQ */
+};
+
+char *cfs_trace_console_buffers[NR_CPUS][CFS_TCD_TYPE_MAX];
+
+struct rw_semaphore cfs_tracefile_sem;
+
+int cfs_tracefile_init_arch()
+{
+	int    i;
+	int    j;
+	struct cfs_trace_cpu_data *tcd;
+
+	init_rwsem(&cfs_tracefile_sem);
+
+	/* initialize trace_data */
+	memset(cfs_trace_data, 0, sizeof(cfs_trace_data));
+	for (i = 0; i < CFS_TCD_TYPE_MAX; i++) {
+		cfs_trace_data[i] =
+			kmalloc(sizeof(union cfs_trace_data_union) *
+				num_possible_cpus(), GFP_KERNEL);
+		if (cfs_trace_data[i] == NULL)
+			goto out;
+
+	}
+
+	/* arch related info initialized */
+	cfs_tcd_for_each(tcd, i, j) {
+		spin_lock_init(&tcd->tcd_lock);
+		tcd->tcd_pages_factor = pages_factor[i];
+		tcd->tcd_type = i;
+		tcd->tcd_cpu = j;
+	}
+
+	for (i = 0; i < num_possible_cpus(); i++)
+		for (j = 0; j < 3; j++) {
+			cfs_trace_console_buffers[i][j] =
+				kmalloc(CFS_TRACE_CONSOLE_BUFFER_SIZE,
+					GFP_KERNEL);
+
+			if (cfs_trace_console_buffers[i][j] == NULL)
+				goto out;
+		}
+
+	return 0;
+
+out:
+	cfs_tracefile_fini_arch();
+	printk(KERN_ERR "lnet: Not enough memory\n");
+	return -ENOMEM;
+}
+
+void cfs_tracefile_fini_arch()
+{
+	int    i;
+	int    j;
+
+	for (i = 0; i < num_possible_cpus(); i++)
+		for (j = 0; j < 3; j++)
+			if (cfs_trace_console_buffers[i][j] != NULL) {
+				kfree(cfs_trace_console_buffers[i][j]);
+				cfs_trace_console_buffers[i][j] = NULL;
+			}
+
+	for (i = 0; cfs_trace_data[i] != NULL; i++) {
+		kfree(cfs_trace_data[i]);
+		cfs_trace_data[i] = NULL;
+	}
+
+	fini_rwsem(&cfs_tracefile_sem);
+}
+
+void cfs_tracefile_read_lock()
+{
+	down_read(&cfs_tracefile_sem);
+}
+
+void cfs_tracefile_read_unlock()
+{
+	up_read(&cfs_tracefile_sem);
+}
+
+void cfs_tracefile_write_lock()
+{
+	down_write(&cfs_tracefile_sem);
+}
+
+void cfs_tracefile_write_unlock()
+{
+	up_write(&cfs_tracefile_sem);
+}
+
+cfs_trace_buf_type_t cfs_trace_buf_idx_get()
+{
+	if (in_irq())
+		return CFS_TCD_TYPE_IRQ;
+	else if (in_softirq())
+		return CFS_TCD_TYPE_SOFTIRQ;
+	else
+		return CFS_TCD_TYPE_PROC;
+}
+
+/*
+ * The walking argument indicates the locking comes from all tcd types
+ * iterator and we must lock it and dissable local irqs to avoid deadlocks
+ * with other interrupt locks that might be happening. See LU-1311
+ * for details.
+ */
+int cfs_trace_lock_tcd(struct cfs_trace_cpu_data *tcd, int walking)
+{
+	__LASSERT(tcd->tcd_type < CFS_TCD_TYPE_MAX);
+	if (tcd->tcd_type == CFS_TCD_TYPE_IRQ)
+		spin_lock_irqsave(&tcd->tcd_lock, tcd->tcd_lock_flags);
+	else if (tcd->tcd_type == CFS_TCD_TYPE_SOFTIRQ)
+		spin_lock_bh(&tcd->tcd_lock);
+	else if (unlikely(walking))
+		spin_lock_irq(&tcd->tcd_lock);
+	else
+		spin_lock(&tcd->tcd_lock);
+	return 1;
+}
+
+void cfs_trace_unlock_tcd(struct cfs_trace_cpu_data *tcd, int walking)
+{
+	__LASSERT(tcd->tcd_type < CFS_TCD_TYPE_MAX);
+	if (tcd->tcd_type == CFS_TCD_TYPE_IRQ)
+		spin_unlock_irqrestore(&tcd->tcd_lock, tcd->tcd_lock_flags);
+	else if (tcd->tcd_type == CFS_TCD_TYPE_SOFTIRQ)
+		spin_unlock_bh(&tcd->tcd_lock);
+	else if (unlikely(walking))
+		spin_unlock_irq(&tcd->tcd_lock);
+	else
+		spin_unlock(&tcd->tcd_lock);
+}
+
+int cfs_tcd_owns_tage(struct cfs_trace_cpu_data *tcd,
+		      struct cfs_trace_page *tage)
+{
+	/*
+	 * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+	 * from here: this will lead to infinite recursion.
+	 */
+	return tcd->tcd_cpu == tage->cpu;
+}
+
+void
+cfs_set_ptldebug_header(struct ptldebug_header *header,
+			struct libcfs_debug_msg_data *msgdata,
+			unsigned long stack)
+{
+	struct timeval tv;
+
+	do_gettimeofday(&tv);
+
+	header->ph_subsys = msgdata->msg_subsys;
+	header->ph_mask = msgdata->msg_mask;
+	header->ph_cpu_id = smp_processor_id();
+	header->ph_type = cfs_trace_buf_idx_get();
+	header->ph_sec = (__u32)tv.tv_sec;
+	header->ph_usec = tv.tv_usec;
+	header->ph_stack = stack;
+	header->ph_pid = current->pid;
+	header->ph_line_num = msgdata->msg_line;
+	header->ph_extern_pid = 0;
+	return;
+}
+
+static char *
+dbghdr_to_err_string(struct ptldebug_header *hdr)
+{
+	switch (hdr->ph_subsys) {
+
+		case S_LND:
+		case S_LNET:
+			return "LNetError";
+		default:
+			return "LustreError";
+	}
+}
+
+static char *
+dbghdr_to_info_string(struct ptldebug_header *hdr)
+{
+	switch (hdr->ph_subsys) {
+
+		case S_LND:
+		case S_LNET:
+			return "LNet";
+		default:
+			return "Lustre";
+	}
+}
+
+void cfs_print_to_console(struct ptldebug_header *hdr, int mask,
+			  const char *buf, int len, const char *file,
+			  const char *fn)
+{
+	char *prefix = "Lustre", *ptype = NULL;
+
+	if ((mask & D_EMERG) != 0) {
+		prefix = dbghdr_to_err_string(hdr);
+		ptype = KERN_EMERG;
+	} else if ((mask & D_ERROR) != 0) {
+		prefix = dbghdr_to_err_string(hdr);
+		ptype = KERN_ERR;
+	} else if ((mask & D_WARNING) != 0) {
+		prefix = dbghdr_to_info_string(hdr);
+		ptype = KERN_WARNING;
+	} else if ((mask & (D_CONSOLE | libcfs_printk)) != 0) {
+		prefix = dbghdr_to_info_string(hdr);
+		ptype = KERN_INFO;
+	}
+
+	if ((mask & D_CONSOLE) != 0) {
+		printk("%s%s: %.*s", ptype, prefix, len, buf);
+	} else {
+		printk("%s%s: %d:%d:(%s:%d:%s()) %.*s", ptype, prefix,
+		       hdr->ph_pid, hdr->ph_extern_pid, file, hdr->ph_line_num,
+		       fn, len, buf);
+	}
+	return;
+}
+
+int cfs_trace_max_debug_mb(void)
+{
+	int  total_mb = (num_physpages >> (20 - PAGE_SHIFT));
+
+	return MAX(512, (total_mb * 80)/100);
+}
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.h b/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.h
new file mode 100644
index 000000000000..ba84e4ffddd1
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.h
@@ -0,0 +1,48 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LIBCFS_LINUX_TRACEFILE_H__
+#define __LIBCFS_LINUX_TRACEFILE_H__
+
+/**
+ * three types of trace_data in linux
+ */
+typedef enum {
+	CFS_TCD_TYPE_PROC = 0,
+	CFS_TCD_TYPE_SOFTIRQ,
+	CFS_TCD_TYPE_IRQ,
+	CFS_TCD_TYPE_MAX
+} cfs_trace_buf_type_t;
+
+#endif
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-utils.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-utils.c
new file mode 100644
index 000000000000..e73903cde212
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-utils.c
@@ -0,0 +1,75 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/linux/linux-utils.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+/*
+ * miscellaneous libcfs stuff
+ */
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lnet.h>
+
+/*
+ * Convert server error code to client format. Error codes are from
+ * Linux errno.h, so for Linux client---identity.
+ */
+int convert_server_error(__u64 ecode)
+{
+	return ecode;
+}
+EXPORT_SYMBOL(convert_server_error);
+
+/*
+ * convert <fcntl.h> flag from client to server.
+ */
+int convert_client_oflag(int cflag, int *result)
+{
+	*result = cflag;
+	return 0;
+}
+EXPORT_SYMBOL(convert_client_oflag);
+
+void cfs_stack_trace_fill(struct cfs_stack_trace *trace)
+{}
+
+EXPORT_SYMBOL(cfs_stack_trace_fill);
+
+void *cfs_stack_trace_frame(struct cfs_stack_trace *trace, int frame_no)
+{
+	return NULL;
+}
+EXPORT_SYMBOL(cfs_stack_trace_frame);
diff --git a/drivers/staging/lustre/lustre/libcfs/lwt.c b/drivers/staging/lustre/lustre/libcfs/lwt.c
new file mode 100644
index 000000000000..b631f7dde8e7
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/lwt.c
@@ -0,0 +1,266 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/lwt.c
+ *
+ * Author: Eric Barton <eeb@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+
+#if LWT_SUPPORT
+
+#if !KLWT_SUPPORT
+int	 lwt_enabled;
+lwt_cpu_t   lwt_cpus[NR_CPUS];
+#endif
+
+int	 lwt_pages_per_cpu;
+
+/* NB only root is allowed to retrieve LWT info; it's an open door into the
+ * kernel... */
+
+int
+lwt_lookup_string (int *size, char *knl_ptr,
+		   char *user_ptr, int user_size)
+{
+	int   maxsize = 128;
+
+	/* knl_ptr was retrieved from an LWT snapshot and the caller wants to
+	 * turn it into a string.  NB we can crash with an access violation
+	 * trying to determine the string length, so we're trusting our
+	 * caller... */
+
+	if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+		return (-EPERM);
+
+	if (user_size > 0 &&
+	    maxsize > user_size)
+		maxsize = user_size;
+
+	*size = strnlen (knl_ptr, maxsize - 1) + 1;
+
+	if (user_ptr != NULL) {
+		if (user_size < 4)
+			return (-EINVAL);
+
+		if (copy_to_user (user_ptr, knl_ptr, *size))
+			return (-EFAULT);
+
+		/* Did I truncate the string?  */
+		if (knl_ptr[*size - 1] != 0)
+			copy_to_user (user_ptr + *size - 4, "...", 4);
+	}
+
+	return (0);
+}
+
+int
+lwt_control (int enable, int clear)
+{
+	lwt_page_t  *p;
+	int	  i;
+	int	  j;
+
+	if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+		return (-EPERM);
+
+	if (!enable) {
+		LWT_EVENT(0,0,0,0);
+		lwt_enabled = 0;
+		mb();
+		/* give people some time to stop adding traces */
+		schedule_timeout(10);
+	}
+
+	for (i = 0; i < num_online_cpus(); i++) {
+		p = lwt_cpus[i].lwtc_current_page;
+
+		if (p == NULL)
+			return (-ENODATA);
+
+		if (!clear)
+			continue;
+
+		for (j = 0; j < lwt_pages_per_cpu; j++) {
+			memset (p->lwtp_events, 0, PAGE_CACHE_SIZE);
+
+			p = list_entry (p->lwtp_list.next,
+					    lwt_page_t, lwtp_list);
+		}
+	}
+
+	if (enable) {
+		lwt_enabled = 1;
+		mb();
+		LWT_EVENT(0,0,0,0);
+	}
+
+	return (0);
+}
+
+int
+lwt_snapshot (cfs_cycles_t *now, int *ncpu, int *total_size,
+	      void *user_ptr, int user_size)
+{
+	const int    events_per_page = PAGE_CACHE_SIZE / sizeof(lwt_event_t);
+	const int    bytes_per_page = events_per_page * sizeof(lwt_event_t);
+	lwt_page_t  *p;
+	int	  i;
+	int	  j;
+
+	if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+		return (-EPERM);
+
+	*ncpu = num_online_cpus();
+	*total_size = num_online_cpus() * lwt_pages_per_cpu *
+		bytes_per_page;
+	*now = get_cycles();
+
+	if (user_ptr == NULL)
+		return (0);
+
+	for (i = 0; i < num_online_cpus(); i++) {
+		p = lwt_cpus[i].lwtc_current_page;
+
+		if (p == NULL)
+			return (-ENODATA);
+
+		for (j = 0; j < lwt_pages_per_cpu; j++) {
+			if (copy_to_user(user_ptr, p->lwtp_events,
+					     bytes_per_page))
+				return (-EFAULT);
+
+			user_ptr = ((char *)user_ptr) + bytes_per_page;
+			p = list_entry(p->lwtp_list.next,
+					   lwt_page_t, lwtp_list);
+		}
+	}
+
+	return (0);
+}
+
+int
+lwt_init ()
+{
+	int     i;
+	int     j;
+
+	for (i = 0; i < num_online_cpus(); i++)
+		if (lwt_cpus[i].lwtc_current_page != NULL)
+			return (-EALREADY);
+
+	LASSERT (!lwt_enabled);
+
+	/* NULL pointers, zero scalars */
+	memset (lwt_cpus, 0, sizeof (lwt_cpus));
+	lwt_pages_per_cpu =
+		LWT_MEMORY / (num_online_cpus() * PAGE_CACHE_SIZE);
+
+	for (i = 0; i < num_online_cpus(); i++)
+		for (j = 0; j < lwt_pages_per_cpu; j++) {
+			struct page *page = alloc_page (GFP_KERNEL);
+			lwt_page_t  *lwtp;
+
+			if (page == NULL) {
+				CERROR ("Can't allocate page\n");
+				lwt_fini ();
+				return (-ENOMEM);
+			}
+
+			LIBCFS_ALLOC(lwtp, sizeof (*lwtp));
+			if (lwtp == NULL) {
+				CERROR ("Can't allocate lwtp\n");
+				__free_page(page);
+				lwt_fini ();
+				return (-ENOMEM);
+			}
+
+			lwtp->lwtp_page = page;
+			lwtp->lwtp_events = page_address(page);
+			memset (lwtp->lwtp_events, 0, PAGE_CACHE_SIZE);
+
+			if (j == 0) {
+				INIT_LIST_HEAD (&lwtp->lwtp_list);
+				lwt_cpus[i].lwtc_current_page = lwtp;
+			} else {
+				list_add (&lwtp->lwtp_list,
+				    &lwt_cpus[i].lwtc_current_page->lwtp_list);
+			}
+		}
+
+	lwt_enabled = 1;
+	mb();
+
+	LWT_EVENT(0,0,0,0);
+
+	return (0);
+}
+
+void
+lwt_fini ()
+{
+	int    i;
+
+	lwt_control(0, 0);
+
+	for (i = 0; i < num_online_cpus(); i++)
+		while (lwt_cpus[i].lwtc_current_page != NULL) {
+			lwt_page_t *lwtp = lwt_cpus[i].lwtc_current_page;
+
+			if (list_empty (&lwtp->lwtp_list)) {
+				lwt_cpus[i].lwtc_current_page = NULL;
+			} else {
+				lwt_cpus[i].lwtc_current_page =
+					list_entry (lwtp->lwtp_list.next,
+							lwt_page_t, lwtp_list);
+
+				list_del (&lwtp->lwtp_list);
+			}
+
+			__free_page (lwtp->lwtp_page);
+			LIBCFS_FREE (lwtp, sizeof (*lwtp));
+		}
+}
+
+EXPORT_SYMBOL(lwt_enabled);
+EXPORT_SYMBOL(lwt_cpus);
+
+EXPORT_SYMBOL(lwt_init);
+EXPORT_SYMBOL(lwt_fini);
+EXPORT_SYMBOL(lwt_lookup_string);
+EXPORT_SYMBOL(lwt_control);
+EXPORT_SYMBOL(lwt_snapshot);
+#endif
diff --git a/drivers/staging/lustre/lustre/libcfs/module.c b/drivers/staging/lustre/lustre/libcfs/module.c
new file mode 100644
index 000000000000..3372537c6f3b
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/module.c
@@ -0,0 +1,498 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/libcfs/libcfs_crypto.h>
+#include <linux/lnet/lib-lnet.h>
+#include <linux/lnet/lnet.h>
+#include "tracefile.h"
+
+void
+kportal_memhog_free (struct libcfs_device_userstate *ldu)
+{
+	struct page **level0p = &ldu->ldu_memhog_root_page;
+	struct page **level1p;
+	struct page **level2p;
+	int	   count1;
+	int	   count2;
+
+	if (*level0p != NULL) {
+
+		level1p = (struct page **)page_address(*level0p);
+		count1 = 0;
+
+		while (count1 < PAGE_CACHE_SIZE/sizeof(struct page *) &&
+		       *level1p != NULL) {
+
+			level2p = (struct page **)page_address(*level1p);
+			count2 = 0;
+
+			while (count2 < PAGE_CACHE_SIZE/sizeof(struct page *) &&
+			       *level2p != NULL) {
+
+				__free_page(*level2p);
+				ldu->ldu_memhog_pages--;
+				level2p++;
+				count2++;
+			}
+
+			__free_page(*level1p);
+			ldu->ldu_memhog_pages--;
+			level1p++;
+			count1++;
+		}
+
+		__free_page(*level0p);
+		ldu->ldu_memhog_pages--;
+
+		*level0p = NULL;
+	}
+
+	LASSERT (ldu->ldu_memhog_pages == 0);
+}
+
+int
+kportal_memhog_alloc (struct libcfs_device_userstate *ldu, int npages, int flags)
+{
+	struct page **level0p;
+	struct page **level1p;
+	struct page **level2p;
+	int	   count1;
+	int	   count2;
+
+	LASSERT (ldu->ldu_memhog_pages == 0);
+	LASSERT (ldu->ldu_memhog_root_page == NULL);
+
+	if (npages < 0)
+		return -EINVAL;
+
+	if (npages == 0)
+		return 0;
+
+	level0p = &ldu->ldu_memhog_root_page;
+	*level0p = alloc_page(flags);
+	if (*level0p == NULL)
+		return -ENOMEM;
+	ldu->ldu_memhog_pages++;
+
+	level1p = (struct page **)page_address(*level0p);
+	count1 = 0;
+	memset(level1p, 0, PAGE_CACHE_SIZE);
+
+	while (ldu->ldu_memhog_pages < npages &&
+	       count1 < PAGE_CACHE_SIZE/sizeof(struct page *)) {
+
+		if (cfs_signal_pending())
+			return (-EINTR);
+
+		*level1p = alloc_page(flags);
+		if (*level1p == NULL)
+			return -ENOMEM;
+		ldu->ldu_memhog_pages++;
+
+		level2p = (struct page **)page_address(*level1p);
+		count2 = 0;
+		memset(level2p, 0, PAGE_CACHE_SIZE);
+
+		while (ldu->ldu_memhog_pages < npages &&
+		       count2 < PAGE_CACHE_SIZE/sizeof(struct page *)) {
+
+			if (cfs_signal_pending())
+				return (-EINTR);
+
+			*level2p = alloc_page(flags);
+			if (*level2p == NULL)
+				return (-ENOMEM);
+			ldu->ldu_memhog_pages++;
+
+			level2p++;
+			count2++;
+		}
+
+		level1p++;
+		count1++;
+	}
+
+	return 0;
+}
+
+/* called when opening /dev/device */
+static int libcfs_psdev_open(unsigned long flags, void *args)
+{
+	struct libcfs_device_userstate *ldu;
+	ENTRY;
+
+	try_module_get(THIS_MODULE);
+
+	LIBCFS_ALLOC(ldu, sizeof(*ldu));
+	if (ldu != NULL) {
+		ldu->ldu_memhog_pages = 0;
+		ldu->ldu_memhog_root_page = NULL;
+	}
+	*(struct libcfs_device_userstate **)args = ldu;
+
+	RETURN(0);
+}
+
+/* called when closing /dev/device */
+static int libcfs_psdev_release(unsigned long flags, void *args)
+{
+	struct libcfs_device_userstate *ldu;
+	ENTRY;
+
+	ldu = (struct libcfs_device_userstate *)args;
+	if (ldu != NULL) {
+		kportal_memhog_free(ldu);
+		LIBCFS_FREE(ldu, sizeof(*ldu));
+	}
+
+	module_put(THIS_MODULE);
+	RETURN(0);
+}
+
+static struct rw_semaphore ioctl_list_sem;
+static struct list_head ioctl_list;
+
+int libcfs_register_ioctl(struct libcfs_ioctl_handler *hand)
+{
+	int rc = 0;
+
+	down_write(&ioctl_list_sem);
+	if (!list_empty(&hand->item))
+		rc = -EBUSY;
+	else
+		list_add_tail(&hand->item, &ioctl_list);
+	up_write(&ioctl_list_sem);
+
+	return rc;
+}
+EXPORT_SYMBOL(libcfs_register_ioctl);
+
+int libcfs_deregister_ioctl(struct libcfs_ioctl_handler *hand)
+{
+	int rc = 0;
+
+	down_write(&ioctl_list_sem);
+	if (list_empty(&hand->item))
+		rc = -ENOENT;
+	else
+		list_del_init(&hand->item);
+	up_write(&ioctl_list_sem);
+
+	return rc;
+}
+EXPORT_SYMBOL(libcfs_deregister_ioctl);
+
+static int libcfs_ioctl_int(struct cfs_psdev_file *pfile,unsigned long cmd,
+			    void *arg, struct libcfs_ioctl_data *data)
+{
+	int err = -EINVAL;
+	ENTRY;
+
+	switch (cmd) {
+	case IOC_LIBCFS_CLEAR_DEBUG:
+		libcfs_debug_clear_buffer();
+		RETURN(0);
+	/*
+	 * case IOC_LIBCFS_PANIC:
+	 * Handled in arch/cfs_module.c
+	 */
+	case IOC_LIBCFS_MARK_DEBUG:
+		if (data->ioc_inlbuf1 == NULL ||
+		    data->ioc_inlbuf1[data->ioc_inllen1 - 1] != '\0')
+			RETURN(-EINVAL);
+		libcfs_debug_mark_buffer(data->ioc_inlbuf1);
+		RETURN(0);
+#if LWT_SUPPORT
+	case IOC_LIBCFS_LWT_CONTROL:
+		err = lwt_control ((data->ioc_flags & 1) != 0,
+				   (data->ioc_flags & 2) != 0);
+		break;
+
+	case IOC_LIBCFS_LWT_SNAPSHOT: {
+		cfs_cycles_t   now;
+		int	    ncpu;
+		int	    total_size;
+
+		err = lwt_snapshot (&now, &ncpu, &total_size,
+				    data->ioc_pbuf1, data->ioc_plen1);
+		data->ioc_u64[0] = now;
+		data->ioc_u32[0] = ncpu;
+		data->ioc_u32[1] = total_size;
+
+		/* Hedge against broken user/kernel typedefs (e.g. cycles_t) */
+		data->ioc_u32[2] = sizeof(lwt_event_t);
+		data->ioc_u32[3] = offsetof(lwt_event_t, lwte_where);
+
+		if (err == 0 &&
+		    libcfs_ioctl_popdata(arg, data, sizeof (*data)))
+			err = -EFAULT;
+		break;
+	}
+
+	case IOC_LIBCFS_LWT_LOOKUP_STRING:
+		err = lwt_lookup_string (&data->ioc_count, data->ioc_pbuf1,
+					 data->ioc_pbuf2, data->ioc_plen2);
+		if (err == 0 &&
+		    libcfs_ioctl_popdata(arg, data, sizeof (*data)))
+			err = -EFAULT;
+		break;
+#endif
+	case IOC_LIBCFS_MEMHOG:
+		if (pfile->private_data == NULL) {
+			err = -EINVAL;
+		} else {
+			kportal_memhog_free(pfile->private_data);
+			/* XXX The ioc_flags is not GFP flags now, need to be fixed */
+			err = kportal_memhog_alloc(pfile->private_data,
+						   data->ioc_count,
+						   data->ioc_flags);
+			if (err != 0)
+				kportal_memhog_free(pfile->private_data);
+		}
+		break;
+
+	case IOC_LIBCFS_PING_TEST: {
+		extern void (kping_client)(struct libcfs_ioctl_data *);
+		void (*ping)(struct libcfs_ioctl_data *);
+
+		CDEBUG(D_IOCTL, "doing %d pings to nid %s (%s)\n",
+		       data->ioc_count, libcfs_nid2str(data->ioc_nid),
+		       libcfs_nid2str(data->ioc_nid));
+		ping = symbol_get(kping_client);
+		if (!ping)
+			CERROR("symbol_get failed\n");
+		else {
+			ping(data);
+			symbol_put(kping_client);
+		}
+		RETURN(0);
+	}
+
+	default: {
+		struct libcfs_ioctl_handler *hand;
+		err = -EINVAL;
+		down_read(&ioctl_list_sem);
+		list_for_each_entry(hand, &ioctl_list, item) {
+			err = hand->handle_ioctl(cmd, data);
+			if (err != -EINVAL) {
+				if (err == 0)
+					err = libcfs_ioctl_popdata(arg,
+							data, sizeof (*data));
+				break;
+			}
+		}
+		up_read(&ioctl_list_sem);
+		break;
+	}
+	}
+
+	RETURN(err);
+}
+
+static int libcfs_ioctl(struct cfs_psdev_file *pfile, unsigned long cmd, void *arg)
+{
+	char    *buf;
+	struct libcfs_ioctl_data *data;
+	int err = 0;
+	ENTRY;
+
+	LIBCFS_ALLOC_GFP(buf, 1024, GFP_IOFS);
+	if (buf == NULL)
+		RETURN(-ENOMEM);
+
+	/* 'cmd' and permissions get checked in our arch-specific caller */
+	if (libcfs_ioctl_getdata(buf, buf + 800, (void *)arg)) {
+		CERROR("PORTALS ioctl: data error\n");
+		GOTO(out, err = -EINVAL);
+	}
+	data = (struct libcfs_ioctl_data *)buf;
+
+	err = libcfs_ioctl_int(pfile, cmd, arg, data);
+
+out:
+	LIBCFS_FREE(buf, 1024);
+	RETURN(err);
+}
+
+
+struct cfs_psdev_ops libcfs_psdev_ops = {
+	libcfs_psdev_open,
+	libcfs_psdev_release,
+	NULL,
+	NULL,
+	libcfs_ioctl
+};
+
+extern int insert_proc(void);
+extern void remove_proc(void);
+MODULE_AUTHOR("Peter J. Braam <braam@clusterfs.com>");
+MODULE_DESCRIPTION("Portals v3.1");
+MODULE_LICENSE("GPL");
+
+extern psdev_t libcfs_dev;
+extern struct rw_semaphore cfs_tracefile_sem;
+extern struct mutex cfs_trace_thread_mutex;
+extern struct cfs_wi_sched *cfs_sched_rehash;
+
+extern void libcfs_init_nidstrings(void);
+extern int libcfs_arch_init(void);
+extern void libcfs_arch_cleanup(void);
+
+static int init_libcfs_module(void)
+{
+	int rc;
+
+	libcfs_arch_init();
+	libcfs_init_nidstrings();
+	init_rwsem(&cfs_tracefile_sem);
+	mutex_init(&cfs_trace_thread_mutex);
+	init_rwsem(&ioctl_list_sem);
+	INIT_LIST_HEAD(&ioctl_list);
+	init_waitqueue_head(&cfs_race_waitq);
+
+	rc = libcfs_debug_init(5 * 1024 * 1024);
+	if (rc < 0) {
+		printk(KERN_ERR "LustreError: libcfs_debug_init: %d\n", rc);
+		return (rc);
+	}
+
+	rc = cfs_cpu_init();
+	if (rc != 0)
+		goto cleanup_debug;
+
+#if LWT_SUPPORT
+	rc = lwt_init();
+	if (rc != 0) {
+		CERROR("lwt_init: error %d\n", rc);
+		goto cleanup_debug;
+	}
+#endif
+	rc = misc_register(&libcfs_dev);
+	if (rc) {
+		CERROR("misc_register: error %d\n", rc);
+		goto cleanup_lwt;
+	}
+
+	rc = cfs_wi_startup();
+	if (rc) {
+		CERROR("initialize workitem: error %d\n", rc);
+		goto cleanup_deregister;
+	}
+
+	/* max to 4 threads, should be enough for rehash */
+	rc = min(cfs_cpt_weight(cfs_cpt_table, CFS_CPT_ANY), 4);
+	rc = cfs_wi_sched_create("cfs_rh", cfs_cpt_table, CFS_CPT_ANY,
+				 rc, &cfs_sched_rehash);
+	if (rc != 0) {
+		CERROR("Startup workitem scheduler: error: %d\n", rc);
+		goto cleanup_deregister;
+	}
+
+	rc = cfs_crypto_register();
+	if (rc) {
+		CERROR("cfs_crypto_regster: error %d\n", rc);
+		goto cleanup_wi;
+	}
+
+
+	rc = insert_proc();
+	if (rc) {
+		CERROR("insert_proc: error %d\n", rc);
+		goto cleanup_crypto;
+	}
+
+	CDEBUG (D_OTHER, "portals setup OK\n");
+	return 0;
+ cleanup_crypto:
+	cfs_crypto_unregister();
+ cleanup_wi:
+	cfs_wi_shutdown();
+ cleanup_deregister:
+	misc_deregister(&libcfs_dev);
+ cleanup_lwt:
+#if LWT_SUPPORT
+	lwt_fini();
+#endif
+ cleanup_debug:
+	libcfs_debug_cleanup();
+	return rc;
+}
+
+static void exit_libcfs_module(void)
+{
+	int rc;
+
+	remove_proc();
+
+	CDEBUG(D_MALLOC, "before Portals cleanup: kmem %d\n",
+	       atomic_read(&libcfs_kmemory));
+
+	if (cfs_sched_rehash != NULL) {
+		cfs_wi_sched_destroy(cfs_sched_rehash);
+		cfs_sched_rehash = NULL;
+	}
+
+	cfs_crypto_unregister();
+	cfs_wi_shutdown();
+
+	rc = misc_deregister(&libcfs_dev);
+	if (rc)
+		CERROR("misc_deregister error %d\n", rc);
+
+#if LWT_SUPPORT
+	lwt_fini();
+#endif
+	cfs_cpu_fini();
+
+	if (atomic_read(&libcfs_kmemory) != 0)
+		CERROR("Portals memory leaked: %d bytes\n",
+		       atomic_read(&libcfs_kmemory));
+
+	rc = libcfs_debug_cleanup();
+	if (rc)
+		printk(KERN_ERR "LustreError: libcfs_debug_cleanup: %d\n",
+		       rc);
+
+	fini_rwsem(&ioctl_list_sem);
+	fini_rwsem(&cfs_tracefile_sem);
+
+	libcfs_arch_cleanup();
+}
+
+cfs_module(libcfs, "1.0.0", init_libcfs_module, exit_libcfs_module);
diff --git a/drivers/staging/lustre/lustre/libcfs/nidstrings.c b/drivers/staging/lustre/lustre/libcfs/nidstrings.c
new file mode 100644
index 000000000000..9a2d70ce2421
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/nidstrings.c
@@ -0,0 +1,867 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/nidstrings.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lnet.h>
+
+/* CAVEAT VENDITOR! Keep the canonical string representation of nets/nids
+ * consistent in all conversion functions.  Some code fragments are copied
+ * around for the sake of clarity...
+ */
+
+/* CAVEAT EMPTOR! Racey temporary buffer allocation!
+ * Choose the number of nidstrings to support the MAXIMUM expected number of
+ * concurrent users.  If there are more, the returned string will be volatile.
+ * NB this number must allow for a process to be descheduled for a timeslice
+ * between getting its string and using it.
+ */
+
+static char      libcfs_nidstrings[LNET_NIDSTR_COUNT][LNET_NIDSTR_SIZE];
+static int       libcfs_nidstring_idx = 0;
+
+static spinlock_t libcfs_nidstring_lock;
+
+void libcfs_init_nidstrings (void)
+{
+	spin_lock_init(&libcfs_nidstring_lock);
+}
+
+# define NIDSTR_LOCK(f)   spin_lock_irqsave(&libcfs_nidstring_lock, f)
+# define NIDSTR_UNLOCK(f) spin_unlock_irqrestore(&libcfs_nidstring_lock, f)
+
+static char *
+libcfs_next_nidstring (void)
+{
+	char	  *str;
+	unsigned long  flags;
+
+	NIDSTR_LOCK(flags);
+
+	str = libcfs_nidstrings[libcfs_nidstring_idx++];
+	if (libcfs_nidstring_idx ==
+	    sizeof(libcfs_nidstrings)/sizeof(libcfs_nidstrings[0]))
+		libcfs_nidstring_idx = 0;
+
+	NIDSTR_UNLOCK(flags);
+	return str;
+}
+
+static int  libcfs_lo_str2addr(const char *str, int nob, __u32 *addr);
+static void libcfs_ip_addr2str(__u32 addr, char *str);
+static int  libcfs_ip_str2addr(const char *str, int nob, __u32 *addr);
+static void libcfs_decnum_addr2str(__u32 addr, char *str);
+static void libcfs_hexnum_addr2str(__u32 addr, char *str);
+static int  libcfs_num_str2addr(const char *str, int nob, __u32 *addr);
+static int  libcfs_num_parse(char *str, int len, struct list_head *list);
+static int  libcfs_num_match(__u32 addr, struct list_head *list);
+
+struct netstrfns {
+	int	  nf_type;
+	char	*nf_name;
+	char	*nf_modname;
+	void       (*nf_addr2str)(__u32 addr, char *str);
+	int	(*nf_str2addr)(const char *str, int nob, __u32 *addr);
+	int	(*nf_parse_addrlist)(char *str, int len,
+					struct list_head *list);
+	int	(*nf_match_addr)(__u32 addr, struct list_head *list);
+};
+
+static struct netstrfns  libcfs_netstrfns[] = {
+	{/* .nf_type      */  LOLND,
+	 /* .nf_name      */  "lo",
+	 /* .nf_modname   */  "klolnd",
+	 /* .nf_addr2str  */  libcfs_decnum_addr2str,
+	 /* .nf_str2addr  */  libcfs_lo_str2addr,
+	 /* .nf_parse_addr*/  libcfs_num_parse,
+	 /* .nf_match_addr*/  libcfs_num_match},
+	{/* .nf_type      */  SOCKLND,
+	 /* .nf_name      */  "tcp",
+	 /* .nf_modname   */  "ksocklnd",
+	 /* .nf_addr2str  */  libcfs_ip_addr2str,
+	 /* .nf_str2addr  */  libcfs_ip_str2addr,
+	 /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+	 /* .nf_match_addr*/  cfs_ip_addr_match},
+	{/* .nf_type      */  O2IBLND,
+	 /* .nf_name      */  "o2ib",
+	 /* .nf_modname   */  "ko2iblnd",
+	 /* .nf_addr2str  */  libcfs_ip_addr2str,
+	 /* .nf_str2addr  */  libcfs_ip_str2addr,
+	 /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+	 /* .nf_match_addr*/  cfs_ip_addr_match},
+	{/* .nf_type      */  CIBLND,
+	 /* .nf_name      */  "cib",
+	 /* .nf_modname   */  "kciblnd",
+	 /* .nf_addr2str  */  libcfs_ip_addr2str,
+	 /* .nf_str2addr  */  libcfs_ip_str2addr,
+	 /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+	 /* .nf_match_addr*/  cfs_ip_addr_match},
+	{/* .nf_type      */  OPENIBLND,
+	 /* .nf_name      */  "openib",
+	 /* .nf_modname   */  "kopeniblnd",
+	 /* .nf_addr2str  */  libcfs_ip_addr2str,
+	 /* .nf_str2addr  */  libcfs_ip_str2addr,
+	 /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+	 /* .nf_match_addr*/  cfs_ip_addr_match},
+	{/* .nf_type      */  IIBLND,
+	 /* .nf_name      */  "iib",
+	 /* .nf_modname   */  "kiiblnd",
+	 /* .nf_addr2str  */  libcfs_ip_addr2str,
+	 /* .nf_str2addr  */  libcfs_ip_str2addr,
+	 /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+	 /* .nf_match_addr*/  cfs_ip_addr_match},
+	{/* .nf_type      */  VIBLND,
+	 /* .nf_name      */  "vib",
+	 /* .nf_modname   */  "kviblnd",
+	 /* .nf_addr2str  */  libcfs_ip_addr2str,
+	 /* .nf_str2addr  */  libcfs_ip_str2addr,
+	 /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+	 /* .nf_match_addr*/  cfs_ip_addr_match},
+	{/* .nf_type      */  RALND,
+	 /* .nf_name      */  "ra",
+	 /* .nf_modname   */  "kralnd",
+	 /* .nf_addr2str  */  libcfs_ip_addr2str,
+	 /* .nf_str2addr  */  libcfs_ip_str2addr,
+	 /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+	 /* .nf_match_addr*/  cfs_ip_addr_match},
+	{/* .nf_type      */  QSWLND,
+	 /* .nf_name      */  "elan",
+	 /* .nf_modname   */  "kqswlnd",
+	 /* .nf_addr2str  */  libcfs_decnum_addr2str,
+	 /* .nf_str2addr  */  libcfs_num_str2addr,
+	 /* .nf_parse_addrlist*/  libcfs_num_parse,
+	 /* .nf_match_addr*/  libcfs_num_match},
+	{/* .nf_type      */  GMLND,
+	 /* .nf_name      */  "gm",
+	 /* .nf_modname   */  "kgmlnd",
+	 /* .nf_addr2str  */  libcfs_hexnum_addr2str,
+	 /* .nf_str2addr  */  libcfs_num_str2addr,
+	 /* .nf_parse_addrlist*/  libcfs_num_parse,
+	 /* .nf_match_addr*/  libcfs_num_match},
+	{/* .nf_type      */  MXLND,
+	 /* .nf_name      */  "mx",
+	 /* .nf_modname   */  "kmxlnd",
+	 /* .nf_addr2str  */  libcfs_ip_addr2str,
+	 /* .nf_str2addr  */  libcfs_ip_str2addr,
+	 /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+	 /* .nf_match_addr*/  cfs_ip_addr_match},
+	{/* .nf_type      */  PTLLND,
+	 /* .nf_name      */  "ptl",
+	 /* .nf_modname   */  "kptllnd",
+	 /* .nf_addr2str  */  libcfs_decnum_addr2str,
+	 /* .nf_str2addr  */  libcfs_num_str2addr,
+	 /* .nf_parse_addrlist*/  libcfs_num_parse,
+	 /* .nf_match_addr*/  libcfs_num_match},
+	{/* .nf_type      */  GNILND,
+	 /* .nf_name      */  "gni",
+	 /* .nf_modname   */  "kgnilnd",
+	 /* .nf_addr2str  */  libcfs_decnum_addr2str,
+	 /* .nf_str2addr  */  libcfs_num_str2addr,
+	 /* .nf_parse_addrlist*/  libcfs_num_parse,
+	 /* .nf_match_addr*/  libcfs_num_match},
+	/* placeholder for net0 alias.  It MUST BE THE LAST ENTRY */
+	{/* .nf_type      */  -1},
+};
+
+const int libcfs_nnetstrfns = sizeof(libcfs_netstrfns)/sizeof(libcfs_netstrfns[0]);
+
+int
+libcfs_lo_str2addr(const char *str, int nob, __u32 *addr)
+{
+	*addr = 0;
+	return 1;
+}
+
+void
+libcfs_ip_addr2str(__u32 addr, char *str)
+{
+#if 0   /* never lookup */
+#endif
+	snprintf(str, LNET_NIDSTR_SIZE, "%u.%u.%u.%u",
+		 (addr >> 24) & 0xff, (addr >> 16) & 0xff,
+		 (addr >> 8) & 0xff, addr & 0xff);
+}
+
+/* CAVEAT EMPTOR XscanfX
+ * I use "%n" at the end of a sscanf format to detect trailing junk.  However
+ * sscanf may return immediately if it sees the terminating '0' in a string, so
+ * I initialise the %n variable to the expected length.  If sscanf sets it;
+ * fine, if it doesn't, then the scan ended at the end of the string, which is
+ * fine too :) */
+
+int
+libcfs_ip_str2addr(const char *str, int nob, __u32 *addr)
+{
+	int   a;
+	int   b;
+	int   c;
+	int   d;
+	int   n = nob;			  /* XscanfX */
+
+	/* numeric IP? */
+	if (sscanf(str, "%u.%u.%u.%u%n", &a, &b, &c, &d, &n) >= 4 &&
+	    n == nob &&
+	    (a & ~0xff) == 0 && (b & ~0xff) == 0 &&
+	    (c & ~0xff) == 0 && (d & ~0xff) == 0) {
+		*addr = ((a<<24)|(b<<16)|(c<<8)|d);
+		return 1;
+	}
+
+	return 0;
+}
+
+void
+libcfs_decnum_addr2str(__u32 addr, char *str)
+{
+	snprintf(str, LNET_NIDSTR_SIZE, "%u", addr);
+}
+
+void
+libcfs_hexnum_addr2str(__u32 addr, char *str)
+{
+	snprintf(str, LNET_NIDSTR_SIZE, "0x%x", addr);
+}
+
+int
+libcfs_num_str2addr(const char *str, int nob, __u32 *addr)
+{
+	int     n;
+
+	n = nob;
+	if (sscanf(str, "0x%x%n", addr, &n) >= 1 && n == nob)
+		return 1;
+
+	n = nob;
+	if (sscanf(str, "0X%x%n", addr, &n) >= 1 && n == nob)
+		return 1;
+
+	n = nob;
+	if (sscanf(str, "%u%n", addr, &n) >= 1 && n == nob)
+		return 1;
+
+	return 0;
+}
+
+struct netstrfns *
+libcfs_lnd2netstrfns(int lnd)
+{
+	int    i;
+
+	if (lnd >= 0)
+		for (i = 0; i < libcfs_nnetstrfns; i++)
+			if (lnd == libcfs_netstrfns[i].nf_type)
+				return &libcfs_netstrfns[i];
+
+	return NULL;
+}
+
+struct netstrfns *
+libcfs_namenum2netstrfns(const char *name)
+{
+	struct netstrfns *nf;
+	int	       i;
+
+	for (i = 0; i < libcfs_nnetstrfns; i++) {
+		nf = &libcfs_netstrfns[i];
+		if (nf->nf_type >= 0 &&
+		    !strncmp(name, nf->nf_name, strlen(nf->nf_name)))
+			return nf;
+	}
+	return NULL;
+}
+
+struct netstrfns *
+libcfs_name2netstrfns(const char *name)
+{
+	int    i;
+
+	for (i = 0; i < libcfs_nnetstrfns; i++)
+		if (libcfs_netstrfns[i].nf_type >= 0 &&
+		    !strcmp(libcfs_netstrfns[i].nf_name, name))
+			return &libcfs_netstrfns[i];
+
+	return NULL;
+}
+
+int
+libcfs_isknown_lnd(int type)
+{
+	return libcfs_lnd2netstrfns(type) != NULL;
+}
+
+char *
+libcfs_lnd2modname(int lnd)
+{
+	struct netstrfns *nf = libcfs_lnd2netstrfns(lnd);
+
+	return (nf == NULL) ? NULL : nf->nf_modname;
+}
+
+char *
+libcfs_lnd2str(int lnd)
+{
+	char	   *str;
+	struct netstrfns *nf = libcfs_lnd2netstrfns(lnd);
+
+	if (nf != NULL)
+		return nf->nf_name;
+
+	str = libcfs_next_nidstring();
+	snprintf(str, LNET_NIDSTR_SIZE, "?%u?", lnd);
+	return str;
+}
+
+int
+libcfs_str2lnd(const char *str)
+{
+	struct netstrfns *nf = libcfs_name2netstrfns(str);
+
+	if (nf != NULL)
+		return nf->nf_type;
+
+	return -1;
+}
+
+char *
+libcfs_net2str(__u32 net)
+{
+	int	       lnd = LNET_NETTYP(net);
+	int	       num = LNET_NETNUM(net);
+	struct netstrfns *nf  = libcfs_lnd2netstrfns(lnd);
+	char	     *str = libcfs_next_nidstring();
+
+	if (nf == NULL)
+		snprintf(str, LNET_NIDSTR_SIZE, "<%u:%u>", lnd, num);
+	else if (num == 0)
+		snprintf(str, LNET_NIDSTR_SIZE, "%s", nf->nf_name);
+	else
+		snprintf(str, LNET_NIDSTR_SIZE, "%s%u", nf->nf_name, num);
+
+	return str;
+}
+
+char *
+libcfs_nid2str(lnet_nid_t nid)
+{
+	__u32	     addr = LNET_NIDADDR(nid);
+	__u32	     net = LNET_NIDNET(nid);
+	int	       lnd = LNET_NETTYP(net);
+	int	       nnum = LNET_NETNUM(net);
+	struct netstrfns *nf;
+	char	     *str;
+	int	       nob;
+
+	if (nid == LNET_NID_ANY)
+		return "<?>";
+
+	nf = libcfs_lnd2netstrfns(lnd);
+	str = libcfs_next_nidstring();
+
+	if (nf == NULL)
+		snprintf(str, LNET_NIDSTR_SIZE, "%x@<%u:%u>", addr, lnd, nnum);
+	else {
+		nf->nf_addr2str(addr, str);
+		nob = strlen(str);
+		if (nnum == 0)
+			snprintf(str + nob, LNET_NIDSTR_SIZE - nob, "@%s",
+				 nf->nf_name);
+		else
+			snprintf(str + nob, LNET_NIDSTR_SIZE - nob, "@%s%u",
+				 nf->nf_name, nnum);
+	}
+
+	return str;
+}
+
+static struct netstrfns *
+libcfs_str2net_internal(const char *str, __u32 *net)
+{
+	struct netstrfns *nf;
+	int	       nob;
+	int	       netnum;
+	int	       i;
+
+	for (i = 0; i < libcfs_nnetstrfns; i++) {
+		nf = &libcfs_netstrfns[i];
+		if (nf->nf_type >= 0 &&
+		    !strncmp(str, nf->nf_name, strlen(nf->nf_name)))
+			break;
+	}
+
+	if (i == libcfs_nnetstrfns)
+		return NULL;
+
+	nob = strlen(nf->nf_name);
+
+	if (strlen(str) == (unsigned int)nob) {
+		netnum = 0;
+	} else {
+		if (nf->nf_type == LOLND) /* net number not allowed */
+			return NULL;
+
+		str += nob;
+		i = strlen(str);
+		if (sscanf(str, "%u%n", &netnum, &i) < 1 ||
+		    i != (int)strlen(str))
+			return NULL;
+	}
+
+	*net = LNET_MKNET(nf->nf_type, netnum);
+	return nf;
+}
+
+__u32
+libcfs_str2net(const char *str)
+{
+	__u32  net;
+
+	if (libcfs_str2net_internal(str, &net) != NULL)
+		return net;
+
+	return LNET_NIDNET(LNET_NID_ANY);
+}
+
+lnet_nid_t
+libcfs_str2nid(const char *str)
+{
+	const char       *sep = strchr(str, '@');
+	struct netstrfns *nf;
+	__u32	     net;
+	__u32	     addr;
+
+	if (sep != NULL) {
+		nf = libcfs_str2net_internal(sep + 1, &net);
+		if (nf == NULL)
+			return LNET_NID_ANY;
+	} else {
+		sep = str + strlen(str);
+		net = LNET_MKNET(SOCKLND, 0);
+		nf = libcfs_lnd2netstrfns(SOCKLND);
+		LASSERT (nf != NULL);
+	}
+
+	if (!nf->nf_str2addr(str, (int)(sep - str), &addr))
+		return LNET_NID_ANY;
+
+	return LNET_MKNID(net, addr);
+}
+
+char *
+libcfs_id2str(lnet_process_id_t id)
+{
+	char *str = libcfs_next_nidstring();
+
+	if (id.pid == LNET_PID_ANY) {
+		snprintf(str, LNET_NIDSTR_SIZE,
+			 "LNET_PID_ANY-%s", libcfs_nid2str(id.nid));
+		return str;
+	}
+
+	snprintf(str, LNET_NIDSTR_SIZE, "%s%u-%s",
+		 ((id.pid & LNET_PID_USERFLAG) != 0) ? "U" : "",
+		 (id.pid & ~LNET_PID_USERFLAG), libcfs_nid2str(id.nid));
+	return str;
+}
+
+int
+libcfs_str2anynid(lnet_nid_t *nidp, const char *str)
+{
+	if (!strcmp(str, "*")) {
+		*nidp = LNET_NID_ANY;
+		return 1;
+	}
+
+	*nidp = libcfs_str2nid(str);
+	return *nidp != LNET_NID_ANY;
+}
+
+/**
+ * Nid range list syntax.
+ * \verbatim
+ *
+ * <nidlist>	 :== <nidrange> [ ' ' <nidrange> ]
+ * <nidrange>	:== <addrrange> '@' <net>
+ * <addrrange>       :== '*' |
+ *		       <ipaddr_range> |
+ *			 <cfs_expr_list>
+ * <ipaddr_range>    :== <cfs_expr_list>.<cfs_expr_list>.<cfs_expr_list>.
+ *			 <cfs_expr_list>
+ * <cfs_expr_list>   :== <number> |
+ *		       <expr_list>
+ * <expr_list>       :== '[' <range_expr> [ ',' <range_expr>] ']'
+ * <range_expr>      :== <number> |
+ *		       <number> '-' <number> |
+ *		       <number> '-' <number> '/' <number>
+ * <net>	     :== <netname> | <netname><number>
+ * <netname>	 :== "lo" | "tcp" | "o2ib" | "cib" | "openib" | "iib" |
+ *		       "vib" | "ra" | "elan" | "mx" | "ptl"
+ * \endverbatim
+ */
+
+/**
+ * Structure to represent \<nidrange\> token of the syntax.
+ *
+ * One of this is created for each \<net\> parsed.
+ */
+struct nidrange {
+	/**
+	 * Link to list of this structures which is built on nid range
+	 * list parsing.
+	 */
+	struct list_head nr_link;
+	/**
+	 * List head for addrrange::ar_link.
+	 */
+	struct list_head nr_addrranges;
+	/**
+	 * Flag indicating that *@<net> is found.
+	 */
+	int nr_all;
+	/**
+	 * Pointer to corresponding element of libcfs_netstrfns.
+	 */
+	struct netstrfns *nr_netstrfns;
+	/**
+	 * Number of network. E.g. 5 if \<net\> is "elan5".
+	 */
+	int nr_netnum;
+};
+
+/**
+ * Structure to represent \<addrrange\> token of the syntax.
+ */
+struct addrrange {
+	/**
+	 * Link to nidrange::nr_addrranges.
+	 */
+	struct list_head ar_link;
+	/**
+	 * List head for cfs_expr_list::el_list.
+	 */
+	struct list_head ar_numaddr_ranges;
+};
+
+/**
+ * Nf_parse_addrlist method for networks using numeric addresses.
+ *
+ * Examples of such networks are gm and elan.
+ *
+ * \retval 0 if \a str parsed to numeric address
+ * \retval errno otherwise
+ */
+static int
+libcfs_num_parse(char *str, int len, struct list_head *list)
+{
+	struct cfs_expr_list *el;
+	int	rc;
+
+	rc = cfs_expr_list_parse(str, len, 0, MAX_NUMERIC_VALUE, &el);
+	if (rc == 0)
+		list_add_tail(&el->el_link, list);
+
+	return rc;
+}
+
+/**
+ * Parses \<addrrange\> token on the syntax.
+ *
+ * Allocates struct addrrange and links to \a nidrange via
+ * (nidrange::nr_addrranges)
+ *
+ * \retval 1 if \a src parses to '*' | \<ipaddr_range\> | \<cfs_expr_list\>
+ * \retval 0 otherwise
+ */
+static int
+parse_addrange(const struct cfs_lstr *src, struct nidrange *nidrange)
+{
+	struct addrrange *addrrange;
+
+	if (src->ls_len == 1 && src->ls_str[0] == '*') {
+		nidrange->nr_all = 1;
+		return 1;
+	}
+
+	LIBCFS_ALLOC(addrrange, sizeof(struct addrrange));
+	if (addrrange == NULL)
+		return 0;
+	list_add_tail(&addrrange->ar_link, &nidrange->nr_addrranges);
+	INIT_LIST_HEAD(&addrrange->ar_numaddr_ranges);
+
+	return nidrange->nr_netstrfns->nf_parse_addrlist(src->ls_str,
+						src->ls_len,
+						&addrrange->ar_numaddr_ranges);
+}
+
+/**
+ * Finds or creates struct nidrange.
+ *
+ * Checks if \a src is a valid network name, looks for corresponding
+ * nidrange on the ist of nidranges (\a nidlist), creates new struct
+ * nidrange if it is not found.
+ *
+ * \retval pointer to struct nidrange matching network specified via \a src
+ * \retval NULL if \a src does not match any network
+ */
+static struct nidrange *
+add_nidrange(const struct cfs_lstr *src,
+	     struct list_head *nidlist)
+{
+	struct netstrfns *nf;
+	struct nidrange *nr;
+	int endlen;
+	unsigned netnum;
+
+	if (src->ls_len >= LNET_NIDSTR_SIZE)
+		return NULL;
+
+	nf = libcfs_namenum2netstrfns(src->ls_str);
+	if (nf == NULL)
+		return NULL;
+	endlen = src->ls_len - strlen(nf->nf_name);
+	if (endlen == 0)
+		/* network name only, e.g. "elan" or "tcp" */
+		netnum = 0;
+	else {
+		/* e.g. "elan25" or "tcp23", refuse to parse if
+		 * network name is not appended with decimal or
+		 * hexadecimal number */
+		if (!cfs_str2num_check(src->ls_str + strlen(nf->nf_name),
+				       endlen, &netnum, 0, MAX_NUMERIC_VALUE))
+			return NULL;
+	}
+
+	list_for_each_entry(nr, nidlist, nr_link) {
+		if (nr->nr_netstrfns != nf)
+			continue;
+		if (nr->nr_netnum != netnum)
+			continue;
+		return nr;
+	}
+
+	LIBCFS_ALLOC(nr, sizeof(struct nidrange));
+	if (nr == NULL)
+		return NULL;
+	list_add_tail(&nr->nr_link, nidlist);
+	INIT_LIST_HEAD(&nr->nr_addrranges);
+	nr->nr_netstrfns = nf;
+	nr->nr_all = 0;
+	nr->nr_netnum = netnum;
+
+	return nr;
+}
+
+/**
+ * Parses \<nidrange\> token of the syntax.
+ *
+ * \retval 1 if \a src parses to \<addrrange\> '@' \<net\>
+ * \retval 0 otherwise
+ */
+static int
+parse_nidrange(struct cfs_lstr *src, struct list_head *nidlist)
+{
+	struct cfs_lstr addrrange;
+	struct cfs_lstr net;
+	struct cfs_lstr tmp;
+	struct nidrange *nr;
+
+	tmp = *src;
+	if (cfs_gettok(src, '@', &addrrange) == 0)
+		goto failed;
+
+	if (cfs_gettok(src, '@', &net) == 0 || src->ls_str != NULL)
+		goto failed;
+
+	nr = add_nidrange(&net, nidlist);
+	if (nr == NULL)
+		goto failed;
+
+	if (parse_addrange(&addrrange, nr) != 0)
+		goto failed;
+
+	return 1;
+ failed:
+	CWARN("can't parse nidrange: \"%.*s\"\n", tmp.ls_len, tmp.ls_str);
+	return 0;
+}
+
+/**
+ * Frees addrrange structures of \a list.
+ *
+ * For each struct addrrange structure found on \a list it frees
+ * cfs_expr_list list attached to it and frees the addrrange itself.
+ *
+ * \retval none
+ */
+static void
+free_addrranges(struct list_head *list)
+{
+	while (!list_empty(list)) {
+		struct addrrange *ar;
+
+		ar = list_entry(list->next, struct addrrange, ar_link);
+
+		cfs_expr_list_free_list(&ar->ar_numaddr_ranges);
+		list_del(&ar->ar_link);
+		LIBCFS_FREE(ar, sizeof(struct addrrange));
+	}
+}
+
+/**
+ * Frees nidrange strutures of \a list.
+ *
+ * For each struct nidrange structure found on \a list it frees
+ * addrrange list attached to it and frees the nidrange itself.
+ *
+ * \retval none
+ */
+void
+cfs_free_nidlist(struct list_head *list)
+{
+	struct list_head *pos, *next;
+	struct nidrange *nr;
+
+	list_for_each_safe(pos, next, list) {
+		nr = list_entry(pos, struct nidrange, nr_link);
+		free_addrranges(&nr->nr_addrranges);
+		list_del(pos);
+		LIBCFS_FREE(nr, sizeof(struct nidrange));
+	}
+}
+
+/**
+ * Parses nid range list.
+ *
+ * Parses with rigorous syntax and overflow checking \a str into
+ * \<nidrange\> [ ' ' \<nidrange\> ], compiles \a str into set of
+ * structures and links that structure to \a nidlist. The resulting
+ * list can be used to match a NID againts set of NIDS defined by \a
+ * str.
+ * \see cfs_match_nid
+ *
+ * \retval 1 on success
+ * \retval 0 otherwise
+ */
+int
+cfs_parse_nidlist(char *str, int len, struct list_head *nidlist)
+{
+	struct cfs_lstr src;
+	struct cfs_lstr res;
+	int rc;
+	ENTRY;
+
+	src.ls_str = str;
+	src.ls_len = len;
+	INIT_LIST_HEAD(nidlist);
+	while (src.ls_str) {
+		rc = cfs_gettok(&src, ' ', &res);
+		if (rc == 0) {
+			cfs_free_nidlist(nidlist);
+			RETURN(0);
+		}
+		rc = parse_nidrange(&res, nidlist);
+		if (rc == 0) {
+			cfs_free_nidlist(nidlist);
+			RETURN(0);
+		}
+	}
+	RETURN(1);
+}
+
+/*
+ * Nf_match_addr method for networks using numeric addresses
+ *
+ * \retval 1 on match
+ * \retval 0 otherwise
+ */
+static int
+libcfs_num_match(__u32 addr, struct list_head *numaddr)
+{
+	struct cfs_expr_list *el;
+
+	LASSERT(!list_empty(numaddr));
+	el = list_entry(numaddr->next, struct cfs_expr_list, el_link);
+
+	return cfs_expr_list_match(addr, el);
+}
+
+/**
+ * Matches a nid (\a nid) against the compiled list of nidranges (\a nidlist).
+ *
+ * \see cfs_parse_nidlist()
+ *
+ * \retval 1 on match
+ * \retval 0  otherwises
+ */
+int cfs_match_nid(lnet_nid_t nid, struct list_head *nidlist)
+{
+	struct nidrange *nr;
+	struct addrrange *ar;
+	ENTRY;
+
+	list_for_each_entry(nr, nidlist, nr_link) {
+		if (nr->nr_netstrfns->nf_type != LNET_NETTYP(LNET_NIDNET(nid)))
+			continue;
+		if (nr->nr_netnum != LNET_NETNUM(LNET_NIDNET(nid)))
+			continue;
+		if (nr->nr_all)
+			RETURN(1);
+		list_for_each_entry(ar, &nr->nr_addrranges, ar_link)
+			if (nr->nr_netstrfns->nf_match_addr(LNET_NIDADDR(nid),
+						       &ar->ar_numaddr_ranges))
+				RETURN(1);
+	}
+	RETURN(0);
+}
+
+
+EXPORT_SYMBOL(libcfs_isknown_lnd);
+EXPORT_SYMBOL(libcfs_lnd2modname);
+EXPORT_SYMBOL(libcfs_lnd2str);
+EXPORT_SYMBOL(libcfs_str2lnd);
+EXPORT_SYMBOL(libcfs_net2str);
+EXPORT_SYMBOL(libcfs_nid2str);
+EXPORT_SYMBOL(libcfs_str2net);
+EXPORT_SYMBOL(libcfs_str2nid);
+EXPORT_SYMBOL(libcfs_id2str);
+EXPORT_SYMBOL(libcfs_str2anynid);
+EXPORT_SYMBOL(cfs_free_nidlist);
+EXPORT_SYMBOL(cfs_parse_nidlist);
+EXPORT_SYMBOL(cfs_match_nid);
diff --git a/drivers/staging/lustre/lustre/libcfs/prng.c b/drivers/staging/lustre/lustre/libcfs/prng.c
new file mode 100644
index 000000000000..69224d84bc4b
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/prng.c
@@ -0,0 +1,139 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/prng.c
+ *
+ * concatenation of following two 16-bit multiply with carry generators
+ * x(n)=a*x(n-1)+carry mod 2^16 and y(n)=b*y(n-1)+carry mod 2^16,
+ * number and carry packed within the same 32 bit integer.
+ * algorithm recommended by Marsaglia
+*/
+
+#include <linux/libcfs/libcfs.h>
+
+/*
+From: George Marsaglia <geo@stat.fsu.edu>
+Newsgroups: sci.math
+Subject: Re: A RANDOM NUMBER GENERATOR FOR C
+Date: Tue, 30 Sep 1997 05:29:35 -0700
+
+ * You may replace the two constants 36969 and 18000 by any
+ * pair of distinct constants from this list:
+ * 18000 18030 18273 18513 18879 19074 19098 19164 19215 19584
+ * 19599 19950 20088 20508 20544 20664 20814 20970 21153 21243
+ * 21423 21723 21954 22125 22188 22293 22860 22938 22965 22974
+ * 23109 23124 23163 23208 23508 23520 23553 23658 23865 24114
+ * 24219 24660 24699 24864 24948 25023 25308 25443 26004 26088
+ * 26154 26550 26679 26838 27183 27258 27753 27795 27810 27834
+ * 27960 28320 28380 28689 28710 28794 28854 28959 28980 29013
+ * 29379 29889 30135 30345 30459 30714 30903 30963 31059 31083
+ * (or any other 16-bit constants k for which both k*2^16-1
+ * and k*2^15-1 are prime) */
+
+#define RANDOM_CONST_A 18030
+#define RANDOM_CONST_B 29013
+
+static unsigned int seed_x = 521288629;
+static unsigned int seed_y = 362436069;
+
+/**
+ * cfs_rand - creates new seeds
+ *
+ * First it creates new seeds from the previous seeds. Then it generates a
+ * new psuedo random number for use.
+ *
+ * Returns a pseudo-random 32-bit integer
+ */
+unsigned int cfs_rand(void)
+{
+	seed_x = RANDOM_CONST_A * (seed_x & 65535) + (seed_x >> 16);
+	seed_y = RANDOM_CONST_B * (seed_y & 65535) + (seed_y >> 16);
+
+	return ((seed_x << 16) + (seed_y & 65535));
+}
+EXPORT_SYMBOL(cfs_rand);
+
+/**
+ * cfs_srand - sets the inital seed
+ * @seed1 : (seed_x) should have the most entropy in the low bits of the word
+ * @seed2 : (seed_y) should have the most entropy in the high bits of the word
+ *
+ * Replaces the original seeds with new values. Used to generate a new pseudo
+ * random numbers.
+ */
+void cfs_srand(unsigned int seed1, unsigned int seed2)
+{
+	if (seed1)
+		seed_x = seed1; /* use default seeds if parameter is 0 */
+	if (seed2)
+		seed_y = seed2;
+}
+EXPORT_SYMBOL(cfs_srand);
+
+/**
+ * cfs_get_random_bytes - generate a bunch of random numbers
+ * @buf : buffer to fill with random numbers
+ * @size: size of passed in buffer
+ *
+ * Fills a buffer with random bytes
+ */
+void cfs_get_random_bytes(void *buf, int size)
+{
+	int *p = buf;
+	int rem, tmp;
+
+	LASSERT(size >= 0);
+
+	rem = min((int)((unsigned long)buf & (sizeof(int) - 1)), size);
+	if (rem) {
+		get_random_bytes(&tmp, sizeof(tmp));
+		tmp ^= cfs_rand();
+		memcpy(buf, &tmp, rem);
+		p = buf + rem;
+		size -= rem;
+	}
+
+	while (size >= sizeof(int)) {
+		get_random_bytes(&tmp, sizeof(tmp));
+		*p = cfs_rand() ^ tmp;
+		size -= sizeof(int);
+		p++;
+	}
+	buf = p;
+	if (size) {
+		get_random_bytes(&tmp, sizeof(tmp));
+		tmp ^= cfs_rand();
+		memcpy(buf, &tmp, size);
+	}
+}
+EXPORT_SYMBOL(cfs_get_random_bytes);
diff --git a/drivers/staging/lustre/lustre/libcfs/tracefile.c b/drivers/staging/lustre/lustre/libcfs/tracefile.c
new file mode 100644
index 000000000000..439e71dfae33
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/tracefile.c
@@ -0,0 +1,1195 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/tracefile.c
+ *
+ * Author: Zach Brown <zab@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+
+#define DEBUG_SUBSYSTEM S_LNET
+#define LUSTRE_TRACEFILE_PRIVATE
+#include "tracefile.h"
+
+#include <linux/libcfs/libcfs.h>
+
+/* XXX move things up to the top, comment */
+union cfs_trace_data_union (*cfs_trace_data[TCD_MAX_TYPES])[NR_CPUS] __cacheline_aligned;
+
+char cfs_tracefile[TRACEFILE_NAME_SIZE];
+long long cfs_tracefile_size = CFS_TRACEFILE_SIZE;
+static struct tracefiled_ctl trace_tctl;
+struct mutex cfs_trace_thread_mutex;
+static int thread_running = 0;
+
+atomic_t cfs_tage_allocated = ATOMIC_INIT(0);
+
+static void put_pages_on_tcd_daemon_list(struct page_collection *pc,
+					 struct cfs_trace_cpu_data *tcd);
+
+static inline struct cfs_trace_page *
+cfs_tage_from_list(struct list_head *list)
+{
+	return list_entry(list, struct cfs_trace_page, linkage);
+}
+
+static struct cfs_trace_page *cfs_tage_alloc(int gfp)
+{
+	struct page	    *page;
+	struct cfs_trace_page *tage;
+
+	/* My caller is trying to free memory */
+	if (!in_interrupt() && memory_pressure_get())
+		return NULL;
+
+	/*
+	 * Don't spam console with allocation failures: they will be reported
+	 * by upper layer anyway.
+	 */
+	gfp |= __GFP_NOWARN;
+	page = alloc_page(gfp);
+	if (page == NULL)
+		return NULL;
+
+	tage = kmalloc(sizeof(*tage), gfp);
+	if (tage == NULL) {
+		__free_page(page);
+		return NULL;
+	}
+
+	tage->page = page;
+	atomic_inc(&cfs_tage_allocated);
+	return tage;
+}
+
+static void cfs_tage_free(struct cfs_trace_page *tage)
+{
+	__LASSERT(tage != NULL);
+	__LASSERT(tage->page != NULL);
+
+	__free_page(tage->page);
+	kfree(tage);
+	atomic_dec(&cfs_tage_allocated);
+}
+
+static void cfs_tage_to_tail(struct cfs_trace_page *tage,
+			     struct list_head *queue)
+{
+	__LASSERT(tage != NULL);
+	__LASSERT(queue != NULL);
+
+	list_move_tail(&tage->linkage, queue);
+}
+
+int cfs_trace_refill_stock(struct cfs_trace_cpu_data *tcd, int gfp,
+			   struct list_head *stock)
+{
+	int i;
+
+	/*
+	 * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+	 * from here: this will lead to infinite recursion.
+	 */
+
+	for (i = 0; i + tcd->tcd_cur_stock_pages < TCD_STOCK_PAGES ; ++ i) {
+		struct cfs_trace_page *tage;
+
+		tage = cfs_tage_alloc(gfp);
+		if (tage == NULL)
+			break;
+		list_add_tail(&tage->linkage, stock);
+	}
+	return i;
+}
+
+/* return a page that has 'len' bytes left at the end */
+static struct cfs_trace_page *
+cfs_trace_get_tage_try(struct cfs_trace_cpu_data *tcd, unsigned long len)
+{
+	struct cfs_trace_page *tage;
+
+	if (tcd->tcd_cur_pages > 0) {
+		__LASSERT(!list_empty(&tcd->tcd_pages));
+		tage = cfs_tage_from_list(tcd->tcd_pages.prev);
+		if (tage->used + len <= PAGE_CACHE_SIZE)
+			return tage;
+	}
+
+	if (tcd->tcd_cur_pages < tcd->tcd_max_pages) {
+		if (tcd->tcd_cur_stock_pages > 0) {
+			tage = cfs_tage_from_list(tcd->tcd_stock_pages.prev);
+			--tcd->tcd_cur_stock_pages;
+			list_del_init(&tage->linkage);
+		} else {
+			tage = cfs_tage_alloc(GFP_ATOMIC);
+			if (unlikely(tage == NULL)) {
+				if ((!memory_pressure_get() ||
+				     in_interrupt()) && printk_ratelimit())
+					printk(KERN_WARNING
+					       "cannot allocate a tage (%ld)\n",
+					       tcd->tcd_cur_pages);
+				return NULL;
+			}
+		}
+
+		tage->used = 0;
+		tage->cpu = smp_processor_id();
+		tage->type = tcd->tcd_type;
+		list_add_tail(&tage->linkage, &tcd->tcd_pages);
+		tcd->tcd_cur_pages++;
+
+		if (tcd->tcd_cur_pages > 8 && thread_running) {
+			struct tracefiled_ctl *tctl = &trace_tctl;
+			/*
+			 * wake up tracefiled to process some pages.
+			 */
+			wake_up(&tctl->tctl_waitq);
+		}
+		return tage;
+	}
+	return NULL;
+}
+
+static void cfs_tcd_shrink(struct cfs_trace_cpu_data *tcd)
+{
+	int pgcount = tcd->tcd_cur_pages / 10;
+	struct page_collection pc;
+	struct cfs_trace_page *tage;
+	struct cfs_trace_page *tmp;
+
+	/*
+	 * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+	 * from here: this will lead to infinite recursion.
+	 */
+
+	if (printk_ratelimit())
+		printk(KERN_WARNING "debug daemon buffer overflowed; "
+		       "discarding 10%% of pages (%d of %ld)\n",
+		       pgcount + 1, tcd->tcd_cur_pages);
+
+	INIT_LIST_HEAD(&pc.pc_pages);
+	spin_lock_init(&pc.pc_lock);
+
+	list_for_each_entry_safe(tage, tmp, &tcd->tcd_pages, linkage) {
+		if (pgcount-- == 0)
+			break;
+
+		list_move_tail(&tage->linkage, &pc.pc_pages);
+		tcd->tcd_cur_pages--;
+	}
+	put_pages_on_tcd_daemon_list(&pc, tcd);
+}
+
+/* return a page that has 'len' bytes left at the end */
+static struct cfs_trace_page *cfs_trace_get_tage(struct cfs_trace_cpu_data *tcd,
+						 unsigned long len)
+{
+	struct cfs_trace_page *tage;
+
+	/*
+	 * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+	 * from here: this will lead to infinite recursion.
+	 */
+
+	if (len > PAGE_CACHE_SIZE) {
+		printk(KERN_ERR
+		       "cowardly refusing to write %lu bytes in a page\n", len);
+		return NULL;
+	}
+
+	tage = cfs_trace_get_tage_try(tcd, len);
+	if (tage != NULL)
+		return tage;
+	if (thread_running)
+		cfs_tcd_shrink(tcd);
+	if (tcd->tcd_cur_pages > 0) {
+		tage = cfs_tage_from_list(tcd->tcd_pages.next);
+		tage->used = 0;
+		cfs_tage_to_tail(tage, &tcd->tcd_pages);
+	}
+	return tage;
+}
+
+int libcfs_debug_msg(struct libcfs_debug_msg_data *msgdata,
+		     const char *format, ...)
+{
+	va_list args;
+	int     rc;
+
+	va_start(args, format);
+	rc = libcfs_debug_vmsg2(msgdata, format, args, NULL);
+	va_end(args);
+
+	return rc;
+}
+EXPORT_SYMBOL(libcfs_debug_msg);
+
+int libcfs_debug_vmsg2(struct libcfs_debug_msg_data *msgdata,
+		       const char *format1, va_list args,
+		       const char *format2, ...)
+{
+	struct cfs_trace_cpu_data *tcd = NULL;
+	struct ptldebug_header     header = {0};
+	struct cfs_trace_page     *tage;
+	/* string_buf is used only if tcd != NULL, and is always set then */
+	char		      *string_buf = NULL;
+	char		      *debug_buf;
+	int			known_size;
+	int			needed = 85; /* average message length */
+	int			max_nob;
+	va_list		    ap;
+	int			depth;
+	int			i;
+	int			remain;
+	int			mask = msgdata->msg_mask;
+	char		      *file = (char *)msgdata->msg_file;
+	cfs_debug_limit_state_t   *cdls = msgdata->msg_cdls;
+
+	if (strchr(file, '/'))
+		file = strrchr(file, '/') + 1;
+
+	tcd = cfs_trace_get_tcd();
+
+	/* cfs_trace_get_tcd() grabs a lock, which disables preemption and
+	 * pins us to a particular CPU.  This avoids an smp_processor_id()
+	 * warning on Linux when debugging is enabled. */
+	cfs_set_ptldebug_header(&header, msgdata, CDEBUG_STACK());
+
+	if (tcd == NULL)		/* arch may not log in IRQ context */
+		goto console;
+
+	if (tcd->tcd_cur_pages == 0)
+		header.ph_flags |= PH_FLAG_FIRST_RECORD;
+
+	if (tcd->tcd_shutting_down) {
+		cfs_trace_put_tcd(tcd);
+		tcd = NULL;
+		goto console;
+	}
+
+	depth = __current_nesting_level();
+	known_size = strlen(file) + 1 + depth;
+	if (msgdata->msg_fn)
+		known_size += strlen(msgdata->msg_fn) + 1;
+
+	if (libcfs_debug_binary)
+		known_size += sizeof(header);
+
+	/*/
+	 * '2' used because vsnprintf return real size required for output
+	 * _without_ terminating NULL.
+	 * if needed is to small for this format.
+	 */
+	for (i = 0; i < 2; i++) {
+		tage = cfs_trace_get_tage(tcd, needed + known_size + 1);
+		if (tage == NULL) {
+			if (needed + known_size > PAGE_CACHE_SIZE)
+				mask |= D_ERROR;
+
+			cfs_trace_put_tcd(tcd);
+			tcd = NULL;
+			goto console;
+		}
+
+		string_buf = (char *)page_address(tage->page) +
+					tage->used + known_size;
+
+		max_nob = PAGE_CACHE_SIZE - tage->used - known_size;
+		if (max_nob <= 0) {
+			printk(KERN_EMERG "negative max_nob: %d\n",
+			       max_nob);
+			mask |= D_ERROR;
+			cfs_trace_put_tcd(tcd);
+			tcd = NULL;
+			goto console;
+		}
+
+		needed = 0;
+		if (format1) {
+			va_copy(ap, args);
+			needed = vsnprintf(string_buf, max_nob, format1, ap);
+			va_end(ap);
+		}
+
+		if (format2) {
+			remain = max_nob - needed;
+			if (remain < 0)
+				remain = 0;
+
+			va_start(ap, format2);
+			needed += vsnprintf(string_buf + needed, remain,
+					    format2, ap);
+			va_end(ap);
+		}
+
+		if (needed < max_nob) /* well. printing ok.. */
+			break;
+	}
+
+	if (*(string_buf+needed-1) != '\n')
+		printk(KERN_INFO "format at %s:%d:%s doesn't end in "
+		       "newline\n", file, msgdata->msg_line, msgdata->msg_fn);
+
+	header.ph_len = known_size + needed;
+	debug_buf = (char *)page_address(tage->page) + tage->used;
+
+	if (libcfs_debug_binary) {
+		memcpy(debug_buf, &header, sizeof(header));
+		tage->used += sizeof(header);
+		debug_buf += sizeof(header);
+	}
+
+	/* indent message according to the nesting level */
+	while (depth-- > 0) {
+		*(debug_buf++) = '.';
+		++ tage->used;
+	}
+
+	strcpy(debug_buf, file);
+	tage->used += strlen(file) + 1;
+	debug_buf += strlen(file) + 1;
+
+	if (msgdata->msg_fn) {
+		strcpy(debug_buf, msgdata->msg_fn);
+		tage->used += strlen(msgdata->msg_fn) + 1;
+		debug_buf += strlen(msgdata->msg_fn) + 1;
+	}
+
+	__LASSERT(debug_buf == string_buf);
+
+	tage->used += needed;
+	__LASSERT (tage->used <= PAGE_CACHE_SIZE);
+
+console:
+	if ((mask & libcfs_printk) == 0) {
+		/* no console output requested */
+		if (tcd != NULL)
+			cfs_trace_put_tcd(tcd);
+		return 1;
+	}
+
+	if (cdls != NULL) {
+		if (libcfs_console_ratelimit &&
+		    cdls->cdls_next != 0 &&     /* not first time ever */
+		    !cfs_time_after(cfs_time_current(), cdls->cdls_next)) {
+			/* skipping a console message */
+			cdls->cdls_count++;
+			if (tcd != NULL)
+				cfs_trace_put_tcd(tcd);
+			return 1;
+		}
+
+		if (cfs_time_after(cfs_time_current(), cdls->cdls_next +
+						       libcfs_console_max_delay
+						       + cfs_time_seconds(10))) {
+			/* last timeout was a long time ago */
+			cdls->cdls_delay /= libcfs_console_backoff * 4;
+		} else {
+			cdls->cdls_delay *= libcfs_console_backoff;
+
+			if (cdls->cdls_delay < libcfs_console_min_delay)
+				cdls->cdls_delay = libcfs_console_min_delay;
+			else if (cdls->cdls_delay > libcfs_console_max_delay)
+				cdls->cdls_delay = libcfs_console_max_delay;
+		}
+
+		/* ensure cdls_next is never zero after it's been seen */
+		cdls->cdls_next = (cfs_time_current() + cdls->cdls_delay) | 1;
+	}
+
+	if (tcd != NULL) {
+		cfs_print_to_console(&header, mask, string_buf, needed, file,
+				     msgdata->msg_fn);
+		cfs_trace_put_tcd(tcd);
+	} else {
+		string_buf = cfs_trace_get_console_buffer();
+
+		needed = 0;
+		if (format1 != NULL) {
+			va_copy(ap, args);
+			needed = vsnprintf(string_buf,
+					   CFS_TRACE_CONSOLE_BUFFER_SIZE,
+					   format1, ap);
+			va_end(ap);
+		}
+		if (format2 != NULL) {
+			remain = CFS_TRACE_CONSOLE_BUFFER_SIZE - needed;
+			if (remain > 0) {
+				va_start(ap, format2);
+				needed += vsnprintf(string_buf+needed, remain,
+						    format2, ap);
+				va_end(ap);
+			}
+		}
+		cfs_print_to_console(&header, mask,
+				     string_buf, needed, file, msgdata->msg_fn);
+
+		cfs_trace_put_console_buffer(string_buf);
+	}
+
+	if (cdls != NULL && cdls->cdls_count != 0) {
+		string_buf = cfs_trace_get_console_buffer();
+
+		needed = snprintf(string_buf, CFS_TRACE_CONSOLE_BUFFER_SIZE,
+				  "Skipped %d previous similar message%s\n",
+				  cdls->cdls_count,
+				  (cdls->cdls_count > 1) ? "s" : "");
+
+		cfs_print_to_console(&header, mask,
+				     string_buf, needed, file, msgdata->msg_fn);
+
+		cfs_trace_put_console_buffer(string_buf);
+		cdls->cdls_count = 0;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(libcfs_debug_vmsg2);
+
+void
+cfs_trace_assertion_failed(const char *str,
+			   struct libcfs_debug_msg_data *msgdata)
+{
+	struct ptldebug_header hdr;
+
+	libcfs_panic_in_progress = 1;
+	libcfs_catastrophe = 1;
+	mb();
+
+	cfs_set_ptldebug_header(&hdr, msgdata, CDEBUG_STACK());
+
+	cfs_print_to_console(&hdr, D_EMERG, str, strlen(str),
+			     msgdata->msg_file, msgdata->msg_fn);
+
+	panic("Lustre debug assertion failure\n");
+
+	/* not reached */
+}
+
+static void
+panic_collect_pages(struct page_collection *pc)
+{
+	/* Do the collect_pages job on a single CPU: assumes that all other
+	 * CPUs have been stopped during a panic.  If this isn't true for some
+	 * arch, this will have to be implemented separately in each arch.  */
+	int			i;
+	int			j;
+	struct cfs_trace_cpu_data *tcd;
+
+	INIT_LIST_HEAD(&pc->pc_pages);
+
+	cfs_tcd_for_each(tcd, i, j) {
+		list_splice_init(&tcd->tcd_pages, &pc->pc_pages);
+		tcd->tcd_cur_pages = 0;
+
+		if (pc->pc_want_daemon_pages) {
+			list_splice_init(&tcd->tcd_daemon_pages,
+					     &pc->pc_pages);
+			tcd->tcd_cur_daemon_pages = 0;
+		}
+	}
+}
+
+static void collect_pages_on_all_cpus(struct page_collection *pc)
+{
+	struct cfs_trace_cpu_data *tcd;
+	int i, cpu;
+
+	spin_lock(&pc->pc_lock);
+	cfs_for_each_possible_cpu(cpu) {
+		cfs_tcd_for_each_type_lock(tcd, i, cpu) {
+			list_splice_init(&tcd->tcd_pages, &pc->pc_pages);
+			tcd->tcd_cur_pages = 0;
+			if (pc->pc_want_daemon_pages) {
+				list_splice_init(&tcd->tcd_daemon_pages,
+						     &pc->pc_pages);
+				tcd->tcd_cur_daemon_pages = 0;
+			}
+		}
+	}
+	spin_unlock(&pc->pc_lock);
+}
+
+static void collect_pages(struct page_collection *pc)
+{
+	INIT_LIST_HEAD(&pc->pc_pages);
+
+	if (libcfs_panic_in_progress)
+		panic_collect_pages(pc);
+	else
+		collect_pages_on_all_cpus(pc);
+}
+
+static void put_pages_back_on_all_cpus(struct page_collection *pc)
+{
+	struct cfs_trace_cpu_data *tcd;
+	struct list_head *cur_head;
+	struct cfs_trace_page *tage;
+	struct cfs_trace_page *tmp;
+	int i, cpu;
+
+	spin_lock(&pc->pc_lock);
+	cfs_for_each_possible_cpu(cpu) {
+		cfs_tcd_for_each_type_lock(tcd, i, cpu) {
+			cur_head = tcd->tcd_pages.next;
+
+			list_for_each_entry_safe(tage, tmp, &pc->pc_pages,
+						 linkage) {
+
+				__LASSERT_TAGE_INVARIANT(tage);
+
+				if (tage->cpu != cpu || tage->type != i)
+					continue;
+
+				cfs_tage_to_tail(tage, cur_head);
+				tcd->tcd_cur_pages++;
+			}
+		}
+	}
+	spin_unlock(&pc->pc_lock);
+}
+
+static void put_pages_back(struct page_collection *pc)
+{
+	if (!libcfs_panic_in_progress)
+		put_pages_back_on_all_cpus(pc);
+}
+
+/* Add pages to a per-cpu debug daemon ringbuffer.  This buffer makes sure that
+ * we have a good amount of data at all times for dumping during an LBUG, even
+ * if we have been steadily writing (and otherwise discarding) pages via the
+ * debug daemon. */
+static void put_pages_on_tcd_daemon_list(struct page_collection *pc,
+					 struct cfs_trace_cpu_data *tcd)
+{
+	struct cfs_trace_page *tage;
+	struct cfs_trace_page *tmp;
+
+	spin_lock(&pc->pc_lock);
+	list_for_each_entry_safe(tage, tmp, &pc->pc_pages, linkage) {
+
+		__LASSERT_TAGE_INVARIANT(tage);
+
+		if (tage->cpu != tcd->tcd_cpu || tage->type != tcd->tcd_type)
+			continue;
+
+		cfs_tage_to_tail(tage, &tcd->tcd_daemon_pages);
+		tcd->tcd_cur_daemon_pages++;
+
+		if (tcd->tcd_cur_daemon_pages > tcd->tcd_max_pages) {
+			struct cfs_trace_page *victim;
+
+			__LASSERT(!list_empty(&tcd->tcd_daemon_pages));
+			victim = cfs_tage_from_list(tcd->tcd_daemon_pages.next);
+
+			__LASSERT_TAGE_INVARIANT(victim);
+
+			list_del(&victim->linkage);
+			cfs_tage_free(victim);
+			tcd->tcd_cur_daemon_pages--;
+		}
+	}
+	spin_unlock(&pc->pc_lock);
+}
+
+static void put_pages_on_daemon_list(struct page_collection *pc)
+{
+	struct cfs_trace_cpu_data *tcd;
+	int i, cpu;
+
+	cfs_for_each_possible_cpu(cpu) {
+		cfs_tcd_for_each_type_lock(tcd, i, cpu)
+			put_pages_on_tcd_daemon_list(pc, tcd);
+	}
+}
+
+void cfs_trace_debug_print(void)
+{
+	struct page_collection pc;
+	struct cfs_trace_page *tage;
+	struct cfs_trace_page *tmp;
+
+	spin_lock_init(&pc.pc_lock);
+
+	pc.pc_want_daemon_pages = 1;
+	collect_pages(&pc);
+	list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) {
+		char *p, *file, *fn;
+		struct page *page;
+
+		__LASSERT_TAGE_INVARIANT(tage);
+
+		page = tage->page;
+		p = page_address(page);
+		while (p < ((char *)page_address(page) + tage->used)) {
+			struct ptldebug_header *hdr;
+			int len;
+			hdr = (void *)p;
+			p += sizeof(*hdr);
+			file = p;
+			p += strlen(file) + 1;
+			fn = p;
+			p += strlen(fn) + 1;
+			len = hdr->ph_len - (int)(p - (char *)hdr);
+
+			cfs_print_to_console(hdr, D_EMERG, p, len, file, fn);
+
+			p += len;
+		}
+
+		list_del(&tage->linkage);
+		cfs_tage_free(tage);
+	}
+}
+
+int cfs_tracefile_dump_all_pages(char *filename)
+{
+	struct page_collection	pc;
+	struct file		*filp;
+	struct cfs_trace_page	*tage;
+	struct cfs_trace_page	*tmp;
+	int rc;
+
+	DECL_MMSPACE;
+
+	cfs_tracefile_write_lock();
+
+	filp = filp_open(filename, O_CREAT|O_EXCL|O_WRONLY|O_LARGEFILE, 0600);
+	if (IS_ERR(filp)) {
+		rc = PTR_ERR(filp);
+		filp = NULL;
+		printk(KERN_ERR "LustreError: can't open %s for dump: rc %d\n",
+		      filename, rc);
+		goto out;
+	}
+
+	spin_lock_init(&pc.pc_lock);
+	pc.pc_want_daemon_pages = 1;
+	collect_pages(&pc);
+	if (list_empty(&pc.pc_pages)) {
+		rc = 0;
+		goto close;
+	}
+
+	/* ok, for now, just write the pages.  in the future we'll be building
+	 * iobufs with the pages and calling generic_direct_IO */
+	MMSPACE_OPEN;
+	list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) {
+
+		__LASSERT_TAGE_INVARIANT(tage);
+
+		rc = filp_write(filp, page_address(tage->page),
+				tage->used, filp_poff(filp));
+		if (rc != (int)tage->used) {
+			printk(KERN_WARNING "wanted to write %u but wrote "
+			       "%d\n", tage->used, rc);
+			put_pages_back(&pc);
+			__LASSERT(list_empty(&pc.pc_pages));
+			break;
+		}
+		list_del(&tage->linkage);
+		cfs_tage_free(tage);
+	}
+	MMSPACE_CLOSE;
+	rc = filp_fsync(filp);
+	if (rc)
+		printk(KERN_ERR "sync returns %d\n", rc);
+close:
+	filp_close(filp, NULL);
+out:
+	cfs_tracefile_write_unlock();
+	return rc;
+}
+
+void cfs_trace_flush_pages(void)
+{
+	struct page_collection pc;
+	struct cfs_trace_page *tage;
+	struct cfs_trace_page *tmp;
+
+	spin_lock_init(&pc.pc_lock);
+
+	pc.pc_want_daemon_pages = 1;
+	collect_pages(&pc);
+	list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) {
+
+		__LASSERT_TAGE_INVARIANT(tage);
+
+		list_del(&tage->linkage);
+		cfs_tage_free(tage);
+	}
+}
+
+int cfs_trace_copyin_string(char *knl_buffer, int knl_buffer_nob,
+			    const char *usr_buffer, int usr_buffer_nob)
+{
+	int    nob;
+
+	if (usr_buffer_nob > knl_buffer_nob)
+		return -EOVERFLOW;
+
+	if (copy_from_user((void *)knl_buffer,
+			   (void *)usr_buffer, usr_buffer_nob))
+		return -EFAULT;
+
+	nob = strnlen(knl_buffer, usr_buffer_nob);
+	while (nob-- >= 0)		      /* strip trailing whitespace */
+		if (!isspace(knl_buffer[nob]))
+			break;
+
+	if (nob < 0)			    /* empty string */
+		return -EINVAL;
+
+	if (nob == knl_buffer_nob)	      /* no space to terminate */
+		return -EOVERFLOW;
+
+	knl_buffer[nob + 1] = 0;		/* terminate */
+	return 0;
+}
+EXPORT_SYMBOL(cfs_trace_copyin_string);
+
+int cfs_trace_copyout_string(char *usr_buffer, int usr_buffer_nob,
+			     const char *knl_buffer, char *append)
+{
+	/* NB if 'append' != NULL, it's a single character to append to the
+	 * copied out string - usually "\n", for /proc entries and "" (i.e. a
+	 * terminating zero byte) for sysctl entries */
+	int   nob = strlen(knl_buffer);
+
+	if (nob > usr_buffer_nob)
+		nob = usr_buffer_nob;
+
+	if (copy_to_user(usr_buffer, knl_buffer, nob))
+		return -EFAULT;
+
+	if (append != NULL && nob < usr_buffer_nob) {
+		if (copy_to_user(usr_buffer + nob, append, 1))
+			return -EFAULT;
+
+		nob++;
+	}
+
+	return nob;
+}
+EXPORT_SYMBOL(cfs_trace_copyout_string);
+
+int cfs_trace_allocate_string_buffer(char **str, int nob)
+{
+	if (nob > 2 * PAGE_CACHE_SIZE)	    /* string must be "sensible" */
+		return -EINVAL;
+
+	*str = kmalloc(nob, GFP_IOFS | __GFP_ZERO);
+	if (*str == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void cfs_trace_free_string_buffer(char *str, int nob)
+{
+	kfree(str);
+}
+
+int cfs_trace_dump_debug_buffer_usrstr(void *usr_str, int usr_str_nob)
+{
+	char	 *str;
+	int	   rc;
+
+	rc = cfs_trace_allocate_string_buffer(&str, usr_str_nob + 1);
+	if (rc != 0)
+		return rc;
+
+	rc = cfs_trace_copyin_string(str, usr_str_nob + 1,
+				     usr_str, usr_str_nob);
+	if (rc != 0)
+		goto out;
+
+	if (str[0] != '/') {
+		rc = -EINVAL;
+		goto out;
+	}
+	rc = cfs_tracefile_dump_all_pages(str);
+out:
+	cfs_trace_free_string_buffer(str, usr_str_nob + 1);
+	return rc;
+}
+
+int cfs_trace_daemon_command(char *str)
+{
+	int       rc = 0;
+
+	cfs_tracefile_write_lock();
+
+	if (strcmp(str, "stop") == 0) {
+		cfs_tracefile_write_unlock();
+		cfs_trace_stop_thread();
+		cfs_tracefile_write_lock();
+		memset(cfs_tracefile, 0, sizeof(cfs_tracefile));
+
+	} else if (strncmp(str, "size=", 5) == 0) {
+		cfs_tracefile_size = simple_strtoul(str + 5, NULL, 0);
+		if (cfs_tracefile_size < 10 || cfs_tracefile_size > 20480)
+			cfs_tracefile_size = CFS_TRACEFILE_SIZE;
+		else
+			cfs_tracefile_size <<= 20;
+
+	} else if (strlen(str) >= sizeof(cfs_tracefile)) {
+		rc = -ENAMETOOLONG;
+	} else if (str[0] != '/') {
+		rc = -EINVAL;
+	} else {
+		strcpy(cfs_tracefile, str);
+
+		printk(KERN_INFO
+		       "Lustre: debug daemon will attempt to start writing "
+		       "to %s (%lukB max)\n", cfs_tracefile,
+		       (long)(cfs_tracefile_size >> 10));
+
+		cfs_trace_start_thread();
+	}
+
+	cfs_tracefile_write_unlock();
+	return rc;
+}
+
+int cfs_trace_daemon_command_usrstr(void *usr_str, int usr_str_nob)
+{
+	char *str;
+	int   rc;
+
+	rc = cfs_trace_allocate_string_buffer(&str, usr_str_nob + 1);
+	if (rc != 0)
+		return rc;
+
+	rc = cfs_trace_copyin_string(str, usr_str_nob + 1,
+				 usr_str, usr_str_nob);
+	if (rc == 0)
+		rc = cfs_trace_daemon_command(str);
+
+	cfs_trace_free_string_buffer(str, usr_str_nob + 1);
+	return rc;
+}
+
+int cfs_trace_set_debug_mb(int mb)
+{
+	int i;
+	int j;
+	int pages;
+	int limit = cfs_trace_max_debug_mb();
+	struct cfs_trace_cpu_data *tcd;
+
+	if (mb < num_possible_cpus()) {
+		printk(KERN_WARNING
+		       "Lustre: %d MB is too small for debug buffer size, "
+		       "setting it to %d MB.\n", mb, num_possible_cpus());
+		mb = num_possible_cpus();
+	}
+
+	if (mb > limit) {
+		printk(KERN_WARNING
+		       "Lustre: %d MB is too large for debug buffer size, "
+		       "setting it to %d MB.\n", mb, limit);
+		mb = limit;
+	}
+
+	mb /= num_possible_cpus();
+	pages = mb << (20 - PAGE_CACHE_SHIFT);
+
+	cfs_tracefile_write_lock();
+
+	cfs_tcd_for_each(tcd, i, j)
+		tcd->tcd_max_pages = (pages * tcd->tcd_pages_factor) / 100;
+
+	cfs_tracefile_write_unlock();
+
+	return 0;
+}
+
+int cfs_trace_set_debug_mb_usrstr(void *usr_str, int usr_str_nob)
+{
+	char     str[32];
+	int      rc;
+
+	rc = cfs_trace_copyin_string(str, sizeof(str), usr_str, usr_str_nob);
+	if (rc < 0)
+		return rc;
+
+	return cfs_trace_set_debug_mb(simple_strtoul(str, NULL, 0));
+}
+
+int cfs_trace_get_debug_mb(void)
+{
+	int i;
+	int j;
+	struct cfs_trace_cpu_data *tcd;
+	int total_pages = 0;
+
+	cfs_tracefile_read_lock();
+
+	cfs_tcd_for_each(tcd, i, j)
+		total_pages += tcd->tcd_max_pages;
+
+	cfs_tracefile_read_unlock();
+
+	return (total_pages >> (20 - PAGE_CACHE_SHIFT)) + 1;
+}
+
+static int tracefiled(void *arg)
+{
+	struct page_collection pc;
+	struct tracefiled_ctl *tctl = arg;
+	struct cfs_trace_page *tage;
+	struct cfs_trace_page *tmp;
+	struct file *filp;
+	int last_loop = 0;
+	int rc;
+
+	DECL_MMSPACE;
+
+	/* we're started late enough that we pick up init's fs context */
+	/* this is so broken in uml?  what on earth is going on? */
+
+	spin_lock_init(&pc.pc_lock);
+	complete(&tctl->tctl_start);
+
+	while (1) {
+		wait_queue_t __wait;
+
+		pc.pc_want_daemon_pages = 0;
+		collect_pages(&pc);
+		if (list_empty(&pc.pc_pages))
+			goto end_loop;
+
+		filp = NULL;
+		cfs_tracefile_read_lock();
+		if (cfs_tracefile[0] != 0) {
+			filp = filp_open(cfs_tracefile,
+					 O_CREAT | O_RDWR | O_LARGEFILE,
+					 0600);
+			if (IS_ERR(filp)) {
+				rc = PTR_ERR(filp);
+				filp = NULL;
+				printk(KERN_WARNING "couldn't open %s: "
+				       "%d\n", cfs_tracefile, rc);
+			}
+		}
+		cfs_tracefile_read_unlock();
+		if (filp == NULL) {
+			put_pages_on_daemon_list(&pc);
+			__LASSERT(list_empty(&pc.pc_pages));
+			goto end_loop;
+		}
+
+		MMSPACE_OPEN;
+
+		list_for_each_entry_safe(tage, tmp, &pc.pc_pages,
+						   linkage) {
+			static loff_t f_pos;
+
+			__LASSERT_TAGE_INVARIANT(tage);
+
+			if (f_pos >= (off_t)cfs_tracefile_size)
+				f_pos = 0;
+			else if (f_pos > (off_t)filp_size(filp))
+				f_pos = filp_size(filp);
+
+			rc = filp_write(filp, page_address(tage->page),
+					tage->used, &f_pos);
+			if (rc != (int)tage->used) {
+				printk(KERN_WARNING "wanted to write %u "
+				       "but wrote %d\n", tage->used, rc);
+				put_pages_back(&pc);
+				__LASSERT(list_empty(&pc.pc_pages));
+			}
+		}
+		MMSPACE_CLOSE;
+
+		filp_close(filp, NULL);
+		put_pages_on_daemon_list(&pc);
+		if (!list_empty(&pc.pc_pages)) {
+			int i;
+
+			printk(KERN_ALERT "Lustre: trace pages aren't "
+			       " empty\n");
+			printk(KERN_ERR "total cpus(%d): ",
+			       num_possible_cpus());
+			for (i = 0; i < num_possible_cpus(); i++)
+				if (cpu_online(i))
+					printk(KERN_ERR "%d(on) ", i);
+				else
+					printk(KERN_ERR "%d(off) ", i);
+			printk(KERN_ERR "\n");
+
+			i = 0;
+			list_for_each_entry_safe(tage, tmp, &pc.pc_pages,
+						     linkage)
+				printk(KERN_ERR "page %d belongs to cpu "
+				       "%d\n", ++i, tage->cpu);
+			printk(KERN_ERR "There are %d pages unwritten\n",
+			       i);
+		}
+		__LASSERT(list_empty(&pc.pc_pages));
+end_loop:
+		if (atomic_read(&tctl->tctl_shutdown)) {
+			if (last_loop == 0) {
+				last_loop = 1;
+				continue;
+			} else {
+				break;
+			}
+		}
+		init_waitqueue_entry_current(&__wait);
+		add_wait_queue(&tctl->tctl_waitq, &__wait);
+		set_current_state(TASK_INTERRUPTIBLE);
+		waitq_timedwait(&__wait, TASK_INTERRUPTIBLE,
+				    cfs_time_seconds(1));
+		remove_wait_queue(&tctl->tctl_waitq, &__wait);
+	}
+	complete(&tctl->tctl_stop);
+	return 0;
+}
+
+int cfs_trace_start_thread(void)
+{
+	struct tracefiled_ctl *tctl = &trace_tctl;
+	int rc = 0;
+
+	mutex_lock(&cfs_trace_thread_mutex);
+	if (thread_running)
+		goto out;
+
+	init_completion(&tctl->tctl_start);
+	init_completion(&tctl->tctl_stop);
+	init_waitqueue_head(&tctl->tctl_waitq);
+	atomic_set(&tctl->tctl_shutdown, 0);
+
+	if (IS_ERR(kthread_run(tracefiled, tctl, "ktracefiled"))) {
+		rc = -ECHILD;
+		goto out;
+	}
+
+	wait_for_completion(&tctl->tctl_start);
+	thread_running = 1;
+out:
+	mutex_unlock(&cfs_trace_thread_mutex);
+	return rc;
+}
+
+void cfs_trace_stop_thread(void)
+{
+	struct tracefiled_ctl *tctl = &trace_tctl;
+
+	mutex_lock(&cfs_trace_thread_mutex);
+	if (thread_running) {
+		printk(KERN_INFO
+		       "Lustre: shutting down debug daemon thread...\n");
+		atomic_set(&tctl->tctl_shutdown, 1);
+		wait_for_completion(&tctl->tctl_stop);
+		thread_running = 0;
+	}
+	mutex_unlock(&cfs_trace_thread_mutex);
+}
+
+int cfs_tracefile_init(int max_pages)
+{
+	struct cfs_trace_cpu_data *tcd;
+	int		    i;
+	int		    j;
+	int		    rc;
+	int		    factor;
+
+	rc = cfs_tracefile_init_arch();
+	if (rc != 0)
+		return rc;
+
+	cfs_tcd_for_each(tcd, i, j) {
+		/* tcd_pages_factor is initialized int tracefile_init_arch. */
+		factor = tcd->tcd_pages_factor;
+		INIT_LIST_HEAD(&tcd->tcd_pages);
+		INIT_LIST_HEAD(&tcd->tcd_stock_pages);
+		INIT_LIST_HEAD(&tcd->tcd_daemon_pages);
+		tcd->tcd_cur_pages = 0;
+		tcd->tcd_cur_stock_pages = 0;
+		tcd->tcd_cur_daemon_pages = 0;
+		tcd->tcd_max_pages = (max_pages * factor) / 100;
+		LASSERT(tcd->tcd_max_pages > 0);
+		tcd->tcd_shutting_down = 0;
+	}
+
+	return 0;
+}
+
+static void trace_cleanup_on_all_cpus(void)
+{
+	struct cfs_trace_cpu_data *tcd;
+	struct cfs_trace_page *tage;
+	struct cfs_trace_page *tmp;
+	int i, cpu;
+
+	cfs_for_each_possible_cpu(cpu) {
+		cfs_tcd_for_each_type_lock(tcd, i, cpu) {
+			tcd->tcd_shutting_down = 1;
+
+			list_for_each_entry_safe(tage, tmp, &tcd->tcd_pages,
+							   linkage) {
+				__LASSERT_TAGE_INVARIANT(tage);
+
+				list_del(&tage->linkage);
+				cfs_tage_free(tage);
+			}
+
+			tcd->tcd_cur_pages = 0;
+		}
+	}
+}
+
+static void cfs_trace_cleanup(void)
+{
+	struct page_collection pc;
+
+	INIT_LIST_HEAD(&pc.pc_pages);
+	spin_lock_init(&pc.pc_lock);
+
+	trace_cleanup_on_all_cpus();
+
+	cfs_tracefile_fini_arch();
+}
+
+void cfs_tracefile_exit(void)
+{
+	cfs_trace_stop_thread();
+	cfs_trace_cleanup();
+}
diff --git a/drivers/staging/lustre/lustre/libcfs/tracefile.h b/drivers/staging/lustre/lustre/libcfs/tracefile.h
new file mode 100644
index 000000000000..7e8d17c12b5b
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/tracefile.h
@@ -0,0 +1,340 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LIBCFS_TRACEFILE_H__
+#define __LIBCFS_TRACEFILE_H__
+
+#include <linux/libcfs/libcfs.h>
+
+#include "linux/linux-tracefile.h"
+
+/* trace file lock routines */
+
+#define TRACEFILE_NAME_SIZE 1024
+extern char      cfs_tracefile[TRACEFILE_NAME_SIZE];
+extern long long cfs_tracefile_size;
+
+extern void libcfs_run_debug_log_upcall(char *file);
+
+int  cfs_tracefile_init_arch(void);
+void cfs_tracefile_fini_arch(void);
+
+void cfs_tracefile_read_lock(void);
+void cfs_tracefile_read_unlock(void);
+void cfs_tracefile_write_lock(void);
+void cfs_tracefile_write_unlock(void);
+
+int cfs_tracefile_dump_all_pages(char *filename);
+void cfs_trace_debug_print(void);
+void cfs_trace_flush_pages(void);
+int cfs_trace_start_thread(void);
+void cfs_trace_stop_thread(void);
+int cfs_tracefile_init(int max_pages);
+void cfs_tracefile_exit(void);
+
+
+
+int cfs_trace_copyin_string(char *knl_buffer, int knl_buffer_nob,
+			    const char *usr_buffer, int usr_buffer_nob);
+int cfs_trace_copyout_string(char *usr_buffer, int usr_buffer_nob,
+			     const char *knl_str, char *append);
+int cfs_trace_allocate_string_buffer(char **str, int nob);
+void cfs_trace_free_string_buffer(char *str, int nob);
+int cfs_trace_dump_debug_buffer_usrstr(void *usr_str, int usr_str_nob);
+int cfs_trace_daemon_command(char *str);
+int cfs_trace_daemon_command_usrstr(void *usr_str, int usr_str_nob);
+int cfs_trace_set_debug_mb(int mb);
+int cfs_trace_set_debug_mb_usrstr(void *usr_str, int usr_str_nob);
+int cfs_trace_get_debug_mb(void);
+
+extern void libcfs_debug_dumplog_internal(void *arg);
+extern void libcfs_register_panic_notifier(void);
+extern void libcfs_unregister_panic_notifier(void);
+extern int  libcfs_panic_in_progress;
+extern int  cfs_trace_max_debug_mb(void);
+
+#define TCD_MAX_PAGES (5 << (20 - PAGE_CACHE_SHIFT))
+#define TCD_STOCK_PAGES (TCD_MAX_PAGES)
+#define CFS_TRACEFILE_SIZE (500 << 20)
+
+#ifdef LUSTRE_TRACEFILE_PRIVATE
+
+/*
+ * Private declare for tracefile
+ */
+#define TCD_MAX_PAGES (5 << (20 - PAGE_CACHE_SHIFT))
+#define TCD_STOCK_PAGES (TCD_MAX_PAGES)
+
+#define CFS_TRACEFILE_SIZE (500 << 20)
+
+/* Size of a buffer for sprinting console messages if we can't get a page
+ * from system */
+#define CFS_TRACE_CONSOLE_BUFFER_SIZE   1024
+
+union cfs_trace_data_union {
+	struct cfs_trace_cpu_data {
+		/*
+		 * Even though this structure is meant to be per-CPU, locking
+		 * is needed because in some places the data may be accessed
+		 * from other CPUs. This lock is directly used in trace_get_tcd
+		 * and trace_put_tcd, which are called in libcfs_debug_vmsg2 and
+		 * tcd_for_each_type_lock
+		 */
+		spinlock_t		tcd_lock;
+		unsigned long	   tcd_lock_flags;
+
+		/*
+		 * pages with trace records not yet processed by tracefiled.
+		 */
+		struct list_head	      tcd_pages;
+		/* number of pages on ->tcd_pages */
+		unsigned long	   tcd_cur_pages;
+
+		/*
+		 * pages with trace records already processed by
+		 * tracefiled. These pages are kept in memory, so that some
+		 * portion of log can be written in the event of LBUG. This
+		 * list is maintained in LRU order.
+		 *
+		 * Pages are moved to ->tcd_daemon_pages by tracefiled()
+		 * (put_pages_on_daemon_list()). LRU pages from this list are
+		 * discarded when list grows too large.
+		 */
+		struct list_head	      tcd_daemon_pages;
+		/* number of pages on ->tcd_daemon_pages */
+		unsigned long	   tcd_cur_daemon_pages;
+
+		/*
+		 * Maximal number of pages allowed on ->tcd_pages and
+		 * ->tcd_daemon_pages each.
+		 * Always TCD_MAX_PAGES * tcd_pages_factor / 100 in current
+		 * implementation.
+		 */
+		unsigned long	   tcd_max_pages;
+
+		/*
+		 * preallocated pages to write trace records into. Pages from
+		 * ->tcd_stock_pages are moved to ->tcd_pages by
+		 * portals_debug_msg().
+		 *
+		 * This list is necessary, because on some platforms it's
+		 * impossible to perform efficient atomic page allocation in a
+		 * non-blockable context.
+		 *
+		 * Such platforms fill ->tcd_stock_pages "on occasion", when
+		 * tracing code is entered in blockable context.
+		 *
+		 * trace_get_tage_try() tries to get a page from
+		 * ->tcd_stock_pages first and resorts to atomic page
+		 * allocation only if this queue is empty. ->tcd_stock_pages
+		 * is replenished when tracing code is entered in blocking
+		 * context (darwin-tracefile.c:trace_get_tcd()). We try to
+		 * maintain TCD_STOCK_PAGES (40 by default) pages in this
+		 * queue. Atomic allocation is only required if more than
+		 * TCD_STOCK_PAGES pagesful are consumed by trace records all
+		 * emitted in non-blocking contexts. Which is quite unlikely.
+		 */
+		struct list_head	      tcd_stock_pages;
+		/* number of pages on ->tcd_stock_pages */
+		unsigned long	   tcd_cur_stock_pages;
+
+		unsigned short	  tcd_shutting_down;
+		unsigned short	  tcd_cpu;
+		unsigned short	  tcd_type;
+		/* The factors to share debug memory. */
+		unsigned short	  tcd_pages_factor;
+	} tcd;
+	char __pad[L1_CACHE_ALIGN(sizeof(struct cfs_trace_cpu_data))];
+};
+
+#define TCD_MAX_TYPES      8
+extern union cfs_trace_data_union (*cfs_trace_data[TCD_MAX_TYPES])[NR_CPUS];
+
+#define cfs_tcd_for_each(tcd, i, j)				       \
+    for (i = 0; cfs_trace_data[i] != NULL; i++)			   \
+	for (j = 0, ((tcd) = &(*cfs_trace_data[i])[j].tcd);	       \
+	     j < num_possible_cpus();				 \
+	     j++, (tcd) = &(*cfs_trace_data[i])[j].tcd)
+
+#define cfs_tcd_for_each_type_lock(tcd, i, cpu)			   \
+    for (i = 0; cfs_trace_data[i] &&				      \
+	 (tcd = &(*cfs_trace_data[i])[cpu].tcd) &&			\
+	 cfs_trace_lock_tcd(tcd, 1); cfs_trace_unlock_tcd(tcd, 1), i++)
+
+/* XXX nikita: this declaration is internal to tracefile.c and should probably
+ * be moved there */
+struct page_collection {
+	struct list_head	pc_pages;
+	/*
+	 * spin-lock protecting ->pc_pages. It is taken by smp_call_function()
+	 * call-back functions. XXX nikita: Which is horrible: all processors
+	 * receive NMI at the same time only to be serialized by this
+	 * lock. Probably ->pc_pages should be replaced with an array of
+	 * NR_CPUS elements accessed locklessly.
+	 */
+	spinlock_t	pc_lock;
+	/*
+	 * if this flag is set, collect_pages() will spill both
+	 * ->tcd_daemon_pages and ->tcd_pages to the ->pc_pages. Otherwise,
+	 * only ->tcd_pages are spilled.
+	 */
+	int		pc_want_daemon_pages;
+};
+
+/* XXX nikita: this declaration is internal to tracefile.c and should probably
+ * be moved there */
+struct tracefiled_ctl {
+	struct completion	tctl_start;
+	struct completion	tctl_stop;
+	wait_queue_head_t		tctl_waitq;
+	pid_t			tctl_pid;
+	atomic_t		tctl_shutdown;
+};
+
+/*
+ * small data-structure for each page owned by tracefiled.
+ */
+/* XXX nikita: this declaration is internal to tracefile.c and should probably
+ * be moved there */
+struct cfs_trace_page {
+	/*
+	 * page itself
+	 */
+	struct page	  *page;
+	/*
+	 * linkage into one of the lists in trace_data_union or
+	 * page_collection
+	 */
+	struct list_head	   linkage;
+	/*
+	 * number of bytes used within this page
+	 */
+	unsigned int	 used;
+	/*
+	 * cpu that owns this page
+	 */
+	unsigned short       cpu;
+	/*
+	 * type(context) of this page
+	 */
+	unsigned short       type;
+};
+
+extern void cfs_set_ptldebug_header(struct ptldebug_header *header,
+				    struct libcfs_debug_msg_data *m,
+				    unsigned long stack);
+extern void cfs_print_to_console(struct ptldebug_header *hdr, int mask,
+				 const char *buf, int len, const char *file,
+				 const char *fn);
+
+extern int cfs_trace_lock_tcd(struct cfs_trace_cpu_data *tcd, int walking);
+extern void cfs_trace_unlock_tcd(struct cfs_trace_cpu_data *tcd, int walking);
+
+/**
+ * trace_buf_type_t, trace_buf_idx_get() and trace_console_buffers[][]
+ * are not public libcfs API; they should be defined in
+ * platform-specific tracefile include files
+ * (see, for example, linux-tracefile.h).
+ */
+
+extern char *cfs_trace_console_buffers[NR_CPUS][CFS_TCD_TYPE_MAX];
+extern cfs_trace_buf_type_t cfs_trace_buf_idx_get(void);
+
+static inline char *
+cfs_trace_get_console_buffer(void)
+{
+	unsigned int i = get_cpu();
+	unsigned int j = cfs_trace_buf_idx_get();
+
+	return cfs_trace_console_buffers[i][j];
+}
+
+static inline void
+cfs_trace_put_console_buffer(char *buffer)
+{
+	put_cpu();
+}
+
+static inline struct cfs_trace_cpu_data *
+cfs_trace_get_tcd(void)
+{
+	struct cfs_trace_cpu_data *tcd =
+		&(*cfs_trace_data[cfs_trace_buf_idx_get()])[get_cpu()].tcd;
+
+	cfs_trace_lock_tcd(tcd, 0);
+
+	return tcd;
+}
+
+static inline void
+cfs_trace_put_tcd (struct cfs_trace_cpu_data *tcd)
+{
+	cfs_trace_unlock_tcd(tcd, 0);
+
+	put_cpu();
+}
+
+int cfs_trace_refill_stock(struct cfs_trace_cpu_data *tcd, int gfp,
+			   struct list_head *stock);
+
+
+int cfs_tcd_owns_tage(struct cfs_trace_cpu_data *tcd,
+		      struct cfs_trace_page *tage);
+
+extern void cfs_trace_assertion_failed(const char *str,
+				       struct libcfs_debug_msg_data *m);
+
+/* ASSERTION that is safe to use within the debug system */
+#define __LASSERT(cond)						 \
+do {								    \
+	if (unlikely(!(cond))) {					\
+		LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_EMERG, NULL);     \
+		cfs_trace_assertion_failed("ASSERTION("#cond") failed", \
+					   &msgdata);		   \
+	}							       \
+} while (0)
+
+#define __LASSERT_TAGE_INVARIANT(tage)				  \
+do {								    \
+	__LASSERT(tage != NULL);					\
+	__LASSERT(tage->page != NULL);				  \
+	__LASSERT(tage->used <= PAGE_CACHE_SIZE);			 \
+	__LASSERT(page_count(tage->page) > 0);		      \
+} while (0)
+
+#endif	/* LUSTRE_TRACEFILE_PRIVATE */
+
+#endif /* __LIBCFS_TRACEFILE_H__ */
diff --git a/drivers/staging/lustre/lustre/libcfs/upcall_cache.c b/drivers/staging/lustre/lustre/libcfs/upcall_cache.c
new file mode 100644
index 000000000000..18c68c3493b8
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/upcall_cache.c
@@ -0,0 +1,462 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/upcall_cache.c
+ *
+ * Supplementary groups cache.
+ */
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/libcfs/lucache.h>
+
+static struct upcall_cache_entry *alloc_entry(struct upcall_cache *cache,
+					      __u64 key, void *args)
+{
+	struct upcall_cache_entry *entry;
+
+	LIBCFS_ALLOC(entry, sizeof(*entry));
+	if (!entry)
+		return NULL;
+
+	UC_CACHE_SET_NEW(entry);
+	INIT_LIST_HEAD(&entry->ue_hash);
+	entry->ue_key = key;
+	atomic_set(&entry->ue_refcount, 0);
+	init_waitqueue_head(&entry->ue_waitq);
+	if (cache->uc_ops->init_entry)
+		cache->uc_ops->init_entry(entry, args);
+	return entry;
+}
+
+/* protected by cache lock */
+static void free_entry(struct upcall_cache *cache,
+		       struct upcall_cache_entry *entry)
+{
+	if (cache->uc_ops->free_entry)
+		cache->uc_ops->free_entry(cache, entry);
+
+	list_del(&entry->ue_hash);
+	CDEBUG(D_OTHER, "destroy cache entry %p for key "LPU64"\n",
+	       entry, entry->ue_key);
+	LIBCFS_FREE(entry, sizeof(*entry));
+}
+
+static inline int upcall_compare(struct upcall_cache *cache,
+				 struct upcall_cache_entry *entry,
+				 __u64 key, void *args)
+{
+	if (entry->ue_key != key)
+		return -1;
+
+	if (cache->uc_ops->upcall_compare)
+		return cache->uc_ops->upcall_compare(cache, entry, key, args);
+
+	return 0;
+}
+
+static inline int downcall_compare(struct upcall_cache *cache,
+				   struct upcall_cache_entry *entry,
+				   __u64 key, void *args)
+{
+	if (entry->ue_key != key)
+		return -1;
+
+	if (cache->uc_ops->downcall_compare)
+		return cache->uc_ops->downcall_compare(cache, entry, key, args);
+
+	return 0;
+}
+
+static inline void get_entry(struct upcall_cache_entry *entry)
+{
+	atomic_inc(&entry->ue_refcount);
+}
+
+static inline void put_entry(struct upcall_cache *cache,
+			     struct upcall_cache_entry *entry)
+{
+	if (atomic_dec_and_test(&entry->ue_refcount) &&
+	    (UC_CACHE_IS_INVALID(entry) || UC_CACHE_IS_EXPIRED(entry))) {
+		free_entry(cache, entry);
+	}
+}
+
+static int check_unlink_entry(struct upcall_cache *cache,
+			      struct upcall_cache_entry *entry)
+{
+	if (UC_CACHE_IS_VALID(entry) &&
+	    cfs_time_before(cfs_time_current(), entry->ue_expire))
+		return 0;
+
+	if (UC_CACHE_IS_ACQUIRING(entry)) {
+		if (entry->ue_acquire_expire == 0 ||
+		    cfs_time_before(cfs_time_current(),
+				    entry->ue_acquire_expire))
+			return 0;
+
+		UC_CACHE_SET_EXPIRED(entry);
+		wake_up_all(&entry->ue_waitq);
+	} else if (!UC_CACHE_IS_INVALID(entry)) {
+		UC_CACHE_SET_EXPIRED(entry);
+	}
+
+	list_del_init(&entry->ue_hash);
+	if (!atomic_read(&entry->ue_refcount))
+		free_entry(cache, entry);
+	return 1;
+}
+
+static inline int refresh_entry(struct upcall_cache *cache,
+			 struct upcall_cache_entry *entry)
+{
+	LASSERT(cache->uc_ops->do_upcall);
+	return cache->uc_ops->do_upcall(cache, entry);
+}
+
+struct upcall_cache_entry *upcall_cache_get_entry(struct upcall_cache *cache,
+						  __u64 key, void *args)
+{
+	struct upcall_cache_entry *entry = NULL, *new = NULL, *next;
+	struct list_head *head;
+	wait_queue_t wait;
+	int rc, found;
+	ENTRY;
+
+	LASSERT(cache);
+
+	head = &cache->uc_hashtable[UC_CACHE_HASH_INDEX(key)];
+find_again:
+	found = 0;
+	spin_lock(&cache->uc_lock);
+	list_for_each_entry_safe(entry, next, head, ue_hash) {
+		/* check invalid & expired items */
+		if (check_unlink_entry(cache, entry))
+			continue;
+		if (upcall_compare(cache, entry, key, args) == 0) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (!found) {
+		if (!new) {
+			spin_unlock(&cache->uc_lock);
+			new = alloc_entry(cache, key, args);
+			if (!new) {
+				CERROR("fail to alloc entry\n");
+				RETURN(ERR_PTR(-ENOMEM));
+			}
+			goto find_again;
+		} else {
+			list_add(&new->ue_hash, head);
+			entry = new;
+		}
+	} else {
+		if (new) {
+			free_entry(cache, new);
+			new = NULL;
+		}
+		list_move(&entry->ue_hash, head);
+	}
+	get_entry(entry);
+
+	/* acquire for new one */
+	if (UC_CACHE_IS_NEW(entry)) {
+		UC_CACHE_SET_ACQUIRING(entry);
+		UC_CACHE_CLEAR_NEW(entry);
+		spin_unlock(&cache->uc_lock);
+		rc = refresh_entry(cache, entry);
+		spin_lock(&cache->uc_lock);
+		entry->ue_acquire_expire =
+			cfs_time_shift(cache->uc_acquire_expire);
+		if (rc < 0) {
+			UC_CACHE_CLEAR_ACQUIRING(entry);
+			UC_CACHE_SET_INVALID(entry);
+			wake_up_all(&entry->ue_waitq);
+			if (unlikely(rc == -EREMCHG)) {
+				put_entry(cache, entry);
+				GOTO(out, entry = ERR_PTR(rc));
+			}
+		}
+	}
+	/* someone (and only one) is doing upcall upon this item,
+	 * wait it to complete */
+	if (UC_CACHE_IS_ACQUIRING(entry)) {
+		long expiry = (entry == new) ?
+			      cfs_time_seconds(cache->uc_acquire_expire) :
+			      MAX_SCHEDULE_TIMEOUT;
+		long left;
+
+		init_waitqueue_entry_current(&wait);
+		add_wait_queue(&entry->ue_waitq, &wait);
+		set_current_state(TASK_INTERRUPTIBLE);
+		spin_unlock(&cache->uc_lock);
+
+		left = waitq_timedwait(&wait, TASK_INTERRUPTIBLE,
+					   expiry);
+
+		spin_lock(&cache->uc_lock);
+		remove_wait_queue(&entry->ue_waitq, &wait);
+		if (UC_CACHE_IS_ACQUIRING(entry)) {
+			/* we're interrupted or upcall failed in the middle */
+			rc = left > 0 ? -EINTR : -ETIMEDOUT;
+			CERROR("acquire for key "LPU64": error %d\n",
+			       entry->ue_key, rc);
+			put_entry(cache, entry);
+			GOTO(out, entry = ERR_PTR(rc));
+		}
+	}
+
+	/* invalid means error, don't need to try again */
+	if (UC_CACHE_IS_INVALID(entry)) {
+		put_entry(cache, entry);
+		GOTO(out, entry = ERR_PTR(-EIDRM));
+	}
+
+	/* check expired
+	 * We can't refresh the existing one because some
+	 * memory might be shared by multiple processes.
+	 */
+	if (check_unlink_entry(cache, entry)) {
+		/* if expired, try again. but if this entry is
+		 * created by me but too quickly turn to expired
+		 * without any error, should at least give a
+		 * chance to use it once.
+		 */
+		if (entry != new) {
+			put_entry(cache, entry);
+			spin_unlock(&cache->uc_lock);
+			new = NULL;
+			goto find_again;
+		}
+	}
+
+	/* Now we know it's good */
+out:
+	spin_unlock(&cache->uc_lock);
+	RETURN(entry);
+}
+EXPORT_SYMBOL(upcall_cache_get_entry);
+
+void upcall_cache_put_entry(struct upcall_cache *cache,
+			    struct upcall_cache_entry *entry)
+{
+	ENTRY;
+
+	if (!entry) {
+		EXIT;
+		return;
+	}
+
+	LASSERT(atomic_read(&entry->ue_refcount) > 0);
+	spin_lock(&cache->uc_lock);
+	put_entry(cache, entry);
+	spin_unlock(&cache->uc_lock);
+	EXIT;
+}
+EXPORT_SYMBOL(upcall_cache_put_entry);
+
+int upcall_cache_downcall(struct upcall_cache *cache, __u32 err, __u64 key,
+			  void *args)
+{
+	struct upcall_cache_entry *entry = NULL;
+	struct list_head *head;
+	int found = 0, rc = 0;
+	ENTRY;
+
+	LASSERT(cache);
+
+	head = &cache->uc_hashtable[UC_CACHE_HASH_INDEX(key)];
+
+	spin_lock(&cache->uc_lock);
+	list_for_each_entry(entry, head, ue_hash) {
+		if (downcall_compare(cache, entry, key, args) == 0) {
+			found = 1;
+			get_entry(entry);
+			break;
+		}
+	}
+
+	if (!found) {
+		CDEBUG(D_OTHER, "%s: upcall for key "LPU64" not expected\n",
+		       cache->uc_name, key);
+		/* haven't found, it's possible */
+		spin_unlock(&cache->uc_lock);
+		RETURN(-EINVAL);
+	}
+
+	if (err) {
+		CDEBUG(D_OTHER, "%s: upcall for key "LPU64" returned %d\n",
+		       cache->uc_name, entry->ue_key, err);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	if (!UC_CACHE_IS_ACQUIRING(entry)) {
+		CDEBUG(D_RPCTRACE,"%s: found uptodate entry %p (key "LPU64")\n",
+		       cache->uc_name, entry, entry->ue_key);
+		GOTO(out, rc = 0);
+	}
+
+	if (UC_CACHE_IS_INVALID(entry) || UC_CACHE_IS_EXPIRED(entry)) {
+		CERROR("%s: found a stale entry %p (key "LPU64") in ioctl\n",
+		       cache->uc_name, entry, entry->ue_key);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	spin_unlock(&cache->uc_lock);
+	if (cache->uc_ops->parse_downcall)
+		rc = cache->uc_ops->parse_downcall(cache, entry, args);
+	spin_lock(&cache->uc_lock);
+	if (rc)
+		GOTO(out, rc);
+
+	entry->ue_expire = cfs_time_shift(cache->uc_entry_expire);
+	UC_CACHE_SET_VALID(entry);
+	CDEBUG(D_OTHER, "%s: created upcall cache entry %p for key "LPU64"\n",
+	       cache->uc_name, entry, entry->ue_key);
+out:
+	if (rc) {
+		UC_CACHE_SET_INVALID(entry);
+		list_del_init(&entry->ue_hash);
+	}
+	UC_CACHE_CLEAR_ACQUIRING(entry);
+	spin_unlock(&cache->uc_lock);
+	wake_up_all(&entry->ue_waitq);
+	put_entry(cache, entry);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(upcall_cache_downcall);
+
+static void cache_flush(struct upcall_cache *cache, int force)
+{
+	struct upcall_cache_entry *entry, *next;
+	int i;
+	ENTRY;
+
+	spin_lock(&cache->uc_lock);
+	for (i = 0; i < UC_CACHE_HASH_SIZE; i++) {
+		list_for_each_entry_safe(entry, next,
+					 &cache->uc_hashtable[i], ue_hash) {
+			if (!force && atomic_read(&entry->ue_refcount)) {
+				UC_CACHE_SET_EXPIRED(entry);
+				continue;
+			}
+			LASSERT(!atomic_read(&entry->ue_refcount));
+			free_entry(cache, entry);
+		}
+	}
+	spin_unlock(&cache->uc_lock);
+	EXIT;
+}
+
+void upcall_cache_flush_idle(struct upcall_cache *cache)
+{
+	cache_flush(cache, 0);
+}
+EXPORT_SYMBOL(upcall_cache_flush_idle);
+
+void upcall_cache_flush_all(struct upcall_cache *cache)
+{
+	cache_flush(cache, 1);
+}
+EXPORT_SYMBOL(upcall_cache_flush_all);
+
+void upcall_cache_flush_one(struct upcall_cache *cache, __u64 key, void *args)
+{
+	struct list_head *head;
+	struct upcall_cache_entry *entry;
+	int found = 0;
+	ENTRY;
+
+	head = &cache->uc_hashtable[UC_CACHE_HASH_INDEX(key)];
+
+	spin_lock(&cache->uc_lock);
+	list_for_each_entry(entry, head, ue_hash) {
+		if (upcall_compare(cache, entry, key, args) == 0) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (found) {
+		CWARN("%s: flush entry %p: key "LPU64", ref %d, fl %x, "
+		      "cur %lu, ex %ld/%ld\n",
+		      cache->uc_name, entry, entry->ue_key,
+		      atomic_read(&entry->ue_refcount), entry->ue_flags,
+		      cfs_time_current_sec(), entry->ue_acquire_expire,
+		      entry->ue_expire);
+		UC_CACHE_SET_EXPIRED(entry);
+		if (!atomic_read(&entry->ue_refcount))
+			free_entry(cache, entry);
+	}
+	spin_unlock(&cache->uc_lock);
+}
+EXPORT_SYMBOL(upcall_cache_flush_one);
+
+struct upcall_cache *upcall_cache_init(const char *name, const char *upcall,
+				       struct upcall_cache_ops *ops)
+{
+	struct upcall_cache *cache;
+	int i;
+	ENTRY;
+
+	LIBCFS_ALLOC(cache, sizeof(*cache));
+	if (!cache)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	spin_lock_init(&cache->uc_lock);
+	rwlock_init(&cache->uc_upcall_rwlock);
+	for (i = 0; i < UC_CACHE_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&cache->uc_hashtable[i]);
+	strncpy(cache->uc_name, name, sizeof(cache->uc_name) - 1);
+	/* upcall pathname proc tunable */
+	strncpy(cache->uc_upcall, upcall, sizeof(cache->uc_upcall) - 1);
+	cache->uc_entry_expire = 20 * 60;
+	cache->uc_acquire_expire = 30;
+	cache->uc_ops = ops;
+
+	RETURN(cache);
+}
+EXPORT_SYMBOL(upcall_cache_init);
+
+void upcall_cache_cleanup(struct upcall_cache *cache)
+{
+	if (!cache)
+		return;
+	upcall_cache_flush_all(cache);
+	LIBCFS_FREE(cache, sizeof(*cache));
+}
+EXPORT_SYMBOL(upcall_cache_cleanup);
diff --git a/drivers/staging/lustre/lustre/libcfs/watchdog.c b/drivers/staging/lustre/lustre/libcfs/watchdog.c
new file mode 100644
index 000000000000..7c385ada3e10
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/watchdog.c
@@ -0,0 +1,516 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/watchdog.c
+ *
+ * Author: Jacob Berkman <jacob@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+#include "tracefile.h"
+
+struct lc_watchdog {
+	spinlock_t  lcw_lock;     /* check or change lcw_list */
+	int	     lcw_refcount; /* must hold lcw_pending_timers_lock */
+	timer_list_t     lcw_timer;    /* kernel timer */
+	struct list_head      lcw_list;     /* chain on pending list */
+	cfs_time_t      lcw_last_touched; /* last touched stamp */
+	task_t     *lcw_task;     /* owner task */
+	void	  (*lcw_callback)(pid_t, void *);
+	void	   *lcw_data;
+
+	pid_t	   lcw_pid;
+
+	enum {
+		LC_WATCHDOG_DISABLED,
+		LC_WATCHDOG_ENABLED,
+		LC_WATCHDOG_EXPIRED
+	} lcw_state;
+};
+
+#ifdef WITH_WATCHDOG
+/*
+ * The dispatcher will complete lcw_start_completion when it starts,
+ * and lcw_stop_completion when it exits.
+ * Wake lcw_event_waitq to signal timer callback dispatches.
+ */
+static struct completion lcw_start_completion;
+static struct completion  lcw_stop_completion;
+static wait_queue_head_t lcw_event_waitq;
+
+/*
+ * Set this and wake lcw_event_waitq to stop the dispatcher.
+ */
+enum {
+	LCW_FLAG_STOP = 0
+};
+static unsigned long lcw_flags = 0;
+
+/*
+ * Number of outstanding watchdogs.
+ * When it hits 1, we start the dispatcher.
+ * When it hits 0, we stop the dispatcher.
+ */
+static __u32	 lcw_refcount = 0;
+static DEFINE_MUTEX(lcw_refcount_mutex);
+
+/*
+ * List of timers that have fired that need their callbacks run by the
+ * dispatcher.
+ */
+/* BH lock! */
+static DEFINE_SPINLOCK(lcw_pending_timers_lock);
+static struct list_head lcw_pending_timers = LIST_HEAD_INIT(lcw_pending_timers);
+
+/* Last time a watchdog expired */
+static cfs_time_t lcw_last_watchdog_time;
+static int lcw_recent_watchdog_count;
+
+static void
+lcw_dump(struct lc_watchdog *lcw)
+{
+	ENTRY;
+	rcu_read_lock();
+       if (lcw->lcw_task == NULL) {
+		LCONSOLE_WARN("Process " LPPID " was not found in the task "
+			      "list; watchdog callback may be incomplete\n",
+			      (int)lcw->lcw_pid);
+	} else {
+		libcfs_debug_dumpstack(lcw->lcw_task);
+	}
+
+	rcu_read_unlock();
+	EXIT;
+}
+
+static void lcw_cb(ulong_ptr_t data)
+{
+	struct lc_watchdog *lcw = (struct lc_watchdog *)data;
+	ENTRY;
+
+	if (lcw->lcw_state != LC_WATCHDOG_ENABLED) {
+		EXIT;
+		return;
+	}
+
+	lcw->lcw_state = LC_WATCHDOG_EXPIRED;
+
+	spin_lock_bh(&lcw->lcw_lock);
+	LASSERT(list_empty(&lcw->lcw_list));
+
+	spin_lock_bh(&lcw_pending_timers_lock);
+	lcw->lcw_refcount++; /* +1 for pending list */
+	list_add(&lcw->lcw_list, &lcw_pending_timers);
+	wake_up(&lcw_event_waitq);
+
+	spin_unlock_bh(&lcw_pending_timers_lock);
+	spin_unlock_bh(&lcw->lcw_lock);
+	EXIT;
+}
+
+static int is_watchdog_fired(void)
+{
+	int rc;
+
+	if (test_bit(LCW_FLAG_STOP, &lcw_flags))
+		return 1;
+
+	spin_lock_bh(&lcw_pending_timers_lock);
+	rc = !list_empty(&lcw_pending_timers);
+	spin_unlock_bh(&lcw_pending_timers_lock);
+	return rc;
+}
+
+static void lcw_dump_stack(struct lc_watchdog *lcw)
+{
+	cfs_time_t      current_time;
+	cfs_duration_t  delta_time;
+	struct timeval  timediff;
+
+	current_time = cfs_time_current();
+	delta_time = cfs_time_sub(current_time, lcw->lcw_last_touched);
+	cfs_duration_usec(delta_time, &timediff);
+
+	/*
+	 * Check to see if we should throttle the watchdog timer to avoid
+	 * too many dumps going to the console thus triggering an NMI.
+	 */
+	delta_time = cfs_duration_sec(cfs_time_sub(current_time,
+						   lcw_last_watchdog_time));
+
+	if (delta_time < libcfs_watchdog_ratelimit &&
+	    lcw_recent_watchdog_count > 3) {
+		LCONSOLE_WARN("Service thread pid %u was inactive for "
+			      "%lu.%.02lus. Watchdog stack traces are limited "
+			      "to 3 per %d seconds, skipping this one.\n",
+			      (int)lcw->lcw_pid,
+			      timediff.tv_sec,
+			      timediff.tv_usec / 10000,
+			      libcfs_watchdog_ratelimit);
+	} else {
+		if (delta_time < libcfs_watchdog_ratelimit) {
+			lcw_recent_watchdog_count++;
+		} else {
+			memcpy(&lcw_last_watchdog_time, &current_time,
+			       sizeof(current_time));
+			lcw_recent_watchdog_count = 0;
+		}
+
+		LCONSOLE_WARN("Service thread pid %u was inactive for "
+			      "%lu.%.02lus. The thread might be hung, or it "
+			      "might only be slow and will resume later. "
+			      "Dumping the stack trace for debugging purposes:"
+			      "\n",
+			      (int)lcw->lcw_pid,
+			      timediff.tv_sec,
+			      timediff.tv_usec / 10000);
+		lcw_dump(lcw);
+	}
+}
+
+static int lcw_dispatch_main(void *data)
+{
+	int		 rc = 0;
+	struct lc_watchdog *lcw;
+	LIST_HEAD      (zombies);
+
+	ENTRY;
+
+	complete(&lcw_start_completion);
+
+	while (1) {
+		int dumplog = 1;
+
+		cfs_wait_event_interruptible(lcw_event_waitq,
+					     is_watchdog_fired(), rc);
+		CDEBUG(D_INFO, "Watchdog got woken up...\n");
+		if (test_bit(LCW_FLAG_STOP, &lcw_flags)) {
+			CDEBUG(D_INFO, "LCW_FLAG_STOP set, shutting down...\n");
+
+			spin_lock_bh(&lcw_pending_timers_lock);
+			rc = !list_empty(&lcw_pending_timers);
+			spin_unlock_bh(&lcw_pending_timers_lock);
+			if (rc) {
+				CERROR("pending timers list was not empty at "
+				       "time of watchdog dispatch shutdown\n");
+			}
+			break;
+		}
+
+		spin_lock_bh(&lcw_pending_timers_lock);
+		while (!list_empty(&lcw_pending_timers)) {
+			int is_dumplog;
+
+			lcw = list_entry(lcw_pending_timers.next,
+					     struct lc_watchdog, lcw_list);
+			/* +1 ref for callback to make sure lwc wouldn't be
+			 * deleted after releasing lcw_pending_timers_lock */
+			lcw->lcw_refcount++;
+			spin_unlock_bh(&lcw_pending_timers_lock);
+
+			/* lock ordering */
+			spin_lock_bh(&lcw->lcw_lock);
+			spin_lock_bh(&lcw_pending_timers_lock);
+
+			if (list_empty(&lcw->lcw_list)) {
+				/* already removed from pending list */
+				lcw->lcw_refcount--; /* -1 ref for callback */
+				if (lcw->lcw_refcount == 0)
+					list_add(&lcw->lcw_list, &zombies);
+				spin_unlock_bh(&lcw->lcw_lock);
+				/* still hold lcw_pending_timers_lock */
+				continue;
+			}
+
+			list_del_init(&lcw->lcw_list);
+			lcw->lcw_refcount--; /* -1 ref for pending list */
+
+			spin_unlock_bh(&lcw_pending_timers_lock);
+			spin_unlock_bh(&lcw->lcw_lock);
+
+			CDEBUG(D_INFO, "found lcw for pid " LPPID "\n",
+			       lcw->lcw_pid);
+			lcw_dump_stack(lcw);
+
+			is_dumplog = lcw->lcw_callback == lc_watchdog_dumplog;
+			if (lcw->lcw_state != LC_WATCHDOG_DISABLED &&
+			    (dumplog || !is_dumplog)) {
+				lcw->lcw_callback(lcw->lcw_pid, lcw->lcw_data);
+				if (dumplog && is_dumplog)
+					dumplog = 0;
+			}
+
+			spin_lock_bh(&lcw_pending_timers_lock);
+			lcw->lcw_refcount--; /* -1 ref for callback */
+			if (lcw->lcw_refcount == 0)
+				list_add(&lcw->lcw_list, &zombies);
+		}
+		spin_unlock_bh(&lcw_pending_timers_lock);
+
+		while (!list_empty(&zombies)) {
+			lcw = list_entry(lcw_pending_timers.next,
+					 struct lc_watchdog, lcw_list);
+			list_del(&lcw->lcw_list);
+			LIBCFS_FREE(lcw, sizeof(*lcw));
+		}
+	}
+
+	complete(&lcw_stop_completion);
+
+	RETURN(rc);
+}
+
+static void lcw_dispatch_start(void)
+{
+	task_t *task;
+
+	ENTRY;
+	LASSERT(lcw_refcount == 1);
+
+	init_completion(&lcw_stop_completion);
+	init_completion(&lcw_start_completion);
+	init_waitqueue_head(&lcw_event_waitq);
+
+	CDEBUG(D_INFO, "starting dispatch thread\n");
+	task = kthread_run(lcw_dispatch_main, NULL, "lc_watchdogd");
+	if (IS_ERR(task)) {
+		CERROR("error spawning watchdog dispatch thread: %ld\n",
+			PTR_ERR(task));
+		EXIT;
+		return;
+	}
+	wait_for_completion(&lcw_start_completion);
+	CDEBUG(D_INFO, "watchdog dispatcher initialization complete.\n");
+
+	EXIT;
+}
+
+static void lcw_dispatch_stop(void)
+{
+	ENTRY;
+	LASSERT(lcw_refcount == 0);
+
+	CDEBUG(D_INFO, "trying to stop watchdog dispatcher.\n");
+
+	set_bit(LCW_FLAG_STOP, &lcw_flags);
+	wake_up(&lcw_event_waitq);
+
+	wait_for_completion(&lcw_stop_completion);
+
+	CDEBUG(D_INFO, "watchdog dispatcher has shut down.\n");
+
+	EXIT;
+}
+
+struct lc_watchdog *lc_watchdog_add(int timeout,
+				    void (*callback)(pid_t, void *),
+				    void *data)
+{
+	struct lc_watchdog *lcw = NULL;
+	ENTRY;
+
+	LIBCFS_ALLOC(lcw, sizeof(*lcw));
+	if (lcw == NULL) {
+		CDEBUG(D_INFO, "Could not allocate new lc_watchdog\n");
+		RETURN(ERR_PTR(-ENOMEM));
+	}
+
+	spin_lock_init(&lcw->lcw_lock);
+	lcw->lcw_refcount = 1; /* refcount for owner */
+	lcw->lcw_task     = current;
+	lcw->lcw_pid      = current_pid();
+	lcw->lcw_callback = (callback != NULL) ? callback : lc_watchdog_dumplog;
+	lcw->lcw_data     = data;
+	lcw->lcw_state    = LC_WATCHDOG_DISABLED;
+
+	INIT_LIST_HEAD(&lcw->lcw_list);
+	cfs_timer_init(&lcw->lcw_timer, lcw_cb, lcw);
+
+	mutex_lock(&lcw_refcount_mutex);
+	if (++lcw_refcount == 1)
+		lcw_dispatch_start();
+	mutex_unlock(&lcw_refcount_mutex);
+
+	/* Keep this working in case we enable them by default */
+	if (lcw->lcw_state == LC_WATCHDOG_ENABLED) {
+		lcw->lcw_last_touched = cfs_time_current();
+		cfs_timer_arm(&lcw->lcw_timer, cfs_time_seconds(timeout) +
+			      cfs_time_current());
+	}
+
+	RETURN(lcw);
+}
+EXPORT_SYMBOL(lc_watchdog_add);
+
+static void lcw_update_time(struct lc_watchdog *lcw, const char *message)
+{
+	cfs_time_t newtime = cfs_time_current();;
+
+	if (lcw->lcw_state == LC_WATCHDOG_EXPIRED) {
+		struct timeval timediff;
+		cfs_time_t delta_time = cfs_time_sub(newtime,
+						     lcw->lcw_last_touched);
+		cfs_duration_usec(delta_time, &timediff);
+
+		LCONSOLE_WARN("Service thread pid %u %s after %lu.%.02lus. "
+			      "This indicates the system was overloaded (too "
+			      "many service threads, or there were not enough "
+			      "hardware resources).\n",
+			      lcw->lcw_pid,
+			      message,
+			      timediff.tv_sec,
+			      timediff.tv_usec / 10000);
+	}
+	lcw->lcw_last_touched = newtime;
+}
+
+static void lc_watchdog_del_pending(struct lc_watchdog *lcw)
+{
+	spin_lock_bh(&lcw->lcw_lock);
+	if (unlikely(!list_empty(&lcw->lcw_list))) {
+		spin_lock_bh(&lcw_pending_timers_lock);
+		list_del_init(&lcw->lcw_list);
+		lcw->lcw_refcount--; /* -1 ref for pending list */
+		spin_unlock_bh(&lcw_pending_timers_lock);
+	}
+
+	spin_unlock_bh(&lcw->lcw_lock);
+}
+
+void lc_watchdog_touch(struct lc_watchdog *lcw, int timeout)
+{
+	ENTRY;
+	LASSERT(lcw != NULL);
+
+	lc_watchdog_del_pending(lcw);
+
+	lcw_update_time(lcw, "resumed");
+	lcw->lcw_state = LC_WATCHDOG_ENABLED;
+
+	cfs_timer_arm(&lcw->lcw_timer, cfs_time_current() +
+		      cfs_time_seconds(timeout));
+
+	EXIT;
+}
+EXPORT_SYMBOL(lc_watchdog_touch);
+
+void lc_watchdog_disable(struct lc_watchdog *lcw)
+{
+	ENTRY;
+	LASSERT(lcw != NULL);
+
+	lc_watchdog_del_pending(lcw);
+
+	lcw_update_time(lcw, "completed");
+	lcw->lcw_state = LC_WATCHDOG_DISABLED;
+
+	EXIT;
+}
+EXPORT_SYMBOL(lc_watchdog_disable);
+
+void lc_watchdog_delete(struct lc_watchdog *lcw)
+{
+	int dead;
+
+	ENTRY;
+	LASSERT(lcw != NULL);
+
+	cfs_timer_disarm(&lcw->lcw_timer);
+
+	lcw_update_time(lcw, "stopped");
+
+	spin_lock_bh(&lcw->lcw_lock);
+	spin_lock_bh(&lcw_pending_timers_lock);
+	if (unlikely(!list_empty(&lcw->lcw_list))) {
+		list_del_init(&lcw->lcw_list);
+		lcw->lcw_refcount--; /* -1 ref for pending list */
+	}
+
+	lcw->lcw_refcount--; /* -1 ref for owner */
+	dead = lcw->lcw_refcount == 0;
+	spin_unlock_bh(&lcw_pending_timers_lock);
+	spin_unlock_bh(&lcw->lcw_lock);
+
+	if (dead)
+		LIBCFS_FREE(lcw, sizeof(*lcw));
+
+	mutex_lock(&lcw_refcount_mutex);
+	if (--lcw_refcount == 0)
+		lcw_dispatch_stop();
+	mutex_unlock(&lcw_refcount_mutex);
+
+	EXIT;
+}
+EXPORT_SYMBOL(lc_watchdog_delete);
+
+/*
+ * Provided watchdog handlers
+ */
+
+void lc_watchdog_dumplog(pid_t pid, void *data)
+{
+	libcfs_debug_dumplog_internal((void *)((long_ptr_t)pid));
+}
+EXPORT_SYMBOL(lc_watchdog_dumplog);
+
+#else   /* !defined(WITH_WATCHDOG) */
+
+struct lc_watchdog *lc_watchdog_add(int timeout,
+				    void (*callback)(pid_t pid, void *),
+				    void *data)
+{
+	static struct lc_watchdog      watchdog;
+	return &watchdog;
+}
+EXPORT_SYMBOL(lc_watchdog_add);
+
+void lc_watchdog_touch(struct lc_watchdog *lcw, int timeout)
+{
+}
+EXPORT_SYMBOL(lc_watchdog_touch);
+
+void lc_watchdog_disable(struct lc_watchdog *lcw)
+{
+}
+EXPORT_SYMBOL(lc_watchdog_disable);
+
+void lc_watchdog_delete(struct lc_watchdog *lcw)
+{
+}
+EXPORT_SYMBOL(lc_watchdog_delete);
+
+#endif
diff --git a/drivers/staging/lustre/lustre/libcfs/workitem.c b/drivers/staging/lustre/lustre/libcfs/workitem.c
new file mode 100644
index 000000000000..b533666c1900
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/workitem.c
@@ -0,0 +1,475 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/workitem.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ *	 Liang Zhen  <zhen.liang@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+
+#define CFS_WS_NAME_LEN	 16
+
+typedef struct cfs_wi_sched {
+	struct list_head		ws_list;	/* chain on global list */
+	/** serialised workitems */
+	spinlock_t		ws_lock;
+	/** where schedulers sleep */
+	wait_queue_head_t		ws_waitq;
+	/** concurrent workitems */
+	struct list_head		ws_runq;
+	/** rescheduled running-workitems, a workitem can be rescheduled
+	 * while running in wi_action(), but we don't to execute it again
+	 * unless it returns from wi_action(), so we put it on ws_rerunq
+	 * while rescheduling, and move it to runq after it returns
+	 * from wi_action() */
+	struct list_head		ws_rerunq;
+	/** CPT-table for this scheduler */
+	struct cfs_cpt_table	*ws_cptab;
+	/** CPT id for affinity */
+	int			ws_cpt;
+	/** number of scheduled workitems */
+	int			ws_nscheduled;
+	/** started scheduler thread, protected by cfs_wi_data::wi_glock */
+	unsigned int		ws_nthreads:30;
+	/** shutting down, protected by cfs_wi_data::wi_glock */
+	unsigned int		ws_stopping:1;
+	/** serialize starting thread, protected by cfs_wi_data::wi_glock */
+	unsigned int		ws_starting:1;
+	/** scheduler name */
+	char			ws_name[CFS_WS_NAME_LEN];
+} cfs_wi_sched_t;
+
+struct cfs_workitem_data {
+	/** serialize */
+	spinlock_t		wi_glock;
+	/** list of all schedulers */
+	struct list_head		wi_scheds;
+	/** WI module is initialized */
+	int			wi_init;
+	/** shutting down the whole WI module */
+	int			wi_stopping;
+} cfs_wi_data;
+
+static inline void
+cfs_wi_sched_lock(cfs_wi_sched_t *sched)
+{
+	spin_lock(&sched->ws_lock);
+}
+
+static inline void
+cfs_wi_sched_unlock(cfs_wi_sched_t *sched)
+{
+	spin_unlock(&sched->ws_lock);
+}
+
+static inline int
+cfs_wi_sched_cansleep(cfs_wi_sched_t *sched)
+{
+	cfs_wi_sched_lock(sched);
+	if (sched->ws_stopping) {
+		cfs_wi_sched_unlock(sched);
+		return 0;
+	}
+
+	if (!list_empty(&sched->ws_runq)) {
+		cfs_wi_sched_unlock(sched);
+		return 0;
+	}
+	cfs_wi_sched_unlock(sched);
+	return 1;
+}
+
+
+/* XXX:
+ * 0. it only works when called from wi->wi_action.
+ * 1. when it returns no one shall try to schedule the workitem.
+ */
+void
+cfs_wi_exit(struct cfs_wi_sched *sched, cfs_workitem_t *wi)
+{
+	LASSERT(!in_interrupt()); /* because we use plain spinlock */
+	LASSERT(!sched->ws_stopping);
+
+	cfs_wi_sched_lock(sched);
+
+	LASSERT(wi->wi_running);
+	if (wi->wi_scheduled) { /* cancel pending schedules */
+		LASSERT(!list_empty(&wi->wi_list));
+		list_del_init(&wi->wi_list);
+
+		LASSERT(sched->ws_nscheduled > 0);
+		sched->ws_nscheduled--;
+	}
+
+	LASSERT(list_empty(&wi->wi_list));
+
+	wi->wi_scheduled = 1; /* LBUG future schedule attempts */
+	cfs_wi_sched_unlock(sched);
+
+	return;
+}
+EXPORT_SYMBOL(cfs_wi_exit);
+
+/**
+ * cancel schedule request of workitem \a wi
+ */
+int
+cfs_wi_deschedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi)
+{
+	int	rc;
+
+	LASSERT(!in_interrupt()); /* because we use plain spinlock */
+	LASSERT(!sched->ws_stopping);
+
+	/*
+	 * return 0 if it's running already, otherwise return 1, which
+	 * means the workitem will not be scheduled and will not have
+	 * any race with wi_action.
+	 */
+	cfs_wi_sched_lock(sched);
+
+	rc = !(wi->wi_running);
+
+	if (wi->wi_scheduled) { /* cancel pending schedules */
+		LASSERT(!list_empty(&wi->wi_list));
+		list_del_init(&wi->wi_list);
+
+		LASSERT(sched->ws_nscheduled > 0);
+		sched->ws_nscheduled--;
+
+		wi->wi_scheduled = 0;
+	}
+
+	LASSERT (list_empty(&wi->wi_list));
+
+	cfs_wi_sched_unlock(sched);
+	return rc;
+}
+EXPORT_SYMBOL(cfs_wi_deschedule);
+
+/*
+ * Workitem scheduled with (serial == 1) is strictly serialised not only with
+ * itself, but also with others scheduled this way.
+ *
+ * Now there's only one static serialised queue, but in the future more might
+ * be added, and even dynamic creation of serialised queues might be supported.
+ */
+void
+cfs_wi_schedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi)
+{
+	LASSERT(!in_interrupt()); /* because we use plain spinlock */
+	LASSERT(!sched->ws_stopping);
+
+	cfs_wi_sched_lock(sched);
+
+	if (!wi->wi_scheduled) {
+		LASSERT (list_empty(&wi->wi_list));
+
+		wi->wi_scheduled = 1;
+		sched->ws_nscheduled++;
+		if (!wi->wi_running) {
+			list_add_tail(&wi->wi_list, &sched->ws_runq);
+			wake_up(&sched->ws_waitq);
+		} else {
+			list_add(&wi->wi_list, &sched->ws_rerunq);
+		}
+	}
+
+	LASSERT (!list_empty(&wi->wi_list));
+	cfs_wi_sched_unlock(sched);
+	return;
+}
+EXPORT_SYMBOL(cfs_wi_schedule);
+
+
+static int
+cfs_wi_scheduler (void *arg)
+{
+	struct cfs_wi_sched	*sched = (cfs_wi_sched_t *)arg;
+
+	cfs_block_allsigs();
+
+	/* CPT affinity scheduler? */
+	if (sched->ws_cptab != NULL)
+		cfs_cpt_bind(sched->ws_cptab, sched->ws_cpt);
+
+	spin_lock(&cfs_wi_data.wi_glock);
+
+	LASSERT(sched->ws_starting == 1);
+	sched->ws_starting--;
+	sched->ws_nthreads++;
+
+	spin_unlock(&cfs_wi_data.wi_glock);
+
+	cfs_wi_sched_lock(sched);
+
+	while (!sched->ws_stopping) {
+		int	     nloops = 0;
+		int	     rc;
+		cfs_workitem_t *wi;
+
+		while (!list_empty(&sched->ws_runq) &&
+		       nloops < CFS_WI_RESCHED) {
+			wi = list_entry(sched->ws_runq.next,
+					    cfs_workitem_t, wi_list);
+			LASSERT(wi->wi_scheduled && !wi->wi_running);
+
+			list_del_init(&wi->wi_list);
+
+			LASSERT(sched->ws_nscheduled > 0);
+			sched->ws_nscheduled--;
+
+			wi->wi_running   = 1;
+			wi->wi_scheduled = 0;
+
+
+			cfs_wi_sched_unlock(sched);
+			nloops++;
+
+			rc = (*wi->wi_action) (wi);
+
+			cfs_wi_sched_lock(sched);
+			if (rc != 0) /* WI should be dead, even be freed! */
+				continue;
+
+			wi->wi_running = 0;
+			if (list_empty(&wi->wi_list))
+				continue;
+
+			LASSERT(wi->wi_scheduled);
+			/* wi is rescheduled, should be on rerunq now, we
+			 * move it to runq so it can run action now */
+			list_move_tail(&wi->wi_list, &sched->ws_runq);
+		}
+
+		if (!list_empty(&sched->ws_runq)) {
+			cfs_wi_sched_unlock(sched);
+			/* don't sleep because some workitems still
+			 * expect me to come back soon */
+			cond_resched();
+			cfs_wi_sched_lock(sched);
+			continue;
+		}
+
+		cfs_wi_sched_unlock(sched);
+		cfs_wait_event_interruptible_exclusive(sched->ws_waitq,
+				!cfs_wi_sched_cansleep(sched), rc);
+		cfs_wi_sched_lock(sched);
+	}
+
+	cfs_wi_sched_unlock(sched);
+
+	spin_lock(&cfs_wi_data.wi_glock);
+	sched->ws_nthreads--;
+	spin_unlock(&cfs_wi_data.wi_glock);
+
+	return 0;
+}
+
+
+void
+cfs_wi_sched_destroy(struct cfs_wi_sched *sched)
+{
+	int	i;
+
+	LASSERT(cfs_wi_data.wi_init);
+	LASSERT(!cfs_wi_data.wi_stopping);
+
+	spin_lock(&cfs_wi_data.wi_glock);
+	if (sched->ws_stopping) {
+		CDEBUG(D_INFO, "%s is in progress of stopping\n",
+		       sched->ws_name);
+		spin_unlock(&cfs_wi_data.wi_glock);
+		return;
+	}
+
+	LASSERT(!list_empty(&sched->ws_list));
+	sched->ws_stopping = 1;
+
+	spin_unlock(&cfs_wi_data.wi_glock);
+
+	i = 2;
+	wake_up_all(&sched->ws_waitq);
+
+	spin_lock(&cfs_wi_data.wi_glock);
+	while (sched->ws_nthreads > 0) {
+		CDEBUG(IS_PO2(++i) ? D_WARNING : D_NET,
+		       "waiting for %d threads of WI sched[%s] to terminate\n",
+		       sched->ws_nthreads, sched->ws_name);
+
+		spin_unlock(&cfs_wi_data.wi_glock);
+		cfs_pause(cfs_time_seconds(1) / 20);
+		spin_lock(&cfs_wi_data.wi_glock);
+	}
+
+	list_del(&sched->ws_list);
+
+	spin_unlock(&cfs_wi_data.wi_glock);
+	LASSERT(sched->ws_nscheduled == 0);
+
+	LIBCFS_FREE(sched, sizeof(*sched));
+}
+EXPORT_SYMBOL(cfs_wi_sched_destroy);
+
+int
+cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab,
+		    int cpt, int nthrs, struct cfs_wi_sched **sched_pp)
+{
+	struct cfs_wi_sched	*sched;
+	int			rc;
+
+	LASSERT(cfs_wi_data.wi_init);
+	LASSERT(!cfs_wi_data.wi_stopping);
+	LASSERT(cptab == NULL || cpt == CFS_CPT_ANY ||
+		(cpt >= 0 && cpt < cfs_cpt_number(cptab)));
+
+	LIBCFS_ALLOC(sched, sizeof(*sched));
+	if (sched == NULL)
+		return -ENOMEM;
+
+	strncpy(sched->ws_name, name, CFS_WS_NAME_LEN);
+	sched->ws_cptab = cptab;
+	sched->ws_cpt = cpt;
+
+	spin_lock_init(&sched->ws_lock);
+	init_waitqueue_head(&sched->ws_waitq);
+	INIT_LIST_HEAD(&sched->ws_runq);
+	INIT_LIST_HEAD(&sched->ws_rerunq);
+	INIT_LIST_HEAD(&sched->ws_list);
+
+	rc = 0;
+	while (nthrs > 0)  {
+		char	name[16];
+		task_t	*task;
+		spin_lock(&cfs_wi_data.wi_glock);
+		while (sched->ws_starting > 0) {
+			spin_unlock(&cfs_wi_data.wi_glock);
+			schedule();
+			spin_lock(&cfs_wi_data.wi_glock);
+		}
+
+		sched->ws_starting++;
+		spin_unlock(&cfs_wi_data.wi_glock);
+
+		if (sched->ws_cptab != NULL && sched->ws_cpt >= 0) {
+			snprintf(name, sizeof(name), "%s_%02d_%02d",
+				 sched->ws_name, sched->ws_cpt,
+				 sched->ws_nthreads);
+		} else {
+			snprintf(name, sizeof(name), "%s_%02d",
+				 sched->ws_name, sched->ws_nthreads);
+		}
+
+		task = kthread_run(cfs_wi_scheduler, sched, name);
+		if (!IS_ERR(task)) {
+			nthrs--;
+			continue;
+		}
+		rc = PTR_ERR(task);
+
+		CERROR("Failed to create thread for WI scheduler %s: %d\n",
+		       name, rc);
+
+		spin_lock(&cfs_wi_data.wi_glock);
+
+		/* make up for cfs_wi_sched_destroy */
+		list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
+		sched->ws_starting--;
+
+		spin_unlock(&cfs_wi_data.wi_glock);
+
+		cfs_wi_sched_destroy(sched);
+		return rc;
+	}
+	spin_lock(&cfs_wi_data.wi_glock);
+	list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
+	spin_unlock(&cfs_wi_data.wi_glock);
+
+	*sched_pp = sched;
+	return 0;
+}
+EXPORT_SYMBOL(cfs_wi_sched_create);
+
+int
+cfs_wi_startup(void)
+{
+	memset(&cfs_wi_data, 0, sizeof(cfs_wi_data));
+
+	spin_lock_init(&cfs_wi_data.wi_glock);
+	INIT_LIST_HEAD(&cfs_wi_data.wi_scheds);
+	cfs_wi_data.wi_init = 1;
+
+	return 0;
+}
+
+void
+cfs_wi_shutdown (void)
+{
+	struct cfs_wi_sched	*sched;
+
+	spin_lock(&cfs_wi_data.wi_glock);
+	cfs_wi_data.wi_stopping = 1;
+	spin_unlock(&cfs_wi_data.wi_glock);
+
+	/* nobody should contend on this list */
+	list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
+		sched->ws_stopping = 1;
+		wake_up_all(&sched->ws_waitq);
+	}
+
+	list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
+		spin_lock(&cfs_wi_data.wi_glock);
+
+		while (sched->ws_nthreads != 0) {
+			spin_unlock(&cfs_wi_data.wi_glock);
+			cfs_pause(cfs_time_seconds(1) / 20);
+			spin_lock(&cfs_wi_data.wi_glock);
+		}
+		spin_unlock(&cfs_wi_data.wi_glock);
+	}
+	while (!list_empty(&cfs_wi_data.wi_scheds)) {
+		sched = list_entry(cfs_wi_data.wi_scheds.next,
+				       struct cfs_wi_sched, ws_list);
+		list_del(&sched->ws_list);
+		LIBCFS_FREE(sched, sizeof(*sched));
+	}
+
+	cfs_wi_data.wi_stopping = 0;
+	cfs_wi_data.wi_init = 0;
+}
diff --git a/drivers/staging/lustre/lustre/llite/Makefile b/drivers/staging/lustre/lustre/llite/Makefile
new file mode 100644
index 000000000000..dff0c0486e77
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/Makefile
@@ -0,0 +1,13 @@
+obj-$(CONFIG_LUSTRE_FS) += lustre.o
+obj-$(CONFIG_LUSTRE_FS) += llite_lloop.o
+lustre-y := dcache.o dir.o file.o llite_close.o llite_lib.o llite_nfs.o \
+	    rw.o lproc_llite.o namei.o symlink.o llite_mmap.o \
+	    xattr.o remote_perm.o llite_rmtacl.o llite_capa.o \
+	    rw26.o super25.o statahead.o \
+	    ../lclient/glimpse.o ../lclient/lcommon_cl.o ../lclient/lcommon_misc.o \
+	    vvp_dev.o vvp_page.o vvp_lock.o vvp_io.o vvp_object.o
+
+llite_lloop-y := lloop.o
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lustre/llite/dcache.c b/drivers/staging/lustre/lustre/llite/dcache.c
new file mode 100644
index 000000000000..e048538d45e6
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/dcache.c
@@ -0,0 +1,675 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/quotaops.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <obd_support.h>
+#include <lustre_lite.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_dlm.h>
+
+#include "llite_internal.h"
+
+static void free_dentry_data(struct rcu_head *head)
+{
+	struct ll_dentry_data *lld;
+
+	lld = container_of(head, struct ll_dentry_data, lld_rcu_head);
+	OBD_FREE_PTR(lld);
+}
+
+/* should NOT be called with the dcache lock, see fs/dcache.c */
+static void ll_release(struct dentry *de)
+{
+	struct ll_dentry_data *lld;
+	ENTRY;
+	LASSERT(de != NULL);
+	lld = ll_d2d(de);
+	if (lld == NULL) /* NFS copies the de->d_op methods (bug 4655) */
+		RETURN_EXIT;
+
+	if (lld->lld_it) {
+		ll_intent_release(lld->lld_it);
+		OBD_FREE(lld->lld_it, sizeof(*lld->lld_it));
+	}
+	LASSERT(lld->lld_cwd_count == 0);
+	LASSERT(lld->lld_mnt_count == 0);
+	de->d_fsdata = NULL;
+	call_rcu(&lld->lld_rcu_head, free_dentry_data);
+
+	EXIT;
+}
+
+/* Compare if two dentries are the same.  Don't match if the existing dentry
+ * is marked invalid.  Returns 1 if different, 0 if the same.
+ *
+ * This avoids a race where ll_lookup_it() instantiates a dentry, but we get
+ * an AST before calling d_revalidate_it().  The dentry still exists (marked
+ * INVALID) so d_lookup() matches it, but we have no lock on it (so
+ * lock_match() fails) and we spin around real_lookup(). */
+int ll_dcompare(const struct dentry *parent, const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
+{
+	ENTRY;
+
+	if (len != name->len)
+		RETURN(1);
+
+	if (memcmp(str, name->name, len))
+		RETURN(1);
+
+	CDEBUG(D_DENTRY, "found name %.*s(%p) flags %#x refc %d\n",
+	       name->len, name->name, dentry, dentry->d_flags,
+	       d_refcount(dentry));
+
+	/* mountpoint is always valid */
+	if (d_mountpoint((struct dentry *)dentry))
+		RETURN(0);
+
+	if (d_lustre_invalid(dentry))
+		RETURN(1);
+
+	RETURN(0);
+}
+
+static inline int return_if_equal(struct ldlm_lock *lock, void *data)
+{
+	if ((lock->l_flags &
+	     (LDLM_FL_CANCELING | LDLM_FL_DISCARD_DATA)) ==
+	    (LDLM_FL_CANCELING | LDLM_FL_DISCARD_DATA))
+		return LDLM_ITER_CONTINUE;
+	return LDLM_ITER_STOP;
+}
+
+/* find any ldlm lock of the inode in mdc and lov
+ * return 0    not find
+ *	1    find one
+ *      < 0    error */
+static int find_cbdata(struct inode *inode)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct lov_stripe_md *lsm;
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(inode);
+	rc = md_find_cbdata(sbi->ll_md_exp, ll_inode2fid(inode),
+			    return_if_equal, NULL);
+	if (rc != 0)
+		 RETURN(rc);
+
+	lsm = ccc_inode_lsm_get(inode);
+	if (lsm == NULL)
+		RETURN(rc);
+
+	rc = obd_find_cbdata(sbi->ll_dt_exp, lsm, return_if_equal, NULL);
+	ccc_inode_lsm_put(inode, lsm);
+
+	RETURN(rc);
+}
+
+/**
+ * Called when last reference to a dentry is dropped and dcache wants to know
+ * whether or not it should cache it:
+ * - return 1 to delete the dentry immediately
+ * - return 0 to cache the dentry
+ * Should NOT be called with the dcache lock, see fs/dcache.c
+ */
+static int ll_ddelete(const struct dentry *de)
+{
+	ENTRY;
+	LASSERT(de);
+
+	CDEBUG(D_DENTRY, "%s dentry %.*s (%p, parent %p, inode %p) %s%s\n",
+	       d_lustre_invalid((struct dentry *)de) ? "deleting" : "keeping",
+	       de->d_name.len, de->d_name.name, de, de->d_parent, de->d_inode,
+	       d_unhashed((struct dentry *)de) ? "" : "hashed,",
+	       list_empty(&de->d_subdirs) ? "" : "subdirs");
+
+	/* kernel >= 2.6.38 last refcount is decreased after this function. */
+	LASSERT(d_refcount(de) == 1);
+
+	/* Disable this piece of code temproarily because this is called
+	 * inside dcache_lock so it's not appropriate to do lots of work
+	 * here. ATTENTION: Before this piece of code enabling, LU-2487 must be
+	 * resolved. */
+#if 0
+	/* if not ldlm lock for this inode, set i_nlink to 0 so that
+	 * this inode can be recycled later b=20433 */
+	if (de->d_inode && !find_cbdata(de->d_inode))
+		clear_nlink(de->d_inode);
+#endif
+
+	if (d_lustre_invalid((struct dentry *)de))
+		RETURN(1);
+	RETURN(0);
+}
+
+static int ll_set_dd(struct dentry *de)
+{
+	ENTRY;
+	LASSERT(de != NULL);
+
+	CDEBUG(D_DENTRY, "ldd on dentry %.*s (%p) parent %p inode %p refc %d\n",
+		de->d_name.len, de->d_name.name, de, de->d_parent, de->d_inode,
+		d_refcount(de));
+
+	if (de->d_fsdata == NULL) {
+		struct ll_dentry_data *lld;
+
+		OBD_ALLOC_PTR(lld);
+		if (likely(lld != NULL)) {
+			spin_lock(&de->d_lock);
+			if (likely(de->d_fsdata == NULL))
+				de->d_fsdata = lld;
+			else
+				OBD_FREE_PTR(lld);
+			spin_unlock(&de->d_lock);
+		} else {
+			RETURN(-ENOMEM);
+		}
+	}
+
+	RETURN(0);
+}
+
+int ll_dops_init(struct dentry *de, int block, int init_sa)
+{
+	struct ll_dentry_data *lld = ll_d2d(de);
+	int rc = 0;
+
+	if (lld == NULL && block != 0) {
+		rc = ll_set_dd(de);
+		if (rc)
+			return rc;
+
+		lld = ll_d2d(de);
+	}
+
+	if (lld != NULL && init_sa != 0)
+		lld->lld_sa_generation = 0;
+
+	/* kernel >= 2.6.38 d_op is set in d_alloc() */
+	LASSERT(de->d_op == &ll_d_ops);
+	return rc;
+}
+
+void ll_intent_drop_lock(struct lookup_intent *it)
+{
+	if (it->it_op && it->d.lustre.it_lock_mode) {
+		struct lustre_handle handle;
+
+		handle.cookie = it->d.lustre.it_lock_handle;
+
+		CDEBUG(D_DLMTRACE, "releasing lock with cookie "LPX64
+		       " from it %p\n", handle.cookie, it);
+		ldlm_lock_decref(&handle, it->d.lustre.it_lock_mode);
+
+		/* bug 494: intent_release may be called multiple times, from
+		 * this thread and we don't want to double-decref this lock */
+		it->d.lustre.it_lock_mode = 0;
+		if (it->d.lustre.it_remote_lock_mode != 0) {
+			handle.cookie = it->d.lustre.it_remote_lock_handle;
+
+			CDEBUG(D_DLMTRACE, "releasing remote lock with cookie"
+			       LPX64" from it %p\n", handle.cookie, it);
+			ldlm_lock_decref(&handle,
+					 it->d.lustre.it_remote_lock_mode);
+			it->d.lustre.it_remote_lock_mode = 0;
+		}
+	}
+}
+
+void ll_intent_release(struct lookup_intent *it)
+{
+	ENTRY;
+
+	CDEBUG(D_INFO, "intent %p released\n", it);
+	ll_intent_drop_lock(it);
+	/* We are still holding extra reference on a request, need to free it */
+	if (it_disposition(it, DISP_ENQ_OPEN_REF))
+		 ptlrpc_req_finished(it->d.lustre.it_data); /* ll_file_open */
+	if (it_disposition(it, DISP_ENQ_CREATE_REF)) /* create rec */
+		ptlrpc_req_finished(it->d.lustre.it_data);
+	if (it_disposition(it, DISP_ENQ_COMPLETE)) /* saved req from revalidate
+						    * to lookup */
+		ptlrpc_req_finished(it->d.lustre.it_data);
+
+	it->d.lustre.it_disposition = 0;
+	it->d.lustre.it_data = NULL;
+	EXIT;
+}
+
+void ll_invalidate_aliases(struct inode *inode)
+{
+	struct dentry *dentry;
+	struct ll_d_hlist_node *p;
+	ENTRY;
+
+	LASSERT(inode != NULL);
+
+	CDEBUG(D_INODE, "marking dentries for ino %lu/%u(%p) invalid\n",
+	       inode->i_ino, inode->i_generation, inode);
+
+	ll_lock_dcache(inode);
+	ll_d_hlist_for_each_entry(dentry, p, &inode->i_dentry, d_alias) {
+		CDEBUG(D_DENTRY, "dentry in drop %.*s (%p) parent %p "
+		       "inode %p flags %d\n", dentry->d_name.len,
+		       dentry->d_name.name, dentry, dentry->d_parent,
+		       dentry->d_inode, dentry->d_flags);
+
+		if (dentry->d_name.len == 1 && dentry->d_name.name[0] == '/') {
+			CERROR("called on root (?) dentry=%p, inode=%p "
+			       "ino=%lu\n", dentry, inode, inode->i_ino);
+			lustre_dump_dentry(dentry, 1);
+			libcfs_debug_dumpstack(NULL);
+		}
+
+		d_lustre_invalidate(dentry);
+	}
+	ll_unlock_dcache(inode);
+
+	EXIT;
+}
+
+int ll_revalidate_it_finish(struct ptlrpc_request *request,
+			    struct lookup_intent *it,
+			    struct dentry *de)
+{
+	int rc = 0;
+	ENTRY;
+
+	if (!request)
+		RETURN(0);
+
+	if (it_disposition(it, DISP_LOOKUP_NEG))
+		RETURN(-ENOENT);
+
+	rc = ll_prep_inode(&de->d_inode, request, NULL, it);
+
+	RETURN(rc);
+}
+
+void ll_lookup_finish_locks(struct lookup_intent *it, struct dentry *dentry)
+{
+	LASSERT(it != NULL);
+	LASSERT(dentry != NULL);
+
+	if (it->d.lustre.it_lock_mode && dentry->d_inode != NULL) {
+		struct inode *inode = dentry->d_inode;
+		struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
+
+		CDEBUG(D_DLMTRACE, "setting l_data to inode %p (%lu/%u)\n",
+		       inode, inode->i_ino, inode->i_generation);
+		ll_set_lock_data(sbi->ll_md_exp, inode, it, NULL);
+	}
+
+	/* drop lookup or getattr locks immediately */
+	if (it->it_op == IT_LOOKUP || it->it_op == IT_GETATTR) {
+		/* on 2.6 there are situation when several lookups and
+		 * revalidations may be requested during single operation.
+		 * therefore, we don't release intent here -bzzz */
+		ll_intent_drop_lock(it);
+	}
+}
+
+void ll_frob_intent(struct lookup_intent **itp, struct lookup_intent *deft)
+{
+	struct lookup_intent *it = *itp;
+
+	if (!it || it->it_op == IT_GETXATTR)
+		it = *itp = deft;
+
+}
+
+int ll_revalidate_it(struct dentry *de, int lookup_flags,
+		     struct lookup_intent *it)
+{
+	struct md_op_data *op_data;
+	struct ptlrpc_request *req = NULL;
+	struct lookup_intent lookup_it = { .it_op = IT_LOOKUP };
+	struct obd_export *exp;
+	struct inode *parent = de->d_parent->d_inode;
+	int rc;
+
+	ENTRY;
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%s,intent=%s\n", de->d_name.name,
+	       LL_IT2STR(it));
+
+	if (de->d_inode == NULL) {
+		__u64 ibits;
+
+		/* We can only use negative dentries if this is stat or lookup,
+		   for opens and stuff we do need to query server. */
+		/* If there is IT_CREAT in intent op set, then we must throw
+		   away this negative dentry and actually do the request to
+		   kernel to create whatever needs to be created (if possible)*/
+		if (it && (it->it_op & IT_CREAT))
+			RETURN(0);
+
+		if (d_lustre_invalid(de))
+			RETURN(0);
+
+		ibits = MDS_INODELOCK_UPDATE;
+		rc = ll_have_md_lock(parent, &ibits, LCK_MINMODE);
+		GOTO(out_sa, rc);
+	}
+
+	/* Never execute intents for mount points.
+	 * Attributes will be fixed up in ll_inode_revalidate_it */
+	if (d_mountpoint(de))
+		GOTO(out_sa, rc = 1);
+
+	/* need to get attributes in case root got changed from other client */
+	if (de == de->d_sb->s_root) {
+		rc = __ll_inode_revalidate_it(de, it, MDS_INODELOCK_LOOKUP);
+		if (rc == 0)
+			rc = 1;
+		GOTO(out_sa, rc);
+	}
+
+	exp = ll_i2mdexp(de->d_inode);
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_MDC_REVALIDATE_PAUSE, 5);
+	ll_frob_intent(&it, &lookup_it);
+	LASSERT(it);
+
+	if (it->it_op == IT_LOOKUP && !d_lustre_invalid(de))
+		RETURN(1);
+
+	if (it->it_op == IT_OPEN) {
+		struct inode *inode = de->d_inode;
+		struct ll_inode_info *lli = ll_i2info(inode);
+		struct obd_client_handle **och_p;
+		__u64 *och_usecount;
+		__u64 ibits;
+
+		/*
+		 * We used to check for MDS_INODELOCK_OPEN here, but in fact
+		 * just having LOOKUP lock is enough to justify inode is the
+		 * same. And if inode is the same and we have suitable
+		 * openhandle, then there is no point in doing another OPEN RPC
+		 * just to throw away newly received openhandle.  There are no
+		 * security implications too, if file owner or access mode is
+		 * change, LOOKUP lock is revoked.
+		 */
+
+
+		if (it->it_flags & FMODE_WRITE) {
+			och_p = &lli->lli_mds_write_och;
+			och_usecount = &lli->lli_open_fd_write_count;
+		} else if (it->it_flags & FMODE_EXEC) {
+			och_p = &lli->lli_mds_exec_och;
+			och_usecount = &lli->lli_open_fd_exec_count;
+		} else {
+			och_p = &lli->lli_mds_read_och;
+			och_usecount = &lli->lli_open_fd_read_count;
+		}
+		/* Check for the proper lock. */
+		ibits = MDS_INODELOCK_LOOKUP;
+		if (!ll_have_md_lock(inode, &ibits, LCK_MINMODE))
+			goto do_lock;
+		mutex_lock(&lli->lli_och_mutex);
+		if (*och_p) { /* Everything is open already, do nothing */
+			/*(*och_usecount)++;  Do not let them steal our open
+			  handle from under us */
+			SET_BUT_UNUSED(och_usecount);
+			/* XXX The code above was my original idea, but in case
+			   we have the handle, but we cannot use it due to later
+			   checks (e.g. O_CREAT|O_EXCL flags set), nobody
+			   would decrement counter increased here. So we just
+			   hope the lock won't be invalidated in between. But
+			   if it would be, we'll reopen the open request to
+			   MDS later during file open path */
+			mutex_unlock(&lli->lli_och_mutex);
+			RETURN(1);
+		} else {
+			mutex_unlock(&lli->lli_och_mutex);
+		}
+	}
+
+	if (it->it_op == IT_GETATTR) {
+		rc = ll_statahead_enter(parent, &de, 0);
+		if (rc == 1)
+			goto mark;
+		else if (rc != -EAGAIN && rc != 0)
+			GOTO(out, rc = 0);
+	}
+
+do_lock:
+	op_data = ll_prep_md_op_data(NULL, parent, de->d_inode,
+				     de->d_name.name, de->d_name.len,
+				     0, LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	if (!IS_POSIXACL(parent) || !exp_connect_umask(exp))
+		it->it_create_mode &= ~current_umask();
+	it->it_create_mode |= M_CHECK_STALE;
+	rc = md_intent_lock(exp, op_data, NULL, 0, it,
+			    lookup_flags,
+			    &req, ll_md_blocking_ast, 0);
+	it->it_create_mode &= ~M_CHECK_STALE;
+	ll_finish_md_op_data(op_data);
+
+	/* If req is NULL, then md_intent_lock only tried to do a lock match;
+	 * if all was well, it will return 1 if it found locks, 0 otherwise. */
+	if (req == NULL && rc >= 0) {
+		if (!rc)
+			goto do_lookup;
+		GOTO(out, rc);
+	}
+
+	if (rc < 0) {
+		if (rc != -ESTALE) {
+			CDEBUG(D_INFO, "ll_intent_lock: rc %d : it->it_status "
+			       "%d\n", rc, it->d.lustre.it_status);
+		}
+		GOTO(out, rc = 0);
+	}
+
+revalidate_finish:
+	rc = ll_revalidate_it_finish(req, it, de);
+	if (rc != 0) {
+		if (rc != -ESTALE && rc != -ENOENT)
+			ll_intent_release(it);
+		GOTO(out, rc = 0);
+	}
+
+	if ((it->it_op & IT_OPEN) && de->d_inode &&
+	    !S_ISREG(de->d_inode->i_mode) &&
+	    !S_ISDIR(de->d_inode->i_mode)) {
+		ll_release_openhandle(de, it);
+	}
+	rc = 1;
+
+out:
+	/* We do not free request as it may be reused during following lookup
+	 * (see comment in mdc/mdc_locks.c::mdc_intent_lock()), request will
+	 * be freed in ll_lookup_it or in ll_intent_release. But if
+	 * request was not completed, we need to free it. (bug 5154, 9903) */
+	if (req != NULL && !it_disposition(it, DISP_ENQ_COMPLETE))
+		ptlrpc_req_finished(req);
+	if (rc == 0) {
+		/* mdt may grant layout lock for the newly created file, so
+		 * release the lock to avoid leaking */
+		ll_intent_drop_lock(it);
+		ll_invalidate_aliases(de->d_inode);
+	} else {
+		__u64 bits = 0;
+		__u64 matched_bits = 0;
+
+		CDEBUG(D_DENTRY, "revalidated dentry %.*s (%p) parent %p "
+		       "inode %p refc %d\n", de->d_name.len,
+		       de->d_name.name, de, de->d_parent, de->d_inode,
+		       d_refcount(de));
+
+		ll_set_lock_data(exp, de->d_inode, it, &bits);
+
+		/* Note: We have to match both LOOKUP and PERM lock
+		 * here to make sure the dentry is valid and no one
+		 * changing the permission.
+		 * But if the client connects < 2.4 server, which will
+		 * only grant LOOKUP lock, so we can only Match LOOKUP
+		 * lock for old server */
+		if (exp_connect_flags(ll_i2mdexp(de->d_inode)) &&
+							OBD_CONNECT_LVB_TYPE)
+			matched_bits =
+				MDS_INODELOCK_LOOKUP | MDS_INODELOCK_PERM;
+		else
+			matched_bits = MDS_INODELOCK_LOOKUP;
+
+		if (((bits & matched_bits) == matched_bits) &&
+		    d_lustre_invalid(de))
+			d_lustre_revalidate(de);
+		ll_lookup_finish_locks(it, de);
+	}
+
+mark:
+	if (it != NULL && it->it_op == IT_GETATTR && rc > 0)
+		ll_statahead_mark(parent, de);
+	RETURN(rc);
+
+	/*
+	 * This part is here to combat evil-evil race in real_lookup on 2.6
+	 * kernels.  The race details are: We enter do_lookup() looking for some
+	 * name, there is nothing in dcache for this name yet and d_lookup()
+	 * returns NULL.  We proceed to real_lookup(), and while we do this,
+	 * another process does open on the same file we looking up (most simple
+	 * reproducer), open succeeds and the dentry is added. Now back to
+	 * us. In real_lookup() we do d_lookup() again and suddenly find the
+	 * dentry, so we call d_revalidate on it, but there is no lock, so
+	 * without this code we would return 0, but unpatched real_lookup just
+	 * returns -ENOENT in such a case instead of retrying the lookup. Once
+	 * this is dealt with in real_lookup(), all of this ugly mess can go and
+	 * we can just check locks in ->d_revalidate without doing any RPCs
+	 * ever.
+	 */
+do_lookup:
+	if (it != &lookup_it) {
+		/* MDS_INODELOCK_UPDATE needed for IT_GETATTR case. */
+		if (it->it_op == IT_GETATTR)
+			lookup_it.it_op = IT_GETATTR;
+		ll_lookup_finish_locks(it, de);
+		it = &lookup_it;
+	}
+
+	/* Do real lookup here. */
+	op_data = ll_prep_md_op_data(NULL, parent, NULL, de->d_name.name,
+				     de->d_name.len, 0, (it->it_op & IT_CREAT ?
+							 LUSTRE_OPC_CREATE :
+							 LUSTRE_OPC_ANY), NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	rc = md_intent_lock(exp, op_data, NULL, 0,  it, 0, &req,
+			    ll_md_blocking_ast, 0);
+	if (rc >= 0) {
+		struct mdt_body *mdt_body;
+		struct lu_fid fid = {.f_seq = 0, .f_oid = 0, .f_ver = 0};
+		mdt_body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+
+		if (de->d_inode)
+			fid = *ll_inode2fid(de->d_inode);
+
+		/* see if we got same inode, if not - return error */
+		if (lu_fid_eq(&fid, &mdt_body->fid1)) {
+			ll_finish_md_op_data(op_data);
+			op_data = NULL;
+			goto revalidate_finish;
+		}
+		ll_intent_release(it);
+	}
+	ll_finish_md_op_data(op_data);
+	GOTO(out, rc = 0);
+
+out_sa:
+	/*
+	 * For rc == 1 case, should not return directly to prevent losing
+	 * statahead windows; for rc == 0 case, the "lookup" will be done later.
+	 */
+	if (it != NULL && it->it_op == IT_GETATTR && rc == 1)
+		ll_statahead_enter(parent, &de, 1);
+	goto mark;
+}
+
+/*
+ * Always trust cached dentries. Update statahead window if necessary.
+ */
+int ll_revalidate_nd(struct dentry *dentry, unsigned int flags)
+{
+	struct inode *parent = dentry->d_parent->d_inode;
+	int unplug = 0;
+
+	ENTRY;
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%s,flags=%u\n",
+	       dentry->d_name.name, flags);
+
+	if (!(flags & (LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE)) &&
+	    ll_need_statahead(parent, dentry) > 0) {
+		if (flags & LOOKUP_RCU)
+			RETURN(-ECHILD);
+
+		if (dentry->d_inode == NULL)
+			unplug = 1;
+		do_statahead_enter(parent, &dentry, unplug);
+		ll_statahead_mark(parent, dentry);
+	}
+
+	RETURN(1);
+}
+
+
+void ll_d_iput(struct dentry *de, struct inode *inode)
+{
+	LASSERT(inode);
+	if (!find_cbdata(inode))
+		clear_nlink(inode);
+	iput(inode);
+}
+
+struct dentry_operations ll_d_ops = {
+	.d_revalidate = ll_revalidate_nd,
+	.d_release = ll_release,
+	.d_delete  = ll_ddelete,
+	.d_iput    = ll_d_iput,
+	.d_compare = ll_dcompare,
+};
diff --git a/drivers/staging/lustre/lustre/llite/dir.c b/drivers/staging/lustre/lustre/llite/dir.c
new file mode 100644
index 000000000000..23c61fe81965
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/dir.c
@@ -0,0 +1,1978 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/dir.c
+ *
+ * Directory code for lustre client.
+ */
+
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/mm.h>
+#include <linux/version.h>
+#include <asm/uaccess.h>
+#include <linux/buffer_head.h>   // for wait_on_buffer
+#include <linux/pagevec.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <lustre/lustre_idl.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_lib.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_lite.h>
+#include <lustre_dlm.h>
+#include <lustre_fid.h>
+#include "llite_internal.h"
+
+/*
+ * (new) readdir implementation overview.
+ *
+ * Original lustre readdir implementation cached exact copy of raw directory
+ * pages on the client. These pages were indexed in client page cache by
+ * logical offset in the directory file. This design, while very simple and
+ * intuitive had some inherent problems:
+ *
+ *     . it implies that byte offset to the directory entry serves as a
+ *     telldir(3)/seekdir(3) cookie, but that offset is not stable: in
+ *     ext3/htree directory entries may move due to splits, and more
+ *     importantly,
+ *
+ *     . it is incompatible with the design of split directories for cmd3,
+ *     that assumes that names are distributed across nodes based on their
+ *     hash, and so readdir should be done in hash order.
+ *
+ * New readdir implementation does readdir in hash order, and uses hash of a
+ * file name as a telldir/seekdir cookie. This led to number of complications:
+ *
+ *     . hash is not unique, so it cannot be used to index cached directory
+ *     pages on the client (note, that it requires a whole pageful of hash
+ *     collided entries to cause two pages to have identical hashes);
+ *
+ *     . hash is not unique, so it cannot, strictly speaking, be used as an
+ *     entry cookie. ext3/htree has the same problem and lustre implementation
+ *     mimics their solution: seekdir(hash) positions directory at the first
+ *     entry with the given hash.
+ *
+ * Client side.
+ *
+ * 0. caching
+ *
+ * Client caches directory pages using hash of the first entry as an index. As
+ * noted above hash is not unique, so this solution doesn't work as is:
+ * special processing is needed for "page hash chains" (i.e., sequences of
+ * pages filled with entries all having the same hash value).
+ *
+ * First, such chains have to be detected. To this end, server returns to the
+ * client the hash of the first entry on the page next to one returned. When
+ * client detects that this hash is the same as hash of the first entry on the
+ * returned page, page hash collision has to be handled. Pages in the
+ * hash chain, except first one, are termed "overflow pages".
+ *
+ * Solution to index uniqueness problem is to not cache overflow
+ * pages. Instead, when page hash collision is detected, all overflow pages
+ * from emerging chain are immediately requested from the server and placed in
+ * a special data structure (struct ll_dir_chain). This data structure is used
+ * by ll_readdir() to process entries from overflow pages. When readdir
+ * invocation finishes, overflow pages are discarded. If page hash collision
+ * chain weren't completely processed, next call to readdir will again detect
+ * page hash collision, again read overflow pages in, process next portion of
+ * entries and again discard the pages. This is not as wasteful as it looks,
+ * because, given reasonable hash, page hash collisions are extremely rare.
+ *
+ * 1. directory positioning
+ *
+ * When seekdir(hash) is called, original
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ * Server.
+ *
+ * identification of and access to overflow pages
+ *
+ * page format
+ *
+ * Page in MDS_READPAGE RPC is packed in LU_PAGE_SIZE, and each page contains
+ * a header lu_dirpage which describes the start/end hash, and whether this
+ * page is empty (contains no dir entry) or hash collide with next page.
+ * After client receives reply, several pages will be integrated into dir page
+ * in PAGE_CACHE_SIZE (if PAGE_CACHE_SIZE greater than LU_PAGE_SIZE), and the
+ * lu_dirpage for this integrated page will be adjusted. See
+ * lmv_adjust_dirpages().
+ *
+ */
+
+/* returns the page unlocked, but with a reference */
+static int ll_dir_filler(void *_hash, struct page *page0)
+{
+	struct inode *inode = page0->mapping->host;
+	int hash64 = ll_i2sbi(inode)->ll_flags & LL_SBI_64BIT_HASH;
+	struct obd_export *exp = ll_i2sbi(inode)->ll_md_exp;
+	struct ptlrpc_request *request;
+	struct mdt_body *body;
+	struct md_op_data *op_data;
+	__u64 hash = *((__u64 *)_hash);
+	struct page **page_pool;
+	struct page *page;
+	struct lu_dirpage *dp;
+	int max_pages = ll_i2sbi(inode)->ll_md_brw_size >> PAGE_CACHE_SHIFT;
+	int nrdpgs = 0; /* number of pages read actually */
+	int npages;
+	int i;
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) hash "LPU64"\n",
+	       inode->i_ino, inode->i_generation, inode, hash);
+
+	LASSERT(max_pages > 0 && max_pages <= MD_MAX_BRW_PAGES);
+
+	OBD_ALLOC(page_pool, sizeof(page) * max_pages);
+	if (page_pool != NULL) {
+		page_pool[0] = page0;
+	} else {
+		page_pool = &page0;
+		max_pages = 1;
+	}
+	for (npages = 1; npages < max_pages; npages++) {
+		page = page_cache_alloc_cold(inode->i_mapping);
+		if (!page)
+			break;
+		page_pool[npages] = page;
+	}
+
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, NULL);
+	op_data->op_npages = npages;
+	op_data->op_offset = hash;
+	rc = md_readpage(exp, op_data, page_pool, &request);
+	ll_finish_md_op_data(op_data);
+	if (rc == 0) {
+		body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
+		/* Checked by mdc_readpage() */
+		LASSERT(body != NULL);
+
+		if (body->valid & OBD_MD_FLSIZE)
+			cl_isize_write(inode, body->size);
+
+		nrdpgs = (request->rq_bulk->bd_nob_transferred+PAGE_CACHE_SIZE-1)
+			 >> PAGE_CACHE_SHIFT;
+		SetPageUptodate(page0);
+	}
+	unlock_page(page0);
+	ptlrpc_req_finished(request);
+
+	CDEBUG(D_VFSTRACE, "read %d/%d pages\n", nrdpgs, npages);
+
+	ll_pagevec_init(&lru_pvec, 0);
+	for (i = 1; i < npages; i++) {
+		unsigned long offset;
+		int ret;
+
+		page = page_pool[i];
+
+		if (rc < 0 || i >= nrdpgs) {
+			page_cache_release(page);
+			continue;
+		}
+
+		SetPageUptodate(page);
+
+		dp = kmap(page);
+		hash = le64_to_cpu(dp->ldp_hash_start);
+		kunmap(page);
+
+		offset = hash_x_index(hash, hash64);
+
+		prefetchw(&page->flags);
+		ret = add_to_page_cache_lru(page, inode->i_mapping, offset,
+					    GFP_KERNEL);
+		if (ret == 0) {
+			unlock_page(page);
+			if (ll_pagevec_add(&lru_pvec, page) == 0)
+				ll_pagevec_lru_add_file(&lru_pvec);
+		} else {
+			CDEBUG(D_VFSTRACE, "page %lu add to page cache failed:"
+			       " %d\n", offset, ret);
+		}
+		page_cache_release(page);
+	}
+	ll_pagevec_lru_add_file(&lru_pvec);
+
+	if (page_pool != &page0)
+		OBD_FREE(page_pool, sizeof(struct page *) * max_pages);
+	EXIT;
+	return rc;
+}
+
+static void ll_check_page(struct inode *dir, struct page *page)
+{
+	/* XXX: check page format later */
+	SetPageChecked(page);
+}
+
+void ll_release_page(struct page *page, int remove)
+{
+	kunmap(page);
+	if (remove) {
+		lock_page(page);
+		if (likely(page->mapping != NULL))
+			truncate_complete_page(page->mapping, page);
+		unlock_page(page);
+	}
+	page_cache_release(page);
+}
+
+/*
+ * Find, kmap and return page that contains given hash.
+ */
+static struct page *ll_dir_page_locate(struct inode *dir, __u64 *hash,
+				       __u64 *start, __u64 *end)
+{
+	int hash64 = ll_i2sbi(dir)->ll_flags & LL_SBI_64BIT_HASH;
+	struct address_space *mapping = dir->i_mapping;
+	/*
+	 * Complement of hash is used as an index so that
+	 * radix_tree_gang_lookup() can be used to find a page with starting
+	 * hash _smaller_ than one we are looking for.
+	 */
+	unsigned long offset = hash_x_index(*hash, hash64);
+	struct page *page;
+	int found;
+
+	TREE_READ_LOCK_IRQ(mapping);
+	found = radix_tree_gang_lookup(&mapping->page_tree,
+				       (void **)&page, offset, 1);
+	if (found > 0) {
+		struct lu_dirpage *dp;
+
+		page_cache_get(page);
+		TREE_READ_UNLOCK_IRQ(mapping);
+		/*
+		 * In contrast to find_lock_page() we are sure that directory
+		 * page cannot be truncated (while DLM lock is held) and,
+		 * hence, can avoid restart.
+		 *
+		 * In fact, page cannot be locked here at all, because
+		 * ll_dir_filler() does synchronous io.
+		 */
+		wait_on_page_locked(page);
+		if (PageUptodate(page)) {
+			dp = kmap(page);
+			if (BITS_PER_LONG == 32 && hash64) {
+				*start = le64_to_cpu(dp->ldp_hash_start) >> 32;
+				*end   = le64_to_cpu(dp->ldp_hash_end) >> 32;
+				*hash  = *hash >> 32;
+			} else {
+				*start = le64_to_cpu(dp->ldp_hash_start);
+				*end   = le64_to_cpu(dp->ldp_hash_end);
+			}
+			LASSERTF(*start <= *hash, "start = "LPX64",end = "
+				 LPX64",hash = "LPX64"\n", *start, *end, *hash);
+			CDEBUG(D_VFSTRACE, "page %lu [%llu %llu], hash "LPU64"\n",
+			       offset, *start, *end, *hash);
+			if (*hash > *end) {
+				ll_release_page(page, 0);
+				page = NULL;
+			} else if (*end != *start && *hash == *end) {
+				/*
+				 * upon hash collision, remove this page,
+				 * otherwise put page reference, and
+				 * ll_get_dir_page() will issue RPC to fetch
+				 * the page we want.
+				 */
+				ll_release_page(page,
+				    le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE);
+				page = NULL;
+			}
+		} else {
+			page_cache_release(page);
+			page = ERR_PTR(-EIO);
+		}
+
+	} else {
+		TREE_READ_UNLOCK_IRQ(mapping);
+		page = NULL;
+	}
+	return page;
+}
+
+struct page *ll_get_dir_page(struct inode *dir, __u64 hash,
+			     struct ll_dir_chain *chain)
+{
+	ldlm_policy_data_t policy = {.l_inodebits = {MDS_INODELOCK_UPDATE} };
+	struct address_space *mapping = dir->i_mapping;
+	struct lustre_handle lockh;
+	struct lu_dirpage *dp;
+	struct page *page;
+	ldlm_mode_t mode;
+	int rc;
+	__u64 start = 0;
+	__u64 end = 0;
+	__u64 lhash = hash;
+	struct ll_inode_info *lli = ll_i2info(dir);
+	int hash64 = ll_i2sbi(dir)->ll_flags & LL_SBI_64BIT_HASH;
+
+	mode = LCK_PR;
+	rc = md_lock_match(ll_i2sbi(dir)->ll_md_exp, LDLM_FL_BLOCK_GRANTED,
+			   ll_inode2fid(dir), LDLM_IBITS, &policy, mode, &lockh);
+	if (!rc) {
+		struct ldlm_enqueue_info einfo = {.ei_type = LDLM_IBITS,
+						  .ei_mode = mode,
+						  .ei_cb_bl =
+						  ll_md_blocking_ast,
+						  .ei_cb_cp =
+						  ldlm_completion_ast,
+						  .ei_cb_gl = NULL,
+						  .ei_cb_wg = NULL,
+						  .ei_cbdata = NULL};
+		struct lookup_intent it = { .it_op = IT_READDIR };
+		struct ptlrpc_request *request;
+		struct md_op_data *op_data;
+
+		op_data = ll_prep_md_op_data(NULL, dir, NULL, NULL, 0, 0,
+		LUSTRE_OPC_ANY, NULL);
+		if (IS_ERR(op_data))
+			return (void *)op_data;
+
+		rc = md_enqueue(ll_i2sbi(dir)->ll_md_exp, &einfo, &it,
+				op_data, &lockh, NULL, 0, NULL, 0);
+
+		ll_finish_md_op_data(op_data);
+
+		request = (struct ptlrpc_request *)it.d.lustre.it_data;
+		if (request)
+			ptlrpc_req_finished(request);
+		if (rc < 0) {
+			CERROR("lock enqueue: "DFID" at "LPU64": rc %d\n",
+				PFID(ll_inode2fid(dir)), hash, rc);
+			return ERR_PTR(rc);
+		}
+
+		CDEBUG(D_INODE, "setting lr_lvb_inode to inode %p (%lu/%u)\n",
+		       dir, dir->i_ino, dir->i_generation);
+		md_set_lock_data(ll_i2sbi(dir)->ll_md_exp,
+				 &it.d.lustre.it_lock_handle, dir, NULL);
+	} else {
+		/* for cross-ref object, l_ast_data of the lock may not be set,
+		 * we reset it here */
+		md_set_lock_data(ll_i2sbi(dir)->ll_md_exp, &lockh.cookie,
+				 dir, NULL);
+	}
+	ldlm_lock_dump_handle(D_OTHER, &lockh);
+
+	mutex_lock(&lli->lli_readdir_mutex);
+	page = ll_dir_page_locate(dir, &lhash, &start, &end);
+	if (IS_ERR(page)) {
+		CERROR("dir page locate: "DFID" at "LPU64": rc %ld\n",
+		       PFID(ll_inode2fid(dir)), lhash, PTR_ERR(page));
+		GOTO(out_unlock, page);
+	} else if (page != NULL) {
+		/*
+		 * XXX nikita: not entirely correct handling of a corner case:
+		 * suppose hash chain of entries with hash value HASH crosses
+		 * border between pages P0 and P1. First both P0 and P1 are
+		 * cached, seekdir() is called for some entry from the P0 part
+		 * of the chain. Later P0 goes out of cache. telldir(HASH)
+		 * happens and finds P1, as it starts with matching hash
+		 * value. Remaining entries from P0 part of the chain are
+		 * skipped. (Is that really a bug?)
+		 *
+		 * Possible solutions: 0. don't cache P1 is such case, handle
+		 * it as an "overflow" page. 1. invalidate all pages at
+		 * once. 2. use HASH|1 as an index for P1.
+		 */
+		GOTO(hash_collision, page);
+	}
+
+	page = read_cache_page(mapping, hash_x_index(hash, hash64),
+			       ll_dir_filler, &lhash);
+	if (IS_ERR(page)) {
+		CERROR("read cache page: "DFID" at "LPU64": rc %ld\n",
+		       PFID(ll_inode2fid(dir)), hash, PTR_ERR(page));
+		GOTO(out_unlock, page);
+	}
+
+	wait_on_page_locked(page);
+	(void)kmap(page);
+	if (!PageUptodate(page)) {
+		CERROR("page not updated: "DFID" at "LPU64": rc %d\n",
+		       PFID(ll_inode2fid(dir)), hash, -5);
+		goto fail;
+	}
+	if (!PageChecked(page))
+		ll_check_page(dir, page);
+	if (PageError(page)) {
+		CERROR("page error: "DFID" at "LPU64": rc %d\n",
+		       PFID(ll_inode2fid(dir)), hash, -5);
+		goto fail;
+	}
+hash_collision:
+	dp = page_address(page);
+	if (BITS_PER_LONG == 32 && hash64) {
+		start = le64_to_cpu(dp->ldp_hash_start) >> 32;
+		end   = le64_to_cpu(dp->ldp_hash_end) >> 32;
+		lhash = hash >> 32;
+	} else {
+		start = le64_to_cpu(dp->ldp_hash_start);
+		end   = le64_to_cpu(dp->ldp_hash_end);
+		lhash = hash;
+	}
+	if (end == start) {
+		LASSERT(start == lhash);
+		CWARN("Page-wide hash collision: "LPU64"\n", end);
+		if (BITS_PER_LONG == 32 && hash64)
+			CWARN("Real page-wide hash collision at ["LPU64" "LPU64
+			      "] with hash "LPU64"\n",
+			      le64_to_cpu(dp->ldp_hash_start),
+			      le64_to_cpu(dp->ldp_hash_end), hash);
+		/*
+		 * Fetch whole overflow chain...
+		 *
+		 * XXX not yet.
+		 */
+		goto fail;
+	}
+out_unlock:
+	mutex_unlock(&lli->lli_readdir_mutex);
+	ldlm_lock_decref(&lockh, mode);
+	return page;
+
+fail:
+	ll_release_page(page, 1);
+	page = ERR_PTR(-EIO);
+	goto out_unlock;
+}
+
+int ll_dir_read(struct inode *inode, __u64 *_pos, void *cookie,
+		filldir_t filldir)
+{
+	struct ll_inode_info *info       = ll_i2info(inode);
+	struct ll_sb_info    *sbi	= ll_i2sbi(inode);
+	__u64		 pos	= *_pos;
+	int		   api32      = ll_need_32bit_api(sbi);
+	int		   hash64     = sbi->ll_flags & LL_SBI_64BIT_HASH;
+	struct page	  *page;
+	struct ll_dir_chain   chain;
+	int		   done = 0;
+	int		   rc = 0;
+	ENTRY;
+
+	ll_dir_chain_init(&chain);
+
+	page = ll_get_dir_page(inode, pos, &chain);
+
+	while (rc == 0 && !done) {
+		struct lu_dirpage *dp;
+		struct lu_dirent  *ent;
+
+		if (!IS_ERR(page)) {
+			/*
+			 * If page is empty (end of directory is reached),
+			 * use this value.
+			 */
+			__u64 hash = MDS_DIR_END_OFF;
+			__u64 next;
+
+			dp = page_address(page);
+			for (ent = lu_dirent_start(dp); ent != NULL && !done;
+			     ent = lu_dirent_next(ent)) {
+				__u16	  type;
+				int	    namelen;
+				struct lu_fid  fid;
+				__u64	  lhash;
+				__u64	  ino;
+
+				/*
+				 * XXX: implement correct swabbing here.
+				 */
+
+				hash = le64_to_cpu(ent->lde_hash);
+				if (hash < pos)
+					/*
+					 * Skip until we find target hash
+					 * value.
+					 */
+					continue;
+
+				namelen = le16_to_cpu(ent->lde_namelen);
+				if (namelen == 0)
+					/*
+					 * Skip dummy record.
+					 */
+					continue;
+
+				if (api32 && hash64)
+					lhash = hash >> 32;
+				else
+					lhash = hash;
+				fid_le_to_cpu(&fid, &ent->lde_fid);
+				ino = cl_fid_build_ino(&fid, api32);
+				type = ll_dirent_type_get(ent);
+				/* For 'll_nfs_get_name_filldir()', it will try
+				 * to access the 'ent' through its 'lde_name',
+				 * so the parameter 'name' for 'filldir()' must
+				 * be part of the 'ent'. */
+				done = filldir(cookie, ent->lde_name, namelen,
+					       lhash, ino, type);
+			}
+			next = le64_to_cpu(dp->ldp_hash_end);
+			if (!done) {
+				pos = next;
+				if (pos == MDS_DIR_END_OFF) {
+					/*
+					 * End of directory reached.
+					 */
+					done = 1;
+					ll_release_page(page, 0);
+				} else if (1 /* chain is exhausted*/) {
+					/*
+					 * Normal case: continue to the next
+					 * page.
+					 */
+					ll_release_page(page,
+					    le32_to_cpu(dp->ldp_flags) &
+							LDF_COLLIDE);
+					next = pos;
+					page = ll_get_dir_page(inode, pos,
+							       &chain);
+				} else {
+					/*
+					 * go into overflow page.
+					 */
+					LASSERT(le32_to_cpu(dp->ldp_flags) &
+						LDF_COLLIDE);
+					ll_release_page(page, 1);
+				}
+			} else {
+				pos = hash;
+				ll_release_page(page, 0);
+			}
+		} else {
+			rc = PTR_ERR(page);
+			CERROR("error reading dir "DFID" at %lu: rc %d\n",
+			       PFID(&info->lli_fid), (unsigned long)pos, rc);
+		}
+	}
+
+	*_pos = pos;
+	ll_dir_chain_fini(&chain);
+	RETURN(rc);
+}
+
+static int ll_readdir(struct file *filp, void *cookie, filldir_t filldir)
+{
+	struct inode		*inode	= filp->f_dentry->d_inode;
+	struct ll_file_data	*lfd	= LUSTRE_FPRIVATE(filp);
+	struct ll_sb_info	*sbi	= ll_i2sbi(inode);
+	__u64			pos	= lfd->lfd_pos;
+	int			hash64	= sbi->ll_flags & LL_SBI_64BIT_HASH;
+	int			api32	= ll_need_32bit_api(sbi);
+	int			rc;
+	struct path		path;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) pos %lu/%llu "
+	       " 32bit_api %d\n", inode->i_ino, inode->i_generation,
+	       inode, (unsigned long)pos, i_size_read(inode), api32);
+
+	if (pos == MDS_DIR_END_OFF)
+		/*
+		 * end-of-file.
+		 */
+		GOTO(out, rc = 0);
+
+	rc = ll_dir_read(inode, &pos, cookie, filldir);
+	lfd->lfd_pos = pos;
+	if (pos == MDS_DIR_END_OFF) {
+		if (api32)
+			filp->f_pos = LL_DIR_END_OFF_32BIT;
+		else
+			filp->f_pos = LL_DIR_END_OFF;
+	} else {
+		if (api32 && hash64)
+			filp->f_pos = pos >> 32;
+		else
+			filp->f_pos = pos;
+	}
+	filp->f_version = inode->i_version;
+	path.mnt = filp->f_path.mnt;
+	path.dentry = filp->f_dentry;
+	touch_atime(&path);
+
+out:
+	if (!rc)
+		ll_stats_ops_tally(sbi, LPROC_LL_READDIR, 1);
+
+	RETURN(rc);
+}
+
+int ll_send_mgc_param(struct obd_export *mgc, char *string)
+{
+	struct mgs_send_param *msp;
+	int rc = 0;
+
+	OBD_ALLOC_PTR(msp);
+	if (!msp)
+		return -ENOMEM;
+
+	strncpy(msp->mgs_param, string, MGS_PARAM_MAXLEN);
+	rc = obd_set_info_async(NULL, mgc, sizeof(KEY_SET_INFO), KEY_SET_INFO,
+				sizeof(struct mgs_send_param), msp, NULL);
+	if (rc)
+		CERROR("Failed to set parameter: %d\n", rc);
+	OBD_FREE_PTR(msp);
+
+	return rc;
+}
+
+int ll_dir_setdirstripe(struct inode *dir, struct lmv_user_md *lump,
+			char *filename)
+{
+	struct ptlrpc_request *request = NULL;
+	struct md_op_data *op_data;
+	struct ll_sb_info *sbi = ll_i2sbi(dir);
+	int mode;
+	int err;
+
+	ENTRY;
+
+	mode = (0755 & (S_IRWXUGO|S_ISVTX) & ~current->fs->umask) | S_IFDIR;
+	op_data = ll_prep_md_op_data(NULL, dir, NULL, filename,
+				     strlen(filename), mode, LUSTRE_OPC_MKDIR,
+				     lump);
+	if (IS_ERR(op_data))
+		GOTO(err_exit, err = PTR_ERR(op_data));
+
+	op_data->op_cli_flags |= CLI_SET_MEA;
+	err = md_create(sbi->ll_md_exp, op_data, lump, sizeof(*lump), mode,
+			current_fsuid(), current_fsgid(),
+			cfs_curproc_cap_pack(), 0, &request);
+	ll_finish_md_op_data(op_data);
+	if (err)
+		GOTO(err_exit, err);
+err_exit:
+	ptlrpc_req_finished(request);
+	return err;
+}
+
+int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
+		     int set_default)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct md_op_data *op_data;
+	struct ptlrpc_request *req = NULL;
+	int rc = 0;
+	struct lustre_sb_info *lsi = s2lsi(inode->i_sb);
+	struct obd_device *mgc = lsi->lsi_mgc;
+	int lum_size;
+	ENTRY;
+
+	if (lump != NULL) {
+		/*
+		 * This is coming from userspace, so should be in
+		 * local endian.  But the MDS would like it in little
+		 * endian, so we swab it before we send it.
+		 */
+		switch (lump->lmm_magic) {
+		case LOV_USER_MAGIC_V1: {
+			if (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V1))
+				lustre_swab_lov_user_md_v1(lump);
+			lum_size = sizeof(struct lov_user_md_v1);
+			break;
+		}
+		case LOV_USER_MAGIC_V3: {
+			if (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V3))
+				lustre_swab_lov_user_md_v3(
+					(struct lov_user_md_v3 *)lump);
+			lum_size = sizeof(struct lov_user_md_v3);
+			break;
+		}
+		default: {
+			CDEBUG(D_IOCTL, "bad userland LOV MAGIC:"
+					" %#08x != %#08x nor %#08x\n",
+					lump->lmm_magic, LOV_USER_MAGIC_V1,
+					LOV_USER_MAGIC_V3);
+			RETURN(-EINVAL);
+		}
+		}
+	} else {
+		lum_size = sizeof(struct lov_user_md_v1);
+	}
+
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	if (lump != NULL && lump->lmm_magic == cpu_to_le32(LMV_USER_MAGIC))
+		op_data->op_cli_flags |= CLI_SET_MEA;
+
+	/* swabbing is done in lov_setstripe() on server side */
+	rc = md_setattr(sbi->ll_md_exp, op_data, lump, lum_size,
+			NULL, 0, &req, NULL);
+	ll_finish_md_op_data(op_data);
+	ptlrpc_req_finished(req);
+	if (rc) {
+		if (rc != -EPERM && rc != -EACCES)
+			CERROR("mdc_setattr fails: rc = %d\n", rc);
+	}
+
+	/* In the following we use the fact that LOV_USER_MAGIC_V1 and
+	 LOV_USER_MAGIC_V3 have the same initial fields so we do not
+	 need the make the distiction between the 2 versions */
+	if (set_default && mgc->u.cli.cl_mgc_mgsexp) {
+		char *param = NULL;
+		char *buf;
+
+		OBD_ALLOC(param, MGS_PARAM_MAXLEN);
+		if (param == NULL)
+			GOTO(end, rc = -ENOMEM);
+
+		buf = param;
+		/* Get fsname and assume devname to be -MDT0000. */
+		ll_get_fsname(inode->i_sb, buf, MTI_NAME_MAXLEN);
+		strcat(buf, "-MDT0000.lov");
+		buf += strlen(buf);
+
+		/* Set root stripesize */
+		sprintf(buf, ".stripesize=%u",
+			lump ? le32_to_cpu(lump->lmm_stripe_size) : 0);
+		rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
+		if (rc)
+			GOTO(end, rc);
+
+		/* Set root stripecount */
+		sprintf(buf, ".stripecount=%hd",
+			lump ? le16_to_cpu(lump->lmm_stripe_count) : 0);
+		rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
+		if (rc)
+			GOTO(end, rc);
+
+		/* Set root stripeoffset */
+		sprintf(buf, ".stripeoffset=%hd",
+			lump ? le16_to_cpu(lump->lmm_stripe_offset) :
+			(typeof(lump->lmm_stripe_offset))(-1));
+		rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
+
+end:
+		if (param != NULL)
+			OBD_FREE(param, MGS_PARAM_MAXLEN);
+	}
+	RETURN(rc);
+}
+
+int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmmp,
+		     int *lmm_size, struct ptlrpc_request **request)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct mdt_body   *body;
+	struct lov_mds_md *lmm = NULL;
+	struct ptlrpc_request *req = NULL;
+	int rc, lmmsize;
+	struct md_op_data *op_data;
+
+	rc = ll_get_max_mdsize(sbi, &lmmsize);
+	if (rc)
+		RETURN(rc);
+
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
+				     0, lmmsize, LUSTRE_OPC_ANY,
+				     NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
+	rc = md_getattr(sbi->ll_md_exp, op_data, &req);
+	ll_finish_md_op_data(op_data);
+	if (rc < 0) {
+		CDEBUG(D_INFO, "md_getattr failed on inode "
+		       "%lu/%u: rc %d\n", inode->i_ino,
+		       inode->i_generation, rc);
+		GOTO(out, rc);
+	}
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	LASSERT(body != NULL);
+
+	lmmsize = body->eadatasize;
+
+	if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
+	    lmmsize == 0) {
+		GOTO(out, rc = -ENODATA);
+	}
+
+	lmm = req_capsule_server_sized_get(&req->rq_pill,
+					   &RMF_MDT_MD, lmmsize);
+	LASSERT(lmm != NULL);
+
+	/*
+	 * This is coming from the MDS, so is probably in
+	 * little endian.  We convert it to host endian before
+	 * passing it to userspace.
+	 */
+	/* We don't swab objects for directories */
+	switch (le32_to_cpu(lmm->lmm_magic)) {
+	case LOV_MAGIC_V1:
+		if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC))
+			lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
+		break;
+	case LOV_MAGIC_V3:
+		if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC))
+			lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
+		break;
+	default:
+		CERROR("unknown magic: %lX\n", (unsigned long)lmm->lmm_magic);
+		rc = -EPROTO;
+	}
+out:
+	*lmmp = lmm;
+	*lmm_size = lmmsize;
+	*request = req;
+	return rc;
+}
+
+/*
+ *  Get MDT index for the inode.
+ */
+int ll_get_mdt_idx(struct inode *inode)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct md_op_data *op_data;
+	int rc, mdtidx;
+	ENTRY;
+
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0,
+				     0, LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	op_data->op_flags |= MF_GET_MDT_IDX;
+	rc = md_getattr(sbi->ll_md_exp, op_data, NULL);
+	mdtidx = op_data->op_mds;
+	ll_finish_md_op_data(op_data);
+	if (rc < 0) {
+		CDEBUG(D_INFO, "md_getattr_name: %d\n", rc);
+		RETURN(rc);
+	}
+	return mdtidx;
+}
+
+/**
+ * Generic handler to do any pre-copy work.
+ *
+ * It send a first hsm_progress (with extent length == 0) to coordinator as a
+ * first information for it that real work has started.
+ *
+ * Moreover, for a ARCHIVE request, it will sample the file data version and
+ * store it in \a copy.
+ *
+ * \return 0 on success.
+ */
+static int ll_ioc_copy_start(struct super_block *sb, struct hsm_copy *copy)
+{
+	struct ll_sb_info		*sbi = ll_s2sbi(sb);
+	struct hsm_progress_kernel	 hpk;
+	int				 rc;
+	ENTRY;
+
+	/* Forge a hsm_progress based on data from copy. */
+	hpk.hpk_fid = copy->hc_hai.hai_fid;
+	hpk.hpk_cookie = copy->hc_hai.hai_cookie;
+	hpk.hpk_extent.offset = copy->hc_hai.hai_extent.offset;
+	hpk.hpk_extent.length = 0;
+	hpk.hpk_flags = 0;
+	hpk.hpk_errval = 0;
+	hpk.hpk_data_version = 0;
+
+
+	/* For archive request, we need to read the current file version. */
+	if (copy->hc_hai.hai_action == HSMA_ARCHIVE) {
+		struct inode	*inode;
+		__u64		 data_version = 0;
+
+		/* Get inode for this fid */
+		inode = search_inode_for_lustre(sb, &copy->hc_hai.hai_fid);
+		if (IS_ERR(inode)) {
+			hpk.hpk_flags |= HP_FLAG_RETRY;
+			/* hpk_errval is >= 0 */
+			hpk.hpk_errval = -PTR_ERR(inode);
+			GOTO(progress, rc = PTR_ERR(inode));
+		}
+
+		/* Read current file data version */
+		rc = ll_data_version(inode, &data_version, 1);
+		iput(inode);
+		if (rc != 0) {
+			CDEBUG(D_HSM, "Could not read file data version of "
+				      DFID" (rc = %d). Archive request ("
+				      LPX64") could not be done.\n",
+				      PFID(&copy->hc_hai.hai_fid), rc,
+				      copy->hc_hai.hai_cookie);
+			hpk.hpk_flags |= HP_FLAG_RETRY;
+			/* hpk_errval must be >= 0 */
+			hpk.hpk_errval = -rc;
+			GOTO(progress, rc);
+		}
+
+		/* Store it the hsm_copy for later copytool use.
+		 * Always modified even if no lsm. */
+		copy->hc_data_version = data_version;
+	}
+
+progress:
+	rc = obd_iocontrol(LL_IOC_HSM_PROGRESS, sbi->ll_md_exp, sizeof(hpk),
+			   &hpk, NULL);
+
+	RETURN(rc);
+}
+
+/**
+ * Generic handler to do any post-copy work.
+ *
+ * It will send the last hsm_progress update to coordinator to inform it
+ * that copy is finished and whether it was successful or not.
+ *
+ * Moreover,
+ * - for ARCHIVE request, it will sample the file data version and compare it
+ *   with the version saved in ll_ioc_copy_start(). If they do not match, copy
+ *   will be considered as failed.
+ * - for RESTORE request, it will sample the file data version and send it to
+ *   coordinator which is useful if the file was imported as 'released'.
+ *
+ * \return 0 on success.
+ */
+static int ll_ioc_copy_end(struct super_block *sb, struct hsm_copy *copy)
+{
+	struct ll_sb_info		*sbi = ll_s2sbi(sb);
+	struct hsm_progress_kernel	 hpk;
+	int				 rc;
+	ENTRY;
+
+	/* If you modify the logic here, also check llapi_hsm_copy_end(). */
+	/* Take care: copy->hc_hai.hai_action, len, gid and data are not
+	 * initialized if copy_end was called with copy == NULL.
+	 */
+
+	/* Forge a hsm_progress based on data from copy. */
+	hpk.hpk_fid = copy->hc_hai.hai_fid;
+	hpk.hpk_cookie = copy->hc_hai.hai_cookie;
+	hpk.hpk_extent = copy->hc_hai.hai_extent;
+	hpk.hpk_flags = copy->hc_flags | HP_FLAG_COMPLETED;
+	hpk.hpk_errval = copy->hc_errval;
+	hpk.hpk_data_version = 0;
+
+	/* For archive request, we need to check the file data was not changed.
+	 *
+	 * For restore request, we need to send the file data version, this is
+	 * useful when the file was created using hsm_import.
+	 */
+	if (((copy->hc_hai.hai_action == HSMA_ARCHIVE) ||
+	     (copy->hc_hai.hai_action == HSMA_RESTORE)) &&
+	    (copy->hc_errval == 0)) {
+		struct inode	*inode;
+		__u64		 data_version = 0;
+
+		/* Get lsm for this fid */
+		inode = search_inode_for_lustre(sb, &copy->hc_hai.hai_fid);
+		if (IS_ERR(inode)) {
+			hpk.hpk_flags |= HP_FLAG_RETRY;
+			/* hpk_errval must be >= 0 */
+			hpk.hpk_errval = -PTR_ERR(inode);
+			GOTO(progress, rc = PTR_ERR(inode));
+		}
+
+		rc = ll_data_version(inode, &data_version,
+				     copy->hc_hai.hai_action == HSMA_ARCHIVE);
+		iput(inode);
+		if (rc) {
+			CDEBUG(D_HSM, "Could not read file data version. "
+				      "Request could not be confirmed.\n");
+			if (hpk.hpk_errval == 0)
+				hpk.hpk_errval = -rc;
+			GOTO(progress, rc);
+		}
+
+		/* Store it the hsm_copy for later copytool use.
+		 * Always modified even if no lsm. */
+		hpk.hpk_data_version = data_version;
+
+		/* File could have been stripped during archiving, so we need
+		 * to check anyway. */
+		if ((copy->hc_hai.hai_action == HSMA_ARCHIVE) &&
+		    (copy->hc_data_version != data_version)) {
+			CDEBUG(D_HSM, "File data version mismatched. "
+			      "File content was changed during archiving. "
+			       DFID", start:"LPX64" current:"LPX64"\n",
+			       PFID(&copy->hc_hai.hai_fid),
+			       copy->hc_data_version, data_version);
+			/* File was changed, send error to cdt. Do not ask for
+			 * retry because if a file is modified frequently,
+			 * the cdt will loop on retried archive requests.
+			 * The policy engine will ask for a new archive later
+			 * when the file will not be modified for some tunable
+			 * time */
+			/* we do not notify caller */
+			hpk.hpk_flags &= ~HP_FLAG_RETRY;
+			/* hpk_errval must be >= 0 */
+			hpk.hpk_errval = EBUSY;
+		}
+
+	}
+
+progress:
+	rc = obd_iocontrol(LL_IOC_HSM_PROGRESS, sbi->ll_md_exp, sizeof(hpk),
+			   &hpk, NULL);
+
+	RETURN(rc);
+}
+
+
+static int copy_and_ioctl(int cmd, struct obd_export *exp, void *data, int len)
+{
+	void *ptr;
+	int rc;
+
+	OBD_ALLOC(ptr, len);
+	if (ptr == NULL)
+		return -ENOMEM;
+	if (copy_from_user(ptr, data, len)) {
+		OBD_FREE(ptr, len);
+		return -EFAULT;
+	}
+	rc = obd_iocontrol(cmd, exp, len, data, NULL);
+	OBD_FREE(ptr, len);
+	return rc;
+}
+
+static int quotactl_ioctl(struct ll_sb_info *sbi, struct if_quotactl *qctl)
+{
+	int cmd = qctl->qc_cmd;
+	int type = qctl->qc_type;
+	int id = qctl->qc_id;
+	int valid = qctl->qc_valid;
+	int rc = 0;
+	ENTRY;
+
+	switch (cmd) {
+	case LUSTRE_Q_INVALIDATE:
+	case LUSTRE_Q_FINVALIDATE:
+	case Q_QUOTAON:
+	case Q_QUOTAOFF:
+	case Q_SETQUOTA:
+	case Q_SETINFO:
+		if (!cfs_capable(CFS_CAP_SYS_ADMIN) ||
+		    sbi->ll_flags & LL_SBI_RMT_CLIENT)
+			RETURN(-EPERM);
+		break;
+	case Q_GETQUOTA:
+		if (((type == USRQUOTA && current_euid() != id) ||
+		     (type == GRPQUOTA && !in_egroup_p(id))) &&
+		    (!cfs_capable(CFS_CAP_SYS_ADMIN) ||
+		     sbi->ll_flags & LL_SBI_RMT_CLIENT))
+			RETURN(-EPERM);
+		break;
+	case Q_GETINFO:
+		break;
+	default:
+		CERROR("unsupported quotactl op: %#x\n", cmd);
+		RETURN(-ENOTTY);
+	}
+
+	if (valid != QC_GENERAL) {
+		if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
+			RETURN(-EOPNOTSUPP);
+
+		if (cmd == Q_GETINFO)
+			qctl->qc_cmd = Q_GETOINFO;
+		else if (cmd == Q_GETQUOTA)
+			qctl->qc_cmd = Q_GETOQUOTA;
+		else
+			RETURN(-EINVAL);
+
+		switch (valid) {
+		case QC_MDTIDX:
+			rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_md_exp,
+					   sizeof(*qctl), qctl, NULL);
+			break;
+		case QC_OSTIDX:
+			rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_dt_exp,
+					   sizeof(*qctl), qctl, NULL);
+			break;
+		case QC_UUID:
+			rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_md_exp,
+					   sizeof(*qctl), qctl, NULL);
+			if (rc == -EAGAIN)
+				rc = obd_iocontrol(OBD_IOC_QUOTACTL,
+						   sbi->ll_dt_exp,
+						   sizeof(*qctl), qctl, NULL);
+			break;
+		default:
+			rc = -EINVAL;
+			break;
+		}
+
+		if (rc)
+			RETURN(rc);
+
+		qctl->qc_cmd = cmd;
+	} else {
+		struct obd_quotactl *oqctl;
+
+		OBD_ALLOC_PTR(oqctl);
+		if (oqctl == NULL)
+			RETURN(-ENOMEM);
+
+		QCTL_COPY(oqctl, qctl);
+		rc = obd_quotactl(sbi->ll_md_exp, oqctl);
+		if (rc) {
+			if (rc != -EALREADY && cmd == Q_QUOTAON) {
+				oqctl->qc_cmd = Q_QUOTAOFF;
+				obd_quotactl(sbi->ll_md_exp, oqctl);
+			}
+			OBD_FREE_PTR(oqctl);
+			RETURN(rc);
+		}
+		/* If QIF_SPACE is not set, client should collect the
+		 * space usage from OSSs by itself */
+		if (cmd == Q_GETQUOTA &&
+		    !(oqctl->qc_dqblk.dqb_valid & QIF_SPACE) &&
+		    !oqctl->qc_dqblk.dqb_curspace) {
+			struct obd_quotactl *oqctl_tmp;
+
+			OBD_ALLOC_PTR(oqctl_tmp);
+			if (oqctl_tmp == NULL)
+				GOTO(out, rc = -ENOMEM);
+
+			oqctl_tmp->qc_cmd = Q_GETOQUOTA;
+			oqctl_tmp->qc_id = oqctl->qc_id;
+			oqctl_tmp->qc_type = oqctl->qc_type;
+
+			/* collect space usage from OSTs */
+			oqctl_tmp->qc_dqblk.dqb_curspace = 0;
+			rc = obd_quotactl(sbi->ll_dt_exp, oqctl_tmp);
+			if (!rc || rc == -EREMOTEIO) {
+				oqctl->qc_dqblk.dqb_curspace =
+					oqctl_tmp->qc_dqblk.dqb_curspace;
+				oqctl->qc_dqblk.dqb_valid |= QIF_SPACE;
+			}
+
+			/* collect space & inode usage from MDTs */
+			oqctl_tmp->qc_dqblk.dqb_curspace = 0;
+			oqctl_tmp->qc_dqblk.dqb_curinodes = 0;
+			rc = obd_quotactl(sbi->ll_md_exp, oqctl_tmp);
+			if (!rc || rc == -EREMOTEIO) {
+				oqctl->qc_dqblk.dqb_curspace +=
+					oqctl_tmp->qc_dqblk.dqb_curspace;
+				oqctl->qc_dqblk.dqb_curinodes =
+					oqctl_tmp->qc_dqblk.dqb_curinodes;
+				oqctl->qc_dqblk.dqb_valid |= QIF_INODES;
+			} else {
+				oqctl->qc_dqblk.dqb_valid &= ~QIF_SPACE;
+			}
+
+			OBD_FREE_PTR(oqctl_tmp);
+		}
+out:
+		QCTL_COPY(qctl, oqctl);
+		OBD_FREE_PTR(oqctl);
+	}
+
+	RETURN(rc);
+}
+
+static char *
+ll_getname(const char __user *filename)
+{
+	int ret = 0, len;
+	char *tmp = __getname();
+
+	if (!tmp)
+		return ERR_PTR(-ENOMEM);
+
+	len = strncpy_from_user(tmp, filename, PATH_MAX);
+	if (len == 0)
+		ret = -ENOENT;
+	else if (len > PATH_MAX)
+		ret = -ENAMETOOLONG;
+
+	if (ret) {
+		__putname(tmp);
+		tmp =  ERR_PTR(ret);
+	}
+	return tmp;
+}
+
+#define ll_putname(filename) __putname(filename)
+
+static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct obd_ioctl_data *data;
+	int rc = 0;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), cmd=%#x\n",
+	       inode->i_ino, inode->i_generation, inode, cmd);
+
+	/* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
+	if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
+		return -ENOTTY;
+
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
+	switch(cmd) {
+	case FSFILT_IOC_GETFLAGS:
+	case FSFILT_IOC_SETFLAGS:
+		RETURN(ll_iocontrol(inode, file, cmd, arg));
+	case FSFILT_IOC_GETVERSION_OLD:
+	case FSFILT_IOC_GETVERSION:
+		RETURN(put_user(inode->i_generation, (int *)arg));
+	/* We need to special case any other ioctls we want to handle,
+	 * to send them to the MDS/OST as appropriate and to properly
+	 * network encode the arg field.
+	case FSFILT_IOC_SETVERSION_OLD:
+	case FSFILT_IOC_SETVERSION:
+	*/
+	case LL_IOC_GET_MDTIDX: {
+		int mdtidx;
+
+		mdtidx = ll_get_mdt_idx(inode);
+		if (mdtidx < 0)
+			RETURN(mdtidx);
+
+		if (put_user((int)mdtidx, (int*)arg))
+			RETURN(-EFAULT);
+
+		return 0;
+	}
+	case IOC_MDC_LOOKUP: {
+		struct ptlrpc_request *request = NULL;
+		int namelen, len = 0;
+		char *buf = NULL;
+		char *filename;
+		struct md_op_data *op_data;
+
+		rc = obd_ioctl_getdata(&buf, &len, (void *)arg);
+		if (rc)
+			RETURN(rc);
+		data = (void *)buf;
+
+		filename = data->ioc_inlbuf1;
+		namelen = strlen(filename);
+
+		if (namelen < 1) {
+			CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n");
+			GOTO(out_free, rc = -EINVAL);
+		}
+
+		op_data = ll_prep_md_op_data(NULL, inode, NULL, filename, namelen,
+					     0, LUSTRE_OPC_ANY, NULL);
+		if (IS_ERR(op_data))
+			GOTO(out_free, rc = PTR_ERR(op_data));
+
+		op_data->op_valid = OBD_MD_FLID;
+		rc = md_getattr_name(sbi->ll_md_exp, op_data, &request);
+		ll_finish_md_op_data(op_data);
+		if (rc < 0) {
+			CDEBUG(D_INFO, "md_getattr_name: %d\n", rc);
+			GOTO(out_free, rc);
+		}
+		ptlrpc_req_finished(request);
+		EXIT;
+out_free:
+		obd_ioctl_freedata(buf, len);
+		return rc;
+	}
+	case LL_IOC_LMV_SETSTRIPE: {
+		struct lmv_user_md  *lum;
+		char		*buf = NULL;
+		char		*filename;
+		int		 namelen = 0;
+		int		 lumlen = 0;
+		int		 len;
+		int		 rc;
+
+		rc = obd_ioctl_getdata(&buf, &len, (void *)arg);
+		if (rc)
+			RETURN(rc);
+
+		data = (void *)buf;
+		if (data->ioc_inlbuf1 == NULL || data->ioc_inlbuf2 == NULL ||
+		    data->ioc_inllen1 == 0 || data->ioc_inllen2 == 0)
+			GOTO(lmv_out_free, rc = -EINVAL);
+
+		filename = data->ioc_inlbuf1;
+		namelen = data->ioc_inllen1;
+
+		if (namelen < 1) {
+			CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n");
+			GOTO(lmv_out_free, rc = -EINVAL);
+		}
+		lum = (struct lmv_user_md *)data->ioc_inlbuf2;
+		lumlen = data->ioc_inllen2;
+
+		if (lum->lum_magic != LMV_USER_MAGIC ||
+		    lumlen != sizeof(*lum)) {
+			CERROR("%s: wrong lum magic %x or size %d: rc = %d\n",
+			       filename, lum->lum_magic, lumlen, -EFAULT);
+			GOTO(lmv_out_free, rc = -EINVAL);
+		}
+
+		/**
+		 * ll_dir_setdirstripe will be used to set dir stripe
+		 *  mdc_create--->mdt_reint_create (with dirstripe)
+		 */
+		rc = ll_dir_setdirstripe(inode, lum, filename);
+lmv_out_free:
+		obd_ioctl_freedata(buf, len);
+		RETURN(rc);
+
+	}
+	case LL_IOC_LOV_SETSTRIPE: {
+		struct lov_user_md_v3 lumv3;
+		struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
+		struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
+		struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
+
+		int set_default = 0;
+
+		LASSERT(sizeof(lumv3) == sizeof(*lumv3p));
+		LASSERT(sizeof(lumv3.lmm_objects[0]) ==
+			sizeof(lumv3p->lmm_objects[0]));
+		/* first try with v1 which is smaller than v3 */
+		if (copy_from_user(lumv1, lumv1p, sizeof(*lumv1)))
+			RETURN(-EFAULT);
+
+		if ((lumv1->lmm_magic == LOV_USER_MAGIC_V3) ) {
+			if (copy_from_user(&lumv3, lumv3p, sizeof(lumv3)))
+				RETURN(-EFAULT);
+		}
+
+		if (inode->i_sb->s_root == file->f_dentry)
+			set_default = 1;
+
+		/* in v1 and v3 cases lumv1 points to data */
+		rc = ll_dir_setstripe(inode, lumv1, set_default);
+
+		RETURN(rc);
+	}
+	case LL_IOC_LMV_GETSTRIPE: {
+		struct lmv_user_md *lump = (struct lmv_user_md *)arg;
+		struct lmv_user_md lum;
+		struct lmv_user_md *tmp;
+		int lum_size;
+		int rc = 0;
+		int mdtindex;
+
+		if (copy_from_user(&lum, lump, sizeof(struct lmv_user_md)))
+			RETURN(-EFAULT);
+
+		if (lum.lum_magic != LMV_MAGIC_V1)
+			RETURN(-EINVAL);
+
+		lum_size = lmv_user_md_size(1, LMV_MAGIC_V1);
+		OBD_ALLOC(tmp, lum_size);
+		if (tmp == NULL)
+			GOTO(free_lmv, rc = -ENOMEM);
+
+		memcpy(tmp, &lum, sizeof(lum));
+		tmp->lum_type = LMV_STRIPE_TYPE;
+		tmp->lum_stripe_count = 1;
+		mdtindex = ll_get_mdt_idx(inode);
+		if (mdtindex < 0)
+			GOTO(free_lmv, rc = -ENOMEM);
+
+		tmp->lum_stripe_offset = mdtindex;
+		tmp->lum_objects[0].lum_mds = mdtindex;
+		memcpy(&tmp->lum_objects[0].lum_fid, ll_inode2fid(inode),
+		       sizeof(struct lu_fid));
+		if (copy_to_user((void *)arg, tmp, lum_size))
+			GOTO(free_lmv, rc = -EFAULT);
+free_lmv:
+		if (tmp)
+			OBD_FREE(tmp, lum_size);
+		RETURN(rc);
+	}
+	case LL_IOC_REMOVE_ENTRY: {
+		char		*filename = NULL;
+		int		 namelen = 0;
+		int		 rc;
+
+		/* Here is a little hack to avoid sending REINT_RMENTRY to
+		 * unsupported server, which might crash the server(LU-2730),
+		 * Because both LVB_TYPE and REINT_RMENTRY will be supported
+		 * on 2.4, we use OBD_CONNECT_LVB_TYPE to detect whether the
+		 * server will support REINT_RMENTRY XXX*/
+		if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_LVB_TYPE))
+			return -ENOTSUPP;
+
+		filename = ll_getname((const char *)arg);
+		if (IS_ERR(filename))
+			RETURN(PTR_ERR(filename));
+
+		namelen = strlen(filename);
+		if (namelen < 1)
+			GOTO(out_rmdir, rc = -EINVAL);
+
+		rc = ll_rmdir_entry(inode, filename, namelen);
+out_rmdir:
+		if (filename)
+			ll_putname(filename);
+		RETURN(rc);
+	}
+	case LL_IOC_LOV_SWAP_LAYOUTS:
+		RETURN(-EPERM);
+	case LL_IOC_OBD_STATFS:
+		RETURN(ll_obd_statfs(inode, (void *)arg));
+	case LL_IOC_LOV_GETSTRIPE:
+	case LL_IOC_MDC_GETINFO:
+	case IOC_MDC_GETFILEINFO:
+	case IOC_MDC_GETFILESTRIPE: {
+		struct ptlrpc_request *request = NULL;
+		struct lov_user_md *lump;
+		struct lov_mds_md *lmm = NULL;
+		struct mdt_body *body;
+		char *filename = NULL;
+		int lmmsize;
+
+		if (cmd == IOC_MDC_GETFILEINFO ||
+		    cmd == IOC_MDC_GETFILESTRIPE) {
+			filename = ll_getname((const char *)arg);
+			if (IS_ERR(filename))
+				RETURN(PTR_ERR(filename));
+
+			rc = ll_lov_getstripe_ea_info(inode, filename, &lmm,
+						      &lmmsize, &request);
+		} else {
+			rc = ll_dir_getstripe(inode, &lmm, &lmmsize, &request);
+		}
+
+		if (request) {
+			body = req_capsule_server_get(&request->rq_pill,
+						      &RMF_MDT_BODY);
+			LASSERT(body != NULL);
+		} else {
+			GOTO(out_req, rc);
+		}
+
+		if (rc < 0) {
+			if (rc == -ENODATA && (cmd == IOC_MDC_GETFILEINFO ||
+					       cmd == LL_IOC_MDC_GETINFO))
+				GOTO(skip_lmm, rc = 0);
+			else
+				GOTO(out_req, rc);
+		}
+
+		if (cmd == IOC_MDC_GETFILESTRIPE ||
+		    cmd == LL_IOC_LOV_GETSTRIPE) {
+			lump = (struct lov_user_md *)arg;
+		} else {
+			struct lov_user_mds_data *lmdp;
+			lmdp = (struct lov_user_mds_data *)arg;
+			lump = &lmdp->lmd_lmm;
+		}
+		if (copy_to_user(lump, lmm, lmmsize)) {
+			if (copy_to_user(lump, lmm, sizeof(*lump)))
+				GOTO(out_req, rc = -EFAULT);
+			rc = -EOVERFLOW;
+		}
+	skip_lmm:
+		if (cmd == IOC_MDC_GETFILEINFO || cmd == LL_IOC_MDC_GETINFO) {
+			struct lov_user_mds_data *lmdp;
+			lstat_t st = { 0 };
+
+			st.st_dev     = inode->i_sb->s_dev;
+			st.st_mode    = body->mode;
+			st.st_nlink   = body->nlink;
+			st.st_uid     = body->uid;
+			st.st_gid     = body->gid;
+			st.st_rdev    = body->rdev;
+			st.st_size    = body->size;
+			st.st_blksize = PAGE_CACHE_SIZE;
+			st.st_blocks  = body->blocks;
+			st.st_atime   = body->atime;
+			st.st_mtime   = body->mtime;
+			st.st_ctime   = body->ctime;
+			st.st_ino     = inode->i_ino;
+
+			lmdp = (struct lov_user_mds_data *)arg;
+			if (copy_to_user(&lmdp->lmd_st, &st, sizeof(st)))
+				GOTO(out_req, rc = -EFAULT);
+		}
+
+		EXIT;
+	out_req:
+		ptlrpc_req_finished(request);
+		if (filename)
+			ll_putname(filename);
+		return rc;
+	}
+	case IOC_LOV_GETINFO: {
+		struct lov_user_mds_data *lumd;
+		struct lov_stripe_md *lsm;
+		struct lov_user_md *lum;
+		struct lov_mds_md *lmm;
+		int lmmsize;
+		lstat_t st;
+
+		lumd = (struct lov_user_mds_data *)arg;
+		lum = &lumd->lmd_lmm;
+
+		rc = ll_get_max_mdsize(sbi, &lmmsize);
+		if (rc)
+			RETURN(rc);
+
+		OBD_ALLOC_LARGE(lmm, lmmsize);
+		if (copy_from_user(lmm, lum, lmmsize))
+			GOTO(free_lmm, rc = -EFAULT);
+
+		switch (lmm->lmm_magic) {
+		case LOV_USER_MAGIC_V1:
+			if (LOV_USER_MAGIC_V1 == cpu_to_le32(LOV_USER_MAGIC_V1))
+				break;
+			/* swab objects first so that stripes num will be sane */
+			lustre_swab_lov_user_md_objects(
+				((struct lov_user_md_v1 *)lmm)->lmm_objects,
+				((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
+			lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
+			break;
+		case LOV_USER_MAGIC_V3:
+			if (LOV_USER_MAGIC_V3 == cpu_to_le32(LOV_USER_MAGIC_V3))
+				break;
+			/* swab objects first so that stripes num will be sane */
+			lustre_swab_lov_user_md_objects(
+				((struct lov_user_md_v3 *)lmm)->lmm_objects,
+				((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
+			lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
+			break;
+		default:
+			GOTO(free_lmm, rc = -EINVAL);
+		}
+
+		rc = obd_unpackmd(sbi->ll_dt_exp, &lsm, lmm, lmmsize);
+		if (rc < 0)
+			GOTO(free_lmm, rc = -ENOMEM);
+
+		/* Perform glimpse_size operation. */
+		memset(&st, 0, sizeof(st));
+
+		rc = ll_glimpse_ioctl(sbi, lsm, &st);
+		if (rc)
+			GOTO(free_lsm, rc);
+
+		if (copy_to_user(&lumd->lmd_st, &st, sizeof(st)))
+			GOTO(free_lsm, rc = -EFAULT);
+
+		EXIT;
+	free_lsm:
+		obd_free_memmd(sbi->ll_dt_exp, &lsm);
+	free_lmm:
+		OBD_FREE_LARGE(lmm, lmmsize);
+		return rc;
+	}
+	case OBD_IOC_LLOG_CATINFO: {
+		RETURN(-EOPNOTSUPP);
+	}
+	case OBD_IOC_QUOTACHECK: {
+		struct obd_quotactl *oqctl;
+		int error = 0;
+
+		if (!cfs_capable(CFS_CAP_SYS_ADMIN) ||
+		    sbi->ll_flags & LL_SBI_RMT_CLIENT)
+			RETURN(-EPERM);
+
+		OBD_ALLOC_PTR(oqctl);
+		if (!oqctl)
+			RETURN(-ENOMEM);
+		oqctl->qc_type = arg;
+		rc = obd_quotacheck(sbi->ll_md_exp, oqctl);
+		if (rc < 0) {
+			CDEBUG(D_INFO, "md_quotacheck failed: rc %d\n", rc);
+			error = rc;
+		}
+
+		rc = obd_quotacheck(sbi->ll_dt_exp, oqctl);
+		if (rc < 0)
+			CDEBUG(D_INFO, "obd_quotacheck failed: rc %d\n", rc);
+
+		OBD_FREE_PTR(oqctl);
+		return error ?: rc;
+	}
+	case OBD_IOC_POLL_QUOTACHECK: {
+		struct if_quotacheck *check;
+
+		if (!cfs_capable(CFS_CAP_SYS_ADMIN) ||
+		    sbi->ll_flags & LL_SBI_RMT_CLIENT)
+			RETURN(-EPERM);
+
+		OBD_ALLOC_PTR(check);
+		if (!check)
+			RETURN(-ENOMEM);
+
+		rc = obd_iocontrol(cmd, sbi->ll_md_exp, 0, (void *)check,
+				   NULL);
+		if (rc) {
+			CDEBUG(D_QUOTA, "mdc ioctl %d failed: %d\n", cmd, rc);
+			if (copy_to_user((void *)arg, check,
+					     sizeof(*check)))
+				CDEBUG(D_QUOTA, "copy_to_user failed\n");
+			GOTO(out_poll, rc);
+		}
+
+		rc = obd_iocontrol(cmd, sbi->ll_dt_exp, 0, (void *)check,
+				   NULL);
+		if (rc) {
+			CDEBUG(D_QUOTA, "osc ioctl %d failed: %d\n", cmd, rc);
+			if (copy_to_user((void *)arg, check,
+					     sizeof(*check)))
+				CDEBUG(D_QUOTA, "copy_to_user failed\n");
+			GOTO(out_poll, rc);
+		}
+	out_poll:
+		OBD_FREE_PTR(check);
+		RETURN(rc);
+	}
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0)
+	case LL_IOC_QUOTACTL_18: {
+		/* copy the old 1.x quota struct for internal use, then copy
+		 * back into old format struct.  For 1.8 compatibility. */
+		struct if_quotactl_18 *qctl_18;
+		struct if_quotactl *qctl_20;
+
+		OBD_ALLOC_PTR(qctl_18);
+		if (!qctl_18)
+			RETURN(-ENOMEM);
+
+		OBD_ALLOC_PTR(qctl_20);
+		if (!qctl_20)
+			GOTO(out_quotactl_18, rc = -ENOMEM);
+
+		if (copy_from_user(qctl_18, (void *)arg, sizeof(*qctl_18)))
+			GOTO(out_quotactl_20, rc = -ENOMEM);
+
+		QCTL_COPY(qctl_20, qctl_18);
+		qctl_20->qc_idx = 0;
+
+		/* XXX: dqb_valid was borrowed as a flag to mark that
+		 *      only mds quota is wanted */
+		if (qctl_18->qc_cmd == Q_GETQUOTA &&
+		    qctl_18->qc_dqblk.dqb_valid) {
+			qctl_20->qc_valid = QC_MDTIDX;
+			qctl_20->qc_dqblk.dqb_valid = 0;
+		} else if (qctl_18->obd_uuid.uuid[0] != '\0') {
+			qctl_20->qc_valid = QC_UUID;
+			qctl_20->obd_uuid = qctl_18->obd_uuid;
+		} else {
+			qctl_20->qc_valid = QC_GENERAL;
+		}
+
+		rc = quotactl_ioctl(sbi, qctl_20);
+
+		if (rc == 0) {
+			QCTL_COPY(qctl_18, qctl_20);
+			qctl_18->obd_uuid = qctl_20->obd_uuid;
+
+			if (copy_to_user((void *)arg, qctl_18,
+					     sizeof(*qctl_18)))
+				rc = -EFAULT;
+		}
+
+	out_quotactl_20:
+		OBD_FREE_PTR(qctl_20);
+	out_quotactl_18:
+		OBD_FREE_PTR(qctl_18);
+		RETURN(rc);
+	}
+#else
+#warning "remove old LL_IOC_QUOTACTL_18 compatibility code"
+#endif /* LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0) */
+	case LL_IOC_QUOTACTL: {
+		struct if_quotactl *qctl;
+
+		OBD_ALLOC_PTR(qctl);
+		if (!qctl)
+			RETURN(-ENOMEM);
+
+		if (copy_from_user(qctl, (void *)arg, sizeof(*qctl)))
+			GOTO(out_quotactl, rc = -EFAULT);
+
+		rc = quotactl_ioctl(sbi, qctl);
+
+		if (rc == 0 && copy_to_user((void *)arg,qctl,sizeof(*qctl)))
+			rc = -EFAULT;
+
+	out_quotactl:
+		OBD_FREE_PTR(qctl);
+		RETURN(rc);
+	}
+	case OBD_IOC_GETDTNAME:
+	case OBD_IOC_GETMDNAME:
+		RETURN(ll_get_obd_name(inode, cmd, arg));
+	case LL_IOC_FLUSHCTX:
+		RETURN(ll_flush_ctx(inode));
+#ifdef CONFIG_FS_POSIX_ACL
+	case LL_IOC_RMTACL: {
+	    if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
+		inode == inode->i_sb->s_root->d_inode) {
+		struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+
+		LASSERT(fd != NULL);
+		rc = rct_add(&sbi->ll_rct, current_pid(), arg);
+		if (!rc)
+			fd->fd_flags |= LL_FILE_RMTACL;
+		RETURN(rc);
+	    } else
+		RETURN(0);
+	}
+#endif
+	case LL_IOC_GETOBDCOUNT: {
+		int count, vallen;
+		struct obd_export *exp;
+
+		if (copy_from_user(&count, (int *)arg, sizeof(int)))
+			RETURN(-EFAULT);
+
+		/* get ost count when count is zero, get mdt count otherwise */
+		exp = count ? sbi->ll_md_exp : sbi->ll_dt_exp;
+		vallen = sizeof(count);
+		rc = obd_get_info(NULL, exp, sizeof(KEY_TGT_COUNT),
+				  KEY_TGT_COUNT, &vallen, &count, NULL);
+		if (rc) {
+			CERROR("get target count failed: %d\n", rc);
+			RETURN(rc);
+		}
+
+		if (copy_to_user((int *)arg, &count, sizeof(int)))
+			RETURN(-EFAULT);
+
+		RETURN(0);
+	}
+	case LL_IOC_PATH2FID:
+		if (copy_to_user((void *)arg, ll_inode2fid(inode),
+				     sizeof(struct lu_fid)))
+			RETURN(-EFAULT);
+		RETURN(0);
+	case LL_IOC_GET_CONNECT_FLAGS: {
+		RETURN(obd_iocontrol(cmd, sbi->ll_md_exp, 0, NULL, (void*)arg));
+	}
+	case OBD_IOC_CHANGELOG_SEND:
+	case OBD_IOC_CHANGELOG_CLEAR:
+		rc = copy_and_ioctl(cmd, sbi->ll_md_exp, (void *)arg,
+				    sizeof(struct ioc_changelog));
+		RETURN(rc);
+	case OBD_IOC_FID2PATH:
+		RETURN(ll_fid2path(inode, (void *)arg));
+	case LL_IOC_HSM_REQUEST: {
+		struct hsm_user_request	*hur;
+		int			 totalsize;
+
+		OBD_ALLOC_PTR(hur);
+		if (hur == NULL)
+			RETURN(-ENOMEM);
+
+		/* We don't know the true size yet; copy the fixed-size part */
+		if (copy_from_user(hur, (void *)arg, sizeof(*hur))) {
+			OBD_FREE_PTR(hur);
+			RETURN(-EFAULT);
+		}
+
+		/* Compute the whole struct size */
+		totalsize = hur_len(hur);
+		OBD_FREE_PTR(hur);
+		OBD_ALLOC_LARGE(hur, totalsize);
+		if (hur == NULL)
+			RETURN(-ENOMEM);
+
+		/* Copy the whole struct */
+		if (copy_from_user(hur, (void *)arg, totalsize)) {
+			OBD_FREE_LARGE(hur, totalsize);
+			RETURN(-EFAULT);
+		}
+
+		rc = obd_iocontrol(cmd, ll_i2mdexp(inode), totalsize,
+				   hur, NULL);
+
+		OBD_FREE_LARGE(hur, totalsize);
+
+		RETURN(rc);
+	}
+	case LL_IOC_HSM_PROGRESS: {
+		struct hsm_progress_kernel	hpk;
+		struct hsm_progress		hp;
+
+		if (copy_from_user(&hp, (void *)arg, sizeof(hp)))
+			RETURN(-EFAULT);
+
+		hpk.hpk_fid = hp.hp_fid;
+		hpk.hpk_cookie = hp.hp_cookie;
+		hpk.hpk_extent = hp.hp_extent;
+		hpk.hpk_flags = hp.hp_flags;
+		hpk.hpk_errval = hp.hp_errval;
+		hpk.hpk_data_version = 0;
+
+		/* File may not exist in Lustre; all progress
+		 * reported to Lustre root */
+		rc = obd_iocontrol(cmd, sbi->ll_md_exp, sizeof(hpk), &hpk,
+				   NULL);
+		RETURN(rc);
+	}
+	case LL_IOC_HSM_CT_START:
+		rc = copy_and_ioctl(cmd, sbi->ll_md_exp, (void *)arg,
+				    sizeof(struct lustre_kernelcomm));
+		RETURN(rc);
+
+	case LL_IOC_HSM_COPY_START: {
+		struct hsm_copy	*copy;
+		int		 rc;
+
+		OBD_ALLOC_PTR(copy);
+		if (copy == NULL)
+			RETURN(-ENOMEM);
+		if (copy_from_user(copy, (char *)arg, sizeof(*copy))) {
+			OBD_FREE_PTR(copy);
+			RETURN(-EFAULT);
+		}
+
+		rc = ll_ioc_copy_start(inode->i_sb, copy);
+		if (copy_to_user((char *)arg, copy, sizeof(*copy)))
+			rc = -EFAULT;
+
+		OBD_FREE_PTR(copy);
+		RETURN(rc);
+	}
+	case LL_IOC_HSM_COPY_END: {
+		struct hsm_copy	*copy;
+		int		 rc;
+
+		OBD_ALLOC_PTR(copy);
+		if (copy == NULL)
+			RETURN(-ENOMEM);
+		if (copy_from_user(copy, (char *)arg, sizeof(*copy))) {
+			OBD_FREE_PTR(copy);
+			RETURN(-EFAULT);
+		}
+
+		rc = ll_ioc_copy_end(inode->i_sb, copy);
+		if (copy_to_user((char *)arg, copy, sizeof(*copy)))
+			rc = -EFAULT;
+
+		OBD_FREE_PTR(copy);
+		RETURN(rc);
+	}
+	default:
+		RETURN(obd_iocontrol(cmd, sbi->ll_dt_exp, 0, NULL,
+				     (void *)arg));
+	}
+}
+
+static loff_t ll_dir_seek(struct file *file, loff_t offset, int origin)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	int api32 = ll_need_32bit_api(sbi);
+	loff_t ret = -EINVAL;
+	ENTRY;
+
+	mutex_lock(&inode->i_mutex);
+	switch (origin) {
+		case SEEK_SET:
+			break;
+		case SEEK_CUR:
+			offset += file->f_pos;
+			break;
+		case SEEK_END:
+			if (offset > 0)
+				GOTO(out, ret);
+			if (api32)
+				offset += LL_DIR_END_OFF_32BIT;
+			else
+				offset += LL_DIR_END_OFF;
+			break;
+		default:
+			GOTO(out, ret);
+	}
+
+	if (offset >= 0 &&
+	    ((api32 && offset <= LL_DIR_END_OFF_32BIT) ||
+	     (!api32 && offset <= LL_DIR_END_OFF))) {
+		if (offset != file->f_pos) {
+			if ((api32 && offset == LL_DIR_END_OFF_32BIT) ||
+			    (!api32 && offset == LL_DIR_END_OFF))
+				fd->lfd_pos = MDS_DIR_END_OFF;
+			else if (api32 && sbi->ll_flags & LL_SBI_64BIT_HASH)
+				fd->lfd_pos = offset << 32;
+			else
+				fd->lfd_pos = offset;
+			file->f_pos = offset;
+			file->f_version = 0;
+		}
+		ret = offset;
+	}
+	GOTO(out, ret);
+
+out:
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
+
+int ll_dir_open(struct inode *inode, struct file *file)
+{
+	ENTRY;
+	RETURN(ll_file_open(inode, file));
+}
+
+int ll_dir_release(struct inode *inode, struct file *file)
+{
+	ENTRY;
+	RETURN(ll_file_release(inode, file));
+}
+
+struct file_operations ll_dir_operations = {
+	.llseek   = ll_dir_seek,
+	.open     = ll_dir_open,
+	.release  = ll_dir_release,
+	.read     = generic_read_dir,
+	.readdir  = ll_readdir,
+	.unlocked_ioctl   = ll_dir_ioctl,
+	.fsync    = ll_fsync,
+};
diff --git a/drivers/staging/lustre/lustre/llite/file.c b/drivers/staging/lustre/lustre/llite/file.c
new file mode 100644
index 000000000000..d423de1eb5da
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/file.c
@@ -0,0 +1,3196 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/file.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+#include <lustre_dlm.h>
+#include <lustre_lite.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include "llite_internal.h"
+#include <lustre/ll_fiemap.h>
+
+#include "cl_object.h"
+
+struct ll_file_data *ll_file_data_get(void)
+{
+	struct ll_file_data *fd;
+
+	OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, __GFP_IO);
+	fd->fd_write_failed = false;
+	return fd;
+}
+
+static void ll_file_data_put(struct ll_file_data *fd)
+{
+	if (fd != NULL)
+		OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
+}
+
+void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
+			  struct lustre_handle *fh)
+{
+	op_data->op_fid1 = ll_i2info(inode)->lli_fid;
+	op_data->op_attr.ia_mode = inode->i_mode;
+	op_data->op_attr.ia_atime = inode->i_atime;
+	op_data->op_attr.ia_mtime = inode->i_mtime;
+	op_data->op_attr.ia_ctime = inode->i_ctime;
+	op_data->op_attr.ia_size = i_size_read(inode);
+	op_data->op_attr_blocks = inode->i_blocks;
+	((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
+					ll_inode_to_ext_flags(inode->i_flags);
+	op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
+	if (fh)
+		op_data->op_handle = *fh;
+	op_data->op_capa1 = ll_mdscapa_get(inode);
+
+	if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
+		op_data->op_bias |= MDS_DATA_MODIFIED;
+}
+
+/**
+ * Closes the IO epoch and packs all the attributes into @op_data for
+ * the CLOSE rpc.
+ */
+static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
+			     struct obd_client_handle *och)
+{
+	ENTRY;
+
+	op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
+				 ATTR_MTIME_SET | ATTR_CTIME_SET;
+
+	if (!(och->och_flags & FMODE_WRITE))
+		goto out;
+
+	if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
+		op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
+	else
+		ll_ioepoch_close(inode, op_data, &och, 0);
+
+out:
+	ll_pack_inode2opdata(inode, op_data, &och->och_fh);
+	ll_prep_md_op_data(op_data, inode, NULL, NULL,
+			   0, 0, LUSTRE_OPC_ANY, NULL);
+	EXIT;
+}
+
+static int ll_close_inode_openhandle(struct obd_export *md_exp,
+				     struct inode *inode,
+				     struct obd_client_handle *och)
+{
+	struct obd_export *exp = ll_i2mdexp(inode);
+	struct md_op_data *op_data;
+	struct ptlrpc_request *req = NULL;
+	struct obd_device *obd = class_exp2obd(exp);
+	int epoch_close = 1;
+	int rc;
+	ENTRY;
+
+	if (obd == NULL) {
+		/*
+		 * XXX: in case of LMV, is this correct to access
+		 * ->exp_handle?
+		 */
+		CERROR("Invalid MDC connection handle "LPX64"\n",
+		       ll_i2mdexp(inode)->exp_handle.h_cookie);
+		GOTO(out, rc = 0);
+	}
+
+	OBD_ALLOC_PTR(op_data);
+	if (op_data == NULL)
+		GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
+
+	ll_prepare_close(inode, op_data, och);
+	epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
+	rc = md_close(md_exp, op_data, och->och_mod, &req);
+	if (rc == -EAGAIN) {
+		/* This close must have the epoch closed. */
+		LASSERT(epoch_close);
+		/* MDS has instructed us to obtain Size-on-MDS attribute from
+		 * OSTs and send setattr to back to MDS. */
+		rc = ll_som_update(inode, op_data);
+		if (rc) {
+			CERROR("inode %lu mdc Size-on-MDS update failed: "
+			       "rc = %d\n", inode->i_ino, rc);
+			rc = 0;
+		}
+	} else if (rc) {
+		CERROR("inode %lu mdc close failed: rc = %d\n",
+		       inode->i_ino, rc);
+	}
+
+	/* DATA_MODIFIED flag was successfully sent on close, cancel data
+	 * modification flag. */
+	if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
+		struct ll_inode_info *lli = ll_i2info(inode);
+
+		spin_lock(&lli->lli_lock);
+		lli->lli_flags &= ~LLIF_DATA_MODIFIED;
+		spin_unlock(&lli->lli_lock);
+	}
+
+	ll_finish_md_op_data(op_data);
+
+	if (rc == 0) {
+		rc = ll_objects_destroy(req, inode);
+		if (rc)
+			CERROR("inode %lu ll_objects destroy: rc = %d\n",
+			       inode->i_ino, rc);
+	}
+
+	EXIT;
+out:
+
+	if (exp_connect_som(exp) && !epoch_close &&
+	    S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
+		ll_queue_done_writing(inode, LLIF_DONE_WRITING);
+	} else {
+		md_clear_open_replay_data(md_exp, och);
+		/* Free @och if it is not waiting for DONE_WRITING. */
+		och->och_fh.cookie = DEAD_HANDLE_MAGIC;
+		OBD_FREE_PTR(och);
+	}
+	if (req) /* This is close request */
+		ptlrpc_req_finished(req);
+	return rc;
+}
+
+int ll_md_real_close(struct inode *inode, int flags)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct obd_client_handle **och_p;
+	struct obd_client_handle *och;
+	__u64 *och_usecount;
+	int rc = 0;
+	ENTRY;
+
+	if (flags & FMODE_WRITE) {
+		och_p = &lli->lli_mds_write_och;
+		och_usecount = &lli->lli_open_fd_write_count;
+	} else if (flags & FMODE_EXEC) {
+		och_p = &lli->lli_mds_exec_och;
+		och_usecount = &lli->lli_open_fd_exec_count;
+	} else {
+		LASSERT(flags & FMODE_READ);
+		och_p = &lli->lli_mds_read_och;
+		och_usecount = &lli->lli_open_fd_read_count;
+	}
+
+	mutex_lock(&lli->lli_och_mutex);
+	if (*och_usecount) { /* There are still users of this handle, so
+				skip freeing it. */
+		mutex_unlock(&lli->lli_och_mutex);
+		RETURN(0);
+	}
+	och=*och_p;
+	*och_p = NULL;
+	mutex_unlock(&lli->lli_och_mutex);
+
+	if (och) { /* There might be a race and somebody have freed this och
+		      already */
+		rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
+					       inode, och);
+	}
+
+	RETURN(rc);
+}
+
+int ll_md_close(struct obd_export *md_exp, struct inode *inode,
+		struct file *file)
+{
+	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+	struct ll_inode_info *lli = ll_i2info(inode);
+	int rc = 0;
+	ENTRY;
+
+	/* clear group lock, if present */
+	if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
+		ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
+
+	/* Let's see if we have good enough OPEN lock on the file and if
+	   we can skip talking to MDS */
+	if (file->f_dentry->d_inode) { /* Can this ever be false? */
+		int lockmode;
+		int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
+		struct lustre_handle lockh;
+		struct inode *inode = file->f_dentry->d_inode;
+		ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
+
+		mutex_lock(&lli->lli_och_mutex);
+		if (fd->fd_omode & FMODE_WRITE) {
+			lockmode = LCK_CW;
+			LASSERT(lli->lli_open_fd_write_count);
+			lli->lli_open_fd_write_count--;
+		} else if (fd->fd_omode & FMODE_EXEC) {
+			lockmode = LCK_PR;
+			LASSERT(lli->lli_open_fd_exec_count);
+			lli->lli_open_fd_exec_count--;
+		} else {
+			lockmode = LCK_CR;
+			LASSERT(lli->lli_open_fd_read_count);
+			lli->lli_open_fd_read_count--;
+		}
+		mutex_unlock(&lli->lli_och_mutex);
+
+		if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
+				   LDLM_IBITS, &policy, lockmode,
+				   &lockh)) {
+			rc = ll_md_real_close(file->f_dentry->d_inode,
+					      fd->fd_omode);
+		}
+	} else {
+		CERROR("Releasing a file %p with negative dentry %p. Name %s",
+		       file, file->f_dentry, file->f_dentry->d_name.name);
+	}
+
+	LUSTRE_FPRIVATE(file) = NULL;
+	ll_file_data_put(fd);
+	ll_capa_close(inode);
+
+	RETURN(rc);
+}
+
+/* While this returns an error code, fput() the caller does not, so we need
+ * to make every effort to clean up all of our state here.  Also, applications
+ * rarely check close errors and even if an error is returned they will not
+ * re-try the close call.
+ */
+int ll_file_release(struct inode *inode, struct file *file)
+{
+	struct ll_file_data *fd;
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ll_inode_info *lli = ll_i2info(inode);
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
+	       inode->i_generation, inode);
+
+#ifdef CONFIG_FS_POSIX_ACL
+	if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
+	    inode == inode->i_sb->s_root->d_inode) {
+		struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+
+		LASSERT(fd != NULL);
+		if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
+			fd->fd_flags &= ~LL_FILE_RMTACL;
+			rct_del(&sbi->ll_rct, current_pid());
+			et_search_free(&sbi->ll_et, current_pid());
+		}
+	}
+#endif
+
+	if (inode->i_sb->s_root != file->f_dentry)
+		ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
+	fd = LUSTRE_FPRIVATE(file);
+	LASSERT(fd != NULL);
+
+	/* The last ref on @file, maybe not the the owner pid of statahead.
+	 * Different processes can open the same dir, "ll_opendir_key" means:
+	 * it is me that should stop the statahead thread. */
+	if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
+	    lli->lli_opendir_pid != 0)
+		ll_stop_statahead(inode, lli->lli_opendir_key);
+
+	if (inode->i_sb->s_root == file->f_dentry) {
+		LUSTRE_FPRIVATE(file) = NULL;
+		ll_file_data_put(fd);
+		RETURN(0);
+	}
+
+	if (!S_ISDIR(inode->i_mode)) {
+		lov_read_and_clear_async_rc(lli->lli_clob);
+		lli->lli_async_rc = 0;
+	}
+
+	rc = ll_md_close(sbi->ll_md_exp, inode, file);
+
+	if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
+		libcfs_debug_dumplog();
+
+	RETURN(rc);
+}
+
+static int ll_intent_file_open(struct file *file, void *lmm,
+			       int lmmsize, struct lookup_intent *itp)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
+	struct dentry *parent = file->f_dentry->d_parent;
+	const char *name = file->f_dentry->d_name.name;
+	const int len = file->f_dentry->d_name.len;
+	struct md_op_data *op_data;
+	struct ptlrpc_request *req;
+	__u32 opc = LUSTRE_OPC_ANY;
+	int rc;
+	ENTRY;
+
+	if (!parent)
+		RETURN(-ENOENT);
+
+	/* Usually we come here only for NFSD, and we want open lock.
+	   But we can also get here with pre 2.6.15 patchless kernels, and in
+	   that case that lock is also ok */
+	/* We can also get here if there was cached open handle in revalidate_it
+	 * but it disappeared while we were getting from there to ll_file_open.
+	 * But this means this file was closed and immediatelly opened which
+	 * makes a good candidate for using OPEN lock */
+	/* If lmmsize & lmm are not 0, we are just setting stripe info
+	 * parameters. No need for the open lock */
+	if (lmm == NULL && lmmsize == 0) {
+		itp->it_flags |= MDS_OPEN_LOCK;
+		if (itp->it_flags & FMODE_WRITE)
+			opc = LUSTRE_OPC_CREATE;
+	}
+
+	op_data  = ll_prep_md_op_data(NULL, parent->d_inode,
+				      file->f_dentry->d_inode, name, len,
+				      O_RDWR, opc, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	itp->it_flags |= MDS_OPEN_BY_FID;
+	rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
+			    0 /*unused */, &req, ll_md_blocking_ast, 0);
+	ll_finish_md_op_data(op_data);
+	if (rc == -ESTALE) {
+		/* reason for keep own exit path - don`t flood log
+		* with messages with -ESTALE errors.
+		*/
+		if (!it_disposition(itp, DISP_OPEN_OPEN) ||
+		     it_open_error(DISP_OPEN_OPEN, itp))
+			GOTO(out, rc);
+		ll_release_openhandle(file->f_dentry, itp);
+		GOTO(out, rc);
+	}
+
+	if (it_disposition(itp, DISP_LOOKUP_NEG))
+		GOTO(out, rc = -ENOENT);
+
+	if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
+		rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
+		CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
+	if (!rc && itp->d.lustre.it_lock_mode)
+		ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
+				 itp, NULL);
+
+out:
+	ptlrpc_req_finished(itp->d.lustre.it_data);
+	it_clear_disposition(itp, DISP_ENQ_COMPLETE);
+	ll_intent_drop_lock(itp);
+
+	RETURN(rc);
+}
+
+/**
+ * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
+ * not believe attributes if a few ioepoch holders exist. Attributes for
+ * previous ioepoch if new one is opened are also skipped by MDS.
+ */
+void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
+{
+	if (ioepoch && lli->lli_ioepoch != ioepoch) {
+		lli->lli_ioepoch = ioepoch;
+		CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
+		       ioepoch, PFID(&lli->lli_fid));
+	}
+}
+
+static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
+		       struct lookup_intent *it, struct obd_client_handle *och)
+{
+	struct ptlrpc_request *req = it->d.lustre.it_data;
+	struct mdt_body *body;
+
+	LASSERT(och);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	LASSERT(body != NULL);		      /* reply already checked out */
+
+	memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
+	och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
+	och->och_fid = lli->lli_fid;
+	och->och_flags = it->it_flags;
+	ll_ioepoch_open(lli, body->ioepoch);
+
+	return md_set_open_replay_data(md_exp, och, req);
+}
+
+int ll_local_open(struct file *file, struct lookup_intent *it,
+		  struct ll_file_data *fd, struct obd_client_handle *och)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ll_inode_info *lli = ll_i2info(inode);
+	ENTRY;
+
+	LASSERT(!LUSTRE_FPRIVATE(file));
+
+	LASSERT(fd != NULL);
+
+	if (och) {
+		struct ptlrpc_request *req = it->d.lustre.it_data;
+		struct mdt_body *body;
+		int rc;
+
+		rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
+		if (rc)
+			RETURN(rc);
+
+		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+		if ((it->it_flags & FMODE_WRITE) &&
+		    (body->valid & OBD_MD_FLSIZE))
+			CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
+			       lli->lli_ioepoch, PFID(&lli->lli_fid));
+	}
+
+	LUSTRE_FPRIVATE(file) = fd;
+	ll_readahead_init(inode, &fd->fd_ras);
+	fd->fd_omode = it->it_flags;
+	RETURN(0);
+}
+
+/* Open a file, and (for the very first open) create objects on the OSTs at
+ * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
+ * creation or open until ll_lov_setstripe() ioctl is called.
+ *
+ * If we already have the stripe MD locally then we don't request it in
+ * md_open(), by passing a lmm_size = 0.
+ *
+ * It is up to the application to ensure no other processes open this file
+ * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
+ * used.  We might be able to avoid races of that sort by getting lli_open_sem
+ * before returning in the O_LOV_DELAY_CREATE case and dropping it here
+ * or in ll_file_release(), but I'm not sure that is desirable/necessary.
+ */
+int ll_file_open(struct inode *inode, struct file *file)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct lookup_intent *it, oit = { .it_op = IT_OPEN,
+					  .it_flags = file->f_flags };
+	struct obd_client_handle **och_p = NULL;
+	__u64 *och_usecount = NULL;
+	struct ll_file_data *fd;
+	int rc = 0, opendir_set = 0;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
+	       inode->i_generation, inode, file->f_flags);
+
+	it = file->private_data; /* XXX: compat macro */
+	file->private_data = NULL; /* prevent ll_local_open assertion */
+
+	fd = ll_file_data_get();
+	if (fd == NULL)
+		GOTO(out_och_free, rc = -ENOMEM);
+
+	fd->fd_file = file;
+	if (S_ISDIR(inode->i_mode)) {
+		spin_lock(&lli->lli_sa_lock);
+		if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
+		    lli->lli_opendir_pid == 0) {
+			lli->lli_opendir_key = fd;
+			lli->lli_opendir_pid = current_pid();
+			opendir_set = 1;
+		}
+		spin_unlock(&lli->lli_sa_lock);
+	}
+
+	if (inode->i_sb->s_root == file->f_dentry) {
+		LUSTRE_FPRIVATE(file) = fd;
+		RETURN(0);
+	}
+
+	if (!it || !it->d.lustre.it_disposition) {
+		/* Convert f_flags into access mode. We cannot use file->f_mode,
+		 * because everything but O_ACCMODE mask was stripped from
+		 * there */
+		if ((oit.it_flags + 1) & O_ACCMODE)
+			oit.it_flags++;
+		if (file->f_flags & O_TRUNC)
+			oit.it_flags |= FMODE_WRITE;
+
+		/* kernel only call f_op->open in dentry_open.  filp_open calls
+		 * dentry_open after call to open_namei that checks permissions.
+		 * Only nfsd_open call dentry_open directly without checking
+		 * permissions and because of that this code below is safe. */
+		if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
+			oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
+
+		/* We do not want O_EXCL here, presumably we opened the file
+		 * already? XXX - NFS implications? */
+		oit.it_flags &= ~O_EXCL;
+
+		/* bug20584, if "it_flags" contains O_CREAT, the file will be
+		 * created if necessary, then "IT_CREAT" should be set to keep
+		 * consistent with it */
+		if (oit.it_flags & O_CREAT)
+			oit.it_op |= IT_CREAT;
+
+		it = &oit;
+	}
+
+restart:
+	/* Let's see if we have file open on MDS already. */
+	if (it->it_flags & FMODE_WRITE) {
+		och_p = &lli->lli_mds_write_och;
+		och_usecount = &lli->lli_open_fd_write_count;
+	} else if (it->it_flags & FMODE_EXEC) {
+		och_p = &lli->lli_mds_exec_och;
+		och_usecount = &lli->lli_open_fd_exec_count;
+	 } else {
+		och_p = &lli->lli_mds_read_och;
+		och_usecount = &lli->lli_open_fd_read_count;
+	}
+
+	mutex_lock(&lli->lli_och_mutex);
+	if (*och_p) { /* Open handle is present */
+		if (it_disposition(it, DISP_OPEN_OPEN)) {
+			/* Well, there's extra open request that we do not need,
+			   let's close it somehow. This will decref request. */
+			rc = it_open_error(DISP_OPEN_OPEN, it);
+			if (rc) {
+				mutex_unlock(&lli->lli_och_mutex);
+				GOTO(out_openerr, rc);
+			}
+
+			ll_release_openhandle(file->f_dentry, it);
+		}
+		(*och_usecount)++;
+
+		rc = ll_local_open(file, it, fd, NULL);
+		if (rc) {
+			(*och_usecount)--;
+			mutex_unlock(&lli->lli_och_mutex);
+			GOTO(out_openerr, rc);
+		}
+	} else {
+		LASSERT(*och_usecount == 0);
+		if (!it->d.lustre.it_disposition) {
+			/* We cannot just request lock handle now, new ELC code
+			   means that one of other OPEN locks for this file
+			   could be cancelled, and since blocking ast handler
+			   would attempt to grab och_mutex as well, that would
+			   result in a deadlock */
+			mutex_unlock(&lli->lli_och_mutex);
+			it->it_create_mode |= M_CHECK_STALE;
+			rc = ll_intent_file_open(file, NULL, 0, it);
+			it->it_create_mode &= ~M_CHECK_STALE;
+			if (rc)
+				GOTO(out_openerr, rc);
+
+			goto restart;
+		}
+		OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
+		if (!*och_p)
+			GOTO(out_och_free, rc = -ENOMEM);
+
+		(*och_usecount)++;
+
+		/* md_intent_lock() didn't get a request ref if there was an
+		 * open error, so don't do cleanup on the request here
+		 * (bug 3430) */
+		/* XXX (green): Should not we bail out on any error here, not
+		 * just open error? */
+		rc = it_open_error(DISP_OPEN_OPEN, it);
+		if (rc)
+			GOTO(out_och_free, rc);
+
+		LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
+
+		rc = ll_local_open(file, it, fd, *och_p);
+		if (rc)
+			GOTO(out_och_free, rc);
+	}
+	mutex_unlock(&lli->lli_och_mutex);
+	fd = NULL;
+
+	/* Must do this outside lli_och_mutex lock to prevent deadlock where
+	   different kind of OPEN lock for this same inode gets cancelled
+	   by ldlm_cancel_lru */
+	if (!S_ISREG(inode->i_mode))
+		GOTO(out_och_free, rc);
+
+	ll_capa_open(inode);
+
+	if (!lli->lli_has_smd) {
+		if (file->f_flags & O_LOV_DELAY_CREATE ||
+		    !(file->f_mode & FMODE_WRITE)) {
+			CDEBUG(D_INODE, "object creation was delayed\n");
+			GOTO(out_och_free, rc);
+		}
+	}
+	file->f_flags &= ~O_LOV_DELAY_CREATE;
+	GOTO(out_och_free, rc);
+
+out_och_free:
+	if (rc) {
+		if (och_p && *och_p) {
+			OBD_FREE(*och_p, sizeof (struct obd_client_handle));
+			*och_p = NULL; /* OBD_FREE writes some magic there */
+			(*och_usecount)--;
+		}
+		mutex_unlock(&lli->lli_och_mutex);
+
+out_openerr:
+		if (opendir_set != 0)
+			ll_stop_statahead(inode, lli->lli_opendir_key);
+		if (fd != NULL)
+			ll_file_data_put(fd);
+	} else {
+		ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
+	}
+
+	if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
+		ptlrpc_req_finished(it->d.lustre.it_data);
+		it_clear_disposition(it, DISP_ENQ_OPEN_REF);
+	}
+
+	return rc;
+}
+
+/* Fills the obdo with the attributes for the lsm */
+static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
+			  struct obd_capa *capa, struct obdo *obdo,
+			  __u64 ioepoch, int sync)
+{
+	struct ptlrpc_request_set *set;
+	struct obd_info	    oinfo = { { { 0 } } };
+	int			rc;
+
+	ENTRY;
+
+	LASSERT(lsm != NULL);
+
+	oinfo.oi_md = lsm;
+	oinfo.oi_oa = obdo;
+	oinfo.oi_oa->o_oi = lsm->lsm_oi;
+	oinfo.oi_oa->o_mode = S_IFREG;
+	oinfo.oi_oa->o_ioepoch = ioepoch;
+	oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
+			       OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
+			       OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
+			       OBD_MD_FLMTIME | OBD_MD_FLCTIME |
+			       OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
+			       OBD_MD_FLDATAVERSION;
+	oinfo.oi_capa = capa;
+	if (sync) {
+		oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
+		oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
+	}
+
+	set = ptlrpc_prep_set();
+	if (set == NULL) {
+		CERROR("can't allocate ptlrpc set\n");
+		rc = -ENOMEM;
+	} else {
+		rc = obd_getattr_async(exp, &oinfo, set);
+		if (rc == 0)
+			rc = ptlrpc_set_wait(set);
+		ptlrpc_set_destroy(set);
+	}
+	if (rc == 0)
+		oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
+					 OBD_MD_FLATIME | OBD_MD_FLMTIME |
+					 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
+					 OBD_MD_FLDATAVERSION);
+	RETURN(rc);
+}
+
+/**
+  * Performs the getattr on the inode and updates its fields.
+  * If @sync != 0, perform the getattr under the server-side lock.
+  */
+int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
+		     __u64 ioepoch, int sync)
+{
+	struct obd_capa      *capa = ll_mdscapa_get(inode);
+	struct lov_stripe_md *lsm;
+	int rc;
+	ENTRY;
+
+	lsm = ccc_inode_lsm_get(inode);
+	rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
+			    capa, obdo, ioepoch, sync);
+	capa_put(capa);
+	if (rc == 0) {
+		struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
+
+		obdo_refresh_inode(inode, obdo, obdo->o_valid);
+		CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
+		       " blksize %lu\n", POSTID(oi), i_size_read(inode),
+		       (unsigned long long)inode->i_blocks,
+		       (unsigned long)ll_inode_blksize(inode));
+	}
+	ccc_inode_lsm_put(inode, lsm);
+	RETURN(rc);
+}
+
+int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct cl_object *obj = lli->lli_clob;
+	struct cl_attr *attr = ccc_env_thread_attr(env);
+	struct ost_lvb lvb;
+	int rc = 0;
+
+	ENTRY;
+
+	ll_inode_size_lock(inode);
+	/* merge timestamps the most recently obtained from mds with
+	   timestamps obtained from osts */
+	LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
+	LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
+	LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
+	inode_init_lvb(inode, &lvb);
+
+	cl_object_attr_lock(obj);
+	rc = cl_object_attr_get(env, obj, attr);
+	cl_object_attr_unlock(obj);
+
+	if (rc == 0) {
+		if (lvb.lvb_atime < attr->cat_atime)
+			lvb.lvb_atime = attr->cat_atime;
+		if (lvb.lvb_ctime < attr->cat_ctime)
+			lvb.lvb_ctime = attr->cat_ctime;
+		if (lvb.lvb_mtime < attr->cat_mtime)
+			lvb.lvb_mtime = attr->cat_mtime;
+
+		CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
+				PFID(&lli->lli_fid), attr->cat_size);
+		cl_isize_write_nolock(inode, attr->cat_size);
+
+		inode->i_blocks = attr->cat_blocks;
+
+		LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
+		LTIME_S(inode->i_atime) = lvb.lvb_atime;
+		LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
+	}
+	ll_inode_size_unlock(inode);
+
+	RETURN(rc);
+}
+
+int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
+		     lstat_t *st)
+{
+	struct obdo obdo = { 0 };
+	int rc;
+
+	rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
+	if (rc == 0) {
+		st->st_size   = obdo.o_size;
+		st->st_blocks = obdo.o_blocks;
+		st->st_mtime  = obdo.o_mtime;
+		st->st_atime  = obdo.o_atime;
+		st->st_ctime  = obdo.o_ctime;
+	}
+	return rc;
+}
+
+void ll_io_init(struct cl_io *io, const struct file *file, int write)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+
+	io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
+	if (write) {
+		io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
+		io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
+				      file->f_flags & O_DIRECT ||
+				      IS_SYNC(inode);
+	}
+	io->ci_obj     = ll_i2info(inode)->lli_clob;
+	io->ci_lockreq = CILR_MAYBE;
+	if (ll_file_nolock(file)) {
+		io->ci_lockreq = CILR_NEVER;
+		io->ci_no_srvlock = 1;
+	} else if (file->f_flags & O_APPEND) {
+		io->ci_lockreq = CILR_MANDATORY;
+	}
+}
+
+static ssize_t
+ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
+		   struct file *file, enum cl_io_type iot,
+		   loff_t *ppos, size_t count)
+{
+	struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
+	struct ll_file_data  *fd  = LUSTRE_FPRIVATE(file);
+	struct cl_io	 *io;
+	ssize_t	       result;
+	ENTRY;
+
+restart:
+	io = ccc_env_thread_io(env);
+	ll_io_init(io, file, iot == CIT_WRITE);
+
+	if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
+		struct vvp_io *vio = vvp_env_io(env);
+		struct ccc_io *cio = ccc_env_io(env);
+		int write_mutex_locked = 0;
+
+		cio->cui_fd  = LUSTRE_FPRIVATE(file);
+		vio->cui_io_subtype = args->via_io_subtype;
+
+		switch (vio->cui_io_subtype) {
+		case IO_NORMAL:
+			cio->cui_iov = args->u.normal.via_iov;
+			cio->cui_nrsegs = args->u.normal.via_nrsegs;
+			cio->cui_tot_nrsegs = cio->cui_nrsegs;
+			cio->cui_iocb = args->u.normal.via_iocb;
+			if ((iot == CIT_WRITE) &&
+			    !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
+				if (mutex_lock_interruptible(&lli->
+							       lli_write_mutex))
+					GOTO(out, result = -ERESTARTSYS);
+				write_mutex_locked = 1;
+			} else if (iot == CIT_READ) {
+				down_read(&lli->lli_trunc_sem);
+			}
+			break;
+		case IO_SENDFILE:
+			vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
+			vio->u.sendfile.cui_target = args->u.sendfile.via_target;
+			break;
+		case IO_SPLICE:
+			vio->u.splice.cui_pipe = args->u.splice.via_pipe;
+			vio->u.splice.cui_flags = args->u.splice.via_flags;
+			break;
+		default:
+			CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
+			LBUG();
+		}
+		result = cl_io_loop(env, io);
+		if (write_mutex_locked)
+			mutex_unlock(&lli->lli_write_mutex);
+		else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
+			up_read(&lli->lli_trunc_sem);
+	} else {
+		/* cl_io_rw_init() handled IO */
+		result = io->ci_result;
+	}
+
+	if (io->ci_nob > 0) {
+		result = io->ci_nob;
+		*ppos = io->u.ci_wr.wr.crw_pos;
+	}
+	GOTO(out, result);
+out:
+	cl_io_fini(env, io);
+	/* If any bit been read/written (result != 0), we just return
+	 * short read/write instead of restart io. */
+	if (result == 0 && io->ci_need_restart) {
+		CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
+		       iot == CIT_READ ? "read" : "write",
+		       file->f_dentry->d_name.name, *ppos, count);
+		LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
+		goto restart;
+	}
+
+	if (iot == CIT_READ) {
+		if (result >= 0)
+			ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
+					   LPROC_LL_READ_BYTES, result);
+	} else if (iot == CIT_WRITE) {
+		if (result >= 0) {
+			ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
+					   LPROC_LL_WRITE_BYTES, result);
+			fd->fd_write_failed = false;
+		} else if (result != -ERESTARTSYS) {
+			fd->fd_write_failed = true;
+		}
+	}
+
+	return result;
+}
+
+
+/*
+ * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
+ */
+static int ll_file_get_iov_count(const struct iovec *iov,
+				 unsigned long *nr_segs, size_t *count)
+{
+	size_t cnt = 0;
+	unsigned long seg;
+
+	for (seg = 0; seg < *nr_segs; seg++) {
+		const struct iovec *iv = &iov[seg];
+
+		/*
+		 * If any segment has a negative length, or the cumulative
+		 * length ever wraps negative then return -EINVAL.
+		 */
+		cnt += iv->iov_len;
+		if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
+			return -EINVAL;
+		if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
+			continue;
+		if (seg == 0)
+			return -EFAULT;
+		*nr_segs = seg;
+		cnt -= iv->iov_len;   /* This segment is no good */
+		break;
+	}
+	*count = cnt;
+	return 0;
+}
+
+static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
+				unsigned long nr_segs, loff_t pos)
+{
+	struct lu_env      *env;
+	struct vvp_io_args *args;
+	size_t	      count;
+	ssize_t	     result;
+	int		 refcheck;
+	ENTRY;
+
+	result = ll_file_get_iov_count(iov, &nr_segs, &count);
+	if (result)
+		RETURN(result);
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	args = vvp_env_args(env, IO_NORMAL);
+	args->u.normal.via_iov = (struct iovec *)iov;
+	args->u.normal.via_nrsegs = nr_segs;
+	args->u.normal.via_iocb = iocb;
+
+	result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
+				    &iocb->ki_pos, count);
+	cl_env_put(env, &refcheck);
+	RETURN(result);
+}
+
+static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
+			    loff_t *ppos)
+{
+	struct lu_env *env;
+	struct iovec  *local_iov;
+	struct kiocb  *kiocb;
+	ssize_t	result;
+	int	    refcheck;
+	ENTRY;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	local_iov = &vvp_env_info(env)->vti_local_iov;
+	kiocb = &vvp_env_info(env)->vti_kiocb;
+	local_iov->iov_base = (void __user *)buf;
+	local_iov->iov_len = count;
+	init_sync_kiocb(kiocb, file);
+	kiocb->ki_pos = *ppos;
+	kiocb->ki_left = count;
+
+	result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
+	*ppos = kiocb->ki_pos;
+
+	cl_env_put(env, &refcheck);
+	RETURN(result);
+}
+
+/*
+ * Write to a file (through the page cache).
+ */
+static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+				 unsigned long nr_segs, loff_t pos)
+{
+	struct lu_env      *env;
+	struct vvp_io_args *args;
+	size_t	      count;
+	ssize_t	     result;
+	int		 refcheck;
+	ENTRY;
+
+	result = ll_file_get_iov_count(iov, &nr_segs, &count);
+	if (result)
+		RETURN(result);
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	args = vvp_env_args(env, IO_NORMAL);
+	args->u.normal.via_iov = (struct iovec *)iov;
+	args->u.normal.via_nrsegs = nr_segs;
+	args->u.normal.via_iocb = iocb;
+
+	result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
+				  &iocb->ki_pos, count);
+	cl_env_put(env, &refcheck);
+	RETURN(result);
+}
+
+static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
+			     loff_t *ppos)
+{
+	struct lu_env *env;
+	struct iovec  *local_iov;
+	struct kiocb  *kiocb;
+	ssize_t	result;
+	int	    refcheck;
+	ENTRY;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	local_iov = &vvp_env_info(env)->vti_local_iov;
+	kiocb = &vvp_env_info(env)->vti_kiocb;
+	local_iov->iov_base = (void __user *)buf;
+	local_iov->iov_len = count;
+	init_sync_kiocb(kiocb, file);
+	kiocb->ki_pos = *ppos;
+	kiocb->ki_left = count;
+
+	result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
+	*ppos = kiocb->ki_pos;
+
+	cl_env_put(env, &refcheck);
+	RETURN(result);
+}
+
+
+
+/*
+ * Send file content (through pagecache) somewhere with helper
+ */
+static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
+				   struct pipe_inode_info *pipe, size_t count,
+				   unsigned int flags)
+{
+	struct lu_env      *env;
+	struct vvp_io_args *args;
+	ssize_t	     result;
+	int		 refcheck;
+	ENTRY;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	args = vvp_env_args(env, IO_SPLICE);
+	args->u.splice.via_pipe = pipe;
+	args->u.splice.via_flags = flags;
+
+	result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
+	cl_env_put(env, &refcheck);
+	RETURN(result);
+}
+
+static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
+			   obd_count ost_idx)
+{
+	struct obd_export *exp = ll_i2dtexp(inode);
+	struct obd_trans_info oti = { 0 };
+	struct obdo *oa = NULL;
+	int lsm_size;
+	int rc = 0;
+	struct lov_stripe_md *lsm = NULL, *lsm2;
+	ENTRY;
+
+	OBDO_ALLOC(oa);
+	if (oa == NULL)
+		RETURN(-ENOMEM);
+
+	lsm = ccc_inode_lsm_get(inode);
+	if (lsm == NULL)
+		GOTO(out, rc = -ENOENT);
+
+	lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
+		   (lsm->lsm_stripe_count));
+
+	OBD_ALLOC_LARGE(lsm2, lsm_size);
+	if (lsm2 == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	oa->o_oi = *oi;
+	oa->o_nlink = ost_idx;
+	oa->o_flags |= OBD_FL_RECREATE_OBJS;
+	oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
+	obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
+				   OBD_MD_FLMTIME | OBD_MD_FLCTIME);
+	obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
+	memcpy(lsm2, lsm, lsm_size);
+	ll_inode_size_lock(inode);
+	rc = obd_create(NULL, exp, oa, &lsm2, &oti);
+	ll_inode_size_unlock(inode);
+
+	OBD_FREE_LARGE(lsm2, lsm_size);
+	GOTO(out, rc);
+out:
+	ccc_inode_lsm_put(inode, lsm);
+	OBDO_FREE(oa);
+	return rc;
+}
+
+static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
+{
+	struct ll_recreate_obj ucreat;
+	struct ost_id		oi;
+	ENTRY;
+
+	if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+		RETURN(-EPERM);
+
+	if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
+			   sizeof(ucreat)))
+		RETURN(-EFAULT);
+
+	ostid_set_seq_mdt0(&oi);
+	ostid_set_id(&oi, ucreat.lrc_id);
+	RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
+}
+
+static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
+{
+	struct lu_fid	fid;
+	struct ost_id	oi;
+	obd_count	ost_idx;
+	ENTRY;
+
+	if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+		RETURN(-EPERM);
+
+	if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
+		RETURN(-EFAULT);
+
+	fid_to_ostid(&fid, &oi);
+	ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
+	RETURN(ll_lov_recreate(inode, &oi, ost_idx));
+}
+
+int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
+			     int flags, struct lov_user_md *lum, int lum_size)
+{
+	struct lov_stripe_md *lsm = NULL;
+	struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
+	int rc = 0;
+	ENTRY;
+
+	lsm = ccc_inode_lsm_get(inode);
+	if (lsm != NULL) {
+		ccc_inode_lsm_put(inode, lsm);
+		CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
+		       inode->i_ino);
+		RETURN(-EEXIST);
+	}
+
+	ll_inode_size_lock(inode);
+	rc = ll_intent_file_open(file, lum, lum_size, &oit);
+	if (rc)
+		GOTO(out, rc);
+	rc = oit.d.lustre.it_status;
+	if (rc < 0)
+		GOTO(out_req_free, rc);
+
+	ll_release_openhandle(file->f_dentry, &oit);
+
+ out:
+	ll_inode_size_unlock(inode);
+	ll_intent_release(&oit);
+	ccc_inode_lsm_put(inode, lsm);
+	RETURN(rc);
+out_req_free:
+	ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
+	goto out;
+}
+
+int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
+			     struct lov_mds_md **lmmp, int *lmm_size,
+			     struct ptlrpc_request **request)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct mdt_body  *body;
+	struct lov_mds_md *lmm = NULL;
+	struct ptlrpc_request *req = NULL;
+	struct md_op_data *op_data;
+	int rc, lmmsize;
+
+	rc = ll_get_max_mdsize(sbi, &lmmsize);
+	if (rc)
+		RETURN(rc);
+
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
+				     strlen(filename), lmmsize,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
+	rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
+	ll_finish_md_op_data(op_data);
+	if (rc < 0) {
+		CDEBUG(D_INFO, "md_getattr_name failed "
+		       "on %s: rc %d\n", filename, rc);
+		GOTO(out, rc);
+	}
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	LASSERT(body != NULL); /* checked by mdc_getattr_name */
+
+	lmmsize = body->eadatasize;
+
+	if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
+			lmmsize == 0) {
+		GOTO(out, rc = -ENODATA);
+	}
+
+	lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
+	LASSERT(lmm != NULL);
+
+	if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
+	    (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
+		GOTO(out, rc = -EPROTO);
+	}
+
+	/*
+	 * This is coming from the MDS, so is probably in
+	 * little endian.  We convert it to host endian before
+	 * passing it to userspace.
+	 */
+	if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
+		/* if function called for directory - we should
+		 * avoid swab not existent lsm objects */
+		if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
+			lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
+			if (S_ISREG(body->mode))
+				lustre_swab_lov_user_md_objects(
+				 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
+				 ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
+		} else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
+			lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
+			if (S_ISREG(body->mode))
+				lustre_swab_lov_user_md_objects(
+				 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
+				 ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
+		}
+	}
+
+out:
+	*lmmp = lmm;
+	*lmm_size = lmmsize;
+	*request = req;
+	return rc;
+}
+
+static int ll_lov_setea(struct inode *inode, struct file *file,
+			    unsigned long arg)
+{
+	int			 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
+	struct lov_user_md	*lump;
+	int			 lum_size = sizeof(struct lov_user_md) +
+					    sizeof(struct lov_user_ost_data);
+	int			 rc;
+	ENTRY;
+
+	if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+		RETURN(-EPERM);
+
+	OBD_ALLOC_LARGE(lump, lum_size);
+	if (lump == NULL)
+		RETURN(-ENOMEM);
+
+	if (copy_from_user(lump, (struct lov_user_md  *)arg, lum_size)) {
+		OBD_FREE_LARGE(lump, lum_size);
+		RETURN(-EFAULT);
+	}
+
+	rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
+
+	OBD_FREE_LARGE(lump, lum_size);
+	RETURN(rc);
+}
+
+static int ll_lov_setstripe(struct inode *inode, struct file *file,
+			    unsigned long arg)
+{
+	struct lov_user_md_v3	 lumv3;
+	struct lov_user_md_v1	*lumv1 = (struct lov_user_md_v1 *)&lumv3;
+	struct lov_user_md_v1	*lumv1p = (struct lov_user_md_v1 *)arg;
+	struct lov_user_md_v3	*lumv3p = (struct lov_user_md_v3 *)arg;
+	int			 lum_size, rc;
+	int			 flags = FMODE_WRITE;
+	ENTRY;
+
+	/* first try with v1 which is smaller than v3 */
+	lum_size = sizeof(struct lov_user_md_v1);
+	if (copy_from_user(lumv1, lumv1p, lum_size))
+		RETURN(-EFAULT);
+
+	if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
+		lum_size = sizeof(struct lov_user_md_v3);
+		if (copy_from_user(&lumv3, lumv3p, lum_size))
+			RETURN(-EFAULT);
+	}
+
+	rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
+	if (rc == 0) {
+		struct lov_stripe_md *lsm;
+		__u32 gen;
+
+		put_user(0, &lumv1p->lmm_stripe_count);
+
+		ll_layout_refresh(inode, &gen);
+		lsm = ccc_inode_lsm_get(inode);
+		rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
+				   0, lsm, (void *)arg);
+		ccc_inode_lsm_put(inode, lsm);
+	}
+	RETURN(rc);
+}
+
+static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
+{
+	struct lov_stripe_md *lsm;
+	int rc = -ENODATA;
+	ENTRY;
+
+	lsm = ccc_inode_lsm_get(inode);
+	if (lsm != NULL)
+		rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
+				   lsm, (void *)arg);
+	ccc_inode_lsm_put(inode, lsm);
+	RETURN(rc);
+}
+
+int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
+{
+	struct ll_inode_info   *lli = ll_i2info(inode);
+	struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
+	struct ccc_grouplock    grouplock;
+	int		     rc;
+	ENTRY;
+
+	if (ll_file_nolock(file))
+		RETURN(-EOPNOTSUPP);
+
+	spin_lock(&lli->lli_lock);
+	if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
+		CWARN("group lock already existed with gid %lu\n",
+		      fd->fd_grouplock.cg_gid);
+		spin_unlock(&lli->lli_lock);
+		RETURN(-EINVAL);
+	}
+	LASSERT(fd->fd_grouplock.cg_lock == NULL);
+	spin_unlock(&lli->lli_lock);
+
+	rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
+			      arg, (file->f_flags & O_NONBLOCK), &grouplock);
+	if (rc)
+		RETURN(rc);
+
+	spin_lock(&lli->lli_lock);
+	if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
+		spin_unlock(&lli->lli_lock);
+		CERROR("another thread just won the race\n");
+		cl_put_grouplock(&grouplock);
+		RETURN(-EINVAL);
+	}
+
+	fd->fd_flags |= LL_FILE_GROUP_LOCKED;
+	fd->fd_grouplock = grouplock;
+	spin_unlock(&lli->lli_lock);
+
+	CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
+	RETURN(0);
+}
+
+int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
+{
+	struct ll_inode_info   *lli = ll_i2info(inode);
+	struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
+	struct ccc_grouplock    grouplock;
+	ENTRY;
+
+	spin_lock(&lli->lli_lock);
+	if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
+		spin_unlock(&lli->lli_lock);
+		CWARN("no group lock held\n");
+		RETURN(-EINVAL);
+	}
+	LASSERT(fd->fd_grouplock.cg_lock != NULL);
+
+	if (fd->fd_grouplock.cg_gid != arg) {
+		CWARN("group lock %lu doesn't match current id %lu\n",
+		       arg, fd->fd_grouplock.cg_gid);
+		spin_unlock(&lli->lli_lock);
+		RETURN(-EINVAL);
+	}
+
+	grouplock = fd->fd_grouplock;
+	memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
+	fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
+	spin_unlock(&lli->lli_lock);
+
+	cl_put_grouplock(&grouplock);
+	CDEBUG(D_INFO, "group lock %lu released\n", arg);
+	RETURN(0);
+}
+
+/**
+ * Close inode open handle
+ *
+ * \param dentry [in]     dentry which contains the inode
+ * \param it     [in,out] intent which contains open info and result
+ *
+ * \retval 0     success
+ * \retval <0    failure
+ */
+int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
+{
+	struct inode *inode = dentry->d_inode;
+	struct obd_client_handle *och;
+	int rc;
+	ENTRY;
+
+	LASSERT(inode);
+
+	/* Root ? Do nothing. */
+	if (dentry->d_inode->i_sb->s_root == dentry)
+		RETURN(0);
+
+	/* No open handle to close? Move away */
+	if (!it_disposition(it, DISP_OPEN_OPEN))
+		RETURN(0);
+
+	LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
+
+	OBD_ALLOC(och, sizeof(*och));
+	if (!och)
+		GOTO(out, rc = -ENOMEM);
+
+	ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
+		    ll_i2info(inode), it, och);
+
+	rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
+				       inode, och);
+ out:
+	/* this one is in place of ll_file_open */
+	if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
+		ptlrpc_req_finished(it->d.lustre.it_data);
+		it_clear_disposition(it, DISP_ENQ_OPEN_REF);
+	}
+	RETURN(rc);
+}
+
+/**
+ * Get size for inode for which FIEMAP mapping is requested.
+ * Make the FIEMAP get_info call and returns the result.
+ */
+int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
+	      int num_bytes)
+{
+	struct obd_export *exp = ll_i2dtexp(inode);
+	struct lov_stripe_md *lsm = NULL;
+	struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
+	int vallen = num_bytes;
+	int rc;
+	ENTRY;
+
+	/* Checks for fiemap flags */
+	if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
+		fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
+		return -EBADR;
+	}
+
+	/* Check for FIEMAP_FLAG_SYNC */
+	if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
+		rc = filemap_fdatawrite(inode->i_mapping);
+		if (rc)
+			return rc;
+	}
+
+	lsm = ccc_inode_lsm_get(inode);
+	if (lsm == NULL)
+		return -ENOENT;
+
+	/* If the stripe_count > 1 and the application does not understand
+	 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
+	 */
+	if (lsm->lsm_stripe_count > 1 &&
+	    !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
+		GOTO(out, rc = -EOPNOTSUPP);
+
+	fm_key.oa.o_oi = lsm->lsm_oi;
+	fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
+
+	obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
+	obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
+	/* If filesize is 0, then there would be no objects for mapping */
+	if (fm_key.oa.o_size == 0) {
+		fiemap->fm_mapped_extents = 0;
+		GOTO(out, rc = 0);
+	}
+
+	memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
+
+	rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
+			  fiemap, lsm);
+	if (rc)
+		CERROR("obd_get_info failed: rc = %d\n", rc);
+
+out:
+	ccc_inode_lsm_put(inode, lsm);
+	RETURN(rc);
+}
+
+int ll_fid2path(struct inode *inode, void *arg)
+{
+	struct obd_export	*exp = ll_i2mdexp(inode);
+	struct getinfo_fid2path	*gfout, *gfin;
+	int			 outsize, rc;
+	ENTRY;
+
+	if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
+	    !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
+		RETURN(-EPERM);
+
+	/* Need to get the buflen */
+	OBD_ALLOC_PTR(gfin);
+	if (gfin == NULL)
+		RETURN(-ENOMEM);
+	if (copy_from_user(gfin, arg, sizeof(*gfin))) {
+		OBD_FREE_PTR(gfin);
+		RETURN(-EFAULT);
+	}
+
+	outsize = sizeof(*gfout) + gfin->gf_pathlen;
+	OBD_ALLOC(gfout, outsize);
+	if (gfout == NULL) {
+		OBD_FREE_PTR(gfin);
+		RETURN(-ENOMEM);
+	}
+	memcpy(gfout, gfin, sizeof(*gfout));
+	OBD_FREE_PTR(gfin);
+
+	/* Call mdc_iocontrol */
+	rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
+	if (rc)
+		GOTO(gf_free, rc);
+
+	if (copy_to_user(arg, gfout, outsize))
+		rc = -EFAULT;
+
+gf_free:
+	OBD_FREE(gfout, outsize);
+	RETURN(rc);
+}
+
+static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
+{
+	struct ll_user_fiemap *fiemap_s;
+	size_t num_bytes, ret_bytes;
+	unsigned int extent_count;
+	int rc = 0;
+
+	/* Get the extent count so we can calculate the size of
+	 * required fiemap buffer */
+	if (get_user(extent_count,
+	    &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
+		RETURN(-EFAULT);
+	num_bytes = sizeof(*fiemap_s) + (extent_count *
+					 sizeof(struct ll_fiemap_extent));
+
+	OBD_ALLOC_LARGE(fiemap_s, num_bytes);
+	if (fiemap_s == NULL)
+		RETURN(-ENOMEM);
+
+	/* get the fiemap value */
+	if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
+			   sizeof(*fiemap_s)))
+		GOTO(error, rc = -EFAULT);
+
+	/* If fm_extent_count is non-zero, read the first extent since
+	 * it is used to calculate end_offset and device from previous
+	 * fiemap call. */
+	if (extent_count) {
+		if (copy_from_user(&fiemap_s->fm_extents[0],
+		    (char __user *)arg + sizeof(*fiemap_s),
+		    sizeof(struct ll_fiemap_extent)))
+			GOTO(error, rc = -EFAULT);
+	}
+
+	rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
+	if (rc)
+		GOTO(error, rc);
+
+	ret_bytes = sizeof(struct ll_user_fiemap);
+
+	if (extent_count != 0)
+		ret_bytes += (fiemap_s->fm_mapped_extents *
+				 sizeof(struct ll_fiemap_extent));
+
+	if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
+		rc = -EFAULT;
+
+error:
+	OBD_FREE_LARGE(fiemap_s, num_bytes);
+	RETURN(rc);
+}
+
+/*
+ * Read the data_version for inode.
+ *
+ * This value is computed using stripe object version on OST.
+ * Version is computed using server side locking.
+ *
+ * @param extent_lock  Take extent lock. Not needed if a process is already
+ *		       holding the OST object group locks.
+ */
+int ll_data_version(struct inode *inode, __u64 *data_version,
+		    int extent_lock)
+{
+	struct lov_stripe_md	*lsm = NULL;
+	struct ll_sb_info	*sbi = ll_i2sbi(inode);
+	struct obdo		*obdo = NULL;
+	int			 rc;
+	ENTRY;
+
+	/* If no stripe, we consider version is 0. */
+	lsm = ccc_inode_lsm_get(inode);
+	if (lsm == NULL) {
+		*data_version = 0;
+		CDEBUG(D_INODE, "No object for inode\n");
+		RETURN(0);
+	}
+
+	OBD_ALLOC_PTR(obdo);
+	if (obdo == NULL) {
+		ccc_inode_lsm_put(inode, lsm);
+		RETURN(-ENOMEM);
+	}
+
+	rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
+	if (!rc) {
+		if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
+			rc = -EOPNOTSUPP;
+		else
+			*data_version = obdo->o_data_version;
+	}
+
+	OBD_FREE_PTR(obdo);
+	ccc_inode_lsm_put(inode, lsm);
+
+	RETURN(rc);
+}
+
+struct ll_swap_stack {
+	struct iattr		 ia1, ia2;
+	__u64			 dv1, dv2;
+	struct inode		*inode1, *inode2;
+	bool			 check_dv1, check_dv2;
+};
+
+static int ll_swap_layouts(struct file *file1, struct file *file2,
+			   struct lustre_swap_layouts *lsl)
+{
+	struct mdc_swap_layouts	 msl;
+	struct md_op_data	*op_data;
+	__u32			 gid;
+	__u64			 dv;
+	struct ll_swap_stack	*llss = NULL;
+	int			 rc;
+
+	OBD_ALLOC_PTR(llss);
+	if (llss == NULL)
+		RETURN(-ENOMEM);
+
+	llss->inode1 = file1->f_dentry->d_inode;
+	llss->inode2 = file2->f_dentry->d_inode;
+
+	if (!S_ISREG(llss->inode2->i_mode))
+		GOTO(free, rc = -EINVAL);
+
+	if (ll_permission(llss->inode1, MAY_WRITE, NULL) ||
+	    ll_permission(llss->inode2, MAY_WRITE, NULL))
+		GOTO(free, rc = -EPERM);
+
+	if (llss->inode2->i_sb != llss->inode1->i_sb)
+		GOTO(free, rc = -EXDEV);
+
+	/* we use 2 bool because it is easier to swap than 2 bits */
+	if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
+		llss->check_dv1 = true;
+
+	if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
+		llss->check_dv2 = true;
+
+	/* we cannot use lsl->sl_dvX directly because we may swap them */
+	llss->dv1 = lsl->sl_dv1;
+	llss->dv2 = lsl->sl_dv2;
+
+	rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
+	if (rc == 0) /* same file, done! */
+		GOTO(free, rc = 0);
+
+	if (rc < 0) { /* sequentialize it */
+		swap(llss->inode1, llss->inode2);
+		swap(file1, file2);
+		swap(llss->dv1, llss->dv2);
+		swap(llss->check_dv1, llss->check_dv2);
+	}
+
+	gid = lsl->sl_gid;
+	if (gid != 0) { /* application asks to flush dirty cache */
+		rc = ll_get_grouplock(llss->inode1, file1, gid);
+		if (rc < 0)
+			GOTO(free, rc);
+
+		rc = ll_get_grouplock(llss->inode2, file2, gid);
+		if (rc < 0) {
+			ll_put_grouplock(llss->inode1, file1, gid);
+			GOTO(free, rc);
+		}
+	}
+
+	/* to be able to restore mtime and atime after swap
+	 * we need to first save them */
+	if (lsl->sl_flags &
+	    (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
+		llss->ia1.ia_mtime = llss->inode1->i_mtime;
+		llss->ia1.ia_atime = llss->inode1->i_atime;
+		llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
+		llss->ia2.ia_mtime = llss->inode2->i_mtime;
+		llss->ia2.ia_atime = llss->inode2->i_atime;
+		llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
+	}
+
+	/* ultimate check, before swaping the layouts we check if
+	 * dataversion has changed (if requested) */
+	if (llss->check_dv1) {
+		rc = ll_data_version(llss->inode1, &dv, 0);
+		if (rc)
+			GOTO(putgl, rc);
+		if (dv != llss->dv1)
+			GOTO(putgl, rc = -EAGAIN);
+	}
+
+	if (llss->check_dv2) {
+		rc = ll_data_version(llss->inode2, &dv, 0);
+		if (rc)
+			GOTO(putgl, rc);
+		if (dv != llss->dv2)
+			GOTO(putgl, rc = -EAGAIN);
+	}
+
+	/* struct md_op_data is used to send the swap args to the mdt
+	 * only flags is missing, so we use struct mdc_swap_layouts
+	 * through the md_op_data->op_data */
+	/* flags from user space have to be converted before they are send to
+	 * server, no flag is sent today, they are only used on the client */
+	msl.msl_flags = 0;
+	rc = -ENOMEM;
+	op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
+				     0, LUSTRE_OPC_ANY, &msl);
+	if (op_data != NULL) {
+		rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS,
+				   ll_i2mdexp(llss->inode1),
+				   sizeof(*op_data), op_data, NULL);
+		ll_finish_md_op_data(op_data);
+	}
+
+putgl:
+	if (gid != 0) {
+		ll_put_grouplock(llss->inode2, file2, gid);
+		ll_put_grouplock(llss->inode1, file1, gid);
+	}
+
+	/* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
+	if (rc != 0)
+		GOTO(free, rc);
+
+	/* clear useless flags */
+	if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
+		llss->ia1.ia_valid &= ~ATTR_MTIME;
+		llss->ia2.ia_valid &= ~ATTR_MTIME;
+	}
+
+	if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
+		llss->ia1.ia_valid &= ~ATTR_ATIME;
+		llss->ia2.ia_valid &= ~ATTR_ATIME;
+	}
+
+	/* update time if requested */
+	rc = 0;
+	if (llss->ia2.ia_valid != 0) {
+		mutex_lock(&llss->inode1->i_mutex);
+		rc = ll_setattr(file1->f_dentry, &llss->ia2);
+		mutex_unlock(&llss->inode1->i_mutex);
+	}
+
+	if (llss->ia1.ia_valid != 0) {
+		int rc1;
+
+		mutex_lock(&llss->inode2->i_mutex);
+		rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
+		mutex_unlock(&llss->inode2->i_mutex);
+		if (rc == 0)
+			rc = rc1;
+	}
+
+free:
+	if (llss != NULL)
+		OBD_FREE_PTR(llss);
+
+	RETURN(rc);
+}
+
+long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct inode		*inode = file->f_dentry->d_inode;
+	struct ll_file_data	*fd = LUSTRE_FPRIVATE(file);
+	int			 flags, rc;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
+	       inode->i_generation, inode, cmd);
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
+
+	/* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
+	if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
+		RETURN(-ENOTTY);
+
+	switch(cmd) {
+	case LL_IOC_GETFLAGS:
+		/* Get the current value of the file flags */
+		return put_user(fd->fd_flags, (int *)arg);
+	case LL_IOC_SETFLAGS:
+	case LL_IOC_CLRFLAGS:
+		/* Set or clear specific file flags */
+		/* XXX This probably needs checks to ensure the flags are
+		 *     not abused, and to handle any flag side effects.
+		 */
+		if (get_user(flags, (int *) arg))
+			RETURN(-EFAULT);
+
+		if (cmd == LL_IOC_SETFLAGS) {
+			if ((flags & LL_FILE_IGNORE_LOCK) &&
+			    !(file->f_flags & O_DIRECT)) {
+				CERROR("%s: unable to disable locking on "
+				       "non-O_DIRECT file\n", current->comm);
+				RETURN(-EINVAL);
+			}
+
+			fd->fd_flags |= flags;
+		} else {
+			fd->fd_flags &= ~flags;
+		}
+		RETURN(0);
+	case LL_IOC_LOV_SETSTRIPE:
+		RETURN(ll_lov_setstripe(inode, file, arg));
+	case LL_IOC_LOV_SETEA:
+		RETURN(ll_lov_setea(inode, file, arg));
+	case LL_IOC_LOV_SWAP_LAYOUTS: {
+		struct file *file2;
+		struct lustre_swap_layouts lsl;
+
+		if (copy_from_user(&lsl, (char *)arg,
+				       sizeof(struct lustre_swap_layouts)))
+			RETURN(-EFAULT);
+
+		if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
+			RETURN(-EPERM);
+
+		file2 = fget(lsl.sl_fd);
+		if (file2 == NULL)
+			RETURN(-EBADF);
+
+		rc = -EPERM;
+		if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
+			rc = ll_swap_layouts(file, file2, &lsl);
+		fput(file2);
+		RETURN(rc);
+	}
+	case LL_IOC_LOV_GETSTRIPE:
+		RETURN(ll_lov_getstripe(inode, arg));
+	case LL_IOC_RECREATE_OBJ:
+		RETURN(ll_lov_recreate_obj(inode, arg));
+	case LL_IOC_RECREATE_FID:
+		RETURN(ll_lov_recreate_fid(inode, arg));
+	case FSFILT_IOC_FIEMAP:
+		RETURN(ll_ioctl_fiemap(inode, arg));
+	case FSFILT_IOC_GETFLAGS:
+	case FSFILT_IOC_SETFLAGS:
+		RETURN(ll_iocontrol(inode, file, cmd, arg));
+	case FSFILT_IOC_GETVERSION_OLD:
+	case FSFILT_IOC_GETVERSION:
+		RETURN(put_user(inode->i_generation, (int *)arg));
+	case LL_IOC_GROUP_LOCK:
+		RETURN(ll_get_grouplock(inode, file, arg));
+	case LL_IOC_GROUP_UNLOCK:
+		RETURN(ll_put_grouplock(inode, file, arg));
+	case IOC_OBD_STATFS:
+		RETURN(ll_obd_statfs(inode, (void *)arg));
+
+	/* We need to special case any other ioctls we want to handle,
+	 * to send them to the MDS/OST as appropriate and to properly
+	 * network encode the arg field.
+	case FSFILT_IOC_SETVERSION_OLD:
+	case FSFILT_IOC_SETVERSION:
+	*/
+	case LL_IOC_FLUSHCTX:
+		RETURN(ll_flush_ctx(inode));
+	case LL_IOC_PATH2FID: {
+		if (copy_to_user((void *)arg, ll_inode2fid(inode),
+				 sizeof(struct lu_fid)))
+			RETURN(-EFAULT);
+
+		RETURN(0);
+	}
+	case OBD_IOC_FID2PATH:
+		RETURN(ll_fid2path(inode, (void *)arg));
+	case LL_IOC_DATA_VERSION: {
+		struct ioc_data_version	idv;
+		int			rc;
+
+		if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
+			RETURN(-EFAULT);
+
+		rc = ll_data_version(inode, &idv.idv_version,
+				!(idv.idv_flags & LL_DV_NOFLUSH));
+
+		if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
+			RETURN(-EFAULT);
+
+		RETURN(rc);
+	}
+
+	case LL_IOC_GET_MDTIDX: {
+		int mdtidx;
+
+		mdtidx = ll_get_mdt_idx(inode);
+		if (mdtidx < 0)
+			RETURN(mdtidx);
+
+		if (put_user((int)mdtidx, (int*)arg))
+			RETURN(-EFAULT);
+
+		RETURN(0);
+	}
+	case OBD_IOC_GETDTNAME:
+	case OBD_IOC_GETMDNAME:
+		RETURN(ll_get_obd_name(inode, cmd, arg));
+	case LL_IOC_HSM_STATE_GET: {
+		struct md_op_data	*op_data;
+		struct hsm_user_state	*hus;
+		int			 rc;
+
+		OBD_ALLOC_PTR(hus);
+		if (hus == NULL)
+			RETURN(-ENOMEM);
+
+		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+					     LUSTRE_OPC_ANY, hus);
+		if (op_data == NULL) {
+			OBD_FREE_PTR(hus);
+			RETURN(-ENOMEM);
+		}
+
+		rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
+				   op_data, NULL);
+
+		if (copy_to_user((void *)arg, hus, sizeof(*hus)))
+			rc = -EFAULT;
+
+		ll_finish_md_op_data(op_data);
+		OBD_FREE_PTR(hus);
+		RETURN(rc);
+	}
+	case LL_IOC_HSM_STATE_SET: {
+		struct md_op_data	*op_data;
+		struct hsm_state_set	*hss;
+		int			 rc;
+
+		OBD_ALLOC_PTR(hss);
+		if (hss == NULL)
+			RETURN(-ENOMEM);
+		if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
+			OBD_FREE_PTR(hss);
+			RETURN(-EFAULT);
+		}
+
+		/* Non-root users are forbidden to set or clear flags which are
+		 * NOT defined in HSM_USER_MASK. */
+		if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK)
+		    && !cfs_capable(CFS_CAP_SYS_ADMIN)) {
+			OBD_FREE_PTR(hss);
+			RETURN(-EPERM);
+		}
+
+		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+					     LUSTRE_OPC_ANY, hss);
+		if (op_data == NULL) {
+			OBD_FREE_PTR(hss);
+			RETURN(-ENOMEM);
+		}
+
+		rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
+				   op_data, NULL);
+
+		ll_finish_md_op_data(op_data);
+
+		OBD_FREE_PTR(hss);
+		RETURN(rc);
+	}
+	case LL_IOC_HSM_ACTION: {
+		struct md_op_data		*op_data;
+		struct hsm_current_action	*hca;
+		int				 rc;
+
+		OBD_ALLOC_PTR(hca);
+		if (hca == NULL)
+			RETURN(-ENOMEM);
+
+		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+					     LUSTRE_OPC_ANY, hca);
+		if (op_data == NULL) {
+			OBD_FREE_PTR(hca);
+			RETURN(-ENOMEM);
+		}
+
+		rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
+				   op_data, NULL);
+
+		if (copy_to_user((char *)arg, hca, sizeof(*hca)))
+			rc = -EFAULT;
+
+		ll_finish_md_op_data(op_data);
+		OBD_FREE_PTR(hca);
+		RETURN(rc);
+	}
+	default: {
+		int err;
+
+		if (LLIOC_STOP ==
+		     ll_iocontrol_call(inode, file, cmd, arg, &err))
+			RETURN(err);
+
+		RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
+				     (void *)arg));
+	}
+	}
+}
+
+
+loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	loff_t retval, eof = 0;
+
+	ENTRY;
+	retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
+			   (origin == SEEK_CUR) ? file->f_pos : 0);
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
+	       inode->i_ino, inode->i_generation, inode, retval, retval,
+	       origin);
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
+
+	if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
+		retval = ll_glimpse_size(inode);
+		if (retval != 0)
+			RETURN(retval);
+		eof = i_size_read(inode);
+	}
+
+	retval = ll_generic_file_llseek_size(file, offset, origin,
+					  ll_file_maxbytes(inode), eof);
+	RETURN(retval);
+}
+
+int ll_flush(struct file *file, fl_owner_t id)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+	int rc, err;
+
+	LASSERT(!S_ISDIR(inode->i_mode));
+
+	/* catch async errors that were recorded back when async writeback
+	 * failed for pages in this mapping. */
+	rc = lli->lli_async_rc;
+	lli->lli_async_rc = 0;
+	err = lov_read_and_clear_async_rc(lli->lli_clob);
+	if (rc == 0)
+		rc = err;
+
+	/* The application has been told write failure already.
+	 * Do not report failure again. */
+	if (fd->fd_write_failed)
+		return 0;
+	return rc ? -EIO : 0;
+}
+
+/**
+ * Called to make sure a portion of file has been written out.
+ * if @local_only is not true, it will send OST_SYNC RPCs to ost.
+ *
+ * Return how many pages have been written.
+ */
+int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
+		       enum cl_fsync_mode mode)
+{
+	struct cl_env_nest nest;
+	struct lu_env *env;
+	struct cl_io *io;
+	struct obd_capa *capa = NULL;
+	struct cl_fsync_io *fio;
+	int result;
+	ENTRY;
+
+	if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
+	    mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
+		RETURN(-EINVAL);
+
+	env = cl_env_nested_get(&nest);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
+
+	io = ccc_env_thread_io(env);
+	io->ci_obj = cl_i2info(inode)->lli_clob;
+	io->ci_ignore_layout = 1;
+
+	/* initialize parameters for sync */
+	fio = &io->u.ci_fsync;
+	fio->fi_capa = capa;
+	fio->fi_start = start;
+	fio->fi_end = end;
+	fio->fi_fid = ll_inode2fid(inode);
+	fio->fi_mode = mode;
+	fio->fi_nr_written = 0;
+
+	if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
+		result = cl_io_loop(env, io);
+	else
+		result = io->ci_result;
+	if (result == 0)
+		result = fio->fi_nr_written;
+	cl_io_fini(env, io);
+	cl_env_nested_put(&nest, env);
+
+	capa_put(capa);
+
+	RETURN(result);
+}
+
+/*
+ * When dentry is provided (the 'else' case), *file->f_dentry may be
+ * null and dentry must be used directly rather than pulled from
+ * *file->f_dentry as is done otherwise.
+ */
+
+int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+	struct dentry *dentry = file->f_dentry;
+	struct inode *inode = dentry->d_inode;
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ptlrpc_request *req;
+	struct obd_capa *oc;
+	int rc, err;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
+	       inode->i_generation, inode);
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
+
+	rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	mutex_lock(&inode->i_mutex);
+
+	/* catch async errors that were recorded back when async writeback
+	 * failed for pages in this mapping. */
+	if (!S_ISDIR(inode->i_mode)) {
+		err = lli->lli_async_rc;
+		lli->lli_async_rc = 0;
+		if (rc == 0)
+			rc = err;
+		err = lov_read_and_clear_async_rc(lli->lli_clob);
+		if (rc == 0)
+			rc = err;
+	}
+
+	oc = ll_mdscapa_get(inode);
+	err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
+		      &req);
+	capa_put(oc);
+	if (!rc)
+		rc = err;
+	if (!err)
+		ptlrpc_req_finished(req);
+
+	if (datasync && S_ISREG(inode->i_mode)) {
+		struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+
+		err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
+				CL_FSYNC_ALL);
+		if (rc == 0 && err < 0)
+			rc = err;
+		if (rc < 0)
+			fd->fd_write_failed = true;
+		else
+			fd->fd_write_failed = false;
+	}
+
+	mutex_unlock(&inode->i_mutex);
+	RETURN(rc);
+}
+
+int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ldlm_enqueue_info einfo = { .ei_type = LDLM_FLOCK,
+					   .ei_cb_cp =ldlm_flock_completion_ast,
+					   .ei_cbdata = file_lock };
+	struct md_op_data *op_data;
+	struct lustre_handle lockh = {0};
+	ldlm_policy_data_t flock = {{0}};
+	int flags = 0;
+	int rc;
+	int rc2 = 0;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
+	       inode->i_ino, file_lock);
+
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
+
+	if (file_lock->fl_flags & FL_FLOCK) {
+		LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
+		/* flocks are whole-file locks */
+		flock.l_flock.end = OFFSET_MAX;
+		/* For flocks owner is determined by the local file desctiptor*/
+		flock.l_flock.owner = (unsigned long)file_lock->fl_file;
+	} else if (file_lock->fl_flags & FL_POSIX) {
+		flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
+		flock.l_flock.start = file_lock->fl_start;
+		flock.l_flock.end = file_lock->fl_end;
+	} else {
+		RETURN(-EINVAL);
+	}
+	flock.l_flock.pid = file_lock->fl_pid;
+
+	/* Somewhat ugly workaround for svc lockd.
+	 * lockd installs custom fl_lmops->lm_compare_owner that checks
+	 * for the fl_owner to be the same (which it always is on local node
+	 * I guess between lockd processes) and then compares pid.
+	 * As such we assign pid to the owner field to make it all work,
+	 * conflict with normal locks is unlikely since pid space and
+	 * pointer space for current->files are not intersecting */
+	if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
+		flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
+
+	switch (file_lock->fl_type) {
+	case F_RDLCK:
+		einfo.ei_mode = LCK_PR;
+		break;
+	case F_UNLCK:
+		/* An unlock request may or may not have any relation to
+		 * existing locks so we may not be able to pass a lock handle
+		 * via a normal ldlm_lock_cancel() request. The request may even
+		 * unlock a byte range in the middle of an existing lock. In
+		 * order to process an unlock request we need all of the same
+		 * information that is given with a normal read or write record
+		 * lock request. To avoid creating another ldlm unlock (cancel)
+		 * message we'll treat a LCK_NL flock request as an unlock. */
+		einfo.ei_mode = LCK_NL;
+		break;
+	case F_WRLCK:
+		einfo.ei_mode = LCK_PW;
+		break;
+	default:
+		CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
+			file_lock->fl_type);
+		RETURN (-ENOTSUPP);
+	}
+
+	switch (cmd) {
+	case F_SETLKW:
+#ifdef F_SETLKW64
+	case F_SETLKW64:
+#endif
+		flags = 0;
+		break;
+	case F_SETLK:
+#ifdef F_SETLK64
+	case F_SETLK64:
+#endif
+		flags = LDLM_FL_BLOCK_NOWAIT;
+		break;
+	case F_GETLK:
+#ifdef F_GETLK64
+	case F_GETLK64:
+#endif
+		flags = LDLM_FL_TEST_LOCK;
+		/* Save the old mode so that if the mode in the lock changes we
+		 * can decrement the appropriate reader or writer refcount. */
+		file_lock->fl_type = einfo.ei_mode;
+		break;
+	default:
+		CERROR("unknown fcntl lock command: %d\n", cmd);
+		RETURN (-EINVAL);
+	}
+
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
+	       "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
+	       flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
+
+	rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
+			op_data, &lockh, &flock, 0, NULL /* req */, flags);
+
+	if ((file_lock->fl_flags & FL_FLOCK) &&
+	    (rc == 0 || file_lock->fl_type == F_UNLCK))
+		rc2  = flock_lock_file_wait(file, file_lock);
+	if ((file_lock->fl_flags & FL_POSIX) &&
+	    (rc == 0 || file_lock->fl_type == F_UNLCK) &&
+	    !(flags & LDLM_FL_TEST_LOCK))
+		rc2  = posix_lock_file_wait(file, file_lock);
+
+	if (rc2 && file_lock->fl_type != F_UNLCK) {
+		einfo.ei_mode = LCK_NL;
+		md_enqueue(sbi->ll_md_exp, &einfo, NULL,
+			op_data, &lockh, &flock, 0, NULL /* req */, flags);
+		rc = rc2;
+	}
+
+	ll_finish_md_op_data(op_data);
+
+	RETURN(rc);
+}
+
+int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
+{
+	ENTRY;
+
+	RETURN(-ENOSYS);
+}
+
+/**
+ * test if some locks matching bits and l_req_mode are acquired
+ * - bits can be in different locks
+ * - if found clear the common lock bits in *bits
+ * - the bits not found, are kept in *bits
+ * \param inode [IN]
+ * \param bits [IN] searched lock bits [IN]
+ * \param l_req_mode [IN] searched lock mode
+ * \retval boolean, true iff all bits are found
+ */
+int ll_have_md_lock(struct inode *inode, __u64 *bits,  ldlm_mode_t l_req_mode)
+{
+	struct lustre_handle lockh;
+	ldlm_policy_data_t policy;
+	ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
+				(LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
+	struct lu_fid *fid;
+	__u64 flags;
+	int i;
+	ENTRY;
+
+	if (!inode)
+	       RETURN(0);
+
+	fid = &ll_i2info(inode)->lli_fid;
+	CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
+	       ldlm_lockname[mode]);
+
+	flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
+	for (i = 0; i < MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
+		policy.l_inodebits.bits = *bits & (1 << i);
+		if (policy.l_inodebits.bits == 0)
+			continue;
+
+		if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
+				  &policy, mode, &lockh)) {
+			struct ldlm_lock *lock;
+
+			lock = ldlm_handle2lock(&lockh);
+			if (lock) {
+				*bits &=
+				      ~(lock->l_policy_data.l_inodebits.bits);
+				LDLM_LOCK_PUT(lock);
+			} else {
+				*bits &= ~policy.l_inodebits.bits;
+			}
+		}
+	}
+	RETURN(*bits == 0);
+}
+
+ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
+			    struct lustre_handle *lockh, __u64 flags)
+{
+	ldlm_policy_data_t policy = { .l_inodebits = {bits}};
+	struct lu_fid *fid;
+	ldlm_mode_t rc;
+	ENTRY;
+
+	fid = &ll_i2info(inode)->lli_fid;
+	CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
+
+	rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
+			   fid, LDLM_IBITS, &policy,
+			   LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
+	RETURN(rc);
+}
+
+static int ll_inode_revalidate_fini(struct inode *inode, int rc)
+{
+	/* Already unlinked. Just update nlink and return success */
+	if (rc == -ENOENT) {
+		clear_nlink(inode);
+		/* This path cannot be hit for regular files unless in
+		 * case of obscure races, so no need to to validate
+		 * size. */
+		if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
+			return 0;
+	} else if (rc != 0) {
+		CERROR("%s: revalidate FID "DFID" error: rc = %d\n",
+		       ll_get_fsname(inode->i_sb, NULL, 0),
+		       PFID(ll_inode2fid(inode)), rc);
+	}
+
+	return rc;
+}
+
+int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
+			     __u64 ibits)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ptlrpc_request *req = NULL;
+	struct obd_export *exp;
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(inode != NULL);
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
+	       inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
+
+	exp = ll_i2mdexp(inode);
+
+	/* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
+	 *      But under CMD case, it caused some lock issues, should be fixed
+	 *      with new CMD ibits lock. See bug 12718 */
+	if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
+		struct lookup_intent oit = { .it_op = IT_GETATTR };
+		struct md_op_data *op_data;
+
+		if (ibits == MDS_INODELOCK_LOOKUP)
+			oit.it_op = IT_LOOKUP;
+
+		/* Call getattr by fid, so do not provide name at all. */
+		op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
+					     dentry->d_inode, NULL, 0, 0,
+					     LUSTRE_OPC_ANY, NULL);
+		if (IS_ERR(op_data))
+			RETURN(PTR_ERR(op_data));
+
+		oit.it_create_mode |= M_CHECK_STALE;
+		rc = md_intent_lock(exp, op_data, NULL, 0,
+				    /* we are not interested in name
+				       based lookup */
+				    &oit, 0, &req,
+				    ll_md_blocking_ast, 0);
+		ll_finish_md_op_data(op_data);
+		oit.it_create_mode &= ~M_CHECK_STALE;
+		if (rc < 0) {
+			rc = ll_inode_revalidate_fini(inode, rc);
+			GOTO (out, rc);
+		}
+
+		rc = ll_revalidate_it_finish(req, &oit, dentry);
+		if (rc != 0) {
+			ll_intent_release(&oit);
+			GOTO(out, rc);
+		}
+
+		/* Unlinked? Unhash dentry, so it is not picked up later by
+		   do_lookup() -> ll_revalidate_it(). We cannot use d_drop
+		   here to preserve get_cwd functionality on 2.6.
+		   Bug 10503 */
+		if (!dentry->d_inode->i_nlink)
+			d_lustre_invalidate(dentry);
+
+		ll_lookup_finish_locks(&oit, dentry);
+	} else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
+		struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
+		obd_valid valid = OBD_MD_FLGETATTR;
+		struct md_op_data *op_data;
+		int ealen = 0;
+
+		if (S_ISREG(inode->i_mode)) {
+			rc = ll_get_max_mdsize(sbi, &ealen);
+			if (rc)
+				RETURN(rc);
+			valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
+		}
+
+		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
+					     0, ealen, LUSTRE_OPC_ANY,
+					     NULL);
+		if (IS_ERR(op_data))
+			RETURN(PTR_ERR(op_data));
+
+		op_data->op_valid = valid;
+		/* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
+		 * capa for this inode. Because we only keep capas of dirs
+		 * fresh. */
+		rc = md_getattr(sbi->ll_md_exp, op_data, &req);
+		ll_finish_md_op_data(op_data);
+		if (rc) {
+			rc = ll_inode_revalidate_fini(inode, rc);
+			RETURN(rc);
+		}
+
+		rc = ll_prep_inode(&inode, req, NULL, NULL);
+	}
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
+			   __u64 ibits)
+{
+	struct inode *inode = dentry->d_inode;
+	int rc;
+	ENTRY;
+
+	rc = __ll_inode_revalidate_it(dentry, it, ibits);
+	if (rc != 0)
+		RETURN(rc);
+
+	/* if object isn't regular file, don't validate size */
+	if (!S_ISREG(inode->i_mode)) {
+		LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
+		LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
+		LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
+	} else {
+		rc = ll_glimpse_size(inode);
+	}
+	RETURN(rc);
+}
+
+int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
+		  struct lookup_intent *it, struct kstat *stat)
+{
+	struct inode *inode = de->d_inode;
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ll_inode_info *lli = ll_i2info(inode);
+	int res = 0;
+
+	res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
+					     MDS_INODELOCK_LOOKUP);
+	ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
+
+	if (res)
+		return res;
+
+	stat->dev = inode->i_sb->s_dev;
+	if (ll_need_32bit_api(sbi))
+		stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
+	else
+		stat->ino = inode->i_ino;
+	stat->mode = inode->i_mode;
+	stat->nlink = inode->i_nlink;
+	stat->uid = inode->i_uid;
+	stat->gid = inode->i_gid;
+	stat->rdev = inode->i_rdev;
+	stat->atime = inode->i_atime;
+	stat->mtime = inode->i_mtime;
+	stat->ctime = inode->i_ctime;
+	stat->blksize = 1 << inode->i_blkbits;
+
+	stat->size = i_size_read(inode);
+	stat->blocks = inode->i_blocks;
+
+	return 0;
+}
+int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
+{
+	struct lookup_intent it = { .it_op = IT_GETATTR };
+
+	return ll_getattr_it(mnt, de, &it, stat);
+}
+
+
+struct posix_acl * ll_get_acl(struct inode *inode, int type)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct posix_acl *acl = NULL;
+	ENTRY;
+
+	spin_lock(&lli->lli_lock);
+	/* VFS' acl_permission_check->check_acl will release the refcount */
+	acl = posix_acl_dup(lli->lli_posix_acl);
+	spin_unlock(&lli->lli_lock);
+
+	RETURN(acl);
+}
+
+
+int ll_inode_permission(struct inode *inode, int mask)
+{
+	int rc = 0;
+	ENTRY;
+
+#ifdef MAY_NOT_BLOCK
+	if (mask & MAY_NOT_BLOCK)
+		return -ECHILD;
+#endif
+
+       /* as root inode are NOT getting validated in lookup operation,
+	* need to do it before permission check. */
+
+	if (inode == inode->i_sb->s_root->d_inode) {
+		struct lookup_intent it = { .it_op = IT_LOOKUP };
+
+		rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
+					      MDS_INODELOCK_LOOKUP);
+		if (rc)
+			RETURN(rc);
+	}
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
+	       inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
+
+	if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
+		return lustre_check_remote_perm(inode, mask);
+
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
+	rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
+
+	RETURN(rc);
+}
+
+#define READ_METHOD aio_read
+#define READ_FUNCTION ll_file_aio_read
+#define WRITE_METHOD aio_write
+#define WRITE_FUNCTION ll_file_aio_write
+
+/* -o localflock - only provides locally consistent flock locks */
+struct file_operations ll_file_operations = {
+	.read	   = ll_file_read,
+	.READ_METHOD    = READ_FUNCTION,
+	.write	  = ll_file_write,
+	.WRITE_METHOD   = WRITE_FUNCTION,
+	.unlocked_ioctl = ll_file_ioctl,
+	.open	   = ll_file_open,
+	.release	= ll_file_release,
+	.mmap	   = ll_file_mmap,
+	.llseek	 = ll_file_seek,
+	.splice_read    = ll_file_splice_read,
+	.fsync	  = ll_fsync,
+	.flush	  = ll_flush
+};
+
+struct file_operations ll_file_operations_flock = {
+	.read	   = ll_file_read,
+	.READ_METHOD    = READ_FUNCTION,
+	.write	  = ll_file_write,
+	.WRITE_METHOD   = WRITE_FUNCTION,
+	.unlocked_ioctl = ll_file_ioctl,
+	.open	   = ll_file_open,
+	.release	= ll_file_release,
+	.mmap	   = ll_file_mmap,
+	.llseek	 = ll_file_seek,
+	.splice_read    = ll_file_splice_read,
+	.fsync	  = ll_fsync,
+	.flush	  = ll_flush,
+	.flock	  = ll_file_flock,
+	.lock	   = ll_file_flock
+};
+
+/* These are for -o noflock - to return ENOSYS on flock calls */
+struct file_operations ll_file_operations_noflock = {
+	.read	   = ll_file_read,
+	.READ_METHOD    = READ_FUNCTION,
+	.write	  = ll_file_write,
+	.WRITE_METHOD   = WRITE_FUNCTION,
+	.unlocked_ioctl = ll_file_ioctl,
+	.open	   = ll_file_open,
+	.release	= ll_file_release,
+	.mmap	   = ll_file_mmap,
+	.llseek	 = ll_file_seek,
+	.splice_read    = ll_file_splice_read,
+	.fsync	  = ll_fsync,
+	.flush	  = ll_flush,
+	.flock	  = ll_file_noflock,
+	.lock	   = ll_file_noflock
+};
+
+struct inode_operations ll_file_inode_operations = {
+	.setattr	= ll_setattr,
+	.getattr	= ll_getattr,
+	.permission	= ll_inode_permission,
+	.setxattr	= ll_setxattr,
+	.getxattr	= ll_getxattr,
+	.listxattr	= ll_listxattr,
+	.removexattr	= ll_removexattr,
+	.get_acl	= ll_get_acl,
+};
+
+/* dynamic ioctl number support routins */
+static struct llioc_ctl_data {
+	struct rw_semaphore	ioc_sem;
+	struct list_head	      ioc_head;
+} llioc = {
+	__RWSEM_INITIALIZER(llioc.ioc_sem),
+	LIST_HEAD_INIT(llioc.ioc_head)
+};
+
+
+struct llioc_data {
+	struct list_head	      iocd_list;
+	unsigned int	    iocd_size;
+	llioc_callback_t	iocd_cb;
+	unsigned int	    iocd_count;
+	unsigned int	    iocd_cmd[0];
+};
+
+void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
+{
+	unsigned int size;
+	struct llioc_data *in_data = NULL;
+	ENTRY;
+
+	if (cb == NULL || cmd == NULL ||
+	    count > LLIOC_MAX_CMD || count < 0)
+		RETURN(NULL);
+
+	size = sizeof(*in_data) + count * sizeof(unsigned int);
+	OBD_ALLOC(in_data, size);
+	if (in_data == NULL)
+		RETURN(NULL);
+
+	memset(in_data, 0, sizeof(*in_data));
+	in_data->iocd_size = size;
+	in_data->iocd_cb = cb;
+	in_data->iocd_count = count;
+	memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
+
+	down_write(&llioc.ioc_sem);
+	list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
+	up_write(&llioc.ioc_sem);
+
+	RETURN(in_data);
+}
+
+void ll_iocontrol_unregister(void *magic)
+{
+	struct llioc_data *tmp;
+
+	if (magic == NULL)
+		return;
+
+	down_write(&llioc.ioc_sem);
+	list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
+		if (tmp == magic) {
+			unsigned int size = tmp->iocd_size;
+
+			list_del(&tmp->iocd_list);
+			up_write(&llioc.ioc_sem);
+
+			OBD_FREE(tmp, size);
+			return;
+		}
+	}
+	up_write(&llioc.ioc_sem);
+
+	CWARN("didn't find iocontrol register block with magic: %p\n", magic);
+}
+
+EXPORT_SYMBOL(ll_iocontrol_register);
+EXPORT_SYMBOL(ll_iocontrol_unregister);
+
+enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
+			unsigned int cmd, unsigned long arg, int *rcp)
+{
+	enum llioc_iter ret = LLIOC_CONT;
+	struct llioc_data *data;
+	int rc = -EINVAL, i;
+
+	down_read(&llioc.ioc_sem);
+	list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
+		for (i = 0; i < data->iocd_count; i++) {
+			if (cmd != data->iocd_cmd[i])
+				continue;
+
+			ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
+			break;
+		}
+
+		if (ret == LLIOC_STOP)
+			break;
+	}
+	up_read(&llioc.ioc_sem);
+
+	if (rcp)
+		*rcp = rc;
+	return ret;
+}
+
+int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct cl_env_nest nest;
+	struct lu_env *env;
+	int result;
+	ENTRY;
+
+	if (lli->lli_clob == NULL)
+		RETURN(0);
+
+	env = cl_env_nested_get(&nest);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	result = cl_conf_set(env, lli->lli_clob, conf);
+	cl_env_nested_put(&nest, env);
+
+	if (conf->coc_opc == OBJECT_CONF_SET) {
+		struct ldlm_lock *lock = conf->coc_lock;
+
+		LASSERT(lock != NULL);
+		LASSERT(ldlm_has_layout(lock));
+		if (result == 0) {
+			/* it can only be allowed to match after layout is
+			 * applied to inode otherwise false layout would be
+			 * seen. Applying layout shoud happen before dropping
+			 * the intent lock. */
+			ldlm_lock_allow_match(lock);
+		}
+	}
+	RETURN(result);
+}
+
+/* Fetch layout from MDT with getxattr request, if it's not ready yet */
+static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
+
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct obd_capa *oc;
+	struct ptlrpc_request *req;
+	struct mdt_body *body;
+	void *lvbdata;
+	void *lmm;
+	int lmmsize;
+	int rc;
+	ENTRY;
+
+	if (lock->l_lvb_data != NULL)
+		RETURN(0);
+
+	/* if layout lock was granted right away, the layout is returned
+	 * within DLM_LVB of dlm reply; otherwise if the lock was ever
+	 * blocked and then granted via completion ast, we have to fetch
+	 * layout here. Please note that we can't use the LVB buffer in
+	 * completion AST because it doesn't have a large enough buffer */
+	oc = ll_mdscapa_get(inode);
+	rc = ll_get_max_mdsize(sbi, &lmmsize);
+	if (rc == 0)
+		rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
+				OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
+				lmmsize, 0, &req);
+	capa_put(oc);
+	if (rc < 0)
+		RETURN(rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL || body->eadatasize > lmmsize)
+		GOTO(out, rc = -EPROTO);
+
+	lmmsize = body->eadatasize;
+	if (lmmsize == 0) /* empty layout */
+		GOTO(out, rc = 0);
+
+	lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
+	if (lmm == NULL)
+		GOTO(out, rc = -EFAULT);
+
+	OBD_ALLOC_LARGE(lvbdata, lmmsize);
+	if (lvbdata == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	memcpy(lvbdata, lmm, lmmsize);
+	lock_res_and_lock(lock);
+	if (lock->l_lvb_data == NULL) {
+		lock->l_lvb_data = lvbdata;
+		lock->l_lvb_len = lmmsize;
+		lvbdata = NULL;
+	}
+	unlock_res_and_lock(lock);
+
+	if (lvbdata != NULL)
+		OBD_FREE_LARGE(lvbdata, lmmsize);
+	EXIT;
+
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+/**
+ * Apply the layout to the inode. Layout lock is held and will be released
+ * in this function.
+ */
+static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
+				struct inode *inode, __u32 *gen, bool reconf)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ll_sb_info    *sbi = ll_i2sbi(inode);
+	struct ldlm_lock *lock;
+	struct lustre_md md = { NULL };
+	struct cl_object_conf conf;
+	int rc = 0;
+	bool lvb_ready;
+	bool wait_layout = false;
+	ENTRY;
+
+	LASSERT(lustre_handle_is_used(lockh));
+
+	lock = ldlm_handle2lock(lockh);
+	LASSERT(lock != NULL);
+	LASSERT(ldlm_has_layout(lock));
+
+	LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
+		inode, PFID(&lli->lli_fid), reconf);
+
+	lock_res_and_lock(lock);
+	lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
+	unlock_res_and_lock(lock);
+	/* checking lvb_ready is racy but this is okay. The worst case is
+	 * that multi processes may configure the file on the same time. */
+	if (lvb_ready || !reconf) {
+		rc = -ENODATA;
+		if (lvb_ready) {
+			/* layout_gen must be valid if layout lock is not
+			 * cancelled and stripe has already set */
+			*gen = lli->lli_layout_gen;
+			rc = 0;
+		}
+		GOTO(out, rc);
+	}
+
+	rc = ll_layout_fetch(inode, lock);
+	if (rc < 0)
+		GOTO(out, rc);
+
+	/* for layout lock, lmm is returned in lock's lvb.
+	 * lvb_data is immutable if the lock is held so it's safe to access it
+	 * without res lock. See the description in ldlm_lock_decref_internal()
+	 * for the condition to free lvb_data of layout lock */
+	if (lock->l_lvb_data != NULL) {
+		rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
+				  lock->l_lvb_data, lock->l_lvb_len);
+		if (rc >= 0) {
+			*gen = LL_LAYOUT_GEN_EMPTY;
+			if (md.lsm != NULL)
+				*gen = md.lsm->lsm_layout_gen;
+			rc = 0;
+		} else {
+			CERROR("%s: file "DFID" unpackmd error: %d\n",
+				ll_get_fsname(inode->i_sb, NULL, 0),
+				PFID(&lli->lli_fid), rc);
+		}
+	}
+	if (rc < 0)
+		GOTO(out, rc);
+
+	/* set layout to file. Unlikely this will fail as old layout was
+	 * surely eliminated */
+	memset(&conf, 0, sizeof conf);
+	conf.coc_opc = OBJECT_CONF_SET;
+	conf.coc_inode = inode;
+	conf.coc_lock = lock;
+	conf.u.coc_md = &md;
+	rc = ll_layout_conf(inode, &conf);
+
+	if (md.lsm != NULL)
+		obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
+
+	/* refresh layout failed, need to wait */
+	wait_layout = rc == -EBUSY;
+	EXIT;
+
+out:
+	LDLM_LOCK_PUT(lock);
+	ldlm_lock_decref(lockh, mode);
+
+	/* wait for IO to complete if it's still being used. */
+	if (wait_layout) {
+		CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
+			ll_get_fsname(inode->i_sb, NULL, 0),
+			inode, PFID(&lli->lli_fid));
+
+		memset(&conf, 0, sizeof conf);
+		conf.coc_opc = OBJECT_CONF_WAIT;
+		conf.coc_inode = inode;
+		rc = ll_layout_conf(inode, &conf);
+		if (rc == 0)
+			rc = -EAGAIN;
+
+		CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
+			PFID(&lli->lli_fid), rc);
+	}
+	RETURN(rc);
+}
+
+/**
+ * This function checks if there exists a LAYOUT lock on the client side,
+ * or enqueues it if it doesn't have one in cache.
+ *
+ * This function will not hold layout lock so it may be revoked any time after
+ * this function returns. Any operations depend on layout should be redone
+ * in that case.
+ *
+ * This function should be called before lov_io_init() to get an uptodate
+ * layout version, the caller should save the version number and after IO
+ * is finished, this function should be called again to verify that layout
+ * is not changed during IO time.
+ */
+int ll_layout_refresh(struct inode *inode, __u32 *gen)
+{
+	struct ll_inode_info  *lli = ll_i2info(inode);
+	struct ll_sb_info     *sbi = ll_i2sbi(inode);
+	struct md_op_data     *op_data;
+	struct lookup_intent   it;
+	struct lustre_handle   lockh;
+	ldlm_mode_t	       mode;
+	struct ldlm_enqueue_info einfo = { .ei_type = LDLM_IBITS,
+					   .ei_mode = LCK_CR,
+					   .ei_cb_bl = ll_md_blocking_ast,
+					   .ei_cb_cp = ldlm_completion_ast,
+					   .ei_cbdata = NULL };
+	int rc;
+	ENTRY;
+
+	*gen = lli->lli_layout_gen;
+	if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
+		RETURN(0);
+
+	/* sanity checks */
+	LASSERT(fid_is_sane(ll_inode2fid(inode)));
+	LASSERT(S_ISREG(inode->i_mode));
+
+	/* mostly layout lock is caching on the local side, so try to match
+	 * it before grabbing layout lock mutex. */
+	mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
+	if (mode != 0) { /* hit cached lock */
+		rc = ll_layout_lock_set(&lockh, mode, inode, gen, false);
+		if (rc == 0)
+			RETURN(0);
+
+		/* better hold lli_layout_mutex to try again otherwise
+		 * it will have starvation problem. */
+	}
+
+	/* take layout lock mutex to enqueue layout lock exclusively. */
+	mutex_lock(&lli->lli_layout_mutex);
+
+again:
+	/* try again. Maybe somebody else has done this. */
+	mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
+	if (mode != 0) { /* hit cached lock */
+		rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
+		if (rc == -EAGAIN)
+			goto again;
+
+		mutex_unlock(&lli->lli_layout_mutex);
+		RETURN(rc);
+	}
+
+	op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
+			0, 0, LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data)) {
+		mutex_unlock(&lli->lli_layout_mutex);
+		RETURN(PTR_ERR(op_data));
+	}
+
+	/* have to enqueue one */
+	memset(&it, 0, sizeof(it));
+	it.it_op = IT_LAYOUT;
+	lockh.cookie = 0ULL;
+
+	LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
+			ll_get_fsname(inode->i_sb, NULL, 0), inode,
+			PFID(&lli->lli_fid));
+
+	rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
+			NULL, 0, NULL, 0);
+	if (it.d.lustre.it_data != NULL)
+		ptlrpc_req_finished(it.d.lustre.it_data);
+	it.d.lustre.it_data = NULL;
+
+	ll_finish_md_op_data(op_data);
+
+	md_set_lock_data(sbi->ll_md_exp, &it.d.lustre.it_lock_handle, inode, NULL);
+
+	mode = it.d.lustre.it_lock_mode;
+	it.d.lustre.it_lock_mode = 0;
+	ll_intent_drop_lock(&it);
+
+	if (rc == 0) {
+		/* set lock data in case this is a new lock */
+		ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
+		rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
+		if (rc == -EAGAIN)
+			goto again;
+	}
+	mutex_unlock(&lli->lli_layout_mutex);
+
+	RETURN(rc);
+}
diff --git a/drivers/staging/lustre/lustre/llite/llite_capa.c b/drivers/staging/lustre/lustre/llite/llite_capa.c
new file mode 100644
index 000000000000..b6fd9593325a
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/llite_capa.c
@@ -0,0 +1,661 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/llite_capa.c
+ *
+ * Author: Lai Siyao <lsy@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/fs.h>
+#include <linux/version.h>
+#include <asm/uaccess.h>
+#include <linux/file.h>
+#include <linux/kmod.h>
+
+#include <lustre_lite.h>
+#include "llite_internal.h"
+
+/* for obd_capa.c_list, client capa might stay in three places:
+ * 1. ll_capa_list.
+ * 2. ll_idle_capas.
+ * 3. stand alone: just allocated.
+ */
+
+/* capas for oss writeback and those failed to renew */
+static LIST_HEAD(ll_idle_capas);
+static struct ptlrpc_thread ll_capa_thread;
+static struct list_head *ll_capa_list = &capa_list[CAPA_SITE_CLIENT];
+
+/* llite capa renewal timer */
+struct timer_list ll_capa_timer;
+/* for debug: indicate whether capa on llite is enabled or not */
+static atomic_t ll_capa_debug = ATOMIC_INIT(0);
+static unsigned long long ll_capa_renewed = 0;
+static unsigned long long ll_capa_renewal_noent = 0;
+static unsigned long long ll_capa_renewal_failed = 0;
+static unsigned long long ll_capa_renewal_retries = 0;
+
+static inline void update_capa_timer(struct obd_capa *ocapa, cfs_time_t expiry)
+{
+	if (cfs_time_before(expiry, ll_capa_timer.expires) ||
+	    !timer_pending(&ll_capa_timer)) {
+		mod_timer(&ll_capa_timer, expiry);
+		DEBUG_CAPA(D_SEC, &ocapa->c_capa,
+			   "ll_capa_timer update: %lu/%lu by", expiry, jiffies);
+	}
+}
+
+static inline cfs_time_t capa_renewal_time(struct obd_capa *ocapa)
+{
+	return cfs_time_sub(ocapa->c_expiry,
+			    cfs_time_seconds(ocapa->c_capa.lc_timeout) / 2);
+}
+
+static inline int capa_is_to_expire(struct obd_capa *ocapa)
+{
+	return cfs_time_beforeq(capa_renewal_time(ocapa), cfs_time_current());
+}
+
+static inline int have_expired_capa(void)
+{
+	struct obd_capa *ocapa = NULL;
+	int expired = 0;
+
+	/* if ll_capa_list has client capa to expire or ll_idle_capas has
+	 * expired capa, return 1.
+	 */
+	spin_lock(&capa_lock);
+	if (!list_empty(ll_capa_list)) {
+		ocapa = list_entry(ll_capa_list->next, struct obd_capa,
+				       c_list);
+		expired = capa_is_to_expire(ocapa);
+		if (!expired)
+			update_capa_timer(ocapa, capa_renewal_time(ocapa));
+	} else if (!list_empty(&ll_idle_capas)) {
+		ocapa = list_entry(ll_idle_capas.next, struct obd_capa,
+				       c_list);
+		expired = capa_is_expired(ocapa);
+		if (!expired)
+			update_capa_timer(ocapa, ocapa->c_expiry);
+	}
+	spin_unlock(&capa_lock);
+
+	if (expired)
+		DEBUG_CAPA(D_SEC, &ocapa->c_capa, "expired");
+	return expired;
+}
+
+static void sort_add_capa(struct obd_capa *ocapa, struct list_head *head)
+{
+	struct obd_capa *tmp;
+	struct list_head *before = NULL;
+
+	/* TODO: client capa is sorted by expiry, this could be optimized */
+	list_for_each_entry_reverse(tmp, head, c_list) {
+		if (cfs_time_aftereq(ocapa->c_expiry, tmp->c_expiry)) {
+			before = &tmp->c_list;
+			break;
+		}
+	}
+
+	LASSERT(&ocapa->c_list != before);
+	list_add(&ocapa->c_list, before ?: head);
+}
+
+static inline int obd_capa_open_count(struct obd_capa *oc)
+{
+	struct ll_inode_info *lli = ll_i2info(oc->u.cli.inode);
+	return atomic_read(&lli->lli_open_count);
+}
+
+static void ll_delete_capa(struct obd_capa *ocapa)
+{
+	struct ll_inode_info *lli = ll_i2info(ocapa->u.cli.inode);
+
+	if (capa_for_mds(&ocapa->c_capa)) {
+		LASSERT(lli->lli_mds_capa == ocapa);
+		lli->lli_mds_capa = NULL;
+	} else if (capa_for_oss(&ocapa->c_capa)) {
+		list_del_init(&ocapa->u.cli.lli_list);
+	}
+
+	DEBUG_CAPA(D_SEC, &ocapa->c_capa, "free client");
+	list_del_init(&ocapa->c_list);
+	capa_count[CAPA_SITE_CLIENT]--;
+	/* release the ref when alloc */
+	capa_put(ocapa);
+}
+
+/* three places where client capa is deleted:
+ * 1. capa_thread_main(), main place to delete expired capa.
+ * 2. ll_clear_inode_capas() in ll_clear_inode().
+ * 3. ll_truncate_free_capa() delete truncate capa explicitly in ll_setattr_ost().
+ */
+static int capa_thread_main(void *unused)
+{
+	struct obd_capa *ocapa, *tmp, *next;
+	struct inode *inode = NULL;
+	struct l_wait_info lwi = { 0 };
+	int rc;
+	ENTRY;
+
+	thread_set_flags(&ll_capa_thread, SVC_RUNNING);
+	wake_up(&ll_capa_thread.t_ctl_waitq);
+
+	while (1) {
+		l_wait_event(ll_capa_thread.t_ctl_waitq,
+			     !thread_is_running(&ll_capa_thread) ||
+			     have_expired_capa(),
+			     &lwi);
+
+		if (!thread_is_running(&ll_capa_thread))
+			break;
+
+		next = NULL;
+
+		spin_lock(&capa_lock);
+		list_for_each_entry_safe(ocapa, tmp, ll_capa_list, c_list) {
+			__u64 ibits;
+
+			LASSERT(ocapa->c_capa.lc_opc != CAPA_OPC_OSS_TRUNC);
+
+			if (!capa_is_to_expire(ocapa)) {
+				next = ocapa;
+				break;
+			}
+
+			list_del_init(&ocapa->c_list);
+
+			/* for MDS capability, only renew those which belong to
+			 * dir, or its inode is opened, or client holds LOOKUP
+			 * lock.
+			 */
+			/* ibits may be changed by ll_have_md_lock() so we have
+			 * to set it each time */
+			ibits = MDS_INODELOCK_LOOKUP;
+			if (capa_for_mds(&ocapa->c_capa) &&
+			    !S_ISDIR(ocapa->u.cli.inode->i_mode) &&
+			    obd_capa_open_count(ocapa) == 0 &&
+			    !ll_have_md_lock(ocapa->u.cli.inode,
+					     &ibits, LCK_MINMODE)) {
+				DEBUG_CAPA(D_SEC, &ocapa->c_capa,
+					   "skip renewal for");
+				sort_add_capa(ocapa, &ll_idle_capas);
+				continue;
+			}
+
+			/* for OSS capability, only renew those whose inode is
+			 * opened.
+			 */
+			if (capa_for_oss(&ocapa->c_capa) &&
+			    obd_capa_open_count(ocapa) == 0) {
+				/* oss capa with open count == 0 won't renew,
+				 * move to idle list */
+				sort_add_capa(ocapa, &ll_idle_capas);
+				continue;
+			}
+
+			/* NB iput() is in ll_update_capa() */
+			inode = igrab(ocapa->u.cli.inode);
+			if (inode == NULL) {
+				DEBUG_CAPA(D_ERROR, &ocapa->c_capa,
+					   "igrab failed for");
+				continue;
+			}
+
+			capa_get(ocapa);
+			ll_capa_renewed++;
+			spin_unlock(&capa_lock);
+			rc = md_renew_capa(ll_i2mdexp(inode), ocapa,
+					   ll_update_capa);
+			spin_lock(&capa_lock);
+			if (rc) {
+				DEBUG_CAPA(D_ERROR, &ocapa->c_capa,
+					   "renew failed: %d", rc);
+				ll_capa_renewal_failed++;
+			}
+		}
+
+		if (next)
+			update_capa_timer(next, capa_renewal_time(next));
+
+		list_for_each_entry_safe(ocapa, tmp, &ll_idle_capas,
+					     c_list) {
+			if (!capa_is_expired(ocapa)) {
+				if (!next)
+					update_capa_timer(ocapa,
+							  ocapa->c_expiry);
+				break;
+			}
+
+			if (atomic_read(&ocapa->c_refc) > 1) {
+				DEBUG_CAPA(D_SEC, &ocapa->c_capa,
+					   "expired(c_refc %d), don't release",
+					   atomic_read(&ocapa->c_refc));
+				/* don't try to renew any more */
+				list_del_init(&ocapa->c_list);
+				continue;
+			}
+
+			/* expired capa is released. */
+			DEBUG_CAPA(D_SEC, &ocapa->c_capa, "release expired");
+			ll_delete_capa(ocapa);
+		}
+
+		spin_unlock(&capa_lock);
+	}
+
+	thread_set_flags(&ll_capa_thread, SVC_STOPPED);
+	wake_up(&ll_capa_thread.t_ctl_waitq);
+	RETURN(0);
+}
+
+void ll_capa_timer_callback(unsigned long unused)
+{
+	wake_up(&ll_capa_thread.t_ctl_waitq);
+}
+
+int ll_capa_thread_start(void)
+{
+	task_t *task;
+	ENTRY;
+
+	init_waitqueue_head(&ll_capa_thread.t_ctl_waitq);
+
+	task = kthread_run(capa_thread_main, NULL, "ll_capa");
+	if (IS_ERR(task)) {
+		CERROR("cannot start expired capa thread: rc %ld\n",
+			PTR_ERR(task));
+		RETURN(PTR_ERR(task));
+	}
+	wait_event(ll_capa_thread.t_ctl_waitq,
+		       thread_is_running(&ll_capa_thread));
+
+	RETURN(0);
+}
+
+void ll_capa_thread_stop(void)
+{
+	thread_set_flags(&ll_capa_thread, SVC_STOPPING);
+	wake_up(&ll_capa_thread.t_ctl_waitq);
+	wait_event(ll_capa_thread.t_ctl_waitq,
+		       thread_is_stopped(&ll_capa_thread));
+}
+
+struct obd_capa *ll_osscapa_get(struct inode *inode, __u64 opc)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct obd_capa *ocapa;
+	int found = 0;
+
+	ENTRY;
+
+	if ((ll_i2sbi(inode)->ll_flags & LL_SBI_OSS_CAPA) == 0)
+		RETURN(NULL);
+
+	LASSERT(opc == CAPA_OPC_OSS_WRITE || opc == CAPA_OPC_OSS_RW ||
+		opc == CAPA_OPC_OSS_TRUNC);
+
+	spin_lock(&capa_lock);
+	list_for_each_entry(ocapa, &lli->lli_oss_capas, u.cli.lli_list) {
+		if (capa_is_expired(ocapa))
+			continue;
+		if ((opc & CAPA_OPC_OSS_WRITE) &&
+		    capa_opc_supported(&ocapa->c_capa, CAPA_OPC_OSS_WRITE)) {
+			found = 1;
+			break;
+		} else if ((opc & CAPA_OPC_OSS_READ) &&
+			   capa_opc_supported(&ocapa->c_capa,
+					      CAPA_OPC_OSS_READ)) {
+			found = 1;
+			break;
+		} else if ((opc & CAPA_OPC_OSS_TRUNC) &&
+			   capa_opc_supported(&ocapa->c_capa, opc)) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (found) {
+		LASSERT(lu_fid_eq(capa_fid(&ocapa->c_capa),
+				  ll_inode2fid(inode)));
+		LASSERT(ocapa->c_site == CAPA_SITE_CLIENT);
+
+		capa_get(ocapa);
+
+		DEBUG_CAPA(D_SEC, &ocapa->c_capa, "found client");
+	} else {
+		ocapa = NULL;
+
+		if (atomic_read(&ll_capa_debug)) {
+			CERROR("no capability for "DFID" opc "LPX64"\n",
+			       PFID(&lli->lli_fid), opc);
+			atomic_set(&ll_capa_debug, 0);
+		}
+	}
+	spin_unlock(&capa_lock);
+
+	RETURN(ocapa);
+}
+EXPORT_SYMBOL(ll_osscapa_get);
+
+struct obd_capa *ll_mdscapa_get(struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct obd_capa *ocapa;
+	ENTRY;
+
+	LASSERT(inode != NULL);
+
+	if ((ll_i2sbi(inode)->ll_flags & LL_SBI_MDS_CAPA) == 0)
+		RETURN(NULL);
+
+	spin_lock(&capa_lock);
+	ocapa = capa_get(lli->lli_mds_capa);
+	spin_unlock(&capa_lock);
+	if (!ocapa && atomic_read(&ll_capa_debug)) {
+		CERROR("no mds capability for "DFID"\n", PFID(&lli->lli_fid));
+		atomic_set(&ll_capa_debug, 0);
+	}
+
+	RETURN(ocapa);
+}
+
+static struct obd_capa *do_add_mds_capa(struct inode *inode,
+					struct obd_capa *ocapa)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct obd_capa *old = lli->lli_mds_capa;
+	struct lustre_capa *capa = &ocapa->c_capa;
+
+	if (!old) {
+		ocapa->u.cli.inode = inode;
+		lli->lli_mds_capa = ocapa;
+		capa_count[CAPA_SITE_CLIENT]++;
+
+		DEBUG_CAPA(D_SEC, capa, "add MDS");
+	} else {
+		spin_lock(&old->c_lock);
+		old->c_capa = *capa;
+		spin_unlock(&old->c_lock);
+
+		DEBUG_CAPA(D_SEC, capa, "update MDS");
+
+		capa_put(ocapa);
+		ocapa = old;
+	}
+	return ocapa;
+}
+
+static struct obd_capa *do_lookup_oss_capa(struct inode *inode, int opc)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct obd_capa *ocapa;
+
+	/* inside capa_lock */
+	list_for_each_entry(ocapa, &lli->lli_oss_capas, u.cli.lli_list) {
+		if ((capa_opc(&ocapa->c_capa) & opc) != opc)
+			continue;
+
+		LASSERT(lu_fid_eq(capa_fid(&ocapa->c_capa),
+				  ll_inode2fid(inode)));
+		LASSERT(ocapa->c_site == CAPA_SITE_CLIENT);
+
+		DEBUG_CAPA(D_SEC, &ocapa->c_capa, "found client");
+		return ocapa;
+	}
+
+	return NULL;
+}
+
+static inline void inode_add_oss_capa(struct inode *inode,
+				      struct obd_capa *ocapa)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct obd_capa *tmp;
+	struct list_head *next = NULL;
+
+	/* capa is sorted in lli_oss_capas so lookup can always find the
+	 * latest one */
+	list_for_each_entry(tmp, &lli->lli_oss_capas, u.cli.lli_list) {
+		if (cfs_time_after(ocapa->c_expiry, tmp->c_expiry)) {
+			next = &tmp->u.cli.lli_list;
+			break;
+		}
+	}
+	LASSERT(&ocapa->u.cli.lli_list != next);
+	list_move_tail(&ocapa->u.cli.lli_list, next ?: &lli->lli_oss_capas);
+}
+
+static struct obd_capa *do_add_oss_capa(struct inode *inode,
+					struct obd_capa *ocapa)
+{
+	struct obd_capa *old;
+	struct lustre_capa *capa = &ocapa->c_capa;
+
+	LASSERTF(S_ISREG(inode->i_mode),
+		 "inode has oss capa, but not regular file, mode: %d\n",
+		 inode->i_mode);
+
+	/* FIXME: can't replace it so easily with fine-grained opc */
+	old = do_lookup_oss_capa(inode, capa_opc(capa) & CAPA_OPC_OSS_ONLY);
+	if (!old) {
+		ocapa->u.cli.inode = inode;
+		INIT_LIST_HEAD(&ocapa->u.cli.lli_list);
+		capa_count[CAPA_SITE_CLIENT]++;
+
+		DEBUG_CAPA(D_SEC, capa, "add OSS");
+	} else {
+		spin_lock(&old->c_lock);
+		old->c_capa = *capa;
+		spin_unlock(&old->c_lock);
+
+		DEBUG_CAPA(D_SEC, capa, "update OSS");
+
+		capa_put(ocapa);
+		ocapa = old;
+	}
+
+	inode_add_oss_capa(inode, ocapa);
+	return ocapa;
+}
+
+struct obd_capa *ll_add_capa(struct inode *inode, struct obd_capa *ocapa)
+{
+	spin_lock(&capa_lock);
+	ocapa = capa_for_mds(&ocapa->c_capa) ? do_add_mds_capa(inode, ocapa) :
+					       do_add_oss_capa(inode, ocapa);
+
+	/* truncate capa won't renew */
+	if (ocapa->c_capa.lc_opc != CAPA_OPC_OSS_TRUNC) {
+		set_capa_expiry(ocapa);
+		list_del_init(&ocapa->c_list);
+		sort_add_capa(ocapa, ll_capa_list);
+
+		update_capa_timer(ocapa, capa_renewal_time(ocapa));
+	}
+
+	spin_unlock(&capa_lock);
+
+	atomic_set(&ll_capa_debug, 1);
+	return ocapa;
+}
+
+static inline void delay_capa_renew(struct obd_capa *oc, cfs_time_t delay)
+{
+	/* NB: set a fake expiry for this capa to prevent it renew too soon */
+	oc->c_expiry = cfs_time_add(oc->c_expiry, cfs_time_seconds(delay));
+}
+
+int ll_update_capa(struct obd_capa *ocapa, struct lustre_capa *capa)
+{
+	struct inode *inode = ocapa->u.cli.inode;
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(ocapa);
+
+	if (IS_ERR(capa)) {
+		/* set error code */
+		rc = PTR_ERR(capa);
+		spin_lock(&capa_lock);
+		if (rc == -ENOENT) {
+			DEBUG_CAPA(D_SEC, &ocapa->c_capa,
+				   "renewal canceled because object removed");
+			ll_capa_renewal_noent++;
+		} else {
+			ll_capa_renewal_failed++;
+
+			/* failed capa won't be renewed any longer, but if -EIO,
+			 * client might be doing recovery, retry in 2 min. */
+			if (rc == -EIO && !capa_is_expired(ocapa)) {
+				delay_capa_renew(ocapa, 120);
+				DEBUG_CAPA(D_ERROR, &ocapa->c_capa,
+					   "renewal failed: -EIO, "
+					   "retry in 2 mins");
+				ll_capa_renewal_retries++;
+				GOTO(retry, rc);
+			} else {
+				DEBUG_CAPA(D_ERROR, &ocapa->c_capa,
+					   "renewal failed(rc: %d) for", rc);
+			}
+		}
+
+		list_del_init(&ocapa->c_list);
+		sort_add_capa(ocapa, &ll_idle_capas);
+		spin_unlock(&capa_lock);
+
+		capa_put(ocapa);
+		iput(inode);
+		RETURN(rc);
+	}
+
+	spin_lock(&ocapa->c_lock);
+	LASSERT(!memcmp(&ocapa->c_capa, capa,
+			offsetof(struct lustre_capa, lc_opc)));
+	ocapa->c_capa = *capa;
+	set_capa_expiry(ocapa);
+	spin_unlock(&ocapa->c_lock);
+
+	spin_lock(&capa_lock);
+	if (capa_for_oss(capa))
+		inode_add_oss_capa(inode, ocapa);
+	DEBUG_CAPA(D_SEC, capa, "renew");
+	EXIT;
+retry:
+	list_del_init(&ocapa->c_list);
+	sort_add_capa(ocapa, ll_capa_list);
+	update_capa_timer(ocapa, capa_renewal_time(ocapa));
+	spin_unlock(&capa_lock);
+
+	capa_put(ocapa);
+	iput(inode);
+	return rc;
+}
+
+void ll_capa_open(struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+
+	if ((ll_i2sbi(inode)->ll_flags & (LL_SBI_MDS_CAPA | LL_SBI_OSS_CAPA))
+	    == 0)
+		return;
+
+	if (!S_ISREG(inode->i_mode))
+		return;
+
+	atomic_inc(&lli->lli_open_count);
+}
+
+void ll_capa_close(struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+
+	if ((ll_i2sbi(inode)->ll_flags & (LL_SBI_MDS_CAPA | LL_SBI_OSS_CAPA))
+	    == 0)
+		return;
+
+	if (!S_ISREG(inode->i_mode))
+		return;
+
+	atomic_dec(&lli->lli_open_count);
+}
+
+/* delete CAPA_OPC_OSS_TRUNC only */
+void ll_truncate_free_capa(struct obd_capa *ocapa)
+{
+	if (!ocapa)
+		return;
+
+	LASSERT(ocapa->c_capa.lc_opc & CAPA_OPC_OSS_TRUNC);
+	DEBUG_CAPA(D_SEC, &ocapa->c_capa, "free truncate");
+
+	/* release ref when find */
+	capa_put(ocapa);
+	if (likely(ocapa->c_capa.lc_opc == CAPA_OPC_OSS_TRUNC)) {
+		spin_lock(&capa_lock);
+		ll_delete_capa(ocapa);
+		spin_unlock(&capa_lock);
+	}
+}
+
+void ll_clear_inode_capas(struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct obd_capa *ocapa, *tmp;
+
+	spin_lock(&capa_lock);
+	ocapa = lli->lli_mds_capa;
+	if (ocapa)
+		ll_delete_capa(ocapa);
+
+	list_for_each_entry_safe(ocapa, tmp, &lli->lli_oss_capas,
+				     u.cli.lli_list)
+		ll_delete_capa(ocapa);
+	spin_unlock(&capa_lock);
+}
+
+void ll_print_capa_stat(struct ll_sb_info *sbi)
+{
+	if (sbi->ll_flags & (LL_SBI_MDS_CAPA | LL_SBI_OSS_CAPA))
+		LCONSOLE_INFO("Fid capabilities renewed: %llu\n"
+			      "Fid capabilities renewal ENOENT: %llu\n"
+			      "Fid capabilities failed to renew: %llu\n"
+			      "Fid capabilities renewal retries: %llu\n",
+			      ll_capa_renewed, ll_capa_renewal_noent,
+			      ll_capa_renewal_failed, ll_capa_renewal_retries);
+}
diff --git a/drivers/staging/lustre/lustre/llite/llite_close.c b/drivers/staging/lustre/lustre/llite/llite_close.c
new file mode 100644
index 000000000000..00b2b38d4c97
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/llite_close.c
@@ -0,0 +1,412 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/llite_close.c
+ *
+ * Lustre Lite routines to issue a secondary close after writeback
+ */
+
+#include <linux/module.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <lustre_lite.h>
+#include "llite_internal.h"
+
+/** records that a write is in flight */
+void vvp_write_pending(struct ccc_object *club, struct ccc_page *page)
+{
+	struct ll_inode_info *lli = ll_i2info(club->cob_inode);
+
+	ENTRY;
+	spin_lock(&lli->lli_lock);
+	lli->lli_flags |= LLIF_SOM_DIRTY;
+	if (page != NULL && list_empty(&page->cpg_pending_linkage))
+		list_add(&page->cpg_pending_linkage,
+			     &club->cob_pending_list);
+	spin_unlock(&lli->lli_lock);
+	EXIT;
+}
+
+/** records that a write has completed */
+void vvp_write_complete(struct ccc_object *club, struct ccc_page *page)
+{
+	struct ll_inode_info *lli = ll_i2info(club->cob_inode);
+	int rc = 0;
+
+	ENTRY;
+	spin_lock(&lli->lli_lock);
+	if (page != NULL && !list_empty(&page->cpg_pending_linkage)) {
+		list_del_init(&page->cpg_pending_linkage);
+		rc = 1;
+	}
+	spin_unlock(&lli->lli_lock);
+	if (rc)
+		ll_queue_done_writing(club->cob_inode, 0);
+	EXIT;
+}
+
+/** Queues DONE_WRITING if
+ * - done writing is allowed;
+ * - inode has no no dirty pages; */
+void ll_queue_done_writing(struct inode *inode, unsigned long flags)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ccc_object *club = cl2ccc(ll_i2info(inode)->lli_clob);
+	ENTRY;
+
+	spin_lock(&lli->lli_lock);
+	lli->lli_flags |= flags;
+
+	if ((lli->lli_flags & LLIF_DONE_WRITING) &&
+	    list_empty(&club->cob_pending_list)) {
+		struct ll_close_queue *lcq = ll_i2sbi(inode)->ll_lcq;
+
+		if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
+			CWARN("ino %lu/%u(flags %u) som valid it just after "
+			      "recovery\n",
+			      inode->i_ino, inode->i_generation,
+			      lli->lli_flags);
+		/* DONE_WRITING is allowed and inode has no dirty page. */
+		spin_lock(&lcq->lcq_lock);
+
+		LASSERT(list_empty(&lli->lli_close_list));
+		CDEBUG(D_INODE, "adding inode %lu/%u to close list\n",
+		       inode->i_ino, inode->i_generation);
+		list_add_tail(&lli->lli_close_list, &lcq->lcq_head);
+
+		/* Avoid a concurrent insertion into the close thread queue:
+		 * an inode is already in the close thread, open(), write(),
+		 * close() happen, epoch is closed as the inode is marked as
+		 * LLIF_EPOCH_PENDING. When pages are written inode should not
+		 * be inserted into the queue again, clear this flag to avoid
+		 * it. */
+		lli->lli_flags &= ~LLIF_DONE_WRITING;
+
+		wake_up(&lcq->lcq_waitq);
+		spin_unlock(&lcq->lcq_lock);
+	}
+	spin_unlock(&lli->lli_lock);
+	EXIT;
+}
+
+/** Pack SOM attributes info @opdata for CLOSE, DONE_WRITING rpc. */
+void ll_done_writing_attr(struct inode *inode, struct md_op_data *op_data)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	ENTRY;
+
+	op_data->op_flags |= MF_SOM_CHANGE;
+	/* Check if Size-on-MDS attributes are valid. */
+	if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
+		CERROR("ino %lu/%u(flags %u) som valid it just after "
+		       "recovery\n", inode->i_ino, inode->i_generation,
+		       lli->lli_flags);
+
+	if (!cl_local_size(inode)) {
+		/* Send Size-on-MDS Attributes if valid. */
+		op_data->op_attr.ia_valid |= ATTR_MTIME_SET | ATTR_CTIME_SET |
+				ATTR_ATIME_SET | ATTR_SIZE | ATTR_BLOCKS;
+	}
+	EXIT;
+}
+
+/** Closes ioepoch and packs Size-on-MDS attribute if needed into @op_data. */
+void ll_ioepoch_close(struct inode *inode, struct md_op_data *op_data,
+		      struct obd_client_handle **och, unsigned long flags)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ccc_object *club = cl2ccc(ll_i2info(inode)->lli_clob);
+	ENTRY;
+
+	spin_lock(&lli->lli_lock);
+	if (!(list_empty(&club->cob_pending_list))) {
+		if (!(lli->lli_flags & LLIF_EPOCH_PENDING)) {
+			LASSERT(*och != NULL);
+			LASSERT(lli->lli_pending_och == NULL);
+			/* Inode is dirty and there is no pending write done
+			 * request yet, DONE_WRITE is to be sent later. */
+			lli->lli_flags |= LLIF_EPOCH_PENDING;
+			lli->lli_pending_och = *och;
+			spin_unlock(&lli->lli_lock);
+
+			inode = igrab(inode);
+			LASSERT(inode);
+			GOTO(out, 0);
+		}
+		if (flags & LLIF_DONE_WRITING) {
+			/* Some pages are still dirty, it is early to send
+			 * DONE_WRITE. Wait untill all pages will be flushed
+			 * and try DONE_WRITE again later. */
+			LASSERT(!(lli->lli_flags & LLIF_DONE_WRITING));
+			lli->lli_flags |= LLIF_DONE_WRITING;
+			spin_unlock(&lli->lli_lock);
+
+			inode = igrab(inode);
+			LASSERT(inode);
+			GOTO(out, 0);
+		}
+	}
+	CDEBUG(D_INODE, "Epoch "LPU64" closed on "DFID"\n",
+	       ll_i2info(inode)->lli_ioepoch, PFID(&lli->lli_fid));
+	op_data->op_flags |= MF_EPOCH_CLOSE;
+
+	if (flags & LLIF_DONE_WRITING) {
+		LASSERT(lli->lli_flags & LLIF_SOM_DIRTY);
+		LASSERT(!(lli->lli_flags & LLIF_DONE_WRITING));
+		*och = lli->lli_pending_och;
+		lli->lli_pending_och = NULL;
+		lli->lli_flags &= ~LLIF_EPOCH_PENDING;
+	} else {
+		/* Pack Size-on-MDS inode attributes only if they has changed */
+		if (!(lli->lli_flags & LLIF_SOM_DIRTY)) {
+			spin_unlock(&lli->lli_lock);
+			GOTO(out, 0);
+		}
+
+		/* There is a pending DONE_WRITE -- close epoch with no
+		 * attribute change. */
+		if (lli->lli_flags & LLIF_EPOCH_PENDING) {
+			spin_unlock(&lli->lli_lock);
+			GOTO(out, 0);
+		}
+	}
+
+	LASSERT(list_empty(&club->cob_pending_list));
+	lli->lli_flags &= ~LLIF_SOM_DIRTY;
+	spin_unlock(&lli->lli_lock);
+	ll_done_writing_attr(inode, op_data);
+
+	EXIT;
+out:
+	return;
+}
+
+/**
+ * Cliens updates SOM attributes on MDS (including llog cookies):
+ * obd_getattr with no lock and md_setattr.
+ */
+int ll_som_update(struct inode *inode, struct md_op_data *op_data)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ptlrpc_request *request = NULL;
+	__u32 old_flags;
+	struct obdo *oa;
+	int rc;
+	ENTRY;
+
+	LASSERT(op_data != NULL);
+	if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
+		CERROR("ino %lu/%u(flags %u) som valid it just after "
+		       "recovery\n", inode->i_ino, inode->i_generation,
+		       lli->lli_flags);
+
+	OBDO_ALLOC(oa);
+	if (!oa) {
+		CERROR("can't allocate memory for Size-on-MDS update.\n");
+		RETURN(-ENOMEM);
+	}
+
+	old_flags = op_data->op_flags;
+	op_data->op_flags = MF_SOM_CHANGE;
+
+	/* If inode is already in another epoch, skip getattr from OSTs. */
+	if (lli->lli_ioepoch == op_data->op_ioepoch) {
+		rc = ll_inode_getattr(inode, oa, op_data->op_ioepoch,
+				      old_flags & MF_GETATTR_LOCK);
+		if (rc) {
+			oa->o_valid = 0;
+			if (rc != -ENOENT)
+				CERROR("inode_getattr failed (%d): unable to "
+				       "send a Size-on-MDS attribute update "
+				       "for inode %lu/%u\n", rc, inode->i_ino,
+				       inode->i_generation);
+		} else {
+			CDEBUG(D_INODE, "Size-on-MDS update on "DFID"\n",
+			       PFID(&lli->lli_fid));
+		}
+		/* Install attributes into op_data. */
+		md_from_obdo(op_data, oa, oa->o_valid);
+	}
+
+	rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data,
+			NULL, 0, NULL, 0, &request, NULL);
+	ptlrpc_req_finished(request);
+
+	OBDO_FREE(oa);
+	RETURN(rc);
+}
+
+/**
+ * Closes the ioepoch and packs all the attributes into @op_data for
+ * DONE_WRITING rpc.
+ */
+static void ll_prepare_done_writing(struct inode *inode,
+				    struct md_op_data *op_data,
+				    struct obd_client_handle **och)
+{
+	ll_ioepoch_close(inode, op_data, och, LLIF_DONE_WRITING);
+	/* If there is no @och, we do not do D_W yet. */
+	if (*och == NULL)
+		return;
+
+	ll_pack_inode2opdata(inode, op_data, &(*och)->och_fh);
+	ll_prep_md_op_data(op_data, inode, NULL, NULL,
+			   0, 0, LUSTRE_OPC_ANY, NULL);
+}
+
+/** Send a DONE_WRITING rpc. */
+static void ll_done_writing(struct inode *inode)
+{
+	struct obd_client_handle *och = NULL;
+	struct md_op_data *op_data;
+	int rc;
+	ENTRY;
+
+	LASSERT(exp_connect_som(ll_i2mdexp(inode)));
+
+	OBD_ALLOC_PTR(op_data);
+	if (op_data == NULL) {
+		CERROR("can't allocate op_data\n");
+		EXIT;
+		return;
+	}
+
+	ll_prepare_done_writing(inode, op_data, &och);
+	/* If there is no @och, we do not do D_W yet. */
+	if (och == NULL)
+		GOTO(out, 0);
+
+	rc = md_done_writing(ll_i2sbi(inode)->ll_md_exp, op_data, NULL);
+	if (rc == -EAGAIN) {
+		/* MDS has instructed us to obtain Size-on-MDS attribute from
+		 * OSTs and send setattr to back to MDS. */
+		rc = ll_som_update(inode, op_data);
+	} else if (rc) {
+		CERROR("inode %lu mdc done_writing failed: rc = %d\n",
+		       inode->i_ino, rc);
+	}
+out:
+	ll_finish_md_op_data(op_data);
+	if (och) {
+		md_clear_open_replay_data(ll_i2sbi(inode)->ll_md_exp, och);
+		OBD_FREE_PTR(och);
+	}
+	EXIT;
+}
+
+static struct ll_inode_info *ll_close_next_lli(struct ll_close_queue *lcq)
+{
+	struct ll_inode_info *lli = NULL;
+
+	spin_lock(&lcq->lcq_lock);
+
+	if (!list_empty(&lcq->lcq_head)) {
+		lli = list_entry(lcq->lcq_head.next, struct ll_inode_info,
+				     lli_close_list);
+		list_del_init(&lli->lli_close_list);
+	} else if (atomic_read(&lcq->lcq_stop))
+		lli = ERR_PTR(-EALREADY);
+
+	spin_unlock(&lcq->lcq_lock);
+	return lli;
+}
+
+static int ll_close_thread(void *arg)
+{
+	struct ll_close_queue *lcq = arg;
+	ENTRY;
+
+	complete(&lcq->lcq_comp);
+
+	while (1) {
+		struct l_wait_info lwi = { 0 };
+		struct ll_inode_info *lli;
+		struct inode *inode;
+
+		l_wait_event_exclusive(lcq->lcq_waitq,
+				       (lli = ll_close_next_lli(lcq)) != NULL,
+				       &lwi);
+		if (IS_ERR(lli))
+			break;
+
+		inode = ll_info2i(lli);
+		CDEBUG(D_INFO, "done_writting for inode %lu/%u\n",
+		       inode->i_ino, inode->i_generation);
+		ll_done_writing(inode);
+		iput(inode);
+	}
+
+	CDEBUG(D_INFO, "ll_close exiting\n");
+	complete(&lcq->lcq_comp);
+	RETURN(0);
+}
+
+int ll_close_thread_start(struct ll_close_queue **lcq_ret)
+{
+	struct ll_close_queue *lcq;
+	task_t *task;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CLOSE_THREAD))
+		return -EINTR;
+
+	OBD_ALLOC(lcq, sizeof(*lcq));
+	if (lcq == NULL)
+		return -ENOMEM;
+
+	spin_lock_init(&lcq->lcq_lock);
+	INIT_LIST_HEAD(&lcq->lcq_head);
+	init_waitqueue_head(&lcq->lcq_waitq);
+	init_completion(&lcq->lcq_comp);
+
+	task = kthread_run(ll_close_thread, lcq, "ll_close");
+	if (IS_ERR(task)) {
+		OBD_FREE(lcq, sizeof(*lcq));
+		return PTR_ERR(task);
+	}
+
+	wait_for_completion(&lcq->lcq_comp);
+	*lcq_ret = lcq;
+	return 0;
+}
+
+void ll_close_thread_shutdown(struct ll_close_queue *lcq)
+{
+	init_completion(&lcq->lcq_comp);
+	atomic_inc(&lcq->lcq_stop);
+	wake_up(&lcq->lcq_waitq);
+	wait_for_completion(&lcq->lcq_comp);
+	OBD_FREE(lcq, sizeof(*lcq));
+}
diff --git a/drivers/staging/lustre/lustre/llite/llite_internal.h b/drivers/staging/lustre/lustre/llite/llite_internal.h
new file mode 100644
index 000000000000..177b4dbd1c67
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/llite_internal.h
@@ -0,0 +1,1578 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef LLITE_INTERNAL_H
+#define LLITE_INTERNAL_H
+#include <lustre_debug.h>
+#include <lustre_ver.h>
+#include <lustre_disk.h>  /* for s2sbi */
+#include <lustre_eacl.h>
+
+/* for struct cl_lock_descr and struct cl_io */
+#include <cl_object.h>
+#include <lclient.h>
+#include <lustre_mdc.h>
+#include <linux/lustre_intent.h>
+
+#ifndef FMODE_EXEC
+#define FMODE_EXEC 0
+#endif
+
+#ifndef VM_FAULT_RETRY
+#define VM_FAULT_RETRY 0
+#endif
+
+/* Kernel 3.1 kills LOOKUP_CONTINUE, LOOKUP_PARENT is equivalent to it.
+ * seem kernel commit 49084c3bb2055c401f3493c13edae14d49128ca0 */
+#ifndef LOOKUP_CONTINUE
+#define LOOKUP_CONTINUE LOOKUP_PARENT
+#endif
+
+/** Only used on client-side for indicating the tail of dir hash/offset. */
+#define LL_DIR_END_OFF	  0x7fffffffffffffffULL
+#define LL_DIR_END_OFF_32BIT    0x7fffffffUL
+
+#define LL_IT2STR(it) ((it) ? ldlm_it2str((it)->it_op) : "0")
+#define LUSTRE_FPRIVATE(file) ((file)->private_data)
+
+struct ll_dentry_data {
+	int				lld_cwd_count;
+	int				lld_mnt_count;
+	struct obd_client_handle	lld_cwd_och;
+	struct obd_client_handle	lld_mnt_och;
+	struct lookup_intent		*lld_it;
+	unsigned int			lld_sa_generation;
+	unsigned int			lld_invalid:1;
+	struct rcu_head			lld_rcu_head;
+};
+
+#define ll_d2d(de) ((struct ll_dentry_data*)((de)->d_fsdata))
+
+extern struct file_operations ll_pgcache_seq_fops;
+
+#define LLI_INODE_MAGIC		 0x111d0de5
+#define LLI_INODE_DEAD		  0xdeadd00d
+
+/* remote client permission cache */
+#define REMOTE_PERM_HASHSIZE 16
+
+struct ll_getname_data {
+	char	    *lgd_name;      /* points to a buffer with NAME_MAX+1 size */
+	struct lu_fid    lgd_fid;       /* target fid we are looking for */
+	int	      lgd_found;     /* inode matched? */
+};
+
+/* llite setxid/access permission for user on remote client */
+struct ll_remote_perm {
+	struct hlist_node	lrp_list;
+	uid_t		   lrp_uid;
+	gid_t		   lrp_gid;
+	uid_t		   lrp_fsuid;
+	gid_t		   lrp_fsgid;
+	int		     lrp_access_perm; /* MAY_READ/WRITE/EXEC, this
+						    is access permission with
+						    lrp_fsuid/lrp_fsgid. */
+};
+
+enum lli_flags {
+	/* MDS has an authority for the Size-on-MDS attributes. */
+	LLIF_MDS_SIZE_LOCK      = (1 << 0),
+	/* Epoch close is postponed. */
+	LLIF_EPOCH_PENDING      = (1 << 1),
+	/* DONE WRITING is allowed. */
+	LLIF_DONE_WRITING       = (1 << 2),
+	/* Sizeon-on-MDS attributes are changed. An attribute update needs to
+	 * be sent to MDS. */
+	LLIF_SOM_DIRTY	  = (1 << 3),
+	/* File is contented */
+	LLIF_CONTENDED	  = (1 << 4),
+	/* Truncate uses server lock for this file */
+	LLIF_SRVLOCK	    = (1 << 5),
+	/* File data is modified. */
+	LLIF_DATA_MODIFIED      = (1 << 6),
+};
+
+struct ll_inode_info {
+	__u32				lli_inode_magic;
+	__u32				lli_flags;
+	__u64				lli_ioepoch;
+
+	spinlock_t			lli_lock;
+	struct posix_acl		*lli_posix_acl;
+
+	struct hlist_head		*lli_remote_perms;
+	struct mutex				lli_rmtperm_mutex;
+
+	/* identifying fields for both metadata and data stacks. */
+	struct lu_fid		   lli_fid;
+	/* Parent fid for accessing default stripe data on parent directory
+	 * for allocating OST objects after a mknod() and later open-by-FID. */
+	struct lu_fid		   lli_pfid;
+
+	struct list_head		      lli_close_list;
+	struct list_head		      lli_oss_capas;
+	/* open count currently used by capability only, indicate whether
+	 * capability needs renewal */
+	atomic_t		    lli_open_count;
+	struct obd_capa		*lli_mds_capa;
+	cfs_time_t		      lli_rmtperm_time;
+
+	/* handle is to be sent to MDS later on done_writing and setattr.
+	 * Open handle data are needed for the recovery to reconstruct
+	 * the inode state on the MDS. XXX: recovery is not ready yet. */
+	struct obd_client_handle       *lli_pending_och;
+
+	/* We need all three because every inode may be opened in different
+	 * modes */
+	struct obd_client_handle       *lli_mds_read_och;
+	struct obd_client_handle       *lli_mds_write_och;
+	struct obd_client_handle       *lli_mds_exec_och;
+	__u64			   lli_open_fd_read_count;
+	__u64			   lli_open_fd_write_count;
+	__u64			   lli_open_fd_exec_count;
+	/* Protects access to och pointers and their usage counters */
+	struct mutex			lli_och_mutex;
+
+	struct inode			lli_vfs_inode;
+
+	/* the most recent timestamps obtained from mds */
+	struct ost_lvb			lli_lvb;
+	spinlock_t			lli_agl_lock;
+
+	/* Try to make the d::member and f::member are aligned. Before using
+	 * these members, make clear whether it is directory or not. */
+	union {
+		/* for directory */
+		struct {
+			/* serialize normal readdir and statahead-readdir. */
+			struct mutex			d_readdir_mutex;
+
+			/* metadata statahead */
+			/* since parent-child threads can share the same @file
+			 * struct, "opendir_key" is the token when dir close for
+			 * case of parent exit before child -- it is me should
+			 * cleanup the dir readahead. */
+			void			   *d_opendir_key;
+			struct ll_statahead_info       *d_sai;
+			struct posix_acl	       *d_def_acl;
+			/* protect statahead stuff. */
+			spinlock_t			d_sa_lock;
+			/* "opendir_pid" is the token when lookup/revalid
+			 * -- I am the owner of dir statahead. */
+			pid_t			   d_opendir_pid;
+		} d;
+
+#define lli_readdir_mutex       u.d.d_readdir_mutex
+#define lli_opendir_key	 u.d.d_opendir_key
+#define lli_sai		 u.d.d_sai
+#define lli_def_acl	     u.d.d_def_acl
+#define lli_sa_lock	     u.d.d_sa_lock
+#define lli_opendir_pid	 u.d.d_opendir_pid
+
+		/* for non-directory */
+		struct {
+			struct semaphore		f_size_sem;
+			void				*f_size_sem_owner;
+			char				*f_symlink_name;
+			__u64				f_maxbytes;
+			/*
+			 * struct rw_semaphore {
+			 *    signed long	count;     // align d.d_def_acl
+			 *    spinlock_t	wait_lock; // align d.d_sa_lock
+			 *    struct list_head wait_list;
+			 * }
+			 */
+			struct rw_semaphore		f_trunc_sem;
+			struct mutex			f_write_mutex;
+
+			struct rw_semaphore		f_glimpse_sem;
+			cfs_time_t			f_glimpse_time;
+			struct list_head			f_agl_list;
+			__u64				f_agl_index;
+
+			/* for writepage() only to communicate to fsync */
+			int				f_async_rc;
+
+			/* volatile file criteria is based on file name, this
+			 * flag is used to keep the test result, so the strcmp
+			 * is done only once
+			 */
+			bool				f_volatile;
+			/*
+			 * whenever a process try to read/write the file, the
+			 * jobid of the process will be saved here, and it'll
+			 * be packed into the write PRC when flush later.
+			 *
+			 * so the read/write statistics for jobid will not be
+			 * accurate if the file is shared by different jobs.
+			 */
+			char		     f_jobid[JOBSTATS_JOBID_SIZE];
+		} f;
+
+#define lli_size_sem	    u.f.f_size_sem
+#define lli_size_sem_owner      u.f.f_size_sem_owner
+#define lli_symlink_name	u.f.f_symlink_name
+#define lli_maxbytes	    u.f.f_maxbytes
+#define lli_trunc_sem	   u.f.f_trunc_sem
+#define lli_write_mutex	 u.f.f_write_mutex
+#define lli_glimpse_sem		u.f.f_glimpse_sem
+#define lli_glimpse_time	u.f.f_glimpse_time
+#define lli_agl_list		u.f.f_agl_list
+#define lli_agl_index		u.f.f_agl_index
+#define lli_async_rc		u.f.f_async_rc
+#define lli_jobid		u.f.f_jobid
+#define lli_volatile		u.f.f_volatile
+
+	} u;
+
+	/* XXX: For following frequent used members, although they maybe special
+	 *      used for non-directory object, it is some time-wasting to check
+	 *      whether the object is directory or not before using them. On the
+	 *      other hand, currently, sizeof(f) > sizeof(d), it cannot reduce
+	 *      the "ll_inode_info" size even if moving those members into u.f.
+	 *      So keep them out side.
+	 *
+	 *      In the future, if more members are added only for directory,
+	 *      some of the following members can be moved into u.f.
+	 */
+	bool			    lli_has_smd;
+	struct cl_object	       *lli_clob;
+
+	/* mutex to request for layout lock exclusively. */
+	struct mutex			lli_layout_mutex;
+	/* valid only inside LAYOUT ibits lock, protected by lli_layout_mutex */
+	__u32				lli_layout_gen;
+};
+
+/*
+ * Locking to guarantee consistency of non-atomic updates to long long i_size,
+ * consistency between file size and KMS.
+ *
+ * Implemented by ->lli_size_sem and ->lsm_lock, nested in that order.
+ */
+
+void ll_inode_size_lock(struct inode *inode);
+void ll_inode_size_unlock(struct inode *inode);
+
+// FIXME: replace the name of this with LL_I to conform to kernel stuff
+// static inline struct ll_inode_info *LL_I(struct inode *inode)
+static inline struct ll_inode_info *ll_i2info(struct inode *inode)
+{
+	return container_of(inode, struct ll_inode_info, lli_vfs_inode);
+}
+
+/* default to about 40meg of readahead on a given system.  That much tied
+ * up in 512k readahead requests serviced at 40ms each is about 1GB/s. */
+#define SBI_DEFAULT_READAHEAD_MAX (40UL << (20 - PAGE_CACHE_SHIFT))
+
+/* default to read-ahead full files smaller than 2MB on the second read */
+#define SBI_DEFAULT_READAHEAD_WHOLE_MAX (2UL << (20 - PAGE_CACHE_SHIFT))
+
+enum ra_stat {
+	RA_STAT_HIT = 0,
+	RA_STAT_MISS,
+	RA_STAT_DISTANT_READPAGE,
+	RA_STAT_MISS_IN_WINDOW,
+	RA_STAT_FAILED_GRAB_PAGE,
+	RA_STAT_FAILED_MATCH,
+	RA_STAT_DISCARDED,
+	RA_STAT_ZERO_LEN,
+	RA_STAT_ZERO_WINDOW,
+	RA_STAT_EOF,
+	RA_STAT_MAX_IN_FLIGHT,
+	RA_STAT_WRONG_GRAB_PAGE,
+	_NR_RA_STAT,
+};
+
+struct ll_ra_info {
+	atomic_t	      ra_cur_pages;
+	unsigned long	     ra_max_pages;
+	unsigned long	     ra_max_pages_per_file;
+	unsigned long	     ra_max_read_ahead_whole_pages;
+};
+
+/* ra_io_arg will be filled in the beginning of ll_readahead with
+ * ras_lock, then the following ll_read_ahead_pages will read RA
+ * pages according to this arg, all the items in this structure are
+ * counted by page index.
+ */
+struct ra_io_arg {
+	unsigned long ria_start;  /* start offset of read-ahead*/
+	unsigned long ria_end;    /* end offset of read-ahead*/
+	/* If stride read pattern is detected, ria_stoff means where
+	 * stride read is started. Note: for normal read-ahead, the
+	 * value here is meaningless, and also it will not be accessed*/
+	pgoff_t ria_stoff;
+	/* ria_length and ria_pages are the length and pages length in the
+	 * stride I/O mode. And they will also be used to check whether
+	 * it is stride I/O read-ahead in the read-ahead pages*/
+	unsigned long ria_length;
+	unsigned long ria_pages;
+};
+
+/* LL_HIST_MAX=32 causes an overflow */
+#define LL_HIST_MAX 28
+#define LL_HIST_START 12 /* buckets start at 2^12 = 4k */
+#define LL_PROCESS_HIST_MAX 10
+struct per_process_info {
+	pid_t pid;
+	struct obd_histogram pp_r_hist;
+	struct obd_histogram pp_w_hist;
+};
+
+/* pp_extents[LL_PROCESS_HIST_MAX] will hold the combined process info */
+struct ll_rw_extents_info {
+	struct per_process_info pp_extents[LL_PROCESS_HIST_MAX + 1];
+};
+
+#define LL_OFFSET_HIST_MAX 100
+struct ll_rw_process_info {
+	pid_t		     rw_pid;
+	int		       rw_op;
+	loff_t		    rw_range_start;
+	loff_t		    rw_range_end;
+	loff_t		    rw_last_file_pos;
+	loff_t		    rw_offset;
+	size_t		    rw_smallest_extent;
+	size_t		    rw_largest_extent;
+	struct ll_file_data      *rw_last_file;
+};
+
+enum stats_track_type {
+	STATS_TRACK_ALL = 0,  /* track all processes */
+	STATS_TRACK_PID,      /* track process with this pid */
+	STATS_TRACK_PPID,     /* track processes with this ppid */
+	STATS_TRACK_GID,      /* track processes with this gid */
+	STATS_TRACK_LAST,
+};
+
+/* flags for sbi->ll_flags */
+#define LL_SBI_NOLCK	     0x01 /* DLM locking disabled (directio-only) */
+#define LL_SBI_CHECKSUM	  0x02 /* checksum each page as it's written */
+#define LL_SBI_FLOCK	     0x04
+#define LL_SBI_USER_XATTR	0x08 /* support user xattr */
+#define LL_SBI_ACL	       0x10 /* support ACL */
+#define LL_SBI_RMT_CLIENT	0x40 /* remote client */
+#define LL_SBI_MDS_CAPA	  0x80 /* support mds capa */
+#define LL_SBI_OSS_CAPA	 0x100 /* support oss capa */
+#define LL_SBI_LOCALFLOCK       0x200 /* Local flocks support by kernel */
+#define LL_SBI_LRU_RESIZE       0x400 /* lru resize support */
+#define LL_SBI_LAZYSTATFS       0x800 /* lazystatfs mount option */
+#define LL_SBI_SOM_PREVIEW     0x1000 /* SOM preview mount option */
+#define LL_SBI_32BIT_API       0x2000 /* generate 32 bit inodes. */
+#define LL_SBI_64BIT_HASH      0x4000 /* support 64-bits dir hash/offset */
+#define LL_SBI_AGL_ENABLED     0x8000 /* enable agl */
+#define LL_SBI_VERBOSE	0x10000 /* verbose mount/umount */
+#define LL_SBI_LAYOUT_LOCK    0x20000 /* layout lock support */
+#define LL_SBI_USER_FID2PATH  0x40000 /* allow fid2path by unprivileged users */
+
+#define LL_SBI_FLAGS {	\
+	"nolck",	\
+	"checksum",	\
+	"flock",	\
+	"xattr",	\
+	"acl",		\
+	"rmt_client",	\
+	"mds_capa",	\
+	"oss_capa",	\
+	"flock",	\
+	"lru_resize",	\
+	"lazy_statfs",	\
+	"som",		\
+	"32bit_api",	\
+	"64bit_hash",	\
+	"agl",		\
+	"verbose",	\
+	"layout",	\
+	"user_fid2path" }
+
+/* default value for ll_sb_info->contention_time */
+#define SBI_DEFAULT_CONTENTION_SECONDS     60
+/* default value for lockless_truncate_enable */
+#define SBI_DEFAULT_LOCKLESS_TRUNCATE_ENABLE 1
+#define RCE_HASHES      32
+
+struct rmtacl_ctl_entry {
+	struct list_head       rce_list;
+	pid_t	    rce_key; /* hash key */
+	int	      rce_ops; /* acl operation type */
+};
+
+struct rmtacl_ctl_table {
+	spinlock_t	rct_lock;
+	struct list_head	rct_entries[RCE_HASHES];
+};
+
+#define EE_HASHES       32
+
+struct eacl_entry {
+	struct list_head	    ee_list;
+	pid_t		 ee_key; /* hash key */
+	struct lu_fid	 ee_fid;
+	int		   ee_type; /* ACL type for ACCESS or DEFAULT */
+	ext_acl_xattr_header *ee_acl;
+};
+
+struct eacl_table {
+	spinlock_t	et_lock;
+	struct list_head	et_entries[EE_HASHES];
+};
+
+struct ll_sb_info {
+	struct list_head		  ll_list;
+	/* this protects pglist and ra_info.  It isn't safe to
+	 * grab from interrupt contexts */
+	spinlock_t		  ll_lock;
+	spinlock_t		  ll_pp_extent_lock; /* pp_extent entry*/
+	spinlock_t		  ll_process_lock; /* ll_rw_process_info */
+	struct obd_uuid	   ll_sb_uuid;
+	struct obd_export	*ll_md_exp;
+	struct obd_export	*ll_dt_exp;
+	struct proc_dir_entry*    ll_proc_root;
+	struct lu_fid	     ll_root_fid; /* root object fid */
+
+	int		       ll_flags;
+	struct list_head		ll_conn_chain; /* per-conn chain of SBs */
+	struct lustre_client_ocd  ll_lco;
+
+	struct list_head		ll_orphan_dentry_list; /*please don't ask -p*/
+	struct ll_close_queue    *ll_lcq;
+
+	struct lprocfs_stats     *ll_stats; /* lprocfs stats counter */
+
+	/* Used to track "unstable" pages on a client, and maintain a
+	 * LRU list of clean pages. An "unstable" page is defined as
+	 * any page which is sent to a server as part of a bulk request,
+	 * but is uncommitted to stable storage. */
+	struct cl_client_cache    ll_cache;
+
+	struct lprocfs_stats     *ll_ra_stats;
+
+	struct ll_ra_info	 ll_ra_info;
+	unsigned int	      ll_namelen;
+	struct file_operations   *ll_fop;
+
+	/* =0 - hold lock over whole read/write
+	 * >0 - max. chunk to be read/written w/o lock re-acquiring */
+	unsigned long	     ll_max_rw_chunk;
+	unsigned int	      ll_md_brw_size; /* used by readdir */
+
+	struct lu_site	   *ll_site;
+	struct cl_device	 *ll_cl;
+	/* Statistics */
+	struct ll_rw_extents_info ll_rw_extents_info;
+	int		       ll_extent_process_count;
+	struct ll_rw_process_info ll_rw_process_info[LL_PROCESS_HIST_MAX];
+	unsigned int	      ll_offset_process_count;
+	struct ll_rw_process_info ll_rw_offset_info[LL_OFFSET_HIST_MAX];
+	unsigned int	      ll_rw_offset_entry_count;
+	int		       ll_stats_track_id;
+	enum stats_track_type     ll_stats_track_type;
+	int		       ll_rw_stats_on;
+
+	/* metadata stat-ahead */
+	unsigned int	      ll_sa_max;     /* max statahead RPCs */
+	atomic_t		  ll_sa_total;   /* statahead thread started
+						  * count */
+	atomic_t		  ll_sa_wrong;   /* statahead thread stopped for
+						  * low hit ratio */
+	atomic_t		  ll_agl_total;  /* AGL thread started count */
+
+	dev_t		     ll_sdev_orig; /* save s_dev before assign for
+						 * clustred nfs */
+	struct rmtacl_ctl_table   ll_rct;
+	struct eacl_table	 ll_et;
+};
+
+#define LL_DEFAULT_MAX_RW_CHUNK      (32 * 1024 * 1024)
+
+struct ll_ra_read {
+	pgoff_t	     lrr_start;
+	pgoff_t	     lrr_count;
+	struct task_struct *lrr_reader;
+	struct list_head	  lrr_linkage;
+};
+
+/*
+ * per file-descriptor read-ahead data.
+ */
+struct ll_readahead_state {
+	spinlock_t  ras_lock;
+	/*
+	 * index of the last page that read(2) needed and that wasn't in the
+	 * cache. Used by ras_update() to detect seeks.
+	 *
+	 * XXX nikita: if access seeks into cached region, Lustre doesn't see
+	 * this.
+	 */
+	unsigned long   ras_last_readpage;
+	/*
+	 * number of pages read after last read-ahead window reset. As window
+	 * is reset on each seek, this is effectively a number of consecutive
+	 * accesses. Maybe ->ras_accessed_in_window is better name.
+	 *
+	 * XXX nikita: window is also reset (by ras_update()) when Lustre
+	 * believes that memory pressure evicts read-ahead pages. In that
+	 * case, it probably doesn't make sense to expand window to
+	 * PTLRPC_MAX_BRW_PAGES on the third access.
+	 */
+	unsigned long   ras_consecutive_pages;
+	/*
+	 * number of read requests after the last read-ahead window reset
+	 * As window is reset on each seek, this is effectively the number
+	 * on consecutive read request and is used to trigger read-ahead.
+	 */
+	unsigned long   ras_consecutive_requests;
+	/*
+	 * Parameters of current read-ahead window. Handled by
+	 * ras_update(). On the initial access to the file or after a seek,
+	 * window is reset to 0. After 3 consecutive accesses, window is
+	 * expanded to PTLRPC_MAX_BRW_PAGES. Afterwards, window is enlarged by
+	 * PTLRPC_MAX_BRW_PAGES chunks up to ->ra_max_pages.
+	 */
+	unsigned long   ras_window_start, ras_window_len;
+	/*
+	 * Where next read-ahead should start at. This lies within read-ahead
+	 * window. Read-ahead window is read in pieces rather than at once
+	 * because: 1. lustre limits total number of pages under read-ahead by
+	 * ->ra_max_pages (see ll_ra_count_get()), 2. client cannot read pages
+	 * not covered by DLM lock.
+	 */
+	unsigned long   ras_next_readahead;
+	/*
+	 * Total number of ll_file_read requests issued, reads originating
+	 * due to mmap are not counted in this total.  This value is used to
+	 * trigger full file read-ahead after multiple reads to a small file.
+	 */
+	unsigned long   ras_requests;
+	/*
+	 * Page index with respect to the current request, these value
+	 * will not be accurate when dealing with reads issued via mmap.
+	 */
+	unsigned long   ras_request_index;
+	/*
+	 * list of struct ll_ra_read's one per read(2) call current in
+	 * progress against this file descriptor. Used by read-ahead code,
+	 * protected by ->ras_lock.
+	 */
+	struct list_head      ras_read_beads;
+	/*
+	 * The following 3 items are used for detecting the stride I/O
+	 * mode.
+	 * In stride I/O mode,
+	 * ...............|-----data-----|****gap*****|--------|******|....
+	 *    offset      |-stride_pages-|-stride_gap-|
+	 * ras_stride_offset = offset;
+	 * ras_stride_length = stride_pages + stride_gap;
+	 * ras_stride_pages = stride_pages;
+	 * Note: all these three items are counted by pages.
+	 */
+	unsigned long   ras_stride_length;
+	unsigned long   ras_stride_pages;
+	pgoff_t	 ras_stride_offset;
+	/*
+	 * number of consecutive stride request count, and it is similar as
+	 * ras_consecutive_requests, but used for stride I/O mode.
+	 * Note: only more than 2 consecutive stride request are detected,
+	 * stride read-ahead will be enable
+	 */
+	unsigned long   ras_consecutive_stride_requests;
+};
+
+extern struct kmem_cache *ll_file_data_slab;
+struct lustre_handle;
+struct ll_file_data {
+	struct ll_readahead_state fd_ras;
+	int fd_omode;
+	struct ccc_grouplock fd_grouplock;
+	__u64 lfd_pos;
+	__u32 fd_flags;
+	struct file *fd_file;
+	/* Indicate whether need to report failure when close.
+	 * true: failure is known, not report again.
+	 * false: unknown failure, should report. */
+	bool fd_write_failed;
+};
+
+struct lov_stripe_md;
+
+extern spinlock_t inode_lock;
+
+extern struct proc_dir_entry *proc_lustre_fs_root;
+
+static inline struct inode *ll_info2i(struct ll_inode_info *lli)
+{
+	return &lli->lli_vfs_inode;
+}
+
+struct it_cb_data {
+	struct inode  *icbd_parent;
+	struct dentry **icbd_childp;
+	obd_id	hash;
+};
+
+__u32 ll_i2suppgid(struct inode *i);
+void ll_i2gids(__u32 *suppgids, struct inode *i1,struct inode *i2);
+
+static inline int ll_need_32bit_api(struct ll_sb_info *sbi)
+{
+#if BITS_PER_LONG == 32
+	return 1;
+#else
+	return unlikely(current_is_32bit() || (sbi->ll_flags & LL_SBI_32BIT_API));
+#endif
+}
+
+#define LLAP_MAGIC 98764321
+
+extern struct kmem_cache *ll_async_page_slab;
+extern size_t ll_async_page_slab_size;
+
+void ll_ra_read_in(struct file *f, struct ll_ra_read *rar);
+void ll_ra_read_ex(struct file *f, struct ll_ra_read *rar);
+struct ll_ra_read *ll_ra_read_get(struct file *f);
+
+/* llite/lproc_llite.c */
+#ifdef LPROCFS
+int lprocfs_register_mountpoint(struct proc_dir_entry *parent,
+				struct super_block *sb, char *osc, char *mdc);
+void lprocfs_unregister_mountpoint(struct ll_sb_info *sbi);
+void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count);
+void lprocfs_llite_init_vars(struct lprocfs_static_vars *lvars);
+#else
+static inline int lprocfs_register_mountpoint(struct proc_dir_entry *parent,
+			struct super_block *sb, char *osc, char *mdc){return 0;}
+static inline void lprocfs_unregister_mountpoint(struct ll_sb_info *sbi) {}
+static void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count) {}
+static void lprocfs_llite_init_vars(struct lprocfs_static_vars *lvars)
+{
+	memset(lvars, 0, sizeof(*lvars));
+}
+#endif
+
+
+/* llite/dir.c */
+void ll_release_page(struct page *page, int remove);
+extern struct file_operations ll_dir_operations;
+extern struct inode_operations ll_dir_inode_operations;
+struct page *ll_get_dir_page(struct inode *dir, __u64 hash,
+			     struct ll_dir_chain *chain);
+int ll_dir_read(struct inode *inode, __u64 *_pos, void *cookie,
+		filldir_t filldir);
+
+int ll_get_mdt_idx(struct inode *inode);
+/* llite/namei.c */
+int ll_objects_destroy(struct ptlrpc_request *request,
+		       struct inode *dir);
+struct inode *ll_iget(struct super_block *sb, ino_t hash,
+		      struct lustre_md *lic);
+int ll_md_blocking_ast(struct ldlm_lock *, struct ldlm_lock_desc *,
+		       void *data, int flag);
+struct dentry *ll_splice_alias(struct inode *inode, struct dentry *de);
+int ll_rmdir_entry(struct inode *dir, char *name, int namelen);
+
+/* llite/rw.c */
+int ll_prepare_write(struct file *, struct page *, unsigned from, unsigned to);
+int ll_commit_write(struct file *, struct page *, unsigned from, unsigned to);
+int ll_writepage(struct page *page, struct writeback_control *wbc);
+int ll_writepages(struct address_space *, struct writeback_control *wbc);
+void ll_removepage(struct page *page);
+int ll_readpage(struct file *file, struct page *page);
+void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras);
+int ll_file_punch(struct inode *, loff_t, int);
+ssize_t ll_file_lockless_io(struct file *, char *, size_t, loff_t *, int);
+void ll_clear_file_contended(struct inode*);
+int ll_sync_page_range(struct inode *, struct address_space *, loff_t, size_t);
+int ll_readahead(const struct lu_env *env, struct cl_io *io,
+		 struct ll_readahead_state *ras, struct address_space *mapping,
+		 struct cl_page_list *queue, int flags);
+
+/* llite/file.c */
+extern struct file_operations ll_file_operations;
+extern struct file_operations ll_file_operations_flock;
+extern struct file_operations ll_file_operations_noflock;
+extern struct inode_operations ll_file_inode_operations;
+extern int ll_inode_revalidate_it(struct dentry *, struct lookup_intent *,
+				  __u64);
+extern int ll_have_md_lock(struct inode *inode, __u64 *bits,
+			   ldlm_mode_t l_req_mode);
+extern ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
+				   struct lustre_handle *lockh, __u64 flags);
+int __ll_inode_revalidate_it(struct dentry *, struct lookup_intent *,
+			     __u64 bits);
+int ll_revalidate_nd(struct dentry *dentry, unsigned int flags);
+int ll_file_open(struct inode *inode, struct file *file);
+int ll_file_release(struct inode *inode, struct file *file);
+int ll_glimpse_ioctl(struct ll_sb_info *sbi,
+		     struct lov_stripe_md *lsm, lstat_t *st);
+void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch);
+int ll_local_open(struct file *file,
+		  struct lookup_intent *it, struct ll_file_data *fd,
+		  struct obd_client_handle *och);
+int ll_release_openhandle(struct dentry *, struct lookup_intent *);
+int ll_md_close(struct obd_export *md_exp, struct inode *inode,
+		struct file *file);
+int ll_md_real_close(struct inode *inode, int flags);
+void ll_ioepoch_close(struct inode *inode, struct md_op_data *op_data,
+		      struct obd_client_handle **och, unsigned long flags);
+void ll_done_writing_attr(struct inode *inode, struct md_op_data *op_data);
+int ll_som_update(struct inode *inode, struct md_op_data *op_data);
+int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
+		     __u64 ioepoch, int sync);
+int ll_md_setattr(struct dentry *dentry, struct md_op_data *op_data,
+		  struct md_open_data **mod);
+void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
+			  struct lustre_handle *fh);
+extern void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid,
+			      struct ll_file_data *file, loff_t pos,
+			      size_t count, int rw);
+int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
+	       struct lookup_intent *it, struct kstat *stat);
+int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat);
+struct ll_file_data *ll_file_data_get(void);
+struct posix_acl * ll_get_acl(struct inode *inode, int type);
+
+int ll_inode_permission(struct inode *inode, int mask);
+
+int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
+			     int flags, struct lov_user_md *lum,
+			     int lum_size);
+int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
+			     struct lov_mds_md **lmm, int *lmm_size,
+			     struct ptlrpc_request **request);
+int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
+		     int set_default);
+int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmmp,
+		     int *lmm_size, struct ptlrpc_request **request);
+int ll_fsync(struct file *file, loff_t start, loff_t end, int data);
+int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
+	      int num_bytes);
+int ll_merge_lvb(const struct lu_env *env, struct inode *inode);
+int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg);
+int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
+int ll_fid2path(struct inode *inode, void *arg);
+int ll_data_version(struct inode *inode, __u64 *data_version, int extent_lock);
+
+/* llite/dcache.c */
+
+int ll_dops_init(struct dentry *de, int block, int init_sa);
+extern struct dentry_operations ll_d_ops;
+void ll_intent_drop_lock(struct lookup_intent *);
+void ll_intent_release(struct lookup_intent *);
+void ll_invalidate_aliases(struct inode *);
+void ll_frob_intent(struct lookup_intent **itp, struct lookup_intent *deft);
+void ll_lookup_finish_locks(struct lookup_intent *it, struct dentry *dentry);
+int ll_dcompare(const struct dentry *parent, const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *d_name);
+int ll_revalidate_it_finish(struct ptlrpc_request *request,
+			    struct lookup_intent *it, struct dentry *de);
+
+/* llite/llite_lib.c */
+extern struct super_operations lustre_super_operations;
+
+char *ll_read_opt(const char *opt, char *data);
+void ll_lli_init(struct ll_inode_info *lli);
+int ll_fill_super(struct super_block *sb, struct vfsmount *mnt);
+void ll_put_super(struct super_block *sb);
+void ll_kill_super(struct super_block *sb);
+struct inode *ll_inode_from_resource_lock(struct ldlm_lock *lock);
+struct inode *ll_inode_from_lock(struct ldlm_lock *lock);
+void ll_clear_inode(struct inode *inode);
+int ll_setattr_raw(struct dentry *dentry, struct iattr *attr);
+int ll_setattr(struct dentry *de, struct iattr *attr);
+int ll_statfs(struct dentry *de, struct kstatfs *sfs);
+int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs,
+		       __u64 max_age, __u32 flags);
+void ll_update_inode(struct inode *inode, struct lustre_md *md);
+void ll_read_inode2(struct inode *inode, void *opaque);
+void ll_delete_inode(struct inode *inode);
+int ll_iocontrol(struct inode *inode, struct file *file,
+		 unsigned int cmd, unsigned long arg);
+int ll_flush_ctx(struct inode *inode);
+void ll_umount_begin(struct super_block *sb);
+int ll_remount_fs(struct super_block *sb, int *flags, char *data);
+int ll_show_options(struct seq_file *seq, struct dentry *dentry);
+void ll_dirty_page_discard_warn(struct page *page, int ioret);
+int ll_prep_inode(struct inode **inode, struct ptlrpc_request *req,
+		  struct super_block *, struct lookup_intent *);
+void lustre_dump_dentry(struct dentry *, int recur);
+void lustre_dump_inode(struct inode *);
+int ll_obd_statfs(struct inode *inode, void *arg);
+int ll_get_max_mdsize(struct ll_sb_info *sbi, int *max_mdsize);
+int ll_process_config(struct lustre_cfg *lcfg);
+struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data,
+				      struct inode *i1, struct inode *i2,
+				      const char *name, int namelen,
+				      int mode, __u32 opc, void *data);
+void ll_finish_md_op_data(struct md_op_data *op_data);
+int ll_get_obd_name(struct inode *inode, unsigned int cmd, unsigned long arg);
+char *ll_get_fsname(struct super_block *sb, char *buf, int buflen);
+
+/* llite/llite_nfs.c */
+extern struct export_operations lustre_export_operations;
+__u32 get_uuid2int(const char *name, int len);
+struct inode *search_inode_for_lustre(struct super_block *sb,
+				      const struct lu_fid *fid);
+
+/* llite/special.c */
+extern struct inode_operations ll_special_inode_operations;
+extern struct file_operations ll_special_chr_inode_fops;
+extern struct file_operations ll_special_chr_file_fops;
+extern struct file_operations ll_special_blk_inode_fops;
+extern struct file_operations ll_special_fifo_inode_fops;
+extern struct file_operations ll_special_fifo_file_fops;
+extern struct file_operations ll_special_sock_inode_fops;
+
+/* llite/symlink.c */
+extern struct inode_operations ll_fast_symlink_inode_operations;
+
+/* llite/llite_close.c */
+struct ll_close_queue {
+	spinlock_t		lcq_lock;
+	struct list_head		lcq_head;
+	wait_queue_head_t		lcq_waitq;
+	struct completion	lcq_comp;
+	atomic_t		lcq_stop;
+};
+
+struct ccc_object *cl_inode2ccc(struct inode *inode);
+
+
+void vvp_write_pending (struct ccc_object *club, struct ccc_page *page);
+void vvp_write_complete(struct ccc_object *club, struct ccc_page *page);
+
+/* specific achitecture can implement only part of this list */
+enum vvp_io_subtype {
+	/** normal IO */
+	IO_NORMAL,
+	/** io called from .sendfile */
+	IO_SENDFILE,
+	/** io started from splice_{read|write} */
+	IO_SPLICE
+};
+
+/* IO subtypes */
+struct vvp_io {
+	/** io subtype */
+	enum vvp_io_subtype    cui_io_subtype;
+
+	union {
+		struct {
+			read_actor_t      cui_actor;
+			void	     *cui_target;
+		} sendfile;
+		struct {
+			struct pipe_inode_info *cui_pipe;
+			unsigned int	    cui_flags;
+		} splice;
+		struct vvp_fault_io {
+			/**
+			 * Inode modification time that is checked across DLM
+			 * lock request.
+			 */
+			time_t		 ft_mtime;
+			struct vm_area_struct *ft_vma;
+			/**
+			 *  locked page returned from vvp_io
+			 */
+			struct page	    *ft_vmpage;
+			struct vm_fault_api {
+				/**
+				 * kernel fault info
+				 */
+				struct vm_fault *ft_vmf;
+				/**
+				 * fault API used bitflags for return code.
+				 */
+				unsigned int    ft_flags;
+			} fault;
+		} fault;
+	} u;
+	/**
+	 * Read-ahead state used by read and page-fault IO contexts.
+	 */
+	struct ll_ra_read    cui_bead;
+	/**
+	 * Set when cui_bead has been initialized.
+	 */
+	int		  cui_ra_window_set;
+	/**
+	 * Partially truncated page, that vvp_io_trunc_start() keeps locked
+	 * across truncate.
+	 */
+	struct cl_page      *cui_partpage;
+};
+
+/**
+ * IO arguments for various VFS I/O interfaces.
+ */
+struct vvp_io_args {
+	/** normal/sendfile/splice */
+	enum vvp_io_subtype via_io_subtype;
+
+	union {
+		struct {
+			struct kiocb      *via_iocb;
+			struct iovec      *via_iov;
+			unsigned long      via_nrsegs;
+		} normal;
+		struct {
+			read_actor_t       via_actor;
+			void	      *via_target;
+		} sendfile;
+		struct {
+			struct pipe_inode_info  *via_pipe;
+			unsigned int       via_flags;
+		} splice;
+	} u;
+};
+
+struct ll_cl_context {
+	void	   *lcc_cookie;
+	struct cl_io   *lcc_io;
+	struct cl_page *lcc_page;
+	struct lu_env  *lcc_env;
+	int	     lcc_refcheck;
+	int	     lcc_created;
+};
+
+struct vvp_thread_info {
+	struct ost_lvb       vti_lvb;
+	struct cl_2queue     vti_queue;
+	struct iovec	 vti_local_iov;
+	struct vvp_io_args   vti_args;
+	struct ra_io_arg     vti_ria;
+	struct kiocb	 vti_kiocb;
+	struct ll_cl_context vti_io_ctx;
+};
+
+static inline struct vvp_thread_info *vvp_env_info(const struct lu_env *env)
+{
+	extern struct lu_context_key vvp_key;
+	struct vvp_thread_info      *info;
+
+	info = lu_context_key_get(&env->le_ctx, &vvp_key);
+	LASSERT(info != NULL);
+	return info;
+}
+
+static inline struct vvp_io_args *vvp_env_args(const struct lu_env *env,
+					       enum vvp_io_subtype type)
+{
+	struct vvp_io_args *ret = &vvp_env_info(env)->vti_args;
+
+	ret->via_io_subtype = type;
+
+	return ret;
+}
+
+struct vvp_session {
+	struct vvp_io	 vs_ios;
+};
+
+static inline struct vvp_session *vvp_env_session(const struct lu_env *env)
+{
+	extern struct lu_context_key vvp_session_key;
+	struct vvp_session *ses;
+
+	ses = lu_context_key_get(env->le_ses, &vvp_session_key);
+	LASSERT(ses != NULL);
+	return ses;
+}
+
+static inline struct vvp_io *vvp_env_io(const struct lu_env *env)
+{
+	return &vvp_env_session(env)->vs_ios;
+}
+
+void ll_queue_done_writing(struct inode *inode, unsigned long flags);
+void ll_close_thread_shutdown(struct ll_close_queue *lcq);
+int ll_close_thread_start(struct ll_close_queue **lcq_ret);
+
+/* llite/llite_mmap.c */
+typedef struct rb_root  rb_root_t;
+typedef struct rb_node  rb_node_t;
+
+struct ll_lock_tree_node;
+struct ll_lock_tree {
+	rb_root_t		       lt_root;
+	struct list_head		      lt_locked_list;
+	struct ll_file_data	    *lt_fd;
+};
+
+int ll_teardown_mmaps(struct address_space *mapping, __u64 first, __u64 last);
+int ll_file_mmap(struct file * file, struct vm_area_struct * vma);
+struct ll_lock_tree_node * ll_node_from_inode(struct inode *inode, __u64 start,
+					      __u64 end, ldlm_mode_t mode);
+void policy_from_vma(ldlm_policy_data_t *policy,
+		struct vm_area_struct *vma, unsigned long addr, size_t count);
+struct vm_area_struct *our_vma(struct mm_struct *mm, unsigned long addr,
+			       size_t count);
+
+static inline void ll_invalidate_page(struct page *vmpage)
+{
+	struct address_space *mapping = vmpage->mapping;
+	loff_t offset = vmpage->index << PAGE_CACHE_SHIFT;
+
+	LASSERT(PageLocked(vmpage));
+	if (mapping == NULL)
+		return;
+
+	ll_teardown_mmaps(mapping, offset, offset + PAGE_CACHE_SIZE);
+	truncate_complete_page(mapping, vmpage);
+}
+
+#define    ll_s2sbi(sb)	(s2lsi(sb)->lsi_llsbi)
+
+/* don't need an addref as the sb_info should be holding one */
+static inline struct obd_export *ll_s2dtexp(struct super_block *sb)
+{
+	return ll_s2sbi(sb)->ll_dt_exp;
+}
+
+/* don't need an addref as the sb_info should be holding one */
+static inline struct obd_export *ll_s2mdexp(struct super_block *sb)
+{
+	return ll_s2sbi(sb)->ll_md_exp;
+}
+
+static inline struct client_obd *sbi2mdc(struct ll_sb_info *sbi)
+{
+	struct obd_device *obd = sbi->ll_md_exp->exp_obd;
+	if (obd == NULL)
+		LBUG();
+	return &obd->u.cli;
+}
+
+// FIXME: replace the name of this with LL_SB to conform to kernel stuff
+static inline struct ll_sb_info *ll_i2sbi(struct inode *inode)
+{
+	return ll_s2sbi(inode->i_sb);
+}
+
+static inline struct obd_export *ll_i2dtexp(struct inode *inode)
+{
+	return ll_s2dtexp(inode->i_sb);
+}
+
+static inline struct obd_export *ll_i2mdexp(struct inode *inode)
+{
+	return ll_s2mdexp(inode->i_sb);
+}
+
+static inline struct lu_fid *ll_inode2fid(struct inode *inode)
+{
+	struct lu_fid *fid;
+
+	LASSERT(inode != NULL);
+	fid = &ll_i2info(inode)->lli_fid;
+
+	return fid;
+}
+
+static inline int ll_mds_max_easize(struct super_block *sb)
+{
+	return sbi2mdc(ll_s2sbi(sb))->cl_max_mds_easize;
+}
+
+static inline __u64 ll_file_maxbytes(struct inode *inode)
+{
+	return ll_i2info(inode)->lli_maxbytes;
+}
+
+/* llite/xattr.c */
+int ll_setxattr(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags);
+ssize_t ll_getxattr(struct dentry *dentry, const char *name,
+		    void *buffer, size_t size);
+ssize_t ll_listxattr(struct dentry *dentry, char *buffer, size_t size);
+int ll_removexattr(struct dentry *dentry, const char *name);
+
+/* llite/remote_perm.c */
+extern struct kmem_cache *ll_remote_perm_cachep;
+extern struct kmem_cache *ll_rmtperm_hash_cachep;
+
+struct hlist_head *alloc_rmtperm_hash(void);
+void free_rmtperm_hash(struct hlist_head *hash);
+int ll_update_remote_perm(struct inode *inode, struct mdt_remote_perm *perm);
+int lustre_check_remote_perm(struct inode *inode, int mask);
+
+/* llite/llite_capa.c */
+extern timer_list_t ll_capa_timer;
+
+int ll_capa_thread_start(void);
+void ll_capa_thread_stop(void);
+void ll_capa_timer_callback(unsigned long unused);
+
+struct obd_capa *ll_add_capa(struct inode *inode, struct obd_capa *ocapa);
+int ll_update_capa(struct obd_capa *ocapa, struct lustre_capa *capa);
+
+void ll_capa_open(struct inode *inode);
+void ll_capa_close(struct inode *inode);
+
+struct obd_capa *ll_mdscapa_get(struct inode *inode);
+struct obd_capa *ll_osscapa_get(struct inode *inode, __u64 opc);
+
+void ll_truncate_free_capa(struct obd_capa *ocapa);
+void ll_clear_inode_capas(struct inode *inode);
+void ll_print_capa_stat(struct ll_sb_info *sbi);
+
+/* llite/llite_cl.c */
+extern struct lu_device_type vvp_device_type;
+
+/**
+ * Common IO arguments for various VFS I/O interfaces.
+ */
+int cl_sb_init(struct super_block *sb);
+int cl_sb_fini(struct super_block *sb);
+enum cl_lock_mode  vvp_mode_from_vma(struct vm_area_struct *vma);
+void ll_io_init(struct cl_io *io, const struct file *file, int write);
+
+void ras_update(struct ll_sb_info *sbi, struct inode *inode,
+		struct ll_readahead_state *ras, unsigned long index,
+		unsigned hit);
+void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len);
+int ll_is_file_contended(struct file *file);
+void ll_ra_stats_inc(struct address_space *mapping, enum ra_stat which);
+
+/* llite/llite_rmtacl.c */
+#ifdef CONFIG_FS_POSIX_ACL
+obd_valid rce_ops2valid(int ops);
+struct rmtacl_ctl_entry *rct_search(struct rmtacl_ctl_table *rct, pid_t key);
+int rct_add(struct rmtacl_ctl_table *rct, pid_t key, int ops);
+int rct_del(struct rmtacl_ctl_table *rct, pid_t key);
+void rct_init(struct rmtacl_ctl_table *rct);
+void rct_fini(struct rmtacl_ctl_table *rct);
+
+void ee_free(struct eacl_entry *ee);
+int ee_add(struct eacl_table *et, pid_t key, struct lu_fid *fid, int type,
+	   ext_acl_xattr_header *header);
+struct eacl_entry *et_search_del(struct eacl_table *et, pid_t key,
+				 struct lu_fid *fid, int type);
+void et_search_free(struct eacl_table *et, pid_t key);
+void et_init(struct eacl_table *et);
+void et_fini(struct eacl_table *et);
+#endif
+
+/* statahead.c */
+
+#define LL_SA_RPC_MIN	   2
+#define LL_SA_RPC_DEF	   32
+#define LL_SA_RPC_MAX	   8192
+
+#define LL_SA_CACHE_BIT	 5
+#define LL_SA_CACHE_SIZE	(1 << LL_SA_CACHE_BIT)
+#define LL_SA_CACHE_MASK	(LL_SA_CACHE_SIZE - 1)
+
+/* per inode struct, for dir only */
+struct ll_statahead_info {
+	struct inode	   *sai_inode;
+	atomic_t	    sai_refcount;   /* when access this struct, hold
+						 * refcount */
+	unsigned int	    sai_generation; /* generation for statahead */
+	unsigned int	    sai_max;	/* max ahead of lookup */
+	__u64		   sai_sent;       /* stat requests sent count */
+	__u64		   sai_replied;    /* stat requests which received
+						 * reply */
+	__u64		   sai_index;      /* index of statahead entry */
+	__u64		   sai_index_wait; /* index of entry which is the
+						 * caller is waiting for */
+	__u64		   sai_hit;	/* hit count */
+	__u64		   sai_miss;       /* miss count:
+						 * for "ls -al" case, it includes
+						 * hidden dentry miss;
+						 * for "ls -l" case, it does not
+						 * include hidden dentry miss.
+						 * "sai_miss_hidden" is used for
+						 * the later case.
+						 */
+	unsigned int	    sai_consecutive_miss; /* consecutive miss */
+	unsigned int	    sai_miss_hidden;/* "ls -al", but first dentry
+						 * is not a hidden one */
+	unsigned int	    sai_skip_hidden;/* skipped hidden dentry count */
+	unsigned int	    sai_ls_all:1,   /* "ls -al", do stat-ahead for
+						 * hidden entries */
+				sai_in_readpage:1,/* statahead is in readdir()*/
+				sai_agl_valid:1;/* AGL is valid for the dir */
+	wait_queue_head_t	     sai_waitq;      /* stat-ahead wait queue */
+	struct ptlrpc_thread    sai_thread;     /* stat-ahead thread */
+	struct ptlrpc_thread    sai_agl_thread; /* AGL thread */
+	struct list_head	      sai_entries;    /* entry list */
+	struct list_head	      sai_entries_received; /* entries returned */
+	struct list_head	      sai_entries_stated;   /* entries stated */
+	struct list_head	      sai_entries_agl; /* AGL entries to be sent */
+	struct list_head	      sai_cache[LL_SA_CACHE_SIZE];
+	spinlock_t		sai_cache_lock[LL_SA_CACHE_SIZE];
+	atomic_t		sai_cache_count; /* entry count in cache */
+};
+
+int do_statahead_enter(struct inode *dir, struct dentry **dentry,
+		       int only_unplug);
+void ll_stop_statahead(struct inode *dir, void *key);
+
+static inline int ll_glimpse_size(struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	int rc;
+
+	down_read(&lli->lli_glimpse_sem);
+	rc = cl_glimpse_size(inode);
+	lli->lli_glimpse_time = cfs_time_current();
+	up_read(&lli->lli_glimpse_sem);
+	return rc;
+}
+
+static inline void
+ll_statahead_mark(struct inode *dir, struct dentry *dentry)
+{
+	struct ll_inode_info     *lli = ll_i2info(dir);
+	struct ll_statahead_info *sai = lli->lli_sai;
+	struct ll_dentry_data    *ldd = ll_d2d(dentry);
+
+	/* not the same process, don't mark */
+	if (lli->lli_opendir_pid != current_pid())
+		return;
+
+	if (sai != NULL && ldd != NULL)
+		ldd->lld_sa_generation = sai->sai_generation;
+}
+
+static inline int
+ll_need_statahead(struct inode *dir, struct dentry *dentryp)
+{
+	struct ll_inode_info  *lli;
+	struct ll_dentry_data *ldd;
+
+	if (ll_i2sbi(dir)->ll_sa_max == 0)
+		return -EAGAIN;
+
+	lli = ll_i2info(dir);
+	/* not the same process, don't statahead */
+	if (lli->lli_opendir_pid != current_pid())
+		return -EAGAIN;
+
+	/* statahead has been stopped */
+	if (lli->lli_opendir_key == NULL)
+		return -EAGAIN;
+
+	ldd = ll_d2d(dentryp);
+	/*
+	 * When stats a dentry, the system trigger more than once "revalidate"
+	 * or "lookup", for "getattr", for "getxattr", and maybe for others.
+	 * Under patchless client mode, the operation intent is not accurate,
+	 * which maybe misguide the statahead thread. For example:
+	 * The "revalidate" call for "getattr" and "getxattr" of a dentry maybe
+	 * have the same operation intent -- "IT_GETATTR".
+	 * In fact, one dentry should has only one chance to interact with the
+	 * statahead thread, otherwise the statahead windows will be confused.
+	 * The solution is as following:
+	 * Assign "lld_sa_generation" with "sai_generation" when a dentry
+	 * "IT_GETATTR" for the first time, and the subsequent "IT_GETATTR"
+	 * will bypass interacting with statahead thread for checking:
+	 * "lld_sa_generation == lli_sai->sai_generation"
+	 */
+	if (ldd && lli->lli_sai &&
+	    ldd->lld_sa_generation == lli->lli_sai->sai_generation)
+		return -EAGAIN;
+
+	return 1;
+}
+
+static inline int
+ll_statahead_enter(struct inode *dir, struct dentry **dentryp, int only_unplug)
+{
+	int ret;
+
+	ret = ll_need_statahead(dir, *dentryp);
+	if (ret <= 0)
+		return ret;
+
+	return do_statahead_enter(dir, dentryp, only_unplug);
+}
+
+/* llite ioctl register support rountine */
+enum llioc_iter {
+	LLIOC_CONT = 0,
+	LLIOC_STOP
+};
+
+#define LLIOC_MAX_CMD	   256
+
+/*
+ * Rules to write a callback function:
+ *
+ * Parameters:
+ *  @magic: Dynamic ioctl call routine will feed this vaule with the pointer
+ *      returned to ll_iocontrol_register.  Callback functions should use this
+ *      data to check the potential collasion of ioctl cmd. If collasion is
+ *      found, callback function should return LLIOC_CONT.
+ *  @rcp: The result of ioctl command.
+ *
+ *  Return values:
+ *      If @magic matches the pointer returned by ll_iocontrol_data, the
+ *      callback should return LLIOC_STOP; return LLIOC_STOP otherwise.
+ */
+typedef enum llioc_iter (*llioc_callback_t)(struct inode *inode,
+		struct file *file, unsigned int cmd, unsigned long arg,
+		void *magic, int *rcp);
+
+enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
+		unsigned int cmd, unsigned long arg, int *rcp);
+
+/* export functions */
+/* Register ioctl block dynamatically for a regular file.
+ *
+ * @cmd: the array of ioctl command set
+ * @count: number of commands in the @cmd
+ * @cb: callback function, it will be called if an ioctl command is found to
+ *      belong to the command list @cmd.
+ *
+ * Return vaule:
+ *      A magic pointer will be returned if success;
+ *      otherwise, NULL will be returned.
+ * */
+void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd);
+void ll_iocontrol_unregister(void *magic);
+
+
+/* lclient compat stuff */
+#define cl_inode_info ll_inode_info
+#define cl_i2info(info) ll_i2info(info)
+#define cl_inode_mode(inode) ((inode)->i_mode)
+#define cl_i2sbi ll_i2sbi
+
+static inline struct ll_file_data *cl_iattr2fd(struct inode *inode,
+					       const struct iattr *attr)
+{
+	LASSERT(attr->ia_valid & ATTR_FILE);
+	return LUSTRE_FPRIVATE(attr->ia_file);
+}
+
+static inline void cl_isize_lock(struct inode *inode)
+{
+	ll_inode_size_lock(inode);
+}
+
+static inline void cl_isize_unlock(struct inode *inode)
+{
+	ll_inode_size_unlock(inode);
+}
+
+static inline void cl_isize_write_nolock(struct inode *inode, loff_t kms)
+{
+	LASSERT(down_trylock(&ll_i2info(inode)->lli_size_sem) != 0);
+	i_size_write(inode, kms);
+}
+
+static inline void cl_isize_write(struct inode *inode, loff_t kms)
+{
+	ll_inode_size_lock(inode);
+	i_size_write(inode, kms);
+	ll_inode_size_unlock(inode);
+}
+
+#define cl_isize_read(inode)	     i_size_read(inode)
+
+static inline int cl_merge_lvb(const struct lu_env *env, struct inode *inode)
+{
+	return ll_merge_lvb(env, inode);
+}
+
+#define cl_inode_atime(inode) LTIME_S((inode)->i_atime)
+#define cl_inode_ctime(inode) LTIME_S((inode)->i_ctime)
+#define cl_inode_mtime(inode) LTIME_S((inode)->i_mtime)
+
+struct obd_capa *cl_capa_lookup(struct inode *inode, enum cl_req_type crt);
+
+int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
+		       enum cl_fsync_mode mode);
+
+/** direct write pages */
+struct ll_dio_pages {
+	/** page array to be written. we don't support
+	 * partial pages except the last one. */
+	struct page **ldp_pages;
+	/* offset of each page */
+	loff_t       *ldp_offsets;
+	/** if ldp_offsets is NULL, it means a sequential
+	 * pages to be written, then this is the file offset
+	 * of the * first page. */
+	loff_t	ldp_start_offset;
+	/** how many bytes are to be written. */
+	size_t	ldp_size;
+	/** # of pages in the array. */
+	int	   ldp_nr;
+};
+
+static inline void cl_stats_tally(struct cl_device *dev, enum cl_req_type crt,
+				  int rc)
+{
+	int opc = (crt == CRT_READ) ? LPROC_LL_OSC_READ :
+				      LPROC_LL_OSC_WRITE;
+
+	ll_stats_ops_tally(ll_s2sbi(cl2ccc_dev(dev)->cdv_sb), opc, rc);
+}
+
+extern ssize_t ll_direct_rw_pages(const struct lu_env *env, struct cl_io *io,
+				  int rw, struct inode *inode,
+				  struct ll_dio_pages *pv);
+
+static inline int ll_file_nolock(const struct file *file)
+{
+	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+	struct inode *inode = file->f_dentry->d_inode;
+
+	LASSERT(fd != NULL);
+	return ((fd->fd_flags & LL_FILE_IGNORE_LOCK) ||
+		(ll_i2sbi(inode)->ll_flags & LL_SBI_NOLCK));
+}
+
+static inline void ll_set_lock_data(struct obd_export *exp, struct inode *inode,
+				    struct lookup_intent *it, __u64 *bits)
+{
+	if (!it->d.lustre.it_lock_set) {
+		struct lustre_handle handle;
+
+		/* If this inode is a remote object, it will get two
+		 * separate locks in different namespaces, Master MDT,
+		 * where the name entry is, will grant LOOKUP lock,
+		 * remote MDT, where the object is, will grant
+		 * UPDATE|PERM lock. The inode will be attched to both
+		 * LOOKUP and PERM locks, so revoking either locks will
+		 * case the dcache being cleared */
+		if (it->d.lustre.it_remote_lock_mode) {
+			handle.cookie = it->d.lustre.it_remote_lock_handle;
+			CDEBUG(D_DLMTRACE, "setting l_data to inode %p"
+			       "(%lu/%u) for remote lock "LPX64"\n", inode,
+			       inode->i_ino, inode->i_generation,
+			       handle.cookie);
+			md_set_lock_data(exp, &handle.cookie, inode, NULL);
+		}
+
+		handle.cookie = it->d.lustre.it_lock_handle;
+
+		CDEBUG(D_DLMTRACE, "setting l_data to inode %p (%lu/%u)"
+		       " for lock "LPX64"\n", inode, inode->i_ino,
+		       inode->i_generation, handle.cookie);
+
+		md_set_lock_data(exp, &handle.cookie, inode,
+				 &it->d.lustre.it_lock_bits);
+		it->d.lustre.it_lock_set = 1;
+	}
+
+	if (bits != NULL)
+		*bits = it->d.lustre.it_lock_bits;
+}
+
+static inline void ll_lock_dcache(struct inode *inode)
+{
+	spin_lock(&inode->i_lock);
+}
+
+static inline void ll_unlock_dcache(struct inode *inode)
+{
+	spin_unlock(&inode->i_lock);
+}
+
+static inline int d_lustre_invalid(const struct dentry *dentry)
+{
+	struct ll_dentry_data *lld = ll_d2d(dentry);
+
+	return (lld == NULL) || lld->lld_invalid;
+}
+
+static inline void __d_lustre_invalidate(struct dentry *dentry)
+{
+	struct ll_dentry_data *lld = ll_d2d(dentry);
+
+	if (lld != NULL)
+		lld->lld_invalid = 1;
+}
+
+/*
+ * Mark dentry INVALID, if dentry refcount is zero (this is normally case for
+ * ll_md_blocking_ast), unhash this dentry, and let dcache to reclaim it later;
+ * else dput() of the last refcount will unhash this dentry and kill it.
+ */
+static inline void d_lustre_invalidate(struct dentry *dentry)
+{
+	CDEBUG(D_DENTRY, "invalidate dentry %.*s (%p) parent %p inode %p "
+	       "refc %d\n", dentry->d_name.len, dentry->d_name.name, dentry,
+	       dentry->d_parent, dentry->d_inode, d_refcount(dentry));
+
+	spin_lock(&dentry->d_lock);
+	__d_lustre_invalidate(dentry);
+	if (d_refcount(dentry) == 0)
+		__d_drop(dentry);
+	spin_unlock(&dentry->d_lock);
+}
+
+static inline void d_lustre_revalidate(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	LASSERT(ll_d2d(dentry) != NULL);
+	ll_d2d(dentry)->lld_invalid = 0;
+	spin_unlock(&dentry->d_lock);
+}
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0)
+/* Compatibility for old (1.8) compiled userspace quota code */
+struct if_quotactl_18 {
+	__u32		   qc_cmd;
+	__u32		   qc_type;
+	__u32		   qc_id;
+	__u32		   qc_stat;
+	struct obd_dqinfo       qc_dqinfo;
+	struct obd_dqblk	qc_dqblk;
+	char		    obd_type[16];
+	struct obd_uuid	 obd_uuid;
+};
+#define LL_IOC_QUOTACTL_18	      _IOWR('f', 162, struct if_quotactl_18 *)
+/* End compatibility for old (1.8) compiled userspace quota code */
+#else
+#warning "remove old LL_IOC_QUOTACTL_18 compatibility code"
+#endif /* LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0) */
+
+enum {
+	LL_LAYOUT_GEN_NONE  = ((__u32)-2),	/* layout lock was cancelled */
+	LL_LAYOUT_GEN_EMPTY = ((__u32)-1)	/* for empty layout */
+};
+
+int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf);
+int ll_layout_refresh(struct inode *inode, __u32 *gen);
+
+#endif /* LLITE_INTERNAL_H */
diff --git a/drivers/staging/lustre/lustre/llite/llite_lib.c b/drivers/staging/lustre/lustre/llite/llite_lib.c
new file mode 100644
index 000000000000..278b97dd94c9
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/llite_lib.c
@@ -0,0 +1,2424 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/llite_lib.c
+ *
+ * Lustre Light Super operations
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/version.h>
+#include <linux/mm.h>
+
+#include <lustre_lite.h>
+#include <lustre_ha.h>
+#include <lustre_dlm.h>
+#include <lprocfs_status.h>
+#include <lustre_disk.h>
+#include <lustre_param.h>
+#include <lustre_log.h>
+#include <cl_object.h>
+#include <obd_cksum.h>
+#include "llite_internal.h"
+
+struct kmem_cache *ll_file_data_slab;
+
+LIST_HEAD(ll_super_blocks);
+DEFINE_SPINLOCK(ll_sb_lock);
+
+#ifndef MS_HAS_NEW_AOPS
+extern struct address_space_operations ll_aops;
+#else
+extern struct address_space_operations_ext ll_aops;
+#endif
+
+#ifndef log2
+#define log2(n) ffz(~(n))
+#endif
+
+static struct ll_sb_info *ll_init_sbi(void)
+{
+	struct ll_sb_info *sbi = NULL;
+	unsigned long pages;
+	unsigned long lru_page_max;
+	struct sysinfo si;
+	class_uuid_t uuid;
+	int i;
+	ENTRY;
+
+	OBD_ALLOC(sbi, sizeof(*sbi));
+	if (!sbi)
+		RETURN(NULL);
+
+	spin_lock_init(&sbi->ll_lock);
+	mutex_init(&sbi->ll_lco.lco_lock);
+	spin_lock_init(&sbi->ll_pp_extent_lock);
+	spin_lock_init(&sbi->ll_process_lock);
+	sbi->ll_rw_stats_on = 0;
+
+	si_meminfo(&si);
+	pages = si.totalram - si.totalhigh;
+	if (pages >> (20 - PAGE_CACHE_SHIFT) < 512) {
+		lru_page_max = pages / 2;
+	} else {
+		lru_page_max = (pages / 4) * 3;
+	}
+
+	/* initialize ll_cache data */
+	atomic_set(&sbi->ll_cache.ccc_users, 0);
+	sbi->ll_cache.ccc_lru_max = lru_page_max;
+	atomic_set(&sbi->ll_cache.ccc_lru_left, lru_page_max);
+	spin_lock_init(&sbi->ll_cache.ccc_lru_lock);
+	INIT_LIST_HEAD(&sbi->ll_cache.ccc_lru);
+
+	atomic_set(&sbi->ll_cache.ccc_unstable_nr, 0);
+	init_waitqueue_head(&sbi->ll_cache.ccc_unstable_waitq);
+
+	sbi->ll_ra_info.ra_max_pages_per_file = min(pages / 32,
+					   SBI_DEFAULT_READAHEAD_MAX);
+	sbi->ll_ra_info.ra_max_pages = sbi->ll_ra_info.ra_max_pages_per_file;
+	sbi->ll_ra_info.ra_max_read_ahead_whole_pages =
+					   SBI_DEFAULT_READAHEAD_WHOLE_MAX;
+	INIT_LIST_HEAD(&sbi->ll_conn_chain);
+	INIT_LIST_HEAD(&sbi->ll_orphan_dentry_list);
+
+	ll_generate_random_uuid(uuid);
+	class_uuid_unparse(uuid, &sbi->ll_sb_uuid);
+	CDEBUG(D_CONFIG, "generated uuid: %s\n", sbi->ll_sb_uuid.uuid);
+
+	spin_lock(&ll_sb_lock);
+	list_add_tail(&sbi->ll_list, &ll_super_blocks);
+	spin_unlock(&ll_sb_lock);
+
+	sbi->ll_flags |= LL_SBI_VERBOSE;
+	sbi->ll_flags |= LL_SBI_CHECKSUM;
+
+	sbi->ll_flags |= LL_SBI_LRU_RESIZE;
+
+	for (i = 0; i <= LL_PROCESS_HIST_MAX; i++) {
+		spin_lock_init(&sbi->ll_rw_extents_info.pp_extents[i].
+			       pp_r_hist.oh_lock);
+		spin_lock_init(&sbi->ll_rw_extents_info.pp_extents[i].
+			       pp_w_hist.oh_lock);
+	}
+
+	/* metadata statahead is enabled by default */
+	sbi->ll_sa_max = LL_SA_RPC_DEF;
+	atomic_set(&sbi->ll_sa_total, 0);
+	atomic_set(&sbi->ll_sa_wrong, 0);
+	atomic_set(&sbi->ll_agl_total, 0);
+	sbi->ll_flags |= LL_SBI_AGL_ENABLED;
+
+	RETURN(sbi);
+}
+
+void ll_free_sbi(struct super_block *sb)
+{
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	ENTRY;
+
+	if (sbi != NULL) {
+		spin_lock(&ll_sb_lock);
+		list_del(&sbi->ll_list);
+		spin_unlock(&ll_sb_lock);
+		OBD_FREE(sbi, sizeof(*sbi));
+	}
+	EXIT;
+}
+
+static struct dentry_operations ll_d_root_ops = {
+	.d_compare = ll_dcompare,
+	.d_revalidate = ll_revalidate_nd,
+};
+
+static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
+				    struct vfsmount *mnt)
+{
+	struct inode *root = 0;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct obd_device *obd;
+	struct obd_capa *oc = NULL;
+	struct obd_statfs *osfs = NULL;
+	struct ptlrpc_request *request = NULL;
+	struct obd_connect_data *data = NULL;
+	struct obd_uuid *uuid;
+	struct md_op_data *op_data;
+	struct lustre_md lmd;
+	obd_valid valid;
+	int size, err, checksum;
+	ENTRY;
+
+	obd = class_name2obd(md);
+	if (!obd) {
+		CERROR("MD %s: not setup or attached\n", md);
+		RETURN(-EINVAL);
+	}
+
+	OBD_ALLOC_PTR(data);
+	if (data == NULL)
+		RETURN(-ENOMEM);
+
+	OBD_ALLOC_PTR(osfs);
+	if (osfs == NULL) {
+		OBD_FREE_PTR(data);
+		RETURN(-ENOMEM);
+	}
+
+	if (proc_lustre_fs_root) {
+		err = lprocfs_register_mountpoint(proc_lustre_fs_root, sb,
+						  dt, md);
+		if (err < 0)
+			CERROR("could not register mount in /proc/fs/lustre\n");
+	}
+
+	/* indicate the features supported by this client */
+	data->ocd_connect_flags = OBD_CONNECT_IBITS    | OBD_CONNECT_NODEVOH  |
+				  OBD_CONNECT_ATTRFID  |
+				  OBD_CONNECT_VERSION  | OBD_CONNECT_BRW_SIZE |
+				  OBD_CONNECT_MDS_CAPA | OBD_CONNECT_OSS_CAPA |
+				  OBD_CONNECT_CANCELSET | OBD_CONNECT_FID     |
+				  OBD_CONNECT_AT       | OBD_CONNECT_LOV_V3   |
+				  OBD_CONNECT_RMT_CLIENT | OBD_CONNECT_VBR    |
+				  OBD_CONNECT_FULL20   | OBD_CONNECT_64BITHASH|
+				  OBD_CONNECT_EINPROGRESS |
+				  OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE |
+				  OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_PINGLESS;
+
+	if (sbi->ll_flags & LL_SBI_SOM_PREVIEW)
+		data->ocd_connect_flags |= OBD_CONNECT_SOM;
+
+	if (sbi->ll_flags & LL_SBI_LRU_RESIZE)
+		data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE;
+#ifdef CONFIG_FS_POSIX_ACL
+	data->ocd_connect_flags |= OBD_CONNECT_ACL | OBD_CONNECT_UMASK;
+#endif
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_MDC_LIGHTWEIGHT))
+		/* flag mdc connection as lightweight, only used for test
+		 * purpose, use with care */
+		data->ocd_connect_flags |= OBD_CONNECT_LIGHTWEIGHT;
+
+	data->ocd_ibits_known = MDS_INODELOCK_FULL;
+	data->ocd_version = LUSTRE_VERSION_CODE;
+
+	if (sb->s_flags & MS_RDONLY)
+		data->ocd_connect_flags |= OBD_CONNECT_RDONLY;
+	if (sbi->ll_flags & LL_SBI_USER_XATTR)
+		data->ocd_connect_flags |= OBD_CONNECT_XATTR;
+
+#ifdef HAVE_MS_FLOCK_LOCK
+	/* force vfs to use lustre handler for flock() calls - bug 10743 */
+	sb->s_flags |= MS_FLOCK_LOCK;
+#endif
+#ifdef MS_HAS_NEW_AOPS
+	sb->s_flags |= MS_HAS_NEW_AOPS;
+#endif
+
+	if (sbi->ll_flags & LL_SBI_FLOCK)
+		sbi->ll_fop = &ll_file_operations_flock;
+	else if (sbi->ll_flags & LL_SBI_LOCALFLOCK)
+		sbi->ll_fop = &ll_file_operations;
+	else
+		sbi->ll_fop = &ll_file_operations_noflock;
+
+	/* real client */
+	data->ocd_connect_flags |= OBD_CONNECT_REAL;
+	if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
+		data->ocd_connect_flags |= OBD_CONNECT_RMT_CLIENT_FORCE;
+
+	data->ocd_brw_size = MD_MAX_BRW_SIZE;
+
+	err = obd_connect(NULL, &sbi->ll_md_exp, obd, &sbi->ll_sb_uuid, data, NULL);
+	if (err == -EBUSY) {
+		LCONSOLE_ERROR_MSG(0x14f, "An MDT (md %s) is performing "
+				   "recovery, of which this client is not a "
+				   "part. Please wait for recovery to complete,"
+				   " abort, or time out.\n", md);
+		GOTO(out, err);
+	} else if (err) {
+		CERROR("cannot connect to %s: rc = %d\n", md, err);
+		GOTO(out, err);
+	}
+
+	sbi->ll_md_exp->exp_connect_data = *data;
+
+	err = obd_fid_init(sbi->ll_md_exp->exp_obd, sbi->ll_md_exp,
+			   LUSTRE_SEQ_METADATA);
+	if (err) {
+		CERROR("%s: Can't init metadata layer FID infrastructure, "
+		       "rc = %d\n", sbi->ll_md_exp->exp_obd->obd_name, err);
+		GOTO(out_md, err);
+	}
+
+	/* For mount, we only need fs info from MDT0, and also in DNE, it
+	 * can make sure the client can be mounted as long as MDT0 is
+	 * avaible */
+	err = obd_statfs(NULL, sbi->ll_md_exp, osfs,
+			cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+			OBD_STATFS_FOR_MDT0);
+	if (err)
+		GOTO(out_md_fid, err);
+
+	/* This needs to be after statfs to ensure connect has finished.
+	 * Note that "data" does NOT contain the valid connect reply.
+	 * If connecting to a 1.8 server there will be no LMV device, so
+	 * we can access the MDC export directly and exp_connect_flags will
+	 * be non-zero, but if accessing an upgraded 2.1 server it will
+	 * have the correct flags filled in.
+	 * XXX: fill in the LMV exp_connect_flags from MDC(s). */
+	valid = exp_connect_flags(sbi->ll_md_exp) & CLIENT_CONNECT_MDT_REQD;
+	if (exp_connect_flags(sbi->ll_md_exp) != 0 &&
+	    valid != CLIENT_CONNECT_MDT_REQD) {
+		char *buf;
+
+		OBD_ALLOC_WAIT(buf, PAGE_CACHE_SIZE);
+		obd_connect_flags2str(buf, PAGE_CACHE_SIZE,
+				      valid ^ CLIENT_CONNECT_MDT_REQD, ",");
+		LCONSOLE_ERROR_MSG(0x170, "Server %s does not support "
+				   "feature(s) needed for correct operation "
+				   "of this client (%s). Please upgrade "
+				   "server or downgrade client.\n",
+				   sbi->ll_md_exp->exp_obd->obd_name, buf);
+		OBD_FREE(buf, PAGE_CACHE_SIZE);
+		GOTO(out_md_fid, err = -EPROTO);
+	}
+
+	size = sizeof(*data);
+	err = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_CONN_DATA),
+			   KEY_CONN_DATA,  &size, data, NULL);
+	if (err) {
+		CERROR("%s: Get connect data failed: rc = %d\n",
+		       sbi->ll_md_exp->exp_obd->obd_name, err);
+		GOTO(out_md_fid, err);
+	}
+
+	LASSERT(osfs->os_bsize);
+	sb->s_blocksize = osfs->os_bsize;
+	sb->s_blocksize_bits = log2(osfs->os_bsize);
+	sb->s_magic = LL_SUPER_MAGIC;
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sbi->ll_namelen = osfs->os_namelen;
+	sbi->ll_max_rw_chunk = LL_DEFAULT_MAX_RW_CHUNK;
+
+	if ((sbi->ll_flags & LL_SBI_USER_XATTR) &&
+	    !(data->ocd_connect_flags & OBD_CONNECT_XATTR)) {
+		LCONSOLE_INFO("Disabling user_xattr feature because "
+			      "it is not supported on the server\n");
+		sbi->ll_flags &= ~LL_SBI_USER_XATTR;
+	}
+
+	if (data->ocd_connect_flags & OBD_CONNECT_ACL) {
+#ifdef MS_POSIXACL
+		sb->s_flags |= MS_POSIXACL;
+#endif
+		sbi->ll_flags |= LL_SBI_ACL;
+	} else {
+		LCONSOLE_INFO("client wants to enable acl, but mdt not!\n");
+#ifdef MS_POSIXACL
+		sb->s_flags &= ~MS_POSIXACL;
+#endif
+		sbi->ll_flags &= ~LL_SBI_ACL;
+	}
+
+	if (data->ocd_connect_flags & OBD_CONNECT_RMT_CLIENT) {
+		if (!(sbi->ll_flags & LL_SBI_RMT_CLIENT)) {
+			sbi->ll_flags |= LL_SBI_RMT_CLIENT;
+			LCONSOLE_INFO("client is set as remote by default.\n");
+		}
+	} else {
+		if (sbi->ll_flags & LL_SBI_RMT_CLIENT) {
+			sbi->ll_flags &= ~LL_SBI_RMT_CLIENT;
+			LCONSOLE_INFO("client claims to be remote, but server "
+				      "rejected, forced to be local.\n");
+		}
+	}
+
+	if (data->ocd_connect_flags & OBD_CONNECT_MDS_CAPA) {
+		LCONSOLE_INFO("client enabled MDS capability!\n");
+		sbi->ll_flags |= LL_SBI_MDS_CAPA;
+	}
+
+	if (data->ocd_connect_flags & OBD_CONNECT_OSS_CAPA) {
+		LCONSOLE_INFO("client enabled OSS capability!\n");
+		sbi->ll_flags |= LL_SBI_OSS_CAPA;
+	}
+
+	if (data->ocd_connect_flags & OBD_CONNECT_64BITHASH)
+		sbi->ll_flags |= LL_SBI_64BIT_HASH;
+
+	if (data->ocd_connect_flags & OBD_CONNECT_BRW_SIZE)
+		sbi->ll_md_brw_size = data->ocd_brw_size;
+	else
+		sbi->ll_md_brw_size = PAGE_CACHE_SIZE;
+
+	if (data->ocd_connect_flags & OBD_CONNECT_LAYOUTLOCK) {
+		LCONSOLE_INFO("Layout lock feature supported.\n");
+		sbi->ll_flags |= LL_SBI_LAYOUT_LOCK;
+	}
+
+	obd = class_name2obd(dt);
+	if (!obd) {
+		CERROR("DT %s: not setup or attached\n", dt);
+		GOTO(out_md_fid, err = -ENODEV);
+	}
+
+	data->ocd_connect_flags = OBD_CONNECT_GRANT     | OBD_CONNECT_VERSION  |
+				  OBD_CONNECT_REQPORTAL | OBD_CONNECT_BRW_SIZE |
+				  OBD_CONNECT_CANCELSET | OBD_CONNECT_FID      |
+				  OBD_CONNECT_SRVLOCK   | OBD_CONNECT_TRUNCLOCK|
+				  OBD_CONNECT_AT | OBD_CONNECT_RMT_CLIENT |
+				  OBD_CONNECT_OSS_CAPA | OBD_CONNECT_VBR|
+				  OBD_CONNECT_FULL20 | OBD_CONNECT_64BITHASH |
+				  OBD_CONNECT_MAXBYTES |
+				  OBD_CONNECT_EINPROGRESS |
+				  OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE |
+				  OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_PINGLESS;
+
+	if (sbi->ll_flags & LL_SBI_SOM_PREVIEW)
+		data->ocd_connect_flags |= OBD_CONNECT_SOM;
+
+	if (!OBD_FAIL_CHECK(OBD_FAIL_OSC_CONNECT_CKSUM)) {
+		/* OBD_CONNECT_CKSUM should always be set, even if checksums are
+		 * disabled by default, because it can still be enabled on the
+		 * fly via /proc. As a consequence, we still need to come to an
+		 * agreement on the supported algorithms at connect time */
+		data->ocd_connect_flags |= OBD_CONNECT_CKSUM;
+
+		if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CKSUM_ADLER_ONLY))
+			data->ocd_cksum_types = OBD_CKSUM_ADLER;
+		else
+			data->ocd_cksum_types = cksum_types_supported_client();
+	}
+
+	data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE;
+	if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
+		data->ocd_connect_flags |= OBD_CONNECT_RMT_CLIENT_FORCE;
+
+	CDEBUG(D_RPCTRACE, "ocd_connect_flags: "LPX64" ocd_version: %d "
+	       "ocd_grant: %d\n", data->ocd_connect_flags,
+	       data->ocd_version, data->ocd_grant);
+
+	obd->obd_upcall.onu_owner = &sbi->ll_lco;
+	obd->obd_upcall.onu_upcall = cl_ocd_update;
+
+	data->ocd_brw_size = DT_MAX_BRW_SIZE;
+
+	err = obd_connect(NULL, &sbi->ll_dt_exp, obd, &sbi->ll_sb_uuid, data,
+			  NULL);
+	if (err == -EBUSY) {
+		LCONSOLE_ERROR_MSG(0x150, "An OST (dt %s) is performing "
+				   "recovery, of which this client is not a "
+				   "part.  Please wait for recovery to "
+				   "complete, abort, or time out.\n", dt);
+		GOTO(out_md, err);
+	} else if (err) {
+		CERROR("%s: Cannot connect to %s: rc = %d\n",
+		       sbi->ll_dt_exp->exp_obd->obd_name, dt, err);
+		GOTO(out_md, err);
+	}
+
+	sbi->ll_dt_exp->exp_connect_data = *data;
+
+	err = obd_fid_init(sbi->ll_dt_exp->exp_obd, sbi->ll_dt_exp,
+			   LUSTRE_SEQ_METADATA);
+	if (err) {
+		CERROR("%s: Can't init data layer FID infrastructure, "
+		       "rc = %d\n", sbi->ll_dt_exp->exp_obd->obd_name, err);
+		GOTO(out_dt, err);
+	}
+
+	mutex_lock(&sbi->ll_lco.lco_lock);
+	sbi->ll_lco.lco_flags = data->ocd_connect_flags;
+	sbi->ll_lco.lco_md_exp = sbi->ll_md_exp;
+	sbi->ll_lco.lco_dt_exp = sbi->ll_dt_exp;
+	mutex_unlock(&sbi->ll_lco.lco_lock);
+
+	fid_zero(&sbi->ll_root_fid);
+	err = md_getstatus(sbi->ll_md_exp, &sbi->ll_root_fid, &oc);
+	if (err) {
+		CERROR("cannot mds_connect: rc = %d\n", err);
+		GOTO(out_lock_cn_cb, err);
+	}
+	if (!fid_is_sane(&sbi->ll_root_fid)) {
+		CERROR("%s: Invalid root fid "DFID" during mount\n",
+		       sbi->ll_md_exp->exp_obd->obd_name,
+		       PFID(&sbi->ll_root_fid));
+		GOTO(out_lock_cn_cb, err = -EINVAL);
+	}
+	CDEBUG(D_SUPER, "rootfid "DFID"\n", PFID(&sbi->ll_root_fid));
+
+	sb->s_op = &lustre_super_operations;
+#if THREAD_SIZE >= 8192 /*b=17630*/
+	sb->s_export_op = &lustre_export_operations;
+#endif
+
+	/* make root inode
+	 * XXX: move this to after cbd setup? */
+	valid = OBD_MD_FLGETATTR | OBD_MD_FLBLOCKS | OBD_MD_FLMDSCAPA;
+	if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
+		valid |= OBD_MD_FLRMTPERM;
+	else if (sbi->ll_flags & LL_SBI_ACL)
+		valid |= OBD_MD_FLACL;
+
+	OBD_ALLOC_PTR(op_data);
+	if (op_data == NULL)
+		GOTO(out_lock_cn_cb, err = -ENOMEM);
+
+	op_data->op_fid1 = sbi->ll_root_fid;
+	op_data->op_mode = 0;
+	op_data->op_capa1 = oc;
+	op_data->op_valid = valid;
+
+	err = md_getattr(sbi->ll_md_exp, op_data, &request);
+	if (oc)
+		capa_put(oc);
+	OBD_FREE_PTR(op_data);
+	if (err) {
+		CERROR("%s: md_getattr failed for root: rc = %d\n",
+		       sbi->ll_md_exp->exp_obd->obd_name, err);
+		GOTO(out_lock_cn_cb, err);
+	}
+
+	err = md_get_lustre_md(sbi->ll_md_exp, request, sbi->ll_dt_exp,
+			       sbi->ll_md_exp, &lmd);
+	if (err) {
+		CERROR("failed to understand root inode md: rc = %d\n", err);
+		ptlrpc_req_finished(request);
+		GOTO(out_lock_cn_cb, err);
+	}
+
+	LASSERT(fid_is_sane(&sbi->ll_root_fid));
+	root = ll_iget(sb, cl_fid_build_ino(&sbi->ll_root_fid,
+					    ll_need_32bit_api(sbi)),
+		       &lmd);
+	md_free_lustre_md(sbi->ll_md_exp, &lmd);
+	ptlrpc_req_finished(request);
+
+	if (root == NULL || IS_ERR(root)) {
+		if (lmd.lsm)
+			obd_free_memmd(sbi->ll_dt_exp, &lmd.lsm);
+#ifdef CONFIG_FS_POSIX_ACL
+		if (lmd.posix_acl) {
+			posix_acl_release(lmd.posix_acl);
+			lmd.posix_acl = NULL;
+		}
+#endif
+		err = IS_ERR(root) ? PTR_ERR(root) : -EBADF;
+		root = NULL;
+		CERROR("lustre_lite: bad iget4 for root\n");
+		GOTO(out_root, err);
+	}
+
+	err = ll_close_thread_start(&sbi->ll_lcq);
+	if (err) {
+		CERROR("cannot start close thread: rc %d\n", err);
+		GOTO(out_root, err);
+	}
+
+#ifdef CONFIG_FS_POSIX_ACL
+	if (sbi->ll_flags & LL_SBI_RMT_CLIENT) {
+		rct_init(&sbi->ll_rct);
+		et_init(&sbi->ll_et);
+	}
+#endif
+
+	checksum = sbi->ll_flags & LL_SBI_CHECKSUM;
+	err = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CHECKSUM),
+				 KEY_CHECKSUM, sizeof(checksum), &checksum,
+				 NULL);
+	cl_sb_init(sb);
+
+	err = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CACHE_SET),
+				 KEY_CACHE_SET, sizeof(sbi->ll_cache),
+				 &sbi->ll_cache, NULL);
+
+	sb->s_root = d_make_root(root);
+	if (sb->s_root == NULL) {
+		CERROR("%s: can't make root dentry\n",
+			ll_get_fsname(sb, NULL, 0));
+		GOTO(out_root, err = -ENOMEM);
+	}
+
+	/* kernel >= 2.6.38 store dentry operations in sb->s_d_op. */
+	d_set_d_op(sb->s_root, &ll_d_root_ops);
+	sb->s_d_op = &ll_d_ops;
+
+	sbi->ll_sdev_orig = sb->s_dev;
+
+	/* We set sb->s_dev equal on all lustre clients in order to support
+	 * NFS export clustering.  NFSD requires that the FSID be the same
+	 * on all clients. */
+	/* s_dev is also used in lt_compare() to compare two fs, but that is
+	 * only a node-local comparison. */
+	uuid = obd_get_uuid(sbi->ll_md_exp);
+	if (uuid != NULL)
+		sb->s_dev = get_uuid2int(uuid->uuid, strlen(uuid->uuid));
+
+	if (data != NULL)
+		OBD_FREE_PTR(data);
+	if (osfs != NULL)
+		OBD_FREE_PTR(osfs);
+
+	RETURN(err);
+out_root:
+	if (root)
+		iput(root);
+out_lock_cn_cb:
+	obd_fid_fini(sbi->ll_dt_exp->exp_obd);
+out_dt:
+	obd_disconnect(sbi->ll_dt_exp);
+	sbi->ll_dt_exp = NULL;
+	/* Make sure all OScs are gone, since cl_cache is accessing sbi. */
+	obd_zombie_barrier();
+out_md_fid:
+	obd_fid_fini(sbi->ll_md_exp->exp_obd);
+out_md:
+	obd_disconnect(sbi->ll_md_exp);
+	sbi->ll_md_exp = NULL;
+out:
+	if (data != NULL)
+		OBD_FREE_PTR(data);
+	if (osfs != NULL)
+		OBD_FREE_PTR(osfs);
+	lprocfs_unregister_mountpoint(sbi);
+	return err;
+}
+
+int ll_get_max_mdsize(struct ll_sb_info *sbi, int *lmmsize)
+{
+	int size, rc;
+
+	*lmmsize = obd_size_diskmd(sbi->ll_dt_exp, NULL);
+	size = sizeof(int);
+	rc = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_MAX_EASIZE),
+			  KEY_MAX_EASIZE, &size, lmmsize, NULL);
+	if (rc)
+		CERROR("Get max mdsize error rc %d \n", rc);
+
+	RETURN(rc);
+}
+
+void ll_dump_inode(struct inode *inode)
+{
+	struct ll_d_hlist_node *tmp;
+	int dentry_count = 0;
+
+	LASSERT(inode != NULL);
+
+	ll_d_hlist_for_each(tmp, &inode->i_dentry)
+		dentry_count++;
+
+	CERROR("inode %p dump: dev=%s ino=%lu mode=%o count=%u, %d dentries\n",
+	       inode, ll_i2mdexp(inode)->exp_obd->obd_name, inode->i_ino,
+	       inode->i_mode, atomic_read(&inode->i_count), dentry_count);
+}
+
+void lustre_dump_dentry(struct dentry *dentry, int recur)
+{
+	struct list_head *tmp;
+	int subdirs = 0;
+
+	LASSERT(dentry != NULL);
+
+	list_for_each(tmp, &dentry->d_subdirs)
+		subdirs++;
+
+	CERROR("dentry %p dump: name=%.*s parent=%.*s (%p), inode=%p, count=%u,"
+	       " flags=0x%x, fsdata=%p, %d subdirs\n", dentry,
+	       dentry->d_name.len, dentry->d_name.name,
+	       dentry->d_parent->d_name.len, dentry->d_parent->d_name.name,
+	       dentry->d_parent, dentry->d_inode, d_refcount(dentry),
+	       dentry->d_flags, dentry->d_fsdata, subdirs);
+	if (dentry->d_inode != NULL)
+		ll_dump_inode(dentry->d_inode);
+
+	if (recur == 0)
+		return;
+
+	list_for_each(tmp, &dentry->d_subdirs) {
+		struct dentry *d = list_entry(tmp, struct dentry, d_u.d_child);
+		lustre_dump_dentry(d, recur - 1);
+	}
+}
+
+void client_common_put_super(struct super_block *sb)
+{
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	ENTRY;
+
+#ifdef CONFIG_FS_POSIX_ACL
+	if (sbi->ll_flags & LL_SBI_RMT_CLIENT) {
+		et_fini(&sbi->ll_et);
+		rct_fini(&sbi->ll_rct);
+	}
+#endif
+
+	ll_close_thread_shutdown(sbi->ll_lcq);
+
+	cl_sb_fini(sb);
+
+	list_del(&sbi->ll_conn_chain);
+
+	obd_fid_fini(sbi->ll_dt_exp->exp_obd);
+	obd_disconnect(sbi->ll_dt_exp);
+	sbi->ll_dt_exp = NULL;
+	/* wait till all OSCs are gone, since cl_cache is accessing sbi.
+	 * see LU-2543. */
+	obd_zombie_barrier();
+
+	lprocfs_unregister_mountpoint(sbi);
+
+	obd_fid_fini(sbi->ll_md_exp->exp_obd);
+	obd_disconnect(sbi->ll_md_exp);
+	sbi->ll_md_exp = NULL;
+
+	EXIT;
+}
+
+void ll_kill_super(struct super_block *sb)
+{
+	struct ll_sb_info *sbi;
+
+	ENTRY;
+
+	/* not init sb ?*/
+	if (!(sb->s_flags & MS_ACTIVE))
+		return;
+
+	sbi = ll_s2sbi(sb);
+	/* we need restore s_dev from changed for clustred NFS before put_super
+	 * because new kernels have cached s_dev and change sb->s_dev in
+	 * put_super not affected real removing devices */
+	if (sbi)
+		sb->s_dev = sbi->ll_sdev_orig;
+	EXIT;
+}
+
+char *ll_read_opt(const char *opt, char *data)
+{
+	char *value;
+	char *retval;
+	ENTRY;
+
+	CDEBUG(D_SUPER, "option: %s, data %s\n", opt, data);
+	if (strncmp(opt, data, strlen(opt)))
+		RETURN(NULL);
+	if ((value = strchr(data, '=')) == NULL)
+		RETURN(NULL);
+
+	value++;
+	OBD_ALLOC(retval, strlen(value) + 1);
+	if (!retval) {
+		CERROR("out of memory!\n");
+		RETURN(NULL);
+	}
+
+	memcpy(retval, value, strlen(value)+1);
+	CDEBUG(D_SUPER, "Assigned option: %s, value %s\n", opt, retval);
+	RETURN(retval);
+}
+
+static inline int ll_set_opt(const char *opt, char *data, int fl)
+{
+	if (strncmp(opt, data, strlen(opt)) != 0)
+		return(0);
+	else
+		return(fl);
+}
+
+/* non-client-specific mount options are parsed in lmd_parse */
+static int ll_options(char *options, int *flags)
+{
+	int tmp;
+	char *s1 = options, *s2;
+	ENTRY;
+
+	if (!options)
+		RETURN(0);
+
+	CDEBUG(D_CONFIG, "Parsing opts %s\n", options);
+
+	while (*s1) {
+		CDEBUG(D_SUPER, "next opt=%s\n", s1);
+		tmp = ll_set_opt("nolock", s1, LL_SBI_NOLCK);
+		if (tmp) {
+			*flags |= tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("flock", s1, LL_SBI_FLOCK);
+		if (tmp) {
+			*flags |= tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("localflock", s1, LL_SBI_LOCALFLOCK);
+		if (tmp) {
+			*flags |= tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("noflock", s1, LL_SBI_FLOCK|LL_SBI_LOCALFLOCK);
+		if (tmp) {
+			*flags &= ~tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("user_xattr", s1, LL_SBI_USER_XATTR);
+		if (tmp) {
+			*flags |= tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("nouser_xattr", s1, LL_SBI_USER_XATTR);
+		if (tmp) {
+			*flags &= ~tmp;
+			goto next;
+		}
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 5, 50, 0)
+		tmp = ll_set_opt("acl", s1, LL_SBI_ACL);
+		if (tmp) {
+			/* Ignore deprecated mount option.  The client will
+			 * always try to mount with ACL support, whether this
+			 * is used depends on whether server supports it. */
+			LCONSOLE_ERROR_MSG(0x152, "Ignoring deprecated "
+						  "mount option 'acl'.\n");
+			goto next;
+		}
+		tmp = ll_set_opt("noacl", s1, LL_SBI_ACL);
+		if (tmp) {
+			LCONSOLE_ERROR_MSG(0x152, "Ignoring deprecated "
+						  "mount option 'noacl'.\n");
+			goto next;
+		}
+#else
+#warning "{no}acl options have been deprecated since 1.8, please remove them"
+#endif
+		tmp = ll_set_opt("remote_client", s1, LL_SBI_RMT_CLIENT);
+		if (tmp) {
+			*flags |= tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("user_fid2path", s1, LL_SBI_USER_FID2PATH);
+		if (tmp) {
+			*flags |= tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("nouser_fid2path", s1, LL_SBI_USER_FID2PATH);
+		if (tmp) {
+			*flags &= ~tmp;
+			goto next;
+		}
+
+		tmp = ll_set_opt("checksum", s1, LL_SBI_CHECKSUM);
+		if (tmp) {
+			*flags |= tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("nochecksum", s1, LL_SBI_CHECKSUM);
+		if (tmp) {
+			*flags &= ~tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("lruresize", s1, LL_SBI_LRU_RESIZE);
+		if (tmp) {
+			*flags |= tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("nolruresize", s1, LL_SBI_LRU_RESIZE);
+		if (tmp) {
+			*flags &= ~tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("lazystatfs", s1, LL_SBI_LAZYSTATFS);
+		if (tmp) {
+			*flags |= tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("nolazystatfs", s1, LL_SBI_LAZYSTATFS);
+		if (tmp) {
+			*flags &= ~tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("som_preview", s1, LL_SBI_SOM_PREVIEW);
+		if (tmp) {
+			*flags |= tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("32bitapi", s1, LL_SBI_32BIT_API);
+		if (tmp) {
+			*flags |= tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("verbose", s1, LL_SBI_VERBOSE);
+		if (tmp) {
+			*flags |= tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("noverbose", s1, LL_SBI_VERBOSE);
+		if (tmp) {
+			*flags &= ~tmp;
+			goto next;
+		}
+		LCONSOLE_ERROR_MSG(0x152, "Unknown option '%s', won't mount.\n",
+				   s1);
+		RETURN(-EINVAL);
+
+next:
+		/* Find next opt */
+		s2 = strchr(s1, ',');
+		if (s2 == NULL)
+			break;
+		s1 = s2 + 1;
+	}
+	RETURN(0);
+}
+
+void ll_lli_init(struct ll_inode_info *lli)
+{
+	lli->lli_inode_magic = LLI_INODE_MAGIC;
+	lli->lli_flags = 0;
+	lli->lli_ioepoch = 0;
+	lli->lli_maxbytes = MAX_LFS_FILESIZE;
+	spin_lock_init(&lli->lli_lock);
+	lli->lli_posix_acl = NULL;
+	lli->lli_remote_perms = NULL;
+	mutex_init(&lli->lli_rmtperm_mutex);
+	/* Do not set lli_fid, it has been initialized already. */
+	fid_zero(&lli->lli_pfid);
+	INIT_LIST_HEAD(&lli->lli_close_list);
+	INIT_LIST_HEAD(&lli->lli_oss_capas);
+	atomic_set(&lli->lli_open_count, 0);
+	lli->lli_mds_capa = NULL;
+	lli->lli_rmtperm_time = 0;
+	lli->lli_pending_och = NULL;
+	lli->lli_mds_read_och = NULL;
+	lli->lli_mds_write_och = NULL;
+	lli->lli_mds_exec_och = NULL;
+	lli->lli_open_fd_read_count = 0;
+	lli->lli_open_fd_write_count = 0;
+	lli->lli_open_fd_exec_count = 0;
+	mutex_init(&lli->lli_och_mutex);
+	spin_lock_init(&lli->lli_agl_lock);
+	lli->lli_has_smd = false;
+	lli->lli_layout_gen = LL_LAYOUT_GEN_NONE;
+	lli->lli_clob = NULL;
+
+	LASSERT(lli->lli_vfs_inode.i_mode != 0);
+	if (S_ISDIR(lli->lli_vfs_inode.i_mode)) {
+		mutex_init(&lli->lli_readdir_mutex);
+		lli->lli_opendir_key = NULL;
+		lli->lli_sai = NULL;
+		lli->lli_def_acl = NULL;
+		spin_lock_init(&lli->lli_sa_lock);
+		lli->lli_opendir_pid = 0;
+	} else {
+		sema_init(&lli->lli_size_sem, 1);
+		lli->lli_size_sem_owner = NULL;
+		lli->lli_symlink_name = NULL;
+		init_rwsem(&lli->lli_trunc_sem);
+		mutex_init(&lli->lli_write_mutex);
+		init_rwsem(&lli->lli_glimpse_sem);
+		lli->lli_glimpse_time = 0;
+		INIT_LIST_HEAD(&lli->lli_agl_list);
+		lli->lli_agl_index = 0;
+		lli->lli_async_rc = 0;
+		lli->lli_volatile = false;
+	}
+	mutex_init(&lli->lli_layout_mutex);
+}
+
+static inline int ll_bdi_register(struct backing_dev_info *bdi)
+{
+	static atomic_t ll_bdi_num = ATOMIC_INIT(0);
+
+	bdi->name = "lustre";
+	return bdi_register(bdi, NULL, "lustre-%d",
+			    atomic_inc_return(&ll_bdi_num));
+}
+
+int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
+{
+	struct lustre_profile *lprof = NULL;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct ll_sb_info *sbi;
+	char  *dt = NULL, *md = NULL;
+	char  *profilenm = get_profile_name(sb);
+	struct config_llog_instance *cfg;
+	/* %p for void* in printf needs 16+2 characters: 0xffffffffffffffff */
+	const int instlen = sizeof(cfg->cfg_instance) * 2 + 2;
+	int    err;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op: sb %p\n", sb);
+
+	OBD_ALLOC_PTR(cfg);
+	if (cfg == NULL)
+		RETURN(-ENOMEM);
+
+	try_module_get(THIS_MODULE);
+
+	/* client additional sb info */
+	lsi->lsi_llsbi = sbi = ll_init_sbi();
+	if (!sbi) {
+		module_put(THIS_MODULE);
+		OBD_FREE_PTR(cfg);
+		RETURN(-ENOMEM);
+	}
+
+	err = ll_options(lsi->lsi_lmd->lmd_opts, &sbi->ll_flags);
+	if (err)
+		GOTO(out_free, err);
+
+	err = bdi_init(&lsi->lsi_bdi);
+	if (err)
+		GOTO(out_free, err);
+	lsi->lsi_flags |= LSI_BDI_INITIALIZED;
+	lsi->lsi_bdi.capabilities = BDI_CAP_MAP_COPY;
+	err = ll_bdi_register(&lsi->lsi_bdi);
+	if (err)
+		GOTO(out_free, err);
+
+	sb->s_bdi = &lsi->lsi_bdi;
+
+	/* Generate a string unique to this super, in case some joker tries
+	   to mount the same fs at two mount points.
+	   Use the address of the super itself.*/
+	cfg->cfg_instance = sb;
+	cfg->cfg_uuid = lsi->lsi_llsbi->ll_sb_uuid;
+	cfg->cfg_callback = class_config_llog_handler;
+	/* set up client obds */
+	err = lustre_process_log(sb, profilenm, cfg);
+	if (err < 0) {
+		CERROR("Unable to process log: %d\n", err);
+		GOTO(out_free, err);
+	}
+
+	/* Profile set with LCFG_MOUNTOPT so we can find our mdc and osc obds */
+	lprof = class_get_profile(profilenm);
+	if (lprof == NULL) {
+		LCONSOLE_ERROR_MSG(0x156, "The client profile '%s' could not be"
+				   " read from the MGS.  Does that filesystem "
+				   "exist?\n", profilenm);
+		GOTO(out_free, err = -EINVAL);
+	}
+	CDEBUG(D_CONFIG, "Found profile %s: mdc=%s osc=%s\n", profilenm,
+	       lprof->lp_md, lprof->lp_dt);
+
+	OBD_ALLOC(dt, strlen(lprof->lp_dt) + instlen + 2);
+	if (!dt)
+		GOTO(out_free, err = -ENOMEM);
+	sprintf(dt, "%s-%p", lprof->lp_dt, cfg->cfg_instance);
+
+	OBD_ALLOC(md, strlen(lprof->lp_md) + instlen + 2);
+	if (!md)
+		GOTO(out_free, err = -ENOMEM);
+	sprintf(md, "%s-%p", lprof->lp_md, cfg->cfg_instance);
+
+	/* connections, registrations, sb setup */
+	err = client_common_fill_super(sb, md, dt, mnt);
+
+out_free:
+	if (md)
+		OBD_FREE(md, strlen(lprof->lp_md) + instlen + 2);
+	if (dt)
+		OBD_FREE(dt, strlen(lprof->lp_dt) + instlen + 2);
+	if (err)
+		ll_put_super(sb);
+	else if (sbi->ll_flags & LL_SBI_VERBOSE)
+		LCONSOLE_WARN("Mounted %s\n", profilenm);
+
+	OBD_FREE_PTR(cfg);
+	RETURN(err);
+} /* ll_fill_super */
+
+
+void lu_context_keys_dump(void);
+
+void ll_put_super(struct super_block *sb)
+{
+	struct config_llog_instance cfg;
+	struct obd_device *obd;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	char *profilenm = get_profile_name(sb);
+	int ccc_count, next, force = 1, rc = 0;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op: sb %p - %s\n", sb, profilenm);
+
+	ll_print_capa_stat(sbi);
+
+	cfg.cfg_instance = sb;
+	lustre_end_log(sb, profilenm, &cfg);
+
+	if (sbi->ll_md_exp) {
+		obd = class_exp2obd(sbi->ll_md_exp);
+		if (obd)
+			force = obd->obd_force;
+	}
+
+	/* Wait for unstable pages to be committed to stable storage */
+	if (force == 0) {
+		struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+		rc = l_wait_event(sbi->ll_cache.ccc_unstable_waitq,
+			atomic_read(&sbi->ll_cache.ccc_unstable_nr) == 0,
+			&lwi);
+	}
+
+	ccc_count = atomic_read(&sbi->ll_cache.ccc_unstable_nr);
+	if (force == 0 && rc != -EINTR)
+		LASSERTF(ccc_count == 0, "count: %i\n", ccc_count);
+
+
+	/* We need to set force before the lov_disconnect in
+	   lustre_common_put_super, since l_d cleans up osc's as well. */
+	if (force) {
+		next = 0;
+		while ((obd = class_devices_in_group(&sbi->ll_sb_uuid,
+						     &next)) != NULL) {
+			obd->obd_force = force;
+		}
+	}
+
+	if (sbi->ll_lcq) {
+		/* Only if client_common_fill_super succeeded */
+		client_common_put_super(sb);
+	}
+
+	next = 0;
+	while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, &next)) !=NULL) {
+		class_manual_cleanup(obd);
+	}
+
+	if (sbi->ll_flags & LL_SBI_VERBOSE)
+		LCONSOLE_WARN("Unmounted %s\n", profilenm ? profilenm : "");
+
+	if (profilenm)
+		class_del_profile(profilenm);
+
+	if (lsi->lsi_flags & LSI_BDI_INITIALIZED) {
+		bdi_destroy(&lsi->lsi_bdi);
+		lsi->lsi_flags &= ~LSI_BDI_INITIALIZED;
+	}
+
+	ll_free_sbi(sb);
+	lsi->lsi_llsbi = NULL;
+
+	lustre_common_put_super(sb);
+
+	module_put(THIS_MODULE);
+
+	EXIT;
+} /* client_put_super */
+
+struct inode *ll_inode_from_resource_lock(struct ldlm_lock *lock)
+{
+	struct inode *inode = NULL;
+
+	/* NOTE: we depend on atomic igrab() -bzzz */
+	lock_res_and_lock(lock);
+	if (lock->l_resource->lr_lvb_inode) {
+		struct ll_inode_info * lli;
+		lli = ll_i2info(lock->l_resource->lr_lvb_inode);
+		if (lli->lli_inode_magic == LLI_INODE_MAGIC) {
+			inode = igrab(lock->l_resource->lr_lvb_inode);
+		} else {
+			inode = lock->l_resource->lr_lvb_inode;
+			LDLM_DEBUG_LIMIT(inode->i_state & I_FREEING ?  D_INFO :
+					 D_WARNING, lock, "lr_lvb_inode %p is "
+					 "bogus: magic %08x",
+					 lock->l_resource->lr_lvb_inode,
+					 lli->lli_inode_magic);
+			inode = NULL;
+		}
+	}
+	unlock_res_and_lock(lock);
+	return inode;
+}
+
+struct inode *ll_inode_from_lock(struct ldlm_lock *lock)
+{
+	struct inode *inode = NULL;
+	/* NOTE: we depend on atomic igrab() -bzzz */
+	lock_res_and_lock(lock);
+	if (lock->l_ast_data) {
+		struct ll_inode_info *lli = ll_i2info(lock->l_ast_data);
+		if (lli->lli_inode_magic == LLI_INODE_MAGIC) {
+			inode = igrab(lock->l_ast_data);
+		} else {
+			inode = lock->l_ast_data;
+			LDLM_DEBUG_LIMIT(inode->i_state & I_FREEING ?  D_INFO :
+					 D_WARNING, lock, "l_ast_data %p is "
+					 "bogus: magic %08x", lock->l_ast_data,
+					 lli->lli_inode_magic);
+			inode = NULL;
+		}
+	}
+	unlock_res_and_lock(lock);
+	return inode;
+}
+
+void ll_clear_inode(struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
+	       inode->i_generation, inode);
+
+	if (S_ISDIR(inode->i_mode)) {
+		/* these should have been cleared in ll_file_release */
+		LASSERT(lli->lli_opendir_key == NULL);
+		LASSERT(lli->lli_sai == NULL);
+		LASSERT(lli->lli_opendir_pid == 0);
+	}
+
+	ll_i2info(inode)->lli_flags &= ~LLIF_MDS_SIZE_LOCK;
+	md_null_inode(sbi->ll_md_exp, ll_inode2fid(inode));
+
+	LASSERT(!lli->lli_open_fd_write_count);
+	LASSERT(!lli->lli_open_fd_read_count);
+	LASSERT(!lli->lli_open_fd_exec_count);
+
+	if (lli->lli_mds_write_och)
+		ll_md_real_close(inode, FMODE_WRITE);
+	if (lli->lli_mds_exec_och)
+		ll_md_real_close(inode, FMODE_EXEC);
+	if (lli->lli_mds_read_och)
+		ll_md_real_close(inode, FMODE_READ);
+
+	if (S_ISLNK(inode->i_mode) && lli->lli_symlink_name) {
+		OBD_FREE(lli->lli_symlink_name,
+			 strlen(lli->lli_symlink_name) + 1);
+		lli->lli_symlink_name = NULL;
+	}
+
+	if (sbi->ll_flags & LL_SBI_RMT_CLIENT) {
+		LASSERT(lli->lli_posix_acl == NULL);
+		if (lli->lli_remote_perms) {
+			free_rmtperm_hash(lli->lli_remote_perms);
+			lli->lli_remote_perms = NULL;
+		}
+	}
+#ifdef CONFIG_FS_POSIX_ACL
+	else if (lli->lli_posix_acl) {
+		LASSERT(atomic_read(&lli->lli_posix_acl->a_refcount) == 1);
+		LASSERT(lli->lli_remote_perms == NULL);
+		posix_acl_release(lli->lli_posix_acl);
+		lli->lli_posix_acl = NULL;
+	}
+#endif
+	lli->lli_inode_magic = LLI_INODE_DEAD;
+
+	ll_clear_inode_capas(inode);
+	if (!S_ISDIR(inode->i_mode))
+		LASSERT(list_empty(&lli->lli_agl_list));
+
+	/*
+	 * XXX This has to be done before lsm is freed below, because
+	 * cl_object still uses inode lsm.
+	 */
+	cl_inode_fini(inode);
+	lli->lli_has_smd = false;
+
+	EXIT;
+}
+
+int ll_md_setattr(struct dentry *dentry, struct md_op_data *op_data,
+		  struct md_open_data **mod)
+{
+	struct lustre_md md;
+	struct inode *inode = dentry->d_inode;
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ptlrpc_request *request = NULL;
+	int rc, ia_valid;
+	ENTRY;
+
+	op_data = ll_prep_md_op_data(op_data, inode, NULL, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	rc = md_setattr(sbi->ll_md_exp, op_data, NULL, 0, NULL, 0,
+			&request, mod);
+	if (rc) {
+		ptlrpc_req_finished(request);
+		if (rc == -ENOENT) {
+			clear_nlink(inode);
+			/* Unlinked special device node? Or just a race?
+			 * Pretend we done everything. */
+			if (!S_ISREG(inode->i_mode) &&
+			    !S_ISDIR(inode->i_mode)) {
+				ia_valid = op_data->op_attr.ia_valid;
+				op_data->op_attr.ia_valid &= ~TIMES_SET_FLAGS;
+				rc = simple_setattr(dentry, &op_data->op_attr);
+				op_data->op_attr.ia_valid = ia_valid;
+			}
+		} else if (rc != -EPERM && rc != -EACCES && rc != -ETXTBSY) {
+			CERROR("md_setattr fails: rc = %d\n", rc);
+		}
+		RETURN(rc);
+	}
+
+	rc = md_get_lustre_md(sbi->ll_md_exp, request, sbi->ll_dt_exp,
+			      sbi->ll_md_exp, &md);
+	if (rc) {
+		ptlrpc_req_finished(request);
+		RETURN(rc);
+	}
+
+	ia_valid = op_data->op_attr.ia_valid;
+	/* inode size will be in ll_setattr_ost, can't do it now since dirty
+	 * cache is not cleared yet. */
+	op_data->op_attr.ia_valid &= ~(TIMES_SET_FLAGS | ATTR_SIZE);
+	rc = simple_setattr(dentry, &op_data->op_attr);
+	op_data->op_attr.ia_valid = ia_valid;
+
+	/* Extract epoch data if obtained. */
+	op_data->op_handle = md.body->handle;
+	op_data->op_ioepoch = md.body->ioepoch;
+
+	ll_update_inode(inode, &md);
+	ptlrpc_req_finished(request);
+
+	RETURN(rc);
+}
+
+/* Close IO epoch and send Size-on-MDS attribute update. */
+static int ll_setattr_done_writing(struct inode *inode,
+				   struct md_op_data *op_data,
+				   struct md_open_data *mod)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(op_data != NULL);
+	if (!S_ISREG(inode->i_mode))
+		RETURN(0);
+
+	CDEBUG(D_INODE, "Epoch "LPU64" closed on "DFID" for truncate\n",
+	       op_data->op_ioepoch, PFID(&lli->lli_fid));
+
+	op_data->op_flags = MF_EPOCH_CLOSE;
+	ll_done_writing_attr(inode, op_data);
+	ll_pack_inode2opdata(inode, op_data, NULL);
+
+	rc = md_done_writing(ll_i2sbi(inode)->ll_md_exp, op_data, mod);
+	if (rc == -EAGAIN) {
+		/* MDS has instructed us to obtain Size-on-MDS attribute
+		 * from OSTs and send setattr to back to MDS. */
+		rc = ll_som_update(inode, op_data);
+	} else if (rc) {
+		CERROR("inode %lu mdc truncate failed: rc = %d\n",
+		       inode->i_ino, rc);
+	}
+	RETURN(rc);
+}
+
+static int ll_setattr_ost(struct inode *inode, struct iattr *attr)
+{
+	struct obd_capa *capa;
+	int rc;
+
+	if (attr->ia_valid & ATTR_SIZE)
+		capa = ll_osscapa_get(inode, CAPA_OPC_OSS_TRUNC);
+	else
+		capa = ll_mdscapa_get(inode);
+
+	rc = cl_setattr_ost(inode, attr, capa);
+
+	if (attr->ia_valid & ATTR_SIZE)
+		ll_truncate_free_capa(capa);
+	else
+		capa_put(capa);
+
+	return rc;
+}
+
+
+/* If this inode has objects allocated to it (lsm != NULL), then the OST
+ * object(s) determine the file size and mtime.  Otherwise, the MDS will
+ * keep these values until such a time that objects are allocated for it.
+ * We do the MDS operations first, as it is checking permissions for us.
+ * We don't to the MDS RPC if there is nothing that we want to store there,
+ * otherwise there is no harm in updating mtime/atime on the MDS if we are
+ * going to do an RPC anyways.
+ *
+ * If we are doing a truncate, we will send the mtime and ctime updates
+ * to the OST with the punch RPC, otherwise we do an explicit setattr RPC.
+ * I don't believe it is possible to get e.g. ATTR_MTIME_SET and ATTR_SIZE
+ * at the same time.
+ */
+int ll_setattr_raw(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct md_op_data *op_data = NULL;
+	struct md_open_data *mod = NULL;
+	int rc = 0, rc1 = 0;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "%s: setattr inode %p/fid:"DFID" from %llu to %llu, "
+		"valid %x\n", ll_get_fsname(inode->i_sb, NULL, 0), inode,
+		PFID(&lli->lli_fid), i_size_read(inode), attr->ia_size,
+		attr->ia_valid);
+
+	if (attr->ia_valid & ATTR_SIZE) {
+		/* Check new size against VFS/VM file size limit and rlimit */
+		rc = inode_newsize_ok(inode, attr->ia_size);
+		if (rc)
+			RETURN(rc);
+
+		/* The maximum Lustre file size is variable, based on the
+		 * OST maximum object size and number of stripes.  This
+		 * needs another check in addition to the VFS check above. */
+		if (attr->ia_size > ll_file_maxbytes(inode)) {
+			CDEBUG(D_INODE,"file "DFID" too large %llu > "LPU64"\n",
+			       PFID(&lli->lli_fid), attr->ia_size,
+			       ll_file_maxbytes(inode));
+			RETURN(-EFBIG);
+		}
+
+		attr->ia_valid |= ATTR_MTIME | ATTR_CTIME;
+	}
+
+	/* POSIX: check before ATTR_*TIME_SET set (from inode_change_ok) */
+	if (attr->ia_valid & TIMES_SET_FLAGS) {
+		if (current_fsuid() != inode->i_uid &&
+		    !cfs_capable(CFS_CAP_FOWNER))
+			RETURN(-EPERM);
+	}
+
+	/* We mark all of the fields "set" so MDS/OST does not re-set them */
+	if (attr->ia_valid & ATTR_CTIME) {
+		attr->ia_ctime = CFS_CURRENT_TIME;
+		attr->ia_valid |= ATTR_CTIME_SET;
+	}
+	if (!(attr->ia_valid & ATTR_ATIME_SET) &&
+	    (attr->ia_valid & ATTR_ATIME)) {
+		attr->ia_atime = CFS_CURRENT_TIME;
+		attr->ia_valid |= ATTR_ATIME_SET;
+	}
+	if (!(attr->ia_valid & ATTR_MTIME_SET) &&
+	    (attr->ia_valid & ATTR_MTIME)) {
+		attr->ia_mtime = CFS_CURRENT_TIME;
+		attr->ia_valid |= ATTR_MTIME_SET;
+	}
+
+	if (attr->ia_valid & (ATTR_MTIME | ATTR_CTIME))
+		CDEBUG(D_INODE, "setting mtime %lu, ctime %lu, now = %lu\n",
+		       LTIME_S(attr->ia_mtime), LTIME_S(attr->ia_ctime),
+		       cfs_time_current_sec());
+
+	/* If we are changing file size, file content is modified, flag it. */
+	if (attr->ia_valid & ATTR_SIZE) {
+		attr->ia_valid |= MDS_OPEN_OWNEROVERRIDE;
+		spin_lock(&lli->lli_lock);
+		lli->lli_flags |= LLIF_DATA_MODIFIED;
+		spin_unlock(&lli->lli_lock);
+	}
+
+	/* We always do an MDS RPC, even if we're only changing the size;
+	 * only the MDS knows whether truncate() should fail with -ETXTBUSY */
+
+	OBD_ALLOC_PTR(op_data);
+	if (op_data == NULL)
+		RETURN(-ENOMEM);
+
+	if (!S_ISDIR(inode->i_mode)) {
+		if (attr->ia_valid & ATTR_SIZE)
+			inode_dio_write_done(inode);
+		mutex_unlock(&inode->i_mutex);
+		down_write(&lli->lli_trunc_sem);
+	}
+
+	memcpy(&op_data->op_attr, attr, sizeof(*attr));
+
+	/* Open epoch for truncate. */
+	if (exp_connect_som(ll_i2mdexp(inode)) &&
+	    (attr->ia_valid & (ATTR_SIZE | ATTR_MTIME | ATTR_MTIME_SET)))
+		op_data->op_flags = MF_EPOCH_OPEN;
+
+	rc = ll_md_setattr(dentry, op_data, &mod);
+	if (rc)
+		GOTO(out, rc);
+
+	/* RPC to MDT is sent, cancel data modification flag */
+	if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
+		spin_lock(&lli->lli_lock);
+		lli->lli_flags &= ~LLIF_DATA_MODIFIED;
+		spin_unlock(&lli->lli_lock);
+	}
+
+	ll_ioepoch_open(lli, op_data->op_ioepoch);
+	if (!S_ISREG(inode->i_mode))
+		GOTO(out, rc = 0);
+
+	if (attr->ia_valid & (ATTR_SIZE |
+			      ATTR_ATIME | ATTR_ATIME_SET |
+			      ATTR_MTIME | ATTR_MTIME_SET))
+		/* For truncate and utimes sending attributes to OSTs, setting
+		 * mtime/atime to the past will be performed under PW [0:EOF]
+		 * extent lock (new_size:EOF for truncate).  It may seem
+		 * excessive to send mtime/atime updates to OSTs when not
+		 * setting times to past, but it is necessary due to possible
+		 * time de-synchronization between MDT inode and OST objects */
+		rc = ll_setattr_ost(inode, attr);
+	EXIT;
+out:
+	if (op_data) {
+		if (op_data->op_ioepoch) {
+			rc1 = ll_setattr_done_writing(inode, op_data, mod);
+			if (!rc)
+				rc = rc1;
+		}
+		ll_finish_md_op_data(op_data);
+	}
+	if (!S_ISDIR(inode->i_mode)) {
+		up_write(&lli->lli_trunc_sem);
+		mutex_lock(&inode->i_mutex);
+		if (attr->ia_valid & ATTR_SIZE)
+			inode_dio_wait(inode);
+	}
+
+	ll_stats_ops_tally(ll_i2sbi(inode), (attr->ia_valid & ATTR_SIZE) ?
+			LPROC_LL_TRUNC : LPROC_LL_SETATTR, 1);
+
+	return rc;
+}
+
+int ll_setattr(struct dentry *de, struct iattr *attr)
+{
+	int mode = de->d_inode->i_mode;
+
+	if ((attr->ia_valid & (ATTR_CTIME|ATTR_SIZE|ATTR_MODE)) ==
+			      (ATTR_CTIME|ATTR_SIZE|ATTR_MODE))
+		attr->ia_valid |= MDS_OPEN_OWNEROVERRIDE;
+
+	if (((attr->ia_valid & (ATTR_MODE|ATTR_FORCE|ATTR_SIZE)) ==
+			       (ATTR_SIZE|ATTR_MODE)) &&
+	    (((mode & S_ISUID) && !(attr->ia_mode & S_ISUID)) ||
+	     (((mode & (S_ISGID|S_IXGRP)) == (S_ISGID|S_IXGRP)) &&
+	      !(attr->ia_mode & S_ISGID))))
+		attr->ia_valid |= ATTR_FORCE;
+
+	if ((mode & S_ISUID) &&
+	    !(attr->ia_mode & S_ISUID) &&
+	    !(attr->ia_valid & ATTR_KILL_SUID))
+		attr->ia_valid |= ATTR_KILL_SUID;
+
+	if (((mode & (S_ISGID|S_IXGRP)) == (S_ISGID|S_IXGRP)) &&
+	    !(attr->ia_mode & S_ISGID) &&
+	    !(attr->ia_valid & ATTR_KILL_SGID))
+		attr->ia_valid |= ATTR_KILL_SGID;
+
+	return ll_setattr_raw(de, attr);
+}
+
+int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs,
+		       __u64 max_age, __u32 flags)
+{
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct obd_statfs obd_osfs;
+	int rc;
+	ENTRY;
+
+	rc = obd_statfs(NULL, sbi->ll_md_exp, osfs, max_age, flags);
+	if (rc) {
+		CERROR("md_statfs fails: rc = %d\n", rc);
+		RETURN(rc);
+	}
+
+	osfs->os_type = sb->s_magic;
+
+	CDEBUG(D_SUPER, "MDC blocks "LPU64"/"LPU64" objects "LPU64"/"LPU64"\n",
+	       osfs->os_bavail, osfs->os_blocks, osfs->os_ffree,osfs->os_files);
+
+	if (sbi->ll_flags & LL_SBI_LAZYSTATFS)
+		flags |= OBD_STATFS_NODELAY;
+
+	rc = obd_statfs_rqset(sbi->ll_dt_exp, &obd_osfs, max_age, flags);
+	if (rc) {
+		CERROR("obd_statfs fails: rc = %d\n", rc);
+		RETURN(rc);
+	}
+
+	CDEBUG(D_SUPER, "OSC blocks "LPU64"/"LPU64" objects "LPU64"/"LPU64"\n",
+	       obd_osfs.os_bavail, obd_osfs.os_blocks, obd_osfs.os_ffree,
+	       obd_osfs.os_files);
+
+	osfs->os_bsize = obd_osfs.os_bsize;
+	osfs->os_blocks = obd_osfs.os_blocks;
+	osfs->os_bfree = obd_osfs.os_bfree;
+	osfs->os_bavail = obd_osfs.os_bavail;
+
+	/* If we don't have as many objects free on the OST as inodes
+	 * on the MDS, we reduce the total number of inodes to
+	 * compensate, so that the "inodes in use" number is correct.
+	 */
+	if (obd_osfs.os_ffree < osfs->os_ffree) {
+		osfs->os_files = (osfs->os_files - osfs->os_ffree) +
+			obd_osfs.os_ffree;
+		osfs->os_ffree = obd_osfs.os_ffree;
+	}
+
+	RETURN(rc);
+}
+int ll_statfs(struct dentry *de, struct kstatfs *sfs)
+{
+	struct super_block *sb = de->d_sb;
+	struct obd_statfs osfs;
+	int rc;
+
+	CDEBUG(D_VFSTRACE, "VFS Op: at "LPU64" jiffies\n", get_jiffies_64());
+	ll_stats_ops_tally(ll_s2sbi(sb), LPROC_LL_STAFS, 1);
+
+	/* Some amount of caching on the client is allowed */
+	rc = ll_statfs_internal(sb, &osfs,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				0);
+	if (rc)
+		return rc;
+
+	statfs_unpack(sfs, &osfs);
+
+	/* We need to downshift for all 32-bit kernels, because we can't
+	 * tell if the kernel is being called via sys_statfs64() or not.
+	 * Stop before overflowing f_bsize - in which case it is better
+	 * to just risk EOVERFLOW if caller is using old sys_statfs(). */
+	if (sizeof(long) < 8) {
+		while (osfs.os_blocks > ~0UL && sfs->f_bsize < 0x40000000) {
+			sfs->f_bsize <<= 1;
+
+			osfs.os_blocks >>= 1;
+			osfs.os_bfree >>= 1;
+			osfs.os_bavail >>= 1;
+		}
+	}
+
+	sfs->f_blocks = osfs.os_blocks;
+	sfs->f_bfree = osfs.os_bfree;
+	sfs->f_bavail = osfs.os_bavail;
+
+	return 0;
+}
+
+void ll_inode_size_lock(struct inode *inode)
+{
+	struct ll_inode_info *lli;
+
+	LASSERT(!S_ISDIR(inode->i_mode));
+
+	lli = ll_i2info(inode);
+	LASSERT(lli->lli_size_sem_owner != current);
+	down(&lli->lli_size_sem);
+	LASSERT(lli->lli_size_sem_owner == NULL);
+	lli->lli_size_sem_owner = current;
+}
+
+void ll_inode_size_unlock(struct inode *inode)
+{
+	struct ll_inode_info *lli;
+
+	lli = ll_i2info(inode);
+	LASSERT(lli->lli_size_sem_owner == current);
+	lli->lli_size_sem_owner = NULL;
+	up(&lli->lli_size_sem);
+}
+
+void ll_update_inode(struct inode *inode, struct lustre_md *md)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct mdt_body *body = md->body;
+	struct lov_stripe_md *lsm = md->lsm;
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+
+	LASSERT ((lsm != NULL) == ((body->valid & OBD_MD_FLEASIZE) != 0));
+	if (lsm != NULL) {
+		if (!lli->lli_has_smd &&
+		    !(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
+			cl_file_inode_init(inode, md);
+
+		lli->lli_maxbytes = lsm->lsm_maxbytes;
+		if (lli->lli_maxbytes > MAX_LFS_FILESIZE)
+			lli->lli_maxbytes = MAX_LFS_FILESIZE;
+	}
+
+	if (sbi->ll_flags & LL_SBI_RMT_CLIENT) {
+		if (body->valid & OBD_MD_FLRMTPERM)
+			ll_update_remote_perm(inode, md->remote_perm);
+	}
+#ifdef CONFIG_FS_POSIX_ACL
+	else if (body->valid & OBD_MD_FLACL) {
+		spin_lock(&lli->lli_lock);
+		if (lli->lli_posix_acl)
+			posix_acl_release(lli->lli_posix_acl);
+		lli->lli_posix_acl = md->posix_acl;
+		spin_unlock(&lli->lli_lock);
+	}
+#endif
+	inode->i_ino = cl_fid_build_ino(&body->fid1, ll_need_32bit_api(sbi));
+	inode->i_generation = cl_fid_build_gen(&body->fid1);
+
+	if (body->valid & OBD_MD_FLATIME) {
+		if (body->atime > LTIME_S(inode->i_atime))
+			LTIME_S(inode->i_atime) = body->atime;
+		lli->lli_lvb.lvb_atime = body->atime;
+	}
+	if (body->valid & OBD_MD_FLMTIME) {
+		if (body->mtime > LTIME_S(inode->i_mtime)) {
+			CDEBUG(D_INODE, "setting ino %lu mtime from %lu "
+			       "to "LPU64"\n", inode->i_ino,
+			       LTIME_S(inode->i_mtime), body->mtime);
+			LTIME_S(inode->i_mtime) = body->mtime;
+		}
+		lli->lli_lvb.lvb_mtime = body->mtime;
+	}
+	if (body->valid & OBD_MD_FLCTIME) {
+		if (body->ctime > LTIME_S(inode->i_ctime))
+			LTIME_S(inode->i_ctime) = body->ctime;
+		lli->lli_lvb.lvb_ctime = body->ctime;
+	}
+	if (body->valid & OBD_MD_FLMODE)
+		inode->i_mode = (inode->i_mode & S_IFMT)|(body->mode & ~S_IFMT);
+	if (body->valid & OBD_MD_FLTYPE)
+		inode->i_mode = (inode->i_mode & ~S_IFMT)|(body->mode & S_IFMT);
+	LASSERT(inode->i_mode != 0);
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_blkbits = min(PTLRPC_MAX_BRW_BITS + 1, LL_MAX_BLKSIZE_BITS);
+	} else {
+		inode->i_blkbits = inode->i_sb->s_blocksize_bits;
+	}
+	if (body->valid & OBD_MD_FLUID)
+		inode->i_uid = body->uid;
+	if (body->valid & OBD_MD_FLGID)
+		inode->i_gid = body->gid;
+	if (body->valid & OBD_MD_FLFLAGS)
+		inode->i_flags = ll_ext_to_inode_flags(body->flags);
+	if (body->valid & OBD_MD_FLNLINK)
+		set_nlink(inode, body->nlink);
+	if (body->valid & OBD_MD_FLRDEV)
+		inode->i_rdev = old_decode_dev(body->rdev);
+
+	if (body->valid & OBD_MD_FLID) {
+		/* FID shouldn't be changed! */
+		if (fid_is_sane(&lli->lli_fid)) {
+			LASSERTF(lu_fid_eq(&lli->lli_fid, &body->fid1),
+				 "Trying to change FID "DFID
+				 " to the "DFID", inode %lu/%u(%p)\n",
+				 PFID(&lli->lli_fid), PFID(&body->fid1),
+				 inode->i_ino, inode->i_generation, inode);
+		} else
+			lli->lli_fid = body->fid1;
+	}
+
+	LASSERT(fid_seq(&lli->lli_fid) != 0);
+
+	if (body->valid & OBD_MD_FLSIZE) {
+		if (exp_connect_som(ll_i2mdexp(inode)) &&
+		    S_ISREG(inode->i_mode)) {
+			struct lustre_handle lockh;
+			ldlm_mode_t mode;
+
+			/* As it is possible a blocking ast has been processed
+			 * by this time, we need to check there is an UPDATE
+			 * lock on the client and set LLIF_MDS_SIZE_LOCK holding
+			 * it. */
+			mode = ll_take_md_lock(inode, MDS_INODELOCK_UPDATE,
+					       &lockh, LDLM_FL_CBPENDING);
+			if (mode) {
+				if (lli->lli_flags & (LLIF_DONE_WRITING |
+						      LLIF_EPOCH_PENDING |
+						      LLIF_SOM_DIRTY)) {
+					CERROR("ino %lu flags %u still has "
+					       "size authority! do not trust "
+					       "the size got from MDS\n",
+					       inode->i_ino, lli->lli_flags);
+				} else {
+					/* Use old size assignment to avoid
+					 * deadlock bz14138 & bz14326 */
+					i_size_write(inode, body->size);
+					lli->lli_flags |= LLIF_MDS_SIZE_LOCK;
+				}
+				ldlm_lock_decref(&lockh, mode);
+			}
+		} else {
+			/* Use old size assignment to avoid
+			 * deadlock bz14138 & bz14326 */
+			i_size_write(inode, body->size);
+
+			CDEBUG(D_VFSTRACE, "inode=%lu, updating i_size %llu\n",
+			       inode->i_ino, (unsigned long long)body->size);
+		}
+
+		if (body->valid & OBD_MD_FLBLOCKS)
+			inode->i_blocks = body->blocks;
+	}
+
+	if (body->valid & OBD_MD_FLMDSCAPA) {
+		LASSERT(md->mds_capa);
+		ll_add_capa(inode, md->mds_capa);
+	}
+	if (body->valid & OBD_MD_FLOSSCAPA) {
+		LASSERT(md->oss_capa);
+		ll_add_capa(inode, md->oss_capa);
+	}
+}
+
+void ll_read_inode2(struct inode *inode, void *opaque)
+{
+	struct lustre_md *md = opaque;
+	struct ll_inode_info *lli = ll_i2info(inode);
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
+	       PFID(&lli->lli_fid), inode);
+
+	LASSERT(!lli->lli_has_smd);
+
+	/* Core attributes from the MDS first.  This is a new inode, and
+	 * the VFS doesn't zero times in the core inode so we have to do
+	 * it ourselves.  They will be overwritten by either MDS or OST
+	 * attributes - we just need to make sure they aren't newer. */
+	LTIME_S(inode->i_mtime) = 0;
+	LTIME_S(inode->i_atime) = 0;
+	LTIME_S(inode->i_ctime) = 0;
+	inode->i_rdev = 0;
+	ll_update_inode(inode, md);
+
+	/* OIDEBUG(inode); */
+
+	/* initializing backing dev info. */
+	inode->i_mapping->backing_dev_info = &s2lsi(inode->i_sb)->lsi_bdi;
+
+
+	if (S_ISREG(inode->i_mode)) {
+		struct ll_sb_info *sbi = ll_i2sbi(inode);
+		inode->i_op = &ll_file_inode_operations;
+		inode->i_fop = sbi->ll_fop;
+		inode->i_mapping->a_ops = (struct address_space_operations *)&ll_aops;
+		EXIT;
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &ll_dir_inode_operations;
+		inode->i_fop = &ll_dir_operations;
+		EXIT;
+	} else if (S_ISLNK(inode->i_mode)) {
+		inode->i_op = &ll_fast_symlink_inode_operations;
+		EXIT;
+	} else {
+		inode->i_op = &ll_special_inode_operations;
+
+		init_special_inode(inode, inode->i_mode,
+				   inode->i_rdev);
+
+		EXIT;
+	}
+}
+
+void ll_delete_inode(struct inode *inode)
+{
+	struct cl_inode_info *lli = cl_i2info(inode);
+	ENTRY;
+
+	if (S_ISREG(inode->i_mode) && lli->lli_clob != NULL)
+		/* discard all dirty pages before truncating them, required by
+		 * osc_extent implementation at LU-1030. */
+		cl_sync_file_range(inode, 0, OBD_OBJECT_EOF, CL_FSYNC_DISCARD);
+
+	truncate_inode_pages(&inode->i_data, 0);
+
+	/* Workaround for LU-118 */
+	if (inode->i_data.nrpages) {
+		TREE_READ_LOCK_IRQ(&inode->i_data);
+		TREE_READ_UNLOCK_IRQ(&inode->i_data);
+		LASSERTF(inode->i_data.nrpages == 0,
+			 "inode=%lu/%u(%p) nrpages=%lu, see "
+			 "http://jira.whamcloud.com/browse/LU-118\n",
+			 inode->i_ino, inode->i_generation, inode,
+			 inode->i_data.nrpages);
+	}
+	/* Workaround end */
+
+	ll_clear_inode(inode);
+	clear_inode(inode);
+
+	EXIT;
+}
+
+int ll_iocontrol(struct inode *inode, struct file *file,
+		 unsigned int cmd, unsigned long arg)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ptlrpc_request *req = NULL;
+	int rc, flags = 0;
+	ENTRY;
+
+	switch(cmd) {
+	case FSFILT_IOC_GETFLAGS: {
+		struct mdt_body *body;
+		struct md_op_data *op_data;
+
+		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
+					     0, 0, LUSTRE_OPC_ANY,
+					     NULL);
+		if (IS_ERR(op_data))
+			RETURN(PTR_ERR(op_data));
+
+		op_data->op_valid = OBD_MD_FLFLAGS;
+		rc = md_getattr(sbi->ll_md_exp, op_data, &req);
+		ll_finish_md_op_data(op_data);
+		if (rc) {
+			CERROR("failure %d inode %lu\n", rc, inode->i_ino);
+			RETURN(-abs(rc));
+		}
+
+		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+
+		flags = body->flags;
+
+		ptlrpc_req_finished(req);
+
+		RETURN(put_user(flags, (int *)arg));
+	}
+	case FSFILT_IOC_SETFLAGS: {
+		struct lov_stripe_md *lsm;
+		struct obd_info oinfo = { { { 0 } } };
+		struct md_op_data *op_data;
+
+		if (get_user(flags, (int *)arg))
+			RETURN(-EFAULT);
+
+		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+					     LUSTRE_OPC_ANY, NULL);
+		if (IS_ERR(op_data))
+			RETURN(PTR_ERR(op_data));
+
+		((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = flags;
+		op_data->op_attr.ia_valid |= ATTR_ATTR_FLAG;
+		rc = md_setattr(sbi->ll_md_exp, op_data,
+				NULL, 0, NULL, 0, &req, NULL);
+		ll_finish_md_op_data(op_data);
+		ptlrpc_req_finished(req);
+		if (rc)
+			RETURN(rc);
+
+		inode->i_flags = ll_ext_to_inode_flags(flags);
+
+		lsm = ccc_inode_lsm_get(inode);
+		if (lsm == NULL)
+			RETURN(0);
+
+		OBDO_ALLOC(oinfo.oi_oa);
+		if (!oinfo.oi_oa) {
+			ccc_inode_lsm_put(inode, lsm);
+			RETURN(-ENOMEM);
+		}
+		oinfo.oi_md = lsm;
+		oinfo.oi_oa->o_oi = lsm->lsm_oi;
+		oinfo.oi_oa->o_flags = flags;
+		oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS |
+				       OBD_MD_FLGROUP;
+		oinfo.oi_capa = ll_mdscapa_get(inode);
+		obdo_set_parent_fid(oinfo.oi_oa, &ll_i2info(inode)->lli_fid);
+		rc = obd_setattr_rqset(sbi->ll_dt_exp, &oinfo, NULL);
+		capa_put(oinfo.oi_capa);
+		OBDO_FREE(oinfo.oi_oa);
+		ccc_inode_lsm_put(inode, lsm);
+
+		if (rc && rc != -EPERM && rc != -EACCES)
+			CERROR("osc_setattr_async fails: rc = %d\n", rc);
+
+		RETURN(rc);
+	}
+	default:
+		RETURN(-ENOSYS);
+	}
+
+	RETURN(0);
+}
+
+int ll_flush_ctx(struct inode *inode)
+{
+	struct ll_sb_info  *sbi = ll_i2sbi(inode);
+
+	CDEBUG(D_SEC, "flush context for user %d\n", current_uid());
+
+	obd_set_info_async(NULL, sbi->ll_md_exp,
+			   sizeof(KEY_FLUSH_CTX), KEY_FLUSH_CTX,
+			   0, NULL, NULL);
+	obd_set_info_async(NULL, sbi->ll_dt_exp,
+			   sizeof(KEY_FLUSH_CTX), KEY_FLUSH_CTX,
+			   0, NULL, NULL);
+	return 0;
+}
+
+/* umount -f client means force down, don't save state */
+void ll_umount_begin(struct super_block *sb)
+{
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct obd_device *obd;
+	struct obd_ioctl_data *ioc_data;
+	ENTRY;
+
+
+	CDEBUG(D_VFSTRACE, "VFS Op: superblock %p count %d active %d\n", sb,
+	       sb->s_count, atomic_read(&sb->s_active));
+
+	obd = class_exp2obd(sbi->ll_md_exp);
+	if (obd == NULL) {
+		CERROR("Invalid MDC connection handle "LPX64"\n",
+		       sbi->ll_md_exp->exp_handle.h_cookie);
+		EXIT;
+		return;
+	}
+	obd->obd_force = 1;
+
+	obd = class_exp2obd(sbi->ll_dt_exp);
+	if (obd == NULL) {
+		CERROR("Invalid LOV connection handle "LPX64"\n",
+		       sbi->ll_dt_exp->exp_handle.h_cookie);
+		EXIT;
+		return;
+	}
+	obd->obd_force = 1;
+
+	OBD_ALLOC_PTR(ioc_data);
+	if (ioc_data) {
+		obd_iocontrol(IOC_OSC_SET_ACTIVE, sbi->ll_md_exp,
+			      sizeof *ioc_data, ioc_data, NULL);
+
+		obd_iocontrol(IOC_OSC_SET_ACTIVE, sbi->ll_dt_exp,
+			      sizeof *ioc_data, ioc_data, NULL);
+
+		OBD_FREE_PTR(ioc_data);
+	}
+
+
+	/* Really, we'd like to wait until there are no requests outstanding,
+	 * and then continue.  For now, we just invalidate the requests,
+	 * schedule() and sleep one second if needed, and hope.
+	 */
+	schedule();
+
+	EXIT;
+}
+
+int ll_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	char *profilenm = get_profile_name(sb);
+	int err;
+	__u32 read_only;
+
+	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
+		read_only = *flags & MS_RDONLY;
+		err = obd_set_info_async(NULL, sbi->ll_md_exp,
+					 sizeof(KEY_READ_ONLY),
+					 KEY_READ_ONLY, sizeof(read_only),
+					 &read_only, NULL);
+		if (err) {
+			LCONSOLE_WARN("Failed to remount %s %s (%d)\n",
+				      profilenm, read_only ?
+				      "read-only" : "read-write", err);
+			return err;
+		}
+
+		if (read_only)
+			sb->s_flags |= MS_RDONLY;
+		else
+			sb->s_flags &= ~MS_RDONLY;
+
+		if (sbi->ll_flags & LL_SBI_VERBOSE)
+			LCONSOLE_WARN("Remounted %s %s\n", profilenm,
+				      read_only ?  "read-only" : "read-write");
+	}
+	return 0;
+}
+
+int ll_prep_inode(struct inode **inode, struct ptlrpc_request *req,
+		  struct super_block *sb, struct lookup_intent *it)
+{
+	struct ll_sb_info *sbi = NULL;
+	struct lustre_md md;
+	int rc;
+	ENTRY;
+
+	LASSERT(*inode || sb);
+	sbi = sb ? ll_s2sbi(sb) : ll_i2sbi(*inode);
+	rc = md_get_lustre_md(sbi->ll_md_exp, req, sbi->ll_dt_exp,
+			      sbi->ll_md_exp, &md);
+	if (rc)
+		RETURN(rc);
+
+	if (*inode) {
+		ll_update_inode(*inode, &md);
+	} else {
+		LASSERT(sb != NULL);
+
+		/*
+		 * At this point server returns to client's same fid as client
+		 * generated for creating. So using ->fid1 is okay here.
+		 */
+		LASSERT(fid_is_sane(&md.body->fid1));
+
+		*inode = ll_iget(sb, cl_fid_build_ino(&md.body->fid1,
+						      ll_need_32bit_api(sbi)),
+				 &md);
+		if (*inode == NULL || IS_ERR(*inode)) {
+#ifdef CONFIG_FS_POSIX_ACL
+			if (md.posix_acl) {
+				posix_acl_release(md.posix_acl);
+				md.posix_acl = NULL;
+			}
+#endif
+			rc = IS_ERR(*inode) ? PTR_ERR(*inode) : -ENOMEM;
+			*inode = NULL;
+			CERROR("new_inode -fatal: rc %d\n", rc);
+			GOTO(out, rc);
+		}
+	}
+
+	/* Handling piggyback layout lock.
+	 * Layout lock can be piggybacked by getattr and open request.
+	 * The lsm can be applied to inode only if it comes with a layout lock
+	 * otherwise correct layout may be overwritten, for example:
+	 * 1. proc1: mdt returns a lsm but not granting layout
+	 * 2. layout was changed by another client
+	 * 3. proc2: refresh layout and layout lock granted
+	 * 4. proc1: to apply a stale layout */
+	if (it != NULL && it->d.lustre.it_lock_mode != 0) {
+		struct lustre_handle lockh;
+		struct ldlm_lock *lock;
+
+		lockh.cookie = it->d.lustre.it_lock_handle;
+		lock = ldlm_handle2lock(&lockh);
+		LASSERT(lock != NULL);
+		if (ldlm_has_layout(lock)) {
+			struct cl_object_conf conf;
+
+			memset(&conf, 0, sizeof(conf));
+			conf.coc_opc = OBJECT_CONF_SET;
+			conf.coc_inode = *inode;
+			conf.coc_lock = lock;
+			conf.u.coc_md = &md;
+			(void)ll_layout_conf(*inode, &conf);
+		}
+		LDLM_LOCK_PUT(lock);
+	}
+
+out:
+	if (md.lsm != NULL)
+		obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
+	md_free_lustre_md(sbi->ll_md_exp, &md);
+	RETURN(rc);
+}
+
+int ll_obd_statfs(struct inode *inode, void *arg)
+{
+	struct ll_sb_info *sbi = NULL;
+	struct obd_export *exp;
+	char *buf = NULL;
+	struct obd_ioctl_data *data = NULL;
+	__u32 type;
+	__u32 flags;
+	int len = 0, rc;
+
+	if (!inode || !(sbi = ll_i2sbi(inode)))
+		GOTO(out_statfs, rc = -EINVAL);
+
+	rc = obd_ioctl_getdata(&buf, &len, arg);
+	if (rc)
+		GOTO(out_statfs, rc);
+
+	data = (void*)buf;
+	if (!data->ioc_inlbuf1 || !data->ioc_inlbuf2 ||
+	    !data->ioc_pbuf1 || !data->ioc_pbuf2)
+		GOTO(out_statfs, rc = -EINVAL);
+
+	if (data->ioc_inllen1 != sizeof(__u32) ||
+	    data->ioc_inllen2 != sizeof(__u32) ||
+	    data->ioc_plen1 != sizeof(struct obd_statfs) ||
+	    data->ioc_plen2 != sizeof(struct obd_uuid))
+		GOTO(out_statfs, rc = -EINVAL);
+
+	memcpy(&type, data->ioc_inlbuf1, sizeof(__u32));
+	if (type & LL_STATFS_LMV)
+		exp = sbi->ll_md_exp;
+	else if (type & LL_STATFS_LOV)
+		exp = sbi->ll_dt_exp;
+	else
+		GOTO(out_statfs, rc = -ENODEV);
+
+	flags = (type & LL_STATFS_NODELAY) ? OBD_STATFS_NODELAY : 0;
+	rc = obd_iocontrol(IOC_OBD_STATFS, exp, len, buf, &flags);
+	if (rc)
+		GOTO(out_statfs, rc);
+out_statfs:
+	if (buf)
+		obd_ioctl_freedata(buf, len);
+	return rc;
+}
+
+int ll_process_config(struct lustre_cfg *lcfg)
+{
+	char *ptr;
+	void *sb;
+	struct lprocfs_static_vars lvars;
+	unsigned long x;
+	int rc = 0;
+
+	lprocfs_llite_init_vars(&lvars);
+
+	/* The instance name contains the sb: lustre-client-aacfe000 */
+	ptr = strrchr(lustre_cfg_string(lcfg, 0), '-');
+	if (!ptr || !*(++ptr))
+		return -EINVAL;
+	if (sscanf(ptr, "%lx", &x) != 1)
+		return -EINVAL;
+	sb = (void *)x;
+	/* This better be a real Lustre superblock! */
+	LASSERT(s2lsi((struct super_block *)sb)->lsi_lmd->lmd_magic == LMD_MAGIC);
+
+	/* Note we have not called client_common_fill_super yet, so
+	   proc fns must be able to handle that! */
+	rc = class_process_proc_param(PARAM_LLITE, lvars.obd_vars,
+				      lcfg, sb);
+	if (rc > 0)
+		rc = 0;
+	return(rc);
+}
+
+/* this function prepares md_op_data hint for passing ot down to MD stack. */
+struct md_op_data * ll_prep_md_op_data(struct md_op_data *op_data,
+				       struct inode *i1, struct inode *i2,
+				       const char *name, int namelen,
+				       int mode, __u32 opc, void *data)
+{
+	LASSERT(i1 != NULL);
+
+	if (namelen > ll_i2sbi(i1)->ll_namelen)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	if (op_data == NULL)
+		OBD_ALLOC_PTR(op_data);
+
+	if (op_data == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	ll_i2gids(op_data->op_suppgids, i1, i2);
+	op_data->op_fid1 = *ll_inode2fid(i1);
+	op_data->op_capa1 = ll_mdscapa_get(i1);
+
+	if (i2) {
+		op_data->op_fid2 = *ll_inode2fid(i2);
+		op_data->op_capa2 = ll_mdscapa_get(i2);
+	} else {
+		fid_zero(&op_data->op_fid2);
+		op_data->op_capa2 = NULL;
+	}
+
+	op_data->op_name = name;
+	op_data->op_namelen = namelen;
+	op_data->op_mode = mode;
+	op_data->op_mod_time = cfs_time_current_sec();
+	op_data->op_fsuid = current_fsuid();
+	op_data->op_fsgid = current_fsgid();
+	op_data->op_cap = cfs_curproc_cap_pack();
+	op_data->op_bias = 0;
+	op_data->op_cli_flags = 0;
+	if ((opc == LUSTRE_OPC_CREATE) && (name != NULL) &&
+	     filename_is_volatile(name, namelen, NULL))
+		op_data->op_bias |= MDS_CREATE_VOLATILE;
+	op_data->op_opc = opc;
+	op_data->op_mds = 0;
+	op_data->op_data = data;
+
+	/* If the file is being opened after mknod() (normally due to NFS)
+	 * try to use the default stripe data from parent directory for
+	 * allocating OST objects.  Try to pass the parent FID to MDS. */
+	if (opc == LUSTRE_OPC_CREATE && i1 == i2 && S_ISREG(i2->i_mode) &&
+	    !ll_i2info(i2)->lli_has_smd) {
+		struct ll_inode_info *lli = ll_i2info(i2);
+
+		spin_lock(&lli->lli_lock);
+		if (likely(!lli->lli_has_smd && !fid_is_zero(&lli->lli_pfid)))
+			op_data->op_fid1 = lli->lli_pfid;
+		spin_unlock(&lli->lli_lock);
+		/** We ignore parent's capability temporary. */
+	}
+
+	/* When called by ll_setattr_raw, file is i1. */
+	if (LLIF_DATA_MODIFIED & ll_i2info(i1)->lli_flags)
+		op_data->op_bias |= MDS_DATA_MODIFIED;
+
+	return op_data;
+}
+
+void ll_finish_md_op_data(struct md_op_data *op_data)
+{
+	capa_put(op_data->op_capa1);
+	capa_put(op_data->op_capa2);
+	OBD_FREE_PTR(op_data);
+}
+
+int ll_show_options(struct seq_file *seq, struct dentry *dentry)
+{
+	struct ll_sb_info *sbi;
+
+	LASSERT((seq != NULL) && (dentry != NULL));
+	sbi = ll_s2sbi(dentry->d_sb);
+
+	if (sbi->ll_flags & LL_SBI_NOLCK)
+		seq_puts(seq, ",nolock");
+
+	if (sbi->ll_flags & LL_SBI_FLOCK)
+		seq_puts(seq, ",flock");
+
+	if (sbi->ll_flags & LL_SBI_LOCALFLOCK)
+		seq_puts(seq, ",localflock");
+
+	if (sbi->ll_flags & LL_SBI_USER_XATTR)
+		seq_puts(seq, ",user_xattr");
+
+	if (sbi->ll_flags & LL_SBI_LAZYSTATFS)
+		seq_puts(seq, ",lazystatfs");
+
+	if (sbi->ll_flags & LL_SBI_USER_FID2PATH)
+		seq_puts(seq, ",user_fid2path");
+
+	RETURN(0);
+}
+
+/**
+ * Get obd name by cmd, and copy out to user space
+ */
+int ll_get_obd_name(struct inode *inode, unsigned int cmd, unsigned long arg)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct obd_device *obd;
+	ENTRY;
+
+	if (cmd == OBD_IOC_GETDTNAME)
+		obd = class_exp2obd(sbi->ll_dt_exp);
+	else if (cmd == OBD_IOC_GETMDNAME)
+		obd = class_exp2obd(sbi->ll_md_exp);
+	else
+		RETURN(-EINVAL);
+
+	if (!obd)
+		RETURN(-ENOENT);
+
+	if (copy_to_user((void *)arg, obd->obd_name,
+			     strlen(obd->obd_name) + 1))
+		RETURN(-EFAULT);
+
+	RETURN(0);
+}
+
+/**
+ * Get lustre file system name by \a sbi. If \a buf is provided(non-NULL), the
+ * fsname will be returned in this buffer; otherwise, a static buffer will be
+ * used to store the fsname and returned to caller.
+ */
+char *ll_get_fsname(struct super_block *sb, char *buf, int buflen)
+{
+	static char fsname_static[MTI_NAME_MAXLEN];
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	char *ptr;
+	int len;
+
+	if (buf == NULL) {
+		/* this means the caller wants to use static buffer
+		 * and it doesn't care about race. Usually this is
+		 * in error reporting path */
+		buf = fsname_static;
+		buflen = sizeof(fsname_static);
+	}
+
+	len = strlen(lsi->lsi_lmd->lmd_profile);
+	ptr = strrchr(lsi->lsi_lmd->lmd_profile, '-');
+	if (ptr && (strcmp(ptr, "-client") == 0))
+		len -= 7;
+
+	if (unlikely(len >= buflen))
+		len = buflen - 1;
+	strncpy(buf, lsi->lsi_lmd->lmd_profile, len);
+	buf[len] = '\0';
+
+	return buf;
+}
+
+static char* ll_d_path(struct dentry *dentry, char *buf, int bufsize)
+{
+	char *path = NULL;
+
+	struct path p;
+
+	p.dentry = dentry;
+	p.mnt = current->fs->root.mnt;
+	path_get(&p);
+	path = d_path(&p, buf, bufsize);
+	path_put(&p);
+
+	return path;
+}
+
+void ll_dirty_page_discard_warn(struct page *page, int ioret)
+{
+	char *buf, *path = NULL;
+	struct dentry *dentry = NULL;
+	struct ccc_object *obj = cl_inode2ccc(page->mapping->host);
+
+	/* this can be called inside spin lock so use GFP_ATOMIC. */
+	buf = (char *)__get_free_page(GFP_ATOMIC);
+	if (buf != NULL) {
+		dentry = d_find_alias(page->mapping->host);
+		if (dentry != NULL)
+			path = ll_d_path(dentry, buf, PAGE_SIZE);
+	}
+
+	CWARN("%s: dirty page discard: %s/fid: "DFID"/%s may get corrupted "
+	      "(rc %d)\n", ll_get_fsname(page->mapping->host->i_sb, NULL, 0),
+	      s2lsi(page->mapping->host->i_sb)->lsi_lmd->lmd_dev,
+	      PFID(&obj->cob_header.coh_lu.loh_fid),
+	      (path && !IS_ERR(path)) ? path : "", ioret);
+
+	if (dentry != NULL)
+		dput(dentry);
+
+	if (buf != NULL)
+		free_page((unsigned long)buf);
+}
diff --git a/drivers/staging/lustre/lustre/llite/llite_mmap.c b/drivers/staging/lustre/lustre/llite/llite_mmap.c
new file mode 100644
index 000000000000..d9590d85634a
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/llite_mmap.c
@@ -0,0 +1,507 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/version.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <asm/uaccess.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <lustre_lite.h>
+#include "llite_internal.h"
+#include <linux/lustre_compat25.h>
+
+struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address,
+		       int *type);
+
+static struct vm_operations_struct ll_file_vm_ops;
+
+void policy_from_vma(ldlm_policy_data_t *policy,
+			    struct vm_area_struct *vma, unsigned long addr,
+			    size_t count)
+{
+	policy->l_extent.start = ((addr - vma->vm_start) & CFS_PAGE_MASK) +
+				 (vma->vm_pgoff << PAGE_CACHE_SHIFT);
+	policy->l_extent.end = (policy->l_extent.start + count - 1) |
+			       ~CFS_PAGE_MASK;
+}
+
+struct vm_area_struct *our_vma(struct mm_struct *mm, unsigned long addr,
+			       size_t count)
+{
+	struct vm_area_struct *vma, *ret = NULL;
+	ENTRY;
+
+	/* mmap_sem must have been held by caller. */
+	LASSERT(!down_write_trylock(&mm->mmap_sem));
+
+	for(vma = find_vma(mm, addr);
+	    vma != NULL && vma->vm_start < (addr + count); vma = vma->vm_next) {
+		if (vma->vm_ops && vma->vm_ops == &ll_file_vm_ops &&
+		    vma->vm_flags & VM_SHARED) {
+			ret = vma;
+			break;
+		}
+	}
+	RETURN(ret);
+}
+
+/**
+ * API independent part for page fault initialization.
+ * \param vma - virtual memory area addressed to page fault
+ * \param env - corespondent lu_env to processing
+ * \param nest - nested level
+ * \param index - page index corespondent to fault.
+ * \parm ra_flags - vma readahead flags.
+ *
+ * \return allocated and initialized env for fault operation.
+ * \retval EINVAL if env can't allocated
+ * \return other error codes from cl_io_init.
+ */
+struct cl_io *ll_fault_io_init(struct vm_area_struct *vma,
+			       struct lu_env **env_ret,
+			       struct cl_env_nest *nest,
+			       pgoff_t index, unsigned long *ra_flags)
+{
+	struct file       *file  = vma->vm_file;
+	struct inode      *inode = file->f_dentry->d_inode;
+	struct cl_io      *io;
+	struct cl_fault_io *fio;
+	struct lu_env     *env;
+	ENTRY;
+
+	*env_ret = NULL;
+	if (ll_file_nolock(file))
+		RETURN(ERR_PTR(-EOPNOTSUPP));
+
+	/*
+	 * page fault can be called when lustre IO is
+	 * already active for the current thread, e.g., when doing read/write
+	 * against user level buffer mapped from Lustre buffer. To avoid
+	 * stomping on existing context, optionally force an allocation of a new
+	 * one.
+	 */
+	env = cl_env_nested_get(nest);
+	if (IS_ERR(env))
+		 RETURN(ERR_PTR(-EINVAL));
+
+	*env_ret = env;
+
+	io = ccc_env_thread_io(env);
+	io->ci_obj = ll_i2info(inode)->lli_clob;
+	LASSERT(io->ci_obj != NULL);
+
+	fio = &io->u.ci_fault;
+	fio->ft_index      = index;
+	fio->ft_executable = vma->vm_flags&VM_EXEC;
+
+	/*
+	 * disable VM_SEQ_READ and use VM_RAND_READ to make sure that
+	 * the kernel will not read other pages not covered by ldlm in
+	 * filemap_nopage. we do our readahead in ll_readpage.
+	 */
+	if (ra_flags != NULL)
+		*ra_flags = vma->vm_flags & (VM_RAND_READ|VM_SEQ_READ);
+	vma->vm_flags &= ~VM_SEQ_READ;
+	vma->vm_flags |= VM_RAND_READ;
+
+	CDEBUG(D_MMAP, "vm_flags: %lx (%lu %d)\n", vma->vm_flags,
+	       fio->ft_index, fio->ft_executable);
+
+	if (cl_io_init(env, io, CIT_FAULT, io->ci_obj) == 0) {
+		struct ccc_io *cio = ccc_env_io(env);
+		struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+
+		LASSERT(cio->cui_cl.cis_io == io);
+
+		/* mmap lock must be MANDATORY
+		 * it has to cache pages. */
+		io->ci_lockreq = CILR_MANDATORY;
+
+		cio->cui_fd  = fd;
+	}
+
+	return io;
+}
+
+/* Sharing code of page_mkwrite method for rhel5 and rhel6 */
+static int ll_page_mkwrite0(struct vm_area_struct *vma, struct page *vmpage,
+			    bool *retry)
+{
+	struct lu_env	   *env;
+	struct cl_io	    *io;
+	struct vvp_io	   *vio;
+	struct cl_env_nest       nest;
+	int		      result;
+	sigset_t	     set;
+	struct inode	     *inode;
+	struct ll_inode_info     *lli;
+	ENTRY;
+
+	LASSERT(vmpage != NULL);
+
+	io = ll_fault_io_init(vma, &env,  &nest, vmpage->index, NULL);
+	if (IS_ERR(io))
+		GOTO(out, result = PTR_ERR(io));
+
+	result = io->ci_result;
+	if (result < 0)
+		GOTO(out, result);
+
+	io->u.ci_fault.ft_mkwrite = 1;
+	io->u.ci_fault.ft_writable = 1;
+
+	vio = vvp_env_io(env);
+	vio->u.fault.ft_vma    = vma;
+	vio->u.fault.ft_vmpage = vmpage;
+
+	set = cfs_block_sigsinv(sigmask(SIGKILL) | sigmask(SIGTERM));
+
+	/* we grab lli_trunc_sem to exclude truncate case.
+	 * Otherwise, we could add dirty pages into osc cache
+	 * while truncate is on-going. */
+	inode = ccc_object_inode(io->ci_obj);
+	lli = ll_i2info(inode);
+	down_read(&lli->lli_trunc_sem);
+
+	result = cl_io_loop(env, io);
+
+	up_read(&lli->lli_trunc_sem);
+
+	cfs_restore_sigs(set);
+
+	if (result == 0) {
+		struct inode *inode = vma->vm_file->f_dentry->d_inode;
+		struct ll_inode_info *lli = ll_i2info(inode);
+
+		lock_page(vmpage);
+		if (vmpage->mapping == NULL) {
+			unlock_page(vmpage);
+
+			/* page was truncated and lock was cancelled, return
+			 * ENODATA so that VM_FAULT_NOPAGE will be returned
+			 * to handle_mm_fault(). */
+			if (result == 0)
+				result = -ENODATA;
+		} else if (!PageDirty(vmpage)) {
+			/* race, the page has been cleaned by ptlrpcd after
+			 * it was unlocked, it has to be added into dirty
+			 * cache again otherwise this soon-to-dirty page won't
+			 * consume any grants, even worse if this page is being
+			 * transferred because it will break RPC checksum.
+			 */
+			unlock_page(vmpage);
+
+			CDEBUG(D_MMAP, "Race on page_mkwrite %p/%lu, page has "
+			       "been written out, retry.\n",
+			       vmpage, vmpage->index);
+
+			*retry = true;
+			result = -EAGAIN;
+		}
+
+		if (result == 0) {
+			spin_lock(&lli->lli_lock);
+			lli->lli_flags |= LLIF_DATA_MODIFIED;
+			spin_unlock(&lli->lli_lock);
+		}
+	}
+	EXIT;
+
+out:
+	cl_io_fini(env, io);
+	cl_env_nested_put(&nest, env);
+
+	CDEBUG(D_MMAP, "%s mkwrite with %d\n", current->comm, result);
+
+	LASSERT(ergo(result == 0, PageLocked(vmpage)));
+	return(result);
+}
+
+
+
+static inline int to_fault_error(int result)
+{
+	switch(result) {
+	case 0:
+		result = VM_FAULT_LOCKED;
+		break;
+	case -EFAULT:
+		result = VM_FAULT_NOPAGE;
+		break;
+	case -ENOMEM:
+		result = VM_FAULT_OOM;
+		break;
+	default:
+		result = VM_FAULT_SIGBUS;
+		break;
+	}
+	return result;
+}
+
+/**
+ * Lustre implementation of a vm_operations_struct::fault() method, called by
+ * VM to server page fault (both in kernel and user space).
+ *
+ * \param vma - is virtiual area struct related to page fault
+ * \param vmf - structure which describe type and address where hit fault
+ *
+ * \return allocated and filled _locked_ page for address
+ * \retval VM_FAULT_ERROR on general error
+ * \retval NOPAGE_OOM not have memory for allocate new page
+ */
+static int ll_fault0(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct lu_env	   *env;
+	struct cl_io	    *io;
+	struct vvp_io	   *vio = NULL;
+	struct page	     *vmpage;
+	unsigned long	    ra_flags;
+	struct cl_env_nest       nest;
+	int		      result;
+	int		      fault_ret = 0;
+	ENTRY;
+
+	io = ll_fault_io_init(vma, &env,  &nest, vmf->pgoff, &ra_flags);
+	if (IS_ERR(io))
+		RETURN(to_fault_error(PTR_ERR(io)));
+
+	result = io->ci_result;
+	if (result == 0) {
+		vio = vvp_env_io(env);
+		vio->u.fault.ft_vma       = vma;
+		vio->u.fault.ft_vmpage    = NULL;
+		vio->u.fault.fault.ft_vmf = vmf;
+
+		result = cl_io_loop(env, io);
+
+		fault_ret = vio->u.fault.fault.ft_flags;
+		vmpage = vio->u.fault.ft_vmpage;
+		if (result != 0 && vmpage != NULL) {
+			page_cache_release(vmpage);
+			vmf->page = NULL;
+		}
+	}
+	cl_io_fini(env, io);
+	cl_env_nested_put(&nest, env);
+
+	vma->vm_flags |= ra_flags;
+	if (result != 0 && !(fault_ret & VM_FAULT_RETRY))
+		fault_ret |= to_fault_error(result);
+
+	CDEBUG(D_MMAP, "%s fault %d/%d\n",
+	       current->comm, fault_ret, result);
+	RETURN(fault_ret);
+}
+
+static int ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	int count = 0;
+	bool printed = false;
+	int result;
+	sigset_t set;
+
+	/* Only SIGKILL and SIGTERM is allowed for fault/nopage/mkwrite
+	 * so that it can be killed by admin but not cause segfault by
+	 * other signals. */
+	set = cfs_block_sigsinv(sigmask(SIGKILL) | sigmask(SIGTERM));
+
+restart:
+	result = ll_fault0(vma, vmf);
+	LASSERT(!(result & VM_FAULT_LOCKED));
+	if (result == 0) {
+		struct page *vmpage = vmf->page;
+
+		/* check if this page has been truncated */
+		lock_page(vmpage);
+		if (unlikely(vmpage->mapping == NULL)) { /* unlucky */
+			unlock_page(vmpage);
+			page_cache_release(vmpage);
+			vmf->page = NULL;
+
+			if (!printed && ++count > 16) {
+				CWARN("the page is under heavy contention,"
+				      "maybe your app(%s) needs revising :-)\n",
+				      current->comm);
+				printed = true;
+			}
+
+			goto restart;
+		}
+
+		result |= VM_FAULT_LOCKED;
+	}
+	cfs_restore_sigs(set);
+	return result;
+}
+
+static int ll_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	int count = 0;
+	bool printed = false;
+	bool retry;
+	int result;
+
+	do {
+		retry = false;
+		result = ll_page_mkwrite0(vma, vmf->page, &retry);
+
+		if (!printed && ++count > 16) {
+			CWARN("app(%s): the page %lu of file %lu is under heavy"
+			      " contention.\n",
+			      current->comm, vmf->pgoff,
+			      vma->vm_file->f_dentry->d_inode->i_ino);
+			printed = true;
+		}
+	} while (retry);
+
+	switch(result) {
+	case 0:
+		LASSERT(PageLocked(vmf->page));
+		result = VM_FAULT_LOCKED;
+		break;
+	case -ENODATA:
+	case -EFAULT:
+		result = VM_FAULT_NOPAGE;
+		break;
+	case -ENOMEM:
+		result = VM_FAULT_OOM;
+		break;
+	case -EAGAIN:
+		result = VM_FAULT_RETRY;
+		break;
+	default:
+		result = VM_FAULT_SIGBUS;
+		break;
+	}
+
+	return result;
+}
+
+/**
+ *  To avoid cancel the locks covering mmapped region for lock cache pressure,
+ *  we track the mapped vma count in ccc_object::cob_mmap_cnt.
+ */
+static void ll_vm_open(struct vm_area_struct * vma)
+{
+	struct inode *inode    = vma->vm_file->f_dentry->d_inode;
+	struct ccc_object *vob = cl_inode2ccc(inode);
+
+	ENTRY;
+	LASSERT(vma->vm_file);
+	LASSERT(atomic_read(&vob->cob_mmap_cnt) >= 0);
+	atomic_inc(&vob->cob_mmap_cnt);
+	EXIT;
+}
+
+/**
+ * Dual to ll_vm_open().
+ */
+static void ll_vm_close(struct vm_area_struct *vma)
+{
+	struct inode      *inode = vma->vm_file->f_dentry->d_inode;
+	struct ccc_object *vob   = cl_inode2ccc(inode);
+
+	ENTRY;
+	LASSERT(vma->vm_file);
+	atomic_dec(&vob->cob_mmap_cnt);
+	LASSERT(atomic_read(&vob->cob_mmap_cnt) >= 0);
+	EXIT;
+}
+
+
+/* return the user space pointer that maps to a file offset via a vma */
+static inline unsigned long file_to_user(struct vm_area_struct *vma, __u64 byte)
+{
+	return vma->vm_start + (byte - ((__u64)vma->vm_pgoff << PAGE_CACHE_SHIFT));
+
+}
+
+/* XXX put nice comment here.  talk about __free_pte -> dirty pages and
+ * nopage's reference passing to the pte */
+int ll_teardown_mmaps(struct address_space *mapping, __u64 first, __u64 last)
+{
+	int rc = -ENOENT;
+	ENTRY;
+
+	LASSERTF(last > first, "last "LPU64" first "LPU64"\n", last, first);
+	if (mapping_mapped(mapping)) {
+		rc = 0;
+		unmap_mapping_range(mapping, first + PAGE_CACHE_SIZE - 1,
+				    last - first + 1, 0);
+	}
+
+	RETURN(rc);
+}
+
+static struct vm_operations_struct ll_file_vm_ops = {
+	.fault			= ll_fault,
+	.page_mkwrite		= ll_page_mkwrite,
+	.open			= ll_vm_open,
+	.close			= ll_vm_close,
+};
+
+int ll_file_mmap(struct file *file, struct vm_area_struct * vma)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	int rc;
+	ENTRY;
+
+	if (ll_file_nolock(file))
+		RETURN(-EOPNOTSUPP);
+
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_MAP, 1);
+	rc = generic_file_mmap(file, vma);
+	if (rc == 0) {
+		vma->vm_ops = &ll_file_vm_ops;
+		vma->vm_ops->open(vma);
+		/* update the inode's size and mtime */
+		rc = ll_glimpse_size(inode);
+	}
+
+	RETURN(rc);
+}
diff --git a/drivers/staging/lustre/lustre/llite/llite_nfs.c b/drivers/staging/lustre/lustre/llite/llite_nfs.c
new file mode 100644
index 000000000000..28cc41e90581
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/llite_nfs.c
@@ -0,0 +1,319 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lustre/llite/llite_nfs.c
+ *
+ * NFS export of Lustre Light File System
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ * Author: Huang Hua <huanghua@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+#include <lustre_lite.h>
+#include "llite_internal.h"
+#include <linux/exportfs.h>
+
+__u32 get_uuid2int(const char *name, int len)
+{
+	__u32 key0 = 0x12a3fe2d, key1 = 0x37abe8f9;
+	while (len--) {
+		__u32 key = key1 + (key0 ^ (*name++ * 7152373));
+		if (key & 0x80000000) key -= 0x7fffffff;
+		key1 = key0;
+		key0 = key;
+	}
+	return (key0 << 1);
+}
+
+static int ll_nfs_test_inode(struct inode *inode, void *opaque)
+{
+	return lu_fid_eq(&ll_i2info(inode)->lli_fid,
+			 (struct lu_fid *)opaque);
+}
+
+struct inode *search_inode_for_lustre(struct super_block *sb,
+				      const struct lu_fid *fid)
+{
+	struct ll_sb_info     *sbi = ll_s2sbi(sb);
+	struct ptlrpc_request *req = NULL;
+	struct inode	  *inode = NULL;
+	int		   eadatalen = 0;
+	unsigned long	      hash = cl_fid_build_ino(fid,
+						      ll_need_32bit_api(sbi));
+	struct  md_op_data    *op_data;
+	int		   rc;
+	ENTRY;
+
+	CDEBUG(D_INFO, "searching inode for:(%lu,"DFID")\n", hash, PFID(fid));
+
+	inode = ilookup5(sb, hash, ll_nfs_test_inode, (void *)fid);
+	if (inode)
+		RETURN(inode);
+
+	rc = ll_get_max_mdsize(sbi, &eadatalen);
+	if (rc)
+		RETURN(ERR_PTR(rc));
+
+	/* Because inode is NULL, ll_prep_md_op_data can not
+	 * be used here. So we allocate op_data ourselves */
+	OBD_ALLOC_PTR(op_data);
+	if (op_data == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	op_data->op_fid1 = *fid;
+	op_data->op_mode = eadatalen;
+	op_data->op_valid = OBD_MD_FLEASIZE;
+
+	/* mds_fid2dentry ignores f_type */
+	rc = md_getattr(sbi->ll_md_exp, op_data, &req);
+	OBD_FREE_PTR(op_data);
+	if (rc) {
+		CERROR("can't get object attrs, fid "DFID", rc %d\n",
+		       PFID(fid), rc);
+		RETURN(ERR_PTR(rc));
+	}
+	rc = ll_prep_inode(&inode, req, sb, NULL);
+	ptlrpc_req_finished(req);
+	if (rc)
+		RETURN(ERR_PTR(rc));
+
+	RETURN(inode);
+}
+
+struct lustre_nfs_fid {
+	struct lu_fid   lnf_child;
+	struct lu_fid   lnf_parent;
+};
+
+static struct dentry *
+ll_iget_for_nfs(struct super_block *sb, struct lu_fid *fid, struct lu_fid *parent)
+{
+	struct inode  *inode;
+	struct dentry *result;
+	ENTRY;
+
+	CDEBUG(D_INFO, "Get dentry for fid: "DFID"\n", PFID(fid));
+	if (!fid_is_sane(fid))
+		RETURN(ERR_PTR(-ESTALE));
+
+	inode = search_inode_for_lustre(sb, fid);
+	if (IS_ERR(inode))
+		RETURN(ERR_PTR(PTR_ERR(inode)));
+
+	if (is_bad_inode(inode)) {
+		/* we didn't find the right inode.. */
+		iput(inode);
+		RETURN(ERR_PTR(-ESTALE));
+	}
+
+	/**
+	 * It is an anonymous dentry without OST objects created yet.
+	 * We have to find the parent to tell MDS how to init lov objects.
+	 */
+	if (S_ISREG(inode->i_mode) && !ll_i2info(inode)->lli_has_smd &&
+	    parent != NULL) {
+		struct ll_inode_info *lli = ll_i2info(inode);
+
+		spin_lock(&lli->lli_lock);
+		lli->lli_pfid = *parent;
+		spin_unlock(&lli->lli_lock);
+	}
+
+	result = d_obtain_alias(inode);
+	if (IS_ERR(result))
+		RETURN(result);
+
+	ll_dops_init(result, 1, 0);
+
+	RETURN(result);
+}
+
+#define LUSTRE_NFS_FID	  0x97
+
+/**
+ * \a connectable - is nfsd will connect himself or this should be done
+ *		  at lustre
+ *
+ * The return value is file handle type:
+ * 1 -- contains child file handle;
+ * 2 -- contains child file handle and parent file handle;
+ * 255 -- error.
+ */
+static int ll_encode_fh(struct inode *inode, __u32 *fh, int *plen,
+			struct inode *parent)
+{
+	struct lustre_nfs_fid *nfs_fid = (void *)fh;
+	ENTRY;
+
+	CDEBUG(D_INFO, "encoding for (%lu,"DFID") maxlen=%d minlen=%d\n",
+	      inode->i_ino, PFID(ll_inode2fid(inode)), *plen,
+	      (int)sizeof(struct lustre_nfs_fid));
+
+	if (*plen < sizeof(struct lustre_nfs_fid) / 4)
+		RETURN(255);
+
+	nfs_fid->lnf_child = *ll_inode2fid(inode);
+	nfs_fid->lnf_parent = *ll_inode2fid(parent);
+	*plen = sizeof(struct lustre_nfs_fid) / 4;
+
+	RETURN(LUSTRE_NFS_FID);
+}
+
+static int ll_nfs_get_name_filldir(void *cookie, const char *name, int namelen,
+				   loff_t hash, u64 ino, unsigned type)
+{
+	/* It is hack to access lde_fid for comparison with lgd_fid.
+	 * So the input 'name' must be part of the 'lu_dirent'. */
+	struct lu_dirent *lde = container_of0(name, struct lu_dirent, lde_name);
+	struct ll_getname_data *lgd = cookie;
+	struct lu_fid fid;
+
+	fid_le_to_cpu(&fid, &lde->lde_fid);
+	if (lu_fid_eq(&fid, &lgd->lgd_fid)) {
+		memcpy(lgd->lgd_name, name, namelen);
+		lgd->lgd_name[namelen] = 0;
+		lgd->lgd_found = 1;
+	}
+	return lgd->lgd_found;
+}
+
+static int ll_get_name(struct dentry *dentry, char *name,
+		       struct dentry *child)
+{
+	struct inode *dir = dentry->d_inode;
+	struct ll_getname_data lgd;
+	__u64 offset = 0;
+	int rc;
+	ENTRY;
+
+	if (!dir || !S_ISDIR(dir->i_mode))
+		GOTO(out, rc = -ENOTDIR);
+
+	if (!dir->i_fop)
+		GOTO(out, rc = -EINVAL);
+
+	lgd.lgd_name = name;
+	lgd.lgd_fid = ll_i2info(child->d_inode)->lli_fid;
+	lgd.lgd_found = 0;
+
+	mutex_lock(&dir->i_mutex);
+	rc = ll_dir_read(dir, &offset, &lgd, ll_nfs_get_name_filldir);
+	mutex_unlock(&dir->i_mutex);
+	if (!rc && !lgd.lgd_found)
+		rc = -ENOENT;
+	EXIT;
+
+out:
+	return rc;
+}
+
+static struct dentry *ll_fh_to_dentry(struct super_block *sb, struct fid *fid,
+				      int fh_len, int fh_type)
+{
+	struct lustre_nfs_fid *nfs_fid = (struct lustre_nfs_fid *)fid;
+
+	if (fh_type != LUSTRE_NFS_FID)
+		RETURN(ERR_PTR(-EPROTO));
+
+	RETURN(ll_iget_for_nfs(sb, &nfs_fid->lnf_child, &nfs_fid->lnf_parent));
+}
+
+static struct dentry *ll_fh_to_parent(struct super_block *sb, struct fid *fid,
+				      int fh_len, int fh_type)
+{
+	struct lustre_nfs_fid *nfs_fid = (struct lustre_nfs_fid *)fid;
+
+	if (fh_type != LUSTRE_NFS_FID)
+		RETURN(ERR_PTR(-EPROTO));
+
+	RETURN(ll_iget_for_nfs(sb, &nfs_fid->lnf_parent, NULL));
+}
+
+static struct dentry *ll_get_parent(struct dentry *dchild)
+{
+	struct ptlrpc_request *req = NULL;
+	struct inode	  *dir = dchild->d_inode;
+	struct ll_sb_info     *sbi;
+	struct dentry	 *result = NULL;
+	struct mdt_body       *body;
+	static char	   dotdot[] = "..";
+	struct md_op_data     *op_data;
+	int		   rc;
+	int		      lmmsize;
+	ENTRY;
+
+	LASSERT(dir && S_ISDIR(dir->i_mode));
+
+	sbi = ll_s2sbi(dir->i_sb);
+
+	CDEBUG(D_INFO, "getting parent for (%lu,"DFID")\n",
+			dir->i_ino, PFID(ll_inode2fid(dir)));
+
+	rc = ll_get_max_mdsize(sbi, &lmmsize);
+	if (rc != 0)
+		RETURN(ERR_PTR(rc));
+
+	op_data = ll_prep_md_op_data(NULL, dir, NULL, dotdot,
+				     strlen(dotdot), lmmsize,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN((void *)op_data);
+
+	rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
+	ll_finish_md_op_data(op_data);
+	if (rc) {
+		CERROR("failure %d inode %lu get parent\n", rc, dir->i_ino);
+		RETURN(ERR_PTR(rc));
+	}
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	LASSERT(body->valid & OBD_MD_FLID);
+
+	CDEBUG(D_INFO, "parent for "DFID" is "DFID"\n",
+		PFID(ll_inode2fid(dir)), PFID(&body->fid1));
+
+	result = ll_iget_for_nfs(dir->i_sb, &body->fid1, NULL);
+
+	ptlrpc_req_finished(req);
+	RETURN(result);
+}
+
+struct export_operations lustre_export_operations = {
+       .get_parent = ll_get_parent,
+       .encode_fh  = ll_encode_fh,
+       .get_name   = ll_get_name,
+	.fh_to_dentry = ll_fh_to_dentry,
+	.fh_to_parent = ll_fh_to_parent,
+};
diff --git a/drivers/staging/lustre/lustre/llite/llite_rmtacl.c b/drivers/staging/lustre/lustre/llite/llite_rmtacl.c
new file mode 100644
index 000000000000..4c610369cb9b
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/llite_rmtacl.c
@@ -0,0 +1,301 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/llite_rmtacl.c
+ *
+ * Lustre Remote User Access Control List.
+ *
+ * Author: Fan Yong <fanyong@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#ifdef CONFIG_FS_POSIX_ACL
+
+#include <lustre_lite.h>
+#include <lustre_eacl.h>
+#include "llite_internal.h"
+
+static inline __u32 rce_hashfunc(uid_t id)
+{
+	return id & (RCE_HASHES - 1);
+}
+
+static inline __u32 ee_hashfunc(uid_t id)
+{
+	return id & (EE_HASHES - 1);
+}
+
+obd_valid rce_ops2valid(int ops)
+{
+	switch (ops) {
+	case RMT_LSETFACL:
+		return OBD_MD_FLRMTLSETFACL;
+	case RMT_LGETFACL:
+		return OBD_MD_FLRMTLGETFACL;
+	case RMT_RSETFACL:
+		return OBD_MD_FLRMTRSETFACL;
+	case RMT_RGETFACL:
+		return OBD_MD_FLRMTRGETFACL;
+	default:
+		return 0;
+	}
+}
+
+static struct rmtacl_ctl_entry *rce_alloc(pid_t key, int ops)
+{
+	struct rmtacl_ctl_entry *rce;
+
+	OBD_ALLOC_PTR(rce);
+	if (!rce)
+		return NULL;
+
+	INIT_LIST_HEAD(&rce->rce_list);
+	rce->rce_key = key;
+	rce->rce_ops = ops;
+
+	return rce;
+}
+
+static void rce_free(struct rmtacl_ctl_entry *rce)
+{
+	if (!list_empty(&rce->rce_list))
+		list_del(&rce->rce_list);
+
+	OBD_FREE_PTR(rce);
+}
+
+static struct rmtacl_ctl_entry *__rct_search(struct rmtacl_ctl_table *rct,
+					   pid_t key)
+{
+	struct rmtacl_ctl_entry *rce;
+	struct list_head *head = &rct->rct_entries[rce_hashfunc(key)];
+
+	list_for_each_entry(rce, head, rce_list)
+		if (rce->rce_key == key)
+			return rce;
+
+	return NULL;
+}
+
+struct rmtacl_ctl_entry *rct_search(struct rmtacl_ctl_table *rct, pid_t key)
+{
+	struct rmtacl_ctl_entry *rce;
+
+	spin_lock(&rct->rct_lock);
+	rce = __rct_search(rct, key);
+	spin_unlock(&rct->rct_lock);
+	return rce;
+}
+
+int rct_add(struct rmtacl_ctl_table *rct, pid_t key, int ops)
+{
+	struct rmtacl_ctl_entry *rce, *e;
+
+	rce = rce_alloc(key, ops);
+	if (rce == NULL)
+		return -ENOMEM;
+
+	spin_lock(&rct->rct_lock);
+	e = __rct_search(rct, key);
+	if (unlikely(e != NULL)) {
+		CWARN("Unexpected stale rmtacl_entry found: "
+		      "[key: %d] [ops: %d]\n", (int)key, ops);
+		rce_free(e);
+	}
+	list_add_tail(&rce->rce_list, &rct->rct_entries[rce_hashfunc(key)]);
+	spin_unlock(&rct->rct_lock);
+
+	return 0;
+}
+
+int rct_del(struct rmtacl_ctl_table *rct, pid_t key)
+{
+	struct rmtacl_ctl_entry *rce;
+
+	spin_lock(&rct->rct_lock);
+	rce = __rct_search(rct, key);
+	if (rce)
+		rce_free(rce);
+	spin_unlock(&rct->rct_lock);
+
+	return rce ? 0 : -ENOENT;
+}
+
+void rct_init(struct rmtacl_ctl_table *rct)
+{
+	int i;
+
+	spin_lock_init(&rct->rct_lock);
+	for (i = 0; i < RCE_HASHES; i++)
+		INIT_LIST_HEAD(&rct->rct_entries[i]);
+}
+
+void rct_fini(struct rmtacl_ctl_table *rct)
+{
+	struct rmtacl_ctl_entry *rce;
+	int i;
+
+	spin_lock(&rct->rct_lock);
+	for (i = 0; i < RCE_HASHES; i++)
+		while (!list_empty(&rct->rct_entries[i])) {
+			rce = list_entry(rct->rct_entries[i].next,
+					     struct rmtacl_ctl_entry, rce_list);
+			rce_free(rce);
+		}
+	spin_unlock(&rct->rct_lock);
+}
+
+
+static struct eacl_entry *ee_alloc(pid_t key, struct lu_fid *fid, int type,
+				   ext_acl_xattr_header *header)
+{
+	struct eacl_entry *ee;
+
+	OBD_ALLOC_PTR(ee);
+	if (!ee)
+		return NULL;
+
+	INIT_LIST_HEAD(&ee->ee_list);
+	ee->ee_key = key;
+	ee->ee_fid = *fid;
+	ee->ee_type = type;
+	ee->ee_acl = header;
+
+	return ee;
+}
+
+void ee_free(struct eacl_entry *ee)
+{
+	if (!list_empty(&ee->ee_list))
+		list_del(&ee->ee_list);
+
+	if (ee->ee_acl)
+		lustre_ext_acl_xattr_free(ee->ee_acl);
+
+	OBD_FREE_PTR(ee);
+}
+
+static struct eacl_entry *__et_search_del(struct eacl_table *et, pid_t key,
+					struct lu_fid *fid, int type)
+{
+	struct eacl_entry *ee;
+	struct list_head *head = &et->et_entries[ee_hashfunc(key)];
+
+	LASSERT(fid != NULL);
+	list_for_each_entry(ee, head, ee_list)
+		if (ee->ee_key == key) {
+			if (lu_fid_eq(&ee->ee_fid, fid) &&
+			    ee->ee_type == type) {
+				list_del_init(&ee->ee_list);
+				return ee;
+			}
+		}
+
+	return NULL;
+}
+
+struct eacl_entry *et_search_del(struct eacl_table *et, pid_t key,
+				 struct lu_fid *fid, int type)
+{
+	struct eacl_entry *ee;
+
+	spin_lock(&et->et_lock);
+	ee = __et_search_del(et, key, fid, type);
+	spin_unlock(&et->et_lock);
+	return ee;
+}
+
+void et_search_free(struct eacl_table *et, pid_t key)
+{
+	struct eacl_entry *ee, *next;
+	struct list_head *head = &et->et_entries[ee_hashfunc(key)];
+
+	spin_lock(&et->et_lock);
+	list_for_each_entry_safe(ee, next, head, ee_list)
+		if (ee->ee_key == key)
+			ee_free(ee);
+
+	spin_unlock(&et->et_lock);
+}
+
+int ee_add(struct eacl_table *et, pid_t key, struct lu_fid *fid, int type,
+	   ext_acl_xattr_header *header)
+{
+	struct eacl_entry *ee, *e;
+
+	ee = ee_alloc(key, fid, type, header);
+	if (ee == NULL)
+		return -ENOMEM;
+
+	spin_lock(&et->et_lock);
+	e = __et_search_del(et, key, fid, type);
+	if (unlikely(e != NULL)) {
+		CWARN("Unexpected stale eacl_entry found: "
+		      "[key: %d] [fid: "DFID"] [type: %d]\n",
+		      (int)key, PFID(fid), type);
+		ee_free(e);
+	}
+	list_add_tail(&ee->ee_list, &et->et_entries[ee_hashfunc(key)]);
+	spin_unlock(&et->et_lock);
+
+	return 0;
+}
+
+void et_init(struct eacl_table *et)
+{
+	int i;
+
+	spin_lock_init(&et->et_lock);
+	for (i = 0; i < EE_HASHES; i++)
+		INIT_LIST_HEAD(&et->et_entries[i]);
+}
+
+void et_fini(struct eacl_table *et)
+{
+	struct eacl_entry *ee;
+	int i;
+
+	spin_lock(&et->et_lock);
+	for (i = 0; i < EE_HASHES; i++)
+		while (!list_empty(&et->et_entries[i])) {
+			ee = list_entry(et->et_entries[i].next,
+					    struct eacl_entry, ee_list);
+			ee_free(ee);
+		}
+	spin_unlock(&et->et_lock);
+}
+
+#endif
diff --git a/drivers/staging/lustre/lustre/llite/lloop.c b/drivers/staging/lustre/lustre/llite/lloop.c
new file mode 100644
index 000000000000..b72f25738bab
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/lloop.c
@@ -0,0 +1,869 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/*
+ *  linux/drivers/block/loop.c
+ *
+ *  Written by Theodore Ts'o, 3/29/93
+ *
+ * Copyright 1993 by Theodore Ts'o.  Redistribution of this file is
+ * permitted under the GNU General Public License.
+ *
+ * Modularized and updated for 1.1.16 kernel - Mitch Dsouza 28th May 1994
+ * Adapted for 1.3.59 kernel - Andries Brouwer, 1 Feb 1996
+ *
+ * Fixed do_loop_request() re-entrancy - Vincent.Renardias@waw.com Mar 20, 1997
+ *
+ * Added devfs support - Richard Gooch <rgooch@atnf.csiro.au> 16-Jan-1998
+ *
+ * Handle sparse backing files correctly - Kenn Humborg, Jun 28, 1998
+ *
+ * Loadable modules and other fixes by AK, 1998
+ *
+ * Maximum number of loop devices now dynamic via max_loop module parameter.
+ * Russell Kroll <rkroll@exploits.org> 19990701
+ *
+ * Maximum number of loop devices when compiled-in now selectable by passing
+ * max_loop=<1-255> to the kernel on boot.
+ * Erik I. Bols?, <eriki@himolde.no>, Oct 31, 1999
+ *
+ * Completely rewrite request handling to be make_request_fn style and
+ * non blocking, pushing work to a helper thread. Lots of fixes from
+ * Al Viro too.
+ * Jens Axboe <axboe@suse.de>, Nov 2000
+ *
+ * Support up to 256 loop devices
+ * Heinz Mauelshagen <mge@sistina.com>, Feb 2002
+ *
+ * Support for falling back on the write file operation when the address space
+ * operations prepare_write and/or commit_write are not available on the
+ * backing filesystem.
+ * Anton Altaparmakov, 16 Feb 2005
+ *
+ * Still To Fix:
+ * - Advisory locking is ignored here.
+ * - Should use an own CAP_* category instead of CAP_SYS_ADMIN
+ *
+ */
+
+#include <linux/module.h>
+
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/major.h>
+#include <linux/wait.h>
+#include <linux/blkdev.h>
+#include <linux/blkpg.h>
+#include <linux/init.h>
+#include <linux/swap.h>
+#include <linux/slab.h>
+#include <linux/suspend.h>
+#include <linux/writeback.h>
+#include <linux/buffer_head.h>		/* for invalidate_bdev() */
+#include <linux/completion.h>
+#include <linux/highmem.h>
+#include <linux/gfp.h>
+#include <linux/swap.h>
+#include <linux/pagevec.h>
+
+#include <asm/uaccess.h>
+
+#include <lustre_lib.h>
+#include <lustre_lite.h>
+#include "llite_internal.h"
+
+#define LLOOP_MAX_SEGMENTS	LNET_MAX_IOV
+
+/* Possible states of device */
+enum {
+	LLOOP_UNBOUND,
+	LLOOP_BOUND,
+	LLOOP_RUNDOWN,
+};
+
+struct lloop_device {
+	int		  lo_number;
+	int		  lo_refcnt;
+	loff_t	       lo_offset;
+	loff_t	       lo_sizelimit;
+	int		  lo_flags;
+	int		(*ioctl)(struct lloop_device *, int cmd,
+				    unsigned long arg);
+
+	struct file	 *lo_backing_file;
+	struct block_device *lo_device;
+	unsigned	     lo_blocksize;
+
+	int		  old_gfp_mask;
+
+	spinlock_t		lo_lock;
+	struct bio		*lo_bio;
+	struct bio		*lo_biotail;
+	int			lo_state;
+	struct semaphore	lo_sem;
+	struct mutex		lo_ctl_mutex;
+	atomic_t	 lo_pending;
+	wait_queue_head_t	  lo_bh_wait;
+
+	struct request_queue *lo_queue;
+
+	const struct lu_env *lo_env;
+	struct cl_io	 lo_io;
+	struct ll_dio_pages  lo_pvec;
+
+	/* data to handle bio for lustre. */
+	struct lo_request_data {
+		struct page *lrd_pages[LLOOP_MAX_SEGMENTS];
+		loff_t       lrd_offsets[LLOOP_MAX_SEGMENTS];
+	} lo_requests[1];
+};
+
+/*
+ * Loop flags
+ */
+enum {
+	LO_FLAGS_READ_ONLY       = 1,
+};
+
+static int lloop_major;
+#define MAX_LOOP_DEFAULT  16
+static int max_loop = MAX_LOOP_DEFAULT;
+static struct lloop_device *loop_dev;
+static struct gendisk **disks;
+static struct mutex lloop_mutex;
+static void *ll_iocontrol_magic = NULL;
+
+static loff_t get_loop_size(struct lloop_device *lo, struct file *file)
+{
+	loff_t size, offset, loopsize;
+
+	/* Compute loopsize in bytes */
+	size = i_size_read(file->f_mapping->host);
+	offset = lo->lo_offset;
+	loopsize = size - offset;
+	if (lo->lo_sizelimit > 0 && lo->lo_sizelimit < loopsize)
+		loopsize = lo->lo_sizelimit;
+
+	/*
+	 * Unfortunately, if we want to do I/O on the device,
+	 * the number of 512-byte sectors has to fit into a sector_t.
+	 */
+	return loopsize >> 9;
+}
+
+static int do_bio_lustrebacked(struct lloop_device *lo, struct bio *head)
+{
+	const struct lu_env  *env   = lo->lo_env;
+	struct cl_io	 *io    = &lo->lo_io;
+	struct inode	 *inode = lo->lo_backing_file->f_dentry->d_inode;
+	struct cl_object     *obj = ll_i2info(inode)->lli_clob;
+	pgoff_t	       offset;
+	int		   ret;
+	int		   i;
+	int		   rw;
+	obd_count	     page_count = 0;
+	struct bio_vec       *bvec;
+	struct bio	   *bio;
+	ssize_t	       bytes;
+
+	struct ll_dio_pages  *pvec = &lo->lo_pvec;
+	struct page	 **pages = pvec->ldp_pages;
+	loff_t	       *offsets = pvec->ldp_offsets;
+
+	truncate_inode_pages(inode->i_mapping, 0);
+
+	/* initialize the IO */
+	memset(io, 0, sizeof(*io));
+	io->ci_obj = obj;
+	ret = cl_io_init(env, io, CIT_MISC, obj);
+	if (ret)
+		return io->ci_result;
+	io->ci_lockreq = CILR_NEVER;
+
+	LASSERT(head != NULL);
+	rw = head->bi_rw;
+	for (bio = head; bio != NULL; bio = bio->bi_next) {
+		LASSERT(rw == bio->bi_rw);
+
+		offset = (pgoff_t)(bio->bi_sector << 9) + lo->lo_offset;
+		bio_for_each_segment(bvec, bio, i) {
+			BUG_ON(bvec->bv_offset != 0);
+			BUG_ON(bvec->bv_len != PAGE_CACHE_SIZE);
+
+			pages[page_count] = bvec->bv_page;
+			offsets[page_count] = offset;
+			page_count++;
+			offset += bvec->bv_len;
+		}
+		LASSERT(page_count <= LLOOP_MAX_SEGMENTS);
+	}
+
+	ll_stats_ops_tally(ll_i2sbi(inode),
+			(rw == WRITE) ? LPROC_LL_BRW_WRITE : LPROC_LL_BRW_READ,
+			page_count);
+
+	pvec->ldp_size = page_count << PAGE_CACHE_SHIFT;
+	pvec->ldp_nr = page_count;
+
+	/* FIXME: in ll_direct_rw_pages, it has to allocate many cl_page{}s to
+	 * write those pages into OST. Even worse case is that more pages
+	 * would be asked to write out to swap space, and then finally get here
+	 * again.
+	 * Unfortunately this is NOT easy to fix.
+	 * Thoughts on solution:
+	 * 0. Define a reserved pool for cl_pages, which could be a list of
+	 *    pre-allocated cl_pages;
+	 * 1. Define a new operation in cl_object_operations{}, says clo_depth,
+	 *    which measures how many layers for this lustre object. Generally
+	 *    speaking, the depth would be 2, one for llite, and one for lovsub.
+	 *    However, for SNS, there will be more since we need additional page
+	 *    to store parity;
+	 * 2. Reserve the # of (page_count * depth) cl_pages from the reserved
+	 *    pool. Afterwards, the clio would allocate the pages from reserved
+	 *    pool, this guarantees we neeedn't allocate the cl_pages from
+	 *    generic cl_page slab cache.
+	 *    Of course, if there is NOT enough pages in the pool, we might
+	 *    be asked to write less pages once, this purely depends on
+	 *    implementation. Anyway, we should be careful to avoid deadlocking.
+	 */
+	mutex_lock(&inode->i_mutex);
+	bytes = ll_direct_rw_pages(env, io, rw, inode, pvec);
+	mutex_unlock(&inode->i_mutex);
+	cl_io_fini(env, io);
+	return (bytes == pvec->ldp_size) ? 0 : (int)bytes;
+}
+
+/*
+ * Add bio to back of pending list
+ */
+static void loop_add_bio(struct lloop_device *lo, struct bio *bio)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&lo->lo_lock, flags);
+	if (lo->lo_biotail) {
+		lo->lo_biotail->bi_next = bio;
+		lo->lo_biotail = bio;
+	} else
+		lo->lo_bio = lo->lo_biotail = bio;
+	spin_unlock_irqrestore(&lo->lo_lock, flags);
+
+	atomic_inc(&lo->lo_pending);
+	if (waitqueue_active(&lo->lo_bh_wait))
+		wake_up(&lo->lo_bh_wait);
+}
+
+/*
+ * Grab first pending buffer
+ */
+static unsigned int loop_get_bio(struct lloop_device *lo, struct bio **req)
+{
+	struct bio *first;
+	struct bio **bio;
+	unsigned int count = 0;
+	unsigned int page_count = 0;
+	int rw;
+
+	spin_lock_irq(&lo->lo_lock);
+	first = lo->lo_bio;
+	if (unlikely(first == NULL)) {
+		spin_unlock_irq(&lo->lo_lock);
+		return 0;
+	}
+
+	/* TODO: need to split the bio, too bad. */
+	LASSERT(first->bi_vcnt <= LLOOP_MAX_SEGMENTS);
+
+	rw = first->bi_rw;
+	bio = &lo->lo_bio;
+	while (*bio && (*bio)->bi_rw == rw) {
+		CDEBUG(D_INFO, "bio sector %llu size %u count %u vcnt%u \n",
+		       (unsigned long long)(*bio)->bi_sector, (*bio)->bi_size,
+		       page_count, (*bio)->bi_vcnt);
+		if (page_count + (*bio)->bi_vcnt > LLOOP_MAX_SEGMENTS)
+			break;
+
+
+		page_count += (*bio)->bi_vcnt;
+		count++;
+		bio = &(*bio)->bi_next;
+	}
+	if (*bio) {
+		/* Some of bios can't be mergable. */
+		lo->lo_bio = *bio;
+		*bio = NULL;
+	} else {
+		/* Hit the end of queue */
+		lo->lo_biotail = NULL;
+		lo->lo_bio = NULL;
+	}
+	*req = first;
+	spin_unlock_irq(&lo->lo_lock);
+	return count;
+}
+
+static ll_mrf_ret
+loop_make_request(struct request_queue *q, struct bio *old_bio)
+{
+	struct lloop_device *lo = q->queuedata;
+	int rw = bio_rw(old_bio);
+	int inactive;
+
+	if (!lo)
+		goto err;
+
+	CDEBUG(D_INFO, "submit bio sector %llu size %u\n",
+	       (unsigned long long)old_bio->bi_sector, old_bio->bi_size);
+
+	spin_lock_irq(&lo->lo_lock);
+	inactive = (lo->lo_state != LLOOP_BOUND);
+	spin_unlock_irq(&lo->lo_lock);
+	if (inactive)
+		goto err;
+
+	if (rw == WRITE) {
+		if (lo->lo_flags & LO_FLAGS_READ_ONLY)
+			goto err;
+	} else if (rw == READA) {
+		rw = READ;
+	} else if (rw != READ) {
+		CERROR("lloop: unknown command (%x)\n", rw);
+		goto err;
+	}
+	loop_add_bio(lo, old_bio);
+	LL_MRF_RETURN(0);
+err:
+	cfs_bio_io_error(old_bio, old_bio->bi_size);
+	LL_MRF_RETURN(0);
+}
+
+
+static inline void loop_handle_bio(struct lloop_device *lo, struct bio *bio)
+{
+	int ret;
+	ret = do_bio_lustrebacked(lo, bio);
+	while (bio) {
+		struct bio *tmp = bio->bi_next;
+		bio->bi_next = NULL;
+		cfs_bio_endio(bio, bio->bi_size, ret);
+		bio = tmp;
+	}
+}
+
+static inline int loop_active(struct lloop_device *lo)
+{
+	return atomic_read(&lo->lo_pending) ||
+		(lo->lo_state == LLOOP_RUNDOWN);
+}
+
+/*
+ * worker thread that handles reads/writes to file backed loop devices,
+ * to avoid blocking in our make_request_fn.
+ */
+static int loop_thread(void *data)
+{
+	struct lloop_device *lo = data;
+	struct bio *bio;
+	unsigned int count;
+	unsigned long times = 0;
+	unsigned long total_count = 0;
+
+	struct lu_env *env;
+	int refcheck;
+	int ret = 0;
+
+	set_user_nice(current, -20);
+
+	lo->lo_state = LLOOP_BOUND;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		GOTO(out, ret = PTR_ERR(env));
+
+	lo->lo_env = env;
+	memset(&lo->lo_pvec, 0, sizeof(lo->lo_pvec));
+	lo->lo_pvec.ldp_pages   = lo->lo_requests[0].lrd_pages;
+	lo->lo_pvec.ldp_offsets = lo->lo_requests[0].lrd_offsets;
+
+	/*
+	 * up sem, we are running
+	 */
+	up(&lo->lo_sem);
+
+	for (;;) {
+		wait_event(lo->lo_bh_wait, loop_active(lo));
+		if (!atomic_read(&lo->lo_pending)) {
+			int exiting = 0;
+			spin_lock_irq(&lo->lo_lock);
+			exiting = (lo->lo_state == LLOOP_RUNDOWN);
+			spin_unlock_irq(&lo->lo_lock);
+			if (exiting)
+				break;
+		}
+
+		bio = NULL;
+		count = loop_get_bio(lo, &bio);
+		if (!count) {
+			CWARN("lloop(minor: %d): missing bio\n", lo->lo_number);
+			continue;
+		}
+
+		total_count += count;
+		if (total_count < count) {     /* overflow */
+			total_count = count;
+			times = 1;
+		} else {
+			times++;
+		}
+		if ((times & 127) == 0) {
+			CDEBUG(D_INFO, "total: %lu, count: %lu, avg: %lu\n",
+			       total_count, times, total_count / times);
+		}
+
+		LASSERT(bio != NULL);
+		LASSERT(count <= atomic_read(&lo->lo_pending));
+		loop_handle_bio(lo, bio);
+		atomic_sub(count, &lo->lo_pending);
+	}
+	cl_env_put(env, &refcheck);
+
+out:
+	up(&lo->lo_sem);
+	return ret;
+}
+
+static int loop_set_fd(struct lloop_device *lo, struct file *unused,
+		       struct block_device *bdev, struct file *file)
+{
+	struct inode	 *inode;
+	struct address_space *mapping;
+	int		   lo_flags = 0;
+	int		   error;
+	loff_t		size;
+
+	if (!try_module_get(THIS_MODULE))
+		return -ENODEV;
+
+	error = -EBUSY;
+	if (lo->lo_state != LLOOP_UNBOUND)
+		goto out;
+
+	mapping = file->f_mapping;
+	inode = mapping->host;
+
+	error = -EINVAL;
+	if (!S_ISREG(inode->i_mode) || inode->i_sb->s_magic != LL_SUPER_MAGIC)
+		goto out;
+
+	if (!(file->f_mode & FMODE_WRITE))
+		lo_flags |= LO_FLAGS_READ_ONLY;
+
+	size = get_loop_size(lo, file);
+
+	if ((loff_t)(sector_t)size != size) {
+		error = -EFBIG;
+		goto out;
+	}
+
+	/* remove all pages in cache so as dirty pages not to be existent. */
+	truncate_inode_pages(mapping, 0);
+
+	set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0);
+
+	lo->lo_blocksize = PAGE_CACHE_SIZE;
+	lo->lo_device = bdev;
+	lo->lo_flags = lo_flags;
+	lo->lo_backing_file = file;
+	lo->ioctl = NULL;
+	lo->lo_sizelimit = 0;
+	lo->old_gfp_mask = mapping_gfp_mask(mapping);
+	mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
+
+	lo->lo_bio = lo->lo_biotail = NULL;
+
+	/*
+	 * set queue make_request_fn, and add limits based on lower level
+	 * device
+	 */
+	blk_queue_make_request(lo->lo_queue, loop_make_request);
+	lo->lo_queue->queuedata = lo;
+
+	/* queue parameters */
+	CLASSERT(PAGE_CACHE_SIZE < (1 << (sizeof(unsigned short) * 8)));
+	blk_queue_logical_block_size(lo->lo_queue,
+				     (unsigned short)PAGE_CACHE_SIZE);
+	blk_queue_max_hw_sectors(lo->lo_queue,
+				 LLOOP_MAX_SEGMENTS << (PAGE_CACHE_SHIFT - 9));
+	blk_queue_max_segments(lo->lo_queue, LLOOP_MAX_SEGMENTS);
+
+	set_capacity(disks[lo->lo_number], size);
+	bd_set_size(bdev, size << 9);
+
+	set_blocksize(bdev, lo->lo_blocksize);
+
+	kthread_run(loop_thread, lo, "lloop%d", lo->lo_number);
+	down(&lo->lo_sem);
+	return 0;
+
+out:
+	/* This is safe: open() is still holding a reference. */
+	module_put(THIS_MODULE);
+	return error;
+}
+
+static int loop_clr_fd(struct lloop_device *lo, struct block_device *bdev,
+		       int count)
+{
+	struct file *filp = lo->lo_backing_file;
+	int gfp = lo->old_gfp_mask;
+
+	if (lo->lo_state != LLOOP_BOUND)
+		return -ENXIO;
+
+	if (lo->lo_refcnt > count)	/* we needed one fd for the ioctl */
+		return -EBUSY;
+
+	if (filp == NULL)
+		return -EINVAL;
+
+	spin_lock_irq(&lo->lo_lock);
+	lo->lo_state = LLOOP_RUNDOWN;
+	spin_unlock_irq(&lo->lo_lock);
+	wake_up(&lo->lo_bh_wait);
+
+	down(&lo->lo_sem);
+	lo->lo_backing_file = NULL;
+	lo->ioctl = NULL;
+	lo->lo_device = NULL;
+	lo->lo_offset = 0;
+	lo->lo_sizelimit = 0;
+	lo->lo_flags = 0;
+	ll_invalidate_bdev(bdev, 0);
+	set_capacity(disks[lo->lo_number], 0);
+	bd_set_size(bdev, 0);
+	mapping_set_gfp_mask(filp->f_mapping, gfp);
+	lo->lo_state = LLOOP_UNBOUND;
+	fput(filp);
+	/* This is safe: open() is still holding a reference. */
+	module_put(THIS_MODULE);
+	return 0;
+}
+
+static int lo_open(struct block_device *bdev, fmode_t mode)
+{
+	struct lloop_device *lo = bdev->bd_disk->private_data;
+
+	mutex_lock(&lo->lo_ctl_mutex);
+	lo->lo_refcnt++;
+	mutex_unlock(&lo->lo_ctl_mutex);
+
+	return 0;
+}
+
+static int lo_release(struct gendisk *disk, fmode_t mode)
+{
+	struct lloop_device *lo = disk->private_data;
+
+	mutex_lock(&lo->lo_ctl_mutex);
+	--lo->lo_refcnt;
+	mutex_unlock(&lo->lo_ctl_mutex);
+
+	return 0;
+}
+
+/* lloop device node's ioctl function. */
+static int lo_ioctl(struct block_device *bdev, fmode_t mode,
+		    unsigned int cmd, unsigned long arg)
+{
+	struct lloop_device *lo = bdev->bd_disk->private_data;
+	struct inode *inode = NULL;
+	int err = 0;
+
+	mutex_lock(&lloop_mutex);
+	switch (cmd) {
+	case LL_IOC_LLOOP_DETACH: {
+		err = loop_clr_fd(lo, bdev, 2);
+		if (err == 0)
+			ll_blkdev_put(bdev, 0); /* grabbed in LLOOP_ATTACH */
+		break;
+	}
+
+	case LL_IOC_LLOOP_INFO: {
+		struct lu_fid fid;
+
+		LASSERT(lo->lo_backing_file != NULL);
+		if (inode == NULL)
+			inode = lo->lo_backing_file->f_dentry->d_inode;
+		if (lo->lo_state == LLOOP_BOUND)
+			fid = ll_i2info(inode)->lli_fid;
+		else
+			fid_zero(&fid);
+
+		if (copy_to_user((struct lu_fid *)arg, &fid, sizeof(fid)))
+			err = -EFAULT;
+		break;
+	}
+
+	default:
+		err = -EINVAL;
+		break;
+	}
+	mutex_unlock(&lloop_mutex);
+
+	return err;
+}
+
+static struct block_device_operations lo_fops = {
+	.owner =	THIS_MODULE,
+	.open =	 lo_open,
+	.release =      lo_release,
+	.ioctl =	lo_ioctl,
+};
+
+/* dynamic iocontrol callback.
+ * This callback is registered in lloop_init and will be called by
+ * ll_iocontrol_call.
+ *
+ * This is a llite regular file ioctl function. It takes the responsibility
+ * of attaching or detaching a file by a lloop's device numner.
+ */
+static enum llioc_iter lloop_ioctl(struct inode *unused, struct file *file,
+				   unsigned int cmd, unsigned long arg,
+				   void *magic, int *rcp)
+{
+	struct lloop_device *lo = NULL;
+	struct block_device *bdev = NULL;
+	int err = 0;
+	dev_t dev;
+
+	if (magic != ll_iocontrol_magic)
+		return LLIOC_CONT;
+
+	if (disks == NULL)
+		GOTO(out1, err = -ENODEV);
+
+	CWARN("Enter llop_ioctl\n");
+
+	mutex_lock(&lloop_mutex);
+	switch (cmd) {
+	case LL_IOC_LLOOP_ATTACH: {
+		struct lloop_device *lo_free = NULL;
+		int i;
+
+		for (i = 0; i < max_loop; i++, lo = NULL) {
+			lo = &loop_dev[i];
+			if (lo->lo_state == LLOOP_UNBOUND) {
+				if (!lo_free)
+					lo_free = lo;
+				continue;
+			}
+			if (lo->lo_backing_file->f_dentry->d_inode ==
+			    file->f_dentry->d_inode)
+				break;
+		}
+		if (lo || !lo_free)
+			GOTO(out, err = -EBUSY);
+
+		lo = lo_free;
+		dev = MKDEV(lloop_major, lo->lo_number);
+
+		/* quit if the used pointer is writable */
+		if (put_user((long)old_encode_dev(dev), (long*)arg))
+			GOTO(out, err = -EFAULT);
+
+		bdev = blkdev_get_by_dev(dev, file->f_mode, NULL);
+		if (IS_ERR(bdev))
+			GOTO(out, err = PTR_ERR(bdev));
+
+		get_file(file);
+		err = loop_set_fd(lo, NULL, bdev, file);
+		if (err) {
+			fput(file);
+			ll_blkdev_put(bdev, 0);
+		}
+
+		break;
+	}
+
+	case LL_IOC_LLOOP_DETACH_BYDEV: {
+		int minor;
+
+		dev = old_decode_dev(arg);
+		if (MAJOR(dev) != lloop_major)
+			GOTO(out, err = -EINVAL);
+
+		minor = MINOR(dev);
+		if (minor > max_loop - 1)
+			GOTO(out, err = -EINVAL);
+
+		lo = &loop_dev[minor];
+		if (lo->lo_state != LLOOP_BOUND)
+			GOTO(out, err = -EINVAL);
+
+		bdev = lo->lo_device;
+		err = loop_clr_fd(lo, bdev, 1);
+		if (err == 0)
+			ll_blkdev_put(bdev, 0); /* grabbed in LLOOP_ATTACH */
+
+		break;
+	}
+
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+out:
+	mutex_unlock(&lloop_mutex);
+out1:
+	if (rcp)
+		*rcp = err;
+	return LLIOC_STOP;
+}
+
+static int __init lloop_init(void)
+{
+	int	i;
+	unsigned int cmdlist[] = {
+		LL_IOC_LLOOP_ATTACH,
+		LL_IOC_LLOOP_DETACH_BYDEV,
+	};
+
+	if (max_loop < 1 || max_loop > 256) {
+		max_loop = MAX_LOOP_DEFAULT;
+		CWARN("lloop: invalid max_loop (must be between"
+		      " 1 and 256), using default (%u)\n", max_loop);
+	}
+
+	lloop_major = register_blkdev(0, "lloop");
+	if (lloop_major < 0)
+		return -EIO;
+
+	CDEBUG(D_CONFIG, "registered lloop major %d with %u minors\n",
+	       lloop_major, max_loop);
+
+	ll_iocontrol_magic = ll_iocontrol_register(lloop_ioctl, 2, cmdlist);
+	if (ll_iocontrol_magic == NULL)
+		goto out_mem1;
+
+	OBD_ALLOC_WAIT(loop_dev, max_loop * sizeof(*loop_dev));
+	if (!loop_dev)
+		goto out_mem1;
+
+	OBD_ALLOC_WAIT(disks, max_loop * sizeof(*disks));
+	if (!disks)
+		goto out_mem2;
+
+	for (i = 0; i < max_loop; i++) {
+		disks[i] = alloc_disk(1);
+		if (!disks[i])
+			goto out_mem3;
+	}
+
+	mutex_init(&lloop_mutex);
+
+	for (i = 0; i < max_loop; i++) {
+		struct lloop_device *lo = &loop_dev[i];
+		struct gendisk *disk = disks[i];
+
+		lo->lo_queue = blk_alloc_queue(GFP_KERNEL);
+		if (!lo->lo_queue)
+			goto out_mem4;
+
+		mutex_init(&lo->lo_ctl_mutex);
+		sema_init(&lo->lo_sem, 0);
+		init_waitqueue_head(&lo->lo_bh_wait);
+		lo->lo_number = i;
+		spin_lock_init(&lo->lo_lock);
+		disk->major = lloop_major;
+		disk->first_minor = i;
+		disk->fops = &lo_fops;
+		sprintf(disk->disk_name, "lloop%d", i);
+		disk->private_data = lo;
+		disk->queue = lo->lo_queue;
+	}
+
+	/* We cannot fail after we call this, so another loop!*/
+	for (i = 0; i < max_loop; i++)
+		add_disk(disks[i]);
+	return 0;
+
+out_mem4:
+	while (i--)
+		blk_cleanup_queue(loop_dev[i].lo_queue);
+	i = max_loop;
+out_mem3:
+	while (i--)
+		put_disk(disks[i]);
+	OBD_FREE(disks, max_loop * sizeof(*disks));
+out_mem2:
+	OBD_FREE(loop_dev, max_loop * sizeof(*loop_dev));
+out_mem1:
+	unregister_blkdev(lloop_major, "lloop");
+	ll_iocontrol_unregister(ll_iocontrol_magic);
+	CERROR("lloop: ran out of memory\n");
+	return -ENOMEM;
+}
+
+static void lloop_exit(void)
+{
+	int i;
+
+	ll_iocontrol_unregister(ll_iocontrol_magic);
+	for (i = 0; i < max_loop; i++) {
+		del_gendisk(disks[i]);
+		blk_cleanup_queue(loop_dev[i].lo_queue);
+		put_disk(disks[i]);
+	}
+	if (ll_unregister_blkdev(lloop_major, "lloop"))
+		CWARN("lloop: cannot unregister blkdev\n");
+	else
+		CDEBUG(D_CONFIG, "unregistered lloop major %d\n", lloop_major);
+
+	OBD_FREE(disks, max_loop * sizeof(*disks));
+	OBD_FREE(loop_dev, max_loop * sizeof(*loop_dev));
+}
+
+module_init(lloop_init);
+module_exit(lloop_exit);
+
+CFS_MODULE_PARM(max_loop, "i", int, 0444, "maximum of lloop_device");
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre virtual block device");
+MODULE_LICENSE("GPL");
diff --git a/drivers/staging/lustre/lustre/llite/lproc_llite.c b/drivers/staging/lustre/lustre/llite/lproc_llite.c
new file mode 100644
index 000000000000..22e19a6c0461
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/lproc_llite.c
@@ -0,0 +1,1397 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/version.h>
+#include <lustre_lite.h>
+#include <lprocfs_status.h>
+#include <linux/seq_file.h>
+#include <obd_support.h>
+
+#include "llite_internal.h"
+
+struct proc_dir_entry *proc_lustre_fs_root;
+
+#ifdef LPROCFS
+/* /proc/lustre/llite mount point registration */
+extern struct file_operations vvp_dump_pgcache_file_ops;
+struct file_operations ll_rw_extents_stats_fops;
+struct file_operations ll_rw_extents_stats_pp_fops;
+struct file_operations ll_rw_offset_stats_fops;
+
+static int ll_rd_blksize(char *page, char **start, off_t off, int count,
+			 int *eof, void *data)
+{
+	struct super_block *sb = (struct super_block *)data;
+	struct obd_statfs osfs;
+	int rc;
+
+	LASSERT(sb != NULL);
+	rc = ll_statfs_internal(sb, &osfs,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				OBD_STATFS_NODELAY);
+	if (!rc) {
+	      *eof = 1;
+	      rc = snprintf(page, count, "%u\n", osfs.os_bsize);
+	}
+
+	return rc;
+}
+
+static int ll_rd_kbytestotal(char *page, char **start, off_t off, int count,
+			     int *eof, void *data)
+{
+	struct super_block *sb = (struct super_block *)data;
+	struct obd_statfs osfs;
+	int rc;
+
+	LASSERT(sb != NULL);
+	rc = ll_statfs_internal(sb, &osfs,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				OBD_STATFS_NODELAY);
+	if (!rc) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_blocks;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		*eof = 1;
+		rc = snprintf(page, count, LPU64"\n", result);
+	}
+	return rc;
+
+}
+
+static int ll_rd_kbytesfree(char *page, char **start, off_t off, int count,
+			    int *eof, void *data)
+{
+	struct super_block *sb = (struct super_block *)data;
+	struct obd_statfs osfs;
+	int rc;
+
+	LASSERT(sb != NULL);
+	rc = ll_statfs_internal(sb, &osfs,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				OBD_STATFS_NODELAY);
+	if (!rc) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_bfree;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		*eof = 1;
+		rc = snprintf(page, count, LPU64"\n", result);
+	}
+	return rc;
+}
+
+static int ll_rd_kbytesavail(char *page, char **start, off_t off, int count,
+			     int *eof, void *data)
+{
+	struct super_block *sb = (struct super_block *)data;
+	struct obd_statfs osfs;
+	int rc;
+
+	LASSERT(sb != NULL);
+	rc = ll_statfs_internal(sb, &osfs,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				OBD_STATFS_NODELAY);
+	if (!rc) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_bavail;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		*eof = 1;
+		rc = snprintf(page, count, LPU64"\n", result);
+	}
+	return rc;
+}
+
+static int ll_rd_filestotal(char *page, char **start, off_t off, int count,
+			    int *eof, void *data)
+{
+	struct super_block *sb = (struct super_block *)data;
+	struct obd_statfs osfs;
+	int rc;
+
+	LASSERT(sb != NULL);
+	rc = ll_statfs_internal(sb, &osfs,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				OBD_STATFS_NODELAY);
+	if (!rc) {
+		 *eof = 1;
+		 rc = snprintf(page, count, LPU64"\n", osfs.os_files);
+	}
+	return rc;
+}
+
+static int ll_rd_filesfree(char *page, char **start, off_t off, int count,
+			   int *eof, void *data)
+{
+	struct super_block *sb = (struct super_block *)data;
+	struct obd_statfs osfs;
+	int rc;
+
+	LASSERT(sb != NULL);
+	rc = ll_statfs_internal(sb, &osfs,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				OBD_STATFS_NODELAY);
+	if (!rc) {
+		 *eof = 1;
+		 rc = snprintf(page, count, LPU64"\n", osfs.os_ffree);
+	}
+	return rc;
+
+}
+
+static int ll_rd_client_type(char *page, char **start, off_t off, int count,
+			    int *eof, void *data)
+{
+	struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)data);
+	int rc;
+
+	LASSERT(sbi != NULL);
+
+	*eof = 1;
+	if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
+		rc = snprintf(page, count, "remote client\n");
+	else
+		rc = snprintf(page, count, "local client\n");
+
+	return rc;
+}
+
+static int ll_rd_fstype(char *page, char **start, off_t off, int count,
+			int *eof, void *data)
+{
+	struct super_block *sb = (struct super_block*)data;
+
+	LASSERT(sb != NULL);
+	*eof = 1;
+	return snprintf(page, count, "%s\n", sb->s_type->name);
+}
+
+static int ll_rd_sb_uuid(char *page, char **start, off_t off, int count,
+			 int *eof, void *data)
+{
+	struct super_block *sb = (struct super_block *)data;
+
+	LASSERT(sb != NULL);
+	*eof = 1;
+	return snprintf(page, count, "%s\n", ll_s2sbi(sb)->ll_sb_uuid.uuid);
+}
+
+static int ll_rd_site_stats(char *page, char **start, off_t off,
+			    int count, int *eof, void *data)
+{
+	struct super_block *sb = data;
+
+	/*
+	 * See description of statistical counters in struct cl_site, and
+	 * struct lu_site.
+	 */
+	return cl_site_stats_print(lu2cl_site(ll_s2sbi(sb)->ll_site),
+				   page, count);
+}
+
+static int ll_rd_max_readahead_mb(char *page, char **start, off_t off,
+				   int count, int *eof, void *data)
+{
+	struct super_block *sb = data;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	long pages_number;
+	int mult;
+
+	spin_lock(&sbi->ll_lock);
+	pages_number = sbi->ll_ra_info.ra_max_pages;
+	spin_unlock(&sbi->ll_lock);
+
+	mult = 1 << (20 - PAGE_CACHE_SHIFT);
+	return lprocfs_read_frac_helper(page, count, pages_number, mult);
+}
+
+static int ll_wr_max_readahead_mb(struct file *file, const char *buffer,
+				  unsigned long count, void *data)
+{
+	struct super_block *sb = data;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	int mult, rc, pages_number;
+
+	mult = 1 << (20 - PAGE_CACHE_SHIFT);
+	rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
+	if (rc)
+		return rc;
+
+	if (pages_number < 0 || pages_number > num_physpages / 2) {
+		CERROR("can't set file readahead more than %lu MB\n",
+		       num_physpages >> (20 - PAGE_CACHE_SHIFT + 1)); /*1/2 of RAM*/
+		return -ERANGE;
+	}
+
+	spin_lock(&sbi->ll_lock);
+	sbi->ll_ra_info.ra_max_pages = pages_number;
+	spin_unlock(&sbi->ll_lock);
+
+	return count;
+}
+
+static int ll_rd_max_readahead_per_file_mb(char *page, char **start, off_t off,
+					   int count, int *eof, void *data)
+{
+	struct super_block *sb = data;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	long pages_number;
+	int mult;
+
+	spin_lock(&sbi->ll_lock);
+	pages_number = sbi->ll_ra_info.ra_max_pages_per_file;
+	spin_unlock(&sbi->ll_lock);
+
+	mult = 1 << (20 - PAGE_CACHE_SHIFT);
+	return lprocfs_read_frac_helper(page, count, pages_number, mult);
+}
+
+static int ll_wr_max_readahead_per_file_mb(struct file *file, const char *buffer,
+					  unsigned long count, void *data)
+{
+	struct super_block *sb = data;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	int mult, rc, pages_number;
+
+	mult = 1 << (20 - PAGE_CACHE_SHIFT);
+	rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
+	if (rc)
+		return rc;
+
+	if (pages_number < 0 ||
+		pages_number > sbi->ll_ra_info.ra_max_pages) {
+		CERROR("can't set file readahead more than"
+		       "max_read_ahead_mb %lu MB\n",
+		       sbi->ll_ra_info.ra_max_pages);
+		return -ERANGE;
+	}
+
+	spin_lock(&sbi->ll_lock);
+	sbi->ll_ra_info.ra_max_pages_per_file = pages_number;
+	spin_unlock(&sbi->ll_lock);
+
+	return count;
+}
+
+static int ll_rd_max_read_ahead_whole_mb(char *page, char **start, off_t off,
+					 int count, int *eof, void *data)
+{
+	struct super_block *sb = data;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	long pages_number;
+	int mult;
+
+	spin_lock(&sbi->ll_lock);
+	pages_number = sbi->ll_ra_info.ra_max_read_ahead_whole_pages;
+	spin_unlock(&sbi->ll_lock);
+
+	mult = 1 << (20 - PAGE_CACHE_SHIFT);
+	return lprocfs_read_frac_helper(page, count, pages_number, mult);
+}
+
+static int ll_wr_max_read_ahead_whole_mb(struct file *file, const char *buffer,
+					 unsigned long count, void *data)
+{
+	struct super_block *sb = data;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	int mult, rc, pages_number;
+
+	mult = 1 << (20 - PAGE_CACHE_SHIFT);
+	rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
+	if (rc)
+		return rc;
+
+	/* Cap this at the current max readahead window size, the readahead
+	 * algorithm does this anyway so it's pointless to set it larger. */
+	if (pages_number < 0 ||
+	    pages_number > sbi->ll_ra_info.ra_max_pages_per_file) {
+		CERROR("can't set max_read_ahead_whole_mb more than "
+		       "max_read_ahead_per_file_mb: %lu\n",
+			sbi->ll_ra_info.ra_max_pages_per_file >> (20 - PAGE_CACHE_SHIFT));
+		return -ERANGE;
+	}
+
+	spin_lock(&sbi->ll_lock);
+	sbi->ll_ra_info.ra_max_read_ahead_whole_pages = pages_number;
+	spin_unlock(&sbi->ll_lock);
+
+	return count;
+}
+
+static int ll_rd_max_cached_mb(char *page, char **start, off_t off,
+			       int count, int *eof, void *data)
+{
+	struct super_block     *sb    = data;
+	struct ll_sb_info      *sbi   = ll_s2sbi(sb);
+	struct cl_client_cache *cache = &sbi->ll_cache;
+	int shift = 20 - PAGE_CACHE_SHIFT;
+	int max_cached_mb;
+	int unused_mb;
+
+	*eof = 1;
+	max_cached_mb = cache->ccc_lru_max >> shift;
+	unused_mb = atomic_read(&cache->ccc_lru_left) >> shift;
+	return snprintf(page, count,
+			"users: %d\n"
+			"max_cached_mb: %d\n"
+			"used_mb: %d\n"
+			"unused_mb: %d\n"
+			"reclaim_count: %u\n",
+			atomic_read(&cache->ccc_users),
+			max_cached_mb,
+			max_cached_mb - unused_mb,
+			unused_mb,
+			cache->ccc_lru_shrinkers);
+}
+
+static int ll_wr_max_cached_mb(struct file *file, const char *buffer,
+			       unsigned long count, void *data)
+{
+	struct super_block *sb = data;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct cl_client_cache *cache = &sbi->ll_cache;
+	int mult, rc, pages_number;
+	int diff = 0;
+	int nrpages = 0;
+	ENTRY;
+
+	mult = 1 << (20 - PAGE_CACHE_SHIFT);
+	buffer = lprocfs_find_named_value(buffer, "max_cached_mb:", &count);
+	rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
+	if (rc)
+		RETURN(rc);
+
+	if (pages_number < 0 || pages_number > num_physpages) {
+		CERROR("%s: can't set max cache more than %lu MB\n",
+		       ll_get_fsname(sb, NULL, 0),
+		       num_physpages >> (20 - PAGE_CACHE_SHIFT));
+		RETURN(-ERANGE);
+	}
+
+	if (sbi->ll_dt_exp == NULL)
+		RETURN(-ENODEV);
+
+	spin_lock(&sbi->ll_lock);
+	diff = pages_number - cache->ccc_lru_max;
+	spin_unlock(&sbi->ll_lock);
+
+	/* easy - add more LRU slots. */
+	if (diff >= 0) {
+		atomic_add(diff, &cache->ccc_lru_left);
+		GOTO(out, rc = 0);
+	}
+
+	diff = -diff;
+	while (diff > 0) {
+		int tmp;
+
+		/* reduce LRU budget from free slots. */
+		do {
+			int ov, nv;
+
+			ov = atomic_read(&cache->ccc_lru_left);
+			if (ov == 0)
+				break;
+
+			nv = ov > diff ? ov - diff : 0;
+			rc = cfs_atomic_cmpxchg(&cache->ccc_lru_left, ov, nv);
+			if (likely(ov == rc)) {
+				diff -= ov - nv;
+				nrpages += ov - nv;
+				break;
+			}
+		} while (1);
+
+		if (diff <= 0)
+			break;
+
+		/* difficult - have to ask OSCs to drop LRU slots. */
+		tmp = diff << 1;
+		rc = obd_set_info_async(NULL, sbi->ll_dt_exp,
+				sizeof(KEY_CACHE_LRU_SHRINK),
+				KEY_CACHE_LRU_SHRINK,
+				sizeof(tmp), &tmp, NULL);
+		if (rc < 0)
+			break;
+	}
+
+out:
+	if (rc >= 0) {
+		spin_lock(&sbi->ll_lock);
+		cache->ccc_lru_max = pages_number;
+		spin_unlock(&sbi->ll_lock);
+		rc = count;
+	} else {
+		atomic_add(nrpages, &cache->ccc_lru_left);
+	}
+	return rc;
+}
+
+static int ll_rd_checksum(char *page, char **start, off_t off,
+			  int count, int *eof, void *data)
+{
+	struct super_block *sb = data;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+	return snprintf(page, count, "%u\n",
+			(sbi->ll_flags & LL_SBI_CHECKSUM) ? 1 : 0);
+}
+
+static int ll_wr_checksum(struct file *file, const char *buffer,
+			  unsigned long count, void *data)
+{
+	struct super_block *sb = data;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	int val, rc;
+
+	if (!sbi->ll_dt_exp)
+		/* Not set up yet */
+		return -EAGAIN;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+	if (val)
+		sbi->ll_flags |= LL_SBI_CHECKSUM;
+	else
+		sbi->ll_flags &= ~LL_SBI_CHECKSUM;
+
+	rc = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CHECKSUM),
+				KEY_CHECKSUM, sizeof(val), &val, NULL);
+	if (rc)
+		CWARN("Failed to set OSC checksum flags: %d\n", rc);
+
+	return count;
+}
+
+static int ll_rd_max_rw_chunk(char *page, char **start, off_t off,
+			  int count, int *eof, void *data)
+{
+	struct super_block *sb = data;
+
+	return snprintf(page, count, "%lu\n", ll_s2sbi(sb)->ll_max_rw_chunk);
+}
+
+static int ll_wr_max_rw_chunk(struct file *file, const char *buffer,
+			  unsigned long count, void *data)
+{
+	struct super_block *sb = data;
+	int rc, val;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+	ll_s2sbi(sb)->ll_max_rw_chunk = val;
+	return count;
+}
+
+static int ll_rd_track_id(char *page, int count, void *data,
+			  enum stats_track_type type)
+{
+	struct super_block *sb = data;
+
+	if (ll_s2sbi(sb)->ll_stats_track_type == type) {
+		return snprintf(page, count, "%d\n",
+				ll_s2sbi(sb)->ll_stats_track_id);
+
+	} else if (ll_s2sbi(sb)->ll_stats_track_type == STATS_TRACK_ALL) {
+		return snprintf(page, count, "0 (all)\n");
+	} else {
+		return snprintf(page, count, "untracked\n");
+	}
+}
+
+static int ll_wr_track_id(const char *buffer, unsigned long count, void *data,
+			  enum stats_track_type type)
+{
+	struct super_block *sb = data;
+	int rc, pid;
+
+	rc = lprocfs_write_helper(buffer, count, &pid);
+	if (rc)
+		return rc;
+	ll_s2sbi(sb)->ll_stats_track_id = pid;
+	if (pid == 0)
+		ll_s2sbi(sb)->ll_stats_track_type = STATS_TRACK_ALL;
+	else
+		ll_s2sbi(sb)->ll_stats_track_type = type;
+	lprocfs_clear_stats(ll_s2sbi(sb)->ll_stats);
+	return count;
+}
+
+static int ll_rd_track_pid(char *page, char **start, off_t off,
+			  int count, int *eof, void *data)
+{
+	return (ll_rd_track_id(page, count, data, STATS_TRACK_PID));
+}
+
+static int ll_wr_track_pid(struct file *file, const char *buffer,
+			  unsigned long count, void *data)
+{
+	return (ll_wr_track_id(buffer, count, data, STATS_TRACK_PID));
+}
+
+static int ll_rd_track_ppid(char *page, char **start, off_t off,
+			  int count, int *eof, void *data)
+{
+	return (ll_rd_track_id(page, count, data, STATS_TRACK_PPID));
+}
+
+static int ll_wr_track_ppid(struct file *file, const char *buffer,
+			  unsigned long count, void *data)
+{
+	return (ll_wr_track_id(buffer, count, data, STATS_TRACK_PPID));
+}
+
+static int ll_rd_track_gid(char *page, char **start, off_t off,
+			  int count, int *eof, void *data)
+{
+	return (ll_rd_track_id(page, count, data, STATS_TRACK_GID));
+}
+
+static int ll_wr_track_gid(struct file *file, const char *buffer,
+			  unsigned long count, void *data)
+{
+	return (ll_wr_track_id(buffer, count, data, STATS_TRACK_GID));
+}
+
+static int ll_rd_statahead_max(char *page, char **start, off_t off,
+			       int count, int *eof, void *data)
+{
+	struct super_block *sb = data;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+	return snprintf(page, count, "%u\n", sbi->ll_sa_max);
+}
+
+static int ll_wr_statahead_max(struct file *file, const char *buffer,
+			       unsigned long count, void *data)
+{
+	struct super_block *sb = data;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	int val, rc;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	if (val >= 0 && val <= LL_SA_RPC_MAX)
+		sbi->ll_sa_max = val;
+	else
+		CERROR("Bad statahead_max value %d. Valid values are in the "
+		       "range [0, %d]\n", val, LL_SA_RPC_MAX);
+
+	return count;
+}
+
+static int ll_rd_statahead_agl(char *page, char **start, off_t off,
+			       int count, int *eof, void *data)
+{
+	struct super_block *sb = data;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+	return snprintf(page, count, "%u\n",
+			sbi->ll_flags & LL_SBI_AGL_ENABLED ? 1 : 0);
+}
+
+static int ll_wr_statahead_agl(struct file *file, const char *buffer,
+			       unsigned long count, void *data)
+{
+	struct super_block *sb = data;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	int val, rc;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	if (val)
+		sbi->ll_flags |= LL_SBI_AGL_ENABLED;
+	else
+		sbi->ll_flags &= ~LL_SBI_AGL_ENABLED;
+
+	return count;
+}
+
+static int ll_rd_statahead_stats(char *page, char **start, off_t off,
+				 int count, int *eof, void *data)
+{
+	struct super_block *sb = data;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+	return snprintf(page, count,
+			"statahead total: %u\n"
+			"statahead wrong: %u\n"
+			"agl total: %u\n",
+			atomic_read(&sbi->ll_sa_total),
+			atomic_read(&sbi->ll_sa_wrong),
+			atomic_read(&sbi->ll_agl_total));
+}
+
+static int ll_rd_lazystatfs(char *page, char **start, off_t off,
+			    int count, int *eof, void *data)
+{
+	struct super_block *sb = data;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+	return snprintf(page, count, "%u\n",
+			(sbi->ll_flags & LL_SBI_LAZYSTATFS) ? 1 : 0);
+}
+
+static int ll_wr_lazystatfs(struct file *file, const char *buffer,
+			    unsigned long count, void *data)
+{
+	struct super_block *sb = data;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	int val, rc;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	if (val)
+		sbi->ll_flags |= LL_SBI_LAZYSTATFS;
+	else
+		sbi->ll_flags &= ~LL_SBI_LAZYSTATFS;
+
+	return count;
+}
+
+static int ll_rd_maxea_size(char *page, char **start, off_t off,
+			    int count, int *eof, void *data)
+{
+	struct super_block *sb = data;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	unsigned int ealen;
+	int rc;
+
+	rc = ll_get_max_mdsize(sbi, &ealen);
+	if (rc)
+		return rc;
+
+	return snprintf(page, count, "%u\n", ealen);
+}
+
+static int ll_rd_sbi_flags(char *page, char **start, off_t off,
+				int count, int *eof, void *data)
+{
+	const char *str[] = LL_SBI_FLAGS;
+	struct super_block *sb = data;
+	int flags = ll_s2sbi(sb)->ll_flags;
+	int i = 0;
+	int rc = 0;
+
+	while (flags != 0) {
+		if (ARRAY_SIZE(str) <= i) {
+			CERROR("%s: Revise array LL_SBI_FLAGS to match sbi "
+				"flags please.\n", ll_get_fsname(sb, NULL, 0));
+			return -EINVAL;
+		}
+
+		if (flags & 0x1)
+			rc += snprintf(page + rc, count - rc, "%s ", str[i]);
+		flags >>= 1;
+		++i;
+	}
+	if (rc > 0)
+		rc += snprintf(page + rc, count - rc, "\b\n");
+	return rc;
+}
+
+static int ll_rd_unstable_stats(char *page, char **start, off_t off,
+			      int count, int *eof, void *data)
+{
+	struct super_block	*sb    = data;
+	struct ll_sb_info	*sbi   = ll_s2sbi(sb);
+	struct cl_client_cache	*cache = &sbi->ll_cache;
+	int pages, mb, rc;
+
+	pages = atomic_read(&cache->ccc_unstable_nr);
+	mb    = (pages * PAGE_CACHE_SIZE) >> 20;
+
+	rc = snprintf(page, count, "unstable_pages: %8d\n"
+				   "unstable_mb:    %8d\n", pages, mb);
+
+	return rc;
+}
+
+static struct lprocfs_vars lprocfs_llite_obd_vars[] = {
+	{ "uuid",	 ll_rd_sb_uuid,	  0, 0 },
+	//{ "mntpt_path",   ll_rd_path,	     0, 0 },
+	{ "fstype",       ll_rd_fstype,	   0, 0 },
+	{ "site",	 ll_rd_site_stats,       0, 0 },
+	{ "blocksize",    ll_rd_blksize,	  0, 0 },
+	{ "kbytestotal",  ll_rd_kbytestotal,      0, 0 },
+	{ "kbytesfree",   ll_rd_kbytesfree,       0, 0 },
+	{ "kbytesavail",  ll_rd_kbytesavail,      0, 0 },
+	{ "filestotal",   ll_rd_filestotal,       0, 0 },
+	{ "filesfree",    ll_rd_filesfree,	0, 0 },
+	{ "client_type",  ll_rd_client_type,      0, 0 },
+	//{ "filegroups",   lprocfs_rd_filegroups,  0, 0 },
+	{ "max_read_ahead_mb", ll_rd_max_readahead_mb,
+			       ll_wr_max_readahead_mb, 0 },
+	{ "max_read_ahead_per_file_mb", ll_rd_max_readahead_per_file_mb,
+					ll_wr_max_readahead_per_file_mb, 0 },
+	{ "max_read_ahead_whole_mb", ll_rd_max_read_ahead_whole_mb,
+				     ll_wr_max_read_ahead_whole_mb, 0 },
+	{ "max_cached_mb",    ll_rd_max_cached_mb, ll_wr_max_cached_mb, 0 },
+	{ "checksum_pages",   ll_rd_checksum, ll_wr_checksum, 0 },
+	{ "max_rw_chunk",     ll_rd_max_rw_chunk, ll_wr_max_rw_chunk, 0 },
+	{ "stats_track_pid",  ll_rd_track_pid, ll_wr_track_pid, 0 },
+	{ "stats_track_ppid", ll_rd_track_ppid, ll_wr_track_ppid, 0 },
+	{ "stats_track_gid",  ll_rd_track_gid, ll_wr_track_gid, 0 },
+	{ "statahead_max",    ll_rd_statahead_max, ll_wr_statahead_max, 0 },
+	{ "statahead_agl",    ll_rd_statahead_agl, ll_wr_statahead_agl, 0 },
+	{ "statahead_stats",  ll_rd_statahead_stats, 0, 0 },
+	{ "lazystatfs",       ll_rd_lazystatfs, ll_wr_lazystatfs, 0 },
+	{ "max_easize",       ll_rd_maxea_size, 0, 0 },
+	{ "sbi_flags",	ll_rd_sbi_flags, 0, 0 },
+	{ "unstable_stats",   ll_rd_unstable_stats, 0, 0},
+	{ 0 }
+};
+
+#define MAX_STRING_SIZE 128
+
+struct llite_file_opcode {
+	__u32       opcode;
+	__u32       type;
+	const char *opname;
+} llite_opcode_table[LPROC_LL_FILE_OPCODES] = {
+	/* file operation */
+	{ LPROC_LL_DIRTY_HITS,     LPROCFS_TYPE_REGS, "dirty_pages_hits" },
+	{ LPROC_LL_DIRTY_MISSES,   LPROCFS_TYPE_REGS, "dirty_pages_misses" },
+	{ LPROC_LL_READ_BYTES,     LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_BYTES,
+				   "read_bytes" },
+	{ LPROC_LL_WRITE_BYTES,    LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_BYTES,
+				   "write_bytes" },
+	{ LPROC_LL_BRW_READ,       LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_PAGES,
+				   "brw_read" },
+	{ LPROC_LL_BRW_WRITE,      LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_PAGES,
+				   "brw_write" },
+	{ LPROC_LL_OSC_READ,       LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_BYTES,
+				   "osc_read" },
+	{ LPROC_LL_OSC_WRITE,      LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_BYTES,
+				   "osc_write" },
+	{ LPROC_LL_IOCTL,	  LPROCFS_TYPE_REGS, "ioctl" },
+	{ LPROC_LL_OPEN,	   LPROCFS_TYPE_REGS, "open" },
+	{ LPROC_LL_RELEASE,	LPROCFS_TYPE_REGS, "close" },
+	{ LPROC_LL_MAP,	    LPROCFS_TYPE_REGS, "mmap" },
+	{ LPROC_LL_LLSEEK,	 LPROCFS_TYPE_REGS, "seek" },
+	{ LPROC_LL_FSYNC,	  LPROCFS_TYPE_REGS, "fsync" },
+	{ LPROC_LL_READDIR,	LPROCFS_TYPE_REGS, "readdir" },
+	/* inode operation */
+	{ LPROC_LL_SETATTR,	LPROCFS_TYPE_REGS, "setattr" },
+	{ LPROC_LL_TRUNC,	  LPROCFS_TYPE_REGS, "truncate" },
+	{ LPROC_LL_FLOCK,	  LPROCFS_TYPE_REGS, "flock" },
+	{ LPROC_LL_GETATTR,	LPROCFS_TYPE_REGS, "getattr" },
+	/* dir inode operation */
+	{ LPROC_LL_CREATE,	 LPROCFS_TYPE_REGS, "create" },
+	{ LPROC_LL_LINK,	   LPROCFS_TYPE_REGS, "link" },
+	{ LPROC_LL_UNLINK,	 LPROCFS_TYPE_REGS, "unlink" },
+	{ LPROC_LL_SYMLINK,	LPROCFS_TYPE_REGS, "symlink" },
+	{ LPROC_LL_MKDIR,	  LPROCFS_TYPE_REGS, "mkdir" },
+	{ LPROC_LL_RMDIR,	  LPROCFS_TYPE_REGS, "rmdir" },
+	{ LPROC_LL_MKNOD,	  LPROCFS_TYPE_REGS, "mknod" },
+	{ LPROC_LL_RENAME,	 LPROCFS_TYPE_REGS, "rename" },
+	/* special inode operation */
+	{ LPROC_LL_STAFS,	  LPROCFS_TYPE_REGS, "statfs" },
+	{ LPROC_LL_ALLOC_INODE,    LPROCFS_TYPE_REGS, "alloc_inode" },
+	{ LPROC_LL_SETXATTR,       LPROCFS_TYPE_REGS, "setxattr" },
+	{ LPROC_LL_GETXATTR,       LPROCFS_TYPE_REGS, "getxattr" },
+	{ LPROC_LL_LISTXATTR,      LPROCFS_TYPE_REGS, "listxattr" },
+	{ LPROC_LL_REMOVEXATTR,    LPROCFS_TYPE_REGS, "removexattr" },
+	{ LPROC_LL_INODE_PERM,     LPROCFS_TYPE_REGS, "inode_permission" },
+};
+
+void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count)
+{
+	if (!sbi->ll_stats)
+		return;
+	if (sbi->ll_stats_track_type == STATS_TRACK_ALL)
+		lprocfs_counter_add(sbi->ll_stats, op, count);
+	else if (sbi->ll_stats_track_type == STATS_TRACK_PID &&
+		 sbi->ll_stats_track_id == current->pid)
+		lprocfs_counter_add(sbi->ll_stats, op, count);
+	else if (sbi->ll_stats_track_type == STATS_TRACK_PPID &&
+		 sbi->ll_stats_track_id == current->parent->pid)
+		lprocfs_counter_add(sbi->ll_stats, op, count);
+	else if (sbi->ll_stats_track_type == STATS_TRACK_GID &&
+		 sbi->ll_stats_track_id == current_gid())
+		lprocfs_counter_add(sbi->ll_stats, op, count);
+}
+EXPORT_SYMBOL(ll_stats_ops_tally);
+
+static const char *ra_stat_string[] = {
+	[RA_STAT_HIT] = "hits",
+	[RA_STAT_MISS] = "misses",
+	[RA_STAT_DISTANT_READPAGE] = "readpage not consecutive",
+	[RA_STAT_MISS_IN_WINDOW] = "miss inside window",
+	[RA_STAT_FAILED_GRAB_PAGE] = "failed grab_cache_page",
+	[RA_STAT_FAILED_MATCH] = "failed lock match",
+	[RA_STAT_DISCARDED] = "read but discarded",
+	[RA_STAT_ZERO_LEN] = "zero length file",
+	[RA_STAT_ZERO_WINDOW] = "zero size window",
+	[RA_STAT_EOF] = "read-ahead to EOF",
+	[RA_STAT_MAX_IN_FLIGHT] = "hit max r-a issue",
+	[RA_STAT_WRONG_GRAB_PAGE] = "wrong page from grab_cache_page",
+};
+
+
+int lprocfs_register_mountpoint(struct proc_dir_entry *parent,
+				struct super_block *sb, char *osc, char *mdc)
+{
+	struct lprocfs_vars lvars[2];
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct obd_device *obd;
+	char name[MAX_STRING_SIZE + 1], *ptr;
+	int err, id, len, rc;
+	ENTRY;
+
+	memset(lvars, 0, sizeof(lvars));
+
+	name[MAX_STRING_SIZE] = '\0';
+	lvars[0].name = name;
+
+	LASSERT(sbi != NULL);
+	LASSERT(mdc != NULL);
+	LASSERT(osc != NULL);
+
+	/* Get fsname */
+	len = strlen(lsi->lsi_lmd->lmd_profile);
+	ptr = strrchr(lsi->lsi_lmd->lmd_profile, '-');
+	if (ptr && (strcmp(ptr, "-client") == 0))
+		len -= 7;
+
+	/* Mount info */
+	snprintf(name, MAX_STRING_SIZE, "%.*s-%p", len,
+		 lsi->lsi_lmd->lmd_profile, sb);
+
+	sbi->ll_proc_root = lprocfs_register(name, parent, NULL, NULL);
+	if (IS_ERR(sbi->ll_proc_root)) {
+		err = PTR_ERR(sbi->ll_proc_root);
+		sbi->ll_proc_root = NULL;
+		RETURN(err);
+	}
+
+	rc = lprocfs_seq_create(sbi->ll_proc_root, "dump_page_cache", 0444,
+				&vvp_dump_pgcache_file_ops, sbi);
+	if (rc)
+		CWARN("Error adding the dump_page_cache file\n");
+
+	rc = lprocfs_seq_create(sbi->ll_proc_root, "extents_stats", 0644,
+				&ll_rw_extents_stats_fops, sbi);
+	if (rc)
+		CWARN("Error adding the extent_stats file\n");
+
+	rc = lprocfs_seq_create(sbi->ll_proc_root, "extents_stats_per_process",
+				0644, &ll_rw_extents_stats_pp_fops, sbi);
+	if (rc)
+		CWARN("Error adding the extents_stats_per_process file\n");
+
+	rc = lprocfs_seq_create(sbi->ll_proc_root, "offset_stats", 0644,
+				&ll_rw_offset_stats_fops, sbi);
+	if (rc)
+		CWARN("Error adding the offset_stats file\n");
+
+	/* File operations stats */
+	sbi->ll_stats = lprocfs_alloc_stats(LPROC_LL_FILE_OPCODES,
+					    LPROCFS_STATS_FLAG_NONE);
+	if (sbi->ll_stats == NULL)
+		GOTO(out, err = -ENOMEM);
+	/* do counter init */
+	for (id = 0; id < LPROC_LL_FILE_OPCODES; id++) {
+		__u32 type = llite_opcode_table[id].type;
+		void *ptr = NULL;
+		if (type & LPROCFS_TYPE_REGS)
+			ptr = "regs";
+		else if (type & LPROCFS_TYPE_BYTES)
+			ptr = "bytes";
+		else if (type & LPROCFS_TYPE_PAGES)
+			ptr = "pages";
+		lprocfs_counter_init(sbi->ll_stats,
+				     llite_opcode_table[id].opcode,
+				     (type & LPROCFS_CNTR_AVGMINMAX),
+				     llite_opcode_table[id].opname, ptr);
+	}
+	err = lprocfs_register_stats(sbi->ll_proc_root, "stats", sbi->ll_stats);
+	if (err)
+		GOTO(out, err);
+
+	sbi->ll_ra_stats = lprocfs_alloc_stats(ARRAY_SIZE(ra_stat_string),
+					       LPROCFS_STATS_FLAG_NONE);
+	if (sbi->ll_ra_stats == NULL)
+		GOTO(out, err = -ENOMEM);
+
+	for (id = 0; id < ARRAY_SIZE(ra_stat_string); id++)
+		lprocfs_counter_init(sbi->ll_ra_stats, id, 0,
+				     ra_stat_string[id], "pages");
+	err = lprocfs_register_stats(sbi->ll_proc_root, "read_ahead_stats",
+				     sbi->ll_ra_stats);
+	if (err)
+		GOTO(out, err);
+
+
+	err = lprocfs_add_vars(sbi->ll_proc_root, lprocfs_llite_obd_vars, sb);
+	if (err)
+		GOTO(out, err);
+
+	/* MDC info */
+	obd = class_name2obd(mdc);
+
+	LASSERT(obd != NULL);
+	LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+	LASSERT(obd->obd_type->typ_name != NULL);
+
+	snprintf(name, MAX_STRING_SIZE, "%s/common_name",
+		 obd->obd_type->typ_name);
+	lvars[0].read_fptr = lprocfs_rd_name;
+	err = lprocfs_add_vars(sbi->ll_proc_root, lvars, obd);
+	if (err)
+		GOTO(out, err);
+
+	snprintf(name, MAX_STRING_SIZE, "%s/uuid", obd->obd_type->typ_name);
+	lvars[0].read_fptr = lprocfs_rd_uuid;
+	err = lprocfs_add_vars(sbi->ll_proc_root, lvars, obd);
+	if (err)
+		GOTO(out, err);
+
+	/* OSC */
+	obd = class_name2obd(osc);
+
+	LASSERT(obd != NULL);
+	LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+	LASSERT(obd->obd_type->typ_name != NULL);
+
+	snprintf(name, MAX_STRING_SIZE, "%s/common_name",
+		 obd->obd_type->typ_name);
+	lvars[0].read_fptr = lprocfs_rd_name;
+	err = lprocfs_add_vars(sbi->ll_proc_root, lvars, obd);
+	if (err)
+		GOTO(out, err);
+
+	snprintf(name, MAX_STRING_SIZE, "%s/uuid", obd->obd_type->typ_name);
+	lvars[0].read_fptr = lprocfs_rd_uuid;
+	err = lprocfs_add_vars(sbi->ll_proc_root, lvars, obd);
+out:
+	if (err) {
+		lprocfs_remove(&sbi->ll_proc_root);
+		lprocfs_free_stats(&sbi->ll_ra_stats);
+		lprocfs_free_stats(&sbi->ll_stats);
+	}
+	RETURN(err);
+}
+
+void lprocfs_unregister_mountpoint(struct ll_sb_info *sbi)
+{
+	if (sbi->ll_proc_root) {
+		lprocfs_remove(&sbi->ll_proc_root);
+		lprocfs_free_stats(&sbi->ll_ra_stats);
+		lprocfs_free_stats(&sbi->ll_stats);
+	}
+}
+#undef MAX_STRING_SIZE
+
+#define pct(a,b) (b ? a * 100 / b : 0)
+
+static void ll_display_extents_info(struct ll_rw_extents_info *io_extents,
+				   struct seq_file *seq, int which)
+{
+	unsigned long read_tot = 0, write_tot = 0, read_cum, write_cum;
+	unsigned long start, end, r, w;
+	char *unitp = "KMGTPEZY";
+	int i, units = 10;
+	struct per_process_info *pp_info = &io_extents->pp_extents[which];
+
+	read_cum = 0;
+	write_cum = 0;
+	start = 0;
+
+	for(i = 0; i < LL_HIST_MAX; i++) {
+		read_tot += pp_info->pp_r_hist.oh_buckets[i];
+		write_tot += pp_info->pp_w_hist.oh_buckets[i];
+	}
+
+	for(i = 0; i < LL_HIST_MAX; i++) {
+		r = pp_info->pp_r_hist.oh_buckets[i];
+		w = pp_info->pp_w_hist.oh_buckets[i];
+		read_cum += r;
+		write_cum += w;
+		end = 1 << (i + LL_HIST_START - units);
+		seq_printf(seq, "%4lu%c - %4lu%c%c: %14lu %4lu %4lu  | "
+			   "%14lu %4lu %4lu\n", start, *unitp, end, *unitp,
+			   (i == LL_HIST_MAX - 1) ? '+' : ' ',
+			   r, pct(r, read_tot), pct(read_cum, read_tot),
+			   w, pct(w, write_tot), pct(write_cum, write_tot));
+		start = end;
+		if (start == 1<<10) {
+			start = 1;
+			units += 10;
+			unitp++;
+		}
+		if (read_cum == read_tot && write_cum == write_tot)
+			break;
+	}
+}
+
+static int ll_rw_extents_stats_pp_seq_show(struct seq_file *seq, void *v)
+{
+	struct timeval now;
+	struct ll_sb_info *sbi = seq->private;
+	struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info;
+	int k;
+
+	do_gettimeofday(&now);
+
+	if (!sbi->ll_rw_stats_on) {
+		seq_printf(seq, "disabled\n"
+				"write anything in this file to activate, "
+				"then 0 or \"[D/d]isabled\" to deactivate\n");
+		return 0;
+	}
+	seq_printf(seq, "snapshot_time:	 %lu.%lu (secs.usecs)\n",
+		   now.tv_sec, now.tv_usec);
+	seq_printf(seq, "%15s %19s       | %20s\n", " ", "read", "write");
+	seq_printf(seq, "%13s   %14s %4s %4s  | %14s %4s %4s\n",
+		   "extents", "calls", "%", "cum%",
+		   "calls", "%", "cum%");
+	spin_lock(&sbi->ll_pp_extent_lock);
+	for (k = 0; k < LL_PROCESS_HIST_MAX; k++) {
+		if (io_extents->pp_extents[k].pid != 0) {
+			seq_printf(seq, "\nPID: %d\n",
+				   io_extents->pp_extents[k].pid);
+			ll_display_extents_info(io_extents, seq, k);
+		}
+	}
+	spin_unlock(&sbi->ll_pp_extent_lock);
+	return 0;
+}
+
+static ssize_t ll_rw_extents_stats_pp_seq_write(struct file *file,
+						const char *buf, size_t len,
+						loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	struct ll_sb_info *sbi = seq->private;
+	struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info;
+	int i;
+	int value = 1, rc = 0;
+
+	rc = lprocfs_write_helper(buf, len, &value);
+	if (rc < 0 && (strcmp(buf, "disabled") == 0 ||
+		       strcmp(buf, "Disabled") == 0))
+		value = 0;
+
+	if (value == 0)
+		sbi->ll_rw_stats_on = 0;
+	else
+		sbi->ll_rw_stats_on = 1;
+
+	spin_lock(&sbi->ll_pp_extent_lock);
+	for (i = 0; i < LL_PROCESS_HIST_MAX; i++) {
+		io_extents->pp_extents[i].pid = 0;
+		lprocfs_oh_clear(&io_extents->pp_extents[i].pp_r_hist);
+		lprocfs_oh_clear(&io_extents->pp_extents[i].pp_w_hist);
+	}
+	spin_unlock(&sbi->ll_pp_extent_lock);
+	return len;
+}
+
+LPROC_SEQ_FOPS(ll_rw_extents_stats_pp);
+
+static int ll_rw_extents_stats_seq_show(struct seq_file *seq, void *v)
+{
+	struct timeval now;
+	struct ll_sb_info *sbi = seq->private;
+	struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info;
+
+	do_gettimeofday(&now);
+
+	if (!sbi->ll_rw_stats_on) {
+		seq_printf(seq, "disabled\n"
+				"write anything in this file to activate, "
+				"then 0 or \"[D/d]isabled\" to deactivate\n");
+		return 0;
+	}
+	seq_printf(seq, "snapshot_time:	 %lu.%lu (secs.usecs)\n",
+		   now.tv_sec, now.tv_usec);
+
+	seq_printf(seq, "%15s %19s       | %20s\n", " ", "read", "write");
+	seq_printf(seq, "%13s   %14s %4s %4s  | %14s %4s %4s\n",
+		   "extents", "calls", "%", "cum%",
+		   "calls", "%", "cum%");
+	spin_lock(&sbi->ll_lock);
+	ll_display_extents_info(io_extents, seq, LL_PROCESS_HIST_MAX);
+	spin_unlock(&sbi->ll_lock);
+
+	return 0;
+}
+
+static ssize_t ll_rw_extents_stats_seq_write(struct file *file, const char *buf,
+					size_t len, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	struct ll_sb_info *sbi = seq->private;
+	struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info;
+	int i;
+	int value = 1, rc = 0;
+
+	rc = lprocfs_write_helper(buf, len, &value);
+	if (rc < 0 && (strcmp(buf, "disabled") == 0 ||
+		       strcmp(buf, "Disabled") == 0))
+		value = 0;
+
+	if (value == 0)
+		sbi->ll_rw_stats_on = 0;
+	else
+		sbi->ll_rw_stats_on = 1;
+	spin_lock(&sbi->ll_pp_extent_lock);
+	for (i = 0; i <= LL_PROCESS_HIST_MAX; i++) {
+		io_extents->pp_extents[i].pid = 0;
+		lprocfs_oh_clear(&io_extents->pp_extents[i].pp_r_hist);
+		lprocfs_oh_clear(&io_extents->pp_extents[i].pp_w_hist);
+	}
+	spin_unlock(&sbi->ll_pp_extent_lock);
+
+	return len;
+}
+
+LPROC_SEQ_FOPS(ll_rw_extents_stats);
+
+void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid,
+		       struct ll_file_data *file, loff_t pos,
+		       size_t count, int rw)
+{
+	int i, cur = -1;
+	struct ll_rw_process_info *process;
+	struct ll_rw_process_info *offset;
+	int *off_count = &sbi->ll_rw_offset_entry_count;
+	int *process_count = &sbi->ll_offset_process_count;
+	struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info;
+
+	if(!sbi->ll_rw_stats_on)
+		return;
+	process = sbi->ll_rw_process_info;
+	offset = sbi->ll_rw_offset_info;
+
+	spin_lock(&sbi->ll_pp_extent_lock);
+	/* Extent statistics */
+	for(i = 0; i < LL_PROCESS_HIST_MAX; i++) {
+		if(io_extents->pp_extents[i].pid == pid) {
+			cur = i;
+			break;
+		}
+	}
+
+	if (cur == -1) {
+		/* new process */
+		sbi->ll_extent_process_count =
+			(sbi->ll_extent_process_count + 1) % LL_PROCESS_HIST_MAX;
+		cur = sbi->ll_extent_process_count;
+		io_extents->pp_extents[cur].pid = pid;
+		lprocfs_oh_clear(&io_extents->pp_extents[cur].pp_r_hist);
+		lprocfs_oh_clear(&io_extents->pp_extents[cur].pp_w_hist);
+	}
+
+	for(i = 0; (count >= (1 << LL_HIST_START << i)) &&
+	     (i < (LL_HIST_MAX - 1)); i++);
+	if (rw == 0) {
+		io_extents->pp_extents[cur].pp_r_hist.oh_buckets[i]++;
+		io_extents->pp_extents[LL_PROCESS_HIST_MAX].pp_r_hist.oh_buckets[i]++;
+	} else {
+		io_extents->pp_extents[cur].pp_w_hist.oh_buckets[i]++;
+		io_extents->pp_extents[LL_PROCESS_HIST_MAX].pp_w_hist.oh_buckets[i]++;
+	}
+	spin_unlock(&sbi->ll_pp_extent_lock);
+
+	spin_lock(&sbi->ll_process_lock);
+	/* Offset statistics */
+	for (i = 0; i < LL_PROCESS_HIST_MAX; i++) {
+		if (process[i].rw_pid == pid) {
+			if (process[i].rw_last_file != file) {
+				process[i].rw_range_start = pos;
+				process[i].rw_last_file_pos = pos + count;
+				process[i].rw_smallest_extent = count;
+				process[i].rw_largest_extent = count;
+				process[i].rw_offset = 0;
+				process[i].rw_last_file = file;
+				spin_unlock(&sbi->ll_process_lock);
+				return;
+			}
+			if (process[i].rw_last_file_pos != pos) {
+				*off_count =
+				    (*off_count + 1) % LL_OFFSET_HIST_MAX;
+				offset[*off_count].rw_op = process[i].rw_op;
+				offset[*off_count].rw_pid = pid;
+				offset[*off_count].rw_range_start =
+					process[i].rw_range_start;
+				offset[*off_count].rw_range_end =
+					process[i].rw_last_file_pos;
+				offset[*off_count].rw_smallest_extent =
+					process[i].rw_smallest_extent;
+				offset[*off_count].rw_largest_extent =
+					process[i].rw_largest_extent;
+				offset[*off_count].rw_offset =
+					process[i].rw_offset;
+				process[i].rw_op = rw;
+				process[i].rw_range_start = pos;
+				process[i].rw_smallest_extent = count;
+				process[i].rw_largest_extent = count;
+				process[i].rw_offset = pos -
+					process[i].rw_last_file_pos;
+			}
+			if(process[i].rw_smallest_extent > count)
+				process[i].rw_smallest_extent = count;
+			if(process[i].rw_largest_extent < count)
+				process[i].rw_largest_extent = count;
+			process[i].rw_last_file_pos = pos + count;
+			spin_unlock(&sbi->ll_process_lock);
+			return;
+		}
+	}
+	*process_count = (*process_count + 1) % LL_PROCESS_HIST_MAX;
+	process[*process_count].rw_pid = pid;
+	process[*process_count].rw_op = rw;
+	process[*process_count].rw_range_start = pos;
+	process[*process_count].rw_last_file_pos = pos + count;
+	process[*process_count].rw_smallest_extent = count;
+	process[*process_count].rw_largest_extent = count;
+	process[*process_count].rw_offset = 0;
+	process[*process_count].rw_last_file = file;
+	spin_unlock(&sbi->ll_process_lock);
+}
+
+static int ll_rw_offset_stats_seq_show(struct seq_file *seq, void *v)
+{
+	struct timeval now;
+	struct ll_sb_info *sbi = seq->private;
+	struct ll_rw_process_info *offset = sbi->ll_rw_offset_info;
+	struct ll_rw_process_info *process = sbi->ll_rw_process_info;
+	int i;
+
+	do_gettimeofday(&now);
+
+	if (!sbi->ll_rw_stats_on) {
+		seq_printf(seq, "disabled\n"
+				"write anything in this file to activate, "
+				"then 0 or \"[D/d]isabled\" to deactivate\n");
+		return 0;
+	}
+	spin_lock(&sbi->ll_process_lock);
+
+	seq_printf(seq, "snapshot_time:	 %lu.%lu (secs.usecs)\n",
+		   now.tv_sec, now.tv_usec);
+	seq_printf(seq, "%3s %10s %14s %14s %17s %17s %14s\n",
+		   "R/W", "PID", "RANGE START", "RANGE END",
+		   "SMALLEST EXTENT", "LARGEST EXTENT", "OFFSET");
+	/* We stored the discontiguous offsets here; print them first */
+	for(i = 0; i < LL_OFFSET_HIST_MAX; i++) {
+		if (offset[i].rw_pid != 0)
+			seq_printf(seq,"%3c %10d %14Lu %14Lu %17lu %17lu %14Lu",
+				   offset[i].rw_op ? 'W' : 'R',
+				   offset[i].rw_pid,
+				   offset[i].rw_range_start,
+				   offset[i].rw_range_end,
+				   (unsigned long)offset[i].rw_smallest_extent,
+				   (unsigned long)offset[i].rw_largest_extent,
+				   offset[i].rw_offset);
+	}
+	/* Then print the current offsets for each process */
+	for(i = 0; i < LL_PROCESS_HIST_MAX; i++) {
+		if (process[i].rw_pid != 0)
+			seq_printf(seq,"%3c %10d %14Lu %14Lu %17lu %17lu %14Lu",
+				   process[i].rw_op ? 'W' : 'R',
+				   process[i].rw_pid,
+				   process[i].rw_range_start,
+				   process[i].rw_last_file_pos,
+				   (unsigned long)process[i].rw_smallest_extent,
+				   (unsigned long)process[i].rw_largest_extent,
+				   process[i].rw_offset);
+	}
+	spin_unlock(&sbi->ll_process_lock);
+
+	return 0;
+}
+
+static ssize_t ll_rw_offset_stats_seq_write(struct file *file, const char *buf,
+				       size_t len, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	struct ll_sb_info *sbi = seq->private;
+	struct ll_rw_process_info *process_info = sbi->ll_rw_process_info;
+	struct ll_rw_process_info *offset_info = sbi->ll_rw_offset_info;
+	int value = 1, rc = 0;
+
+	rc = lprocfs_write_helper(buf, len, &value);
+
+	if (rc < 0 && (strcmp(buf, "disabled") == 0 ||
+			   strcmp(buf, "Disabled") == 0))
+		value = 0;
+
+	if (value == 0)
+		sbi->ll_rw_stats_on = 0;
+	else
+		sbi->ll_rw_stats_on = 1;
+
+	spin_lock(&sbi->ll_process_lock);
+	sbi->ll_offset_process_count = 0;
+	sbi->ll_rw_offset_entry_count = 0;
+	memset(process_info, 0, sizeof(struct ll_rw_process_info) *
+	       LL_PROCESS_HIST_MAX);
+	memset(offset_info, 0, sizeof(struct ll_rw_process_info) *
+	       LL_OFFSET_HIST_MAX);
+	spin_unlock(&sbi->ll_process_lock);
+
+	return len;
+}
+
+LPROC_SEQ_FOPS(ll_rw_offset_stats);
+
+void lprocfs_llite_init_vars(struct lprocfs_static_vars *lvars)
+{
+    lvars->module_vars  = NULL;
+    lvars->obd_vars     = lprocfs_llite_obd_vars;
+}
+#endif /* LPROCFS */
diff --git a/drivers/staging/lustre/lustre/llite/namei.c b/drivers/staging/lustre/lustre/llite/namei.c
new file mode 100644
index 000000000000..e6b3f54abbe3
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/namei.c
@@ -0,0 +1,1279 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/quotaops.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/security.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <lustre_lite.h>
+#include <lustre_dlm.h>
+#include <lustre_ver.h>
+#include "llite_internal.h"
+
+static int ll_create_it(struct inode *, struct dentry *,
+			int, struct lookup_intent *);
+
+/*
+ * Check if we have something mounted at the named dchild.
+ * In such a case there would always be dentry present.
+ */
+static int ll_d_mountpoint(struct dentry *dparent, struct dentry *dchild,
+			   struct qstr *name)
+{
+	int mounted = 0;
+
+	if (unlikely(dchild)) {
+		mounted = d_mountpoint(dchild);
+	} else if (dparent) {
+		dchild = d_lookup(dparent, name);
+		if (dchild) {
+			mounted = d_mountpoint(dchild);
+			dput(dchild);
+		}
+	}
+	return mounted;
+}
+
+int ll_unlock(__u32 mode, struct lustre_handle *lockh)
+{
+	ENTRY;
+
+	ldlm_lock_decref(lockh, mode);
+
+	RETURN(0);
+}
+
+
+/* called from iget5_locked->find_inode() under inode_lock spinlock */
+static int ll_test_inode(struct inode *inode, void *opaque)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct lustre_md     *md = opaque;
+
+	if (unlikely(!(md->body->valid & OBD_MD_FLID))) {
+		CERROR("MDS body missing FID\n");
+		return 0;
+	}
+
+	if (!lu_fid_eq(&lli->lli_fid, &md->body->fid1))
+		return 0;
+
+	return 1;
+}
+
+static int ll_set_inode(struct inode *inode, void *opaque)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct mdt_body *body = ((struct lustre_md *)opaque)->body;
+
+	if (unlikely(!(body->valid & OBD_MD_FLID))) {
+		CERROR("MDS body missing FID\n");
+		return -EINVAL;
+	}
+
+	lli->lli_fid = body->fid1;
+	if (unlikely(!(body->valid & OBD_MD_FLTYPE))) {
+		CERROR("Can not initialize inode "DFID" without object type: "
+		       "valid = "LPX64"\n", PFID(&lli->lli_fid), body->valid);
+		return -EINVAL;
+	}
+
+	inode->i_mode = (inode->i_mode & ~S_IFMT) | (body->mode & S_IFMT);
+	if (unlikely(inode->i_mode == 0)) {
+		CERROR("Invalid inode "DFID" type\n", PFID(&lli->lli_fid));
+		return -EINVAL;
+	}
+
+	ll_lli_init(lli);
+
+	return 0;
+}
+
+
+/*
+ * Get an inode by inode number (already instantiated by the intent lookup).
+ * Returns inode or NULL
+ */
+struct inode *ll_iget(struct super_block *sb, ino_t hash,
+		      struct lustre_md *md)
+{
+	struct inode	 *inode;
+	ENTRY;
+
+	LASSERT(hash != 0);
+	inode = iget5_locked(sb, hash, ll_test_inode, ll_set_inode, md);
+
+	if (inode) {
+		if (inode->i_state & I_NEW) {
+			int rc = 0;
+
+			ll_read_inode2(inode, md);
+			if (S_ISREG(inode->i_mode) &&
+			    ll_i2info(inode)->lli_clob == NULL) {
+				CDEBUG(D_INODE,
+					"%s: apply lsm %p to inode "DFID".\n",
+					ll_get_fsname(sb, NULL, 0), md->lsm,
+					PFID(ll_inode2fid(inode)));
+				rc = cl_file_inode_init(inode, md);
+			}
+			if (rc != 0) {
+				make_bad_inode(inode);
+				unlock_new_inode(inode);
+				iput(inode);
+				inode = ERR_PTR(rc);
+			} else
+				unlock_new_inode(inode);
+		} else if (!(inode->i_state & (I_FREEING | I_CLEAR)))
+			ll_update_inode(inode, md);
+		CDEBUG(D_VFSTRACE, "got inode: %p for "DFID"\n",
+		       inode, PFID(&md->body->fid1));
+	}
+	RETURN(inode);
+}
+
+static void ll_invalidate_negative_children(struct inode *dir)
+{
+	struct dentry *dentry, *tmp_subdir;
+	struct ll_d_hlist_node *p;
+
+	ll_lock_dcache(dir);
+	ll_d_hlist_for_each_entry(dentry, p, &dir->i_dentry, d_alias) {
+		spin_lock(&dentry->d_lock);
+		if (!list_empty(&dentry->d_subdirs)) {
+			struct dentry *child;
+
+			list_for_each_entry_safe(child, tmp_subdir,
+						 &dentry->d_subdirs,
+						 d_u.d_child) {
+				if (child->d_inode == NULL)
+					d_lustre_invalidate(child);
+			}
+		}
+		spin_unlock(&dentry->d_lock);
+	}
+	ll_unlock_dcache(dir);
+}
+
+int ll_md_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+		       void *data, int flag)
+{
+	int rc;
+	struct lustre_handle lockh;
+	ENTRY;
+
+	switch (flag) {
+	case LDLM_CB_BLOCKING:
+		ldlm_lock2handle(lock, &lockh);
+		rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
+		if (rc < 0) {
+			CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
+			RETURN(rc);
+		}
+		break;
+	case LDLM_CB_CANCELING: {
+		struct inode *inode = ll_inode_from_resource_lock(lock);
+		struct ll_inode_info *lli;
+		__u64 bits = lock->l_policy_data.l_inodebits.bits;
+		struct lu_fid *fid;
+		ldlm_mode_t mode = lock->l_req_mode;
+
+		/* Inode is set to lock->l_resource->lr_lvb_inode
+		 * for mdc - bug 24555 */
+		LASSERT(lock->l_ast_data == NULL);
+
+		/* Invalidate all dentries associated with this inode */
+		if (inode == NULL)
+			break;
+
+		LASSERT(lock->l_flags & LDLM_FL_CANCELING);
+		/* For OPEN locks we differentiate between lock modes
+		 * LCK_CR, LCK_CW, LCK_PR - bug 22891 */
+		if (bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_UPDATE |
+			    MDS_INODELOCK_LAYOUT | MDS_INODELOCK_PERM))
+			ll_have_md_lock(inode, &bits, LCK_MINMODE);
+
+		if (bits & MDS_INODELOCK_OPEN)
+			ll_have_md_lock(inode, &bits, mode);
+
+		fid = ll_inode2fid(inode);
+		if (lock->l_resource->lr_name.name[0] != fid_seq(fid) ||
+		    lock->l_resource->lr_name.name[1] != fid_oid(fid) ||
+		    lock->l_resource->lr_name.name[2] != fid_ver(fid)) {
+			LDLM_ERROR(lock, "data mismatch with object "
+				   DFID" (%p)", PFID(fid), inode);
+		}
+
+		if (bits & MDS_INODELOCK_OPEN) {
+			int flags = 0;
+			switch (lock->l_req_mode) {
+			case LCK_CW:
+				flags = FMODE_WRITE;
+				break;
+			case LCK_PR:
+				flags = FMODE_EXEC;
+				break;
+			case LCK_CR:
+				flags = FMODE_READ;
+				break;
+			default:
+				CERROR("Unexpected lock mode for OPEN lock "
+				       "%d, inode %ld\n", lock->l_req_mode,
+				       inode->i_ino);
+			}
+			ll_md_real_close(inode, flags);
+		}
+
+		lli = ll_i2info(inode);
+		if (bits & MDS_INODELOCK_LAYOUT) {
+			struct cl_object_conf conf = { { 0 } };
+
+			conf.coc_opc = OBJECT_CONF_INVALIDATE;
+			conf.coc_inode = inode;
+			rc = ll_layout_conf(inode, &conf);
+			if (rc)
+				CDEBUG(D_INODE, "invaliding layout %d.\n", rc);
+		}
+
+		if (bits & MDS_INODELOCK_UPDATE)
+			lli->lli_flags &= ~LLIF_MDS_SIZE_LOCK;
+
+		if (S_ISDIR(inode->i_mode) &&
+		     (bits & MDS_INODELOCK_UPDATE)) {
+			CDEBUG(D_INODE, "invalidating inode %lu\n",
+			       inode->i_ino);
+			truncate_inode_pages(inode->i_mapping, 0);
+			ll_invalidate_negative_children(inode);
+		}
+
+		if (inode->i_sb->s_root &&
+		    inode != inode->i_sb->s_root->d_inode &&
+		    (bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_PERM)))
+			ll_invalidate_aliases(inode);
+		iput(inode);
+		break;
+	}
+	default:
+		LBUG();
+	}
+
+	RETURN(0);
+}
+
+__u32 ll_i2suppgid(struct inode *i)
+{
+	if (current_is_in_group(i->i_gid))
+		return (__u32)i->i_gid;
+	else
+		return (__u32)(-1);
+}
+
+/* Pack the required supplementary groups into the supplied groups array.
+ * If we don't need to use the groups from the target inode(s) then we
+ * instead pack one or more groups from the user's supplementary group
+ * array in case it might be useful.  Not needed if doing an MDS-side upcall. */
+void ll_i2gids(__u32 *suppgids, struct inode *i1, struct inode *i2)
+{
+#if 0
+	int i;
+#endif
+
+	LASSERT(i1 != NULL);
+	LASSERT(suppgids != NULL);
+
+	suppgids[0] = ll_i2suppgid(i1);
+
+	if (i2)
+		suppgids[1] = ll_i2suppgid(i2);
+		else
+			suppgids[1] = -1;
+
+#if 0
+	for (i = 0; i < current_ngroups; i++) {
+		if (suppgids[0] == -1) {
+			if (current_groups[i] != suppgids[1])
+				suppgids[0] = current_groups[i];
+			continue;
+		}
+		if (suppgids[1] == -1) {
+			if (current_groups[i] != suppgids[0])
+				suppgids[1] = current_groups[i];
+			continue;
+		}
+		break;
+	}
+#endif
+}
+
+/*
+ * try to reuse three types of dentry:
+ * 1. unhashed alias, this one is unhashed by d_invalidate (but it may be valid
+ *    by concurrent .revalidate).
+ * 2. INVALID alias (common case for no valid ldlm lock held, but this flag may
+ *    be cleared by others calling d_lustre_revalidate).
+ * 3. DISCONNECTED alias.
+ */
+static struct dentry *ll_find_alias(struct inode *inode, struct dentry *dentry)
+{
+	struct dentry *alias, *discon_alias, *invalid_alias;
+	struct ll_d_hlist_node *p;
+
+	if (ll_d_hlist_empty(&inode->i_dentry))
+		return NULL;
+
+	discon_alias = invalid_alias = NULL;
+
+	ll_lock_dcache(inode);
+	ll_d_hlist_for_each_entry(alias, p, &inode->i_dentry, d_alias) {
+		LASSERT(alias != dentry);
+
+		spin_lock(&alias->d_lock);
+		if (alias->d_flags & DCACHE_DISCONNECTED)
+			/* LASSERT(last_discon == NULL); LU-405, bz 20055 */
+			discon_alias = alias;
+		else if (alias->d_parent == dentry->d_parent	     &&
+			 alias->d_name.hash == dentry->d_name.hash       &&
+			 alias->d_name.len == dentry->d_name.len	 &&
+			 memcmp(alias->d_name.name, dentry->d_name.name,
+				dentry->d_name.len) == 0)
+			invalid_alias = alias;
+		spin_unlock(&alias->d_lock);
+
+		if (invalid_alias)
+			break;
+	}
+	alias = invalid_alias ?: discon_alias ?: NULL;
+	if (alias) {
+		spin_lock(&alias->d_lock);
+		dget_dlock(alias);
+		spin_unlock(&alias->d_lock);
+	}
+	ll_unlock_dcache(inode);
+
+	return alias;
+}
+
+/*
+ * Similar to d_splice_alias(), but lustre treats invalid alias
+ * similar to DCACHE_DISCONNECTED, and tries to use it anyway.
+ */
+struct dentry *ll_splice_alias(struct inode *inode, struct dentry *de)
+{
+	struct dentry *new;
+
+	if (inode) {
+		new = ll_find_alias(inode, de);
+		if (new) {
+			ll_dops_init(new, 1, 1);
+			d_move(new, de);
+			iput(inode);
+			CDEBUG(D_DENTRY,
+			       "Reuse dentry %p inode %p refc %d flags %#x\n",
+			      new, new->d_inode, d_refcount(new), new->d_flags);
+			return new;
+		}
+	}
+	ll_dops_init(de, 1, 1);
+	__d_lustre_invalidate(de);
+	d_add(de, inode);
+	CDEBUG(D_DENTRY, "Add dentry %p inode %p refc %d flags %#x\n",
+	       de, de->d_inode, d_refcount(de), de->d_flags);
+	return de;
+}
+
+int ll_lookup_it_finish(struct ptlrpc_request *request,
+			struct lookup_intent *it, void *data)
+{
+	struct it_cb_data *icbd = data;
+	struct dentry **de = icbd->icbd_childp;
+	struct inode *parent = icbd->icbd_parent;
+	struct inode *inode = NULL;
+	__u64 bits = 0;
+	int rc;
+	ENTRY;
+
+	/* NB 1 request reference will be taken away by ll_intent_lock()
+	 * when I return */
+	CDEBUG(D_DENTRY, "it %p it_disposition %x\n", it,
+	       it->d.lustre.it_disposition);
+	if (!it_disposition(it, DISP_LOOKUP_NEG)) {
+		rc = ll_prep_inode(&inode, request, (*de)->d_sb, it);
+		if (rc)
+			RETURN(rc);
+
+		ll_set_lock_data(ll_i2sbi(parent)->ll_md_exp, inode, it, &bits);
+
+		/* We used to query real size from OSTs here, but actually
+		   this is not needed. For stat() calls size would be updated
+		   from subsequent do_revalidate()->ll_inode_revalidate_it() in
+		   2.4 and
+		   vfs_getattr_it->ll_getattr()->ll_inode_revalidate_it() in 2.6
+		   Everybody else who needs correct file size would call
+		   ll_glimpse_size or some equivalent themselves anyway.
+		   Also see bug 7198. */
+	}
+
+	/* Only hash *de if it is unhashed (new dentry).
+	 * Atoimc_open may passin hashed dentries for open.
+	 */
+	if (d_unhashed(*de))
+		*de = ll_splice_alias(inode, *de);
+
+	if (!it_disposition(it, DISP_LOOKUP_NEG)) {
+		/* we have lookup look - unhide dentry */
+		if (bits & MDS_INODELOCK_LOOKUP)
+			d_lustre_revalidate(*de);
+	} else if (!it_disposition(it, DISP_OPEN_CREATE)) {
+		/* If file created on server, don't depend on parent UPDATE
+		 * lock to unhide it. It is left hidden and next lookup can
+		 * find it in ll_splice_alias.
+		 */
+		/* Check that parent has UPDATE lock. */
+		struct lookup_intent parent_it = {
+					.it_op = IT_GETATTR,
+					.d.lustre.it_lock_handle = 0 };
+
+		if (md_revalidate_lock(ll_i2mdexp(parent), &parent_it,
+				       &ll_i2info(parent)->lli_fid, NULL)) {
+			d_lustre_revalidate(*de);
+			ll_intent_release(&parent_it);
+		}
+	}
+
+	RETURN(0);
+}
+
+static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry,
+				   struct lookup_intent *it, int lookup_flags)
+{
+	struct lookup_intent lookup_it = { .it_op = IT_LOOKUP };
+	struct dentry *save = dentry, *retval;
+	struct ptlrpc_request *req = NULL;
+	struct md_op_data *op_data;
+	struct it_cb_data icbd;
+	__u32 opc;
+	int rc;
+	ENTRY;
+
+	if (dentry->d_name.len > ll_i2sbi(parent)->ll_namelen)
+		RETURN(ERR_PTR(-ENAMETOOLONG));
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p),intent=%s\n",
+	       dentry->d_name.len, dentry->d_name.name, parent->i_ino,
+	       parent->i_generation, parent, LL_IT2STR(it));
+
+	if (d_mountpoint(dentry))
+		CERROR("Tell Peter, lookup on mtpt, it %s\n", LL_IT2STR(it));
+
+	ll_frob_intent(&it, &lookup_it);
+
+	/* As do_lookup is called before follow_mount, root dentry may be left
+	 * not valid, revalidate it here. */
+	if (parent->i_sb->s_root && (parent->i_sb->s_root->d_inode == parent) &&
+	    (it->it_op & (IT_OPEN | IT_CREAT))) {
+		rc = ll_inode_revalidate_it(parent->i_sb->s_root, it,
+					    MDS_INODELOCK_LOOKUP);
+		if (rc)
+			RETURN(ERR_PTR(rc));
+	}
+
+	if (it->it_op == IT_GETATTR) {
+		rc = ll_statahead_enter(parent, &dentry, 0);
+		if (rc == 1) {
+			if (dentry == save)
+				GOTO(out, retval = NULL);
+			GOTO(out, retval = dentry);
+		}
+	}
+
+	icbd.icbd_childp = &dentry;
+	icbd.icbd_parent = parent;
+
+	if (it->it_op & IT_CREAT ||
+	    (it->it_op & IT_OPEN && it->it_create_mode & O_CREAT))
+		opc = LUSTRE_OPC_CREATE;
+	else
+		opc = LUSTRE_OPC_ANY;
+
+	op_data = ll_prep_md_op_data(NULL, parent, NULL, dentry->d_name.name,
+				     dentry->d_name.len, lookup_flags, opc,
+				     NULL);
+	if (IS_ERR(op_data))
+		RETURN((void *)op_data);
+
+	/* enforce umask if acl disabled or MDS doesn't support umask */
+	if (!IS_POSIXACL(parent) || !exp_connect_umask(ll_i2mdexp(parent)))
+		it->it_create_mode &= ~current_umask();
+
+	rc = md_intent_lock(ll_i2mdexp(parent), op_data, NULL, 0, it,
+			    lookup_flags, &req, ll_md_blocking_ast, 0);
+	ll_finish_md_op_data(op_data);
+	if (rc < 0)
+		GOTO(out, retval = ERR_PTR(rc));
+
+	rc = ll_lookup_it_finish(req, it, &icbd);
+	if (rc != 0) {
+		ll_intent_release(it);
+		GOTO(out, retval = ERR_PTR(rc));
+	}
+
+	if ((it->it_op & IT_OPEN) && dentry->d_inode &&
+	    !S_ISREG(dentry->d_inode->i_mode) &&
+	    !S_ISDIR(dentry->d_inode->i_mode)) {
+		ll_release_openhandle(dentry, it);
+	}
+	ll_lookup_finish_locks(it, dentry);
+
+	if (dentry == save)
+		GOTO(out, retval = NULL);
+	else
+		GOTO(out, retval = dentry);
+ out:
+	if (req)
+		ptlrpc_req_finished(req);
+	if (it->it_op == IT_GETATTR && (retval == NULL || retval == dentry))
+		ll_statahead_mark(parent, dentry);
+	return retval;
+}
+
+static struct dentry *ll_lookup_nd(struct inode *parent, struct dentry *dentry,
+				   unsigned int flags)
+{
+	struct lookup_intent *itp, it = { .it_op = IT_GETATTR };
+	struct dentry *de;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p),flags=%u\n",
+	       dentry->d_name.len, dentry->d_name.name, parent->i_ino,
+	       parent->i_generation, parent, flags);
+
+	/* Optimize away (CREATE && !OPEN). Let .create handle the race. */
+	if ((flags & LOOKUP_CREATE ) && !(flags & LOOKUP_OPEN)) {
+		ll_dops_init(dentry, 1, 1);
+		__d_lustre_invalidate(dentry);
+		d_add(dentry, NULL);
+		return NULL;
+	}
+
+	if (flags & (LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE))
+		itp = NULL;
+	else
+		itp = &it;
+	de = ll_lookup_it(parent, dentry, itp, 0);
+
+	if (itp != NULL)
+		ll_intent_release(itp);
+
+	return de;
+}
+
+/*
+ * For cached negative dentry and new dentry, handle lookup/create/open
+ * together.
+ */
+static int ll_atomic_open(struct inode *dir, struct dentry *dentry,
+			  struct file *file, unsigned open_flags,
+			  umode_t mode, int *opened)
+{
+	struct lookup_intent *it;
+	struct dentry *de;
+	long long lookup_flags = LOOKUP_OPEN;
+	int rc = 0;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p),file %p,"
+			   "open_flags %x,mode %x opened %d\n",
+	       dentry->d_name.len, dentry->d_name.name, dir->i_ino,
+	       dir->i_generation, dir, file, open_flags, mode, *opened);
+
+	OBD_ALLOC(it, sizeof(*it));
+	if (!it)
+		RETURN(-ENOMEM);
+
+	it->it_op = IT_OPEN;
+	if (mode) {
+		it->it_op |= IT_CREAT;
+		lookup_flags |= LOOKUP_CREATE;
+	}
+	it->it_create_mode = (mode & S_IALLUGO) | S_IFREG;
+	it->it_flags = (open_flags & ~O_ACCMODE) | OPEN_FMODE(open_flags);
+
+	/* Dentry added to dcache tree in ll_lookup_it */
+	de = ll_lookup_it(dir, dentry, it, lookup_flags);
+	if (IS_ERR(de))
+		rc = PTR_ERR(de);
+	else if (de != NULL)
+		dentry = de;
+
+	if (!rc) {
+		if (it_disposition(it, DISP_OPEN_CREATE)) {
+			/* Dentry instantiated in ll_create_it. */
+			rc = ll_create_it(dir, dentry, mode, it);
+			if (rc) {
+				/* We dget in ll_splice_alias. */
+				if (de != NULL)
+					dput(de);
+				goto out_release;
+			}
+
+			*opened |= FILE_CREATED;
+		}
+		if (dentry->d_inode && it_disposition(it, DISP_OPEN_OPEN)) {
+			/* Open dentry. */
+			if (S_ISFIFO(dentry->d_inode->i_mode)) {
+				/* We cannot call open here as it would
+				 * deadlock.
+				 */
+				if (it_disposition(it, DISP_ENQ_OPEN_REF))
+					ptlrpc_req_finished(
+						       (struct ptlrpc_request *)
+							  it->d.lustre.it_data);
+				rc = finish_no_open(file, de);
+			} else {
+				file->private_data = it;
+				rc = finish_open(file, dentry, NULL, opened);
+				/* We dget in ll_splice_alias. finish_open takes
+				 * care of dget for fd open.
+				 */
+				if (de != NULL)
+					dput(de);
+			}
+		} else {
+			rc = finish_no_open(file, de);
+		}
+	}
+
+out_release:
+	ll_intent_release(it);
+	OBD_FREE(it, sizeof(*it));
+
+	RETURN(rc);
+}
+
+
+/* We depend on "mode" being set with the proper file type/umask by now */
+static struct inode *ll_create_node(struct inode *dir, const char *name,
+				    int namelen, const void *data, int datalen,
+				    int mode, __u64 extra,
+				    struct lookup_intent *it)
+{
+	struct inode *inode = NULL;
+	struct ptlrpc_request *request = NULL;
+	struct ll_sb_info *sbi = ll_i2sbi(dir);
+	int rc;
+	ENTRY;
+
+	LASSERT(it && it->d.lustre.it_disposition);
+
+	LASSERT(it_disposition(it, DISP_ENQ_CREATE_REF));
+	request = it->d.lustre.it_data;
+	it_clear_disposition(it, DISP_ENQ_CREATE_REF);
+	rc = ll_prep_inode(&inode, request, dir->i_sb, it);
+	if (rc)
+		GOTO(out, inode = ERR_PTR(rc));
+
+	LASSERT(ll_d_hlist_empty(&inode->i_dentry));
+
+	/* We asked for a lock on the directory, but were granted a
+	 * lock on the inode.  Since we finally have an inode pointer,
+	 * stuff it in the lock. */
+	CDEBUG(D_DLMTRACE, "setting l_ast_data to inode %p (%lu/%u)\n",
+	       inode, inode->i_ino, inode->i_generation);
+	ll_set_lock_data(sbi->ll_md_exp, inode, it, NULL);
+	EXIT;
+ out:
+	ptlrpc_req_finished(request);
+	return inode;
+}
+
+/*
+ * By the time this is called, we already have created the directory cache
+ * entry for the new file, but it is so far negative - it has no inode.
+ *
+ * We defer creating the OBD object(s) until open, to keep the intent and
+ * non-intent code paths similar, and also because we do not have the MDS
+ * inode number before calling ll_create_node() (which is needed for LOV),
+ * so we would need to do yet another RPC to the MDS to store the LOV EA
+ * data on the MDS.  If needed, we would pass the PACKED lmm as data and
+ * lmm_size in datalen (the MDS still has code which will handle that).
+ *
+ * If the create succeeds, we fill in the inode information
+ * with d_instantiate().
+ */
+static int ll_create_it(struct inode *dir, struct dentry *dentry, int mode,
+			struct lookup_intent *it)
+{
+	struct inode *inode;
+	int rc = 0;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p),intent=%s\n",
+	       dentry->d_name.len, dentry->d_name.name, dir->i_ino,
+	       dir->i_generation, dir, LL_IT2STR(it));
+
+	rc = it_open_error(DISP_OPEN_CREATE, it);
+	if (rc)
+		RETURN(rc);
+
+	inode = ll_create_node(dir, dentry->d_name.name, dentry->d_name.len,
+			       NULL, 0, mode, 0, it);
+	if (IS_ERR(inode))
+		RETURN(PTR_ERR(inode));
+
+	if (filename_is_volatile(dentry->d_name.name, dentry->d_name.len, NULL))
+		ll_i2info(inode)->lli_volatile = true;
+
+	d_instantiate(dentry, inode);
+	RETURN(0);
+}
+
+static void ll_update_times(struct ptlrpc_request *request,
+			    struct inode *inode)
+{
+	struct mdt_body *body = req_capsule_server_get(&request->rq_pill,
+						       &RMF_MDT_BODY);
+
+	LASSERT(body);
+	if (body->valid & OBD_MD_FLMTIME &&
+	    body->mtime > LTIME_S(inode->i_mtime)) {
+		CDEBUG(D_INODE, "setting ino %lu mtime from %lu to "LPU64"\n",
+		       inode->i_ino, LTIME_S(inode->i_mtime), body->mtime);
+		LTIME_S(inode->i_mtime) = body->mtime;
+	}
+	if (body->valid & OBD_MD_FLCTIME &&
+	    body->ctime > LTIME_S(inode->i_ctime))
+		LTIME_S(inode->i_ctime) = body->ctime;
+}
+
+static int ll_new_node(struct inode *dir, struct qstr *name,
+		       const char *tgt, int mode, int rdev,
+		       struct dentry *dchild, __u32 opc)
+{
+	struct ptlrpc_request *request = NULL;
+	struct md_op_data *op_data;
+	struct inode *inode = NULL;
+	struct ll_sb_info *sbi = ll_i2sbi(dir);
+	int tgt_len = 0;
+	int err;
+
+	ENTRY;
+	if (unlikely(tgt != NULL))
+		tgt_len = strlen(tgt) + 1;
+
+	op_data = ll_prep_md_op_data(NULL, dir, NULL, name->name,
+				     name->len, 0, opc, NULL);
+	if (IS_ERR(op_data))
+		GOTO(err_exit, err = PTR_ERR(op_data));
+
+	err = md_create(sbi->ll_md_exp, op_data, tgt, tgt_len, mode,
+			current_fsuid(), current_fsgid(),
+			cfs_curproc_cap_pack(), rdev, &request);
+	ll_finish_md_op_data(op_data);
+	if (err)
+		GOTO(err_exit, err);
+
+	ll_update_times(request, dir);
+
+	if (dchild) {
+		err = ll_prep_inode(&inode, request, dchild->d_sb, NULL);
+		if (err)
+		     GOTO(err_exit, err);
+
+		d_instantiate(dchild, inode);
+	}
+	EXIT;
+err_exit:
+	ptlrpc_req_finished(request);
+
+	return err;
+}
+
+static int ll_mknod_generic(struct inode *dir, struct qstr *name, int mode,
+			    unsigned rdev, struct dentry *dchild)
+{
+	int err;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p) mode %o dev %x\n",
+	       name->len, name->name, dir->i_ino, dir->i_generation, dir,
+	       mode, rdev);
+
+	if (!IS_POSIXACL(dir) || !exp_connect_umask(ll_i2mdexp(dir)))
+		mode &= ~current_umask();
+
+	switch (mode & S_IFMT) {
+	case 0:
+		mode |= S_IFREG; /* for mode = 0 case, fallthrough */
+	case S_IFREG:
+	case S_IFCHR:
+	case S_IFBLK:
+	case S_IFIFO:
+	case S_IFSOCK:
+		err = ll_new_node(dir, name, NULL, mode, rdev, dchild,
+				  LUSTRE_OPC_MKNOD);
+		break;
+	case S_IFDIR:
+		err = -EPERM;
+		break;
+	default:
+		err = -EINVAL;
+	}
+
+	if (!err)
+		ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_MKNOD, 1);
+
+	RETURN(err);
+}
+
+/*
+ * Plain create. Intent create is handled in atomic_open.
+ */
+static int ll_create_nd(struct inode *dir, struct dentry *dentry,
+			umode_t mode, bool want_excl)
+{
+	int rc;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p),"
+			   "flags=%u, excl=%d\n",
+	       dentry->d_name.len, dentry->d_name.name, dir->i_ino,
+	       dir->i_generation, dir, mode, want_excl);
+
+	rc = ll_mknod_generic(dir, &dentry->d_name, mode, 0, dentry);
+
+	ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_CREATE, 1);
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s, unhashed %d\n",
+	       dentry->d_name.len, dentry->d_name.name, d_unhashed(dentry));
+
+	return rc;
+}
+
+static int ll_symlink_generic(struct inode *dir, struct qstr *name,
+			      const char *tgt, struct dentry *dchild)
+{
+	int err;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p),target=%.*s\n",
+	       name->len, name->name, dir->i_ino, dir->i_generation,
+	       dir, 3000, tgt);
+
+	err = ll_new_node(dir, name, (char *)tgt, S_IFLNK | S_IRWXUGO,
+			  0, dchild, LUSTRE_OPC_SYMLINK);
+
+	if (!err)
+		ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_SYMLINK, 1);
+
+	RETURN(err);
+}
+
+static int ll_link_generic(struct inode *src,  struct inode *dir,
+			   struct qstr *name, struct dentry *dchild)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(dir);
+	struct ptlrpc_request *request = NULL;
+	struct md_op_data *op_data;
+	int err;
+
+	ENTRY;
+	CDEBUG(D_VFSTRACE,
+	       "VFS Op: inode=%lu/%u(%p), dir=%lu/%u(%p), target=%.*s\n",
+	       src->i_ino, src->i_generation, src, dir->i_ino,
+	       dir->i_generation, dir, name->len, name->name);
+
+	op_data = ll_prep_md_op_data(NULL, src, dir, name->name, name->len,
+				     0, LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	err = md_link(sbi->ll_md_exp, op_data, &request);
+	ll_finish_md_op_data(op_data);
+	if (err)
+		GOTO(out, err);
+
+	ll_update_times(request, dir);
+	ll_stats_ops_tally(sbi, LPROC_LL_LINK, 1);
+	EXIT;
+out:
+	ptlrpc_req_finished(request);
+	RETURN(err);
+}
+
+static int ll_mkdir_generic(struct inode *dir, struct qstr *name,
+			    int mode, struct dentry *dchild)
+
+{
+	int err;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p)\n",
+	       name->len, name->name, dir->i_ino, dir->i_generation, dir);
+
+	if (!IS_POSIXACL(dir) || !exp_connect_umask(ll_i2mdexp(dir)))
+		mode &= ~current_umask();
+	mode = (mode & (S_IRWXUGO|S_ISVTX)) | S_IFDIR;
+	err = ll_new_node(dir, name, NULL, mode, 0, dchild, LUSTRE_OPC_MKDIR);
+
+	if (!err)
+		ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_MKDIR, 1);
+
+	RETURN(err);
+}
+
+/* Try to find the child dentry by its name.
+   If found, put the result fid into @fid. */
+static void ll_get_child_fid(struct inode * dir, struct qstr *name,
+			     struct lu_fid *fid)
+{
+	struct dentry *parent, *child;
+
+	parent = ll_d_hlist_entry(dir->i_dentry, struct dentry, d_alias);
+	child = d_lookup(parent, name);
+	if (child) {
+		if (child->d_inode)
+			*fid = *ll_inode2fid(child->d_inode);
+		dput(child);
+	}
+}
+
+static int ll_rmdir_generic(struct inode *dir, struct dentry *dparent,
+			    struct dentry *dchild, struct qstr *name)
+{
+	struct ptlrpc_request *request = NULL;
+	struct md_op_data *op_data;
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p)\n",
+	       name->len, name->name, dir->i_ino, dir->i_generation, dir);
+
+	if (unlikely(ll_d_mountpoint(dparent, dchild, name)))
+		RETURN(-EBUSY);
+
+	op_data = ll_prep_md_op_data(NULL, dir, NULL, name->name, name->len,
+				     S_IFDIR, LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	ll_get_child_fid(dir, name, &op_data->op_fid3);
+	op_data->op_fid2 = op_data->op_fid3;
+	rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request);
+	ll_finish_md_op_data(op_data);
+	if (rc == 0) {
+		ll_update_times(request, dir);
+		ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_RMDIR, 1);
+	}
+
+	ptlrpc_req_finished(request);
+	RETURN(rc);
+}
+
+/**
+ * Remove dir entry
+ **/
+int ll_rmdir_entry(struct inode *dir, char *name, int namelen)
+{
+	struct ptlrpc_request *request = NULL;
+	struct md_op_data *op_data;
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p)\n",
+	       namelen, name, dir->i_ino, dir->i_generation, dir);
+
+	op_data = ll_prep_md_op_data(NULL, dir, NULL, name, strlen(name),
+				     S_IFDIR, LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+	op_data->op_cli_flags |= CLI_RM_ENTRY;
+	rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request);
+	ll_finish_md_op_data(op_data);
+	if (rc == 0) {
+		ll_update_times(request, dir);
+		ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_RMDIR, 1);
+	}
+
+	ptlrpc_req_finished(request);
+	RETURN(rc);
+}
+
+int ll_objects_destroy(struct ptlrpc_request *request, struct inode *dir)
+{
+	struct mdt_body *body;
+	struct lov_mds_md *eadata;
+	struct lov_stripe_md *lsm = NULL;
+	struct obd_trans_info oti = { 0 };
+	struct obdo *oa;
+	struct obd_capa *oc = NULL;
+	int rc;
+	ENTRY;
+
+	/* req is swabbed so this is safe */
+	body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
+	if (!(body->valid & OBD_MD_FLEASIZE))
+		RETURN(0);
+
+	if (body->eadatasize == 0) {
+		CERROR("OBD_MD_FLEASIZE set but eadatasize zero\n");
+		GOTO(out, rc = -EPROTO);
+	}
+
+	/* The MDS sent back the EA because we unlinked the last reference
+	 * to this file. Use this EA to unlink the objects on the OST.
+	 * It's opaque so we don't swab here; we leave it to obd_unpackmd() to
+	 * check it is complete and sensible. */
+	eadata = req_capsule_server_sized_get(&request->rq_pill, &RMF_MDT_MD,
+					      body->eadatasize);
+	LASSERT(eadata != NULL);
+
+	rc = obd_unpackmd(ll_i2dtexp(dir), &lsm, eadata, body->eadatasize);
+	if (rc < 0) {
+		CERROR("obd_unpackmd: %d\n", rc);
+		GOTO(out, rc);
+	}
+	LASSERT(rc >= sizeof(*lsm));
+
+	OBDO_ALLOC(oa);
+	if (oa == NULL)
+		GOTO(out_free_memmd, rc = -ENOMEM);
+
+	oa->o_oi = lsm->lsm_oi;
+	oa->o_mode = body->mode & S_IFMT;
+	oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLGROUP;
+
+	if (body->valid & OBD_MD_FLCOOKIE) {
+		oa->o_valid |= OBD_MD_FLCOOKIE;
+		oti.oti_logcookies =
+			req_capsule_server_sized_get(&request->rq_pill,
+						     &RMF_LOGCOOKIES,
+						   sizeof(struct llog_cookie) *
+						     lsm->lsm_stripe_count);
+		if (oti.oti_logcookies == NULL) {
+			oa->o_valid &= ~OBD_MD_FLCOOKIE;
+			body->valid &= ~OBD_MD_FLCOOKIE;
+		}
+	}
+
+	if (body->valid & OBD_MD_FLOSSCAPA) {
+		rc = md_unpack_capa(ll_i2mdexp(dir), request, &RMF_CAPA2, &oc);
+		if (rc)
+			GOTO(out_free_memmd, rc);
+	}
+
+	rc = obd_destroy(NULL, ll_i2dtexp(dir), oa, lsm, &oti,
+			 ll_i2mdexp(dir), oc);
+	capa_put(oc);
+	if (rc)
+		CERROR("obd destroy objid "DOSTID" error %d\n",
+		       POSTID(&lsm->lsm_oi), rc);
+out_free_memmd:
+	obd_free_memmd(ll_i2dtexp(dir), &lsm);
+	OBDO_FREE(oa);
+out:
+	return rc;
+}
+
+/* ll_unlink_generic() doesn't update the inode with the new link count.
+ * Instead, ll_ddelete() and ll_d_iput() will update it based upon if there
+ * is any lock existing. They will recycle dentries and inodes based upon locks
+ * too. b=20433 */
+static int ll_unlink_generic(struct inode *dir, struct dentry *dparent,
+			     struct dentry *dchild, struct qstr *name)
+{
+	struct ptlrpc_request *request = NULL;
+	struct md_op_data *op_data;
+	int rc;
+	ENTRY;
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p)\n",
+	       name->len, name->name, dir->i_ino, dir->i_generation, dir);
+
+	/*
+	 * XXX: unlink bind mountpoint maybe call to here,
+	 * just check it as vfs_unlink does.
+	 */
+	if (unlikely(ll_d_mountpoint(dparent, dchild, name)))
+		RETURN(-EBUSY);
+
+	op_data = ll_prep_md_op_data(NULL, dir, NULL, name->name,
+				     name->len, 0, LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	ll_get_child_fid(dir, name, &op_data->op_fid3);
+	op_data->op_fid2 = op_data->op_fid3;
+	rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request);
+	ll_finish_md_op_data(op_data);
+	if (rc)
+		GOTO(out, rc);
+
+	ll_update_times(request, dir);
+	ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_UNLINK, 1);
+
+	rc = ll_objects_destroy(request, dir);
+ out:
+	ptlrpc_req_finished(request);
+	RETURN(rc);
+}
+
+static int ll_rename_generic(struct inode *src, struct dentry *src_dparent,
+			     struct dentry *src_dchild, struct qstr *src_name,
+			     struct inode *tgt, struct dentry *tgt_dparent,
+			     struct dentry *tgt_dchild, struct qstr *tgt_name)
+{
+	struct ptlrpc_request *request = NULL;
+	struct ll_sb_info *sbi = ll_i2sbi(src);
+	struct md_op_data *op_data;
+	int err;
+	ENTRY;
+	CDEBUG(D_VFSTRACE,"VFS Op:oldname=%.*s,src_dir=%lu/%u(%p),newname=%.*s,"
+	       "tgt_dir=%lu/%u(%p)\n", src_name->len, src_name->name,
+	       src->i_ino, src->i_generation, src, tgt_name->len,
+	       tgt_name->name, tgt->i_ino, tgt->i_generation, tgt);
+
+	if (unlikely(ll_d_mountpoint(src_dparent, src_dchild, src_name) ||
+	    ll_d_mountpoint(tgt_dparent, tgt_dchild, tgt_name)))
+		RETURN(-EBUSY);
+
+	op_data = ll_prep_md_op_data(NULL, src, tgt, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	ll_get_child_fid(src, src_name, &op_data->op_fid3);
+	ll_get_child_fid(tgt, tgt_name, &op_data->op_fid4);
+	err = md_rename(sbi->ll_md_exp, op_data,
+			src_name->name, src_name->len,
+			tgt_name->name, tgt_name->len, &request);
+	ll_finish_md_op_data(op_data);
+	if (!err) {
+		ll_update_times(request, src);
+		ll_update_times(request, tgt);
+		ll_stats_ops_tally(sbi, LPROC_LL_RENAME, 1);
+		err = ll_objects_destroy(request, src);
+	}
+
+	ptlrpc_req_finished(request);
+
+	RETURN(err);
+}
+
+static int ll_mknod(struct inode *dir, struct dentry *dchild, ll_umode_t mode,
+		    dev_t rdev)
+{
+	return ll_mknod_generic(dir, &dchild->d_name, mode,
+				old_encode_dev(rdev), dchild);
+}
+
+static int ll_unlink(struct inode * dir, struct dentry *dentry)
+{
+	return ll_unlink_generic(dir, NULL, dentry, &dentry->d_name);
+}
+
+static int ll_mkdir(struct inode *dir, struct dentry *dentry, ll_umode_t mode)
+{
+	return ll_mkdir_generic(dir, &dentry->d_name, mode, dentry);
+}
+
+static int ll_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	return ll_rmdir_generic(dir, NULL, dentry, &dentry->d_name);
+}
+
+static int ll_symlink(struct inode *dir, struct dentry *dentry,
+		      const char *oldname)
+{
+	return ll_symlink_generic(dir, &dentry->d_name, oldname, dentry);
+}
+
+static int ll_link(struct dentry *old_dentry, struct inode *dir,
+		   struct dentry *new_dentry)
+{
+	return ll_link_generic(old_dentry->d_inode, dir, &new_dentry->d_name,
+			       new_dentry);
+}
+
+static int ll_rename(struct inode *old_dir, struct dentry *old_dentry,
+		     struct inode *new_dir, struct dentry *new_dentry)
+{
+	int err;
+	err = ll_rename_generic(old_dir, NULL,
+				 old_dentry, &old_dentry->d_name,
+				 new_dir, NULL, new_dentry,
+				 &new_dentry->d_name);
+	if (!err) {
+			d_move(old_dentry, new_dentry);
+	}
+	return err;
+}
+
+struct inode_operations ll_dir_inode_operations = {
+	.mknod	      = ll_mknod,
+	.atomic_open	    = ll_atomic_open,
+	.lookup	     = ll_lookup_nd,
+	.create	     = ll_create_nd,
+	/* We need all these non-raw things for NFSD, to not patch it. */
+	.unlink	     = ll_unlink,
+	.mkdir	      = ll_mkdir,
+	.rmdir	      = ll_rmdir,
+	.symlink	    = ll_symlink,
+	.link	       = ll_link,
+	.rename	     = ll_rename,
+	.setattr	    = ll_setattr,
+	.getattr	    = ll_getattr,
+	.permission	 = ll_inode_permission,
+	.setxattr	   = ll_setxattr,
+	.getxattr	   = ll_getxattr,
+	.listxattr	  = ll_listxattr,
+	.removexattr	= ll_removexattr,
+	.get_acl	    = ll_get_acl,
+};
+
+struct inode_operations ll_special_inode_operations = {
+	.setattr	= ll_setattr,
+	.getattr	= ll_getattr,
+	.permission     = ll_inode_permission,
+	.setxattr       = ll_setxattr,
+	.getxattr       = ll_getxattr,
+	.listxattr      = ll_listxattr,
+	.removexattr    = ll_removexattr,
+	.get_acl	    = ll_get_acl,
+};
diff --git a/drivers/staging/lustre/lustre/llite/remote_perm.c b/drivers/staging/lustre/lustre/llite/remote_perm.c
new file mode 100644
index 000000000000..68b2dc4a7b62
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/remote_perm.c
@@ -0,0 +1,333 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/remote_perm.c
+ *
+ * Lustre Permission Cache for Remote Client
+ *
+ * Author: Lai Siyao <lsy@clusterfs.com>
+ * Author: Fan Yong <fanyong@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/version.h>
+
+#include <lustre_lite.h>
+#include <lustre_ha.h>
+#include <lustre_dlm.h>
+#include <lprocfs_status.h>
+#include <lustre_disk.h>
+#include <lustre_param.h>
+#include "llite_internal.h"
+
+struct kmem_cache *ll_remote_perm_cachep = NULL;
+struct kmem_cache *ll_rmtperm_hash_cachep = NULL;
+
+static inline struct ll_remote_perm *alloc_ll_remote_perm(void)
+{
+	struct ll_remote_perm *lrp;
+
+	OBD_SLAB_ALLOC_PTR_GFP(lrp, ll_remote_perm_cachep, GFP_KERNEL);
+	if (lrp)
+		INIT_HLIST_NODE(&lrp->lrp_list);
+	return lrp;
+}
+
+static inline void free_ll_remote_perm(struct ll_remote_perm *lrp)
+{
+	if (!lrp)
+		return;
+
+	if (!hlist_unhashed(&lrp->lrp_list))
+		hlist_del(&lrp->lrp_list);
+	OBD_SLAB_FREE(lrp, ll_remote_perm_cachep, sizeof(*lrp));
+}
+
+struct hlist_head *alloc_rmtperm_hash(void)
+{
+	struct hlist_head *hash;
+	int i;
+
+	OBD_SLAB_ALLOC_GFP(hash, ll_rmtperm_hash_cachep,
+			   REMOTE_PERM_HASHSIZE * sizeof(*hash),
+			   GFP_IOFS);
+	if (!hash)
+		return NULL;
+
+	for (i = 0; i < REMOTE_PERM_HASHSIZE; i++)
+		INIT_HLIST_HEAD(hash + i);
+
+	return hash;
+}
+
+void free_rmtperm_hash(struct hlist_head *hash)
+{
+	int i;
+	struct ll_remote_perm *lrp;
+	struct hlist_node *next;
+
+	if(!hash)
+		return;
+
+	for (i = 0; i < REMOTE_PERM_HASHSIZE; i++)
+		hlist_for_each_entry_safe(lrp, next, hash + i,
+					      lrp_list)
+			free_ll_remote_perm(lrp);
+	OBD_SLAB_FREE(hash, ll_rmtperm_hash_cachep,
+		      REMOTE_PERM_HASHSIZE * sizeof(*hash));
+}
+
+static inline int remote_perm_hashfunc(uid_t uid)
+{
+	return uid & (REMOTE_PERM_HASHSIZE - 1);
+}
+
+/* NB: setxid permission is not checked here, instead it's done on
+ * MDT when client get remote permission. */
+static int do_check_remote_perm(struct ll_inode_info *lli, int mask)
+{
+	struct hlist_head *head;
+	struct ll_remote_perm *lrp;
+	int found = 0, rc;
+	ENTRY;
+
+	if (!lli->lli_remote_perms)
+		RETURN(-ENOENT);
+
+	head = lli->lli_remote_perms + remote_perm_hashfunc(current_uid());
+
+	spin_lock(&lli->lli_lock);
+	hlist_for_each_entry(lrp, head, lrp_list) {
+		if (lrp->lrp_uid != current_uid())
+			continue;
+		if (lrp->lrp_gid != current_gid())
+			continue;
+		if (lrp->lrp_fsuid != current_fsuid())
+			continue;
+		if (lrp->lrp_fsgid != current_fsgid())
+			continue;
+		found = 1;
+		break;
+	}
+
+	if (!found)
+		GOTO(out, rc = -ENOENT);
+
+	CDEBUG(D_SEC, "found remote perm: %u/%u/%u/%u - %#x\n",
+	       lrp->lrp_uid, lrp->lrp_gid, lrp->lrp_fsuid, lrp->lrp_fsgid,
+	       lrp->lrp_access_perm);
+	rc = ((lrp->lrp_access_perm & mask) == mask) ? 0 : -EACCES;
+
+out:
+	spin_unlock(&lli->lli_lock);
+	return rc;
+}
+
+int ll_update_remote_perm(struct inode *inode, struct mdt_remote_perm *perm)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ll_remote_perm *lrp = NULL, *tmp = NULL;
+	struct hlist_head *head, *perm_hash = NULL;
+	ENTRY;
+
+	LASSERT(ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT);
+
+#if 0
+	if (perm->rp_uid != current->uid ||
+	    perm->rp_gid != current->gid ||
+	    perm->rp_fsuid != current->fsuid ||
+	    perm->rp_fsgid != current->fsgid) {
+		/* user might setxid in this small period */
+		CDEBUG(D_SEC,
+		       "remote perm user %u/%u/%u/%u != current %u/%u/%u/%u\n",
+		       perm->rp_uid, perm->rp_gid, perm->rp_fsuid,
+		       perm->rp_fsgid, current->uid, current->gid,
+		       current->fsuid, current->fsgid);
+		RETURN(-EAGAIN);
+	}
+#endif
+
+	if (!lli->lli_remote_perms) {
+		perm_hash = alloc_rmtperm_hash();
+		if (perm_hash == NULL) {
+			CERROR("alloc lli_remote_perms failed!\n");
+			RETURN(-ENOMEM);
+		}
+	}
+
+	spin_lock(&lli->lli_lock);
+
+	if (!lli->lli_remote_perms)
+		lli->lli_remote_perms = perm_hash;
+	else if (perm_hash)
+		free_rmtperm_hash(perm_hash);
+
+	head = lli->lli_remote_perms + remote_perm_hashfunc(perm->rp_uid);
+
+again:
+	hlist_for_each_entry(tmp, head, lrp_list) {
+		if (tmp->lrp_uid != perm->rp_uid)
+			continue;
+		if (tmp->lrp_gid != perm->rp_gid)
+			continue;
+		if (tmp->lrp_fsuid != perm->rp_fsuid)
+			continue;
+		if (tmp->lrp_fsgid != perm->rp_fsgid)
+			continue;
+		if (lrp)
+			free_ll_remote_perm(lrp);
+		lrp = tmp;
+		break;
+	}
+
+	if (!lrp) {
+		spin_unlock(&lli->lli_lock);
+		lrp = alloc_ll_remote_perm();
+		if (!lrp) {
+			CERROR("alloc memory for ll_remote_perm failed!\n");
+			RETURN(-ENOMEM);
+		}
+		spin_lock(&lli->lli_lock);
+		goto again;
+	}
+
+	lrp->lrp_access_perm = perm->rp_access_perm;
+	if (lrp != tmp) {
+		lrp->lrp_uid	 = perm->rp_uid;
+		lrp->lrp_gid	 = perm->rp_gid;
+		lrp->lrp_fsuid       = perm->rp_fsuid;
+		lrp->lrp_fsgid       = perm->rp_fsgid;
+		hlist_add_head(&lrp->lrp_list, head);
+	}
+	lli->lli_rmtperm_time = cfs_time_current();
+	spin_unlock(&lli->lli_lock);
+
+	CDEBUG(D_SEC, "new remote perm@%p: %u/%u/%u/%u - %#x\n",
+	       lrp, lrp->lrp_uid, lrp->lrp_gid, lrp->lrp_fsuid, lrp->lrp_fsgid,
+	       lrp->lrp_access_perm);
+
+	RETURN(0);
+}
+
+int lustre_check_remote_perm(struct inode *inode, int mask)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ptlrpc_request *req = NULL;
+	struct mdt_remote_perm *perm;
+	struct obd_capa *oc;
+	cfs_time_t save;
+	int i = 0, rc;
+	ENTRY;
+
+	do {
+		save = lli->lli_rmtperm_time;
+		rc = do_check_remote_perm(lli, mask);
+		if (!rc || (rc != -ENOENT && i))
+			break;
+
+		might_sleep();
+
+		mutex_lock(&lli->lli_rmtperm_mutex);
+		/* check again */
+		if (save != lli->lli_rmtperm_time) {
+			rc = do_check_remote_perm(lli, mask);
+			if (!rc || (rc != -ENOENT && i)) {
+				mutex_unlock(&lli->lli_rmtperm_mutex);
+				break;
+			}
+		}
+
+		if (i++ > 5) {
+			CERROR("check remote perm falls in dead loop!\n");
+			LBUG();
+		}
+
+		oc = ll_mdscapa_get(inode);
+		rc = md_get_remote_perm(sbi->ll_md_exp, ll_inode2fid(inode), oc,
+					ll_i2suppgid(inode), &req);
+		capa_put(oc);
+		if (rc) {
+			mutex_unlock(&lli->lli_rmtperm_mutex);
+			break;
+		}
+
+		perm = req_capsule_server_swab_get(&req->rq_pill, &RMF_ACL,
+						   lustre_swab_mdt_remote_perm);
+		if (unlikely(perm == NULL)) {
+			mutex_unlock(&lli->lli_rmtperm_mutex);
+			rc = -EPROTO;
+			break;
+		}
+
+		rc = ll_update_remote_perm(inode, perm);
+		mutex_unlock(&lli->lli_rmtperm_mutex);
+		if (rc == -ENOMEM)
+			break;
+
+		ptlrpc_req_finished(req);
+		req = NULL;
+	} while (1);
+	ptlrpc_req_finished(req);
+	RETURN(rc);
+}
+
+#if 0  /* NB: remote perms can't be freed in ll_mdc_blocking_ast of UPDATE lock,
+	* because it will fail sanity test 48.
+	*/
+void ll_free_remote_perms(struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct hlist_head *hash = lli->lli_remote_perms;
+	struct ll_remote_perm *lrp;
+	struct hlist_node *node, *next;
+	int i;
+
+	LASSERT(hash);
+
+	spin_lock(&lli->lli_lock);
+
+	for (i = 0; i < REMOTE_PERM_HASHSIZE; i++) {
+		hlist_for_each_entry_safe(lrp, node, next, hash + i,
+					      lrp_list)
+			free_ll_remote_perm(lrp);
+	}
+
+	spin_unlock(&lli->lli_lock);
+}
+#endif
diff --git a/drivers/staging/lustre/lustre/llite/rw.c b/drivers/staging/lustre/lustre/llite/rw.c
new file mode 100644
index 000000000000..0a0ac262eaaa
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/rw.c
@@ -0,0 +1,1307 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/rw.c
+ *
+ * Lustre Lite I/O page cache routines shared by different kernel revs
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/writeback.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <asm/uaccess.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+/* current_is_kswapd() */
+#include <linux/swap.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <lustre_lite.h>
+#include <obd_cksum.h>
+#include "llite_internal.h"
+#include <linux/lustre_compat25.h>
+
+/**
+ * Finalizes cl-data before exiting typical address_space operation. Dual to
+ * ll_cl_init().
+ */
+static void ll_cl_fini(struct ll_cl_context *lcc)
+{
+	struct lu_env  *env  = lcc->lcc_env;
+	struct cl_io   *io   = lcc->lcc_io;
+	struct cl_page *page = lcc->lcc_page;
+
+	LASSERT(lcc->lcc_cookie == current);
+	LASSERT(env != NULL);
+
+	if (page != NULL) {
+		lu_ref_del(&page->cp_reference, "cl_io", io);
+		cl_page_put(env, page);
+	}
+
+	if (io && lcc->lcc_created) {
+		cl_io_end(env, io);
+		cl_io_unlock(env, io);
+		cl_io_iter_fini(env, io);
+		cl_io_fini(env, io);
+	}
+	cl_env_put(env, &lcc->lcc_refcheck);
+}
+
+/**
+ * Initializes common cl-data at the typical address_space operation entry
+ * point.
+ */
+static struct ll_cl_context *ll_cl_init(struct file *file,
+					struct page *vmpage, int create)
+{
+	struct ll_cl_context *lcc;
+	struct lu_env    *env;
+	struct cl_io     *io;
+	struct cl_object *clob;
+	struct ccc_io    *cio;
+
+	int refcheck;
+	int result = 0;
+
+	clob = ll_i2info(vmpage->mapping->host)->lli_clob;
+	LASSERT(clob != NULL);
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		return ERR_PTR(PTR_ERR(env));
+
+	lcc = &vvp_env_info(env)->vti_io_ctx;
+	memset(lcc, 0, sizeof(*lcc));
+	lcc->lcc_env = env;
+	lcc->lcc_refcheck = refcheck;
+	lcc->lcc_cookie = current;
+
+	cio = ccc_env_io(env);
+	io = cio->cui_cl.cis_io;
+	if (io == NULL && create) {
+		struct inode *inode = vmpage->mapping->host;
+		loff_t pos;
+
+		if (mutex_trylock(&inode->i_mutex)) {
+			mutex_unlock(&(inode)->i_mutex);
+
+			/* this is too bad. Someone is trying to write the
+			 * page w/o holding inode mutex. This means we can
+			 * add dirty pages into cache during truncate */
+			CERROR("Proc %s is dirting page w/o inode lock, this"
+			       "will break truncate.\n", current->comm);
+			libcfs_debug_dumpstack(NULL);
+			LBUG();
+			return ERR_PTR(-EIO);
+		}
+
+		/*
+		 * Loop-back driver calls ->prepare_write() and ->sendfile()
+		 * methods directly, bypassing file system ->write() operation,
+		 * so cl_io has to be created here.
+		 */
+		io = ccc_env_thread_io(env);
+		ll_io_init(io, file, 1);
+
+		/* No lock at all for this kind of IO - we can't do it because
+		 * we have held page lock, it would cause deadlock.
+		 * XXX: This causes poor performance to loop device - One page
+		 *      per RPC.
+		 *      In order to get better performance, users should use
+		 *      lloop driver instead.
+		 */
+		io->ci_lockreq = CILR_NEVER;
+
+		pos = (vmpage->index << PAGE_CACHE_SHIFT);
+
+		/* Create a temp IO to serve write. */
+		result = cl_io_rw_init(env, io, CIT_WRITE, pos, PAGE_CACHE_SIZE);
+		if (result == 0) {
+			cio->cui_fd = LUSTRE_FPRIVATE(file);
+			cio->cui_iov = NULL;
+			cio->cui_nrsegs = 0;
+			result = cl_io_iter_init(env, io);
+			if (result == 0) {
+				result = cl_io_lock(env, io);
+				if (result == 0)
+					result = cl_io_start(env, io);
+			}
+		} else
+			result = io->ci_result;
+		lcc->lcc_created = 1;
+	}
+
+	lcc->lcc_io = io;
+	if (io == NULL)
+		result = -EIO;
+	if (result == 0) {
+		struct cl_page   *page;
+
+		LASSERT(io != NULL);
+		LASSERT(io->ci_state == CIS_IO_GOING);
+		LASSERT(cio->cui_fd == LUSTRE_FPRIVATE(file));
+		page = cl_page_find(env, clob, vmpage->index, vmpage,
+				    CPT_CACHEABLE);
+		if (!IS_ERR(page)) {
+			lcc->lcc_page = page;
+			lu_ref_add(&page->cp_reference, "cl_io", io);
+			result = 0;
+		} else
+			result = PTR_ERR(page);
+	}
+	if (result) {
+		ll_cl_fini(lcc);
+		lcc = ERR_PTR(result);
+	}
+
+	CDEBUG(D_VFSTRACE, "%lu@"DFID" -> %d %p %p\n",
+	       vmpage->index, PFID(lu_object_fid(&clob->co_lu)), result,
+	       env, io);
+	return lcc;
+}
+
+static struct ll_cl_context *ll_cl_get(void)
+{
+	struct ll_cl_context *lcc;
+	struct lu_env *env;
+	int refcheck;
+
+	env = cl_env_get(&refcheck);
+	LASSERT(!IS_ERR(env));
+	lcc = &vvp_env_info(env)->vti_io_ctx;
+	LASSERT(env == lcc->lcc_env);
+	LASSERT(current == lcc->lcc_cookie);
+	cl_env_put(env, &refcheck);
+
+	/* env has got in ll_cl_init, so it is still usable. */
+	return lcc;
+}
+
+/**
+ * ->prepare_write() address space operation called by generic_file_write()
+ * for every page during write.
+ */
+int ll_prepare_write(struct file *file, struct page *vmpage, unsigned from,
+		     unsigned to)
+{
+	struct ll_cl_context *lcc;
+	int result;
+	ENTRY;
+
+	lcc = ll_cl_init(file, vmpage, 1);
+	if (!IS_ERR(lcc)) {
+		struct lu_env  *env = lcc->lcc_env;
+		struct cl_io   *io  = lcc->lcc_io;
+		struct cl_page *page = lcc->lcc_page;
+
+		cl_page_assume(env, io, page);
+
+		result = cl_io_prepare_write(env, io, page, from, to);
+		if (result == 0) {
+			/*
+			 * Add a reference, so that page is not evicted from
+			 * the cache until ->commit_write() is called.
+			 */
+			cl_page_get(page);
+			lu_ref_add(&page->cp_reference, "prepare_write",
+				   current);
+		} else {
+			cl_page_unassume(env, io, page);
+			ll_cl_fini(lcc);
+		}
+		/* returning 0 in prepare assumes commit must be called
+		 * afterwards */
+	} else {
+		result = PTR_ERR(lcc);
+	}
+	RETURN(result);
+}
+
+int ll_commit_write(struct file *file, struct page *vmpage, unsigned from,
+		    unsigned to)
+{
+	struct ll_cl_context *lcc;
+	struct lu_env    *env;
+	struct cl_io     *io;
+	struct cl_page   *page;
+	int result = 0;
+	ENTRY;
+
+	lcc  = ll_cl_get();
+	env  = lcc->lcc_env;
+	page = lcc->lcc_page;
+	io   = lcc->lcc_io;
+
+	LASSERT(cl_page_is_owned(page, io));
+	LASSERT(from <= to);
+	if (from != to) /* handle short write case. */
+		result = cl_io_commit_write(env, io, page, from, to);
+	if (cl_page_is_owned(page, io))
+		cl_page_unassume(env, io, page);
+
+	/*
+	 * Release reference acquired by ll_prepare_write().
+	 */
+	lu_ref_del(&page->cp_reference, "prepare_write", current);
+	cl_page_put(env, page);
+	ll_cl_fini(lcc);
+	RETURN(result);
+}
+
+struct obd_capa *cl_capa_lookup(struct inode *inode, enum cl_req_type crt)
+{
+	__u64 opc;
+
+	opc = crt == CRT_WRITE ? CAPA_OPC_OSS_WRITE : CAPA_OPC_OSS_RW;
+	return ll_osscapa_get(inode, opc);
+}
+
+static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which);
+
+/**
+ * Get readahead pages from the filesystem readahead pool of the client for a
+ * thread.
+ *
+ * /param sbi superblock for filesystem readahead state ll_ra_info
+ * /param ria per-thread readahead state
+ * /param pages number of pages requested for readahead for the thread.
+ *
+ * WARNING: This algorithm is used to reduce contention on sbi->ll_lock.
+ * It should work well if the ra_max_pages is much greater than the single
+ * file's read-ahead window, and not too many threads contending for
+ * these readahead pages.
+ *
+ * TODO: There may be a 'global sync problem' if many threads are trying
+ * to get an ra budget that is larger than the remaining readahead pages
+ * and reach here at exactly the same time. They will compute /a ret to
+ * consume the remaining pages, but will fail at atomic_add_return() and
+ * get a zero ra window, although there is still ra space remaining. - Jay */
+
+static unsigned long ll_ra_count_get(struct ll_sb_info *sbi,
+				     struct ra_io_arg *ria,
+				     unsigned long pages)
+{
+	struct ll_ra_info *ra = &sbi->ll_ra_info;
+	long ret;
+	ENTRY;
+
+	/* If read-ahead pages left are less than 1M, do not do read-ahead,
+	 * otherwise it will form small read RPC(< 1M), which hurt server
+	 * performance a lot. */
+	ret = min(ra->ra_max_pages - atomic_read(&ra->ra_cur_pages), pages);
+	if (ret < 0 || ret < min_t(long, PTLRPC_MAX_BRW_PAGES, pages))
+		GOTO(out, ret = 0);
+
+	/* If the non-strided (ria_pages == 0) readahead window
+	 * (ria_start + ret) has grown across an RPC boundary, then trim
+	 * readahead size by the amount beyond the RPC so it ends on an
+	 * RPC boundary. If the readahead window is already ending on
+	 * an RPC boundary (beyond_rpc == 0), or smaller than a full
+	 * RPC (beyond_rpc < ret) the readahead size is unchanged.
+	 * The (beyond_rpc != 0) check is skipped since the conditional
+	 * branch is more expensive than subtracting zero from the result.
+	 *
+	 * Strided read is left unaligned to avoid small fragments beyond
+	 * the RPC boundary from needing an extra read RPC. */
+	if (ria->ria_pages == 0) {
+		long beyond_rpc = (ria->ria_start + ret) % PTLRPC_MAX_BRW_PAGES;
+		if (/* beyond_rpc != 0 && */ beyond_rpc < ret)
+			ret -= beyond_rpc;
+	}
+
+	if (atomic_add_return(ret, &ra->ra_cur_pages) > ra->ra_max_pages) {
+		atomic_sub(ret, &ra->ra_cur_pages);
+		ret = 0;
+	}
+
+out:
+	RETURN(ret);
+}
+
+void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len)
+{
+	struct ll_ra_info *ra = &sbi->ll_ra_info;
+	atomic_sub(len, &ra->ra_cur_pages);
+}
+
+static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which)
+{
+	LASSERTF(which >= 0 && which < _NR_RA_STAT, "which: %u\n", which);
+	lprocfs_counter_incr(sbi->ll_ra_stats, which);
+}
+
+void ll_ra_stats_inc(struct address_space *mapping, enum ra_stat which)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(mapping->host);
+	ll_ra_stats_inc_sbi(sbi, which);
+}
+
+#define RAS_CDEBUG(ras) \
+	CDEBUG(D_READA,						      \
+	       "lrp %lu cr %lu cp %lu ws %lu wl %lu nra %lu r %lu ri %lu"    \
+	       "csr %lu sf %lu sp %lu sl %lu \n",			    \
+	       ras->ras_last_readpage, ras->ras_consecutive_requests,	\
+	       ras->ras_consecutive_pages, ras->ras_window_start,	    \
+	       ras->ras_window_len, ras->ras_next_readahead,		 \
+	       ras->ras_requests, ras->ras_request_index,		    \
+	       ras->ras_consecutive_stride_requests, ras->ras_stride_offset, \
+	       ras->ras_stride_pages, ras->ras_stride_length)
+
+static int index_in_window(unsigned long index, unsigned long point,
+			   unsigned long before, unsigned long after)
+{
+	unsigned long start = point - before, end = point + after;
+
+	if (start > point)
+	       start = 0;
+	if (end < point)
+	       end = ~0;
+
+	return start <= index && index <= end;
+}
+
+static struct ll_readahead_state *ll_ras_get(struct file *f)
+{
+	struct ll_file_data       *fd;
+
+	fd = LUSTRE_FPRIVATE(f);
+	return &fd->fd_ras;
+}
+
+void ll_ra_read_in(struct file *f, struct ll_ra_read *rar)
+{
+	struct ll_readahead_state *ras;
+
+	ras = ll_ras_get(f);
+
+	spin_lock(&ras->ras_lock);
+	ras->ras_requests++;
+	ras->ras_request_index = 0;
+	ras->ras_consecutive_requests++;
+	rar->lrr_reader = current;
+
+	list_add(&rar->lrr_linkage, &ras->ras_read_beads);
+	spin_unlock(&ras->ras_lock);
+}
+
+void ll_ra_read_ex(struct file *f, struct ll_ra_read *rar)
+{
+	struct ll_readahead_state *ras;
+
+	ras = ll_ras_get(f);
+
+	spin_lock(&ras->ras_lock);
+	list_del_init(&rar->lrr_linkage);
+	spin_unlock(&ras->ras_lock);
+}
+
+static struct ll_ra_read *ll_ra_read_get_locked(struct ll_readahead_state *ras)
+{
+	struct ll_ra_read *scan;
+
+	list_for_each_entry(scan, &ras->ras_read_beads, lrr_linkage) {
+		if (scan->lrr_reader == current)
+			return scan;
+	}
+	return NULL;
+}
+
+struct ll_ra_read *ll_ra_read_get(struct file *f)
+{
+	struct ll_readahead_state *ras;
+	struct ll_ra_read	 *bead;
+
+	ras = ll_ras_get(f);
+
+	spin_lock(&ras->ras_lock);
+	bead = ll_ra_read_get_locked(ras);
+	spin_unlock(&ras->ras_lock);
+	return bead;
+}
+
+static int cl_read_ahead_page(const struct lu_env *env, struct cl_io *io,
+			      struct cl_page_list *queue, struct cl_page *page,
+			      struct page *vmpage)
+{
+	struct ccc_page *cp;
+	int	      rc;
+
+	ENTRY;
+
+	rc = 0;
+	cl_page_assume(env, io, page);
+	lu_ref_add(&page->cp_reference, "ra", current);
+	cp = cl2ccc_page(cl_page_at(page, &vvp_device_type));
+	if (!cp->cpg_defer_uptodate && !PageUptodate(vmpage)) {
+		rc = cl_page_is_under_lock(env, io, page);
+		if (rc == -EBUSY) {
+			cp->cpg_defer_uptodate = 1;
+			cp->cpg_ra_used = 0;
+			cl_page_list_add(queue, page);
+			rc = 1;
+		} else {
+			cl_page_delete(env, page);
+			rc = -ENOLCK;
+		}
+	} else {
+		/* skip completed pages */
+		cl_page_unassume(env, io, page);
+	}
+	lu_ref_del(&page->cp_reference, "ra", current);
+	cl_page_put(env, page);
+	RETURN(rc);
+}
+
+/**
+ * Initiates read-ahead of a page with given index.
+ *
+ * \retval     +ve: page was added to \a queue.
+ *
+ * \retval -ENOLCK: there is no extent lock for this part of a file, stop
+ *		  read-ahead.
+ *
+ * \retval  -ve, 0: page wasn't added to \a queue for other reason.
+ */
+static int ll_read_ahead_page(const struct lu_env *env, struct cl_io *io,
+			      struct cl_page_list *queue,
+			      pgoff_t index, struct address_space *mapping)
+{
+	struct page      *vmpage;
+	struct cl_object *clob  = ll_i2info(mapping->host)->lli_clob;
+	struct cl_page   *page;
+	enum ra_stat      which = _NR_RA_STAT; /* keep gcc happy */
+	unsigned int      gfp_mask;
+	int	       rc    = 0;
+	const char       *msg   = NULL;
+
+	ENTRY;
+
+	gfp_mask = GFP_HIGHUSER & ~__GFP_WAIT;
+#ifdef __GFP_NOWARN
+	gfp_mask |= __GFP_NOWARN;
+#endif
+	vmpage = grab_cache_page_nowait(mapping, index);
+	if (vmpage != NULL) {
+		/* Check if vmpage was truncated or reclaimed */
+		if (vmpage->mapping == mapping) {
+			page = cl_page_find(env, clob, vmpage->index,
+					    vmpage, CPT_CACHEABLE);
+			if (!IS_ERR(page)) {
+				rc = cl_read_ahead_page(env, io, queue,
+							page, vmpage);
+				if (rc == -ENOLCK) {
+					which = RA_STAT_FAILED_MATCH;
+					msg   = "lock match failed";
+				}
+			} else {
+				which = RA_STAT_FAILED_GRAB_PAGE;
+				msg   = "cl_page_find failed";
+			}
+		} else {
+			which = RA_STAT_WRONG_GRAB_PAGE;
+			msg   = "g_c_p_n returned invalid page";
+		}
+		if (rc != 1)
+			unlock_page(vmpage);
+		page_cache_release(vmpage);
+	} else {
+		which = RA_STAT_FAILED_GRAB_PAGE;
+		msg   = "g_c_p_n failed";
+	}
+	if (msg != NULL) {
+		ll_ra_stats_inc(mapping, which);
+		CDEBUG(D_READA, "%s\n", msg);
+	}
+	RETURN(rc);
+}
+
+#define RIA_DEBUG(ria)						       \
+	CDEBUG(D_READA, "rs %lu re %lu ro %lu rl %lu rp %lu\n",       \
+	ria->ria_start, ria->ria_end, ria->ria_stoff, ria->ria_length,\
+	ria->ria_pages)
+
+/* Limit this to the blocksize instead of PTLRPC_BRW_MAX_SIZE, since we don't
+ * know what the actual RPC size is.  If this needs to change, it makes more
+ * sense to tune the i_blkbits value for the file based on the OSTs it is
+ * striped over, rather than having a constant value for all files here. */
+
+/* RAS_INCREASE_STEP should be (1UL << (inode->i_blkbits - PAGE_CACHE_SHIFT)).
+ * Temprarily set RAS_INCREASE_STEP to 1MB. After 4MB RPC is enabled
+ * by default, this should be adjusted corresponding with max_read_ahead_mb
+ * and max_read_ahead_per_file_mb otherwise the readahead budget can be used
+ * up quickly which will affect read performance siginificantly. See LU-2816 */
+#define RAS_INCREASE_STEP(inode) (ONE_MB_BRW_SIZE >> PAGE_CACHE_SHIFT)
+
+static inline int stride_io_mode(struct ll_readahead_state *ras)
+{
+	return ras->ras_consecutive_stride_requests > 1;
+}
+/* The function calculates how much pages will be read in
+ * [off, off + length], in such stride IO area,
+ * stride_offset = st_off, stride_lengh = st_len,
+ * stride_pages = st_pgs
+ *
+ *   |------------------|*****|------------------|*****|------------|*****|....
+ * st_off
+ *   |--- st_pgs     ---|
+ *   |-----     st_len   -----|
+ *
+ *	      How many pages it should read in such pattern
+ *	      |-------------------------------------------------------------|
+ *	      off
+ *	      |<------		  length		      ------->|
+ *
+ *	  =   |<----->|  +  |-------------------------------------| +   |---|
+ *	     start_left		 st_pgs * i		    end_left
+ */
+static unsigned long
+stride_pg_count(pgoff_t st_off, unsigned long st_len, unsigned long st_pgs,
+		unsigned long off, unsigned long length)
+{
+	__u64 start = off > st_off ? off - st_off : 0;
+	__u64 end = off + length > st_off ? off + length - st_off : 0;
+	unsigned long start_left = 0;
+	unsigned long end_left = 0;
+	unsigned long pg_count;
+
+	if (st_len == 0 || length == 0 || end == 0)
+		return length;
+
+	start_left = do_div(start, st_len);
+	if (start_left < st_pgs)
+		start_left = st_pgs - start_left;
+	else
+		start_left = 0;
+
+	end_left = do_div(end, st_len);
+	if (end_left > st_pgs)
+		end_left = st_pgs;
+
+	CDEBUG(D_READA, "start "LPU64", end "LPU64" start_left %lu end_left %lu \n",
+	       start, end, start_left, end_left);
+
+	if (start == end)
+		pg_count = end_left - (st_pgs - start_left);
+	else
+		pg_count = start_left + st_pgs * (end - start - 1) + end_left;
+
+	CDEBUG(D_READA, "st_off %lu, st_len %lu st_pgs %lu off %lu length %lu"
+	       "pgcount %lu\n", st_off, st_len, st_pgs, off, length, pg_count);
+
+	return pg_count;
+}
+
+static int ria_page_count(struct ra_io_arg *ria)
+{
+	__u64 length = ria->ria_end >= ria->ria_start ?
+		       ria->ria_end - ria->ria_start + 1 : 0;
+
+	return stride_pg_count(ria->ria_stoff, ria->ria_length,
+			       ria->ria_pages, ria->ria_start,
+			       length);
+}
+
+/*Check whether the index is in the defined ra-window */
+static int ras_inside_ra_window(unsigned long idx, struct ra_io_arg *ria)
+{
+	/* If ria_length == ria_pages, it means non-stride I/O mode,
+	 * idx should always inside read-ahead window in this case
+	 * For stride I/O mode, just check whether the idx is inside
+	 * the ria_pages. */
+	return ria->ria_length == 0 || ria->ria_length == ria->ria_pages ||
+	       (idx >= ria->ria_stoff && (idx - ria->ria_stoff) %
+		ria->ria_length < ria->ria_pages);
+}
+
+static int ll_read_ahead_pages(const struct lu_env *env,
+			       struct cl_io *io, struct cl_page_list *queue,
+			       struct ra_io_arg *ria,
+			       unsigned long *reserved_pages,
+			       struct address_space *mapping,
+			       unsigned long *ra_end)
+{
+	int rc, count = 0, stride_ria;
+	unsigned long page_idx;
+
+	LASSERT(ria != NULL);
+	RIA_DEBUG(ria);
+
+	stride_ria = ria->ria_length > ria->ria_pages && ria->ria_pages > 0;
+	for (page_idx = ria->ria_start; page_idx <= ria->ria_end &&
+			*reserved_pages > 0; page_idx++) {
+		if (ras_inside_ra_window(page_idx, ria)) {
+			/* If the page is inside the read-ahead window*/
+			rc = ll_read_ahead_page(env, io, queue,
+						page_idx, mapping);
+			if (rc == 1) {
+				(*reserved_pages)--;
+				count ++;
+			} else if (rc == -ENOLCK)
+				break;
+		} else if (stride_ria) {
+			/* If it is not in the read-ahead window, and it is
+			 * read-ahead mode, then check whether it should skip
+			 * the stride gap */
+			pgoff_t offset;
+			/* FIXME: This assertion only is valid when it is for
+			 * forward read-ahead, it will be fixed when backward
+			 * read-ahead is implemented */
+			LASSERTF(page_idx > ria->ria_stoff, "Invalid page_idx %lu"
+				"rs %lu re %lu ro %lu rl %lu rp %lu\n", page_idx,
+				ria->ria_start, ria->ria_end, ria->ria_stoff,
+				ria->ria_length, ria->ria_pages);
+			offset = page_idx - ria->ria_stoff;
+			offset = offset % (ria->ria_length);
+			if (offset > ria->ria_pages) {
+				page_idx += ria->ria_length - offset;
+				CDEBUG(D_READA, "i %lu skip %lu \n", page_idx,
+				       ria->ria_length - offset);
+				continue;
+			}
+		}
+	}
+	*ra_end = page_idx;
+	return count;
+}
+
+int ll_readahead(const struct lu_env *env, struct cl_io *io,
+		 struct ll_readahead_state *ras, struct address_space *mapping,
+		 struct cl_page_list *queue, int flags)
+{
+	struct vvp_io *vio = vvp_env_io(env);
+	struct vvp_thread_info *vti = vvp_env_info(env);
+	struct cl_attr *attr = ccc_env_thread_attr(env);
+	unsigned long start = 0, end = 0, reserved;
+	unsigned long ra_end, len;
+	struct inode *inode;
+	struct ll_ra_read *bead;
+	struct ra_io_arg *ria = &vti->vti_ria;
+	struct ll_inode_info *lli;
+	struct cl_object *clob;
+	int ret = 0;
+	__u64 kms;
+	ENTRY;
+
+	inode = mapping->host;
+	lli = ll_i2info(inode);
+	clob = lli->lli_clob;
+
+	memset(ria, 0, sizeof *ria);
+
+	cl_object_attr_lock(clob);
+	ret = cl_object_attr_get(env, clob, attr);
+	cl_object_attr_unlock(clob);
+
+	if (ret != 0)
+		RETURN(ret);
+	kms = attr->cat_kms;
+	if (kms == 0) {
+		ll_ra_stats_inc(mapping, RA_STAT_ZERO_LEN);
+		RETURN(0);
+	}
+
+	spin_lock(&ras->ras_lock);
+	if (vio->cui_ra_window_set)
+		bead = &vio->cui_bead;
+	else
+		bead = NULL;
+
+	/* Enlarge the RA window to encompass the full read */
+	if (bead != NULL && ras->ras_window_start + ras->ras_window_len <
+	    bead->lrr_start + bead->lrr_count) {
+		ras->ras_window_len = bead->lrr_start + bead->lrr_count -
+				      ras->ras_window_start;
+	}
+	/* Reserve a part of the read-ahead window that we'll be issuing */
+	if (ras->ras_window_len) {
+		start = ras->ras_next_readahead;
+		end = ras->ras_window_start + ras->ras_window_len - 1;
+	}
+	if (end != 0) {
+		unsigned long rpc_boundary;
+		/*
+		 * Align RA window to an optimal boundary.
+		 *
+		 * XXX This would be better to align to cl_max_pages_per_rpc
+		 * instead of PTLRPC_MAX_BRW_PAGES, because the RPC size may
+		 * be aligned to the RAID stripe size in the future and that
+		 * is more important than the RPC size.
+		 */
+		/* Note: we only trim the RPC, instead of extending the RPC
+		 * to the boundary, so to avoid reading too much pages during
+		 * random reading. */
+		rpc_boundary = ((end + 1) & (~(PTLRPC_MAX_BRW_PAGES - 1)));
+		if (rpc_boundary > 0)
+			rpc_boundary--;
+
+		if (rpc_boundary  > start)
+			end = rpc_boundary;
+
+		/* Truncate RA window to end of file */
+		end = min(end, (unsigned long)((kms - 1) >> PAGE_CACHE_SHIFT));
+
+		ras->ras_next_readahead = max(end, end + 1);
+		RAS_CDEBUG(ras);
+	}
+	ria->ria_start = start;
+	ria->ria_end = end;
+	/* If stride I/O mode is detected, get stride window*/
+	if (stride_io_mode(ras)) {
+		ria->ria_stoff = ras->ras_stride_offset;
+		ria->ria_length = ras->ras_stride_length;
+		ria->ria_pages = ras->ras_stride_pages;
+	}
+	spin_unlock(&ras->ras_lock);
+
+	if (end == 0) {
+		ll_ra_stats_inc(mapping, RA_STAT_ZERO_WINDOW);
+		RETURN(0);
+	}
+	len = ria_page_count(ria);
+	if (len == 0)
+		RETURN(0);
+
+	reserved = ll_ra_count_get(ll_i2sbi(inode), ria, len);
+	if (reserved < len)
+		ll_ra_stats_inc(mapping, RA_STAT_MAX_IN_FLIGHT);
+
+	CDEBUG(D_READA, "reserved page %lu ra_cur %d ra_max %lu\n", reserved,
+	       atomic_read(&ll_i2sbi(inode)->ll_ra_info.ra_cur_pages),
+	       ll_i2sbi(inode)->ll_ra_info.ra_max_pages);
+
+	ret = ll_read_ahead_pages(env, io, queue,
+				  ria, &reserved, mapping, &ra_end);
+
+	LASSERTF(reserved >= 0, "reserved %lu\n", reserved);
+	if (reserved != 0)
+		ll_ra_count_put(ll_i2sbi(inode), reserved);
+
+	if (ra_end == end + 1 && ra_end == (kms >> PAGE_CACHE_SHIFT))
+		ll_ra_stats_inc(mapping, RA_STAT_EOF);
+
+	/* if we didn't get to the end of the region we reserved from
+	 * the ras we need to go back and update the ras so that the
+	 * next read-ahead tries from where we left off.  we only do so
+	 * if the region we failed to issue read-ahead on is still ahead
+	 * of the app and behind the next index to start read-ahead from */
+	CDEBUG(D_READA, "ra_end %lu end %lu stride end %lu \n",
+	       ra_end, end, ria->ria_end);
+
+	if (ra_end != end + 1) {
+		spin_lock(&ras->ras_lock);
+		if (ra_end < ras->ras_next_readahead &&
+		    index_in_window(ra_end, ras->ras_window_start, 0,
+				    ras->ras_window_len)) {
+			ras->ras_next_readahead = ra_end;
+			RAS_CDEBUG(ras);
+		}
+		spin_unlock(&ras->ras_lock);
+	}
+
+	RETURN(ret);
+}
+
+static void ras_set_start(struct inode *inode, struct ll_readahead_state *ras,
+			  unsigned long index)
+{
+	ras->ras_window_start = index & (~(RAS_INCREASE_STEP(inode) - 1));
+}
+
+/* called with the ras_lock held or from places where it doesn't matter */
+static void ras_reset(struct inode *inode, struct ll_readahead_state *ras,
+		      unsigned long index)
+{
+	ras->ras_last_readpage = index;
+	ras->ras_consecutive_requests = 0;
+	ras->ras_consecutive_pages = 0;
+	ras->ras_window_len = 0;
+	ras_set_start(inode, ras, index);
+	ras->ras_next_readahead = max(ras->ras_window_start, index);
+
+	RAS_CDEBUG(ras);
+}
+
+/* called with the ras_lock held or from places where it doesn't matter */
+static void ras_stride_reset(struct ll_readahead_state *ras)
+{
+	ras->ras_consecutive_stride_requests = 0;
+	ras->ras_stride_length = 0;
+	ras->ras_stride_pages = 0;
+	RAS_CDEBUG(ras);
+}
+
+void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras)
+{
+	spin_lock_init(&ras->ras_lock);
+	ras_reset(inode, ras, 0);
+	ras->ras_requests = 0;
+	INIT_LIST_HEAD(&ras->ras_read_beads);
+}
+
+/*
+ * Check whether the read request is in the stride window.
+ * If it is in the stride window, return 1, otherwise return 0.
+ */
+static int index_in_stride_window(struct ll_readahead_state *ras,
+				  unsigned long index)
+{
+	unsigned long stride_gap;
+
+	if (ras->ras_stride_length == 0 || ras->ras_stride_pages == 0 ||
+	    ras->ras_stride_pages == ras->ras_stride_length)
+		return 0;
+
+	stride_gap = index - ras->ras_last_readpage - 1;
+
+	/* If it is contiguous read */
+	if (stride_gap == 0)
+		return ras->ras_consecutive_pages + 1 <= ras->ras_stride_pages;
+
+	/* Otherwise check the stride by itself */
+	return (ras->ras_stride_length - ras->ras_stride_pages) == stride_gap &&
+		ras->ras_consecutive_pages == ras->ras_stride_pages;
+}
+
+static void ras_update_stride_detector(struct ll_readahead_state *ras,
+				       unsigned long index)
+{
+	unsigned long stride_gap = index - ras->ras_last_readpage - 1;
+
+	if (!stride_io_mode(ras) && (stride_gap != 0 ||
+	     ras->ras_consecutive_stride_requests == 0)) {
+		ras->ras_stride_pages = ras->ras_consecutive_pages;
+		ras->ras_stride_length = stride_gap +ras->ras_consecutive_pages;
+	}
+	LASSERT(ras->ras_request_index == 0);
+	LASSERT(ras->ras_consecutive_stride_requests == 0);
+
+	if (index <= ras->ras_last_readpage) {
+		/*Reset stride window for forward read*/
+		ras_stride_reset(ras);
+		return;
+	}
+
+	ras->ras_stride_pages = ras->ras_consecutive_pages;
+	ras->ras_stride_length = stride_gap +ras->ras_consecutive_pages;
+
+	RAS_CDEBUG(ras);
+	return;
+}
+
+static unsigned long
+stride_page_count(struct ll_readahead_state *ras, unsigned long len)
+{
+	return stride_pg_count(ras->ras_stride_offset, ras->ras_stride_length,
+			       ras->ras_stride_pages, ras->ras_stride_offset,
+			       len);
+}
+
+/* Stride Read-ahead window will be increased inc_len according to
+ * stride I/O pattern */
+static void ras_stride_increase_window(struct ll_readahead_state *ras,
+				       struct ll_ra_info *ra,
+				       unsigned long inc_len)
+{
+	unsigned long left, step, window_len;
+	unsigned long stride_len;
+
+	LASSERT(ras->ras_stride_length > 0);
+	LASSERTF(ras->ras_window_start + ras->ras_window_len
+		 >= ras->ras_stride_offset, "window_start %lu, window_len %lu"
+		 " stride_offset %lu\n", ras->ras_window_start,
+		 ras->ras_window_len, ras->ras_stride_offset);
+
+	stride_len = ras->ras_window_start + ras->ras_window_len -
+		     ras->ras_stride_offset;
+
+	left = stride_len % ras->ras_stride_length;
+	window_len = ras->ras_window_len - left;
+
+	if (left < ras->ras_stride_pages)
+		left += inc_len;
+	else
+		left = ras->ras_stride_pages + inc_len;
+
+	LASSERT(ras->ras_stride_pages != 0);
+
+	step = left / ras->ras_stride_pages;
+	left %= ras->ras_stride_pages;
+
+	window_len += step * ras->ras_stride_length + left;
+
+	if (stride_page_count(ras, window_len) <= ra->ra_max_pages_per_file)
+		ras->ras_window_len = window_len;
+
+	RAS_CDEBUG(ras);
+}
+
+static void ras_increase_window(struct inode *inode,
+				struct ll_readahead_state *ras,
+				struct ll_ra_info *ra)
+{
+	/* The stretch of ra-window should be aligned with max rpc_size
+	 * but current clio architecture does not support retrieve such
+	 * information from lower layer. FIXME later
+	 */
+	if (stride_io_mode(ras))
+		ras_stride_increase_window(ras, ra, RAS_INCREASE_STEP(inode));
+	else
+		ras->ras_window_len = min(ras->ras_window_len +
+					  RAS_INCREASE_STEP(inode),
+					  ra->ra_max_pages_per_file);
+}
+
+void ras_update(struct ll_sb_info *sbi, struct inode *inode,
+		struct ll_readahead_state *ras, unsigned long index,
+		unsigned hit)
+{
+	struct ll_ra_info *ra = &sbi->ll_ra_info;
+	int zero = 0, stride_detect = 0, ra_miss = 0;
+	ENTRY;
+
+	spin_lock(&ras->ras_lock);
+
+	ll_ra_stats_inc_sbi(sbi, hit ? RA_STAT_HIT : RA_STAT_MISS);
+
+	/* reset the read-ahead window in two cases.  First when the app seeks
+	 * or reads to some other part of the file.  Secondly if we get a
+	 * read-ahead miss that we think we've previously issued.  This can
+	 * be a symptom of there being so many read-ahead pages that the VM is
+	 * reclaiming it before we get to it. */
+	if (!index_in_window(index, ras->ras_last_readpage, 8, 8)) {
+		zero = 1;
+		ll_ra_stats_inc_sbi(sbi, RA_STAT_DISTANT_READPAGE);
+	} else if (!hit && ras->ras_window_len &&
+		   index < ras->ras_next_readahead &&
+		   index_in_window(index, ras->ras_window_start, 0,
+				   ras->ras_window_len)) {
+		ra_miss = 1;
+		ll_ra_stats_inc_sbi(sbi, RA_STAT_MISS_IN_WINDOW);
+	}
+
+	/* On the second access to a file smaller than the tunable
+	 * ra_max_read_ahead_whole_pages trigger RA on all pages in the
+	 * file up to ra_max_pages_per_file.  This is simply a best effort
+	 * and only occurs once per open file.  Normal RA behavior is reverted
+	 * to for subsequent IO.  The mmap case does not increment
+	 * ras_requests and thus can never trigger this behavior. */
+	if (ras->ras_requests == 2 && !ras->ras_request_index) {
+		__u64 kms_pages;
+
+		kms_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+			    PAGE_CACHE_SHIFT;
+
+		CDEBUG(D_READA, "kmsp "LPU64" mwp %lu mp %lu\n", kms_pages,
+		       ra->ra_max_read_ahead_whole_pages, ra->ra_max_pages_per_file);
+
+		if (kms_pages &&
+		    kms_pages <= ra->ra_max_read_ahead_whole_pages) {
+			ras->ras_window_start = 0;
+			ras->ras_last_readpage = 0;
+			ras->ras_next_readahead = 0;
+			ras->ras_window_len = min(ra->ra_max_pages_per_file,
+				ra->ra_max_read_ahead_whole_pages);
+			GOTO(out_unlock, 0);
+		}
+	}
+	if (zero) {
+		/* check whether it is in stride I/O mode*/
+		if (!index_in_stride_window(ras, index)) {
+			if (ras->ras_consecutive_stride_requests == 0 &&
+			    ras->ras_request_index == 0) {
+				ras_update_stride_detector(ras, index);
+				ras->ras_consecutive_stride_requests++;
+			} else {
+				ras_stride_reset(ras);
+			}
+			ras_reset(inode, ras, index);
+			ras->ras_consecutive_pages++;
+			GOTO(out_unlock, 0);
+		} else {
+			ras->ras_consecutive_pages = 0;
+			ras->ras_consecutive_requests = 0;
+			if (++ras->ras_consecutive_stride_requests > 1)
+				stride_detect = 1;
+			RAS_CDEBUG(ras);
+		}
+	} else {
+		if (ra_miss) {
+			if (index_in_stride_window(ras, index) &&
+			    stride_io_mode(ras)) {
+				/*If stride-RA hit cache miss, the stride dector
+				 *will not be reset to avoid the overhead of
+				 *redetecting read-ahead mode */
+				if (index != ras->ras_last_readpage + 1)
+					ras->ras_consecutive_pages = 0;
+				ras_reset(inode, ras, index);
+				RAS_CDEBUG(ras);
+			} else {
+				/* Reset both stride window and normal RA
+				 * window */
+				ras_reset(inode, ras, index);
+				ras->ras_consecutive_pages++;
+				ras_stride_reset(ras);
+				GOTO(out_unlock, 0);
+			}
+		} else if (stride_io_mode(ras)) {
+			/* If this is contiguous read but in stride I/O mode
+			 * currently, check whether stride step still is valid,
+			 * if invalid, it will reset the stride ra window*/
+			if (!index_in_stride_window(ras, index)) {
+				/* Shrink stride read-ahead window to be zero */
+				ras_stride_reset(ras);
+				ras->ras_window_len = 0;
+				ras->ras_next_readahead = index;
+			}
+		}
+	}
+	ras->ras_consecutive_pages++;
+	ras->ras_last_readpage = index;
+	ras_set_start(inode, ras, index);
+
+	if (stride_io_mode(ras))
+		/* Since stride readahead is sentivite to the offset
+		 * of read-ahead, so we use original offset here,
+		 * instead of ras_window_start, which is RPC aligned */
+		ras->ras_next_readahead = max(index, ras->ras_next_readahead);
+	else
+		ras->ras_next_readahead = max(ras->ras_window_start,
+					      ras->ras_next_readahead);
+	RAS_CDEBUG(ras);
+
+	/* Trigger RA in the mmap case where ras_consecutive_requests
+	 * is not incremented and thus can't be used to trigger RA */
+	if (!ras->ras_window_len && ras->ras_consecutive_pages == 4) {
+		ras->ras_window_len = RAS_INCREASE_STEP(inode);
+		GOTO(out_unlock, 0);
+	}
+
+	/* Initially reset the stride window offset to next_readahead*/
+	if (ras->ras_consecutive_stride_requests == 2 && stride_detect) {
+		/**
+		 * Once stride IO mode is detected, next_readahead should be
+		 * reset to make sure next_readahead > stride offset
+		 */
+		ras->ras_next_readahead = max(index, ras->ras_next_readahead);
+		ras->ras_stride_offset = index;
+		ras->ras_window_len = RAS_INCREASE_STEP(inode);
+	}
+
+	/* The initial ras_window_len is set to the request size.  To avoid
+	 * uselessly reading and discarding pages for random IO the window is
+	 * only increased once per consecutive request received. */
+	if ((ras->ras_consecutive_requests > 1 || stride_detect) &&
+	    !ras->ras_request_index)
+		ras_increase_window(inode, ras, ra);
+	EXIT;
+out_unlock:
+	RAS_CDEBUG(ras);
+	ras->ras_request_index++;
+	spin_unlock(&ras->ras_lock);
+	return;
+}
+
+int ll_writepage(struct page *vmpage, struct writeback_control *wbc)
+{
+	struct inode	       *inode = vmpage->mapping->host;
+	struct ll_inode_info   *lli   = ll_i2info(inode);
+	struct lu_env	  *env;
+	struct cl_io	   *io;
+	struct cl_page	 *page;
+	struct cl_object       *clob;
+	struct cl_env_nest      nest;
+	bool redirtied = false;
+	bool unlocked = false;
+	int result;
+	ENTRY;
+
+	LASSERT(PageLocked(vmpage));
+	LASSERT(!PageWriteback(vmpage));
+
+	LASSERT(ll_i2dtexp(inode) != NULL);
+
+	env = cl_env_nested_get(&nest);
+	if (IS_ERR(env))
+		GOTO(out, result = PTR_ERR(env));
+
+	clob  = ll_i2info(inode)->lli_clob;
+	LASSERT(clob != NULL);
+
+	io = ccc_env_thread_io(env);
+	io->ci_obj = clob;
+	io->ci_ignore_layout = 1;
+	result = cl_io_init(env, io, CIT_MISC, clob);
+	if (result == 0) {
+		page = cl_page_find(env, clob, vmpage->index,
+				    vmpage, CPT_CACHEABLE);
+		if (!IS_ERR(page)) {
+			lu_ref_add(&page->cp_reference, "writepage",
+				   current);
+			cl_page_assume(env, io, page);
+			result = cl_page_flush(env, io, page);
+			if (result != 0) {
+				/*
+				 * Re-dirty page on error so it retries write,
+				 * but not in case when IO has actually
+				 * occurred and completed with an error.
+				 */
+				if (!PageError(vmpage)) {
+					redirty_page_for_writepage(wbc, vmpage);
+					result = 0;
+					redirtied = true;
+				}
+			}
+			cl_page_disown(env, io, page);
+			unlocked = true;
+			lu_ref_del(&page->cp_reference,
+				   "writepage", current);
+			cl_page_put(env, page);
+		} else {
+			result = PTR_ERR(page);
+		}
+	}
+	cl_io_fini(env, io);
+
+	if (redirtied && wbc->sync_mode == WB_SYNC_ALL) {
+		loff_t offset = cl_offset(clob, vmpage->index);
+
+		/* Flush page failed because the extent is being written out.
+		 * Wait for the write of extent to be finished to avoid
+		 * breaking kernel which assumes ->writepage should mark
+		 * PageWriteback or clean the page. */
+		result = cl_sync_file_range(inode, offset,
+					    offset + PAGE_CACHE_SIZE - 1,
+					    CL_FSYNC_LOCAL);
+		if (result > 0) {
+			/* actually we may have written more than one page.
+			 * decreasing this page because the caller will count
+			 * it. */
+			wbc->nr_to_write -= result - 1;
+			result = 0;
+		}
+	}
+
+	cl_env_nested_put(&nest, env);
+	GOTO(out, result);
+
+out:
+	if (result < 0) {
+		if (!lli->lli_async_rc)
+			lli->lli_async_rc = result;
+		SetPageError(vmpage);
+		if (!unlocked)
+			unlock_page(vmpage);
+	}
+	return result;
+}
+
+int ll_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+	struct inode *inode = mapping->host;
+	loff_t start;
+	loff_t end;
+	enum cl_fsync_mode mode;
+	int range_whole = 0;
+	int result;
+	ENTRY;
+
+	if (wbc->range_cyclic) {
+		start = mapping->writeback_index << PAGE_CACHE_SHIFT;
+		end = OBD_OBJECT_EOF;
+	} else {
+		start = wbc->range_start;
+		end = wbc->range_end;
+		if (end == LLONG_MAX) {
+			end = OBD_OBJECT_EOF;
+			range_whole = start == 0;
+		}
+	}
+
+	mode = CL_FSYNC_NONE;
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		mode = CL_FSYNC_LOCAL;
+
+	result = cl_sync_file_range(inode, start, end, mode);
+	if (result > 0) {
+		wbc->nr_to_write -= result;
+		result = 0;
+	 }
+
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) {
+		if (end == OBD_OBJECT_EOF)
+			end = i_size_read(inode);
+		mapping->writeback_index = (end >> PAGE_CACHE_SHIFT) + 1;
+	}
+	RETURN(result);
+}
+
+int ll_readpage(struct file *file, struct page *vmpage)
+{
+	struct ll_cl_context *lcc;
+	int result;
+	ENTRY;
+
+	lcc = ll_cl_init(file, vmpage, 0);
+	if (!IS_ERR(lcc)) {
+		struct lu_env  *env  = lcc->lcc_env;
+		struct cl_io   *io   = lcc->lcc_io;
+		struct cl_page *page = lcc->lcc_page;
+
+		LASSERT(page->cp_type == CPT_CACHEABLE);
+		if (likely(!PageUptodate(vmpage))) {
+			cl_page_assume(env, io, page);
+			result = cl_io_read_page(env, io, page);
+		} else {
+			/* Page from a non-object file. */
+			unlock_page(vmpage);
+			result = 0;
+		}
+		ll_cl_fini(lcc);
+	} else {
+		unlock_page(vmpage);
+		result = PTR_ERR(lcc);
+	}
+	RETURN(result);
+}
diff --git a/drivers/staging/lustre/lustre/llite/rw26.c b/drivers/staging/lustre/lustre/llite/rw26.c
new file mode 100644
index 000000000000..27e4e64bc1e7
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/rw26.c
@@ -0,0 +1,586 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lustre/llite/rw26.c
+ *
+ * Lustre Lite I/O page cache routines for the 2.5/2.6 kernel version
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <asm/uaccess.h>
+
+#include <linux/migrate.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/mpage.h>
+#include <linux/writeback.h>
+#include <linux/stat.h>
+#include <asm/uaccess.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <lustre_lite.h>
+#include "llite_internal.h"
+#include <linux/lustre_compat25.h>
+
+/**
+ * Implements Linux VM address_space::invalidatepage() method. This method is
+ * called when the page is truncate from a file, either as a result of
+ * explicit truncate, or when inode is removed from memory (as a result of
+ * final iput(), umount, or memory pressure induced icache shrinking).
+ *
+ * [0, offset] bytes of the page remain valid (this is for a case of not-page
+ * aligned truncate). Lustre leaves partially truncated page in the cache,
+ * relying on struct inode::i_size to limit further accesses.
+ */
+static void ll_invalidatepage(struct page *vmpage, unsigned long offset)
+{
+	struct inode     *inode;
+	struct lu_env    *env;
+	struct cl_page   *page;
+	struct cl_object *obj;
+
+	int refcheck;
+
+	LASSERT(PageLocked(vmpage));
+	LASSERT(!PageWriteback(vmpage));
+
+	/*
+	 * It is safe to not check anything in invalidatepage/releasepage
+	 * below because they are run with page locked and all our io is
+	 * happening with locked page too
+	 */
+	if (offset == 0) {
+		env = cl_env_get(&refcheck);
+		if (!IS_ERR(env)) {
+			inode = vmpage->mapping->host;
+			obj = ll_i2info(inode)->lli_clob;
+			if (obj != NULL) {
+				page = cl_vmpage_page(vmpage, obj);
+				if (page != NULL) {
+					lu_ref_add(&page->cp_reference,
+						   "delete", vmpage);
+					cl_page_delete(env, page);
+					lu_ref_del(&page->cp_reference,
+						   "delete", vmpage);
+					cl_page_put(env, page);
+				}
+			} else
+				LASSERT(vmpage->private == 0);
+			cl_env_put(env, &refcheck);
+		}
+	}
+}
+
+#ifdef HAVE_RELEASEPAGE_WITH_INT
+#define RELEASEPAGE_ARG_TYPE int
+#else
+#define RELEASEPAGE_ARG_TYPE gfp_t
+#endif
+static int ll_releasepage(struct page *vmpage, RELEASEPAGE_ARG_TYPE gfp_mask)
+{
+	struct cl_env_nest nest;
+	struct lu_env     *env;
+	struct cl_object  *obj;
+	struct cl_page    *page;
+	struct address_space *mapping;
+	int result;
+
+	LASSERT(PageLocked(vmpage));
+	if (PageWriteback(vmpage) || PageDirty(vmpage))
+		return 0;
+
+	mapping = vmpage->mapping;
+	if (mapping == NULL)
+		return 1;
+
+	obj = ll_i2info(mapping->host)->lli_clob;
+	if (obj == NULL)
+		return 1;
+
+	/* 1 for page allocator, 1 for cl_page and 1 for page cache */
+	if (page_count(vmpage) > 3)
+		return 0;
+
+	/* TODO: determine what gfp should be used by @gfp_mask. */
+	env = cl_env_nested_get(&nest);
+	if (IS_ERR(env))
+		/* If we can't allocate an env we won't call cl_page_put()
+		 * later on which further means it's impossible to drop
+		 * page refcount by cl_page, so ask kernel to not free
+		 * this page. */
+		return 0;
+
+	page = cl_vmpage_page(vmpage, obj);
+	result = page == NULL;
+	if (page != NULL) {
+		if (!cl_page_in_use(page)) {
+			result = 1;
+			cl_page_delete(env, page);
+		}
+		cl_page_put(env, page);
+	}
+	cl_env_nested_put(&nest, env);
+	return result;
+}
+
+static int ll_set_page_dirty(struct page *vmpage)
+{
+#if 0
+	struct cl_page    *page = vvp_vmpage_page_transient(vmpage);
+	struct vvp_object *obj  = cl_inode2vvp(vmpage->mapping->host);
+	struct vvp_page   *cpg;
+
+	/*
+	 * XXX should page method be called here?
+	 */
+	LASSERT(&obj->co_cl == page->cp_obj);
+	cpg = cl2vvp_page(cl_page_at(page, &vvp_device_type));
+	/*
+	 * XXX cannot do much here, because page is possibly not locked:
+	 * sys_munmap()->...
+	 *     ->unmap_page_range()->zap_pte_range()->set_page_dirty().
+	 */
+	vvp_write_pending(obj, cpg);
+#endif
+	RETURN(__set_page_dirty_nobuffers(vmpage));
+}
+
+#define MAX_DIRECTIO_SIZE 2*1024*1024*1024UL
+
+static inline int ll_get_user_pages(int rw, unsigned long user_addr,
+				    size_t size, struct page ***pages,
+				    int *max_pages)
+{
+	int result = -ENOMEM;
+
+	/* set an arbitrary limit to prevent arithmetic overflow */
+	if (size > MAX_DIRECTIO_SIZE) {
+		*pages = NULL;
+		return -EFBIG;
+	}
+
+	*max_pages = (user_addr + size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	*max_pages -= user_addr >> PAGE_CACHE_SHIFT;
+
+	OBD_ALLOC_LARGE(*pages, *max_pages * sizeof(**pages));
+	if (*pages) {
+		down_read(&current->mm->mmap_sem);
+		result = get_user_pages(current, current->mm, user_addr,
+					*max_pages, (rw == READ), 0, *pages,
+					NULL);
+		up_read(&current->mm->mmap_sem);
+		if (unlikely(result <= 0))
+			OBD_FREE_LARGE(*pages, *max_pages * sizeof(**pages));
+	}
+
+	return result;
+}
+
+/*  ll_free_user_pages - tear down page struct array
+ *  @pages: array of page struct pointers underlying target buffer */
+static void ll_free_user_pages(struct page **pages, int npages, int do_dirty)
+{
+	int i;
+
+	for (i = 0; i < npages; i++) {
+		if (pages[i] == NULL)
+			break;
+		if (do_dirty)
+			set_page_dirty_lock(pages[i]);
+		page_cache_release(pages[i]);
+	}
+
+	OBD_FREE_LARGE(pages, npages * sizeof(*pages));
+}
+
+ssize_t ll_direct_rw_pages(const struct lu_env *env, struct cl_io *io,
+			   int rw, struct inode *inode,
+			   struct ll_dio_pages *pv)
+{
+	struct cl_page    *clp;
+	struct cl_2queue  *queue;
+	struct cl_object  *obj = io->ci_obj;
+	int i;
+	ssize_t rc = 0;
+	loff_t file_offset  = pv->ldp_start_offset;
+	long size	   = pv->ldp_size;
+	int page_count      = pv->ldp_nr;
+	struct page **pages = pv->ldp_pages;
+	long page_size      = cl_page_size(obj);
+	bool do_io;
+	int  io_pages       = 0;
+	ENTRY;
+
+	queue = &io->ci_queue;
+	cl_2queue_init(queue);
+	for (i = 0; i < page_count; i++) {
+		if (pv->ldp_offsets)
+		    file_offset = pv->ldp_offsets[i];
+
+		LASSERT(!(file_offset & (page_size - 1)));
+		clp = cl_page_find(env, obj, cl_index(obj, file_offset),
+				   pv->ldp_pages[i], CPT_TRANSIENT);
+		if (IS_ERR(clp)) {
+			rc = PTR_ERR(clp);
+			break;
+		}
+
+		rc = cl_page_own(env, io, clp);
+		if (rc) {
+			LASSERT(clp->cp_state == CPS_FREEING);
+			cl_page_put(env, clp);
+			break;
+		}
+
+		do_io = true;
+
+		/* check the page type: if the page is a host page, then do
+		 * write directly */
+		if (clp->cp_type == CPT_CACHEABLE) {
+			struct page *vmpage = cl_page_vmpage(env, clp);
+			struct page *src_page;
+			struct page *dst_page;
+			void       *src;
+			void       *dst;
+
+			src_page = (rw == WRITE) ? pages[i] : vmpage;
+			dst_page = (rw == WRITE) ? vmpage : pages[i];
+
+			src = ll_kmap_atomic(src_page, KM_USER0);
+			dst = ll_kmap_atomic(dst_page, KM_USER1);
+			memcpy(dst, src, min(page_size, size));
+			ll_kunmap_atomic(dst, KM_USER1);
+			ll_kunmap_atomic(src, KM_USER0);
+
+			/* make sure page will be added to the transfer by
+			 * cl_io_submit()->...->vvp_page_prep_write(). */
+			if (rw == WRITE)
+				set_page_dirty(vmpage);
+
+			if (rw == READ) {
+				/* do not issue the page for read, since it
+				 * may reread a ra page which has NOT uptodate
+				 * bit set. */
+				cl_page_disown(env, io, clp);
+				do_io = false;
+			}
+		}
+
+		if (likely(do_io)) {
+			cl_2queue_add(queue, clp);
+
+			/*
+			 * Set page clip to tell transfer formation engine
+			 * that page has to be sent even if it is beyond KMS.
+			 */
+			cl_page_clip(env, clp, 0, min(size, page_size));
+
+			++io_pages;
+		}
+
+		/* drop the reference count for cl_page_find */
+		cl_page_put(env, clp);
+		size -= page_size;
+		file_offset += page_size;
+	}
+
+	if (rc == 0 && io_pages) {
+		rc = cl_io_submit_sync(env, io,
+				       rw == READ ? CRT_READ : CRT_WRITE,
+				       queue, 0);
+	}
+	if (rc == 0)
+		rc = pv->ldp_size;
+
+	cl_2queue_discard(env, io, queue);
+	cl_2queue_disown(env, io, queue);
+	cl_2queue_fini(env, queue);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ll_direct_rw_pages);
+
+static ssize_t ll_direct_IO_26_seg(const struct lu_env *env, struct cl_io *io,
+				   int rw, struct inode *inode,
+				   struct address_space *mapping,
+				   size_t size, loff_t file_offset,
+				   struct page **pages, int page_count)
+{
+    struct ll_dio_pages pvec = { .ldp_pages	= pages,
+				 .ldp_nr	   = page_count,
+				 .ldp_size	 = size,
+				 .ldp_offsets      = NULL,
+				 .ldp_start_offset = file_offset
+			       };
+
+    return ll_direct_rw_pages(env, io, rw, inode, &pvec);
+}
+
+#ifdef KMALLOC_MAX_SIZE
+#define MAX_MALLOC KMALLOC_MAX_SIZE
+#else
+#define MAX_MALLOC (128 * 1024)
+#endif
+
+/* This is the maximum size of a single O_DIRECT request, based on the
+ * kmalloc limit.  We need to fit all of the brw_page structs, each one
+ * representing PAGE_SIZE worth of user data, into a single buffer, and
+ * then truncate this to be a full-sized RPC.  For 4kB PAGE_SIZE this is
+ * up to 22MB for 128kB kmalloc and up to 682MB for 4MB kmalloc. */
+#define MAX_DIO_SIZE ((MAX_MALLOC / sizeof(struct brw_page) * PAGE_CACHE_SIZE) & \
+		      ~(DT_MAX_BRW_SIZE - 1))
+static ssize_t ll_direct_IO_26(int rw, struct kiocb *iocb,
+			       const struct iovec *iov, loff_t file_offset,
+			       unsigned long nr_segs)
+{
+	struct lu_env *env;
+	struct cl_io *io;
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	struct ccc_object *obj = cl_inode2ccc(inode);
+	long count = iov_length(iov, nr_segs);
+	long tot_bytes = 0, result = 0;
+	struct ll_inode_info *lli = ll_i2info(inode);
+	unsigned long seg = 0;
+	long size = MAX_DIO_SIZE;
+	int refcheck;
+	ENTRY;
+
+	if (!lli->lli_has_smd)
+		RETURN(-EBADF);
+
+	/* FIXME: io smaller than PAGE_SIZE is broken on ia64 ??? */
+	if ((file_offset & ~CFS_PAGE_MASK) || (count & ~CFS_PAGE_MASK))
+		RETURN(-EINVAL);
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), size=%lu (max %lu), "
+	       "offset=%lld=%llx, pages %lu (max %lu)\n",
+	       inode->i_ino, inode->i_generation, inode, count, MAX_DIO_SIZE,
+	       file_offset, file_offset, count >> PAGE_CACHE_SHIFT,
+	       MAX_DIO_SIZE >> PAGE_CACHE_SHIFT);
+
+	/* Check that all user buffers are aligned as well */
+	for (seg = 0; seg < nr_segs; seg++) {
+		if (((unsigned long)iov[seg].iov_base & ~CFS_PAGE_MASK) ||
+		    (iov[seg].iov_len & ~CFS_PAGE_MASK))
+			RETURN(-EINVAL);
+	}
+
+	env = cl_env_get(&refcheck);
+	LASSERT(!IS_ERR(env));
+	io = ccc_env_io(env)->cui_cl.cis_io;
+	LASSERT(io != NULL);
+
+	/* 0. Need locking between buffered and direct access. and race with
+	 *    size changing by concurrent truncates and writes.
+	 * 1. Need inode mutex to operate transient pages.
+	 */
+	if (rw == READ)
+		mutex_lock(&inode->i_mutex);
+
+	LASSERT(obj->cob_transient_pages == 0);
+	for (seg = 0; seg < nr_segs; seg++) {
+		long iov_left = iov[seg].iov_len;
+		unsigned long user_addr = (unsigned long)iov[seg].iov_base;
+
+		if (rw == READ) {
+			if (file_offset >= i_size_read(inode))
+				break;
+			if (file_offset + iov_left > i_size_read(inode))
+				iov_left = i_size_read(inode) - file_offset;
+		}
+
+		while (iov_left > 0) {
+			struct page **pages;
+			int page_count, max_pages = 0;
+			long bytes;
+
+			bytes = min(size, iov_left);
+			page_count = ll_get_user_pages(rw, user_addr, bytes,
+						       &pages, &max_pages);
+			if (likely(page_count > 0)) {
+				if (unlikely(page_count <  max_pages))
+					bytes = page_count << PAGE_CACHE_SHIFT;
+				result = ll_direct_IO_26_seg(env, io, rw, inode,
+							     file->f_mapping,
+							     bytes, file_offset,
+							     pages, page_count);
+				ll_free_user_pages(pages, max_pages, rw==READ);
+			} else if (page_count == 0) {
+				GOTO(out, result = -EFAULT);
+			} else {
+				result = page_count;
+			}
+			if (unlikely(result <= 0)) {
+				/* If we can't allocate a large enough buffer
+				 * for the request, shrink it to a smaller
+				 * PAGE_SIZE multiple and try again.
+				 * We should always be able to kmalloc for a
+				 * page worth of page pointers = 4MB on i386. */
+				if (result == -ENOMEM &&
+				    size > (PAGE_CACHE_SIZE / sizeof(*pages)) *
+					   PAGE_CACHE_SIZE) {
+					size = ((((size / 2) - 1) |
+						 ~CFS_PAGE_MASK) + 1) &
+						CFS_PAGE_MASK;
+					CDEBUG(D_VFSTRACE,"DIO size now %lu\n",
+					       size);
+					continue;
+				}
+
+				GOTO(out, result);
+			}
+
+			tot_bytes += result;
+			file_offset += result;
+			iov_left -= result;
+			user_addr += result;
+		}
+	}
+out:
+	LASSERT(obj->cob_transient_pages == 0);
+	if (rw == READ)
+		mutex_unlock(&inode->i_mutex);
+
+	if (tot_bytes > 0) {
+		if (rw == WRITE) {
+			struct lov_stripe_md *lsm;
+
+			lsm = ccc_inode_lsm_get(inode);
+			LASSERT(lsm != NULL);
+			lov_stripe_lock(lsm);
+			obd_adjust_kms(ll_i2dtexp(inode), lsm, file_offset, 0);
+			lov_stripe_unlock(lsm);
+			ccc_inode_lsm_put(inode, lsm);
+		}
+	}
+
+	cl_env_put(env, &refcheck);
+	RETURN(tot_bytes ? : result);
+}
+
+static int ll_write_begin(struct file *file, struct address_space *mapping,
+			 loff_t pos, unsigned len, unsigned flags,
+			 struct page **pagep, void **fsdata)
+{
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	struct page *page;
+	int rc;
+	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+	ENTRY;
+
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page)
+		RETURN(-ENOMEM);
+
+	*pagep = page;
+
+	rc = ll_prepare_write(file, page, from, from + len);
+	if (rc) {
+		unlock_page(page);
+		page_cache_release(page);
+	}
+	RETURN(rc);
+}
+
+static int ll_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata)
+{
+	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+	int rc;
+
+	rc = ll_commit_write(file, page, from, from + copied);
+	unlock_page(page);
+	page_cache_release(page);
+
+	return rc ?: copied;
+}
+
+#ifdef CONFIG_MIGRATION
+int ll_migratepage(struct address_space *mapping,
+		struct page *newpage, struct page *page
+		, enum migrate_mode mode
+		)
+{
+	/* Always fail page migration until we have a proper implementation */
+	return -EIO;
+}
+#endif
+
+#ifndef MS_HAS_NEW_AOPS
+struct address_space_operations ll_aops = {
+	.readpage       = ll_readpage,
+//	.readpages      = ll_readpages,
+	.direct_IO      = ll_direct_IO_26,
+	.writepage      = ll_writepage,
+	.writepages     = ll_writepages,
+	.set_page_dirty = ll_set_page_dirty,
+	.write_begin    = ll_write_begin,
+	.write_end      = ll_write_end,
+	.invalidatepage = ll_invalidatepage,
+	.releasepage    = (void *)ll_releasepage,
+#ifdef CONFIG_MIGRATION
+	.migratepage    = ll_migratepage,
+#endif
+	.bmap	   = NULL
+};
+#else
+struct address_space_operations_ext ll_aops = {
+	.orig_aops.readpage       = ll_readpage,
+//	.orig_aops.readpages      = ll_readpages,
+	.orig_aops.direct_IO      = ll_direct_IO_26,
+	.orig_aops.writepage      = ll_writepage,
+	.orig_aops.writepages     = ll_writepages,
+	.orig_aops.set_page_dirty = ll_set_page_dirty,
+	.orig_aops.prepare_write  = ll_prepare_write,
+	.orig_aops.commit_write   = ll_commit_write,
+	.orig_aops.invalidatepage = ll_invalidatepage,
+	.orig_aops.releasepage    = ll_releasepage,
+#ifdef CONFIG_MIGRATION
+	.orig_aops.migratepage    = ll_migratepage,
+#endif
+	.orig_aops.bmap	   = NULL,
+	.write_begin    = ll_write_begin,
+	.write_end      = ll_write_end
+};
+#endif
diff --git a/drivers/staging/lustre/lustre/llite/statahead.c b/drivers/staging/lustre/lustre/llite/statahead.c
new file mode 100644
index 000000000000..7747f8f2079d
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/statahead.c
@@ -0,0 +1,1722 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <obd_support.h>
+#include <lustre_lite.h>
+#include <lustre_dlm.h>
+#include "llite_internal.h"
+
+#define SA_OMITTED_ENTRY_MAX 8ULL
+
+typedef enum {
+	/** negative values are for error cases */
+	SA_ENTRY_INIT = 0,      /** init entry */
+	SA_ENTRY_SUCC = 1,      /** stat succeed */
+	SA_ENTRY_INVA = 2,      /** invalid entry */
+	SA_ENTRY_DEST = 3,      /** entry to be destroyed */
+} se_stat_t;
+
+struct ll_sa_entry {
+	/* link into sai->sai_entries */
+	struct list_head	      se_link;
+	/* link into sai->sai_entries_{received,stated} */
+	struct list_head	      se_list;
+	/* link into sai hash table locally */
+	struct list_head	      se_hash;
+	/* entry reference count */
+	atomic_t	    se_refcount;
+	/* entry index in the sai */
+	__u64		   se_index;
+	/* low layer ldlm lock handle */
+	__u64		   se_handle;
+	/* entry status */
+	se_stat_t	       se_stat;
+	/* entry size, contains name */
+	int		     se_size;
+	/* pointer to async getattr enqueue info */
+	struct md_enqueue_info *se_minfo;
+	/* pointer to the async getattr request */
+	struct ptlrpc_request  *se_req;
+	/* pointer to the target inode */
+	struct inode	   *se_inode;
+	/* entry name */
+	struct qstr	     se_qstr;
+};
+
+static unsigned int sai_generation = 0;
+static DEFINE_SPINLOCK(sai_generation_lock);
+
+static inline int ll_sa_entry_unhashed(struct ll_sa_entry *entry)
+{
+	return list_empty(&entry->se_hash);
+}
+
+/*
+ * The entry only can be released by the caller, it is necessary to hold lock.
+ */
+static inline int ll_sa_entry_stated(struct ll_sa_entry *entry)
+{
+	smp_rmb();
+	return (entry->se_stat != SA_ENTRY_INIT);
+}
+
+static inline int ll_sa_entry_hash(int val)
+{
+	return val & LL_SA_CACHE_MASK;
+}
+
+/*
+ * Insert entry to hash SA table.
+ */
+static inline void
+ll_sa_entry_enhash(struct ll_statahead_info *sai, struct ll_sa_entry *entry)
+{
+	int i = ll_sa_entry_hash(entry->se_qstr.hash);
+
+	spin_lock(&sai->sai_cache_lock[i]);
+	list_add_tail(&entry->se_hash, &sai->sai_cache[i]);
+	spin_unlock(&sai->sai_cache_lock[i]);
+}
+
+/*
+ * Remove entry from SA table.
+ */
+static inline void
+ll_sa_entry_unhash(struct ll_statahead_info *sai, struct ll_sa_entry *entry)
+{
+	int i = ll_sa_entry_hash(entry->se_qstr.hash);
+
+	spin_lock(&sai->sai_cache_lock[i]);
+	list_del_init(&entry->se_hash);
+	spin_unlock(&sai->sai_cache_lock[i]);
+}
+
+static inline int agl_should_run(struct ll_statahead_info *sai,
+				 struct inode *inode)
+{
+	return (inode != NULL && S_ISREG(inode->i_mode) && sai->sai_agl_valid);
+}
+
+static inline struct ll_sa_entry *
+sa_first_received_entry(struct ll_statahead_info *sai)
+{
+	return list_entry(sai->sai_entries_received.next,
+			      struct ll_sa_entry, se_list);
+}
+
+static inline struct ll_inode_info *
+agl_first_entry(struct ll_statahead_info *sai)
+{
+	return list_entry(sai->sai_entries_agl.next,
+			      struct ll_inode_info, lli_agl_list);
+}
+
+static inline int sa_sent_full(struct ll_statahead_info *sai)
+{
+	return atomic_read(&sai->sai_cache_count) >= sai->sai_max;
+}
+
+static inline int sa_received_empty(struct ll_statahead_info *sai)
+{
+	return list_empty(&sai->sai_entries_received);
+}
+
+static inline int agl_list_empty(struct ll_statahead_info *sai)
+{
+	return list_empty(&sai->sai_entries_agl);
+}
+
+/**
+ * (1) hit ratio less than 80%
+ * or
+ * (2) consecutive miss more than 8
+ * then means low hit.
+ */
+static inline int sa_low_hit(struct ll_statahead_info *sai)
+{
+	return ((sai->sai_hit > 7 && sai->sai_hit < 4 * sai->sai_miss) ||
+		(sai->sai_consecutive_miss > 8));
+}
+
+/*
+ * If the given index is behind of statahead window more than
+ * SA_OMITTED_ENTRY_MAX, then it is old.
+ */
+static inline int is_omitted_entry(struct ll_statahead_info *sai, __u64 index)
+{
+	return ((__u64)sai->sai_max + index + SA_OMITTED_ENTRY_MAX <
+		 sai->sai_index);
+}
+
+/*
+ * Insert it into sai_entries tail when init.
+ */
+static struct ll_sa_entry *
+ll_sa_entry_alloc(struct ll_statahead_info *sai, __u64 index,
+		  const char *name, int len)
+{
+	struct ll_inode_info *lli;
+	struct ll_sa_entry   *entry;
+	int		   entry_size;
+	char		 *dname;
+	ENTRY;
+
+	entry_size = sizeof(struct ll_sa_entry) + (len & ~3) + 4;
+	OBD_ALLOC(entry, entry_size);
+	if (unlikely(entry == NULL))
+		RETURN(ERR_PTR(-ENOMEM));
+
+	CDEBUG(D_READA, "alloc sa entry %.*s(%p) index "LPU64"\n",
+	       len, name, entry, index);
+
+	entry->se_index = index;
+
+	/*
+	 * Statahead entry reference rules:
+	 *
+	 * 1) When statahead entry is initialized, its reference is set as 2.
+	 *    One reference is used by the directory scanner. When the scanner
+	 *    searches the statahead cache for the given name, it can perform
+	 *    lockless hash lookup (only the scanner can remove entry from hash
+	 *    list), and once found, it needn't to call "atomic_inc()" for the
+	 *    entry reference. So the performance is improved. After using the
+	 *    statahead entry, the scanner will call "atomic_dec()" to drop the
+	 *    reference held when initialization. If it is the last reference,
+	 *    the statahead entry will be freed.
+	 *
+	 * 2) All other threads, including statahead thread and ptlrpcd thread,
+	 *    when they process the statahead entry, the reference for target
+	 *    should be held to guarantee the entry will not be released by the
+	 *    directory scanner. After processing the entry, these threads will
+	 *    drop the entry reference. If it is the last reference, the entry
+	 *    will be freed.
+	 *
+	 *    The second reference when initializes the statahead entry is used
+	 *    by the statahead thread, following the rule 2).
+	 */
+	atomic_set(&entry->se_refcount, 2);
+	entry->se_stat = SA_ENTRY_INIT;
+	entry->se_size = entry_size;
+	dname = (char *)entry + sizeof(struct ll_sa_entry);
+	memcpy(dname, name, len);
+	dname[len] = 0;
+	entry->se_qstr.hash = full_name_hash(name, len);
+	entry->se_qstr.len = len;
+	entry->se_qstr.name = dname;
+
+	lli = ll_i2info(sai->sai_inode);
+	spin_lock(&lli->lli_sa_lock);
+	list_add_tail(&entry->se_link, &sai->sai_entries);
+	INIT_LIST_HEAD(&entry->se_list);
+	ll_sa_entry_enhash(sai, entry);
+	spin_unlock(&lli->lli_sa_lock);
+
+	atomic_inc(&sai->sai_cache_count);
+
+	RETURN(entry);
+}
+
+/*
+ * Used by the directory scanner to search entry with name.
+ *
+ * Only the caller can remove the entry from hash, so it is unnecessary to hold
+ * hash lock. It is caller's duty to release the init refcount on the entry, so
+ * it is also unnecessary to increase refcount on the entry.
+ */
+static struct ll_sa_entry *
+ll_sa_entry_get_byname(struct ll_statahead_info *sai, const struct qstr *qstr)
+{
+	struct ll_sa_entry *entry;
+	int i = ll_sa_entry_hash(qstr->hash);
+
+	list_for_each_entry(entry, &sai->sai_cache[i], se_hash) {
+		if (entry->se_qstr.hash == qstr->hash &&
+		    entry->se_qstr.len == qstr->len &&
+		    memcmp(entry->se_qstr.name, qstr->name, qstr->len) == 0)
+			return entry;
+	}
+	return NULL;
+}
+
+/*
+ * Used by the async getattr request callback to find entry with index.
+ *
+ * Inside lli_sa_lock to prevent others to change the list during the search.
+ * It needs to increase entry refcount before returning to guarantee that the
+ * entry cannot be freed by others.
+ */
+static struct ll_sa_entry *
+ll_sa_entry_get_byindex(struct ll_statahead_info *sai, __u64 index)
+{
+	struct ll_sa_entry *entry;
+
+	list_for_each_entry(entry, &sai->sai_entries, se_link) {
+		if (entry->se_index == index) {
+			LASSERT(atomic_read(&entry->se_refcount) > 0);
+			atomic_inc(&entry->se_refcount);
+			return entry;
+		}
+		if (entry->se_index > index)
+			break;
+	}
+	return NULL;
+}
+
+static void ll_sa_entry_cleanup(struct ll_statahead_info *sai,
+				 struct ll_sa_entry *entry)
+{
+	struct md_enqueue_info *minfo = entry->se_minfo;
+	struct ptlrpc_request  *req   = entry->se_req;
+
+	if (minfo) {
+		entry->se_minfo = NULL;
+		ll_intent_release(&minfo->mi_it);
+		iput(minfo->mi_dir);
+		OBD_FREE_PTR(minfo);
+	}
+
+	if (req) {
+		entry->se_req = NULL;
+		ptlrpc_req_finished(req);
+	}
+}
+
+static void ll_sa_entry_put(struct ll_statahead_info *sai,
+			     struct ll_sa_entry *entry)
+{
+	if (atomic_dec_and_test(&entry->se_refcount)) {
+		CDEBUG(D_READA, "free sa entry %.*s(%p) index "LPU64"\n",
+		       entry->se_qstr.len, entry->se_qstr.name, entry,
+		       entry->se_index);
+
+		LASSERT(list_empty(&entry->se_link));
+		LASSERT(list_empty(&entry->se_list));
+		LASSERT(ll_sa_entry_unhashed(entry));
+
+		ll_sa_entry_cleanup(sai, entry);
+		if (entry->se_inode)
+			iput(entry->se_inode);
+
+		OBD_FREE(entry, entry->se_size);
+		atomic_dec(&sai->sai_cache_count);
+	}
+}
+
+static inline void
+do_sa_entry_fini(struct ll_statahead_info *sai, struct ll_sa_entry *entry)
+{
+	struct ll_inode_info *lli = ll_i2info(sai->sai_inode);
+
+	LASSERT(!ll_sa_entry_unhashed(entry));
+	LASSERT(!list_empty(&entry->se_link));
+
+	ll_sa_entry_unhash(sai, entry);
+
+	spin_lock(&lli->lli_sa_lock);
+	entry->se_stat = SA_ENTRY_DEST;
+	list_del_init(&entry->se_link);
+	if (likely(!list_empty(&entry->se_list)))
+		list_del_init(&entry->se_list);
+	spin_unlock(&lli->lli_sa_lock);
+
+	ll_sa_entry_put(sai, entry);
+}
+
+/*
+ * Delete it from sai_entries_stated list when fini.
+ */
+static void
+ll_sa_entry_fini(struct ll_statahead_info *sai, struct ll_sa_entry *entry)
+{
+	struct ll_sa_entry *pos, *next;
+
+	if (entry)
+		do_sa_entry_fini(sai, entry);
+
+	/* drop old entry, only 'scanner' process does this, no need to lock */
+	list_for_each_entry_safe(pos, next, &sai->sai_entries, se_link) {
+		if (!is_omitted_entry(sai, pos->se_index))
+			break;
+		do_sa_entry_fini(sai, pos);
+	}
+}
+
+/*
+ * Inside lli_sa_lock.
+ */
+static void
+do_sa_entry_to_stated(struct ll_statahead_info *sai,
+		      struct ll_sa_entry *entry, se_stat_t stat)
+{
+	struct ll_sa_entry *se;
+	struct list_head	 *pos = &sai->sai_entries_stated;
+
+	if (!list_empty(&entry->se_list))
+		list_del_init(&entry->se_list);
+
+	list_for_each_entry_reverse(se, &sai->sai_entries_stated, se_list) {
+		if (se->se_index < entry->se_index) {
+			pos = &se->se_list;
+			break;
+		}
+	}
+
+	list_add(&entry->se_list, pos);
+	entry->se_stat = stat;
+}
+
+/*
+ * Move entry to sai_entries_stated and sort with the index.
+ * \retval 1    -- entry to be destroyed.
+ * \retval 0    -- entry is inserted into stated list.
+ */
+static int
+ll_sa_entry_to_stated(struct ll_statahead_info *sai,
+		      struct ll_sa_entry *entry, se_stat_t stat)
+{
+	struct ll_inode_info *lli = ll_i2info(sai->sai_inode);
+	int		   ret = 1;
+
+	ll_sa_entry_cleanup(sai, entry);
+
+	spin_lock(&lli->lli_sa_lock);
+	if (likely(entry->se_stat != SA_ENTRY_DEST)) {
+		do_sa_entry_to_stated(sai, entry, stat);
+		ret = 0;
+	}
+	spin_unlock(&lli->lli_sa_lock);
+
+	return ret;
+}
+
+/*
+ * Insert inode into the list of sai_entries_agl.
+ */
+static void ll_agl_add(struct ll_statahead_info *sai,
+		       struct inode *inode, int index)
+{
+	struct ll_inode_info *child  = ll_i2info(inode);
+	struct ll_inode_info *parent = ll_i2info(sai->sai_inode);
+	int		   added  = 0;
+
+	spin_lock(&child->lli_agl_lock);
+	if (child->lli_agl_index == 0) {
+		child->lli_agl_index = index;
+		spin_unlock(&child->lli_agl_lock);
+
+		LASSERT(list_empty(&child->lli_agl_list));
+
+		igrab(inode);
+		spin_lock(&parent->lli_agl_lock);
+		if (agl_list_empty(sai))
+			added = 1;
+		list_add_tail(&child->lli_agl_list, &sai->sai_entries_agl);
+		spin_unlock(&parent->lli_agl_lock);
+	} else {
+		spin_unlock(&child->lli_agl_lock);
+	}
+
+	if (added > 0)
+		wake_up(&sai->sai_agl_thread.t_ctl_waitq);
+}
+
+static struct ll_statahead_info *ll_sai_alloc(void)
+{
+	struct ll_statahead_info *sai;
+	int		       i;
+	ENTRY;
+
+	OBD_ALLOC_PTR(sai);
+	if (!sai)
+		RETURN(NULL);
+
+	atomic_set(&sai->sai_refcount, 1);
+
+	spin_lock(&sai_generation_lock);
+	sai->sai_generation = ++sai_generation;
+	if (unlikely(sai_generation == 0))
+		sai->sai_generation = ++sai_generation;
+	spin_unlock(&sai_generation_lock);
+
+	sai->sai_max = LL_SA_RPC_MIN;
+	sai->sai_index = 1;
+	init_waitqueue_head(&sai->sai_waitq);
+	init_waitqueue_head(&sai->sai_thread.t_ctl_waitq);
+	init_waitqueue_head(&sai->sai_agl_thread.t_ctl_waitq);
+
+	INIT_LIST_HEAD(&sai->sai_entries);
+	INIT_LIST_HEAD(&sai->sai_entries_received);
+	INIT_LIST_HEAD(&sai->sai_entries_stated);
+	INIT_LIST_HEAD(&sai->sai_entries_agl);
+
+	for (i = 0; i < LL_SA_CACHE_SIZE; i++) {
+		INIT_LIST_HEAD(&sai->sai_cache[i]);
+		spin_lock_init(&sai->sai_cache_lock[i]);
+	}
+	atomic_set(&sai->sai_cache_count, 0);
+
+	RETURN(sai);
+}
+
+static inline struct ll_statahead_info *
+ll_sai_get(struct ll_statahead_info *sai)
+{
+	atomic_inc(&sai->sai_refcount);
+	return sai;
+}
+
+static void ll_sai_put(struct ll_statahead_info *sai)
+{
+	struct inode	 *inode = sai->sai_inode;
+	struct ll_inode_info *lli   = ll_i2info(inode);
+	ENTRY;
+
+	if (atomic_dec_and_lock(&sai->sai_refcount, &lli->lli_sa_lock)) {
+		struct ll_sa_entry *entry, *next;
+
+		if (unlikely(atomic_read(&sai->sai_refcount) > 0)) {
+			/* It is race case, the interpret callback just hold
+			 * a reference count */
+			spin_unlock(&lli->lli_sa_lock);
+			RETURN_EXIT;
+		}
+
+		LASSERT(lli->lli_opendir_key == NULL);
+		LASSERT(thread_is_stopped(&sai->sai_thread));
+		LASSERT(thread_is_stopped(&sai->sai_agl_thread));
+
+		lli->lli_sai = NULL;
+		lli->lli_opendir_pid = 0;
+		spin_unlock(&lli->lli_sa_lock);
+
+		if (sai->sai_sent > sai->sai_replied)
+			CDEBUG(D_READA,"statahead for dir "DFID" does not "
+			      "finish: [sent:"LPU64"] [replied:"LPU64"]\n",
+			      PFID(&lli->lli_fid),
+			      sai->sai_sent, sai->sai_replied);
+
+		list_for_each_entry_safe(entry, next,
+					     &sai->sai_entries, se_link)
+			do_sa_entry_fini(sai, entry);
+
+		LASSERT(list_empty(&sai->sai_entries));
+		LASSERT(sa_received_empty(sai));
+		LASSERT(list_empty(&sai->sai_entries_stated));
+
+		LASSERT(atomic_read(&sai->sai_cache_count) == 0);
+		LASSERT(agl_list_empty(sai));
+
+		iput(inode);
+		OBD_FREE_PTR(sai);
+	}
+
+	EXIT;
+}
+
+/* Do NOT forget to drop inode refcount when into sai_entries_agl. */
+static void ll_agl_trigger(struct inode *inode, struct ll_statahead_info *sai)
+{
+	struct ll_inode_info *lli   = ll_i2info(inode);
+	__u64		 index = lli->lli_agl_index;
+	int		   rc;
+	ENTRY;
+
+	LASSERT(list_empty(&lli->lli_agl_list));
+
+	/* AGL maybe fall behind statahead with one entry */
+	if (is_omitted_entry(sai, index + 1)) {
+		lli->lli_agl_index = 0;
+		iput(inode);
+		RETURN_EXIT;
+	}
+
+	/* Someone is in glimpse (sync or async), do nothing. */
+	rc = down_write_trylock(&lli->lli_glimpse_sem);
+	if (rc == 0) {
+		lli->lli_agl_index = 0;
+		iput(inode);
+		RETURN_EXIT;
+	}
+
+	/*
+	 * Someone triggered glimpse within 1 sec before.
+	 * 1) The former glimpse succeeded with glimpse lock granted by OST, and
+	 *    if the lock is still cached on client, AGL needs to do nothing. If
+	 *    it is cancelled by other client, AGL maybe cannot obtaion new lock
+	 *    for no glimpse callback triggered by AGL.
+	 * 2) The former glimpse succeeded, but OST did not grant glimpse lock.
+	 *    Under such case, it is quite possible that the OST will not grant
+	 *    glimpse lock for AGL also.
+	 * 3) The former glimpse failed, compared with other two cases, it is
+	 *    relative rare. AGL can ignore such case, and it will not muchly
+	 *    affect the performance.
+	 */
+	if (lli->lli_glimpse_time != 0 &&
+	    cfs_time_before(cfs_time_shift(-1), lli->lli_glimpse_time)) {
+		up_write(&lli->lli_glimpse_sem);
+		lli->lli_agl_index = 0;
+		iput(inode);
+		RETURN_EXIT;
+	}
+
+	CDEBUG(D_READA, "Handling (init) async glimpse: inode = "
+	       DFID", idx = "LPU64"\n", PFID(&lli->lli_fid), index);
+
+	cl_agl(inode);
+	lli->lli_agl_index = 0;
+	lli->lli_glimpse_time = cfs_time_current();
+	up_write(&lli->lli_glimpse_sem);
+
+	CDEBUG(D_READA, "Handled (init) async glimpse: inode= "
+	       DFID", idx = "LPU64", rc = %d\n",
+	       PFID(&lli->lli_fid), index, rc);
+
+	iput(inode);
+
+	EXIT;
+}
+
+static void ll_post_statahead(struct ll_statahead_info *sai)
+{
+	struct inode	   *dir   = sai->sai_inode;
+	struct inode	   *child;
+	struct ll_inode_info   *lli   = ll_i2info(dir);
+	struct ll_sa_entry     *entry;
+	struct md_enqueue_info *minfo;
+	struct lookup_intent   *it;
+	struct ptlrpc_request  *req;
+	struct mdt_body	*body;
+	int		     rc    = 0;
+	ENTRY;
+
+	spin_lock(&lli->lli_sa_lock);
+	if (unlikely(sa_received_empty(sai))) {
+		spin_unlock(&lli->lli_sa_lock);
+		RETURN_EXIT;
+	}
+	entry = sa_first_received_entry(sai);
+	atomic_inc(&entry->se_refcount);
+	list_del_init(&entry->se_list);
+	spin_unlock(&lli->lli_sa_lock);
+
+	LASSERT(entry->se_handle != 0);
+
+	minfo = entry->se_minfo;
+	it = &minfo->mi_it;
+	req = entry->se_req;
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		GOTO(out, rc = -EFAULT);
+
+	child = entry->se_inode;
+	if (child == NULL) {
+		/*
+		 * lookup.
+		 */
+		LASSERT(fid_is_zero(&minfo->mi_data.op_fid2));
+
+		/* XXX: No fid in reply, this is probaly cross-ref case.
+		 * SA can't handle it yet. */
+		if (body->valid & OBD_MD_MDS)
+			GOTO(out, rc = -EAGAIN);
+	} else {
+		/*
+		 * revalidate.
+		 */
+		/* unlinked and re-created with the same name */
+		if (unlikely(!lu_fid_eq(&minfo->mi_data.op_fid2, &body->fid1))){
+			entry->se_inode = NULL;
+			iput(child);
+			child = NULL;
+		}
+	}
+
+	it->d.lustre.it_lock_handle = entry->se_handle;
+	rc = md_revalidate_lock(ll_i2mdexp(dir), it, ll_inode2fid(dir), NULL);
+	if (rc != 1)
+		GOTO(out, rc = -EAGAIN);
+
+	rc = ll_prep_inode(&child, req, dir->i_sb, it);
+	if (rc)
+		GOTO(out, rc);
+
+	CDEBUG(D_DLMTRACE, "setting l_data to inode %p (%lu/%u)\n",
+	       child, child->i_ino, child->i_generation);
+	ll_set_lock_data(ll_i2sbi(dir)->ll_md_exp, child, it, NULL);
+
+	entry->se_inode = child;
+
+	if (agl_should_run(sai, child))
+		ll_agl_add(sai, child, entry->se_index);
+
+	EXIT;
+
+out:
+	/* The "ll_sa_entry_to_stated()" will drop related ldlm ibits lock
+	 * reference count by calling "ll_intent_drop_lock()" in spite of the
+	 * above operations failed or not. Do not worry about calling
+	 * "ll_intent_drop_lock()" more than once. */
+	rc = ll_sa_entry_to_stated(sai, entry,
+				   rc < 0 ? SA_ENTRY_INVA : SA_ENTRY_SUCC);
+	if (rc == 0 && entry->se_index == sai->sai_index_wait)
+		wake_up(&sai->sai_waitq);
+	ll_sa_entry_put(sai, entry);
+}
+
+static int ll_statahead_interpret(struct ptlrpc_request *req,
+				  struct md_enqueue_info *minfo, int rc)
+{
+	struct lookup_intent     *it  = &minfo->mi_it;
+	struct inode	     *dir = minfo->mi_dir;
+	struct ll_inode_info     *lli = ll_i2info(dir);
+	struct ll_statahead_info *sai = NULL;
+	struct ll_sa_entry       *entry;
+	int		       wakeup;
+	ENTRY;
+
+	if (it_disposition(it, DISP_LOOKUP_NEG))
+		rc = -ENOENT;
+
+	spin_lock(&lli->lli_sa_lock);
+	/* stale entry */
+	if (unlikely(lli->lli_sai == NULL ||
+		     lli->lli_sai->sai_generation != minfo->mi_generation)) {
+		spin_unlock(&lli->lli_sa_lock);
+		GOTO(out, rc = -ESTALE);
+	} else {
+		sai = ll_sai_get(lli->lli_sai);
+		if (unlikely(!thread_is_running(&sai->sai_thread))) {
+			sai->sai_replied++;
+			spin_unlock(&lli->lli_sa_lock);
+			GOTO(out, rc = -EBADFD);
+		}
+
+		entry = ll_sa_entry_get_byindex(sai, minfo->mi_cbdata);
+		if (entry == NULL) {
+			sai->sai_replied++;
+			spin_unlock(&lli->lli_sa_lock);
+			GOTO(out, rc = -EIDRM);
+		}
+
+		if (rc != 0) {
+			do_sa_entry_to_stated(sai, entry, SA_ENTRY_INVA);
+			wakeup = (entry->se_index == sai->sai_index_wait);
+		} else {
+			entry->se_minfo = minfo;
+			entry->se_req = ptlrpc_request_addref(req);
+			/* Release the async ibits lock ASAP to avoid deadlock
+			 * when statahead thread tries to enqueue lock on parent
+			 * for readpage and other tries to enqueue lock on child
+			 * with parent's lock held, for example: unlink. */
+			entry->se_handle = it->d.lustre.it_lock_handle;
+			ll_intent_drop_lock(it);
+			wakeup = sa_received_empty(sai);
+			list_add_tail(&entry->se_list,
+					  &sai->sai_entries_received);
+		}
+		sai->sai_replied++;
+		spin_unlock(&lli->lli_sa_lock);
+
+		ll_sa_entry_put(sai, entry);
+		if (wakeup)
+			wake_up(&sai->sai_thread.t_ctl_waitq);
+	}
+
+	EXIT;
+
+out:
+	if (rc != 0) {
+		ll_intent_release(it);
+		iput(dir);
+		OBD_FREE_PTR(minfo);
+	}
+	if (sai != NULL)
+		ll_sai_put(sai);
+	return rc;
+}
+
+static void sa_args_fini(struct md_enqueue_info *minfo,
+			 struct ldlm_enqueue_info *einfo)
+{
+	LASSERT(minfo && einfo);
+	iput(minfo->mi_dir);
+	capa_put(minfo->mi_data.op_capa1);
+	capa_put(minfo->mi_data.op_capa2);
+	OBD_FREE_PTR(minfo);
+	OBD_FREE_PTR(einfo);
+}
+
+/**
+ * There is race condition between "capa_put" and "ll_statahead_interpret" for
+ * accessing "op_data.op_capa[1,2]" as following:
+ * "capa_put" releases "op_data.op_capa[1,2]"'s reference count after calling
+ * "md_intent_getattr_async". But "ll_statahead_interpret" maybe run first, and
+ * fill "op_data.op_capa[1,2]" as POISON, then cause "capa_put" access invalid
+ * "ocapa". So here reserve "op_data.op_capa[1,2]" in "pcapa" before calling
+ * "md_intent_getattr_async".
+ */
+static int sa_args_init(struct inode *dir, struct inode *child,
+			struct ll_sa_entry *entry, struct md_enqueue_info **pmi,
+			struct ldlm_enqueue_info **pei,
+			struct obd_capa **pcapa)
+{
+	struct qstr	      *qstr = &entry->se_qstr;
+	struct ll_inode_info     *lli  = ll_i2info(dir);
+	struct md_enqueue_info   *minfo;
+	struct ldlm_enqueue_info *einfo;
+	struct md_op_data	*op_data;
+
+	OBD_ALLOC_PTR(einfo);
+	if (einfo == NULL)
+		return -ENOMEM;
+
+	OBD_ALLOC_PTR(minfo);
+	if (minfo == NULL) {
+		OBD_FREE_PTR(einfo);
+		return -ENOMEM;
+	}
+
+	op_data = ll_prep_md_op_data(&minfo->mi_data, dir, child, qstr->name,
+				     qstr->len, 0, LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data)) {
+		OBD_FREE_PTR(einfo);
+		OBD_FREE_PTR(minfo);
+		return PTR_ERR(op_data);
+	}
+
+	minfo->mi_it.it_op = IT_GETATTR;
+	minfo->mi_dir = igrab(dir);
+	minfo->mi_cb = ll_statahead_interpret;
+	minfo->mi_generation = lli->lli_sai->sai_generation;
+	minfo->mi_cbdata = entry->se_index;
+
+	einfo->ei_type   = LDLM_IBITS;
+	einfo->ei_mode   = it_to_lock_mode(&minfo->mi_it);
+	einfo->ei_cb_bl  = ll_md_blocking_ast;
+	einfo->ei_cb_cp  = ldlm_completion_ast;
+	einfo->ei_cb_gl  = NULL;
+	einfo->ei_cbdata = NULL;
+
+	*pmi = minfo;
+	*pei = einfo;
+	pcapa[0] = op_data->op_capa1;
+	pcapa[1] = op_data->op_capa2;
+
+	return 0;
+}
+
+static int do_sa_lookup(struct inode *dir, struct ll_sa_entry *entry)
+{
+	struct md_enqueue_info   *minfo;
+	struct ldlm_enqueue_info *einfo;
+	struct obd_capa	  *capas[2];
+	int		       rc;
+	ENTRY;
+
+	rc = sa_args_init(dir, NULL, entry, &minfo, &einfo, capas);
+	if (rc)
+		RETURN(rc);
+
+	rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo, einfo);
+	if (!rc) {
+		capa_put(capas[0]);
+		capa_put(capas[1]);
+	} else {
+		sa_args_fini(minfo, einfo);
+	}
+
+	RETURN(rc);
+}
+
+/**
+ * similar to ll_revalidate_it().
+ * \retval      1 -- dentry valid
+ * \retval      0 -- will send stat-ahead request
+ * \retval others -- prepare stat-ahead request failed
+ */
+static int do_sa_revalidate(struct inode *dir, struct ll_sa_entry *entry,
+			    struct dentry *dentry)
+{
+	struct inode	     *inode = dentry->d_inode;
+	struct lookup_intent      it = { .it_op = IT_GETATTR,
+					 .d.lustre.it_lock_handle = 0 };
+	struct md_enqueue_info   *minfo;
+	struct ldlm_enqueue_info *einfo;
+	struct obd_capa	  *capas[2];
+	int rc;
+	ENTRY;
+
+	if (unlikely(inode == NULL))
+		RETURN(1);
+
+	if (d_mountpoint(dentry))
+		RETURN(1);
+
+	if (unlikely(dentry == dentry->d_sb->s_root))
+		RETURN(1);
+
+	entry->se_inode = igrab(inode);
+	rc = md_revalidate_lock(ll_i2mdexp(dir), &it, ll_inode2fid(inode),NULL);
+	if (rc == 1) {
+		entry->se_handle = it.d.lustre.it_lock_handle;
+		ll_intent_release(&it);
+		RETURN(1);
+	}
+
+	rc = sa_args_init(dir, inode, entry, &minfo, &einfo, capas);
+	if (rc) {
+		entry->se_inode = NULL;
+		iput(inode);
+		RETURN(rc);
+	}
+
+	rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo, einfo);
+	if (!rc) {
+		capa_put(capas[0]);
+		capa_put(capas[1]);
+	} else {
+		entry->se_inode = NULL;
+		iput(inode);
+		sa_args_fini(minfo, einfo);
+	}
+
+	RETURN(rc);
+}
+
+static void ll_statahead_one(struct dentry *parent, const char* entry_name,
+			     int entry_name_len)
+{
+	struct inode	     *dir    = parent->d_inode;
+	struct ll_inode_info     *lli    = ll_i2info(dir);
+	struct ll_statahead_info *sai    = lli->lli_sai;
+	struct dentry	    *dentry = NULL;
+	struct ll_sa_entry       *entry;
+	int		       rc;
+	int		       rc1;
+	ENTRY;
+
+	entry = ll_sa_entry_alloc(sai, sai->sai_index, entry_name,
+				  entry_name_len);
+	if (IS_ERR(entry))
+		RETURN_EXIT;
+
+	dentry = d_lookup(parent, &entry->se_qstr);
+	if (!dentry) {
+		rc = do_sa_lookup(dir, entry);
+	} else {
+		rc = do_sa_revalidate(dir, entry, dentry);
+		if (rc == 1 && agl_should_run(sai, dentry->d_inode))
+			ll_agl_add(sai, dentry->d_inode, entry->se_index);
+	}
+
+	if (dentry != NULL)
+		dput(dentry);
+
+	if (rc) {
+		rc1 = ll_sa_entry_to_stated(sai, entry,
+					rc < 0 ? SA_ENTRY_INVA : SA_ENTRY_SUCC);
+		if (rc1 == 0 && entry->se_index == sai->sai_index_wait)
+			wake_up(&sai->sai_waitq);
+	} else {
+		sai->sai_sent++;
+	}
+
+	sai->sai_index++;
+	/* drop one refcount on entry by ll_sa_entry_alloc */
+	ll_sa_entry_put(sai, entry);
+
+	EXIT;
+}
+
+static int ll_agl_thread(void *arg)
+{
+	struct dentry	    *parent = (struct dentry *)arg;
+	struct inode	     *dir    = parent->d_inode;
+	struct ll_inode_info     *plli   = ll_i2info(dir);
+	struct ll_inode_info     *clli;
+	struct ll_sb_info	*sbi    = ll_i2sbi(dir);
+	struct ll_statahead_info *sai    = ll_sai_get(plli->lli_sai);
+	struct ptlrpc_thread     *thread = &sai->sai_agl_thread;
+	struct l_wait_info	lwi    = { 0 };
+	ENTRY;
+
+	CDEBUG(D_READA, "agl thread started: [pid %d] [parent %.*s]\n",
+	       current_pid(), parent->d_name.len, parent->d_name.name);
+
+	atomic_inc(&sbi->ll_agl_total);
+	spin_lock(&plli->lli_agl_lock);
+	sai->sai_agl_valid = 1;
+	thread_set_flags(thread, SVC_RUNNING);
+	spin_unlock(&plli->lli_agl_lock);
+	wake_up(&thread->t_ctl_waitq);
+
+	while (1) {
+		l_wait_event(thread->t_ctl_waitq,
+			     !agl_list_empty(sai) ||
+			     !thread_is_running(thread),
+			     &lwi);
+
+		if (!thread_is_running(thread))
+			break;
+
+		spin_lock(&plli->lli_agl_lock);
+		/* The statahead thread maybe help to process AGL entries,
+		 * so check whether list empty again. */
+		if (!agl_list_empty(sai)) {
+			clli = agl_first_entry(sai);
+			list_del_init(&clli->lli_agl_list);
+			spin_unlock(&plli->lli_agl_lock);
+			ll_agl_trigger(&clli->lli_vfs_inode, sai);
+		} else {
+			spin_unlock(&plli->lli_agl_lock);
+		}
+	}
+
+	spin_lock(&plli->lli_agl_lock);
+	sai->sai_agl_valid = 0;
+	while (!agl_list_empty(sai)) {
+		clli = agl_first_entry(sai);
+		list_del_init(&clli->lli_agl_list);
+		spin_unlock(&plli->lli_agl_lock);
+		clli->lli_agl_index = 0;
+		iput(&clli->lli_vfs_inode);
+		spin_lock(&plli->lli_agl_lock);
+	}
+	thread_set_flags(thread, SVC_STOPPED);
+	spin_unlock(&plli->lli_agl_lock);
+	wake_up(&thread->t_ctl_waitq);
+	ll_sai_put(sai);
+	CDEBUG(D_READA, "agl thread stopped: [pid %d] [parent %.*s]\n",
+	       current_pid(), parent->d_name.len, parent->d_name.name);
+	RETURN(0);
+}
+
+static void ll_start_agl(struct dentry *parent, struct ll_statahead_info *sai)
+{
+	struct ptlrpc_thread *thread = &sai->sai_agl_thread;
+	struct l_wait_info    lwi    = { 0 };
+	struct ll_inode_info  *plli;
+	task_t	      *task;
+	ENTRY;
+
+	CDEBUG(D_READA, "start agl thread: [pid %d] [parent %.*s]\n",
+	       current_pid(), parent->d_name.len, parent->d_name.name);
+
+	plli = ll_i2info(parent->d_inode);
+	task = kthread_run(ll_agl_thread, parent,
+			       "ll_agl_%u", plli->lli_opendir_pid);
+	if (IS_ERR(task)) {
+		CERROR("can't start ll_agl thread, rc: %ld\n", PTR_ERR(task));
+		thread_set_flags(thread, SVC_STOPPED);
+		RETURN_EXIT;
+	}
+
+	l_wait_event(thread->t_ctl_waitq,
+		     thread_is_running(thread) || thread_is_stopped(thread),
+		     &lwi);
+	EXIT;
+}
+
+static int ll_statahead_thread(void *arg)
+{
+	struct dentry	    *parent = (struct dentry *)arg;
+	struct inode	     *dir    = parent->d_inode;
+	struct ll_inode_info     *plli   = ll_i2info(dir);
+	struct ll_inode_info     *clli;
+	struct ll_sb_info	*sbi    = ll_i2sbi(dir);
+	struct ll_statahead_info *sai    = ll_sai_get(plli->lli_sai);
+	struct ptlrpc_thread     *thread = &sai->sai_thread;
+	struct ptlrpc_thread *agl_thread = &sai->sai_agl_thread;
+	struct page	      *page;
+	__u64		     pos    = 0;
+	int		       first  = 0;
+	int		       rc     = 0;
+	struct ll_dir_chain       chain;
+	struct l_wait_info	lwi    = { 0 };
+	ENTRY;
+
+	CDEBUG(D_READA, "statahead thread started: [pid %d] [parent %.*s]\n",
+	       current_pid(), parent->d_name.len, parent->d_name.name);
+
+	if (sbi->ll_flags & LL_SBI_AGL_ENABLED)
+		ll_start_agl(parent, sai);
+
+	atomic_inc(&sbi->ll_sa_total);
+	spin_lock(&plli->lli_sa_lock);
+	thread_set_flags(thread, SVC_RUNNING);
+	spin_unlock(&plli->lli_sa_lock);
+	wake_up(&thread->t_ctl_waitq);
+
+	ll_dir_chain_init(&chain);
+	page = ll_get_dir_page(dir, pos, &chain);
+
+	while (1) {
+		struct lu_dirpage *dp;
+		struct lu_dirent  *ent;
+
+		if (IS_ERR(page)) {
+			rc = PTR_ERR(page);
+			CDEBUG(D_READA, "error reading dir "DFID" at "LPU64
+			       "/"LPU64": [rc %d] [parent %u]\n",
+			       PFID(ll_inode2fid(dir)), pos, sai->sai_index,
+			       rc, plli->lli_opendir_pid);
+			GOTO(out, rc);
+		}
+
+		dp = page_address(page);
+		for (ent = lu_dirent_start(dp); ent != NULL;
+		     ent = lu_dirent_next(ent)) {
+			__u64 hash;
+			int namelen;
+			char *name;
+
+			hash = le64_to_cpu(ent->lde_hash);
+			if (unlikely(hash < pos))
+				/*
+				 * Skip until we find target hash value.
+				 */
+				continue;
+
+			namelen = le16_to_cpu(ent->lde_namelen);
+			if (unlikely(namelen == 0))
+				/*
+				 * Skip dummy record.
+				 */
+				continue;
+
+			name = ent->lde_name;
+			if (name[0] == '.') {
+				if (namelen == 1) {
+					/*
+					 * skip "."
+					 */
+					continue;
+				} else if (name[1] == '.' && namelen == 2) {
+					/*
+					 * skip ".."
+					 */
+					continue;
+				} else if (!sai->sai_ls_all) {
+					/*
+					 * skip hidden files.
+					 */
+					sai->sai_skip_hidden++;
+					continue;
+				}
+			}
+
+			/*
+			 * don't stat-ahead first entry.
+			 */
+			if (unlikely(++first == 1))
+				continue;
+
+keep_it:
+			l_wait_event(thread->t_ctl_waitq,
+				     !sa_sent_full(sai) ||
+				     !sa_received_empty(sai) ||
+				     !agl_list_empty(sai) ||
+				     !thread_is_running(thread),
+				     &lwi);
+
+interpret_it:
+			while (!sa_received_empty(sai))
+				ll_post_statahead(sai);
+
+			if (unlikely(!thread_is_running(thread))) {
+				ll_release_page(page, 0);
+				GOTO(out, rc = 0);
+			}
+
+			/* If no window for metadata statahead, but there are
+			 * some AGL entries to be triggered, then try to help
+			 * to process the AGL entries. */
+			if (sa_sent_full(sai)) {
+				spin_lock(&plli->lli_agl_lock);
+				while (!agl_list_empty(sai)) {
+					clli = agl_first_entry(sai);
+					list_del_init(&clli->lli_agl_list);
+					spin_unlock(&plli->lli_agl_lock);
+					ll_agl_trigger(&clli->lli_vfs_inode,
+						       sai);
+
+					if (!sa_received_empty(sai))
+						goto interpret_it;
+
+					if (unlikely(
+						!thread_is_running(thread))) {
+						ll_release_page(page, 0);
+						GOTO(out, rc = 0);
+					}
+
+					if (!sa_sent_full(sai))
+						goto do_it;
+
+					spin_lock(&plli->lli_agl_lock);
+				}
+				spin_unlock(&plli->lli_agl_lock);
+
+				goto keep_it;
+			}
+
+do_it:
+			ll_statahead_one(parent, name, namelen);
+		}
+		pos = le64_to_cpu(dp->ldp_hash_end);
+		if (pos == MDS_DIR_END_OFF) {
+			/*
+			 * End of directory reached.
+			 */
+			ll_release_page(page, 0);
+			while (1) {
+				l_wait_event(thread->t_ctl_waitq,
+					     !sa_received_empty(sai) ||
+					     sai->sai_sent == sai->sai_replied||
+					     !thread_is_running(thread),
+					     &lwi);
+
+				while (!sa_received_empty(sai))
+					ll_post_statahead(sai);
+
+				if (unlikely(!thread_is_running(thread)))
+					GOTO(out, rc = 0);
+
+				if (sai->sai_sent == sai->sai_replied &&
+				    sa_received_empty(sai))
+					break;
+			}
+
+			spin_lock(&plli->lli_agl_lock);
+			while (!agl_list_empty(sai) &&
+			       thread_is_running(thread)) {
+				clli = agl_first_entry(sai);
+				list_del_init(&clli->lli_agl_list);
+				spin_unlock(&plli->lli_agl_lock);
+				ll_agl_trigger(&clli->lli_vfs_inode, sai);
+				spin_lock(&plli->lli_agl_lock);
+			}
+			spin_unlock(&plli->lli_agl_lock);
+
+			GOTO(out, rc = 0);
+		} else if (1) {
+			/*
+			 * chain is exhausted.
+			 * Normal case: continue to the next page.
+			 */
+			ll_release_page(page, le32_to_cpu(dp->ldp_flags) &
+					      LDF_COLLIDE);
+			sai->sai_in_readpage = 1;
+			page = ll_get_dir_page(dir, pos, &chain);
+			sai->sai_in_readpage = 0;
+		} else {
+			LASSERT(le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE);
+			ll_release_page(page, 1);
+			/*
+			 * go into overflow page.
+			 */
+		}
+	}
+	EXIT;
+
+out:
+	if (sai->sai_agl_valid) {
+		spin_lock(&plli->lli_agl_lock);
+		thread_set_flags(agl_thread, SVC_STOPPING);
+		spin_unlock(&plli->lli_agl_lock);
+		wake_up(&agl_thread->t_ctl_waitq);
+
+		CDEBUG(D_READA, "stop agl thread: [pid %d]\n",
+		       current_pid());
+		l_wait_event(agl_thread->t_ctl_waitq,
+			     thread_is_stopped(agl_thread),
+			     &lwi);
+	} else {
+		/* Set agl_thread flags anyway. */
+		thread_set_flags(&sai->sai_agl_thread, SVC_STOPPED);
+	}
+	ll_dir_chain_fini(&chain);
+	spin_lock(&plli->lli_sa_lock);
+	if (!sa_received_empty(sai)) {
+		thread_set_flags(thread, SVC_STOPPING);
+		spin_unlock(&plli->lli_sa_lock);
+
+		/* To release the resources held by received entries. */
+		while (!sa_received_empty(sai))
+			ll_post_statahead(sai);
+
+		spin_lock(&plli->lli_sa_lock);
+	}
+	thread_set_flags(thread, SVC_STOPPED);
+	spin_unlock(&plli->lli_sa_lock);
+	wake_up(&sai->sai_waitq);
+	wake_up(&thread->t_ctl_waitq);
+	ll_sai_put(sai);
+	dput(parent);
+	CDEBUG(D_READA, "statahead thread stopped: [pid %d] [parent %.*s]\n",
+	       current_pid(), parent->d_name.len, parent->d_name.name);
+	return rc;
+}
+
+/**
+ * called in ll_file_release().
+ */
+void ll_stop_statahead(struct inode *dir, void *key)
+{
+	struct ll_inode_info *lli = ll_i2info(dir);
+
+	if (unlikely(key == NULL))
+		return;
+
+	spin_lock(&lli->lli_sa_lock);
+	if (lli->lli_opendir_key != key || lli->lli_opendir_pid == 0) {
+		spin_unlock(&lli->lli_sa_lock);
+		return;
+	}
+
+	lli->lli_opendir_key = NULL;
+
+	if (lli->lli_sai) {
+		struct l_wait_info lwi = { 0 };
+		struct ptlrpc_thread *thread = &lli->lli_sai->sai_thread;
+
+		if (!thread_is_stopped(thread)) {
+			thread_set_flags(thread, SVC_STOPPING);
+			spin_unlock(&lli->lli_sa_lock);
+			wake_up(&thread->t_ctl_waitq);
+
+			CDEBUG(D_READA, "stop statahead thread: [pid %d]\n",
+			       current_pid());
+			l_wait_event(thread->t_ctl_waitq,
+				     thread_is_stopped(thread),
+				     &lwi);
+		} else {
+			spin_unlock(&lli->lli_sa_lock);
+		}
+
+		/*
+		 * Put the ref which was held when first statahead_enter.
+		 * It maybe not the last ref for some statahead requests
+		 * maybe inflight.
+		 */
+		ll_sai_put(lli->lli_sai);
+	} else {
+		lli->lli_opendir_pid = 0;
+		spin_unlock(&lli->lli_sa_lock);
+	}
+}
+
+enum {
+	/**
+	 * not first dirent, or is "."
+	 */
+	LS_NONE_FIRST_DE = 0,
+	/**
+	 * the first non-hidden dirent
+	 */
+	LS_FIRST_DE,
+	/**
+	 * the first hidden dirent, that is "."
+	 */
+	LS_FIRST_DOT_DE
+};
+
+static int is_first_dirent(struct inode *dir, struct dentry *dentry)
+{
+	struct ll_dir_chain   chain;
+	struct qstr	  *target = &dentry->d_name;
+	struct page	  *page;
+	__u64		 pos    = 0;
+	int		   dot_de;
+	int		   rc     = LS_NONE_FIRST_DE;
+	ENTRY;
+
+	ll_dir_chain_init(&chain);
+	page = ll_get_dir_page(dir, pos, &chain);
+
+	while (1) {
+		struct lu_dirpage *dp;
+		struct lu_dirent  *ent;
+
+		if (IS_ERR(page)) {
+			struct ll_inode_info *lli = ll_i2info(dir);
+
+			rc = PTR_ERR(page);
+			CERROR("error reading dir "DFID" at "LPU64": "
+			       "[rc %d] [parent %u]\n",
+			       PFID(ll_inode2fid(dir)), pos,
+			       rc, lli->lli_opendir_pid);
+			break;
+		}
+
+		dp = page_address(page);
+		for (ent = lu_dirent_start(dp); ent != NULL;
+		     ent = lu_dirent_next(ent)) {
+			__u64 hash;
+			int namelen;
+			char *name;
+
+			hash = le64_to_cpu(ent->lde_hash);
+			/* The ll_get_dir_page() can return any page containing
+			 * the given hash which may be not the start hash. */
+			if (unlikely(hash < pos))
+				continue;
+
+			namelen = le16_to_cpu(ent->lde_namelen);
+			if (unlikely(namelen == 0))
+				/*
+				 * skip dummy record.
+				 */
+				continue;
+
+			name = ent->lde_name;
+			if (name[0] == '.') {
+				if (namelen == 1)
+					/*
+					 * skip "."
+					 */
+					continue;
+				else if (name[1] == '.' && namelen == 2)
+					/*
+					 * skip ".."
+					 */
+					continue;
+				else
+					dot_de = 1;
+			} else {
+				dot_de = 0;
+			}
+
+			if (dot_de && target->name[0] != '.') {
+				CDEBUG(D_READA, "%.*s skip hidden file %.*s\n",
+				       target->len, target->name,
+				       namelen, name);
+				continue;
+			}
+
+			if (target->len != namelen ||
+			    memcmp(target->name, name, namelen) != 0)
+				rc = LS_NONE_FIRST_DE;
+			else if (!dot_de)
+				rc = LS_FIRST_DE;
+			else
+				rc = LS_FIRST_DOT_DE;
+
+			ll_release_page(page, 0);
+			GOTO(out, rc);
+		}
+		pos = le64_to_cpu(dp->ldp_hash_end);
+		if (pos == MDS_DIR_END_OFF) {
+			/*
+			 * End of directory reached.
+			 */
+			ll_release_page(page, 0);
+			break;
+		} else if (1) {
+			/*
+			 * chain is exhausted
+			 * Normal case: continue to the next page.
+			 */
+			ll_release_page(page, le32_to_cpu(dp->ldp_flags) &
+					      LDF_COLLIDE);
+			page = ll_get_dir_page(dir, pos, &chain);
+		} else {
+			/*
+			 * go into overflow page.
+			 */
+			LASSERT(le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE);
+			ll_release_page(page, 1);
+		}
+	}
+	EXIT;
+
+out:
+	ll_dir_chain_fini(&chain);
+	return rc;
+}
+
+static void
+ll_sai_unplug(struct ll_statahead_info *sai, struct ll_sa_entry *entry)
+{
+	struct ptlrpc_thread *thread = &sai->sai_thread;
+	struct ll_sb_info    *sbi    = ll_i2sbi(sai->sai_inode);
+	int		   hit;
+	ENTRY;
+
+	if (entry != NULL && entry->se_stat == SA_ENTRY_SUCC)
+		hit = 1;
+	else
+		hit = 0;
+
+	ll_sa_entry_fini(sai, entry);
+	if (hit) {
+		sai->sai_hit++;
+		sai->sai_consecutive_miss = 0;
+		sai->sai_max = min(2 * sai->sai_max, sbi->ll_sa_max);
+	} else {
+		struct ll_inode_info *lli = ll_i2info(sai->sai_inode);
+
+		sai->sai_miss++;
+		sai->sai_consecutive_miss++;
+		if (sa_low_hit(sai) && thread_is_running(thread)) {
+			atomic_inc(&sbi->ll_sa_wrong);
+			CDEBUG(D_READA, "Statahead for dir "DFID" hit "
+			       "ratio too low: hit/miss "LPU64"/"LPU64
+			       ", sent/replied "LPU64"/"LPU64", stopping "
+			       "statahead thread: pid %d\n",
+			       PFID(&lli->lli_fid), sai->sai_hit,
+			       sai->sai_miss, sai->sai_sent,
+			       sai->sai_replied, current_pid());
+			spin_lock(&lli->lli_sa_lock);
+			if (!thread_is_stopped(thread))
+				thread_set_flags(thread, SVC_STOPPING);
+			spin_unlock(&lli->lli_sa_lock);
+		}
+	}
+
+	if (!thread_is_stopped(thread))
+		wake_up(&thread->t_ctl_waitq);
+
+	EXIT;
+}
+
+/**
+ * Start statahead thread if this is the first dir entry.
+ * Otherwise if a thread is started already, wait it until it is ahead of me.
+ * \retval 1       -- find entry with lock in cache, the caller needs to do
+ *		    nothing.
+ * \retval 0       -- find entry in cache, but without lock, the caller needs
+ *		    refresh from MDS.
+ * \retval others  -- the caller need to process as non-statahead.
+ */
+int do_statahead_enter(struct inode *dir, struct dentry **dentryp,
+		       int only_unplug)
+{
+	struct ll_inode_info     *lli   = ll_i2info(dir);
+	struct ll_statahead_info *sai   = lli->lli_sai;
+	struct dentry	    *parent;
+	struct ll_sa_entry       *entry;
+	struct ptlrpc_thread     *thread;
+	struct l_wait_info	lwi   = { 0 };
+	int		       rc    = 0;
+	struct ll_inode_info     *plli;
+	ENTRY;
+
+	LASSERT(lli->lli_opendir_pid == current_pid());
+
+	if (sai) {
+		thread = &sai->sai_thread;
+		if (unlikely(thread_is_stopped(thread) &&
+			     list_empty(&sai->sai_entries_stated))) {
+			/* to release resource */
+			ll_stop_statahead(dir, lli->lli_opendir_key);
+			RETURN(-EAGAIN);
+		}
+
+		if ((*dentryp)->d_name.name[0] == '.') {
+			if (sai->sai_ls_all ||
+			    sai->sai_miss_hidden >= sai->sai_skip_hidden) {
+				/*
+				 * Hidden dentry is the first one, or statahead
+				 * thread does not skip so many hidden dentries
+				 * before "sai_ls_all" enabled as below.
+				 */
+			} else {
+				if (!sai->sai_ls_all)
+					/*
+					 * It maybe because hidden dentry is not
+					 * the first one, "sai_ls_all" was not
+					 * set, then "ls -al" missed. Enable
+					 * "sai_ls_all" for such case.
+					 */
+					sai->sai_ls_all = 1;
+
+				/*
+				 * Such "getattr" has been skipped before
+				 * "sai_ls_all" enabled as above.
+				 */
+				sai->sai_miss_hidden++;
+				RETURN(-EAGAIN);
+			}
+		}
+
+		entry = ll_sa_entry_get_byname(sai, &(*dentryp)->d_name);
+		if (entry == NULL || only_unplug) {
+			ll_sai_unplug(sai, entry);
+			RETURN(entry ? 1 : -EAGAIN);
+		}
+
+		/* if statahead is busy in readdir, help it do post-work */
+		while (!ll_sa_entry_stated(entry) &&
+		       sai->sai_in_readpage &&
+		       !sa_received_empty(sai))
+			ll_post_statahead(sai);
+
+		if (!ll_sa_entry_stated(entry)) {
+			sai->sai_index_wait = entry->se_index;
+			lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(30), NULL,
+					       LWI_ON_SIGNAL_NOOP, NULL);
+			rc = l_wait_event(sai->sai_waitq,
+					  ll_sa_entry_stated(entry) ||
+					  thread_is_stopped(thread),
+					  &lwi);
+			if (rc < 0) {
+				ll_sai_unplug(sai, entry);
+				RETURN(-EAGAIN);
+			}
+		}
+
+		if (entry->se_stat == SA_ENTRY_SUCC &&
+		    entry->se_inode != NULL) {
+			struct inode *inode = entry->se_inode;
+			struct lookup_intent it = { .it_op = IT_GETATTR,
+						    .d.lustre.it_lock_handle =
+						     entry->se_handle };
+			__u64 bits;
+
+			rc = md_revalidate_lock(ll_i2mdexp(dir), &it,
+						ll_inode2fid(inode), &bits);
+			if (rc == 1) {
+				if ((*dentryp)->d_inode == NULL) {
+					*dentryp = ll_splice_alias(inode,
+								   *dentryp);
+				} else if ((*dentryp)->d_inode != inode) {
+					/* revalidate, but inode is recreated */
+					CDEBUG(D_READA,
+					      "stale dentry %.*s inode %lu/%u, "
+					      "statahead inode %lu/%u\n",
+					      (*dentryp)->d_name.len,
+					      (*dentryp)->d_name.name,
+					      (*dentryp)->d_inode->i_ino,
+					      (*dentryp)->d_inode->i_generation,
+					      inode->i_ino,
+					      inode->i_generation);
+					ll_sai_unplug(sai, entry);
+					RETURN(-ESTALE);
+				} else {
+					iput(inode);
+				}
+				entry->se_inode = NULL;
+
+				if ((bits & MDS_INODELOCK_LOOKUP) &&
+				    d_lustre_invalid(*dentryp))
+					d_lustre_revalidate(*dentryp);
+				ll_intent_release(&it);
+			}
+		}
+
+		ll_sai_unplug(sai, entry);
+		RETURN(rc);
+	}
+
+	/* I am the "lli_opendir_pid" owner, only me can set "lli_sai". */
+	rc = is_first_dirent(dir, *dentryp);
+	if (rc == LS_NONE_FIRST_DE)
+		/* It is not "ls -{a}l" operation, no need statahead for it. */
+		GOTO(out, rc = -EAGAIN);
+
+	sai = ll_sai_alloc();
+	if (sai == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	sai->sai_ls_all = (rc == LS_FIRST_DOT_DE);
+	sai->sai_inode = igrab(dir);
+	if (unlikely(sai->sai_inode == NULL)) {
+		CWARN("Do not start stat ahead on dying inode "DFID"\n",
+		      PFID(&lli->lli_fid));
+		GOTO(out, rc = -ESTALE);
+	}
+
+	/* get parent reference count here, and put it in ll_statahead_thread */
+	parent = dget((*dentryp)->d_parent);
+	if (unlikely(sai->sai_inode != parent->d_inode)) {
+		struct ll_inode_info *nlli = ll_i2info(parent->d_inode);
+
+		CWARN("Race condition, someone changed %.*s just now: "
+		      "old parent "DFID", new parent "DFID"\n",
+		      (*dentryp)->d_name.len, (*dentryp)->d_name.name,
+		      PFID(&lli->lli_fid), PFID(&nlli->lli_fid));
+		dput(parent);
+		iput(sai->sai_inode);
+		GOTO(out, rc = -EAGAIN);
+	}
+
+	CDEBUG(D_READA, "start statahead thread: [pid %d] [parent %.*s]\n",
+	       current_pid(), parent->d_name.len, parent->d_name.name);
+
+	lli->lli_sai = sai;
+
+	plli = ll_i2info(parent->d_inode);
+	rc = PTR_ERR(kthread_run(ll_statahead_thread, parent,
+				 "ll_sa_%u", plli->lli_opendir_pid));
+	thread = &sai->sai_thread;
+	if (IS_ERR_VALUE(rc)) {
+		CERROR("can't start ll_sa thread, rc: %d\n", rc);
+		dput(parent);
+		lli->lli_opendir_key = NULL;
+		thread_set_flags(thread, SVC_STOPPED);
+		thread_set_flags(&sai->sai_agl_thread, SVC_STOPPED);
+		ll_sai_put(sai);
+		LASSERT(lli->lli_sai == NULL);
+		RETURN(-EAGAIN);
+	}
+
+	l_wait_event(thread->t_ctl_waitq,
+		     thread_is_running(thread) || thread_is_stopped(thread),
+		     &lwi);
+
+	/*
+	 * We don't stat-ahead for the first dirent since we are already in
+	 * lookup.
+	 */
+	RETURN(-EAGAIN);
+
+out:
+	if (sai != NULL)
+		OBD_FREE_PTR(sai);
+	spin_lock(&lli->lli_sa_lock);
+	lli->lli_opendir_key = NULL;
+	lli->lli_opendir_pid = 0;
+	spin_unlock(&lli->lli_sa_lock);
+	return rc;
+}
diff --git a/drivers/staging/lustre/lustre/llite/super25.c b/drivers/staging/lustre/lustre/llite/super25.c
new file mode 100644
index 000000000000..4101c52ed5d7
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/super25.c
@@ -0,0 +1,226 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/version.h>
+#include <lustre_lite.h>
+#include <lustre_ha.h>
+#include <lustre_dlm.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <lprocfs_status.h>
+#include "llite_internal.h"
+
+static struct kmem_cache *ll_inode_cachep;
+
+static struct inode *ll_alloc_inode(struct super_block *sb)
+{
+	struct ll_inode_info *lli;
+	ll_stats_ops_tally(ll_s2sbi(sb), LPROC_LL_ALLOC_INODE, 1);
+	OBD_SLAB_ALLOC_PTR_GFP(lli, ll_inode_cachep, __GFP_IO);
+	if (lli == NULL)
+		return NULL;
+
+	inode_init_once(&lli->lli_vfs_inode);
+	return &lli->lli_vfs_inode;
+}
+
+static void ll_inode_destroy_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	struct ll_inode_info *ptr = ll_i2info(inode);
+	OBD_SLAB_FREE_PTR(ptr, ll_inode_cachep);
+}
+
+static void ll_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, ll_inode_destroy_callback);
+}
+
+int ll_init_inodecache(void)
+{
+	ll_inode_cachep = kmem_cache_create("lustre_inode_cache",
+					       sizeof(struct ll_inode_info),
+					       0, SLAB_HWCACHE_ALIGN, NULL);
+	if (ll_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+void ll_destroy_inodecache(void)
+{
+	kmem_cache_destroy(ll_inode_cachep);
+}
+
+/* exported operations */
+struct super_operations lustre_super_operations =
+{
+	.alloc_inode   = ll_alloc_inode,
+	.destroy_inode = ll_destroy_inode,
+	.evict_inode   = ll_delete_inode,
+	.put_super     = ll_put_super,
+	.statfs	= ll_statfs,
+	.umount_begin  = ll_umount_begin,
+	.remount_fs    = ll_remount_fs,
+	.show_options  = ll_show_options,
+};
+
+
+void lustre_register_client_process_config(int (*cpc)(struct lustre_cfg *lcfg));
+
+int vvp_global_init(void);
+void vvp_global_fini(void);
+
+static int __init init_lustre_lite(void)
+{
+	int i, rc, seed[2];
+	struct timeval tv;
+	lnet_process_id_t lnet_id;
+
+	CLASSERT(sizeof(LUSTRE_VOLATILE_HDR) == LUSTRE_VOLATILE_HDR_LEN + 1);
+
+	/* print an address of _any_ initialized kernel symbol from this
+	 * module, to allow debugging with gdb that doesn't support data
+	 * symbols from modules.*/
+	CDEBUG(D_INFO, "Lustre client module (%p).\n",
+	       &lustre_super_operations);
+
+	rc = ll_init_inodecache();
+	if (rc)
+		return -ENOMEM;
+	ll_file_data_slab = kmem_cache_create("ll_file_data",
+						 sizeof(struct ll_file_data), 0,
+						 SLAB_HWCACHE_ALIGN, NULL);
+	if (ll_file_data_slab == NULL) {
+		ll_destroy_inodecache();
+		return -ENOMEM;
+	}
+
+	ll_remote_perm_cachep = kmem_cache_create("ll_remote_perm_cache",
+						  sizeof(struct ll_remote_perm),
+						      0, 0, NULL);
+	if (ll_remote_perm_cachep == NULL) {
+		kmem_cache_destroy(ll_file_data_slab);
+		ll_file_data_slab = NULL;
+		ll_destroy_inodecache();
+		return -ENOMEM;
+	}
+
+	ll_rmtperm_hash_cachep = kmem_cache_create("ll_rmtperm_hash_cache",
+						   REMOTE_PERM_HASHSIZE *
+						   sizeof(struct list_head),
+						   0, 0, NULL);
+	if (ll_rmtperm_hash_cachep == NULL) {
+		kmem_cache_destroy(ll_remote_perm_cachep);
+		ll_remote_perm_cachep = NULL;
+		kmem_cache_destroy(ll_file_data_slab);
+		ll_file_data_slab = NULL;
+		ll_destroy_inodecache();
+		return -ENOMEM;
+	}
+
+	proc_lustre_fs_root = proc_lustre_root ?
+			      lprocfs_register("llite", proc_lustre_root, NULL, NULL) : NULL;
+
+	lustre_register_client_fill_super(ll_fill_super);
+	lustre_register_kill_super_cb(ll_kill_super);
+
+	lustre_register_client_process_config(ll_process_config);
+
+	cfs_get_random_bytes(seed, sizeof(seed));
+
+	/* Nodes with small feet have little entropy
+	 * the NID for this node gives the most entropy in the low bits */
+	for (i=0; ; i++) {
+		if (LNetGetId(i, &lnet_id) == -ENOENT) {
+			break;
+		}
+		if (LNET_NETTYP(LNET_NIDNET(lnet_id.nid)) != LOLND) {
+			seed[0] ^= LNET_NIDADDR(lnet_id.nid);
+		}
+	}
+
+	do_gettimeofday(&tv);
+	cfs_srand(tv.tv_sec ^ seed[0], tv.tv_usec ^ seed[1]);
+
+	init_timer(&ll_capa_timer);
+	ll_capa_timer.function = ll_capa_timer_callback;
+	rc = ll_capa_thread_start();
+	/*
+	 * XXX normal cleanup is needed here.
+	 */
+	if (rc == 0)
+		rc = vvp_global_init();
+
+	return rc;
+}
+
+static void __exit exit_lustre_lite(void)
+{
+	vvp_global_fini();
+	del_timer(&ll_capa_timer);
+	ll_capa_thread_stop();
+	LASSERTF(capa_count[CAPA_SITE_CLIENT] == 0,
+		 "client remaining capa count %d\n",
+		 capa_count[CAPA_SITE_CLIENT]);
+
+	lustre_register_client_fill_super(NULL);
+	lustre_register_kill_super_cb(NULL);
+
+	lustre_register_client_process_config(NULL);
+
+	ll_destroy_inodecache();
+
+	kmem_cache_destroy(ll_rmtperm_hash_cachep);
+	ll_rmtperm_hash_cachep = NULL;
+
+	kmem_cache_destroy(ll_remote_perm_cachep);
+	ll_remote_perm_cachep = NULL;
+
+	kmem_cache_destroy(ll_file_data_slab);
+	if (proc_lustre_fs_root)
+		lprocfs_remove(&proc_lustre_fs_root);
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Lite Client File System");
+MODULE_LICENSE("GPL");
+
+module_init(init_lustre_lite);
+module_exit(exit_lustre_lite);
diff --git a/drivers/staging/lustre/lustre/llite/symlink.c b/drivers/staging/lustre/lustre/llite/symlink.c
new file mode 100644
index 000000000000..5260e989a4e5
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/symlink.c
@@ -0,0 +1,192 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/stat.h>
+#include <linux/version.h>
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <lustre_lite.h>
+#include "llite_internal.h"
+
+static int ll_readlink_internal(struct inode *inode,
+				struct ptlrpc_request **request, char **symname)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	int rc, symlen = i_size_read(inode) + 1;
+	struct mdt_body *body;
+	struct md_op_data *op_data;
+	ENTRY;
+
+	*request = NULL;
+
+	if (lli->lli_symlink_name) {
+		int print_limit = min_t(int, PAGE_SIZE - 128, symlen);
+
+		*symname = lli->lli_symlink_name;
+		/* If the total CDEBUG() size is larger than a page, it
+		 * will print a warning to the console, avoid this by
+		 * printing just the last part of the symlink. */
+		CDEBUG(D_INODE, "using cached symlink %s%.*s, len = %d\n",
+		       print_limit < symlen ? "..." : "", print_limit,
+		       (*symname) + symlen - print_limit, symlen);
+		RETURN(0);
+	}
+
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, symlen,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	op_data->op_valid = OBD_MD_LINKNAME;
+	rc = md_getattr(sbi->ll_md_exp, op_data, request);
+	ll_finish_md_op_data(op_data);
+	if (rc) {
+		if (rc != -ENOENT)
+			CERROR("inode %lu: rc = %d\n", inode->i_ino, rc);
+		GOTO (failed, rc);
+	}
+
+	body = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY);
+	LASSERT(body != NULL);
+	if ((body->valid & OBD_MD_LINKNAME) == 0) {
+		CERROR("OBD_MD_LINKNAME not set on reply\n");
+		GOTO(failed, rc = -EPROTO);
+	}
+
+	LASSERT(symlen != 0);
+	if (body->eadatasize != symlen) {
+		CERROR("inode %lu: symlink length %d not expected %d\n",
+			inode->i_ino, body->eadatasize - 1, symlen - 1);
+		GOTO(failed, rc = -EPROTO);
+	}
+
+	*symname = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_MD);
+	if (*symname == NULL ||
+	    strnlen(*symname, symlen) != symlen - 1) {
+		/* not full/NULL terminated */
+		CERROR("inode %lu: symlink not NULL terminated string"
+			"of length %d\n", inode->i_ino, symlen - 1);
+		GOTO(failed, rc = -EPROTO);
+	}
+
+	OBD_ALLOC(lli->lli_symlink_name, symlen);
+	/* do not return an error if we cannot cache the symlink locally */
+	if (lli->lli_symlink_name) {
+		memcpy(lli->lli_symlink_name, *symname, symlen);
+		*symname = lli->lli_symlink_name;
+	}
+	RETURN(0);
+
+failed:
+	RETURN (rc);
+}
+
+static int ll_readlink(struct dentry *dentry, char *buffer, int buflen)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ptlrpc_request *request;
+	char *symname;
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op\n");
+
+	ll_inode_size_lock(inode);
+	rc = ll_readlink_internal(inode, &request, &symname);
+	if (rc)
+		GOTO(out, rc);
+
+	rc = vfs_readlink(dentry, buffer, buflen, symname);
+ out:
+	ptlrpc_req_finished(request);
+	ll_inode_size_unlock(inode);
+	RETURN(rc);
+}
+
+static void *ll_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ptlrpc_request *request = NULL;
+	int rc;
+	char *symname;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op\n");
+	/* Limit the recursive symlink depth to 5 instead of default
+	 * 8 links when kernel has 4k stack to prevent stack overflow.
+	 * For 8k stacks we need to limit it to 7 for local servers. */
+	if (THREAD_SIZE < 8192 && current->link_count >= 6) {
+		rc = -ELOOP;
+	} else if (THREAD_SIZE == 8192 && current->link_count >= 8) {
+		rc = -ELOOP;
+	} else {
+		ll_inode_size_lock(inode);
+		rc = ll_readlink_internal(inode, &request, &symname);
+		ll_inode_size_unlock(inode);
+	}
+	if (rc) {
+		ptlrpc_req_finished(request);
+		request = NULL;
+		symname = ERR_PTR(rc);
+	}
+
+	nd_set_link(nd, symname);
+	/* symname may contain a pointer to the request message buffer,
+	 * we delay request releasing until ll_put_link then.
+	 */
+	RETURN(request);
+}
+
+static void ll_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
+{
+	ptlrpc_req_finished(cookie);
+}
+
+struct inode_operations ll_fast_symlink_inode_operations = {
+	.readlink	= ll_readlink,
+	.setattr	= ll_setattr,
+	.follow_link	= ll_follow_link,
+	.put_link	= ll_put_link,
+	.getattr	= ll_getattr,
+	.permission	= ll_inode_permission,
+	.setxattr	= ll_setxattr,
+	.getxattr	= ll_getxattr,
+	.listxattr	= ll_listxattr,
+	.removexattr	= ll_removexattr,
+};
diff --git a/drivers/staging/lustre/lustre/llite/vvp_dev.c b/drivers/staging/lustre/lustre/llite/vvp_dev.c
new file mode 100644
index 000000000000..60daf750e2e1
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/vvp_dev.c
@@ -0,0 +1,547 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * cl_device and cl_device_type implementation for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+
+#include <obd.h>
+#include <lustre_lite.h>
+
+#include "vvp_internal.h"
+
+/*****************************************************************************
+ *
+ * Vvp device and device type functions.
+ *
+ */
+
+/*
+ * vvp_ prefix stands for "Vfs Vm Posix". It corresponds to historical
+ * "llite_" (var. "ll_") prefix.
+ */
+
+struct kmem_cache *vvp_thread_kmem;
+static struct kmem_cache *vvp_session_kmem;
+static struct lu_kmem_descr vvp_caches[] = {
+	{
+		.ckd_cache = &vvp_thread_kmem,
+		.ckd_name  = "vvp_thread_kmem",
+		.ckd_size  = sizeof (struct vvp_thread_info),
+	},
+	{
+		.ckd_cache = &vvp_session_kmem,
+		.ckd_name  = "vvp_session_kmem",
+		.ckd_size  = sizeof (struct vvp_session)
+	},
+	{
+		.ckd_cache = NULL
+	}
+};
+
+static void *vvp_key_init(const struct lu_context *ctx,
+			  struct lu_context_key *key)
+{
+	struct vvp_thread_info *info;
+
+	OBD_SLAB_ALLOC_PTR_GFP(info, vvp_thread_kmem, __GFP_IO);
+	if (info == NULL)
+		info = ERR_PTR(-ENOMEM);
+	return info;
+}
+
+static void vvp_key_fini(const struct lu_context *ctx,
+			 struct lu_context_key *key, void *data)
+{
+	struct vvp_thread_info *info = data;
+	OBD_SLAB_FREE_PTR(info, vvp_thread_kmem);
+}
+
+static void *vvp_session_key_init(const struct lu_context *ctx,
+				  struct lu_context_key *key)
+{
+	struct vvp_session *session;
+
+	OBD_SLAB_ALLOC_PTR_GFP(session, vvp_session_kmem, __GFP_IO);
+	if (session == NULL)
+		session = ERR_PTR(-ENOMEM);
+	return session;
+}
+
+static void vvp_session_key_fini(const struct lu_context *ctx,
+				 struct lu_context_key *key, void *data)
+{
+	struct vvp_session *session = data;
+	OBD_SLAB_FREE_PTR(session, vvp_session_kmem);
+}
+
+
+struct lu_context_key vvp_key = {
+	.lct_tags = LCT_CL_THREAD,
+	.lct_init = vvp_key_init,
+	.lct_fini = vvp_key_fini
+};
+
+struct lu_context_key vvp_session_key = {
+	.lct_tags = LCT_SESSION,
+	.lct_init = vvp_session_key_init,
+	.lct_fini = vvp_session_key_fini
+};
+
+/* type constructor/destructor: vvp_type_{init,fini,start,stop}(). */
+LU_TYPE_INIT_FINI(vvp, &ccc_key, &ccc_session_key, &vvp_key, &vvp_session_key);
+
+static const struct lu_device_operations vvp_lu_ops = {
+	.ldo_object_alloc      = vvp_object_alloc
+};
+
+static const struct cl_device_operations vvp_cl_ops = {
+	.cdo_req_init = ccc_req_init
+};
+
+static struct lu_device *vvp_device_alloc(const struct lu_env *env,
+					  struct lu_device_type *t,
+					  struct lustre_cfg *cfg)
+{
+	return ccc_device_alloc(env, t, cfg, &vvp_lu_ops, &vvp_cl_ops);
+}
+
+static const struct lu_device_type_operations vvp_device_type_ops = {
+	.ldto_init = vvp_type_init,
+	.ldto_fini = vvp_type_fini,
+
+	.ldto_start = vvp_type_start,
+	.ldto_stop  = vvp_type_stop,
+
+	.ldto_device_alloc = vvp_device_alloc,
+	.ldto_device_free  = ccc_device_free,
+	.ldto_device_init  = ccc_device_init,
+	.ldto_device_fini  = ccc_device_fini
+};
+
+struct lu_device_type vvp_device_type = {
+	.ldt_tags     = LU_DEVICE_CL,
+	.ldt_name     = LUSTRE_VVP_NAME,
+	.ldt_ops      = &vvp_device_type_ops,
+	.ldt_ctx_tags = LCT_CL_THREAD
+};
+
+/**
+ * A mutex serializing calls to vvp_inode_fini() under extreme memory
+ * pressure, when environments cannot be allocated.
+ */
+int vvp_global_init(void)
+{
+	int result;
+
+	result = lu_kmem_init(vvp_caches);
+	if (result == 0) {
+		result = ccc_global_init(&vvp_device_type);
+		if (result != 0)
+			lu_kmem_fini(vvp_caches);
+	}
+	return result;
+}
+
+void vvp_global_fini(void)
+{
+	ccc_global_fini(&vvp_device_type);
+	lu_kmem_fini(vvp_caches);
+}
+
+
+/*****************************************************************************
+ *
+ * mirror obd-devices into cl devices.
+ *
+ */
+
+int cl_sb_init(struct super_block *sb)
+{
+	struct ll_sb_info *sbi;
+	struct cl_device  *cl;
+	struct lu_env     *env;
+	int rc = 0;
+	int refcheck;
+
+	sbi  = ll_s2sbi(sb);
+	env = cl_env_get(&refcheck);
+	if (!IS_ERR(env)) {
+		cl = cl_type_setup(env, NULL, &vvp_device_type,
+				   sbi->ll_dt_exp->exp_obd->obd_lu_dev);
+		if (!IS_ERR(cl)) {
+			cl2ccc_dev(cl)->cdv_sb = sb;
+			sbi->ll_cl = cl;
+			sbi->ll_site = cl2lu_dev(cl)->ld_site;
+		}
+		cl_env_put(env, &refcheck);
+	} else
+		rc = PTR_ERR(env);
+	RETURN(rc);
+}
+
+int cl_sb_fini(struct super_block *sb)
+{
+	struct ll_sb_info *sbi;
+	struct lu_env     *env;
+	struct cl_device  *cld;
+	int		refcheck;
+	int		result;
+
+	ENTRY;
+	sbi = ll_s2sbi(sb);
+	env = cl_env_get(&refcheck);
+	if (!IS_ERR(env)) {
+		cld = sbi->ll_cl;
+
+		if (cld != NULL) {
+			cl_stack_fini(env, cld);
+			sbi->ll_cl = NULL;
+			sbi->ll_site = NULL;
+		}
+		cl_env_put(env, &refcheck);
+		result = 0;
+	} else {
+		CERROR("Cannot cleanup cl-stack due to memory shortage.\n");
+		result = PTR_ERR(env);
+	}
+	/*
+	 * If mount failed (sbi->ll_cl == NULL), and this there are no other
+	 * mounts, stop device types manually (this usually happens
+	 * automatically when last device is destroyed).
+	 */
+	lu_types_stop();
+	RETURN(result);
+}
+
+/****************************************************************************
+ *
+ * /proc/fs/lustre/llite/$MNT/dump_page_cache
+ *
+ ****************************************************************************/
+
+/*
+ * To represent contents of a page cache as a byte stream, following
+ * information if encoded in 64bit offset:
+ *
+ *       - file hash bucket in lu_site::ls_hash[]       28bits
+ *
+ *       - how far file is from bucket head	      4bits
+ *
+ *       - page index				   32bits
+ *
+ * First two data identify a file in the cache uniquely.
+ */
+
+#define PGC_OBJ_SHIFT (32 + 4)
+#define PGC_DEPTH_SHIFT (32)
+
+struct vvp_pgcache_id {
+	unsigned		 vpi_bucket;
+	unsigned		 vpi_depth;
+	uint32_t		 vpi_index;
+
+	unsigned		 vpi_curdep;
+	struct lu_object_header *vpi_obj;
+};
+
+static void vvp_pgcache_id_unpack(loff_t pos, struct vvp_pgcache_id *id)
+{
+	CLASSERT(sizeof(pos) == sizeof(__u64));
+
+	id->vpi_index  = pos & 0xffffffff;
+	id->vpi_depth  = (pos >> PGC_DEPTH_SHIFT) & 0xf;
+	id->vpi_bucket = ((unsigned long long)pos >> PGC_OBJ_SHIFT);
+}
+
+static loff_t vvp_pgcache_id_pack(struct vvp_pgcache_id *id)
+{
+	return
+		((__u64)id->vpi_index) |
+		((__u64)id->vpi_depth  << PGC_DEPTH_SHIFT) |
+		((__u64)id->vpi_bucket << PGC_OBJ_SHIFT);
+}
+
+static int vvp_pgcache_obj_get(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+			       struct hlist_node *hnode, void *data)
+{
+	struct vvp_pgcache_id   *id  = data;
+	struct lu_object_header *hdr = cfs_hash_object(hs, hnode);
+
+	if (id->vpi_curdep-- > 0)
+		return 0; /* continue */
+
+	if (lu_object_is_dying(hdr))
+		return 1;
+
+	cfs_hash_get(hs, hnode);
+	id->vpi_obj = hdr;
+	return 1;
+}
+
+static struct cl_object *vvp_pgcache_obj(const struct lu_env *env,
+					 struct lu_device *dev,
+					 struct vvp_pgcache_id *id)
+{
+	LASSERT(lu_device_is_cl(dev));
+
+	id->vpi_depth &= 0xf;
+	id->vpi_obj    = NULL;
+	id->vpi_curdep = id->vpi_depth;
+
+	cfs_hash_hlist_for_each(dev->ld_site->ls_obj_hash, id->vpi_bucket,
+				vvp_pgcache_obj_get, id);
+	if (id->vpi_obj != NULL) {
+		struct lu_object *lu_obj;
+
+		lu_obj = lu_object_locate(id->vpi_obj, dev->ld_type);
+		if (lu_obj != NULL) {
+			lu_object_ref_add(lu_obj, "dump", current);
+			return lu2cl(lu_obj);
+		}
+		lu_object_put(env, lu_object_top(id->vpi_obj));
+
+	} else if (id->vpi_curdep > 0) {
+		id->vpi_depth = 0xf;
+	}
+	return NULL;
+}
+
+static loff_t vvp_pgcache_find(const struct lu_env *env,
+			       struct lu_device *dev, loff_t pos)
+{
+	struct cl_object     *clob;
+	struct lu_site       *site;
+	struct vvp_pgcache_id id;
+
+	site = dev->ld_site;
+	vvp_pgcache_id_unpack(pos, &id);
+
+	while (1) {
+		if (id.vpi_bucket >= CFS_HASH_NHLIST(site->ls_obj_hash))
+			return ~0ULL;
+		clob = vvp_pgcache_obj(env, dev, &id);
+		if (clob != NULL) {
+			struct cl_object_header *hdr;
+			int		      nr;
+			struct cl_page	  *pg;
+
+			/* got an object. Find next page. */
+			hdr = cl_object_header(clob);
+
+			spin_lock(&hdr->coh_page_guard);
+			nr = radix_tree_gang_lookup(&hdr->coh_tree,
+						    (void **)&pg,
+						    id.vpi_index, 1);
+			if (nr > 0) {
+				id.vpi_index = pg->cp_index;
+				/* Cant support over 16T file */
+				nr = !(pg->cp_index > 0xffffffff);
+			}
+			spin_unlock(&hdr->coh_page_guard);
+
+			lu_object_ref_del(&clob->co_lu, "dump", current);
+			cl_object_put(env, clob);
+			if (nr > 0)
+				return vvp_pgcache_id_pack(&id);
+		}
+		/* to the next object. */
+		++id.vpi_depth;
+		id.vpi_depth &= 0xf;
+		if (id.vpi_depth == 0 && ++id.vpi_bucket == 0)
+			return ~0ULL;
+		id.vpi_index = 0;
+	}
+}
+
+#define seq_page_flag(seq, page, flag, has_flags) do {		  \
+	if (test_bit(PG_##flag, &(page)->flags)) {		  \
+		seq_printf(seq, "%s"#flag, has_flags ? "|" : "");       \
+		has_flags = 1;					  \
+	}							       \
+} while(0)
+
+static void vvp_pgcache_page_show(const struct lu_env *env,
+				  struct seq_file *seq, struct cl_page *page)
+{
+	struct ccc_page *cpg;
+	struct page      *vmpage;
+	int	      has_flags;
+
+	cpg = cl2ccc_page(cl_page_at(page, &vvp_device_type));
+	vmpage = cpg->cpg_page;
+	seq_printf(seq," %5i | %p %p %s %s %s %s | %p %lu/%u(%p) %lu %u [",
+		   0 /* gen */,
+		   cpg, page,
+		   "none",
+		   cpg->cpg_write_queued ? "wq" : "- ",
+		   cpg->cpg_defer_uptodate ? "du" : "- ",
+		   PageWriteback(vmpage) ? "wb" : "-",
+		   vmpage, vmpage->mapping->host->i_ino,
+		   vmpage->mapping->host->i_generation,
+		   vmpage->mapping->host, vmpage->index,
+		   page_count(vmpage));
+	has_flags = 0;
+	seq_page_flag(seq, vmpage, locked, has_flags);
+	seq_page_flag(seq, vmpage, error, has_flags);
+	seq_page_flag(seq, vmpage, referenced, has_flags);
+	seq_page_flag(seq, vmpage, uptodate, has_flags);
+	seq_page_flag(seq, vmpage, dirty, has_flags);
+	seq_page_flag(seq, vmpage, writeback, has_flags);
+	seq_printf(seq, "%s]\n", has_flags ? "" : "-");
+}
+
+static int vvp_pgcache_show(struct seq_file *f, void *v)
+{
+	loff_t		   pos;
+	struct ll_sb_info       *sbi;
+	struct cl_object	*clob;
+	struct lu_env	   *env;
+	struct cl_page	  *page;
+	struct cl_object_header *hdr;
+	struct vvp_pgcache_id    id;
+	int		      refcheck;
+	int		      result;
+
+	env = cl_env_get(&refcheck);
+	if (!IS_ERR(env)) {
+		pos = *(loff_t *) v;
+		vvp_pgcache_id_unpack(pos, &id);
+		sbi = f->private;
+		clob = vvp_pgcache_obj(env, &sbi->ll_cl->cd_lu_dev, &id);
+		if (clob != NULL) {
+			hdr = cl_object_header(clob);
+
+			spin_lock(&hdr->coh_page_guard);
+			page = cl_page_lookup(hdr, id.vpi_index);
+			spin_unlock(&hdr->coh_page_guard);
+
+			seq_printf(f, "%8x@"DFID": ",
+				   id.vpi_index, PFID(&hdr->coh_lu.loh_fid));
+			if (page != NULL) {
+				vvp_pgcache_page_show(env, f, page);
+				cl_page_put(env, page);
+			} else
+				seq_puts(f, "missing\n");
+			lu_object_ref_del(&clob->co_lu, "dump", current);
+			cl_object_put(env, clob);
+		} else
+			seq_printf(f, "%llx missing\n", pos);
+		cl_env_put(env, &refcheck);
+		result = 0;
+	} else
+		result = PTR_ERR(env);
+	return result;
+}
+
+static void *vvp_pgcache_start(struct seq_file *f, loff_t *pos)
+{
+	struct ll_sb_info *sbi;
+	struct lu_env     *env;
+	int		refcheck;
+
+	sbi = f->private;
+
+	env = cl_env_get(&refcheck);
+	if (!IS_ERR(env)) {
+		sbi = f->private;
+		if (sbi->ll_site->ls_obj_hash->hs_cur_bits > 64 - PGC_OBJ_SHIFT)
+			pos = ERR_PTR(-EFBIG);
+		else {
+			*pos = vvp_pgcache_find(env, &sbi->ll_cl->cd_lu_dev,
+						*pos);
+			if (*pos == ~0ULL)
+				pos = NULL;
+		}
+		cl_env_put(env, &refcheck);
+	}
+	return pos;
+}
+
+static void *vvp_pgcache_next(struct seq_file *f, void *v, loff_t *pos)
+{
+	struct ll_sb_info *sbi;
+	struct lu_env     *env;
+	int		refcheck;
+
+	env = cl_env_get(&refcheck);
+	if (!IS_ERR(env)) {
+		sbi = f->private;
+		*pos = vvp_pgcache_find(env, &sbi->ll_cl->cd_lu_dev, *pos + 1);
+		if (*pos == ~0ULL)
+			pos = NULL;
+		cl_env_put(env, &refcheck);
+	}
+	return pos;
+}
+
+static void vvp_pgcache_stop(struct seq_file *f, void *v)
+{
+	/* Nothing to do */
+}
+
+static struct seq_operations vvp_pgcache_ops = {
+	.start = vvp_pgcache_start,
+	.next  = vvp_pgcache_next,
+	.stop  = vvp_pgcache_stop,
+	.show  = vvp_pgcache_show
+};
+
+static int vvp_dump_pgcache_seq_open(struct inode *inode, struct file *filp)
+{
+	struct proc_dir_entry *dp  = PDE(inode);
+	struct ll_sb_info     *sbi = dp->data;
+	struct seq_file       *seq;
+	int		    result;
+
+	result = seq_open(filp, &vvp_pgcache_ops);
+	if (result == 0) {
+		seq = filp->private_data;
+		seq->private = sbi;
+	}
+	return result;
+}
+
+struct file_operations vvp_dump_pgcache_file_ops = {
+	.owner   = THIS_MODULE,
+	.open    = vvp_dump_pgcache_seq_open,
+	.read    = seq_read,
+	.llseek	 = seq_lseek,
+	.release = seq_release,
+};
diff --git a/drivers/staging/lustre/lustre/llite/vvp_internal.h b/drivers/staging/lustre/lustre/llite/vvp_internal.h
new file mode 100644
index 000000000000..c82bf17f55a6
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/vvp_internal.h
@@ -0,0 +1,62 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Internal definitions for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#ifndef VVP_INTERNAL_H
+#define VVP_INTERNAL_H
+
+
+#include <cl_object.h>
+#include "llite_internal.h"
+
+int	       vvp_io_init     (const struct lu_env *env,
+				   struct cl_object *obj, struct cl_io *io);
+int	       vvp_lock_init   (const struct lu_env *env,
+				   struct cl_object *obj, struct cl_lock *lock,
+				   const struct cl_io *io);
+int		  vvp_page_init   (const struct lu_env *env,
+				   struct cl_object *obj,
+				   struct cl_page *page, struct page *vmpage);
+struct lu_object *vvp_object_alloc(const struct lu_env *env,
+				   const struct lu_object_header *hdr,
+				   struct lu_device *dev);
+
+struct ccc_object *cl_inode2ccc(struct inode *inode);
+
+extern struct kmem_cache *vvp_thread_kmem;
+
+#endif /* VVP_INTERNAL_H */
diff --git a/drivers/staging/lustre/lustre/llite/vvp_io.c b/drivers/staging/lustre/lustre/llite/vvp_io.c
new file mode 100644
index 000000000000..8504d448aac8
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/vvp_io.c
@@ -0,0 +1,1175 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_io for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+
+#include <obd.h>
+#include <lustre_lite.h>
+
+#include "vvp_internal.h"
+
+static struct vvp_io *cl2vvp_io(const struct lu_env *env,
+				const struct cl_io_slice *slice);
+
+/**
+ * True, if \a io is a normal io, False for sendfile() / splice_{read|write}
+ */
+int cl_is_normalio(const struct lu_env *env, const struct cl_io *io)
+{
+	struct vvp_io *vio = vvp_env_io(env);
+
+	LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+
+	return vio->cui_io_subtype == IO_NORMAL;
+}
+
+/**
+ * For swapping layout. The file's layout may have changed.
+ * To avoid populating pages to a wrong stripe, we have to verify the
+ * correctness of layout. It works because swapping layout processes
+ * have to acquire group lock.
+ */
+static bool can_populate_pages(const struct lu_env *env, struct cl_io *io,
+				struct inode *inode)
+{
+	struct ll_inode_info	*lli = ll_i2info(inode);
+	struct ccc_io		*cio = ccc_env_io(env);
+	bool rc = true;
+
+	switch (io->ci_type) {
+	case CIT_READ:
+	case CIT_WRITE:
+		/* don't need lock here to check lli_layout_gen as we have held
+		 * extent lock and GROUP lock has to hold to swap layout */
+		if (lli->lli_layout_gen != cio->cui_layout_gen) {
+			io->ci_need_restart = 1;
+			/* this will return application a short read/write */
+			io->ci_continue = 0;
+			rc = false;
+		}
+	case CIT_FAULT:
+		/* fault is okay because we've already had a page. */
+	default:
+		break;
+	}
+
+	return rc;
+}
+
+/*****************************************************************************
+ *
+ * io operations.
+ *
+ */
+
+static int vvp_io_fault_iter_init(const struct lu_env *env,
+				  const struct cl_io_slice *ios)
+{
+	struct vvp_io *vio   = cl2vvp_io(env, ios);
+	struct inode  *inode = ccc_object_inode(ios->cis_obj);
+
+	LASSERT(inode ==
+		cl2ccc_io(env, ios)->cui_fd->fd_file->f_dentry->d_inode);
+	vio->u.fault.ft_mtime = LTIME_S(inode->i_mtime);
+	return 0;
+}
+
+static void vvp_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+	struct cl_io     *io  = ios->cis_io;
+	struct cl_object *obj = io->ci_obj;
+	struct ccc_io    *cio = cl2ccc_io(env, ios);
+
+	CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+
+	CDEBUG(D_VFSTRACE, "ignore/verify layout %d/%d, layout version %d.\n",
+		io->ci_ignore_layout, io->ci_verify_layout, cio->cui_layout_gen);
+
+	if (!io->ci_ignore_layout && io->ci_verify_layout) {
+		__u32 gen = 0;
+
+		/* check layout version */
+		ll_layout_refresh(ccc_object_inode(obj), &gen);
+		io->ci_need_restart = cio->cui_layout_gen != gen;
+		if (io->ci_need_restart)
+			CDEBUG(D_VFSTRACE, "layout changed from %d to %d.\n",
+				cio->cui_layout_gen, gen);
+	}
+}
+
+static void vvp_io_fault_fini(const struct lu_env *env,
+			      const struct cl_io_slice *ios)
+{
+	struct cl_io   *io   = ios->cis_io;
+	struct cl_page *page = io->u.ci_fault.ft_page;
+
+	CLOBINVRNT(env, io->ci_obj, ccc_object_invariant(io->ci_obj));
+
+	if (page != NULL) {
+		lu_ref_del(&page->cp_reference, "fault", io);
+		cl_page_put(env, page);
+		io->u.ci_fault.ft_page = NULL;
+	}
+	vvp_io_fini(env, ios);
+}
+
+enum cl_lock_mode vvp_mode_from_vma(struct vm_area_struct *vma)
+{
+	/*
+	 * we only want to hold PW locks if the mmap() can generate
+	 * writes back to the file and that only happens in shared
+	 * writable vmas
+	 */
+	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
+		return CLM_WRITE;
+	return CLM_READ;
+}
+
+static int vvp_mmap_locks(const struct lu_env *env,
+			  struct ccc_io *vio, struct cl_io *io)
+{
+	struct ccc_thread_info *cti = ccc_env_info(env);
+	struct mm_struct       *mm = current->mm;
+	struct vm_area_struct  *vma;
+	struct cl_lock_descr   *descr = &cti->cti_descr;
+	ldlm_policy_data_t      policy;
+	unsigned long	   addr;
+	unsigned long	   seg;
+	ssize_t		 count;
+	int		     result;
+	ENTRY;
+
+	LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+
+	if (!cl_is_normalio(env, io))
+		RETURN(0);
+
+	if (vio->cui_iov == NULL) /* nfs or loop back device write */
+		RETURN(0);
+
+	/* No MM (e.g. NFS)? No vmas too. */
+	if (mm == NULL)
+		RETURN(0);
+
+	for (seg = 0; seg < vio->cui_nrsegs; seg++) {
+		const struct iovec *iv = &vio->cui_iov[seg];
+
+		addr = (unsigned long)iv->iov_base;
+		count = iv->iov_len;
+		if (count == 0)
+			continue;
+
+		count += addr & (~CFS_PAGE_MASK);
+		addr &= CFS_PAGE_MASK;
+
+		down_read(&mm->mmap_sem);
+		while((vma = our_vma(mm, addr, count)) != NULL) {
+			struct inode *inode = vma->vm_file->f_dentry->d_inode;
+			int flags = CEF_MUST;
+
+			if (ll_file_nolock(vma->vm_file)) {
+				/*
+				 * For no lock case, a lockless lock will be
+				 * generated.
+				 */
+				flags = CEF_NEVER;
+			}
+
+			/*
+			 * XXX: Required lock mode can be weakened: CIT_WRITE
+			 * io only ever reads user level buffer, and CIT_READ
+			 * only writes on it.
+			 */
+			policy_from_vma(&policy, vma, addr, count);
+			descr->cld_mode = vvp_mode_from_vma(vma);
+			descr->cld_obj = ll_i2info(inode)->lli_clob;
+			descr->cld_start = cl_index(descr->cld_obj,
+						    policy.l_extent.start);
+			descr->cld_end = cl_index(descr->cld_obj,
+						  policy.l_extent.end);
+			descr->cld_enq_flags = flags;
+			result = cl_io_lock_alloc_add(env, io, descr);
+
+			CDEBUG(D_VFSTRACE, "lock: %d: [%lu, %lu]\n",
+			       descr->cld_mode, descr->cld_start,
+			       descr->cld_end);
+
+			if (result < 0)
+				RETURN(result);
+
+			if (vma->vm_end - addr >= count)
+				break;
+
+			count -= vma->vm_end - addr;
+			addr = vma->vm_end;
+		}
+		up_read(&mm->mmap_sem);
+	}
+	RETURN(0);
+}
+
+static int vvp_io_rw_lock(const struct lu_env *env, struct cl_io *io,
+			  enum cl_lock_mode mode, loff_t start, loff_t end)
+{
+	struct ccc_io *cio = ccc_env_io(env);
+	int result;
+	int ast_flags = 0;
+
+	LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+	ENTRY;
+
+	ccc_io_update_iov(env, cio, io);
+
+	if (io->u.ci_rw.crw_nonblock)
+		ast_flags |= CEF_NONBLOCK;
+	result = vvp_mmap_locks(env, cio, io);
+	if (result == 0)
+		result = ccc_io_one_lock(env, io, ast_flags, mode, start, end);
+	RETURN(result);
+}
+
+static int vvp_io_read_lock(const struct lu_env *env,
+			    const struct cl_io_slice *ios)
+{
+	struct cl_io	 *io  = ios->cis_io;
+	struct ll_inode_info *lli = ll_i2info(ccc_object_inode(io->ci_obj));
+	int result;
+
+	ENTRY;
+	/* XXX: Layer violation, we shouldn't see lsm at llite level. */
+	if (lli->lli_has_smd) /* lsm-less file doesn't need to lock */
+		result = vvp_io_rw_lock(env, io, CLM_READ,
+					io->u.ci_rd.rd.crw_pos,
+					io->u.ci_rd.rd.crw_pos +
+					io->u.ci_rd.rd.crw_count - 1);
+	else
+		result = 0;
+	RETURN(result);
+}
+
+static int vvp_io_fault_lock(const struct lu_env *env,
+			     const struct cl_io_slice *ios)
+{
+	struct cl_io *io   = ios->cis_io;
+	struct vvp_io *vio = cl2vvp_io(env, ios);
+	/*
+	 * XXX LDLM_FL_CBPENDING
+	 */
+	return ccc_io_one_lock_index
+		(env, io, 0, vvp_mode_from_vma(vio->u.fault.ft_vma),
+		 io->u.ci_fault.ft_index, io->u.ci_fault.ft_index);
+}
+
+static int vvp_io_write_lock(const struct lu_env *env,
+			     const struct cl_io_slice *ios)
+{
+	struct cl_io *io = ios->cis_io;
+	loff_t start;
+	loff_t end;
+
+	if (io->u.ci_wr.wr_append) {
+		start = 0;
+		end   = OBD_OBJECT_EOF;
+	} else {
+		start = io->u.ci_wr.wr.crw_pos;
+		end   = start + io->u.ci_wr.wr.crw_count - 1;
+	}
+	return vvp_io_rw_lock(env, io, CLM_WRITE, start, end);
+}
+
+static int vvp_io_setattr_iter_init(const struct lu_env *env,
+				    const struct cl_io_slice *ios)
+{
+	return 0;
+}
+
+/**
+ * Implementation of cl_io_operations::cio_lock() method for CIT_SETATTR io.
+ *
+ * Handles "lockless io" mode when extent locking is done by server.
+ */
+static int vvp_io_setattr_lock(const struct lu_env *env,
+			       const struct cl_io_slice *ios)
+{
+	struct ccc_io *cio = ccc_env_io(env);
+	struct cl_io  *io  = ios->cis_io;
+	__u64 new_size;
+	__u32 enqflags = 0;
+
+	if (cl_io_is_trunc(io)) {
+		new_size = io->u.ci_setattr.sa_attr.lvb_size;
+		if (new_size == 0)
+			enqflags = CEF_DISCARD_DATA;
+	} else {
+		if ((io->u.ci_setattr.sa_attr.lvb_mtime >=
+		     io->u.ci_setattr.sa_attr.lvb_ctime) ||
+		    (io->u.ci_setattr.sa_attr.lvb_atime >=
+		     io->u.ci_setattr.sa_attr.lvb_ctime))
+			return 0;
+		new_size = 0;
+	}
+	cio->u.setattr.cui_local_lock = SETATTR_EXTENT_LOCK;
+	return ccc_io_one_lock(env, io, enqflags, CLM_WRITE,
+			       new_size, OBD_OBJECT_EOF);
+}
+
+static int vvp_do_vmtruncate(struct inode *inode, size_t size)
+{
+	int     result;
+	/*
+	 * Only ll_inode_size_lock is taken at this level.
+	 */
+	ll_inode_size_lock(inode);
+	result = inode_newsize_ok(inode, size);
+	if (result < 0) {
+		ll_inode_size_unlock(inode);
+		return result;
+	}
+	truncate_setsize(inode, size);
+	ll_inode_size_unlock(inode);
+	return result;
+}
+
+static int vvp_io_setattr_trunc(const struct lu_env *env,
+				const struct cl_io_slice *ios,
+				struct inode *inode, loff_t size)
+{
+	inode_dio_wait(inode);
+	return 0;
+}
+
+static int vvp_io_setattr_time(const struct lu_env *env,
+			       const struct cl_io_slice *ios)
+{
+	struct cl_io       *io    = ios->cis_io;
+	struct cl_object   *obj   = io->ci_obj;
+	struct cl_attr     *attr  = ccc_env_thread_attr(env);
+	int result;
+	unsigned valid = CAT_CTIME;
+
+	cl_object_attr_lock(obj);
+	attr->cat_ctime = io->u.ci_setattr.sa_attr.lvb_ctime;
+	if (io->u.ci_setattr.sa_valid & ATTR_ATIME_SET) {
+		attr->cat_atime = io->u.ci_setattr.sa_attr.lvb_atime;
+		valid |= CAT_ATIME;
+	}
+	if (io->u.ci_setattr.sa_valid & ATTR_MTIME_SET) {
+		attr->cat_mtime = io->u.ci_setattr.sa_attr.lvb_mtime;
+		valid |= CAT_MTIME;
+	}
+	result = cl_object_attr_set(env, obj, attr, valid);
+	cl_object_attr_unlock(obj);
+
+	return result;
+}
+
+static int vvp_io_setattr_start(const struct lu_env *env,
+				const struct cl_io_slice *ios)
+{
+	struct cl_io	*io    = ios->cis_io;
+	struct inode	*inode = ccc_object_inode(io->ci_obj);
+
+	mutex_lock(&inode->i_mutex);
+	if (cl_io_is_trunc(io))
+		return vvp_io_setattr_trunc(env, ios, inode,
+					    io->u.ci_setattr.sa_attr.lvb_size);
+	else
+		return vvp_io_setattr_time(env, ios);
+}
+
+static void vvp_io_setattr_end(const struct lu_env *env,
+			       const struct cl_io_slice *ios)
+{
+	struct cl_io *io    = ios->cis_io;
+	struct inode *inode = ccc_object_inode(io->ci_obj);
+
+	if (cl_io_is_trunc(io)) {
+		/* Truncate in memory pages - they must be clean pages
+		 * because osc has already notified to destroy osc_extents. */
+		vvp_do_vmtruncate(inode, io->u.ci_setattr.sa_attr.lvb_size);
+		inode_dio_write_done(inode);
+	}
+	mutex_unlock(&inode->i_mutex);
+}
+
+static void vvp_io_setattr_fini(const struct lu_env *env,
+				const struct cl_io_slice *ios)
+{
+	vvp_io_fini(env, ios);
+}
+
+static ssize_t lustre_generic_file_read(struct file *file,
+					struct ccc_io *vio, loff_t *ppos)
+{
+	return generic_file_aio_read(vio->cui_iocb, vio->cui_iov,
+				     vio->cui_nrsegs, *ppos);
+}
+
+static ssize_t lustre_generic_file_write(struct file *file,
+					struct ccc_io *vio, loff_t *ppos)
+{
+	return generic_file_aio_write(vio->cui_iocb, vio->cui_iov,
+				      vio->cui_nrsegs, *ppos);
+}
+
+static int vvp_io_read_start(const struct lu_env *env,
+			     const struct cl_io_slice *ios)
+{
+	struct vvp_io     *vio   = cl2vvp_io(env, ios);
+	struct ccc_io     *cio   = cl2ccc_io(env, ios);
+	struct cl_io      *io    = ios->cis_io;
+	struct cl_object  *obj   = io->ci_obj;
+	struct inode      *inode = ccc_object_inode(obj);
+	struct ll_ra_read *bead  = &vio->cui_bead;
+	struct file       *file  = cio->cui_fd->fd_file;
+
+	int     result;
+	loff_t  pos = io->u.ci_rd.rd.crw_pos;
+	long    cnt = io->u.ci_rd.rd.crw_count;
+	long    tot = cio->cui_tot_count;
+	int     exceed = 0;
+
+	CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+
+	CDEBUG(D_VFSTRACE, "read: -> [%lli, %lli)\n", pos, pos + cnt);
+
+	if (!can_populate_pages(env, io, inode))
+		return 0;
+
+	result = ccc_prep_size(env, obj, io, pos, tot, &exceed);
+	if (result != 0)
+		return result;
+	else if (exceed != 0)
+		goto out;
+
+	LU_OBJECT_HEADER(D_INODE, env, &obj->co_lu,
+			"Read ino %lu, %lu bytes, offset %lld, size %llu\n",
+			inode->i_ino, cnt, pos, i_size_read(inode));
+
+	/* turn off the kernel's read-ahead */
+	cio->cui_fd->fd_file->f_ra.ra_pages = 0;
+
+	/* initialize read-ahead window once per syscall */
+	if (!vio->cui_ra_window_set) {
+		vio->cui_ra_window_set = 1;
+		bead->lrr_start = cl_index(obj, pos);
+		/*
+		 * XXX: explicit PAGE_CACHE_SIZE
+		 */
+		bead->lrr_count = cl_index(obj, tot + PAGE_CACHE_SIZE - 1);
+		ll_ra_read_in(file, bead);
+	}
+
+	/* BUG: 5972 */
+	file_accessed(file);
+	switch (vio->cui_io_subtype) {
+	case IO_NORMAL:
+		 result = lustre_generic_file_read(file, cio, &pos);
+		 break;
+	case IO_SPLICE:
+		result = generic_file_splice_read(file, &pos,
+				vio->u.splice.cui_pipe, cnt,
+				vio->u.splice.cui_flags);
+		/* LU-1109: do splice read stripe by stripe otherwise if it
+		 * may make nfsd stuck if this read occupied all internal pipe
+		 * buffers. */
+		io->ci_continue = 0;
+		break;
+	default:
+		CERROR("Wrong IO type %u\n", vio->cui_io_subtype);
+		LBUG();
+	}
+
+out:
+	if (result >= 0) {
+		if (result < cnt)
+			io->ci_continue = 0;
+		io->ci_nob += result;
+		ll_rw_stats_tally(ll_i2sbi(inode), current->pid,
+				  cio->cui_fd, pos, result, 0);
+		result = 0;
+	}
+	return result;
+}
+
+static void vvp_io_read_fini(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+	struct vvp_io *vio = cl2vvp_io(env, ios);
+	struct ccc_io *cio = cl2ccc_io(env, ios);
+
+	if (vio->cui_ra_window_set)
+		ll_ra_read_ex(cio->cui_fd->fd_file, &vio->cui_bead);
+
+	vvp_io_fini(env, ios);
+}
+
+static int vvp_io_write_start(const struct lu_env *env,
+			      const struct cl_io_slice *ios)
+{
+	struct ccc_io      *cio   = cl2ccc_io(env, ios);
+	struct cl_io       *io    = ios->cis_io;
+	struct cl_object   *obj   = io->ci_obj;
+	struct inode       *inode = ccc_object_inode(obj);
+	struct file	*file  = cio->cui_fd->fd_file;
+	ssize_t result = 0;
+	loff_t pos = io->u.ci_wr.wr.crw_pos;
+	size_t cnt = io->u.ci_wr.wr.crw_count;
+
+	ENTRY;
+
+	if (!can_populate_pages(env, io, inode))
+		return 0;
+
+	if (cl_io_is_append(io)) {
+		/*
+		 * PARALLEL IO This has to be changed for parallel IO doing
+		 * out-of-order writes.
+		 */
+		pos = io->u.ci_wr.wr.crw_pos = i_size_read(inode);
+		cio->cui_iocb->ki_pos = pos;
+	}
+
+	CDEBUG(D_VFSTRACE, "write: [%lli, %lli)\n", pos, pos + (long long)cnt);
+
+	if (cio->cui_iov == NULL) /* from a temp io in ll_cl_init(). */
+		result = 0;
+	else
+		result = lustre_generic_file_write(file, cio, &pos);
+
+	if (result > 0) {
+		if (result < cnt)
+			io->ci_continue = 0;
+		io->ci_nob += result;
+		ll_rw_stats_tally(ll_i2sbi(inode), current->pid,
+				  cio->cui_fd, pos, result, 0);
+		result = 0;
+	}
+	RETURN(result);
+}
+
+static int vvp_io_kernel_fault(struct vvp_fault_io *cfio)
+{
+	struct vm_fault *vmf = cfio->fault.ft_vmf;
+
+	cfio->fault.ft_flags = filemap_fault(cfio->ft_vma, vmf);
+
+	if (vmf->page) {
+		LL_CDEBUG_PAGE(D_PAGE, vmf->page, "got addr %p type NOPAGE\n",
+			       vmf->virtual_address);
+		if (unlikely(!(cfio->fault.ft_flags & VM_FAULT_LOCKED))) {
+			lock_page(vmf->page);
+			cfio->fault.ft_flags &= VM_FAULT_LOCKED;
+		}
+
+		cfio->ft_vmpage = vmf->page;
+		return 0;
+	}
+
+	if (cfio->fault.ft_flags & VM_FAULT_SIGBUS) {
+		CDEBUG(D_PAGE, "got addr %p - SIGBUS\n", vmf->virtual_address);
+		return -EFAULT;
+	}
+
+	if (cfio->fault.ft_flags & VM_FAULT_OOM) {
+		CDEBUG(D_PAGE, "got addr %p - OOM\n", vmf->virtual_address);
+		return -ENOMEM;
+	}
+
+	if (cfio->fault.ft_flags & VM_FAULT_RETRY)
+		return -EAGAIN;
+
+	CERROR("unknow error in page fault %d!\n", cfio->fault.ft_flags);
+	return -EINVAL;
+}
+
+
+static int vvp_io_fault_start(const struct lu_env *env,
+			      const struct cl_io_slice *ios)
+{
+	struct vvp_io       *vio     = cl2vvp_io(env, ios);
+	struct cl_io	*io      = ios->cis_io;
+	struct cl_object    *obj     = io->ci_obj;
+	struct inode	*inode   = ccc_object_inode(obj);
+	struct cl_fault_io  *fio     = &io->u.ci_fault;
+	struct vvp_fault_io *cfio    = &vio->u.fault;
+	loff_t	       offset;
+	int		  result  = 0;
+	struct page	  *vmpage  = NULL;
+	struct cl_page      *page;
+	loff_t	       size;
+	pgoff_t	      last; /* last page in a file data region */
+
+	if (fio->ft_executable &&
+	    LTIME_S(inode->i_mtime) != vio->u.fault.ft_mtime)
+		CWARN("binary "DFID
+		      " changed while waiting for the page fault lock\n",
+		      PFID(lu_object_fid(&obj->co_lu)));
+
+	/* offset of the last byte on the page */
+	offset = cl_offset(obj, fio->ft_index + 1) - 1;
+	LASSERT(cl_index(obj, offset) == fio->ft_index);
+	result = ccc_prep_size(env, obj, io, 0, offset + 1, NULL);
+	if (result != 0)
+		return result;
+
+	/* must return locked page */
+	if (fio->ft_mkwrite) {
+		LASSERT(cfio->ft_vmpage != NULL);
+		lock_page(cfio->ft_vmpage);
+	} else {
+		result = vvp_io_kernel_fault(cfio);
+		if (result != 0)
+			return result;
+	}
+
+	vmpage = cfio->ft_vmpage;
+	LASSERT(PageLocked(vmpage));
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_FAULT_TRUNC_RACE))
+		ll_invalidate_page(vmpage);
+
+	size = i_size_read(inode);
+	/* Though we have already held a cl_lock upon this page, but
+	 * it still can be truncated locally. */
+	if (unlikely((vmpage->mapping != inode->i_mapping) ||
+		     (page_offset(vmpage) > size))) {
+		CDEBUG(D_PAGE, "llite: fault and truncate race happened!\n");
+
+		/* return +1 to stop cl_io_loop() and ll_fault() will catch
+		 * and retry. */
+		GOTO(out, result = +1);
+	}
+
+
+	if (fio->ft_mkwrite ) {
+		pgoff_t last_index;
+		/*
+		 * Capture the size while holding the lli_trunc_sem from above
+		 * we want to make sure that we complete the mkwrite action
+		 * while holding this lock. We need to make sure that we are
+		 * not past the end of the file.
+		 */
+		last_index = cl_index(obj, size - 1);
+		if (last_index < fio->ft_index) {
+			CDEBUG(D_PAGE,
+				"llite: mkwrite and truncate race happened: "
+				"%p: 0x%lx 0x%lx\n",
+				vmpage->mapping,fio->ft_index,last_index);
+			/*
+			 * We need to return if we are
+			 * passed the end of the file. This will propagate
+			 * up the call stack to ll_page_mkwrite where
+			 * we will return VM_FAULT_NOPAGE. Any non-negative
+			 * value returned here will be silently
+			 * converted to 0. If the vmpage->mapping is null
+			 * the error code would be converted back to ENODATA
+			 * in ll_page_mkwrite0. Thus we return -ENODATA
+			 * to handle both cases
+			 */
+			GOTO(out, result = -ENODATA);
+		}
+	}
+
+	page = cl_page_find(env, obj, fio->ft_index, vmpage, CPT_CACHEABLE);
+	if (IS_ERR(page))
+		GOTO(out, result = PTR_ERR(page));
+
+	/* if page is going to be written, we should add this page into cache
+	 * earlier. */
+	if (fio->ft_mkwrite) {
+		wait_on_page_writeback(vmpage);
+		if (set_page_dirty(vmpage)) {
+			struct ccc_page *cp;
+
+			/* vvp_page_assume() calls wait_on_page_writeback(). */
+			cl_page_assume(env, io, page);
+
+			cp = cl2ccc_page(cl_page_at(page, &vvp_device_type));
+			vvp_write_pending(cl2ccc(obj), cp);
+
+			/* Do not set Dirty bit here so that in case IO is
+			 * started before the page is really made dirty, we
+			 * still have chance to detect it. */
+			result = cl_page_cache_add(env, io, page, CRT_WRITE);
+			LASSERT(cl_page_is_owned(page, io));
+
+			vmpage = NULL;
+			if (result < 0) {
+				cl_page_unmap(env, io, page);
+				cl_page_discard(env, io, page);
+				cl_page_disown(env, io, page);
+
+				cl_page_put(env, page);
+
+				/* we're in big trouble, what can we do now? */
+				if (result == -EDQUOT)
+					result = -ENOSPC;
+				GOTO(out, result);
+			} else
+				cl_page_disown(env, io, page);
+		}
+	}
+
+	last = cl_index(obj, size - 1);
+	/*
+	 * The ft_index is only used in the case of
+	 * a mkwrite action. We need to check
+	 * our assertions are correct, since
+	 * we should have caught this above
+	 */
+	LASSERT(!fio->ft_mkwrite || fio->ft_index <= last);
+	if (fio->ft_index == last)
+		/*
+		 * Last page is mapped partially.
+		 */
+		fio->ft_nob = size - cl_offset(obj, fio->ft_index);
+	else
+		fio->ft_nob = cl_page_size(obj);
+
+	lu_ref_add(&page->cp_reference, "fault", io);
+	fio->ft_page = page;
+	EXIT;
+
+out:
+	/* return unlocked vmpage to avoid deadlocking */
+	if (vmpage != NULL)
+		unlock_page(vmpage);
+	cfio->fault.ft_flags &= ~VM_FAULT_LOCKED;
+	return result;
+}
+
+static int vvp_io_fsync_start(const struct lu_env *env,
+			      const struct cl_io_slice *ios)
+{
+	/* we should mark TOWRITE bit to each dirty page in radix tree to
+	 * verify pages have been written, but this is difficult because of
+	 * race. */
+	return 0;
+}
+
+static int vvp_io_read_page(const struct lu_env *env,
+			    const struct cl_io_slice *ios,
+			    const struct cl_page_slice *slice)
+{
+	struct cl_io	      *io     = ios->cis_io;
+	struct cl_object	  *obj    = slice->cpl_obj;
+	struct ccc_page	   *cp     = cl2ccc_page(slice);
+	struct cl_page	    *page   = slice->cpl_page;
+	struct inode	      *inode  = ccc_object_inode(obj);
+	struct ll_sb_info	 *sbi    = ll_i2sbi(inode);
+	struct ll_file_data       *fd     = cl2ccc_io(env, ios)->cui_fd;
+	struct ll_readahead_state *ras    = &fd->fd_ras;
+	struct page		*vmpage = cp->cpg_page;
+	struct cl_2queue	  *queue  = &io->ci_queue;
+	int rc;
+
+	CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+	LASSERT(slice->cpl_obj == obj);
+
+	ENTRY;
+
+	if (sbi->ll_ra_info.ra_max_pages_per_file &&
+	    sbi->ll_ra_info.ra_max_pages)
+		ras_update(sbi, inode, ras, page->cp_index,
+			   cp->cpg_defer_uptodate);
+
+	/* Sanity check whether the page is protected by a lock. */
+	rc = cl_page_is_under_lock(env, io, page);
+	if (rc != -EBUSY) {
+		CL_PAGE_HEADER(D_WARNING, env, page, "%s: %d\n",
+			       rc == -ENODATA ? "without a lock" :
+			       "match failed", rc);
+		if (rc != -ENODATA)
+			RETURN(rc);
+	}
+
+	if (cp->cpg_defer_uptodate) {
+		cp->cpg_ra_used = 1;
+		cl_page_export(env, page, 1);
+	}
+	/*
+	 * Add page into the queue even when it is marked uptodate above.
+	 * this will unlock it automatically as part of cl_page_list_disown().
+	 */
+	cl_2queue_add(queue, page);
+	if (sbi->ll_ra_info.ra_max_pages_per_file &&
+	    sbi->ll_ra_info.ra_max_pages)
+		ll_readahead(env, io, ras,
+			     vmpage->mapping, &queue->c2_qin, fd->fd_flags);
+
+	RETURN(0);
+}
+
+static int vvp_page_sync_io(const struct lu_env *env, struct cl_io *io,
+			    struct cl_page *page, struct ccc_page *cp,
+			    enum cl_req_type crt)
+{
+	struct cl_2queue  *queue;
+	int result;
+
+	LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+
+	queue = &io->ci_queue;
+	cl_2queue_init_page(queue, page);
+
+	result = cl_io_submit_sync(env, io, crt, queue, 0);
+	LASSERT(cl_page_is_owned(page, io));
+
+	if (crt == CRT_READ)
+		/*
+		 * in CRT_WRITE case page is left locked even in case of
+		 * error.
+		 */
+		cl_page_list_disown(env, io, &queue->c2_qin);
+	cl_2queue_fini(env, queue);
+
+	return result;
+}
+
+/**
+ * Prepare partially written-to page for a write.
+ */
+static int vvp_io_prepare_partial(const struct lu_env *env, struct cl_io *io,
+				  struct cl_object *obj, struct cl_page *pg,
+				  struct ccc_page *cp,
+				  unsigned from, unsigned to)
+{
+	struct cl_attr *attr   = ccc_env_thread_attr(env);
+	loff_t	  offset = cl_offset(obj, pg->cp_index);
+	int	     result;
+
+	cl_object_attr_lock(obj);
+	result = cl_object_attr_get(env, obj, attr);
+	cl_object_attr_unlock(obj);
+	if (result == 0) {
+		/*
+		 * If are writing to a new page, no need to read old data.
+		 * The extent locking will have updated the KMS, and for our
+		 * purposes here we can treat it like i_size.
+		 */
+		if (attr->cat_kms <= offset) {
+			char *kaddr = ll_kmap_atomic(cp->cpg_page, KM_USER0);
+
+			memset(kaddr, 0, cl_page_size(obj));
+			ll_kunmap_atomic(kaddr, KM_USER0);
+		} else if (cp->cpg_defer_uptodate)
+			cp->cpg_ra_used = 1;
+		else
+			result = vvp_page_sync_io(env, io, pg, cp, CRT_READ);
+		/*
+		 * In older implementations, obdo_refresh_inode is called here
+		 * to update the inode because the write might modify the
+		 * object info at OST. However, this has been proven useless,
+		 * since LVB functions will be called when user space program
+		 * tries to retrieve inode attribute.  Also, see bug 15909 for
+		 * details. -jay
+		 */
+		if (result == 0)
+			cl_page_export(env, pg, 1);
+	}
+	return result;
+}
+
+static int vvp_io_prepare_write(const struct lu_env *env,
+				const struct cl_io_slice *ios,
+				const struct cl_page_slice *slice,
+				unsigned from, unsigned to)
+{
+	struct cl_object *obj    = slice->cpl_obj;
+	struct ccc_page  *cp     = cl2ccc_page(slice);
+	struct cl_page   *pg     = slice->cpl_page;
+	struct page       *vmpage = cp->cpg_page;
+
+	int result;
+
+	ENTRY;
+
+	LINVRNT(cl_page_is_vmlocked(env, pg));
+	LASSERT(vmpage->mapping->host == ccc_object_inode(obj));
+
+	result = 0;
+
+	CL_PAGE_HEADER(D_PAGE, env, pg, "preparing: [%d, %d]\n", from, to);
+	if (!PageUptodate(vmpage)) {
+		/*
+		 * We're completely overwriting an existing page, so _don't_
+		 * set it up to date until commit_write
+		 */
+		if (from == 0 && to == PAGE_CACHE_SIZE) {
+			CL_PAGE_HEADER(D_PAGE, env, pg, "full page write\n");
+			POISON_PAGE(page, 0x11);
+		} else
+			result = vvp_io_prepare_partial(env, ios->cis_io, obj,
+							pg, cp, from, to);
+	} else
+		CL_PAGE_HEADER(D_PAGE, env, pg, "uptodate\n");
+	RETURN(result);
+}
+
+static int vvp_io_commit_write(const struct lu_env *env,
+			       const struct cl_io_slice *ios,
+			       const struct cl_page_slice *slice,
+			       unsigned from, unsigned to)
+{
+	struct cl_object  *obj    = slice->cpl_obj;
+	struct cl_io      *io     = ios->cis_io;
+	struct ccc_page   *cp     = cl2ccc_page(slice);
+	struct cl_page    *pg     = slice->cpl_page;
+	struct inode      *inode  = ccc_object_inode(obj);
+	struct ll_sb_info *sbi    = ll_i2sbi(inode);
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct page	*vmpage = cp->cpg_page;
+
+	int    result;
+	int    tallyop;
+	loff_t size;
+
+	ENTRY;
+
+	LINVRNT(cl_page_is_vmlocked(env, pg));
+	LASSERT(vmpage->mapping->host == inode);
+
+	LU_OBJECT_HEADER(D_INODE, env, &obj->co_lu, "commiting page write\n");
+	CL_PAGE_HEADER(D_PAGE, env, pg, "committing: [%d, %d]\n", from, to);
+
+	/*
+	 * queue a write for some time in the future the first time we
+	 * dirty the page.
+	 *
+	 * This is different from what other file systems do: they usually
+	 * just mark page (and some of its buffers) dirty and rely on
+	 * balance_dirty_pages() to start a write-back. Lustre wants write-back
+	 * to be started earlier for the following reasons:
+	 *
+	 *     (1) with a large number of clients we need to limit the amount
+	 *     of cached data on the clients a lot;
+	 *
+	 *     (2) large compute jobs generally want compute-only then io-only
+	 *     and the IO should complete as quickly as possible;
+	 *
+	 *     (3) IO is batched up to the RPC size and is async until the
+	 *     client max cache is hit
+	 *     (/proc/fs/lustre/osc/OSC.../max_dirty_mb)
+	 *
+	 */
+	if (!PageDirty(vmpage)) {
+		tallyop = LPROC_LL_DIRTY_MISSES;
+		result = cl_page_cache_add(env, io, pg, CRT_WRITE);
+		if (result == 0) {
+			/* page was added into cache successfully. */
+			set_page_dirty(vmpage);
+			vvp_write_pending(cl2ccc(obj), cp);
+		} else if (result == -EDQUOT) {
+			pgoff_t last_index = i_size_read(inode) >> PAGE_CACHE_SHIFT;
+			bool need_clip = true;
+
+			/*
+			 * Client ran out of disk space grant. Possible
+			 * strategies are:
+			 *
+			 *     (a) do a sync write, renewing grant;
+			 *
+			 *     (b) stop writing on this stripe, switch to the
+			 *     next one.
+			 *
+			 * (b) is a part of "parallel io" design that is the
+			 * ultimate goal. (a) is what "old" client did, and
+			 * what the new code continues to do for the time
+			 * being.
+			 */
+			if (last_index > pg->cp_index) {
+				to = PAGE_CACHE_SIZE;
+				need_clip = false;
+			} else if (last_index == pg->cp_index) {
+				int size_to = i_size_read(inode) & ~CFS_PAGE_MASK;
+				if (to < size_to)
+					to = size_to;
+			}
+			if (need_clip)
+				cl_page_clip(env, pg, 0, to);
+			result = vvp_page_sync_io(env, io, pg, cp, CRT_WRITE);
+			if (result)
+				CERROR("Write page %lu of inode %p failed %d\n",
+				       pg->cp_index, inode, result);
+		}
+	} else {
+		tallyop = LPROC_LL_DIRTY_HITS;
+		result = 0;
+	}
+	ll_stats_ops_tally(sbi, tallyop, 1);
+
+	/* Inode should be marked DIRTY even if no new page was marked DIRTY
+	 * because page could have been not flushed between 2 modifications.
+	 * It is important the file is marked DIRTY as soon as the I/O is done
+	 * Indeed, when cache is flushed, file could be already closed and it
+	 * is too late to warn the MDT.
+	 * It is acceptable that file is marked DIRTY even if I/O is dropped
+	 * for some reasons before being flushed to OST.
+	 */
+	if (result == 0) {
+		spin_lock(&lli->lli_lock);
+		lli->lli_flags |= LLIF_DATA_MODIFIED;
+		spin_unlock(&lli->lli_lock);
+	}
+
+	size = cl_offset(obj, pg->cp_index) + to;
+
+	ll_inode_size_lock(inode);
+	if (result == 0) {
+		if (size > i_size_read(inode)) {
+			cl_isize_write_nolock(inode, size);
+			CDEBUG(D_VFSTRACE, DFID" updating i_size %lu\n",
+			       PFID(lu_object_fid(&obj->co_lu)),
+			       (unsigned long)size);
+		}
+		cl_page_export(env, pg, 1);
+	} else {
+		if (size > i_size_read(inode))
+			cl_page_discard(env, io, pg);
+	}
+	ll_inode_size_unlock(inode);
+	RETURN(result);
+}
+
+static const struct cl_io_operations vvp_io_ops = {
+	.op = {
+		[CIT_READ] = {
+			.cio_fini      = vvp_io_read_fini,
+			.cio_lock      = vvp_io_read_lock,
+			.cio_start     = vvp_io_read_start,
+			.cio_advance   = ccc_io_advance
+		},
+		[CIT_WRITE] = {
+			.cio_fini      = vvp_io_fini,
+			.cio_lock      = vvp_io_write_lock,
+			.cio_start     = vvp_io_write_start,
+			.cio_advance   = ccc_io_advance
+		},
+		[CIT_SETATTR] = {
+			.cio_fini       = vvp_io_setattr_fini,
+			.cio_iter_init  = vvp_io_setattr_iter_init,
+			.cio_lock       = vvp_io_setattr_lock,
+			.cio_start      = vvp_io_setattr_start,
+			.cio_end	= vvp_io_setattr_end
+		},
+		[CIT_FAULT] = {
+			.cio_fini      = vvp_io_fault_fini,
+			.cio_iter_init = vvp_io_fault_iter_init,
+			.cio_lock      = vvp_io_fault_lock,
+			.cio_start     = vvp_io_fault_start,
+			.cio_end       = ccc_io_end
+		},
+		[CIT_FSYNC] = {
+			.cio_start  = vvp_io_fsync_start,
+			.cio_fini   = vvp_io_fini
+		},
+		[CIT_MISC] = {
+			.cio_fini   = vvp_io_fini
+		}
+	},
+	.cio_read_page     = vvp_io_read_page,
+	.cio_prepare_write = vvp_io_prepare_write,
+	.cio_commit_write  = vvp_io_commit_write
+};
+
+int vvp_io_init(const struct lu_env *env, struct cl_object *obj,
+		struct cl_io *io)
+{
+	struct vvp_io      *vio   = vvp_env_io(env);
+	struct ccc_io      *cio   = ccc_env_io(env);
+	struct inode       *inode = ccc_object_inode(obj);
+	int		 result;
+
+	CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+	ENTRY;
+
+	CL_IO_SLICE_CLEAN(cio, cui_cl);
+	cl_io_slice_add(io, &cio->cui_cl, obj, &vvp_io_ops);
+	vio->cui_ra_window_set = 0;
+	result = 0;
+	if (io->ci_type == CIT_READ || io->ci_type == CIT_WRITE) {
+		size_t count;
+		struct ll_inode_info *lli = ll_i2info(inode);
+
+		count = io->u.ci_rw.crw_count;
+		/* "If nbyte is 0, read() will return 0 and have no other
+		 *  results."  -- Single Unix Spec */
+		if (count == 0)
+			result = 1;
+		else {
+			cio->cui_tot_count = count;
+			cio->cui_tot_nrsegs = 0;
+		}
+		/* for read/write, we store the jobid in the inode, and
+		 * it'll be fetched by osc when building RPC.
+		 *
+		 * it's not accurate if the file is shared by different
+		 * jobs.
+		 */
+		lustre_get_jobid(lli->lli_jobid);
+	} else if (io->ci_type == CIT_SETATTR) {
+		if (!cl_io_is_trunc(io))
+			io->ci_lockreq = CILR_MANDATORY;
+	}
+
+	/* ignore layout change for generic CIT_MISC but not for glimpse.
+	 * io context for glimpse must set ci_verify_layout to true,
+	 * see cl_glimpse_size0() for details. */
+	if (io->ci_type == CIT_MISC && !io->ci_verify_layout)
+		io->ci_ignore_layout = 1;
+
+	/* Enqueue layout lock and get layout version. We need to do this
+	 * even for operations requiring to open file, such as read and write,
+	 * because it might not grant layout lock in IT_OPEN. */
+	if (result == 0 && !io->ci_ignore_layout)
+		result = ll_layout_refresh(inode, &cio->cui_layout_gen);
+
+	RETURN(result);
+}
+
+static struct vvp_io *cl2vvp_io(const struct lu_env *env,
+				const struct cl_io_slice *slice)
+{
+	/* Caling just for assertion */
+	cl2ccc_io(env, slice);
+	return vvp_env_io(env);
+}
diff --git a/drivers/staging/lustre/lustre/llite/vvp_lock.c b/drivers/staging/lustre/lustre/llite/vvp_lock.c
new file mode 100644
index 000000000000..9b8712bccd92
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/vvp_lock.c
@@ -0,0 +1,85 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_lock for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+
+#include <obd.h>
+#include <lustre_lite.h>
+
+#include "vvp_internal.h"
+
+/*****************************************************************************
+ *
+ * Vvp lock functions.
+ *
+ */
+
+/**
+ * Estimates lock value for the purpose of managing the lock cache during
+ * memory shortages.
+ *
+ * Locks for memory mapped files are almost infinitely precious, others are
+ * junk. "Mapped locks" are heavy, but not infinitely heavy, so that they are
+ * ordered within themselves by weights assigned from other layers.
+ */
+static unsigned long vvp_lock_weigh(const struct lu_env *env,
+				    const struct cl_lock_slice *slice)
+{
+	struct ccc_object *cob = cl2ccc(slice->cls_obj);
+
+	ENTRY;
+	RETURN(atomic_read(&cob->cob_mmap_cnt) > 0 ? ~0UL >> 2 : 0);
+}
+
+static const struct cl_lock_operations vvp_lock_ops = {
+	.clo_delete    = ccc_lock_delete,
+	.clo_fini      = ccc_lock_fini,
+	.clo_enqueue   = ccc_lock_enqueue,
+	.clo_wait      = ccc_lock_wait,
+	.clo_unuse     = ccc_lock_unuse,
+	.clo_fits_into = ccc_lock_fits_into,
+	.clo_state     = ccc_lock_state,
+	.clo_weigh     = vvp_lock_weigh
+};
+
+int vvp_lock_init(const struct lu_env *env, struct cl_object *obj,
+		  struct cl_lock *lock, const struct cl_io *io)
+{
+	return ccc_lock_init(env, obj, lock, io, &vvp_lock_ops);
+}
diff --git a/drivers/staging/lustre/lustre/llite/vvp_object.c b/drivers/staging/lustre/lustre/llite/vvp_object.c
new file mode 100644
index 000000000000..01edc5b63e13
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/vvp_object.c
@@ -0,0 +1,186 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * cl_object implementation for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+
+#include <linux/libcfs/libcfs.h>
+
+#include <obd.h>
+#include <lustre_lite.h>
+
+#include "vvp_internal.h"
+
+/*****************************************************************************
+ *
+ * Object operations.
+ *
+ */
+
+static int vvp_object_print(const struct lu_env *env, void *cookie,
+			    lu_printer_t p, const struct lu_object *o)
+{
+	struct ccc_object    *obj   = lu2ccc(o);
+	struct inode	 *inode = obj->cob_inode;
+	struct ll_inode_info *lli;
+
+	(*p)(env, cookie, "(%s %d %d) inode: %p ",
+	     list_empty(&obj->cob_pending_list) ? "-" : "+",
+	     obj->cob_transient_pages, atomic_read(&obj->cob_mmap_cnt),
+	     inode);
+	if (inode) {
+		lli = ll_i2info(inode);
+		(*p)(env, cookie, "%lu/%u %o %u %d %p "DFID,
+		     inode->i_ino, inode->i_generation, inode->i_mode,
+		     inode->i_nlink, atomic_read(&inode->i_count),
+		     lli->lli_clob, PFID(&lli->lli_fid));
+	}
+	return 0;
+}
+
+static int vvp_attr_get(const struct lu_env *env, struct cl_object *obj,
+			struct cl_attr *attr)
+{
+	struct inode *inode = ccc_object_inode(obj);
+
+	/*
+	 * lov overwrites most of these fields in
+	 * lov_attr_get()->...lov_merge_lvb_kms(), except when inode
+	 * attributes are newer.
+	 */
+
+	attr->cat_size = i_size_read(inode);
+	attr->cat_mtime = LTIME_S(inode->i_mtime);
+	attr->cat_atime = LTIME_S(inode->i_atime);
+	attr->cat_ctime = LTIME_S(inode->i_ctime);
+	attr->cat_blocks = inode->i_blocks;
+	attr->cat_uid = inode->i_uid;
+	attr->cat_gid = inode->i_gid;
+	/* KMS is not known by this layer */
+	return 0; /* layers below have to fill in the rest */
+}
+
+static int vvp_attr_set(const struct lu_env *env, struct cl_object *obj,
+			const struct cl_attr *attr, unsigned valid)
+{
+	struct inode *inode = ccc_object_inode(obj);
+
+	if (valid & CAT_UID)
+		inode->i_uid = attr->cat_uid;
+	if (valid & CAT_GID)
+		inode->i_gid = attr->cat_gid;
+	if (valid & CAT_ATIME)
+		LTIME_S(inode->i_atime) = attr->cat_atime;
+	if (valid & CAT_MTIME)
+		LTIME_S(inode->i_mtime) = attr->cat_mtime;
+	if (valid & CAT_CTIME)
+		LTIME_S(inode->i_ctime) = attr->cat_ctime;
+	if (0 && valid & CAT_SIZE)
+		cl_isize_write_nolock(inode, attr->cat_size);
+	/* not currently necessary */
+	if (0 && valid & (CAT_UID|CAT_GID|CAT_SIZE))
+		mark_inode_dirty(inode);
+	return 0;
+}
+
+int vvp_conf_set(const struct lu_env *env, struct cl_object *obj,
+		const struct cl_object_conf *conf)
+{
+	struct ll_inode_info *lli = ll_i2info(conf->coc_inode);
+
+	if (conf->coc_opc == OBJECT_CONF_INVALIDATE) {
+		lli->lli_layout_gen = LL_LAYOUT_GEN_NONE;
+		return 0;
+	}
+
+	if (conf->coc_opc != OBJECT_CONF_SET)
+		return 0;
+
+	if (conf->u.coc_md != NULL && conf->u.coc_md->lsm != NULL) {
+		CDEBUG(D_VFSTRACE, "layout lock change: %u -> %u\n",
+			lli->lli_layout_gen,
+			conf->u.coc_md->lsm->lsm_layout_gen);
+
+		lli->lli_has_smd = true;
+		lli->lli_layout_gen = conf->u.coc_md->lsm->lsm_layout_gen;
+	} else {
+		CDEBUG(D_VFSTRACE, "layout lock destroyed: %u.\n",
+			lli->lli_layout_gen);
+
+		lli->lli_has_smd = false;
+		lli->lli_layout_gen = LL_LAYOUT_GEN_EMPTY;
+	}
+	return 0;
+}
+
+static const struct cl_object_operations vvp_ops = {
+	.coo_page_init = vvp_page_init,
+	.coo_lock_init = vvp_lock_init,
+	.coo_io_init   = vvp_io_init,
+	.coo_attr_get  = vvp_attr_get,
+	.coo_attr_set  = vvp_attr_set,
+	.coo_conf_set  = vvp_conf_set,
+	.coo_glimpse   = ccc_object_glimpse
+};
+
+static const struct lu_object_operations vvp_lu_obj_ops = {
+	.loo_object_init  = ccc_object_init,
+	.loo_object_free  = ccc_object_free,
+	.loo_object_print = vvp_object_print
+};
+
+struct ccc_object *cl_inode2ccc(struct inode *inode)
+{
+	struct cl_inode_info *lli = cl_i2info(inode);
+	struct cl_object     *obj = lli->lli_clob;
+	struct lu_object     *lu;
+
+	LASSERT(obj != NULL);
+	lu = lu_object_locate(obj->co_lu.lo_header, &vvp_device_type);
+	LASSERT(lu != NULL);
+	return lu2ccc(lu);
+}
+
+struct lu_object *vvp_object_alloc(const struct lu_env *env,
+				   const struct lu_object_header *hdr,
+				   struct lu_device *dev)
+{
+	return ccc_object_alloc(env, hdr, dev, &vvp_ops, &vvp_lu_obj_ops);
+}
diff --git a/drivers/staging/lustre/lustre/llite/vvp_page.c b/drivers/staging/lustre/lustre/llite/vvp_page.c
new file mode 100644
index 000000000000..4568e69bb9f0
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/vvp_page.c
@@ -0,0 +1,558 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_page for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+
+#include <obd.h>
+#include <lustre_lite.h>
+
+#include "vvp_internal.h"
+
+/*****************************************************************************
+ *
+ * Page operations.
+ *
+ */
+
+static void vvp_page_fini_common(struct ccc_page *cp)
+{
+	struct page *vmpage = cp->cpg_page;
+
+	LASSERT(vmpage != NULL);
+	page_cache_release(vmpage);
+}
+
+static void vvp_page_fini(const struct lu_env *env,
+			  struct cl_page_slice *slice)
+{
+	struct ccc_page *cp = cl2ccc_page(slice);
+	struct page *vmpage  = cp->cpg_page;
+
+	/*
+	 * vmpage->private was already cleared when page was moved into
+	 * VPG_FREEING state.
+	 */
+	LASSERT((struct cl_page *)vmpage->private != slice->cpl_page);
+	vvp_page_fini_common(cp);
+}
+
+static int vvp_page_own(const struct lu_env *env,
+			const struct cl_page_slice *slice, struct cl_io *io,
+			int nonblock)
+{
+	struct ccc_page *vpg    = cl2ccc_page(slice);
+	struct page      *vmpage = vpg->cpg_page;
+
+	LASSERT(vmpage != NULL);
+	if (nonblock) {
+		if (!trylock_page(vmpage))
+			return -EAGAIN;
+
+		if (unlikely(PageWriteback(vmpage))) {
+			unlock_page(vmpage);
+			return -EAGAIN;
+		}
+
+		return 0;
+	}
+
+	lock_page(vmpage);
+	wait_on_page_writeback(vmpage);
+	return 0;
+}
+
+static void vvp_page_assume(const struct lu_env *env,
+			    const struct cl_page_slice *slice,
+			    struct cl_io *unused)
+{
+	struct page *vmpage = cl2vm_page(slice);
+
+	LASSERT(vmpage != NULL);
+	LASSERT(PageLocked(vmpage));
+	wait_on_page_writeback(vmpage);
+}
+
+static void vvp_page_unassume(const struct lu_env *env,
+			      const struct cl_page_slice *slice,
+			      struct cl_io *unused)
+{
+	struct page *vmpage = cl2vm_page(slice);
+
+	LASSERT(vmpage != NULL);
+	LASSERT(PageLocked(vmpage));
+}
+
+static void vvp_page_disown(const struct lu_env *env,
+			    const struct cl_page_slice *slice, struct cl_io *io)
+{
+	struct page *vmpage = cl2vm_page(slice);
+
+	LASSERT(vmpage != NULL);
+	LASSERT(PageLocked(vmpage));
+
+	unlock_page(cl2vm_page(slice));
+}
+
+static void vvp_page_discard(const struct lu_env *env,
+			     const struct cl_page_slice *slice,
+			     struct cl_io *unused)
+{
+	struct page	   *vmpage  = cl2vm_page(slice);
+	struct address_space *mapping;
+	struct ccc_page      *cpg     = cl2ccc_page(slice);
+
+	LASSERT(vmpage != NULL);
+	LASSERT(PageLocked(vmpage));
+
+	mapping = vmpage->mapping;
+
+	if (cpg->cpg_defer_uptodate && !cpg->cpg_ra_used)
+		ll_ra_stats_inc(mapping, RA_STAT_DISCARDED);
+
+	/*
+	 * truncate_complete_page() calls
+	 * a_ops->invalidatepage()->cl_page_delete()->vvp_page_delete().
+	 */
+	truncate_complete_page(mapping, vmpage);
+}
+
+static int vvp_page_unmap(const struct lu_env *env,
+			  const struct cl_page_slice *slice,
+			  struct cl_io *unused)
+{
+	struct page *vmpage = cl2vm_page(slice);
+	__u64       offset;
+
+	LASSERT(vmpage != NULL);
+	LASSERT(PageLocked(vmpage));
+
+	offset = vmpage->index << PAGE_CACHE_SHIFT;
+
+	/*
+	 * XXX is it safe to call this with the page lock held?
+	 */
+	ll_teardown_mmaps(vmpage->mapping, offset, offset + PAGE_CACHE_SIZE);
+	return 0;
+}
+
+static void vvp_page_delete(const struct lu_env *env,
+			    const struct cl_page_slice *slice)
+{
+	struct page       *vmpage = cl2vm_page(slice);
+	struct inode     *inode  = vmpage->mapping->host;
+	struct cl_object *obj    = slice->cpl_obj;
+
+	LASSERT(PageLocked(vmpage));
+	LASSERT((struct cl_page *)vmpage->private == slice->cpl_page);
+	LASSERT(inode == ccc_object_inode(obj));
+
+	vvp_write_complete(cl2ccc(obj), cl2ccc_page(slice));
+	ClearPagePrivate(vmpage);
+	vmpage->private = 0;
+	/*
+	 * Reference from vmpage to cl_page is removed, but the reference back
+	 * is still here. It is removed later in vvp_page_fini().
+	 */
+}
+
+static void vvp_page_export(const struct lu_env *env,
+			    const struct cl_page_slice *slice,
+			    int uptodate)
+{
+	struct page *vmpage = cl2vm_page(slice);
+
+	LASSERT(vmpage != NULL);
+	LASSERT(PageLocked(vmpage));
+	if (uptodate)
+		SetPageUptodate(vmpage);
+	else
+		ClearPageUptodate(vmpage);
+}
+
+static int vvp_page_is_vmlocked(const struct lu_env *env,
+				const struct cl_page_slice *slice)
+{
+	return PageLocked(cl2vm_page(slice)) ? -EBUSY : -ENODATA;
+}
+
+static int vvp_page_prep_read(const struct lu_env *env,
+			      const struct cl_page_slice *slice,
+			      struct cl_io *unused)
+{
+	ENTRY;
+	/* Skip the page already marked as PG_uptodate. */
+	RETURN(PageUptodate(cl2vm_page(slice)) ? -EALREADY : 0);
+}
+
+static int vvp_page_prep_write(const struct lu_env *env,
+			       const struct cl_page_slice *slice,
+			       struct cl_io *unused)
+{
+	struct page *vmpage = cl2vm_page(slice);
+
+	LASSERT(PageLocked(vmpage));
+	LASSERT(!PageDirty(vmpage));
+
+	set_page_writeback(vmpage);
+	vvp_write_pending(cl2ccc(slice->cpl_obj), cl2ccc_page(slice));
+
+	return 0;
+}
+
+/**
+ * Handles page transfer errors at VM level.
+ *
+ * This takes inode as a separate argument, because inode on which error is to
+ * be set can be different from \a vmpage inode in case of direct-io.
+ */
+static void vvp_vmpage_error(struct inode *inode, struct page *vmpage, int ioret)
+{
+	struct ccc_object *obj = cl_inode2ccc(inode);
+
+	if (ioret == 0) {
+		ClearPageError(vmpage);
+		obj->cob_discard_page_warned = 0;
+	} else {
+		SetPageError(vmpage);
+		if (ioret == -ENOSPC)
+			set_bit(AS_ENOSPC, &inode->i_mapping->flags);
+		else
+			set_bit(AS_EIO, &inode->i_mapping->flags);
+
+		if ((ioret == -ESHUTDOWN || ioret == -EINTR) &&
+		     obj->cob_discard_page_warned == 0) {
+			obj->cob_discard_page_warned = 1;
+			ll_dirty_page_discard_warn(vmpage, ioret);
+		}
+	}
+}
+
+static void vvp_page_completion_read(const struct lu_env *env,
+				     const struct cl_page_slice *slice,
+				     int ioret)
+{
+	struct ccc_page *cp     = cl2ccc_page(slice);
+	struct page      *vmpage = cp->cpg_page;
+	struct cl_page  *page   = cl_page_top(slice->cpl_page);
+	struct inode    *inode  = ccc_object_inode(page->cp_obj);
+	ENTRY;
+
+	LASSERT(PageLocked(vmpage));
+	CL_PAGE_HEADER(D_PAGE, env, page, "completing READ with %d\n", ioret);
+
+	if (cp->cpg_defer_uptodate)
+		ll_ra_count_put(ll_i2sbi(inode), 1);
+
+	if (ioret == 0)  {
+		if (!cp->cpg_defer_uptodate)
+			cl_page_export(env, page, 1);
+	} else
+		cp->cpg_defer_uptodate = 0;
+
+	if (page->cp_sync_io == NULL)
+		unlock_page(vmpage);
+
+	EXIT;
+}
+
+static void vvp_page_completion_write(const struct lu_env *env,
+				      const struct cl_page_slice *slice,
+				      int ioret)
+{
+	struct ccc_page *cp     = cl2ccc_page(slice);
+	struct cl_page  *pg     = slice->cpl_page;
+	struct page      *vmpage = cp->cpg_page;
+	ENTRY;
+
+	LASSERT(ergo(pg->cp_sync_io != NULL, PageLocked(vmpage)));
+	LASSERT(PageWriteback(vmpage));
+
+	CL_PAGE_HEADER(D_PAGE, env, pg, "completing WRITE with %d\n", ioret);
+
+	/*
+	 * TODO: Actually it makes sense to add the page into oap pending
+	 * list again and so that we don't need to take the page out from
+	 * SoM write pending list, if we just meet a recoverable error,
+	 * -ENOMEM, etc.
+	 * To implement this, we just need to return a non zero value in
+	 * ->cpo_completion method. The underlying transfer should be notified
+	 * and then re-add the page into pending transfer queue.  -jay
+	 */
+
+	cp->cpg_write_queued = 0;
+	vvp_write_complete(cl2ccc(slice->cpl_obj), cp);
+
+	/*
+	 * Only mark the page error only when it's an async write because
+	 * applications won't wait for IO to finish.
+	 */
+	if (pg->cp_sync_io == NULL)
+		vvp_vmpage_error(ccc_object_inode(pg->cp_obj), vmpage, ioret);
+
+	end_page_writeback(vmpage);
+	EXIT;
+}
+
+/**
+ * Implements cl_page_operations::cpo_make_ready() method.
+ *
+ * This is called to yank a page from the transfer cache and to send it out as
+ * a part of transfer. This function try-locks the page. If try-lock failed,
+ * page is owned by some concurrent IO, and should be skipped (this is bad,
+ * but hopefully rare situation, as it usually results in transfer being
+ * shorter than possible).
+ *
+ * \retval 0      success, page can be placed into transfer
+ *
+ * \retval -EAGAIN page is either used by concurrent IO has been
+ * truncated. Skip it.
+ */
+static int vvp_page_make_ready(const struct lu_env *env,
+			       const struct cl_page_slice *slice)
+{
+	struct page *vmpage = cl2vm_page(slice);
+	struct cl_page *pg = slice->cpl_page;
+	int result = 0;
+
+	lock_page(vmpage);
+	if (clear_page_dirty_for_io(vmpage)) {
+		LASSERT(pg->cp_state == CPS_CACHED);
+		/* This actually clears the dirty bit in the radix
+		 * tree. */
+		set_page_writeback(vmpage);
+		vvp_write_pending(cl2ccc(slice->cpl_obj),
+				cl2ccc_page(slice));
+		CL_PAGE_HEADER(D_PAGE, env, pg, "readied\n");
+	} else if (pg->cp_state == CPS_PAGEOUT) {
+		/* is it possible for osc_flush_async_page() to already
+		 * make it ready? */
+		result = -EALREADY;
+	} else {
+		CL_PAGE_DEBUG(D_ERROR, env, pg, "Unexpecting page state %d.\n",
+			      pg->cp_state);
+		LBUG();
+	}
+	unlock_page(vmpage);
+	RETURN(result);
+}
+
+static int vvp_page_print(const struct lu_env *env,
+			  const struct cl_page_slice *slice,
+			  void *cookie, lu_printer_t printer)
+{
+	struct ccc_page *vp = cl2ccc_page(slice);
+	struct page      *vmpage = vp->cpg_page;
+
+	(*printer)(env, cookie, LUSTRE_VVP_NAME"-page@%p(%d:%d:%d) "
+		   "vm@%p ",
+		   vp, vp->cpg_defer_uptodate, vp->cpg_ra_used,
+		   vp->cpg_write_queued, vmpage);
+	if (vmpage != NULL) {
+		(*printer)(env, cookie, "%lx %d:%d %lx %lu %slru",
+			   (long)vmpage->flags, page_count(vmpage),
+			   page_mapcount(vmpage), vmpage->private,
+			   page_index(vmpage),
+			   list_empty(&vmpage->lru) ? "not-" : "");
+	}
+	(*printer)(env, cookie, "\n");
+	return 0;
+}
+
+static const struct cl_page_operations vvp_page_ops = {
+	.cpo_own	   = vvp_page_own,
+	.cpo_assume	= vvp_page_assume,
+	.cpo_unassume      = vvp_page_unassume,
+	.cpo_disown	= vvp_page_disown,
+	.cpo_vmpage	= ccc_page_vmpage,
+	.cpo_discard       = vvp_page_discard,
+	.cpo_delete	= vvp_page_delete,
+	.cpo_unmap	 = vvp_page_unmap,
+	.cpo_export	= vvp_page_export,
+	.cpo_is_vmlocked   = vvp_page_is_vmlocked,
+	.cpo_fini	  = vvp_page_fini,
+	.cpo_print	 = vvp_page_print,
+	.cpo_is_under_lock = ccc_page_is_under_lock,
+	.io = {
+		[CRT_READ] = {
+			.cpo_prep	= vvp_page_prep_read,
+			.cpo_completion  = vvp_page_completion_read,
+			.cpo_make_ready  = ccc_fail,
+		},
+		[CRT_WRITE] = {
+			.cpo_prep	= vvp_page_prep_write,
+			.cpo_completion  = vvp_page_completion_write,
+			.cpo_make_ready  = vvp_page_make_ready,
+		}
+	}
+};
+
+static void vvp_transient_page_verify(const struct cl_page *page)
+{
+	struct inode *inode = ccc_object_inode(page->cp_obj);
+
+	LASSERT(!mutex_trylock(&inode->i_mutex));
+}
+
+static int vvp_transient_page_own(const struct lu_env *env,
+				  const struct cl_page_slice *slice,
+				  struct cl_io *unused, int nonblock)
+{
+	vvp_transient_page_verify(slice->cpl_page);
+	return 0;
+}
+
+static void vvp_transient_page_assume(const struct lu_env *env,
+				      const struct cl_page_slice *slice,
+				      struct cl_io *unused)
+{
+	vvp_transient_page_verify(slice->cpl_page);
+}
+
+static void vvp_transient_page_unassume(const struct lu_env *env,
+					const struct cl_page_slice *slice,
+					struct cl_io *unused)
+{
+	vvp_transient_page_verify(slice->cpl_page);
+}
+
+static void vvp_transient_page_disown(const struct lu_env *env,
+				      const struct cl_page_slice *slice,
+				      struct cl_io *unused)
+{
+	vvp_transient_page_verify(slice->cpl_page);
+}
+
+static void vvp_transient_page_discard(const struct lu_env *env,
+				       const struct cl_page_slice *slice,
+				       struct cl_io *unused)
+{
+	struct cl_page *page = slice->cpl_page;
+
+	vvp_transient_page_verify(slice->cpl_page);
+
+	/*
+	 * For transient pages, remove it from the radix tree.
+	 */
+	cl_page_delete(env, page);
+}
+
+static int vvp_transient_page_is_vmlocked(const struct lu_env *env,
+					  const struct cl_page_slice *slice)
+{
+	struct inode    *inode = ccc_object_inode(slice->cpl_obj);
+	int	locked;
+
+	locked = !mutex_trylock(&inode->i_mutex);
+	if (!locked)
+		mutex_unlock(&inode->i_mutex);
+	return locked ? -EBUSY : -ENODATA;
+}
+
+static void
+vvp_transient_page_completion(const struct lu_env *env,
+			      const struct cl_page_slice *slice,
+			      int ioret)
+{
+	vvp_transient_page_verify(slice->cpl_page);
+}
+
+static void vvp_transient_page_fini(const struct lu_env *env,
+				    struct cl_page_slice *slice)
+{
+	struct ccc_page *cp = cl2ccc_page(slice);
+	struct cl_page *clp = slice->cpl_page;
+	struct ccc_object *clobj = cl2ccc(clp->cp_obj);
+
+	vvp_page_fini_common(cp);
+	LASSERT(!mutex_trylock(&clobj->cob_inode->i_mutex));
+	clobj->cob_transient_pages--;
+}
+
+static const struct cl_page_operations vvp_transient_page_ops = {
+	.cpo_own	   = vvp_transient_page_own,
+	.cpo_assume	= vvp_transient_page_assume,
+	.cpo_unassume      = vvp_transient_page_unassume,
+	.cpo_disown	= vvp_transient_page_disown,
+	.cpo_discard       = vvp_transient_page_discard,
+	.cpo_vmpage	= ccc_page_vmpage,
+	.cpo_fini	  = vvp_transient_page_fini,
+	.cpo_is_vmlocked   = vvp_transient_page_is_vmlocked,
+	.cpo_print	 = vvp_page_print,
+	.cpo_is_under_lock = ccc_page_is_under_lock,
+	.io = {
+		[CRT_READ] = {
+			.cpo_prep	= ccc_transient_page_prep,
+			.cpo_completion  = vvp_transient_page_completion,
+		},
+		[CRT_WRITE] = {
+			.cpo_prep	= ccc_transient_page_prep,
+			.cpo_completion  = vvp_transient_page_completion,
+		}
+	}
+};
+
+int vvp_page_init(const struct lu_env *env, struct cl_object *obj,
+		struct cl_page *page, struct page *vmpage)
+{
+	struct ccc_page *cpg = cl_object_page_slice(obj, page);
+
+	CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+
+	cpg->cpg_page = vmpage;
+	page_cache_get(vmpage);
+
+	INIT_LIST_HEAD(&cpg->cpg_pending_linkage);
+	if (page->cp_type == CPT_CACHEABLE) {
+		SetPagePrivate(vmpage);
+		vmpage->private = (unsigned long)page;
+		cl_page_slice_add(page, &cpg->cpg_cl, obj,
+				&vvp_page_ops);
+	} else {
+		struct ccc_object *clobj = cl2ccc(obj);
+
+		LASSERT(!mutex_trylock(&clobj->cob_inode->i_mutex));
+		cl_page_slice_add(page, &cpg->cpg_cl, obj,
+				&vvp_transient_page_ops);
+		clobj->cob_transient_pages++;
+	}
+	return 0;
+}
diff --git a/drivers/staging/lustre/lustre/llite/xattr.c b/drivers/staging/lustre/lustre/llite/xattr.c
new file mode 100644
index 000000000000..4176264984bb
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/xattr.c
@@ -0,0 +1,578 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/selinux.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <obd_support.h>
+#include <lustre_lite.h>
+#include <lustre_dlm.h>
+#include <lustre_ver.h>
+#include <lustre_eacl.h>
+
+#include "llite_internal.h"
+
+#define XATTR_USER_T	    (1)
+#define XATTR_TRUSTED_T	 (2)
+#define XATTR_SECURITY_T	(3)
+#define XATTR_ACL_ACCESS_T      (4)
+#define XATTR_ACL_DEFAULT_T     (5)
+#define XATTR_LUSTRE_T	  (6)
+#define XATTR_OTHER_T	   (7)
+
+static
+int get_xattr_type(const char *name)
+{
+	if (!strcmp(name, POSIX_ACL_XATTR_ACCESS))
+		return XATTR_ACL_ACCESS_T;
+
+	if (!strcmp(name, POSIX_ACL_XATTR_DEFAULT))
+		return XATTR_ACL_DEFAULT_T;
+
+	if (!strncmp(name, XATTR_USER_PREFIX,
+		     sizeof(XATTR_USER_PREFIX) - 1))
+		return XATTR_USER_T;
+
+	if (!strncmp(name, XATTR_TRUSTED_PREFIX,
+		     sizeof(XATTR_TRUSTED_PREFIX) - 1))
+		return XATTR_TRUSTED_T;
+
+	if (!strncmp(name, XATTR_SECURITY_PREFIX,
+		     sizeof(XATTR_SECURITY_PREFIX) - 1))
+		return XATTR_SECURITY_T;
+
+	if (!strncmp(name, XATTR_LUSTRE_PREFIX,
+		     sizeof(XATTR_LUSTRE_PREFIX) - 1))
+		return XATTR_LUSTRE_T;
+
+	return XATTR_OTHER_T;
+}
+
+static
+int xattr_type_filter(struct ll_sb_info *sbi, int xattr_type)
+{
+	if ((xattr_type == XATTR_ACL_ACCESS_T ||
+	     xattr_type == XATTR_ACL_DEFAULT_T) &&
+	   !(sbi->ll_flags & LL_SBI_ACL))
+		return -EOPNOTSUPP;
+
+	if (xattr_type == XATTR_USER_T && !(sbi->ll_flags & LL_SBI_USER_XATTR))
+		return -EOPNOTSUPP;
+	if (xattr_type == XATTR_TRUSTED_T && !cfs_capable(CFS_CAP_SYS_ADMIN))
+		return -EPERM;
+	if (xattr_type == XATTR_OTHER_T)
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+static
+int ll_setxattr_common(struct inode *inode, const char *name,
+		       const void *value, size_t size,
+		       int flags, __u64 valid)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ptlrpc_request *req;
+	int xattr_type, rc;
+	struct obd_capa *oc;
+	posix_acl_xattr_header *new_value = NULL;
+	struct rmtacl_ctl_entry *rce = NULL;
+	ext_acl_xattr_header *acl = NULL;
+	const char *pv = value;
+	ENTRY;
+
+	xattr_type = get_xattr_type(name);
+	rc = xattr_type_filter(sbi, xattr_type);
+	if (rc)
+		RETURN(rc);
+
+	/* b10667: ignore lustre special xattr for now */
+	if ((xattr_type == XATTR_TRUSTED_T && strcmp(name, "trusted.lov") == 0) ||
+	    (xattr_type == XATTR_LUSTRE_T && strcmp(name, "lustre.lov") == 0))
+		RETURN(0);
+
+	/* b15587: ignore security.capability xattr for now */
+	if ((xattr_type == XATTR_SECURITY_T &&
+	    strcmp(name, "security.capability") == 0))
+		RETURN(0);
+
+	/* LU-549:  Disable security.selinux when selinux is disabled */
+	if (xattr_type == XATTR_SECURITY_T && !selinux_is_enabled() &&
+	    strcmp(name, "security.selinux") == 0)
+		RETURN(-EOPNOTSUPP);
+
+#ifdef CONFIG_FS_POSIX_ACL
+	if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
+	    (xattr_type == XATTR_ACL_ACCESS_T ||
+	    xattr_type == XATTR_ACL_DEFAULT_T)) {
+		rce = rct_search(&sbi->ll_rct, current_pid());
+		if (rce == NULL ||
+		    (rce->rce_ops != RMT_LSETFACL &&
+		    rce->rce_ops != RMT_RSETFACL))
+			RETURN(-EOPNOTSUPP);
+
+		if (rce->rce_ops == RMT_LSETFACL) {
+			struct eacl_entry *ee;
+
+			ee = et_search_del(&sbi->ll_et, current_pid(),
+					   ll_inode2fid(inode), xattr_type);
+			LASSERT(ee != NULL);
+			if (valid & OBD_MD_FLXATTR) {
+				acl = lustre_acl_xattr_merge2ext(
+						(posix_acl_xattr_header *)value,
+						size, ee->ee_acl);
+				if (IS_ERR(acl)) {
+					ee_free(ee);
+					RETURN(PTR_ERR(acl));
+				}
+				size =  CFS_ACL_XATTR_SIZE(\
+						le32_to_cpu(acl->a_count), \
+						ext_acl_xattr);
+				pv = (const char *)acl;
+			}
+			ee_free(ee);
+		} else if (rce->rce_ops == RMT_RSETFACL) {
+			size = lustre_posix_acl_xattr_filter(
+						(posix_acl_xattr_header *)value,
+						size, &new_value);
+			if (unlikely(size < 0))
+				RETURN(size);
+
+			pv = (const char *)new_value;
+		} else
+			RETURN(-EOPNOTSUPP);
+
+		valid |= rce_ops2valid(rce->rce_ops);
+	}
+#endif
+	oc = ll_mdscapa_get(inode);
+	rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
+			 valid, name, pv, size, 0, flags, ll_i2suppgid(inode),
+			 &req);
+	capa_put(oc);
+#ifdef CONFIG_FS_POSIX_ACL
+	if (new_value != NULL)
+		lustre_posix_acl_xattr_free(new_value, size);
+	if (acl != NULL)
+		lustre_ext_acl_xattr_free(acl);
+#endif
+	if (rc) {
+		if (rc == -EOPNOTSUPP && xattr_type == XATTR_USER_T) {
+			LCONSOLE_INFO("Disabling user_xattr feature because "
+				      "it is not supported on the server\n");
+			sbi->ll_flags &= ~LL_SBI_USER_XATTR;
+		}
+		RETURN(rc);
+	}
+
+	ptlrpc_req_finished(req);
+	RETURN(0);
+}
+
+int ll_setxattr(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags)
+{
+	struct inode *inode = dentry->d_inode;
+
+	LASSERT(inode);
+	LASSERT(name);
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), xattr %s\n",
+	       inode->i_ino, inode->i_generation, inode, name);
+
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_SETXATTR, 1);
+
+	if ((strncmp(name, XATTR_TRUSTED_PREFIX,
+		     sizeof(XATTR_TRUSTED_PREFIX) - 1) == 0 &&
+	     strcmp(name + sizeof(XATTR_TRUSTED_PREFIX) - 1, "lov") == 0) ||
+	    (strncmp(name, XATTR_LUSTRE_PREFIX,
+		     sizeof(XATTR_LUSTRE_PREFIX) - 1) == 0 &&
+	     strcmp(name + sizeof(XATTR_LUSTRE_PREFIX) - 1, "lov") == 0)) {
+		struct lov_user_md *lump = (struct lov_user_md *)value;
+		int rc = 0;
+
+		/* Attributes that are saved via getxattr will always have
+		 * the stripe_offset as 0.  Instead, the MDS should be
+		 * allowed to pick the starting OST index.   b=17846 */
+		if (lump != NULL && lump->lmm_stripe_offset == 0)
+			lump->lmm_stripe_offset = -1;
+
+		if (lump != NULL && S_ISREG(inode->i_mode)) {
+			struct file f;
+			int flags = FMODE_WRITE;
+			int lum_size = (lump->lmm_magic == LOV_USER_MAGIC_V1) ?
+				sizeof(*lump) : sizeof(struct lov_user_md_v3);
+
+			f.f_dentry = dentry;
+			rc = ll_lov_setstripe_ea_info(inode, &f, flags, lump,
+						      lum_size);
+			/* b10667: rc always be 0 here for now */
+			rc = 0;
+		} else if (S_ISDIR(inode->i_mode)) {
+			rc = ll_dir_setstripe(inode, lump, 0);
+		}
+
+		return rc;
+
+	} else if (strcmp(name, XATTR_NAME_LMA) == 0 ||
+		   strcmp(name, XATTR_NAME_LINK) == 0)
+		return 0;
+
+	return ll_setxattr_common(inode, name, value, size, flags,
+				  OBD_MD_FLXATTR);
+}
+
+int ll_removexattr(struct dentry *dentry, const char *name)
+{
+	struct inode *inode = dentry->d_inode;
+
+	LASSERT(inode);
+	LASSERT(name);
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), xattr %s\n",
+	       inode->i_ino, inode->i_generation, inode, name);
+
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_REMOVEXATTR, 1);
+	return ll_setxattr_common(inode, name, NULL, 0, 0,
+				  OBD_MD_FLXATTRRM);
+}
+
+static
+int ll_getxattr_common(struct inode *inode, const char *name,
+		       void *buffer, size_t size, __u64 valid)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ptlrpc_request *req = NULL;
+	struct mdt_body *body;
+	int xattr_type, rc;
+	void *xdata;
+	struct obd_capa *oc;
+	struct rmtacl_ctl_entry *rce = NULL;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n",
+	       inode->i_ino, inode->i_generation, inode);
+
+	/* listxattr have slightly different behavior from of ext3:
+	 * without 'user_xattr' ext3 will list all xattr names but
+	 * filtered out "^user..*"; we list them all for simplicity.
+	 */
+	if (!name) {
+		xattr_type = XATTR_OTHER_T;
+		goto do_getxattr;
+	}
+
+	xattr_type = get_xattr_type(name);
+	rc = xattr_type_filter(sbi, xattr_type);
+	if (rc)
+		RETURN(rc);
+
+	/* b15587: ignore security.capability xattr for now */
+	if ((xattr_type == XATTR_SECURITY_T &&
+	    strcmp(name, "security.capability") == 0))
+		RETURN(-ENODATA);
+
+	/* LU-549:  Disable security.selinux when selinux is disabled */
+	if (xattr_type == XATTR_SECURITY_T && !selinux_is_enabled() &&
+	    strcmp(name, "security.selinux") == 0)
+		RETURN(-EOPNOTSUPP);
+
+#ifdef CONFIG_FS_POSIX_ACL
+	if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
+	    (xattr_type == XATTR_ACL_ACCESS_T ||
+	    xattr_type == XATTR_ACL_DEFAULT_T)) {
+		rce = rct_search(&sbi->ll_rct, current_pid());
+		if (rce == NULL ||
+		    (rce->rce_ops != RMT_LSETFACL &&
+		    rce->rce_ops != RMT_LGETFACL &&
+		    rce->rce_ops != RMT_RSETFACL &&
+		    rce->rce_ops != RMT_RGETFACL))
+			RETURN(-EOPNOTSUPP);
+	}
+
+	/* posix acl is under protection of LOOKUP lock. when calling to this,
+	 * we just have path resolution to the target inode, so we have great
+	 * chance that cached ACL is uptodate.
+	 */
+	if (xattr_type == XATTR_ACL_ACCESS_T &&
+	    !(sbi->ll_flags & LL_SBI_RMT_CLIENT)) {
+		struct ll_inode_info *lli = ll_i2info(inode);
+		struct posix_acl *acl;
+
+		spin_lock(&lli->lli_lock);
+		acl = posix_acl_dup(lli->lli_posix_acl);
+		spin_unlock(&lli->lli_lock);
+
+		if (!acl)
+			RETURN(-ENODATA);
+
+		rc = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
+		posix_acl_release(acl);
+		RETURN(rc);
+	}
+	if (xattr_type == XATTR_ACL_DEFAULT_T && !S_ISDIR(inode->i_mode))
+		RETURN(-ENODATA);
+#endif
+
+do_getxattr:
+	oc = ll_mdscapa_get(inode);
+	rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
+			 valid | (rce ? rce_ops2valid(rce->rce_ops) : 0),
+			 name, NULL, 0, size, 0, &req);
+	capa_put(oc);
+	if (rc) {
+		if (rc == -EOPNOTSUPP && xattr_type == XATTR_USER_T) {
+			LCONSOLE_INFO("Disabling user_xattr feature because "
+				      "it is not supported on the server\n");
+			sbi->ll_flags &= ~LL_SBI_USER_XATTR;
+		}
+		RETURN(rc);
+	}
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	LASSERT(body);
+
+	/* only detect the xattr size */
+	if (size == 0)
+		GOTO(out, rc = body->eadatasize);
+
+	if (size < body->eadatasize) {
+		CERROR("server bug: replied size %u > %u\n",
+		       body->eadatasize, (int)size);
+		GOTO(out, rc = -ERANGE);
+	}
+
+	if (body->eadatasize == 0)
+		GOTO(out, rc = -ENODATA);
+
+	/* do not need swab xattr data */
+	xdata = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA,
+					     body->eadatasize);
+	if (!xdata)
+		GOTO(out, rc = -EFAULT);
+
+#ifdef CONFIG_FS_POSIX_ACL
+	if (body->eadatasize >= 0 && rce && rce->rce_ops == RMT_LSETFACL) {
+		ext_acl_xattr_header *acl;
+
+		acl = lustre_posix_acl_xattr_2ext((posix_acl_xattr_header *)xdata,
+						  body->eadatasize);
+		if (IS_ERR(acl))
+			GOTO(out, rc = PTR_ERR(acl));
+
+		rc = ee_add(&sbi->ll_et, current_pid(), ll_inode2fid(inode),
+			    xattr_type, acl);
+		if (unlikely(rc < 0)) {
+			lustre_ext_acl_xattr_free(acl);
+			GOTO(out, rc);
+		}
+	}
+#endif
+
+	if (body->eadatasize == 0) {
+		rc = -ENODATA;
+	} else {
+		LASSERT(buffer);
+		memcpy(buffer, xdata, body->eadatasize);
+		rc = body->eadatasize;
+	}
+	EXIT;
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+ssize_t ll_getxattr(struct dentry *dentry, const char *name,
+		    void *buffer, size_t size)
+{
+	struct inode *inode = dentry->d_inode;
+
+	LASSERT(inode);
+	LASSERT(name);
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), xattr %s\n",
+	       inode->i_ino, inode->i_generation, inode, name);
+
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETXATTR, 1);
+
+	if ((strncmp(name, XATTR_TRUSTED_PREFIX,
+		     sizeof(XATTR_TRUSTED_PREFIX) - 1) == 0 &&
+	     strcmp(name + sizeof(XATTR_TRUSTED_PREFIX) - 1, "lov") == 0) ||
+	    (strncmp(name, XATTR_LUSTRE_PREFIX,
+		     sizeof(XATTR_LUSTRE_PREFIX) - 1) == 0 &&
+	     strcmp(name + sizeof(XATTR_LUSTRE_PREFIX) - 1, "lov") == 0)) {
+		struct lov_stripe_md *lsm;
+		struct lov_user_md *lump;
+		struct lov_mds_md *lmm = NULL;
+		struct ptlrpc_request *request = NULL;
+		int rc = 0, lmmsize = 0;
+
+		if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
+			return -ENODATA;
+
+		if (size == 0 && S_ISDIR(inode->i_mode)) {
+			/* XXX directory EA is fix for now, optimize to save
+			 * RPC transfer */
+			GOTO(out, rc = sizeof(struct lov_user_md));
+		}
+
+		lsm = ccc_inode_lsm_get(inode);
+		if (lsm == NULL) {
+			if (S_ISDIR(inode->i_mode)) {
+				rc = ll_dir_getstripe(inode, &lmm,
+						      &lmmsize, &request);
+			} else {
+				rc = -ENODATA;
+			}
+		} else {
+			/* LSM is present already after lookup/getattr call.
+			 * we need to grab layout lock once it is implemented */
+			rc = obd_packmd(ll_i2dtexp(inode), &lmm, lsm);
+			lmmsize = rc;
+		}
+		ccc_inode_lsm_put(inode, lsm);
+
+		if (rc < 0)
+		       GOTO(out, rc);
+
+		if (size == 0) {
+			/* used to call ll_get_max_mdsize() forward to get
+			 * the maximum buffer size, while some apps (such as
+			 * rsync 3.0.x) care much about the exact xattr value
+			 * size */
+			rc = lmmsize;
+			GOTO(out, rc);
+		}
+
+		if (size < lmmsize) {
+			CERROR("server bug: replied size %d > %d for %s (%s)\n",
+			       lmmsize, (int)size, dentry->d_name.name, name);
+			GOTO(out, rc = -ERANGE);
+		}
+
+		lump = (struct lov_user_md *)buffer;
+		memcpy(lump, lmm, lmmsize);
+		/* do not return layout gen for getxattr otherwise it would
+		 * confuse tar --xattr by recognizing layout gen as stripe
+		 * offset when the file is restored. See LU-2809. */
+		lump->lmm_layout_gen = 0;
+
+		rc = lmmsize;
+out:
+		if (request)
+			ptlrpc_req_finished(request);
+		else if (lmm)
+			obd_free_diskmd(ll_i2dtexp(inode), &lmm);
+		return(rc);
+	}
+
+	return ll_getxattr_common(inode, name, buffer, size, OBD_MD_FLXATTR);
+}
+
+ssize_t ll_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+	struct inode *inode = dentry->d_inode;
+	int rc = 0, rc2 = 0;
+	struct lov_mds_md *lmm = NULL;
+	struct ptlrpc_request *request = NULL;
+	int lmmsize;
+
+	LASSERT(inode);
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n",
+	       inode->i_ino, inode->i_generation, inode);
+
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LISTXATTR, 1);
+
+	rc = ll_getxattr_common(inode, NULL, buffer, size, OBD_MD_FLXATTRLS);
+	if (rc < 0)
+		GOTO(out, rc);
+
+	if (buffer != NULL) {
+		struct ll_sb_info *sbi = ll_i2sbi(inode);
+		char *xattr_name = buffer;
+		int xlen, rem = rc;
+
+		while (rem > 0) {
+			xlen = strnlen(xattr_name, rem - 1) + 1;
+			rem -= xlen;
+			if (xattr_type_filter(sbi,
+					get_xattr_type(xattr_name)) == 0) {
+				/* skip OK xattr type
+				 * leave it in buffer
+				 */
+				xattr_name += xlen;
+				continue;
+			}
+			/* move up remaining xattrs in buffer
+			 * removing the xattr that is not OK
+			 */
+			memmove(xattr_name, xattr_name + xlen, rem);
+			rc -= xlen;
+		}
+	}
+	if (S_ISREG(inode->i_mode)) {
+		if (!ll_i2info(inode)->lli_has_smd)
+			rc2 = -1;
+	} else if (S_ISDIR(inode->i_mode)) {
+		rc2 = ll_dir_getstripe(inode, &lmm, &lmmsize, &request);
+	}
+
+	if (rc2 < 0) {
+		GOTO(out, rc2 = 0);
+	} else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)) {
+		const int prefix_len = sizeof(XATTR_LUSTRE_PREFIX) - 1;
+		const size_t name_len   = sizeof("lov") - 1;
+		const size_t total_len  = prefix_len + name_len + 1;
+
+		if (buffer && (rc + total_len) <= size) {
+			buffer += rc;
+			memcpy(buffer, XATTR_LUSTRE_PREFIX, prefix_len);
+			memcpy(buffer + prefix_len, "lov", name_len);
+			buffer[prefix_len + name_len] = '\0';
+		}
+		rc2 = total_len;
+	}
+out:
+	ptlrpc_req_finished(request);
+	rc = rc + rc2;
+
+	return rc;
+}
diff --git a/drivers/staging/lustre/lustre/lmv/Makefile b/drivers/staging/lustre/lustre/lmv/Makefile
new file mode 100644
index 000000000000..8cc81ade126c
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lmv/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_LUSTRE_FS) += lmv.o
+lmv-y := lmv_obd.o lmv_intent.o lmv_fld.o lproc_lmv.o
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lustre/lmv/lmv_fld.c b/drivers/staging/lustre/lustre/lmv/lmv_fld.c
new file mode 100644
index 000000000000..a4805aefa684
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lmv/lmv_fld.c
@@ -0,0 +1,88 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LMV
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <asm/div64.h>
+#include <linux/seq_file.h>
+
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_fid.h>
+#include <lustre_lib.h>
+#include <lustre_net.h>
+#include <lustre_dlm.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include "lmv_internal.h"
+
+int lmv_fld_lookup(struct lmv_obd *lmv,
+		   const struct lu_fid *fid,
+		   mdsno_t *mds)
+{
+	int rc;
+	ENTRY;
+
+
+	/* FIXME: Currently ZFS still use local seq for ROOT unfortunately, and
+	 * this fid_is_local check should be removed once LU-2240 is fixed */
+	LASSERTF((fid_seq_in_fldb(fid_seq(fid)) ||
+		  fid_seq_is_local_file(fid_seq(fid))) &&
+		 fid_is_sane(fid), DFID" is insane!\n", PFID(fid));
+
+	rc = fld_client_lookup(&lmv->lmv_fld, fid_seq(fid), mds,
+			       LU_SEQ_RANGE_MDT, NULL);
+	if (rc) {
+		CERROR("Error while looking for mds number. Seq "LPX64
+		       ", err = %d\n", fid_seq(fid), rc);
+		RETURN(rc);
+	}
+
+	CDEBUG(D_INODE, "FLD lookup got mds #%x for fid="DFID"\n",
+	       *mds, PFID(fid));
+
+	if (*mds >= lmv->desc.ld_tgt_count) {
+		CERROR("FLD lookup got invalid mds #%x (max: %x) "
+		       "for fid="DFID"\n", *mds, lmv->desc.ld_tgt_count,
+		       PFID(fid));
+		rc = -EINVAL;
+	}
+	RETURN(rc);
+}
diff --git a/drivers/staging/lustre/lustre/lmv/lmv_intent.c b/drivers/staging/lustre/lustre/lmv/lmv_intent.c
new file mode 100644
index 000000000000..7eefab5ef5d0
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lmv/lmv_intent.c
@@ -0,0 +1,328 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LMV
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <asm/div64.h>
+#include <linux/seq_file.h>
+#include <linux/namei.h>
+#include <linux/lustre_intent.h>
+
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_lib.h>
+#include <lustre_net.h>
+#include <lustre_dlm.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include "lmv_internal.h"
+
+static int lmv_intent_remote(struct obd_export *exp, void *lmm,
+			     int lmmsize, struct lookup_intent *it,
+			     const struct lu_fid *parent_fid, int flags,
+			     struct ptlrpc_request **reqp,
+			     ldlm_blocking_callback cb_blocking,
+			     __u64 extra_lock_flags)
+{
+	struct obd_device	*obd = exp->exp_obd;
+	struct lmv_obd		*lmv = &obd->u.lmv;
+	struct ptlrpc_request	*req = NULL;
+	struct lustre_handle	plock;
+	struct md_op_data	*op_data;
+	struct lmv_tgt_desc	*tgt;
+	struct mdt_body		*body;
+	int			pmode;
+	int			rc = 0;
+	ENTRY;
+
+	body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		RETURN(-EPROTO);
+
+	LASSERT((body->valid & OBD_MD_MDS));
+
+	/*
+	 * Unfortunately, we have to lie to MDC/MDS to retrieve
+	 * attributes llite needs and provideproper locking.
+	 */
+	if (it->it_op & IT_LOOKUP)
+		it->it_op = IT_GETATTR;
+
+	/*
+	 * We got LOOKUP lock, but we really need attrs.
+	 */
+	pmode = it->d.lustre.it_lock_mode;
+	if (pmode) {
+		plock.cookie = it->d.lustre.it_lock_handle;
+		it->d.lustre.it_lock_mode = 0;
+		it->d.lustre.it_data = NULL;
+	}
+
+	LASSERT(fid_is_sane(&body->fid1));
+
+	tgt = lmv_find_target(lmv, &body->fid1);
+	if (IS_ERR(tgt))
+		GOTO(out, rc = PTR_ERR(tgt));
+
+	OBD_ALLOC_PTR(op_data);
+	if (op_data == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	op_data->op_fid1 = body->fid1;
+	/* Sent the parent FID to the remote MDT */
+	if (parent_fid != NULL) {
+		/* The parent fid is only for remote open to
+		 * check whether the open is from OBF,
+		 * see mdt_cross_open */
+		LASSERT(it->it_op & IT_OPEN);
+		op_data->op_fid2 = *parent_fid;
+		/* Add object FID to op_fid3, in case it needs to check stale
+		 * (M_CHECK_STALE), see mdc_finish_intent_lock */
+		op_data->op_fid3 = body->fid1;
+	}
+
+	op_data->op_bias = MDS_CROSS_REF;
+	CDEBUG(D_INODE, "REMOTE_INTENT with fid="DFID" -> mds #%d\n",
+	       PFID(&body->fid1), tgt->ltd_idx);
+
+	it->d.lustre.it_disposition &= ~DISP_ENQ_COMPLETE;
+	rc = md_intent_lock(tgt->ltd_exp, op_data, lmm, lmmsize, it,
+			    flags, &req, cb_blocking, extra_lock_flags);
+	if (rc)
+		GOTO(out_free_op_data, rc);
+
+	/*
+	 * LLite needs LOOKUP lock to track dentry revocation in order to
+	 * maintain dcache consistency. Thus drop UPDATE|PERM lock here
+	 * and put LOOKUP in request.
+	 */
+	if (it->d.lustre.it_lock_mode != 0) {
+		it->d.lustre.it_remote_lock_handle =
+					it->d.lustre.it_lock_handle;
+		it->d.lustre.it_remote_lock_mode = it->d.lustre.it_lock_mode;
+	}
+
+	it->d.lustre.it_lock_handle = plock.cookie;
+	it->d.lustre.it_lock_mode = pmode;
+
+	EXIT;
+out_free_op_data:
+	OBD_FREE_PTR(op_data);
+out:
+	if (rc && pmode)
+		ldlm_lock_decref(&plock, pmode);
+
+	ptlrpc_req_finished(*reqp);
+	*reqp = req;
+	return rc;
+}
+
+/*
+ * IT_OPEN is intended to open (and create, possible) an object. Parent (pid)
+ * may be split dir.
+ */
+int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data,
+		    void *lmm, int lmmsize, struct lookup_intent *it,
+		    int flags, struct ptlrpc_request **reqp,
+		    ldlm_blocking_callback cb_blocking,
+		    __u64 extra_lock_flags)
+{
+	struct obd_device	*obd = exp->exp_obd;
+	struct lmv_obd		*lmv = &obd->u.lmv;
+	struct lmv_tgt_desc	*tgt;
+	struct mdt_body		*body;
+	int			rc;
+	ENTRY;
+
+	tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	/* If it is ready to open the file by FID, do not need
+	 * allocate FID at all, otherwise it will confuse MDT */
+	if ((it->it_op & IT_CREAT) &&
+	    !(it->it_flags & MDS_OPEN_BY_FID)) {
+		/*
+		 * For open with IT_CREATE and for IT_CREATE cases allocate new
+		 * fid and setup FLD for it.
+		 */
+		op_data->op_fid3 = op_data->op_fid2;
+		rc = lmv_fid_alloc(exp, &op_data->op_fid2, op_data);
+		if (rc != 0)
+			RETURN(rc);
+	}
+
+	CDEBUG(D_INODE, "OPEN_INTENT with fid1="DFID", fid2="DFID","
+	       " name='%s' -> mds #%d\n", PFID(&op_data->op_fid1),
+	       PFID(&op_data->op_fid2), op_data->op_name, tgt->ltd_idx);
+
+	rc = md_intent_lock(tgt->ltd_exp, op_data, lmm, lmmsize, it, flags,
+			    reqp, cb_blocking, extra_lock_flags);
+	if (rc != 0)
+		RETURN(rc);
+	/*
+	 * Nothing is found, do not access body->fid1 as it is zero and thus
+	 * pointless.
+	 */
+	if ((it->d.lustre.it_disposition & DISP_LOOKUP_NEG) &&
+	    !(it->d.lustre.it_disposition & DISP_OPEN_CREATE) &&
+	    !(it->d.lustre.it_disposition & DISP_OPEN_OPEN))
+		RETURN(rc);
+
+	body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		RETURN(-EPROTO);
+	/*
+	 * Not cross-ref case, just get out of here.
+	 */
+	if (likely(!(body->valid & OBD_MD_MDS)))
+		RETURN(0);
+
+	/*
+	 * Okay, MDS has returned success. Probably name has been resolved in
+	 * remote inode.
+	 */
+	rc = lmv_intent_remote(exp, lmm, lmmsize, it, &op_data->op_fid1, flags,
+			       reqp, cb_blocking, extra_lock_flags);
+	if (rc != 0) {
+		LASSERT(rc < 0);
+		/*
+		 * This is possible, that some userspace application will try to
+		 * open file as directory and we will have -ENOTDIR here. As
+		 * this is normal situation, we should not print error here,
+		 * only debug info.
+		 */
+		CDEBUG(D_INODE, "Can't handle remote %s: dir "DFID"("DFID"):"
+		       "%*s: %d\n", LL_IT2STR(it), PFID(&op_data->op_fid2),
+		       PFID(&op_data->op_fid1), op_data->op_namelen,
+		       op_data->op_name, rc);
+		RETURN(rc);
+	}
+
+	RETURN(rc);
+}
+
+/*
+ * Handler for: getattr, lookup and revalidate cases.
+ */
+int lmv_intent_lookup(struct obd_export *exp, struct md_op_data *op_data,
+		      void *lmm, int lmmsize, struct lookup_intent *it,
+		      int flags, struct ptlrpc_request **reqp,
+		      ldlm_blocking_callback cb_blocking,
+		      __u64 extra_lock_flags)
+{
+	struct obd_device      *obd = exp->exp_obd;
+	struct lmv_obd	 *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc    *tgt = NULL;
+	struct mdt_body	*body;
+	int		     rc = 0;
+	ENTRY;
+
+	tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	if (!fid_is_sane(&op_data->op_fid2))
+		fid_zero(&op_data->op_fid2);
+
+	CDEBUG(D_INODE, "LOOKUP_INTENT with fid1="DFID", fid2="DFID
+	       ", name='%s' -> mds #%d\n", PFID(&op_data->op_fid1),
+	       PFID(&op_data->op_fid2),
+	       op_data->op_name ? op_data->op_name : "<NULL>",
+	       tgt->ltd_idx);
+
+	op_data->op_bias &= ~MDS_CROSS_REF;
+
+	rc = md_intent_lock(tgt->ltd_exp, op_data, lmm, lmmsize, it,
+			     flags, reqp, cb_blocking, extra_lock_flags);
+
+	if (rc < 0 || *reqp == NULL)
+		RETURN(rc);
+
+	/*
+	 * MDS has returned success. Probably name has been resolved in
+	 * remote inode. Let's check this.
+	 */
+	body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		RETURN(-EPROTO);
+	/* Not cross-ref case, just get out of here. */
+	if (likely(!(body->valid & OBD_MD_MDS)))
+		RETURN(0);
+
+	rc = lmv_intent_remote(exp, lmm, lmmsize, it, NULL, flags, reqp,
+			       cb_blocking, extra_lock_flags);
+
+	RETURN(rc);
+}
+
+int lmv_intent_lock(struct obd_export *exp, struct md_op_data *op_data,
+		    void *lmm, int lmmsize, struct lookup_intent *it,
+		    int flags, struct ptlrpc_request **reqp,
+		    ldlm_blocking_callback cb_blocking,
+		    __u64 extra_lock_flags)
+{
+	struct obd_device *obd = exp->exp_obd;
+	int		rc;
+	ENTRY;
+
+	LASSERT(it != NULL);
+	LASSERT(fid_is_sane(&op_data->op_fid1));
+
+	CDEBUG(D_INODE, "INTENT LOCK '%s' for '%*s' on "DFID"\n",
+	       LL_IT2STR(it), op_data->op_namelen, op_data->op_name,
+	       PFID(&op_data->op_fid1));
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	if (it->it_op & (IT_LOOKUP | IT_GETATTR | IT_LAYOUT))
+		rc = lmv_intent_lookup(exp, op_data, lmm, lmmsize, it,
+				       flags, reqp, cb_blocking,
+				       extra_lock_flags);
+	else if (it->it_op & IT_OPEN)
+		rc = lmv_intent_open(exp, op_data, lmm, lmmsize, it,
+				     flags, reqp, cb_blocking,
+				     extra_lock_flags);
+	else
+		LBUG();
+	RETURN(rc);
+}
diff --git a/drivers/staging/lustre/lustre/lmv/lmv_internal.h b/drivers/staging/lustre/lustre/lmv/lmv_internal.h
new file mode 100644
index 000000000000..f75b0a987681
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lmv/lmv_internal.h
@@ -0,0 +1,159 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LMV_INTERNAL_H_
+#define _LMV_INTERNAL_H_
+
+#include <lustre/lustre_idl.h>
+#include <obd.h>
+
+#define LMV_MAX_TGT_COUNT 128
+
+#define lmv_init_lock(lmv)   mutex_lock(&lmv->init_mutex);
+#define lmv_init_unlock(lmv) mutex_unlock(&lmv->init_mutex);
+
+#define LL_IT2STR(it)					\
+	((it) ? ldlm_it2str((it)->it_op) : "0")
+
+int lmv_check_connect(struct obd_device *obd);
+
+int lmv_intent_lock(struct obd_export *exp, struct md_op_data *op_data,
+		    void *lmm, int lmmsize, struct lookup_intent *it,
+		    int flags, struct ptlrpc_request **reqp,
+		    ldlm_blocking_callback cb_blocking,
+		    __u64 extra_lock_flags);
+
+int lmv_intent_lookup(struct obd_export *exp, struct md_op_data *op_data,
+		      void *lmm, int lmmsize, struct lookup_intent *it,
+		      int flags, struct ptlrpc_request **reqp,
+		      ldlm_blocking_callback cb_blocking,
+		      __u64 extra_lock_flags);
+
+int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data,
+		    void *lmm, int lmmsize, struct lookup_intent *it,
+		    int flags, struct ptlrpc_request **reqp,
+		    ldlm_blocking_callback cb_blocking,
+		    __u64 extra_lock_flags);
+
+int lmv_blocking_ast(struct ldlm_lock *, struct ldlm_lock_desc *,
+		     void *, int);
+int lmv_fld_lookup(struct lmv_obd *lmv, const struct lu_fid *fid,
+		   mdsno_t *mds);
+int __lmv_fid_alloc(struct lmv_obd *lmv, struct lu_fid *fid,
+		    mdsno_t mds);
+int lmv_fid_alloc(struct obd_export *exp, struct lu_fid *fid,
+		  struct md_op_data *op_data);
+
+static inline struct lmv_stripe_md *lmv_get_mea(struct ptlrpc_request *req)
+{
+	struct mdt_body	 *body;
+	struct lmv_stripe_md    *mea;
+
+	LASSERT(req != NULL);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+
+	if (!body || !S_ISDIR(body->mode) || !body->eadatasize)
+		return NULL;
+
+	mea = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD,
+					   body->eadatasize);
+	LASSERT(mea != NULL);
+
+	if (mea->mea_count == 0)
+		return NULL;
+	if( mea->mea_magic != MEA_MAGIC_LAST_CHAR &&
+		mea->mea_magic != MEA_MAGIC_ALL_CHARS &&
+		mea->mea_magic != MEA_MAGIC_HASH_SEGMENT)
+		return NULL;
+
+	return mea;
+}
+
+static inline int lmv_get_easize(struct lmv_obd *lmv)
+{
+	return sizeof(struct lmv_stripe_md) +
+		lmv->desc.ld_tgt_count *
+		sizeof(struct lu_fid);
+}
+
+static inline struct lmv_tgt_desc *
+lmv_get_target(struct lmv_obd *lmv, mdsno_t mds)
+{
+	int count = lmv->desc.ld_tgt_count;
+	int i;
+
+	for (i = 0; i < count; i++) {
+		if (lmv->tgts[i] == NULL)
+			continue;
+
+		if (lmv->tgts[i]->ltd_idx == mds)
+			return lmv->tgts[i];
+	}
+
+	return ERR_PTR(-ENODEV);
+}
+
+static inline struct lmv_tgt_desc *
+lmv_find_target(struct lmv_obd *lmv, const struct lu_fid *fid)
+{
+	mdsno_t mds = 0;
+	int rc;
+
+	if (lmv->desc.ld_tgt_count > 1) {
+		rc = lmv_fld_lookup(lmv, fid, &mds);
+		if (rc)
+			return ERR_PTR(rc);
+	}
+
+	return lmv_get_target(lmv, mds);
+}
+
+struct lmv_tgt_desc
+*lmv_locate_mds(struct lmv_obd *lmv, struct md_op_data *op_data,
+		struct lu_fid *fid);
+/* lproc_lmv.c */
+#ifdef LPROCFS
+void lprocfs_lmv_init_vars(struct lprocfs_static_vars *lvars);
+#else
+static inline void lprocfs_lmv_init_vars(struct lprocfs_static_vars *lvars)
+{
+	memset(lvars, 0, sizeof(*lvars));
+}
+#endif
+extern struct file_operations lmv_proc_target_fops;
+
+#endif
diff --git a/drivers/staging/lustre/lustre/lmv/lmv_obd.c b/drivers/staging/lustre/lustre/lmv/lmv_obd.c
new file mode 100644
index 000000000000..a13eead0a6cf
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lmv/lmv_obd.c
@@ -0,0 +1,2734 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LMV
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/mm.h>
+#include <asm/div64.h>
+#include <linux/seq_file.h>
+#include <linux/namei.h>
+
+#include <lustre/lustre_idl.h>
+#include <obd_support.h>
+#include <lustre_lib.h>
+#include <lustre_net.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include <lustre_lite.h>
+#include <lustre_fid.h>
+#include "lmv_internal.h"
+
+static void lmv_activate_target(struct lmv_obd *lmv,
+				struct lmv_tgt_desc *tgt,
+				int activate)
+{
+	if (tgt->ltd_active == activate)
+		return;
+
+	tgt->ltd_active = activate;
+	lmv->desc.ld_active_tgt_count += (activate ? 1 : -1);
+}
+
+/**
+ * Error codes:
+ *
+ *  -EINVAL  : UUID can't be found in the LMV's target list
+ *  -ENOTCONN: The UUID is found, but the target connection is bad (!)
+ *  -EBADF   : The UUID is found, but the OBD of the wrong type (!)
+ */
+static int lmv_set_mdc_active(struct lmv_obd *lmv, struct obd_uuid *uuid,
+			      int activate)
+{
+	struct lmv_tgt_desc    *tgt;
+	struct obd_device      *obd;
+	int		     i;
+	int		     rc = 0;
+	ENTRY;
+
+	CDEBUG(D_INFO, "Searching in lmv %p for uuid %s (activate=%d)\n",
+	       lmv, uuid->uuid, activate);
+
+	spin_lock(&lmv->lmv_lock);
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		tgt = lmv->tgts[i];
+		if (tgt == NULL || tgt->ltd_exp == NULL)
+			continue;
+
+		CDEBUG(D_INFO, "Target idx %d is %s conn "LPX64"\n", i,
+		       tgt->ltd_uuid.uuid, tgt->ltd_exp->exp_handle.h_cookie);
+
+		if (obd_uuid_equals(uuid, &tgt->ltd_uuid))
+			break;
+	}
+
+	if (i == lmv->desc.ld_tgt_count)
+		GOTO(out_lmv_lock, rc = -EINVAL);
+
+	obd = class_exp2obd(tgt->ltd_exp);
+	if (obd == NULL)
+		GOTO(out_lmv_lock, rc = -ENOTCONN);
+
+	CDEBUG(D_INFO, "Found OBD %s=%s device %d (%p) type %s at LMV idx %d\n",
+	       obd->obd_name, obd->obd_uuid.uuid, obd->obd_minor, obd,
+	       obd->obd_type->typ_name, i);
+	LASSERT(strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0);
+
+	if (tgt->ltd_active == activate) {
+		CDEBUG(D_INFO, "OBD %p already %sactive!\n", obd,
+		       activate ? "" : "in");
+		GOTO(out_lmv_lock, rc);
+	}
+
+	CDEBUG(D_INFO, "Marking OBD %p %sactive\n", obd,
+	       activate ? "" : "in");
+	lmv_activate_target(lmv, tgt, activate);
+	EXIT;
+
+ out_lmv_lock:
+	spin_unlock(&lmv->lmv_lock);
+	return rc;
+}
+
+struct obd_uuid *lmv_get_uuid(struct obd_export *exp)
+{
+	struct lmv_obd *lmv = &exp->exp_obd->u.lmv;
+
+	return obd_get_uuid(lmv->tgts[0]->ltd_exp);
+}
+
+static int lmv_notify(struct obd_device *obd, struct obd_device *watched,
+		      enum obd_notify_event ev, void *data)
+{
+	struct obd_connect_data *conn_data;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct obd_uuid	 *uuid;
+	int		      rc = 0;
+	ENTRY;
+
+	if (strcmp(watched->obd_type->typ_name, LUSTRE_MDC_NAME)) {
+		CERROR("unexpected notification of %s %s!\n",
+		       watched->obd_type->typ_name,
+		       watched->obd_name);
+		RETURN(-EINVAL);
+	}
+
+	uuid = &watched->u.cli.cl_target_uuid;
+	if (ev == OBD_NOTIFY_ACTIVE || ev == OBD_NOTIFY_INACTIVE) {
+		/*
+		 * Set MDC as active before notifying the observer, so the
+		 * observer can use the MDC normally.
+		 */
+		rc = lmv_set_mdc_active(lmv, uuid,
+					ev == OBD_NOTIFY_ACTIVE);
+		if (rc) {
+			CERROR("%sactivation of %s failed: %d\n",
+			       ev == OBD_NOTIFY_ACTIVE ? "" : "de",
+			       uuid->uuid, rc);
+			RETURN(rc);
+		}
+	} else if (ev == OBD_NOTIFY_OCD) {
+		conn_data = &watched->u.cli.cl_import->imp_connect_data;
+		/*
+		 * XXX: Make sure that ocd_connect_flags from all targets are
+		 * the same. Otherwise one of MDTs runs wrong version or
+		 * something like this.  --umka
+		 */
+		obd->obd_self_export->exp_connect_data = *conn_data;
+	}
+#if 0
+	else if (ev == OBD_NOTIFY_DISCON) {
+		/*
+		 * For disconnect event, flush fld cache for failout MDS case.
+		 */
+		fld_client_flush(&lmv->lmv_fld);
+	}
+#endif
+	/*
+	 * Pass the notification up the chain.
+	 */
+	if (obd->obd_observer)
+		rc = obd_notify(obd->obd_observer, watched, ev, data);
+
+	RETURN(rc);
+}
+
+/**
+ * This is fake connect function. Its purpose is to initialize lmv and say
+ * caller that everything is okay. Real connection will be performed later.
+ */
+static int lmv_connect(const struct lu_env *env,
+		       struct obd_export **exp, struct obd_device *obd,
+		       struct obd_uuid *cluuid, struct obd_connect_data *data,
+		       void *localdata)
+{
+	struct proc_dir_entry *lmv_proc_dir;
+	struct lmv_obd	*lmv = &obd->u.lmv;
+	struct lustre_handle  conn = { 0 };
+	int		    rc = 0;
+	ENTRY;
+
+	/*
+	 * We don't want to actually do the underlying connections more than
+	 * once, so keep track.
+	 */
+	lmv->refcount++;
+	if (lmv->refcount > 1) {
+		*exp = NULL;
+		RETURN(0);
+	}
+
+	rc = class_connect(&conn, obd, cluuid);
+	if (rc) {
+		CERROR("class_connection() returned %d\n", rc);
+		RETURN(rc);
+	}
+
+	*exp = class_conn2export(&conn);
+	class_export_get(*exp);
+
+	lmv->exp = *exp;
+	lmv->connected = 0;
+	lmv->cluuid = *cluuid;
+
+	if (data)
+		lmv->conn_data = *data;
+
+	lmv_proc_dir = lprocfs_register("target_obds", obd->obd_proc_entry,
+					NULL, NULL);
+	if (IS_ERR(lmv_proc_dir)) {
+		CERROR("could not register /proc/fs/lustre/%s/%s/target_obds.",
+		       obd->obd_type->typ_name, obd->obd_name);
+		lmv_proc_dir = NULL;
+	}
+
+	/*
+	 * All real clients should perform actual connection right away, because
+	 * it is possible, that LMV will not have opportunity to connect targets
+	 * and MDC stuff will be called directly, for instance while reading
+	 * ../mdc/../kbytesfree procfs file, etc.
+	 */
+	if (data->ocd_connect_flags & OBD_CONNECT_REAL)
+		rc = lmv_check_connect(obd);
+
+	if (rc) {
+		if (lmv_proc_dir)
+			lprocfs_remove(&lmv_proc_dir);
+	}
+
+	RETURN(rc);
+}
+
+static void lmv_set_timeouts(struct obd_device *obd)
+{
+	struct lmv_tgt_desc   *tgt;
+	struct lmv_obd	*lmv;
+	int		    i;
+
+	lmv = &obd->u.lmv;
+	if (lmv->server_timeout == 0)
+		return;
+
+	if (lmv->connected == 0)
+		return;
+
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		tgt = lmv->tgts[i];
+		if (tgt == NULL || tgt->ltd_exp == NULL || tgt->ltd_active == 0)
+			continue;
+
+		obd_set_info_async(NULL, tgt->ltd_exp, sizeof(KEY_INTERMDS),
+				   KEY_INTERMDS, 0, NULL, NULL);
+	}
+}
+
+static int lmv_init_ea_size(struct obd_export *exp, int easize,
+			    int def_easize, int cookiesize)
+{
+	struct obd_device   *obd = exp->exp_obd;
+	struct lmv_obd      *lmv = &obd->u.lmv;
+	int		  i;
+	int		  rc = 0;
+	int		  change = 0;
+	ENTRY;
+
+	if (lmv->max_easize < easize) {
+		lmv->max_easize = easize;
+		change = 1;
+	}
+	if (lmv->max_def_easize < def_easize) {
+		lmv->max_def_easize = def_easize;
+		change = 1;
+	}
+	if (lmv->max_cookiesize < cookiesize) {
+		lmv->max_cookiesize = cookiesize;
+		change = 1;
+	}
+	if (change == 0)
+		RETURN(0);
+
+	if (lmv->connected == 0)
+		RETURN(0);
+
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		if (lmv->tgts[i] == NULL ||
+		    lmv->tgts[i]->ltd_exp == NULL ||
+		    lmv->tgts[i]->ltd_active == 0) {
+			CWARN("%s: NULL export for %d\n", obd->obd_name, i);
+			continue;
+		}
+
+		rc = md_init_ea_size(lmv->tgts[i]->ltd_exp, easize, def_easize,
+				     cookiesize);
+		if (rc) {
+			CERROR("%s: obd_init_ea_size() failed on MDT target %d:"
+			       " rc = %d.\n", obd->obd_name, i, rc);
+			break;
+		}
+	}
+	RETURN(rc);
+}
+
+#define MAX_STRING_SIZE 128
+
+int lmv_connect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt)
+{
+	struct proc_dir_entry   *lmv_proc_dir;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct obd_uuid	 *cluuid = &lmv->cluuid;
+	struct obd_uuid	  lmv_mdc_uuid = { "LMV_MDC_UUID" };
+	struct obd_device       *mdc_obd;
+	struct obd_export       *mdc_exp;
+	struct lu_fld_target     target;
+	int		      rc;
+	ENTRY;
+
+	mdc_obd = class_find_client_obd(&tgt->ltd_uuid, LUSTRE_MDC_NAME,
+					&obd->obd_uuid);
+	if (!mdc_obd) {
+		CERROR("target %s not attached\n", tgt->ltd_uuid.uuid);
+		RETURN(-EINVAL);
+	}
+
+	CDEBUG(D_CONFIG, "connect to %s(%s) - %s, %s FOR %s\n",
+		mdc_obd->obd_name, mdc_obd->obd_uuid.uuid,
+		tgt->ltd_uuid.uuid, obd->obd_uuid.uuid,
+		cluuid->uuid);
+
+	if (!mdc_obd->obd_set_up) {
+		CERROR("target %s is not set up\n", tgt->ltd_uuid.uuid);
+		RETURN(-EINVAL);
+	}
+
+	rc = obd_connect(NULL, &mdc_exp, mdc_obd, &lmv_mdc_uuid,
+			 &lmv->conn_data, NULL);
+	if (rc) {
+		CERROR("target %s connect error %d\n", tgt->ltd_uuid.uuid, rc);
+		RETURN(rc);
+	}
+
+	/*
+	 * Init fid sequence client for this mdc and add new fld target.
+	 */
+	rc = obd_fid_init(mdc_obd, mdc_exp, LUSTRE_SEQ_METADATA);
+	if (rc)
+		RETURN(rc);
+
+	target.ft_srv = NULL;
+	target.ft_exp = mdc_exp;
+	target.ft_idx = tgt->ltd_idx;
+
+	fld_client_add_target(&lmv->lmv_fld, &target);
+
+	rc = obd_register_observer(mdc_obd, obd);
+	if (rc) {
+		obd_disconnect(mdc_exp);
+		CERROR("target %s register_observer error %d\n",
+		       tgt->ltd_uuid.uuid, rc);
+		RETURN(rc);
+	}
+
+	if (obd->obd_observer) {
+		/*
+		 * Tell the observer about the new target.
+		 */
+		rc = obd_notify(obd->obd_observer, mdc_exp->exp_obd,
+				OBD_NOTIFY_ACTIVE,
+				(void *)(tgt - lmv->tgts[0]));
+		if (rc) {
+			obd_disconnect(mdc_exp);
+			RETURN(rc);
+		}
+	}
+
+	tgt->ltd_active = 1;
+	tgt->ltd_exp = mdc_exp;
+	lmv->desc.ld_active_tgt_count++;
+
+	md_init_ea_size(tgt->ltd_exp, lmv->max_easize,
+			lmv->max_def_easize, lmv->max_cookiesize);
+
+	CDEBUG(D_CONFIG, "Connected to %s(%s) successfully (%d)\n",
+		mdc_obd->obd_name, mdc_obd->obd_uuid.uuid,
+		atomic_read(&obd->obd_refcount));
+
+	lmv_proc_dir = lprocfs_srch(obd->obd_proc_entry, "target_obds");
+	if (lmv_proc_dir) {
+		struct proc_dir_entry *mdc_symlink;
+
+		LASSERT(mdc_obd->obd_type != NULL);
+		LASSERT(mdc_obd->obd_type->typ_name != NULL);
+		mdc_symlink = lprocfs_add_symlink(mdc_obd->obd_name,
+						  lmv_proc_dir,
+						  "../../../%s/%s",
+						  mdc_obd->obd_type->typ_name,
+						  mdc_obd->obd_name);
+		if (mdc_symlink == NULL) {
+			CERROR("Could not register LMV target "
+			       "/proc/fs/lustre/%s/%s/target_obds/%s.",
+			       obd->obd_type->typ_name, obd->obd_name,
+			       mdc_obd->obd_name);
+			lprocfs_remove(&lmv_proc_dir);
+			lmv_proc_dir = NULL;
+		}
+	}
+	RETURN(0);
+}
+
+static void lmv_del_target(struct lmv_obd *lmv, int index)
+{
+	if (lmv->tgts[index] == NULL)
+		return;
+
+	OBD_FREE_PTR(lmv->tgts[index]);
+	lmv->tgts[index] = NULL;
+	return;
+}
+
+static int lmv_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
+			   __u32 index, int gen)
+{
+	struct lmv_obd      *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	int		  rc = 0;
+	ENTRY;
+
+	CDEBUG(D_CONFIG, "Target uuid: %s. index %d\n", uuidp->uuid, index);
+
+	lmv_init_lock(lmv);
+
+	if (lmv->desc.ld_tgt_count == 0) {
+		struct obd_device *mdc_obd;
+
+		mdc_obd = class_find_client_obd(uuidp, LUSTRE_MDC_NAME,
+						&obd->obd_uuid);
+		if (!mdc_obd) {
+			lmv_init_unlock(lmv);
+			CERROR("%s: Target %s not attached: rc = %d\n",
+			       obd->obd_name, uuidp->uuid, -EINVAL);
+			RETURN(-EINVAL);
+		}
+	}
+
+	if ((index < lmv->tgts_size) && (lmv->tgts[index] != NULL)) {
+		tgt = lmv->tgts[index];
+		CERROR("%s: UUID %s already assigned at LOV target index %d:"
+		       " rc = %d\n", obd->obd_name,
+		       obd_uuid2str(&tgt->ltd_uuid), index, -EEXIST);
+		lmv_init_unlock(lmv);
+		RETURN(-EEXIST);
+	}
+
+	if (index >= lmv->tgts_size) {
+		/* We need to reallocate the lmv target array. */
+		struct lmv_tgt_desc **newtgts, **old = NULL;
+		__u32 newsize = 1;
+		__u32 oldsize = 0;
+
+		while (newsize < index + 1)
+			newsize = newsize << 1;
+		OBD_ALLOC(newtgts, sizeof(*newtgts) * newsize);
+		if (newtgts == NULL) {
+			lmv_init_unlock(lmv);
+			RETURN(-ENOMEM);
+		}
+
+		if (lmv->tgts_size) {
+			memcpy(newtgts, lmv->tgts,
+			       sizeof(*newtgts) * lmv->tgts_size);
+			old = lmv->tgts;
+			oldsize = lmv->tgts_size;
+		}
+
+		lmv->tgts = newtgts;
+		lmv->tgts_size = newsize;
+		smp_rmb();
+		if (old)
+			OBD_FREE(old, sizeof(*old) * oldsize);
+
+		CDEBUG(D_CONFIG, "tgts: %p size: %d\n", lmv->tgts,
+		       lmv->tgts_size);
+	}
+
+	OBD_ALLOC_PTR(tgt);
+	if (!tgt) {
+		lmv_init_unlock(lmv);
+		RETURN(-ENOMEM);
+	}
+
+	mutex_init(&tgt->ltd_fid_mutex);
+	tgt->ltd_idx = index;
+	tgt->ltd_uuid = *uuidp;
+	tgt->ltd_active = 0;
+	lmv->tgts[index] = tgt;
+	if (index >= lmv->desc.ld_tgt_count)
+		lmv->desc.ld_tgt_count = index + 1;
+
+	if (lmv->connected) {
+		rc = lmv_connect_mdc(obd, tgt);
+		if (rc) {
+			spin_lock(&lmv->lmv_lock);
+			lmv->desc.ld_tgt_count--;
+			memset(tgt, 0, sizeof(*tgt));
+			spin_unlock(&lmv->lmv_lock);
+		} else {
+			int easize = sizeof(struct lmv_stripe_md) +
+				     lmv->desc.ld_tgt_count *
+				     sizeof(struct lu_fid);
+			lmv_init_ea_size(obd->obd_self_export, easize, 0, 0);
+		}
+	}
+
+	lmv_init_unlock(lmv);
+	RETURN(rc);
+}
+
+int lmv_check_connect(struct obd_device *obd)
+{
+	struct lmv_obd       *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc  *tgt;
+	int		   i;
+	int		   rc;
+	int		   easize;
+	ENTRY;
+
+	if (lmv->connected)
+		RETURN(0);
+
+	lmv_init_lock(lmv);
+	if (lmv->connected) {
+		lmv_init_unlock(lmv);
+		RETURN(0);
+	}
+
+	if (lmv->desc.ld_tgt_count == 0) {
+		lmv_init_unlock(lmv);
+		CERROR("%s: no targets configured.\n", obd->obd_name);
+		RETURN(-EINVAL);
+	}
+
+	CDEBUG(D_CONFIG, "Time to connect %s to %s\n",
+	       lmv->cluuid.uuid, obd->obd_name);
+
+	LASSERT(lmv->tgts != NULL);
+
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		tgt = lmv->tgts[i];
+		if (tgt == NULL)
+			continue;
+		rc = lmv_connect_mdc(obd, tgt);
+		if (rc)
+			GOTO(out_disc, rc);
+	}
+
+	lmv_set_timeouts(obd);
+	class_export_put(lmv->exp);
+	lmv->connected = 1;
+	easize = lmv_get_easize(lmv);
+	lmv_init_ea_size(obd->obd_self_export, easize, 0, 0);
+	lmv_init_unlock(lmv);
+	RETURN(0);
+
+ out_disc:
+	while (i-- > 0) {
+		int rc2;
+		tgt = lmv->tgts[i];
+		if (tgt == NULL)
+			continue;
+		tgt->ltd_active = 0;
+		if (tgt->ltd_exp) {
+			--lmv->desc.ld_active_tgt_count;
+			rc2 = obd_disconnect(tgt->ltd_exp);
+			if (rc2) {
+				CERROR("LMV target %s disconnect on "
+				       "MDC idx %d: error %d\n",
+				       tgt->ltd_uuid.uuid, i, rc2);
+			}
+		}
+	}
+	class_disconnect(lmv->exp);
+	lmv_init_unlock(lmv);
+	RETURN(rc);
+}
+
+static int lmv_disconnect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt)
+{
+	struct proc_dir_entry  *lmv_proc_dir;
+	struct lmv_obd	 *lmv = &obd->u.lmv;
+	struct obd_device      *mdc_obd;
+	int		     rc;
+	ENTRY;
+
+	LASSERT(tgt != NULL);
+	LASSERT(obd != NULL);
+
+	mdc_obd = class_exp2obd(tgt->ltd_exp);
+
+	if (mdc_obd) {
+		mdc_obd->obd_force = obd->obd_force;
+		mdc_obd->obd_fail = obd->obd_fail;
+		mdc_obd->obd_no_recov = obd->obd_no_recov;
+	}
+
+	lmv_proc_dir = lprocfs_srch(obd->obd_proc_entry, "target_obds");
+	if (lmv_proc_dir) {
+		struct proc_dir_entry *mdc_symlink;
+
+		mdc_symlink = lprocfs_srch(lmv_proc_dir, mdc_obd->obd_name);
+		if (mdc_symlink) {
+			lprocfs_remove(&mdc_symlink);
+		} else {
+			CERROR("/proc/fs/lustre/%s/%s/target_obds/%s missing\n",
+			       obd->obd_type->typ_name, obd->obd_name,
+			       mdc_obd->obd_name);
+		}
+	}
+	rc = obd_fid_fini(tgt->ltd_exp->exp_obd);
+	if (rc)
+		CERROR("Can't finanize fids factory\n");
+
+	CDEBUG(D_INFO, "Disconnected from %s(%s) successfully\n",
+	       tgt->ltd_exp->exp_obd->obd_name,
+	       tgt->ltd_exp->exp_obd->obd_uuid.uuid);
+
+	obd_register_observer(tgt->ltd_exp->exp_obd, NULL);
+	rc = obd_disconnect(tgt->ltd_exp);
+	if (rc) {
+		if (tgt->ltd_active) {
+			CERROR("Target %s disconnect error %d\n",
+			       tgt->ltd_uuid.uuid, rc);
+		}
+	}
+
+	lmv_activate_target(lmv, tgt, 0);
+	tgt->ltd_exp = NULL;
+	RETURN(0);
+}
+
+static int lmv_disconnect(struct obd_export *exp)
+{
+	struct obd_device     *obd = class_exp2obd(exp);
+	struct proc_dir_entry *lmv_proc_dir;
+	struct lmv_obd	*lmv = &obd->u.lmv;
+	int		    rc;
+	int		    i;
+	ENTRY;
+
+	if (!lmv->tgts)
+		goto out_local;
+
+	/*
+	 * Only disconnect the underlying layers on the final disconnect.
+	 */
+	lmv->refcount--;
+	if (lmv->refcount != 0)
+		goto out_local;
+
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL)
+			continue;
+
+		lmv_disconnect_mdc(obd, lmv->tgts[i]);
+	}
+
+	lmv_proc_dir = lprocfs_srch(obd->obd_proc_entry, "target_obds");
+	if (lmv_proc_dir) {
+		lprocfs_remove(&lmv_proc_dir);
+	} else {
+		CERROR("/proc/fs/lustre/%s/%s/target_obds missing\n",
+		       obd->obd_type->typ_name, obd->obd_name);
+	}
+
+out_local:
+	/*
+	 * This is the case when no real connection is established by
+	 * lmv_check_connect().
+	 */
+	if (!lmv->connected)
+		class_export_put(exp);
+	rc = class_disconnect(exp);
+	if (lmv->refcount == 0)
+		lmv->connected = 0;
+	RETURN(rc);
+}
+
+static int lmv_fid2path(struct obd_export *exp, int len, void *karg, void *uarg)
+{
+	struct obd_device	*obddev = class_exp2obd(exp);
+	struct lmv_obd		*lmv = &obddev->u.lmv;
+	struct getinfo_fid2path *gf;
+	struct lmv_tgt_desc     *tgt;
+	struct getinfo_fid2path *remote_gf = NULL;
+	int			remote_gf_size = 0;
+	int			rc;
+
+	gf = (struct getinfo_fid2path *)karg;
+	tgt = lmv_find_target(lmv, &gf->gf_fid);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+repeat_fid2path:
+	rc = obd_iocontrol(OBD_IOC_FID2PATH, tgt->ltd_exp, len, gf, uarg);
+	if (rc != 0 && rc != -EREMOTE)
+		GOTO(out_fid2path, rc);
+
+	/* If remote_gf != NULL, it means just building the
+	 * path on the remote MDT, copy this path segement to gf */
+	if (remote_gf != NULL) {
+		struct getinfo_fid2path *ori_gf;
+		char *ptr;
+
+		ori_gf = (struct getinfo_fid2path *)karg;
+		if (strlen(ori_gf->gf_path) +
+		    strlen(gf->gf_path) > ori_gf->gf_pathlen)
+			GOTO(out_fid2path, rc = -EOVERFLOW);
+
+		ptr = ori_gf->gf_path;
+
+		memmove(ptr + strlen(gf->gf_path) + 1, ptr,
+			strlen(ori_gf->gf_path));
+
+		strncpy(ptr, gf->gf_path, strlen(gf->gf_path));
+		ptr += strlen(gf->gf_path);
+		*ptr = '/';
+	}
+
+	CDEBUG(D_INFO, "%s: get path %s "DFID" rec: "LPU64" ln: %u\n",
+	       tgt->ltd_exp->exp_obd->obd_name,
+	       gf->gf_path, PFID(&gf->gf_fid), gf->gf_recno,
+	       gf->gf_linkno);
+
+	if (rc == 0)
+		GOTO(out_fid2path, rc);
+
+	/* sigh, has to go to another MDT to do path building further */
+	if (remote_gf == NULL) {
+		remote_gf_size = sizeof(*remote_gf) + PATH_MAX;
+		OBD_ALLOC(remote_gf, remote_gf_size);
+		if (remote_gf == NULL)
+			GOTO(out_fid2path, rc = -ENOMEM);
+		remote_gf->gf_pathlen = PATH_MAX;
+	}
+
+	if (!fid_is_sane(&gf->gf_fid)) {
+		CERROR("%s: invalid FID "DFID": rc = %d\n",
+		       tgt->ltd_exp->exp_obd->obd_name,
+		       PFID(&gf->gf_fid), -EINVAL);
+		GOTO(out_fid2path, rc = -EINVAL);
+	}
+
+	tgt = lmv_find_target(lmv, &gf->gf_fid);
+	if (IS_ERR(tgt))
+		GOTO(out_fid2path, rc = -EINVAL);
+
+	remote_gf->gf_fid = gf->gf_fid;
+	remote_gf->gf_recno = -1;
+	remote_gf->gf_linkno = -1;
+	memset(remote_gf->gf_path, 0, remote_gf->gf_pathlen);
+	gf = remote_gf;
+	goto repeat_fid2path;
+
+out_fid2path:
+	if (remote_gf != NULL)
+		OBD_FREE(remote_gf, remote_gf_size);
+	RETURN(rc);
+}
+
+static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp,
+			 int len, void *karg, void *uarg)
+{
+	struct obd_device    *obddev = class_exp2obd(exp);
+	struct lmv_obd       *lmv = &obddev->u.lmv;
+	int		   i = 0;
+	int		   rc = 0;
+	int		   set = 0;
+	int		   count = lmv->desc.ld_tgt_count;
+	ENTRY;
+
+	if (count == 0)
+		RETURN(-ENOTTY);
+
+	switch (cmd) {
+	case IOC_OBD_STATFS: {
+		struct obd_ioctl_data *data = karg;
+		struct obd_device *mdc_obd;
+		struct obd_statfs stat_buf = {0};
+		__u32 index;
+
+		memcpy(&index, data->ioc_inlbuf2, sizeof(__u32));
+		if ((index >= count))
+			RETURN(-ENODEV);
+
+		if (lmv->tgts[index] == NULL ||
+		    lmv->tgts[index]->ltd_active == 0)
+			RETURN(-ENODATA);
+
+		mdc_obd = class_exp2obd(lmv->tgts[index]->ltd_exp);
+		if (!mdc_obd)
+			RETURN(-EINVAL);
+
+		/* copy UUID */
+		if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(mdc_obd),
+				     min((int) data->ioc_plen2,
+					 (int) sizeof(struct obd_uuid))))
+			RETURN(-EFAULT);
+
+		rc = obd_statfs(NULL, lmv->tgts[index]->ltd_exp, &stat_buf,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				0);
+		if (rc)
+			RETURN(rc);
+		if (copy_to_user(data->ioc_pbuf1, &stat_buf,
+				     min((int) data->ioc_plen1,
+					 (int) sizeof(stat_buf))))
+			RETURN(-EFAULT);
+		break;
+	}
+	case OBD_IOC_QUOTACTL: {
+		struct if_quotactl *qctl = karg;
+		struct lmv_tgt_desc *tgt = NULL;
+		struct obd_quotactl *oqctl;
+
+		if (qctl->qc_valid == QC_MDTIDX) {
+			if (qctl->qc_idx < 0 || count <= qctl->qc_idx)
+				RETURN(-EINVAL);
+
+			tgt = lmv->tgts[qctl->qc_idx];
+			if (tgt == NULL || tgt->ltd_exp == NULL)
+				RETURN(-EINVAL);
+		} else if (qctl->qc_valid == QC_UUID) {
+			for (i = 0; i < count; i++) {
+				tgt = lmv->tgts[i];
+				if (tgt == NULL)
+					continue;
+				if (!obd_uuid_equals(&tgt->ltd_uuid,
+						     &qctl->obd_uuid))
+					continue;
+
+				if (tgt->ltd_exp == NULL)
+					RETURN(-EINVAL);
+
+				break;
+			}
+		} else {
+			RETURN(-EINVAL);
+		}
+
+		if (i >= count)
+			RETURN(-EAGAIN);
+
+		LASSERT(tgt && tgt->ltd_exp);
+		OBD_ALLOC_PTR(oqctl);
+		if (!oqctl)
+			RETURN(-ENOMEM);
+
+		QCTL_COPY(oqctl, qctl);
+		rc = obd_quotactl(tgt->ltd_exp, oqctl);
+		if (rc == 0) {
+			QCTL_COPY(qctl, oqctl);
+			qctl->qc_valid = QC_MDTIDX;
+			qctl->obd_uuid = tgt->ltd_uuid;
+		}
+		OBD_FREE_PTR(oqctl);
+		break;
+	}
+	case OBD_IOC_CHANGELOG_SEND:
+	case OBD_IOC_CHANGELOG_CLEAR: {
+		struct ioc_changelog *icc = karg;
+
+		if (icc->icc_mdtindex >= count)
+			RETURN(-ENODEV);
+
+		if (lmv->tgts[icc->icc_mdtindex] == NULL ||
+		    lmv->tgts[icc->icc_mdtindex]->ltd_exp == NULL ||
+		    lmv->tgts[icc->icc_mdtindex]->ltd_active == 0)
+			RETURN(-ENODEV);
+		rc = obd_iocontrol(cmd, lmv->tgts[icc->icc_mdtindex]->ltd_exp,
+				   sizeof(*icc), icc, NULL);
+		break;
+	}
+	case LL_IOC_GET_CONNECT_FLAGS: {
+		if (lmv->tgts[0] == NULL)
+			RETURN(-ENODATA);
+		rc = obd_iocontrol(cmd, lmv->tgts[0]->ltd_exp, len, karg, uarg);
+		break;
+	}
+	case OBD_IOC_FID2PATH: {
+		rc = lmv_fid2path(exp, len, karg, uarg);
+		break;
+	}
+	case LL_IOC_HSM_STATE_GET:
+	case LL_IOC_HSM_STATE_SET:
+	case LL_IOC_HSM_ACTION:
+	case LL_IOC_LOV_SWAP_LAYOUTS: {
+		struct md_op_data	*op_data = karg;
+		struct lmv_tgt_desc	*tgt1, *tgt2;
+
+		tgt1 = lmv_find_target(lmv, &op_data->op_fid1);
+		if (IS_ERR(tgt1))
+			RETURN(PTR_ERR(tgt1));
+
+		tgt2 = lmv_find_target(lmv, &op_data->op_fid2);
+		if (IS_ERR(tgt2))
+			RETURN(PTR_ERR(tgt2));
+
+		if ((tgt1->ltd_exp == NULL) || (tgt2->ltd_exp == NULL))
+			RETURN(-EINVAL);
+
+		/* only files on same MDT can have their layouts swapped */
+		if (tgt1->ltd_idx != tgt2->ltd_idx)
+			RETURN(-EPERM);
+
+		rc = obd_iocontrol(cmd, tgt1->ltd_exp, len, karg, uarg);
+		break;
+	}
+	default:
+		for (i = 0; i < count; i++) {
+			struct obd_device *mdc_obd;
+			int err;
+
+			if (lmv->tgts[i] == NULL ||
+			    lmv->tgts[i]->ltd_exp == NULL)
+				continue;
+			/* ll_umount_begin() sets force flag but for lmv, not
+			 * mdc. Let's pass it through */
+			mdc_obd = class_exp2obd(lmv->tgts[i]->ltd_exp);
+			mdc_obd->obd_force = obddev->obd_force;
+			err = obd_iocontrol(cmd, lmv->tgts[i]->ltd_exp, len,
+					    karg, uarg);
+			if (err == -ENODATA && cmd == OBD_IOC_POLL_QUOTACHECK) {
+				RETURN(err);
+			} else if (err) {
+				if (lmv->tgts[i]->ltd_active) {
+					CERROR("error: iocontrol MDC %s on MDT"
+					       "idx %d cmd %x: err = %d\n",
+						lmv->tgts[i]->ltd_uuid.uuid,
+						i, cmd, err);
+					if (!rc)
+						rc = err;
+				}
+			} else
+				set = 1;
+		}
+		if (!set && !rc)
+			rc = -EIO;
+	}
+	RETURN(rc);
+}
+
+#if 0
+static int lmv_all_chars_policy(int count, const char *name,
+				int len)
+{
+	unsigned int c = 0;
+
+	while (len > 0)
+		c += name[--len];
+	c = c % count;
+	return c;
+}
+
+static int lmv_nid_policy(struct lmv_obd *lmv)
+{
+	struct obd_import *imp;
+	__u32	      id;
+
+	/*
+	 * XXX: To get nid we assume that underlying obd device is mdc.
+	 */
+	imp = class_exp2cliimp(lmv->tgts[0].ltd_exp);
+	id = imp->imp_connection->c_self ^ (imp->imp_connection->c_self >> 32);
+	return id % lmv->desc.ld_tgt_count;
+}
+
+static int lmv_choose_mds(struct lmv_obd *lmv, struct md_op_data *op_data,
+			  placement_policy_t placement)
+{
+	switch (placement) {
+	case PLACEMENT_CHAR_POLICY:
+		return lmv_all_chars_policy(lmv->desc.ld_tgt_count,
+					    op_data->op_name,
+					    op_data->op_namelen);
+	case PLACEMENT_NID_POLICY:
+		return lmv_nid_policy(lmv);
+
+	default:
+		break;
+	}
+
+	CERROR("Unsupported placement policy %x\n", placement);
+	return -EINVAL;
+}
+#endif
+
+/**
+ * This is _inode_ placement policy function (not name).
+ */
+static int lmv_placement_policy(struct obd_device *obd,
+				struct md_op_data *op_data,
+				mdsno_t *mds)
+{
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	ENTRY;
+
+	LASSERT(mds != NULL);
+
+	if (lmv->desc.ld_tgt_count == 1) {
+		*mds = 0;
+		RETURN(0);
+	}
+
+	/**
+	 * If stripe_offset is provided during setdirstripe
+	 * (setdirstripe -i xx), xx MDS will be choosen.
+	 */
+	if (op_data->op_cli_flags & CLI_SET_MEA) {
+		struct lmv_user_md *lum;
+
+		lum = (struct lmv_user_md *)op_data->op_data;
+		if (lum->lum_type == LMV_STRIPE_TYPE &&
+		    lum->lum_stripe_offset != -1) {
+			if (lum->lum_stripe_offset >= lmv->desc.ld_tgt_count) {
+				CERROR("%s: Stripe_offset %d > MDT count %d:"
+				       " rc = %d\n", obd->obd_name,
+				       lum->lum_stripe_offset,
+				       lmv->desc.ld_tgt_count, -ERANGE);
+				RETURN(-ERANGE);
+			}
+			*mds = lum->lum_stripe_offset;
+			RETURN(0);
+		}
+	}
+
+	/* Allocate new fid on target according to operation type and parent
+	 * home mds. */
+	*mds = op_data->op_mds;
+	RETURN(0);
+}
+
+int __lmv_fid_alloc(struct lmv_obd *lmv, struct lu_fid *fid,
+		    mdsno_t mds)
+{
+	struct lmv_tgt_desc	*tgt;
+	int			 rc;
+	ENTRY;
+
+	tgt = lmv_get_target(lmv, mds);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	/*
+	 * New seq alloc and FLD setup should be atomic. Otherwise we may find
+	 * on server that seq in new allocated fid is not yet known.
+	 */
+	mutex_lock(&tgt->ltd_fid_mutex);
+
+	if (tgt->ltd_active == 0 || tgt->ltd_exp == NULL)
+		GOTO(out, rc = -ENODEV);
+
+	/*
+	 * Asking underlaying tgt layer to allocate new fid.
+	 */
+	rc = obd_fid_alloc(tgt->ltd_exp, fid, NULL);
+	if (rc > 0) {
+		LASSERT(fid_is_sane(fid));
+		rc = 0;
+	}
+
+	EXIT;
+out:
+	mutex_unlock(&tgt->ltd_fid_mutex);
+	return rc;
+}
+
+int lmv_fid_alloc(struct obd_export *exp, struct lu_fid *fid,
+		  struct md_op_data *op_data)
+{
+	struct obd_device     *obd = class_exp2obd(exp);
+	struct lmv_obd	*lmv = &obd->u.lmv;
+	mdsno_t		mds = 0;
+	int		    rc;
+	ENTRY;
+
+	LASSERT(op_data != NULL);
+	LASSERT(fid != NULL);
+
+	rc = lmv_placement_policy(obd, op_data, &mds);
+	if (rc) {
+		CERROR("Can't get target for allocating fid, "
+		       "rc %d\n", rc);
+		RETURN(rc);
+	}
+
+	rc = __lmv_fid_alloc(lmv, fid, mds);
+	if (rc) {
+		CERROR("Can't alloc new fid, rc %d\n", rc);
+		RETURN(rc);
+	}
+
+	RETURN(rc);
+}
+
+static int lmv_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct lmv_obd	     *lmv = &obd->u.lmv;
+	struct lprocfs_static_vars  lvars;
+	struct lmv_desc	    *desc;
+	int			 rc;
+	ENTRY;
+
+	if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
+		CERROR("LMV setup requires a descriptor\n");
+		RETURN(-EINVAL);
+	}
+
+	desc = (struct lmv_desc *)lustre_cfg_buf(lcfg, 1);
+	if (sizeof(*desc) > LUSTRE_CFG_BUFLEN(lcfg, 1)) {
+		CERROR("Lmv descriptor size wrong: %d > %d\n",
+		       (int)sizeof(*desc), LUSTRE_CFG_BUFLEN(lcfg, 1));
+		RETURN(-EINVAL);
+	}
+
+	OBD_ALLOC(lmv->tgts, sizeof(*lmv->tgts) * 32);
+	if (lmv->tgts == NULL)
+		RETURN(-ENOMEM);
+	lmv->tgts_size = 32;
+
+	obd_str2uuid(&lmv->desc.ld_uuid, desc->ld_uuid.uuid);
+	lmv->desc.ld_tgt_count = 0;
+	lmv->desc.ld_active_tgt_count = 0;
+	lmv->max_cookiesize = 0;
+	lmv->max_def_easize = 0;
+	lmv->max_easize = 0;
+	lmv->lmv_placement = PLACEMENT_CHAR_POLICY;
+
+	spin_lock_init(&lmv->lmv_lock);
+	mutex_init(&lmv->init_mutex);
+
+	lprocfs_lmv_init_vars(&lvars);
+
+	lprocfs_obd_setup(obd, lvars.obd_vars);
+#ifdef LPROCFS
+	{
+		rc = lprocfs_seq_create(obd->obd_proc_entry, "target_obd",
+					0444, &lmv_proc_target_fops, obd);
+		if (rc)
+			CWARN("%s: error adding LMV target_obd file: rc = %d\n",
+			       obd->obd_name, rc);
+       }
+#endif
+	rc = fld_client_init(&lmv->lmv_fld, obd->obd_name,
+			     LUSTRE_CLI_FLD_HASH_DHT);
+	if (rc) {
+		CERROR("Can't init FLD, err %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	RETURN(0);
+
+out:
+	return rc;
+}
+
+static int lmv_cleanup(struct obd_device *obd)
+{
+	struct lmv_obd   *lmv = &obd->u.lmv;
+	ENTRY;
+
+	fld_client_fini(&lmv->lmv_fld);
+	if (lmv->tgts != NULL) {
+		int i;
+		for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+			if (lmv->tgts[i] == NULL)
+				continue;
+			lmv_del_target(lmv, i);
+		}
+		OBD_FREE(lmv->tgts, sizeof(*lmv->tgts) * lmv->tgts_size);
+		lmv->tgts_size = 0;
+	}
+	RETURN(0);
+}
+
+static int lmv_process_config(struct obd_device *obd, obd_count len, void *buf)
+{
+	struct lustre_cfg	*lcfg = buf;
+	struct obd_uuid		obd_uuid;
+	int			gen;
+	__u32			index;
+	int			rc;
+	ENTRY;
+
+	switch (lcfg->lcfg_command) {
+	case LCFG_ADD_MDC:
+		/* modify_mdc_tgts add 0:lustre-clilmv  1:lustre-MDT0000_UUID
+		 * 2:0  3:1  4:lustre-MDT0000-mdc_UUID */
+		if (LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(obd_uuid.uuid))
+			GOTO(out, rc = -EINVAL);
+
+		obd_str2uuid(&obd_uuid,  lustre_cfg_buf(lcfg, 1));
+
+		if (sscanf(lustre_cfg_buf(lcfg, 2), "%d", &index) != 1)
+			GOTO(out, rc = -EINVAL);
+		if (sscanf(lustre_cfg_buf(lcfg, 3), "%d", &gen) != 1)
+			GOTO(out, rc = -EINVAL);
+		rc = lmv_add_target(obd, &obd_uuid, index, gen);
+		GOTO(out, rc);
+	default:
+		CERROR("Unknown command: %d\n", lcfg->lcfg_command);
+		GOTO(out, rc = -EINVAL);
+	}
+out:
+	RETURN(rc);
+}
+
+static int lmv_statfs(const struct lu_env *env, struct obd_export *exp,
+		      struct obd_statfs *osfs, __u64 max_age, __u32 flags)
+{
+	struct obd_device     *obd = class_exp2obd(exp);
+	struct lmv_obd	*lmv = &obd->u.lmv;
+	struct obd_statfs     *temp;
+	int		    rc = 0;
+	int		    i;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	OBD_ALLOC(temp, sizeof(*temp));
+	if (temp == NULL)
+		RETURN(-ENOMEM);
+
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL)
+			continue;
+
+		rc = obd_statfs(env, lmv->tgts[i]->ltd_exp, temp,
+				max_age, flags);
+		if (rc) {
+			CERROR("can't stat MDS #%d (%s), error %d\n", i,
+			       lmv->tgts[i]->ltd_exp->exp_obd->obd_name,
+			       rc);
+			GOTO(out_free_temp, rc);
+		}
+
+		if (i == 0) {
+			*osfs = *temp;
+			/* If the statfs is from mount, it will needs
+			 * retrieve necessary information from MDT0.
+			 * i.e. mount does not need the merged osfs
+			 * from all of MDT.
+			 * And also clients can be mounted as long as
+			 * MDT0 is in service*/
+			if (flags & OBD_STATFS_FOR_MDT0)
+				GOTO(out_free_temp, rc);
+		} else {
+			osfs->os_bavail += temp->os_bavail;
+			osfs->os_blocks += temp->os_blocks;
+			osfs->os_ffree += temp->os_ffree;
+			osfs->os_files += temp->os_files;
+		}
+	}
+
+	EXIT;
+out_free_temp:
+	OBD_FREE(temp, sizeof(*temp));
+	return rc;
+}
+
+static int lmv_getstatus(struct obd_export *exp,
+			 struct lu_fid *fid,
+			 struct obd_capa **pc)
+{
+	struct obd_device    *obd = exp->exp_obd;
+	struct lmv_obd       *lmv = &obd->u.lmv;
+	int		   rc;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	rc = md_getstatus(lmv->tgts[0]->ltd_exp, fid, pc);
+	RETURN(rc);
+}
+
+static int lmv_getxattr(struct obd_export *exp, const struct lu_fid *fid,
+			struct obd_capa *oc, obd_valid valid, const char *name,
+			const char *input, int input_size, int output_size,
+			int flags, struct ptlrpc_request **request)
+{
+	struct obd_device      *obd = exp->exp_obd;
+	struct lmv_obd	 *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc    *tgt;
+	int		     rc;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	tgt = lmv_find_target(lmv, fid);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	rc = md_getxattr(tgt->ltd_exp, fid, oc, valid, name, input,
+			 input_size, output_size, flags, request);
+
+	RETURN(rc);
+}
+
+static int lmv_setxattr(struct obd_export *exp, const struct lu_fid *fid,
+			struct obd_capa *oc, obd_valid valid, const char *name,
+			const char *input, int input_size, int output_size,
+			int flags, __u32 suppgid,
+			struct ptlrpc_request **request)
+{
+	struct obd_device      *obd = exp->exp_obd;
+	struct lmv_obd	 *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc    *tgt;
+	int		     rc;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	tgt = lmv_find_target(lmv, fid);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	rc = md_setxattr(tgt->ltd_exp, fid, oc, valid, name, input,
+			 input_size, output_size, flags, suppgid,
+			 request);
+
+	RETURN(rc);
+}
+
+static int lmv_getattr(struct obd_export *exp, struct md_op_data *op_data,
+		       struct ptlrpc_request **request)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt;
+	int		      rc;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	tgt = lmv_find_target(lmv, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	if (op_data->op_flags & MF_GET_MDT_IDX) {
+		op_data->op_mds = tgt->ltd_idx;
+		RETURN(0);
+	}
+
+	rc = md_getattr(tgt->ltd_exp, op_data, request);
+
+	RETURN(rc);
+}
+
+static int lmv_null_inode(struct obd_export *exp, const struct lu_fid *fid)
+{
+	struct obd_device   *obd = exp->exp_obd;
+	struct lmv_obd      *lmv = &obd->u.lmv;
+	int		  i;
+	int		  rc;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	CDEBUG(D_INODE, "CBDATA for "DFID"\n", PFID(fid));
+
+	/*
+	 * With DNE every object can have two locks in different namespaces:
+	 * lookup lock in space of MDT storing direntry and update/open lock in
+	 * space of MDT storing inode.
+	 */
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL)
+			continue;
+		md_null_inode(lmv->tgts[i]->ltd_exp, fid);
+	}
+
+	RETURN(0);
+}
+
+static int lmv_find_cbdata(struct obd_export *exp, const struct lu_fid *fid,
+			   ldlm_iterator_t it, void *data)
+{
+	struct obd_device   *obd = exp->exp_obd;
+	struct lmv_obd      *lmv = &obd->u.lmv;
+	int		  i;
+	int		  rc;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	CDEBUG(D_INODE, "CBDATA for "DFID"\n", PFID(fid));
+
+	/*
+	 * With DNE every object can have two locks in different namespaces:
+	 * lookup lock in space of MDT storing direntry and update/open lock in
+	 * space of MDT storing inode.
+	 */
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL)
+			continue;
+		rc = md_find_cbdata(lmv->tgts[i]->ltd_exp, fid, it, data);
+		if (rc)
+			RETURN(rc);
+	}
+
+	RETURN(rc);
+}
+
+
+static int lmv_close(struct obd_export *exp, struct md_op_data *op_data,
+		     struct md_open_data *mod, struct ptlrpc_request **request)
+{
+	struct obd_device     *obd = exp->exp_obd;
+	struct lmv_obd	*lmv = &obd->u.lmv;
+	struct lmv_tgt_desc   *tgt;
+	int		    rc;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	tgt = lmv_find_target(lmv, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	CDEBUG(D_INODE, "CLOSE "DFID"\n", PFID(&op_data->op_fid1));
+	rc = md_close(tgt->ltd_exp, op_data, mod, request);
+	RETURN(rc);
+}
+
+struct lmv_tgt_desc
+*lmv_locate_mds(struct lmv_obd *lmv, struct md_op_data *op_data,
+		struct lu_fid *fid)
+{
+	struct lmv_tgt_desc *tgt;
+
+	tgt = lmv_find_target(lmv, fid);
+	if (IS_ERR(tgt))
+		return tgt;
+
+	op_data->op_mds = tgt->ltd_idx;
+
+	return tgt;
+}
+
+int lmv_create(struct obd_export *exp, struct md_op_data *op_data,
+	       const void *data, int datalen, int mode, __u32 uid,
+	       __u32 gid, cfs_cap_t cap_effective, __u64 rdev,
+	       struct ptlrpc_request **request)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt;
+	int		      rc;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	if (!lmv->desc.ld_active_tgt_count)
+		RETURN(-EIO);
+
+	tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	rc = lmv_fid_alloc(exp, &op_data->op_fid2, op_data);
+	if (rc)
+		RETURN(rc);
+
+	CDEBUG(D_INODE, "CREATE '%*s' on "DFID" -> mds #%x\n",
+	       op_data->op_namelen, op_data->op_name, PFID(&op_data->op_fid1),
+	       op_data->op_mds);
+
+	op_data->op_flags |= MF_MDC_CANCEL_FID1;
+	rc = md_create(tgt->ltd_exp, op_data, data, datalen, mode, uid, gid,
+		       cap_effective, rdev, request);
+
+	if (rc == 0) {
+		if (*request == NULL)
+			RETURN(rc);
+		CDEBUG(D_INODE, "Created - "DFID"\n", PFID(&op_data->op_fid2));
+	}
+	RETURN(rc);
+}
+
+static int lmv_done_writing(struct obd_export *exp,
+			    struct md_op_data *op_data,
+			    struct md_open_data *mod)
+{
+	struct obd_device     *obd = exp->exp_obd;
+	struct lmv_obd	*lmv = &obd->u.lmv;
+	struct lmv_tgt_desc   *tgt;
+	int		    rc;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	tgt = lmv_find_target(lmv, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	rc = md_done_writing(tgt->ltd_exp, op_data, mod);
+	RETURN(rc);
+}
+
+static int
+lmv_enqueue_remote(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
+		   struct lookup_intent *it, struct md_op_data *op_data,
+		   struct lustre_handle *lockh, void *lmm, int lmmsize,
+		   int extra_lock_flags)
+{
+	struct ptlrpc_request      *req = it->d.lustre.it_data;
+	struct obd_device	  *obd = exp->exp_obd;
+	struct lmv_obd	     *lmv = &obd->u.lmv;
+	struct lustre_handle	plock;
+	struct lmv_tgt_desc	*tgt;
+	struct md_op_data	  *rdata;
+	struct lu_fid	       fid1;
+	struct mdt_body	    *body;
+	int			 rc = 0;
+	int			 pmode;
+	ENTRY;
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	LASSERT(body != NULL);
+
+	if (!(body->valid & OBD_MD_MDS))
+		RETURN(0);
+
+	CDEBUG(D_INODE, "REMOTE_ENQUEUE '%s' on "DFID" -> "DFID"\n",
+	       LL_IT2STR(it), PFID(&op_data->op_fid1), PFID(&body->fid1));
+
+	/*
+	 * We got LOOKUP lock, but we really need attrs.
+	 */
+	pmode = it->d.lustre.it_lock_mode;
+	LASSERT(pmode != 0);
+	memcpy(&plock, lockh, sizeof(plock));
+	it->d.lustre.it_lock_mode = 0;
+	it->d.lustre.it_data = NULL;
+	fid1 = body->fid1;
+
+	it->d.lustre.it_disposition &= ~DISP_ENQ_COMPLETE;
+	ptlrpc_req_finished(req);
+
+	tgt = lmv_find_target(lmv, &fid1);
+	if (IS_ERR(tgt))
+		GOTO(out, rc = PTR_ERR(tgt));
+
+	OBD_ALLOC_PTR(rdata);
+	if (rdata == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	rdata->op_fid1 = fid1;
+	rdata->op_bias = MDS_CROSS_REF;
+
+	rc = md_enqueue(tgt->ltd_exp, einfo, it, rdata, lockh,
+			lmm, lmmsize, NULL, extra_lock_flags);
+	OBD_FREE_PTR(rdata);
+	EXIT;
+out:
+	ldlm_lock_decref(&plock, pmode);
+	return rc;
+}
+
+static int
+lmv_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
+	    struct lookup_intent *it, struct md_op_data *op_data,
+	    struct lustre_handle *lockh, void *lmm, int lmmsize,
+	    struct ptlrpc_request **req, __u64 extra_lock_flags)
+{
+	struct obd_device	*obd = exp->exp_obd;
+	struct lmv_obd	   *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc      *tgt;
+	int		       rc;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	CDEBUG(D_INODE, "ENQUEUE '%s' on "DFID"\n",
+	       LL_IT2STR(it), PFID(&op_data->op_fid1));
+
+	tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	CDEBUG(D_INODE, "ENQUEUE '%s' on "DFID" -> mds #%d\n",
+	       LL_IT2STR(it), PFID(&op_data->op_fid1), tgt->ltd_idx);
+
+	rc = md_enqueue(tgt->ltd_exp, einfo, it, op_data, lockh,
+			lmm, lmmsize, req, extra_lock_flags);
+
+	if (rc == 0 && it && it->it_op == IT_OPEN) {
+		rc = lmv_enqueue_remote(exp, einfo, it, op_data, lockh,
+					lmm, lmmsize, extra_lock_flags);
+	}
+	RETURN(rc);
+}
+
+static int
+lmv_getattr_name(struct obd_export *exp,struct md_op_data *op_data,
+		 struct ptlrpc_request **request)
+{
+	struct ptlrpc_request   *req = NULL;
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt;
+	struct mdt_body	 *body;
+	int		      rc;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	CDEBUG(D_INODE, "GETATTR_NAME for %*s on "DFID" -> mds #%d\n",
+	       op_data->op_namelen, op_data->op_name, PFID(&op_data->op_fid1),
+	       tgt->ltd_idx);
+
+	rc = md_getattr_name(tgt->ltd_exp, op_data, request);
+	if (rc != 0)
+		RETURN(rc);
+
+	body = req_capsule_server_get(&(*request)->rq_pill,
+				      &RMF_MDT_BODY);
+	LASSERT(body != NULL);
+
+	if (body->valid & OBD_MD_MDS) {
+		struct lu_fid rid = body->fid1;
+		CDEBUG(D_INODE, "Request attrs for "DFID"\n",
+		       PFID(&rid));
+
+		tgt = lmv_find_target(lmv, &rid);
+		if (IS_ERR(tgt)) {
+			ptlrpc_req_finished(*request);
+			RETURN(PTR_ERR(tgt));
+		}
+
+		op_data->op_fid1 = rid;
+		op_data->op_valid |= OBD_MD_FLCROSSREF;
+		op_data->op_namelen = 0;
+		op_data->op_name = NULL;
+		rc = md_getattr_name(tgt->ltd_exp, op_data, &req);
+		ptlrpc_req_finished(*request);
+		*request = req;
+	}
+
+	RETURN(rc);
+}
+
+#define md_op_data_fid(op_data, fl)		     \
+	(fl == MF_MDC_CANCEL_FID1 ? &op_data->op_fid1 : \
+	 fl == MF_MDC_CANCEL_FID2 ? &op_data->op_fid2 : \
+	 fl == MF_MDC_CANCEL_FID3 ? &op_data->op_fid3 : \
+	 fl == MF_MDC_CANCEL_FID4 ? &op_data->op_fid4 : \
+	 NULL)
+
+static int lmv_early_cancel(struct obd_export *exp, struct md_op_data *op_data,
+			    int op_tgt, ldlm_mode_t mode, int bits, int flag)
+{
+	struct lu_fid	  *fid = md_op_data_fid(op_data, flag);
+	struct obd_device      *obd = exp->exp_obd;
+	struct lmv_obd	 *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc    *tgt;
+	ldlm_policy_data_t      policy = {{0}};
+	int		     rc = 0;
+	ENTRY;
+
+	if (!fid_is_sane(fid))
+		RETURN(0);
+
+	tgt = lmv_find_target(lmv, fid);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	if (tgt->ltd_idx != op_tgt) {
+		CDEBUG(D_INODE, "EARLY_CANCEL on "DFID"\n", PFID(fid));
+		policy.l_inodebits.bits = bits;
+		rc = md_cancel_unused(tgt->ltd_exp, fid, &policy,
+				      mode, LCF_ASYNC, NULL);
+	} else {
+		CDEBUG(D_INODE,
+		       "EARLY_CANCEL skip operation target %d on "DFID"\n",
+		       op_tgt, PFID(fid));
+		op_data->op_flags |= flag;
+		rc = 0;
+	}
+
+	RETURN(rc);
+}
+
+/*
+ * llite passes fid of an target inode in op_data->op_fid1 and id of directory in
+ * op_data->op_fid2
+ */
+static int lmv_link(struct obd_export *exp, struct md_op_data *op_data,
+		    struct ptlrpc_request **request)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt;
+	int		      rc;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	LASSERT(op_data->op_namelen != 0);
+
+	CDEBUG(D_INODE, "LINK "DFID":%*s to "DFID"\n",
+	       PFID(&op_data->op_fid2), op_data->op_namelen,
+	       op_data->op_name, PFID(&op_data->op_fid1));
+
+	op_data->op_fsuid = current_fsuid();
+	op_data->op_fsgid = current_fsgid();
+	op_data->op_cap = cfs_curproc_cap_pack();
+	tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid2);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	/*
+	 * Cancel UPDATE lock on child (fid1).
+	 */
+	op_data->op_flags |= MF_MDC_CANCEL_FID2;
+	rc = lmv_early_cancel(exp, op_data, tgt->ltd_idx, LCK_EX,
+			      MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID1);
+	if (rc != 0)
+		RETURN(rc);
+
+	rc = md_link(tgt->ltd_exp, op_data, request);
+
+	RETURN(rc);
+}
+
+static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data,
+		      const char *old, int oldlen, const char *new, int newlen,
+		      struct ptlrpc_request **request)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *src_tgt;
+	struct lmv_tgt_desc     *tgt_tgt;
+	int			rc;
+	ENTRY;
+
+	LASSERT(oldlen != 0);
+
+	CDEBUG(D_INODE, "RENAME %*s in "DFID" to %*s in "DFID"\n",
+	       oldlen, old, PFID(&op_data->op_fid1),
+	       newlen, new, PFID(&op_data->op_fid2));
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	op_data->op_fsuid = current_fsuid();
+	op_data->op_fsgid = current_fsgid();
+	op_data->op_cap = cfs_curproc_cap_pack();
+	src_tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+	if (IS_ERR(src_tgt))
+		RETURN(PTR_ERR(src_tgt));
+
+	tgt_tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid2);
+	if (IS_ERR(tgt_tgt))
+		RETURN(PTR_ERR(tgt_tgt));
+	/*
+	 * LOOKUP lock on src child (fid3) should also be cancelled for
+	 * src_tgt in mdc_rename.
+	 */
+	op_data->op_flags |= MF_MDC_CANCEL_FID1 | MF_MDC_CANCEL_FID3;
+
+	/*
+	 * Cancel UPDATE locks on tgt parent (fid2), tgt_tgt is its
+	 * own target.
+	 */
+	rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx,
+			      LCK_EX, MDS_INODELOCK_UPDATE,
+			      MF_MDC_CANCEL_FID2);
+
+	/*
+	 * Cancel LOOKUP locks on tgt child (fid4) for parent tgt_tgt.
+	 */
+	if (rc == 0) {
+		rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx,
+				      LCK_EX, MDS_INODELOCK_LOOKUP,
+				      MF_MDC_CANCEL_FID4);
+	}
+
+	/*
+	 * Cancel all the locks on tgt child (fid4).
+	 */
+	if (rc == 0)
+		rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx,
+				      LCK_EX, MDS_INODELOCK_FULL,
+				      MF_MDC_CANCEL_FID4);
+
+	if (rc == 0)
+		rc = md_rename(src_tgt->ltd_exp, op_data, old, oldlen,
+			       new, newlen, request);
+	RETURN(rc);
+}
+
+static int lmv_setattr(struct obd_export *exp, struct md_op_data *op_data,
+		       void *ea, int ealen, void *ea2, int ea2len,
+		       struct ptlrpc_request **request,
+		       struct md_open_data **mod)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt;
+	int		      rc = 0;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	CDEBUG(D_INODE, "SETATTR for "DFID", valid 0x%x\n",
+	       PFID(&op_data->op_fid1), op_data->op_attr.ia_valid);
+
+	op_data->op_flags |= MF_MDC_CANCEL_FID1;
+	tgt = lmv_find_target(lmv, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	rc = md_setattr(tgt->ltd_exp, op_data, ea, ealen, ea2,
+			ea2len, request, mod);
+
+	RETURN(rc);
+}
+
+static int lmv_sync(struct obd_export *exp, const struct lu_fid *fid,
+		    struct obd_capa *oc, struct ptlrpc_request **request)
+{
+	struct obd_device	 *obd = exp->exp_obd;
+	struct lmv_obd	    *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc       *tgt;
+	int			rc;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	tgt = lmv_find_target(lmv, fid);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	rc = md_sync(tgt->ltd_exp, fid, oc, request);
+	RETURN(rc);
+}
+
+/*
+ * Adjust a set of pages, each page containing an array of lu_dirpages,
+ * so that each page can be used as a single logical lu_dirpage.
+ *
+ * A lu_dirpage is laid out as follows, where s = ldp_hash_start,
+ * e = ldp_hash_end, f = ldp_flags, p = padding, and each "ent" is a
+ * struct lu_dirent.  It has size up to LU_PAGE_SIZE. The ldp_hash_end
+ * value is used as a cookie to request the next lu_dirpage in a
+ * directory listing that spans multiple pages (two in this example):
+ *   ________
+ *  |	|
+ * .|--------v-------   -----.
+ * |s|e|f|p|ent|ent| ... |ent|
+ * '--|--------------   -----'   Each CFS_PAGE contains a single
+ *    '------.		   lu_dirpage.
+ * .---------v-------   -----.
+ * |s|e|f|p|ent| 0 | ... | 0 |
+ * '-----------------   -----'
+ *
+ * However, on hosts where the native VM page size (PAGE_CACHE_SIZE) is
+ * larger than LU_PAGE_SIZE, a single host page may contain multiple
+ * lu_dirpages. After reading the lu_dirpages from the MDS, the
+ * ldp_hash_end of the first lu_dirpage refers to the one immediately
+ * after it in the same CFS_PAGE (arrows simplified for brevity, but
+ * in general e0==s1, e1==s2, etc.):
+ *
+ * .--------------------   -----.
+ * |s0|e0|f0|p|ent|ent| ... |ent|
+ * |---v----------------   -----|
+ * |s1|e1|f1|p|ent|ent| ... |ent|
+ * |---v----------------   -----|  Here, each CFS_PAGE contains
+ *	     ...		 multiple lu_dirpages.
+ * |---v----------------   -----|
+ * |s'|e'|f'|p|ent|ent| ... |ent|
+ * '---|----------------   -----'
+ *     v
+ * .----------------------------.
+ * |	next CFS_PAGE       |
+ *
+ * This structure is transformed into a single logical lu_dirpage as follows:
+ *
+ * - Replace e0 with e' so the request for the next lu_dirpage gets the page
+ *   labeled 'next CFS_PAGE'.
+ *
+ * - Copy the LDF_COLLIDE flag from f' to f0 to correctly reflect whether
+ *   a hash collision with the next page exists.
+ *
+ * - Adjust the lde_reclen of the ending entry of each lu_dirpage to span
+ *   to the first entry of the next lu_dirpage.
+ */
+#if PAGE_CACHE_SIZE > LU_PAGE_SIZE
+static void lmv_adjust_dirpages(struct page **pages, int ncfspgs, int nlupgs)
+{
+	int i;
+
+	for (i = 0; i < ncfspgs; i++) {
+		struct lu_dirpage	*dp = kmap(pages[i]);
+		struct lu_dirpage	*first = dp;
+		struct lu_dirent	*end_dirent = NULL;
+		struct lu_dirent	*ent;
+		__u64			hash_end = dp->ldp_hash_end;
+		__u32			flags = dp->ldp_flags;
+
+		for (; nlupgs > 1; nlupgs--) {
+			ent = lu_dirent_start(dp);
+			for (end_dirent = ent; ent != NULL;
+			     end_dirent = ent, ent = lu_dirent_next(ent));
+
+			/* Advance dp to next lu_dirpage. */
+			dp = (struct lu_dirpage *)((char *)dp + LU_PAGE_SIZE);
+
+			/* Check if we've reached the end of the CFS_PAGE. */
+			if (!((unsigned long)dp & ~CFS_PAGE_MASK))
+				break;
+
+			/* Save the hash and flags of this lu_dirpage. */
+			hash_end = dp->ldp_hash_end;
+			flags = dp->ldp_flags;
+
+			/* Check if lu_dirpage contains no entries. */
+			if (!end_dirent)
+				break;
+
+			/* Enlarge the end entry lde_reclen from 0 to
+			 * first entry of next lu_dirpage. */
+			LASSERT(le16_to_cpu(end_dirent->lde_reclen) == 0);
+			end_dirent->lde_reclen =
+				cpu_to_le16((char *)(dp->ldp_entries) -
+					    (char *)end_dirent);
+		}
+
+		first->ldp_hash_end = hash_end;
+		first->ldp_flags &= ~cpu_to_le32(LDF_COLLIDE);
+		first->ldp_flags |= flags & cpu_to_le32(LDF_COLLIDE);
+
+		kunmap(pages[i]);
+	}
+}
+#else
+#define lmv_adjust_dirpages(pages, ncfspgs, nlupgs) do {} while (0)
+#endif	/* PAGE_CACHE_SIZE > LU_PAGE_SIZE */
+
+static int lmv_readpage(struct obd_export *exp, struct md_op_data *op_data,
+			struct page **pages, struct ptlrpc_request **request)
+{
+	struct obd_device	*obd = exp->exp_obd;
+	struct lmv_obd		*lmv = &obd->u.lmv;
+	__u64			offset = op_data->op_offset;
+	int			rc;
+	int			ncfspgs; /* pages read in PAGE_CACHE_SIZE */
+	int			nlupgs; /* pages read in LU_PAGE_SIZE */
+	struct lmv_tgt_desc	*tgt;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	CDEBUG(D_INODE, "READPAGE at "LPX64" from "DFID"\n",
+	       offset, PFID(&op_data->op_fid1));
+
+	tgt = lmv_find_target(lmv, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	rc = md_readpage(tgt->ltd_exp, op_data, pages, request);
+	if (rc != 0)
+		RETURN(rc);
+
+	ncfspgs = ((*request)->rq_bulk->bd_nob_transferred + PAGE_CACHE_SIZE - 1)
+		 >> PAGE_CACHE_SHIFT;
+	nlupgs = (*request)->rq_bulk->bd_nob_transferred >> LU_PAGE_SHIFT;
+	LASSERT(!((*request)->rq_bulk->bd_nob_transferred & ~LU_PAGE_MASK));
+	LASSERT(ncfspgs > 0 && ncfspgs <= op_data->op_npages);
+
+	CDEBUG(D_INODE, "read %d(%d)/%d pages\n", ncfspgs, nlupgs,
+	       op_data->op_npages);
+
+	lmv_adjust_dirpages(pages, ncfspgs, nlupgs);
+
+	RETURN(rc);
+}
+
+static int lmv_unlink(struct obd_export *exp, struct md_op_data *op_data,
+		      struct ptlrpc_request **request)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt = NULL;
+	struct mdt_body		*body;
+	int		     rc;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+retry:
+	/* Send unlink requests to the MDT where the child is located */
+	if (likely(!fid_is_zero(&op_data->op_fid2)))
+		tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid2);
+	else
+		tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	op_data->op_fsuid = current_fsuid();
+	op_data->op_fsgid = current_fsgid();
+	op_data->op_cap = cfs_curproc_cap_pack();
+
+	/*
+	 * If child's fid is given, cancel unused locks for it if it is from
+	 * another export than parent.
+	 *
+	 * LOOKUP lock for child (fid3) should also be cancelled on parent
+	 * tgt_tgt in mdc_unlink().
+	 */
+	op_data->op_flags |= MF_MDC_CANCEL_FID1 | MF_MDC_CANCEL_FID3;
+
+	/*
+	 * Cancel FULL locks on child (fid3).
+	 */
+	rc = lmv_early_cancel(exp, op_data, tgt->ltd_idx, LCK_EX,
+			      MDS_INODELOCK_FULL, MF_MDC_CANCEL_FID3);
+
+	if (rc != 0)
+		RETURN(rc);
+
+	CDEBUG(D_INODE, "unlink with fid="DFID"/"DFID" -> mds #%d\n",
+	       PFID(&op_data->op_fid1), PFID(&op_data->op_fid2), tgt->ltd_idx);
+
+	rc = md_unlink(tgt->ltd_exp, op_data, request);
+	if (rc != 0 && rc != -EREMOTE)
+		RETURN(rc);
+
+	body = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		RETURN(-EPROTO);
+
+	/* Not cross-ref case, just get out of here. */
+	if (likely(!(body->valid & OBD_MD_MDS)))
+		RETURN(0);
+
+	CDEBUG(D_INODE, "%s: try unlink to another MDT for "DFID"\n",
+	       exp->exp_obd->obd_name, PFID(&body->fid1));
+
+	/* This is a remote object, try remote MDT, Note: it may
+	 * try more than 1 time here, Considering following case
+	 * /mnt/lustre is root on MDT0, remote1 is on MDT1
+	 * 1. Initially A does not know where remote1 is, it send
+	 *    unlink RPC to MDT0, MDT0 return -EREMOTE, it will
+	 *    resend unlink RPC to MDT1 (retry 1st time).
+	 *
+	 * 2. During the unlink RPC in flight,
+	 *    client B mv /mnt/lustre/remote1 /mnt/lustre/remote2
+	 *    and create new remote1, but on MDT0
+	 *
+	 * 3. MDT1 get unlink RPC(from A), then do remote lock on
+	 *    /mnt/lustre, then lookup get fid of remote1, and find
+	 *    it is remote dir again, and replay -EREMOTE again.
+	 *
+	 * 4. Then A will resend unlink RPC to MDT0. (retry 2nd times).
+	 *
+	 * In theory, it might try unlimited time here, but it should
+	 * be very rare case.  */
+	op_data->op_fid2 = body->fid1;
+	ptlrpc_req_finished(*request);
+	*request = NULL;
+
+	goto retry;
+}
+
+static int lmv_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
+{
+	struct lmv_obd *lmv = &obd->u.lmv;
+	int rc = 0;
+
+	switch (stage) {
+	case OBD_CLEANUP_EARLY:
+		/* XXX: here should be calling obd_precleanup() down to
+		 * stack. */
+		break;
+	case OBD_CLEANUP_EXPORTS:
+		fld_client_proc_fini(&lmv->lmv_fld);
+		lprocfs_obd_cleanup(obd);
+		break;
+	default:
+		break;
+	}
+	RETURN(rc);
+}
+
+static int lmv_get_info(const struct lu_env *env, struct obd_export *exp,
+			__u32 keylen, void *key, __u32 *vallen, void *val,
+			struct lov_stripe_md *lsm)
+{
+	struct obd_device       *obd;
+	struct lmv_obd	  *lmv;
+	int		      rc = 0;
+	ENTRY;
+
+	obd = class_exp2obd(exp);
+	if (obd == NULL) {
+		CDEBUG(D_IOCTL, "Invalid client cookie "LPX64"\n",
+		       exp->exp_handle.h_cookie);
+		RETURN(-EINVAL);
+	}
+
+	lmv = &obd->u.lmv;
+	if (keylen >= strlen("remote_flag") && !strcmp(key, "remote_flag")) {
+		struct lmv_tgt_desc *tgt;
+		int i;
+
+		rc = lmv_check_connect(obd);
+		if (rc)
+			RETURN(rc);
+
+		LASSERT(*vallen == sizeof(__u32));
+		for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+			tgt = lmv->tgts[i];
+			/*
+			 * All tgts should be connected when this gets called.
+			 */
+			if (tgt == NULL || tgt->ltd_exp == NULL)
+				continue;
+
+			if (!obd_get_info(env, tgt->ltd_exp, keylen, key,
+					  vallen, val, NULL))
+				RETURN(0);
+		}
+		RETURN(-EINVAL);
+	} else if (KEY_IS(KEY_MAX_EASIZE) || KEY_IS(KEY_CONN_DATA)) {
+		rc = lmv_check_connect(obd);
+		if (rc)
+			RETURN(rc);
+
+		/*
+		 * Forwarding this request to first MDS, it should know LOV
+		 * desc.
+		 */
+		rc = obd_get_info(env, lmv->tgts[0]->ltd_exp, keylen, key,
+				  vallen, val, NULL);
+		if (!rc && KEY_IS(KEY_CONN_DATA))
+			exp->exp_connect_data = *(struct obd_connect_data *)val;
+		RETURN(rc);
+	} else if (KEY_IS(KEY_TGT_COUNT)) {
+		*((int *)val) = lmv->desc.ld_tgt_count;
+		RETURN(0);
+	}
+
+	CDEBUG(D_IOCTL, "Invalid key\n");
+	RETURN(-EINVAL);
+}
+
+int lmv_set_info_async(const struct lu_env *env, struct obd_export *exp,
+		       obd_count keylen, void *key, obd_count vallen,
+		       void *val, struct ptlrpc_request_set *set)
+{
+	struct lmv_tgt_desc    *tgt;
+	struct obd_device      *obd;
+	struct lmv_obd	 *lmv;
+	int rc = 0;
+	ENTRY;
+
+	obd = class_exp2obd(exp);
+	if (obd == NULL) {
+		CDEBUG(D_IOCTL, "Invalid client cookie "LPX64"\n",
+		       exp->exp_handle.h_cookie);
+		RETURN(-EINVAL);
+	}
+	lmv = &obd->u.lmv;
+
+	if (KEY_IS(KEY_READ_ONLY) || KEY_IS(KEY_FLUSH_CTX)) {
+		int i, err = 0;
+
+		for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+			tgt = lmv->tgts[i];
+
+			if (tgt == NULL || tgt->ltd_exp == NULL)
+				continue;
+
+			err = obd_set_info_async(env, tgt->ltd_exp,
+						 keylen, key, vallen, val, set);
+			if (err && rc == 0)
+				rc = err;
+		}
+
+		RETURN(rc);
+	}
+
+	RETURN(-EINVAL);
+}
+
+int lmv_packmd(struct obd_export *exp, struct lov_mds_md **lmmp,
+	       struct lov_stripe_md *lsm)
+{
+	struct obd_device	 *obd = class_exp2obd(exp);
+	struct lmv_obd	    *lmv = &obd->u.lmv;
+	struct lmv_stripe_md      *meap;
+	struct lmv_stripe_md      *lsmp;
+	int			mea_size;
+	int			i;
+	ENTRY;
+
+	mea_size = lmv_get_easize(lmv);
+	if (!lmmp)
+		RETURN(mea_size);
+
+	if (*lmmp && !lsm) {
+		OBD_FREE_LARGE(*lmmp, mea_size);
+		*lmmp = NULL;
+		RETURN(0);
+	}
+
+	if (*lmmp == NULL) {
+		OBD_ALLOC_LARGE(*lmmp, mea_size);
+		if (*lmmp == NULL)
+			RETURN(-ENOMEM);
+	}
+
+	if (!lsm)
+		RETURN(mea_size);
+
+	lsmp = (struct lmv_stripe_md *)lsm;
+	meap = (struct lmv_stripe_md *)*lmmp;
+
+	if (lsmp->mea_magic != MEA_MAGIC_LAST_CHAR &&
+	    lsmp->mea_magic != MEA_MAGIC_ALL_CHARS)
+		RETURN(-EINVAL);
+
+	meap->mea_magic = cpu_to_le32(lsmp->mea_magic);
+	meap->mea_count = cpu_to_le32(lsmp->mea_count);
+	meap->mea_master = cpu_to_le32(lsmp->mea_master);
+
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		meap->mea_ids[i] = lsmp->mea_ids[i];
+		fid_cpu_to_le(&meap->mea_ids[i], &lsmp->mea_ids[i]);
+	}
+
+	RETURN(mea_size);
+}
+
+int lmv_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp,
+		 struct lov_mds_md *lmm, int lmm_size)
+{
+	struct obd_device	  *obd = class_exp2obd(exp);
+	struct lmv_stripe_md      **tmea = (struct lmv_stripe_md **)lsmp;
+	struct lmv_stripe_md       *mea = (struct lmv_stripe_md *)lmm;
+	struct lmv_obd	     *lmv = &obd->u.lmv;
+	int			 mea_size;
+	int			 i;
+	__u32		       magic;
+	ENTRY;
+
+	mea_size = lmv_get_easize(lmv);
+	if (lsmp == NULL)
+		return mea_size;
+
+	if (*lsmp != NULL && lmm == NULL) {
+		OBD_FREE_LARGE(*tmea, mea_size);
+		*lsmp = NULL;
+		RETURN(0);
+	}
+
+	LASSERT(mea_size == lmm_size);
+
+	OBD_ALLOC_LARGE(*tmea, mea_size);
+	if (*tmea == NULL)
+		RETURN(-ENOMEM);
+
+	if (!lmm)
+		RETURN(mea_size);
+
+	if (mea->mea_magic == MEA_MAGIC_LAST_CHAR ||
+	    mea->mea_magic == MEA_MAGIC_ALL_CHARS ||
+	    mea->mea_magic == MEA_MAGIC_HASH_SEGMENT)
+	{
+		magic = le32_to_cpu(mea->mea_magic);
+	} else {
+		/*
+		 * Old mea is not handled here.
+		 */
+		CERROR("Old not supportable EA is found\n");
+		LBUG();
+	}
+
+	(*tmea)->mea_magic = magic;
+	(*tmea)->mea_count = le32_to_cpu(mea->mea_count);
+	(*tmea)->mea_master = le32_to_cpu(mea->mea_master);
+
+	for (i = 0; i < (*tmea)->mea_count; i++) {
+		(*tmea)->mea_ids[i] = mea->mea_ids[i];
+		fid_le_to_cpu(&(*tmea)->mea_ids[i], &(*tmea)->mea_ids[i]);
+	}
+	RETURN(mea_size);
+}
+
+static int lmv_cancel_unused(struct obd_export *exp, const struct lu_fid *fid,
+			     ldlm_policy_data_t *policy, ldlm_mode_t mode,
+			     ldlm_cancel_flags_t flags, void *opaque)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	int		      rc = 0;
+	int		      err;
+	int		      i;
+	ENTRY;
+
+	LASSERT(fid != NULL);
+
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL ||
+		    lmv->tgts[i]->ltd_active == 0)
+			continue;
+
+		err = md_cancel_unused(lmv->tgts[i]->ltd_exp, fid,
+				       policy, mode, flags, opaque);
+		if (!rc)
+			rc = err;
+	}
+	RETURN(rc);
+}
+
+int lmv_set_lock_data(struct obd_export *exp, __u64 *lockh, void *data,
+		      __u64 *bits)
+{
+	struct lmv_obd	  *lmv = &exp->exp_obd->u.lmv;
+	int		      rc;
+	ENTRY;
+
+	rc =  md_set_lock_data(lmv->tgts[0]->ltd_exp, lockh, data, bits);
+	RETURN(rc);
+}
+
+ldlm_mode_t lmv_lock_match(struct obd_export *exp, __u64 flags,
+			   const struct lu_fid *fid, ldlm_type_t type,
+			   ldlm_policy_data_t *policy, ldlm_mode_t mode,
+			   struct lustre_handle *lockh)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	ldlm_mode_t	      rc;
+	int		      i;
+	ENTRY;
+
+	CDEBUG(D_INODE, "Lock match for "DFID"\n", PFID(fid));
+
+	/*
+	 * With CMD every object can have two locks in different namespaces:
+	 * lookup lock in space of mds storing direntry and update/open lock in
+	 * space of mds storing inode. Thus we check all targets, not only that
+	 * one fid was created in.
+	 */
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		if (lmv->tgts[i] == NULL ||
+		    lmv->tgts[i]->ltd_exp == NULL ||
+		    lmv->tgts[i]->ltd_active == 0)
+			continue;
+
+		rc = md_lock_match(lmv->tgts[i]->ltd_exp, flags, fid,
+				   type, policy, mode, lockh);
+		if (rc)
+			RETURN(rc);
+	}
+
+	RETURN(0);
+}
+
+int lmv_get_lustre_md(struct obd_export *exp, struct ptlrpc_request *req,
+		      struct obd_export *dt_exp, struct obd_export *md_exp,
+		      struct lustre_md *md)
+{
+	struct lmv_obd	  *lmv = &exp->exp_obd->u.lmv;
+
+	return md_get_lustre_md(lmv->tgts[0]->ltd_exp, req, dt_exp, md_exp, md);
+}
+
+int lmv_free_lustre_md(struct obd_export *exp, struct lustre_md *md)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	ENTRY;
+
+	if (md->mea)
+		obd_free_memmd(exp, (void *)&md->mea);
+	RETURN(md_free_lustre_md(lmv->tgts[0]->ltd_exp, md));
+}
+
+int lmv_set_open_replay_data(struct obd_export *exp,
+			     struct obd_client_handle *och,
+			     struct ptlrpc_request *open_req)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt;
+	ENTRY;
+
+	tgt = lmv_find_target(lmv, &och->och_fid);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	RETURN(md_set_open_replay_data(tgt->ltd_exp, och, open_req));
+}
+
+int lmv_clear_open_replay_data(struct obd_export *exp,
+			       struct obd_client_handle *och)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt;
+	ENTRY;
+
+	tgt = lmv_find_target(lmv, &och->och_fid);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	RETURN(md_clear_open_replay_data(tgt->ltd_exp, och));
+}
+
+static int lmv_get_remote_perm(struct obd_export *exp,
+			       const struct lu_fid *fid,
+			       struct obd_capa *oc, __u32 suppgid,
+			       struct ptlrpc_request **request)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt;
+	int		      rc;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	tgt = lmv_find_target(lmv, fid);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	rc = md_get_remote_perm(tgt->ltd_exp, fid, oc, suppgid, request);
+	RETURN(rc);
+}
+
+static int lmv_renew_capa(struct obd_export *exp, struct obd_capa *oc,
+			  renew_capa_cb_t cb)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt;
+	int		      rc;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	tgt = lmv_find_target(lmv, &oc->c_capa.lc_fid);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	rc = md_renew_capa(tgt->ltd_exp, oc, cb);
+	RETURN(rc);
+}
+
+int lmv_unpack_capa(struct obd_export *exp, struct ptlrpc_request *req,
+		    const struct req_msg_field *field, struct obd_capa **oc)
+{
+	struct lmv_obd *lmv = &exp->exp_obd->u.lmv;
+
+	return md_unpack_capa(lmv->tgts[0]->ltd_exp, req, field, oc);
+}
+
+int lmv_intent_getattr_async(struct obd_export *exp,
+			     struct md_enqueue_info *minfo,
+			     struct ldlm_enqueue_info *einfo)
+{
+	struct md_op_data       *op_data = &minfo->mi_data;
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt = NULL;
+	int		      rc;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	tgt = lmv_find_target(lmv, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	rc = md_intent_getattr_async(tgt->ltd_exp, minfo, einfo);
+	RETURN(rc);
+}
+
+int lmv_revalidate_lock(struct obd_export *exp, struct lookup_intent *it,
+			struct lu_fid *fid, __u64 *bits)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt;
+	int		      rc;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	tgt = lmv_find_target(lmv, fid);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	rc = md_revalidate_lock(tgt->ltd_exp, it, fid, bits);
+	RETURN(rc);
+}
+
+/**
+ * For lmv, only need to send request to master MDT, and the master MDT will
+ * process with other slave MDTs. The only exception is Q_GETOQUOTA for which
+ * we directly fetch data from the slave MDTs.
+ */
+int lmv_quotactl(struct obd_device *unused, struct obd_export *exp,
+		 struct obd_quotactl *oqctl)
+{
+	struct obd_device   *obd = class_exp2obd(exp);
+	struct lmv_obd      *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt = lmv->tgts[0];
+	int		  rc = 0, i;
+	__u64		curspace, curinodes;
+	ENTRY;
+
+	if (!lmv->desc.ld_tgt_count || !tgt->ltd_active) {
+		CERROR("master lmv inactive\n");
+		RETURN(-EIO);
+	}
+
+	if (oqctl->qc_cmd != Q_GETOQUOTA) {
+		rc = obd_quotactl(tgt->ltd_exp, oqctl);
+		RETURN(rc);
+	}
+
+	curspace = curinodes = 0;
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		int err;
+		tgt = lmv->tgts[i];
+
+		if (tgt == NULL || tgt->ltd_exp == NULL || tgt->ltd_active == 0)
+			continue;
+		if (!tgt->ltd_active) {
+			CDEBUG(D_HA, "mdt %d is inactive.\n", i);
+			continue;
+		}
+
+		err = obd_quotactl(tgt->ltd_exp, oqctl);
+		if (err) {
+			CERROR("getquota on mdt %d failed. %d\n", i, err);
+			if (!rc)
+				rc = err;
+		} else {
+			curspace += oqctl->qc_dqblk.dqb_curspace;
+			curinodes += oqctl->qc_dqblk.dqb_curinodes;
+		}
+	}
+	oqctl->qc_dqblk.dqb_curspace = curspace;
+	oqctl->qc_dqblk.dqb_curinodes = curinodes;
+
+	RETURN(rc);
+}
+
+int lmv_quotacheck(struct obd_device *unused, struct obd_export *exp,
+		   struct obd_quotactl *oqctl)
+{
+	struct obd_device   *obd = class_exp2obd(exp);
+	struct lmv_obd      *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	int		  i, rc = 0;
+	ENTRY;
+
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		int err;
+		tgt = lmv->tgts[i];
+		if (tgt == NULL || tgt->ltd_exp == NULL || !tgt->ltd_active) {
+			CERROR("lmv idx %d inactive\n", i);
+			RETURN(-EIO);
+		}
+
+		err = obd_quotacheck(tgt->ltd_exp, oqctl);
+		if (err && !rc)
+			rc = err;
+	}
+
+	RETURN(rc);
+}
+
+struct obd_ops lmv_obd_ops = {
+	.o_owner		= THIS_MODULE,
+	.o_setup		= lmv_setup,
+	.o_cleanup	      = lmv_cleanup,
+	.o_precleanup	   = lmv_precleanup,
+	.o_process_config       = lmv_process_config,
+	.o_connect	      = lmv_connect,
+	.o_disconnect	   = lmv_disconnect,
+	.o_statfs	       = lmv_statfs,
+	.o_get_info	     = lmv_get_info,
+	.o_set_info_async       = lmv_set_info_async,
+	.o_packmd	       = lmv_packmd,
+	.o_unpackmd	     = lmv_unpackmd,
+	.o_notify	       = lmv_notify,
+	.o_get_uuid	     = lmv_get_uuid,
+	.o_iocontrol	    = lmv_iocontrol,
+	.o_quotacheck	   = lmv_quotacheck,
+	.o_quotactl	     = lmv_quotactl
+};
+
+struct md_ops lmv_md_ops = {
+	.m_getstatus	    = lmv_getstatus,
+	.m_null_inode		= lmv_null_inode,
+	.m_find_cbdata	  = lmv_find_cbdata,
+	.m_close		= lmv_close,
+	.m_create	       = lmv_create,
+	.m_done_writing	 = lmv_done_writing,
+	.m_enqueue	      = lmv_enqueue,
+	.m_getattr	      = lmv_getattr,
+	.m_getxattr	     = lmv_getxattr,
+	.m_getattr_name	 = lmv_getattr_name,
+	.m_intent_lock	  = lmv_intent_lock,
+	.m_link		 = lmv_link,
+	.m_rename	       = lmv_rename,
+	.m_setattr	      = lmv_setattr,
+	.m_setxattr	     = lmv_setxattr,
+	.m_sync		 = lmv_sync,
+	.m_readpage	     = lmv_readpage,
+	.m_unlink	       = lmv_unlink,
+	.m_init_ea_size	 = lmv_init_ea_size,
+	.m_cancel_unused	= lmv_cancel_unused,
+	.m_set_lock_data	= lmv_set_lock_data,
+	.m_lock_match	   = lmv_lock_match,
+	.m_get_lustre_md	= lmv_get_lustre_md,
+	.m_free_lustre_md       = lmv_free_lustre_md,
+	.m_set_open_replay_data = lmv_set_open_replay_data,
+	.m_clear_open_replay_data = lmv_clear_open_replay_data,
+	.m_renew_capa	   = lmv_renew_capa,
+	.m_unpack_capa	  = lmv_unpack_capa,
+	.m_get_remote_perm      = lmv_get_remote_perm,
+	.m_intent_getattr_async = lmv_intent_getattr_async,
+	.m_revalidate_lock      = lmv_revalidate_lock
+};
+
+int __init lmv_init(void)
+{
+	struct lprocfs_static_vars lvars;
+	int			rc;
+
+	lprocfs_lmv_init_vars(&lvars);
+
+	rc = class_register_type(&lmv_obd_ops, &lmv_md_ops,
+				 lvars.module_vars, LUSTRE_LMV_NAME, NULL);
+	return rc;
+}
+
+static void lmv_exit(void)
+{
+	class_unregister_type(LUSTRE_LMV_NAME);
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Logical Metadata Volume OBD driver");
+MODULE_LICENSE("GPL");
+
+module_init(lmv_init);
+module_exit(lmv_exit);
diff --git a/drivers/staging/lustre/lustre/lmv/lproc_lmv.c b/drivers/staging/lustre/lustre/lmv/lproc_lmv.c
new file mode 100644
index 000000000000..4bbe0241c93d
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lmv/lproc_lmv.c
@@ -0,0 +1,239 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/version.h>
+#include <linux/seq_file.h>
+#include <asm/statfs.h>
+#include <lprocfs_status.h>
+#include <obd_class.h>
+
+#ifndef LPROCFS
+static struct lprocfs_vars lprocfs_module_vars[] = { {0} };
+static struct lprocfs_vars lprocfs_obd_vars[] = { {0} };
+#else
+static int lmv_rd_numobd(char *page, char **start, off_t off, int count,
+			 int *eof, void *data)
+{
+	struct obd_device       *dev = (struct obd_device*)data;
+	struct lmv_desc	 *desc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lmv.desc;
+	*eof = 1;
+	return snprintf(page, count, "%u\n", desc->ld_tgt_count);
+
+}
+
+static const char *placement_name[] = {
+	[PLACEMENT_CHAR_POLICY] = "CHAR",
+	[PLACEMENT_NID_POLICY]  = "NID",
+	[PLACEMENT_INVAL_POLICY]  = "INVAL"
+};
+
+static placement_policy_t placement_name2policy(char *name, int len)
+{
+	int		     i;
+
+	for (i = 0; i < PLACEMENT_MAX_POLICY; i++) {
+		if (!strncmp(placement_name[i], name, len))
+			return i;
+	}
+	return PLACEMENT_INVAL_POLICY;
+}
+
+static const char *placement_policy2name(placement_policy_t placement)
+{
+	LASSERT(placement < PLACEMENT_MAX_POLICY);
+	return placement_name[placement];
+}
+
+static int lmv_rd_placement(char *page, char **start, off_t off, int count,
+			    int *eof, void *data)
+{
+	struct obd_device       *dev = (struct obd_device*)data;
+	struct lmv_obd	  *lmv;
+
+	LASSERT(dev != NULL);
+	lmv = &dev->u.lmv;
+	*eof = 1;
+	return snprintf(page, count, "%s\n",
+			placement_policy2name(lmv->lmv_placement));
+
+}
+
+#define MAX_POLICY_STRING_SIZE 64
+
+static int lmv_wr_placement(struct file *file, const char *buffer,
+			    unsigned long count, void *data)
+{
+	struct obd_device       *dev = (struct obd_device *)data;
+	char		     dummy[MAX_POLICY_STRING_SIZE + 1];
+	int		      len = count;
+	placement_policy_t       policy;
+	struct lmv_obd	  *lmv;
+
+	if (copy_from_user(dummy, buffer, MAX_POLICY_STRING_SIZE))
+		return -EFAULT;
+
+	LASSERT(dev != NULL);
+	lmv = &dev->u.lmv;
+
+	if (len > MAX_POLICY_STRING_SIZE)
+		len = MAX_POLICY_STRING_SIZE;
+
+	if (dummy[len - 1] == '\n')
+		len--;
+	dummy[len] = '\0';
+
+	policy = placement_name2policy(dummy, len);
+	if (policy != PLACEMENT_INVAL_POLICY) {
+		spin_lock(&lmv->lmv_lock);
+		lmv->lmv_placement = policy;
+		spin_unlock(&lmv->lmv_lock);
+	} else {
+		CERROR("Invalid placement policy \"%s\"!\n", dummy);
+		return -EINVAL;
+	}
+	return count;
+}
+
+static int lmv_rd_activeobd(char *page, char **start, off_t off, int count,
+			    int *eof, void *data)
+{
+	struct obd_device       *dev = (struct obd_device*)data;
+	struct lmv_desc	 *desc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lmv.desc;
+	*eof = 1;
+	return snprintf(page, count, "%u\n", desc->ld_active_tgt_count);
+}
+
+static int lmv_rd_desc_uuid(char *page, char **start, off_t off, int count,
+			    int *eof, void *data)
+{
+	struct obd_device       *dev = (struct obd_device*) data;
+	struct lmv_obd	  *lmv;
+
+	LASSERT(dev != NULL);
+	lmv = &dev->u.lmv;
+	*eof = 1;
+	return snprintf(page, count, "%s\n", lmv->desc.ld_uuid.uuid);
+}
+
+static void *lmv_tgt_seq_start(struct seq_file *p, loff_t *pos)
+{
+	struct obd_device       *dev = p->private;
+	struct lmv_obd	  *lmv = &dev->u.lmv;
+	return (*pos >= lmv->desc.ld_tgt_count) ? NULL : lmv->tgts[*pos];
+}
+
+static void lmv_tgt_seq_stop(struct seq_file *p, void *v)
+{
+	return;
+}
+
+static void *lmv_tgt_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	struct obd_device       *dev = p->private;
+	struct lmv_obd	  *lmv = &dev->u.lmv;
+	++*pos;
+	return (*pos >= lmv->desc.ld_tgt_count) ? NULL : lmv->tgts[*pos];
+}
+
+static int lmv_tgt_seq_show(struct seq_file *p, void *v)
+{
+	struct lmv_tgt_desc     *tgt = v;
+
+	if (tgt == NULL)
+		return 0;
+	return seq_printf(p, "%d: %s %sACTIVE\n", tgt->ltd_idx,
+			  tgt->ltd_uuid.uuid, tgt->ltd_active ? "" : "IN");
+}
+
+struct seq_operations lmv_tgt_sops = {
+	.start		 = lmv_tgt_seq_start,
+	.stop		  = lmv_tgt_seq_stop,
+	.next		  = lmv_tgt_seq_next,
+	.show		  = lmv_tgt_seq_show,
+};
+
+static int lmv_target_seq_open(struct inode *inode, struct file *file)
+{
+	struct proc_dir_entry   *dp = PDE(inode);
+	struct seq_file	 *seq;
+	int		     rc;
+
+	rc = seq_open(file, &lmv_tgt_sops);
+	if (rc)
+		return rc;
+
+	seq = file->private_data;
+	seq->private = dp->data;
+
+	return 0;
+}
+
+struct lprocfs_vars lprocfs_lmv_obd_vars[] = {
+	{ "numobd",	     lmv_rd_numobd,	  0, 0 },
+	{ "placement",	  lmv_rd_placement,       lmv_wr_placement, 0 },
+	{ "activeobd",	  lmv_rd_activeobd,       0, 0 },
+	{ "uuid",	       lprocfs_rd_uuid,	0, 0 },
+	{ "desc_uuid",	  lmv_rd_desc_uuid,       0, 0 },
+	{ 0 }
+};
+
+static struct lprocfs_vars lprocfs_lmv_module_vars[] = {
+	{ "num_refs",	   lprocfs_rd_numrefs,     0, 0 },
+	{ 0 }
+};
+
+struct file_operations lmv_proc_target_fops = {
+	.owner		= THIS_MODULE,
+	.open		 = lmv_target_seq_open,
+	.read		 = seq_read,
+	.llseek	       = seq_lseek,
+	.release	      = seq_release,
+};
+
+#endif /* LPROCFS */
+void lprocfs_lmv_init_vars(struct lprocfs_static_vars *lvars)
+{
+	lvars->module_vars    = lprocfs_lmv_module_vars;
+	lvars->obd_vars       = lprocfs_lmv_obd_vars;
+}
diff --git a/drivers/staging/lustre/lustre/lov/Makefile b/drivers/staging/lustre/lustre/lov/Makefile
new file mode 100644
index 000000000000..67eaec29bef1
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/Makefile
@@ -0,0 +1,9 @@
+obj-$(CONFIG_LUSTRE_FS) += lov.o
+lov-y := lov_log.o lov_obd.o lov_pack.o lproc_lov.o lov_offset.o lov_merge.o \
+	 lov_request.o lov_ea.o lov_dev.o lov_object.o lov_page.o  \
+	 lov_lock.o lov_io.o lovsub_dev.o lovsub_object.o lovsub_page.o      \
+	 lovsub_lock.o lovsub_io.o lov_pool.o
+
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lustre/lov/lov_cl_internal.h b/drivers/staging/lustre/lustre/lov/lov_cl_internal.h
new file mode 100644
index 000000000000..28801b8b5fdf
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_cl_internal.h
@@ -0,0 +1,820 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Internal interfaces of LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@intel.com>
+ */
+
+#ifndef LOV_CL_INTERNAL_H
+#define LOV_CL_INTERNAL_H
+
+# include <linux/libcfs/libcfs.h>
+
+#include <obd.h>
+#include <cl_object.h>
+#include "lov_internal.h"
+
+/** \defgroup lov lov
+ * Logical object volume layer. This layer implements data striping (raid0).
+ *
+ * At the lov layer top-entity (object, page, lock, io) is connected to one or
+ * more sub-entities: top-object, representing a file is connected to a set of
+ * sub-objects, each representing a stripe, file-level top-lock is connected
+ * to a set of per-stripe sub-locks, top-page is connected to a (single)
+ * sub-page, and a top-level IO is connected to a set of (potentially
+ * concurrent) sub-IO's.
+ *
+ * Sub-object, sub-page, and sub-io have well-defined top-object and top-page
+ * respectively, while a single sub-lock can be part of multiple top-locks.
+ *
+ * Reference counting models are different for different types of entities:
+ *
+ *     - top-object keeps a reference to its sub-objects, and destroys them
+ *       when it is destroyed.
+ *
+ *     - top-page keeps a reference to its sub-page, and destroys it when it
+ *       is destroyed.
+ *
+ *     - sub-lock keep a reference to its top-locks. Top-lock keeps a
+ *       reference (and a hold, see cl_lock_hold()) on its sub-locks when it
+ *       actively using them (that is, in cl_lock_state::CLS_QUEUING,
+ *       cl_lock_state::CLS_ENQUEUED, cl_lock_state::CLS_HELD states). When
+ *       moving into cl_lock_state::CLS_CACHED state, top-lock releases a
+ *       hold. From this moment top-lock has only a 'weak' reference to its
+ *       sub-locks. This reference is protected by top-lock
+ *       cl_lock::cll_guard, and will be automatically cleared by the sub-lock
+ *       when the latter is destroyed. When a sub-lock is canceled, a
+ *       reference to it is removed from the top-lock array, and top-lock is
+ *       moved into CLS_NEW state. It is guaranteed that all sub-locks exist
+ *       while their top-lock is in CLS_HELD or CLS_CACHED states.
+ *
+ *     - IO's are not reference counted.
+ *
+ * To implement a connection between top and sub entities, lov layer is split
+ * into two pieces: lov ("upper half"), and lovsub ("bottom half"), both
+ * implementing full set of cl-interfaces. For example, top-object has vvp and
+ * lov layers, and it's sub-object has lovsub and osc layers. lovsub layer is
+ * used to track child-parent relationship.
+ *
+ * @{
+ */
+
+struct lovsub_device;
+struct lovsub_object;
+struct lovsub_lock;
+
+enum lov_device_flags {
+	LOV_DEV_INITIALIZED = 1 << 0
+};
+
+/*
+ * Upper half.
+ */
+
+/**
+ * Resources that are used in memory-cleaning path, and whose allocation
+ * cannot fail even when memory is tight. They are preallocated in sufficient
+ * quantities in lov_device::ld_emerg[], and access to them is serialized
+ * lov_device::ld_mutex.
+ */
+struct lov_device_emerg {
+	/**
+	 * Page list used to submit IO when memory is in pressure.
+	 */
+	struct cl_page_list emrg_page_list;
+	/**
+	 * sub-io's shared by all threads accessing this device when memory is
+	 * too low to allocate sub-io's dynamically.
+	 */
+	struct cl_io	emrg_subio;
+	/**
+	 * Environments used by sub-io's in
+	 * lov_device_emerg::emrg_subio.
+	 */
+	struct lu_env      *emrg_env;
+	/**
+	 * Refchecks for lov_device_emerg::emrg_env.
+	 *
+	 * \see cl_env_get()
+	 */
+	int		 emrg_refcheck;
+};
+
+struct lov_device {
+	/*
+	 * XXX Locking of lov-private data is missing.
+	 */
+	struct cl_device	  ld_cl;
+	struct lov_obd	   *ld_lov;
+	/** size of lov_device::ld_target[] array */
+	__u32		     ld_target_nr;
+	struct lovsub_device    **ld_target;
+	__u32		     ld_flags;
+
+	/** Emergency resources used in memory-cleansing paths. */
+	struct lov_device_emerg **ld_emrg;
+	/**
+	 * Serializes access to lov_device::ld_emrg in low-memory
+	 * conditions.
+	 */
+	struct mutex		  ld_mutex;
+};
+
+/**
+ * Layout type.
+ */
+enum lov_layout_type {
+	/** empty file without body */
+	LLT_EMPTY,
+	/** striped file */
+	LLT_RAID0,
+	LLT_NR
+};
+
+/**
+ * lov-specific file state.
+ *
+ * lov object has particular layout type, determining how top-object is built
+ * on top of sub-objects. Layout type can change dynamically. When this
+ * happens, lov_object::lo_type_guard semaphore is taken in exclusive mode,
+ * all state pertaining to the old layout type is destroyed, and new state is
+ * constructed. All object methods take said semaphore in the shared mode,
+ * providing serialization against transition between layout types.
+ *
+ * To avoid multiple `if' or `switch' statements, selecting behavior for the
+ * current layout type, object methods perform double-dispatch, invoking
+ * function corresponding to the current layout type.
+ */
+struct lov_object {
+	struct cl_object       lo_cl;
+	/**
+	 * Serializes object operations with transitions between layout types.
+	 *
+	 * This semaphore is taken in shared mode by all object methods, and
+	 * is taken in exclusive mode when object type is changed.
+	 *
+	 * \see lov_object::lo_type
+	 */
+	struct rw_semaphore	lo_type_guard;
+	/**
+	 * Type of an object. Protected by lov_object::lo_type_guard.
+	 */
+	enum lov_layout_type	lo_type;
+	/**
+	 * True if layout is invalid. This bit is cleared when layout lock
+	 * is lost.
+	 */
+	bool			lo_layout_invalid;
+	/**
+	 * How many IOs are on going on this object. Layout can be changed
+	 * only if there is no active IO.
+	 */
+	atomic_t	       lo_active_ios;
+	/**
+	 * Waitq - wait for no one else is using lo_lsm
+	 */
+	wait_queue_head_t	       lo_waitq;
+	/**
+	 * Layout metadata. NULL if empty layout.
+	 */
+	struct lov_stripe_md  *lo_lsm;
+
+	union lov_layout_state {
+		struct lov_layout_raid0 {
+			unsigned	       lo_nr;
+			/**
+			 * When this is true, lov_object::lo_attr contains
+			 * valid up to date attributes for a top-level
+			 * object. This field is reset to 0 when attributes of
+			 * any sub-object change.
+			 */
+			int		       lo_attr_valid;
+			/**
+			 * Array of sub-objects. Allocated when top-object is
+			 * created (lov_init_raid0()).
+			 *
+			 * Top-object is a strict master of its sub-objects:
+			 * it is created before them, and outlives its
+			 * children (this later is necessary so that basic
+			 * functions like cl_object_top() always
+			 * work). Top-object keeps a reference on every
+			 * sub-object.
+			 *
+			 * When top-object is destroyed (lov_delete_raid0())
+			 * it releases its reference to a sub-object and waits
+			 * until the latter is finally destroyed.
+			 */
+			struct lovsub_object **lo_sub;
+			/**
+			 * protect lo_sub
+			 */
+			spinlock_t		lo_sub_lock;
+			/**
+			 * Cached object attribute, built from sub-object
+			 * attributes.
+			 */
+			struct cl_attr	 lo_attr;
+		} raid0;
+		struct lov_layout_state_empty {
+		} empty;
+	} u;
+	/**
+	 * Thread that acquired lov_object::lo_type_guard in an exclusive
+	 * mode.
+	 */
+	task_t	    *lo_owner;
+};
+
+/**
+ * Flags that top-lock can set on each of its sub-locks.
+ */
+enum lov_sub_flags {
+	/** Top-lock acquired a hold (cl_lock_hold()) on a sub-lock. */
+	LSF_HELD = 1 << 0
+};
+
+/**
+ * State lov_lock keeps for each sub-lock.
+ */
+struct lov_lock_sub {
+	/** sub-lock itself */
+	struct lovsub_lock  *sub_lock;
+	/** An array of per-sub-lock flags, taken from enum lov_sub_flags */
+	unsigned	     sub_flags;
+	int		  sub_stripe;
+	struct cl_lock_descr sub_descr;
+	struct cl_lock_descr sub_got;
+};
+
+/**
+ * lov-specific lock state.
+ */
+struct lov_lock {
+	struct cl_lock_slice   lls_cl;
+	/** Number of sub-locks in this lock */
+	int		    lls_nr;
+	/**
+	 * Number of existing sub-locks.
+	 */
+	unsigned	       lls_nr_filled;
+	/**
+	 * Set when sub-lock was canceled, while top-lock was being
+	 * used, or unused.
+	 */
+	unsigned int	       lls_cancel_race:1;
+	/**
+	 * An array of sub-locks
+	 *
+	 * There are two issues with managing sub-locks:
+	 *
+	 *     - sub-locks are concurrently canceled, and
+	 *
+	 *     - sub-locks are shared with other top-locks.
+	 *
+	 * To manage cancellation, top-lock acquires a hold on a sublock
+	 * (lov_sublock_adopt()) when the latter is inserted into
+	 * lov_lock::lls_sub[]. This hold is released (lov_sublock_release())
+	 * when top-lock is going into CLS_CACHED state or destroyed. Hold
+	 * prevents sub-lock from cancellation.
+	 *
+	 * Sub-lock sharing means, among other things, that top-lock that is
+	 * in the process of creation (i.e., not yet inserted into lock list)
+	 * is already accessible to other threads once at least one of its
+	 * sub-locks is created, see lov_lock_sub_init().
+	 *
+	 * Sub-lock can be in one of the following states:
+	 *
+	 *     - doesn't exist, lov_lock::lls_sub[]::sub_lock == NULL. Such
+	 *       sub-lock was either never created (top-lock is in CLS_NEW
+	 *       state), or it was created, then canceled, then destroyed
+	 *       (lov_lock_unlink() cleared sub-lock pointer in the top-lock).
+	 *
+	 *     - sub-lock exists and is on
+	 *       hold. (lov_lock::lls_sub[]::sub_flags & LSF_HELD). This is a
+	 *       normal state of a sub-lock in CLS_HELD and CLS_CACHED states
+	 *       of a top-lock.
+	 *
+	 *     - sub-lock exists, but is not held by the top-lock. This
+	 *       happens after top-lock released a hold on sub-locks before
+	 *       going into cache (lov_lock_unuse()).
+	 *
+	 * \todo To support wide-striping, array has to be replaced with a set
+	 * of queues to avoid scanning.
+	 */
+	struct lov_lock_sub   *lls_sub;
+	/**
+	 * Original description with which lock was enqueued.
+	 */
+	struct cl_lock_descr   lls_orig;
+};
+
+struct lov_page {
+	struct cl_page_slice lps_cl;
+	int		  lps_invalid;
+};
+
+/*
+ * Bottom half.
+ */
+
+struct lovsub_device {
+	struct cl_device   acid_cl;
+	struct lov_device *acid_super;
+	int		acid_idx;
+	struct cl_device  *acid_next;
+};
+
+struct lovsub_object {
+	struct cl_object_header lso_header;
+	struct cl_object	lso_cl;
+	struct lov_object      *lso_super;
+	int		     lso_index;
+};
+
+/**
+ * A link between a top-lock and a sub-lock. Separate data-structure is
+ * necessary, because top-locks and sub-locks are in M:N relationship.
+ *
+ * \todo This can be optimized for a (by far) most frequent case of a single
+ * top-lock per sub-lock.
+ */
+struct lov_lock_link {
+	struct lov_lock *lll_super;
+	/** An index within parent lock. */
+	int	      lll_idx;
+	/**
+	 * A linkage into per sub-lock list of all corresponding top-locks,
+	 * hanging off lovsub_lock::lss_parents.
+	 */
+	struct list_head       lll_list;
+};
+
+/**
+ * Lock state at lovsub layer.
+ */
+struct lovsub_lock {
+	struct cl_lock_slice  lss_cl;
+	/**
+	 * List of top-locks that have given sub-lock as their part. Protected
+	 * by cl_lock::cll_guard mutex.
+	 */
+	struct list_head	    lss_parents;
+	/**
+	 * Top-lock that initiated current operation on this sub-lock. This is
+	 * only set during top-to-bottom lock operations like enqueue, and is
+	 * used to optimize state change notification. Protected by
+	 * cl_lock::cll_guard mutex.
+	 *
+	 * \see lovsub_lock_state_one().
+	 */
+	struct cl_lock       *lss_active;
+};
+
+/**
+ * Describe the environment settings for sublocks.
+ */
+struct lov_sublock_env {
+	const struct lu_env *lse_env;
+	struct cl_io	*lse_io;
+	struct lov_io_sub   *lse_sub;
+};
+
+struct lovsub_page {
+	struct cl_page_slice lsb_cl;
+};
+
+
+struct lov_thread_info {
+	struct cl_object_conf   lti_stripe_conf;
+	struct lu_fid	   lti_fid;
+	struct cl_lock_descr    lti_ldescr;
+	struct ost_lvb	  lti_lvb;
+	struct cl_2queue	lti_cl2q;
+	struct cl_lock_closure  lti_closure;
+	wait_queue_t	  lti_waiter;
+};
+
+/**
+ * State that lov_io maintains for every sub-io.
+ */
+struct lov_io_sub {
+	int		  sub_stripe;
+	/**
+	 * sub-io for a stripe. Ideally sub-io's can be stopped and resumed
+	 * independently, with lov acting as a scheduler to maximize overall
+	 * throughput.
+	 */
+	struct cl_io	*sub_io;
+	/**
+	 * Linkage into a list (hanging off lov_io::lis_active) of all
+	 * sub-io's active for the current IO iteration.
+	 */
+	struct list_head	   sub_linkage;
+	/**
+	 * true, iff cl_io_init() was successfully executed against
+	 * lov_io_sub::sub_io.
+	 */
+	int		  sub_io_initialized;
+	/**
+	 * True, iff lov_io_sub::sub_io and lov_io_sub::sub_env weren't
+	 * allocated, but borrowed from a per-device emergency pool.
+	 */
+	int		  sub_borrowed;
+	/**
+	 * environment, in which sub-io executes.
+	 */
+	struct lu_env *sub_env;
+	/**
+	 * environment's refcheck.
+	 *
+	 * \see cl_env_get()
+	 */
+	int		  sub_refcheck;
+	int		  sub_refcheck2;
+	int		  sub_reenter;
+	void		*sub_cookie;
+};
+
+/**
+ * IO state private for LOV.
+ */
+struct lov_io {
+	/** super-class */
+	struct cl_io_slice lis_cl;
+	/**
+	 * Pointer to the object slice. This is a duplicate of
+	 * lov_io::lis_cl::cis_object.
+	 */
+	struct lov_object *lis_object;
+	/**
+	 * Original end-of-io position for this IO, set by the upper layer as
+	 * cl_io::u::ci_rw::pos + cl_io::u::ci_rw::count. lov remembers this,
+	 * changes pos and count to fit IO into a single stripe and uses saved
+	 * value to determine when IO iterations have to stop.
+	 *
+	 * This is used only for CIT_READ and CIT_WRITE io's.
+	 */
+	loff_t	     lis_io_endpos;
+
+	/**
+	 * starting position within a file, for the current io loop iteration
+	 * (stripe), used by ci_io_loop().
+	 */
+	obd_off	    lis_pos;
+	/**
+	 * end position with in a file, for the current stripe io. This is
+	 * exclusive (i.e., next offset after last byte affected by io).
+	 */
+	obd_off	    lis_endpos;
+
+	int		lis_mem_frozen;
+	int		lis_stripe_count;
+	int		lis_active_subios;
+
+	/**
+	 * the index of ls_single_subio in ls_subios array
+	 */
+	int		lis_single_subio_index;
+	struct cl_io       lis_single_subio;
+
+	/**
+	 * size of ls_subios array, actually the highest stripe #
+	 */
+	int		lis_nr_subios;
+	struct lov_io_sub *lis_subs;
+	/**
+	 * List of active sub-io's.
+	 */
+	struct list_head	 lis_active;
+};
+
+struct lov_session {
+	struct lov_io	  ls_io;
+	struct lov_sublock_env ls_subenv;
+};
+
+/**
+ * State of transfer for lov.
+ */
+struct lov_req {
+	struct cl_req_slice lr_cl;
+};
+
+/**
+ * State of transfer for lovsub.
+ */
+struct lovsub_req {
+	struct cl_req_slice lsrq_cl;
+};
+
+extern struct lu_device_type lov_device_type;
+extern struct lu_device_type lovsub_device_type;
+
+extern struct lu_context_key lov_key;
+extern struct lu_context_key lov_session_key;
+
+extern struct kmem_cache *lov_lock_kmem;
+extern struct kmem_cache *lov_object_kmem;
+extern struct kmem_cache *lov_thread_kmem;
+extern struct kmem_cache *lov_session_kmem;
+extern struct kmem_cache *lov_req_kmem;
+
+extern struct kmem_cache *lovsub_lock_kmem;
+extern struct kmem_cache *lovsub_object_kmem;
+extern struct kmem_cache *lovsub_req_kmem;
+
+extern struct kmem_cache *lov_lock_link_kmem;
+
+int   lov_object_init     (const struct lu_env *env, struct lu_object *obj,
+			   const struct lu_object_conf *conf);
+int   lovsub_object_init  (const struct lu_env *env, struct lu_object *obj,
+			   const struct lu_object_conf *conf);
+int   lov_lock_init       (const struct lu_env *env, struct cl_object *obj,
+			   struct cl_lock *lock, const struct cl_io *io);
+int   lov_io_init	 (const struct lu_env *env, struct cl_object *obj,
+			   struct cl_io *io);
+int   lovsub_lock_init    (const struct lu_env *env, struct cl_object *obj,
+			   struct cl_lock *lock, const struct cl_io *io);
+
+int   lov_lock_init_raid0 (const struct lu_env *env, struct cl_object *obj,
+			   struct cl_lock *lock, const struct cl_io *io);
+int   lov_lock_init_empty (const struct lu_env *env, struct cl_object *obj,
+			   struct cl_lock *lock, const struct cl_io *io);
+int   lov_io_init_raid0   (const struct lu_env *env, struct cl_object *obj,
+			   struct cl_io *io);
+int   lov_io_init_empty   (const struct lu_env *env, struct cl_object *obj,
+			   struct cl_io *io);
+void  lov_lock_unlink     (const struct lu_env *env, struct lov_lock_link *link,
+			   struct lovsub_lock *sub);
+
+struct lov_io_sub *lov_sub_get(const struct lu_env *env, struct lov_io *lio,
+			       int stripe);
+void  lov_sub_put	     (struct lov_io_sub *sub);
+int   lov_sublock_modify  (const struct lu_env *env, struct lov_lock *lov,
+			   struct lovsub_lock *sublock,
+			   const struct cl_lock_descr *d, int idx);
+
+
+int   lov_page_init       (const struct lu_env *env, struct cl_object *ob,
+			   struct cl_page *page, struct page *vmpage);
+int   lovsub_page_init    (const struct lu_env *env, struct cl_object *ob,
+			   struct cl_page *page, struct page *vmpage);
+
+int   lov_page_init_empty (const struct lu_env *env,
+			   struct cl_object *obj,
+			   struct cl_page *page, struct page *vmpage);
+int   lov_page_init_raid0 (const struct lu_env *env,
+			   struct cl_object *obj,
+			   struct cl_page *page, struct page *vmpage);
+struct lu_object *lov_object_alloc   (const struct lu_env *env,
+				      const struct lu_object_header *hdr,
+				      struct lu_device *dev);
+struct lu_object *lovsub_object_alloc(const struct lu_env *env,
+				      const struct lu_object_header *hdr,
+				      struct lu_device *dev);
+
+struct lov_lock_link *lov_lock_link_find(const struct lu_env *env,
+					 struct lov_lock *lck,
+					 struct lovsub_lock *sub);
+struct lov_io_sub    *lov_page_subio    (const struct lu_env *env,
+					 struct lov_io *lio,
+					 const struct cl_page_slice *slice);
+
+void lov_lsm_decref(struct lov_object *lov, struct lov_stripe_md *lsm);
+struct lov_stripe_md *lov_lsm_addref(struct lov_object *lov);
+
+#define lov_foreach_target(lov, var)		    \
+	for (var = 0; var < lov_targets_nr(lov); ++var)
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ * Accessors.
+ *
+ */
+
+static inline struct lov_session *lov_env_session(const struct lu_env *env)
+{
+	struct lov_session *ses;
+
+	ses = lu_context_key_get(env->le_ses, &lov_session_key);
+	LASSERT(ses != NULL);
+	return ses;
+}
+
+static inline struct lov_io *lov_env_io(const struct lu_env *env)
+{
+	return &lov_env_session(env)->ls_io;
+}
+
+static inline int lov_is_object(const struct lu_object *obj)
+{
+	return obj->lo_dev->ld_type == &lov_device_type;
+}
+
+static inline int lovsub_is_object(const struct lu_object *obj)
+{
+	return obj->lo_dev->ld_type == &lovsub_device_type;
+}
+
+static inline struct lu_device *lov2lu_dev(struct lov_device *lov)
+{
+	return &lov->ld_cl.cd_lu_dev;
+}
+
+static inline struct lov_device *lu2lov_dev(const struct lu_device *d)
+{
+	LINVRNT(d->ld_type == &lov_device_type);
+	return container_of0(d, struct lov_device, ld_cl.cd_lu_dev);
+}
+
+static inline struct cl_device *lovsub2cl_dev(struct lovsub_device *lovsub)
+{
+	return &lovsub->acid_cl;
+}
+
+static inline struct lu_device *lovsub2lu_dev(struct lovsub_device *lovsub)
+{
+	return &lovsub2cl_dev(lovsub)->cd_lu_dev;
+}
+
+static inline struct lovsub_device *lu2lovsub_dev(const struct lu_device *d)
+{
+	LINVRNT(d->ld_type == &lovsub_device_type);
+	return container_of0(d, struct lovsub_device, acid_cl.cd_lu_dev);
+}
+
+static inline struct lovsub_device *cl2lovsub_dev(const struct cl_device *d)
+{
+	LINVRNT(d->cd_lu_dev.ld_type == &lovsub_device_type);
+	return container_of0(d, struct lovsub_device, acid_cl);
+}
+
+static inline struct lu_object *lov2lu(struct lov_object *lov)
+{
+	return &lov->lo_cl.co_lu;
+}
+
+static inline struct cl_object *lov2cl(struct lov_object *lov)
+{
+	return &lov->lo_cl;
+}
+
+static inline struct lov_object *lu2lov(const struct lu_object *obj)
+{
+	LINVRNT(lov_is_object(obj));
+	return container_of0(obj, struct lov_object, lo_cl.co_lu);
+}
+
+static inline struct lov_object *cl2lov(const struct cl_object *obj)
+{
+	LINVRNT(lov_is_object(&obj->co_lu));
+	return container_of0(obj, struct lov_object, lo_cl);
+}
+
+static inline struct lu_object *lovsub2lu(struct lovsub_object *los)
+{
+	return &los->lso_cl.co_lu;
+}
+
+static inline struct cl_object *lovsub2cl(struct lovsub_object *los)
+{
+	return &los->lso_cl;
+}
+
+static inline struct lovsub_object *cl2lovsub(const struct cl_object *obj)
+{
+	LINVRNT(lovsub_is_object(&obj->co_lu));
+	return container_of0(obj, struct lovsub_object, lso_cl);
+}
+
+static inline struct lovsub_object *lu2lovsub(const struct lu_object *obj)
+{
+	LINVRNT(lovsub_is_object(obj));
+	return container_of0(obj, struct lovsub_object, lso_cl.co_lu);
+}
+
+static inline struct lovsub_lock *
+cl2lovsub_lock(const struct cl_lock_slice *slice)
+{
+	LINVRNT(lovsub_is_object(&slice->cls_obj->co_lu));
+	return container_of(slice, struct lovsub_lock, lss_cl);
+}
+
+static inline struct lovsub_lock *cl2sub_lock(const struct cl_lock *lock)
+{
+	const struct cl_lock_slice *slice;
+
+	slice = cl_lock_at(lock, &lovsub_device_type);
+	LASSERT(slice != NULL);
+	return cl2lovsub_lock(slice);
+}
+
+static inline struct lov_lock *cl2lov_lock(const struct cl_lock_slice *slice)
+{
+	LINVRNT(lov_is_object(&slice->cls_obj->co_lu));
+	return container_of(slice, struct lov_lock, lls_cl);
+}
+
+static inline struct lov_page *cl2lov_page(const struct cl_page_slice *slice)
+{
+	LINVRNT(lov_is_object(&slice->cpl_obj->co_lu));
+	return container_of0(slice, struct lov_page, lps_cl);
+}
+
+static inline struct lov_req *cl2lov_req(const struct cl_req_slice *slice)
+{
+	return container_of0(slice, struct lov_req, lr_cl);
+}
+
+static inline struct lovsub_page *
+cl2lovsub_page(const struct cl_page_slice *slice)
+{
+	LINVRNT(lovsub_is_object(&slice->cpl_obj->co_lu));
+	return container_of0(slice, struct lovsub_page, lsb_cl);
+}
+
+static inline struct lovsub_req *cl2lovsub_req(const struct cl_req_slice *slice)
+{
+	return container_of0(slice, struct lovsub_req, lsrq_cl);
+}
+
+static inline struct cl_page *lov_sub_page(const struct cl_page_slice *slice)
+{
+	return slice->cpl_page->cp_child;
+}
+
+static inline struct lov_io *cl2lov_io(const struct lu_env *env,
+				const struct cl_io_slice *ios)
+{
+	struct lov_io *lio;
+
+	lio = container_of(ios, struct lov_io, lis_cl);
+	LASSERT(lio == lov_env_io(env));
+	return lio;
+}
+
+static inline int lov_targets_nr(const struct lov_device *lov)
+{
+	return lov->ld_lov->desc.ld_tgt_count;
+}
+
+static inline struct lov_thread_info *lov_env_info(const struct lu_env *env)
+{
+	struct lov_thread_info *info;
+
+	info = lu_context_key_get(&env->le_ctx, &lov_key);
+	LASSERT(info != NULL);
+	return info;
+}
+
+static inline struct lov_layout_raid0 *lov_r0(struct lov_object *lov)
+{
+	LASSERT(lov->lo_type == LLT_RAID0);
+	LASSERT(lov->lo_lsm->lsm_wire.lw_magic == LOV_MAGIC ||
+		lov->lo_lsm->lsm_wire.lw_magic == LOV_MAGIC_V3);
+	return &lov->u.raid0;
+}
+
+/** @} lov */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/lov/lov_dev.c b/drivers/staging/lustre/lustre/lov/lov_dev.c
new file mode 100644
index 000000000000..f94f8d9d33d7
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_dev.c
@@ -0,0 +1,533 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_device and cl_device_type for LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+/* class_name2obd() */
+#include <obd_class.h>
+
+#include "lov_cl_internal.h"
+
+struct kmem_cache *lov_lock_kmem;
+struct kmem_cache *lov_object_kmem;
+struct kmem_cache *lov_thread_kmem;
+struct kmem_cache *lov_session_kmem;
+struct kmem_cache *lov_req_kmem;
+
+struct kmem_cache *lovsub_lock_kmem;
+struct kmem_cache *lovsub_object_kmem;
+struct kmem_cache *lovsub_req_kmem;
+
+struct kmem_cache *lov_lock_link_kmem;
+
+/** Lock class of lov_device::ld_mutex. */
+struct lock_class_key cl_lov_device_mutex_class;
+
+struct lu_kmem_descr lov_caches[] = {
+	{
+		.ckd_cache = &lov_lock_kmem,
+		.ckd_name  = "lov_lock_kmem",
+		.ckd_size  = sizeof (struct lov_lock)
+	},
+	{
+		.ckd_cache = &lov_object_kmem,
+		.ckd_name  = "lov_object_kmem",
+		.ckd_size  = sizeof (struct lov_object)
+	},
+	{
+		.ckd_cache = &lov_thread_kmem,
+		.ckd_name  = "lov_thread_kmem",
+		.ckd_size  = sizeof (struct lov_thread_info)
+	},
+	{
+		.ckd_cache = &lov_session_kmem,
+		.ckd_name  = "lov_session_kmem",
+		.ckd_size  = sizeof (struct lov_session)
+	},
+	{
+		.ckd_cache = &lov_req_kmem,
+		.ckd_name  = "lov_req_kmem",
+		.ckd_size  = sizeof (struct lov_req)
+	},
+	{
+		.ckd_cache = &lovsub_lock_kmem,
+		.ckd_name  = "lovsub_lock_kmem",
+		.ckd_size  = sizeof (struct lovsub_lock)
+	},
+	{
+		.ckd_cache = &lovsub_object_kmem,
+		.ckd_name  = "lovsub_object_kmem",
+		.ckd_size  = sizeof (struct lovsub_object)
+	},
+	{
+		.ckd_cache = &lovsub_req_kmem,
+		.ckd_name  = "lovsub_req_kmem",
+		.ckd_size  = sizeof (struct lovsub_req)
+	},
+	{
+		.ckd_cache = &lov_lock_link_kmem,
+		.ckd_name  = "lov_lock_link_kmem",
+		.ckd_size  = sizeof (struct lov_lock_link)
+	},
+	{
+		.ckd_cache = NULL
+	}
+};
+
+/*****************************************************************************
+ *
+ * Lov transfer operations.
+ *
+ */
+
+static void lov_req_completion(const struct lu_env *env,
+			       const struct cl_req_slice *slice, int ioret)
+{
+	struct lov_req *lr;
+
+	ENTRY;
+	lr = cl2lov_req(slice);
+	OBD_SLAB_FREE_PTR(lr, lov_req_kmem);
+	EXIT;
+}
+
+static const struct cl_req_operations lov_req_ops = {
+	.cro_completion = lov_req_completion
+};
+
+/*****************************************************************************
+ *
+ * Lov device and device type functions.
+ *
+ */
+
+static void *lov_key_init(const struct lu_context *ctx,
+			  struct lu_context_key *key)
+{
+	struct lov_thread_info *info;
+
+	OBD_SLAB_ALLOC_PTR_GFP(info, lov_thread_kmem, __GFP_IO);
+	if (info != NULL)
+		INIT_LIST_HEAD(&info->lti_closure.clc_list);
+	else
+		info = ERR_PTR(-ENOMEM);
+	return info;
+}
+
+static void lov_key_fini(const struct lu_context *ctx,
+			 struct lu_context_key *key, void *data)
+{
+	struct lov_thread_info *info = data;
+	LINVRNT(list_empty(&info->lti_closure.clc_list));
+	OBD_SLAB_FREE_PTR(info, lov_thread_kmem);
+}
+
+struct lu_context_key lov_key = {
+	.lct_tags = LCT_CL_THREAD,
+	.lct_init = lov_key_init,
+	.lct_fini = lov_key_fini
+};
+
+static void *lov_session_key_init(const struct lu_context *ctx,
+				  struct lu_context_key *key)
+{
+	struct lov_session *info;
+
+	OBD_SLAB_ALLOC_PTR_GFP(info, lov_session_kmem, __GFP_IO);
+	if (info == NULL)
+		info = ERR_PTR(-ENOMEM);
+	return info;
+}
+
+static void lov_session_key_fini(const struct lu_context *ctx,
+				 struct lu_context_key *key, void *data)
+{
+	struct lov_session *info = data;
+	OBD_SLAB_FREE_PTR(info, lov_session_kmem);
+}
+
+struct lu_context_key lov_session_key = {
+	.lct_tags = LCT_SESSION,
+	.lct_init = lov_session_key_init,
+	.lct_fini = lov_session_key_fini
+};
+
+/* type constructor/destructor: lov_type_{init,fini,start,stop}() */
+LU_TYPE_INIT_FINI(lov, &lov_key, &lov_session_key);
+
+static struct lu_device *lov_device_fini(const struct lu_env *env,
+					 struct lu_device *d)
+{
+	int i;
+	struct lov_device *ld = lu2lov_dev(d);
+
+	LASSERT(ld->ld_lov != NULL);
+	if (ld->ld_target == NULL)
+		RETURN(NULL);
+
+	lov_foreach_target(ld, i) {
+		struct lovsub_device *lsd;
+
+		lsd = ld->ld_target[i];
+		if (lsd != NULL) {
+			cl_stack_fini(env, lovsub2cl_dev(lsd));
+			ld->ld_target[i] = NULL;
+		}
+	}
+	RETURN(NULL);
+}
+
+static int lov_device_init(const struct lu_env *env, struct lu_device *d,
+			   const char *name, struct lu_device *next)
+{
+	struct lov_device *ld = lu2lov_dev(d);
+	int i;
+	int rc = 0;
+
+	LASSERT(d->ld_site != NULL);
+	if (ld->ld_target == NULL)
+		RETURN(rc);
+
+	lov_foreach_target(ld, i) {
+		struct lovsub_device *lsd;
+		struct cl_device     *cl;
+		struct lov_tgt_desc  *desc;
+
+		desc = ld->ld_lov->lov_tgts[i];
+		if (desc == NULL)
+			continue;
+
+		cl = cl_type_setup(env, d->ld_site, &lovsub_device_type,
+				   desc->ltd_obd->obd_lu_dev);
+		if (IS_ERR(cl)) {
+			rc = PTR_ERR(cl);
+			break;
+		}
+		lsd = cl2lovsub_dev(cl);
+		lsd->acid_idx = i;
+		lsd->acid_super = ld;
+		ld->ld_target[i] = lsd;
+	}
+
+	if (rc)
+		lov_device_fini(env, d);
+	else
+		ld->ld_flags |= LOV_DEV_INITIALIZED;
+
+	RETURN(rc);
+}
+
+static int lov_req_init(const struct lu_env *env, struct cl_device *dev,
+			struct cl_req *req)
+{
+	struct lov_req *lr;
+	int result;
+
+	ENTRY;
+	OBD_SLAB_ALLOC_PTR_GFP(lr, lov_req_kmem, __GFP_IO);
+	if (lr != NULL) {
+		cl_req_slice_add(req, &lr->lr_cl, dev, &lov_req_ops);
+		result = 0;
+	} else
+		result = -ENOMEM;
+	RETURN(result);
+}
+
+static const struct cl_device_operations lov_cl_ops = {
+	.cdo_req_init = lov_req_init
+};
+
+static void lov_emerg_free(struct lov_device_emerg **emrg, int nr)
+{
+	int i;
+
+	for (i = 0; i < nr; ++i) {
+		struct lov_device_emerg *em;
+
+		em = emrg[i];
+		if (em != NULL) {
+			LASSERT(em->emrg_page_list.pl_nr == 0);
+			if (em->emrg_env != NULL)
+				cl_env_put(em->emrg_env, &em->emrg_refcheck);
+			OBD_FREE_PTR(em);
+		}
+	}
+	OBD_FREE(emrg, nr * sizeof emrg[0]);
+}
+
+static struct lu_device *lov_device_free(const struct lu_env *env,
+					 struct lu_device *d)
+{
+	struct lov_device *ld = lu2lov_dev(d);
+	const int	  nr = ld->ld_target_nr;
+
+	cl_device_fini(lu2cl_dev(d));
+	if (ld->ld_target != NULL)
+		OBD_FREE(ld->ld_target, nr * sizeof ld->ld_target[0]);
+	if (ld->ld_emrg != NULL)
+		lov_emerg_free(ld->ld_emrg, nr);
+	OBD_FREE_PTR(ld);
+	return NULL;
+}
+
+static void lov_cl_del_target(const struct lu_env *env, struct lu_device *dev,
+			      __u32 index)
+{
+	struct lov_device *ld = lu2lov_dev(dev);
+	ENTRY;
+
+	if (ld->ld_target[index] != NULL) {
+		cl_stack_fini(env, lovsub2cl_dev(ld->ld_target[index]));
+		ld->ld_target[index] = NULL;
+	}
+	EXIT;
+}
+
+static struct lov_device_emerg **lov_emerg_alloc(int nr)
+{
+	struct lov_device_emerg **emerg;
+	int i;
+	int result;
+
+	OBD_ALLOC(emerg, nr * sizeof emerg[0]);
+	if (emerg == NULL)
+		return ERR_PTR(-ENOMEM);
+	for (result = i = 0; i < nr && result == 0; i++) {
+		struct lov_device_emerg *em;
+
+		OBD_ALLOC_PTR(em);
+		if (em != NULL) {
+			emerg[i] = em;
+			cl_page_list_init(&em->emrg_page_list);
+			em->emrg_env = cl_env_alloc(&em->emrg_refcheck,
+						    LCT_REMEMBER|LCT_NOREF);
+			if (!IS_ERR(em->emrg_env))
+				em->emrg_env->le_ctx.lc_cookie = 0x2;
+			else {
+				result = PTR_ERR(em->emrg_env);
+				em->emrg_env = NULL;
+			}
+		} else
+			result = -ENOMEM;
+	}
+	if (result != 0) {
+		lov_emerg_free(emerg, nr);
+		emerg = ERR_PTR(result);
+	}
+	return emerg;
+}
+
+static int lov_expand_targets(const struct lu_env *env, struct lov_device *dev)
+{
+	int   result;
+	__u32 tgt_size;
+	__u32 sub_size;
+
+	ENTRY;
+	result = 0;
+	tgt_size = dev->ld_lov->lov_tgt_size;
+	sub_size = dev->ld_target_nr;
+	if (sub_size < tgt_size) {
+		struct lovsub_device    **newd;
+		struct lov_device_emerg **emerg;
+		const size_t	      sz   = sizeof newd[0];
+
+		emerg = lov_emerg_alloc(tgt_size);
+		if (IS_ERR(emerg))
+			RETURN(PTR_ERR(emerg));
+
+		OBD_ALLOC(newd, tgt_size * sz);
+		if (newd != NULL) {
+			mutex_lock(&dev->ld_mutex);
+			if (sub_size > 0) {
+				memcpy(newd, dev->ld_target, sub_size * sz);
+				OBD_FREE(dev->ld_target, sub_size * sz);
+			}
+			dev->ld_target    = newd;
+			dev->ld_target_nr = tgt_size;
+
+			if (dev->ld_emrg != NULL)
+				lov_emerg_free(dev->ld_emrg, sub_size);
+			dev->ld_emrg = emerg;
+			mutex_unlock(&dev->ld_mutex);
+		} else {
+			lov_emerg_free(emerg, tgt_size);
+			result = -ENOMEM;
+		}
+	}
+	RETURN(result);
+}
+
+static int lov_cl_add_target(const struct lu_env *env, struct lu_device *dev,
+			     __u32 index)
+{
+	struct obd_device    *obd = dev->ld_obd;
+	struct lov_device    *ld  = lu2lov_dev(dev);
+	struct lov_tgt_desc  *tgt;
+	struct lovsub_device *lsd;
+	struct cl_device     *cl;
+	int rc;
+	ENTRY;
+
+	obd_getref(obd);
+
+	tgt = obd->u.lov.lov_tgts[index];
+	LASSERT(tgt != NULL);
+	LASSERT(tgt->ltd_obd != NULL);
+
+	if (!tgt->ltd_obd->obd_set_up) {
+		CERROR("Target %s not set up\n", obd_uuid2str(&tgt->ltd_uuid));
+		RETURN(-EINVAL);
+	}
+
+	rc = lov_expand_targets(env, ld);
+	if (rc == 0 && ld->ld_flags & LOV_DEV_INITIALIZED) {
+		LASSERT(dev->ld_site != NULL);
+
+		cl = cl_type_setup(env, dev->ld_site, &lovsub_device_type,
+				   tgt->ltd_obd->obd_lu_dev);
+		if (!IS_ERR(cl)) {
+			lsd = cl2lovsub_dev(cl);
+			lsd->acid_idx = index;
+			lsd->acid_super = ld;
+			ld->ld_target[index] = lsd;
+		} else {
+			CERROR("add failed (%d), deleting %s\n", rc,
+			       obd_uuid2str(&tgt->ltd_uuid));
+			lov_cl_del_target(env, dev, index);
+			rc = PTR_ERR(cl);
+		}
+	}
+	obd_putref(obd);
+	RETURN(rc);
+}
+
+static int lov_process_config(const struct lu_env *env,
+			      struct lu_device *d, struct lustre_cfg *cfg)
+{
+	struct obd_device *obd = d->ld_obd;
+	int cmd;
+	int rc;
+	int gen;
+	__u32 index;
+
+	obd_getref(obd);
+
+	cmd = cfg->lcfg_command;
+	rc = lov_process_config_base(d->ld_obd, cfg, &index, &gen);
+	if (rc == 0) {
+		switch(cmd) {
+		case LCFG_LOV_ADD_OBD:
+		case LCFG_LOV_ADD_INA:
+			rc = lov_cl_add_target(env, d, index);
+			if (rc != 0)
+				lov_del_target(d->ld_obd, index, 0, 0);
+			break;
+		case LCFG_LOV_DEL_OBD:
+			lov_cl_del_target(env, d, index);
+			break;
+		}
+	}
+	obd_putref(obd);
+	RETURN(rc);
+}
+
+static const struct lu_device_operations lov_lu_ops = {
+	.ldo_object_alloc      = lov_object_alloc,
+	.ldo_process_config    = lov_process_config,
+};
+
+static struct lu_device *lov_device_alloc(const struct lu_env *env,
+					  struct lu_device_type *t,
+					  struct lustre_cfg *cfg)
+{
+	struct lu_device *d;
+	struct lov_device *ld;
+	struct obd_device *obd;
+	int rc;
+
+	OBD_ALLOC_PTR(ld);
+	if (ld == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	cl_device_init(&ld->ld_cl, t);
+	d = lov2lu_dev(ld);
+	d->ld_ops	= &lov_lu_ops;
+	ld->ld_cl.cd_ops = &lov_cl_ops;
+
+	mutex_init(&ld->ld_mutex);
+	lockdep_set_class(&ld->ld_mutex, &cl_lov_device_mutex_class);
+
+	/* setup the LOV OBD */
+	obd = class_name2obd(lustre_cfg_string(cfg, 0));
+	LASSERT(obd != NULL);
+	rc = lov_setup(obd, cfg);
+	if (rc) {
+		lov_device_free(env, d);
+		RETURN(ERR_PTR(rc));
+	}
+
+	ld->ld_lov = &obd->u.lov;
+	RETURN(d);
+}
+
+static const struct lu_device_type_operations lov_device_type_ops = {
+	.ldto_init = lov_type_init,
+	.ldto_fini = lov_type_fini,
+
+	.ldto_start = lov_type_start,
+	.ldto_stop  = lov_type_stop,
+
+	.ldto_device_alloc = lov_device_alloc,
+	.ldto_device_free  = lov_device_free,
+
+	.ldto_device_init    = lov_device_init,
+	.ldto_device_fini    = lov_device_fini
+};
+
+struct lu_device_type lov_device_type = {
+	.ldt_tags     = LU_DEVICE_CL,
+	.ldt_name     = LUSTRE_LOV_NAME,
+	.ldt_ops      = &lov_device_type_ops,
+	.ldt_ctx_tags = LCT_CL_THREAD
+};
+EXPORT_SYMBOL(lov_device_type);
+
+/** @} lov */
diff --git a/drivers/staging/lustre/lustre/lov/lov_ea.c b/drivers/staging/lustre/lustre/lov/lov_ea.c
new file mode 100644
index 000000000000..481e8631fc3e
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_ea.c
@@ -0,0 +1,334 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lov/lov_ea.c
+ *
+ * Author: Wang Di <wangdi@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include <asm/div64.h>
+#include <linux/libcfs/libcfs.h>
+
+#include <obd_class.h>
+#include <obd_lov.h>
+#include <lustre/lustre_idl.h>
+
+#include "lov_internal.h"
+
+struct lovea_unpack_args {
+	struct lov_stripe_md *lsm;
+	int		   cursor;
+};
+
+static int lsm_lmm_verify_common(struct lov_mds_md *lmm, int lmm_bytes,
+				 __u16 stripe_count)
+{
+
+	if (stripe_count == 0 || stripe_count > LOV_V1_INSANE_STRIPE_COUNT) {
+		CERROR("bad stripe count %d\n", stripe_count);
+		lov_dump_lmm(D_WARNING, lmm);
+		return -EINVAL;
+	}
+
+	if (lmm_oi_id(&lmm->lmm_oi) == 0) {
+		CERROR("zero object id\n");
+		lov_dump_lmm(D_WARNING, lmm);
+		return -EINVAL;
+	}
+
+	if (lmm->lmm_pattern != cpu_to_le32(LOV_PATTERN_RAID0)) {
+		CERROR("bad striping pattern\n");
+		lov_dump_lmm(D_WARNING, lmm);
+		return -EINVAL;
+	}
+
+	if (lmm->lmm_stripe_size == 0 ||
+	     (le32_to_cpu(lmm->lmm_stripe_size)&(LOV_MIN_STRIPE_SIZE-1)) != 0) {
+		CERROR("bad stripe size %u\n",
+		       le32_to_cpu(lmm->lmm_stripe_size));
+		lov_dump_lmm(D_WARNING, lmm);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+struct lov_stripe_md *lsm_alloc_plain(__u16 stripe_count, int *size)
+{
+	struct lov_stripe_md *lsm;
+	struct lov_oinfo     *loi;
+	int		   i, oinfo_ptrs_size;
+
+	LASSERT(stripe_count <= LOV_MAX_STRIPE_COUNT);
+
+	oinfo_ptrs_size = sizeof(struct lov_oinfo *) * stripe_count;
+	*size = sizeof(struct lov_stripe_md) + oinfo_ptrs_size;
+
+	OBD_ALLOC_LARGE(lsm, *size);
+	if (!lsm)
+		return NULL;;
+
+	for (i = 0; i < stripe_count; i++) {
+		OBD_SLAB_ALLOC_PTR_GFP(loi, lov_oinfo_slab, __GFP_IO);
+		if (loi == NULL)
+			goto err;
+		lsm->lsm_oinfo[i] = loi;
+	}
+	lsm->lsm_stripe_count = stripe_count;
+	return lsm;
+
+err:
+	while (--i >= 0)
+		OBD_SLAB_FREE(lsm->lsm_oinfo[i], lov_oinfo_slab, sizeof(*loi));
+	OBD_FREE_LARGE(lsm, *size);
+	return NULL;
+}
+
+void lsm_free_plain(struct lov_stripe_md *lsm)
+{
+	__u16 stripe_count = lsm->lsm_stripe_count;
+	int i;
+
+	for (i = 0; i < stripe_count; i++)
+		OBD_SLAB_FREE(lsm->lsm_oinfo[i], lov_oinfo_slab,
+			      sizeof(struct lov_oinfo));
+	OBD_FREE_LARGE(lsm, sizeof(struct lov_stripe_md) +
+		       stripe_count * sizeof(struct lov_oinfo *));
+}
+
+static void lsm_unpackmd_common(struct lov_stripe_md *lsm,
+				struct lov_mds_md *lmm)
+{
+	/*
+	 * This supposes lov_mds_md_v1/v3 first fields are
+	 * are the same
+	 */
+	lmm_oi_le_to_cpu(&lsm->lsm_oi, &lmm->lmm_oi);
+	lsm->lsm_stripe_size = le32_to_cpu(lmm->lmm_stripe_size);
+	lsm->lsm_pattern = le32_to_cpu(lmm->lmm_pattern);
+	lsm->lsm_layout_gen = le16_to_cpu(lmm->lmm_layout_gen);
+	lsm->lsm_pool_name[0] = '\0';
+}
+
+static void
+lsm_stripe_by_index_plain(struct lov_stripe_md *lsm, int *stripeno,
+			   obd_off *lov_off, obd_off *swidth)
+{
+	if (swidth)
+		*swidth = (obd_off)lsm->lsm_stripe_size * lsm->lsm_stripe_count;
+}
+
+static void
+lsm_stripe_by_offset_plain(struct lov_stripe_md *lsm, int *stripeno,
+			   obd_off *lov_off, obd_off *swidth)
+{
+	if (swidth)
+		*swidth = (obd_off)lsm->lsm_stripe_size * lsm->lsm_stripe_count;
+}
+
+static int lsm_destroy_plain(struct lov_stripe_md *lsm, struct obdo *oa,
+			     struct obd_export *md_exp)
+{
+	return 0;
+}
+
+/* Find minimum stripe maxbytes value.  For inactive or
+ * reconnecting targets use LUSTRE_STRIPE_MAXBYTES. */
+static void lov_tgt_maxbytes(struct lov_tgt_desc *tgt, __u64 *stripe_maxbytes)
+{
+	struct obd_import *imp = tgt->ltd_obd->u.cli.cl_import;
+
+	if (imp == NULL || !tgt->ltd_active) {
+		*stripe_maxbytes = LUSTRE_STRIPE_MAXBYTES;
+		return;
+	}
+
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state == LUSTRE_IMP_FULL &&
+	    (imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_MAXBYTES) &&
+	    imp->imp_connect_data.ocd_maxbytes > 0) {
+		if (*stripe_maxbytes > imp->imp_connect_data.ocd_maxbytes)
+			*stripe_maxbytes = imp->imp_connect_data.ocd_maxbytes;
+	} else {
+		*stripe_maxbytes = LUSTRE_STRIPE_MAXBYTES;
+	}
+	spin_unlock(&imp->imp_lock);
+}
+
+static int lsm_lmm_verify_v1(struct lov_mds_md_v1 *lmm, int lmm_bytes,
+			     __u16 *stripe_count)
+{
+	if (lmm_bytes < sizeof(*lmm)) {
+		CERROR("lov_mds_md_v1 too small: %d, need at least %d\n",
+		       lmm_bytes, (int)sizeof(*lmm));
+		return -EINVAL;
+	}
+
+	*stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
+
+	if (lmm_bytes < lov_mds_md_size(*stripe_count, LOV_MAGIC_V1)) {
+		CERROR("LOV EA V1 too small: %d, need %d\n",
+		       lmm_bytes, lov_mds_md_size(*stripe_count, LOV_MAGIC_V1));
+		lov_dump_lmm_v1(D_WARNING, lmm);
+		return -EINVAL;
+	}
+
+	return lsm_lmm_verify_common(lmm, lmm_bytes, *stripe_count);
+}
+
+int lsm_unpackmd_v1(struct lov_obd *lov, struct lov_stripe_md *lsm,
+		    struct lov_mds_md_v1 *lmm)
+{
+	struct lov_oinfo *loi;
+	int i;
+	__u64 stripe_maxbytes = OBD_OBJECT_EOF;
+
+	lsm_unpackmd_common(lsm, lmm);
+
+	for (i = 0; i < lsm->lsm_stripe_count; i++) {
+		/* XXX LOV STACKING call down to osc_unpackmd() */
+		loi = lsm->lsm_oinfo[i];
+		ostid_le_to_cpu(&lmm->lmm_objects[i].l_ost_oi, &loi->loi_oi);
+		loi->loi_ost_idx = le32_to_cpu(lmm->lmm_objects[i].l_ost_idx);
+		loi->loi_ost_gen = le32_to_cpu(lmm->lmm_objects[i].l_ost_gen);
+		if (loi->loi_ost_idx >= lov->desc.ld_tgt_count) {
+			CERROR("OST index %d more than OST count %d\n",
+			       loi->loi_ost_idx, lov->desc.ld_tgt_count);
+			lov_dump_lmm_v1(D_WARNING, lmm);
+			return -EINVAL;
+		}
+		if (!lov->lov_tgts[loi->loi_ost_idx]) {
+			CERROR("OST index %d missing\n", loi->loi_ost_idx);
+			lov_dump_lmm_v1(D_WARNING, lmm);
+			return -EINVAL;
+		}
+		/* calculate the minimum stripe max bytes */
+		lov_tgt_maxbytes(lov->lov_tgts[loi->loi_ost_idx],
+				 &stripe_maxbytes);
+	}
+
+	lsm->lsm_maxbytes = stripe_maxbytes * lsm->lsm_stripe_count;
+
+	return 0;
+}
+
+const struct lsm_operations lsm_v1_ops = {
+	.lsm_free	    = lsm_free_plain,
+	.lsm_destroy	 = lsm_destroy_plain,
+	.lsm_stripe_by_index    = lsm_stripe_by_index_plain,
+	.lsm_stripe_by_offset   = lsm_stripe_by_offset_plain,
+	.lsm_lmm_verify	 = lsm_lmm_verify_v1,
+	.lsm_unpackmd	   = lsm_unpackmd_v1,
+};
+
+static int lsm_lmm_verify_v3(struct lov_mds_md *lmmv1, int lmm_bytes,
+			     __u16 *stripe_count)
+{
+	struct lov_mds_md_v3 *lmm;
+
+	lmm = (struct lov_mds_md_v3 *)lmmv1;
+
+	if (lmm_bytes < sizeof(*lmm)) {
+		CERROR("lov_mds_md_v3 too small: %d, need at least %d\n",
+		       lmm_bytes, (int)sizeof(*lmm));
+		return -EINVAL;
+	}
+
+	*stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
+
+	if (lmm_bytes < lov_mds_md_size(*stripe_count, LOV_MAGIC_V3)) {
+		CERROR("LOV EA V3 too small: %d, need %d\n",
+		       lmm_bytes, lov_mds_md_size(*stripe_count, LOV_MAGIC_V3));
+		lov_dump_lmm_v3(D_WARNING, lmm);
+		return -EINVAL;
+	}
+
+	return lsm_lmm_verify_common((struct lov_mds_md_v1 *)lmm, lmm_bytes,
+				     *stripe_count);
+}
+
+int lsm_unpackmd_v3(struct lov_obd *lov, struct lov_stripe_md *lsm,
+		    struct lov_mds_md *lmmv1)
+{
+	struct lov_mds_md_v3 *lmm;
+	struct lov_oinfo *loi;
+	int i;
+	__u64 stripe_maxbytes = OBD_OBJECT_EOF;
+	int cplen = 0;
+
+	lmm = (struct lov_mds_md_v3 *)lmmv1;
+
+	lsm_unpackmd_common(lsm, (struct lov_mds_md_v1 *)lmm);
+	cplen = strlcpy(lsm->lsm_pool_name, lmm->lmm_pool_name,
+			sizeof(lsm->lsm_pool_name));
+	if (cplen >= sizeof(lsm->lsm_pool_name))
+		return -E2BIG;
+
+	for (i = 0; i < lsm->lsm_stripe_count; i++) {
+		/* XXX LOV STACKING call down to osc_unpackmd() */
+		loi = lsm->lsm_oinfo[i];
+		ostid_le_to_cpu(&lmm->lmm_objects[i].l_ost_oi, &loi->loi_oi);
+		loi->loi_ost_idx = le32_to_cpu(lmm->lmm_objects[i].l_ost_idx);
+		loi->loi_ost_gen = le32_to_cpu(lmm->lmm_objects[i].l_ost_gen);
+		if (loi->loi_ost_idx >= lov->desc.ld_tgt_count) {
+			CERROR("OST index %d more than OST count %d\n",
+			       loi->loi_ost_idx, lov->desc.ld_tgt_count);
+			lov_dump_lmm_v3(D_WARNING, lmm);
+			return -EINVAL;
+		}
+		if (!lov->lov_tgts[loi->loi_ost_idx]) {
+			CERROR("OST index %d missing\n", loi->loi_ost_idx);
+			lov_dump_lmm_v3(D_WARNING, lmm);
+			return -EINVAL;
+		}
+		/* calculate the minimum stripe max bytes */
+		lov_tgt_maxbytes(lov->lov_tgts[loi->loi_ost_idx],
+				 &stripe_maxbytes);
+	}
+
+	lsm->lsm_maxbytes = stripe_maxbytes * lsm->lsm_stripe_count;
+
+	return 0;
+}
+
+const struct lsm_operations lsm_v3_ops = {
+	.lsm_free	    = lsm_free_plain,
+	.lsm_destroy	 = lsm_destroy_plain,
+	.lsm_stripe_by_index    = lsm_stripe_by_index_plain,
+	.lsm_stripe_by_offset   = lsm_stripe_by_offset_plain,
+	.lsm_lmm_verify	 = lsm_lmm_verify_v3,
+	.lsm_unpackmd	   = lsm_unpackmd_v3,
+};
diff --git a/drivers/staging/lustre/lustre/lov/lov_internal.h b/drivers/staging/lustre/lustre/lov/lov_internal.h
new file mode 100644
index 000000000000..146d5e310283
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_internal.h
@@ -0,0 +1,322 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef LOV_INTERNAL_H
+#define LOV_INTERNAL_H
+
+#include <obd_class.h>
+#include <obd_lov.h>
+#include <lustre/lustre_user.h>
+
+struct lov_lock_handles {
+	struct portals_handle   llh_handle;
+	atomic_t	    llh_refcount;
+	int		     llh_stripe_count;
+	struct lustre_handle    llh_handles[0];
+};
+
+struct lov_request {
+	struct obd_info	  rq_oi;
+	struct lov_request_set  *rq_rqset;
+
+	struct list_head	       rq_link;
+
+	int		      rq_idx;	/* index in lov->tgts array */
+	int		      rq_stripe;     /* stripe number */
+	int		      rq_complete;
+	int		      rq_rc;
+	int		      rq_buflen;     /* length of sub_md */
+
+	obd_count		rq_oabufs;
+	obd_count		rq_pgaidx;
+};
+
+struct lov_request_set {
+	struct ldlm_enqueue_info	*set_ei;
+	struct obd_info			*set_oi;
+	atomic_t			set_refcount;
+	struct obd_export		*set_exp;
+	/* XXX: There is @set_exp already, however obd_statfs gets obd_device
+	   only. */
+	struct obd_device		*set_obd;
+	int				set_count;
+	atomic_t			set_completes;
+	atomic_t			set_success;
+	atomic_t			set_finish_checked;
+	struct llog_cookie		*set_cookies;
+	int				set_cookie_sent;
+	struct obd_trans_info		*set_oti;
+	obd_count			set_oabufs;
+	struct brw_page			*set_pga;
+	struct lov_lock_handles		*set_lockh;
+	struct list_head			set_list;
+	wait_queue_head_t			set_waitq;
+	spinlock_t			set_lock;
+};
+
+extern struct kmem_cache *lov_oinfo_slab;
+
+void lov_finish_set(struct lov_request_set *set);
+
+static inline void lov_get_reqset(struct lov_request_set *set)
+{
+	LASSERT(set != NULL);
+	LASSERT(atomic_read(&set->set_refcount) > 0);
+	atomic_inc(&set->set_refcount);
+}
+
+static inline void lov_put_reqset(struct lov_request_set *set)
+{
+	if (atomic_dec_and_test(&set->set_refcount))
+		lov_finish_set(set);
+}
+
+static inline struct lov_lock_handles *
+lov_handle2llh(struct lustre_handle *handle)
+{
+	LASSERT(handle != NULL);
+	return(class_handle2object(handle->cookie));
+}
+
+static inline void lov_llh_put(struct lov_lock_handles *llh)
+{
+	CDEBUG(D_INFO, "PUTting llh %p : new refcount %d\n", llh,
+	       atomic_read(&llh->llh_refcount) - 1);
+	LASSERT(atomic_read(&llh->llh_refcount) > 0 &&
+		atomic_read(&llh->llh_refcount) < 0x5a5a);
+	if (atomic_dec_and_test(&llh->llh_refcount)) {
+		class_handle_unhash(&llh->llh_handle);
+		/* The structure may be held by other threads because RCU.
+		 *   -jxiong */
+		if (atomic_read(&llh->llh_refcount))
+			return;
+
+		OBD_FREE_RCU(llh, sizeof *llh +
+			     sizeof(*llh->llh_handles) * llh->llh_stripe_count,
+			     &llh->llh_handle);
+	}
+}
+
+#define lov_uuid2str(lv, index) \
+	(char *)((lv)->lov_tgts[index]->ltd_uuid.uuid)
+
+/* lov_merge.c */
+void lov_merge_attrs(struct obdo *tgt, struct obdo *src, obd_valid valid,
+		     struct lov_stripe_md *lsm, int stripeno, int *set);
+int lov_merge_lvb(struct obd_export *exp, struct lov_stripe_md *lsm,
+		  struct ost_lvb *lvb, int kms_only);
+int lov_adjust_kms(struct obd_export *exp, struct lov_stripe_md *lsm,
+		   obd_off size, int shrink);
+int lov_merge_lvb_kms(struct lov_stripe_md *lsm,
+		      struct ost_lvb *lvb, __u64 *kms_place);
+
+/* lov_offset.c */
+obd_size lov_stripe_size(struct lov_stripe_md *lsm, obd_size ost_size,
+			 int stripeno);
+int lov_stripe_offset(struct lov_stripe_md *lsm, obd_off lov_off,
+		      int stripeno, obd_off *obd_off);
+obd_off lov_size_to_stripe(struct lov_stripe_md *lsm, obd_off file_size,
+			   int stripeno);
+int lov_stripe_intersects(struct lov_stripe_md *lsm, int stripeno,
+			  obd_off start, obd_off end,
+			  obd_off *obd_start, obd_off *obd_end);
+int lov_stripe_number(struct lov_stripe_md *lsm, obd_off lov_off);
+
+/* lov_qos.c */
+#define LOV_USES_ASSIGNED_STRIPE	0
+#define LOV_USES_DEFAULT_STRIPE	 1
+int qos_add_tgt(struct obd_device *obd, __u32 index);
+int qos_del_tgt(struct obd_device *obd, struct lov_tgt_desc *tgt);
+void qos_shrink_lsm(struct lov_request_set *set);
+int qos_prep_create(struct obd_export *exp, struct lov_request_set *set);
+void qos_update(struct lov_obd *lov);
+void qos_statfs_done(struct lov_obd *lov);
+void qos_statfs_update(struct obd_device *obd, __u64 max_age, int wait);
+int qos_remedy_create(struct lov_request_set *set, struct lov_request *req);
+
+/* lov_request.c */
+void lov_set_add_req(struct lov_request *req, struct lov_request_set *set);
+int lov_set_finished(struct lov_request_set *set, int idempotent);
+void lov_update_set(struct lov_request_set *set,
+		    struct lov_request *req, int rc);
+int lov_update_common_set(struct lov_request_set *set,
+			  struct lov_request *req, int rc);
+int lov_check_and_wait_active(struct lov_obd *lov, int ost_idx);
+int lov_prep_create_set(struct obd_export *exp, struct obd_info *oifo,
+			struct lov_stripe_md **ea, struct obdo *src_oa,
+			struct obd_trans_info *oti,
+			struct lov_request_set **reqset);
+int cb_create_update(void *cookie, int rc);
+int lov_fini_create_set(struct lov_request_set *set, struct lov_stripe_md **ea);
+int lov_prep_brw_set(struct obd_export *exp, struct obd_info *oinfo,
+		     obd_count oa_bufs, struct brw_page *pga,
+		     struct obd_trans_info *oti,
+		     struct lov_request_set **reqset);
+int lov_fini_brw_set(struct lov_request_set *set);
+int lov_prep_getattr_set(struct obd_export *exp, struct obd_info *oinfo,
+			 struct lov_request_set **reqset);
+int lov_fini_getattr_set(struct lov_request_set *set);
+int lov_prep_destroy_set(struct obd_export *exp, struct obd_info *oinfo,
+			 struct obdo *src_oa, struct lov_stripe_md *lsm,
+			 struct obd_trans_info *oti,
+			 struct lov_request_set **reqset);
+int lov_update_destroy_set(struct lov_request_set *set,
+			   struct lov_request *req, int rc);
+int lov_fini_destroy_set(struct lov_request_set *set);
+int lov_prep_setattr_set(struct obd_export *exp, struct obd_info *oinfo,
+			 struct obd_trans_info *oti,
+			 struct lov_request_set **reqset);
+int lov_update_setattr_set(struct lov_request_set *set,
+			   struct lov_request *req, int rc);
+int lov_fini_setattr_set(struct lov_request_set *set);
+int lov_prep_punch_set(struct obd_export *exp, struct obd_info *oinfo,
+		       struct obd_trans_info *oti,
+		       struct lov_request_set **reqset);
+int lov_fini_punch_set(struct lov_request_set *set);
+int lov_prep_sync_set(struct obd_export *exp, struct obd_info *obd_info,
+		      obd_off start, obd_off end,
+		      struct lov_request_set **reqset);
+int lov_fini_sync_set(struct lov_request_set *set);
+int lov_prep_enqueue_set(struct obd_export *exp, struct obd_info *oinfo,
+			 struct ldlm_enqueue_info *einfo,
+			 struct lov_request_set **reqset);
+int lov_fini_enqueue_set(struct lov_request_set *set, __u32 mode, int rc,
+			 struct ptlrpc_request_set *rqset);
+int lov_prep_match_set(struct obd_export *exp, struct obd_info *oinfo,
+		       struct lov_stripe_md *lsm,
+		       ldlm_policy_data_t *policy, __u32 mode,
+		       struct lustre_handle *lockh,
+		       struct lov_request_set **reqset);
+int lov_fini_match_set(struct lov_request_set *set, __u32 mode, int flags);
+int lov_prep_cancel_set(struct obd_export *exp, struct obd_info *oinfo,
+			struct lov_stripe_md *lsm,
+			__u32 mode, struct lustre_handle *lockh,
+			struct lov_request_set **reqset);
+int lov_fini_cancel_set(struct lov_request_set *set);
+int lov_prep_statfs_set(struct obd_device *obd, struct obd_info *oinfo,
+			struct lov_request_set **reqset);
+void lov_update_statfs(struct obd_statfs *osfs, struct obd_statfs *lov_sfs,
+		       int success);
+int lov_fini_statfs(struct obd_device *obd, struct obd_statfs *osfs,
+		    int success);
+int lov_fini_statfs_set(struct lov_request_set *set);
+int lov_statfs_interpret(struct ptlrpc_request_set *rqset, void *data, int rc);
+
+/* lov_obd.c */
+void lov_fix_desc(struct lov_desc *desc);
+void lov_fix_desc_stripe_size(__u64 *val);
+void lov_fix_desc_stripe_count(__u32 *val);
+void lov_fix_desc_pattern(__u32 *val);
+void lov_fix_desc_qos_maxage(__u32 *val);
+__u16 lov_get_stripecnt(struct lov_obd *lov, __u32 magic, __u16 stripe_count);
+int lov_connect_obd(struct obd_device *obd, __u32 index, int activate,
+		    struct obd_connect_data *data);
+int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg);
+int lov_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg,
+			    __u32 *indexp, int *genp);
+int lov_del_target(struct obd_device *obd, __u32 index,
+		   struct obd_uuid *uuidp, int gen);
+/* lov_log.c */
+int lov_llog_init(struct obd_device *obd, struct obd_llog_group *olg,
+		  struct obd_device *tgt, int *idx);
+int lov_llog_finish(struct obd_device *obd, int count);
+
+/* lov_pack.c */
+int lov_packmd(struct obd_export *exp, struct lov_mds_md **lmm,
+	       struct lov_stripe_md *lsm);
+int lov_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp,
+		 struct lov_mds_md *lmm, int lmm_bytes);
+int lov_setstripe(struct obd_export *exp, int max_lmm_size,
+		  struct lov_stripe_md **lsmp, struct lov_user_md *lump);
+int lov_setea(struct obd_export *exp, struct lov_stripe_md **lsmp,
+	      struct lov_user_md *lump);
+int lov_getstripe(struct obd_export *exp,
+		  struct lov_stripe_md *lsm, struct lov_user_md *lump);
+int lov_alloc_memmd(struct lov_stripe_md **lsmp, __u16 stripe_count,
+		    int pattern, int magic);
+int lov_free_memmd(struct lov_stripe_md **lsmp);
+
+void lov_dump_lmm_v1(int level, struct lov_mds_md_v1 *lmm);
+void lov_dump_lmm_v3(int level, struct lov_mds_md_v3 *lmm);
+void lov_dump_lmm(int level, void *lmm);
+
+/* lov_ea.c */
+struct lov_stripe_md *lsm_alloc_plain(__u16 stripe_count, int *size);
+void lsm_free_plain(struct lov_stripe_md *lsm);
+
+int lovea_destroy_object(struct lov_obd *lov, struct lov_stripe_md *lsm,
+			 struct obdo *oa, void *data);
+/* lproc_lov.c */
+extern struct file_operations lov_proc_target_fops;
+#ifdef LPROCFS
+void lprocfs_lov_init_vars(struct lprocfs_static_vars *lvars);
+#else
+static inline void lprocfs_lov_init_vars(struct lprocfs_static_vars *lvars)
+{
+	memset(lvars, 0, sizeof(*lvars));
+}
+#endif
+
+/* lov_cl.c */
+extern struct lu_device_type lov_device_type;
+
+/* pools */
+extern cfs_hash_ops_t pool_hash_operations;
+/* ost_pool methods */
+int lov_ost_pool_init(struct ost_pool *op, unsigned int count);
+int lov_ost_pool_extend(struct ost_pool *op, unsigned int min_count);
+int lov_ost_pool_add(struct ost_pool *op, __u32 idx, unsigned int min_count);
+int lov_ost_pool_remove(struct ost_pool *op, __u32 idx);
+int lov_ost_pool_free(struct ost_pool *op);
+
+/* high level pool methods */
+int lov_pool_new(struct obd_device *obd, char *poolname);
+int lov_pool_del(struct obd_device *obd, char *poolname);
+int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname);
+int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname);
+void lov_dump_pool(int level, struct pool_desc *pool);
+struct pool_desc *lov_find_pool(struct lov_obd *lov, char *poolname);
+int lov_check_index_in_pool(__u32 idx, struct pool_desc *pool);
+void lov_pool_putref(struct pool_desc *pool);
+
+static inline struct lov_stripe_md *lsm_addref(struct lov_stripe_md *lsm)
+{
+	LASSERT(atomic_read(&lsm->lsm_refc) > 0);
+	atomic_inc(&lsm->lsm_refc);
+	return lsm;
+}
+
+#endif
diff --git a/drivers/staging/lustre/lustre/lov/lov_io.c b/drivers/staging/lustre/lustre/lov/lov_io.c
new file mode 100644
index 000000000000..1a87abdf0953
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_io.c
@@ -0,0 +1,967 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_io for LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+static inline void lov_sub_enter(struct lov_io_sub *sub)
+{
+	sub->sub_reenter++;
+}
+static inline void lov_sub_exit(struct lov_io_sub *sub)
+{
+	sub->sub_reenter--;
+}
+
+static void lov_io_sub_fini(const struct lu_env *env, struct lov_io *lio,
+			    struct lov_io_sub *sub)
+{
+	ENTRY;
+	if (sub->sub_io != NULL) {
+		if (sub->sub_io_initialized) {
+			lov_sub_enter(sub);
+			cl_io_fini(sub->sub_env, sub->sub_io);
+			lov_sub_exit(sub);
+			sub->sub_io_initialized = 0;
+			lio->lis_active_subios--;
+		}
+		if (sub->sub_stripe == lio->lis_single_subio_index)
+			lio->lis_single_subio_index = -1;
+		else if (!sub->sub_borrowed)
+			OBD_FREE_PTR(sub->sub_io);
+		sub->sub_io = NULL;
+	}
+	if (sub->sub_env != NULL && !IS_ERR(sub->sub_env)) {
+		if (!sub->sub_borrowed)
+			cl_env_put(sub->sub_env, &sub->sub_refcheck);
+		sub->sub_env = NULL;
+	}
+	EXIT;
+}
+
+static void lov_io_sub_inherit(struct cl_io *io, struct lov_io *lio,
+			       int stripe, loff_t start, loff_t end)
+{
+	struct lov_stripe_md *lsm    = lio->lis_object->lo_lsm;
+	struct cl_io	 *parent = lio->lis_cl.cis_io;
+
+	switch(io->ci_type) {
+	case CIT_SETATTR: {
+		io->u.ci_setattr.sa_attr = parent->u.ci_setattr.sa_attr;
+		io->u.ci_setattr.sa_valid = parent->u.ci_setattr.sa_valid;
+		io->u.ci_setattr.sa_capa = parent->u.ci_setattr.sa_capa;
+		if (cl_io_is_trunc(io)) {
+			loff_t new_size = parent->u.ci_setattr.sa_attr.lvb_size;
+
+			new_size = lov_size_to_stripe(lsm, new_size, stripe);
+			io->u.ci_setattr.sa_attr.lvb_size = new_size;
+		}
+		break;
+	}
+	case CIT_FAULT: {
+		struct cl_object *obj = parent->ci_obj;
+		loff_t off = cl_offset(obj, parent->u.ci_fault.ft_index);
+
+		io->u.ci_fault = parent->u.ci_fault;
+		off = lov_size_to_stripe(lsm, off, stripe);
+		io->u.ci_fault.ft_index = cl_index(obj, off);
+		break;
+	}
+	case CIT_FSYNC: {
+		io->u.ci_fsync.fi_start = start;
+		io->u.ci_fsync.fi_end = end;
+		io->u.ci_fsync.fi_capa = parent->u.ci_fsync.fi_capa;
+		io->u.ci_fsync.fi_fid = parent->u.ci_fsync.fi_fid;
+		io->u.ci_fsync.fi_mode = parent->u.ci_fsync.fi_mode;
+		break;
+	}
+	case CIT_READ:
+	case CIT_WRITE: {
+		io->u.ci_wr.wr_sync = cl_io_is_sync_write(parent);
+		if (cl_io_is_append(parent)) {
+			io->u.ci_wr.wr_append = 1;
+		} else {
+			io->u.ci_rw.crw_pos = start;
+			io->u.ci_rw.crw_count = end - start;
+		}
+		break;
+	}
+	default:
+		break;
+	}
+}
+
+static int lov_io_sub_init(const struct lu_env *env, struct lov_io *lio,
+			   struct lov_io_sub *sub)
+{
+	struct lov_object *lov = lio->lis_object;
+	struct lov_device *ld  = lu2lov_dev(lov2cl(lov)->co_lu.lo_dev);
+	struct cl_io      *sub_io;
+	struct cl_object  *sub_obj;
+	struct cl_io      *io  = lio->lis_cl.cis_io;
+
+	int stripe = sub->sub_stripe;
+	int result;
+
+	LASSERT(sub->sub_io == NULL);
+	LASSERT(sub->sub_env == NULL);
+	LASSERT(sub->sub_stripe < lio->lis_stripe_count);
+	ENTRY;
+
+	result = 0;
+	sub->sub_io_initialized = 0;
+	sub->sub_borrowed = 0;
+
+	if (lio->lis_mem_frozen) {
+		LASSERT(mutex_is_locked(&ld->ld_mutex));
+		sub->sub_io  = &ld->ld_emrg[stripe]->emrg_subio;
+		sub->sub_env = ld->ld_emrg[stripe]->emrg_env;
+		sub->sub_borrowed = 1;
+	} else {
+		void *cookie;
+
+		/* obtain new environment */
+		cookie = cl_env_reenter();
+		sub->sub_env = cl_env_get(&sub->sub_refcheck);
+		cl_env_reexit(cookie);
+		if (IS_ERR(sub->sub_env))
+			result = PTR_ERR(sub->sub_env);
+
+		if (result == 0) {
+			/*
+			 * First sub-io. Use ->lis_single_subio to
+			 * avoid dynamic allocation.
+			 */
+			if (lio->lis_active_subios == 0) {
+				sub->sub_io = &lio->lis_single_subio;
+				lio->lis_single_subio_index = stripe;
+			} else {
+				OBD_ALLOC_PTR(sub->sub_io);
+				if (sub->sub_io == NULL)
+					result = -ENOMEM;
+			}
+		}
+	}
+
+	if (result == 0) {
+		sub_obj = lovsub2cl(lov_r0(lov)->lo_sub[stripe]);
+		sub_io  = sub->sub_io;
+
+		sub_io->ci_obj    = sub_obj;
+		sub_io->ci_result = 0;
+
+		sub_io->ci_parent  = io;
+		sub_io->ci_lockreq = io->ci_lockreq;
+		sub_io->ci_type    = io->ci_type;
+		sub_io->ci_no_srvlock = io->ci_no_srvlock;
+
+		lov_sub_enter(sub);
+		result = cl_io_sub_init(sub->sub_env, sub_io,
+					io->ci_type, sub_obj);
+		lov_sub_exit(sub);
+		if (result >= 0) {
+			lio->lis_active_subios++;
+			sub->sub_io_initialized = 1;
+			result = 0;
+		}
+	}
+	if (result != 0)
+		lov_io_sub_fini(env, lio, sub);
+	RETURN(result);
+}
+
+struct lov_io_sub *lov_sub_get(const struct lu_env *env,
+			       struct lov_io *lio, int stripe)
+{
+	int rc;
+	struct lov_io_sub *sub = &lio->lis_subs[stripe];
+
+	LASSERT(stripe < lio->lis_stripe_count);
+	ENTRY;
+
+	if (!sub->sub_io_initialized) {
+		sub->sub_stripe = stripe;
+		rc = lov_io_sub_init(env, lio, sub);
+	} else
+		rc = 0;
+	if (rc == 0)
+		lov_sub_enter(sub);
+	else
+		sub = ERR_PTR(rc);
+	RETURN(sub);
+}
+
+void lov_sub_put(struct lov_io_sub *sub)
+{
+	lov_sub_exit(sub);
+}
+
+/*****************************************************************************
+ *
+ * Lov io operations.
+ *
+ */
+
+static int lov_page_stripe(const struct cl_page *page)
+{
+	struct lovsub_object *subobj;
+
+	ENTRY;
+	subobj = lu2lovsub(
+		lu_object_locate(page->cp_child->cp_obj->co_lu.lo_header,
+				 &lovsub_device_type));
+	LASSERT(subobj != NULL);
+	RETURN(subobj->lso_index);
+}
+
+struct lov_io_sub *lov_page_subio(const struct lu_env *env, struct lov_io *lio,
+				  const struct cl_page_slice *slice)
+{
+	struct lov_stripe_md *lsm  = lio->lis_object->lo_lsm;
+	struct cl_page       *page = slice->cpl_page;
+	int stripe;
+
+	LASSERT(lio->lis_cl.cis_io != NULL);
+	LASSERT(cl2lov(slice->cpl_obj) == lio->lis_object);
+	LASSERT(lsm != NULL);
+	LASSERT(lio->lis_nr_subios > 0);
+	ENTRY;
+
+	stripe = lov_page_stripe(page);
+	RETURN(lov_sub_get(env, lio, stripe));
+}
+
+
+static int lov_io_subio_init(const struct lu_env *env, struct lov_io *lio,
+			     struct cl_io *io)
+{
+	struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
+	int result;
+
+	LASSERT(lio->lis_object != NULL);
+	ENTRY;
+
+	/*
+	 * Need to be optimized, we can't afford to allocate a piece of memory
+	 * when writing a page. -jay
+	 */
+	OBD_ALLOC_LARGE(lio->lis_subs,
+			lsm->lsm_stripe_count * sizeof lio->lis_subs[0]);
+	if (lio->lis_subs != NULL) {
+		lio->lis_nr_subios = lio->lis_stripe_count;
+		lio->lis_single_subio_index = -1;
+		lio->lis_active_subios = 0;
+		result = 0;
+	} else
+		result = -ENOMEM;
+	RETURN(result);
+}
+
+static void lov_io_slice_init(struct lov_io *lio,
+			      struct lov_object *obj, struct cl_io *io)
+{
+	ENTRY;
+
+	io->ci_result = 0;
+	lio->lis_object = obj;
+
+	LASSERT(obj->lo_lsm != NULL);
+	lio->lis_stripe_count = obj->lo_lsm->lsm_stripe_count;
+
+	switch (io->ci_type) {
+	case CIT_READ:
+	case CIT_WRITE:
+		lio->lis_pos = io->u.ci_rw.crw_pos;
+		lio->lis_endpos = io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count;
+		lio->lis_io_endpos = lio->lis_endpos;
+		if (cl_io_is_append(io)) {
+			LASSERT(io->ci_type == CIT_WRITE);
+			lio->lis_pos = 0;
+			lio->lis_endpos = OBD_OBJECT_EOF;
+		}
+		break;
+
+	case CIT_SETATTR:
+		if (cl_io_is_trunc(io))
+			lio->lis_pos = io->u.ci_setattr.sa_attr.lvb_size;
+		else
+			lio->lis_pos = 0;
+		lio->lis_endpos = OBD_OBJECT_EOF;
+		break;
+
+	case CIT_FAULT: {
+		pgoff_t index = io->u.ci_fault.ft_index;
+		lio->lis_pos = cl_offset(io->ci_obj, index);
+		lio->lis_endpos = cl_offset(io->ci_obj, index + 1);
+		break;
+	}
+
+	case CIT_FSYNC: {
+		lio->lis_pos = io->u.ci_fsync.fi_start;
+		lio->lis_endpos = io->u.ci_fsync.fi_end;
+		break;
+	}
+
+	case CIT_MISC:
+		lio->lis_pos = 0;
+		lio->lis_endpos = OBD_OBJECT_EOF;
+		break;
+
+	default:
+		LBUG();
+	}
+
+	EXIT;
+}
+
+static void lov_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+	struct lov_io *lio = cl2lov_io(env, ios);
+	struct lov_object *lov = cl2lov(ios->cis_obj);
+	int i;
+
+	ENTRY;
+	if (lio->lis_subs != NULL) {
+		for (i = 0; i < lio->lis_nr_subios; i++)
+			lov_io_sub_fini(env, lio, &lio->lis_subs[i]);
+		OBD_FREE_LARGE(lio->lis_subs,
+			 lio->lis_nr_subios * sizeof lio->lis_subs[0]);
+		lio->lis_nr_subios = 0;
+	}
+
+	LASSERT(atomic_read(&lov->lo_active_ios) > 0);
+	if (atomic_dec_and_test(&lov->lo_active_ios))
+		wake_up_all(&lov->lo_waitq);
+	EXIT;
+}
+
+static obd_off lov_offset_mod(obd_off val, int delta)
+{
+	if (val != OBD_OBJECT_EOF)
+		val += delta;
+	return val;
+}
+
+static int lov_io_iter_init(const struct lu_env *env,
+			    const struct cl_io_slice *ios)
+{
+	struct lov_io	*lio = cl2lov_io(env, ios);
+	struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
+	struct lov_io_sub    *sub;
+	obd_off endpos;
+	obd_off start;
+	obd_off end;
+	int stripe;
+	int rc = 0;
+
+	ENTRY;
+	endpos = lov_offset_mod(lio->lis_endpos, -1);
+	for (stripe = 0; stripe < lio->lis_stripe_count; stripe++) {
+		if (!lov_stripe_intersects(lsm, stripe, lio->lis_pos,
+					   endpos, &start, &end))
+			continue;
+
+		end = lov_offset_mod(end, +1);
+		sub = lov_sub_get(env, lio, stripe);
+		if (!IS_ERR(sub)) {
+			lov_io_sub_inherit(sub->sub_io, lio, stripe,
+					   start, end);
+			rc = cl_io_iter_init(sub->sub_env, sub->sub_io);
+			lov_sub_put(sub);
+			CDEBUG(D_VFSTRACE, "shrink: %d ["LPU64", "LPU64")\n",
+			       stripe, start, end);
+		} else
+			rc = PTR_ERR(sub);
+
+		if (!rc)
+			list_add_tail(&sub->sub_linkage, &lio->lis_active);
+		else
+			break;
+	}
+	RETURN(rc);
+}
+
+static int lov_io_rw_iter_init(const struct lu_env *env,
+			       const struct cl_io_slice *ios)
+{
+	struct lov_io	*lio = cl2lov_io(env, ios);
+	struct cl_io	 *io  = ios->cis_io;
+	struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
+	loff_t start = io->u.ci_rw.crw_pos;
+	loff_t next;
+	unsigned long ssize = lsm->lsm_stripe_size;
+
+	LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+	ENTRY;
+
+	/* fast path for common case. */
+	if (lio->lis_nr_subios != 1 && !cl_io_is_append(io)) {
+
+		lov_do_div64(start, ssize);
+		next = (start + 1) * ssize;
+		if (next <= start * ssize)
+			next = ~0ull;
+
+		io->ci_continue = next < lio->lis_io_endpos;
+		io->u.ci_rw.crw_count = min_t(loff_t, lio->lis_io_endpos,
+					      next) - io->u.ci_rw.crw_pos;
+		lio->lis_pos    = io->u.ci_rw.crw_pos;
+		lio->lis_endpos = io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count;
+		CDEBUG(D_VFSTRACE, "stripe: "LPU64" chunk: ["LPU64", "LPU64") "
+		       LPU64"\n", (__u64)start, lio->lis_pos, lio->lis_endpos,
+		       (__u64)lio->lis_io_endpos);
+	}
+	/*
+	 * XXX The following call should be optimized: we know, that
+	 * [lio->lis_pos, lio->lis_endpos) intersects with exactly one stripe.
+	 */
+	RETURN(lov_io_iter_init(env, ios));
+}
+
+static int lov_io_call(const struct lu_env *env, struct lov_io *lio,
+		       int (*iofunc)(const struct lu_env *, struct cl_io *))
+{
+	struct cl_io *parent = lio->lis_cl.cis_io;
+	struct lov_io_sub *sub;
+	int rc = 0;
+
+	ENTRY;
+	list_for_each_entry(sub, &lio->lis_active, sub_linkage) {
+		lov_sub_enter(sub);
+		rc = iofunc(sub->sub_env, sub->sub_io);
+		lov_sub_exit(sub);
+		if (rc)
+			break;
+
+		if (parent->ci_result == 0)
+			parent->ci_result = sub->sub_io->ci_result;
+	}
+	RETURN(rc);
+}
+
+static int lov_io_lock(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+	ENTRY;
+	RETURN(lov_io_call(env, cl2lov_io(env, ios), cl_io_lock));
+}
+
+static int lov_io_start(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+	ENTRY;
+	RETURN(lov_io_call(env, cl2lov_io(env, ios), cl_io_start));
+}
+
+static int lov_io_end_wrapper(const struct lu_env *env, struct cl_io *io)
+{
+	ENTRY;
+	/*
+	 * It's possible that lov_io_start() wasn't called against this
+	 * sub-io, either because previous sub-io failed, or upper layer
+	 * completed IO.
+	 */
+	if (io->ci_state == CIS_IO_GOING)
+		cl_io_end(env, io);
+	else
+		io->ci_state = CIS_IO_FINISHED;
+	RETURN(0);
+}
+
+static int lov_io_iter_fini_wrapper(const struct lu_env *env, struct cl_io *io)
+{
+	cl_io_iter_fini(env, io);
+	RETURN(0);
+}
+
+static int lov_io_unlock_wrapper(const struct lu_env *env, struct cl_io *io)
+{
+	cl_io_unlock(env, io);
+	RETURN(0);
+}
+
+static void lov_io_end(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+	int rc;
+
+	rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_end_wrapper);
+	LASSERT(rc == 0);
+}
+
+static void lov_io_iter_fini(const struct lu_env *env,
+			     const struct cl_io_slice *ios)
+{
+	struct lov_io *lio = cl2lov_io(env, ios);
+	int rc;
+
+	ENTRY;
+	rc = lov_io_call(env, lio, lov_io_iter_fini_wrapper);
+	LASSERT(rc == 0);
+	while (!list_empty(&lio->lis_active))
+		list_del_init(lio->lis_active.next);
+	EXIT;
+}
+
+static void lov_io_unlock(const struct lu_env *env,
+			  const struct cl_io_slice *ios)
+{
+	int rc;
+
+	ENTRY;
+	rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_unlock_wrapper);
+	LASSERT(rc == 0);
+	EXIT;
+}
+
+
+static struct cl_page_list *lov_io_submit_qin(struct lov_device *ld,
+					      struct cl_page_list *qin,
+					      int idx, int alloc)
+{
+	return alloc ? &qin[idx] : &ld->ld_emrg[idx]->emrg_page_list;
+}
+
+/**
+ * lov implementation of cl_operations::cio_submit() method. It takes a list
+ * of pages in \a queue, splits it into per-stripe sub-lists, invokes
+ * cl_io_submit() on underlying devices to submit sub-lists, and then splices
+ * everything back.
+ *
+ * Major complication of this function is a need to handle memory cleansing:
+ * cl_io_submit() is called to write out pages as a part of VM memory
+ * reclamation, and hence it may not fail due to memory shortages (system
+ * dead-locks otherwise). To deal with this, some resources (sub-lists,
+ * sub-environment, etc.) are allocated per-device on "startup" (i.e., in a
+ * not-memory cleansing context), and in case of memory shortage, these
+ * pre-allocated resources are used by lov_io_submit() under
+ * lov_device::ld_mutex mutex.
+ */
+static int lov_io_submit(const struct lu_env *env,
+			 const struct cl_io_slice *ios,
+			 enum cl_req_type crt, struct cl_2queue *queue)
+{
+	struct lov_io	  *lio = cl2lov_io(env, ios);
+	struct lov_object      *obj = lio->lis_object;
+	struct lov_device       *ld = lu2lov_dev(lov2cl(obj)->co_lu.lo_dev);
+	struct cl_page_list    *qin = &queue->c2_qin;
+	struct cl_2queue      *cl2q = &lov_env_info(env)->lti_cl2q;
+	struct cl_page_list *stripes_qin = NULL;
+	struct cl_page *page;
+	struct cl_page *tmp;
+	int stripe;
+
+#define QIN(stripe) lov_io_submit_qin(ld, stripes_qin, stripe, alloc)
+
+	int rc = 0;
+	int alloc =
+		!(current->flags & PF_MEMALLOC);
+	ENTRY;
+	if (lio->lis_active_subios == 1) {
+		int idx = lio->lis_single_subio_index;
+		struct lov_io_sub *sub;
+
+		LASSERT(idx < lio->lis_nr_subios);
+		sub = lov_sub_get(env, lio, idx);
+		LASSERT(!IS_ERR(sub));
+		LASSERT(sub->sub_io == &lio->lis_single_subio);
+		rc = cl_io_submit_rw(sub->sub_env, sub->sub_io,
+				     crt, queue);
+		lov_sub_put(sub);
+		RETURN(rc);
+	}
+
+	LASSERT(lio->lis_subs != NULL);
+	if (alloc) {
+		OBD_ALLOC_LARGE(stripes_qin,
+				sizeof(*stripes_qin) * lio->lis_nr_subios);
+		if (stripes_qin == NULL)
+			RETURN(-ENOMEM);
+
+		for (stripe = 0; stripe < lio->lis_nr_subios; stripe++)
+			cl_page_list_init(&stripes_qin[stripe]);
+	} else {
+		/*
+		 * If we get here, it means pageout & swap doesn't help.
+		 * In order to not make things worse, even don't try to
+		 * allocate the memory with __GFP_NOWARN. -jay
+		 */
+		mutex_lock(&ld->ld_mutex);
+		lio->lis_mem_frozen = 1;
+	}
+
+	cl_2queue_init(cl2q);
+	cl_page_list_for_each_safe(page, tmp, qin) {
+		stripe = lov_page_stripe(page);
+		cl_page_list_move(QIN(stripe), qin, page);
+	}
+
+	for (stripe = 0; stripe < lio->lis_nr_subios; stripe++) {
+		struct lov_io_sub   *sub;
+		struct cl_page_list *sub_qin = QIN(stripe);
+
+		if (list_empty(&sub_qin->pl_pages))
+			continue;
+
+		cl_page_list_splice(sub_qin, &cl2q->c2_qin);
+		sub = lov_sub_get(env, lio, stripe);
+		if (!IS_ERR(sub)) {
+			rc = cl_io_submit_rw(sub->sub_env, sub->sub_io,
+					     crt, cl2q);
+			lov_sub_put(sub);
+		} else
+			rc = PTR_ERR(sub);
+		cl_page_list_splice(&cl2q->c2_qin,  &queue->c2_qin);
+		cl_page_list_splice(&cl2q->c2_qout, &queue->c2_qout);
+		if (rc != 0)
+			break;
+	}
+
+	for (stripe = 0; stripe < lio->lis_nr_subios; stripe++) {
+		struct cl_page_list *sub_qin = QIN(stripe);
+
+		if (list_empty(&sub_qin->pl_pages))
+			continue;
+
+		cl_page_list_splice(sub_qin, qin);
+	}
+
+	if (alloc) {
+		OBD_FREE_LARGE(stripes_qin,
+			 sizeof(*stripes_qin) * lio->lis_nr_subios);
+	} else {
+		int i;
+
+		for (i = 0; i < lio->lis_nr_subios; i++) {
+			struct cl_io *cio = lio->lis_subs[i].sub_io;
+
+			if (cio && cio == &ld->ld_emrg[i]->emrg_subio)
+				lov_io_sub_fini(env, lio, &lio->lis_subs[i]);
+		}
+		lio->lis_mem_frozen = 0;
+		mutex_unlock(&ld->ld_mutex);
+	}
+
+	RETURN(rc);
+#undef QIN
+}
+
+static int lov_io_prepare_write(const struct lu_env *env,
+				const struct cl_io_slice *ios,
+				const struct cl_page_slice *slice,
+				unsigned from, unsigned to)
+{
+	struct lov_io     *lio      = cl2lov_io(env, ios);
+	struct cl_page    *sub_page = lov_sub_page(slice);
+	struct lov_io_sub *sub;
+	int result;
+
+	ENTRY;
+	sub = lov_page_subio(env, lio, slice);
+	if (!IS_ERR(sub)) {
+		result = cl_io_prepare_write(sub->sub_env, sub->sub_io,
+					     sub_page, from, to);
+		lov_sub_put(sub);
+	} else
+		result = PTR_ERR(sub);
+	RETURN(result);
+}
+
+static int lov_io_commit_write(const struct lu_env *env,
+			       const struct cl_io_slice *ios,
+			       const struct cl_page_slice *slice,
+			       unsigned from, unsigned to)
+{
+	struct lov_io     *lio      = cl2lov_io(env, ios);
+	struct cl_page    *sub_page = lov_sub_page(slice);
+	struct lov_io_sub *sub;
+	int result;
+
+	ENTRY;
+	sub = lov_page_subio(env, lio, slice);
+	if (!IS_ERR(sub)) {
+		result = cl_io_commit_write(sub->sub_env, sub->sub_io,
+					    sub_page, from, to);
+		lov_sub_put(sub);
+	} else
+		result = PTR_ERR(sub);
+	RETURN(result);
+}
+
+static int lov_io_fault_start(const struct lu_env *env,
+			      const struct cl_io_slice *ios)
+{
+	struct cl_fault_io *fio;
+	struct lov_io      *lio;
+	struct lov_io_sub  *sub;
+
+	ENTRY;
+	fio = &ios->cis_io->u.ci_fault;
+	lio = cl2lov_io(env, ios);
+	sub = lov_sub_get(env, lio, lov_page_stripe(fio->ft_page));
+	sub->sub_io->u.ci_fault.ft_nob = fio->ft_nob;
+	lov_sub_put(sub);
+	RETURN(lov_io_start(env, ios));
+}
+
+static void lov_io_fsync_end(const struct lu_env *env,
+			     const struct cl_io_slice *ios)
+{
+	struct lov_io *lio = cl2lov_io(env, ios);
+	struct lov_io_sub *sub;
+	unsigned int *written = &ios->cis_io->u.ci_fsync.fi_nr_written;
+	ENTRY;
+
+	*written = 0;
+	list_for_each_entry(sub, &lio->lis_active, sub_linkage) {
+		struct cl_io *subio = sub->sub_io;
+
+		lov_sub_enter(sub);
+		lov_io_end_wrapper(sub->sub_env, subio);
+		lov_sub_exit(sub);
+
+		if (subio->ci_result == 0)
+			*written += subio->u.ci_fsync.fi_nr_written;
+	}
+	RETURN_EXIT;
+}
+
+static const struct cl_io_operations lov_io_ops = {
+	.op = {
+		[CIT_READ] = {
+			.cio_fini      = lov_io_fini,
+			.cio_iter_init = lov_io_rw_iter_init,
+			.cio_iter_fini = lov_io_iter_fini,
+			.cio_lock      = lov_io_lock,
+			.cio_unlock    = lov_io_unlock,
+			.cio_start     = lov_io_start,
+			.cio_end       = lov_io_end
+		},
+		[CIT_WRITE] = {
+			.cio_fini      = lov_io_fini,
+			.cio_iter_init = lov_io_rw_iter_init,
+			.cio_iter_fini = lov_io_iter_fini,
+			.cio_lock      = lov_io_lock,
+			.cio_unlock    = lov_io_unlock,
+			.cio_start     = lov_io_start,
+			.cio_end       = lov_io_end
+		},
+		[CIT_SETATTR] = {
+			.cio_fini      = lov_io_fini,
+			.cio_iter_init = lov_io_iter_init,
+			.cio_iter_fini = lov_io_iter_fini,
+			.cio_lock      = lov_io_lock,
+			.cio_unlock    = lov_io_unlock,
+			.cio_start     = lov_io_start,
+			.cio_end       = lov_io_end
+		},
+		[CIT_FAULT] = {
+			.cio_fini      = lov_io_fini,
+			.cio_iter_init = lov_io_iter_init,
+			.cio_iter_fini = lov_io_iter_fini,
+			.cio_lock      = lov_io_lock,
+			.cio_unlock    = lov_io_unlock,
+			.cio_start     = lov_io_fault_start,
+			.cio_end       = lov_io_end
+		},
+		[CIT_FSYNC] = {
+			.cio_fini      = lov_io_fini,
+			.cio_iter_init = lov_io_iter_init,
+			.cio_iter_fini = lov_io_iter_fini,
+			.cio_lock      = lov_io_lock,
+			.cio_unlock    = lov_io_unlock,
+			.cio_start     = lov_io_start,
+			.cio_end       = lov_io_fsync_end
+		},
+		[CIT_MISC] = {
+			.cio_fini   = lov_io_fini
+		}
+	},
+	.req_op = {
+		 [CRT_READ] = {
+			 .cio_submit    = lov_io_submit
+		 },
+		 [CRT_WRITE] = {
+			 .cio_submit    = lov_io_submit
+		 }
+	 },
+	.cio_prepare_write = lov_io_prepare_write,
+	.cio_commit_write  = lov_io_commit_write
+};
+
+/*****************************************************************************
+ *
+ * Empty lov io operations.
+ *
+ */
+
+static void lov_empty_io_fini(const struct lu_env *env,
+			      const struct cl_io_slice *ios)
+{
+	struct lov_object *lov = cl2lov(ios->cis_obj);
+	ENTRY;
+
+	if (atomic_dec_and_test(&lov->lo_active_ios))
+		wake_up_all(&lov->lo_waitq);
+	EXIT;
+}
+
+static void lov_empty_impossible(const struct lu_env *env,
+				 struct cl_io_slice *ios)
+{
+	LBUG();
+}
+
+#define LOV_EMPTY_IMPOSSIBLE ((void *)lov_empty_impossible)
+
+/**
+ * An io operation vector for files without stripes.
+ */
+static const struct cl_io_operations lov_empty_io_ops = {
+	.op = {
+		[CIT_READ] = {
+			.cio_fini       = lov_empty_io_fini,
+#if 0
+			.cio_iter_init  = LOV_EMPTY_IMPOSSIBLE,
+			.cio_lock       = LOV_EMPTY_IMPOSSIBLE,
+			.cio_start      = LOV_EMPTY_IMPOSSIBLE,
+			.cio_end	= LOV_EMPTY_IMPOSSIBLE
+#endif
+		},
+		[CIT_WRITE] = {
+			.cio_fini      = lov_empty_io_fini,
+			.cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
+			.cio_lock      = LOV_EMPTY_IMPOSSIBLE,
+			.cio_start     = LOV_EMPTY_IMPOSSIBLE,
+			.cio_end       = LOV_EMPTY_IMPOSSIBLE
+		},
+		[CIT_SETATTR] = {
+			.cio_fini      = lov_empty_io_fini,
+			.cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
+			.cio_lock      = LOV_EMPTY_IMPOSSIBLE,
+			.cio_start     = LOV_EMPTY_IMPOSSIBLE,
+			.cio_end       = LOV_EMPTY_IMPOSSIBLE
+		},
+		[CIT_FAULT] = {
+			.cio_fini      = lov_empty_io_fini,
+			.cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
+			.cio_lock      = LOV_EMPTY_IMPOSSIBLE,
+			.cio_start     = LOV_EMPTY_IMPOSSIBLE,
+			.cio_end       = LOV_EMPTY_IMPOSSIBLE
+		},
+		[CIT_FSYNC] = {
+			.cio_fini   = lov_empty_io_fini
+		},
+		[CIT_MISC] = {
+			.cio_fini   = lov_empty_io_fini
+		}
+	},
+	.req_op = {
+		 [CRT_READ] = {
+			 .cio_submit    = LOV_EMPTY_IMPOSSIBLE
+		 },
+		 [CRT_WRITE] = {
+			 .cio_submit    = LOV_EMPTY_IMPOSSIBLE
+		 }
+	 },
+	.cio_commit_write = LOV_EMPTY_IMPOSSIBLE
+};
+
+int lov_io_init_raid0(const struct lu_env *env, struct cl_object *obj,
+		      struct cl_io *io)
+{
+	struct lov_io       *lio = lov_env_io(env);
+	struct lov_object   *lov = cl2lov(obj);
+
+	ENTRY;
+	INIT_LIST_HEAD(&lio->lis_active);
+	lov_io_slice_init(lio, lov, io);
+	if (io->ci_result == 0) {
+		io->ci_result = lov_io_subio_init(env, lio, io);
+		if (io->ci_result == 0) {
+			cl_io_slice_add(io, &lio->lis_cl, obj, &lov_io_ops);
+			atomic_inc(&lov->lo_active_ios);
+		}
+	}
+	RETURN(io->ci_result);
+}
+
+int lov_io_init_empty(const struct lu_env *env, struct cl_object *obj,
+		      struct cl_io *io)
+{
+	struct lov_object *lov = cl2lov(obj);
+	struct lov_io *lio = lov_env_io(env);
+	int result;
+	ENTRY;
+
+	lio->lis_object = lov;
+	switch (io->ci_type) {
+	default:
+		LBUG();
+	case CIT_MISC:
+	case CIT_READ:
+		result = 0;
+		break;
+	case CIT_FSYNC:
+	case CIT_SETATTR:
+		result = +1;
+		break;
+	case CIT_WRITE:
+		result = -EBADF;
+		break;
+	case CIT_FAULT:
+		result = -EFAULT;
+		CERROR("Page fault on a file without stripes: "DFID"\n",
+		       PFID(lu_object_fid(&obj->co_lu)));
+		break;
+	}
+	if (result == 0) {
+		cl_io_slice_add(io, &lio->lis_cl, obj, &lov_empty_io_ops);
+		atomic_inc(&lov->lo_active_ios);
+	}
+
+	io->ci_result = result < 0 ? result : 0;
+	RETURN(result != 0);
+}
+
+/** @} lov */
diff --git a/drivers/staging/lustre/lustre/lov/lov_lock.c b/drivers/staging/lustre/lustre/lov/lov_lock.c
new file mode 100644
index 000000000000..bdf3334e0c9f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_lock.c
@@ -0,0 +1,1253 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_lock for LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+static struct cl_lock_closure *lov_closure_get(const struct lu_env *env,
+					       struct cl_lock *parent);
+
+static int lov_lock_unuse(const struct lu_env *env,
+			  const struct cl_lock_slice *slice);
+/*****************************************************************************
+ *
+ * Lov lock operations.
+ *
+ */
+
+static struct lov_sublock_env *lov_sublock_env_get(const struct lu_env *env,
+						   struct cl_lock *parent,
+						   struct lov_lock_sub *lls)
+{
+	struct lov_sublock_env *subenv;
+	struct lov_io	  *lio    = lov_env_io(env);
+	struct cl_io	   *io     = lio->lis_cl.cis_io;
+	struct lov_io_sub      *sub;
+
+	subenv = &lov_env_session(env)->ls_subenv;
+
+	/*
+	 * FIXME: We tend to use the subio's env & io to call the sublock
+	 * lock operations because osc lock sometimes stores some control
+	 * variables in thread's IO infomation(Now only lockless information).
+	 * However, if the lock's host(object) is different from the object
+	 * for current IO, we have no way to get the subenv and subio because
+	 * they are not initialized at all. As a temp fix, in this case,
+	 * we still borrow the parent's env to call sublock operations.
+	 */
+	if (!io || !cl_object_same(io->ci_obj, parent->cll_descr.cld_obj)) {
+		subenv->lse_env = env;
+		subenv->lse_io  = io;
+		subenv->lse_sub = NULL;
+	} else {
+		sub = lov_sub_get(env, lio, lls->sub_stripe);
+		if (!IS_ERR(sub)) {
+			subenv->lse_env = sub->sub_env;
+			subenv->lse_io  = sub->sub_io;
+			subenv->lse_sub = sub;
+		} else {
+			subenv = (void*)sub;
+		}
+	}
+	return subenv;
+}
+
+static void lov_sublock_env_put(struct lov_sublock_env *subenv)
+{
+	if (subenv && subenv->lse_sub)
+		lov_sub_put(subenv->lse_sub);
+}
+
+static void lov_sublock_adopt(const struct lu_env *env, struct lov_lock *lck,
+			      struct cl_lock *sublock, int idx,
+			      struct lov_lock_link *link)
+{
+	struct lovsub_lock *lsl;
+	struct cl_lock     *parent = lck->lls_cl.cls_lock;
+	int		 rc;
+
+	LASSERT(cl_lock_is_mutexed(parent));
+	LASSERT(cl_lock_is_mutexed(sublock));
+	ENTRY;
+
+	lsl = cl2sub_lock(sublock);
+	/*
+	 * check that sub-lock doesn't have lock link to this top-lock.
+	 */
+	LASSERT(lov_lock_link_find(env, lck, lsl) == NULL);
+	LASSERT(idx < lck->lls_nr);
+
+	lck->lls_sub[idx].sub_lock = lsl;
+	lck->lls_nr_filled++;
+	LASSERT(lck->lls_nr_filled <= lck->lls_nr);
+	list_add_tail(&link->lll_list, &lsl->lss_parents);
+	link->lll_idx = idx;
+	link->lll_super = lck;
+	cl_lock_get(parent);
+	lu_ref_add(&parent->cll_reference, "lov-child", sublock);
+	lck->lls_sub[idx].sub_flags |= LSF_HELD;
+	cl_lock_user_add(env, sublock);
+
+	rc = lov_sublock_modify(env, lck, lsl, &sublock->cll_descr, idx);
+	LASSERT(rc == 0); /* there is no way this can fail, currently */
+	EXIT;
+}
+
+static struct cl_lock *lov_sublock_alloc(const struct lu_env *env,
+					 const struct cl_io *io,
+					 struct lov_lock *lck,
+					 int idx, struct lov_lock_link **out)
+{
+	struct cl_lock       *sublock;
+	struct cl_lock       *parent;
+	struct lov_lock_link *link;
+
+	LASSERT(idx < lck->lls_nr);
+	ENTRY;
+
+	OBD_SLAB_ALLOC_PTR_GFP(link, lov_lock_link_kmem, __GFP_IO);
+	if (link != NULL) {
+		struct lov_sublock_env *subenv;
+		struct lov_lock_sub  *lls;
+		struct cl_lock_descr *descr;
+
+		parent = lck->lls_cl.cls_lock;
+		lls    = &lck->lls_sub[idx];
+		descr  = &lls->sub_got;
+
+		subenv = lov_sublock_env_get(env, parent, lls);
+		if (!IS_ERR(subenv)) {
+			/* CAVEAT: Don't try to add a field in lov_lock_sub
+			 * to remember the subio. This is because lock is able
+			 * to be cached, but this is not true for IO. This
+			 * further means a sublock might be referenced in
+			 * different io context. -jay */
+
+			sublock = cl_lock_hold(subenv->lse_env, subenv->lse_io,
+					       descr, "lov-parent", parent);
+			lov_sublock_env_put(subenv);
+		} else {
+			/* error occurs. */
+			sublock = (void*)subenv;
+		}
+
+		if (!IS_ERR(sublock))
+			*out = link;
+		else
+			OBD_SLAB_FREE_PTR(link, lov_lock_link_kmem);
+	} else
+		sublock = ERR_PTR(-ENOMEM);
+	RETURN(sublock);
+}
+
+static void lov_sublock_unlock(const struct lu_env *env,
+			       struct lovsub_lock *lsl,
+			       struct cl_lock_closure *closure,
+			       struct lov_sublock_env *subenv)
+{
+	ENTRY;
+	lov_sublock_env_put(subenv);
+	lsl->lss_active = NULL;
+	cl_lock_disclosure(env, closure);
+	EXIT;
+}
+
+static int lov_sublock_lock(const struct lu_env *env,
+			    struct lov_lock *lck,
+			    struct lov_lock_sub *lls,
+			    struct cl_lock_closure *closure,
+			    struct lov_sublock_env **lsep)
+{
+	struct lovsub_lock *sublock;
+	struct cl_lock     *child;
+	int		 result = 0;
+	ENTRY;
+
+	LASSERT(list_empty(&closure->clc_list));
+
+	sublock = lls->sub_lock;
+	child = sublock->lss_cl.cls_lock;
+	result = cl_lock_closure_build(env, child, closure);
+	if (result == 0) {
+		struct cl_lock *parent = closure->clc_origin;
+
+		LASSERT(cl_lock_is_mutexed(child));
+		sublock->lss_active = parent;
+
+		if (unlikely((child->cll_state == CLS_FREEING) ||
+			     (child->cll_flags & CLF_CANCELLED))) {
+			struct lov_lock_link *link;
+			/*
+			 * we could race with lock deletion which temporarily
+			 * put the lock in freeing state, bug 19080.
+			 */
+			LASSERT(!(lls->sub_flags & LSF_HELD));
+
+			link = lov_lock_link_find(env, lck, sublock);
+			LASSERT(link != NULL);
+			lov_lock_unlink(env, link, sublock);
+			lov_sublock_unlock(env, sublock, closure, NULL);
+			lck->lls_cancel_race = 1;
+			result = CLO_REPEAT;
+		} else if (lsep) {
+			struct lov_sublock_env *subenv;
+			subenv = lov_sublock_env_get(env, parent, lls);
+			if (IS_ERR(subenv)) {
+				lov_sublock_unlock(env, sublock,
+						   closure, NULL);
+				result = PTR_ERR(subenv);
+			} else {
+				*lsep = subenv;
+			}
+		}
+	}
+	RETURN(result);
+}
+
+/**
+ * Updates the result of a top-lock operation from a result of sub-lock
+ * sub-operations. Top-operations like lov_lock_{enqueue,use,unuse}() iterate
+ * over sub-locks and lov_subresult() is used to calculate return value of a
+ * top-operation. To this end, possible return values of sub-operations are
+ * ordered as
+ *
+ *     - 0		  success
+ *     - CLO_WAIT	   wait for event
+ *     - CLO_REPEAT	 repeat top-operation
+ *     - -ne		fundamental error
+ *
+ * Top-level return code can only go down through this list. CLO_REPEAT
+ * overwrites CLO_WAIT, because lock mutex was released and sleeping condition
+ * has to be rechecked by the upper layer.
+ */
+static int lov_subresult(int result, int rc)
+{
+	int result_rank;
+	int rc_rank;
+
+	ENTRY;
+
+	LASSERTF(result <= 0 || result == CLO_REPEAT || result == CLO_WAIT,
+		 "result = %d", result);
+	LASSERTF(rc <= 0 || rc == CLO_REPEAT || rc == CLO_WAIT,
+		 "rc = %d\n", rc);
+	CLASSERT(CLO_WAIT < CLO_REPEAT);
+
+	/* calculate ranks in the ordering above */
+	result_rank = result < 0 ? 1 + CLO_REPEAT : result;
+	rc_rank = rc < 0 ? 1 + CLO_REPEAT : rc;
+
+	if (result_rank < rc_rank)
+		result = rc;
+	RETURN(result);
+}
+
+/**
+ * Creates sub-locks for a given lov_lock for the first time.
+ *
+ * Goes through all sub-objects of top-object, and creates sub-locks on every
+ * sub-object intersecting with top-lock extent. This is complicated by the
+ * fact that top-lock (that is being created) can be accessed concurrently
+ * through already created sub-locks (possibly shared with other top-locks).
+ */
+static int lov_lock_sub_init(const struct lu_env *env,
+			     struct lov_lock *lck, const struct cl_io *io)
+{
+	int result = 0;
+	int i;
+	int nr;
+	obd_off start;
+	obd_off end;
+	obd_off file_start;
+	obd_off file_end;
+
+	struct lov_object       *loo    = cl2lov(lck->lls_cl.cls_obj);
+	struct lov_layout_raid0 *r0     = lov_r0(loo);
+	struct cl_lock	  *parent = lck->lls_cl.cls_lock;
+
+	ENTRY;
+
+	lck->lls_orig = parent->cll_descr;
+	file_start = cl_offset(lov2cl(loo), parent->cll_descr.cld_start);
+	file_end   = cl_offset(lov2cl(loo), parent->cll_descr.cld_end + 1) - 1;
+
+	for (i = 0, nr = 0; i < r0->lo_nr; i++) {
+		/*
+		 * XXX for wide striping smarter algorithm is desirable,
+		 * breaking out of the loop, early.
+		 */
+		if (lov_stripe_intersects(loo->lo_lsm, i,
+					  file_start, file_end, &start, &end))
+			nr++;
+	}
+	LASSERT(nr > 0);
+	OBD_ALLOC_LARGE(lck->lls_sub, nr * sizeof lck->lls_sub[0]);
+	if (lck->lls_sub == NULL)
+		RETURN(-ENOMEM);
+
+	lck->lls_nr = nr;
+	/*
+	 * First, fill in sub-lock descriptions in
+	 * lck->lls_sub[].sub_descr. They are used by lov_sublock_alloc()
+	 * (called below in this function, and by lov_lock_enqueue()) to
+	 * create sub-locks. At this moment, no other thread can access
+	 * top-lock.
+	 */
+	for (i = 0, nr = 0; i < r0->lo_nr; ++i) {
+		if (lov_stripe_intersects(loo->lo_lsm, i,
+					  file_start, file_end, &start, &end)) {
+			struct cl_lock_descr *descr;
+
+			descr = &lck->lls_sub[nr].sub_descr;
+
+			LASSERT(descr->cld_obj == NULL);
+			descr->cld_obj   = lovsub2cl(r0->lo_sub[i]);
+			descr->cld_start = cl_index(descr->cld_obj, start);
+			descr->cld_end   = cl_index(descr->cld_obj, end);
+			descr->cld_mode  = parent->cll_descr.cld_mode;
+			descr->cld_gid   = parent->cll_descr.cld_gid;
+			descr->cld_enq_flags   = parent->cll_descr.cld_enq_flags;
+			/* XXX has no effect */
+			lck->lls_sub[nr].sub_got = *descr;
+			lck->lls_sub[nr].sub_stripe = i;
+			nr++;
+		}
+	}
+	LASSERT(nr == lck->lls_nr);
+	/*
+	 * Then, create sub-locks. Once at least one sub-lock was created,
+	 * top-lock can be reached by other threads.
+	 */
+	for (i = 0; i < lck->lls_nr; ++i) {
+		struct cl_lock       *sublock;
+		struct lov_lock_link *link;
+
+		if (lck->lls_sub[i].sub_lock == NULL) {
+			sublock = lov_sublock_alloc(env, io, lck, i, &link);
+			if (IS_ERR(sublock)) {
+				result = PTR_ERR(sublock);
+				break;
+			}
+			cl_lock_get_trust(sublock);
+			cl_lock_mutex_get(env, sublock);
+			cl_lock_mutex_get(env, parent);
+			/*
+			 * recheck under mutex that sub-lock wasn't created
+			 * concurrently, and that top-lock is still alive.
+			 */
+			if (lck->lls_sub[i].sub_lock == NULL &&
+			    parent->cll_state < CLS_FREEING) {
+				lov_sublock_adopt(env, lck, sublock, i, link);
+				cl_lock_mutex_put(env, parent);
+			} else {
+				OBD_SLAB_FREE_PTR(link, lov_lock_link_kmem);
+				cl_lock_mutex_put(env, parent);
+				cl_lock_unhold(env, sublock,
+					       "lov-parent", parent);
+			}
+			cl_lock_mutex_put(env, sublock);
+			cl_lock_put(env, sublock);
+		}
+	}
+	/*
+	 * Some sub-locks can be missing at this point. This is not a problem,
+	 * because enqueue will create them anyway. Main duty of this function
+	 * is to fill in sub-lock descriptions in a race free manner.
+	 */
+	RETURN(result);
+}
+
+static int lov_sublock_release(const struct lu_env *env, struct lov_lock *lck,
+			       int i, int deluser, int rc)
+{
+	struct cl_lock *parent = lck->lls_cl.cls_lock;
+
+	LASSERT(cl_lock_is_mutexed(parent));
+	ENTRY;
+
+	if (lck->lls_sub[i].sub_flags & LSF_HELD) {
+		struct cl_lock    *sublock;
+		int dying;
+
+		LASSERT(lck->lls_sub[i].sub_lock != NULL);
+		sublock = lck->lls_sub[i].sub_lock->lss_cl.cls_lock;
+		LASSERT(cl_lock_is_mutexed(sublock));
+
+		lck->lls_sub[i].sub_flags &= ~LSF_HELD;
+		if (deluser)
+			cl_lock_user_del(env, sublock);
+		/*
+		 * If the last hold is released, and cancellation is pending
+		 * for a sub-lock, release parent mutex, to avoid keeping it
+		 * while sub-lock is being paged out.
+		 */
+		dying = (sublock->cll_descr.cld_mode == CLM_PHANTOM ||
+			 sublock->cll_descr.cld_mode == CLM_GROUP ||
+			 (sublock->cll_flags & (CLF_CANCELPEND|CLF_DOOMED))) &&
+			sublock->cll_holds == 1;
+		if (dying)
+			cl_lock_mutex_put(env, parent);
+		cl_lock_unhold(env, sublock, "lov-parent", parent);
+		if (dying) {
+			cl_lock_mutex_get(env, parent);
+			rc = lov_subresult(rc, CLO_REPEAT);
+		}
+		/*
+		 * From now on lck->lls_sub[i].sub_lock is a "weak" pointer,
+		 * not backed by a reference on a
+		 * sub-lock. lovsub_lock_delete() will clear
+		 * lck->lls_sub[i].sub_lock under semaphores, just before
+		 * sub-lock is destroyed.
+		 */
+	}
+	RETURN(rc);
+}
+
+static void lov_sublock_hold(const struct lu_env *env, struct lov_lock *lck,
+			     int i)
+{
+	struct cl_lock *parent = lck->lls_cl.cls_lock;
+
+	LASSERT(cl_lock_is_mutexed(parent));
+	ENTRY;
+
+	if (!(lck->lls_sub[i].sub_flags & LSF_HELD)) {
+		struct cl_lock *sublock;
+
+		LASSERT(lck->lls_sub[i].sub_lock != NULL);
+		sublock = lck->lls_sub[i].sub_lock->lss_cl.cls_lock;
+		LASSERT(cl_lock_is_mutexed(sublock));
+		LASSERT(sublock->cll_state != CLS_FREEING);
+
+		lck->lls_sub[i].sub_flags |= LSF_HELD;
+
+		cl_lock_get_trust(sublock);
+		cl_lock_hold_add(env, sublock, "lov-parent", parent);
+		cl_lock_user_add(env, sublock);
+		cl_lock_put(env, sublock);
+	}
+	EXIT;
+}
+
+static void lov_lock_fini(const struct lu_env *env,
+			  struct cl_lock_slice *slice)
+{
+	struct lov_lock *lck;
+	int i;
+
+	ENTRY;
+	lck = cl2lov_lock(slice);
+	LASSERT(lck->lls_nr_filled == 0);
+	if (lck->lls_sub != NULL) {
+		for (i = 0; i < lck->lls_nr; ++i)
+			/*
+			 * No sub-locks exists at this point, as sub-lock has
+			 * a reference on its parent.
+			 */
+			LASSERT(lck->lls_sub[i].sub_lock == NULL);
+		OBD_FREE_LARGE(lck->lls_sub,
+			       lck->lls_nr * sizeof lck->lls_sub[0]);
+	}
+	OBD_SLAB_FREE_PTR(lck, lov_lock_kmem);
+	EXIT;
+}
+
+static int lov_lock_enqueue_wait(const struct lu_env *env,
+				 struct lov_lock *lck,
+				 struct cl_lock *sublock)
+{
+	struct cl_lock *lock = lck->lls_cl.cls_lock;
+	int	     result;
+	ENTRY;
+
+	LASSERT(cl_lock_is_mutexed(lock));
+
+	cl_lock_mutex_put(env, lock);
+	result = cl_lock_enqueue_wait(env, sublock, 0);
+	cl_lock_mutex_get(env, lock);
+	RETURN(result ?: CLO_REPEAT);
+}
+
+/**
+ * Tries to advance a state machine of a given sub-lock toward enqueuing of
+ * the top-lock.
+ *
+ * \retval 0 if state-transition can proceed
+ * \retval -ve otherwise.
+ */
+static int lov_lock_enqueue_one(const struct lu_env *env, struct lov_lock *lck,
+				struct cl_lock *sublock,
+				struct cl_io *io, __u32 enqflags, int last)
+{
+	int result;
+	ENTRY;
+
+	/* first, try to enqueue a sub-lock ... */
+	result = cl_enqueue_try(env, sublock, io, enqflags);
+	if ((sublock->cll_state == CLS_ENQUEUED) && !(enqflags & CEF_AGL)) {
+		/* if it is enqueued, try to `wait' on it---maybe it's already
+		 * granted */
+		result = cl_wait_try(env, sublock);
+		if (result == CLO_REENQUEUED)
+			result = CLO_WAIT;
+	}
+	/*
+	 * If CEF_ASYNC flag is set, then all sub-locks can be enqueued in
+	 * parallel, otherwise---enqueue has to wait until sub-lock is granted
+	 * before proceeding to the next one.
+	 */
+	if ((result == CLO_WAIT) && (sublock->cll_state <= CLS_HELD) &&
+	    (enqflags & CEF_ASYNC) && (!last || (enqflags & CEF_AGL)))
+		result = 0;
+	RETURN(result);
+}
+
+/**
+ * Helper function for lov_lock_enqueue() that creates missing sub-lock.
+ */
+static int lov_sublock_fill(const struct lu_env *env, struct cl_lock *parent,
+			    struct cl_io *io, struct lov_lock *lck, int idx)
+{
+	struct lov_lock_link *link;
+	struct cl_lock       *sublock;
+	int		   result;
+
+	LASSERT(parent->cll_depth == 1);
+	cl_lock_mutex_put(env, parent);
+	sublock = lov_sublock_alloc(env, io, lck, idx, &link);
+	if (!IS_ERR(sublock))
+		cl_lock_mutex_get(env, sublock);
+	cl_lock_mutex_get(env, parent);
+
+	if (!IS_ERR(sublock)) {
+		cl_lock_get_trust(sublock);
+		if (parent->cll_state == CLS_QUEUING &&
+		    lck->lls_sub[idx].sub_lock == NULL) {
+			lov_sublock_adopt(env, lck, sublock, idx, link);
+		} else {
+			OBD_SLAB_FREE_PTR(link, lov_lock_link_kmem);
+			/* other thread allocated sub-lock, or enqueue is no
+			 * longer going on */
+			cl_lock_mutex_put(env, parent);
+			cl_lock_unhold(env, sublock, "lov-parent", parent);
+			cl_lock_mutex_get(env, parent);
+		}
+		cl_lock_mutex_put(env, sublock);
+		cl_lock_put(env, sublock);
+		result = CLO_REPEAT;
+	} else
+		result = PTR_ERR(sublock);
+	return result;
+}
+
+/**
+ * Implementation of cl_lock_operations::clo_enqueue() for lov layer. This
+ * function is rather subtle, as it enqueues top-lock (i.e., advances top-lock
+ * state machine from CLS_QUEUING to CLS_ENQUEUED states) by juggling sub-lock
+ * state machines in the face of sub-locks sharing (by multiple top-locks),
+ * and concurrent sub-lock cancellations.
+ */
+static int lov_lock_enqueue(const struct lu_env *env,
+			    const struct cl_lock_slice *slice,
+			    struct cl_io *io, __u32 enqflags)
+{
+	struct cl_lock	 *lock    = slice->cls_lock;
+	struct lov_lock	*lck     = cl2lov_lock(slice);
+	struct cl_lock_closure *closure = lov_closure_get(env, lock);
+	int i;
+	int result;
+	enum cl_lock_state minstate;
+
+	ENTRY;
+
+	for (result = 0, minstate = CLS_FREEING, i = 0; i < lck->lls_nr; ++i) {
+		int rc;
+		struct lovsub_lock     *sub;
+		struct lov_lock_sub    *lls;
+		struct cl_lock	 *sublock;
+		struct lov_sublock_env *subenv;
+
+		if (lock->cll_state != CLS_QUEUING) {
+			/*
+			 * Lock might have left QUEUING state if previous
+			 * iteration released its mutex. Stop enqueing in this
+			 * case and let the upper layer to decide what to do.
+			 */
+			LASSERT(i > 0 && result != 0);
+			break;
+		}
+
+		lls = &lck->lls_sub[i];
+		sub = lls->sub_lock;
+		/*
+		 * Sub-lock might have been canceled, while top-lock was
+		 * cached.
+		 */
+		if (sub == NULL) {
+			result = lov_sublock_fill(env, lock, io, lck, i);
+			/* lov_sublock_fill() released @lock mutex,
+			 * restart. */
+			break;
+		}
+		sublock = sub->lss_cl.cls_lock;
+		rc = lov_sublock_lock(env, lck, lls, closure, &subenv);
+		if (rc == 0) {
+			lov_sublock_hold(env, lck, i);
+			rc = lov_lock_enqueue_one(subenv->lse_env, lck, sublock,
+						  subenv->lse_io, enqflags,
+						  i == lck->lls_nr - 1);
+			minstate = min(minstate, sublock->cll_state);
+			if (rc == CLO_WAIT) {
+				switch (sublock->cll_state) {
+				case CLS_QUEUING:
+					/* take recursive mutex, the lock is
+					 * released in lov_lock_enqueue_wait.
+					 */
+					cl_lock_mutex_get(env, sublock);
+					lov_sublock_unlock(env, sub, closure,
+							   subenv);
+					rc = lov_lock_enqueue_wait(env, lck,
+								   sublock);
+					break;
+				case CLS_CACHED:
+					cl_lock_get(sublock);
+					/* take recursive mutex of sublock */
+					cl_lock_mutex_get(env, sublock);
+					/* need to release all locks in closure
+					 * otherwise it may deadlock. LU-2683.*/
+					lov_sublock_unlock(env, sub, closure,
+							   subenv);
+					/* sublock and parent are held. */
+					rc = lov_sublock_release(env, lck, i,
+								 1, rc);
+					cl_lock_mutex_put(env, sublock);
+					cl_lock_put(env, sublock);
+					break;
+				default:
+					lov_sublock_unlock(env, sub, closure,
+							   subenv);
+					break;
+				}
+			} else {
+				LASSERT(sublock->cll_conflict == NULL);
+				lov_sublock_unlock(env, sub, closure, subenv);
+			}
+		}
+		result = lov_subresult(result, rc);
+		if (result != 0)
+			break;
+	}
+	cl_lock_closure_fini(closure);
+	RETURN(result ?: minstate >= CLS_ENQUEUED ? 0 : CLO_WAIT);
+}
+
+static int lov_lock_unuse(const struct lu_env *env,
+			  const struct cl_lock_slice *slice)
+{
+	struct lov_lock	*lck     = cl2lov_lock(slice);
+	struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock);
+	int i;
+	int result;
+
+	ENTRY;
+
+	for (result = 0, i = 0; i < lck->lls_nr; ++i) {
+		int rc;
+		struct lovsub_lock     *sub;
+		struct cl_lock	 *sublock;
+		struct lov_lock_sub    *lls;
+		struct lov_sublock_env *subenv;
+
+		/* top-lock state cannot change concurrently, because single
+		 * thread (one that released the last hold) carries unlocking
+		 * to the completion. */
+		LASSERT(slice->cls_lock->cll_state == CLS_INTRANSIT);
+		lls = &lck->lls_sub[i];
+		sub = lls->sub_lock;
+		if (sub == NULL)
+			continue;
+
+		sublock = sub->lss_cl.cls_lock;
+		rc = lov_sublock_lock(env, lck, lls, closure, &subenv);
+		if (rc == 0) {
+			if (lls->sub_flags & LSF_HELD) {
+				LASSERT(sublock->cll_state == CLS_HELD ||
+					sublock->cll_state == CLS_ENQUEUED);
+				rc = cl_unuse_try(subenv->lse_env, sublock);
+				rc = lov_sublock_release(env, lck, i, 0, rc);
+			}
+			lov_sublock_unlock(env, sub, closure, subenv);
+		}
+		result = lov_subresult(result, rc);
+	}
+
+	if (result == 0 && lck->lls_cancel_race) {
+		lck->lls_cancel_race = 0;
+		result = -ESTALE;
+	}
+	cl_lock_closure_fini(closure);
+	RETURN(result);
+}
+
+
+static void lov_lock_cancel(const struct lu_env *env,
+			   const struct cl_lock_slice *slice)
+{
+	struct lov_lock	*lck     = cl2lov_lock(slice);
+	struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock);
+	int i;
+	int result;
+
+	ENTRY;
+
+	for (result = 0, i = 0; i < lck->lls_nr; ++i) {
+		int rc;
+		struct lovsub_lock     *sub;
+		struct cl_lock	 *sublock;
+		struct lov_lock_sub    *lls;
+		struct lov_sublock_env *subenv;
+
+		/* top-lock state cannot change concurrently, because single
+		 * thread (one that released the last hold) carries unlocking
+		 * to the completion. */
+		lls = &lck->lls_sub[i];
+		sub = lls->sub_lock;
+		if (sub == NULL)
+			continue;
+
+		sublock = sub->lss_cl.cls_lock;
+		rc = lov_sublock_lock(env, lck, lls, closure, &subenv);
+		if (rc == 0) {
+			if (!(lls->sub_flags & LSF_HELD)) {
+				lov_sublock_unlock(env, sub, closure, subenv);
+				continue;
+			}
+
+			switch(sublock->cll_state) {
+			case CLS_HELD:
+				rc = cl_unuse_try(subenv->lse_env, sublock);
+				lov_sublock_release(env, lck, i, 0, 0);
+				break;
+			default:
+				lov_sublock_release(env, lck, i, 1, 0);
+				break;
+			}
+			lov_sublock_unlock(env, sub, closure, subenv);
+		}
+
+		if (rc == CLO_REPEAT) {
+			--i;
+			continue;
+		}
+
+		result = lov_subresult(result, rc);
+	}
+
+	if (result)
+		CL_LOCK_DEBUG(D_ERROR, env, slice->cls_lock,
+			      "lov_lock_cancel fails with %d.\n", result);
+
+	cl_lock_closure_fini(closure);
+}
+
+static int lov_lock_wait(const struct lu_env *env,
+			 const struct cl_lock_slice *slice)
+{
+	struct lov_lock	*lck     = cl2lov_lock(slice);
+	struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock);
+	enum cl_lock_state      minstate;
+	int		     reenqueued;
+	int		     result;
+	int		     i;
+
+	ENTRY;
+
+again:
+	for (result = 0, minstate = CLS_FREEING, i = 0, reenqueued = 0;
+	     i < lck->lls_nr; ++i) {
+		int rc;
+		struct lovsub_lock     *sub;
+		struct cl_lock	 *sublock;
+		struct lov_lock_sub    *lls;
+		struct lov_sublock_env *subenv;
+
+		lls = &lck->lls_sub[i];
+		sub = lls->sub_lock;
+		LASSERT(sub != NULL);
+		sublock = sub->lss_cl.cls_lock;
+		rc = lov_sublock_lock(env, lck, lls, closure, &subenv);
+		if (rc == 0) {
+			LASSERT(sublock->cll_state >= CLS_ENQUEUED);
+			if (sublock->cll_state < CLS_HELD)
+				rc = cl_wait_try(env, sublock);
+
+			minstate = min(minstate, sublock->cll_state);
+			lov_sublock_unlock(env, sub, closure, subenv);
+		}
+		if (rc == CLO_REENQUEUED) {
+			reenqueued++;
+			rc = 0;
+		}
+		result = lov_subresult(result, rc);
+		if (result != 0)
+			break;
+	}
+	/* Each sublock only can be reenqueued once, so will not loop for
+	 * ever. */
+	if (result == 0 && reenqueued != 0)
+		goto again;
+	cl_lock_closure_fini(closure);
+	RETURN(result ?: minstate >= CLS_HELD ? 0 : CLO_WAIT);
+}
+
+static int lov_lock_use(const struct lu_env *env,
+			const struct cl_lock_slice *slice)
+{
+	struct lov_lock	*lck     = cl2lov_lock(slice);
+	struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock);
+	int		     result;
+	int		     i;
+
+	LASSERT(slice->cls_lock->cll_state == CLS_INTRANSIT);
+	ENTRY;
+
+	for (result = 0, i = 0; i < lck->lls_nr; ++i) {
+		int rc;
+		struct lovsub_lock     *sub;
+		struct cl_lock	 *sublock;
+		struct lov_lock_sub    *lls;
+		struct lov_sublock_env *subenv;
+
+		LASSERT(slice->cls_lock->cll_state == CLS_INTRANSIT);
+
+		lls = &lck->lls_sub[i];
+		sub = lls->sub_lock;
+		if (sub == NULL) {
+			/*
+			 * Sub-lock might have been canceled, while top-lock was
+			 * cached.
+			 */
+			result = -ESTALE;
+			break;
+		}
+
+		sublock = sub->lss_cl.cls_lock;
+		rc = lov_sublock_lock(env, lck, lls, closure, &subenv);
+		if (rc == 0) {
+			LASSERT(sublock->cll_state != CLS_FREEING);
+			lov_sublock_hold(env, lck, i);
+			if (sublock->cll_state == CLS_CACHED) {
+				rc = cl_use_try(subenv->lse_env, sublock, 0);
+				if (rc != 0)
+					rc = lov_sublock_release(env, lck,
+								 i, 1, rc);
+			} else if (sublock->cll_state == CLS_NEW) {
+				/* Sub-lock might have been canceled, while
+				 * top-lock was cached. */
+				result = -ESTALE;
+				lov_sublock_release(env, lck, i, 1, result);
+			}
+			lov_sublock_unlock(env, sub, closure, subenv);
+		}
+		result = lov_subresult(result, rc);
+		if (result != 0)
+			break;
+	}
+
+	if (lck->lls_cancel_race) {
+		/*
+		 * If there is unlocking happened at the same time, then
+		 * sublock_lock state should be FREEING, and lov_sublock_lock
+		 * should return CLO_REPEAT. In this case, it should return
+		 * ESTALE, and up layer should reset the lock state to be NEW.
+		 */
+		lck->lls_cancel_race = 0;
+		LASSERT(result != 0);
+		result = -ESTALE;
+	}
+	cl_lock_closure_fini(closure);
+	RETURN(result);
+}
+
+#if 0
+static int lock_lock_multi_match()
+{
+	struct cl_lock	  *lock    = slice->cls_lock;
+	struct cl_lock_descr    *subneed = &lov_env_info(env)->lti_ldescr;
+	struct lov_object       *loo     = cl2lov(lov->lls_cl.cls_obj);
+	struct lov_layout_raid0 *r0      = lov_r0(loo);
+	struct lov_lock_sub     *sub;
+	struct cl_object	*subobj;
+	obd_off  fstart;
+	obd_off  fend;
+	obd_off  start;
+	obd_off  end;
+	int i;
+
+	fstart = cl_offset(need->cld_obj, need->cld_start);
+	fend   = cl_offset(need->cld_obj, need->cld_end + 1) - 1;
+	subneed->cld_mode = need->cld_mode;
+	cl_lock_mutex_get(env, lock);
+	for (i = 0; i < lov->lls_nr; ++i) {
+		sub = &lov->lls_sub[i];
+		if (sub->sub_lock == NULL)
+			continue;
+		subobj = sub->sub_descr.cld_obj;
+		if (!lov_stripe_intersects(loo->lo_lsm, sub->sub_stripe,
+					   fstart, fend, &start, &end))
+			continue;
+		subneed->cld_start = cl_index(subobj, start);
+		subneed->cld_end   = cl_index(subobj, end);
+		subneed->cld_obj   = subobj;
+		if (!cl_lock_ext_match(&sub->sub_got, subneed)) {
+			result = 0;
+			break;
+		}
+	}
+	cl_lock_mutex_put(env, lock);
+}
+#endif
+
+/**
+ * Check if the extent region \a descr is covered by \a child against the
+ * specific \a stripe.
+ */
+static int lov_lock_stripe_is_matching(const struct lu_env *env,
+				       struct lov_object *lov, int stripe,
+				       const struct cl_lock_descr *child,
+				       const struct cl_lock_descr *descr)
+{
+	struct lov_stripe_md *lsm = lov->lo_lsm;
+	obd_off start;
+	obd_off end;
+	int result;
+
+	if (lov_r0(lov)->lo_nr == 1)
+		return cl_lock_ext_match(child, descr);
+
+	/*
+	 * For a multi-stripes object:
+	 * - make sure the descr only covers child's stripe, and
+	 * - check if extent is matching.
+	 */
+	start = cl_offset(&lov->lo_cl, descr->cld_start);
+	end   = cl_offset(&lov->lo_cl, descr->cld_end + 1) - 1;
+	result = end - start <= lsm->lsm_stripe_size &&
+		 stripe == lov_stripe_number(lsm, start) &&
+		 stripe == lov_stripe_number(lsm, end);
+	if (result) {
+		struct cl_lock_descr *subd = &lov_env_info(env)->lti_ldescr;
+		obd_off sub_start;
+		obd_off sub_end;
+
+		subd->cld_obj  = NULL;   /* don't need sub object at all */
+		subd->cld_mode = descr->cld_mode;
+		subd->cld_gid  = descr->cld_gid;
+		result = lov_stripe_intersects(lsm, stripe, start, end,
+					       &sub_start, &sub_end);
+		LASSERT(result);
+		subd->cld_start = cl_index(child->cld_obj, sub_start);
+		subd->cld_end   = cl_index(child->cld_obj, sub_end);
+		result = cl_lock_ext_match(child, subd);
+	}
+	return result;
+}
+
+/**
+ * An implementation of cl_lock_operations::clo_fits_into() method.
+ *
+ * Checks whether a lock (given by \a slice) is suitable for \a
+ * io. Multi-stripe locks can be used only for "quick" io, like truncate, or
+ * O_APPEND write.
+ *
+ * \see ccc_lock_fits_into().
+ */
+static int lov_lock_fits_into(const struct lu_env *env,
+			      const struct cl_lock_slice *slice,
+			      const struct cl_lock_descr *need,
+			      const struct cl_io *io)
+{
+	struct lov_lock   *lov = cl2lov_lock(slice);
+	struct lov_object *obj = cl2lov(slice->cls_obj);
+	int result;
+
+	LASSERT(cl_object_same(need->cld_obj, slice->cls_obj));
+	LASSERT(lov->lls_nr > 0);
+
+	ENTRY;
+
+	/* for top lock, it's necessary to match enq flags otherwise it will
+	 * run into problem if a sublock is missing and reenqueue. */
+	if (need->cld_enq_flags != lov->lls_orig.cld_enq_flags)
+		return 0;
+
+	if (need->cld_mode == CLM_GROUP)
+		/*
+		 * always allow to match group lock.
+		 */
+		result = cl_lock_ext_match(&lov->lls_orig, need);
+	else if (lov->lls_nr == 1) {
+		struct cl_lock_descr *got = &lov->lls_sub[0].sub_got;
+		result = lov_lock_stripe_is_matching(env,
+						     cl2lov(slice->cls_obj),
+						     lov->lls_sub[0].sub_stripe,
+						     got, need);
+	} else if (io->ci_type != CIT_SETATTR && io->ci_type != CIT_MISC &&
+		   !cl_io_is_append(io) && need->cld_mode != CLM_PHANTOM)
+		/*
+		 * Multi-stripe locks are only suitable for `quick' IO and for
+		 * glimpse.
+		 */
+		result = 0;
+	else
+		/*
+		 * Most general case: multi-stripe existing lock, and
+		 * (potentially) multi-stripe @need lock. Check that @need is
+		 * covered by @lov's sub-locks.
+		 *
+		 * For now, ignore lock expansions made by the server, and
+		 * match against original lock extent.
+		 */
+		result = cl_lock_ext_match(&lov->lls_orig, need);
+	CDEBUG(D_DLMTRACE, DDESCR"/"DDESCR" %d %d/%d: %d\n",
+	       PDESCR(&lov->lls_orig), PDESCR(&lov->lls_sub[0].sub_got),
+	       lov->lls_sub[0].sub_stripe, lov->lls_nr, lov_r0(obj)->lo_nr,
+	       result);
+	RETURN(result);
+}
+
+void lov_lock_unlink(const struct lu_env *env,
+		     struct lov_lock_link *link, struct lovsub_lock *sub)
+{
+	struct lov_lock *lck    = link->lll_super;
+	struct cl_lock  *parent = lck->lls_cl.cls_lock;
+
+	LASSERT(cl_lock_is_mutexed(parent));
+	LASSERT(cl_lock_is_mutexed(sub->lss_cl.cls_lock));
+	ENTRY;
+
+	list_del_init(&link->lll_list);
+	LASSERT(lck->lls_sub[link->lll_idx].sub_lock == sub);
+	/* yank this sub-lock from parent's array */
+	lck->lls_sub[link->lll_idx].sub_lock = NULL;
+	LASSERT(lck->lls_nr_filled > 0);
+	lck->lls_nr_filled--;
+	lu_ref_del(&parent->cll_reference, "lov-child", sub->lss_cl.cls_lock);
+	cl_lock_put(env, parent);
+	OBD_SLAB_FREE_PTR(link, lov_lock_link_kmem);
+	EXIT;
+}
+
+struct lov_lock_link *lov_lock_link_find(const struct lu_env *env,
+					 struct lov_lock *lck,
+					 struct lovsub_lock *sub)
+{
+	struct lov_lock_link *scan;
+
+	LASSERT(cl_lock_is_mutexed(sub->lss_cl.cls_lock));
+	ENTRY;
+
+	list_for_each_entry(scan, &sub->lss_parents, lll_list) {
+		if (scan->lll_super == lck)
+			RETURN(scan);
+	}
+	RETURN(NULL);
+}
+
+/**
+ * An implementation of cl_lock_operations::clo_delete() method. This is
+ * invoked for "top-to-bottom" delete, when lock destruction starts from the
+ * top-lock, e.g., as a result of inode destruction.
+ *
+ * Unlinks top-lock from all its sub-locks. Sub-locks are not deleted there:
+ * this is done separately elsewhere:
+ *
+ *     - for inode destruction, lov_object_delete() calls cl_object_kill() for
+ *       each sub-object, purging its locks;
+ *
+ *     - in other cases (e.g., a fatal error with a top-lock) sub-locks are
+ *       left in the cache.
+ */
+static void lov_lock_delete(const struct lu_env *env,
+			    const struct cl_lock_slice *slice)
+{
+	struct lov_lock	*lck     = cl2lov_lock(slice);
+	struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock);
+	struct lov_lock_link   *link;
+	int		     rc;
+	int		     i;
+
+	LASSERT(slice->cls_lock->cll_state == CLS_FREEING);
+	ENTRY;
+
+	for (i = 0; i < lck->lls_nr; ++i) {
+		struct lov_lock_sub *lls = &lck->lls_sub[i];
+		struct lovsub_lock  *lsl = lls->sub_lock;
+
+		if (lsl == NULL) /* already removed */
+			continue;
+
+		rc = lov_sublock_lock(env, lck, lls, closure, NULL);
+		if (rc == CLO_REPEAT) {
+			--i;
+			continue;
+		}
+
+		LASSERT(rc == 0);
+		LASSERT(lsl->lss_cl.cls_lock->cll_state < CLS_FREEING);
+
+		if (lls->sub_flags & LSF_HELD)
+			lov_sublock_release(env, lck, i, 1, 0);
+
+		link = lov_lock_link_find(env, lck, lsl);
+		LASSERT(link != NULL);
+		lov_lock_unlink(env, link, lsl);
+		LASSERT(lck->lls_sub[i].sub_lock == NULL);
+
+		lov_sublock_unlock(env, lsl, closure, NULL);
+	}
+
+	cl_lock_closure_fini(closure);
+	EXIT;
+}
+
+static int lov_lock_print(const struct lu_env *env, void *cookie,
+			  lu_printer_t p, const struct cl_lock_slice *slice)
+{
+	struct lov_lock *lck = cl2lov_lock(slice);
+	int	      i;
+
+	(*p)(env, cookie, "%d\n", lck->lls_nr);
+	for (i = 0; i < lck->lls_nr; ++i) {
+		struct lov_lock_sub *sub;
+
+		sub = &lck->lls_sub[i];
+		(*p)(env, cookie, "    %d %x: ", i, sub->sub_flags);
+		if (sub->sub_lock != NULL)
+			cl_lock_print(env, cookie, p,
+				      sub->sub_lock->lss_cl.cls_lock);
+		else
+			(*p)(env, cookie, "---\n");
+	}
+	return 0;
+}
+
+static const struct cl_lock_operations lov_lock_ops = {
+	.clo_fini      = lov_lock_fini,
+	.clo_enqueue   = lov_lock_enqueue,
+	.clo_wait      = lov_lock_wait,
+	.clo_use       = lov_lock_use,
+	.clo_unuse     = lov_lock_unuse,
+	.clo_cancel    = lov_lock_cancel,
+	.clo_fits_into = lov_lock_fits_into,
+	.clo_delete    = lov_lock_delete,
+	.clo_print     = lov_lock_print
+};
+
+int lov_lock_init_raid0(const struct lu_env *env, struct cl_object *obj,
+			struct cl_lock *lock, const struct cl_io *io)
+{
+	struct lov_lock *lck;
+	int result;
+
+	ENTRY;
+	OBD_SLAB_ALLOC_PTR_GFP(lck, lov_lock_kmem, __GFP_IO);
+	if (lck != NULL) {
+		cl_lock_slice_add(lock, &lck->lls_cl, obj, &lov_lock_ops);
+		result = lov_lock_sub_init(env, lck, io);
+	} else
+		result = -ENOMEM;
+	RETURN(result);
+}
+
+static void lov_empty_lock_fini(const struct lu_env *env,
+				struct cl_lock_slice *slice)
+{
+	struct lov_lock *lck = cl2lov_lock(slice);
+	OBD_SLAB_FREE_PTR(lck, lov_lock_kmem);
+}
+
+static int lov_empty_lock_print(const struct lu_env *env, void *cookie,
+			lu_printer_t p, const struct cl_lock_slice *slice)
+{
+	(*p)(env, cookie, "empty\n");
+	return 0;
+}
+
+/* XXX: more methods will be added later. */
+static const struct cl_lock_operations lov_empty_lock_ops = {
+	.clo_fini  = lov_empty_lock_fini,
+	.clo_print = lov_empty_lock_print
+};
+
+int lov_lock_init_empty(const struct lu_env *env, struct cl_object *obj,
+		struct cl_lock *lock, const struct cl_io *io)
+{
+	struct lov_lock *lck;
+	int result = -ENOMEM;
+
+	ENTRY;
+	OBD_SLAB_ALLOC_PTR_GFP(lck, lov_lock_kmem, __GFP_IO);
+	if (lck != NULL) {
+		cl_lock_slice_add(lock, &lck->lls_cl, obj, &lov_empty_lock_ops);
+		lck->lls_orig = lock->cll_descr;
+		result = 0;
+	}
+	RETURN(result);
+}
+
+static struct cl_lock_closure *lov_closure_get(const struct lu_env *env,
+					       struct cl_lock *parent)
+{
+	struct cl_lock_closure *closure;
+
+	closure = &lov_env_info(env)->lti_closure;
+	LASSERT(list_empty(&closure->clc_list));
+	cl_lock_closure_init(env, closure, parent, 1);
+	return closure;
+}
+
+
+/** @} lov */
diff --git a/drivers/staging/lustre/lustre/lov/lov_log.c b/drivers/staging/lustre/lustre/lov/lov_log.c
new file mode 100644
index 000000000000..63b7f8d3182f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_log.c
@@ -0,0 +1,278 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lov/lov_log.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Mike Shaver <shaver@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+#include <linux/libcfs/libcfs.h>
+
+#include <obd_support.h>
+#include <lustre_lib.h>
+#include <lustre_net.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_dlm.h>
+#include <lustre_mds.h>
+#include <obd_class.h>
+#include <obd_lov.h>
+#include <obd_ost.h>
+#include <lprocfs_status.h>
+#include <lustre_log.h>
+
+#include "lov_internal.h"
+
+/* Add log records for each OSC that this object is striped over, and return
+ * cookies for each one.  We _would_ have nice abstraction here, except that
+ * we need to keep cookies in stripe order, even if some are NULL, so that
+ * the right cookies are passed back to the right OSTs at the client side.
+ * Unset cookies should be all-zero (which will never occur naturally). */
+static int lov_llog_origin_add(const struct lu_env *env,
+			       struct llog_ctxt *ctxt,
+			       struct llog_rec_hdr *rec,
+			       struct lov_stripe_md *lsm,
+			       struct llog_cookie *logcookies, int numcookies)
+{
+	struct obd_device *obd = ctxt->loc_obd;
+	struct lov_obd *lov = &obd->u.lov;
+	int i, rc = 0, cookies = 0;
+	ENTRY;
+
+	LASSERTF(logcookies && numcookies >= lsm->lsm_stripe_count,
+		 "logcookies %p, numcookies %d lsm->lsm_stripe_count %d \n",
+		 logcookies, numcookies, lsm->lsm_stripe_count);
+
+	for (i = 0; i < lsm->lsm_stripe_count; i++) {
+		struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+		struct obd_device *child =
+			lov->lov_tgts[loi->loi_ost_idx]->ltd_exp->exp_obd;
+		struct llog_ctxt *cctxt = llog_get_context(child, ctxt->loc_idx);
+
+		/* fill mds unlink/setattr log record */
+		switch (rec->lrh_type) {
+		case MDS_UNLINK_REC: {
+			struct llog_unlink_rec *lur = (struct llog_unlink_rec *)rec;
+			lur->lur_oid = ostid_id(&loi->loi_oi);
+			lur->lur_oseq = (__u32)ostid_seq(&loi->loi_oi);
+			break;
+		}
+		case MDS_SETATTR64_REC: {
+			struct llog_setattr64_rec *lsr = (struct llog_setattr64_rec *)rec;
+			lsr->lsr_oi = loi->loi_oi;
+			break;
+		}
+		default:
+			break;
+		}
+
+		/* inject error in llog_obd_add() below */
+		if (OBD_FAIL_CHECK(OBD_FAIL_MDS_FAIL_LOV_LOG_ADD)) {
+			llog_ctxt_put(cctxt);
+			cctxt = NULL;
+		}
+		rc = llog_obd_add(env, cctxt, rec, NULL, logcookies + cookies,
+				  numcookies - cookies);
+		llog_ctxt_put(cctxt);
+		if (rc < 0) {
+			CERROR("Can't add llog (rc = %d) for stripe %d\n",
+			       rc, cookies);
+			memset(logcookies + cookies, 0,
+			       sizeof(struct llog_cookie));
+			rc = 1; /* skip this cookie */
+		}
+		/* Note that rc is always 1 if llog_obd_add was successful */
+		cookies += rc;
+	}
+	RETURN(cookies);
+}
+
+static int lov_llog_origin_connect(struct llog_ctxt *ctxt,
+				   struct llog_logid *logid,
+				   struct llog_gen *gen,
+				   struct obd_uuid *uuid)
+{
+	struct obd_device *obd = ctxt->loc_obd;
+	struct lov_obd *lov = &obd->u.lov;
+	int i, rc = 0, err = 0;
+	ENTRY;
+
+	obd_getref(obd);
+	for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+		struct obd_device *child;
+		struct llog_ctxt *cctxt;
+
+		if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_active)
+			continue;
+		if (uuid && !obd_uuid_equals(uuid, &lov->lov_tgts[i]->ltd_uuid))
+			continue;
+		CDEBUG(D_CONFIG, "connect %d/%d\n", i, lov->desc.ld_tgt_count);
+		child = lov->lov_tgts[i]->ltd_exp->exp_obd;
+		cctxt = llog_get_context(child, ctxt->loc_idx);
+		rc = llog_connect(cctxt, logid, gen, uuid);
+		llog_ctxt_put(cctxt);
+
+		if (rc) {
+			CERROR("error osc_llog_connect tgt %d (%d)\n", i, rc);
+			if (!err)
+				err = rc;
+		}
+	}
+	obd_putref(obd);
+
+	RETURN(err);
+}
+
+/* the replicators commit callback */
+static int lov_llog_repl_cancel(const struct lu_env *env,
+				struct llog_ctxt *ctxt,
+				struct lov_stripe_md *lsm,
+				int count, struct llog_cookie *cookies,
+				int flags)
+{
+	struct lov_obd *lov;
+	struct obd_device *obd = ctxt->loc_obd;
+	int rc = 0, i;
+	ENTRY;
+
+	LASSERT(lsm != NULL);
+	LASSERT(count == lsm->lsm_stripe_count);
+
+	lov = &obd->u.lov;
+	obd_getref(obd);
+	for (i = 0; i < count; i++, cookies++) {
+		struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+		struct obd_device *child =
+			lov->lov_tgts[loi->loi_ost_idx]->ltd_exp->exp_obd;
+		struct llog_ctxt *cctxt =
+			llog_get_context(child, ctxt->loc_idx);
+		int err;
+
+		err = llog_cancel(env, cctxt, NULL, 1, cookies, flags);
+		llog_ctxt_put(cctxt);
+		if (err && lov->lov_tgts[loi->loi_ost_idx]->ltd_active) {
+			CERROR("%s: objid "DOSTID" subobj "DOSTID
+			       " on OST idx %d: rc = %d\n",
+			       obd->obd_name, POSTID(&lsm->lsm_oi),
+			       POSTID(&loi->loi_oi), loi->loi_ost_idx, err);
+			if (!rc)
+				rc = err;
+		}
+	}
+	obd_putref(obd);
+	RETURN(rc);
+}
+
+static struct llog_operations lov_mds_ost_orig_logops = {
+	.lop_obd_add	= lov_llog_origin_add,
+	.lop_connect	= lov_llog_origin_connect,
+};
+
+static struct llog_operations lov_size_repl_logops = {
+	.lop_cancel	= lov_llog_repl_cancel,
+};
+
+int lov_llog_init(struct obd_device *obd, struct obd_llog_group *olg,
+		  struct obd_device *disk_obd, int *index)
+{
+	struct lov_obd *lov = &obd->u.lov;
+	struct obd_device *child;
+	int i, rc = 0;
+	ENTRY;
+
+	LASSERT(olg == &obd->obd_olg);
+	rc = llog_setup(NULL, obd, olg, LLOG_MDS_OST_ORIG_CTXT, disk_obd,
+			&lov_mds_ost_orig_logops);
+	if (rc)
+		RETURN(rc);
+
+	rc = llog_setup(NULL, obd, olg, LLOG_SIZE_REPL_CTXT, disk_obd,
+			&lov_size_repl_logops);
+	if (rc)
+		GOTO(err_cleanup, rc);
+
+	obd_getref(obd);
+	/* count may not match lov->desc.ld_tgt_count during dynamic ost add */
+	for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+		if (!lov->lov_tgts[i])
+			continue;
+
+		if (index && i != *index)
+			continue;
+
+		child = lov->lov_tgts[i]->ltd_obd;
+		rc = obd_llog_init(child, &child->obd_olg, disk_obd, &i);
+		if (rc)
+			CERROR("error osc_llog_init idx %d osc '%s' tgt '%s' "
+			       "(rc=%d)\n", i, child->obd_name,
+			       disk_obd->obd_name, rc);
+		rc = 0;
+	}
+	obd_putref(obd);
+	GOTO(err_cleanup, rc);
+err_cleanup:
+	if (rc) {
+		struct llog_ctxt *ctxt =
+			llog_get_context(obd, LLOG_SIZE_REPL_CTXT);
+		if (ctxt)
+			llog_cleanup(NULL, ctxt);
+		ctxt = llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT);
+		if (ctxt)
+			llog_cleanup(NULL, ctxt);
+	}
+	return rc;
+}
+
+int lov_llog_finish(struct obd_device *obd, int count)
+{
+	struct llog_ctxt *ctxt;
+
+	ENTRY;
+
+	/* cleanup our llogs only if the ctxts have been setup
+	 * (client lov doesn't setup, mds lov does). */
+	ctxt = llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT);
+	if (ctxt)
+		llog_cleanup(NULL, ctxt);
+
+	ctxt = llog_get_context(obd, LLOG_SIZE_REPL_CTXT);
+	if (ctxt)
+		llog_cleanup(NULL, ctxt);
+
+	/* lov->tgt llogs are cleaned during osc_cleanup. */
+	RETURN(0);
+}
diff --git a/drivers/staging/lustre/lustre/lov/lov_merge.c b/drivers/staging/lustre/lustre/lov/lov_merge.c
new file mode 100644
index 000000000000..ddbac1220263
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_merge.c
@@ -0,0 +1,218 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include <linux/libcfs/libcfs.h>
+
+#include <obd_class.h>
+#include <obd_lov.h>
+
+#include "lov_internal.h"
+
+/** Merge the lock value block(&lvb) attributes and KMS from each of the
+ * stripes in a file into a single lvb. It is expected that the caller
+ * initializes the current atime, mtime, ctime to avoid regressing a more
+ * uptodate time on the local client.
+ */
+int lov_merge_lvb_kms(struct lov_stripe_md *lsm,
+		      struct ost_lvb *lvb, __u64 *kms_place)
+{
+	__u64 size = 0;
+	__u64 kms = 0;
+	__u64 blocks = 0;
+	obd_time current_mtime = lvb->lvb_mtime;
+	obd_time current_atime = lvb->lvb_atime;
+	obd_time current_ctime = lvb->lvb_ctime;
+	int i;
+	int rc = 0;
+
+	LASSERT(spin_is_locked(&lsm->lsm_lock));
+	LASSERT(lsm->lsm_lock_owner == current_pid());
+
+	CDEBUG(D_INODE, "MDT ID "DOSTID" initial value: s="LPU64" m="LPU64
+	       " a="LPU64" c="LPU64" b="LPU64"\n", POSTID(&lsm->lsm_oi),
+	       lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime, lvb->lvb_ctime,
+	       lvb->lvb_blocks);
+	for (i = 0; i < lsm->lsm_stripe_count; i++) {
+		struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+		obd_size lov_size, tmpsize;
+
+		if (OST_LVB_IS_ERR(loi->loi_lvb.lvb_blocks)) {
+			rc = OST_LVB_GET_ERR(loi->loi_lvb.lvb_blocks);
+			continue;
+		}
+
+		tmpsize = loi->loi_kms;
+		lov_size = lov_stripe_size(lsm, tmpsize, i);
+		if (lov_size > kms)
+			kms = lov_size;
+
+		if (loi->loi_lvb.lvb_size > tmpsize)
+			tmpsize = loi->loi_lvb.lvb_size;
+
+		lov_size = lov_stripe_size(lsm, tmpsize, i);
+		if (lov_size > size)
+			size = lov_size;
+		/* merge blocks, mtime, atime */
+		blocks += loi->loi_lvb.lvb_blocks;
+		if (loi->loi_lvb.lvb_mtime > current_mtime)
+			current_mtime = loi->loi_lvb.lvb_mtime;
+		if (loi->loi_lvb.lvb_atime > current_atime)
+			current_atime = loi->loi_lvb.lvb_atime;
+		if (loi->loi_lvb.lvb_ctime > current_ctime)
+			current_ctime = loi->loi_lvb.lvb_ctime;
+
+		CDEBUG(D_INODE, "MDT ID "DOSTID" on OST[%u]: s="LPU64" m="LPU64
+		       " a="LPU64" c="LPU64" b="LPU64"\n", POSTID(&lsm->lsm_oi),
+		       loi->loi_ost_idx, loi->loi_lvb.lvb_size,
+		       loi->loi_lvb.lvb_mtime, loi->loi_lvb.lvb_atime,
+		       loi->loi_lvb.lvb_ctime, loi->loi_lvb.lvb_blocks);
+	}
+
+	*kms_place = kms;
+	lvb->lvb_size = size;
+	lvb->lvb_blocks = blocks;
+	lvb->lvb_mtime = current_mtime;
+	lvb->lvb_atime = current_atime;
+	lvb->lvb_ctime = current_ctime;
+	RETURN(rc);
+}
+
+/** Merge the lock value block(&lvb) attributes from each of the stripes in a
+ * file into a single lvb. It is expected that the caller initializes the
+ * current atime, mtime, ctime to avoid regressing a more uptodate time on
+ * the local client.
+ *
+ * If \a kms_only is set then we do not consider the recently seen size (rss)
+ * when updating the known minimum size (kms).  Even when merging RSS, we will
+ * take the KMS value if it's larger.  This prevents getattr from stomping on
+ * dirty cached pages which extend the file size. */
+int lov_merge_lvb(struct obd_export *exp,
+		  struct lov_stripe_md *lsm, struct ost_lvb *lvb, int kms_only)
+{
+	int   rc;
+	__u64 kms;
+
+	ENTRY;
+	lov_stripe_lock(lsm);
+	rc = lov_merge_lvb_kms(lsm, lvb, &kms);
+	lov_stripe_unlock(lsm);
+	if (kms_only)
+		lvb->lvb_size = kms;
+
+	CDEBUG(D_INODE, "merged for ID "DOSTID" s="LPU64" m="LPU64" a="LPU64
+	       " c="LPU64" b="LPU64"\n", POSTID(&lsm->lsm_oi), lvb->lvb_size,
+	       lvb->lvb_mtime, lvb->lvb_atime, lvb->lvb_ctime, lvb->lvb_blocks);
+	RETURN(rc);
+}
+
+/* Must be called under the lov_stripe_lock() */
+int lov_adjust_kms(struct obd_export *exp, struct lov_stripe_md *lsm,
+		   obd_off size, int shrink)
+{
+	struct lov_oinfo *loi;
+	int stripe = 0;
+	__u64 kms;
+	ENTRY;
+
+	LASSERT(spin_is_locked(&lsm->lsm_lock));
+	LASSERT(lsm->lsm_lock_owner == current_pid());
+
+	if (shrink) {
+		for (; stripe < lsm->lsm_stripe_count; stripe++) {
+			struct lov_oinfo *loi = lsm->lsm_oinfo[stripe];
+			kms = lov_size_to_stripe(lsm, size, stripe);
+			CDEBUG(D_INODE,
+			       "stripe %d KMS %sing "LPU64"->"LPU64"\n",
+			       stripe, kms > loi->loi_kms ? "increas":"shrink",
+			       loi->loi_kms, kms);
+			loi_kms_set(loi, loi->loi_lvb.lvb_size = kms);
+		}
+		RETURN(0);
+	}
+
+	if (size > 0)
+		stripe = lov_stripe_number(lsm, size - 1);
+	kms = lov_size_to_stripe(lsm, size, stripe);
+	loi = lsm->lsm_oinfo[stripe];
+
+	CDEBUG(D_INODE, "stripe %d KMS %sincreasing "LPU64"->"LPU64"\n",
+	       stripe, kms > loi->loi_kms ? "" : "not ", loi->loi_kms, kms);
+	if (kms > loi->loi_kms)
+		loi_kms_set(loi, kms);
+
+	RETURN(0);
+}
+
+void lov_merge_attrs(struct obdo *tgt, struct obdo *src, obd_valid valid,
+		     struct lov_stripe_md *lsm, int stripeno, int *set)
+{
+	valid &= src->o_valid;
+
+	if (*set) {
+		if (valid & OBD_MD_FLSIZE) {
+			/* this handles sparse files properly */
+			obd_size lov_size;
+
+			lov_size = lov_stripe_size(lsm, src->o_size, stripeno);
+			if (lov_size > tgt->o_size)
+				tgt->o_size = lov_size;
+		}
+		if (valid & OBD_MD_FLBLOCKS)
+			tgt->o_blocks += src->o_blocks;
+		if (valid & OBD_MD_FLBLKSZ)
+			tgt->o_blksize += src->o_blksize;
+		if (valid & OBD_MD_FLCTIME && tgt->o_ctime < src->o_ctime)
+			tgt->o_ctime = src->o_ctime;
+		if (valid & OBD_MD_FLMTIME && tgt->o_mtime < src->o_mtime)
+			tgt->o_mtime = src->o_mtime;
+		if (valid & OBD_MD_FLDATAVERSION)
+			tgt->o_data_version += src->o_data_version;
+	} else {
+		memcpy(tgt, src, sizeof(*tgt));
+		tgt->o_oi = lsm->lsm_oi;
+		if (valid & OBD_MD_FLSIZE)
+			tgt->o_size = lov_stripe_size(lsm, src->o_size,
+						      stripeno);
+	}
+
+	/* data_version needs to be valid on all stripes to be correct! */
+	if (!(valid & OBD_MD_FLDATAVERSION))
+		tgt->o_valid &= ~OBD_MD_FLDATAVERSION;
+
+	*set += 1;
+}
diff --git a/drivers/staging/lustre/lustre/lov/lov_obd.c b/drivers/staging/lustre/lustre/lov/lov_obd.c
new file mode 100644
index 000000000000..8089f03a200e
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_obd.c
@@ -0,0 +1,2923 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lov/lov_obd.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Mike Shaver <shaver@clusterfs.com>
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+#include <linux/libcfs/libcfs.h>
+
+#include <obd_support.h>
+#include <lustre_lib.h>
+#include <lustre_net.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_dlm.h>
+#include <lustre_mds.h>
+#include <lustre_debug.h>
+#include <obd_class.h>
+#include <obd_lov.h>
+#include <obd_ost.h>
+#include <lprocfs_status.h>
+#include <lustre_param.h>
+#include <cl_object.h>
+#include <lclient.h>
+#include <lustre/ll_fiemap.h>
+#include <lustre_log.h>
+#include <lustre_fid.h>
+
+#include "lov_internal.h"
+
+/* Keep a refcount of lov->tgt usage to prevent racing with addition/deletion.
+   Any function that expects lov_tgts to remain stationary must take a ref. */
+static void lov_getref(struct obd_device *obd)
+{
+	struct lov_obd *lov = &obd->u.lov;
+
+	/* nobody gets through here until lov_putref is done */
+	mutex_lock(&lov->lov_lock);
+	atomic_inc(&lov->lov_refcount);
+	mutex_unlock(&lov->lov_lock);
+	return;
+}
+
+static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt);
+
+static void lov_putref(struct obd_device *obd)
+{
+	struct lov_obd *lov = &obd->u.lov;
+
+	mutex_lock(&lov->lov_lock);
+	/* ok to dec to 0 more than once -- ltd_exp's will be null */
+	if (atomic_dec_and_test(&lov->lov_refcount) && lov->lov_death_row) {
+		LIST_HEAD(kill);
+		int i;
+		struct lov_tgt_desc *tgt, *n;
+		CDEBUG(D_CONFIG, "destroying %d lov targets\n",
+		       lov->lov_death_row);
+		for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+			tgt = lov->lov_tgts[i];
+
+			if (!tgt || !tgt->ltd_reap)
+				continue;
+			list_add(&tgt->ltd_kill, &kill);
+			/* XXX - right now there is a dependency on ld_tgt_count
+			 * being the maximum tgt index for computing the
+			 * mds_max_easize. So we can't shrink it. */
+			lov_ost_pool_remove(&lov->lov_packed, i);
+			lov->lov_tgts[i] = NULL;
+			lov->lov_death_row--;
+		}
+		mutex_unlock(&lov->lov_lock);
+
+		list_for_each_entry_safe(tgt, n, &kill, ltd_kill) {
+			list_del(&tgt->ltd_kill);
+			/* Disconnect */
+			__lov_del_obd(obd, tgt);
+		}
+	} else {
+		mutex_unlock(&lov->lov_lock);
+	}
+}
+
+static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid,
+			      enum obd_notify_event ev);
+static int lov_notify(struct obd_device *obd, struct obd_device *watched,
+		      enum obd_notify_event ev, void *data);
+
+
+#define MAX_STRING_SIZE 128
+int lov_connect_obd(struct obd_device *obd, __u32 index, int activate,
+		    struct obd_connect_data *data)
+{
+	struct lov_obd *lov = &obd->u.lov;
+	struct obd_uuid *tgt_uuid;
+	struct obd_device *tgt_obd;
+	static struct obd_uuid lov_osc_uuid = { "LOV_OSC_UUID" };
+	struct obd_import *imp;
+	proc_dir_entry_t *lov_proc_dir;
+	int rc;
+	ENTRY;
+
+	if (!lov->lov_tgts[index])
+		RETURN(-EINVAL);
+
+	tgt_uuid = &lov->lov_tgts[index]->ltd_uuid;
+	tgt_obd = lov->lov_tgts[index]->ltd_obd;
+
+	if (!tgt_obd->obd_set_up) {
+		CERROR("Target %s not set up\n", obd_uuid2str(tgt_uuid));
+		RETURN(-EINVAL);
+	}
+
+	/* override the sp_me from lov */
+	tgt_obd->u.cli.cl_sp_me = lov->lov_sp_me;
+
+	if (data && (data->ocd_connect_flags & OBD_CONNECT_INDEX))
+		data->ocd_index = index;
+
+	/*
+	 * Divine LOV knows that OBDs under it are OSCs.
+	 */
+	imp = tgt_obd->u.cli.cl_import;
+
+	if (activate) {
+		tgt_obd->obd_no_recov = 0;
+		/* FIXME this is probably supposed to be
+		   ptlrpc_set_import_active.  Horrible naming. */
+		ptlrpc_activate_import(imp);
+	}
+
+	rc = obd_register_observer(tgt_obd, obd);
+	if (rc) {
+		CERROR("Target %s register_observer error %d\n",
+		       obd_uuid2str(tgt_uuid), rc);
+		RETURN(rc);
+	}
+
+
+	if (imp->imp_invalid) {
+		CDEBUG(D_CONFIG, "not connecting OSC %s; administratively "
+		       "disabled\n", obd_uuid2str(tgt_uuid));
+		RETURN(0);
+	}
+
+	rc = obd_connect(NULL, &lov->lov_tgts[index]->ltd_exp, tgt_obd,
+			 &lov_osc_uuid, data, NULL);
+	if (rc || !lov->lov_tgts[index]->ltd_exp) {
+		CERROR("Target %s connect error %d\n",
+		       obd_uuid2str(tgt_uuid), rc);
+		RETURN(-ENODEV);
+	}
+
+	lov->lov_tgts[index]->ltd_reap = 0;
+
+	CDEBUG(D_CONFIG, "Connected tgt idx %d %s (%s) %sactive\n", index,
+	       obd_uuid2str(tgt_uuid), tgt_obd->obd_name, activate ? "":"in");
+
+	lov_proc_dir = lprocfs_srch(obd->obd_proc_entry, "target_obds");
+	if (lov_proc_dir) {
+		struct obd_device *osc_obd = lov->lov_tgts[index]->ltd_exp->exp_obd;
+		proc_dir_entry_t *osc_symlink;
+
+		LASSERT(osc_obd != NULL);
+		LASSERT(osc_obd->obd_magic == OBD_DEVICE_MAGIC);
+		LASSERT(osc_obd->obd_type->typ_name != NULL);
+
+		osc_symlink = lprocfs_add_symlink(osc_obd->obd_name,
+						  lov_proc_dir,
+						  "../../../%s/%s",
+						  osc_obd->obd_type->typ_name,
+						  osc_obd->obd_name);
+		if (osc_symlink == NULL) {
+			CERROR("could not register LOV target "
+				"/proc/fs/lustre/%s/%s/target_obds/%s.",
+				obd->obd_type->typ_name, obd->obd_name,
+				osc_obd->obd_name);
+			lprocfs_remove(&lov_proc_dir);
+		}
+	}
+
+	RETURN(0);
+}
+
+static int lov_connect(const struct lu_env *env,
+		       struct obd_export **exp, struct obd_device *obd,
+		       struct obd_uuid *cluuid, struct obd_connect_data *data,
+		       void *localdata)
+{
+	struct lov_obd *lov = &obd->u.lov;
+	struct lov_tgt_desc *tgt;
+	struct lustre_handle conn;
+	int i, rc;
+	ENTRY;
+
+	CDEBUG(D_CONFIG, "connect #%d\n", lov->lov_connects);
+
+	rc = class_connect(&conn, obd, cluuid);
+	if (rc)
+		RETURN(rc);
+
+	*exp = class_conn2export(&conn);
+
+	/* Why should there ever be more than 1 connect? */
+	lov->lov_connects++;
+	LASSERT(lov->lov_connects == 1);
+
+	memset(&lov->lov_ocd, 0, sizeof(lov->lov_ocd));
+	if (data)
+		lov->lov_ocd = *data;
+
+	obd_getref(obd);
+	for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+		tgt = lov->lov_tgts[i];
+		if (!tgt || obd_uuid_empty(&tgt->ltd_uuid))
+			continue;
+		/* Flags will be lowest common denominator */
+		rc = lov_connect_obd(obd, i, tgt->ltd_activate, &lov->lov_ocd);
+		if (rc) {
+			CERROR("%s: lov connect tgt %d failed: %d\n",
+			       obd->obd_name, i, rc);
+			continue;
+		}
+		/* connect to administrative disabled ost */
+		if (!lov->lov_tgts[i]->ltd_exp)
+			continue;
+
+		rc = lov_notify(obd, lov->lov_tgts[i]->ltd_exp->exp_obd,
+				OBD_NOTIFY_CONNECT, (void *)&i);
+		if (rc) {
+			CERROR("%s error sending notify %d\n",
+			       obd->obd_name, rc);
+		}
+	}
+	obd_putref(obd);
+
+	RETURN(0);
+}
+
+static int lov_disconnect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt)
+{
+	proc_dir_entry_t *lov_proc_dir;
+	struct lov_obd *lov = &obd->u.lov;
+	struct obd_device *osc_obd;
+	int rc;
+	ENTRY;
+
+	osc_obd = class_exp2obd(tgt->ltd_exp);
+	CDEBUG(D_CONFIG, "%s: disconnecting target %s\n",
+	       obd->obd_name, osc_obd->obd_name);
+
+	if (tgt->ltd_active) {
+		tgt->ltd_active = 0;
+		lov->desc.ld_active_tgt_count--;
+		tgt->ltd_exp->exp_obd->obd_inactive = 1;
+	}
+
+	lov_proc_dir = lprocfs_srch(obd->obd_proc_entry, "target_obds");
+	if (lov_proc_dir) {
+		proc_dir_entry_t *osc_symlink;
+
+		osc_symlink = lprocfs_srch(lov_proc_dir, osc_obd->obd_name);
+		if (osc_symlink) {
+			lprocfs_remove(&osc_symlink);
+		} else {
+			CERROR("/proc/fs/lustre/%s/%s/target_obds/%s missing.",
+			       obd->obd_type->typ_name, obd->obd_name,
+			       osc_obd->obd_name);
+		}
+	}
+
+	if (osc_obd) {
+		/* Pass it on to our clients.
+		 * XXX This should be an argument to disconnect,
+		 * XXX not a back-door flag on the OBD.  Ah well.
+		 */
+		osc_obd->obd_force = obd->obd_force;
+		osc_obd->obd_fail = obd->obd_fail;
+		osc_obd->obd_no_recov = obd->obd_no_recov;
+	}
+
+	obd_register_observer(osc_obd, NULL);
+
+	rc = obd_disconnect(tgt->ltd_exp);
+	if (rc) {
+		CERROR("Target %s disconnect error %d\n",
+		       tgt->ltd_uuid.uuid, rc);
+		rc = 0;
+	}
+
+	tgt->ltd_exp = NULL;
+	RETURN(0);
+}
+
+static int lov_disconnect(struct obd_export *exp)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	struct lov_obd *lov = &obd->u.lov;
+	int i, rc;
+	ENTRY;
+
+	if (!lov->lov_tgts)
+		goto out;
+
+	/* Only disconnect the underlying layers on the final disconnect. */
+	lov->lov_connects--;
+	if (lov->lov_connects != 0) {
+		/* why should there be more than 1 connect? */
+		CERROR("disconnect #%d\n", lov->lov_connects);
+		goto out;
+	}
+
+	/* Let's hold another reference so lov_del_obd doesn't spin through
+	   putref every time */
+	obd_getref(obd);
+
+	for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+		if (lov->lov_tgts[i] && lov->lov_tgts[i]->ltd_exp) {
+			/* Disconnection is the last we know about an obd */
+			lov_del_target(obd, i, 0, lov->lov_tgts[i]->ltd_gen);
+		}
+	}
+	obd_putref(obd);
+
+out:
+	rc = class_disconnect(exp); /* bz 9811 */
+	RETURN(rc);
+}
+
+/* Error codes:
+ *
+ *  -EINVAL  : UUID can't be found in the LOV's target list
+ *  -ENOTCONN: The UUID is found, but the target connection is bad (!)
+ *  -EBADF   : The UUID is found, but the OBD is the wrong type (!)
+ *  any >= 0 : is log target index
+ */
+static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid,
+			      enum obd_notify_event ev)
+{
+	struct lov_obd *lov = &obd->u.lov;
+	struct lov_tgt_desc *tgt;
+	int index, activate, active;
+	ENTRY;
+
+	CDEBUG(D_INFO, "Searching in lov %p for uuid %s event(%d)\n",
+	       lov, uuid->uuid, ev);
+
+	obd_getref(obd);
+	for (index = 0; index < lov->desc.ld_tgt_count; index++) {
+		tgt = lov->lov_tgts[index];
+		if (!tgt)
+			continue;
+		/*
+		 * LU-642, initially inactive OSC could miss the obd_connect,
+		 * we make up for it here.
+		 */
+		if (ev == OBD_NOTIFY_ACTIVATE && tgt->ltd_exp == NULL &&
+		    obd_uuid_equals(uuid, &tgt->ltd_uuid)) {
+			struct obd_uuid lov_osc_uuid = {"LOV_OSC_UUID"};
+
+			obd_connect(NULL, &tgt->ltd_exp, tgt->ltd_obd,
+				    &lov_osc_uuid, &lov->lov_ocd, NULL);
+		}
+		if (!tgt->ltd_exp)
+			continue;
+
+		CDEBUG(D_INFO, "lov idx %d is %s conn "LPX64"\n",
+		       index, obd_uuid2str(&tgt->ltd_uuid),
+		       tgt->ltd_exp->exp_handle.h_cookie);
+		if (obd_uuid_equals(uuid, &tgt->ltd_uuid))
+			break;
+	}
+
+	if (index == lov->desc.ld_tgt_count)
+		GOTO(out, index = -EINVAL);
+
+	if (ev == OBD_NOTIFY_DEACTIVATE || ev == OBD_NOTIFY_ACTIVATE) {
+		activate = (ev == OBD_NOTIFY_ACTIVATE) ? 1 : 0;
+
+		if (lov->lov_tgts[index]->ltd_activate == activate) {
+			CDEBUG(D_INFO, "OSC %s already %sactivate!\n",
+			       uuid->uuid, activate ? "" : "de");
+		} else {
+			lov->lov_tgts[index]->ltd_activate = activate;
+			CDEBUG(D_CONFIG, "%sactivate OSC %s\n",
+			       activate ? "" : "de", obd_uuid2str(uuid));
+		}
+
+	} else if (ev == OBD_NOTIFY_INACTIVE || ev == OBD_NOTIFY_ACTIVE) {
+		active = (ev == OBD_NOTIFY_ACTIVE) ? 1 : 0;
+
+		if (lov->lov_tgts[index]->ltd_active == active) {
+			CDEBUG(D_INFO, "OSC %s already %sactive!\n",
+			       uuid->uuid, active ? "" : "in");
+			GOTO(out, index);
+		} else {
+			CDEBUG(D_CONFIG, "Marking OSC %s %sactive\n",
+			       obd_uuid2str(uuid), active ? "" : "in");
+		}
+
+		lov->lov_tgts[index]->ltd_active = active;
+		if (active) {
+			lov->desc.ld_active_tgt_count++;
+			lov->lov_tgts[index]->ltd_exp->exp_obd->obd_inactive = 0;
+		} else {
+			lov->desc.ld_active_tgt_count--;
+			lov->lov_tgts[index]->ltd_exp->exp_obd->obd_inactive = 1;
+		}
+	} else {
+		CERROR("Unknown event(%d) for uuid %s", ev, uuid->uuid);
+	}
+
+ out:
+	obd_putref(obd);
+	RETURN(index);
+}
+
+static int lov_notify(struct obd_device *obd, struct obd_device *watched,
+		      enum obd_notify_event ev, void *data)
+{
+	int rc = 0;
+	struct lov_obd *lov = &obd->u.lov;
+	ENTRY;
+
+	down_read(&lov->lov_notify_lock);
+	if (!lov->lov_connects) {
+		up_read(&lov->lov_notify_lock);
+		RETURN(rc);
+	}
+
+	if (ev == OBD_NOTIFY_ACTIVE || ev == OBD_NOTIFY_INACTIVE ||
+	    ev == OBD_NOTIFY_ACTIVATE || ev == OBD_NOTIFY_DEACTIVATE) {
+		struct obd_uuid *uuid;
+
+		LASSERT(watched);
+
+		if (strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME)) {
+			up_read(&lov->lov_notify_lock);
+			CERROR("unexpected notification of %s %s!\n",
+			       watched->obd_type->typ_name,
+			       watched->obd_name);
+			RETURN(-EINVAL);
+		}
+		uuid = &watched->u.cli.cl_target_uuid;
+
+		/* Set OSC as active before notifying the observer, so the
+		 * observer can use the OSC normally.
+		 */
+		rc = lov_set_osc_active(obd, uuid, ev);
+		if (rc < 0) {
+			up_read(&lov->lov_notify_lock);
+			CERROR("event(%d) of %s failed: %d\n", ev,
+			       obd_uuid2str(uuid), rc);
+			RETURN(rc);
+		}
+		/* active event should be pass lov target index as data */
+		data = &rc;
+	}
+
+	/* Pass the notification up the chain. */
+	if (watched) {
+		rc = obd_notify_observer(obd, watched, ev, data);
+	} else {
+		/* NULL watched means all osc's in the lov (only for syncs) */
+		/* sync event should be send lov idx as data */
+		struct lov_obd *lov = &obd->u.lov;
+		int i, is_sync;
+
+		data = &i;
+		is_sync = (ev == OBD_NOTIFY_SYNC) ||
+			  (ev == OBD_NOTIFY_SYNC_NONBLOCK);
+
+		obd_getref(obd);
+		for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+			if (!lov->lov_tgts[i])
+				continue;
+
+			/* don't send sync event if target not
+			 * connected/activated */
+			if (is_sync &&  !lov->lov_tgts[i]->ltd_active)
+				continue;
+
+			rc = obd_notify_observer(obd, lov->lov_tgts[i]->ltd_obd,
+						 ev, data);
+			if (rc) {
+				CERROR("%s: notify %s of %s failed %d\n",
+				       obd->obd_name,
+				       obd->obd_observer->obd_name,
+				       lov->lov_tgts[i]->ltd_obd->obd_name,
+				       rc);
+			}
+		}
+		obd_putref(obd);
+	}
+
+	up_read(&lov->lov_notify_lock);
+	RETURN(rc);
+}
+
+static int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
+			  __u32 index, int gen, int active)
+{
+	struct lov_obd *lov = &obd->u.lov;
+	struct lov_tgt_desc *tgt;
+	struct obd_device *tgt_obd;
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_CONFIG, "uuid:%s idx:%d gen:%d active:%d\n",
+	       uuidp->uuid, index, gen, active);
+
+	if (gen <= 0) {
+		CERROR("request to add OBD %s with invalid generation: %d\n",
+		       uuidp->uuid, gen);
+		RETURN(-EINVAL);
+	}
+
+	tgt_obd = class_find_client_obd(uuidp, LUSTRE_OSC_NAME,
+					&obd->obd_uuid);
+	if (tgt_obd == NULL)
+		RETURN(-EINVAL);
+
+	mutex_lock(&lov->lov_lock);
+
+	if ((index < lov->lov_tgt_size) && (lov->lov_tgts[index] != NULL)) {
+		tgt = lov->lov_tgts[index];
+		CERROR("UUID %s already assigned at LOV target index %d\n",
+		       obd_uuid2str(&tgt->ltd_uuid), index);
+		mutex_unlock(&lov->lov_lock);
+		RETURN(-EEXIST);
+	}
+
+	if (index >= lov->lov_tgt_size) {
+		/* We need to reallocate the lov target array. */
+		struct lov_tgt_desc **newtgts, **old = NULL;
+		__u32 newsize, oldsize = 0;
+
+		newsize = max(lov->lov_tgt_size, (__u32)2);
+		while (newsize < index + 1)
+			newsize = newsize << 1;
+		OBD_ALLOC(newtgts, sizeof(*newtgts) * newsize);
+		if (newtgts == NULL) {
+			mutex_unlock(&lov->lov_lock);
+			RETURN(-ENOMEM);
+		}
+
+		if (lov->lov_tgt_size) {
+			memcpy(newtgts, lov->lov_tgts, sizeof(*newtgts) *
+			       lov->lov_tgt_size);
+			old = lov->lov_tgts;
+			oldsize = lov->lov_tgt_size;
+		}
+
+		lov->lov_tgts = newtgts;
+		lov->lov_tgt_size = newsize;
+		smp_rmb();
+		if (old)
+			OBD_FREE(old, sizeof(*old) * oldsize);
+
+		CDEBUG(D_CONFIG, "tgts: %p size: %d\n",
+		       lov->lov_tgts, lov->lov_tgt_size);
+	}
+
+	OBD_ALLOC_PTR(tgt);
+	if (!tgt) {
+		mutex_unlock(&lov->lov_lock);
+		RETURN(-ENOMEM);
+	}
+
+	rc = lov_ost_pool_add(&lov->lov_packed, index, lov->lov_tgt_size);
+	if (rc) {
+		mutex_unlock(&lov->lov_lock);
+		OBD_FREE_PTR(tgt);
+		RETURN(rc);
+	}
+
+	tgt->ltd_uuid = *uuidp;
+	tgt->ltd_obd = tgt_obd;
+	/* XXX - add a sanity check on the generation number. */
+	tgt->ltd_gen = gen;
+	tgt->ltd_index = index;
+	tgt->ltd_activate = active;
+	lov->lov_tgts[index] = tgt;
+	if (index >= lov->desc.ld_tgt_count)
+		lov->desc.ld_tgt_count = index + 1;
+
+	mutex_unlock(&lov->lov_lock);
+
+	CDEBUG(D_CONFIG, "idx=%d ltd_gen=%d ld_tgt_count=%d\n",
+		index, tgt->ltd_gen, lov->desc.ld_tgt_count);
+
+	rc = obd_notify(obd, tgt_obd, OBD_NOTIFY_CREATE, &index);
+
+	if (lov->lov_connects == 0) {
+		/* lov_connect hasn't been called yet. We'll do the
+		   lov_connect_obd on this target when that fn first runs,
+		   because we don't know the connect flags yet. */
+		RETURN(0);
+	}
+
+	obd_getref(obd);
+
+	rc = lov_connect_obd(obd, index, active, &lov->lov_ocd);
+	if (rc)
+		GOTO(out, rc);
+
+	/* connect to administrative disabled ost */
+	if (!tgt->ltd_exp)
+		GOTO(out, rc = 0);
+
+	if (lov->lov_cache != NULL) {
+		rc = obd_set_info_async(NULL, tgt->ltd_exp,
+				sizeof(KEY_CACHE_SET), KEY_CACHE_SET,
+				sizeof(struct cl_client_cache), lov->lov_cache,
+				NULL);
+		if (rc < 0)
+			GOTO(out, rc);
+	}
+
+	rc = lov_notify(obd, tgt->ltd_exp->exp_obd,
+			active ? OBD_NOTIFY_CONNECT : OBD_NOTIFY_INACTIVE,
+			(void *)&index);
+
+out:
+	if (rc) {
+		CERROR("add failed (%d), deleting %s\n", rc,
+		       obd_uuid2str(&tgt->ltd_uuid));
+		lov_del_target(obd, index, 0, 0);
+	}
+	obd_putref(obd);
+	RETURN(rc);
+}
+
+/* Schedule a target for deletion */
+int lov_del_target(struct obd_device *obd, __u32 index,
+		   struct obd_uuid *uuidp, int gen)
+{
+	struct lov_obd *lov = &obd->u.lov;
+	int count = lov->desc.ld_tgt_count;
+	int rc = 0;
+	ENTRY;
+
+	if (index >= count) {
+		CERROR("LOV target index %d >= number of LOV OBDs %d.\n",
+		       index, count);
+		RETURN(-EINVAL);
+	}
+
+	/* to make sure there's no ongoing lov_notify() now */
+	down_write(&lov->lov_notify_lock);
+	obd_getref(obd);
+
+	if (!lov->lov_tgts[index]) {
+		CERROR("LOV target at index %d is not setup.\n", index);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	if (uuidp && !obd_uuid_equals(uuidp, &lov->lov_tgts[index]->ltd_uuid)) {
+		CERROR("LOV target UUID %s at index %d doesn't match %s.\n",
+		       lov_uuid2str(lov, index), index,
+		       obd_uuid2str(uuidp));
+		GOTO(out, rc = -EINVAL);
+	}
+
+	CDEBUG(D_CONFIG, "uuid: %s idx: %d gen: %d exp: %p active: %d\n",
+	       lov_uuid2str(lov, index), index,
+	       lov->lov_tgts[index]->ltd_gen, lov->lov_tgts[index]->ltd_exp,
+	       lov->lov_tgts[index]->ltd_active);
+
+	lov->lov_tgts[index]->ltd_reap = 1;
+	lov->lov_death_row++;
+	/* we really delete it from obd_putref */
+out:
+	obd_putref(obd);
+	up_write(&lov->lov_notify_lock);
+
+	RETURN(rc);
+}
+
+static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt)
+{
+	struct obd_device *osc_obd;
+
+	LASSERT(tgt);
+	LASSERT(tgt->ltd_reap);
+
+	osc_obd = class_exp2obd(tgt->ltd_exp);
+
+	CDEBUG(D_CONFIG, "Removing tgt %s : %s\n",
+	       tgt->ltd_uuid.uuid,
+	       osc_obd ? osc_obd->obd_name : "<no obd>");
+
+	if (tgt->ltd_exp)
+		lov_disconnect_obd(obd, tgt);
+
+	OBD_FREE_PTR(tgt);
+
+	/* Manual cleanup - no cleanup logs to clean up the osc's.  We must
+	   do it ourselves. And we can't do it from lov_cleanup,
+	   because we just lost our only reference to it. */
+	if (osc_obd)
+		class_manual_cleanup(osc_obd);
+}
+
+void lov_fix_desc_stripe_size(__u64 *val)
+{
+	if (*val < LOV_DEFAULT_STRIPE_SIZE) {
+		LCONSOLE_WARN("Increasing default stripe size to min %u\n",
+			      LOV_DEFAULT_STRIPE_SIZE);
+		*val = LOV_DEFAULT_STRIPE_SIZE;
+	} else if (*val & (LOV_MIN_STRIPE_SIZE - 1)) {
+		*val &= ~(LOV_MIN_STRIPE_SIZE - 1);
+		LCONSOLE_WARN("Changing default stripe size to "LPU64" (a "
+			      "multiple of %u)\n",
+			      *val, LOV_MIN_STRIPE_SIZE);
+	}
+}
+
+void lov_fix_desc_stripe_count(__u32 *val)
+{
+	if (*val == 0)
+		*val = 1;
+}
+
+void lov_fix_desc_pattern(__u32 *val)
+{
+	/* from lov_setstripe */
+	if ((*val != 0) && (*val != LOV_PATTERN_RAID0)) {
+		LCONSOLE_WARN("Unknown stripe pattern: %#x\n", *val);
+		*val = 0;
+	}
+}
+
+void lov_fix_desc_qos_maxage(__u32 *val)
+{
+	/* fix qos_maxage */
+	if (*val == 0)
+		*val = QOS_DEFAULT_MAXAGE;
+}
+
+void lov_fix_desc(struct lov_desc *desc)
+{
+	lov_fix_desc_stripe_size(&desc->ld_default_stripe_size);
+	lov_fix_desc_stripe_count(&desc->ld_default_stripe_count);
+	lov_fix_desc_pattern(&desc->ld_pattern);
+	lov_fix_desc_qos_maxage(&desc->ld_qos_maxage);
+}
+
+int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct lprocfs_static_vars lvars = { 0 };
+	struct lov_desc *desc;
+	struct lov_obd *lov = &obd->u.lov;
+	int rc;
+	ENTRY;
+
+	if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
+		CERROR("LOV setup requires a descriptor\n");
+		RETURN(-EINVAL);
+	}
+
+	desc = (struct lov_desc *)lustre_cfg_buf(lcfg, 1);
+
+	if (sizeof(*desc) > LUSTRE_CFG_BUFLEN(lcfg, 1)) {
+		CERROR("descriptor size wrong: %d > %d\n",
+		       (int)sizeof(*desc), LUSTRE_CFG_BUFLEN(lcfg, 1));
+		RETURN(-EINVAL);
+	}
+
+	if (desc->ld_magic != LOV_DESC_MAGIC) {
+		if (desc->ld_magic == __swab32(LOV_DESC_MAGIC)) {
+			    CDEBUG(D_OTHER, "%s: Swabbing lov desc %p\n",
+				   obd->obd_name, desc);
+			    lustre_swab_lov_desc(desc);
+		} else {
+			CERROR("%s: Bad lov desc magic: %#x\n",
+			       obd->obd_name, desc->ld_magic);
+			RETURN(-EINVAL);
+		}
+	}
+
+	lov_fix_desc(desc);
+
+	desc->ld_active_tgt_count = 0;
+	lov->desc = *desc;
+	lov->lov_tgt_size = 0;
+
+	mutex_init(&lov->lov_lock);
+	atomic_set(&lov->lov_refcount, 0);
+	lov->lov_sp_me = LUSTRE_SP_CLI;
+
+	init_rwsem(&lov->lov_notify_lock);
+
+	lov->lov_pools_hash_body = cfs_hash_create("POOLS", HASH_POOLS_CUR_BITS,
+						   HASH_POOLS_MAX_BITS,
+						   HASH_POOLS_BKT_BITS, 0,
+						   CFS_HASH_MIN_THETA,
+						   CFS_HASH_MAX_THETA,
+						   &pool_hash_operations,
+						   CFS_HASH_DEFAULT);
+	INIT_LIST_HEAD(&lov->lov_pool_list);
+	lov->lov_pool_count = 0;
+	rc = lov_ost_pool_init(&lov->lov_packed, 0);
+	if (rc)
+		GOTO(out, rc);
+
+	lprocfs_lov_init_vars(&lvars);
+	lprocfs_obd_setup(obd, lvars.obd_vars);
+#ifdef LPROCFS
+	{
+		int rc;
+
+		rc = lprocfs_seq_create(obd->obd_proc_entry, "target_obd",
+					0444, &lov_proc_target_fops, obd);
+		if (rc)
+			CWARN("Error adding the target_obd file\n");
+	}
+#endif
+	lov->lov_pool_proc_entry = lprocfs_register("pools",
+						    obd->obd_proc_entry,
+						    NULL, NULL);
+
+	RETURN(0);
+
+out:
+	return rc;
+}
+
+static int lov_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
+{
+	int rc = 0;
+	struct lov_obd *lov = &obd->u.lov;
+
+	ENTRY;
+
+	switch (stage) {
+	case OBD_CLEANUP_EARLY: {
+		int i;
+		for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+			if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_active)
+				continue;
+			obd_precleanup(class_exp2obd(lov->lov_tgts[i]->ltd_exp),
+				       OBD_CLEANUP_EARLY);
+		}
+		break;
+	}
+	case OBD_CLEANUP_EXPORTS:
+		rc = obd_llog_finish(obd, 0);
+		if (rc != 0)
+			CERROR("failed to cleanup llogging subsystems\n");
+		break;
+	}
+	RETURN(rc);
+}
+
+static int lov_cleanup(struct obd_device *obd)
+{
+	struct lov_obd *lov = &obd->u.lov;
+	struct list_head *pos, *tmp;
+	struct pool_desc *pool;
+	ENTRY;
+
+	list_for_each_safe(pos, tmp, &lov->lov_pool_list) {
+		pool = list_entry(pos, struct pool_desc, pool_list);
+		/* free pool structs */
+		CDEBUG(D_INFO, "delete pool %p\n", pool);
+		/* In the function below, .hs_keycmp resolves to
+		 * pool_hashkey_keycmp() */
+		/* coverity[overrun-buffer-val] */
+		lov_pool_del(obd, pool->pool_name);
+	}
+	cfs_hash_putref(lov->lov_pools_hash_body);
+	lov_ost_pool_free(&lov->lov_packed);
+
+	lprocfs_obd_cleanup(obd);
+	if (lov->lov_tgts) {
+		int i;
+		obd_getref(obd);
+		for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+			if (!lov->lov_tgts[i])
+				continue;
+
+			/* Inactive targets may never have connected */
+			if (lov->lov_tgts[i]->ltd_active ||
+			    atomic_read(&lov->lov_refcount))
+			    /* We should never get here - these
+			       should have been removed in the
+			     disconnect. */
+				CERROR("lov tgt %d not cleaned!"
+				       " deathrow=%d, lovrc=%d\n",
+				       i, lov->lov_death_row,
+				       atomic_read(&lov->lov_refcount));
+			lov_del_target(obd, i, 0, 0);
+		}
+		obd_putref(obd);
+		OBD_FREE(lov->lov_tgts, sizeof(*lov->lov_tgts) *
+			 lov->lov_tgt_size);
+		lov->lov_tgt_size = 0;
+	}
+	RETURN(0);
+}
+
+int lov_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg,
+			    __u32 *indexp, int *genp)
+{
+	struct obd_uuid obd_uuid;
+	int cmd;
+	int rc = 0;
+	ENTRY;
+
+	switch(cmd = lcfg->lcfg_command) {
+	case LCFG_LOV_ADD_OBD:
+	case LCFG_LOV_ADD_INA:
+	case LCFG_LOV_DEL_OBD: {
+		__u32 index;
+		int gen;
+		/* lov_modify_tgts add  0:lov_mdsA  1:ost1_UUID  2:0  3:1 */
+		if (LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(obd_uuid.uuid))
+			GOTO(out, rc = -EINVAL);
+
+		obd_str2uuid(&obd_uuid,  lustre_cfg_buf(lcfg, 1));
+
+		if (sscanf(lustre_cfg_buf(lcfg, 2), "%d", indexp) != 1)
+			GOTO(out, rc = -EINVAL);
+		if (sscanf(lustre_cfg_buf(lcfg, 3), "%d", genp) != 1)
+			GOTO(out, rc = -EINVAL);
+		index = *indexp;
+		gen = *genp;
+		if (cmd == LCFG_LOV_ADD_OBD)
+			rc = lov_add_target(obd, &obd_uuid, index, gen, 1);
+		else if (cmd == LCFG_LOV_ADD_INA)
+			rc = lov_add_target(obd, &obd_uuid, index, gen, 0);
+		else
+			rc = lov_del_target(obd, index, &obd_uuid, gen);
+		GOTO(out, rc);
+	}
+	case LCFG_PARAM: {
+		struct lprocfs_static_vars lvars = { 0 };
+		struct lov_desc *desc = &(obd->u.lov.desc);
+
+		if (!desc)
+			GOTO(out, rc = -EINVAL);
+
+		lprocfs_lov_init_vars(&lvars);
+
+		rc = class_process_proc_param(PARAM_LOV, lvars.obd_vars,
+					      lcfg, obd);
+		if (rc > 0)
+			rc = 0;
+		GOTO(out, rc);
+	}
+	case LCFG_POOL_NEW:
+	case LCFG_POOL_ADD:
+	case LCFG_POOL_DEL:
+	case LCFG_POOL_REM:
+		GOTO(out, rc);
+
+	default: {
+		CERROR("Unknown command: %d\n", lcfg->lcfg_command);
+		GOTO(out, rc = -EINVAL);
+
+	}
+	}
+out:
+	RETURN(rc);
+}
+
+static int lov_recreate(struct obd_export *exp, struct obdo *src_oa,
+			struct lov_stripe_md **ea, struct obd_trans_info *oti)
+{
+	struct lov_stripe_md *obj_mdp, *lsm;
+	struct lov_obd *lov = &exp->exp_obd->u.lov;
+	unsigned ost_idx;
+	int rc, i;
+	ENTRY;
+
+	LASSERT(src_oa->o_valid & OBD_MD_FLFLAGS &&
+		src_oa->o_flags & OBD_FL_RECREATE_OBJS);
+
+	OBD_ALLOC(obj_mdp, sizeof(*obj_mdp));
+	if (obj_mdp == NULL)
+		RETURN(-ENOMEM);
+
+	ost_idx = src_oa->o_nlink;
+	lsm = *ea;
+	if (lsm == NULL)
+		GOTO(out, rc = -EINVAL);
+	if (ost_idx >= lov->desc.ld_tgt_count ||
+	    !lov->lov_tgts[ost_idx])
+		GOTO(out, rc = -EINVAL);
+
+	for (i = 0; i < lsm->lsm_stripe_count; i++) {
+		if (lsm->lsm_oinfo[i]->loi_ost_idx == ost_idx) {
+			if (ostid_id(&lsm->lsm_oinfo[i]->loi_oi) !=
+					ostid_id(&src_oa->o_oi))
+				GOTO(out, rc = -EINVAL);
+			break;
+		}
+	}
+	if (i == lsm->lsm_stripe_count)
+		GOTO(out, rc = -EINVAL);
+
+	rc = obd_create(NULL, lov->lov_tgts[ost_idx]->ltd_exp,
+			src_oa, &obj_mdp, oti);
+out:
+	OBD_FREE(obj_mdp, sizeof(*obj_mdp));
+	RETURN(rc);
+}
+
+/* the LOV expects oa->o_id to be set to the LOV object id */
+static int lov_create(const struct lu_env *env, struct obd_export *exp,
+		      struct obdo *src_oa, struct lov_stripe_md **ea,
+		      struct obd_trans_info *oti)
+{
+	struct lov_obd *lov;
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(ea != NULL);
+	if (exp == NULL)
+		RETURN(-EINVAL);
+
+	if ((src_oa->o_valid & OBD_MD_FLFLAGS) &&
+	    src_oa->o_flags == OBD_FL_DELORPHAN) {
+		/* should be used with LOV anymore */
+		LBUG();
+	}
+
+	lov = &exp->exp_obd->u.lov;
+	if (!lov->desc.ld_active_tgt_count)
+		RETURN(-EIO);
+
+	obd_getref(exp->exp_obd);
+	/* Recreate a specific object id at the given OST index */
+	if ((src_oa->o_valid & OBD_MD_FLFLAGS) &&
+	    (src_oa->o_flags & OBD_FL_RECREATE_OBJS)) {
+		 rc = lov_recreate(exp, src_oa, ea, oti);
+	}
+
+	obd_putref(exp->exp_obd);
+	RETURN(rc);
+}
+
+#define ASSERT_LSM_MAGIC(lsmp)						  \
+do {									    \
+	LASSERT((lsmp) != NULL);						\
+	LASSERTF(((lsmp)->lsm_magic == LOV_MAGIC_V1 ||			  \
+		 (lsmp)->lsm_magic == LOV_MAGIC_V3),			    \
+		 "%p->lsm_magic=%x\n", (lsmp), (lsmp)->lsm_magic);	      \
+} while (0)
+
+static int lov_destroy(const struct lu_env *env, struct obd_export *exp,
+		       struct obdo *oa, struct lov_stripe_md *lsm,
+		       struct obd_trans_info *oti, struct obd_export *md_exp,
+		       void *capa)
+{
+	struct lov_request_set *set;
+	struct obd_info oinfo;
+	struct lov_request *req;
+	struct list_head *pos;
+	struct lov_obd *lov;
+	int rc = 0, err = 0;
+	ENTRY;
+
+	ASSERT_LSM_MAGIC(lsm);
+
+	if (!exp || !exp->exp_obd)
+		RETURN(-ENODEV);
+
+	if (oa->o_valid & OBD_MD_FLCOOKIE) {
+		LASSERT(oti);
+		LASSERT(oti->oti_logcookies);
+	}
+
+	lov = &exp->exp_obd->u.lov;
+	obd_getref(exp->exp_obd);
+	rc = lov_prep_destroy_set(exp, &oinfo, oa, lsm, oti, &set);
+	if (rc)
+		GOTO(out, rc);
+
+	list_for_each (pos, &set->set_list) {
+		req = list_entry(pos, struct lov_request, rq_link);
+
+		if (oa->o_valid & OBD_MD_FLCOOKIE)
+			oti->oti_logcookies = set->set_cookies + req->rq_stripe;
+
+		err = obd_destroy(env, lov->lov_tgts[req->rq_idx]->ltd_exp,
+				  req->rq_oi.oi_oa, NULL, oti, NULL, capa);
+		err = lov_update_common_set(set, req, err);
+		if (err) {
+			CERROR("%s: destroying objid "DOSTID" subobj "
+			       DOSTID" on OST idx %d: rc = %d\n",
+			       exp->exp_obd->obd_name, POSTID(&oa->o_oi),
+			       POSTID(&req->rq_oi.oi_oa->o_oi),
+			       req->rq_idx, err);
+			if (!rc)
+				rc = err;
+		}
+	}
+
+	if (rc == 0) {
+		LASSERT(lsm_op_find(lsm->lsm_magic) != NULL);
+		rc = lsm_op_find(lsm->lsm_magic)->lsm_destroy(lsm, oa, md_exp);
+	}
+	err = lov_fini_destroy_set(set);
+out:
+	obd_putref(exp->exp_obd);
+	RETURN(rc ? rc : err);
+}
+
+static int lov_getattr(const struct lu_env *env, struct obd_export *exp,
+		       struct obd_info *oinfo)
+{
+	struct lov_request_set *set;
+	struct lov_request *req;
+	struct list_head *pos;
+	struct lov_obd *lov;
+	int err = 0, rc = 0;
+	ENTRY;
+
+	LASSERT(oinfo);
+	ASSERT_LSM_MAGIC(oinfo->oi_md);
+
+	if (!exp || !exp->exp_obd)
+		RETURN(-ENODEV);
+
+	lov = &exp->exp_obd->u.lov;
+
+	rc = lov_prep_getattr_set(exp, oinfo, &set);
+	if (rc)
+		RETURN(rc);
+
+	list_for_each (pos, &set->set_list) {
+		req = list_entry(pos, struct lov_request, rq_link);
+
+		CDEBUG(D_INFO, "objid "DOSTID"[%d] has subobj "DOSTID" at idx"
+		       " %u\n", POSTID(&oinfo->oi_oa->o_oi), req->rq_stripe,
+		       POSTID(&req->rq_oi.oi_oa->o_oi), req->rq_idx);
+
+		rc = obd_getattr(env, lov->lov_tgts[req->rq_idx]->ltd_exp,
+				 &req->rq_oi);
+		err = lov_update_common_set(set, req, rc);
+		if (err) {
+			CERROR("%s: getattr objid "DOSTID" subobj "
+			       DOSTID" on OST idx %d: rc = %d\n",
+			       exp->exp_obd->obd_name,
+			       POSTID(&oinfo->oi_oa->o_oi),
+			       POSTID(&req->rq_oi.oi_oa->o_oi),
+			       req->rq_idx, err);
+			break;
+		}
+	}
+
+	rc = lov_fini_getattr_set(set);
+	if (err)
+		rc = err;
+	RETURN(rc);
+}
+
+static int lov_getattr_interpret(struct ptlrpc_request_set *rqset,
+				 void *data, int rc)
+{
+	struct lov_request_set *lovset = (struct lov_request_set *)data;
+	int err;
+	ENTRY;
+
+	/* don't do attribute merge if this aysnc op failed */
+	if (rc)
+		atomic_set(&lovset->set_completes, 0);
+	err = lov_fini_getattr_set(lovset);
+	RETURN(rc ? rc : err);
+}
+
+static int lov_getattr_async(struct obd_export *exp, struct obd_info *oinfo,
+			      struct ptlrpc_request_set *rqset)
+{
+	struct lov_request_set *lovset;
+	struct lov_obd *lov;
+	struct list_head *pos;
+	struct lov_request *req;
+	int rc = 0, err;
+	ENTRY;
+
+	LASSERT(oinfo);
+	ASSERT_LSM_MAGIC(oinfo->oi_md);
+
+	if (!exp || !exp->exp_obd)
+		RETURN(-ENODEV);
+
+	lov = &exp->exp_obd->u.lov;
+
+	rc = lov_prep_getattr_set(exp, oinfo, &lovset);
+	if (rc)
+		RETURN(rc);
+
+	CDEBUG(D_INFO, "objid "DOSTID": %ux%u byte stripes\n",
+	       POSTID(&oinfo->oi_md->lsm_oi), oinfo->oi_md->lsm_stripe_count,
+	       oinfo->oi_md->lsm_stripe_size);
+
+	list_for_each(pos, &lovset->set_list) {
+		req = list_entry(pos, struct lov_request, rq_link);
+
+		CDEBUG(D_INFO, "objid "DOSTID"[%d] has subobj "DOSTID" at idx"
+		       "%u\n", POSTID(&oinfo->oi_oa->o_oi), req->rq_stripe,
+		       POSTID(&req->rq_oi.oi_oa->o_oi), req->rq_idx);
+		rc = obd_getattr_async(lov->lov_tgts[req->rq_idx]->ltd_exp,
+				       &req->rq_oi, rqset);
+		if (rc) {
+			CERROR("%s: getattr objid "DOSTID" subobj"
+			       DOSTID" on OST idx %d: rc = %d\n",
+			       exp->exp_obd->obd_name,
+			       POSTID(&oinfo->oi_oa->o_oi),
+			       POSTID(&req->rq_oi.oi_oa->o_oi),
+			       req->rq_idx, rc);
+			GOTO(out, rc);
+		}
+	}
+
+	if (!list_empty(&rqset->set_requests)) {
+		LASSERT(rc == 0);
+		LASSERT (rqset->set_interpret == NULL);
+		rqset->set_interpret = lov_getattr_interpret;
+		rqset->set_arg = (void *)lovset;
+		RETURN(rc);
+	}
+out:
+	if (rc)
+		atomic_set(&lovset->set_completes, 0);
+	err = lov_fini_getattr_set(lovset);
+	RETURN(rc ? rc : err);
+}
+
+static int lov_setattr(const struct lu_env *env, struct obd_export *exp,
+		       struct obd_info *oinfo, struct obd_trans_info *oti)
+{
+	struct lov_request_set *set;
+	struct lov_obd *lov;
+	struct list_head *pos;
+	struct lov_request *req;
+	int err = 0, rc = 0;
+	ENTRY;
+
+	LASSERT(oinfo);
+	ASSERT_LSM_MAGIC(oinfo->oi_md);
+
+	if (!exp || !exp->exp_obd)
+		RETURN(-ENODEV);
+
+	/* for now, we only expect the following updates here */
+	LASSERT(!(oinfo->oi_oa->o_valid & ~(OBD_MD_FLID | OBD_MD_FLTYPE |
+					    OBD_MD_FLMODE | OBD_MD_FLATIME |
+					    OBD_MD_FLMTIME | OBD_MD_FLCTIME |
+					    OBD_MD_FLFLAGS | OBD_MD_FLSIZE |
+					    OBD_MD_FLGROUP | OBD_MD_FLUID |
+					    OBD_MD_FLGID | OBD_MD_FLFID |
+					    OBD_MD_FLGENER)));
+	lov = &exp->exp_obd->u.lov;
+	rc = lov_prep_setattr_set(exp, oinfo, oti, &set);
+	if (rc)
+		RETURN(rc);
+
+	list_for_each (pos, &set->set_list) {
+		req = list_entry(pos, struct lov_request, rq_link);
+
+		rc = obd_setattr(env, lov->lov_tgts[req->rq_idx]->ltd_exp,
+				 &req->rq_oi, NULL);
+		err = lov_update_setattr_set(set, req, rc);
+		if (err) {
+			CERROR("%s: setattr objid "DOSTID" subobj "
+			       DOSTID" on OST idx %d: rc = %d\n",
+			       exp->exp_obd->obd_name,
+			       POSTID(&set->set_oi->oi_oa->o_oi),
+			       POSTID(&req->rq_oi.oi_oa->o_oi), req->rq_idx,
+			       err);
+			if (!rc)
+				rc = err;
+		}
+	}
+	err = lov_fini_setattr_set(set);
+	if (!rc)
+		rc = err;
+	RETURN(rc);
+}
+
+static int lov_setattr_interpret(struct ptlrpc_request_set *rqset,
+				 void *data, int rc)
+{
+	struct lov_request_set *lovset = (struct lov_request_set *)data;
+	int err;
+	ENTRY;
+
+	if (rc)
+		atomic_set(&lovset->set_completes, 0);
+	err = lov_fini_setattr_set(lovset);
+	RETURN(rc ? rc : err);
+}
+
+/* If @oti is given, the request goes from MDS and responses from OSTs are not
+   needed. Otherwise, a client is waiting for responses. */
+static int lov_setattr_async(struct obd_export *exp, struct obd_info *oinfo,
+			     struct obd_trans_info *oti,
+			     struct ptlrpc_request_set *rqset)
+{
+	struct lov_request_set *set;
+	struct lov_request *req;
+	struct list_head *pos;
+	struct lov_obd *lov;
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(oinfo);
+	ASSERT_LSM_MAGIC(oinfo->oi_md);
+	if (oinfo->oi_oa->o_valid & OBD_MD_FLCOOKIE) {
+		LASSERT(oti);
+		LASSERT(oti->oti_logcookies);
+	}
+
+	if (!exp || !exp->exp_obd)
+		RETURN(-ENODEV);
+
+	lov = &exp->exp_obd->u.lov;
+	rc = lov_prep_setattr_set(exp, oinfo, oti, &set);
+	if (rc)
+		RETURN(rc);
+
+	CDEBUG(D_INFO, "objid "DOSTID": %ux%u byte stripes\n",
+	       POSTID(&oinfo->oi_md->lsm_oi),
+	       oinfo->oi_md->lsm_stripe_count,
+	       oinfo->oi_md->lsm_stripe_size);
+
+	list_for_each(pos, &set->set_list) {
+		req = list_entry(pos, struct lov_request, rq_link);
+
+		if (oinfo->oi_oa->o_valid & OBD_MD_FLCOOKIE)
+			oti->oti_logcookies = set->set_cookies + req->rq_stripe;
+
+		CDEBUG(D_INFO, "objid "DOSTID"[%d] has subobj "DOSTID" at idx"
+		       "%u\n", POSTID(&oinfo->oi_oa->o_oi), req->rq_stripe,
+		       POSTID(&req->rq_oi.oi_oa->o_oi), req->rq_idx);
+
+		rc = obd_setattr_async(lov->lov_tgts[req->rq_idx]->ltd_exp,
+				       &req->rq_oi, oti, rqset);
+		if (rc) {
+			CERROR("error: setattr objid "DOSTID" subobj"
+			       DOSTID" on OST idx %d: rc = %d\n",
+			       POSTID(&set->set_oi->oi_oa->o_oi),
+			       POSTID(&req->rq_oi.oi_oa->o_oi),
+			       req->rq_idx, rc);
+			break;
+		}
+	}
+
+	/* If we are not waiting for responses on async requests, return. */
+	if (rc || !rqset || list_empty(&rqset->set_requests)) {
+		int err;
+		if (rc)
+			atomic_set(&set->set_completes, 0);
+		err = lov_fini_setattr_set(set);
+		RETURN(rc ? rc : err);
+	}
+
+	LASSERT(rqset->set_interpret == NULL);
+	rqset->set_interpret = lov_setattr_interpret;
+	rqset->set_arg = (void *)set;
+
+	RETURN(0);
+}
+
+static int lov_punch_interpret(struct ptlrpc_request_set *rqset,
+			       void *data, int rc)
+{
+	struct lov_request_set *lovset = (struct lov_request_set *)data;
+	int err;
+	ENTRY;
+
+	if (rc)
+		atomic_set(&lovset->set_completes, 0);
+	err = lov_fini_punch_set(lovset);
+	RETURN(rc ? rc : err);
+}
+
+/* FIXME: maybe we'll just make one node the authoritative attribute node, then
+ * we can send this 'punch' to just the authoritative node and the nodes
+ * that the punch will affect. */
+static int lov_punch(const struct lu_env *env, struct obd_export *exp,
+		     struct obd_info *oinfo, struct obd_trans_info *oti,
+		     struct ptlrpc_request_set *rqset)
+{
+	struct lov_request_set *set;
+	struct lov_obd *lov;
+	struct list_head *pos;
+	struct lov_request *req;
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(oinfo);
+	ASSERT_LSM_MAGIC(oinfo->oi_md);
+
+	if (!exp || !exp->exp_obd)
+		RETURN(-ENODEV);
+
+	lov = &exp->exp_obd->u.lov;
+	rc = lov_prep_punch_set(exp, oinfo, oti, &set);
+	if (rc)
+		RETURN(rc);
+
+	list_for_each (pos, &set->set_list) {
+		req = list_entry(pos, struct lov_request, rq_link);
+
+		rc = obd_punch(env, lov->lov_tgts[req->rq_idx]->ltd_exp,
+			       &req->rq_oi, NULL, rqset);
+		if (rc) {
+			CERROR("%s: punch objid "DOSTID" subobj "DOSTID
+			       " on OST idx %d: rc = %d\n",
+			       exp->exp_obd->obd_name,
+			       POSTID(&set->set_oi->oi_oa->o_oi),
+			       POSTID(&req->rq_oi.oi_oa->o_oi), req->rq_idx, rc);
+			break;
+		}
+	}
+
+	if (rc || list_empty(&rqset->set_requests)) {
+		int err;
+		err = lov_fini_punch_set(set);
+		RETURN(rc ? rc : err);
+	}
+
+	LASSERT(rqset->set_interpret == NULL);
+	rqset->set_interpret = lov_punch_interpret;
+	rqset->set_arg = (void *)set;
+
+	RETURN(0);
+}
+
+static int lov_sync_interpret(struct ptlrpc_request_set *rqset,
+			      void *data, int rc)
+{
+	struct lov_request_set *lovset = data;
+	int err;
+	ENTRY;
+
+	if (rc)
+		atomic_set(&lovset->set_completes, 0);
+	err = lov_fini_sync_set(lovset);
+	RETURN(rc ?: err);
+}
+
+static int lov_sync(const struct lu_env *env, struct obd_export *exp,
+		    struct obd_info *oinfo, obd_off start, obd_off end,
+		    struct ptlrpc_request_set *rqset)
+{
+	struct lov_request_set *set = NULL;
+	struct lov_obd *lov;
+	struct list_head *pos;
+	struct lov_request *req;
+	int rc = 0;
+	ENTRY;
+
+	ASSERT_LSM_MAGIC(oinfo->oi_md);
+	LASSERT(rqset != NULL);
+
+	if (!exp->exp_obd)
+		RETURN(-ENODEV);
+
+	lov = &exp->exp_obd->u.lov;
+	rc = lov_prep_sync_set(exp, oinfo, start, end, &set);
+	if (rc)
+		RETURN(rc);
+
+	CDEBUG(D_INFO, "fsync objid "DOSTID" ["LPX64", "LPX64"]\n",
+	       POSTID(&set->set_oi->oi_oa->o_oi), start, end);
+
+	list_for_each (pos, &set->set_list) {
+		req = list_entry(pos, struct lov_request, rq_link);
+
+		rc = obd_sync(env, lov->lov_tgts[req->rq_idx]->ltd_exp,
+			      &req->rq_oi, req->rq_oi.oi_policy.l_extent.start,
+			      req->rq_oi.oi_policy.l_extent.end, rqset);
+		if (rc) {
+			CERROR("%s: fsync objid "DOSTID" subobj "DOSTID
+			       " on OST idx %d: rc = %d\n",
+			       exp->exp_obd->obd_name,
+			       POSTID(&set->set_oi->oi_oa->o_oi),
+			       POSTID(&req->rq_oi.oi_oa->o_oi), req->rq_idx,
+			       rc);
+			break;
+		}
+	}
+
+	/* If we are not waiting for responses on async requests, return. */
+	if (rc || list_empty(&rqset->set_requests)) {
+		int err = lov_fini_sync_set(set);
+
+		RETURN(rc ?: err);
+	}
+
+	LASSERT(rqset->set_interpret == NULL);
+	rqset->set_interpret = lov_sync_interpret;
+	rqset->set_arg = (void *)set;
+
+	RETURN(0);
+}
+
+static int lov_brw_check(struct lov_obd *lov, struct obd_info *lov_oinfo,
+			 obd_count oa_bufs, struct brw_page *pga)
+{
+	struct obd_info oinfo = { { { 0 } } };
+	int i, rc = 0;
+
+	oinfo.oi_oa = lov_oinfo->oi_oa;
+
+	/* The caller just wants to know if there's a chance that this
+	 * I/O can succeed */
+	for (i = 0; i < oa_bufs; i++) {
+		int stripe = lov_stripe_number(lov_oinfo->oi_md, pga[i].off);
+		int ost = lov_oinfo->oi_md->lsm_oinfo[stripe]->loi_ost_idx;
+		obd_off start, end;
+
+		if (!lov_stripe_intersects(lov_oinfo->oi_md, i, pga[i].off,
+					   pga[i].off + pga[i].count - 1,
+					   &start, &end))
+			continue;
+
+		if (!lov->lov_tgts[ost] || !lov->lov_tgts[ost]->ltd_active) {
+			CDEBUG(D_HA, "lov idx %d inactive\n", ost);
+			return -EIO;
+		}
+
+		rc = obd_brw(OBD_BRW_CHECK, lov->lov_tgts[ost]->ltd_exp, &oinfo,
+			     1, &pga[i], NULL);
+		if (rc)
+			break;
+	}
+	return rc;
+}
+
+static int lov_brw(int cmd, struct obd_export *exp, struct obd_info *oinfo,
+		   obd_count oa_bufs, struct brw_page *pga,
+		   struct obd_trans_info *oti)
+{
+	struct lov_request_set *set;
+	struct lov_request *req;
+	struct list_head *pos;
+	struct lov_obd *lov = &exp->exp_obd->u.lov;
+	int err, rc = 0;
+	ENTRY;
+
+	ASSERT_LSM_MAGIC(oinfo->oi_md);
+
+	if (cmd == OBD_BRW_CHECK) {
+		rc = lov_brw_check(lov, oinfo, oa_bufs, pga);
+		RETURN(rc);
+	}
+
+	rc = lov_prep_brw_set(exp, oinfo, oa_bufs, pga, oti, &set);
+	if (rc)
+		RETURN(rc);
+
+	list_for_each (pos, &set->set_list) {
+		struct obd_export *sub_exp;
+		struct brw_page *sub_pga;
+		req = list_entry(pos, struct lov_request, rq_link);
+
+		sub_exp = lov->lov_tgts[req->rq_idx]->ltd_exp;
+		sub_pga = set->set_pga + req->rq_pgaidx;
+		rc = obd_brw(cmd, sub_exp, &req->rq_oi, req->rq_oabufs,
+			     sub_pga, oti);
+		if (rc)
+			break;
+		lov_update_common_set(set, req, rc);
+	}
+
+	err = lov_fini_brw_set(set);
+	if (!rc)
+		rc = err;
+	RETURN(rc);
+}
+
+static int lov_enqueue_interpret(struct ptlrpc_request_set *rqset,
+				 void *data, int rc)
+{
+	struct lov_request_set *lovset = (struct lov_request_set *)data;
+	ENTRY;
+	rc = lov_fini_enqueue_set(lovset, lovset->set_ei->ei_mode, rc, rqset);
+	RETURN(rc);
+}
+
+static int lov_enqueue(struct obd_export *exp, struct obd_info *oinfo,
+		       struct ldlm_enqueue_info *einfo,
+		       struct ptlrpc_request_set *rqset)
+{
+	ldlm_mode_t mode = einfo->ei_mode;
+	struct lov_request_set *set;
+	struct lov_request *req;
+	struct list_head *pos;
+	struct lov_obd *lov;
+	ldlm_error_t rc;
+	ENTRY;
+
+	LASSERT(oinfo);
+	ASSERT_LSM_MAGIC(oinfo->oi_md);
+	LASSERT(mode == (mode & -mode));
+
+	/* we should never be asked to replay a lock this way. */
+	LASSERT((oinfo->oi_flags & LDLM_FL_REPLAY) == 0);
+
+	if (!exp || !exp->exp_obd)
+		RETURN(-ENODEV);
+
+	lov = &exp->exp_obd->u.lov;
+	rc = lov_prep_enqueue_set(exp, oinfo, einfo, &set);
+	if (rc)
+		RETURN(rc);
+
+	list_for_each (pos, &set->set_list) {
+		req = list_entry(pos, struct lov_request, rq_link);
+
+		rc = obd_enqueue(lov->lov_tgts[req->rq_idx]->ltd_exp,
+				 &req->rq_oi, einfo, rqset);
+		if (rc != ELDLM_OK)
+			GOTO(out, rc);
+	}
+
+	if (rqset && !list_empty(&rqset->set_requests)) {
+		LASSERT(rc == 0);
+		LASSERT(rqset->set_interpret == NULL);
+		rqset->set_interpret = lov_enqueue_interpret;
+		rqset->set_arg = (void *)set;
+		RETURN(rc);
+	}
+out:
+	rc = lov_fini_enqueue_set(set, mode, rc, rqset);
+	RETURN(rc);
+}
+
+static int lov_change_cbdata(struct obd_export *exp,
+			     struct lov_stripe_md *lsm, ldlm_iterator_t it,
+			     void *data)
+{
+	struct lov_obd *lov;
+	int rc = 0, i;
+	ENTRY;
+
+	ASSERT_LSM_MAGIC(lsm);
+
+	if (!exp || !exp->exp_obd)
+		RETURN(-ENODEV);
+
+	lov = &exp->exp_obd->u.lov;
+	for (i = 0; i < lsm->lsm_stripe_count; i++) {
+		struct lov_stripe_md submd;
+		struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+
+		if (!lov->lov_tgts[loi->loi_ost_idx]) {
+			CDEBUG(D_HA, "lov idx %d NULL \n", loi->loi_ost_idx);
+			continue;
+		}
+
+		submd.lsm_oi = loi->loi_oi;
+		submd.lsm_stripe_count = 0;
+		rc = obd_change_cbdata(lov->lov_tgts[loi->loi_ost_idx]->ltd_exp,
+				       &submd, it, data);
+	}
+	RETURN(rc);
+}
+
+/* find any ldlm lock of the inode in lov
+ * return 0    not find
+ *	1    find one
+ *      < 0    error */
+static int lov_find_cbdata(struct obd_export *exp,
+			   struct lov_stripe_md *lsm, ldlm_iterator_t it,
+			   void *data)
+{
+	struct lov_obd *lov;
+	int rc = 0, i;
+	ENTRY;
+
+	ASSERT_LSM_MAGIC(lsm);
+
+	if (!exp || !exp->exp_obd)
+		RETURN(-ENODEV);
+
+	lov = &exp->exp_obd->u.lov;
+	for (i = 0; i < lsm->lsm_stripe_count; i++) {
+		struct lov_stripe_md submd;
+		struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+
+		if (!lov->lov_tgts[loi->loi_ost_idx]) {
+			CDEBUG(D_HA, "lov idx %d NULL \n", loi->loi_ost_idx);
+			continue;
+		}
+		submd.lsm_oi = loi->loi_oi;
+		submd.lsm_stripe_count = 0;
+		rc = obd_find_cbdata(lov->lov_tgts[loi->loi_ost_idx]->ltd_exp,
+				     &submd, it, data);
+		if (rc != 0)
+			RETURN(rc);
+	}
+	RETURN(rc);
+}
+
+static int lov_cancel(struct obd_export *exp, struct lov_stripe_md *lsm,
+		      __u32 mode, struct lustre_handle *lockh)
+{
+	struct lov_request_set *set;
+	struct obd_info oinfo;
+	struct lov_request *req;
+	struct list_head *pos;
+	struct lov_obd *lov;
+	struct lustre_handle *lov_lockhp;
+	int err = 0, rc = 0;
+	ENTRY;
+
+	ASSERT_LSM_MAGIC(lsm);
+
+	if (!exp || !exp->exp_obd)
+		RETURN(-ENODEV);
+
+	LASSERT(lockh);
+	lov = &exp->exp_obd->u.lov;
+	rc = lov_prep_cancel_set(exp, &oinfo, lsm, mode, lockh, &set);
+	if (rc)
+		RETURN(rc);
+
+	list_for_each(pos, &set->set_list) {
+		req = list_entry(pos, struct lov_request, rq_link);
+		lov_lockhp = set->set_lockh->llh_handles + req->rq_stripe;
+
+		rc = obd_cancel(lov->lov_tgts[req->rq_idx]->ltd_exp,
+				req->rq_oi.oi_md, mode, lov_lockhp);
+		rc = lov_update_common_set(set, req, rc);
+		if (rc) {
+			CERROR("%s: cancel objid "DOSTID" subobj "
+			       DOSTID" on OST idx %d: rc = %d\n",
+			       exp->exp_obd->obd_name, POSTID(&lsm->lsm_oi),
+			       POSTID(&req->rq_oi.oi_md->lsm_oi),
+			       req->rq_idx, rc);
+			err = rc;
+		}
+
+	}
+	lov_fini_cancel_set(set);
+	RETURN(err);
+}
+
+static int lov_cancel_unused(struct obd_export *exp,
+			     struct lov_stripe_md *lsm,
+			     ldlm_cancel_flags_t flags, void *opaque)
+{
+	struct lov_obd *lov;
+	int rc = 0, i;
+	ENTRY;
+
+	if (!exp || !exp->exp_obd)
+		RETURN(-ENODEV);
+
+	lov = &exp->exp_obd->u.lov;
+	if (lsm == NULL) {
+		for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+			int err;
+			if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_exp)
+				continue;
+
+			err = obd_cancel_unused(lov->lov_tgts[i]->ltd_exp, NULL,
+						flags, opaque);
+			if (!rc)
+				rc = err;
+		}
+		RETURN(rc);
+	}
+
+	ASSERT_LSM_MAGIC(lsm);
+
+	for (i = 0; i < lsm->lsm_stripe_count; i++) {
+		struct lov_stripe_md submd;
+		struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+		int idx = loi->loi_ost_idx;
+		int err;
+
+		if (!lov->lov_tgts[idx]) {
+			CDEBUG(D_HA, "lov idx %d NULL\n", idx);
+			continue;
+		}
+
+		if (!lov->lov_tgts[idx]->ltd_active)
+			CDEBUG(D_HA, "lov idx %d inactive\n", idx);
+
+		submd.lsm_oi = loi->loi_oi;
+		submd.lsm_stripe_count = 0;
+		err = obd_cancel_unused(lov->lov_tgts[idx]->ltd_exp,
+					&submd, flags, opaque);
+		if (err && lov->lov_tgts[idx]->ltd_active) {
+			CERROR("%s: cancel unused objid "DOSTID
+			       " subobj "DOSTID" on OST idx %d: rc = %d\n",
+			       exp->exp_obd->obd_name, POSTID(&lsm->lsm_oi),
+			       POSTID(&loi->loi_oi), idx, err);
+			if (!rc)
+				rc = err;
+		}
+	}
+	RETURN(rc);
+}
+
+int lov_statfs_interpret(struct ptlrpc_request_set *rqset, void *data, int rc)
+{
+	struct lov_request_set *lovset = (struct lov_request_set *)data;
+	int err;
+	ENTRY;
+
+	if (rc)
+		atomic_set(&lovset->set_completes, 0);
+
+	err = lov_fini_statfs_set(lovset);
+	RETURN(rc ? rc : err);
+}
+
+static int lov_statfs_async(struct obd_export *exp, struct obd_info *oinfo,
+			    __u64 max_age, struct ptlrpc_request_set *rqset)
+{
+	struct obd_device      *obd = class_exp2obd(exp);
+	struct lov_request_set *set;
+	struct lov_request *req;
+	struct list_head *pos;
+	struct lov_obd *lov;
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(oinfo != NULL);
+	LASSERT(oinfo->oi_osfs != NULL);
+
+	lov = &obd->u.lov;
+	rc = lov_prep_statfs_set(obd, oinfo, &set);
+	if (rc)
+		RETURN(rc);
+
+	list_for_each (pos, &set->set_list) {
+		req = list_entry(pos, struct lov_request, rq_link);
+		rc = obd_statfs_async(lov->lov_tgts[req->rq_idx]->ltd_exp,
+				      &req->rq_oi, max_age, rqset);
+		if (rc)
+			break;
+	}
+
+	if (rc || list_empty(&rqset->set_requests)) {
+		int err;
+		if (rc)
+			atomic_set(&set->set_completes, 0);
+		err = lov_fini_statfs_set(set);
+		RETURN(rc ? rc : err);
+	}
+
+	LASSERT(rqset->set_interpret == NULL);
+	rqset->set_interpret = lov_statfs_interpret;
+	rqset->set_arg = (void *)set;
+	RETURN(0);
+}
+
+static int lov_statfs(const struct lu_env *env, struct obd_export *exp,
+		      struct obd_statfs *osfs, __u64 max_age, __u32 flags)
+{
+	struct ptlrpc_request_set *set = NULL;
+	struct obd_info oinfo = { { { 0 } } };
+	int rc = 0;
+	ENTRY;
+
+
+	/* for obdclass we forbid using obd_statfs_rqset, but prefer using async
+	 * statfs requests */
+	set = ptlrpc_prep_set();
+	if (set == NULL)
+		RETURN(-ENOMEM);
+
+	oinfo.oi_osfs = osfs;
+	oinfo.oi_flags = flags;
+	rc = lov_statfs_async(exp, &oinfo, max_age, set);
+	if (rc == 0)
+		rc = ptlrpc_set_wait(set);
+	ptlrpc_set_destroy(set);
+
+	RETURN(rc);
+}
+
+static int lov_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
+			 void *karg, void *uarg)
+{
+	struct obd_device *obddev = class_exp2obd(exp);
+	struct lov_obd *lov = &obddev->u.lov;
+	int i = 0, rc = 0, count = lov->desc.ld_tgt_count;
+	struct obd_uuid *uuidp;
+	ENTRY;
+
+	switch (cmd) {
+	case IOC_OBD_STATFS: {
+		struct obd_ioctl_data *data = karg;
+		struct obd_device *osc_obd;
+		struct obd_statfs stat_buf = {0};
+		__u32 index;
+		__u32 flags;
+
+		memcpy(&index, data->ioc_inlbuf2, sizeof(__u32));
+		if ((index >= count))
+			RETURN(-ENODEV);
+
+		if (!lov->lov_tgts[index])
+			/* Try again with the next index */
+			RETURN(-EAGAIN);
+		if (!lov->lov_tgts[index]->ltd_active)
+			RETURN(-ENODATA);
+
+		osc_obd = class_exp2obd(lov->lov_tgts[index]->ltd_exp);
+		if (!osc_obd)
+			RETURN(-EINVAL);
+
+		/* copy UUID */
+		if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(osc_obd),
+				     min((int) data->ioc_plen2,
+					 (int) sizeof(struct obd_uuid))))
+			RETURN(-EFAULT);
+
+		flags = uarg ? *(__u32*)uarg : 0;
+		/* got statfs data */
+		rc = obd_statfs(NULL, lov->lov_tgts[index]->ltd_exp, &stat_buf,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				flags);
+		if (rc)
+			RETURN(rc);
+		if (copy_to_user(data->ioc_pbuf1, &stat_buf,
+				     min((int) data->ioc_plen1,
+					 (int) sizeof(stat_buf))))
+			RETURN(-EFAULT);
+		break;
+	}
+	case OBD_IOC_LOV_GET_CONFIG: {
+		struct obd_ioctl_data *data;
+		struct lov_desc *desc;
+		char *buf = NULL;
+		__u32 *genp;
+
+		len = 0;
+		if (obd_ioctl_getdata(&buf, &len, (void *)uarg))
+			RETURN(-EINVAL);
+
+		data = (struct obd_ioctl_data *)buf;
+
+		if (sizeof(*desc) > data->ioc_inllen1) {
+			obd_ioctl_freedata(buf, len);
+			RETURN(-EINVAL);
+		}
+
+		if (sizeof(uuidp->uuid) * count > data->ioc_inllen2) {
+			obd_ioctl_freedata(buf, len);
+			RETURN(-EINVAL);
+		}
+
+		if (sizeof(__u32) * count > data->ioc_inllen3) {
+			obd_ioctl_freedata(buf, len);
+			RETURN(-EINVAL);
+		}
+
+		desc = (struct lov_desc *)data->ioc_inlbuf1;
+		memcpy(desc, &(lov->desc), sizeof(*desc));
+
+		uuidp = (struct obd_uuid *)data->ioc_inlbuf2;
+		genp = (__u32 *)data->ioc_inlbuf3;
+		/* the uuid will be empty for deleted OSTs */
+		for (i = 0; i < count; i++, uuidp++, genp++) {
+			if (!lov->lov_tgts[i])
+				continue;
+			*uuidp = lov->lov_tgts[i]->ltd_uuid;
+			*genp = lov->lov_tgts[i]->ltd_gen;
+		}
+
+		if (copy_to_user((void *)uarg, buf, len))
+			rc = -EFAULT;
+		obd_ioctl_freedata(buf, len);
+		break;
+	}
+	case LL_IOC_LOV_SETSTRIPE:
+		rc = lov_setstripe(exp, len, karg, uarg);
+		break;
+	case LL_IOC_LOV_GETSTRIPE:
+		rc = lov_getstripe(exp, karg, uarg);
+		break;
+	case LL_IOC_LOV_SETEA:
+		rc = lov_setea(exp, karg, uarg);
+		break;
+	case OBD_IOC_QUOTACTL: {
+		struct if_quotactl *qctl = karg;
+		struct lov_tgt_desc *tgt = NULL;
+		struct obd_quotactl *oqctl;
+
+		if (qctl->qc_valid == QC_OSTIDX) {
+			if (qctl->qc_idx < 0 || count <= qctl->qc_idx)
+				RETURN(-EINVAL);
+
+			tgt = lov->lov_tgts[qctl->qc_idx];
+			if (!tgt || !tgt->ltd_exp)
+				RETURN(-EINVAL);
+		} else if (qctl->qc_valid == QC_UUID) {
+			for (i = 0; i < count; i++) {
+				tgt = lov->lov_tgts[i];
+				if (!tgt ||
+				    !obd_uuid_equals(&tgt->ltd_uuid,
+						     &qctl->obd_uuid))
+					continue;
+
+				if (tgt->ltd_exp == NULL)
+					RETURN(-EINVAL);
+
+				break;
+			}
+		} else {
+			RETURN(-EINVAL);
+		}
+
+		if (i >= count)
+			RETURN(-EAGAIN);
+
+		LASSERT(tgt && tgt->ltd_exp);
+		OBD_ALLOC_PTR(oqctl);
+		if (!oqctl)
+			RETURN(-ENOMEM);
+
+		QCTL_COPY(oqctl, qctl);
+		rc = obd_quotactl(tgt->ltd_exp, oqctl);
+		if (rc == 0) {
+			QCTL_COPY(qctl, oqctl);
+			qctl->qc_valid = QC_OSTIDX;
+			qctl->obd_uuid = tgt->ltd_uuid;
+		}
+		OBD_FREE_PTR(oqctl);
+		break;
+	}
+	default: {
+		int set = 0;
+
+		if (count == 0)
+			RETURN(-ENOTTY);
+
+		for (i = 0; i < count; i++) {
+			int err;
+			struct obd_device *osc_obd;
+
+			/* OST was disconnected */
+			if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_exp)
+				continue;
+
+			/* ll_umount_begin() sets force flag but for lov, not
+			 * osc. Let's pass it through */
+			osc_obd = class_exp2obd(lov->lov_tgts[i]->ltd_exp);
+			osc_obd->obd_force = obddev->obd_force;
+			err = obd_iocontrol(cmd, lov->lov_tgts[i]->ltd_exp,
+					    len, karg, uarg);
+			if (err == -ENODATA && cmd == OBD_IOC_POLL_QUOTACHECK) {
+				RETURN(err);
+			} else if (err) {
+				if (lov->lov_tgts[i]->ltd_active) {
+					CDEBUG(err == -ENOTTY ?
+					       D_IOCTL : D_WARNING,
+					       "iocontrol OSC %s on OST "
+					       "idx %d cmd %x: err = %d\n",
+					       lov_uuid2str(lov, i),
+					       i, cmd, err);
+					if (!rc)
+						rc = err;
+				}
+			} else {
+				set = 1;
+			}
+		}
+		if (!set && !rc)
+			rc = -EIO;
+	}
+	}
+
+	RETURN(rc);
+}
+
+#define FIEMAP_BUFFER_SIZE 4096
+
+/**
+ * Non-zero fe_logical indicates that this is a continuation FIEMAP
+ * call. The local end offset and the device are sent in the first
+ * fm_extent. This function calculates the stripe number from the index.
+ * This function returns a stripe_no on which mapping is to be restarted.
+ *
+ * This function returns fm_end_offset which is the in-OST offset at which
+ * mapping should be restarted. If fm_end_offset=0 is returned then caller
+ * will re-calculate proper offset in next stripe.
+ * Note that the first extent is passed to lov_get_info via the value field.
+ *
+ * \param fiemap fiemap request header
+ * \param lsm striping information for the file
+ * \param fm_start logical start of mapping
+ * \param fm_end logical end of mapping
+ * \param start_stripe starting stripe will be returned in this
+ */
+obd_size fiemap_calc_fm_end_offset(struct ll_user_fiemap *fiemap,
+				   struct lov_stripe_md *lsm, obd_size fm_start,
+				   obd_size fm_end, int *start_stripe)
+{
+	obd_size local_end = fiemap->fm_extents[0].fe_logical;
+	obd_off lun_start, lun_end;
+	obd_size fm_end_offset;
+	int stripe_no = -1, i;
+
+	if (fiemap->fm_extent_count == 0 ||
+	    fiemap->fm_extents[0].fe_logical == 0)
+		return 0;
+
+	/* Find out stripe_no from ost_index saved in the fe_device */
+	for (i = 0; i < lsm->lsm_stripe_count; i++) {
+		if (lsm->lsm_oinfo[i]->loi_ost_idx ==
+					fiemap->fm_extents[0].fe_device) {
+			stripe_no = i;
+			break;
+		}
+	}
+	if (stripe_no == -1)
+		return -EINVAL;
+
+	/* If we have finished mapping on previous device, shift logical
+	 * offset to start of next device */
+	if ((lov_stripe_intersects(lsm, stripe_no, fm_start, fm_end,
+				   &lun_start, &lun_end)) != 0 &&
+				   local_end < lun_end) {
+		fm_end_offset = local_end;
+		*start_stripe = stripe_no;
+	} else {
+		/* This is a special value to indicate that caller should
+		 * calculate offset in next stripe. */
+		fm_end_offset = 0;
+		*start_stripe = (stripe_no + 1) % lsm->lsm_stripe_count;
+	}
+
+	return fm_end_offset;
+}
+
+/**
+ * We calculate on which OST the mapping will end. If the length of mapping
+ * is greater than (stripe_size * stripe_count) then the last_stripe will
+ * will be one just before start_stripe. Else we check if the mapping
+ * intersects each OST and find last_stripe.
+ * This function returns the last_stripe and also sets the stripe_count
+ * over which the mapping is spread
+ *
+ * \param lsm striping information for the file
+ * \param fm_start logical start of mapping
+ * \param fm_end logical end of mapping
+ * \param start_stripe starting stripe of the mapping
+ * \param stripe_count the number of stripes across which to map is returned
+ *
+ * \retval last_stripe return the last stripe of the mapping
+ */
+int fiemap_calc_last_stripe(struct lov_stripe_md *lsm, obd_size fm_start,
+			    obd_size fm_end, int start_stripe,
+			    int *stripe_count)
+{
+	int last_stripe;
+	obd_off obd_start, obd_end;
+	int i, j;
+
+	if (fm_end - fm_start > lsm->lsm_stripe_size * lsm->lsm_stripe_count) {
+		last_stripe = (start_stripe < 1 ? lsm->lsm_stripe_count - 1 :
+							      start_stripe - 1);
+		*stripe_count = lsm->lsm_stripe_count;
+	} else {
+		for (j = 0, i = start_stripe; j < lsm->lsm_stripe_count;
+		     i = (i + 1) % lsm->lsm_stripe_count, j++) {
+			if ((lov_stripe_intersects(lsm, i, fm_start, fm_end,
+						   &obd_start, &obd_end)) == 0)
+				break;
+		}
+		*stripe_count = j;
+		last_stripe = (start_stripe + j - 1) %lsm->lsm_stripe_count;
+	}
+
+	return last_stripe;
+}
+
+/**
+ * Set fe_device and copy extents from local buffer into main return buffer.
+ *
+ * \param fiemap fiemap request header
+ * \param lcl_fm_ext array of local fiemap extents to be copied
+ * \param ost_index OST index to be written into the fm_device field for each
+		    extent
+ * \param ext_count number of extents to be copied
+ * \param current_extent where to start copying in main extent array
+ */
+void fiemap_prepare_and_copy_exts(struct ll_user_fiemap *fiemap,
+				  struct ll_fiemap_extent *lcl_fm_ext,
+				  int ost_index, unsigned int ext_count,
+				  int current_extent)
+{
+	char *to;
+	int ext;
+
+	for (ext = 0; ext < ext_count; ext++) {
+		lcl_fm_ext[ext].fe_device = ost_index;
+		lcl_fm_ext[ext].fe_flags |= FIEMAP_EXTENT_NET;
+	}
+
+	/* Copy fm_extent's from fm_local to return buffer */
+	to = (char *)fiemap + fiemap_count_to_size(current_extent);
+	memcpy(to, lcl_fm_ext, ext_count * sizeof(struct ll_fiemap_extent));
+}
+
+/**
+ * Break down the FIEMAP request and send appropriate calls to individual OSTs.
+ * This also handles the restarting of FIEMAP calls in case mapping overflows
+ * the available number of extents in single call.
+ */
+static int lov_fiemap(struct lov_obd *lov, __u32 keylen, void *key,
+		      __u32 *vallen, void *val, struct lov_stripe_md *lsm)
+{
+	struct ll_fiemap_info_key *fm_key = key;
+	struct ll_user_fiemap *fiemap = val;
+	struct ll_user_fiemap *fm_local = NULL;
+	struct ll_fiemap_extent *lcl_fm_ext;
+	int count_local;
+	unsigned int get_num_extents = 0;
+	int ost_index = 0, actual_start_stripe, start_stripe;
+	obd_size fm_start, fm_end, fm_length, fm_end_offset;
+	obd_size curr_loc;
+	int current_extent = 0, rc = 0, i;
+	int ost_eof = 0; /* EOF for object */
+	int ost_done = 0; /* done with required mapping for this OST? */
+	int last_stripe;
+	int cur_stripe = 0, cur_stripe_wrap = 0, stripe_count;
+	unsigned int buffer_size = FIEMAP_BUFFER_SIZE;
+
+	if (lsm == NULL)
+		GOTO(out, rc = 0);
+
+	if (fiemap_count_to_size(fm_key->fiemap.fm_extent_count) < buffer_size)
+		buffer_size = fiemap_count_to_size(fm_key->fiemap.fm_extent_count);
+
+	OBD_ALLOC_LARGE(fm_local, buffer_size);
+	if (fm_local == NULL)
+		GOTO(out, rc = -ENOMEM);
+	lcl_fm_ext = &fm_local->fm_extents[0];
+
+	count_local = fiemap_size_to_count(buffer_size);
+
+	memcpy(fiemap, &fm_key->fiemap, sizeof(*fiemap));
+	fm_start = fiemap->fm_start;
+	fm_length = fiemap->fm_length;
+	/* Calculate start stripe, last stripe and length of mapping */
+	actual_start_stripe = start_stripe = lov_stripe_number(lsm, fm_start);
+	fm_end = (fm_length == ~0ULL ? fm_key->oa.o_size :
+						fm_start + fm_length - 1);
+	/* If fm_length != ~0ULL but fm_start+fm_length-1 exceeds file size */
+	if (fm_end > fm_key->oa.o_size)
+		fm_end = fm_key->oa.o_size;
+
+	last_stripe = fiemap_calc_last_stripe(lsm, fm_start, fm_end,
+					    actual_start_stripe, &stripe_count);
+
+	fm_end_offset = fiemap_calc_fm_end_offset(fiemap, lsm, fm_start,
+						  fm_end, &start_stripe);
+	if (fm_end_offset == -EINVAL)
+		GOTO(out, rc = -EINVAL);
+
+	if (fiemap->fm_extent_count == 0) {
+		get_num_extents = 1;
+		count_local = 0;
+	}
+
+	/* Check each stripe */
+	for (cur_stripe = start_stripe, i = 0; i < stripe_count;
+	     i++, cur_stripe = (cur_stripe + 1) % lsm->lsm_stripe_count) {
+		obd_size req_fm_len; /* Stores length of required mapping */
+		obd_size len_mapped_single_call;
+		obd_off lun_start, lun_end, obd_object_end;
+		unsigned int ext_count;
+
+		cur_stripe_wrap = cur_stripe;
+
+		/* Find out range of mapping on this stripe */
+		if ((lov_stripe_intersects(lsm, cur_stripe, fm_start, fm_end,
+					   &lun_start, &obd_object_end)) == 0)
+			continue;
+
+		/* If this is a continuation FIEMAP call and we are on
+		 * starting stripe then lun_start needs to be set to
+		 * fm_end_offset */
+		if (fm_end_offset != 0 && cur_stripe == start_stripe)
+			lun_start = fm_end_offset;
+
+		if (fm_length != ~0ULL) {
+			/* Handle fm_start + fm_length overflow */
+			if (fm_start + fm_length < fm_start)
+				fm_length = ~0ULL - fm_start;
+			lun_end = lov_size_to_stripe(lsm, fm_start + fm_length,
+						     cur_stripe);
+		} else {
+			lun_end = ~0ULL;
+		}
+
+		if (lun_start == lun_end)
+			continue;
+
+		req_fm_len = obd_object_end - lun_start;
+		fm_local->fm_length = 0;
+		len_mapped_single_call = 0;
+
+		/* If the output buffer is very large and the objects have many
+		 * extents we may need to loop on a single OST repeatedly */
+		ost_eof = 0;
+		ost_done = 0;
+		do {
+			if (get_num_extents == 0) {
+				/* Don't get too many extents. */
+				if (current_extent + count_local >
+				    fiemap->fm_extent_count)
+					count_local = fiemap->fm_extent_count -
+								 current_extent;
+			}
+
+			lun_start += len_mapped_single_call;
+			fm_local->fm_length = req_fm_len - len_mapped_single_call;
+			req_fm_len = fm_local->fm_length;
+			fm_local->fm_extent_count = count_local;
+			fm_local->fm_mapped_extents = 0;
+			fm_local->fm_flags = fiemap->fm_flags;
+
+			fm_key->oa.o_oi = lsm->lsm_oinfo[cur_stripe]->loi_oi;
+			ost_index = lsm->lsm_oinfo[cur_stripe]->loi_ost_idx;
+
+			if (ost_index < 0 || ost_index >=lov->desc.ld_tgt_count)
+				GOTO(out, rc = -EINVAL);
+
+			/* If OST is inactive, return extent with UNKNOWN flag */
+			if (!lov->lov_tgts[ost_index]->ltd_active) {
+				fm_local->fm_flags |= FIEMAP_EXTENT_LAST;
+				fm_local->fm_mapped_extents = 1;
+
+				lcl_fm_ext[0].fe_logical = lun_start;
+				lcl_fm_ext[0].fe_length = obd_object_end -
+								      lun_start;
+				lcl_fm_ext[0].fe_flags |= FIEMAP_EXTENT_UNKNOWN;
+
+				goto inactive_tgt;
+			}
+
+			fm_local->fm_start = lun_start;
+			fm_local->fm_flags &= ~FIEMAP_FLAG_DEVICE_ORDER;
+			memcpy(&fm_key->fiemap, fm_local, sizeof(*fm_local));
+			*vallen=fiemap_count_to_size(fm_local->fm_extent_count);
+			rc = obd_get_info(NULL,
+					  lov->lov_tgts[ost_index]->ltd_exp,
+					  keylen, key, vallen, fm_local, lsm);
+			if (rc != 0)
+				GOTO(out, rc);
+
+inactive_tgt:
+			ext_count = fm_local->fm_mapped_extents;
+			if (ext_count == 0) {
+				ost_done = 1;
+				/* If last stripe has hole at the end,
+				 * then we need to return */
+				if (cur_stripe_wrap == last_stripe) {
+					fiemap->fm_mapped_extents = 0;
+					goto finish;
+				}
+				break;
+			}
+
+			/* If we just need num of extents then go to next device */
+			if (get_num_extents) {
+				current_extent += ext_count;
+				break;
+			}
+
+			len_mapped_single_call = lcl_fm_ext[ext_count-1].fe_logical -
+				  lun_start + lcl_fm_ext[ext_count - 1].fe_length;
+
+			/* Have we finished mapping on this device? */
+			if (req_fm_len <= len_mapped_single_call)
+				ost_done = 1;
+
+			/* Clear the EXTENT_LAST flag which can be present on
+			 * last extent */
+			if (lcl_fm_ext[ext_count-1].fe_flags & FIEMAP_EXTENT_LAST)
+				lcl_fm_ext[ext_count - 1].fe_flags &=
+							    ~FIEMAP_EXTENT_LAST;
+
+			curr_loc = lov_stripe_size(lsm,
+					   lcl_fm_ext[ext_count - 1].fe_logical+
+					   lcl_fm_ext[ext_count - 1].fe_length,
+					   cur_stripe);
+			if (curr_loc >= fm_key->oa.o_size)
+				ost_eof = 1;
+
+			fiemap_prepare_and_copy_exts(fiemap, lcl_fm_ext,
+						     ost_index, ext_count,
+						     current_extent);
+
+			current_extent += ext_count;
+
+			/* Ran out of available extents? */
+			if (current_extent >= fiemap->fm_extent_count)
+				goto finish;
+		} while (ost_done == 0 && ost_eof == 0);
+
+		if (cur_stripe_wrap == last_stripe)
+			goto finish;
+	}
+
+finish:
+	/* Indicate that we are returning device offsets unless file just has
+	 * single stripe */
+	if (lsm->lsm_stripe_count > 1)
+		fiemap->fm_flags |= FIEMAP_FLAG_DEVICE_ORDER;
+
+	if (get_num_extents)
+		goto skip_last_device_calc;
+
+	/* Check if we have reached the last stripe and whether mapping for that
+	 * stripe is done. */
+	if (cur_stripe_wrap == last_stripe) {
+		if (ost_done || ost_eof)
+			fiemap->fm_extents[current_extent - 1].fe_flags |=
+							     FIEMAP_EXTENT_LAST;
+	}
+
+skip_last_device_calc:
+	fiemap->fm_mapped_extents = current_extent;
+
+out:
+	OBD_FREE_LARGE(fm_local, buffer_size);
+	return rc;
+}
+
+static int lov_get_info(const struct lu_env *env, struct obd_export *exp,
+			__u32 keylen, void *key, __u32 *vallen, void *val,
+			struct lov_stripe_md *lsm)
+{
+	struct obd_device *obddev = class_exp2obd(exp);
+	struct lov_obd *lov = &obddev->u.lov;
+	int i, rc;
+	ENTRY;
+
+	if (!vallen || !val)
+		RETURN(-EFAULT);
+
+	obd_getref(obddev);
+
+	if (KEY_IS(KEY_LOCK_TO_STRIPE)) {
+		struct {
+			char name[16];
+			struct ldlm_lock *lock;
+		} *data = key;
+		struct ldlm_res_id *res_id = &data->lock->l_resource->lr_name;
+		struct lov_oinfo *loi;
+		__u32 *stripe = val;
+
+		if (*vallen < sizeof(*stripe))
+			GOTO(out, rc = -EFAULT);
+		*vallen = sizeof(*stripe);
+
+		/* XXX This is another one of those bits that will need to
+		 * change if we ever actually support nested LOVs.  It uses
+		 * the lock's export to find out which stripe it is. */
+		/* XXX - it's assumed all the locks for deleted OSTs have
+		 * been cancelled. Also, the export for deleted OSTs will
+		 * be NULL and won't match the lock's export. */
+		for (i = 0; i < lsm->lsm_stripe_count; i++) {
+			loi = lsm->lsm_oinfo[i];
+			if (!lov->lov_tgts[loi->loi_ost_idx])
+				continue;
+			if (lov->lov_tgts[loi->loi_ost_idx]->ltd_exp ==
+			    data->lock->l_conn_export &&
+			    ostid_res_name_eq(&loi->loi_oi, res_id)) {
+				*stripe = i;
+				GOTO(out, rc = 0);
+			}
+		}
+		LDLM_ERROR(data->lock, "lock on inode without such object");
+		dump_lsm(D_ERROR, lsm);
+		GOTO(out, rc = -ENXIO);
+	} else if (KEY_IS(KEY_LAST_ID)) {
+		struct obd_id_info *info = val;
+		__u32 size = sizeof(obd_id);
+		struct lov_tgt_desc *tgt;
+
+		LASSERT(*vallen == sizeof(struct obd_id_info));
+		tgt = lov->lov_tgts[info->idx];
+
+		if (!tgt || !tgt->ltd_active)
+			GOTO(out, rc = -ESRCH);
+
+		rc = obd_get_info(env, tgt->ltd_exp, keylen, key,
+				  &size, info->data, NULL);
+		GOTO(out, rc = 0);
+	} else if (KEY_IS(KEY_LOVDESC)) {
+		struct lov_desc *desc_ret = val;
+		*desc_ret = lov->desc;
+
+		GOTO(out, rc = 0);
+	} else if (KEY_IS(KEY_FIEMAP)) {
+		rc = lov_fiemap(lov, keylen, key, vallen, val, lsm);
+		GOTO(out, rc);
+	} else if (KEY_IS(KEY_CONNECT_FLAG)) {
+		struct lov_tgt_desc *tgt;
+		__u64 ost_idx = *((__u64*)val);
+
+		LASSERT(*vallen == sizeof(__u64));
+		LASSERT(ost_idx < lov->desc.ld_tgt_count);
+		tgt = lov->lov_tgts[ost_idx];
+
+		if (!tgt || !tgt->ltd_exp)
+			GOTO(out, rc = -ESRCH);
+
+		*((__u64 *)val) = exp_connect_flags(tgt->ltd_exp);
+		GOTO(out, rc = 0);
+	} else if (KEY_IS(KEY_TGT_COUNT)) {
+		*((int *)val) = lov->desc.ld_tgt_count;
+		GOTO(out, rc = 0);
+	}
+
+	rc = -EINVAL;
+
+out:
+	obd_putref(obddev);
+	RETURN(rc);
+}
+
+static int lov_set_info_async(const struct lu_env *env, struct obd_export *exp,
+			      obd_count keylen, void *key, obd_count vallen,
+			      void *val, struct ptlrpc_request_set *set)
+{
+	struct obd_device *obddev = class_exp2obd(exp);
+	struct lov_obd *lov = &obddev->u.lov;
+	obd_count count;
+	int i, rc = 0, err;
+	struct lov_tgt_desc *tgt;
+	unsigned incr, check_uuid,
+		 do_inactive, no_set;
+	unsigned next_id = 0,  mds_con = 0, capa = 0;
+	ENTRY;
+
+	incr = check_uuid = do_inactive = no_set = 0;
+	if (set == NULL) {
+		no_set = 1;
+		set = ptlrpc_prep_set();
+		if (!set)
+			RETURN(-ENOMEM);
+	}
+
+	obd_getref(obddev);
+	count = lov->desc.ld_tgt_count;
+
+	if (KEY_IS(KEY_NEXT_ID)) {
+		count = vallen / sizeof(struct obd_id_info);
+		vallen = sizeof(obd_id);
+		incr = sizeof(struct obd_id_info);
+		do_inactive = 1;
+		next_id = 1;
+	} else if (KEY_IS(KEY_CHECKSUM)) {
+		do_inactive = 1;
+	} else if (KEY_IS(KEY_EVICT_BY_NID)) {
+		/* use defaults:  do_inactive = incr = 0; */
+	} else if (KEY_IS(KEY_MDS_CONN)) {
+		mds_con = 1;
+	} else if (KEY_IS(KEY_CAPA_KEY)) {
+		capa = 1;
+	} else if (KEY_IS(KEY_CACHE_SET)) {
+		LASSERT(lov->lov_cache == NULL);
+		lov->lov_cache = val;
+		do_inactive = 1;
+	}
+
+	for (i = 0; i < count; i++, val = (char *)val + incr) {
+		if (next_id) {
+			tgt = lov->lov_tgts[((struct obd_id_info*)val)->idx];
+		} else {
+			tgt = lov->lov_tgts[i];
+		}
+		/* OST was disconnected */
+		if (!tgt || !tgt->ltd_exp)
+			continue;
+
+		/* OST is inactive and we don't want inactive OSCs */
+		if (!tgt->ltd_active && !do_inactive)
+			continue;
+
+		if (mds_con) {
+			struct mds_group_info *mgi;
+
+			LASSERT(vallen == sizeof(*mgi));
+			mgi = (struct mds_group_info *)val;
+
+			/* Only want a specific OSC */
+			if (mgi->uuid && !obd_uuid_equals(mgi->uuid,
+						&tgt->ltd_uuid))
+				continue;
+
+			err = obd_set_info_async(env, tgt->ltd_exp,
+					 keylen, key, sizeof(int),
+					 &mgi->group, set);
+		} else if (next_id) {
+			err = obd_set_info_async(env, tgt->ltd_exp,
+					 keylen, key, vallen,
+					 ((struct obd_id_info*)val)->data, set);
+		} else if (capa) {
+			struct mds_capa_info *info = (struct mds_capa_info*)val;
+
+			LASSERT(vallen == sizeof(*info));
+
+			 /* Only want a specific OSC */
+			if (info->uuid &&
+			    !obd_uuid_equals(info->uuid, &tgt->ltd_uuid))
+				continue;
+
+			err = obd_set_info_async(env, tgt->ltd_exp, keylen,
+						 key, sizeof(*info->capa),
+						 info->capa, set);
+		} else {
+			/* Only want a specific OSC */
+			if (check_uuid &&
+			    !obd_uuid_equals(val, &tgt->ltd_uuid))
+				continue;
+
+			err = obd_set_info_async(env, tgt->ltd_exp,
+					 keylen, key, vallen, val, set);
+		}
+
+		if (!rc)
+			rc = err;
+	}
+
+	obd_putref(obddev);
+	if (no_set) {
+		err = ptlrpc_set_wait(set);
+		if (!rc)
+			rc = err;
+		ptlrpc_set_destroy(set);
+	}
+	RETURN(rc);
+}
+
+static int lov_extent_calc(struct obd_export *exp, struct lov_stripe_md *lsm,
+			   int cmd, __u64 *offset)
+{
+	__u32 ssize = lsm->lsm_stripe_size;
+	__u64 start;
+
+	start = *offset;
+	lov_do_div64(start, ssize);
+	start = start * ssize;
+
+	CDEBUG(D_DLMTRACE, "offset "LPU64", stripe %u, start "LPU64
+			   ", end "LPU64"\n", *offset, ssize, start,
+			   start + ssize - 1);
+	if (cmd == OBD_CALC_STRIPE_END) {
+		*offset = start + ssize - 1;
+	} else if (cmd == OBD_CALC_STRIPE_START) {
+		*offset = start;
+	} else {
+		LBUG();
+	}
+
+	RETURN(0);
+}
+
+void lov_stripe_lock(struct lov_stripe_md *md)
+{
+	LASSERT(md->lsm_lock_owner != current_pid());
+	spin_lock(&md->lsm_lock);
+	LASSERT(md->lsm_lock_owner == 0);
+	md->lsm_lock_owner = current_pid();
+}
+EXPORT_SYMBOL(lov_stripe_lock);
+
+void lov_stripe_unlock(struct lov_stripe_md *md)
+{
+	LASSERT(md->lsm_lock_owner == current_pid());
+	md->lsm_lock_owner = 0;
+	spin_unlock(&md->lsm_lock);
+}
+EXPORT_SYMBOL(lov_stripe_unlock);
+
+static int lov_quotactl(struct obd_device *obd, struct obd_export *exp,
+			struct obd_quotactl *oqctl)
+{
+	struct lov_obd      *lov = &obd->u.lov;
+	struct lov_tgt_desc *tgt;
+	__u64		curspace = 0;
+	__u64		bhardlimit = 0;
+	int		  i, rc = 0;
+	ENTRY;
+
+	if (oqctl->qc_cmd != LUSTRE_Q_QUOTAON &&
+	    oqctl->qc_cmd != LUSTRE_Q_QUOTAOFF &&
+	    oqctl->qc_cmd != Q_GETOQUOTA &&
+	    oqctl->qc_cmd != Q_INITQUOTA &&
+	    oqctl->qc_cmd != LUSTRE_Q_SETQUOTA &&
+	    oqctl->qc_cmd != Q_FINVALIDATE) {
+		CERROR("bad quota opc %x for lov obd", oqctl->qc_cmd);
+		RETURN(-EFAULT);
+	}
+
+	/* for lov tgt */
+	obd_getref(obd);
+	for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+		int err;
+
+		tgt = lov->lov_tgts[i];
+
+		if (!tgt)
+			continue;
+
+		if (!tgt->ltd_active || tgt->ltd_reap) {
+			if (oqctl->qc_cmd == Q_GETOQUOTA &&
+			    lov->lov_tgts[i]->ltd_activate) {
+				rc = -EREMOTEIO;
+				CERROR("ost %d is inactive\n", i);
+			} else {
+				CDEBUG(D_HA, "ost %d is inactive\n", i);
+			}
+			continue;
+		}
+
+		err = obd_quotactl(tgt->ltd_exp, oqctl);
+		if (err) {
+			if (tgt->ltd_active && !rc)
+				rc = err;
+			continue;
+		}
+
+		if (oqctl->qc_cmd == Q_GETOQUOTA) {
+			curspace += oqctl->qc_dqblk.dqb_curspace;
+			bhardlimit += oqctl->qc_dqblk.dqb_bhardlimit;
+		}
+	}
+	obd_putref(obd);
+
+	if (oqctl->qc_cmd == Q_GETOQUOTA) {
+		oqctl->qc_dqblk.dqb_curspace = curspace;
+		oqctl->qc_dqblk.dqb_bhardlimit = bhardlimit;
+	}
+	RETURN(rc);
+}
+
+static int lov_quotacheck(struct obd_device *obd, struct obd_export *exp,
+			  struct obd_quotactl *oqctl)
+{
+	struct lov_obd *lov = &obd->u.lov;
+	int	     i, rc = 0;
+	ENTRY;
+
+	obd_getref(obd);
+
+	for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+		if (!lov->lov_tgts[i])
+			continue;
+
+		/* Skip quota check on the administratively disabled OSTs. */
+		if (!lov->lov_tgts[i]->ltd_activate) {
+			CWARN("lov idx %d was administratively disabled, "
+			      "skip quotacheck on it.\n", i);
+			continue;
+		}
+
+		if (!lov->lov_tgts[i]->ltd_active) {
+			CERROR("lov idx %d inactive\n", i);
+			rc = -EIO;
+			goto out;
+		}
+	}
+
+	for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+		int err;
+
+		if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_activate)
+			continue;
+
+		err = obd_quotacheck(lov->lov_tgts[i]->ltd_exp, oqctl);
+		if (err && !rc)
+			rc = err;
+	}
+
+out:
+	obd_putref(obd);
+
+	RETURN(rc);
+}
+
+struct obd_ops lov_obd_ops = {
+	.o_owner	       = THIS_MODULE,
+	.o_setup	       = lov_setup,
+	.o_precleanup	  = lov_precleanup,
+	.o_cleanup	     = lov_cleanup,
+	//.o_process_config      = lov_process_config,
+	.o_connect	     = lov_connect,
+	.o_disconnect	  = lov_disconnect,
+	.o_statfs	      = lov_statfs,
+	.o_statfs_async	= lov_statfs_async,
+	.o_packmd	      = lov_packmd,
+	.o_unpackmd	    = lov_unpackmd,
+	.o_create	      = lov_create,
+	.o_destroy	     = lov_destroy,
+	.o_getattr	     = lov_getattr,
+	.o_getattr_async       = lov_getattr_async,
+	.o_setattr	     = lov_setattr,
+	.o_setattr_async       = lov_setattr_async,
+	.o_brw		 = lov_brw,
+	.o_merge_lvb	   = lov_merge_lvb,
+	.o_adjust_kms	  = lov_adjust_kms,
+	.o_punch	       = lov_punch,
+	.o_sync		= lov_sync,
+	.o_enqueue	     = lov_enqueue,
+	.o_change_cbdata       = lov_change_cbdata,
+	.o_find_cbdata	 = lov_find_cbdata,
+	.o_cancel	      = lov_cancel,
+	.o_cancel_unused       = lov_cancel_unused,
+	.o_iocontrol	   = lov_iocontrol,
+	.o_get_info	    = lov_get_info,
+	.o_set_info_async      = lov_set_info_async,
+	.o_extent_calc	 = lov_extent_calc,
+	.o_llog_init	   = lov_llog_init,
+	.o_llog_finish	 = lov_llog_finish,
+	.o_notify	      = lov_notify,
+	.o_pool_new	    = lov_pool_new,
+	.o_pool_rem	    = lov_pool_remove,
+	.o_pool_add	    = lov_pool_add,
+	.o_pool_del	    = lov_pool_del,
+	.o_getref	      = lov_getref,
+	.o_putref	      = lov_putref,
+	.o_quotactl	    = lov_quotactl,
+	.o_quotacheck	  = lov_quotacheck,
+};
+
+struct kmem_cache *lov_oinfo_slab;
+
+extern struct lu_kmem_descr lov_caches[];
+
+int __init lov_init(void)
+{
+	struct lprocfs_static_vars lvars = { 0 };
+	int rc;
+	ENTRY;
+
+	/* print an address of _any_ initialized kernel symbol from this
+	 * module, to allow debugging with gdb that doesn't support data
+	 * symbols from modules.*/
+	CDEBUG(D_INFO, "Lustre LOV module (%p).\n", &lov_caches);
+
+	rc = lu_kmem_init(lov_caches);
+	if (rc)
+		return rc;
+
+	lov_oinfo_slab = kmem_cache_create("lov_oinfo",
+					      sizeof(struct lov_oinfo),
+					      0, SLAB_HWCACHE_ALIGN, NULL);
+	if (lov_oinfo_slab == NULL) {
+		lu_kmem_fini(lov_caches);
+		return -ENOMEM;
+	}
+	lprocfs_lov_init_vars(&lvars);
+
+	rc = class_register_type(&lov_obd_ops, NULL, lvars.module_vars,
+				 LUSTRE_LOV_NAME, &lov_device_type);
+
+	if (rc) {
+		kmem_cache_destroy(lov_oinfo_slab);
+		lu_kmem_fini(lov_caches);
+	}
+
+	RETURN(rc);
+}
+
+static void /*__exit*/ lov_exit(void)
+{
+	class_unregister_type(LUSTRE_LOV_NAME);
+	kmem_cache_destroy(lov_oinfo_slab);
+
+	lu_kmem_fini(lov_caches);
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Logical Object Volume OBD driver");
+MODULE_LICENSE("GPL");
+
+cfs_module(lov, LUSTRE_VERSION_STRING, lov_init, lov_exit);
diff --git a/drivers/staging/lustre/lustre/lov/lov_object.c b/drivers/staging/lustre/lustre/lov/lov_object.c
new file mode 100644
index 000000000000..aa8ae80e8121
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_object.c
@@ -0,0 +1,942 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_object for LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+#include <lustre_debug.h>
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Layout operations.
+ *
+ */
+
+struct lov_layout_operations {
+	int (*llo_init)(const struct lu_env *env, struct lov_device *dev,
+			struct lov_object *lov,
+			const struct cl_object_conf *conf,
+			union lov_layout_state *state);
+	int (*llo_delete)(const struct lu_env *env, struct lov_object *lov,
+			   union lov_layout_state *state);
+	void (*llo_fini)(const struct lu_env *env, struct lov_object *lov,
+			 union lov_layout_state *state);
+	void (*llo_install)(const struct lu_env *env, struct lov_object *lov,
+			    union lov_layout_state *state);
+	int  (*llo_print)(const struct lu_env *env, void *cookie,
+			  lu_printer_t p, const struct lu_object *o);
+	int  (*llo_page_init)(const struct lu_env *env, struct cl_object *obj,
+				struct cl_page *page, struct page *vmpage);
+	int  (*llo_lock_init)(const struct lu_env *env,
+			      struct cl_object *obj, struct cl_lock *lock,
+			      const struct cl_io *io);
+	int  (*llo_io_init)(const struct lu_env *env,
+			    struct cl_object *obj, struct cl_io *io);
+	int  (*llo_getattr)(const struct lu_env *env, struct cl_object *obj,
+			    struct cl_attr *attr);
+};
+
+static int lov_layout_wait(const struct lu_env *env, struct lov_object *lov);
+
+/*****************************************************************************
+ *
+ * Lov object layout operations.
+ *
+ */
+
+static void lov_install_empty(const struct lu_env *env,
+			      struct lov_object *lov,
+			      union  lov_layout_state *state)
+{
+	/*
+	 * File without objects.
+	 */
+}
+
+static int lov_init_empty(const struct lu_env *env,
+			  struct lov_device *dev, struct lov_object *lov,
+			  const struct cl_object_conf *conf,
+			  union  lov_layout_state *state)
+{
+	return 0;
+}
+
+static void lov_install_raid0(const struct lu_env *env,
+			      struct lov_object *lov,
+			      union  lov_layout_state *state)
+{
+}
+
+static struct cl_object *lov_sub_find(const struct lu_env *env,
+				      struct cl_device *dev,
+				      const struct lu_fid *fid,
+				      const struct cl_object_conf *conf)
+{
+	struct lu_object *o;
+
+	ENTRY;
+	o = lu_object_find_at(env, cl2lu_dev(dev), fid, &conf->coc_lu);
+	LASSERT(ergo(!IS_ERR(o), o->lo_dev->ld_type == &lovsub_device_type));
+	RETURN(lu2cl(o));
+}
+
+static int lov_init_sub(const struct lu_env *env, struct lov_object *lov,
+			struct cl_object *stripe,
+			struct lov_layout_raid0 *r0, int idx)
+{
+	struct cl_object_header *hdr;
+	struct cl_object_header *subhdr;
+	struct cl_object_header *parent;
+	struct lov_oinfo	*oinfo;
+	int result;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_LOV_INIT)) {
+		/* For sanity:test_206.
+		 * Do not leave the object in cache to avoid accessing
+		 * freed memory. This is because osc_object is referring to
+		 * lov_oinfo of lsm_stripe_data which will be freed due to
+		 * this failure. */
+		cl_object_kill(env, stripe);
+		cl_object_put(env, stripe);
+		return -EIO;
+	}
+
+	hdr    = cl_object_header(lov2cl(lov));
+	subhdr = cl_object_header(stripe);
+	parent = subhdr->coh_parent;
+
+	oinfo = lov->lo_lsm->lsm_oinfo[idx];
+	CDEBUG(D_INODE, DFID"@%p[%d] -> "DFID"@%p: ostid: "DOSTID
+	       " idx: %d gen: %d\n",
+	       PFID(&subhdr->coh_lu.loh_fid), subhdr, idx,
+	       PFID(&hdr->coh_lu.loh_fid), hdr, POSTID(&oinfo->loi_oi),
+	       oinfo->loi_ost_idx, oinfo->loi_ost_gen);
+
+	if (parent == NULL) {
+		subhdr->coh_parent = hdr;
+		subhdr->coh_nesting = hdr->coh_nesting + 1;
+		lu_object_ref_add(&stripe->co_lu, "lov-parent", lov);
+		r0->lo_sub[idx] = cl2lovsub(stripe);
+		r0->lo_sub[idx]->lso_super = lov;
+		r0->lo_sub[idx]->lso_index = idx;
+		result = 0;
+	} else {
+		struct lu_object  *old_obj;
+		struct lov_object *old_lov;
+		unsigned int mask = D_INODE;
+
+		old_obj = lu_object_locate(&parent->coh_lu, &lov_device_type);
+		LASSERT(old_obj != NULL);
+		old_lov = cl2lov(lu2cl(old_obj));
+		if (old_lov->lo_layout_invalid) {
+			/* the object's layout has already changed but isn't
+			 * refreshed */
+			lu_object_unhash(env, &stripe->co_lu);
+			result = -EAGAIN;
+		} else {
+			mask = D_ERROR;
+			result = -EIO;
+		}
+
+		LU_OBJECT_DEBUG(mask, env, &stripe->co_lu,
+				"stripe %d is already owned.\n", idx);
+		LU_OBJECT_DEBUG(mask, env, old_obj, "owned.\n");
+		LU_OBJECT_HEADER(mask, env, lov2lu(lov), "try to own.\n");
+		cl_object_put(env, stripe);
+	}
+	return result;
+}
+
+static int lov_init_raid0(const struct lu_env *env,
+			  struct lov_device *dev, struct lov_object *lov,
+			  const struct cl_object_conf *conf,
+			  union  lov_layout_state *state)
+{
+	int result;
+	int i;
+
+	struct cl_object	*stripe;
+	struct lov_thread_info  *lti     = lov_env_info(env);
+	struct cl_object_conf   *subconf = &lti->lti_stripe_conf;
+	struct lov_stripe_md    *lsm     = conf->u.coc_md->lsm;
+	struct lu_fid	   *ofid    = &lti->lti_fid;
+	struct lov_layout_raid0 *r0      = &state->raid0;
+
+	ENTRY;
+
+	if (lsm->lsm_magic != LOV_MAGIC_V1 && lsm->lsm_magic != LOV_MAGIC_V3) {
+		dump_lsm(D_ERROR, lsm);
+		LASSERTF(0, "magic mismatch, expected %d/%d, actual %d.\n",
+			 LOV_MAGIC_V1, LOV_MAGIC_V3, lsm->lsm_magic);
+	}
+
+	LASSERT(lov->lo_lsm == NULL);
+	lov->lo_lsm = lsm_addref(lsm);
+	r0->lo_nr  = lsm->lsm_stripe_count;
+	LASSERT(r0->lo_nr <= lov_targets_nr(dev));
+
+	OBD_ALLOC_LARGE(r0->lo_sub, r0->lo_nr * sizeof r0->lo_sub[0]);
+	if (r0->lo_sub != NULL) {
+		result = 0;
+		subconf->coc_inode = conf->coc_inode;
+		spin_lock_init(&r0->lo_sub_lock);
+		/*
+		 * Create stripe cl_objects.
+		 */
+		for (i = 0; i < r0->lo_nr && result == 0; ++i) {
+			struct cl_device *subdev;
+			struct lov_oinfo *oinfo = lsm->lsm_oinfo[i];
+			int ost_idx = oinfo->loi_ost_idx;
+
+			result = ostid_to_fid(ofid, &oinfo->loi_oi,
+					      oinfo->loi_ost_idx);
+			if (result != 0)
+				GOTO(out, result);
+
+			subdev = lovsub2cl_dev(dev->ld_target[ost_idx]);
+			subconf->u.coc_oinfo = oinfo;
+			LASSERTF(subdev != NULL, "not init ost %d\n", ost_idx);
+			/* In the function below, .hs_keycmp resolves to
+			 * lu_obj_hop_keycmp() */
+			/* coverity[overrun-buffer-val] */
+			stripe = lov_sub_find(env, subdev, ofid, subconf);
+			if (!IS_ERR(stripe)) {
+				result = lov_init_sub(env, lov, stripe, r0, i);
+				if (result == -EAGAIN) { /* try again */
+					--i;
+					result = 0;
+				}
+			} else {
+				result = PTR_ERR(stripe);
+			}
+		}
+	} else
+		result = -ENOMEM;
+out:
+	RETURN(result);
+}
+
+static int lov_delete_empty(const struct lu_env *env, struct lov_object *lov,
+			    union lov_layout_state *state)
+{
+	LASSERT(lov->lo_type == LLT_EMPTY);
+
+	lov_layout_wait(env, lov);
+
+	cl_object_prune(env, &lov->lo_cl);
+	return 0;
+}
+
+static void lov_subobject_kill(const struct lu_env *env, struct lov_object *lov,
+			       struct lovsub_object *los, int idx)
+{
+	struct cl_object	*sub;
+	struct lov_layout_raid0 *r0;
+	struct lu_site	  *site;
+	struct lu_site_bkt_data *bkt;
+	wait_queue_t	  *waiter;
+
+	r0  = &lov->u.raid0;
+	LASSERT(r0->lo_sub[idx] == los);
+
+	sub  = lovsub2cl(los);
+	site = sub->co_lu.lo_dev->ld_site;
+	bkt  = lu_site_bkt_from_fid(site, &sub->co_lu.lo_header->loh_fid);
+
+	cl_object_kill(env, sub);
+	/* release a reference to the sub-object and ... */
+	lu_object_ref_del(&sub->co_lu, "lov-parent", lov);
+	cl_object_put(env, sub);
+
+	/* ... wait until it is actually destroyed---sub-object clears its
+	 * ->lo_sub[] slot in lovsub_object_fini() */
+	if (r0->lo_sub[idx] == los) {
+		waiter = &lov_env_info(env)->lti_waiter;
+		init_waitqueue_entry_current(waiter);
+		add_wait_queue(&bkt->lsb_marche_funebre, waiter);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		while (1) {
+			/* this wait-queue is signaled at the end of
+			 * lu_object_free(). */
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			spin_lock(&r0->lo_sub_lock);
+			if (r0->lo_sub[idx] == los) {
+				spin_unlock(&r0->lo_sub_lock);
+				waitq_wait(waiter, TASK_UNINTERRUPTIBLE);
+			} else {
+				spin_unlock(&r0->lo_sub_lock);
+				set_current_state(TASK_RUNNING);
+				break;
+			}
+		}
+		remove_wait_queue(&bkt->lsb_marche_funebre, waiter);
+	}
+	LASSERT(r0->lo_sub[idx] == NULL);
+}
+
+static int lov_delete_raid0(const struct lu_env *env, struct lov_object *lov,
+			    union lov_layout_state *state)
+{
+	struct lov_layout_raid0 *r0 = &state->raid0;
+	struct lov_stripe_md    *lsm = lov->lo_lsm;
+	int i;
+
+	ENTRY;
+
+	dump_lsm(D_INODE, lsm);
+
+	lov_layout_wait(env, lov);
+	if (r0->lo_sub != NULL) {
+		for (i = 0; i < r0->lo_nr; ++i) {
+			struct lovsub_object *los = r0->lo_sub[i];
+
+			if (los != NULL) {
+				cl_locks_prune(env, &los->lso_cl, 1);
+				/*
+				 * If top-level object is to be evicted from
+				 * the cache, so are its sub-objects.
+				 */
+				lov_subobject_kill(env, lov, los, i);
+			}
+		}
+	}
+	cl_object_prune(env, &lov->lo_cl);
+	RETURN(0);
+}
+
+static void lov_fini_empty(const struct lu_env *env, struct lov_object *lov,
+			   union lov_layout_state *state)
+{
+	LASSERT(lov->lo_type == LLT_EMPTY);
+}
+
+static void lov_fini_raid0(const struct lu_env *env, struct lov_object *lov,
+			   union lov_layout_state *state)
+{
+	struct lov_layout_raid0 *r0 = &state->raid0;
+	ENTRY;
+
+	if (r0->lo_sub != NULL) {
+		OBD_FREE_LARGE(r0->lo_sub, r0->lo_nr * sizeof r0->lo_sub[0]);
+		r0->lo_sub = NULL;
+	}
+
+	dump_lsm(D_INODE, lov->lo_lsm);
+	lov_free_memmd(&lov->lo_lsm);
+
+	EXIT;
+}
+
+static int lov_print_empty(const struct lu_env *env, void *cookie,
+			   lu_printer_t p, const struct lu_object *o)
+{
+	(*p)(env, cookie, "empty %d\n", lu2lov(o)->lo_layout_invalid);
+	return 0;
+}
+
+static int lov_print_raid0(const struct lu_env *env, void *cookie,
+			   lu_printer_t p, const struct lu_object *o)
+{
+	struct lov_object       *lov = lu2lov(o);
+	struct lov_layout_raid0 *r0  = lov_r0(lov);
+	struct lov_stripe_md    *lsm = lov->lo_lsm;
+	int i;
+
+	(*p)(env, cookie, "stripes: %d, %svalid, lsm{%p 0x%08X %d %u %u}: \n",
+		r0->lo_nr, lov->lo_layout_invalid ? "in" : "", lsm,
+		lsm->lsm_magic, atomic_read(&lsm->lsm_refc),
+		lsm->lsm_stripe_count, lsm->lsm_layout_gen);
+	for (i = 0; i < r0->lo_nr; ++i) {
+		struct lu_object *sub;
+
+		if (r0->lo_sub[i] != NULL) {
+			sub = lovsub2lu(r0->lo_sub[i]);
+			lu_object_print(env, cookie, p, sub);
+		} else
+			(*p)(env, cookie, "sub %d absent\n", i);
+	}
+	return 0;
+}
+
+/**
+ * Implements cl_object_operations::coo_attr_get() method for an object
+ * without stripes (LLT_EMPTY layout type).
+ *
+ * The only attributes this layer is authoritative in this case is
+ * cl_attr::cat_blocks---it's 0.
+ */
+static int lov_attr_get_empty(const struct lu_env *env, struct cl_object *obj,
+			      struct cl_attr *attr)
+{
+	attr->cat_blocks = 0;
+	return 0;
+}
+
+static int lov_attr_get_raid0(const struct lu_env *env, struct cl_object *obj,
+			      struct cl_attr *attr)
+{
+	struct lov_object	*lov = cl2lov(obj);
+	struct lov_layout_raid0 *r0 = lov_r0(lov);
+	struct cl_attr		*lov_attr = &r0->lo_attr;
+	int			 result = 0;
+
+	ENTRY;
+
+	/* this is called w/o holding type guard mutex, so it must be inside
+	 * an on going IO otherwise lsm may be replaced.
+	 * LU-2117: it turns out there exists one exception. For mmaped files,
+	 * the lock of those files may be requested in the other file's IO
+	 * context, and this function is called in ccc_lock_state(), it will
+	 * hit this assertion.
+	 * Anyway, it's still okay to call attr_get w/o type guard as layout
+	 * can't go if locks exist. */
+	/* LASSERT(atomic_read(&lsm->lsm_refc) > 1); */
+
+	if (!r0->lo_attr_valid) {
+		struct lov_stripe_md    *lsm = lov->lo_lsm;
+		struct ost_lvb	  *lvb = &lov_env_info(env)->lti_lvb;
+		__u64		    kms = 0;
+
+		memset(lvb, 0, sizeof(*lvb));
+		/* XXX: timestamps can be negative by sanity:test_39m,
+		 * how can it be? */
+		lvb->lvb_atime = LLONG_MIN;
+		lvb->lvb_ctime = LLONG_MIN;
+		lvb->lvb_mtime = LLONG_MIN;
+
+		/*
+		 * XXX that should be replaced with a loop over sub-objects,
+		 * doing cl_object_attr_get() on them. But for now, let's
+		 * reuse old lov code.
+		 */
+
+		/*
+		 * XXX take lsm spin-lock to keep lov_merge_lvb_kms()
+		 * happy. It's not needed, because new code uses
+		 * ->coh_attr_guard spin-lock to protect consistency of
+		 * sub-object attributes.
+		 */
+		lov_stripe_lock(lsm);
+		result = lov_merge_lvb_kms(lsm, lvb, &kms);
+		lov_stripe_unlock(lsm);
+		if (result == 0) {
+			cl_lvb2attr(lov_attr, lvb);
+			lov_attr->cat_kms = kms;
+			r0->lo_attr_valid = 1;
+		}
+	}
+	if (result == 0) { /* merge results */
+		attr->cat_blocks = lov_attr->cat_blocks;
+		attr->cat_size = lov_attr->cat_size;
+		attr->cat_kms = lov_attr->cat_kms;
+		if (attr->cat_atime < lov_attr->cat_atime)
+			attr->cat_atime = lov_attr->cat_atime;
+		if (attr->cat_ctime < lov_attr->cat_ctime)
+			attr->cat_ctime = lov_attr->cat_ctime;
+		if (attr->cat_mtime < lov_attr->cat_mtime)
+			attr->cat_mtime = lov_attr->cat_mtime;
+	}
+	RETURN(result);
+}
+
+const static struct lov_layout_operations lov_dispatch[] = {
+	[LLT_EMPTY] = {
+		.llo_init      = lov_init_empty,
+		.llo_delete    = lov_delete_empty,
+		.llo_fini      = lov_fini_empty,
+		.llo_install   = lov_install_empty,
+		.llo_print     = lov_print_empty,
+		.llo_page_init = lov_page_init_empty,
+		.llo_lock_init = lov_lock_init_empty,
+		.llo_io_init   = lov_io_init_empty,
+		.llo_getattr   = lov_attr_get_empty
+	},
+	[LLT_RAID0] = {
+		.llo_init      = lov_init_raid0,
+		.llo_delete    = lov_delete_raid0,
+		.llo_fini      = lov_fini_raid0,
+		.llo_install   = lov_install_raid0,
+		.llo_print     = lov_print_raid0,
+		.llo_page_init = lov_page_init_raid0,
+		.llo_lock_init = lov_lock_init_raid0,
+		.llo_io_init   = lov_io_init_raid0,
+		.llo_getattr   = lov_attr_get_raid0
+	}
+};
+
+
+/**
+ * Performs a double-dispatch based on the layout type of an object.
+ */
+#define LOV_2DISPATCH_NOLOCK(obj, op, ...)			      \
+({								      \
+	struct lov_object		      *__obj = (obj);	  \
+	enum lov_layout_type		    __llt;		  \
+									\
+	__llt = __obj->lo_type;					 \
+	LASSERT(0 <= __llt && __llt < ARRAY_SIZE(lov_dispatch));	\
+	lov_dispatch[__llt].op(__VA_ARGS__);			    \
+})
+
+static inline void lov_conf_freeze(struct lov_object *lov)
+{
+	if (lov->lo_owner != current)
+		down_read(&lov->lo_type_guard);
+}
+
+static inline void lov_conf_thaw(struct lov_object *lov)
+{
+	if (lov->lo_owner != current)
+		up_read(&lov->lo_type_guard);
+}
+
+#define LOV_2DISPATCH_MAYLOCK(obj, op, lock, ...)		       \
+({								      \
+	struct lov_object		      *__obj = (obj);	  \
+	int				     __lock = !!(lock);      \
+	typeof(lov_dispatch[0].op(__VA_ARGS__)) __result;	       \
+									\
+	if (__lock)						     \
+		lov_conf_freeze(__obj);					\
+	__result = LOV_2DISPATCH_NOLOCK(obj, op, __VA_ARGS__);	  \
+	if (__lock)						     \
+		lov_conf_thaw(__obj);					\
+	__result;						       \
+})
+
+/**
+ * Performs a locked double-dispatch based on the layout type of an object.
+ */
+#define LOV_2DISPATCH(obj, op, ...)		     \
+	LOV_2DISPATCH_MAYLOCK(obj, op, 1, __VA_ARGS__)
+
+#define LOV_2DISPATCH_VOID(obj, op, ...)				\
+do {								    \
+	struct lov_object		      *__obj = (obj);	  \
+	enum lov_layout_type		    __llt;		  \
+									\
+	lov_conf_freeze(__obj);						\
+	__llt = __obj->lo_type;					 \
+	LASSERT(0 <= __llt && __llt < ARRAY_SIZE(lov_dispatch));	\
+	lov_dispatch[__llt].op(__VA_ARGS__);			    \
+	lov_conf_thaw(__obj);						\
+} while (0)
+
+static void lov_conf_lock(struct lov_object *lov)
+{
+	LASSERT(lov->lo_owner != current);
+	down_write(&lov->lo_type_guard);
+	LASSERT(lov->lo_owner == NULL);
+	lov->lo_owner = current;
+}
+
+static void lov_conf_unlock(struct lov_object *lov)
+{
+	lov->lo_owner = NULL;
+	up_write(&lov->lo_type_guard);
+}
+
+static int lov_layout_wait(const struct lu_env *env, struct lov_object *lov)
+{
+	struct l_wait_info lwi = { 0 };
+	ENTRY;
+
+	while (atomic_read(&lov->lo_active_ios) > 0) {
+		CDEBUG(D_INODE, "file:"DFID" wait for active IO, now: %d.\n",
+			PFID(lu_object_fid(lov2lu(lov))),
+			atomic_read(&lov->lo_active_ios));
+
+		l_wait_event(lov->lo_waitq,
+			     atomic_read(&lov->lo_active_ios) == 0, &lwi);
+	}
+	RETURN(0);
+}
+
+static int lov_layout_change(const struct lu_env *unused,
+			     struct lov_object *lov,
+			     const struct cl_object_conf *conf)
+{
+	int result;
+	enum lov_layout_type llt = LLT_EMPTY;
+	union lov_layout_state *state = &lov->u;
+	const struct lov_layout_operations *old_ops;
+	const struct lov_layout_operations *new_ops;
+
+	struct cl_object_header *hdr = cl_object_header(&lov->lo_cl);
+	void *cookie;
+	struct lu_env *env;
+	int refcheck;
+	ENTRY;
+
+	LASSERT(0 <= lov->lo_type && lov->lo_type < ARRAY_SIZE(lov_dispatch));
+
+	if (conf->u.coc_md != NULL && conf->u.coc_md->lsm != NULL)
+		llt = LLT_RAID0; /* only raid0 is supported. */
+	LASSERT(0 <= llt && llt < ARRAY_SIZE(lov_dispatch));
+
+	cookie = cl_env_reenter();
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env)) {
+		cl_env_reexit(cookie);
+		RETURN(PTR_ERR(env));
+	}
+
+	old_ops = &lov_dispatch[lov->lo_type];
+	new_ops = &lov_dispatch[llt];
+
+	result = old_ops->llo_delete(env, lov, &lov->u);
+	if (result == 0) {
+		old_ops->llo_fini(env, lov, &lov->u);
+
+		LASSERT(atomic_read(&lov->lo_active_ios) == 0);
+		LASSERT(hdr->coh_tree.rnode == NULL);
+		LASSERT(hdr->coh_pages == 0);
+
+		lov->lo_type = LLT_EMPTY;
+		result = new_ops->llo_init(env,
+					lu2lov_dev(lov->lo_cl.co_lu.lo_dev),
+					lov, conf, state);
+		if (result == 0) {
+			new_ops->llo_install(env, lov, state);
+			lov->lo_type = llt;
+		} else {
+			new_ops->llo_delete(env, lov, state);
+			new_ops->llo_fini(env, lov, state);
+			/* this file becomes an EMPTY file. */
+		}
+	}
+
+	cl_env_put(env, &refcheck);
+	cl_env_reexit(cookie);
+	RETURN(result);
+}
+
+/*****************************************************************************
+ *
+ * Lov object operations.
+ *
+ */
+
+int lov_object_init(const struct lu_env *env, struct lu_object *obj,
+		    const struct lu_object_conf *conf)
+{
+	struct lov_device	    *dev   = lu2lov_dev(obj->lo_dev);
+	struct lov_object	    *lov   = lu2lov(obj);
+	const struct cl_object_conf  *cconf = lu2cl_conf(conf);
+	union  lov_layout_state      *set   = &lov->u;
+	const struct lov_layout_operations *ops;
+	int result;
+
+	ENTRY;
+	init_rwsem(&lov->lo_type_guard);
+	atomic_set(&lov->lo_active_ios, 0);
+	init_waitqueue_head(&lov->lo_waitq);
+
+	cl_object_page_init(lu2cl(obj), sizeof(struct lov_page));
+
+	/* no locking is necessary, as object is being created */
+	lov->lo_type = cconf->u.coc_md->lsm != NULL ? LLT_RAID0 : LLT_EMPTY;
+	ops = &lov_dispatch[lov->lo_type];
+	result = ops->llo_init(env, dev, lov, cconf, set);
+	if (result == 0)
+		ops->llo_install(env, lov, set);
+	RETURN(result);
+}
+
+static int lov_conf_set(const struct lu_env *env, struct cl_object *obj,
+			const struct cl_object_conf *conf)
+{
+	struct lov_stripe_md *lsm = NULL;
+	struct lov_object *lov = cl2lov(obj);
+	int result = 0;
+	ENTRY;
+
+	lov_conf_lock(lov);
+	if (conf->coc_opc == OBJECT_CONF_INVALIDATE) {
+		lov->lo_layout_invalid = true;
+		GOTO(out, result = 0);
+	}
+
+	if (conf->coc_opc == OBJECT_CONF_WAIT) {
+		if (lov->lo_layout_invalid &&
+		    atomic_read(&lov->lo_active_ios) > 0) {
+			lov_conf_unlock(lov);
+			result = lov_layout_wait(env, lov);
+			lov_conf_lock(lov);
+		}
+		GOTO(out, result);
+	}
+
+	LASSERT(conf->coc_opc == OBJECT_CONF_SET);
+
+	if (conf->u.coc_md != NULL)
+		lsm = conf->u.coc_md->lsm;
+	if ((lsm == NULL && lov->lo_lsm == NULL) ||
+	    (lsm != NULL && lov->lo_lsm != NULL &&
+	     lov->lo_lsm->lsm_layout_gen == lsm->lsm_layout_gen)) {
+		/* same version of layout */
+		lov->lo_layout_invalid = false;
+		GOTO(out, result = 0);
+	}
+
+	/* will change layout - check if there still exists active IO. */
+	if (atomic_read(&lov->lo_active_ios) > 0) {
+		lov->lo_layout_invalid = true;
+		GOTO(out, result = -EBUSY);
+	}
+
+	lov->lo_layout_invalid = lov_layout_change(env, lov, conf);
+	EXIT;
+
+out:
+	lov_conf_unlock(lov);
+	RETURN(result);
+}
+
+static void lov_object_delete(const struct lu_env *env, struct lu_object *obj)
+{
+	struct lov_object *lov = lu2lov(obj);
+
+	ENTRY;
+	LOV_2DISPATCH_VOID(lov, llo_delete, env, lov, &lov->u);
+	EXIT;
+}
+
+static void lov_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+	struct lov_object *lov = lu2lov(obj);
+
+	ENTRY;
+	LOV_2DISPATCH_VOID(lov, llo_fini, env, lov, &lov->u);
+	lu_object_fini(obj);
+	OBD_SLAB_FREE_PTR(lov, lov_object_kmem);
+	EXIT;
+}
+
+static int lov_object_print(const struct lu_env *env, void *cookie,
+			    lu_printer_t p, const struct lu_object *o)
+{
+	return LOV_2DISPATCH_NOLOCK(lu2lov(o), llo_print, env, cookie, p, o);
+}
+
+int lov_page_init(const struct lu_env *env, struct cl_object *obj,
+		struct cl_page *page, struct page *vmpage)
+{
+	return LOV_2DISPATCH_NOLOCK(cl2lov(obj),
+				    llo_page_init, env, obj, page, vmpage);
+}
+
+/**
+ * Implements cl_object_operations::clo_io_init() method for lov
+ * layer. Dispatches to the appropriate layout io initialization method.
+ */
+int lov_io_init(const struct lu_env *env, struct cl_object *obj,
+		struct cl_io *io)
+{
+	CL_IO_SLICE_CLEAN(lov_env_io(env), lis_cl);
+	return LOV_2DISPATCH_MAYLOCK(cl2lov(obj), llo_io_init,
+				     !io->ci_ignore_layout, env, obj, io);
+}
+
+/**
+ * An implementation of cl_object_operations::clo_attr_get() method for lov
+ * layer. For raid0 layout this collects and merges attributes of all
+ * sub-objects.
+ */
+static int lov_attr_get(const struct lu_env *env, struct cl_object *obj,
+			struct cl_attr *attr)
+{
+	/* do not take lock, as this function is called under a
+	 * spin-lock. Layout is protected from changing by ongoing IO. */
+	return LOV_2DISPATCH_NOLOCK(cl2lov(obj), llo_getattr, env, obj, attr);
+}
+
+static int lov_attr_set(const struct lu_env *env, struct cl_object *obj,
+			const struct cl_attr *attr, unsigned valid)
+{
+	/*
+	 * No dispatch is required here, as no layout implements this.
+	 */
+	return 0;
+}
+
+int lov_lock_init(const struct lu_env *env, struct cl_object *obj,
+		  struct cl_lock *lock, const struct cl_io *io)
+{
+	/* No need to lock because we've taken one refcount of layout.  */
+	return LOV_2DISPATCH_NOLOCK(cl2lov(obj), llo_lock_init, env, obj, lock,
+				    io);
+}
+
+static const struct cl_object_operations lov_ops = {
+	.coo_page_init = lov_page_init,
+	.coo_lock_init = lov_lock_init,
+	.coo_io_init   = lov_io_init,
+	.coo_attr_get  = lov_attr_get,
+	.coo_attr_set  = lov_attr_set,
+	.coo_conf_set  = lov_conf_set
+};
+
+static const struct lu_object_operations lov_lu_obj_ops = {
+	.loo_object_init      = lov_object_init,
+	.loo_object_delete    = lov_object_delete,
+	.loo_object_release   = NULL,
+	.loo_object_free      = lov_object_free,
+	.loo_object_print     = lov_object_print,
+	.loo_object_invariant = NULL
+};
+
+struct lu_object *lov_object_alloc(const struct lu_env *env,
+				   const struct lu_object_header *unused,
+				   struct lu_device *dev)
+{
+	struct lov_object *lov;
+	struct lu_object  *obj;
+
+	ENTRY;
+	OBD_SLAB_ALLOC_PTR_GFP(lov, lov_object_kmem, __GFP_IO);
+	if (lov != NULL) {
+		obj = lov2lu(lov);
+		lu_object_init(obj, NULL, dev);
+		lov->lo_cl.co_ops = &lov_ops;
+		lov->lo_type = -1; /* invalid, to catch uninitialized type */
+		/*
+		 * object io operation vector (cl_object::co_iop) is installed
+		 * later in lov_object_init(), as different vectors are used
+		 * for object with different layouts.
+		 */
+		obj->lo_ops = &lov_lu_obj_ops;
+	} else
+		obj = NULL;
+	RETURN(obj);
+}
+
+struct lov_stripe_md *lov_lsm_addref(struct lov_object *lov)
+{
+	struct lov_stripe_md *lsm = NULL;
+
+	lov_conf_freeze(lov);
+	if (lov->lo_lsm != NULL) {
+		lsm = lsm_addref(lov->lo_lsm);
+		CDEBUG(D_INODE, "lsm %p addref %d/%d by %p.\n",
+			lsm, atomic_read(&lsm->lsm_refc),
+			lov->lo_layout_invalid, current);
+	}
+	lov_conf_thaw(lov);
+	return lsm;
+}
+
+void lov_lsm_decref(struct lov_object *lov, struct lov_stripe_md *lsm)
+{
+	if (lsm == NULL)
+		return;
+
+	CDEBUG(D_INODE, "lsm %p decref %d by %p.\n",
+		lsm, atomic_read(&lsm->lsm_refc), current);
+
+	lov_free_memmd(&lsm);
+}
+
+struct lov_stripe_md *lov_lsm_get(struct cl_object *clobj)
+{
+	struct lu_object *luobj;
+	struct lov_stripe_md *lsm = NULL;
+
+	if (clobj == NULL)
+		return NULL;
+
+	luobj = lu_object_locate(&cl_object_header(clobj)->coh_lu,
+				 &lov_device_type);
+	if (luobj != NULL)
+		lsm = lov_lsm_addref(lu2lov(luobj));
+	return lsm;
+}
+EXPORT_SYMBOL(lov_lsm_get);
+
+void lov_lsm_put(struct cl_object *unused, struct lov_stripe_md *lsm)
+{
+	if (lsm != NULL)
+		lov_free_memmd(&lsm);
+}
+EXPORT_SYMBOL(lov_lsm_put);
+
+int lov_read_and_clear_async_rc(struct cl_object *clob)
+{
+	struct lu_object *luobj;
+	int rc = 0;
+	ENTRY;
+
+	luobj = lu_object_locate(&cl_object_header(clob)->coh_lu,
+				 &lov_device_type);
+	if (luobj != NULL) {
+		struct lov_object *lov = lu2lov(luobj);
+
+		lov_conf_freeze(lov);
+		switch (lov->lo_type) {
+		case LLT_RAID0: {
+			struct lov_stripe_md *lsm;
+			int i;
+
+			lsm = lov->lo_lsm;
+			LASSERT(lsm != NULL);
+			for (i = 0; i < lsm->lsm_stripe_count; i++) {
+				struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+				if (loi->loi_ar.ar_rc && !rc)
+					rc = loi->loi_ar.ar_rc;
+				loi->loi_ar.ar_rc = 0;
+			}
+		}
+		case LLT_EMPTY:
+			break;
+		default:
+			LBUG();
+		}
+		lov_conf_thaw(lov);
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(lov_read_and_clear_async_rc);
+
+/** @} lov */
diff --git a/drivers/staging/lustre/lustre/lov/lov_offset.c b/drivers/staging/lustre/lustre/lov/lov_offset.c
new file mode 100644
index 000000000000..f62b7e53b665
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_offset.c
@@ -0,0 +1,267 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include <linux/libcfs/libcfs.h>
+
+#include <obd_class.h>
+#include <obd_lov.h>
+
+#include "lov_internal.h"
+
+/* compute object size given "stripeno" and the ost size */
+obd_size lov_stripe_size(struct lov_stripe_md *lsm, obd_size ost_size,
+			 int stripeno)
+{
+	unsigned long ssize = lsm->lsm_stripe_size;
+	unsigned long stripe_size;
+	obd_off swidth;
+	obd_size lov_size;
+	int magic = lsm->lsm_magic;
+	ENTRY;
+
+	if (ost_size == 0)
+		RETURN(0);
+
+	LASSERT(lsm_op_find(magic) != NULL);
+	lsm_op_find(magic)->lsm_stripe_by_index(lsm, &stripeno, NULL, &swidth);
+
+	/* lov_do_div64(a, b) returns a % b, and a = a / b */
+	stripe_size = lov_do_div64(ost_size, ssize);
+	if (stripe_size)
+		lov_size = ost_size * swidth + stripeno * ssize + stripe_size;
+	else
+		lov_size = (ost_size - 1) * swidth + (stripeno + 1) * ssize;
+
+	RETURN(lov_size);
+}
+
+/* we have an offset in file backed by an lov and want to find out where
+ * that offset lands in our given stripe of the file.  for the easy
+ * case where the offset is within the stripe, we just have to scale the
+ * offset down to make it relative to the stripe instead of the lov.
+ *
+ * the harder case is what to do when the offset doesn't intersect the
+ * stripe.  callers will want start offsets clamped ahead to the start
+ * of the nearest stripe in the file.  end offsets similarly clamped to the
+ * nearest ending byte of a stripe in the file:
+ *
+ * all this function does is move offsets to the nearest region of the
+ * stripe, and it does its work "mod" the full length of all the stripes.
+ * consider a file with 3 stripes:
+ *
+ *	     S					      E
+ * ---------------------------------------------------------------------
+ * |    0    |     1     |     2     |    0    |     1     |     2     |
+ * ---------------------------------------------------------------------
+ *
+ * to find stripe 1's offsets for S and E, it divides by the full stripe
+ * width and does its math in the context of a single set of stripes:
+ *
+ *	     S	 E
+ * -----------------------------------
+ * |    0    |     1     |     2     |
+ * -----------------------------------
+ *
+ * it'll notice that E is outside stripe 1 and clamp it to the end of the
+ * stripe, then multiply it back out by lov_off to give the real offsets in
+ * the stripe:
+ *
+ *   S		   E
+ * ---------------------------------------------------------------------
+ * |    1    |     1     |     1     |    1    |     1     |     1     |
+ * ---------------------------------------------------------------------
+ *
+ * it would have done similarly and pulled S forward to the start of a 1
+ * stripe if, say, S had landed in a 0 stripe.
+ *
+ * this rounding isn't always correct.  consider an E lov offset that lands
+ * on a 0 stripe, the "mod stripe width" math will pull it forward to the
+ * start of a 1 stripe, when in fact it wanted to be rounded back to the end
+ * of a previous 1 stripe.  this logic is handled by callers and this is why:
+ *
+ * this function returns < 0 when the offset was "before" the stripe and
+ * was moved forward to the start of the stripe in question;  0 when it
+ * falls in the stripe and no shifting was done; > 0 when the offset
+ * was outside the stripe and was pulled back to its final byte. */
+int lov_stripe_offset(struct lov_stripe_md *lsm, obd_off lov_off,
+		      int stripeno, obd_off *obdoff)
+{
+	unsigned long ssize  = lsm->lsm_stripe_size;
+	obd_off stripe_off, this_stripe, swidth;
+	int magic = lsm->lsm_magic;
+	int ret = 0;
+
+	if (lov_off == OBD_OBJECT_EOF) {
+		*obdoff = OBD_OBJECT_EOF;
+		return 0;
+	}
+
+	LASSERT(lsm_op_find(magic) != NULL);
+
+	lsm_op_find(magic)->lsm_stripe_by_index(lsm, &stripeno, &lov_off,
+						&swidth);
+
+	/* lov_do_div64(a, b) returns a % b, and a = a / b */
+	stripe_off = lov_do_div64(lov_off, swidth);
+
+	this_stripe = (obd_off)stripeno * ssize;
+	if (stripe_off < this_stripe) {
+		stripe_off = 0;
+		ret = -1;
+	} else {
+		stripe_off -= this_stripe;
+
+		if (stripe_off >= ssize) {
+			stripe_off = ssize;
+			ret = 1;
+		}
+	}
+
+	*obdoff = lov_off * ssize + stripe_off;
+	return ret;
+}
+
+/* Given a whole-file size and a stripe number, give the file size which
+ * corresponds to the individual object of that stripe.
+ *
+ * This behaves basically in the same was as lov_stripe_offset, except that
+ * file sizes falling before the beginning of a stripe are clamped to the end
+ * of the previous stripe, not the beginning of the next:
+ *
+ *					       S
+ * ---------------------------------------------------------------------
+ * |    0    |     1     |     2     |    0    |     1     |     2     |
+ * ---------------------------------------------------------------------
+ *
+ * if clamped to stripe 2 becomes:
+ *
+ *				   S
+ * ---------------------------------------------------------------------
+ * |    0    |     1     |     2     |    0    |     1     |     2     |
+ * ---------------------------------------------------------------------
+ */
+obd_off lov_size_to_stripe(struct lov_stripe_md *lsm, obd_off file_size,
+			   int stripeno)
+{
+	unsigned long ssize  = lsm->lsm_stripe_size;
+	obd_off stripe_off, this_stripe, swidth;
+	int magic = lsm->lsm_magic;
+
+	if (file_size == OBD_OBJECT_EOF)
+		return OBD_OBJECT_EOF;
+
+	LASSERT(lsm_op_find(magic) != NULL);
+	lsm_op_find(magic)->lsm_stripe_by_index(lsm, &stripeno, &file_size,
+						&swidth);
+
+	/* lov_do_div64(a, b) returns a % b, and a = a / b */
+	stripe_off = lov_do_div64(file_size, swidth);
+
+	this_stripe = (obd_off)stripeno * ssize;
+	if (stripe_off < this_stripe) {
+		/* Move to end of previous stripe, or zero */
+		if (file_size > 0) {
+			file_size--;
+			stripe_off = ssize;
+		} else {
+			stripe_off = 0;
+		}
+	} else {
+		stripe_off -= this_stripe;
+
+		if (stripe_off >= ssize) {
+			/* Clamp to end of this stripe */
+			stripe_off = ssize;
+		}
+	}
+
+	return (file_size * ssize + stripe_off);
+}
+
+/* given an extent in an lov and a stripe, calculate the extent of the stripe
+ * that is contained within the lov extent.  this returns true if the given
+ * stripe does intersect with the lov extent. */
+int lov_stripe_intersects(struct lov_stripe_md *lsm, int stripeno,
+			  obd_off start, obd_off end,
+			  obd_off *obd_start, obd_off *obd_end)
+{
+	int start_side, end_side;
+
+	start_side = lov_stripe_offset(lsm, start, stripeno, obd_start);
+	end_side = lov_stripe_offset(lsm, end, stripeno, obd_end);
+
+	CDEBUG(D_INODE, "["LPU64"->"LPU64"] -> [(%d) "LPU64"->"LPU64" (%d)]\n",
+	       start, end, start_side, *obd_start, *obd_end, end_side);
+
+	/* this stripe doesn't intersect the file extent when neither
+	 * start or the end intersected the stripe and obd_start and
+	 * obd_end got rounded up to the save value. */
+	if (start_side != 0 && end_side != 0 && *obd_start == *obd_end)
+		return 0;
+
+	/* as mentioned in the lov_stripe_offset commentary, end
+	 * might have been shifted in the wrong direction.  This
+	 * happens when an end offset is before the stripe when viewed
+	 * through the "mod stripe size" math. we detect it being shifted
+	 * in the wrong direction and touch it up.
+	 * interestingly, this can't underflow since end must be > start
+	 * if we passed through the previous check.
+	 * (should we assert for that somewhere?) */
+	if (end_side != 0)
+		(*obd_end)--;
+
+	return 1;
+}
+
+/* compute which stripe number "lov_off" will be written into */
+int lov_stripe_number(struct lov_stripe_md *lsm, obd_off lov_off)
+{
+	unsigned long ssize  = lsm->lsm_stripe_size;
+	obd_off stripe_off, swidth;
+	int magic = lsm->lsm_magic;
+
+	LASSERT(lsm_op_find(magic) != NULL);
+	lsm_op_find(magic)->lsm_stripe_by_offset(lsm, NULL, &lov_off, &swidth);
+
+	stripe_off = lov_do_div64(lov_off, swidth);
+
+	/* Puts stripe_off/ssize result into stripe_off */
+	lov_do_div64(stripe_off, ssize);
+
+	return stripe_off;
+}
diff --git a/drivers/staging/lustre/lustre/lov/lov_pack.c b/drivers/staging/lustre/lustre/lov/lov_pack.c
new file mode 100644
index 000000000000..8bb57aa5f418
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_pack.c
@@ -0,0 +1,677 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lov/lov_pack.c
+ *
+ * (Un)packing of OST/MDS requests
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include <lustre_net.h>
+#include <obd.h>
+#include <obd_lov.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_user.h>
+
+#include "lov_internal.h"
+
+static void lov_dump_lmm_common(int level, void *lmmp)
+{
+	struct lov_mds_md *lmm = lmmp;
+	struct ost_id	oi;
+
+	lmm_oi_le_to_cpu(&oi, &lmm->lmm_oi);
+	CDEBUG(level, "objid "DOSTID", magic 0x%08x, pattern %#x\n",
+	       POSTID(&oi), le32_to_cpu(lmm->lmm_magic),
+	       le32_to_cpu(lmm->lmm_pattern));
+	CDEBUG(level, "stripe_size %u, stripe_count %u, layout_gen %u\n",
+	       le32_to_cpu(lmm->lmm_stripe_size),
+	       le16_to_cpu(lmm->lmm_stripe_count),
+	       le16_to_cpu(lmm->lmm_layout_gen));
+}
+
+static void lov_dump_lmm_objects(int level, struct lov_ost_data *lod,
+				 int stripe_count)
+{
+	int i;
+
+	if (stripe_count > LOV_V1_INSANE_STRIPE_COUNT) {
+		CDEBUG(level, "bad stripe_count %u > max_stripe_count %u\n",
+		       stripe_count, LOV_V1_INSANE_STRIPE_COUNT);
+	}
+
+	for (i = 0; i < stripe_count; ++i, ++lod) {
+		struct ost_id	oi;
+
+		ostid_le_to_cpu(&lod->l_ost_oi, &oi);
+		CDEBUG(level, "stripe %u idx %u subobj "DOSTID"\n", i,
+		       le32_to_cpu(lod->l_ost_idx), POSTID(&oi));
+	}
+}
+
+void lov_dump_lmm_v1(int level, struct lov_mds_md_v1 *lmm)
+{
+	lov_dump_lmm_common(level, lmm);
+	lov_dump_lmm_objects(level, lmm->lmm_objects,
+			     le16_to_cpu(lmm->lmm_stripe_count));
+}
+
+void lov_dump_lmm_v3(int level, struct lov_mds_md_v3 *lmm)
+{
+	lov_dump_lmm_common(level, lmm);
+	CDEBUG(level,"pool_name "LOV_POOLNAMEF"\n", lmm->lmm_pool_name);
+	lov_dump_lmm_objects(level, lmm->lmm_objects,
+			     le16_to_cpu(lmm->lmm_stripe_count));
+}
+
+void lov_dump_lmm(int level, void *lmm)
+{
+	int magic;
+
+	magic = ((struct lov_mds_md_v1 *)(lmm))->lmm_magic;
+	switch (magic) {
+	case LOV_MAGIC_V1:
+		return lov_dump_lmm_v1(level, (struct lov_mds_md_v1 *)(lmm));
+	case LOV_MAGIC_V3:
+		return lov_dump_lmm_v3(level, (struct lov_mds_md_v3 *)(lmm));
+	default:
+		CERROR("Cannot recognize lmm_magic %x", magic);
+	}
+	return;
+}
+
+#define LMM_ASSERT(test)						\
+do {								    \
+	if (!(test)) lov_dump_lmm(D_ERROR, lmm);			\
+	LASSERT(test); /* so we know what assertion failed */	   \
+} while(0)
+
+/* Pack LOV object metadata for disk storage.  It is packed in LE byte
+ * order and is opaque to the networking layer.
+ *
+ * XXX In the future, this will be enhanced to get the EA size from the
+ *     underlying OSC device(s) to get their EA sizes so we can stack
+ *     LOVs properly.  For now lov_mds_md_size() just assumes one obd_id
+ *     per stripe.
+ */
+int lov_packmd(struct obd_export *exp, struct lov_mds_md **lmmp,
+	       struct lov_stripe_md *lsm)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	struct lov_obd *lov = &obd->u.lov;
+	struct lov_mds_md_v1 *lmmv1;
+	struct lov_mds_md_v3 *lmmv3;
+	__u16 stripe_count;
+	struct lov_ost_data_v1 *lmm_objects;
+	int lmm_size, lmm_magic;
+	int i;
+	int cplen = 0;
+	ENTRY;
+
+	if (lsm) {
+		lmm_magic = lsm->lsm_magic;
+	} else {
+		if (lmmp && *lmmp)
+			lmm_magic = le32_to_cpu((*lmmp)->lmm_magic);
+		else
+			/* lsm == NULL and lmmp == NULL */
+			lmm_magic = LOV_MAGIC;
+	}
+
+	if ((lmm_magic != LOV_MAGIC_V1) &&
+	    (lmm_magic != LOV_MAGIC_V3)) {
+		CERROR("bad mem LOV MAGIC: 0x%08X != 0x%08X nor 0x%08X\n",
+			lmm_magic, LOV_MAGIC_V1, LOV_MAGIC_V3);
+		RETURN(-EINVAL);
+
+	}
+
+	if (lsm) {
+		/* If we are just sizing the EA, limit the stripe count
+		 * to the actual number of OSTs in this filesystem. */
+		if (!lmmp) {
+			stripe_count = lov_get_stripecnt(lov, lmm_magic,
+							 lsm->lsm_stripe_count);
+			lsm->lsm_stripe_count = stripe_count;
+		} else {
+			stripe_count = lsm->lsm_stripe_count;
+		}
+	} else {
+		/* No need to allocate more than maximum supported stripes.
+		 * Anyway, this is pretty inaccurate since ld_tgt_count now
+		 * represents max index and we should rely on the actual number
+		 * of OSTs instead */
+		stripe_count = lov_mds_md_stripecnt(lov->lov_ocd.ocd_max_easize,
+						    lmm_magic);
+		if (stripe_count > lov->desc.ld_tgt_count)
+			stripe_count = lov->desc.ld_tgt_count;
+	}
+
+	/* XXX LOV STACKING call into osc for sizes */
+	lmm_size = lov_mds_md_size(stripe_count, lmm_magic);
+
+	if (!lmmp)
+		RETURN(lmm_size);
+
+	if (*lmmp && !lsm) {
+		stripe_count = le16_to_cpu((*lmmp)->lmm_stripe_count);
+		lmm_size = lov_mds_md_size(stripe_count, lmm_magic);
+		OBD_FREE_LARGE(*lmmp, lmm_size);
+		*lmmp = NULL;
+		RETURN(0);
+	}
+
+	if (!*lmmp) {
+		OBD_ALLOC_LARGE(*lmmp, lmm_size);
+		if (!*lmmp)
+			RETURN(-ENOMEM);
+	}
+
+	CDEBUG(D_INFO, "lov_packmd: LOV_MAGIC 0x%08X, lmm_size = %d \n",
+	       lmm_magic, lmm_size);
+
+	lmmv1 = *lmmp;
+	lmmv3 = (struct lov_mds_md_v3 *)*lmmp;
+	if (lmm_magic == LOV_MAGIC_V3)
+		lmmv3->lmm_magic = cpu_to_le32(LOV_MAGIC_V3);
+	else
+		lmmv1->lmm_magic = cpu_to_le32(LOV_MAGIC_V1);
+
+	if (!lsm)
+		RETURN(lmm_size);
+
+	/* lmmv1 and lmmv3 point to the same struct and have the
+	 * same first fields
+	 */
+	lmm_oi_cpu_to_le(&lmmv1->lmm_oi, &lsm->lsm_oi);
+	lmmv1->lmm_stripe_size = cpu_to_le32(lsm->lsm_stripe_size);
+	lmmv1->lmm_stripe_count = cpu_to_le16(stripe_count);
+	lmmv1->lmm_pattern = cpu_to_le32(lsm->lsm_pattern);
+	lmmv1->lmm_layout_gen = cpu_to_le16(lsm->lsm_layout_gen);
+	if (lsm->lsm_magic == LOV_MAGIC_V3) {
+		cplen = strlcpy(lmmv3->lmm_pool_name, lsm->lsm_pool_name,
+				sizeof(lmmv3->lmm_pool_name));
+		if (cplen >= sizeof(lmmv3->lmm_pool_name))
+			RETURN(-E2BIG);
+		lmm_objects = lmmv3->lmm_objects;
+	} else {
+		lmm_objects = lmmv1->lmm_objects;
+	}
+
+	for (i = 0; i < stripe_count; i++) {
+		struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+		/* XXX LOV STACKING call down to osc_packmd() to do packing */
+		LASSERTF(ostid_id(&loi->loi_oi) != 0, "lmm_oi "DOSTID
+			 " stripe %u/%u idx %u\n", POSTID(&lmmv1->lmm_oi),
+			 i, stripe_count, loi->loi_ost_idx);
+		ostid_cpu_to_le(&loi->loi_oi, &lmm_objects[i].l_ost_oi);
+		lmm_objects[i].l_ost_gen = cpu_to_le32(loi->loi_ost_gen);
+		lmm_objects[i].l_ost_idx = cpu_to_le32(loi->loi_ost_idx);
+	}
+
+	RETURN(lmm_size);
+}
+
+/* Find the max stripecount we should use */
+__u16 lov_get_stripecnt(struct lov_obd *lov, __u32 magic, __u16 stripe_count)
+{
+	__u32 max_stripes = LOV_MAX_STRIPE_COUNT_OLD;
+
+	if (!stripe_count)
+		stripe_count = lov->desc.ld_default_stripe_count;
+	if (stripe_count > lov->desc.ld_active_tgt_count)
+		stripe_count = lov->desc.ld_active_tgt_count;
+	if (!stripe_count)
+		stripe_count = 1;
+
+	/* stripe count is based on whether ldiskfs can handle
+	 * larger EA sizes */
+	if (lov->lov_ocd.ocd_connect_flags & OBD_CONNECT_MAX_EASIZE &&
+	    lov->lov_ocd.ocd_max_easize)
+		max_stripes = lov_mds_md_stripecnt(lov->lov_ocd.ocd_max_easize,
+						   magic);
+
+	if (stripe_count > max_stripes)
+		stripe_count = max_stripes;
+
+	return stripe_count;
+}
+
+
+static int lov_verify_lmm(void *lmm, int lmm_bytes, __u16 *stripe_count)
+{
+	int rc;
+
+	if (lsm_op_find(le32_to_cpu(*(__u32 *)lmm)) == NULL) {
+		char *buffer;
+		int sz;
+
+		CERROR("bad disk LOV MAGIC: 0x%08X; dumping LMM (size=%d):\n",
+		       le32_to_cpu(*(__u32 *)lmm), lmm_bytes);
+		sz = lmm_bytes * 2 + 1;
+		OBD_ALLOC_LARGE(buffer, sz);
+		if (buffer != NULL) {
+			int i;
+
+			for (i = 0; i < lmm_bytes; i++)
+				sprintf(buffer+2*i, "%.2X", ((char *)lmm)[i]);
+			buffer[sz - 1] = '\0';
+			CERROR("%s\n", buffer);
+			OBD_FREE_LARGE(buffer, sz);
+		}
+		return -EINVAL;
+	}
+	rc = lsm_op_find(le32_to_cpu(*(__u32 *)lmm))->lsm_lmm_verify(lmm,
+				     lmm_bytes, stripe_count);
+	return rc;
+}
+
+int lov_alloc_memmd(struct lov_stripe_md **lsmp, __u16 stripe_count,
+		    int pattern, int magic)
+{
+	int i, lsm_size;
+	ENTRY;
+
+	CDEBUG(D_INFO, "alloc lsm, stripe_count %d\n", stripe_count);
+
+	*lsmp = lsm_alloc_plain(stripe_count, &lsm_size);
+	if (!*lsmp) {
+		CERROR("can't allocate lsmp stripe_count %d\n", stripe_count);
+		RETURN(-ENOMEM);
+	}
+
+	atomic_set(&(*lsmp)->lsm_refc, 1);
+	spin_lock_init(&(*lsmp)->lsm_lock);
+	(*lsmp)->lsm_magic = magic;
+	(*lsmp)->lsm_stripe_count = stripe_count;
+	(*lsmp)->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES * stripe_count;
+	(*lsmp)->lsm_pattern = pattern;
+	(*lsmp)->lsm_pool_name[0] = '\0';
+	(*lsmp)->lsm_layout_gen = 0;
+	(*lsmp)->lsm_oinfo[0]->loi_ost_idx = ~0;
+
+	for (i = 0; i < stripe_count; i++)
+		loi_init((*lsmp)->lsm_oinfo[i]);
+
+	RETURN(lsm_size);
+}
+
+int lov_free_memmd(struct lov_stripe_md **lsmp)
+{
+	struct lov_stripe_md *lsm = *lsmp;
+	int refc;
+
+	*lsmp = NULL;
+	LASSERT(atomic_read(&lsm->lsm_refc) > 0);
+	if ((refc = atomic_dec_return(&lsm->lsm_refc)) == 0) {
+		LASSERT(lsm_op_find(lsm->lsm_magic) != NULL);
+		lsm_op_find(lsm->lsm_magic)->lsm_free(lsm);
+	}
+	return refc;
+}
+
+
+/* Unpack LOV object metadata from disk storage.  It is packed in LE byte
+ * order and is opaque to the networking layer.
+ */
+int lov_unpackmd(struct obd_export *exp,  struct lov_stripe_md **lsmp,
+		 struct lov_mds_md *lmm, int lmm_bytes)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	struct lov_obd *lov = &obd->u.lov;
+	int rc = 0, lsm_size;
+	__u16 stripe_count;
+	__u32 magic;
+	ENTRY;
+
+	/* If passed an MDS struct use values from there, otherwise defaults */
+	if (lmm) {
+		rc = lov_verify_lmm(lmm, lmm_bytes, &stripe_count);
+		if (rc)
+			RETURN(rc);
+		magic = le32_to_cpu(lmm->lmm_magic);
+	} else {
+		magic = LOV_MAGIC;
+		stripe_count = lov_get_stripecnt(lov, magic, 0);
+	}
+
+	/* If we aren't passed an lsmp struct, we just want the size */
+	if (!lsmp) {
+		/* XXX LOV STACKING call into osc for sizes */
+		LBUG();
+		RETURN(lov_stripe_md_size(stripe_count));
+	}
+	/* If we are passed an allocated struct but nothing to unpack, free */
+	if (*lsmp && !lmm) {
+		lov_free_memmd(lsmp);
+		RETURN(0);
+	}
+
+	lsm_size = lov_alloc_memmd(lsmp, stripe_count, LOV_PATTERN_RAID0,
+				   magic);
+	if (lsm_size < 0)
+		RETURN(lsm_size);
+
+	/* If we are passed a pointer but nothing to unpack, we only alloc */
+	if (!lmm)
+		RETURN(lsm_size);
+
+	LASSERT(lsm_op_find(magic) != NULL);
+	rc = lsm_op_find(magic)->lsm_unpackmd(lov, *lsmp, lmm);
+	if (rc) {
+		lov_free_memmd(lsmp);
+		RETURN(rc);
+	}
+
+	RETURN(lsm_size);
+}
+
+static int __lov_setstripe(struct obd_export *exp, int max_lmm_size,
+			   struct lov_stripe_md **lsmp,
+			   struct lov_user_md *lump)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	struct lov_obd *lov = &obd->u.lov;
+	char buffer[sizeof(struct lov_user_md_v3)];
+	struct lov_user_md_v3 *lumv3 = (struct lov_user_md_v3 *)&buffer[0];
+	struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&buffer[0];
+	int lmm_magic;
+	__u16 stripe_count;
+	int rc;
+	int cplen = 0;
+	ENTRY;
+
+	rc = lov_lum_swab_if_needed(lumv3, &lmm_magic, lump);
+	if (rc)
+		RETURN(rc);
+
+	/* in the rest of the tests, as *lumv1 and lumv3 have the same
+	 * fields, we use lumv1 to avoid code duplication */
+
+	if (lumv1->lmm_pattern == 0) {
+		lumv1->lmm_pattern = lov->desc.ld_pattern ?
+			lov->desc.ld_pattern : LOV_PATTERN_RAID0;
+	}
+
+	if (lumv1->lmm_pattern != LOV_PATTERN_RAID0) {
+		CDEBUG(D_IOCTL, "bad userland stripe pattern: %#x\n",
+		       lumv1->lmm_pattern);
+		RETURN(-EINVAL);
+	}
+
+	/* 64kB is the largest common page size we see (ia64), and matches the
+	 * check in lfs */
+	if (lumv1->lmm_stripe_size & (LOV_MIN_STRIPE_SIZE - 1)) {
+		CDEBUG(D_IOCTL, "stripe size %u not multiple of %u, fixing\n",
+		       lumv1->lmm_stripe_size, LOV_MIN_STRIPE_SIZE);
+		lumv1->lmm_stripe_size = LOV_MIN_STRIPE_SIZE;
+	}
+
+	if ((lumv1->lmm_stripe_offset >= lov->desc.ld_tgt_count) &&
+	    (lumv1->lmm_stripe_offset !=
+	     (typeof(lumv1->lmm_stripe_offset))(-1))) {
+		CDEBUG(D_IOCTL, "stripe offset %u > number of OSTs %u\n",
+		       lumv1->lmm_stripe_offset, lov->desc.ld_tgt_count);
+		RETURN(-EINVAL);
+	}
+	stripe_count = lov_get_stripecnt(lov, lmm_magic,
+					 lumv1->lmm_stripe_count);
+
+	if (max_lmm_size) {
+		int max_stripes = (max_lmm_size -
+				   lov_mds_md_size(0, lmm_magic)) /
+				   sizeof(struct lov_ost_data_v1);
+		if (unlikely(max_stripes < stripe_count)) {
+			CDEBUG(D_IOCTL, "stripe count reset from %d to %d\n",
+			       stripe_count, max_stripes);
+			stripe_count = max_stripes;
+		}
+	}
+
+	if (lmm_magic == LOV_USER_MAGIC_V3) {
+		struct pool_desc *pool;
+
+		/* In the function below, .hs_keycmp resolves to
+		 * pool_hashkey_keycmp() */
+		/* coverity[overrun-buffer-val] */
+		pool = lov_find_pool(lov, lumv3->lmm_pool_name);
+		if (pool != NULL) {
+			if (lumv3->lmm_stripe_offset !=
+			    (typeof(lumv3->lmm_stripe_offset))(-1)) {
+				rc = lov_check_index_in_pool(
+					lumv3->lmm_stripe_offset, pool);
+				if (rc < 0) {
+					lov_pool_putref(pool);
+					RETURN(-EINVAL);
+				}
+			}
+
+			if (stripe_count > pool_tgt_count(pool))
+				stripe_count = pool_tgt_count(pool);
+
+			lov_pool_putref(pool);
+		}
+	}
+
+	rc = lov_alloc_memmd(lsmp, stripe_count, lumv1->lmm_pattern, lmm_magic);
+
+	if (rc >= 0) {
+		(*lsmp)->lsm_oinfo[0]->loi_ost_idx = lumv1->lmm_stripe_offset;
+		(*lsmp)->lsm_stripe_size = lumv1->lmm_stripe_size;
+		if (lmm_magic == LOV_USER_MAGIC_V3) {
+			cplen = strlcpy((*lsmp)->lsm_pool_name,
+					lumv3->lmm_pool_name,
+					sizeof((*lsmp)->lsm_pool_name));
+			if (cplen >= sizeof((*lsmp)->lsm_pool_name))
+				rc = -E2BIG;
+		}
+		rc = 0;
+	}
+
+	RETURN(rc);
+}
+
+/* Configure object striping information on a new file.
+ *
+ * @lmmu is a pointer to a user struct with one or more of the fields set to
+ * indicate the application preference: lmm_stripe_count, lmm_stripe_size,
+ * lmm_stripe_offset, and lmm_stripe_pattern.  lmm_magic must be LOV_MAGIC.
+ * @lsmp is a pointer to an in-core stripe MD that needs to be filled in.
+ */
+int lov_setstripe(struct obd_export *exp, int max_lmm_size,
+		  struct lov_stripe_md **lsmp, struct lov_user_md *lump)
+{
+	int rc;
+	mm_segment_t seg;
+
+	seg = get_fs();
+	set_fs(KERNEL_DS);
+
+	rc = __lov_setstripe(exp, max_lmm_size, lsmp, lump);
+	set_fs(seg);
+	RETURN(rc);
+}
+
+int lov_setea(struct obd_export *exp, struct lov_stripe_md **lsmp,
+	      struct lov_user_md *lump)
+{
+	int i;
+	int rc;
+	struct obd_export *oexp;
+	struct lov_obd *lov = &exp->exp_obd->u.lov;
+	obd_id last_id = 0;
+	struct lov_user_ost_data_v1 *lmm_objects;
+
+	ENTRY;
+
+	if (lump->lmm_magic == LOV_USER_MAGIC_V3)
+		lmm_objects = ((struct lov_user_md_v3 *)lump)->lmm_objects;
+	else
+		lmm_objects = lump->lmm_objects;
+
+	for (i = 0; i < lump->lmm_stripe_count; i++) {
+		__u32 len = sizeof(last_id);
+		oexp = lov->lov_tgts[lmm_objects[i].l_ost_idx]->ltd_exp;
+		rc = obd_get_info(NULL, oexp, sizeof(KEY_LAST_ID), KEY_LAST_ID,
+				  &len, &last_id, NULL);
+		if (rc)
+			RETURN(rc);
+		if (ostid_id(&lmm_objects[i].l_ost_oi) > last_id) {
+			CERROR("Setting EA for object > than last id on"
+			       " ost idx %d "DOSTID" > "LPD64" \n",
+			       lmm_objects[i].l_ost_idx,
+			       POSTID(&lmm_objects[i].l_ost_oi), last_id);
+			RETURN(-EINVAL);
+		}
+	}
+
+	rc = lov_setstripe(exp, 0, lsmp, lump);
+	if (rc)
+		RETURN(rc);
+
+	for (i = 0; i < lump->lmm_stripe_count; i++) {
+		(*lsmp)->lsm_oinfo[i]->loi_ost_idx =
+			lmm_objects[i].l_ost_idx;
+		(*lsmp)->lsm_oinfo[i]->loi_oi = lmm_objects[i].l_ost_oi;
+	}
+	RETURN(0);
+}
+
+
+/* Retrieve object striping information.
+ *
+ * @lump is a pointer to an in-core struct with lmm_ost_count indicating
+ * the maximum number of OST indices which will fit in the user buffer.
+ * lmm_magic must be LOV_USER_MAGIC.
+ */
+int lov_getstripe(struct obd_export *exp, struct lov_stripe_md *lsm,
+		  struct lov_user_md *lump)
+{
+	/*
+	 * XXX huge struct allocated on stack.
+	 */
+	/* we use lov_user_md_v3 because it is larger than lov_user_md_v1 */
+	struct lov_user_md_v3 lum;
+	struct lov_mds_md *lmmk = NULL;
+	int rc, lmm_size;
+	int lum_size;
+	mm_segment_t seg;
+	ENTRY;
+
+	if (!lsm)
+		RETURN(-ENODATA);
+
+	/*
+	 * "Switch to kernel segment" to allow copying from kernel space by
+	 * copy_{to,from}_user().
+	 */
+	seg = get_fs();
+	set_fs(KERNEL_DS);
+
+	/* we only need the header part from user space to get lmm_magic and
+	 * lmm_stripe_count, (the header part is common to v1 and v3) */
+	lum_size = sizeof(struct lov_user_md_v1);
+	if (copy_from_user(&lum, lump, lum_size))
+		GOTO(out_set, rc = -EFAULT);
+	else if ((lum.lmm_magic != LOV_USER_MAGIC) &&
+		 (lum.lmm_magic != LOV_USER_MAGIC_V3))
+		GOTO(out_set, rc = -EINVAL);
+
+	if (lum.lmm_stripe_count &&
+	    (lum.lmm_stripe_count < lsm->lsm_stripe_count)) {
+		/* Return right size of stripe to user */
+		lum.lmm_stripe_count = lsm->lsm_stripe_count;
+		rc = copy_to_user(lump, &lum, lum_size);
+		GOTO(out_set, rc = -EOVERFLOW);
+	}
+	rc = lov_packmd(exp, &lmmk, lsm);
+	if (rc < 0)
+		GOTO(out_set, rc);
+	lmm_size = rc;
+	rc = 0;
+
+	/* FIXME: Bug 1185 - copy fields properly when structs change */
+	/* struct lov_user_md_v3 and struct lov_mds_md_v3 must be the same */
+	CLASSERT(sizeof(lum) == sizeof(struct lov_mds_md_v3));
+	CLASSERT(sizeof lum.lmm_objects[0] == sizeof lmmk->lmm_objects[0]);
+
+	if ((cpu_to_le32(LOV_MAGIC) != LOV_MAGIC) &&
+	    ((lmmk->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) ||
+	    (lmmk->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)))) {
+		lustre_swab_lov_mds_md(lmmk);
+		lustre_swab_lov_user_md_objects(
+				(struct lov_user_ost_data*)lmmk->lmm_objects,
+				lmmk->lmm_stripe_count);
+	}
+	if (lum.lmm_magic == LOV_USER_MAGIC) {
+		/* User request for v1, we need skip lmm_pool_name */
+		if (lmmk->lmm_magic == LOV_MAGIC_V3) {
+			memmove((char*)(&lmmk->lmm_stripe_count) +
+				sizeof(lmmk->lmm_stripe_count),
+				((struct lov_mds_md_v3*)lmmk)->lmm_objects,
+				lmmk->lmm_stripe_count *
+				sizeof(struct lov_ost_data_v1));
+			lmm_size -= LOV_MAXPOOLNAME;
+		}
+	} else {
+		/* if v3 we just have to update the lum_size */
+		lum_size = sizeof(struct lov_user_md_v3);
+	}
+
+	/* User wasn't expecting this many OST entries */
+	if (lum.lmm_stripe_count == 0)
+		lmm_size = lum_size;
+	else if (lum.lmm_stripe_count < lmmk->lmm_stripe_count)
+		GOTO(out_set, rc = -EOVERFLOW);
+	/*
+	 * Have a difference between lov_mds_md & lov_user_md.
+	 * So we have to re-order the data before copy to user.
+	 */
+	lum.lmm_stripe_count = lmmk->lmm_stripe_count;
+	lum.lmm_layout_gen = lmmk->lmm_layout_gen;
+	((struct lov_user_md *)lmmk)->lmm_layout_gen = lum.lmm_layout_gen;
+	((struct lov_user_md *)lmmk)->lmm_stripe_count = lum.lmm_stripe_count;
+	if (copy_to_user(lump, lmmk, lmm_size))
+		rc = -EFAULT;
+
+	obd_free_diskmd(exp, &lmmk);
+out_set:
+	set_fs(seg);
+	RETURN(rc);
+}
diff --git a/drivers/staging/lustre/lustre/lov/lov_page.c b/drivers/staging/lustre/lustre/lov/lov_page.c
new file mode 100644
index 000000000000..65790d684720
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_page.c
@@ -0,0 +1,235 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_page for LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Lov page operations.
+ *
+ */
+
+static int lov_page_invariant(const struct cl_page_slice *slice)
+{
+	const struct cl_page  *page = slice->cpl_page;
+	const struct cl_page  *sub  = lov_sub_page(slice);
+
+	return ergo(sub != NULL,
+		    page->cp_child == sub &&
+		    sub->cp_parent == page &&
+		    page->cp_state == sub->cp_state);
+}
+
+static void lov_page_fini(const struct lu_env *env,
+			  struct cl_page_slice *slice)
+{
+	struct cl_page  *sub = lov_sub_page(slice);
+
+	LINVRNT(lov_page_invariant(slice));
+	ENTRY;
+
+	if (sub != NULL) {
+		LASSERT(sub->cp_state == CPS_FREEING);
+		lu_ref_del(&sub->cp_reference, "lov", sub->cp_parent);
+		sub->cp_parent = NULL;
+		slice->cpl_page->cp_child = NULL;
+		cl_page_put(env, sub);
+	}
+	EXIT;
+}
+
+static int lov_page_own(const struct lu_env *env,
+			const struct cl_page_slice *slice, struct cl_io *io,
+			int nonblock)
+{
+	struct lov_io     *lio = lov_env_io(env);
+	struct lov_io_sub *sub;
+
+	LINVRNT(lov_page_invariant(slice));
+	LINVRNT(!cl2lov_page(slice)->lps_invalid);
+	ENTRY;
+
+	sub = lov_page_subio(env, lio, slice);
+	if (!IS_ERR(sub)) {
+		lov_sub_page(slice)->cp_owner = sub->sub_io;
+		lov_sub_put(sub);
+	} else
+		LBUG(); /* Arrgh */
+	RETURN(0);
+}
+
+static void lov_page_assume(const struct lu_env *env,
+			    const struct cl_page_slice *slice, struct cl_io *io)
+{
+	lov_page_own(env, slice, io, 0);
+}
+
+static int lov_page_cache_add(const struct lu_env *env,
+			      const struct cl_page_slice *slice,
+			      struct cl_io *io)
+{
+	struct lov_io     *lio = lov_env_io(env);
+	struct lov_io_sub *sub;
+	int rc = 0;
+
+	LINVRNT(lov_page_invariant(slice));
+	LINVRNT(!cl2lov_page(slice)->lps_invalid);
+	ENTRY;
+
+	sub = lov_page_subio(env, lio, slice);
+	if (!IS_ERR(sub)) {
+		rc = cl_page_cache_add(sub->sub_env, sub->sub_io,
+				       slice->cpl_page->cp_child, CRT_WRITE);
+		lov_sub_put(sub);
+	} else {
+		rc = PTR_ERR(sub);
+		CL_PAGE_DEBUG(D_ERROR, env, slice->cpl_page, "rc = %d\n", rc);
+	}
+	RETURN(rc);
+}
+
+static int lov_page_print(const struct lu_env *env,
+			  const struct cl_page_slice *slice,
+			  void *cookie, lu_printer_t printer)
+{
+	struct lov_page *lp = cl2lov_page(slice);
+
+	return (*printer)(env, cookie, LUSTRE_LOV_NAME"-page@%p\n", lp);
+}
+
+static const struct cl_page_operations lov_page_ops = {
+	.cpo_fini   = lov_page_fini,
+	.cpo_own    = lov_page_own,
+	.cpo_assume = lov_page_assume,
+	.io = {
+		[CRT_WRITE] = {
+			.cpo_cache_add = lov_page_cache_add
+		}
+	},
+	.cpo_print  = lov_page_print
+};
+
+static void lov_empty_page_fini(const struct lu_env *env,
+				struct cl_page_slice *slice)
+{
+	LASSERT(slice->cpl_page->cp_child == NULL);
+}
+
+int lov_page_init_raid0(const struct lu_env *env, struct cl_object *obj,
+			struct cl_page *page, struct page *vmpage)
+{
+	struct lov_object *loo = cl2lov(obj);
+	struct lov_layout_raid0 *r0 = lov_r0(loo);
+	struct lov_io     *lio = lov_env_io(env);
+	struct cl_page    *subpage;
+	struct cl_object  *subobj;
+	struct lov_io_sub *sub;
+	struct lov_page   *lpg = cl_object_page_slice(obj, page);
+	loff_t	     offset;
+	obd_off	    suboff;
+	int		stripe;
+	int		rc;
+	ENTRY;
+
+	offset = cl_offset(obj, page->cp_index);
+	stripe = lov_stripe_number(loo->lo_lsm, offset);
+	LASSERT(stripe < r0->lo_nr);
+	rc = lov_stripe_offset(loo->lo_lsm, offset, stripe,
+				   &suboff);
+	LASSERT(rc == 0);
+
+	lpg->lps_invalid = 1;
+	cl_page_slice_add(page, &lpg->lps_cl, obj, &lov_page_ops);
+
+	sub = lov_sub_get(env, lio, stripe);
+	if (IS_ERR(sub))
+		GOTO(out, rc = PTR_ERR(sub));
+
+	subobj = lovsub2cl(r0->lo_sub[stripe]);
+	subpage = cl_page_find_sub(sub->sub_env, subobj,
+				   cl_index(subobj, suboff), vmpage, page);
+	lov_sub_put(sub);
+	if (IS_ERR(subpage))
+		GOTO(out, rc = PTR_ERR(subpage));
+
+	if (likely(subpage->cp_parent == page)) {
+		lu_ref_add(&subpage->cp_reference, "lov", page);
+		lpg->lps_invalid = 0;
+		rc = 0;
+	} else {
+		CL_PAGE_DEBUG(D_ERROR, env, page, "parent page\n");
+		CL_PAGE_DEBUG(D_ERROR, env, subpage, "child page\n");
+		LASSERT(0);
+	}
+
+	EXIT;
+out:
+	return rc;
+}
+
+
+static const struct cl_page_operations lov_empty_page_ops = {
+	.cpo_fini   = lov_empty_page_fini,
+	.cpo_print  = lov_page_print
+};
+
+int lov_page_init_empty(const struct lu_env *env, struct cl_object *obj,
+			struct cl_page *page, struct page *vmpage)
+{
+	struct lov_page *lpg = cl_object_page_slice(obj, page);
+	void *addr;
+	ENTRY;
+
+	cl_page_slice_add(page, &lpg->lps_cl, obj, &lov_empty_page_ops);
+	addr = kmap(vmpage);
+	memset(addr, 0, cl_page_size(obj));
+	kunmap(vmpage);
+	cl_page_export(env, page, 1);
+	RETURN(0);
+}
+
+
+/** @} lov */
diff --git a/drivers/staging/lustre/lustre/lov/lov_pool.c b/drivers/staging/lustre/lustre/lov/lov_pool.c
new file mode 100644
index 000000000000..0f3f96dee915
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_pool.c
@@ -0,0 +1,682 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see [sun.com URL with a
+ * copy of GPLv2].
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lov/lov_pool.c
+ *
+ * OST pool methods
+ *
+ * Author: Jacques-Charles LAFOUCRIERE <jc.lafoucriere@cea.fr>
+ * Author: Alex Lyashkov <Alexey.Lyashkov@Sun.COM>
+ * Author: Nathaniel Rutman <Nathan.Rutman@Sun.COM>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include <linux/libcfs/libcfs.h>
+
+#include <obd.h>
+#include "lov_internal.h"
+
+#define pool_tgt(_p, _i) \
+		_p->pool_lobd->u.lov.lov_tgts[_p->pool_obds.op_array[_i]]
+
+static void lov_pool_getref(struct pool_desc *pool)
+{
+	CDEBUG(D_INFO, "pool %p\n", pool);
+	atomic_inc(&pool->pool_refcount);
+}
+
+void lov_pool_putref(struct pool_desc *pool)
+{
+	CDEBUG(D_INFO, "pool %p\n", pool);
+	if (atomic_dec_and_test(&pool->pool_refcount)) {
+		LASSERT(hlist_unhashed(&pool->pool_hash));
+		LASSERT(list_empty(&pool->pool_list));
+		LASSERT(pool->pool_proc_entry == NULL);
+		lov_ost_pool_free(&(pool->pool_rr.lqr_pool));
+		lov_ost_pool_free(&(pool->pool_obds));
+		OBD_FREE_PTR(pool);
+		EXIT;
+	}
+}
+
+void lov_pool_putref_locked(struct pool_desc *pool)
+{
+	CDEBUG(D_INFO, "pool %p\n", pool);
+	LASSERT(atomic_read(&pool->pool_refcount) > 1);
+
+	atomic_dec(&pool->pool_refcount);
+}
+
+/*
+ * hash function using a Rotating Hash algorithm
+ * Knuth, D. The Art of Computer Programming,
+ * Volume 3: Sorting and Searching,
+ * Chapter 6.4.
+ * Addison Wesley, 1973
+ */
+static __u32 pool_hashfn(cfs_hash_t *hash_body, const void *key, unsigned mask)
+{
+	int i;
+	__u32 result;
+	char *poolname;
+
+	result = 0;
+	poolname = (char *)key;
+	for (i = 0; i < LOV_MAXPOOLNAME; i++) {
+		if (poolname[i] == '\0')
+			break;
+		result = (result << 4)^(result >> 28) ^  poolname[i];
+	}
+	return (result % mask);
+}
+
+static void *pool_key(struct hlist_node *hnode)
+{
+	struct pool_desc *pool;
+
+	pool = hlist_entry(hnode, struct pool_desc, pool_hash);
+	return (pool->pool_name);
+}
+
+static int pool_hashkey_keycmp(const void *key, struct hlist_node *compared_hnode)
+{
+	char *pool_name;
+	struct pool_desc *pool;
+
+	pool_name = (char *)key;
+	pool = hlist_entry(compared_hnode, struct pool_desc, pool_hash);
+	return !strncmp(pool_name, pool->pool_name, LOV_MAXPOOLNAME);
+}
+
+static void *pool_hashobject(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct pool_desc, pool_hash);
+}
+
+static void pool_hashrefcount_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct pool_desc *pool;
+
+	pool = hlist_entry(hnode, struct pool_desc, pool_hash);
+	lov_pool_getref(pool);
+}
+
+static void pool_hashrefcount_put_locked(cfs_hash_t *hs,
+					 struct hlist_node *hnode)
+{
+	struct pool_desc *pool;
+
+	pool = hlist_entry(hnode, struct pool_desc, pool_hash);
+	lov_pool_putref_locked(pool);
+}
+
+cfs_hash_ops_t pool_hash_operations = {
+	.hs_hash	= pool_hashfn,
+	.hs_key	 = pool_key,
+	.hs_keycmp      = pool_hashkey_keycmp,
+	.hs_object      = pool_hashobject,
+	.hs_get	 = pool_hashrefcount_get,
+	.hs_put_locked  = pool_hashrefcount_put_locked,
+
+};
+
+#ifdef LPROCFS
+/* ifdef needed for liblustre support */
+/*
+ * pool /proc seq_file methods
+ */
+/*
+ * iterator is used to go through the target pool entries
+ * index is the current entry index in the lp_array[] array
+ * index >= pos returned to the seq_file interface
+ * pos is from 0 to (pool->pool_obds.op_count - 1)
+ */
+#define POOL_IT_MAGIC 0xB001CEA0
+struct pool_iterator {
+	int magic;
+	struct pool_desc *pool;
+	int idx;	/* from 0 to pool_tgt_size - 1 */
+};
+
+static void *pool_proc_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	struct pool_iterator *iter = (struct pool_iterator *)s->private;
+	int prev_idx;
+
+	LASSERTF(iter->magic == POOL_IT_MAGIC, "%08X", iter->magic);
+
+	/* test if end of file */
+	if (*pos >= pool_tgt_count(iter->pool))
+		return NULL;
+
+	/* iterate to find a non empty entry */
+	prev_idx = iter->idx;
+	down_read(&pool_tgt_rw_sem(iter->pool));
+	iter->idx++;
+	if (iter->idx == pool_tgt_count(iter->pool)) {
+		iter->idx = prev_idx; /* we stay on the last entry */
+		up_read(&pool_tgt_rw_sem(iter->pool));
+		return NULL;
+	}
+	up_read(&pool_tgt_rw_sem(iter->pool));
+	(*pos)++;
+	/* return != NULL to continue */
+	return iter;
+}
+
+static void *pool_proc_start(struct seq_file *s, loff_t *pos)
+{
+	struct pool_desc *pool = (struct pool_desc *)s->private;
+	struct pool_iterator *iter;
+
+	lov_pool_getref(pool);
+	if ((pool_tgt_count(pool) == 0) ||
+	    (*pos >= pool_tgt_count(pool))) {
+		/* iter is not created, so stop() has no way to
+		 * find pool to dec ref */
+		lov_pool_putref(pool);
+		return NULL;
+	}
+
+	OBD_ALLOC_PTR(iter);
+	if (!iter)
+		return ERR_PTR(-ENOMEM);
+	iter->magic = POOL_IT_MAGIC;
+	iter->pool = pool;
+	iter->idx = 0;
+
+	/* we use seq_file private field to memorized iterator so
+	 * we can free it at stop() */
+	/* /!\ do not forget to restore it to pool before freeing it */
+	s->private = iter;
+	if (*pos > 0) {
+		loff_t i;
+		void *ptr;
+
+		i = 0;
+		do {
+		     ptr = pool_proc_next(s, &iter, &i);
+		} while ((i < *pos) && (ptr != NULL));
+		return ptr;
+	}
+	return iter;
+}
+
+static void pool_proc_stop(struct seq_file *s, void *v)
+{
+	struct pool_iterator *iter = (struct pool_iterator *)s->private;
+
+	/* in some cases stop() method is called 2 times, without
+	 * calling start() method (see seq_read() from fs/seq_file.c)
+	 * we have to free only if s->private is an iterator */
+	if ((iter) && (iter->magic == POOL_IT_MAGIC)) {
+		/* we restore s->private so next call to pool_proc_start()
+		 * will work */
+		s->private = iter->pool;
+		lov_pool_putref(iter->pool);
+		OBD_FREE_PTR(iter);
+	}
+	return;
+}
+
+static int pool_proc_show(struct seq_file *s, void *v)
+{
+	struct pool_iterator *iter = (struct pool_iterator *)v;
+	struct lov_tgt_desc *tgt;
+
+	LASSERTF(iter->magic == POOL_IT_MAGIC, "%08X", iter->magic);
+	LASSERT(iter->pool != NULL);
+	LASSERT(iter->idx <= pool_tgt_count(iter->pool));
+
+	down_read(&pool_tgt_rw_sem(iter->pool));
+	tgt = pool_tgt(iter->pool, iter->idx);
+	up_read(&pool_tgt_rw_sem(iter->pool));
+	if (tgt)
+		seq_printf(s, "%s\n", obd_uuid2str(&(tgt->ltd_uuid)));
+
+	return 0;
+}
+
+static struct seq_operations pool_proc_ops = {
+	.start	  = pool_proc_start,
+	.next	   = pool_proc_next,
+	.stop	   = pool_proc_stop,
+	.show	   = pool_proc_show,
+};
+
+static int pool_proc_open(struct inode *inode, struct file *file)
+{
+	int rc;
+
+	rc = seq_open(file, &pool_proc_ops);
+	if (!rc) {
+		struct seq_file *s = file->private_data;
+		s->private = PROC_I(inode)->pde->data;
+	}
+	return rc;
+}
+
+static struct file_operations pool_proc_operations = {
+	.open	   = pool_proc_open,
+	.read	   = seq_read,
+	.llseek	 = seq_lseek,
+	.release	= seq_release,
+};
+#endif /* LPROCFS */
+
+void lov_dump_pool(int level, struct pool_desc *pool)
+{
+	int i;
+
+	lov_pool_getref(pool);
+
+	CDEBUG(level, "pool "LOV_POOLNAMEF" has %d members\n",
+	       pool->pool_name, pool->pool_obds.op_count);
+	down_read(&pool_tgt_rw_sem(pool));
+
+	for (i = 0; i < pool_tgt_count(pool) ; i++) {
+		if (!pool_tgt(pool, i) || !(pool_tgt(pool, i))->ltd_exp)
+			continue;
+		CDEBUG(level, "pool "LOV_POOLNAMEF"[%d] = %s\n",
+		       pool->pool_name, i,
+		       obd_uuid2str(&((pool_tgt(pool, i))->ltd_uuid)));
+	}
+
+	up_read(&pool_tgt_rw_sem(pool));
+	lov_pool_putref(pool);
+}
+
+#define LOV_POOL_INIT_COUNT 2
+int lov_ost_pool_init(struct ost_pool *op, unsigned int count)
+{
+	ENTRY;
+
+	if (count == 0)
+		count = LOV_POOL_INIT_COUNT;
+	op->op_array = NULL;
+	op->op_count = 0;
+	init_rwsem(&op->op_rw_sem);
+	op->op_size = count;
+	OBD_ALLOC(op->op_array, op->op_size * sizeof(op->op_array[0]));
+	if (op->op_array == NULL) {
+		op->op_size = 0;
+		RETURN(-ENOMEM);
+	}
+	EXIT;
+	return 0;
+}
+
+/* Caller must hold write op_rwlock */
+int lov_ost_pool_extend(struct ost_pool *op, unsigned int min_count)
+{
+	__u32 *new;
+	int new_size;
+
+	LASSERT(min_count != 0);
+
+	if (op->op_count < op->op_size)
+		return 0;
+
+	new_size = max(min_count, 2 * op->op_size);
+	OBD_ALLOC(new, new_size * sizeof(op->op_array[0]));
+	if (new == NULL)
+		return -ENOMEM;
+
+	/* copy old array to new one */
+	memcpy(new, op->op_array, op->op_size * sizeof(op->op_array[0]));
+	OBD_FREE(op->op_array, op->op_size * sizeof(op->op_array[0]));
+	op->op_array = new;
+	op->op_size = new_size;
+	return 0;
+}
+
+int lov_ost_pool_add(struct ost_pool *op, __u32 idx, unsigned int min_count)
+{
+	int rc = 0, i;
+	ENTRY;
+
+	down_write(&op->op_rw_sem);
+
+	rc = lov_ost_pool_extend(op, min_count);
+	if (rc)
+		GOTO(out, rc);
+
+	/* search ost in pool array */
+	for (i = 0; i < op->op_count; i++) {
+		if (op->op_array[i] == idx)
+			GOTO(out, rc = -EEXIST);
+	}
+	/* ost not found we add it */
+	op->op_array[op->op_count] = idx;
+	op->op_count++;
+	EXIT;
+out:
+	up_write(&op->op_rw_sem);
+	return rc;
+}
+
+int lov_ost_pool_remove(struct ost_pool *op, __u32 idx)
+{
+	int i;
+	ENTRY;
+
+	down_write(&op->op_rw_sem);
+
+	for (i = 0; i < op->op_count; i++) {
+		if (op->op_array[i] == idx) {
+			memmove(&op->op_array[i], &op->op_array[i + 1],
+				(op->op_count - i - 1) * sizeof(op->op_array[0]));
+			op->op_count--;
+			up_write(&op->op_rw_sem);
+			EXIT;
+			return 0;
+		}
+	}
+
+	up_write(&op->op_rw_sem);
+	RETURN(-EINVAL);
+}
+
+int lov_ost_pool_free(struct ost_pool *op)
+{
+	ENTRY;
+
+	if (op->op_size == 0)
+		RETURN(0);
+
+	down_write(&op->op_rw_sem);
+
+	OBD_FREE(op->op_array, op->op_size * sizeof(op->op_array[0]));
+	op->op_array = NULL;
+	op->op_count = 0;
+	op->op_size = 0;
+
+	up_write(&op->op_rw_sem);
+	RETURN(0);
+}
+
+
+int lov_pool_new(struct obd_device *obd, char *poolname)
+{
+	struct lov_obd *lov;
+	struct pool_desc *new_pool;
+	int rc;
+	ENTRY;
+
+	lov = &(obd->u.lov);
+
+	if (strlen(poolname) > LOV_MAXPOOLNAME)
+		RETURN(-ENAMETOOLONG);
+
+	OBD_ALLOC_PTR(new_pool);
+	if (new_pool == NULL)
+		RETURN(-ENOMEM);
+
+	strncpy(new_pool->pool_name, poolname, LOV_MAXPOOLNAME);
+	new_pool->pool_name[LOV_MAXPOOLNAME] = '\0';
+	new_pool->pool_lobd = obd;
+	/* ref count init to 1 because when created a pool is always used
+	 * up to deletion
+	 */
+	atomic_set(&new_pool->pool_refcount, 1);
+	rc = lov_ost_pool_init(&new_pool->pool_obds, 0);
+	if (rc)
+	       GOTO(out_err, rc);
+
+	memset(&(new_pool->pool_rr), 0, sizeof(struct lov_qos_rr));
+	rc = lov_ost_pool_init(&new_pool->pool_rr.lqr_pool, 0);
+	if (rc)
+		GOTO(out_free_pool_obds, rc);
+
+	INIT_HLIST_NODE(&new_pool->pool_hash);
+
+#ifdef LPROCFS
+	/* we need this assert seq_file is not implementated for liblustre */
+	/* get ref for /proc file */
+	lov_pool_getref(new_pool);
+	new_pool->pool_proc_entry = lprocfs_add_simple(lov->lov_pool_proc_entry,
+						       poolname, NULL, NULL,
+						       new_pool,
+						       &pool_proc_operations);
+	if (IS_ERR(new_pool->pool_proc_entry)) {
+		CWARN("Cannot add proc pool entry "LOV_POOLNAMEF"\n", poolname);
+		new_pool->pool_proc_entry = NULL;
+		lov_pool_putref(new_pool);
+	}
+	CDEBUG(D_INFO, "pool %p - proc %p\n", new_pool, new_pool->pool_proc_entry);
+#endif
+
+	spin_lock(&obd->obd_dev_lock);
+	list_add_tail(&new_pool->pool_list, &lov->lov_pool_list);
+	lov->lov_pool_count++;
+	spin_unlock(&obd->obd_dev_lock);
+
+	/* add to find only when it fully ready  */
+	rc = cfs_hash_add_unique(lov->lov_pools_hash_body, poolname,
+				 &new_pool->pool_hash);
+	if (rc)
+		GOTO(out_err, rc = -EEXIST);
+
+	CDEBUG(D_CONFIG, LOV_POOLNAMEF" is pool #%d\n",
+	       poolname, lov->lov_pool_count);
+
+	RETURN(0);
+
+out_err:
+	spin_lock(&obd->obd_dev_lock);
+	list_del_init(&new_pool->pool_list);
+	lov->lov_pool_count--;
+	spin_unlock(&obd->obd_dev_lock);
+
+	lprocfs_remove(&new_pool->pool_proc_entry);
+
+	lov_ost_pool_free(&new_pool->pool_rr.lqr_pool);
+out_free_pool_obds:
+	lov_ost_pool_free(&new_pool->pool_obds);
+	OBD_FREE_PTR(new_pool);
+	return rc;
+}
+
+int lov_pool_del(struct obd_device *obd, char *poolname)
+{
+	struct lov_obd *lov;
+	struct pool_desc *pool;
+	ENTRY;
+
+	lov = &(obd->u.lov);
+
+	/* lookup and kill hash reference */
+	pool = cfs_hash_del_key(lov->lov_pools_hash_body, poolname);
+	if (pool == NULL)
+		RETURN(-ENOENT);
+
+	if (pool->pool_proc_entry != NULL) {
+		CDEBUG(D_INFO, "proc entry %p\n", pool->pool_proc_entry);
+		lprocfs_remove(&pool->pool_proc_entry);
+		lov_pool_putref(pool);
+	}
+
+	spin_lock(&obd->obd_dev_lock);
+	list_del_init(&pool->pool_list);
+	lov->lov_pool_count--;
+	spin_unlock(&obd->obd_dev_lock);
+
+	/* release last reference */
+	lov_pool_putref(pool);
+
+	RETURN(0);
+}
+
+
+int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname)
+{
+	struct obd_uuid ost_uuid;
+	struct lov_obd *lov;
+	struct pool_desc *pool;
+	unsigned int lov_idx;
+	int rc;
+	ENTRY;
+
+	lov = &(obd->u.lov);
+
+	pool = cfs_hash_lookup(lov->lov_pools_hash_body, poolname);
+	if (pool == NULL)
+		RETURN(-ENOENT);
+
+	obd_str2uuid(&ost_uuid, ostname);
+
+
+	/* search ost in lov array */
+	obd_getref(obd);
+	for (lov_idx = 0; lov_idx < lov->desc.ld_tgt_count; lov_idx++) {
+		if (!lov->lov_tgts[lov_idx])
+			continue;
+		if (obd_uuid_equals(&ost_uuid,
+				    &(lov->lov_tgts[lov_idx]->ltd_uuid)))
+			break;
+	}
+	/* test if ost found in lov */
+	if (lov_idx == lov->desc.ld_tgt_count)
+		GOTO(out, rc = -EINVAL);
+
+	rc = lov_ost_pool_add(&pool->pool_obds, lov_idx, lov->lov_tgt_size);
+	if (rc)
+		GOTO(out, rc);
+
+	pool->pool_rr.lqr_dirty = 1;
+
+	CDEBUG(D_CONFIG, "Added %s to "LOV_POOLNAMEF" as member %d\n",
+	       ostname, poolname,  pool_tgt_count(pool));
+
+	EXIT;
+out:
+	obd_putref(obd);
+	lov_pool_putref(pool);
+	return rc;
+}
+
+int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname)
+{
+	struct obd_uuid ost_uuid;
+	struct lov_obd *lov;
+	struct pool_desc *pool;
+	unsigned int lov_idx;
+	int rc = 0;
+	ENTRY;
+
+	lov = &(obd->u.lov);
+
+	pool = cfs_hash_lookup(lov->lov_pools_hash_body, poolname);
+	if (pool == NULL)
+		RETURN(-ENOENT);
+
+	obd_str2uuid(&ost_uuid, ostname);
+
+	obd_getref(obd);
+	/* search ost in lov array, to get index */
+	for (lov_idx = 0; lov_idx < lov->desc.ld_tgt_count; lov_idx++) {
+		if (!lov->lov_tgts[lov_idx])
+			continue;
+
+		if (obd_uuid_equals(&ost_uuid,
+				    &(lov->lov_tgts[lov_idx]->ltd_uuid)))
+			break;
+	}
+
+	/* test if ost found in lov */
+	if (lov_idx == lov->desc.ld_tgt_count)
+		GOTO(out, rc = -EINVAL);
+
+	lov_ost_pool_remove(&pool->pool_obds, lov_idx);
+
+	pool->pool_rr.lqr_dirty = 1;
+
+	CDEBUG(D_CONFIG, "%s removed from "LOV_POOLNAMEF"\n", ostname,
+	       poolname);
+
+	EXIT;
+out:
+	obd_putref(obd);
+	lov_pool_putref(pool);
+	return rc;
+}
+
+int lov_check_index_in_pool(__u32 idx, struct pool_desc *pool)
+{
+	int i, rc;
+	ENTRY;
+
+	/* caller may no have a ref on pool if it got the pool
+	 * without calling lov_find_pool() (e.g. go through the lov pool
+	 * list)
+	 */
+	lov_pool_getref(pool);
+
+	down_read(&pool_tgt_rw_sem(pool));
+
+	for (i = 0; i < pool_tgt_count(pool); i++) {
+		if (pool_tgt_array(pool)[i] == idx)
+			GOTO(out, rc = 0);
+	}
+	rc = -ENOENT;
+	EXIT;
+out:
+	up_read(&pool_tgt_rw_sem(pool));
+
+	lov_pool_putref(pool);
+	return rc;
+}
+
+struct pool_desc *lov_find_pool(struct lov_obd *lov, char *poolname)
+{
+	struct pool_desc *pool;
+
+	pool = NULL;
+	if (poolname[0] != '\0') {
+		pool = cfs_hash_lookup(lov->lov_pools_hash_body, poolname);
+		if (pool == NULL)
+			CWARN("Request for an unknown pool ("LOV_POOLNAMEF")\n",
+			      poolname);
+		if ((pool != NULL) && (pool_tgt_count(pool) == 0)) {
+			CWARN("Request for an empty pool ("LOV_POOLNAMEF")\n",
+			       poolname);
+			/* pool is ignored, so we remove ref on it */
+			lov_pool_putref(pool);
+			pool = NULL;
+		}
+	}
+	return pool;
+}
diff --git a/drivers/staging/lustre/lustre/lov/lov_request.c b/drivers/staging/lustre/lustre/lov/lov_request.c
new file mode 100644
index 000000000000..13f1637bc700
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_request.c
@@ -0,0 +1,1551 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include <linux/libcfs/libcfs.h>
+
+#include <obd_class.h>
+#include <obd_lov.h>
+#include <lustre/lustre_idl.h>
+
+#include "lov_internal.h"
+
+static void lov_init_set(struct lov_request_set *set)
+{
+	set->set_count = 0;
+	atomic_set(&set->set_completes, 0);
+	atomic_set(&set->set_success, 0);
+	atomic_set(&set->set_finish_checked, 0);
+	set->set_cookies = 0;
+	INIT_LIST_HEAD(&set->set_list);
+	atomic_set(&set->set_refcount, 1);
+	init_waitqueue_head(&set->set_waitq);
+	spin_lock_init(&set->set_lock);
+}
+
+void lov_finish_set(struct lov_request_set *set)
+{
+	struct list_head *pos, *n;
+	ENTRY;
+
+	LASSERT(set);
+	list_for_each_safe(pos, n, &set->set_list) {
+		struct lov_request *req = list_entry(pos,
+							 struct lov_request,
+							 rq_link);
+		list_del_init(&req->rq_link);
+
+		if (req->rq_oi.oi_oa)
+			OBDO_FREE(req->rq_oi.oi_oa);
+		if (req->rq_oi.oi_md)
+			OBD_FREE_LARGE(req->rq_oi.oi_md, req->rq_buflen);
+		if (req->rq_oi.oi_osfs)
+			OBD_FREE(req->rq_oi.oi_osfs,
+				 sizeof(*req->rq_oi.oi_osfs));
+		OBD_FREE(req, sizeof(*req));
+	}
+
+	if (set->set_pga) {
+		int len = set->set_oabufs * sizeof(*set->set_pga);
+		OBD_FREE_LARGE(set->set_pga, len);
+	}
+	if (set->set_lockh)
+		lov_llh_put(set->set_lockh);
+
+	OBD_FREE(set, sizeof(*set));
+	EXIT;
+}
+
+int lov_set_finished(struct lov_request_set *set, int idempotent)
+{
+	int completes = atomic_read(&set->set_completes);
+
+	CDEBUG(D_INFO, "check set %d/%d\n", completes, set->set_count);
+
+	if (completes == set->set_count) {
+		if (idempotent)
+			return 1;
+		if (atomic_inc_return(&set->set_finish_checked) == 1)
+			return 1;
+	}
+	return 0;
+}
+
+void lov_update_set(struct lov_request_set *set,
+		    struct lov_request *req, int rc)
+{
+	req->rq_complete = 1;
+	req->rq_rc = rc;
+
+	atomic_inc(&set->set_completes);
+	if (rc == 0)
+		atomic_inc(&set->set_success);
+
+	wake_up(&set->set_waitq);
+}
+
+int lov_update_common_set(struct lov_request_set *set,
+			  struct lov_request *req, int rc)
+{
+	struct lov_obd *lov = &set->set_exp->exp_obd->u.lov;
+	ENTRY;
+
+	lov_update_set(set, req, rc);
+
+	/* grace error on inactive ost */
+	if (rc && !(lov->lov_tgts[req->rq_idx] &&
+		    lov->lov_tgts[req->rq_idx]->ltd_active))
+		rc = 0;
+
+	/* FIXME in raid1 regime, should return 0 */
+	RETURN(rc);
+}
+
+void lov_set_add_req(struct lov_request *req, struct lov_request_set *set)
+{
+	list_add_tail(&req->rq_link, &set->set_list);
+	set->set_count++;
+	req->rq_rqset = set;
+}
+
+static int lov_check_set(struct lov_obd *lov, int idx)
+{
+	int rc = 0;
+	mutex_lock(&lov->lov_lock);
+
+	if (lov->lov_tgts[idx] == NULL ||
+	    lov->lov_tgts[idx]->ltd_active ||
+	    (lov->lov_tgts[idx]->ltd_exp != NULL &&
+	     class_exp2cliimp(lov->lov_tgts[idx]->ltd_exp)->imp_connect_tried))
+		rc = 1;
+
+	mutex_unlock(&lov->lov_lock);
+	return rc;
+}
+
+/* Check if the OSC connection exists and is active.
+ * If the OSC has not yet had a chance to connect to the OST the first time,
+ * wait once for it to connect instead of returning an error.
+ */
+int lov_check_and_wait_active(struct lov_obd *lov, int ost_idx)
+{
+	wait_queue_head_t waitq;
+	struct l_wait_info lwi;
+	struct lov_tgt_desc *tgt;
+	int rc = 0;
+
+	mutex_lock(&lov->lov_lock);
+
+	tgt = lov->lov_tgts[ost_idx];
+
+	if (unlikely(tgt == NULL))
+		GOTO(out, rc = 0);
+
+	if (likely(tgt->ltd_active))
+		GOTO(out, rc = 1);
+
+	if (tgt->ltd_exp && class_exp2cliimp(tgt->ltd_exp)->imp_connect_tried)
+		GOTO(out, rc = 0);
+
+	mutex_unlock(&lov->lov_lock);
+
+	init_waitqueue_head(&waitq);
+	lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(obd_timeout),
+				   cfs_time_seconds(1), NULL, NULL);
+
+	rc = l_wait_event(waitq, lov_check_set(lov, ost_idx), &lwi);
+	if (tgt != NULL && tgt->ltd_active)
+		return 1;
+
+	return 0;
+
+out:
+	mutex_unlock(&lov->lov_lock);
+	return rc;
+}
+
+extern void osc_update_enqueue(struct lustre_handle *lov_lockhp,
+			       struct lov_oinfo *loi, int flags,
+			       struct ost_lvb *lvb, __u32 mode, int rc);
+
+static int lov_update_enqueue_lov(struct obd_export *exp,
+				  struct lustre_handle *lov_lockhp,
+				  struct lov_oinfo *loi, int flags, int idx,
+				  struct ost_id *oi, int rc)
+{
+	struct lov_obd *lov = &exp->exp_obd->u.lov;
+
+	if (rc != ELDLM_OK &&
+	    !(rc == ELDLM_LOCK_ABORTED && (flags & LDLM_FL_HAS_INTENT))) {
+		memset(lov_lockhp, 0, sizeof(*lov_lockhp));
+		if (lov->lov_tgts[idx] && lov->lov_tgts[idx]->ltd_active) {
+			/* -EUSERS used by OST to report file contention */
+			if (rc != -EINTR && rc != -EUSERS)
+				CERROR("%s: enqueue objid "DOSTID" subobj"
+				       DOSTID" on OST idx %d: rc %d\n",
+				       exp->exp_obd->obd_name,
+				       POSTID(oi), POSTID(&loi->loi_oi),
+				       loi->loi_ost_idx, rc);
+		} else
+			rc = ELDLM_OK;
+	}
+	return rc;
+}
+
+int lov_update_enqueue_set(struct lov_request *req, __u32 mode, int rc)
+{
+	struct lov_request_set *set = req->rq_rqset;
+	struct lustre_handle *lov_lockhp;
+	struct obd_info *oi = set->set_oi;
+	struct lov_oinfo *loi;
+	ENTRY;
+
+	LASSERT(oi != NULL);
+
+	lov_lockhp = set->set_lockh->llh_handles + req->rq_stripe;
+	loi = oi->oi_md->lsm_oinfo[req->rq_stripe];
+
+	/* XXX LOV STACKING: OSC gets a copy, created in lov_prep_enqueue_set
+	 * and that copy can be arbitrarily out of date.
+	 *
+	 * The LOV API is due for a serious rewriting anyways, and this
+	 * can be addressed then. */
+
+	lov_stripe_lock(oi->oi_md);
+	osc_update_enqueue(lov_lockhp, loi, oi->oi_flags,
+			   &req->rq_oi.oi_md->lsm_oinfo[0]->loi_lvb, mode, rc);
+	if (rc == ELDLM_LOCK_ABORTED && (oi->oi_flags & LDLM_FL_HAS_INTENT))
+		memset(lov_lockhp, 0, sizeof *lov_lockhp);
+	rc = lov_update_enqueue_lov(set->set_exp, lov_lockhp, loi, oi->oi_flags,
+				    req->rq_idx, &oi->oi_md->lsm_oi, rc);
+	lov_stripe_unlock(oi->oi_md);
+	lov_update_set(set, req, rc);
+	RETURN(rc);
+}
+
+/* The callback for osc_enqueue that updates lov info for every OSC request. */
+static int cb_update_enqueue(void *cookie, int rc)
+{
+	struct obd_info *oinfo = cookie;
+	struct ldlm_enqueue_info *einfo;
+	struct lov_request *lovreq;
+
+	lovreq = container_of(oinfo, struct lov_request, rq_oi);
+	einfo = lovreq->rq_rqset->set_ei;
+	return lov_update_enqueue_set(lovreq, einfo->ei_mode, rc);
+}
+
+static int enqueue_done(struct lov_request_set *set, __u32 mode)
+{
+	struct lov_request *req;
+	struct lov_obd *lov = &set->set_exp->exp_obd->u.lov;
+	int completes = atomic_read(&set->set_completes);
+	int rc = 0;
+	ENTRY;
+
+	/* enqueue/match success, just return */
+	if (completes && completes == atomic_read(&set->set_success))
+		RETURN(0);
+
+	/* cancel enqueued/matched locks */
+	list_for_each_entry(req, &set->set_list, rq_link) {
+		struct lustre_handle *lov_lockhp;
+
+		if (!req->rq_complete || req->rq_rc)
+			continue;
+
+		lov_lockhp = set->set_lockh->llh_handles + req->rq_stripe;
+		LASSERT(lov_lockhp);
+		if (!lustre_handle_is_used(lov_lockhp))
+			continue;
+
+		rc = obd_cancel(lov->lov_tgts[req->rq_idx]->ltd_exp,
+				req->rq_oi.oi_md, mode, lov_lockhp);
+		if (rc && lov->lov_tgts[req->rq_idx] &&
+		    lov->lov_tgts[req->rq_idx]->ltd_active)
+			CERROR("%s: cancelling obdjid "DOSTID" on OST"
+			       "idx %d error: rc = %d\n",
+			       set->set_exp->exp_obd->obd_name,
+			       POSTID(&req->rq_oi.oi_md->lsm_oi),
+			       req->rq_idx, rc);
+	}
+	if (set->set_lockh)
+		lov_llh_put(set->set_lockh);
+	RETURN(rc);
+}
+
+int lov_fini_enqueue_set(struct lov_request_set *set, __u32 mode, int rc,
+			 struct ptlrpc_request_set *rqset)
+{
+	int ret = 0;
+	ENTRY;
+
+	if (set == NULL)
+		RETURN(0);
+	LASSERT(set->set_exp);
+	/* Do enqueue_done only for sync requests and if any request
+	 * succeeded. */
+	if (!rqset) {
+		if (rc)
+			atomic_set(&set->set_completes, 0);
+		ret = enqueue_done(set, mode);
+	} else if (set->set_lockh)
+		lov_llh_put(set->set_lockh);
+
+	lov_put_reqset(set);
+
+	RETURN(rc ? rc : ret);
+}
+
+static void lov_llh_addref(void *llhp)
+{
+	struct lov_lock_handles *llh = llhp;
+
+	atomic_inc(&llh->llh_refcount);
+	CDEBUG(D_INFO, "GETting llh %p : new refcount %d\n", llh,
+	       atomic_read(&llh->llh_refcount));
+}
+
+static struct portals_handle_ops lov_handle_ops = {
+	.hop_addref = lov_llh_addref,
+	.hop_free   = NULL,
+};
+
+static struct lov_lock_handles *lov_llh_new(struct lov_stripe_md *lsm)
+{
+	struct lov_lock_handles *llh;
+
+	OBD_ALLOC(llh, sizeof *llh +
+		  sizeof(*llh->llh_handles) * lsm->lsm_stripe_count);
+	if (llh == NULL)
+		return NULL;
+
+	atomic_set(&llh->llh_refcount, 2);
+	llh->llh_stripe_count = lsm->lsm_stripe_count;
+	INIT_LIST_HEAD(&llh->llh_handle.h_link);
+	class_handle_hash(&llh->llh_handle, &lov_handle_ops);
+
+	return llh;
+}
+
+int lov_prep_enqueue_set(struct obd_export *exp, struct obd_info *oinfo,
+			 struct ldlm_enqueue_info *einfo,
+			 struct lov_request_set **reqset)
+{
+	struct lov_obd *lov = &exp->exp_obd->u.lov;
+	struct lov_request_set *set;
+	int i, rc = 0;
+	ENTRY;
+
+	OBD_ALLOC(set, sizeof(*set));
+	if (set == NULL)
+		RETURN(-ENOMEM);
+	lov_init_set(set);
+
+	set->set_exp = exp;
+	set->set_oi = oinfo;
+	set->set_ei = einfo;
+	set->set_lockh = lov_llh_new(oinfo->oi_md);
+	if (set->set_lockh == NULL)
+		GOTO(out_set, rc = -ENOMEM);
+	oinfo->oi_lockh->cookie = set->set_lockh->llh_handle.h_cookie;
+
+	for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++) {
+		struct lov_oinfo *loi;
+		struct lov_request *req;
+		obd_off start, end;
+
+		loi = oinfo->oi_md->lsm_oinfo[i];
+		if (!lov_stripe_intersects(oinfo->oi_md, i,
+					   oinfo->oi_policy.l_extent.start,
+					   oinfo->oi_policy.l_extent.end,
+					   &start, &end))
+			continue;
+
+		if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) {
+			CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+			continue;
+		}
+
+		OBD_ALLOC(req, sizeof(*req));
+		if (req == NULL)
+			GOTO(out_set, rc = -ENOMEM);
+
+		req->rq_buflen = sizeof(*req->rq_oi.oi_md) +
+			sizeof(struct lov_oinfo *) +
+			sizeof(struct lov_oinfo);
+		OBD_ALLOC_LARGE(req->rq_oi.oi_md, req->rq_buflen);
+		if (req->rq_oi.oi_md == NULL) {
+			OBD_FREE(req, sizeof(*req));
+			GOTO(out_set, rc = -ENOMEM);
+		}
+		req->rq_oi.oi_md->lsm_oinfo[0] =
+			((void *)req->rq_oi.oi_md) + sizeof(*req->rq_oi.oi_md) +
+			sizeof(struct lov_oinfo *);
+
+		/* Set lov request specific parameters. */
+		req->rq_oi.oi_lockh = set->set_lockh->llh_handles + i;
+		req->rq_oi.oi_cb_up = cb_update_enqueue;
+		req->rq_oi.oi_flags = oinfo->oi_flags;
+
+		LASSERT(req->rq_oi.oi_lockh);
+
+		req->rq_oi.oi_policy.l_extent.gid =
+			oinfo->oi_policy.l_extent.gid;
+		req->rq_oi.oi_policy.l_extent.start = start;
+		req->rq_oi.oi_policy.l_extent.end = end;
+
+		req->rq_idx = loi->loi_ost_idx;
+		req->rq_stripe = i;
+
+		/* XXX LOV STACKING: submd should be from the subobj */
+		req->rq_oi.oi_md->lsm_oi = loi->loi_oi;
+		req->rq_oi.oi_md->lsm_stripe_count = 0;
+		req->rq_oi.oi_md->lsm_oinfo[0]->loi_kms_valid =
+			loi->loi_kms_valid;
+		req->rq_oi.oi_md->lsm_oinfo[0]->loi_kms = loi->loi_kms;
+		req->rq_oi.oi_md->lsm_oinfo[0]->loi_lvb = loi->loi_lvb;
+
+		lov_set_add_req(req, set);
+	}
+	if (!set->set_count)
+		GOTO(out_set, rc = -EIO);
+	*reqset = set;
+	RETURN(0);
+out_set:
+	lov_fini_enqueue_set(set, einfo->ei_mode, rc, NULL);
+	RETURN(rc);
+}
+
+int lov_fini_match_set(struct lov_request_set *set, __u32 mode, int flags)
+{
+	int rc = 0;
+	ENTRY;
+
+	if (set == NULL)
+		RETURN(0);
+	LASSERT(set->set_exp);
+	rc = enqueue_done(set, mode);
+	if ((set->set_count == atomic_read(&set->set_success)) &&
+	    (flags & LDLM_FL_TEST_LOCK))
+		lov_llh_put(set->set_lockh);
+
+	lov_put_reqset(set);
+
+	RETURN(rc);
+}
+
+int lov_prep_match_set(struct obd_export *exp, struct obd_info *oinfo,
+		       struct lov_stripe_md *lsm, ldlm_policy_data_t *policy,
+		       __u32 mode, struct lustre_handle *lockh,
+		       struct lov_request_set **reqset)
+{
+	struct lov_obd *lov = &exp->exp_obd->u.lov;
+	struct lov_request_set *set;
+	int i, rc = 0;
+	ENTRY;
+
+	OBD_ALLOC(set, sizeof(*set));
+	if (set == NULL)
+		RETURN(-ENOMEM);
+	lov_init_set(set);
+
+	set->set_exp = exp;
+	set->set_oi = oinfo;
+	set->set_oi->oi_md = lsm;
+	set->set_lockh = lov_llh_new(lsm);
+	if (set->set_lockh == NULL)
+		GOTO(out_set, rc = -ENOMEM);
+	lockh->cookie = set->set_lockh->llh_handle.h_cookie;
+
+	for (i = 0; i < lsm->lsm_stripe_count; i++){
+		struct lov_oinfo *loi;
+		struct lov_request *req;
+		obd_off start, end;
+
+		loi = lsm->lsm_oinfo[i];
+		if (!lov_stripe_intersects(lsm, i, policy->l_extent.start,
+					   policy->l_extent.end, &start, &end))
+			continue;
+
+		/* FIXME raid1 should grace this error */
+		if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) {
+			CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+			GOTO(out_set, rc = -EIO);
+		}
+
+		OBD_ALLOC(req, sizeof(*req));
+		if (req == NULL)
+			GOTO(out_set, rc = -ENOMEM);
+
+		req->rq_buflen = sizeof(*req->rq_oi.oi_md);
+		OBD_ALLOC_LARGE(req->rq_oi.oi_md, req->rq_buflen);
+		if (req->rq_oi.oi_md == NULL) {
+			OBD_FREE(req, sizeof(*req));
+			GOTO(out_set, rc = -ENOMEM);
+		}
+
+		req->rq_oi.oi_policy.l_extent.start = start;
+		req->rq_oi.oi_policy.l_extent.end = end;
+		req->rq_oi.oi_policy.l_extent.gid = policy->l_extent.gid;
+
+		req->rq_idx = loi->loi_ost_idx;
+		req->rq_stripe = i;
+
+		/* XXX LOV STACKING: submd should be from the subobj */
+		req->rq_oi.oi_md->lsm_oi = loi->loi_oi;
+		req->rq_oi.oi_md->lsm_stripe_count = 0;
+
+		lov_set_add_req(req, set);
+	}
+	if (!set->set_count)
+		GOTO(out_set, rc = -EIO);
+	*reqset = set;
+	RETURN(rc);
+out_set:
+	lov_fini_match_set(set, mode, 0);
+	RETURN(rc);
+}
+
+int lov_fini_cancel_set(struct lov_request_set *set)
+{
+	int rc = 0;
+	ENTRY;
+
+	if (set == NULL)
+		RETURN(0);
+
+	LASSERT(set->set_exp);
+	if (set->set_lockh)
+		lov_llh_put(set->set_lockh);
+
+	lov_put_reqset(set);
+
+	RETURN(rc);
+}
+
+int lov_prep_cancel_set(struct obd_export *exp, struct obd_info *oinfo,
+			struct lov_stripe_md *lsm, __u32 mode,
+			struct lustre_handle *lockh,
+			struct lov_request_set **reqset)
+{
+	struct lov_request_set *set;
+	int i, rc = 0;
+	ENTRY;
+
+	OBD_ALLOC(set, sizeof(*set));
+	if (set == NULL)
+		RETURN(-ENOMEM);
+	lov_init_set(set);
+
+	set->set_exp = exp;
+	set->set_oi = oinfo;
+	set->set_oi->oi_md = lsm;
+	set->set_lockh = lov_handle2llh(lockh);
+	if (set->set_lockh == NULL) {
+		CERROR("LOV: invalid lov lock handle %p\n", lockh);
+		GOTO(out_set, rc = -EINVAL);
+	}
+	lockh->cookie = set->set_lockh->llh_handle.h_cookie;
+
+	for (i = 0; i < lsm->lsm_stripe_count; i++){
+		struct lov_request *req;
+		struct lustre_handle *lov_lockhp;
+		struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+
+		lov_lockhp = set->set_lockh->llh_handles + i;
+		if (!lustre_handle_is_used(lov_lockhp)) {
+			CDEBUG(D_INFO, "lov idx %d subobj "DOSTID" no lock\n",
+			       loi->loi_ost_idx, POSTID(&loi->loi_oi));
+			continue;
+		}
+
+		OBD_ALLOC(req, sizeof(*req));
+		if (req == NULL)
+			GOTO(out_set, rc = -ENOMEM);
+
+		req->rq_buflen = sizeof(*req->rq_oi.oi_md);
+		OBD_ALLOC_LARGE(req->rq_oi.oi_md, req->rq_buflen);
+		if (req->rq_oi.oi_md == NULL) {
+			OBD_FREE(req, sizeof(*req));
+			GOTO(out_set, rc = -ENOMEM);
+		}
+
+		req->rq_idx = loi->loi_ost_idx;
+		req->rq_stripe = i;
+
+		/* XXX LOV STACKING: submd should be from the subobj */
+		req->rq_oi.oi_md->lsm_oi = loi->loi_oi;
+		req->rq_oi.oi_md->lsm_stripe_count = 0;
+
+		lov_set_add_req(req, set);
+	}
+	if (!set->set_count)
+		GOTO(out_set, rc = -EIO);
+	*reqset = set;
+	RETURN(rc);
+out_set:
+	lov_fini_cancel_set(set);
+	RETURN(rc);
+}
+static int common_attr_done(struct lov_request_set *set)
+{
+	struct list_head *pos;
+	struct lov_request *req;
+	struct obdo *tmp_oa;
+	int rc = 0, attrset = 0;
+	ENTRY;
+
+	LASSERT(set->set_oi != NULL);
+
+	if (set->set_oi->oi_oa == NULL)
+		RETURN(0);
+
+	if (!atomic_read(&set->set_success))
+		RETURN(-EIO);
+
+	OBDO_ALLOC(tmp_oa);
+	if (tmp_oa == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	list_for_each (pos, &set->set_list) {
+		req = list_entry(pos, struct lov_request, rq_link);
+
+		if (!req->rq_complete || req->rq_rc)
+			continue;
+		if (req->rq_oi.oi_oa->o_valid == 0)   /* inactive stripe */
+			continue;
+		lov_merge_attrs(tmp_oa, req->rq_oi.oi_oa,
+				req->rq_oi.oi_oa->o_valid,
+				set->set_oi->oi_md, req->rq_stripe, &attrset);
+	}
+	if (!attrset) {
+		CERROR("No stripes had valid attrs\n");
+		rc = -EIO;
+	}
+	if ((set->set_oi->oi_oa->o_valid & OBD_MD_FLEPOCH) &&
+	    (set->set_oi->oi_md->lsm_stripe_count != attrset)) {
+		/* When we take attributes of some epoch, we require all the
+		 * ost to be active. */
+		CERROR("Not all the stripes had valid attrs\n");
+		GOTO(out, rc = -EIO);
+	}
+
+	tmp_oa->o_oi = set->set_oi->oi_oa->o_oi;
+	memcpy(set->set_oi->oi_oa, tmp_oa, sizeof(*set->set_oi->oi_oa));
+out:
+	if (tmp_oa)
+		OBDO_FREE(tmp_oa);
+	RETURN(rc);
+
+}
+
+static int brw_done(struct lov_request_set *set)
+{
+	struct lov_stripe_md *lsm = set->set_oi->oi_md;
+	struct lov_oinfo     *loi = NULL;
+	struct list_head *pos;
+	struct lov_request *req;
+	ENTRY;
+
+	list_for_each (pos, &set->set_list) {
+		req = list_entry(pos, struct lov_request, rq_link);
+
+		if (!req->rq_complete || req->rq_rc)
+			continue;
+
+		loi = lsm->lsm_oinfo[req->rq_stripe];
+
+		if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLBLOCKS)
+			loi->loi_lvb.lvb_blocks = req->rq_oi.oi_oa->o_blocks;
+	}
+
+	RETURN(0);
+}
+
+int lov_fini_brw_set(struct lov_request_set *set)
+{
+	int rc = 0;
+	ENTRY;
+
+	if (set == NULL)
+		RETURN(0);
+	LASSERT(set->set_exp);
+	if (atomic_read(&set->set_completes)) {
+		rc = brw_done(set);
+		/* FIXME update qos data here */
+	}
+	lov_put_reqset(set);
+
+	RETURN(rc);
+}
+
+int lov_prep_brw_set(struct obd_export *exp, struct obd_info *oinfo,
+		     obd_count oa_bufs, struct brw_page *pga,
+		     struct obd_trans_info *oti,
+		     struct lov_request_set **reqset)
+{
+	struct {
+		obd_count       index;
+		obd_count       count;
+		obd_count       off;
+	} *info = NULL;
+	struct lov_request_set *set;
+	struct lov_obd *lov = &exp->exp_obd->u.lov;
+	int rc = 0, i, shift;
+	ENTRY;
+
+	OBD_ALLOC(set, sizeof(*set));
+	if (set == NULL)
+		RETURN(-ENOMEM);
+	lov_init_set(set);
+
+	set->set_exp = exp;
+	set->set_oti = oti;
+	set->set_oi = oinfo;
+	set->set_oabufs = oa_bufs;
+	OBD_ALLOC_LARGE(set->set_pga, oa_bufs * sizeof(*set->set_pga));
+	if (!set->set_pga)
+		GOTO(out, rc = -ENOMEM);
+
+	OBD_ALLOC_LARGE(info, sizeof(*info) * oinfo->oi_md->lsm_stripe_count);
+	if (!info)
+		GOTO(out, rc = -ENOMEM);
+
+	/* calculate the page count for each stripe */
+	for (i = 0; i < oa_bufs; i++) {
+		int stripe = lov_stripe_number(oinfo->oi_md, pga[i].off);
+		info[stripe].count++;
+	}
+
+	/* alloc and initialize lov request */
+	shift = 0;
+	for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++){
+		struct lov_oinfo *loi = NULL;
+		struct lov_request *req;
+
+		if (info[i].count == 0)
+			continue;
+
+		loi = oinfo->oi_md->lsm_oinfo[i];
+		if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) {
+			CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+			GOTO(out, rc = -EIO);
+		}
+
+		OBD_ALLOC(req, sizeof(*req));
+		if (req == NULL)
+			GOTO(out, rc = -ENOMEM);
+
+		OBDO_ALLOC(req->rq_oi.oi_oa);
+		if (req->rq_oi.oi_oa == NULL) {
+			OBD_FREE(req, sizeof(*req));
+			GOTO(out, rc = -ENOMEM);
+		}
+
+		if (oinfo->oi_oa) {
+			memcpy(req->rq_oi.oi_oa, oinfo->oi_oa,
+			       sizeof(*req->rq_oi.oi_oa));
+		}
+		req->rq_oi.oi_oa->o_oi = loi->loi_oi;
+		req->rq_oi.oi_oa->o_stripe_idx = i;
+
+		req->rq_buflen = sizeof(*req->rq_oi.oi_md);
+		OBD_ALLOC_LARGE(req->rq_oi.oi_md, req->rq_buflen);
+		if (req->rq_oi.oi_md == NULL) {
+			OBDO_FREE(req->rq_oi.oi_oa);
+			OBD_FREE(req, sizeof(*req));
+			GOTO(out, rc = -ENOMEM);
+		}
+
+		req->rq_idx = loi->loi_ost_idx;
+		req->rq_stripe = i;
+
+		/* XXX LOV STACKING */
+		req->rq_oi.oi_md->lsm_oi = loi->loi_oi;
+		req->rq_oabufs = info[i].count;
+		req->rq_pgaidx = shift;
+		shift += req->rq_oabufs;
+
+		/* remember the index for sort brw_page array */
+		info[i].index = req->rq_pgaidx;
+
+		req->rq_oi.oi_capa = oinfo->oi_capa;
+
+		lov_set_add_req(req, set);
+	}
+	if (!set->set_count)
+		GOTO(out, rc = -EIO);
+
+	/* rotate & sort the brw_page array */
+	for (i = 0; i < oa_bufs; i++) {
+		int stripe = lov_stripe_number(oinfo->oi_md, pga[i].off);
+
+		shift = info[stripe].index + info[stripe].off;
+		LASSERT(shift < oa_bufs);
+		set->set_pga[shift] = pga[i];
+		lov_stripe_offset(oinfo->oi_md, pga[i].off, stripe,
+				  &set->set_pga[shift].off);
+		info[stripe].off++;
+	}
+out:
+	if (info)
+		OBD_FREE_LARGE(info,
+			       sizeof(*info) * oinfo->oi_md->lsm_stripe_count);
+
+	if (rc == 0)
+		*reqset = set;
+	else
+		lov_fini_brw_set(set);
+
+	RETURN(rc);
+}
+
+int lov_fini_getattr_set(struct lov_request_set *set)
+{
+	int rc = 0;
+	ENTRY;
+
+	if (set == NULL)
+		RETURN(0);
+	LASSERT(set->set_exp);
+	if (atomic_read(&set->set_completes))
+		rc = common_attr_done(set);
+
+	lov_put_reqset(set);
+
+	RETURN(rc);
+}
+
+/* The callback for osc_getattr_async that finilizes a request info when a
+ * response is received. */
+static int cb_getattr_update(void *cookie, int rc)
+{
+	struct obd_info *oinfo = cookie;
+	struct lov_request *lovreq;
+	lovreq = container_of(oinfo, struct lov_request, rq_oi);
+	return lov_update_common_set(lovreq->rq_rqset, lovreq, rc);
+}
+
+int lov_prep_getattr_set(struct obd_export *exp, struct obd_info *oinfo,
+			 struct lov_request_set **reqset)
+{
+	struct lov_request_set *set;
+	struct lov_obd *lov = &exp->exp_obd->u.lov;
+	int rc = 0, i;
+	ENTRY;
+
+	OBD_ALLOC(set, sizeof(*set));
+	if (set == NULL)
+		RETURN(-ENOMEM);
+	lov_init_set(set);
+
+	set->set_exp = exp;
+	set->set_oi = oinfo;
+
+	for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++) {
+		struct lov_oinfo *loi;
+		struct lov_request *req;
+
+		loi = oinfo->oi_md->lsm_oinfo[i];
+		if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) {
+			CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+			if (oinfo->oi_oa->o_valid & OBD_MD_FLEPOCH)
+				/* SOM requires all the OSTs to be active. */
+				GOTO(out_set, rc = -EIO);
+			continue;
+		}
+
+		OBD_ALLOC(req, sizeof(*req));
+		if (req == NULL)
+			GOTO(out_set, rc = -ENOMEM);
+
+		req->rq_stripe = i;
+		req->rq_idx = loi->loi_ost_idx;
+
+		OBDO_ALLOC(req->rq_oi.oi_oa);
+		if (req->rq_oi.oi_oa == NULL) {
+			OBD_FREE(req, sizeof(*req));
+			GOTO(out_set, rc = -ENOMEM);
+		}
+		memcpy(req->rq_oi.oi_oa, oinfo->oi_oa,
+		       sizeof(*req->rq_oi.oi_oa));
+		req->rq_oi.oi_oa->o_oi = loi->loi_oi;
+		req->rq_oi.oi_cb_up = cb_getattr_update;
+		req->rq_oi.oi_capa = oinfo->oi_capa;
+
+		lov_set_add_req(req, set);
+	}
+	if (!set->set_count)
+		GOTO(out_set, rc = -EIO);
+	*reqset = set;
+	RETURN(rc);
+out_set:
+	lov_fini_getattr_set(set);
+	RETURN(rc);
+}
+
+int lov_fini_destroy_set(struct lov_request_set *set)
+{
+	ENTRY;
+
+	if (set == NULL)
+		RETURN(0);
+	LASSERT(set->set_exp);
+	if (atomic_read(&set->set_completes)) {
+		/* FIXME update qos data here */
+	}
+
+	lov_put_reqset(set);
+
+	RETURN(0);
+}
+
+int lov_prep_destroy_set(struct obd_export *exp, struct obd_info *oinfo,
+			 struct obdo *src_oa, struct lov_stripe_md *lsm,
+			 struct obd_trans_info *oti,
+			 struct lov_request_set **reqset)
+{
+	struct lov_request_set *set;
+	struct lov_obd *lov = &exp->exp_obd->u.lov;
+	int rc = 0, i;
+	ENTRY;
+
+	OBD_ALLOC(set, sizeof(*set));
+	if (set == NULL)
+		RETURN(-ENOMEM);
+	lov_init_set(set);
+
+	set->set_exp = exp;
+	set->set_oi = oinfo;
+	set->set_oi->oi_md = lsm;
+	set->set_oi->oi_oa = src_oa;
+	set->set_oti = oti;
+	if (oti != NULL && src_oa->o_valid & OBD_MD_FLCOOKIE)
+		set->set_cookies = oti->oti_logcookies;
+
+	for (i = 0; i < lsm->lsm_stripe_count; i++) {
+		struct lov_oinfo *loi;
+		struct lov_request *req;
+
+		loi = lsm->lsm_oinfo[i];
+		if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) {
+			CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+			continue;
+		}
+
+		OBD_ALLOC(req, sizeof(*req));
+		if (req == NULL)
+			GOTO(out_set, rc = -ENOMEM);
+
+		req->rq_stripe = i;
+		req->rq_idx = loi->loi_ost_idx;
+
+		OBDO_ALLOC(req->rq_oi.oi_oa);
+		if (req->rq_oi.oi_oa == NULL) {
+			OBD_FREE(req, sizeof(*req));
+			GOTO(out_set, rc = -ENOMEM);
+		}
+		memcpy(req->rq_oi.oi_oa, src_oa, sizeof(*req->rq_oi.oi_oa));
+		req->rq_oi.oi_oa->o_oi = loi->loi_oi;
+		lov_set_add_req(req, set);
+	}
+	if (!set->set_count)
+		GOTO(out_set, rc = -EIO);
+	*reqset = set;
+	RETURN(rc);
+out_set:
+	lov_fini_destroy_set(set);
+	RETURN(rc);
+}
+
+int lov_fini_setattr_set(struct lov_request_set *set)
+{
+	int rc = 0;
+	ENTRY;
+
+	if (set == NULL)
+		RETURN(0);
+	LASSERT(set->set_exp);
+	if (atomic_read(&set->set_completes)) {
+		rc = common_attr_done(set);
+		/* FIXME update qos data here */
+	}
+
+	lov_put_reqset(set);
+	RETURN(rc);
+}
+
+int lov_update_setattr_set(struct lov_request_set *set,
+			   struct lov_request *req, int rc)
+{
+	struct lov_obd *lov = &req->rq_rqset->set_exp->exp_obd->u.lov;
+	struct lov_stripe_md *lsm = req->rq_rqset->set_oi->oi_md;
+	ENTRY;
+
+	lov_update_set(set, req, rc);
+
+	/* grace error on inactive ost */
+	if (rc && !(lov->lov_tgts[req->rq_idx] &&
+		    lov->lov_tgts[req->rq_idx]->ltd_active))
+		rc = 0;
+
+	if (rc == 0) {
+		if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLCTIME)
+			lsm->lsm_oinfo[req->rq_stripe]->loi_lvb.lvb_ctime =
+				req->rq_oi.oi_oa->o_ctime;
+		if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLMTIME)
+			lsm->lsm_oinfo[req->rq_stripe]->loi_lvb.lvb_mtime =
+				req->rq_oi.oi_oa->o_mtime;
+		if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLATIME)
+			lsm->lsm_oinfo[req->rq_stripe]->loi_lvb.lvb_atime =
+				req->rq_oi.oi_oa->o_atime;
+	}
+
+	RETURN(rc);
+}
+
+/* The callback for osc_setattr_async that finilizes a request info when a
+ * response is received. */
+static int cb_setattr_update(void *cookie, int rc)
+{
+	struct obd_info *oinfo = cookie;
+	struct lov_request *lovreq;
+	lovreq = container_of(oinfo, struct lov_request, rq_oi);
+	return lov_update_setattr_set(lovreq->rq_rqset, lovreq, rc);
+}
+
+int lov_prep_setattr_set(struct obd_export *exp, struct obd_info *oinfo,
+			 struct obd_trans_info *oti,
+			 struct lov_request_set **reqset)
+{
+	struct lov_request_set *set;
+	struct lov_obd *lov = &exp->exp_obd->u.lov;
+	int rc = 0, i;
+	ENTRY;
+
+	OBD_ALLOC(set, sizeof(*set));
+	if (set == NULL)
+		RETURN(-ENOMEM);
+	lov_init_set(set);
+
+	set->set_exp = exp;
+	set->set_oti = oti;
+	set->set_oi = oinfo;
+	if (oti != NULL && oinfo->oi_oa->o_valid & OBD_MD_FLCOOKIE)
+		set->set_cookies = oti->oti_logcookies;
+
+	for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++) {
+		struct lov_oinfo *loi = oinfo->oi_md->lsm_oinfo[i];
+		struct lov_request *req;
+
+		if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) {
+			CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+			continue;
+		}
+
+		OBD_ALLOC(req, sizeof(*req));
+		if (req == NULL)
+			GOTO(out_set, rc = -ENOMEM);
+		req->rq_stripe = i;
+		req->rq_idx = loi->loi_ost_idx;
+
+		OBDO_ALLOC(req->rq_oi.oi_oa);
+		if (req->rq_oi.oi_oa == NULL) {
+			OBD_FREE(req, sizeof(*req));
+			GOTO(out_set, rc = -ENOMEM);
+		}
+		memcpy(req->rq_oi.oi_oa, oinfo->oi_oa,
+		       sizeof(*req->rq_oi.oi_oa));
+		req->rq_oi.oi_oa->o_oi = loi->loi_oi;
+		req->rq_oi.oi_oa->o_stripe_idx = i;
+		req->rq_oi.oi_cb_up = cb_setattr_update;
+		req->rq_oi.oi_capa = oinfo->oi_capa;
+
+		if (oinfo->oi_oa->o_valid & OBD_MD_FLSIZE) {
+			int off = lov_stripe_offset(oinfo->oi_md,
+						    oinfo->oi_oa->o_size, i,
+						    &req->rq_oi.oi_oa->o_size);
+
+			if (off < 0 && req->rq_oi.oi_oa->o_size)
+				req->rq_oi.oi_oa->o_size--;
+
+			CDEBUG(D_INODE, "stripe %d has size "LPU64"/"LPU64"\n",
+			       i, req->rq_oi.oi_oa->o_size,
+			       oinfo->oi_oa->o_size);
+		}
+		lov_set_add_req(req, set);
+	}
+	if (!set->set_count)
+		GOTO(out_set, rc = -EIO);
+	*reqset = set;
+	RETURN(rc);
+out_set:
+	lov_fini_setattr_set(set);
+	RETURN(rc);
+}
+
+int lov_fini_punch_set(struct lov_request_set *set)
+{
+	int rc = 0;
+	ENTRY;
+
+	if (set == NULL)
+		RETURN(0);
+	LASSERT(set->set_exp);
+	if (atomic_read(&set->set_completes)) {
+		rc = -EIO;
+		/* FIXME update qos data here */
+		if (atomic_read(&set->set_success))
+			rc = common_attr_done(set);
+	}
+
+	lov_put_reqset(set);
+
+	RETURN(rc);
+}
+
+int lov_update_punch_set(struct lov_request_set *set,
+			 struct lov_request *req, int rc)
+{
+	struct lov_obd *lov = &req->rq_rqset->set_exp->exp_obd->u.lov;
+	struct lov_stripe_md *lsm = req->rq_rqset->set_oi->oi_md;
+	ENTRY;
+
+	lov_update_set(set, req, rc);
+
+	/* grace error on inactive ost */
+	if (rc && !lov->lov_tgts[req->rq_idx]->ltd_active)
+		rc = 0;
+
+	if (rc == 0) {
+		lov_stripe_lock(lsm);
+		if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLBLOCKS) {
+			lsm->lsm_oinfo[req->rq_stripe]->loi_lvb.lvb_blocks =
+				req->rq_oi.oi_oa->o_blocks;
+		}
+
+		lov_stripe_unlock(lsm);
+	}
+
+	RETURN(rc);
+}
+
+/* The callback for osc_punch that finilizes a request info when a response
+ * is received. */
+static int cb_update_punch(void *cookie, int rc)
+{
+	struct obd_info *oinfo = cookie;
+	struct lov_request *lovreq;
+	lovreq = container_of(oinfo, struct lov_request, rq_oi);
+	return lov_update_punch_set(lovreq->rq_rqset, lovreq, rc);
+}
+
+int lov_prep_punch_set(struct obd_export *exp, struct obd_info *oinfo,
+		       struct obd_trans_info *oti,
+		       struct lov_request_set **reqset)
+{
+	struct lov_request_set *set;
+	struct lov_obd *lov = &exp->exp_obd->u.lov;
+	int rc = 0, i;
+	ENTRY;
+
+	OBD_ALLOC(set, sizeof(*set));
+	if (set == NULL)
+		RETURN(-ENOMEM);
+	lov_init_set(set);
+
+	set->set_oi = oinfo;
+	set->set_exp = exp;
+
+	for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++) {
+		struct lov_oinfo *loi = oinfo->oi_md->lsm_oinfo[i];
+		struct lov_request *req;
+		obd_off rs, re;
+
+		if (!lov_stripe_intersects(oinfo->oi_md, i,
+					   oinfo->oi_policy.l_extent.start,
+					   oinfo->oi_policy.l_extent.end,
+					   &rs, &re))
+			continue;
+
+		if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) {
+			CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+			GOTO(out_set, rc = -EIO);
+		}
+
+		OBD_ALLOC(req, sizeof(*req));
+		if (req == NULL)
+			GOTO(out_set, rc = -ENOMEM);
+		req->rq_stripe = i;
+		req->rq_idx = loi->loi_ost_idx;
+
+		OBDO_ALLOC(req->rq_oi.oi_oa);
+		if (req->rq_oi.oi_oa == NULL) {
+			OBD_FREE(req, sizeof(*req));
+			GOTO(out_set, rc = -ENOMEM);
+		}
+		memcpy(req->rq_oi.oi_oa, oinfo->oi_oa,
+		       sizeof(*req->rq_oi.oi_oa));
+		req->rq_oi.oi_oa->o_oi = loi->loi_oi;
+		req->rq_oi.oi_oa->o_valid |= OBD_MD_FLGROUP;
+
+		req->rq_oi.oi_oa->o_stripe_idx = i;
+		req->rq_oi.oi_cb_up = cb_update_punch;
+
+		req->rq_oi.oi_policy.l_extent.start = rs;
+		req->rq_oi.oi_policy.l_extent.end = re;
+		req->rq_oi.oi_policy.l_extent.gid = -1;
+
+		req->rq_oi.oi_capa = oinfo->oi_capa;
+
+		lov_set_add_req(req, set);
+	}
+	if (!set->set_count)
+		GOTO(out_set, rc = -EIO);
+	*reqset = set;
+	RETURN(rc);
+out_set:
+	lov_fini_punch_set(set);
+	RETURN(rc);
+}
+
+int lov_fini_sync_set(struct lov_request_set *set)
+{
+	int rc = 0;
+	ENTRY;
+
+	if (set == NULL)
+		RETURN(0);
+	LASSERT(set->set_exp);
+	if (atomic_read(&set->set_completes)) {
+		if (!atomic_read(&set->set_success))
+			rc = -EIO;
+		/* FIXME update qos data here */
+	}
+
+	lov_put_reqset(set);
+
+	RETURN(rc);
+}
+
+/* The callback for osc_sync that finilizes a request info when a
+ * response is recieved. */
+static int cb_sync_update(void *cookie, int rc)
+{
+	struct obd_info *oinfo = cookie;
+	struct lov_request *lovreq;
+
+	lovreq = container_of(oinfo, struct lov_request, rq_oi);
+	return lov_update_common_set(lovreq->rq_rqset, lovreq, rc);
+}
+
+int lov_prep_sync_set(struct obd_export *exp, struct obd_info *oinfo,
+		      obd_off start, obd_off end,
+		      struct lov_request_set **reqset)
+{
+	struct lov_request_set *set;
+	struct lov_obd *lov = &exp->exp_obd->u.lov;
+	int rc = 0, i;
+	ENTRY;
+
+	OBD_ALLOC_PTR(set);
+	if (set == NULL)
+		RETURN(-ENOMEM);
+	lov_init_set(set);
+
+	set->set_exp = exp;
+	set->set_oi = oinfo;
+
+	for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++) {
+		struct lov_oinfo *loi = oinfo->oi_md->lsm_oinfo[i];
+		struct lov_request *req;
+		obd_off rs, re;
+
+		if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) {
+			CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+			continue;
+		}
+
+		if (!lov_stripe_intersects(oinfo->oi_md, i, start, end, &rs,
+					   &re))
+			continue;
+
+		OBD_ALLOC_PTR(req);
+		if (req == NULL)
+			GOTO(out_set, rc = -ENOMEM);
+		req->rq_stripe = i;
+		req->rq_idx = loi->loi_ost_idx;
+
+		OBDO_ALLOC(req->rq_oi.oi_oa);
+		if (req->rq_oi.oi_oa == NULL) {
+			OBD_FREE(req, sizeof(*req));
+			GOTO(out_set, rc = -ENOMEM);
+		}
+		*req->rq_oi.oi_oa = *oinfo->oi_oa;
+		req->rq_oi.oi_oa->o_oi = loi->loi_oi;
+		req->rq_oi.oi_oa->o_stripe_idx = i;
+
+		req->rq_oi.oi_policy.l_extent.start = rs;
+		req->rq_oi.oi_policy.l_extent.end = re;
+		req->rq_oi.oi_policy.l_extent.gid = -1;
+		req->rq_oi.oi_cb_up = cb_sync_update;
+
+		lov_set_add_req(req, set);
+	}
+	if (!set->set_count)
+		GOTO(out_set, rc = -EIO);
+	*reqset = set;
+	RETURN(rc);
+out_set:
+	lov_fini_sync_set(set);
+	RETURN(rc);
+}
+
+#define LOV_U64_MAX ((__u64)~0ULL)
+#define LOV_SUM_MAX(tot, add)					   \
+	do {							    \
+		if ((tot) + (add) < (tot))			      \
+			(tot) = LOV_U64_MAX;			    \
+		else						    \
+			(tot) += (add);				 \
+	} while(0)
+
+int lov_fini_statfs(struct obd_device *obd, struct obd_statfs *osfs,int success)
+{
+	ENTRY;
+
+	if (success) {
+		__u32 expected_stripes = lov_get_stripecnt(&obd->u.lov,
+							   LOV_MAGIC, 0);
+		if (osfs->os_files != LOV_U64_MAX)
+			lov_do_div64(osfs->os_files, expected_stripes);
+		if (osfs->os_ffree != LOV_U64_MAX)
+			lov_do_div64(osfs->os_ffree, expected_stripes);
+
+		spin_lock(&obd->obd_osfs_lock);
+		memcpy(&obd->obd_osfs, osfs, sizeof(*osfs));
+		obd->obd_osfs_age = cfs_time_current_64();
+		spin_unlock(&obd->obd_osfs_lock);
+		RETURN(0);
+	}
+
+	RETURN(-EIO);
+}
+
+int lov_fini_statfs_set(struct lov_request_set *set)
+{
+	int rc = 0;
+	ENTRY;
+
+	if (set == NULL)
+		RETURN(0);
+
+	if (atomic_read(&set->set_completes)) {
+		rc = lov_fini_statfs(set->set_obd, set->set_oi->oi_osfs,
+				     atomic_read(&set->set_success));
+	}
+	lov_put_reqset(set);
+	RETURN(rc);
+}
+
+void lov_update_statfs(struct obd_statfs *osfs, struct obd_statfs *lov_sfs,
+		       int success)
+{
+	int shift = 0, quit = 0;
+	__u64 tmp;
+
+	if (success == 0) {
+		memcpy(osfs, lov_sfs, sizeof(*lov_sfs));
+	} else {
+		if (osfs->os_bsize != lov_sfs->os_bsize) {
+			/* assume all block sizes are always powers of 2 */
+			/* get the bits difference */
+			tmp = osfs->os_bsize | lov_sfs->os_bsize;
+			for (shift = 0; shift <= 64; ++shift) {
+				if (tmp & 1) {
+					if (quit)
+						break;
+					else
+						quit = 1;
+					shift = 0;
+				}
+				tmp >>= 1;
+			}
+		}
+
+		if (osfs->os_bsize < lov_sfs->os_bsize) {
+			osfs->os_bsize = lov_sfs->os_bsize;
+
+			osfs->os_bfree  >>= shift;
+			osfs->os_bavail >>= shift;
+			osfs->os_blocks >>= shift;
+		} else if (shift != 0) {
+			lov_sfs->os_bfree  >>= shift;
+			lov_sfs->os_bavail >>= shift;
+			lov_sfs->os_blocks >>= shift;
+		}
+		osfs->os_bfree += lov_sfs->os_bfree;
+		osfs->os_bavail += lov_sfs->os_bavail;
+		osfs->os_blocks += lov_sfs->os_blocks;
+		/* XXX not sure about this one - depends on policy.
+		 *   - could be minimum if we always stripe on all OBDs
+		 *     (but that would be wrong for any other policy,
+		 *     if one of the OBDs has no more objects left)
+		 *   - could be sum if we stripe whole objects
+		 *   - could be average, just to give a nice number
+		 *
+		 * To give a "reasonable" (if not wholly accurate)
+		 * number, we divide the total number of free objects
+		 * by expected stripe count (watch out for overflow).
+		 */
+		LOV_SUM_MAX(osfs->os_files, lov_sfs->os_files);
+		LOV_SUM_MAX(osfs->os_ffree, lov_sfs->os_ffree);
+	}
+}
+
+/* The callback for osc_statfs_async that finilizes a request info when a
+ * response is received. */
+static int cb_statfs_update(void *cookie, int rc)
+{
+	struct obd_info *oinfo = cookie;
+	struct lov_request *lovreq;
+	struct lov_request_set *set;
+	struct obd_statfs *osfs, *lov_sfs;
+	struct lov_obd *lov;
+	struct lov_tgt_desc *tgt;
+	struct obd_device *lovobd, *tgtobd;
+	int success;
+	ENTRY;
+
+	lovreq = container_of(oinfo, struct lov_request, rq_oi);
+	set = lovreq->rq_rqset;
+	lovobd = set->set_obd;
+	lov = &lovobd->u.lov;
+	osfs = set->set_oi->oi_osfs;
+	lov_sfs = oinfo->oi_osfs;
+	success = atomic_read(&set->set_success);
+	/* XXX: the same is done in lov_update_common_set, however
+	   lovset->set_exp is not initialized. */
+	lov_update_set(set, lovreq, rc);
+	if (rc)
+		GOTO(out, rc);
+
+	obd_getref(lovobd);
+	tgt = lov->lov_tgts[lovreq->rq_idx];
+	if (!tgt || !tgt->ltd_active)
+		GOTO(out_update, rc);
+
+	tgtobd = class_exp2obd(tgt->ltd_exp);
+	spin_lock(&tgtobd->obd_osfs_lock);
+	memcpy(&tgtobd->obd_osfs, lov_sfs, sizeof(*lov_sfs));
+	if ((oinfo->oi_flags & OBD_STATFS_FROM_CACHE) == 0)
+		tgtobd->obd_osfs_age = cfs_time_current_64();
+	spin_unlock(&tgtobd->obd_osfs_lock);
+
+out_update:
+	lov_update_statfs(osfs, lov_sfs, success);
+	obd_putref(lovobd);
+
+out:
+	if (set->set_oi->oi_flags & OBD_STATFS_PTLRPCD &&
+	    lov_set_finished(set, 0)) {
+		lov_statfs_interpret(NULL, set, set->set_count !=
+				     atomic_read(&set->set_success));
+	}
+
+	RETURN(0);
+}
+
+int lov_prep_statfs_set(struct obd_device *obd, struct obd_info *oinfo,
+			struct lov_request_set **reqset)
+{
+	struct lov_request_set *set;
+	struct lov_obd *lov = &obd->u.lov;
+	int rc = 0, i;
+	ENTRY;
+
+	OBD_ALLOC(set, sizeof(*set));
+	if (set == NULL)
+		RETURN(-ENOMEM);
+	lov_init_set(set);
+
+	set->set_obd = obd;
+	set->set_oi = oinfo;
+
+	/* We only get block data from the OBD */
+	for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+		struct lov_request *req;
+
+		if (lov->lov_tgts[i] == NULL ||
+		    (!lov_check_and_wait_active(lov, i) &&
+		     (oinfo->oi_flags & OBD_STATFS_NODELAY))) {
+			CDEBUG(D_HA, "lov idx %d inactive\n", i);
+			continue;
+		}
+
+		/* skip targets that have been explicitely disabled by the
+		 * administrator */
+		if (!lov->lov_tgts[i]->ltd_exp) {
+			CDEBUG(D_HA, "lov idx %d administratively disabled\n", i);
+			continue;
+		}
+
+		OBD_ALLOC(req, sizeof(*req));
+		if (req == NULL)
+			GOTO(out_set, rc = -ENOMEM);
+
+		OBD_ALLOC(req->rq_oi.oi_osfs, sizeof(*req->rq_oi.oi_osfs));
+		if (req->rq_oi.oi_osfs == NULL) {
+			OBD_FREE(req, sizeof(*req));
+			GOTO(out_set, rc = -ENOMEM);
+		}
+
+		req->rq_idx = i;
+		req->rq_oi.oi_cb_up = cb_statfs_update;
+		req->rq_oi.oi_flags = oinfo->oi_flags;
+
+		lov_set_add_req(req, set);
+	}
+	if (!set->set_count)
+		GOTO(out_set, rc = -EIO);
+	*reqset = set;
+	RETURN(rc);
+out_set:
+	lov_fini_statfs_set(set);
+	RETURN(rc);
+}
diff --git a/drivers/staging/lustre/lustre/lov/lovsub_dev.c b/drivers/staging/lustre/lustre/lov/lovsub_dev.c
new file mode 100644
index 000000000000..204ecd0b8639
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lovsub_dev.c
@@ -0,0 +1,211 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_device and cl_device_type for LOVSUB layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Lovsub transfer operations.
+ *
+ */
+
+static void lovsub_req_completion(const struct lu_env *env,
+				  const struct cl_req_slice *slice, int ioret)
+{
+	struct lovsub_req *lsr;
+
+	ENTRY;
+	lsr = cl2lovsub_req(slice);
+	OBD_SLAB_FREE_PTR(lsr, lovsub_req_kmem);
+	EXIT;
+}
+
+/**
+ * Implementation of struct cl_req_operations::cro_attr_set() for lovsub
+ * layer. Lov and lovsub are responsible only for struct obdo::o_stripe_idx
+ * field, which is filled there.
+ */
+static void lovsub_req_attr_set(const struct lu_env *env,
+				const struct cl_req_slice *slice,
+				const struct cl_object *obj,
+				struct cl_req_attr *attr, obd_valid flags)
+{
+	struct lovsub_object *subobj;
+
+	ENTRY;
+	subobj = cl2lovsub(obj);
+	/*
+	 * There is no OBD_MD_* flag for obdo::o_stripe_idx, so set it
+	 * unconditionally. It never changes anyway.
+	 */
+	attr->cra_oa->o_stripe_idx = subobj->lso_index;
+	EXIT;
+}
+
+static const struct cl_req_operations lovsub_req_ops = {
+	.cro_attr_set   = lovsub_req_attr_set,
+	.cro_completion = lovsub_req_completion
+};
+
+/*****************************************************************************
+ *
+ * Lov-sub device and device type functions.
+ *
+ */
+
+static int lovsub_device_init(const struct lu_env *env, struct lu_device *d,
+			      const char *name, struct lu_device *next)
+{
+	struct lovsub_device  *lsd = lu2lovsub_dev(d);
+	struct lu_device_type *ldt;
+	int rc;
+
+	ENTRY;
+	next->ld_site = d->ld_site;
+	ldt = next->ld_type;
+	LASSERT(ldt != NULL);
+	rc = ldt->ldt_ops->ldto_device_init(env, next, ldt->ldt_name, NULL);
+	if (rc) {
+		next->ld_site = NULL;
+		RETURN(rc);
+	}
+
+	lu_device_get(next);
+	lu_ref_add(&next->ld_reference, "lu-stack", &lu_site_init);
+	lsd->acid_next = lu2cl_dev(next);
+	RETURN(rc);
+}
+
+static struct lu_device *lovsub_device_fini(const struct lu_env *env,
+					    struct lu_device *d)
+{
+	struct lu_device *next;
+	struct lovsub_device *lsd;
+
+	ENTRY;
+	lsd = lu2lovsub_dev(d);
+	next = cl2lu_dev(lsd->acid_next);
+	lsd->acid_super = NULL;
+	lsd->acid_next = NULL;
+	RETURN(next);
+}
+
+static struct lu_device *lovsub_device_free(const struct lu_env *env,
+					    struct lu_device *d)
+{
+	struct lovsub_device *lsd  = lu2lovsub_dev(d);
+	struct lu_device     *next = cl2lu_dev(lsd->acid_next);
+
+	cl_device_fini(lu2cl_dev(d));
+	OBD_FREE_PTR(lsd);
+	return next;
+}
+
+static int lovsub_req_init(const struct lu_env *env, struct cl_device *dev,
+			   struct cl_req *req)
+{
+	struct lovsub_req *lsr;
+	int result;
+
+	OBD_SLAB_ALLOC_PTR_GFP(lsr, lovsub_req_kmem, __GFP_IO);
+	if (lsr != NULL) {
+		cl_req_slice_add(req, &lsr->lsrq_cl, dev, &lovsub_req_ops);
+		result = 0;
+	} else
+		result = -ENOMEM;
+	return result;
+}
+
+static const struct lu_device_operations lovsub_lu_ops = {
+	.ldo_object_alloc      = lovsub_object_alloc,
+	.ldo_process_config    = NULL,
+	.ldo_recovery_complete = NULL
+};
+
+static const struct cl_device_operations lovsub_cl_ops = {
+	.cdo_req_init = lovsub_req_init
+};
+
+static struct lu_device *lovsub_device_alloc(const struct lu_env *env,
+					     struct lu_device_type *t,
+					     struct lustre_cfg *cfg)
+{
+	struct lu_device     *d;
+	struct lovsub_device *lsd;
+
+	OBD_ALLOC_PTR(lsd);
+	if (lsd != NULL) {
+		int result;
+
+		result = cl_device_init(&lsd->acid_cl, t);
+		if (result == 0) {
+			d = lovsub2lu_dev(lsd);
+			d->ld_ops	 = &lovsub_lu_ops;
+			lsd->acid_cl.cd_ops = &lovsub_cl_ops;
+		} else
+			d = ERR_PTR(result);
+	} else
+		d = ERR_PTR(-ENOMEM);
+	return d;
+}
+
+static const struct lu_device_type_operations lovsub_device_type_ops = {
+	.ldto_device_alloc = lovsub_device_alloc,
+	.ldto_device_free  = lovsub_device_free,
+
+	.ldto_device_init    = lovsub_device_init,
+	.ldto_device_fini    = lovsub_device_fini
+};
+
+#define LUSTRE_LOVSUB_NAME	 "lovsub"
+
+struct lu_device_type lovsub_device_type = {
+	.ldt_tags     = LU_DEVICE_CL,
+	.ldt_name     = LUSTRE_LOVSUB_NAME,
+	.ldt_ops      = &lovsub_device_type_ops,
+	.ldt_ctx_tags = LCT_CL_THREAD
+};
+
+
+/** @} lov */
diff --git a/drivers/staging/lustre/lustre/lov/lovsub_io.c b/drivers/staging/lustre/lustre/lov/lovsub_io.c
new file mode 100644
index 000000000000..783ec687a4e7
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lovsub_io.c
@@ -0,0 +1,55 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_io for LOVSUB layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Lovsub io operations.
+ *
+ */
+
+/* All trivial */
+
+/** @} lov */
diff --git a/drivers/staging/lustre/lustre/lov/lovsub_lock.c b/drivers/staging/lustre/lustre/lov/lovsub_lock.c
new file mode 100644
index 000000000000..03bab17ccc64
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lovsub_lock.c
@@ -0,0 +1,485 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_lock for LOVSUB layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Lovsub lock operations.
+ *
+ */
+
+static void lovsub_lock_fini(const struct lu_env *env,
+			     struct cl_lock_slice *slice)
+{
+	struct lovsub_lock   *lsl;
+
+	ENTRY;
+	lsl = cl2lovsub_lock(slice);
+	LASSERT(list_empty(&lsl->lss_parents));
+	OBD_SLAB_FREE_PTR(lsl, lovsub_lock_kmem);
+	EXIT;
+}
+
+static void lovsub_parent_lock(const struct lu_env *env, struct lov_lock *lov)
+{
+	struct cl_lock *parent;
+
+	ENTRY;
+	parent = lov->lls_cl.cls_lock;
+	cl_lock_get(parent);
+	lu_ref_add(&parent->cll_reference, "lovsub-parent", current);
+	cl_lock_mutex_get(env, parent);
+	EXIT;
+}
+
+static void lovsub_parent_unlock(const struct lu_env *env, struct lov_lock *lov)
+{
+	struct cl_lock *parent;
+
+	ENTRY;
+	parent = lov->lls_cl.cls_lock;
+	cl_lock_mutex_put(env, lov->lls_cl.cls_lock);
+	lu_ref_del(&parent->cll_reference, "lovsub-parent", current);
+	cl_lock_put(env, parent);
+	EXIT;
+}
+
+/**
+ * Implements cl_lock_operations::clo_state() method for lovsub layer, which
+ * method is called whenever sub-lock state changes. Propagates state change
+ * to the top-locks.
+ */
+static void lovsub_lock_state(const struct lu_env *env,
+			      const struct cl_lock_slice *slice,
+			      enum cl_lock_state state)
+{
+	struct lovsub_lock   *sub = cl2lovsub_lock(slice);
+	struct lov_lock_link *scan;
+
+	LASSERT(cl_lock_is_mutexed(slice->cls_lock));
+	ENTRY;
+
+	list_for_each_entry(scan, &sub->lss_parents, lll_list) {
+		struct lov_lock *lov    = scan->lll_super;
+		struct cl_lock  *parent = lov->lls_cl.cls_lock;
+
+		if (sub->lss_active != parent) {
+			lovsub_parent_lock(env, lov);
+			cl_lock_signal(env, parent);
+			lovsub_parent_unlock(env, lov);
+		}
+	}
+	EXIT;
+}
+
+/**
+ * Implementation of cl_lock_operation::clo_weigh() estimating lock weight by
+ * asking parent lock.
+ */
+static unsigned long lovsub_lock_weigh(const struct lu_env *env,
+				       const struct cl_lock_slice *slice)
+{
+	struct lovsub_lock *lock = cl2lovsub_lock(slice);
+	struct lov_lock    *lov;
+	unsigned long       dumbbell;
+
+	ENTRY;
+
+	LASSERT(cl_lock_is_mutexed(slice->cls_lock));
+
+	if (!list_empty(&lock->lss_parents)) {
+		/*
+		 * It is not clear whether all parents have to be asked and
+		 * their estimations summed, or it is enough to ask one. For
+		 * the current usages, one is always enough.
+		 */
+		lov = container_of(lock->lss_parents.next,
+				   struct lov_lock_link, lll_list)->lll_super;
+
+		lovsub_parent_lock(env, lov);
+		dumbbell = cl_lock_weigh(env, lov->lls_cl.cls_lock);
+		lovsub_parent_unlock(env, lov);
+	} else
+		dumbbell = 0;
+
+	RETURN(dumbbell);
+}
+
+/**
+ * Maps start/end offsets within a stripe, to offsets within a file.
+ */
+static void lovsub_lock_descr_map(const struct cl_lock_descr *in,
+				  struct lov_object *lov,
+				  int stripe, struct cl_lock_descr *out)
+{
+	pgoff_t size; /* stripe size in pages */
+	pgoff_t skip; /* how many pages in every stripe are occupied by
+		       * "other" stripes */
+	pgoff_t start;
+	pgoff_t end;
+
+	ENTRY;
+	start = in->cld_start;
+	end   = in->cld_end;
+
+	if (lov->lo_lsm->lsm_stripe_count > 1) {
+		size = cl_index(lov2cl(lov), lov->lo_lsm->lsm_stripe_size);
+		skip = (lov->lo_lsm->lsm_stripe_count - 1) * size;
+
+		/* XXX overflow check here? */
+		start += start/size * skip + stripe * size;
+
+		if (end != CL_PAGE_EOF) {
+			end += end/size * skip + stripe * size;
+			/*
+			 * And check for overflow...
+			 */
+			if (end < in->cld_end)
+				end = CL_PAGE_EOF;
+		}
+	}
+	out->cld_start = start;
+	out->cld_end   = end;
+	EXIT;
+}
+
+/**
+ * Adjusts parent lock extent when a sub-lock is attached to a parent. This is
+ * called in two ways:
+ *
+ *     - as part of receive call-back, when server returns granted extent to
+ *       the client, and
+ *
+ *     - when top-lock finds existing sub-lock in the cache.
+ *
+ * Note, that lock mode is not propagated to the parent: i.e., if CLM_READ
+ * top-lock matches CLM_WRITE sub-lock, top-lock is still CLM_READ.
+ */
+int lov_sublock_modify(const struct lu_env *env, struct lov_lock *lov,
+		       struct lovsub_lock *sublock,
+		       const struct cl_lock_descr *d, int idx)
+{
+	struct cl_lock       *parent;
+	struct lovsub_object *subobj;
+	struct cl_lock_descr *pd;
+	struct cl_lock_descr *parent_descr;
+	int		   result;
+
+	parent       = lov->lls_cl.cls_lock;
+	parent_descr = &parent->cll_descr;
+	LASSERT(cl_lock_mode_match(d->cld_mode, parent_descr->cld_mode));
+
+	subobj = cl2lovsub(sublock->lss_cl.cls_obj);
+	pd     = &lov_env_info(env)->lti_ldescr;
+
+	pd->cld_obj  = parent_descr->cld_obj;
+	pd->cld_mode = parent_descr->cld_mode;
+	pd->cld_gid  = parent_descr->cld_gid;
+	lovsub_lock_descr_map(d, subobj->lso_super, subobj->lso_index, pd);
+	lov->lls_sub[idx].sub_got = *d;
+	/*
+	 * Notify top-lock about modification, if lock description changes
+	 * materially.
+	 */
+	if (!cl_lock_ext_match(parent_descr, pd))
+		result = cl_lock_modify(env, parent, pd);
+	else
+		result = 0;
+	return result;
+}
+
+static int lovsub_lock_modify(const struct lu_env *env,
+			      const struct cl_lock_slice *s,
+			      const struct cl_lock_descr *d)
+{
+	struct lovsub_lock   *lock   = cl2lovsub_lock(s);
+	struct lov_lock_link *scan;
+	struct lov_lock      *lov;
+	int result		   = 0;
+
+	ENTRY;
+
+	LASSERT(cl_lock_mode_match(d->cld_mode,
+				   s->cls_lock->cll_descr.cld_mode));
+	list_for_each_entry(scan, &lock->lss_parents, lll_list) {
+		int rc;
+
+		lov = scan->lll_super;
+		lovsub_parent_lock(env, lov);
+		rc = lov_sublock_modify(env, lov, lock, d, scan->lll_idx);
+		lovsub_parent_unlock(env, lov);
+		result = result ?: rc;
+	}
+	RETURN(result);
+}
+
+static int lovsub_lock_closure(const struct lu_env *env,
+			       const struct cl_lock_slice *slice,
+			       struct cl_lock_closure *closure)
+{
+	struct lovsub_lock   *sub;
+	struct cl_lock       *parent;
+	struct lov_lock_link *scan;
+	int		   result;
+
+	LASSERT(cl_lock_is_mutexed(slice->cls_lock));
+	ENTRY;
+
+	sub    = cl2lovsub_lock(slice);
+	result = 0;
+
+	list_for_each_entry(scan, &sub->lss_parents, lll_list) {
+		parent = scan->lll_super->lls_cl.cls_lock;
+		result = cl_lock_closure_build(env, parent, closure);
+		if (result != 0)
+			break;
+	}
+	RETURN(result);
+}
+
+/**
+ * A helper function for lovsub_lock_delete() that deals with a given parent
+ * top-lock.
+ */
+static int lovsub_lock_delete_one(const struct lu_env *env,
+				  struct cl_lock *child, struct lov_lock *lov)
+{
+	struct cl_lock *parent;
+	int	     result;
+	ENTRY;
+
+	parent = lov->lls_cl.cls_lock;
+	if (parent->cll_error)
+		RETURN(0);
+
+	result = 0;
+	switch (parent->cll_state) {
+	case CLS_ENQUEUED:
+		/* See LU-1355 for the case that a glimpse lock is
+		 * interrupted by signal */
+		LASSERT(parent->cll_flags & CLF_CANCELLED);
+		break;
+	case CLS_QUEUING:
+	case CLS_FREEING:
+		cl_lock_signal(env, parent);
+		break;
+	case CLS_INTRANSIT:
+		/*
+		 * Here lies a problem: a sub-lock is canceled while top-lock
+		 * is being unlocked. Top-lock cannot be moved into CLS_NEW
+		 * state, because unlocking has to succeed eventually by
+		 * placing lock into CLS_CACHED (or failing it), see
+		 * cl_unuse_try(). Nor can top-lock be left in CLS_CACHED
+		 * state, because lov maintains an invariant that all
+		 * sub-locks exist in CLS_CACHED (this allows cached top-lock
+		 * to be reused immediately). Nor can we wait for top-lock
+		 * state to change, because this can be synchronous to the
+		 * current thread.
+		 *
+		 * We know for sure that lov_lock_unuse() will be called at
+		 * least one more time to finish un-using, so leave a mark on
+		 * the top-lock, that will be seen by the next call to
+		 * lov_lock_unuse().
+		 */
+		if (cl_lock_is_intransit(parent))
+			lov->lls_cancel_race = 1;
+		break;
+	case CLS_CACHED:
+		/*
+		 * if a sub-lock is canceled move its top-lock into CLS_NEW
+		 * state to preserve an invariant that a top-lock in
+		 * CLS_CACHED is immediately ready for re-use (i.e., has all
+		 * sub-locks), and so that next attempt to re-use the top-lock
+		 * enqueues missing sub-lock.
+		 */
+		cl_lock_state_set(env, parent, CLS_NEW);
+		/* fall through */
+	case CLS_NEW:
+		/*
+		 * if last sub-lock is canceled, destroy the top-lock (which
+		 * is now `empty') proactively.
+		 */
+		if (lov->lls_nr_filled == 0) {
+			/* ... but unfortunately, this cannot be done easily,
+			 * as cancellation of a top-lock might acquire mutices
+			 * of its other sub-locks, violating lock ordering,
+			 * see cl_lock_{cancel,delete}() preconditions.
+			 *
+			 * To work around this, the mutex of this sub-lock is
+			 * released, top-lock is destroyed, and sub-lock mutex
+			 * acquired again. The list of parents has to be
+			 * re-scanned from the beginning after this.
+			 *
+			 * Only do this if no mutices other than on @child and
+			 * @parent are held by the current thread.
+			 *
+			 * TODO: The lock modal here is too complex, because
+			 * the lock may be canceled and deleted by voluntarily:
+			 *    cl_lock_request
+			 *      -> osc_lock_enqueue_wait
+			 *	-> osc_lock_cancel_wait
+			 *	  -> cl_lock_delete
+			 *	    -> lovsub_lock_delete
+			 *	      -> cl_lock_cancel/delete
+			 *		-> ...
+			 *
+			 * The better choice is to spawn a kernel thread for
+			 * this purpose. -jay
+			 */
+			if (cl_lock_nr_mutexed(env) == 2) {
+				cl_lock_mutex_put(env, child);
+				cl_lock_cancel(env, parent);
+				cl_lock_delete(env, parent);
+				result = 1;
+			}
+		}
+		break;
+	case CLS_HELD:
+		CL_LOCK_DEBUG(D_ERROR, env, parent, "Delete CLS_HELD lock\n");
+	default:
+		CERROR("Impossible state: %d\n", parent->cll_state);
+		LBUG();
+		break;
+	}
+
+	RETURN(result);
+}
+
+/**
+ * An implementation of cl_lock_operations::clo_delete() method. This is
+ * invoked in "bottom-to-top" delete, when lock destruction starts from the
+ * sub-lock (e.g, as a result of ldlm lock LRU policy).
+ */
+static void lovsub_lock_delete(const struct lu_env *env,
+			       const struct cl_lock_slice *slice)
+{
+	struct cl_lock     *child = slice->cls_lock;
+	struct lovsub_lock *sub   = cl2lovsub_lock(slice);
+	int restart;
+
+	LASSERT(cl_lock_is_mutexed(child));
+
+	ENTRY;
+	/*
+	 * Destruction of a sub-lock might take multiple iterations, because
+	 * when the last sub-lock of a given top-lock is deleted, top-lock is
+	 * canceled proactively, and this requires to release sub-lock
+	 * mutex. Once sub-lock mutex has been released, list of its parents
+	 * has to be re-scanned from the beginning.
+	 */
+	do {
+		struct lov_lock      *lov;
+		struct lov_lock_link *scan;
+		struct lov_lock_link *temp;
+		struct lov_lock_sub  *subdata;
+
+		restart = 0;
+		list_for_each_entry_safe(scan, temp,
+					     &sub->lss_parents, lll_list) {
+			lov     = scan->lll_super;
+			subdata = &lov->lls_sub[scan->lll_idx];
+			lovsub_parent_lock(env, lov);
+			subdata->sub_got = subdata->sub_descr;
+			lov_lock_unlink(env, scan, sub);
+			restart = lovsub_lock_delete_one(env, child, lov);
+			lovsub_parent_unlock(env, lov);
+
+			if (restart) {
+				cl_lock_mutex_get(env, child);
+				break;
+			}
+	       }
+	} while (restart);
+	EXIT;
+}
+
+static int lovsub_lock_print(const struct lu_env *env, void *cookie,
+			     lu_printer_t p, const struct cl_lock_slice *slice)
+{
+	struct lovsub_lock   *sub = cl2lovsub_lock(slice);
+	struct lov_lock      *lov;
+	struct lov_lock_link *scan;
+
+	list_for_each_entry(scan, &sub->lss_parents, lll_list) {
+		lov = scan->lll_super;
+		(*p)(env, cookie, "[%d %p ", scan->lll_idx, lov);
+		if (lov != NULL)
+			cl_lock_descr_print(env, cookie, p,
+					    &lov->lls_cl.cls_lock->cll_descr);
+		(*p)(env, cookie, "] ");
+	}
+	return 0;
+}
+
+static const struct cl_lock_operations lovsub_lock_ops = {
+	.clo_fini    = lovsub_lock_fini,
+	.clo_state   = lovsub_lock_state,
+	.clo_delete  = lovsub_lock_delete,
+	.clo_modify  = lovsub_lock_modify,
+	.clo_closure = lovsub_lock_closure,
+	.clo_weigh   = lovsub_lock_weigh,
+	.clo_print   = lovsub_lock_print
+};
+
+int lovsub_lock_init(const struct lu_env *env, struct cl_object *obj,
+		     struct cl_lock *lock, const struct cl_io *io)
+{
+	struct lovsub_lock *lsk;
+	int result;
+
+	ENTRY;
+	OBD_SLAB_ALLOC_PTR_GFP(lsk, lovsub_lock_kmem, __GFP_IO);
+	if (lsk != NULL) {
+		INIT_LIST_HEAD(&lsk->lss_parents);
+		cl_lock_slice_add(lock, &lsk->lss_cl, obj, &lovsub_lock_ops);
+		result = 0;
+	} else
+		result = -ENOMEM;
+	RETURN(result);
+}
+
+/** @} lov */
diff --git a/drivers/staging/lustre/lustre/lov/lovsub_object.c b/drivers/staging/lustre/lustre/lov/lovsub_object.c
new file mode 100644
index 000000000000..1b83d9081c40
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lovsub_object.c
@@ -0,0 +1,170 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_object for LOVSUB layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Lovsub object operations.
+ *
+ */
+
+int lovsub_object_init(const struct lu_env *env, struct lu_object *obj,
+		       const struct lu_object_conf *conf)
+{
+	struct lovsub_device  *dev   = lu2lovsub_dev(obj->lo_dev);
+	struct lu_object      *below;
+	struct lu_device      *under;
+
+	int result;
+
+	ENTRY;
+	under = &dev->acid_next->cd_lu_dev;
+	below = under->ld_ops->ldo_object_alloc(env, obj->lo_header, under);
+	if (below != NULL) {
+		lu_object_add(obj, below);
+		cl_object_page_init(lu2cl(obj), sizeof(struct lovsub_page));
+		result = 0;
+	} else
+		result = -ENOMEM;
+	RETURN(result);
+
+}
+
+static void lovsub_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+	struct lovsub_object *los = lu2lovsub(obj);
+	struct lov_object    *lov = los->lso_super;
+	ENTRY;
+
+	/* We can't assume lov was assigned here, because of the shadow
+	 * object handling in lu_object_find.
+	 */
+	if (lov) {
+		LASSERT(lov->lo_type == LLT_RAID0);
+		LASSERT(lov->u.raid0.lo_sub[los->lso_index] == los);
+		spin_lock(&lov->u.raid0.lo_sub_lock);
+		lov->u.raid0.lo_sub[los->lso_index] = NULL;
+		spin_unlock(&lov->u.raid0.lo_sub_lock);
+	}
+
+	lu_object_fini(obj);
+	lu_object_header_fini(&los->lso_header.coh_lu);
+	OBD_SLAB_FREE_PTR(los, lovsub_object_kmem);
+	EXIT;
+}
+
+static int lovsub_object_print(const struct lu_env *env, void *cookie,
+			       lu_printer_t p, const struct lu_object *obj)
+{
+	struct lovsub_object *los = lu2lovsub(obj);
+
+	return (*p)(env, cookie, "[%d]", los->lso_index);
+}
+
+static int lovsub_attr_set(const struct lu_env *env, struct cl_object *obj,
+			   const struct cl_attr *attr, unsigned valid)
+{
+	struct lov_object *lov = cl2lovsub(obj)->lso_super;
+
+	ENTRY;
+	lov_r0(lov)->lo_attr_valid = 0;
+	RETURN(0);
+}
+
+static int lovsub_object_glimpse(const struct lu_env *env,
+				 const struct cl_object *obj,
+				 struct ost_lvb *lvb)
+{
+	struct lovsub_object *los = cl2lovsub(obj);
+
+	ENTRY;
+	RETURN(cl_object_glimpse(env, &los->lso_super->lo_cl, lvb));
+}
+
+
+
+static const struct cl_object_operations lovsub_ops = {
+	.coo_page_init = lovsub_page_init,
+	.coo_lock_init = lovsub_lock_init,
+	.coo_attr_set  = lovsub_attr_set,
+	.coo_glimpse   = lovsub_object_glimpse
+};
+
+static const struct lu_object_operations lovsub_lu_obj_ops = {
+	.loo_object_init      = lovsub_object_init,
+	.loo_object_delete    = NULL,
+	.loo_object_release   = NULL,
+	.loo_object_free      = lovsub_object_free,
+	.loo_object_print     = lovsub_object_print,
+	.loo_object_invariant = NULL
+};
+
+struct lu_object *lovsub_object_alloc(const struct lu_env *env,
+				      const struct lu_object_header *unused,
+				      struct lu_device *dev)
+{
+	struct lovsub_object *los;
+	struct lu_object     *obj;
+
+	ENTRY;
+	OBD_SLAB_ALLOC_PTR_GFP(los, lovsub_object_kmem, __GFP_IO);
+	if (los != NULL) {
+		struct cl_object_header *hdr;
+
+		obj = lovsub2lu(los);
+		hdr = &los->lso_header;
+		cl_object_header_init(hdr);
+		lu_object_init(obj, &hdr->coh_lu, dev);
+		lu_object_add_top(&hdr->coh_lu, obj);
+		los->lso_cl.co_ops = &lovsub_ops;
+		obj->lo_ops = &lovsub_lu_obj_ops;
+	} else
+		obj = NULL;
+	RETURN(obj);
+}
+
+/** @} lov */
diff --git a/drivers/staging/lustre/lustre/lov/lovsub_page.c b/drivers/staging/lustre/lustre/lov/lovsub_page.c
new file mode 100644
index 000000000000..bc9e683968da
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lovsub_page.c
@@ -0,0 +1,72 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_page for LOVSUB layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Lovsub page operations.
+ *
+ */
+
+static void lovsub_page_fini(const struct lu_env *env,
+			     struct cl_page_slice *slice)
+{
+}
+
+static const struct cl_page_operations lovsub_page_ops = {
+	.cpo_fini   = lovsub_page_fini
+};
+
+int lovsub_page_init(const struct lu_env *env, struct cl_object *obj,
+			struct cl_page *page, struct page *unused)
+{
+	struct lovsub_page *lsb = cl_object_page_slice(obj, page);
+	ENTRY;
+
+	cl_page_slice_add(page, &lsb->lsb_cl, obj, &lovsub_page_ops);
+	RETURN(0);
+}
+
+/** @} lov */
diff --git a/drivers/staging/lustre/lustre/lov/lproc_lov.c b/drivers/staging/lustre/lustre/lov/lproc_lov.c
new file mode 100644
index 000000000000..732d5c70edbb
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lproc_lov.c
@@ -0,0 +1,304 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/version.h>
+#include <asm/statfs.h>
+#include <lprocfs_status.h>
+#include <obd_class.h>
+#include <linux/seq_file.h>
+#include "lov_internal.h"
+
+#ifdef LPROCFS
+static int lov_rd_stripesize(char *page, char **start, off_t off, int count,
+			     int *eof, void *data)
+{
+	struct obd_device *dev = (struct obd_device *)data;
+	struct lov_desc *desc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	*eof = 1;
+	return snprintf(page, count, LPU64"\n", desc->ld_default_stripe_size);
+}
+
+static int lov_wr_stripesize(struct file *file, const char *buffer,
+			       unsigned long count, void *data)
+{
+	struct obd_device *dev = (struct obd_device *)data;
+	struct lov_desc *desc;
+	__u64 val;
+	int rc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	rc = lprocfs_write_u64_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	lov_fix_desc_stripe_size(&val);
+	desc->ld_default_stripe_size = val;
+	return count;
+}
+
+static int lov_rd_stripeoffset(char *page, char **start, off_t off, int count,
+			       int *eof, void *data)
+{
+	struct obd_device *dev = (struct obd_device *)data;
+	struct lov_desc *desc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	*eof = 1;
+	return snprintf(page, count, LPU64"\n", desc->ld_default_stripe_offset);
+}
+
+static int lov_wr_stripeoffset(struct file *file, const char *buffer,
+			       unsigned long count, void *data)
+{
+	struct obd_device *dev = (struct obd_device *)data;
+	struct lov_desc *desc;
+	__u64 val;
+	int rc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	rc = lprocfs_write_u64_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	desc->ld_default_stripe_offset = val;
+	return count;
+}
+
+static int lov_rd_stripetype(char *page, char **start, off_t off, int count,
+			     int *eof, void *data)
+{
+	struct obd_device* dev = (struct obd_device*)data;
+	struct lov_desc *desc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	*eof = 1;
+	return snprintf(page, count, "%u\n", desc->ld_pattern);
+}
+
+static int lov_wr_stripetype(struct file *file, const char *buffer,
+			     unsigned long count, void *data)
+{
+	struct obd_device *dev = (struct obd_device *)data;
+	struct lov_desc *desc;
+	int val, rc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	lov_fix_desc_pattern(&val);
+	desc->ld_pattern = val;
+	return count;
+}
+
+static int lov_rd_stripecount(char *page, char **start, off_t off, int count,
+			      int *eof, void *data)
+{
+	struct obd_device *dev = (struct obd_device *)data;
+	struct lov_desc *desc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	*eof = 1;
+	return snprintf(page, count, "%d\n",
+			(__s16)(desc->ld_default_stripe_count + 1) - 1);
+}
+
+static int lov_wr_stripecount(struct file *file, const char *buffer,
+			      unsigned long count, void *data)
+{
+	struct obd_device *dev = (struct obd_device *)data;
+	struct lov_desc *desc;
+	int val, rc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	lov_fix_desc_stripe_count(&val);
+	desc->ld_default_stripe_count = val;
+	return count;
+}
+
+static int lov_rd_numobd(char *page, char **start, off_t off, int count,
+			 int *eof, void *data)
+{
+	struct obd_device *dev = (struct obd_device*)data;
+	struct lov_desc *desc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	*eof = 1;
+	return snprintf(page, count, "%u\n", desc->ld_tgt_count);
+
+}
+
+static int lov_rd_activeobd(char *page, char **start, off_t off, int count,
+			    int *eof, void *data)
+{
+	struct obd_device* dev = (struct obd_device*)data;
+	struct lov_desc *desc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	*eof = 1;
+	return snprintf(page, count, "%u\n", desc->ld_active_tgt_count);
+}
+
+static int lov_rd_desc_uuid(char *page, char **start, off_t off, int count,
+			    int *eof, void *data)
+{
+	struct obd_device *dev = (struct obd_device*) data;
+	struct lov_obd *lov;
+
+	LASSERT(dev != NULL);
+	lov = &dev->u.lov;
+	*eof = 1;
+	return snprintf(page, count, "%s\n", lov->desc.ld_uuid.uuid);
+}
+
+static void *lov_tgt_seq_start(struct seq_file *p, loff_t *pos)
+{
+	struct obd_device *dev = p->private;
+	struct lov_obd *lov = &dev->u.lov;
+
+	while (*pos < lov->desc.ld_tgt_count) {
+		if (lov->lov_tgts[*pos])
+			return lov->lov_tgts[*pos];
+		++*pos;
+	}
+	return NULL;
+}
+
+static void lov_tgt_seq_stop(struct seq_file *p, void *v)
+{
+}
+
+static void *lov_tgt_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	struct obd_device *dev = p->private;
+	struct lov_obd *lov = &dev->u.lov;
+
+	while (++*pos < lov->desc.ld_tgt_count) {
+		if (lov->lov_tgts[*pos])
+			return lov->lov_tgts[*pos];
+	}
+	return NULL;
+}
+
+static int lov_tgt_seq_show(struct seq_file *p, void *v)
+{
+	struct lov_tgt_desc *tgt = v;
+	return seq_printf(p, "%d: %s %sACTIVE\n", tgt->ltd_index,
+			  obd_uuid2str(&tgt->ltd_uuid),
+			  tgt->ltd_active ? "" : "IN");
+}
+
+struct seq_operations lov_tgt_sops = {
+	.start = lov_tgt_seq_start,
+	.stop = lov_tgt_seq_stop,
+	.next = lov_tgt_seq_next,
+	.show = lov_tgt_seq_show,
+};
+
+static int lov_target_seq_open(struct inode *inode, struct file *file)
+{
+	struct proc_dir_entry *dp = PDE(inode);
+	struct seq_file *seq;
+	int rc;
+
+	LPROCFS_ENTRY_AND_CHECK(dp);
+	rc = seq_open(file, &lov_tgt_sops);
+	if (rc) {
+		LPROCFS_EXIT();
+		return rc;
+	}
+
+	seq = file->private_data;
+	seq->private = dp->data;
+	return 0;
+}
+
+struct lprocfs_vars lprocfs_lov_obd_vars[] = {
+	{ "uuid",	 lprocfs_rd_uuid,	0, 0 },
+	{ "stripesize",   lov_rd_stripesize,      lov_wr_stripesize, 0 },
+	{ "stripeoffset", lov_rd_stripeoffset,    lov_wr_stripeoffset, 0 },
+	{ "stripecount",  lov_rd_stripecount,     lov_wr_stripecount, 0 },
+	{ "stripetype",   lov_rd_stripetype,      lov_wr_stripetype, 0 },
+	{ "numobd",       lov_rd_numobd,	  0, 0 },
+	{ "activeobd",    lov_rd_activeobd,       0, 0 },
+	{ "filestotal",   lprocfs_rd_filestotal,  0, 0 },
+	{ "filesfree",    lprocfs_rd_filesfree,   0, 0 },
+	/*{ "filegroups", lprocfs_rd_filegroups,  0, 0 },*/
+	{ "blocksize",    lprocfs_rd_blksize,     0, 0 },
+	{ "kbytestotal",  lprocfs_rd_kbytestotal, 0, 0 },
+	{ "kbytesfree",   lprocfs_rd_kbytesfree,  0, 0 },
+	{ "kbytesavail",  lprocfs_rd_kbytesavail, 0, 0 },
+	{ "desc_uuid",    lov_rd_desc_uuid,       0, 0 },
+	{ 0 }
+};
+
+static struct lprocfs_vars lprocfs_lov_module_vars[] = {
+	{ "num_refs",     lprocfs_rd_numrefs,     0, 0 },
+	{ 0 }
+};
+
+void lprocfs_lov_init_vars(struct lprocfs_static_vars *lvars)
+{
+    lvars->module_vars  = lprocfs_lov_module_vars;
+    lvars->obd_vars     = lprocfs_lov_obd_vars;
+}
+
+struct file_operations lov_proc_target_fops = {
+	.owner   = THIS_MODULE,
+	.open    = lov_target_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = lprocfs_seq_release,
+};
+#endif /* LPROCFS */
diff --git a/drivers/staging/lustre/lustre/lvfs/Makefile b/drivers/staging/lustre/lustre/lvfs/Makefile
new file mode 100644
index 000000000000..f50b1c574385
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lvfs/Makefile
@@ -0,0 +1,6 @@
+obj-$(CONFIG_LUSTRE_FS) += lvfs.o
+
+lvfs-y := lvfs_linux.o fsfilt.o lvfs_lib.o
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lustre/lvfs/fsfilt.c b/drivers/staging/lustre/lustre/lvfs/fsfilt.c
new file mode 100644
index 000000000000..064445cbdb57
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lvfs/fsfilt.c
@@ -0,0 +1,138 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_FILTER
+
+#include <linux/fs.h>
+#include <linux/jbd.h>
+#include <linux/module.h>
+#include <linux/kmod.h>
+#include <linux/slab.h>
+#include <linux/libcfs/libcfs.h>
+#include <lustre_fsfilt.h>
+
+LIST_HEAD(fsfilt_types);
+
+static struct fsfilt_operations *fsfilt_search_type(const char *type)
+{
+	struct fsfilt_operations *found;
+	struct list_head *p;
+
+	list_for_each(p, &fsfilt_types) {
+		found = list_entry(p, struct fsfilt_operations, fs_list);
+		if (!strcmp(found->fs_type, type)) {
+			return found;
+		}
+	}
+	return NULL;
+}
+
+int fsfilt_register_ops(struct fsfilt_operations *fs_ops)
+{
+	struct fsfilt_operations *found;
+
+	/* lock fsfilt_types list */
+	if ((found = fsfilt_search_type(fs_ops->fs_type))) {
+		if (found != fs_ops) {
+			CERROR("different operations for type %s\n",
+			       fs_ops->fs_type);
+			/* unlock fsfilt_types list */
+			RETURN(-EEXIST);
+		}
+	} else {
+		try_module_get(THIS_MODULE);
+		list_add(&fs_ops->fs_list, &fsfilt_types);
+	}
+
+	/* unlock fsfilt_types list */
+	return 0;
+}
+EXPORT_SYMBOL(fsfilt_register_ops);
+
+void fsfilt_unregister_ops(struct fsfilt_operations *fs_ops)
+{
+	struct list_head *p;
+
+	/* lock fsfilt_types list */
+	list_for_each(p, &fsfilt_types) {
+		struct fsfilt_operations *found;
+
+		found = list_entry(p, typeof(*found), fs_list);
+		if (found == fs_ops) {
+			list_del(p);
+			module_put(THIS_MODULE);
+			break;
+		}
+	}
+	/* unlock fsfilt_types list */
+}
+EXPORT_SYMBOL(fsfilt_unregister_ops);
+
+struct fsfilt_operations *fsfilt_get_ops(const char *type)
+{
+	struct fsfilt_operations *fs_ops;
+
+	/* lock fsfilt_types list */
+	if (!(fs_ops = fsfilt_search_type(type))) {
+		char name[32];
+		int rc;
+
+		snprintf(name, sizeof(name) - 1, "fsfilt_%s", type);
+		name[sizeof(name) - 1] = '\0';
+
+		if (!(rc = request_module("%s", name))) {
+			fs_ops = fsfilt_search_type(type);
+			CDEBUG(D_INFO, "Loaded module '%s'\n", name);
+			if (!fs_ops)
+				rc = -ENOENT;
+		}
+
+		if (rc) {
+			CERROR("Can't find %s interface\n", name);
+			RETURN(ERR_PTR(rc < 0 ? rc : -rc));
+			/* unlock fsfilt_types list */
+		}
+	}
+	try_module_get(fs_ops->fs_owner);
+	/* unlock fsfilt_types list */
+
+	return fs_ops;
+}
+EXPORT_SYMBOL(fsfilt_get_ops);
+
+void fsfilt_put_ops(struct fsfilt_operations *fs_ops)
+{
+	module_put(fs_ops->fs_owner);
+}
+EXPORT_SYMBOL(fsfilt_put_ops);
diff --git a/drivers/staging/lustre/lustre/lvfs/fsfilt_ext3.c b/drivers/staging/lustre/lustre/lvfs/fsfilt_ext3.c
new file mode 100644
index 000000000000..c1e99b37572e
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lvfs/fsfilt_ext3.c
@@ -0,0 +1,761 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lvfs/fsfilt_ext3.c
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FILTER
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <ldiskfs/ldiskfs_config.h>
+#include <ext4/ext4.h>
+#include <ext4/ext4_jbd2.h>
+#include <linux/version.h>
+#include <linux/bitops.h>
+#include <linux/quota.h>
+
+#include <linux/libcfs/libcfs.h>
+#include <lustre_fsfilt.h>
+#include <obd.h>
+#include <linux/lustre_compat25.h>
+#include <linux/lprocfs_status.h>
+
+#include <ext4/ext4_extents.h>
+
+#ifdef HAVE_EXT_PBLOCK /* Name changed to ext4_ext_pblock for kernel 2.6.35 */
+#define ext3_ext_pblock(ex) ext_pblock((ex))
+#endif
+
+/* for kernels 2.6.18 and later */
+#define FSFILT_SINGLEDATA_TRANS_BLOCKS(sb) EXT3_SINGLEDATA_TRANS_BLOCKS(sb)
+
+#define fsfilt_ext3_ext_insert_extent(handle, inode, path, newext, flag) \
+	       ext3_ext_insert_extent(handle, inode, path, newext, flag)
+
+#define ext3_mb_discard_inode_preallocations(inode) \
+		 ext3_discard_preallocations(inode)
+
+#define fsfilt_log_start_commit(journal, tid) jbd2_log_start_commit(journal, tid)
+#define fsfilt_log_wait_commit(journal, tid) jbd2_log_wait_commit(journal, tid)
+
+static struct kmem_cache *fcb_cache;
+
+struct fsfilt_cb_data {
+	struct ext4_journal_cb_entry cb_jcb; /* private data - MUST BE FIRST */
+	fsfilt_cb_t cb_func;	    /* MDS/OBD completion function */
+	struct obd_device *cb_obd;      /* MDS/OBD completion device */
+	__u64 cb_last_rcvd;	     /* MDS/OST last committed operation */
+	void *cb_data;		  /* MDS/OST completion function data */
+};
+
+static char *fsfilt_ext3_get_label(struct super_block *sb)
+{
+	return EXT3_SB(sb)->s_es->s_volume_name;
+}
+
+/* kernel has ext4_blocks_for_truncate since linux-3.1.1 */
+# include <ext4/truncate.h>
+
+/*
+ * We don't currently need any additional blocks for rmdir and
+ * unlink transactions because we are storing the OST oa_id inside
+ * the inode (which we will be changing anyways as part of this
+ * transaction).
+ */
+static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private,
+			       int logs)
+{
+	/* For updates to the last received file */
+	int nblocks = FSFILT_SINGLEDATA_TRANS_BLOCKS(inode->i_sb);
+	journal_t *journal;
+	void *handle;
+
+	if (current->journal_info) {
+		CDEBUG(D_INODE, "increasing refcount on %p\n",
+		       current->journal_info);
+		goto journal_start;
+	}
+
+	switch(op) {
+	case FSFILT_OP_UNLINK:
+		/* delete one file + create/update logs for each stripe */
+		nblocks += EXT3_DELETE_TRANS_BLOCKS(inode->i_sb);
+		nblocks += (EXT3_INDEX_EXTRA_TRANS_BLOCKS +
+			    FSFILT_SINGLEDATA_TRANS_BLOCKS(inode->i_sb)) * logs;
+		break;
+	case FSFILT_OP_CANCEL_UNLINK:
+		LASSERT(logs == 1);
+
+		/* blocks for log header bitmap update OR
+		 * blocks for catalog header bitmap update + unlink of logs +
+		 * blocks for delete the inode (include blocks truncating). */
+		nblocks = (LLOG_CHUNK_SIZE >> inode->i_blkbits) +
+			  EXT3_DELETE_TRANS_BLOCKS(inode->i_sb) +
+			  ext4_blocks_for_truncate(inode) + 3;
+		break;
+	default: CERROR("unknown transaction start op %d\n", op);
+		LBUG();
+	}
+
+	LASSERT(current->journal_info == desc_private);
+	journal = EXT3_SB(inode->i_sb)->s_journal;
+	if (nblocks > journal->j_max_transaction_buffers) {
+		CWARN("too many credits %d for op %ux%u using %d instead\n",
+		       nblocks, op, logs, journal->j_max_transaction_buffers);
+		nblocks = journal->j_max_transaction_buffers;
+	}
+
+ journal_start:
+	LASSERTF(nblocks > 0, "can't start %d credit transaction\n", nblocks);
+	handle = ext3_journal_start(inode, nblocks);
+
+	if (!IS_ERR(handle))
+		LASSERT(current->journal_info == handle);
+	else
+		CERROR("error starting handle for op %u (%u credits): rc %ld\n",
+		       op, nblocks, PTR_ERR(handle));
+	return handle;
+}
+
+static int fsfilt_ext3_commit(struct inode *inode, void *h, int force_sync)
+{
+	int rc;
+	handle_t *handle = h;
+
+	LASSERT(current->journal_info == handle);
+	if (force_sync)
+		handle->h_sync = 1; /* recovery likes this */
+
+	rc = ext3_journal_stop(handle);
+
+	return rc;
+}
+
+#ifndef EXT3_EXTENTS_FL
+#define EXT3_EXTENTS_FL		 0x00080000 /* Inode uses extents */
+#endif
+
+#ifndef EXT_ASSERT
+#define EXT_ASSERT(cond)  BUG_ON(!(cond))
+#endif
+
+#define EXT_GENERATION(inode)	   (EXT4_I(inode)->i_ext_generation)
+#define ext3_ext_base		   inode
+#define ext3_ext_base2inode(inode)      (inode)
+#define EXT_DEPTH(inode)		ext_depth(inode)
+#define fsfilt_ext3_ext_walk_space(inode, block, num, cb, cbdata) \
+			ext3_ext_walk_space(inode, block, num, cb, cbdata);
+
+struct bpointers {
+	unsigned long *blocks;
+	unsigned long start;
+	int num;
+	int init_num;
+	int create;
+};
+
+static long ext3_ext_find_goal(struct inode *inode, struct ext3_ext_path *path,
+			       unsigned long block, int *aflags)
+{
+	struct ext3_inode_info *ei = EXT3_I(inode);
+	unsigned long bg_start;
+	unsigned long colour;
+	int depth;
+
+	if (path) {
+		struct ext3_extent *ex;
+		depth = path->p_depth;
+
+		/* try to predict block placement */
+		if ((ex = path[depth].p_ext))
+			return ext4_ext_pblock(ex) + (block - le32_to_cpu(ex->ee_block));
+
+		/* it looks index is empty
+		 * try to find starting from index itself */
+		if (path[depth].p_bh)
+			return path[depth].p_bh->b_blocknr;
+	}
+
+	/* OK. use inode's group */
+	bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
+		le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
+	colour = (current->pid % 16) *
+		(EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
+	return bg_start + colour + block;
+}
+
+#define ll_unmap_underlying_metadata(sb, blocknr) \
+	unmap_underlying_metadata((sb)->s_bdev, blocknr)
+
+#ifndef EXT3_MB_HINT_GROUP_ALLOC
+static unsigned long new_blocks(handle_t *handle, struct ext3_ext_base *base,
+				struct ext3_ext_path *path, unsigned long block,
+				unsigned long *count, int *err)
+{
+	unsigned long pblock, goal;
+	int aflags = 0;
+	struct inode *inode = ext3_ext_base2inode(base);
+
+	goal = ext3_ext_find_goal(inode, path, block, &aflags);
+	aflags |= 2; /* block have been already reserved */
+	pblock = ext3_mb_new_blocks(handle, inode, goal, count, aflags, err);
+	return pblock;
+
+}
+#else
+static unsigned long new_blocks(handle_t *handle, struct ext3_ext_base *base,
+				struct ext3_ext_path *path, unsigned long block,
+				unsigned long *count, int *err)
+{
+	struct inode *inode = ext3_ext_base2inode(base);
+	struct ext3_allocation_request ar;
+	unsigned long pblock;
+	int aflags;
+
+	/* find neighbour allocated blocks */
+	ar.lleft = block;
+	*err = ext3_ext_search_left(base, path, &ar.lleft, &ar.pleft);
+	if (*err)
+		return 0;
+	ar.lright = block;
+	*err = ext3_ext_search_right(base, path, &ar.lright, &ar.pright);
+	if (*err)
+		return 0;
+
+	/* allocate new block */
+	ar.goal = ext3_ext_find_goal(inode, path, block, &aflags);
+	ar.inode = inode;
+	ar.logical = block;
+	ar.len = *count;
+	ar.flags = EXT3_MB_HINT_DATA;
+	pblock = ext3_mb_new_blocks(handle, &ar, err);
+	*count = ar.len;
+	return pblock;
+}
+#endif
+
+static int ext3_ext_new_extent_cb(struct ext3_ext_base *base,
+				  struct ext3_ext_path *path,
+				  struct ext3_ext_cache *cex,
+#ifdef HAVE_EXT_PREPARE_CB_EXTENT
+				   struct ext3_extent *ex,
+#endif
+				  void *cbdata)
+{
+	struct bpointers *bp = cbdata;
+	struct inode *inode = ext3_ext_base2inode(base);
+	struct ext3_extent nex;
+	unsigned long pblock;
+	unsigned long tgen;
+	int err, i;
+	unsigned long count;
+	handle_t *handle;
+
+#ifdef EXT3_EXT_CACHE_EXTENT
+	if (cex->ec_type == EXT3_EXT_CACHE_EXTENT)
+#else
+	if ((cex->ec_len != 0) && (cex->ec_start != 0))
+#endif
+						   {
+		err = EXT_CONTINUE;
+		goto map;
+	}
+
+	if (bp->create == 0) {
+		i = 0;
+		if (cex->ec_block < bp->start)
+			i = bp->start - cex->ec_block;
+		if (i >= cex->ec_len)
+			CERROR("nothing to do?! i = %d, e_num = %u\n",
+					i, cex->ec_len);
+		for (; i < cex->ec_len && bp->num; i++) {
+			*(bp->blocks) = 0;
+			bp->blocks++;
+			bp->num--;
+			bp->start++;
+		}
+
+		return EXT_CONTINUE;
+	}
+
+	tgen = EXT_GENERATION(base);
+	count = ext3_ext_calc_credits_for_insert(base, path);
+
+	handle = ext3_journal_start(inode, count+EXT3_ALLOC_NEEDED+1);
+	if (IS_ERR(handle)) {
+		return PTR_ERR(handle);
+	}
+
+	if (tgen != EXT_GENERATION(base)) {
+		/* the tree has changed. so path can be invalid at moment */
+		ext3_journal_stop(handle);
+		return EXT_REPEAT;
+	}
+
+	/* In 2.6.32 kernel, ext4_ext_walk_space()'s callback func is not
+	 * protected by i_data_sem as whole. so we patch it to store
+	 * generation to path and now verify the tree hasn't changed */
+	down_write((&EXT4_I(inode)->i_data_sem));
+
+	/* validate extent, make sure the extent tree does not changed */
+	if (EXT_GENERATION(base) != path[0].p_generation) {
+		/* cex is invalid, try again */
+		up_write(&EXT4_I(inode)->i_data_sem);
+		ext3_journal_stop(handle);
+		return EXT_REPEAT;
+	}
+
+	count = cex->ec_len;
+	pblock = new_blocks(handle, base, path, cex->ec_block, &count, &err);
+	if (!pblock)
+		goto out;
+	EXT_ASSERT(count <= cex->ec_len);
+
+	/* insert new extent */
+	nex.ee_block = cpu_to_le32(cex->ec_block);
+	ext3_ext_store_pblock(&nex, pblock);
+	nex.ee_len = cpu_to_le16(count);
+	err = fsfilt_ext3_ext_insert_extent(handle, base, path, &nex, 0);
+	if (err) {
+		/* free data blocks we just allocated */
+		/* not a good idea to call discard here directly,
+		 * but otherwise we'd need to call it every free() */
+#ifdef EXT3_MB_HINT_GROUP_ALLOC
+		ext3_mb_discard_inode_preallocations(inode);
+#endif
+#ifdef HAVE_EXT_FREE_BLOCK_WITH_BUFFER_HEAD /* Introduced in 2.6.32-rc7 */
+		ext3_free_blocks(handle, inode, NULL, ext4_ext_pblock(&nex),
+				 cpu_to_le16(nex.ee_len), 0);
+#else
+		ext3_free_blocks(handle, inode, ext4_ext_pblock(&nex),
+				 cpu_to_le16(nex.ee_len), 0);
+#endif
+		goto out;
+	}
+
+	/*
+	 * Putting len of the actual extent we just inserted,
+	 * we are asking ext3_ext_walk_space() to continue
+	 * scaning after that block
+	 */
+	cex->ec_len = le16_to_cpu(nex.ee_len);
+	cex->ec_start = ext4_ext_pblock(&nex);
+	BUG_ON(le16_to_cpu(nex.ee_len) == 0);
+	BUG_ON(le32_to_cpu(nex.ee_block) != cex->ec_block);
+
+out:
+	up_write((&EXT4_I(inode)->i_data_sem));
+	ext3_journal_stop(handle);
+map:
+	if (err >= 0) {
+		/* map blocks */
+		if (bp->num == 0) {
+			CERROR("hmm. why do we find this extent?\n");
+			CERROR("initial space: %lu:%u\n",
+				bp->start, bp->init_num);
+#ifdef EXT3_EXT_CACHE_EXTENT
+			CERROR("current extent: %u/%u/%llu %d\n",
+				cex->ec_block, cex->ec_len,
+				(unsigned long long)cex->ec_start,
+				cex->ec_type);
+#else
+			CERROR("current extent: %u/%u/%llu\n",
+				cex->ec_block, cex->ec_len,
+				(unsigned long long)cex->ec_start);
+#endif
+		}
+		i = 0;
+		if (cex->ec_block < bp->start)
+			i = bp->start - cex->ec_block;
+		if (i >= cex->ec_len)
+			CERROR("nothing to do?! i = %d, e_num = %u\n",
+					i, cex->ec_len);
+		for (; i < cex->ec_len && bp->num; i++) {
+			*(bp->blocks) = cex->ec_start + i;
+#ifdef EXT3_EXT_CACHE_EXTENT
+			if (cex->ec_type != EXT3_EXT_CACHE_EXTENT)
+#else
+			if ((cex->ec_len == 0) || (cex->ec_start == 0))
+#endif
+									{
+				/* unmap any possible underlying metadata from
+				 * the block device mapping.  bug 6998. */
+				ll_unmap_underlying_metadata(inode->i_sb,
+							     *(bp->blocks));
+			}
+			bp->blocks++;
+			bp->num--;
+			bp->start++;
+		}
+	}
+	return err;
+}
+
+int fsfilt_map_nblocks(struct inode *inode, unsigned long block,
+		       unsigned long num, unsigned long *blocks,
+		       int create)
+{
+	struct ext3_ext_base *base = inode;
+	struct bpointers bp;
+	int err;
+
+	CDEBUG(D_OTHER, "blocks %lu-%lu requested for inode %u\n",
+	       block, block + num - 1, (unsigned) inode->i_ino);
+
+	bp.blocks = blocks;
+	bp.start = block;
+	bp.init_num = bp.num = num;
+	bp.create = create;
+
+	err = fsfilt_ext3_ext_walk_space(base, block, num,
+					 ext3_ext_new_extent_cb, &bp);
+	ext3_ext_invalidate_cache(base);
+
+	return err;
+}
+
+int fsfilt_ext3_map_ext_inode_pages(struct inode *inode, struct page **page,
+				    int pages, unsigned long *blocks,
+				    int create)
+{
+	int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits;
+	int rc = 0, i = 0;
+	struct page *fp = NULL;
+	int clen = 0;
+
+	CDEBUG(D_OTHER, "inode %lu: map %d pages from %lu\n",
+		inode->i_ino, pages, (*page)->index);
+
+	/* pages are sorted already. so, we just have to find
+	 * contig. space and process them properly */
+	while (i < pages) {
+		if (fp == NULL) {
+			/* start new extent */
+			fp = *page++;
+			clen = 1;
+			i++;
+			continue;
+		} else if (fp->index + clen == (*page)->index) {
+			/* continue the extent */
+			page++;
+			clen++;
+			i++;
+			continue;
+		}
+
+		/* process found extent */
+		rc = fsfilt_map_nblocks(inode, fp->index * blocks_per_page,
+					clen * blocks_per_page, blocks,
+					create);
+		if (rc)
+			GOTO(cleanup, rc);
+
+		/* look for next extent */
+		fp = NULL;
+		blocks += blocks_per_page * clen;
+	}
+
+	if (fp)
+		rc = fsfilt_map_nblocks(inode, fp->index * blocks_per_page,
+					clen * blocks_per_page, blocks,
+					create);
+cleanup:
+	return rc;
+}
+
+int fsfilt_ext3_map_bm_inode_pages(struct inode *inode, struct page **page,
+				   int pages, unsigned long *blocks,
+				   int create)
+{
+	int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits;
+	unsigned long *b;
+	int rc = 0, i;
+
+	for (i = 0, b = blocks; i < pages; i++, page++) {
+		rc = ext3_map_inode_page(inode, *page, b, create);
+		if (rc) {
+			CERROR("ino %lu, blk %lu create %d: rc %d\n",
+			       inode->i_ino, *b, create, rc);
+			break;
+		}
+
+		b += blocks_per_page;
+	}
+	return rc;
+}
+
+int fsfilt_ext3_map_inode_pages(struct inode *inode, struct page **page,
+				int pages, unsigned long *blocks,
+				int create, struct mutex *optional_mutex)
+{
+	int rc;
+
+	if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) {
+		rc = fsfilt_ext3_map_ext_inode_pages(inode, page, pages,
+						     blocks, create);
+		return rc;
+	}
+	if (optional_mutex != NULL)
+		mutex_lock(optional_mutex);
+	rc = fsfilt_ext3_map_bm_inode_pages(inode, page, pages, blocks, create);
+	if (optional_mutex != NULL)
+		mutex_unlock(optional_mutex);
+
+	return rc;
+}
+
+int fsfilt_ext3_read(struct inode *inode, void *buf, int size, loff_t *offs)
+{
+	unsigned long block;
+	struct buffer_head *bh;
+	int err, blocksize, csize, boffs, osize = size;
+
+	/* prevent reading after eof */
+	spin_lock(&inode->i_lock);
+	if (i_size_read(inode) < *offs + size) {
+		size = i_size_read(inode) - *offs;
+		spin_unlock(&inode->i_lock);
+		if (size < 0) {
+			CDEBUG(D_EXT2, "size %llu is too short for read @%llu\n",
+			       i_size_read(inode), *offs);
+			return -EBADR;
+		} else if (size == 0) {
+			return 0;
+		}
+	} else {
+		spin_unlock(&inode->i_lock);
+	}
+
+	blocksize = 1 << inode->i_blkbits;
+
+	while (size > 0) {
+		block = *offs >> inode->i_blkbits;
+		boffs = *offs & (blocksize - 1);
+		csize = min(blocksize - boffs, size);
+		bh = ext3_bread(NULL, inode, block, 0, &err);
+		if (!bh) {
+			CERROR("can't read block: %d\n", err);
+			return err;
+		}
+
+		memcpy(buf, bh->b_data + boffs, csize);
+		brelse(bh);
+
+		*offs += csize;
+		buf += csize;
+		size -= csize;
+	}
+	return osize;
+}
+EXPORT_SYMBOL(fsfilt_ext3_read);
+
+static int fsfilt_ext3_read_record(struct file * file, void *buf,
+				   int size, loff_t *offs)
+{
+	int rc;
+	rc = fsfilt_ext3_read(file->f_dentry->d_inode, buf, size, offs);
+	if (rc > 0)
+		rc = 0;
+	return rc;
+}
+
+int fsfilt_ext3_write_handle(struct inode *inode, void *buf, int bufsize,
+				loff_t *offs, handle_t *handle)
+{
+	struct buffer_head *bh = NULL;
+	loff_t old_size = i_size_read(inode), offset = *offs;
+	loff_t new_size = i_size_read(inode);
+	unsigned long block;
+	int err = 0, blocksize = 1 << inode->i_blkbits, size, boffs;
+
+	while (bufsize > 0) {
+		if (bh != NULL)
+			brelse(bh);
+
+		block = offset >> inode->i_blkbits;
+		boffs = offset & (blocksize - 1);
+		size = min(blocksize - boffs, bufsize);
+		bh = ext3_bread(handle, inode, block, 1, &err);
+		if (!bh) {
+			CERROR("can't read/create block: %d\n", err);
+			break;
+		}
+
+		err = ext3_journal_get_write_access(handle, bh);
+		if (err) {
+			CERROR("journal_get_write_access() returned error %d\n",
+			       err);
+			break;
+		}
+		LASSERT(bh->b_data + boffs + size <= bh->b_data + bh->b_size);
+		memcpy(bh->b_data + boffs, buf, size);
+		err = ext3_journal_dirty_metadata(handle, bh);
+		if (err) {
+			CERROR("journal_dirty_metadata() returned error %d\n",
+			       err);
+			break;
+		}
+		if (offset + size > new_size)
+			new_size = offset + size;
+		offset += size;
+		bufsize -= size;
+		buf += size;
+	}
+	if (bh)
+		brelse(bh);
+
+	/* correct in-core and on-disk sizes */
+	if (new_size > i_size_read(inode)) {
+		spin_lock(&inode->i_lock);
+		if (new_size > i_size_read(inode))
+			i_size_write(inode, new_size);
+		if (i_size_read(inode) > EXT3_I(inode)->i_disksize)
+			EXT3_I(inode)->i_disksize = i_size_read(inode);
+		if (i_size_read(inode) > old_size) {
+			spin_unlock(&inode->i_lock);
+			mark_inode_dirty(inode);
+		} else {
+			spin_unlock(&inode->i_lock);
+		}
+	}
+
+	if (err == 0)
+		*offs = offset;
+	return err;
+}
+EXPORT_SYMBOL(fsfilt_ext3_write_handle);
+
+static int fsfilt_ext3_write_record(struct file *file, void *buf, int bufsize,
+				    loff_t *offs, int force_sync)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	handle_t *handle;
+	int err, block_count = 0, blocksize;
+
+	/* Determine how many transaction credits are needed */
+	blocksize = 1 << inode->i_blkbits;
+	block_count = (*offs & (blocksize - 1)) + bufsize;
+	block_count = (block_count + blocksize - 1) >> inode->i_blkbits;
+
+	handle = ext3_journal_start(inode,
+			block_count * EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + 2);
+	if (IS_ERR(handle)) {
+		CERROR("can't start transaction for %d blocks (%d bytes)\n",
+		       block_count * EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + 2,
+		       bufsize);
+		return PTR_ERR(handle);
+	}
+
+	err = fsfilt_ext3_write_handle(inode, buf, bufsize, offs, handle);
+
+	if (!err && force_sync)
+		handle->h_sync = 1; /* recovery likes this */
+
+	ext3_journal_stop(handle);
+
+	return err;
+}
+
+static int fsfilt_ext3_setup(struct super_block *sb)
+{
+	if (!EXT3_HAS_COMPAT_FEATURE(sb,
+				EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
+		CERROR("ext3 mounted without journal\n");
+		return -EINVAL;
+	}
+
+#ifdef S_PDIROPS
+	CWARN("Enabling PDIROPS\n");
+	set_opt(EXT3_SB(sb)->s_mount_opt, PDIROPS);
+	sb->s_flags |= S_PDIROPS;
+#endif
+	if (!EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX))
+		CWARN("filesystem doesn't have dir_index feature enabled\n");
+	return 0;
+}
+static struct fsfilt_operations fsfilt_ext3_ops = {
+	.fs_type		= "ext3",
+	.fs_owner	       = THIS_MODULE,
+	.fs_getlabel	    = fsfilt_ext3_get_label,
+	.fs_start	       = fsfilt_ext3_start,
+	.fs_commit	      = fsfilt_ext3_commit,
+	.fs_map_inode_pages     = fsfilt_ext3_map_inode_pages,
+	.fs_write_record	= fsfilt_ext3_write_record,
+	.fs_read_record	 = fsfilt_ext3_read_record,
+	.fs_setup	       = fsfilt_ext3_setup,
+};
+
+static int __init fsfilt_ext3_init(void)
+{
+	int rc;
+
+	fcb_cache = kmem_cache_create("fsfilt_ext3_fcb",
+					 sizeof(struct fsfilt_cb_data), 0, 0);
+	if (!fcb_cache) {
+		CERROR("error allocating fsfilt journal callback cache\n");
+		GOTO(out, rc = -ENOMEM);
+	}
+
+	rc = fsfilt_register_ops(&fsfilt_ext3_ops);
+
+	if (rc) {
+		int err = kmem_cache_destroy(fcb_cache);
+		LASSERTF(err == 0, "error destroying new cache: rc %d\n", err);
+	}
+out:
+	return rc;
+}
+
+static void __exit fsfilt_ext3_exit(void)
+{
+	int rc;
+
+	fsfilt_unregister_ops(&fsfilt_ext3_ops);
+	rc = kmem_cache_destroy(fcb_cache);
+	LASSERTF(rc == 0, "couldn't destroy fcb_cache slab\n");
+}
+
+module_init(fsfilt_ext3_init);
+module_exit(fsfilt_ext3_exit);
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre ext3 Filesystem Helper v0.1");
+MODULE_LICENSE("GPL");
diff --git a/drivers/staging/lustre/lustre/lvfs/lvfs_lib.c b/drivers/staging/lustre/lustre/lvfs/lvfs_lib.c
new file mode 100644
index 000000000000..97a8be2300dd
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lvfs/lvfs_lib.c
@@ -0,0 +1,173 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lvfs/lvfs_lib.c
+ *
+ * Lustre filesystem abstraction routines
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+#include <linux/module.h>
+#include <lustre_lib.h>
+#include <lprocfs_status.h>
+
+#ifdef LPROCFS
+void lprocfs_counter_add(struct lprocfs_stats *stats, int idx, long amount)
+{
+	struct lprocfs_counter		*percpu_cntr;
+	struct lprocfs_counter_header	*header;
+	int				smp_id;
+	unsigned long			flags = 0;
+
+	if (stats == NULL)
+		return;
+
+	/* With per-client stats, statistics are allocated only for
+	 * single CPU area, so the smp_id should be 0 always. */
+	smp_id = lprocfs_stats_lock(stats, LPROCFS_GET_SMP_ID, &flags);
+	if (smp_id < 0)
+		return;
+
+	header = &stats->ls_cnt_header[idx];
+	percpu_cntr = lprocfs_stats_counter_get(stats, smp_id, idx);
+	percpu_cntr->lc_count++;
+
+	if (header->lc_config & LPROCFS_CNTR_AVGMINMAX) {
+		/*
+		 * lprocfs_counter_add() can be called in interrupt context,
+		 * as memory allocation could trigger memory shrinker call
+		 * ldlm_pool_shrink(), which calls lprocfs_counter_add().
+		 * LU-1727.
+		 *
+		 * Only obd_memory uses LPROCFS_STATS_FLAG_IRQ_SAFE
+		 * flag, because it needs accurate counting lest memory leak
+		 * check reports error.
+		 */
+		if (in_interrupt() &&
+		    (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+			percpu_cntr->lc_sum_irq += amount;
+		else
+			percpu_cntr->lc_sum += amount;
+
+		if (header->lc_config & LPROCFS_CNTR_STDDEV)
+			percpu_cntr->lc_sumsquare += (__s64)amount * amount;
+		if (amount < percpu_cntr->lc_min)
+			percpu_cntr->lc_min = amount;
+		if (amount > percpu_cntr->lc_max)
+			percpu_cntr->lc_max = amount;
+	}
+	lprocfs_stats_unlock(stats, LPROCFS_GET_SMP_ID, &flags);
+}
+EXPORT_SYMBOL(lprocfs_counter_add);
+
+void lprocfs_counter_sub(struct lprocfs_stats *stats, int idx, long amount)
+{
+	struct lprocfs_counter		*percpu_cntr;
+	struct lprocfs_counter_header	*header;
+	int				smp_id;
+	unsigned long			flags = 0;
+
+	if (stats == NULL)
+		return;
+
+	/* With per-client stats, statistics are allocated only for
+	 * single CPU area, so the smp_id should be 0 always. */
+	smp_id = lprocfs_stats_lock(stats, LPROCFS_GET_SMP_ID, &flags);
+	if (smp_id < 0)
+		return;
+
+	header = &stats->ls_cnt_header[idx];
+	percpu_cntr = lprocfs_stats_counter_get(stats, smp_id, idx);
+	if (header->lc_config & LPROCFS_CNTR_AVGMINMAX) {
+		/*
+		 * Sometimes we use RCU callbacks to free memory which calls
+		 * lprocfs_counter_sub(), and RCU callbacks may execute in
+		 * softirq context - right now that's the only case we're in
+		 * softirq context here, use separate counter for that.
+		 * bz20650.
+		 *
+		 * Only obd_memory uses LPROCFS_STATS_FLAG_IRQ_SAFE
+		 * flag, because it needs accurate counting lest memory leak
+		 * check reports error.
+		 */
+		if (in_interrupt() &&
+		    (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+			percpu_cntr->lc_sum_irq -= amount;
+		else
+			percpu_cntr->lc_sum -= amount;
+	}
+	lprocfs_stats_unlock(stats, LPROCFS_GET_SMP_ID, &flags);
+}
+EXPORT_SYMBOL(lprocfs_counter_sub);
+
+int lprocfs_stats_alloc_one(struct lprocfs_stats *stats, unsigned int cpuid)
+{
+	struct lprocfs_counter	*cntr;
+	unsigned int		percpusize;
+	int			rc = -ENOMEM;
+	unsigned long		flags = 0;
+	int			i;
+
+	LASSERT(stats->ls_percpu[cpuid] == NULL);
+	LASSERT((stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) == 0);
+
+	percpusize = lprocfs_stats_counter_size(stats);
+	LIBCFS_ALLOC_ATOMIC(stats->ls_percpu[cpuid], percpusize);
+	if (stats->ls_percpu[cpuid] != NULL) {
+		rc = 0;
+		if (unlikely(stats->ls_biggest_alloc_num <= cpuid)) {
+			if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE)
+				spin_lock_irqsave(&stats->ls_lock, flags);
+			else
+				spin_lock(&stats->ls_lock);
+			if (stats->ls_biggest_alloc_num <= cpuid)
+				stats->ls_biggest_alloc_num = cpuid + 1;
+			if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) {
+				spin_unlock_irqrestore(&stats->ls_lock, flags);
+			} else {
+				spin_unlock(&stats->ls_lock);
+			}
+		}
+		/* initialize the ls_percpu[cpuid] non-zero counter */
+		for (i = 0; i < stats->ls_num; ++i) {
+			cntr = lprocfs_stats_counter_get(stats, cpuid, i);
+			cntr->lc_min = LC_MIN_INIT;
+		}
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_stats_alloc_one);
+#endif  /* LPROCFS */
diff --git a/drivers/staging/lustre/lustre/lvfs/lvfs_linux.c b/drivers/staging/lustre/lustre/lvfs/lvfs_linux.c
new file mode 100644
index 000000000000..1e6f32c3549b
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lvfs/lvfs_linux.c
@@ -0,0 +1,295 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lvfs/lvfs_linux.c
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FILTER
+
+#include <linux/version.h>
+#include <linux/fs.h>
+#include <asm/unistd.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/version.h>
+#include <linux/libcfs/libcfs.h>
+#include <lustre_fsfilt.h>
+#include <obd.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/lustre_compat25.h>
+#include <lvfs.h>
+
+#include <obd.h>
+#include <lustre_lib.h>
+
+struct lprocfs_stats *obd_memory = NULL;
+EXPORT_SYMBOL(obd_memory);
+/* refine later and change to seqlock or simlar from libcfs */
+
+/* Debugging check only needed during development */
+#ifdef OBD_CTXT_DEBUG
+# define ASSERT_CTXT_MAGIC(magic) LASSERT((magic) == OBD_RUN_CTXT_MAGIC)
+# define ASSERT_NOT_KERNEL_CTXT(msg) LASSERTF(!segment_eq(get_fs(), get_ds()),\
+					      msg)
+# define ASSERT_KERNEL_CTXT(msg) LASSERTF(segment_eq(get_fs(), get_ds()), msg)
+#else
+# define ASSERT_CTXT_MAGIC(magic) do {} while(0)
+# define ASSERT_NOT_KERNEL_CTXT(msg) do {} while(0)
+# define ASSERT_KERNEL_CTXT(msg) do {} while(0)
+#endif
+
+static void push_group_info(struct lvfs_run_ctxt *save,
+			    struct group_info *ginfo)
+{
+	if (!ginfo) {
+		save->ngroups = current_ngroups;
+		current_ngroups = 0;
+	} else {
+		struct cred *cred;
+		task_lock(current);
+		save->group_info = current_cred()->group_info;
+		if ((cred = prepare_creds())) {
+			cred->group_info = ginfo;
+			commit_creds(cred);
+		}
+		task_unlock(current);
+	}
+}
+
+static void pop_group_info(struct lvfs_run_ctxt *save,
+			   struct group_info *ginfo)
+{
+	if (!ginfo) {
+		current_ngroups = save->ngroups;
+	} else {
+		struct cred *cred;
+		task_lock(current);
+		if ((cred = prepare_creds())) {
+			cred->group_info = save->group_info;
+			commit_creds(cred);
+		}
+		task_unlock(current);
+	}
+}
+
+/* push / pop to root of obd store */
+void push_ctxt(struct lvfs_run_ctxt *save, struct lvfs_run_ctxt *new_ctx,
+	       struct lvfs_ucred *uc)
+{
+	/* if there is underlaying dt_device then push_ctxt is not needed */
+	if (new_ctx->dt != NULL)
+		return;
+
+	//ASSERT_NOT_KERNEL_CTXT("already in kernel context!\n");
+	ASSERT_CTXT_MAGIC(new_ctx->magic);
+	OBD_SET_CTXT_MAGIC(save);
+
+	save->fs = get_fs();
+	LASSERT(d_refcount(cfs_fs_pwd(current->fs)));
+	LASSERT(d_refcount(new_ctx->pwd));
+	save->pwd = dget(cfs_fs_pwd(current->fs));
+	save->pwdmnt = mntget(cfs_fs_mnt(current->fs));
+	save->luc.luc_umask = current_umask();
+	save->ngroups = current_cred()->group_info->ngroups;
+
+	LASSERT(save->pwd);
+	LASSERT(save->pwdmnt);
+	LASSERT(new_ctx->pwd);
+	LASSERT(new_ctx->pwdmnt);
+
+	if (uc) {
+		struct cred *cred;
+		save->luc.luc_uid = current_uid();
+		save->luc.luc_gid = current_gid();
+		save->luc.luc_fsuid = current_fsuid();
+		save->luc.luc_fsgid = current_fsgid();
+		save->luc.luc_cap = current_cap();
+
+		if ((cred = prepare_creds())) {
+			cred->uid = uc->luc_uid;
+			cred->gid = uc->luc_gid;
+			cred->fsuid = uc->luc_fsuid;
+			cred->fsgid = uc->luc_fsgid;
+			cred->cap_effective = uc->luc_cap;
+			commit_creds(cred);
+		}
+
+		push_group_info(save,
+				uc->luc_ginfo ?:
+				uc->luc_identity ? uc->luc_identity->mi_ginfo :
+						   NULL);
+	}
+	current->fs->umask = 0; /* umask already applied on client */
+	set_fs(new_ctx->fs);
+	ll_set_fs_pwd(current->fs, new_ctx->pwdmnt, new_ctx->pwd);
+}
+EXPORT_SYMBOL(push_ctxt);
+
+void pop_ctxt(struct lvfs_run_ctxt *saved, struct lvfs_run_ctxt *new_ctx,
+	      struct lvfs_ucred *uc)
+{
+	/* if there is underlaying dt_device then pop_ctxt is not needed */
+	if (new_ctx->dt != NULL)
+		return;
+
+	ASSERT_CTXT_MAGIC(saved->magic);
+	ASSERT_KERNEL_CTXT("popping non-kernel context!\n");
+
+	LASSERTF(cfs_fs_pwd(current->fs) == new_ctx->pwd, "%p != %p\n",
+		 cfs_fs_pwd(current->fs), new_ctx->pwd);
+	LASSERTF(cfs_fs_mnt(current->fs) == new_ctx->pwdmnt, "%p != %p\n",
+		 cfs_fs_mnt(current->fs), new_ctx->pwdmnt);
+
+	set_fs(saved->fs);
+	ll_set_fs_pwd(current->fs, saved->pwdmnt, saved->pwd);
+
+	dput(saved->pwd);
+	mntput(saved->pwdmnt);
+	current->fs->umask = saved->luc.luc_umask;
+	if (uc) {
+		struct cred *cred;
+		if ((cred = prepare_creds())) {
+			cred->uid = saved->luc.luc_uid;
+			cred->gid = saved->luc.luc_gid;
+			cred->fsuid = saved->luc.luc_fsuid;
+			cred->fsgid = saved->luc.luc_fsgid;
+			cred->cap_effective = saved->luc.luc_cap;
+			commit_creds(cred);
+		}
+
+		pop_group_info(saved,
+			       uc->luc_ginfo ?:
+			       uc->luc_identity ? uc->luc_identity->mi_ginfo :
+						  NULL);
+	}
+}
+EXPORT_SYMBOL(pop_ctxt);
+
+/* utility to rename a file */
+int lustre_rename(struct dentry *dir, struct vfsmount *mnt,
+		  char *oldname, char *newname)
+{
+	struct dentry *dchild_old, *dchild_new;
+	int err = 0;
+	ENTRY;
+
+	ASSERT_KERNEL_CTXT("kernel doing rename outside kernel context\n");
+	CDEBUG(D_INODE, "renaming file %.*s to %.*s\n",
+	       (int)strlen(oldname), oldname, (int)strlen(newname), newname);
+
+	dchild_old = ll_lookup_one_len(oldname, dir, strlen(oldname));
+	if (IS_ERR(dchild_old))
+		RETURN(PTR_ERR(dchild_old));
+
+	if (!dchild_old->d_inode)
+		GOTO(put_old, err = -ENOENT);
+
+	dchild_new = ll_lookup_one_len(newname, dir, strlen(newname));
+	if (IS_ERR(dchild_new))
+		GOTO(put_old, err = PTR_ERR(dchild_new));
+
+	err = ll_vfs_rename(dir->d_inode, dchild_old, mnt,
+			    dir->d_inode, dchild_new, mnt);
+
+	dput(dchild_new);
+put_old:
+	dput(dchild_old);
+	RETURN(err);
+}
+EXPORT_SYMBOL(lustre_rename);
+
+/* Note: dput(dchild) will *not* be called if there is an error */
+struct l_file *l_dentry_open(struct lvfs_run_ctxt *ctxt, struct l_dentry *de,
+			     int flags)
+{
+	struct path path = {
+		.dentry = de,
+		.mnt = ctxt->pwdmnt,
+	};
+	return ll_dentry_open(&path, flags, current_cred());
+}
+EXPORT_SYMBOL(l_dentry_open);
+
+#ifdef LPROCFS
+__s64 lprocfs_read_helper(struct lprocfs_counter *lc,
+			  struct lprocfs_counter_header *header,
+			  enum lprocfs_stats_flags flags,
+			  enum lprocfs_fields_flags field)
+{
+	__s64 ret = 0;
+
+	if (lc == NULL || header == NULL)
+		RETURN(0);
+
+	switch (field) {
+		case LPROCFS_FIELDS_FLAGS_CONFIG:
+			ret = header->lc_config;
+			break;
+		case LPROCFS_FIELDS_FLAGS_SUM:
+			ret = lc->lc_sum;
+			if ((flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+				ret += lc->lc_sum_irq;
+			break;
+		case LPROCFS_FIELDS_FLAGS_MIN:
+			ret = lc->lc_min;
+			break;
+		case LPROCFS_FIELDS_FLAGS_MAX:
+			ret = lc->lc_max;
+			break;
+		case LPROCFS_FIELDS_FLAGS_AVG:
+			ret = (lc->lc_max - lc->lc_min) / 2;
+			break;
+		case LPROCFS_FIELDS_FLAGS_SUMSQUARE:
+			ret = lc->lc_sumsquare;
+			break;
+		case LPROCFS_FIELDS_FLAGS_COUNT:
+			ret = lc->lc_count;
+			break;
+		default:
+			break;
+	};
+
+	RETURN(ret);
+}
+EXPORT_SYMBOL(lprocfs_read_helper);
+#endif /* LPROCFS */
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre VFS Filesystem Helper v0.1");
+MODULE_LICENSE("GPL");
diff --git a/drivers/staging/lustre/lustre/mdc/Makefile b/drivers/staging/lustre/lustre/mdc/Makefile
new file mode 100644
index 000000000000..93bae242e761
--- /dev/null
+++ b/drivers/staging/lustre/lustre/mdc/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_LUSTRE_FS) += mdc.o
+mdc-y := mdc_request.o mdc_reint.o lproc_mdc.o mdc_lib.o mdc_locks.o
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lustre/mdc/lproc_mdc.c b/drivers/staging/lustre/lustre/mdc/lproc_mdc.c
new file mode 100644
index 000000000000..a6a8a0d3d009
--- /dev/null
+++ b/drivers/staging/lustre/lustre/mdc/lproc_mdc.c
@@ -0,0 +1,183 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/version.h>
+#include <linux/vfs.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+
+#ifdef LPROCFS
+
+static int mdc_rd_max_rpcs_in_flight(char *page, char **start, off_t off,
+				     int count, int *eof, void *data)
+{
+	struct obd_device *dev = data;
+	struct client_obd *cli = &dev->u.cli;
+	int rc;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	rc = snprintf(page, count, "%u\n", cli->cl_max_rpcs_in_flight);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+	return rc;
+}
+
+static int mdc_wr_max_rpcs_in_flight(struct file *file, const char *buffer,
+				     unsigned long count, void *data)
+{
+	struct obd_device *dev = data;
+	struct client_obd *cli = &dev->u.cli;
+	int val, rc;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	if (val < 1 || val > MDC_MAX_RIF_MAX)
+		return -ERANGE;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	cli->cl_max_rpcs_in_flight = val;
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	return count;
+}
+
+/* temporary for testing */
+static int mdc_wr_kuc(struct file *file, const char *buffer,
+		      unsigned long count, void *data)
+{
+	struct obd_device	*obd = data;
+	struct kuc_hdr		*lh;
+	struct hsm_action_list	*hal;
+	struct hsm_action_item	*hai;
+	int			 len;
+	int			 fd, rc;
+	ENTRY;
+
+	rc = lprocfs_write_helper(buffer, count, &fd);
+	if (rc)
+		RETURN(rc);
+
+	if (fd < 0)
+		RETURN(-ERANGE);
+	CWARN("message to fd %d\n", fd);
+
+	len = sizeof(*lh) + sizeof(*hal) + MTI_NAME_MAXLEN +
+		/* for mockup below */ 2 * cfs_size_round(sizeof(*hai));
+
+	OBD_ALLOC(lh, len);
+
+	lh->kuc_magic = KUC_MAGIC;
+	lh->kuc_transport = KUC_TRANSPORT_HSM;
+	lh->kuc_msgtype = HMT_ACTION_LIST;
+	lh->kuc_msglen = len;
+
+	hal = (struct hsm_action_list *)(lh + 1);
+	hal->hal_version = HAL_VERSION;
+	hal->hal_archive_id = 1;
+	hal->hal_flags = 0;
+	obd_uuid2fsname(hal->hal_fsname, obd->obd_name, MTI_NAME_MAXLEN);
+
+	/* mock up an action list */
+	hal->hal_count = 2;
+	hai = hai_zero(hal);
+	hai->hai_action = HSMA_ARCHIVE;
+	hai->hai_fid.f_oid = 5;
+	hai->hai_len = sizeof(*hai);
+	hai = hai_next(hai);
+	hai->hai_action = HSMA_RESTORE;
+	hai->hai_fid.f_oid = 10;
+	hai->hai_len = sizeof(*hai);
+
+	/* This works for either broadcast or unicast to a single fd */
+	if (fd == 0) {
+		rc = libcfs_kkuc_group_put(KUC_GRP_HSM, lh);
+	} else {
+		struct file *fp = fget(fd);
+
+		rc = libcfs_kkuc_msg_put(fp, lh);
+		fput(fp);
+	}
+	OBD_FREE(lh, len);
+	if (rc < 0)
+		RETURN(rc);
+	RETURN(count);
+}
+
+static struct lprocfs_vars lprocfs_mdc_obd_vars[] = {
+	{ "uuid",	    lprocfs_rd_uuid,	0, 0 },
+	{ "ping",	    0, lprocfs_wr_ping,     0, 0, 0222 },
+	{ "connect_flags",   lprocfs_rd_connect_flags, 0, 0 },
+	{ "blocksize",       lprocfs_rd_blksize,     0, 0 },
+	{ "kbytestotal",     lprocfs_rd_kbytestotal, 0, 0 },
+	{ "kbytesfree",      lprocfs_rd_kbytesfree,  0, 0 },
+	{ "kbytesavail",     lprocfs_rd_kbytesavail, 0, 0 },
+	{ "filestotal",      lprocfs_rd_filestotal,  0, 0 },
+	{ "filesfree",       lprocfs_rd_filesfree,   0, 0 },
+	/*{ "filegroups",      lprocfs_rd_filegroups,  0, 0 },*/
+	{ "mds_server_uuid", lprocfs_rd_server_uuid, 0, 0 },
+	{ "mds_conn_uuid",   lprocfs_rd_conn_uuid,   0, 0 },
+	/*
+	 * FIXME: below proc entry is provided, but not in used, instead
+	 * sbi->sb_md_brw_size is used, the per obd variable should be used
+	 * when CMD is enabled, and dir pages are managed in MDC layer.
+	 * Remember to enable proc write function.
+	 */
+	{ "max_pages_per_rpc",  lprocfs_obd_rd_max_pages_per_rpc,
+				/* lprocfs_obd_wr_max_pages_per_rpc */0, 0 },
+	{ "max_rpcs_in_flight", mdc_rd_max_rpcs_in_flight,
+				mdc_wr_max_rpcs_in_flight, 0 },
+	{ "timeouts",	lprocfs_rd_timeouts,    0, 0 },
+	{ "import",	  lprocfs_rd_import,      lprocfs_wr_import, 0 },
+	{ "state",	   lprocfs_rd_state,       0, 0 },
+	{ "hsm_nl",	  0, mdc_wr_kuc,	  0, 0, 0200 },
+	{ "pinger_recov",    lprocfs_rd_pinger_recov,
+			     lprocfs_wr_pinger_recov, 0, 0 },
+	{ 0 }
+};
+
+static struct lprocfs_vars lprocfs_mdc_module_vars[] = {
+	{ "num_refs",	lprocfs_rd_numrefs,     0, 0 },
+	{ 0 }
+};
+
+void lprocfs_mdc_init_vars(struct lprocfs_static_vars *lvars)
+{
+    lvars->module_vars  = lprocfs_mdc_module_vars;
+    lvars->obd_vars     = lprocfs_mdc_obd_vars;
+}
+#endif /* LPROCFS */
diff --git a/drivers/staging/lustre/lustre/mdc/mdc_internal.h b/drivers/staging/lustre/lustre/mdc/mdc_internal.h
new file mode 100644
index 000000000000..2aeff0ecec34
--- /dev/null
+++ b/drivers/staging/lustre/lustre/mdc/mdc_internal.h
@@ -0,0 +1,180 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _MDC_INTERNAL_H
+#define _MDC_INTERNAL_H
+
+#include <lustre_mdc.h>
+#include <lustre_mds.h>
+
+#ifdef LPROCFS
+void lprocfs_mdc_init_vars(struct lprocfs_static_vars *lvars);
+#else
+static inline void lprocfs_mdc_init_vars(struct lprocfs_static_vars *lvars)
+{
+	memset(lvars, 0, sizeof(*lvars));
+}
+#endif
+
+void mdc_pack_body(struct ptlrpc_request *req, const struct lu_fid *fid,
+		   struct obd_capa *oc, __u64 valid, int ea_size,
+		   __u32 suppgid, int flags);
+void mdc_pack_capa(struct ptlrpc_request *req,
+		   const struct req_msg_field *field, struct obd_capa *oc);
+int mdc_pack_req(struct ptlrpc_request *req, int version, int opc);
+void mdc_is_subdir_pack(struct ptlrpc_request *req, const struct lu_fid *pfid,
+			const struct lu_fid *cfid, int flags);
+void mdc_swap_layouts_pack(struct ptlrpc_request *req,
+			   struct md_op_data *op_data);
+void mdc_readdir_pack(struct ptlrpc_request *req, __u64 pgoff, __u32 size,
+		      const struct lu_fid *fid, struct obd_capa *oc);
+void mdc_getattr_pack(struct ptlrpc_request *req, __u64 valid, int flags,
+		      struct md_op_data *data, int ea_size);
+void mdc_setattr_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+		     void *ea, int ealen, void *ea2, int ea2len);
+void mdc_create_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+		     const void *data, int datalen, __u32 mode, __u32 uid,
+		     __u32 gid, cfs_cap_t capability, __u64 rdev);
+void mdc_open_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+		   __u32 mode, __u64 rdev, __u32 flags, const void *data,
+		   int datalen);
+void mdc_unlink_pack(struct ptlrpc_request *req, struct md_op_data *op_data);
+void mdc_link_pack(struct ptlrpc_request *req, struct md_op_data *op_data);
+void mdc_rename_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+		     const char *old, int oldlen, const char *new, int newlen);
+void mdc_close_pack(struct ptlrpc_request *req, struct md_op_data *op_data);
+int mdc_enter_request(struct client_obd *cli);
+void mdc_exit_request(struct client_obd *cli);
+
+/* mdc/mdc_locks.c */
+int mdc_set_lock_data(struct obd_export *exp,
+		      __u64 *lockh, void *data, __u64 *bits);
+
+int mdc_null_inode(struct obd_export *exp, const struct lu_fid *fid);
+
+int mdc_find_cbdata(struct obd_export *exp, const struct lu_fid *fid,
+		    ldlm_iterator_t it, void *data);
+
+int mdc_intent_lock(struct obd_export *exp,
+		    struct md_op_data *,
+		    void *lmm, int lmmsize,
+		    struct lookup_intent *, int,
+		    struct ptlrpc_request **reqp,
+		    ldlm_blocking_callback cb_blocking,
+		    __u64 extra_lock_flags);
+int mdc_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
+		struct lookup_intent *it, struct md_op_data *op_data,
+		struct lustre_handle *lockh, void *lmm, int lmmsize,
+		struct ptlrpc_request **req, __u64 extra_lock_flags);
+
+int mdc_resource_get_unused(struct obd_export *exp, struct lu_fid *fid,
+			    struct list_head *cancels, ldlm_mode_t mode,
+			    __u64 bits);
+/* mdc/mdc_request.c */
+int mdc_fid_alloc(struct obd_export *exp, struct lu_fid *fid,
+		  struct md_op_data *op_data);
+
+int mdc_open(struct obd_export *exp, obd_id ino, int type, int flags,
+	     struct lov_mds_md *lmm, int lmm_size, struct lustre_handle *fh,
+	     struct ptlrpc_request **);
+
+struct obd_client_handle;
+
+int mdc_get_lustre_md(struct obd_export *md_exp, struct ptlrpc_request *req,
+		      struct obd_export *dt_exp, struct obd_export *lmv_exp,
+		      struct lustre_md *md);
+
+int mdc_free_lustre_md(struct obd_export *exp, struct lustre_md *md);
+
+int mdc_set_open_replay_data(struct obd_export *exp,
+			     struct obd_client_handle *och,
+			     struct ptlrpc_request *open_req);
+
+int mdc_clear_open_replay_data(struct obd_export *exp,
+			       struct obd_client_handle *och);
+void mdc_commit_open(struct ptlrpc_request *req);
+void mdc_replay_open(struct ptlrpc_request *req);
+
+int mdc_create(struct obd_export *exp, struct md_op_data *op_data,
+	       const void *data, int datalen, int mode, __u32 uid, __u32 gid,
+	       cfs_cap_t capability, __u64 rdev,
+	       struct ptlrpc_request **request);
+int mdc_link(struct obd_export *exp, struct md_op_data *op_data,
+	     struct ptlrpc_request **request);
+int mdc_rename(struct obd_export *exp, struct md_op_data *op_data,
+	       const char *old, int oldlen, const char *new, int newlen,
+	       struct ptlrpc_request **request);
+int mdc_setattr(struct obd_export *exp, struct md_op_data *op_data,
+		void *ea, int ealen, void *ea2, int ea2len,
+		struct ptlrpc_request **request, struct md_open_data **mod);
+int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data,
+	       struct ptlrpc_request **request);
+int mdc_cancel_unused(struct obd_export *exp, const struct lu_fid *fid,
+		      ldlm_policy_data_t *policy, ldlm_mode_t mode,
+		      ldlm_cancel_flags_t flags, void *opaque);
+
+static inline void mdc_set_capa_size(struct ptlrpc_request *req,
+				     const struct req_msg_field *field,
+				     struct obd_capa *oc)
+{
+	if (oc == NULL)
+		req_capsule_set_size(&req->rq_pill, field, RCL_CLIENT, 0);
+	else
+		/* it is already calculated as sizeof struct obd_capa */
+		;
+}
+
+int mdc_revalidate_lock(struct obd_export *exp, struct lookup_intent *it,
+			struct lu_fid *fid, __u64 *bits);
+
+int mdc_intent_getattr_async(struct obd_export *exp,
+			     struct md_enqueue_info *minfo,
+			     struct ldlm_enqueue_info *einfo);
+
+ldlm_mode_t mdc_lock_match(struct obd_export *exp, __u64 flags,
+			   const struct lu_fid *fid, ldlm_type_t type,
+			   ldlm_policy_data_t *policy, ldlm_mode_t mode,
+			   struct lustre_handle *lockh);
+
+static inline int mdc_prep_elc_req(struct obd_export *exp,
+				   struct ptlrpc_request *req, int opc,
+				   struct list_head *cancels, int count)
+{
+	return ldlm_prep_elc_req(exp, req, LUSTRE_MDS_VERSION, opc, 0, cancels,
+				 count);
+}
+
+#endif
diff --git a/drivers/staging/lustre/lustre/mdc/mdc_lib.c b/drivers/staging/lustre/lustre/mdc/mdc_lib.c
new file mode 100644
index 000000000000..05c6968119c8
--- /dev/null
+++ b/drivers/staging/lustre/lustre/mdc/mdc_lib.c
@@ -0,0 +1,564 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_MDC
+#include <lustre_net.h>
+#include <lustre/lustre_idl.h>
+#include "mdc_internal.h"
+
+
+static void __mdc_pack_body(struct mdt_body *b, __u32 suppgid)
+{
+	LASSERT (b != NULL);
+
+	b->suppgid = suppgid;
+	b->uid = current_uid();
+	b->gid = current_gid();
+	b->fsuid = current_fsuid();
+	b->fsgid = current_fsgid();
+	b->capability = cfs_curproc_cap_pack();
+}
+
+void mdc_pack_capa(struct ptlrpc_request *req, const struct req_msg_field *field,
+		   struct obd_capa *oc)
+{
+	struct req_capsule *pill = &req->rq_pill;
+	struct lustre_capa *c;
+
+	if (oc == NULL) {
+		LASSERT(req_capsule_get_size(pill, field, RCL_CLIENT) == 0);
+		return;
+	}
+
+	c = req_capsule_client_get(pill, field);
+	LASSERT(c != NULL);
+	capa_cpy(c, oc);
+	DEBUG_CAPA(D_SEC, c, "pack");
+}
+
+void mdc_is_subdir_pack(struct ptlrpc_request *req, const struct lu_fid *pfid,
+			const struct lu_fid *cfid, int flags)
+{
+	struct mdt_body *b = req_capsule_client_get(&req->rq_pill,
+						    &RMF_MDT_BODY);
+
+	if (pfid) {
+		b->fid1 = *pfid;
+		b->valid = OBD_MD_FLID;
+	}
+	if (cfid)
+		b->fid2 = *cfid;
+	b->flags = flags;
+}
+
+void mdc_swap_layouts_pack(struct ptlrpc_request *req,
+			   struct md_op_data *op_data)
+{
+	struct mdt_body *b = req_capsule_client_get(&req->rq_pill,
+						    &RMF_MDT_BODY);
+
+	__mdc_pack_body(b, op_data->op_suppgids[0]);
+	b->fid1 = op_data->op_fid1;
+	b->fid2 = op_data->op_fid2;
+	b->valid |= OBD_MD_FLID;
+
+	mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+	mdc_pack_capa(req, &RMF_CAPA2, op_data->op_capa2);
+}
+
+void mdc_pack_body(struct ptlrpc_request *req,
+		   const struct lu_fid *fid, struct obd_capa *oc,
+		   __u64 valid, int ea_size, __u32 suppgid, int flags)
+{
+	struct mdt_body *b = req_capsule_client_get(&req->rq_pill,
+						    &RMF_MDT_BODY);
+	LASSERT(b != NULL);
+	b->valid = valid;
+	b->eadatasize = ea_size;
+	b->flags = flags;
+	__mdc_pack_body(b, suppgid);
+	if (fid) {
+		b->fid1 = *fid;
+		b->valid |= OBD_MD_FLID;
+		mdc_pack_capa(req, &RMF_CAPA1, oc);
+	}
+}
+
+void mdc_readdir_pack(struct ptlrpc_request *req, __u64 pgoff,
+		      __u32 size, const struct lu_fid *fid, struct obd_capa *oc)
+{
+	struct mdt_body *b = req_capsule_client_get(&req->rq_pill,
+						    &RMF_MDT_BODY);
+	b->fid1 = *fid;
+	b->valid |= OBD_MD_FLID;
+	b->size = pgoff;		       /* !! */
+	b->nlink = size;			/* !! */
+	__mdc_pack_body(b, -1);
+	b->mode = LUDA_FID | LUDA_TYPE;
+
+	mdc_pack_capa(req, &RMF_CAPA1, oc);
+}
+
+/* packing of MDS records */
+void mdc_create_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+		     const void *data, int datalen, __u32 mode,
+		     __u32 uid, __u32 gid, cfs_cap_t cap_effective, __u64 rdev)
+{
+	struct mdt_rec_create	*rec;
+	char			*tmp;
+	__u64			 flags;
+
+	CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_create));
+	rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+
+
+	rec->cr_opcode   = REINT_CREATE;
+	rec->cr_fsuid    = uid;
+	rec->cr_fsgid    = gid;
+	rec->cr_cap      = cap_effective;
+	rec->cr_fid1     = op_data->op_fid1;
+	rec->cr_fid2     = op_data->op_fid2;
+	rec->cr_mode     = mode;
+	rec->cr_rdev     = rdev;
+	rec->cr_time     = op_data->op_mod_time;
+	rec->cr_suppgid1 = op_data->op_suppgids[0];
+	rec->cr_suppgid2 = op_data->op_suppgids[1];
+	flags = op_data->op_flags & MF_SOM_LOCAL_FLAGS;
+	if (op_data->op_bias & MDS_CREATE_VOLATILE)
+		flags |= MDS_OPEN_VOLATILE;
+	set_mrc_cr_flags(rec, flags);
+	rec->cr_bias     = op_data->op_bias;
+	rec->cr_umask    = current_umask();
+
+	mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+
+	tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+	LOGL0(op_data->op_name, op_data->op_namelen, tmp);
+
+	if (data) {
+		tmp = req_capsule_client_get(&req->rq_pill, &RMF_EADATA);
+		memcpy(tmp, data, datalen);
+	}
+}
+
+static __u64 mds_pack_open_flags(__u32 flags, __u32 mode)
+{
+	__u64 cr_flags = (flags & (FMODE_READ | FMODE_WRITE |
+				   MDS_OPEN_HAS_EA | MDS_OPEN_HAS_OBJS |
+				   MDS_OPEN_OWNEROVERRIDE | MDS_OPEN_LOCK |
+				   MDS_OPEN_BY_FID));
+	if (flags & O_CREAT)
+		cr_flags |= MDS_OPEN_CREAT;
+	if (flags & O_EXCL)
+		cr_flags |= MDS_OPEN_EXCL;
+	if (flags & O_TRUNC)
+		cr_flags |= MDS_OPEN_TRUNC;
+	if (flags & O_APPEND)
+		cr_flags |= MDS_OPEN_APPEND;
+	if (flags & O_SYNC)
+		cr_flags |= MDS_OPEN_SYNC;
+	if (flags & O_DIRECTORY)
+		cr_flags |= MDS_OPEN_DIRECTORY;
+#ifdef FMODE_EXEC
+	if (flags & FMODE_EXEC)
+		cr_flags |= MDS_FMODE_EXEC;
+#endif
+	if (flags & O_LOV_DELAY_CREATE)
+		cr_flags |= MDS_OPEN_DELAY_CREATE;
+
+	if ((flags & O_NOACCESS) || (flags & O_NONBLOCK))
+		cr_flags |= MDS_OPEN_NORESTORE;
+
+	return cr_flags;
+}
+
+/* packing of MDS records */
+void mdc_open_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+		   __u32 mode, __u64 rdev, __u32 flags, const void *lmm,
+		   int lmmlen)
+{
+	struct mdt_rec_create *rec;
+	char *tmp;
+	__u64 cr_flags;
+
+	CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_create));
+	rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+
+	/* XXX do something about time, uid, gid */
+	rec->cr_opcode   = REINT_OPEN;
+	rec->cr_fsuid   = current_fsuid();
+	rec->cr_fsgid   = current_fsgid();
+	rec->cr_cap      = cfs_curproc_cap_pack();
+	if (op_data != NULL) {
+		rec->cr_fid1 = op_data->op_fid1;
+		rec->cr_fid2 = op_data->op_fid2;
+	}
+	rec->cr_mode     = mode;
+	cr_flags = mds_pack_open_flags(flags, mode);
+	rec->cr_rdev     = rdev;
+	rec->cr_time     = op_data->op_mod_time;
+	rec->cr_suppgid1 = op_data->op_suppgids[0];
+	rec->cr_suppgid2 = op_data->op_suppgids[1];
+	rec->cr_bias     = op_data->op_bias;
+	rec->cr_umask    = current_umask();
+
+	mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+	/* the next buffer is child capa, which is used for replay,
+	 * will be packed from the data in reply message. */
+
+	if (op_data->op_name) {
+		tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+		LOGL0(op_data->op_name, op_data->op_namelen, tmp);
+		if (op_data->op_bias & MDS_CREATE_VOLATILE)
+			cr_flags |= MDS_OPEN_VOLATILE;
+	}
+
+	if (lmm) {
+		cr_flags |= MDS_OPEN_HAS_EA;
+		tmp = req_capsule_client_get(&req->rq_pill, &RMF_EADATA);
+		memcpy(tmp, lmm, lmmlen);
+	}
+	set_mrc_cr_flags(rec, cr_flags);
+}
+
+static inline __u64 attr_pack(unsigned int ia_valid) {
+	__u64 sa_valid = 0;
+
+	if (ia_valid & ATTR_MODE)
+		sa_valid |= MDS_ATTR_MODE;
+	if (ia_valid & ATTR_UID)
+		sa_valid |= MDS_ATTR_UID;
+	if (ia_valid & ATTR_GID)
+		sa_valid |= MDS_ATTR_GID;
+	if (ia_valid & ATTR_SIZE)
+		sa_valid |= MDS_ATTR_SIZE;
+	if (ia_valid & ATTR_ATIME)
+		sa_valid |= MDS_ATTR_ATIME;
+	if (ia_valid & ATTR_MTIME)
+		sa_valid |= MDS_ATTR_MTIME;
+	if (ia_valid & ATTR_CTIME)
+		sa_valid |= MDS_ATTR_CTIME;
+	if (ia_valid & ATTR_ATIME_SET)
+		sa_valid |= MDS_ATTR_ATIME_SET;
+	if (ia_valid & ATTR_MTIME_SET)
+		sa_valid |= MDS_ATTR_MTIME_SET;
+	if (ia_valid & ATTR_FORCE)
+		sa_valid |= MDS_ATTR_FORCE;
+	if (ia_valid & ATTR_ATTR_FLAG)
+		sa_valid |= MDS_ATTR_ATTR_FLAG;
+	if (ia_valid & ATTR_KILL_SUID)
+		sa_valid |=  MDS_ATTR_KILL_SUID;
+	if (ia_valid & ATTR_KILL_SGID)
+		sa_valid |= MDS_ATTR_KILL_SGID;
+	if (ia_valid & ATTR_CTIME_SET)
+		sa_valid |= MDS_ATTR_CTIME_SET;
+	if (ia_valid & ATTR_FROM_OPEN)
+		sa_valid |= MDS_ATTR_FROM_OPEN;
+	if (ia_valid & ATTR_BLOCKS)
+		sa_valid |= MDS_ATTR_BLOCKS;
+	if (ia_valid & MDS_OPEN_OWNEROVERRIDE)
+		/* NFSD hack (see bug 5781) */
+		sa_valid |= MDS_OPEN_OWNEROVERRIDE;
+	return sa_valid;
+}
+
+static void mdc_setattr_pack_rec(struct mdt_rec_setattr *rec,
+				 struct md_op_data *op_data)
+{
+	rec->sa_opcode  = REINT_SETATTR;
+	rec->sa_fsuid   = current_fsuid();
+	rec->sa_fsgid   = current_fsgid();
+	rec->sa_cap     = cfs_curproc_cap_pack();
+	rec->sa_suppgid = -1;
+
+	rec->sa_fid    = op_data->op_fid1;
+	rec->sa_valid  = attr_pack(op_data->op_attr.ia_valid);
+	rec->sa_mode   = op_data->op_attr.ia_mode;
+	rec->sa_uid    = op_data->op_attr.ia_uid;
+	rec->sa_gid    = op_data->op_attr.ia_gid;
+	rec->sa_size   = op_data->op_attr.ia_size;
+	rec->sa_blocks = op_data->op_attr_blocks;
+	rec->sa_atime  = LTIME_S(op_data->op_attr.ia_atime);
+	rec->sa_mtime  = LTIME_S(op_data->op_attr.ia_mtime);
+	rec->sa_ctime  = LTIME_S(op_data->op_attr.ia_ctime);
+	rec->sa_attr_flags = ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags;
+	if ((op_data->op_attr.ia_valid & ATTR_GID) &&
+	    current_is_in_group(op_data->op_attr.ia_gid))
+		rec->sa_suppgid = op_data->op_attr.ia_gid;
+	else
+		rec->sa_suppgid = op_data->op_suppgids[0];
+
+	rec->sa_bias = op_data->op_bias;
+}
+
+static void mdc_ioepoch_pack(struct mdt_ioepoch *epoch,
+			     struct md_op_data *op_data)
+{
+	memcpy(&epoch->handle, &op_data->op_handle, sizeof(epoch->handle));
+	epoch->ioepoch = op_data->op_ioepoch;
+	epoch->flags = op_data->op_flags & MF_SOM_LOCAL_FLAGS;
+}
+
+void mdc_setattr_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+		      void *ea, int ealen, void *ea2, int ea2len)
+{
+	struct mdt_rec_setattr *rec;
+	struct mdt_ioepoch *epoch;
+	struct lov_user_md *lum = NULL;
+
+	CLASSERT(sizeof(struct mdt_rec_reint) ==sizeof(struct mdt_rec_setattr));
+	rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+	mdc_setattr_pack_rec(rec, op_data);
+
+	mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+
+	if (op_data->op_flags & (MF_SOM_CHANGE | MF_EPOCH_OPEN)) {
+		epoch = req_capsule_client_get(&req->rq_pill, &RMF_MDT_EPOCH);
+		mdc_ioepoch_pack(epoch, op_data);
+	}
+
+	if (ealen == 0)
+		return;
+
+	lum = req_capsule_client_get(&req->rq_pill, &RMF_EADATA);
+	if (ea == NULL) { /* Remove LOV EA */
+		lum->lmm_magic = LOV_USER_MAGIC_V1;
+		lum->lmm_stripe_size = 0;
+		lum->lmm_stripe_count = 0;
+		lum->lmm_stripe_offset = (typeof(lum->lmm_stripe_offset))(-1);
+	} else {
+		memcpy(lum, ea, ealen);
+	}
+
+	if (ea2len == 0)
+		return;
+
+	memcpy(req_capsule_client_get(&req->rq_pill, &RMF_LOGCOOKIES), ea2,
+	       ea2len);
+}
+
+void mdc_unlink_pack(struct ptlrpc_request *req, struct md_op_data *op_data)
+{
+	struct mdt_rec_unlink *rec;
+	char *tmp;
+
+	CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_unlink));
+	rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+	LASSERT(rec != NULL);
+
+	rec->ul_opcode  = op_data->op_cli_flags & CLI_RM_ENTRY ?
+					REINT_RMENTRY : REINT_UNLINK;
+	rec->ul_fsuid   = op_data->op_fsuid;
+	rec->ul_fsgid   = op_data->op_fsgid;
+	rec->ul_cap     = op_data->op_cap;
+	rec->ul_mode    = op_data->op_mode;
+	rec->ul_suppgid1= op_data->op_suppgids[0];
+	rec->ul_suppgid2= -1;
+	rec->ul_fid1    = op_data->op_fid1;
+	rec->ul_fid2    = op_data->op_fid2;
+	rec->ul_time    = op_data->op_mod_time;
+	rec->ul_bias    = op_data->op_bias;
+
+	mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+
+	tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+	LASSERT(tmp != NULL);
+	LOGL0(op_data->op_name, op_data->op_namelen, tmp);
+}
+
+void mdc_link_pack(struct ptlrpc_request *req, struct md_op_data *op_data)
+{
+	struct mdt_rec_link *rec;
+	char *tmp;
+
+	CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_link));
+	rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+	LASSERT (rec != NULL);
+
+	rec->lk_opcode   = REINT_LINK;
+	rec->lk_fsuid    = op_data->op_fsuid;//current->fsuid;
+	rec->lk_fsgid    = op_data->op_fsgid;//current->fsgid;
+	rec->lk_cap      = op_data->op_cap;//current->cap_effective;
+	rec->lk_suppgid1 = op_data->op_suppgids[0];
+	rec->lk_suppgid2 = op_data->op_suppgids[1];
+	rec->lk_fid1     = op_data->op_fid1;
+	rec->lk_fid2     = op_data->op_fid2;
+	rec->lk_time     = op_data->op_mod_time;
+	rec->lk_bias     = op_data->op_bias;
+
+	mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+	mdc_pack_capa(req, &RMF_CAPA2, op_data->op_capa2);
+
+	tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+	LOGL0(op_data->op_name, op_data->op_namelen, tmp);
+}
+
+void mdc_rename_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+		     const char *old, int oldlen, const char *new, int newlen)
+{
+	struct mdt_rec_rename *rec;
+	char *tmp;
+
+	CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_rename));
+	rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+
+	/* XXX do something about time, uid, gid */
+	rec->rn_opcode   = REINT_RENAME;
+	rec->rn_fsuid    = op_data->op_fsuid;
+	rec->rn_fsgid    = op_data->op_fsgid;
+	rec->rn_cap      = op_data->op_cap;
+	rec->rn_suppgid1 = op_data->op_suppgids[0];
+	rec->rn_suppgid2 = op_data->op_suppgids[1];
+	rec->rn_fid1     = op_data->op_fid1;
+	rec->rn_fid2     = op_data->op_fid2;
+	rec->rn_time     = op_data->op_mod_time;
+	rec->rn_mode     = op_data->op_mode;
+	rec->rn_bias     = op_data->op_bias;
+
+	mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+	mdc_pack_capa(req, &RMF_CAPA2, op_data->op_capa2);
+
+	tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+	LOGL0(old, oldlen, tmp);
+
+	if (new) {
+		tmp = req_capsule_client_get(&req->rq_pill, &RMF_SYMTGT);
+		LOGL0(new, newlen, tmp);
+	}
+}
+
+void mdc_getattr_pack(struct ptlrpc_request *req, __u64 valid, int flags,
+		      struct md_op_data *op_data, int ea_size)
+{
+	struct mdt_body *b = req_capsule_client_get(&req->rq_pill,
+						    &RMF_MDT_BODY);
+
+	b->valid = valid;
+	if (op_data->op_bias & MDS_CHECK_SPLIT)
+		b->valid |= OBD_MD_FLCKSPLIT;
+	if (op_data->op_bias & MDS_CROSS_REF)
+		b->valid |= OBD_MD_FLCROSSREF;
+	b->eadatasize = ea_size;
+	b->flags = flags;
+	__mdc_pack_body(b, op_data->op_suppgids[0]);
+
+	b->fid1 = op_data->op_fid1;
+	b->fid2 = op_data->op_fid2;
+	b->valid |= OBD_MD_FLID;
+
+	mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+
+	if (op_data->op_name) {
+		char *tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+		LOGL0(op_data->op_name, op_data->op_namelen, tmp);
+
+	}
+}
+
+void mdc_close_pack(struct ptlrpc_request *req, struct md_op_data *op_data)
+{
+	struct mdt_ioepoch *epoch;
+	struct mdt_rec_setattr *rec;
+
+	epoch = req_capsule_client_get(&req->rq_pill, &RMF_MDT_EPOCH);
+	rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+
+	mdc_setattr_pack_rec(rec, op_data);
+	mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+	mdc_ioepoch_pack(epoch, op_data);
+}
+
+static int mdc_req_avail(struct client_obd *cli, struct mdc_cache_waiter *mcw)
+{
+	int rc;
+	ENTRY;
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	rc = list_empty(&mcw->mcw_entry);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+	RETURN(rc);
+};
+
+/* We record requests in flight in cli->cl_r_in_flight here.
+ * There is only one write rpc possible in mdc anyway. If this to change
+ * in the future - the code may need to be revisited. */
+int mdc_enter_request(struct client_obd *cli)
+{
+	int rc = 0;
+	struct mdc_cache_waiter mcw;
+	struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	if (cli->cl_r_in_flight >= cli->cl_max_rpcs_in_flight) {
+		list_add_tail(&mcw.mcw_entry, &cli->cl_cache_waiters);
+		init_waitqueue_head(&mcw.mcw_waitq);
+		client_obd_list_unlock(&cli->cl_loi_list_lock);
+		rc = l_wait_event(mcw.mcw_waitq, mdc_req_avail(cli, &mcw), &lwi);
+		if (rc) {
+			client_obd_list_lock(&cli->cl_loi_list_lock);
+			if (list_empty(&mcw.mcw_entry))
+				cli->cl_r_in_flight--;
+			list_del_init(&mcw.mcw_entry);
+			client_obd_list_unlock(&cli->cl_loi_list_lock);
+		}
+	} else {
+		cli->cl_r_in_flight++;
+		client_obd_list_unlock(&cli->cl_loi_list_lock);
+	}
+	return rc;
+}
+
+void mdc_exit_request(struct client_obd *cli)
+{
+	struct list_head *l, *tmp;
+	struct mdc_cache_waiter *mcw;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	cli->cl_r_in_flight--;
+	list_for_each_safe(l, tmp, &cli->cl_cache_waiters) {
+		if (cli->cl_r_in_flight >= cli->cl_max_rpcs_in_flight) {
+			/* No free request slots anymore */
+			break;
+		}
+
+		mcw = list_entry(l, struct mdc_cache_waiter, mcw_entry);
+		list_del_init(&mcw->mcw_entry);
+		cli->cl_r_in_flight++;
+		wake_up(&mcw->mcw_waitq);
+	}
+	/* Empty waiting list? Decrease reqs in-flight number */
+
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+}
diff --git a/drivers/staging/lustre/lustre/mdc/mdc_locks.c b/drivers/staging/lustre/lustre/mdc/mdc_locks.c
new file mode 100644
index 000000000000..1cc90b635fb5
--- /dev/null
+++ b/drivers/staging/lustre/lustre/mdc/mdc_locks.c
@@ -0,0 +1,1229 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_MDC
+
+# include <linux/module.h>
+# include <linux/pagemap.h>
+# include <linux/miscdevice.h>
+# include <linux/init.h>
+
+#include <lustre_acl.h>
+#include <obd_class.h>
+#include <lustre_dlm.h>
+/* fid_res_name_eq() */
+#include <lustre_fid.h>
+#include <lprocfs_status.h>
+#include "mdc_internal.h"
+
+struct mdc_getattr_args {
+	struct obd_export	   *ga_exp;
+	struct md_enqueue_info      *ga_minfo;
+	struct ldlm_enqueue_info    *ga_einfo;
+};
+
+int it_disposition(struct lookup_intent *it, int flag)
+{
+	return it->d.lustre.it_disposition & flag;
+}
+EXPORT_SYMBOL(it_disposition);
+
+void it_set_disposition(struct lookup_intent *it, int flag)
+{
+	it->d.lustre.it_disposition |= flag;
+}
+EXPORT_SYMBOL(it_set_disposition);
+
+void it_clear_disposition(struct lookup_intent *it, int flag)
+{
+	it->d.lustre.it_disposition &= ~flag;
+}
+EXPORT_SYMBOL(it_clear_disposition);
+
+int it_open_error(int phase, struct lookup_intent *it)
+{
+	if (it_disposition(it, DISP_OPEN_OPEN)) {
+		if (phase >= DISP_OPEN_OPEN)
+			return it->d.lustre.it_status;
+		else
+			return 0;
+	}
+
+	if (it_disposition(it, DISP_OPEN_CREATE)) {
+		if (phase >= DISP_OPEN_CREATE)
+			return it->d.lustre.it_status;
+		else
+			return 0;
+	}
+
+	if (it_disposition(it, DISP_LOOKUP_EXECD)) {
+		if (phase >= DISP_LOOKUP_EXECD)
+			return it->d.lustre.it_status;
+		else
+			return 0;
+	}
+
+	if (it_disposition(it, DISP_IT_EXECD)) {
+		if (phase >= DISP_IT_EXECD)
+			return it->d.lustre.it_status;
+		else
+			return 0;
+	}
+	CERROR("it disp: %X, status: %d\n", it->d.lustre.it_disposition,
+	       it->d.lustre.it_status);
+	LBUG();
+	return 0;
+}
+EXPORT_SYMBOL(it_open_error);
+
+/* this must be called on a lockh that is known to have a referenced lock */
+int mdc_set_lock_data(struct obd_export *exp, __u64 *lockh, void *data,
+		      __u64 *bits)
+{
+	struct ldlm_lock *lock;
+	struct inode *new_inode = data;
+	ENTRY;
+
+	if(bits)
+		*bits = 0;
+
+	if (!*lockh)
+		RETURN(0);
+
+	lock = ldlm_handle2lock((struct lustre_handle *)lockh);
+
+	LASSERT(lock != NULL);
+	lock_res_and_lock(lock);
+	if (lock->l_resource->lr_lvb_inode &&
+	    lock->l_resource->lr_lvb_inode != data) {
+		struct inode *old_inode = lock->l_resource->lr_lvb_inode;
+		LASSERTF(old_inode->i_state & I_FREEING,
+			 "Found existing inode %p/%lu/%u state %lu in lock: "
+			 "setting data to %p/%lu/%u\n", old_inode,
+			 old_inode->i_ino, old_inode->i_generation,
+			 old_inode->i_state,
+			 new_inode, new_inode->i_ino, new_inode->i_generation);
+	}
+	lock->l_resource->lr_lvb_inode = new_inode;
+	if (bits)
+		*bits = lock->l_policy_data.l_inodebits.bits;
+
+	unlock_res_and_lock(lock);
+	LDLM_LOCK_PUT(lock);
+
+	RETURN(0);
+}
+
+ldlm_mode_t mdc_lock_match(struct obd_export *exp, __u64 flags,
+			   const struct lu_fid *fid, ldlm_type_t type,
+			   ldlm_policy_data_t *policy, ldlm_mode_t mode,
+			   struct lustre_handle *lockh)
+{
+	struct ldlm_res_id res_id;
+	ldlm_mode_t rc;
+	ENTRY;
+
+	fid_build_reg_res_name(fid, &res_id);
+	rc = ldlm_lock_match(class_exp2obd(exp)->obd_namespace, flags,
+			     &res_id, type, policy, mode, lockh, 0);
+	RETURN(rc);
+}
+
+int mdc_cancel_unused(struct obd_export *exp,
+		      const struct lu_fid *fid,
+		      ldlm_policy_data_t *policy,
+		      ldlm_mode_t mode,
+		      ldlm_cancel_flags_t flags,
+		      void *opaque)
+{
+	struct ldlm_res_id res_id;
+	struct obd_device *obd = class_exp2obd(exp);
+	int rc;
+
+	ENTRY;
+
+	fid_build_reg_res_name(fid, &res_id);
+	rc = ldlm_cli_cancel_unused_resource(obd->obd_namespace, &res_id,
+					     policy, mode, flags, opaque);
+	RETURN(rc);
+}
+
+int mdc_null_inode(struct obd_export *exp,
+		   const struct lu_fid *fid)
+{
+	struct ldlm_res_id res_id;
+	struct ldlm_resource *res;
+	struct ldlm_namespace *ns = class_exp2obd(exp)->obd_namespace;
+	ENTRY;
+
+	LASSERTF(ns != NULL, "no namespace passed\n");
+
+	fid_build_reg_res_name(fid, &res_id);
+
+	res = ldlm_resource_get(ns, NULL, &res_id, 0, 0);
+	if(res == NULL)
+		RETURN(0);
+
+	lock_res(res);
+	res->lr_lvb_inode = NULL;
+	unlock_res(res);
+
+	ldlm_resource_putref(res);
+	RETURN(0);
+}
+
+/* find any ldlm lock of the inode in mdc
+ * return 0    not find
+ *	1    find one
+ *      < 0    error */
+int mdc_find_cbdata(struct obd_export *exp,
+		    const struct lu_fid *fid,
+		    ldlm_iterator_t it, void *data)
+{
+	struct ldlm_res_id res_id;
+	int rc = 0;
+	ENTRY;
+
+	fid_build_reg_res_name((struct lu_fid*)fid, &res_id);
+	rc = ldlm_resource_iterate(class_exp2obd(exp)->obd_namespace, &res_id,
+				   it, data);
+	if (rc == LDLM_ITER_STOP)
+		RETURN(1);
+	else if (rc == LDLM_ITER_CONTINUE)
+		RETURN(0);
+	RETURN(rc);
+}
+
+static inline void mdc_clear_replay_flag(struct ptlrpc_request *req, int rc)
+{
+	/* Don't hold error requests for replay. */
+	if (req->rq_replay) {
+		spin_lock(&req->rq_lock);
+		req->rq_replay = 0;
+		spin_unlock(&req->rq_lock);
+	}
+	if (rc && req->rq_transno != 0) {
+		DEBUG_REQ(D_ERROR, req, "transno returned on error rc %d", rc);
+		LBUG();
+	}
+}
+
+/* Save a large LOV EA into the request buffer so that it is available
+ * for replay.  We don't do this in the initial request because the
+ * original request doesn't need this buffer (at most it sends just the
+ * lov_mds_md) and it is a waste of RAM/bandwidth to send the empty
+ * buffer and may also be difficult to allocate and save a very large
+ * request buffer for each open. (bug 5707)
+ *
+ * OOM here may cause recovery failure if lmm is needed (only for the
+ * original open if the MDS crashed just when this client also OOM'd)
+ * but this is incredibly unlikely, and questionable whether the client
+ * could do MDS recovery under OOM anyways... */
+static void mdc_realloc_openmsg(struct ptlrpc_request *req,
+				struct mdt_body *body)
+{
+	int     rc;
+
+	/* FIXME: remove this explicit offset. */
+	rc = sptlrpc_cli_enlarge_reqbuf(req, DLM_INTENT_REC_OFF + 4,
+					body->eadatasize);
+	if (rc) {
+		CERROR("Can't enlarge segment %d size to %d\n",
+		       DLM_INTENT_REC_OFF + 4, body->eadatasize);
+		body->valid &= ~OBD_MD_FLEASIZE;
+		body->eadatasize = 0;
+	}
+}
+
+static struct ptlrpc_request *mdc_intent_open_pack(struct obd_export *exp,
+						   struct lookup_intent *it,
+						   struct md_op_data *op_data,
+						   void *lmm, int lmmsize,
+						   void *cb_data)
+{
+	struct ptlrpc_request *req;
+	struct obd_device     *obddev = class_exp2obd(exp);
+	struct ldlm_intent    *lit;
+	LIST_HEAD(cancels);
+	int		    count = 0;
+	int		    mode;
+	int		    rc;
+	ENTRY;
+
+	it->it_create_mode = (it->it_create_mode & ~S_IFMT) | S_IFREG;
+
+	/* XXX: openlock is not cancelled for cross-refs. */
+	/* If inode is known, cancel conflicting OPEN locks. */
+	if (fid_is_sane(&op_data->op_fid2)) {
+		if (it->it_flags & (FMODE_WRITE|MDS_OPEN_TRUNC))
+			mode = LCK_CW;
+#ifdef FMODE_EXEC
+		else if (it->it_flags & FMODE_EXEC)
+			mode = LCK_PR;
+#endif
+		else
+			mode = LCK_CR;
+		count = mdc_resource_get_unused(exp, &op_data->op_fid2,
+						&cancels, mode,
+						MDS_INODELOCK_OPEN);
+	}
+
+	/* If CREATE, cancel parent's UPDATE lock. */
+	if (it->it_op & IT_CREAT)
+		mode = LCK_EX;
+	else
+		mode = LCK_CR;
+	count += mdc_resource_get_unused(exp, &op_data->op_fid1,
+					 &cancels, mode,
+					 MDS_INODELOCK_UPDATE);
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_LDLM_INTENT_OPEN);
+	if (req == NULL) {
+		ldlm_lock_list_put(&cancels, l_bl_ast, count);
+		RETURN(ERR_PTR(-ENOMEM));
+	}
+
+	/* parent capability */
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+	/* child capability, reserve the size according to parent capa, it will
+	 * be filled after we get the reply */
+	mdc_set_capa_size(req, &RMF_CAPA2, op_data->op_capa1);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+			     op_data->op_namelen + 1);
+	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT,
+			     max(lmmsize, obddev->u.cli.cl_default_mds_easize));
+
+	rc = ldlm_prep_enqueue_req(exp, req, &cancels, count);
+	if (rc) {
+		ptlrpc_request_free(req);
+		return NULL;
+	}
+
+	spin_lock(&req->rq_lock);
+	req->rq_replay = req->rq_import->imp_replayable;
+	spin_unlock(&req->rq_lock);
+
+	/* pack the intent */
+	lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+	lit->opc = (__u64)it->it_op;
+
+	/* pack the intended request */
+	mdc_open_pack(req, op_data, it->it_create_mode, 0, it->it_flags, lmm,
+		      lmmsize);
+
+	/* for remote client, fetch remote perm for current user */
+	if (client_is_remote(exp))
+		req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
+				     sizeof(struct mdt_remote_perm));
+	ptlrpc_request_set_replen(req);
+	return req;
+}
+
+static struct ptlrpc_request *mdc_intent_unlink_pack(struct obd_export *exp,
+						     struct lookup_intent *it,
+						     struct md_op_data *op_data)
+{
+	struct ptlrpc_request *req;
+	struct obd_device     *obddev = class_exp2obd(exp);
+	struct ldlm_intent    *lit;
+	int		    rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_LDLM_INTENT_UNLINK);
+	if (req == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+	req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+			     op_data->op_namelen + 1);
+
+	rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(ERR_PTR(rc));
+	}
+
+	/* pack the intent */
+	lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+	lit->opc = (__u64)it->it_op;
+
+	/* pack the intended request */
+	mdc_unlink_pack(req, op_data);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+			     obddev->u.cli.cl_max_mds_easize);
+	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
+			     obddev->u.cli.cl_max_mds_cookiesize);
+	ptlrpc_request_set_replen(req);
+	RETURN(req);
+}
+
+static struct ptlrpc_request *mdc_intent_getattr_pack(struct obd_export *exp,
+						      struct lookup_intent *it,
+						      struct md_op_data *op_data)
+{
+	struct ptlrpc_request *req;
+	struct obd_device     *obddev = class_exp2obd(exp);
+	obd_valid	      valid = OBD_MD_FLGETATTR | OBD_MD_FLEASIZE |
+				       OBD_MD_FLMODEASIZE | OBD_MD_FLDIREA |
+				       OBD_MD_FLMDSCAPA | OBD_MD_MEA |
+				       (client_is_remote(exp) ?
+					       OBD_MD_FLRMTPERM : OBD_MD_FLACL);
+	struct ldlm_intent    *lit;
+	int		    rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_LDLM_INTENT_GETATTR);
+	if (req == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+	req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+			     op_data->op_namelen + 1);
+
+	rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(ERR_PTR(rc));
+	}
+
+	/* pack the intent */
+	lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+	lit->opc = (__u64)it->it_op;
+
+	/* pack the intended request */
+	mdc_getattr_pack(req, valid, it->it_flags, op_data,
+			 obddev->u.cli.cl_max_mds_easize);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+			     obddev->u.cli.cl_max_mds_easize);
+	if (client_is_remote(exp))
+		req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
+				     sizeof(struct mdt_remote_perm));
+	ptlrpc_request_set_replen(req);
+	RETURN(req);
+}
+
+static struct ptlrpc_request *mdc_intent_layout_pack(struct obd_export *exp,
+						     struct lookup_intent *it,
+						     struct md_op_data *unused)
+{
+	struct obd_device     *obd = class_exp2obd(exp);
+	struct ptlrpc_request *req;
+	struct ldlm_intent    *lit;
+	struct layout_intent  *layout;
+	int rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				&RQF_LDLM_INTENT_LAYOUT);
+	if (req == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, 0);
+	rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(ERR_PTR(rc));
+	}
+
+	/* pack the intent */
+	lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+	lit->opc = (__u64)it->it_op;
+
+	/* pack the layout intent request */
+	layout = req_capsule_client_get(&req->rq_pill, &RMF_LAYOUT_INTENT);
+	/* LAYOUT_INTENT_ACCESS is generic, specific operation will be
+	 * set for replication */
+	layout->li_opc = LAYOUT_INTENT_ACCESS;
+
+	req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
+			obd->u.cli.cl_max_mds_easize);
+	ptlrpc_request_set_replen(req);
+	RETURN(req);
+}
+
+static struct ptlrpc_request *
+mdc_enqueue_pack(struct obd_export *exp, int lvb_len)
+{
+	struct ptlrpc_request *req;
+	int rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LDLM_ENQUEUE);
+	if (req == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(ERR_PTR(rc));
+	}
+
+	req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, lvb_len);
+	ptlrpc_request_set_replen(req);
+	RETURN(req);
+}
+
+static int mdc_finish_enqueue(struct obd_export *exp,
+			      struct ptlrpc_request *req,
+			      struct ldlm_enqueue_info *einfo,
+			      struct lookup_intent *it,
+			      struct lustre_handle *lockh,
+			      int rc)
+{
+	struct req_capsule  *pill = &req->rq_pill;
+	struct ldlm_request *lockreq;
+	struct ldlm_reply   *lockrep;
+	struct lustre_intent_data *intent = &it->d.lustre;
+	struct ldlm_lock    *lock;
+	void		*lvb_data = NULL;
+	int		  lvb_len = 0;
+	ENTRY;
+
+	LASSERT(rc >= 0);
+	/* Similarly, if we're going to replay this request, we don't want to
+	 * actually get a lock, just perform the intent. */
+	if (req->rq_transno || req->rq_replay) {
+		lockreq = req_capsule_client_get(pill, &RMF_DLM_REQ);
+		lockreq->lock_flags |= ldlm_flags_to_wire(LDLM_FL_INTENT_ONLY);
+	}
+
+	if (rc == ELDLM_LOCK_ABORTED) {
+		einfo->ei_mode = 0;
+		memset(lockh, 0, sizeof(*lockh));
+		rc = 0;
+	} else { /* rc = 0 */
+		lock = ldlm_handle2lock(lockh);
+		LASSERT(lock != NULL);
+
+		/* If the server gave us back a different lock mode, we should
+		 * fix up our variables. */
+		if (lock->l_req_mode != einfo->ei_mode) {
+			ldlm_lock_addref(lockh, lock->l_req_mode);
+			ldlm_lock_decref(lockh, einfo->ei_mode);
+			einfo->ei_mode = lock->l_req_mode;
+		}
+		LDLM_LOCK_PUT(lock);
+	}
+
+	lockrep = req_capsule_server_get(pill, &RMF_DLM_REP);
+	LASSERT(lockrep != NULL); /* checked by ldlm_cli_enqueue() */
+
+	intent->it_disposition = (int)lockrep->lock_policy_res1;
+	intent->it_status = (int)lockrep->lock_policy_res2;
+	intent->it_lock_mode = einfo->ei_mode;
+	intent->it_lock_handle = lockh->cookie;
+	intent->it_data = req;
+
+	/* Technically speaking rq_transno must already be zero if
+	 * it_status is in error, so the check is a bit redundant */
+	if ((!req->rq_transno || intent->it_status < 0) && req->rq_replay)
+		mdc_clear_replay_flag(req, intent->it_status);
+
+	/* If we're doing an IT_OPEN which did not result in an actual
+	 * successful open, then we need to remove the bit which saves
+	 * this request for unconditional replay.
+	 *
+	 * It's important that we do this first!  Otherwise we might exit the
+	 * function without doing so, and try to replay a failed create
+	 * (bug 3440) */
+	if (it->it_op & IT_OPEN && req->rq_replay &&
+	    (!it_disposition(it, DISP_OPEN_OPEN) ||intent->it_status != 0))
+		mdc_clear_replay_flag(req, intent->it_status);
+
+	DEBUG_REQ(D_RPCTRACE, req, "op: %d disposition: %x, status: %d",
+		  it->it_op, intent->it_disposition, intent->it_status);
+
+	/* We know what to expect, so we do any byte flipping required here */
+	if (it->it_op & (IT_OPEN | IT_UNLINK | IT_LOOKUP | IT_GETATTR)) {
+		struct mdt_body *body;
+
+		body = req_capsule_server_get(pill, &RMF_MDT_BODY);
+		if (body == NULL) {
+			CERROR ("Can't swab mdt_body\n");
+			RETURN (-EPROTO);
+		}
+
+		if (it_disposition(it, DISP_OPEN_OPEN) &&
+		    !it_open_error(DISP_OPEN_OPEN, it)) {
+			/*
+			 * If this is a successful OPEN request, we need to set
+			 * replay handler and data early, so that if replay
+			 * happens immediately after swabbing below, new reply
+			 * is swabbed by that handler correctly.
+			 */
+			mdc_set_open_replay_data(NULL, NULL, req);
+		}
+
+		if ((body->valid & (OBD_MD_FLDIREA | OBD_MD_FLEASIZE)) != 0) {
+			void *eadata;
+
+			mdc_update_max_ea_from_body(exp, body);
+
+			/*
+			 * The eadata is opaque; just check that it is there.
+			 * Eventually, obd_unpackmd() will check the contents.
+			 */
+			eadata = req_capsule_server_sized_get(pill, &RMF_MDT_MD,
+							      body->eadatasize);
+			if (eadata == NULL)
+				RETURN(-EPROTO);
+
+			/* save lvb data and length in case this is for layout
+			 * lock */
+			lvb_data = eadata;
+			lvb_len = body->eadatasize;
+
+			/*
+			 * We save the reply LOV EA in case we have to replay a
+			 * create for recovery.  If we didn't allocate a large
+			 * enough request buffer above we need to reallocate it
+			 * here to hold the actual LOV EA.
+			 *
+			 * To not save LOV EA if request is not going to replay
+			 * (for example error one).
+			 */
+			if ((it->it_op & IT_OPEN) && req->rq_replay) {
+				void *lmm;
+				if (req_capsule_get_size(pill, &RMF_EADATA,
+							 RCL_CLIENT) <
+				    body->eadatasize)
+					mdc_realloc_openmsg(req, body);
+				else
+					req_capsule_shrink(pill, &RMF_EADATA,
+							   body->eadatasize,
+							   RCL_CLIENT);
+
+				req_capsule_set_size(pill, &RMF_EADATA,
+						     RCL_CLIENT,
+						     body->eadatasize);
+
+				lmm = req_capsule_client_get(pill, &RMF_EADATA);
+				if (lmm)
+					memcpy(lmm, eadata, body->eadatasize);
+			}
+		}
+
+		if (body->valid & OBD_MD_FLRMTPERM) {
+			struct mdt_remote_perm *perm;
+
+			LASSERT(client_is_remote(exp));
+			perm = req_capsule_server_swab_get(pill, &RMF_ACL,
+						lustre_swab_mdt_remote_perm);
+			if (perm == NULL)
+				RETURN(-EPROTO);
+		}
+		if (body->valid & OBD_MD_FLMDSCAPA) {
+			struct lustre_capa *capa, *p;
+
+			capa = req_capsule_server_get(pill, &RMF_CAPA1);
+			if (capa == NULL)
+				RETURN(-EPROTO);
+
+			if (it->it_op & IT_OPEN) {
+				/* client fid capa will be checked in replay */
+				p = req_capsule_client_get(pill, &RMF_CAPA2);
+				LASSERT(p);
+				*p = *capa;
+			}
+		}
+		if (body->valid & OBD_MD_FLOSSCAPA) {
+			struct lustre_capa *capa;
+
+			capa = req_capsule_server_get(pill, &RMF_CAPA2);
+			if (capa == NULL)
+				RETURN(-EPROTO);
+		}
+	} else if (it->it_op & IT_LAYOUT) {
+		/* maybe the lock was granted right away and layout
+		 * is packed into RMF_DLM_LVB of req */
+		lvb_len = req_capsule_get_size(pill, &RMF_DLM_LVB, RCL_SERVER);
+		if (lvb_len > 0) {
+			lvb_data = req_capsule_server_sized_get(pill,
+							&RMF_DLM_LVB, lvb_len);
+			if (lvb_data == NULL)
+				RETURN(-EPROTO);
+		}
+	}
+
+	/* fill in stripe data for layout lock */
+	lock = ldlm_handle2lock(lockh);
+	if (lock != NULL && ldlm_has_layout(lock) && lvb_data != NULL) {
+		void *lmm;
+
+		LDLM_DEBUG(lock, "layout lock returned by: %s, lvb_len: %d\n",
+			ldlm_it2str(it->it_op), lvb_len);
+
+		OBD_ALLOC_LARGE(lmm, lvb_len);
+		if (lmm == NULL) {
+			LDLM_LOCK_PUT(lock);
+			RETURN(-ENOMEM);
+		}
+		memcpy(lmm, lvb_data, lvb_len);
+
+		/* install lvb_data */
+		lock_res_and_lock(lock);
+		if (lock->l_lvb_data == NULL) {
+			lock->l_lvb_data = lmm;
+			lock->l_lvb_len = lvb_len;
+			lmm = NULL;
+		}
+		unlock_res_and_lock(lock);
+		if (lmm != NULL)
+			OBD_FREE_LARGE(lmm, lvb_len);
+	}
+	if (lock != NULL)
+		LDLM_LOCK_PUT(lock);
+
+	RETURN(rc);
+}
+
+/* We always reserve enough space in the reply packet for a stripe MD, because
+ * we don't know in advance the file type. */
+int mdc_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
+		struct lookup_intent *it, struct md_op_data *op_data,
+		struct lustre_handle *lockh, void *lmm, int lmmsize,
+		struct ptlrpc_request **reqp, __u64 extra_lock_flags)
+{
+	struct obd_device     *obddev = class_exp2obd(exp);
+	struct ptlrpc_request *req = NULL;
+	__u64		  flags, saved_flags = extra_lock_flags;
+	int		    rc;
+	struct ldlm_res_id res_id;
+	static const ldlm_policy_data_t lookup_policy =
+			    { .l_inodebits = { MDS_INODELOCK_LOOKUP } };
+	static const ldlm_policy_data_t update_policy =
+			    { .l_inodebits = { MDS_INODELOCK_UPDATE } };
+	static const ldlm_policy_data_t layout_policy =
+			    { .l_inodebits = { MDS_INODELOCK_LAYOUT } };
+	ldlm_policy_data_t const *policy = &lookup_policy;
+	int		    generation, resends = 0;
+	struct ldlm_reply     *lockrep;
+	enum lvb_type	       lvb_type = 0;
+	ENTRY;
+
+	LASSERTF(!it || einfo->ei_type == LDLM_IBITS, "lock type %d\n",
+		 einfo->ei_type);
+
+	fid_build_reg_res_name(&op_data->op_fid1, &res_id);
+
+	if (it) {
+		saved_flags |= LDLM_FL_HAS_INTENT;
+		if (it->it_op & (IT_UNLINK | IT_GETATTR | IT_READDIR))
+			policy = &update_policy;
+		else if (it->it_op & IT_LAYOUT)
+			policy = &layout_policy;
+	}
+
+	LASSERT(reqp == NULL);
+
+	generation = obddev->u.cli.cl_import->imp_generation;
+resend:
+	flags = saved_flags;
+	if (!it) {
+		/* The only way right now is FLOCK, in this case we hide flock
+		   policy as lmm, but lmmsize is 0 */
+		LASSERT(lmm && lmmsize == 0);
+		LASSERTF(einfo->ei_type == LDLM_FLOCK, "lock type %d\n",
+			 einfo->ei_type);
+		policy = (ldlm_policy_data_t *)lmm;
+		res_id.name[3] = LDLM_FLOCK;
+	} else if (it->it_op & IT_OPEN) {
+		req = mdc_intent_open_pack(exp, it, op_data, lmm, lmmsize,
+					   einfo->ei_cbdata);
+		policy = &update_policy;
+		einfo->ei_cbdata = NULL;
+		lmm = NULL;
+	} else if (it->it_op & IT_UNLINK) {
+		req = mdc_intent_unlink_pack(exp, it, op_data);
+	} else if (it->it_op & (IT_GETATTR | IT_LOOKUP)) {
+		req = mdc_intent_getattr_pack(exp, it, op_data);
+	} else if (it->it_op & IT_READDIR) {
+		req = mdc_enqueue_pack(exp, 0);
+	} else if (it->it_op & IT_LAYOUT) {
+		if (!imp_connect_lvb_type(class_exp2cliimp(exp)))
+			RETURN(-EOPNOTSUPP);
+
+		req = mdc_intent_layout_pack(exp, it, op_data);
+		lvb_type = LVB_T_LAYOUT;
+	} else {
+		LBUG();
+		RETURN(-EINVAL);
+	}
+
+	if (IS_ERR(req))
+		RETURN(PTR_ERR(req));
+
+	if (req != NULL && it && it->it_op & IT_CREAT)
+		/* ask ptlrpc not to resend on EINPROGRESS since we have our own
+		 * retry logic */
+		req->rq_no_retry_einprogress = 1;
+
+	if (resends) {
+		req->rq_generation_set = 1;
+		req->rq_import_generation = generation;
+		req->rq_sent = cfs_time_current_sec() + resends;
+	}
+
+	/* It is important to obtain rpc_lock first (if applicable), so that
+	 * threads that are serialised with rpc_lock are not polluting our
+	 * rpcs in flight counter. We do not do flock request limiting, though*/
+	if (it) {
+		mdc_get_rpc_lock(obddev->u.cli.cl_rpc_lock, it);
+		rc = mdc_enter_request(&obddev->u.cli);
+		if (rc != 0) {
+			mdc_put_rpc_lock(obddev->u.cli.cl_rpc_lock, it);
+			mdc_clear_replay_flag(req, 0);
+			ptlrpc_req_finished(req);
+			RETURN(rc);
+		}
+	}
+
+	rc = ldlm_cli_enqueue(exp, &req, einfo, &res_id, policy, &flags, NULL,
+			      0, lvb_type, lockh, 0);
+	if (!it) {
+		/* For flock requests we immediatelly return without further
+		   delay and let caller deal with the rest, since rest of
+		   this function metadata processing makes no sense for flock
+		   requests anyway */
+		RETURN(rc);
+	}
+
+	mdc_exit_request(&obddev->u.cli);
+	mdc_put_rpc_lock(obddev->u.cli.cl_rpc_lock, it);
+
+	if (rc < 0) {
+		CERROR("ldlm_cli_enqueue: %d\n", rc);
+		mdc_clear_replay_flag(req, rc);
+		ptlrpc_req_finished(req);
+		RETURN(rc);
+	}
+
+	lockrep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+	LASSERT(lockrep != NULL);
+
+	/* Retry the create infinitely when we get -EINPROGRESS from
+	 * server. This is required by the new quota design. */
+	if (it && it->it_op & IT_CREAT &&
+	    (int)lockrep->lock_policy_res2 == -EINPROGRESS) {
+		mdc_clear_replay_flag(req, rc);
+		ptlrpc_req_finished(req);
+		resends++;
+
+		CDEBUG(D_HA, "%s: resend:%d op:%d "DFID"/"DFID"\n",
+		       obddev->obd_name, resends, it->it_op,
+		       PFID(&op_data->op_fid1), PFID(&op_data->op_fid2));
+
+		if (generation == obddev->u.cli.cl_import->imp_generation) {
+			goto resend;
+		} else {
+			CDEBUG(D_HA, "resend cross eviction\n");
+			RETURN(-EIO);
+		}
+	}
+
+	rc = mdc_finish_enqueue(exp, req, einfo, it, lockh, rc);
+	if (rc < 0) {
+		if (lustre_handle_is_used(lockh)) {
+			ldlm_lock_decref(lockh, einfo->ei_mode);
+			memset(lockh, 0, sizeof(*lockh));
+		}
+		ptlrpc_req_finished(req);
+	}
+	RETURN(rc);
+}
+
+static int mdc_finish_intent_lock(struct obd_export *exp,
+				  struct ptlrpc_request *request,
+				  struct md_op_data *op_data,
+				  struct lookup_intent *it,
+				  struct lustre_handle *lockh)
+{
+	struct lustre_handle old_lock;
+	struct mdt_body *mdt_body;
+	struct ldlm_lock *lock;
+	int rc;
+
+
+	LASSERT(request != NULL);
+	LASSERT(request != LP_POISON);
+	LASSERT(request->rq_repmsg != LP_POISON);
+
+	if (!it_disposition(it, DISP_IT_EXECD)) {
+		/* The server failed before it even started executing the
+		 * intent, i.e. because it couldn't unpack the request. */
+		LASSERT(it->d.lustre.it_status != 0);
+		RETURN(it->d.lustre.it_status);
+	}
+	rc = it_open_error(DISP_IT_EXECD, it);
+	if (rc)
+		RETURN(rc);
+
+	mdt_body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
+	LASSERT(mdt_body != NULL);      /* mdc_enqueue checked */
+
+	/* If we were revalidating a fid/name pair, mark the intent in
+	 * case we fail and get called again from lookup */
+	if (fid_is_sane(&op_data->op_fid2) &&
+	    it->it_create_mode & M_CHECK_STALE &&
+	    it->it_op != IT_GETATTR) {
+		it_set_disposition(it, DISP_ENQ_COMPLETE);
+
+		/* Also: did we find the same inode? */
+		/* sever can return one of two fids:
+		 * op_fid2 - new allocated fid - if file is created.
+		 * op_fid3 - existent fid - if file only open.
+		 * op_fid3 is saved in lmv_intent_open */
+		if ((!lu_fid_eq(&op_data->op_fid2, &mdt_body->fid1)) &&
+		    (!lu_fid_eq(&op_data->op_fid3, &mdt_body->fid1))) {
+			CDEBUG(D_DENTRY, "Found stale data "DFID"("DFID")/"DFID
+			       "\n", PFID(&op_data->op_fid2),
+			       PFID(&op_data->op_fid2), PFID(&mdt_body->fid1));
+			RETURN(-ESTALE);
+		}
+	}
+
+	rc = it_open_error(DISP_LOOKUP_EXECD, it);
+	if (rc)
+		RETURN(rc);
+
+	/* keep requests around for the multiple phases of the call
+	 * this shows the DISP_XX must guarantee we make it into the call
+	 */
+	if (!it_disposition(it, DISP_ENQ_CREATE_REF) &&
+	    it_disposition(it, DISP_OPEN_CREATE) &&
+	    !it_open_error(DISP_OPEN_CREATE, it)) {
+		it_set_disposition(it, DISP_ENQ_CREATE_REF);
+		ptlrpc_request_addref(request); /* balanced in ll_create_node */
+	}
+	if (!it_disposition(it, DISP_ENQ_OPEN_REF) &&
+	    it_disposition(it, DISP_OPEN_OPEN) &&
+	    !it_open_error(DISP_OPEN_OPEN, it)) {
+		it_set_disposition(it, DISP_ENQ_OPEN_REF);
+		ptlrpc_request_addref(request); /* balanced in ll_file_open */
+		/* BUG 11546 - eviction in the middle of open rpc processing */
+		OBD_FAIL_TIMEOUT(OBD_FAIL_MDC_ENQUEUE_PAUSE, obd_timeout);
+	}
+
+	if (it->it_op & IT_CREAT) {
+		/* XXX this belongs in ll_create_it */
+	} else if (it->it_op == IT_OPEN) {
+		LASSERT(!it_disposition(it, DISP_OPEN_CREATE));
+	} else {
+		LASSERT(it->it_op & (IT_GETATTR | IT_LOOKUP | IT_LAYOUT));
+	}
+
+	/* If we already have a matching lock, then cancel the new
+	 * one.  We have to set the data here instead of in
+	 * mdc_enqueue, because we need to use the child's inode as
+	 * the l_ast_data to match, and that's not available until
+	 * intent_finish has performed the iget().) */
+	lock = ldlm_handle2lock(lockh);
+	if (lock) {
+		ldlm_policy_data_t policy = lock->l_policy_data;
+		LDLM_DEBUG(lock, "matching against this");
+
+		LASSERTF(fid_res_name_eq(&mdt_body->fid1,
+					 &lock->l_resource->lr_name),
+			 "Lock res_id: %lu/%lu/%lu, fid: %lu/%lu/%lu.\n",
+			 (unsigned long)lock->l_resource->lr_name.name[0],
+			 (unsigned long)lock->l_resource->lr_name.name[1],
+			 (unsigned long)lock->l_resource->lr_name.name[2],
+			 (unsigned long)fid_seq(&mdt_body->fid1),
+			 (unsigned long)fid_oid(&mdt_body->fid1),
+			 (unsigned long)fid_ver(&mdt_body->fid1));
+		LDLM_LOCK_PUT(lock);
+
+		memcpy(&old_lock, lockh, sizeof(*lockh));
+		if (ldlm_lock_match(NULL, LDLM_FL_BLOCK_GRANTED, NULL,
+				    LDLM_IBITS, &policy, LCK_NL, &old_lock, 0)) {
+			ldlm_lock_decref_and_cancel(lockh,
+						    it->d.lustre.it_lock_mode);
+			memcpy(lockh, &old_lock, sizeof(old_lock));
+			it->d.lustre.it_lock_handle = lockh->cookie;
+		}
+	}
+	CDEBUG(D_DENTRY,"D_IT dentry %.*s intent: %s status %d disp %x rc %d\n",
+	       op_data->op_namelen, op_data->op_name, ldlm_it2str(it->it_op),
+	       it->d.lustre.it_status, it->d.lustre.it_disposition, rc);
+	RETURN(rc);
+}
+
+int mdc_revalidate_lock(struct obd_export *exp, struct lookup_intent *it,
+			struct lu_fid *fid, __u64 *bits)
+{
+	/* We could just return 1 immediately, but since we should only
+	 * be called in revalidate_it if we already have a lock, let's
+	 * verify that. */
+	struct ldlm_res_id res_id;
+	struct lustre_handle lockh;
+	ldlm_policy_data_t policy;
+	ldlm_mode_t mode;
+	ENTRY;
+
+	if (it->d.lustre.it_lock_handle) {
+		lockh.cookie = it->d.lustre.it_lock_handle;
+		mode = ldlm_revalidate_lock_handle(&lockh, bits);
+	} else {
+		fid_build_reg_res_name(fid, &res_id);
+		switch (it->it_op) {
+		case IT_GETATTR:
+			policy.l_inodebits.bits = MDS_INODELOCK_UPDATE;
+			break;
+		case IT_LAYOUT:
+			policy.l_inodebits.bits = MDS_INODELOCK_LAYOUT;
+			break;
+		default:
+			policy.l_inodebits.bits = MDS_INODELOCK_LOOKUP;
+			break;
+		}
+		mode = ldlm_lock_match(exp->exp_obd->obd_namespace,
+				       LDLM_FL_BLOCK_GRANTED, &res_id,
+				       LDLM_IBITS, &policy,
+				       LCK_CR|LCK_CW|LCK_PR|LCK_PW, &lockh, 0);
+	}
+
+	if (mode) {
+		it->d.lustre.it_lock_handle = lockh.cookie;
+		it->d.lustre.it_lock_mode = mode;
+	} else {
+		it->d.lustre.it_lock_handle = 0;
+		it->d.lustre.it_lock_mode = 0;
+	}
+
+	RETURN(!!mode);
+}
+
+/*
+ * This long block is all about fixing up the lock and request state
+ * so that it is correct as of the moment _before_ the operation was
+ * applied; that way, the VFS will think that everything is normal and
+ * call Lustre's regular VFS methods.
+ *
+ * If we're performing a creation, that means that unless the creation
+ * failed with EEXIST, we should fake up a negative dentry.
+ *
+ * For everything else, we want to lookup to succeed.
+ *
+ * One additional note: if CREATE or OPEN succeeded, we add an extra
+ * reference to the request because we need to keep it around until
+ * ll_create/ll_open gets called.
+ *
+ * The server will return to us, in it_disposition, an indication of
+ * exactly what d.lustre.it_status refers to.
+ *
+ * If DISP_OPEN_OPEN is set, then d.lustre.it_status refers to the open() call,
+ * otherwise if DISP_OPEN_CREATE is set, then it status is the
+ * creation failure mode.  In either case, one of DISP_LOOKUP_NEG or
+ * DISP_LOOKUP_POS will be set, indicating whether the child lookup
+ * was successful.
+ *
+ * Else, if DISP_LOOKUP_EXECD then d.lustre.it_status is the rc of the
+ * child lookup.
+ */
+int mdc_intent_lock(struct obd_export *exp, struct md_op_data *op_data,
+		    void *lmm, int lmmsize, struct lookup_intent *it,
+		    int lookup_flags, struct ptlrpc_request **reqp,
+		    ldlm_blocking_callback cb_blocking,
+		    __u64 extra_lock_flags)
+{
+	struct lustre_handle lockh;
+	int rc = 0;
+	ENTRY;
+	LASSERT(it);
+
+	CDEBUG(D_DLMTRACE, "(name: %.*s,"DFID") in obj "DFID
+	       ", intent: %s flags %#o\n", op_data->op_namelen,
+	       op_data->op_name, PFID(&op_data->op_fid2),
+	       PFID(&op_data->op_fid1), ldlm_it2str(it->it_op),
+	       it->it_flags);
+
+	lockh.cookie = 0;
+	if (fid_is_sane(&op_data->op_fid2) &&
+	    (it->it_op & (IT_LOOKUP | IT_GETATTR))) {
+		/* We could just return 1 immediately, but since we should only
+		 * be called in revalidate_it if we already have a lock, let's
+		 * verify that. */
+		it->d.lustre.it_lock_handle = 0;
+		rc = mdc_revalidate_lock(exp, it, &op_data->op_fid2, NULL);
+		/* Only return failure if it was not GETATTR by cfid
+		   (from inode_revalidate) */
+		if (rc || op_data->op_namelen != 0)
+			RETURN(rc);
+	}
+
+	/* lookup_it may be called only after revalidate_it has run, because
+	 * revalidate_it cannot return errors, only zero.  Returning zero causes
+	 * this call to lookup, which *can* return an error.
+	 *
+	 * We only want to execute the request associated with the intent one
+	 * time, however, so don't send the request again.  Instead, skip past
+	 * this and use the request from revalidate.  In this case, revalidate
+	 * never dropped its reference, so the refcounts are all OK */
+	if (!it_disposition(it, DISP_ENQ_COMPLETE)) {
+		struct ldlm_enqueue_info einfo =
+			{ LDLM_IBITS, it_to_lock_mode(it), cb_blocking,
+			  ldlm_completion_ast, NULL, NULL, NULL };
+
+		/* For case if upper layer did not alloc fid, do it now. */
+		if (!fid_is_sane(&op_data->op_fid2) && it->it_op & IT_CREAT) {
+			rc = mdc_fid_alloc(exp, &op_data->op_fid2, op_data);
+			if (rc < 0) {
+				CERROR("Can't alloc new fid, rc %d\n", rc);
+				RETURN(rc);
+			}
+		}
+		rc = mdc_enqueue(exp, &einfo, it, op_data, &lockh,
+				 lmm, lmmsize, NULL, extra_lock_flags);
+		if (rc < 0)
+			RETURN(rc);
+	} else if (!fid_is_sane(&op_data->op_fid2) ||
+		   !(it->it_create_mode & M_CHECK_STALE)) {
+		/* DISP_ENQ_COMPLETE set means there is extra reference on
+		 * request referenced from this intent, saved for subsequent
+		 * lookup.  This path is executed when we proceed to this
+		 * lookup, so we clear DISP_ENQ_COMPLETE */
+		it_clear_disposition(it, DISP_ENQ_COMPLETE);
+	}
+	*reqp = it->d.lustre.it_data;
+	rc = mdc_finish_intent_lock(exp, *reqp, op_data, it, &lockh);
+	RETURN(rc);
+}
+
+static int mdc_intent_getattr_async_interpret(const struct lu_env *env,
+					      struct ptlrpc_request *req,
+					      void *args, int rc)
+{
+	struct mdc_getattr_args  *ga = args;
+	struct obd_export	*exp = ga->ga_exp;
+	struct md_enqueue_info   *minfo = ga->ga_minfo;
+	struct ldlm_enqueue_info *einfo = ga->ga_einfo;
+	struct lookup_intent     *it;
+	struct lustre_handle     *lockh;
+	struct obd_device	*obddev;
+	__u64		     flags = LDLM_FL_HAS_INTENT;
+	ENTRY;
+
+	it    = &minfo->mi_it;
+	lockh = &minfo->mi_lockh;
+
+	obddev = class_exp2obd(exp);
+
+	mdc_exit_request(&obddev->u.cli);
+	if (OBD_FAIL_CHECK(OBD_FAIL_MDC_GETATTR_ENQUEUE))
+		rc = -ETIMEDOUT;
+
+	rc = ldlm_cli_enqueue_fini(exp, req, einfo->ei_type, 1, einfo->ei_mode,
+				   &flags, NULL, 0, lockh, rc);
+	if (rc < 0) {
+		CERROR("ldlm_cli_enqueue_fini: %d\n", rc);
+		mdc_clear_replay_flag(req, rc);
+		GOTO(out, rc);
+	}
+
+	rc = mdc_finish_enqueue(exp, req, einfo, it, lockh, rc);
+	if (rc)
+		GOTO(out, rc);
+
+	rc = mdc_finish_intent_lock(exp, req, &minfo->mi_data, it, lockh);
+	EXIT;
+
+out:
+	OBD_FREE_PTR(einfo);
+	minfo->mi_cb(req, minfo, rc);
+	return 0;
+}
+
+int mdc_intent_getattr_async(struct obd_export *exp,
+			     struct md_enqueue_info *minfo,
+			     struct ldlm_enqueue_info *einfo)
+{
+	struct md_op_data       *op_data = &minfo->mi_data;
+	struct lookup_intent    *it = &minfo->mi_it;
+	struct ptlrpc_request   *req;
+	struct mdc_getattr_args *ga;
+	struct obd_device       *obddev = class_exp2obd(exp);
+	struct ldlm_res_id       res_id;
+	/*XXX: Both MDS_INODELOCK_LOOKUP and MDS_INODELOCK_UPDATE are needed
+	 *     for statahead currently. Consider CMD in future, such two bits
+	 *     maybe managed by different MDS, should be adjusted then. */
+	ldlm_policy_data_t       policy = {
+					.l_inodebits = { MDS_INODELOCK_LOOKUP |
+							 MDS_INODELOCK_UPDATE }
+				 };
+	int		      rc = 0;
+	__u64		    flags = LDLM_FL_HAS_INTENT;
+	ENTRY;
+
+	CDEBUG(D_DLMTRACE,"name: %.*s in inode "DFID", intent: %s flags %#o\n",
+	       op_data->op_namelen, op_data->op_name, PFID(&op_data->op_fid1),
+	       ldlm_it2str(it->it_op), it->it_flags);
+
+	fid_build_reg_res_name(&op_data->op_fid1, &res_id);
+	req = mdc_intent_getattr_pack(exp, it, op_data);
+	if (!req)
+		RETURN(-ENOMEM);
+
+	rc = mdc_enter_request(&obddev->u.cli);
+	if (rc != 0) {
+		ptlrpc_req_finished(req);
+		RETURN(rc);
+	}
+
+	rc = ldlm_cli_enqueue(exp, &req, einfo, &res_id, &policy, &flags, NULL,
+			      0, LVB_T_NONE, &minfo->mi_lockh, 1);
+	if (rc < 0) {
+		mdc_exit_request(&obddev->u.cli);
+		ptlrpc_req_finished(req);
+		RETURN(rc);
+	}
+
+	CLASSERT(sizeof(*ga) <= sizeof(req->rq_async_args));
+	ga = ptlrpc_req_async_args(req);
+	ga->ga_exp = exp;
+	ga->ga_minfo = minfo;
+	ga->ga_einfo = einfo;
+
+	req->rq_interpret_reply = mdc_intent_getattr_async_interpret;
+	ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1);
+
+	RETURN(0);
+}
diff --git a/drivers/staging/lustre/lustre/mdc/mdc_reint.c b/drivers/staging/lustre/lustre/mdc/mdc_reint.c
new file mode 100644
index 000000000000..5e25a07c52bd
--- /dev/null
+++ b/drivers/staging/lustre/lustre/mdc/mdc_reint.c
@@ -0,0 +1,489 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_MDC
+
+# include <linux/module.h>
+# include <linux/kernel.h>
+
+#include <obd_class.h>
+#include "mdc_internal.h"
+#include <lustre_fid.h>
+
+/* mdc_setattr does its own semaphore handling */
+static int mdc_reint(struct ptlrpc_request *request,
+		     struct mdc_rpc_lock *rpc_lock,
+		     int level)
+{
+	int rc;
+
+	request->rq_send_state = level;
+
+	mdc_get_rpc_lock(rpc_lock, NULL);
+	rc = ptlrpc_queue_wait(request);
+	mdc_put_rpc_lock(rpc_lock, NULL);
+	if (rc)
+		CDEBUG(D_INFO, "error in handling %d\n", rc);
+	else if (!req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY)) {
+		rc = -EPROTO;
+	}
+	return rc;
+}
+
+/* Find and cancel locally locks matched by inode @bits & @mode in the resource
+ * found by @fid. Found locks are added into @cancel list. Returns the amount of
+ * locks added to @cancels list. */
+int mdc_resource_get_unused(struct obd_export *exp, struct lu_fid *fid,
+			    struct list_head *cancels, ldlm_mode_t mode,
+			    __u64 bits)
+{
+	struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
+	ldlm_policy_data_t policy = {{0}};
+	struct ldlm_res_id res_id;
+	struct ldlm_resource *res;
+	int count;
+	ENTRY;
+
+	/* Return, i.e. cancel nothing, only if ELC is supported (flag in
+	 * export) but disabled through procfs (flag in NS).
+	 *
+	 * This distinguishes from a case when ELC is not supported originally,
+	 * when we still want to cancel locks in advance and just cancel them
+	 * locally, without sending any RPC. */
+	if (exp_connect_cancelset(exp) && !ns_connect_cancelset(ns))
+		RETURN(0);
+
+	fid_build_reg_res_name(fid, &res_id);
+	res = ldlm_resource_get(exp->exp_obd->obd_namespace,
+				NULL, &res_id, 0, 0);
+	if (res == NULL)
+		RETURN(0);
+	LDLM_RESOURCE_ADDREF(res);
+	/* Initialize ibits lock policy. */
+	policy.l_inodebits.bits = bits;
+	count = ldlm_cancel_resource_local(res, cancels, &policy,
+					   mode, 0, 0, NULL);
+	LDLM_RESOURCE_DELREF(res);
+	ldlm_resource_putref(res);
+	RETURN(count);
+}
+
+int mdc_setattr(struct obd_export *exp, struct md_op_data *op_data,
+		void *ea, int ealen, void *ea2, int ea2len,
+		struct ptlrpc_request **request, struct md_open_data **mod)
+{
+	LIST_HEAD(cancels);
+	struct ptlrpc_request *req;
+	struct mdc_rpc_lock *rpc_lock;
+	struct obd_device *obd = exp->exp_obd;
+	int count = 0, rc;
+	__u64 bits;
+	ENTRY;
+
+	LASSERT(op_data != NULL);
+
+	bits = MDS_INODELOCK_UPDATE;
+	if (op_data->op_attr.ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID))
+		bits |= MDS_INODELOCK_LOOKUP;
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+	    (fid_is_sane(&op_data->op_fid1)) &&
+	    !OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET))
+		count = mdc_resource_get_unused(exp, &op_data->op_fid1,
+						&cancels, LCK_EX, bits);
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_MDS_REINT_SETATTR);
+	if (req == NULL) {
+		ldlm_lock_list_put(&cancels, l_bl_ast, count);
+		RETURN(-ENOMEM);
+	}
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+	if ((op_data->op_flags & (MF_SOM_CHANGE | MF_EPOCH_OPEN)) == 0)
+		req_capsule_set_size(&req->rq_pill, &RMF_MDT_EPOCH, RCL_CLIENT,
+				     0);
+	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, ealen);
+	req_capsule_set_size(&req->rq_pill, &RMF_LOGCOOKIES, RCL_CLIENT,
+			     ea2len);
+
+	rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	rpc_lock = obd->u.cli.cl_rpc_lock;
+
+	if (op_data->op_attr.ia_valid & (ATTR_MTIME | ATTR_CTIME))
+		CDEBUG(D_INODE, "setting mtime "CFS_TIME_T
+		       ", ctime "CFS_TIME_T"\n",
+		       LTIME_S(op_data->op_attr.ia_mtime),
+		       LTIME_S(op_data->op_attr.ia_ctime));
+	mdc_setattr_pack(req, op_data, ea, ealen, ea2, ea2len);
+
+	ptlrpc_request_set_replen(req);
+	if (mod && (op_data->op_flags & MF_EPOCH_OPEN) &&
+	    req->rq_import->imp_replayable)
+	{
+		LASSERT(*mod == NULL);
+
+		*mod = obd_mod_alloc();
+		if (*mod == NULL) {
+			DEBUG_REQ(D_ERROR, req, "Can't allocate "
+				  "md_open_data");
+		} else {
+			req->rq_replay = 1;
+			req->rq_cb_data = *mod;
+			(*mod)->mod_open_req = req;
+			req->rq_commit_cb = mdc_commit_open;
+			/**
+			 * Take an extra reference on \var mod, it protects \var
+			 * mod from being freed on eviction (commit callback is
+			 * called despite rq_replay flag).
+			 * Will be put on mdc_done_writing().
+			 */
+			obd_mod_get(*mod);
+		}
+	}
+
+	rc = mdc_reint(req, rpc_lock, LUSTRE_IMP_FULL);
+
+	/* Save the obtained info in the original RPC for the replay case. */
+	if (rc == 0 && (op_data->op_flags & MF_EPOCH_OPEN)) {
+		struct mdt_ioepoch *epoch;
+		struct mdt_body  *body;
+
+		epoch = req_capsule_client_get(&req->rq_pill, &RMF_MDT_EPOCH);
+		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+		LASSERT(epoch != NULL);
+		LASSERT(body != NULL);
+		epoch->handle = body->handle;
+		epoch->ioepoch = body->ioepoch;
+		req->rq_replay_cb = mdc_replay_open;
+	/** bug 3633, open may be committed and estale answer is not error */
+	} else if (rc == -ESTALE && (op_data->op_flags & MF_SOM_CHANGE)) {
+		rc = 0;
+	} else if (rc == -ERESTARTSYS) {
+		rc = 0;
+	}
+	*request = req;
+	if (rc && req->rq_commit_cb) {
+		/* Put an extra reference on \var mod on error case. */
+		obd_mod_put(*mod);
+		req->rq_commit_cb(req);
+	}
+	RETURN(rc);
+}
+
+int mdc_create(struct obd_export *exp, struct md_op_data *op_data,
+	       const void *data, int datalen, int mode, __u32 uid, __u32 gid,
+	       cfs_cap_t cap_effective, __u64 rdev,
+	       struct ptlrpc_request **request)
+{
+	struct ptlrpc_request *req;
+	int level, rc;
+	int count, resends = 0;
+	struct obd_import *import = exp->exp_obd->u.cli.cl_import;
+	int generation = import->imp_generation;
+	LIST_HEAD(cancels);
+	ENTRY;
+
+	/* For case if upper layer did not alloc fid, do it now. */
+	if (!fid_is_sane(&op_data->op_fid2)) {
+		/*
+		 * mdc_fid_alloc() may return errno 1 in case of switch to new
+		 * sequence, handle this.
+		 */
+		rc = mdc_fid_alloc(exp, &op_data->op_fid2, op_data);
+		if (rc < 0) {
+			CERROR("Can't alloc new fid, rc %d\n", rc);
+			RETURN(rc);
+		}
+	}
+
+rebuild:
+	count = 0;
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+	    (fid_is_sane(&op_data->op_fid1)))
+		count = mdc_resource_get_unused(exp, &op_data->op_fid1,
+						&cancels, LCK_EX,
+						MDS_INODELOCK_UPDATE);
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_MDS_REINT_CREATE_RMT_ACL);
+	if (req == NULL) {
+		ldlm_lock_list_put(&cancels, l_bl_ast, count);
+		RETURN(-ENOMEM);
+	}
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+	req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+			     op_data->op_namelen + 1);
+	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT,
+			     data && datalen ? datalen : 0);
+
+	rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	/*
+	 * mdc_create_pack() fills msg->bufs[1] with name and msg->bufs[2] with
+	 * tgt, for symlinks or lov MD data.
+	 */
+	mdc_create_pack(req, op_data, data, datalen, mode, uid,
+			gid, cap_effective, rdev);
+
+	ptlrpc_request_set_replen(req);
+
+	/* ask ptlrpc not to resend on EINPROGRESS since we have our own retry
+	 * logic here */
+	req->rq_no_retry_einprogress = 1;
+
+	if (resends) {
+		req->rq_generation_set = 1;
+		req->rq_import_generation = generation;
+		req->rq_sent = cfs_time_current_sec() + resends;
+	}
+	level = LUSTRE_IMP_FULL;
+ resend:
+	rc = mdc_reint(req, exp->exp_obd->u.cli.cl_rpc_lock, level);
+
+	/* Resend if we were told to. */
+	if (rc == -ERESTARTSYS) {
+		level = LUSTRE_IMP_RECOVER;
+		goto resend;
+	} else if (rc == -EINPROGRESS) {
+		/* Retry create infinitely until succeed or get other
+		 * error code. */
+		ptlrpc_req_finished(req);
+		resends++;
+
+		CDEBUG(D_HA, "%s: resend:%d create on "DFID"/"DFID"\n",
+		       exp->exp_obd->obd_name, resends,
+		       PFID(&op_data->op_fid1), PFID(&op_data->op_fid2));
+
+		if (generation == import->imp_generation) {
+			goto rebuild;
+		} else {
+			CDEBUG(D_HA, "resend cross eviction\n");
+			RETURN(-EIO);
+		}
+	} else if (rc == 0) {
+		struct mdt_body *body;
+		struct lustre_capa *capa;
+
+		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+		LASSERT(body);
+		if (body->valid & OBD_MD_FLMDSCAPA) {
+			capa = req_capsule_server_get(&req->rq_pill,
+						      &RMF_CAPA1);
+			if (capa == NULL)
+				rc = -EPROTO;
+		}
+	}
+
+	*request = req;
+	RETURN(rc);
+}
+
+int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data,
+	       struct ptlrpc_request **request)
+{
+	LIST_HEAD(cancels);
+	struct obd_device *obd = class_exp2obd(exp);
+	struct ptlrpc_request *req = *request;
+	int count = 0, rc;
+	ENTRY;
+
+	LASSERT(req == NULL);
+
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+	    (fid_is_sane(&op_data->op_fid1)) &&
+	    !OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET))
+		count = mdc_resource_get_unused(exp, &op_data->op_fid1,
+						&cancels, LCK_EX,
+						MDS_INODELOCK_UPDATE);
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID3) &&
+	    (fid_is_sane(&op_data->op_fid3)) &&
+	    !OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET))
+		count += mdc_resource_get_unused(exp, &op_data->op_fid3,
+						 &cancels, LCK_EX,
+						 MDS_INODELOCK_FULL);
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_MDS_REINT_UNLINK);
+	if (req == NULL) {
+		ldlm_lock_list_put(&cancels, l_bl_ast, count);
+		RETURN(-ENOMEM);
+	}
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+	req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+			     op_data->op_namelen + 1);
+
+	rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	mdc_unlink_pack(req, op_data);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+			     obd->u.cli.cl_max_mds_easize);
+	req_capsule_set_size(&req->rq_pill, &RMF_LOGCOOKIES, RCL_SERVER,
+			     obd->u.cli.cl_max_mds_cookiesize);
+	ptlrpc_request_set_replen(req);
+
+	*request = req;
+
+	rc = mdc_reint(req, obd->u.cli.cl_rpc_lock, LUSTRE_IMP_FULL);
+	if (rc == -ERESTARTSYS)
+		rc = 0;
+	RETURN(rc);
+}
+
+int mdc_link(struct obd_export *exp, struct md_op_data *op_data,
+	     struct ptlrpc_request **request)
+{
+	LIST_HEAD(cancels);
+	struct obd_device *obd = exp->exp_obd;
+	struct ptlrpc_request *req;
+	int count = 0, rc;
+	ENTRY;
+
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID2) &&
+	    (fid_is_sane(&op_data->op_fid2)))
+		count = mdc_resource_get_unused(exp, &op_data->op_fid2,
+						&cancels, LCK_EX,
+						MDS_INODELOCK_UPDATE);
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+	    (fid_is_sane(&op_data->op_fid1)))
+		count += mdc_resource_get_unused(exp, &op_data->op_fid1,
+						 &cancels, LCK_EX,
+						 MDS_INODELOCK_UPDATE);
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_REINT_LINK);
+	if (req == NULL) {
+		ldlm_lock_list_put(&cancels, l_bl_ast, count);
+		RETURN(-ENOMEM);
+	}
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+	mdc_set_capa_size(req, &RMF_CAPA2, op_data->op_capa2);
+	req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+			     op_data->op_namelen + 1);
+
+	rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	mdc_link_pack(req, op_data);
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_reint(req, obd->u.cli.cl_rpc_lock, LUSTRE_IMP_FULL);
+	*request = req;
+	if (rc == -ERESTARTSYS)
+		rc = 0;
+
+	RETURN(rc);
+}
+
+int mdc_rename(struct obd_export *exp, struct md_op_data *op_data,
+	       const char *old, int oldlen, const char *new, int newlen,
+	       struct ptlrpc_request **request)
+{
+	LIST_HEAD(cancels);
+	struct obd_device *obd = exp->exp_obd;
+	struct ptlrpc_request *req;
+	int count = 0, rc;
+	ENTRY;
+
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+	    (fid_is_sane(&op_data->op_fid1)))
+		count = mdc_resource_get_unused(exp, &op_data->op_fid1,
+						&cancels, LCK_EX,
+						MDS_INODELOCK_UPDATE);
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID2) &&
+	    (fid_is_sane(&op_data->op_fid2)))
+		count += mdc_resource_get_unused(exp, &op_data->op_fid2,
+						 &cancels, LCK_EX,
+						 MDS_INODELOCK_UPDATE);
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID3) &&
+	    (fid_is_sane(&op_data->op_fid3)))
+		count += mdc_resource_get_unused(exp, &op_data->op_fid3,
+						 &cancels, LCK_EX,
+						 MDS_INODELOCK_LOOKUP);
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID4) &&
+	     (fid_is_sane(&op_data->op_fid4)))
+		count += mdc_resource_get_unused(exp, &op_data->op_fid4,
+						 &cancels, LCK_EX,
+						 MDS_INODELOCK_FULL);
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_MDS_REINT_RENAME);
+	if (req == NULL) {
+		ldlm_lock_list_put(&cancels, l_bl_ast, count);
+		RETURN(-ENOMEM);
+	}
+
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+	mdc_set_capa_size(req, &RMF_CAPA2, op_data->op_capa2);
+	req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, oldlen + 1);
+	req_capsule_set_size(&req->rq_pill, &RMF_SYMTGT, RCL_CLIENT, newlen+1);
+
+	rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	if (exp_connect_cancelset(exp) && req)
+		ldlm_cli_cancel_list(&cancels, count, req, 0);
+
+	mdc_rename_pack(req, op_data, old, oldlen, new, newlen);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+			     obd->u.cli.cl_max_mds_easize);
+	req_capsule_set_size(&req->rq_pill, &RMF_LOGCOOKIES, RCL_SERVER,
+			     obd->u.cli.cl_max_mds_cookiesize);
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_reint(req, obd->u.cli.cl_rpc_lock, LUSTRE_IMP_FULL);
+	*request = req;
+	if (rc == -ERESTARTSYS)
+		rc = 0;
+
+	RETURN(rc);
+}
diff --git a/drivers/staging/lustre/lustre/mdc/mdc_request.c b/drivers/staging/lustre/lustre/mdc/mdc_request.c
new file mode 100644
index 000000000000..88454bf75a71
--- /dev/null
+++ b/drivers/staging/lustre/lustre/mdc/mdc_request.c
@@ -0,0 +1,2752 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_MDC
+
+# include <linux/module.h>
+# include <linux/pagemap.h>
+# include <linux/miscdevice.h>
+# include <linux/init.h>
+# include <linux/utsname.h>
+
+#include <lustre_acl.h>
+#include <obd_class.h>
+#include <lustre_fid.h>
+#include <lprocfs_status.h>
+#include <lustre_param.h>
+#include <lustre_log.h>
+
+#include "mdc_internal.h"
+
+#define REQUEST_MINOR 244
+
+struct mdc_renew_capa_args {
+	struct obd_capa	*ra_oc;
+	renew_capa_cb_t	 ra_cb;
+};
+
+static int mdc_cleanup(struct obd_device *obd);
+
+int mdc_unpack_capa(struct obd_export *exp, struct ptlrpc_request *req,
+		    const struct req_msg_field *field, struct obd_capa **oc)
+{
+	struct lustre_capa *capa;
+	struct obd_capa *c;
+	ENTRY;
+
+	/* swabbed already in mdc_enqueue */
+	capa = req_capsule_server_get(&req->rq_pill, field);
+	if (capa == NULL)
+		RETURN(-EPROTO);
+
+	c = alloc_capa(CAPA_SITE_CLIENT);
+	if (IS_ERR(c)) {
+		CDEBUG(D_INFO, "alloc capa failed!\n");
+		RETURN(PTR_ERR(c));
+	} else {
+		c->c_capa = *capa;
+		*oc = c;
+		RETURN(0);
+	}
+}
+
+static inline int mdc_queue_wait(struct ptlrpc_request *req)
+{
+	struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
+	int rc;
+
+	/* mdc_enter_request() ensures that this client has no more
+	 * than cl_max_rpcs_in_flight RPCs simultaneously inf light
+	 * against an MDT. */
+	rc = mdc_enter_request(cli);
+	if (rc != 0)
+		return rc;
+
+	rc = ptlrpc_queue_wait(req);
+	mdc_exit_request(cli);
+
+	return rc;
+}
+
+/* Helper that implements most of mdc_getstatus and signal_completed_replay. */
+/* XXX this should become mdc_get_info("key"), sending MDS_GET_INFO RPC */
+static int send_getstatus(struct obd_import *imp, struct lu_fid *rootfid,
+			  struct obd_capa **pc, int level, int msg_flags)
+{
+	struct ptlrpc_request *req;
+	struct mdt_body       *body;
+	int		    rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_GETSTATUS,
+					LUSTRE_MDS_VERSION, MDS_GETSTATUS);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	mdc_pack_body(req, NULL, NULL, 0, 0, -1, 0);
+	lustre_msg_add_flags(req->rq_reqmsg, msg_flags);
+	req->rq_send_state = level;
+
+	ptlrpc_request_set_replen(req);
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	if (body->valid & OBD_MD_FLMDSCAPA) {
+		rc = mdc_unpack_capa(NULL, req, &RMF_CAPA1, pc);
+		if (rc)
+			GOTO(out, rc);
+	}
+
+	*rootfid = body->fid1;
+	CDEBUG(D_NET,
+	       "root fid="DFID", last_committed="LPU64"\n",
+	       PFID(rootfid),
+	       lustre_msg_get_last_committed(req->rq_repmsg));
+	EXIT;
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+/* This should be mdc_get_info("rootfid") */
+int mdc_getstatus(struct obd_export *exp, struct lu_fid *rootfid,
+		  struct obd_capa **pc)
+{
+	return send_getstatus(class_exp2cliimp(exp), rootfid, pc,
+			      LUSTRE_IMP_FULL, 0);
+}
+
+/*
+ * This function now is known to always saying that it will receive 4 buffers
+ * from server. Even for cases when acl_size and md_size is zero, RPC header
+ * will contain 4 fields and RPC itself will contain zero size fields. This is
+ * because mdt_getattr*() _always_ returns 4 fields, but if acl is not needed
+ * and thus zero, it shrinks it, making zero size. The same story about
+ * md_size. And this is course of problem when client waits for smaller number
+ * of fields. This issue will be fixed later when client gets aware of RPC
+ * layouts.  --umka
+ */
+static int mdc_getattr_common(struct obd_export *exp,
+			      struct ptlrpc_request *req)
+{
+	struct req_capsule *pill = &req->rq_pill;
+	struct mdt_body    *body;
+	void	       *eadata;
+	int		 rc;
+	ENTRY;
+
+	/* Request message already built. */
+	rc = ptlrpc_queue_wait(req);
+	if (rc != 0)
+		RETURN(rc);
+
+	/* sanity check for the reply */
+	body = req_capsule_server_get(pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		RETURN(-EPROTO);
+
+	CDEBUG(D_NET, "mode: %o\n", body->mode);
+
+	if (body->eadatasize != 0) {
+		mdc_update_max_ea_from_body(exp, body);
+
+		eadata = req_capsule_server_sized_get(pill, &RMF_MDT_MD,
+						      body->eadatasize);
+		if (eadata == NULL)
+			RETURN(-EPROTO);
+	}
+
+	if (body->valid & OBD_MD_FLRMTPERM) {
+		struct mdt_remote_perm *perm;
+
+		LASSERT(client_is_remote(exp));
+		perm = req_capsule_server_swab_get(pill, &RMF_ACL,
+						lustre_swab_mdt_remote_perm);
+		if (perm == NULL)
+			RETURN(-EPROTO);
+	}
+
+	if (body->valid & OBD_MD_FLMDSCAPA) {
+		struct lustre_capa *capa;
+		capa = req_capsule_server_get(pill, &RMF_CAPA1);
+		if (capa == NULL)
+			RETURN(-EPROTO);
+	}
+
+	RETURN(0);
+}
+
+int mdc_getattr(struct obd_export *exp, struct md_op_data *op_data,
+		struct ptlrpc_request **request)
+{
+	struct ptlrpc_request *req;
+	int		    rc;
+	ENTRY;
+
+	/* Single MDS without an LMV case */
+	if (op_data->op_flags & MF_GET_MDT_IDX) {
+		op_data->op_mds = 0;
+		RETURN(0);
+	}
+	*request = NULL;
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_GETATTR);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GETATTR);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	mdc_pack_body(req, &op_data->op_fid1, op_data->op_capa1,
+		      op_data->op_valid, op_data->op_mode, -1, 0);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+			     op_data->op_mode);
+	if (op_data->op_valid & OBD_MD_FLRMTPERM) {
+		LASSERT(client_is_remote(exp));
+		req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
+				     sizeof(struct mdt_remote_perm));
+	}
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_getattr_common(exp, req);
+	if (rc)
+		ptlrpc_req_finished(req);
+	else
+		*request = req;
+	RETURN(rc);
+}
+
+int mdc_getattr_name(struct obd_export *exp, struct md_op_data *op_data,
+		     struct ptlrpc_request **request)
+{
+	struct ptlrpc_request *req;
+	int		    rc;
+	ENTRY;
+
+	*request = NULL;
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_MDS_GETATTR_NAME);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+	req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+			     op_data->op_namelen + 1);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GETATTR_NAME);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	mdc_pack_body(req, &op_data->op_fid1, op_data->op_capa1,
+		      op_data->op_valid, op_data->op_mode,
+		      op_data->op_suppgids[0], 0);
+
+	if (op_data->op_name) {
+		char *name = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+		LASSERT(strnlen(op_data->op_name, op_data->op_namelen) ==
+				op_data->op_namelen);
+		memcpy(name, op_data->op_name, op_data->op_namelen);
+	}
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+			     op_data->op_mode);
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_getattr_common(exp, req);
+	if (rc)
+		ptlrpc_req_finished(req);
+	else
+		*request = req;
+	RETURN(rc);
+}
+
+static int mdc_is_subdir(struct obd_export *exp,
+			 const struct lu_fid *pfid,
+			 const struct lu_fid *cfid,
+			 struct ptlrpc_request **request)
+{
+	struct ptlrpc_request  *req;
+	int		     rc;
+
+	ENTRY;
+
+	*request = NULL;
+	req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+					&RQF_MDS_IS_SUBDIR, LUSTRE_MDS_VERSION,
+					MDS_IS_SUBDIR);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	mdc_is_subdir_pack(req, pfid, cfid, 0);
+	ptlrpc_request_set_replen(req);
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc && rc != -EREMOTE)
+		ptlrpc_req_finished(req);
+	else
+		*request = req;
+	RETURN(rc);
+}
+
+static int mdc_xattr_common(struct obd_export *exp,const struct req_format *fmt,
+			    const struct lu_fid *fid,
+			    struct obd_capa *oc, int opcode, obd_valid valid,
+			    const char *xattr_name, const char *input,
+			    int input_size, int output_size, int flags,
+			    __u32 suppgid, struct ptlrpc_request **request)
+{
+	struct ptlrpc_request *req;
+	int   xattr_namelen = 0;
+	char *tmp;
+	int   rc;
+	ENTRY;
+
+	*request = NULL;
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), fmt);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	mdc_set_capa_size(req, &RMF_CAPA1, oc);
+	if (xattr_name) {
+		xattr_namelen = strlen(xattr_name) + 1;
+		req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+				     xattr_namelen);
+	}
+	if (input_size) {
+		LASSERT(input);
+		req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT,
+				     input_size);
+	}
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, opcode);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	if (opcode == MDS_REINT) {
+		struct mdt_rec_setxattr *rec;
+
+		CLASSERT(sizeof(struct mdt_rec_setxattr) ==
+			 sizeof(struct mdt_rec_reint));
+		rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+		rec->sx_opcode = REINT_SETXATTR;
+		/* TODO:
+		 *  cfs_curproc_fs{u,g}id() should replace
+		 *  current->fs{u,g}id for portability.
+		 */
+		rec->sx_fsuid  = current_fsuid();
+		rec->sx_fsgid  = current_fsgid();
+		rec->sx_cap    = cfs_curproc_cap_pack();
+		rec->sx_suppgid1 = suppgid;
+		rec->sx_suppgid2 = -1;
+		rec->sx_fid    = *fid;
+		rec->sx_valid  = valid | OBD_MD_FLCTIME;
+		rec->sx_time   = cfs_time_current_sec();
+		rec->sx_size   = output_size;
+		rec->sx_flags  = flags;
+
+		mdc_pack_capa(req, &RMF_CAPA1, oc);
+	} else {
+		mdc_pack_body(req, fid, oc, valid, output_size, suppgid, flags);
+	}
+
+	if (xattr_name) {
+		tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+		memcpy(tmp, xattr_name, xattr_namelen);
+	}
+	if (input_size) {
+		tmp = req_capsule_client_get(&req->rq_pill, &RMF_EADATA);
+		memcpy(tmp, input, input_size);
+	}
+
+	if (req_capsule_has_field(&req->rq_pill, &RMF_EADATA, RCL_SERVER))
+		req_capsule_set_size(&req->rq_pill, &RMF_EADATA,
+				     RCL_SERVER, output_size);
+	ptlrpc_request_set_replen(req);
+
+	/* make rpc */
+	if (opcode == MDS_REINT)
+		mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+
+	rc = ptlrpc_queue_wait(req);
+
+	if (opcode == MDS_REINT)
+		mdc_put_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+
+	if (rc)
+		ptlrpc_req_finished(req);
+	else
+		*request = req;
+	RETURN(rc);
+}
+
+int mdc_setxattr(struct obd_export *exp, const struct lu_fid *fid,
+		 struct obd_capa *oc, obd_valid valid, const char *xattr_name,
+		 const char *input, int input_size, int output_size,
+		 int flags, __u32 suppgid, struct ptlrpc_request **request)
+{
+	return mdc_xattr_common(exp, &RQF_MDS_REINT_SETXATTR,
+				fid, oc, MDS_REINT, valid, xattr_name,
+				input, input_size, output_size, flags,
+				suppgid, request);
+}
+
+int mdc_getxattr(struct obd_export *exp, const struct lu_fid *fid,
+		 struct obd_capa *oc, obd_valid valid, const char *xattr_name,
+		 const char *input, int input_size, int output_size,
+		 int flags, struct ptlrpc_request **request)
+{
+	return mdc_xattr_common(exp, &RQF_MDS_GETXATTR,
+				fid, oc, MDS_GETXATTR, valid, xattr_name,
+				input, input_size, output_size, flags,
+				-1, request);
+}
+
+#ifdef CONFIG_FS_POSIX_ACL
+static int mdc_unpack_acl(struct ptlrpc_request *req, struct lustre_md *md)
+{
+	struct req_capsule     *pill = &req->rq_pill;
+	struct mdt_body	*body = md->body;
+	struct posix_acl       *acl;
+	void		   *buf;
+	int		     rc;
+	ENTRY;
+
+	if (!body->aclsize)
+		RETURN(0);
+
+	buf = req_capsule_server_sized_get(pill, &RMF_ACL, body->aclsize);
+
+	if (!buf)
+		RETURN(-EPROTO);
+
+	acl = posix_acl_from_xattr(&init_user_ns, buf, body->aclsize);
+	if (IS_ERR(acl)) {
+		rc = PTR_ERR(acl);
+		CERROR("convert xattr to acl: %d\n", rc);
+		RETURN(rc);
+	}
+
+	rc = posix_acl_valid(acl);
+	if (rc) {
+		CERROR("validate acl: %d\n", rc);
+		posix_acl_release(acl);
+		RETURN(rc);
+	}
+
+	md->posix_acl = acl;
+	RETURN(0);
+}
+#else
+#define mdc_unpack_acl(req, md) 0
+#endif
+
+int mdc_get_lustre_md(struct obd_export *exp, struct ptlrpc_request *req,
+		      struct obd_export *dt_exp, struct obd_export *md_exp,
+		      struct lustre_md *md)
+{
+	struct req_capsule *pill = &req->rq_pill;
+	int rc;
+	ENTRY;
+
+	LASSERT(md);
+	memset(md, 0, sizeof(*md));
+
+	md->body = req_capsule_server_get(pill, &RMF_MDT_BODY);
+	LASSERT(md->body != NULL);
+
+	if (md->body->valid & OBD_MD_FLEASIZE) {
+		int lmmsize;
+		struct lov_mds_md *lmm;
+
+		if (!S_ISREG(md->body->mode)) {
+			CDEBUG(D_INFO, "OBD_MD_FLEASIZE set, should be a "
+			       "regular file, but is not\n");
+			GOTO(out, rc = -EPROTO);
+		}
+
+		if (md->body->eadatasize == 0) {
+			CDEBUG(D_INFO, "OBD_MD_FLEASIZE set, "
+			       "but eadatasize 0\n");
+			GOTO(out, rc = -EPROTO);
+		}
+		lmmsize = md->body->eadatasize;
+		lmm = req_capsule_server_sized_get(pill, &RMF_MDT_MD, lmmsize);
+		if (!lmm)
+			GOTO(out, rc = -EPROTO);
+
+		rc = obd_unpackmd(dt_exp, &md->lsm, lmm, lmmsize);
+		if (rc < 0)
+			GOTO(out, rc);
+
+		if (rc < sizeof(*md->lsm)) {
+			CDEBUG(D_INFO, "lsm size too small: "
+			       "rc < sizeof (*md->lsm) (%d < %d)\n",
+			       rc, (int)sizeof(*md->lsm));
+			GOTO(out, rc = -EPROTO);
+		}
+
+	} else if (md->body->valid & OBD_MD_FLDIREA) {
+		int lmvsize;
+		struct lov_mds_md *lmv;
+
+		if(!S_ISDIR(md->body->mode)) {
+			CDEBUG(D_INFO, "OBD_MD_FLDIREA set, should be a "
+			       "directory, but is not\n");
+			GOTO(out, rc = -EPROTO);
+		}
+
+		if (md->body->eadatasize == 0) {
+			CDEBUG(D_INFO, "OBD_MD_FLDIREA is set, "
+			       "but eadatasize 0\n");
+			RETURN(-EPROTO);
+		}
+		if (md->body->valid & OBD_MD_MEA) {
+			lmvsize = md->body->eadatasize;
+			lmv = req_capsule_server_sized_get(pill, &RMF_MDT_MD,
+							   lmvsize);
+			if (!lmv)
+				GOTO(out, rc = -EPROTO);
+
+			rc = obd_unpackmd(md_exp, (void *)&md->mea, lmv,
+					  lmvsize);
+			if (rc < 0)
+				GOTO(out, rc);
+
+			if (rc < sizeof(*md->mea)) {
+				CDEBUG(D_INFO, "size too small:  "
+				       "rc < sizeof(*md->mea) (%d < %d)\n",
+					rc, (int)sizeof(*md->mea));
+				GOTO(out, rc = -EPROTO);
+			}
+		}
+	}
+	rc = 0;
+
+	if (md->body->valid & OBD_MD_FLRMTPERM) {
+		/* remote permission */
+		LASSERT(client_is_remote(exp));
+		md->remote_perm = req_capsule_server_swab_get(pill, &RMF_ACL,
+						lustre_swab_mdt_remote_perm);
+		if (!md->remote_perm)
+			GOTO(out, rc = -EPROTO);
+	}
+	else if (md->body->valid & OBD_MD_FLACL) {
+		/* for ACL, it's possible that FLACL is set but aclsize is zero.
+		 * only when aclsize != 0 there's an actual segment for ACL
+		 * in reply buffer.
+		 */
+		if (md->body->aclsize) {
+			rc = mdc_unpack_acl(req, md);
+			if (rc)
+				GOTO(out, rc);
+#ifdef CONFIG_FS_POSIX_ACL
+		} else {
+			md->posix_acl = NULL;
+#endif
+		}
+	}
+	if (md->body->valid & OBD_MD_FLMDSCAPA) {
+		struct obd_capa *oc = NULL;
+
+		rc = mdc_unpack_capa(NULL, req, &RMF_CAPA1, &oc);
+		if (rc)
+			GOTO(out, rc);
+		md->mds_capa = oc;
+	}
+
+	if (md->body->valid & OBD_MD_FLOSSCAPA) {
+		struct obd_capa *oc = NULL;
+
+		rc = mdc_unpack_capa(NULL, req, &RMF_CAPA2, &oc);
+		if (rc)
+			GOTO(out, rc);
+		md->oss_capa = oc;
+	}
+
+	EXIT;
+out:
+	if (rc) {
+		if (md->oss_capa) {
+			capa_put(md->oss_capa);
+			md->oss_capa = NULL;
+		}
+		if (md->mds_capa) {
+			capa_put(md->mds_capa);
+			md->mds_capa = NULL;
+		}
+#ifdef CONFIG_FS_POSIX_ACL
+		posix_acl_release(md->posix_acl);
+#endif
+		if (md->lsm)
+			obd_free_memmd(dt_exp, &md->lsm);
+	}
+	return rc;
+}
+
+int mdc_free_lustre_md(struct obd_export *exp, struct lustre_md *md)
+{
+	ENTRY;
+	RETURN(0);
+}
+
+/**
+ * Handles both OPEN and SETATTR RPCs for OPEN-CLOSE and SETATTR-DONE_WRITING
+ * RPC chains.
+ */
+void mdc_replay_open(struct ptlrpc_request *req)
+{
+	struct md_open_data *mod = req->rq_cb_data;
+	struct ptlrpc_request *close_req;
+	struct obd_client_handle *och;
+	struct lustre_handle old;
+	struct mdt_body *body;
+	ENTRY;
+
+	if (mod == NULL) {
+		DEBUG_REQ(D_ERROR, req,
+			  "Can't properly replay without open data.");
+		EXIT;
+		return;
+	}
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	LASSERT(body != NULL);
+
+	och = mod->mod_och;
+	if (och != NULL) {
+		struct lustre_handle *file_fh;
+
+		LASSERT(och->och_magic == OBD_CLIENT_HANDLE_MAGIC);
+
+		file_fh = &och->och_fh;
+		CDEBUG(D_HA, "updating handle from "LPX64" to "LPX64"\n",
+		       file_fh->cookie, body->handle.cookie);
+		old = *file_fh;
+		*file_fh = body->handle;
+	}
+	close_req = mod->mod_close_req;
+	if (close_req != NULL) {
+		__u32 opc = lustre_msg_get_opc(close_req->rq_reqmsg);
+		struct mdt_ioepoch *epoch;
+
+		LASSERT(opc == MDS_CLOSE || opc == MDS_DONE_WRITING);
+		epoch = req_capsule_client_get(&close_req->rq_pill,
+					       &RMF_MDT_EPOCH);
+		LASSERT(epoch);
+
+		if (och != NULL)
+			LASSERT(!memcmp(&old, &epoch->handle, sizeof(old)));
+		DEBUG_REQ(D_HA, close_req, "updating close body with new fh");
+		epoch->handle = body->handle;
+	}
+	EXIT;
+}
+
+void mdc_commit_open(struct ptlrpc_request *req)
+{
+	struct md_open_data *mod = req->rq_cb_data;
+	if (mod == NULL)
+		return;
+
+	/**
+	 * No need to touch md_open_data::mod_och, it holds a reference on
+	 * \var mod and will zero references to each other, \var mod will be
+	 * freed after that when md_open_data::mod_och will put the reference.
+	 */
+
+	/**
+	 * Do not let open request to disappear as it still may be needed
+	 * for close rpc to happen (it may happen on evict only, otherwise
+	 * ptlrpc_request::rq_replay does not let mdc_commit_open() to be
+	 * called), just mark this rpc as committed to distinguish these 2
+	 * cases, see mdc_close() for details. The open request reference will
+	 * be put along with freeing \var mod.
+	 */
+	ptlrpc_request_addref(req);
+	spin_lock(&req->rq_lock);
+	req->rq_committed = 1;
+	spin_unlock(&req->rq_lock);
+	req->rq_cb_data = NULL;
+	obd_mod_put(mod);
+}
+
+int mdc_set_open_replay_data(struct obd_export *exp,
+			     struct obd_client_handle *och,
+			     struct ptlrpc_request *open_req)
+{
+	struct md_open_data   *mod;
+	struct mdt_rec_create *rec;
+	struct mdt_body       *body;
+	struct obd_import     *imp = open_req->rq_import;
+	ENTRY;
+
+	if (!open_req->rq_replay)
+		RETURN(0);
+
+	rec = req_capsule_client_get(&open_req->rq_pill, &RMF_REC_REINT);
+	body = req_capsule_server_get(&open_req->rq_pill, &RMF_MDT_BODY);
+	LASSERT(rec != NULL);
+	/* Incoming message in my byte order (it's been swabbed). */
+	/* Outgoing messages always in my byte order. */
+	LASSERT(body != NULL);
+
+	/* Only if the import is replayable, we set replay_open data */
+	if (och && imp->imp_replayable) {
+		mod = obd_mod_alloc();
+		if (mod == NULL) {
+			DEBUG_REQ(D_ERROR, open_req,
+				  "Can't allocate md_open_data");
+			RETURN(0);
+		}
+
+		/**
+		 * Take a reference on \var mod, to be freed on mdc_close().
+		 * It protects \var mod from being freed on eviction (commit
+		 * callback is called despite rq_replay flag).
+		 * Another reference for \var och.
+		 */
+		obd_mod_get(mod);
+		obd_mod_get(mod);
+
+		spin_lock(&open_req->rq_lock);
+		och->och_mod = mod;
+		mod->mod_och = och;
+		mod->mod_open_req = open_req;
+		open_req->rq_cb_data = mod;
+		open_req->rq_commit_cb = mdc_commit_open;
+		spin_unlock(&open_req->rq_lock);
+	}
+
+	rec->cr_fid2 = body->fid1;
+	rec->cr_ioepoch = body->ioepoch;
+	rec->cr_old_handle.cookie = body->handle.cookie;
+	open_req->rq_replay_cb = mdc_replay_open;
+	if (!fid_is_sane(&body->fid1)) {
+		DEBUG_REQ(D_ERROR, open_req, "Saving replay request with "
+			  "insane fid");
+		LBUG();
+	}
+
+	DEBUG_REQ(D_RPCTRACE, open_req, "Set up open replay data");
+	RETURN(0);
+}
+
+int mdc_clear_open_replay_data(struct obd_export *exp,
+			       struct obd_client_handle *och)
+{
+	struct md_open_data *mod = och->och_mod;
+	ENTRY;
+
+	/**
+	 * It is possible to not have \var mod in a case of eviction between
+	 * lookup and ll_file_open().
+	 **/
+	if (mod == NULL)
+		RETURN(0);
+
+	LASSERT(mod != LP_POISON);
+
+	mod->mod_och = NULL;
+	och->och_mod = NULL;
+	obd_mod_put(mod);
+
+	RETURN(0);
+}
+
+/* Prepares the request for the replay by the given reply */
+static void mdc_close_handle_reply(struct ptlrpc_request *req,
+				   struct md_op_data *op_data, int rc) {
+	struct mdt_body  *repbody;
+	struct mdt_ioepoch *epoch;
+
+	if (req && rc == -EAGAIN) {
+		repbody = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+		epoch = req_capsule_client_get(&req->rq_pill, &RMF_MDT_EPOCH);
+
+		epoch->flags |= MF_SOM_AU;
+		if (repbody->valid & OBD_MD_FLGETATTRLOCK)
+			op_data->op_flags |= MF_GETATTR_LOCK;
+	}
+}
+
+int mdc_close(struct obd_export *exp, struct md_op_data *op_data,
+	      struct md_open_data *mod, struct ptlrpc_request **request)
+{
+	struct obd_device     *obd = class_exp2obd(exp);
+	struct ptlrpc_request *req;
+	int		    rc;
+	ENTRY;
+
+	*request = NULL;
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_CLOSE);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_CLOSE);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	/* To avoid a livelock (bug 7034), we need to send CLOSE RPCs to a
+	 * portal whose threads are not taking any DLM locks and are therefore
+	 * always progressing */
+	req->rq_request_portal = MDS_READPAGE_PORTAL;
+	ptlrpc_at_set_req_timeout(req);
+
+	/* Ensure that this close's handle is fixed up during replay. */
+	if (likely(mod != NULL)) {
+		LASSERTF(mod->mod_open_req != NULL &&
+			 mod->mod_open_req->rq_type != LI_POISON,
+			 "POISONED open %p!\n", mod->mod_open_req);
+
+		mod->mod_close_req = req;
+
+		DEBUG_REQ(D_HA, mod->mod_open_req, "matched open");
+		/* We no longer want to preserve this open for replay even
+		 * though the open was committed. b=3632, b=3633 */
+		spin_lock(&mod->mod_open_req->rq_lock);
+		mod->mod_open_req->rq_replay = 0;
+		spin_unlock(&mod->mod_open_req->rq_lock);
+	} else {
+		 CDEBUG(D_HA, "couldn't find open req; expecting close error\n");
+	}
+
+	mdc_close_pack(req, op_data);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+			     obd->u.cli.cl_max_mds_easize);
+	req_capsule_set_size(&req->rq_pill, &RMF_LOGCOOKIES, RCL_SERVER,
+			     obd->u.cli.cl_max_mds_cookiesize);
+
+	ptlrpc_request_set_replen(req);
+
+	mdc_get_rpc_lock(obd->u.cli.cl_close_lock, NULL);
+	rc = ptlrpc_queue_wait(req);
+	mdc_put_rpc_lock(obd->u.cli.cl_close_lock, NULL);
+
+	if (req->rq_repmsg == NULL) {
+		CDEBUG(D_RPCTRACE, "request failed to send: %p, %d\n", req,
+		       req->rq_status);
+		if (rc == 0)
+			rc = req->rq_status ?: -EIO;
+	} else if (rc == 0 || rc == -EAGAIN) {
+		struct mdt_body *body;
+
+		rc = lustre_msg_get_status(req->rq_repmsg);
+		if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) {
+			DEBUG_REQ(D_ERROR, req, "type == PTL_RPC_MSG_ERR, err "
+				  "= %d", rc);
+			if (rc > 0)
+				rc = -rc;
+		}
+		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+		if (body == NULL)
+			rc = -EPROTO;
+	} else if (rc == -ESTALE) {
+		/**
+		 * it can be allowed error after 3633 if open was committed and
+		 * server failed before close was sent. Let's check if mod
+		 * exists and return no error in that case
+		 */
+		if (mod) {
+			DEBUG_REQ(D_HA, req, "Reset ESTALE = %d", rc);
+			LASSERT(mod->mod_open_req != NULL);
+			if (mod->mod_open_req->rq_committed)
+				rc = 0;
+		}
+	}
+
+	if (mod) {
+		if (rc != 0)
+			mod->mod_close_req = NULL;
+		/* Since now, mod is accessed through open_req only,
+		 * thus close req does not keep a reference on mod anymore. */
+		obd_mod_put(mod);
+	}
+	*request = req;
+	mdc_close_handle_reply(req, op_data, rc);
+	RETURN(rc);
+}
+
+int mdc_done_writing(struct obd_export *exp, struct md_op_data *op_data,
+		     struct md_open_data *mod)
+{
+	struct obd_device     *obd = class_exp2obd(exp);
+	struct ptlrpc_request *req;
+	int		    rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_MDS_DONE_WRITING);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_DONE_WRITING);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	if (mod != NULL) {
+		LASSERTF(mod->mod_open_req != NULL &&
+			 mod->mod_open_req->rq_type != LI_POISON,
+			 "POISONED setattr %p!\n", mod->mod_open_req);
+
+		mod->mod_close_req = req;
+		DEBUG_REQ(D_HA, mod->mod_open_req, "matched setattr");
+		/* We no longer want to preserve this setattr for replay even
+		 * though the open was committed. b=3632, b=3633 */
+		spin_lock(&mod->mod_open_req->rq_lock);
+		mod->mod_open_req->rq_replay = 0;
+		spin_unlock(&mod->mod_open_req->rq_lock);
+	}
+
+	mdc_close_pack(req, op_data);
+	ptlrpc_request_set_replen(req);
+
+	mdc_get_rpc_lock(obd->u.cli.cl_close_lock, NULL);
+	rc = ptlrpc_queue_wait(req);
+	mdc_put_rpc_lock(obd->u.cli.cl_close_lock, NULL);
+
+	if (rc == -ESTALE) {
+		/**
+		 * it can be allowed error after 3633 if open or setattr were
+		 * committed and server failed before close was sent.
+		 * Let's check if mod exists and return no error in that case
+		 */
+		if (mod) {
+			LASSERT(mod->mod_open_req != NULL);
+			if (mod->mod_open_req->rq_committed)
+				rc = 0;
+		}
+	}
+
+	if (mod) {
+		if (rc != 0)
+			mod->mod_close_req = NULL;
+		/* Since now, mod is accessed through setattr req only,
+		 * thus DW req does not keep a reference on mod anymore. */
+		obd_mod_put(mod);
+	}
+
+	mdc_close_handle_reply(req, op_data, rc);
+	ptlrpc_req_finished(req);
+	RETURN(rc);
+}
+
+
+int mdc_readpage(struct obd_export *exp, struct md_op_data *op_data,
+		 struct page **pages, struct ptlrpc_request **request)
+{
+	struct ptlrpc_request   *req;
+	struct ptlrpc_bulk_desc *desc;
+	int		      i;
+	wait_queue_head_t	      waitq;
+	int		      resends = 0;
+	struct l_wait_info       lwi;
+	int		      rc;
+	ENTRY;
+
+	*request = NULL;
+	init_waitqueue_head(&waitq);
+
+restart_bulk:
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_READPAGE);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_READPAGE);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	req->rq_request_portal = MDS_READPAGE_PORTAL;
+	ptlrpc_at_set_req_timeout(req);
+
+	desc = ptlrpc_prep_bulk_imp(req, op_data->op_npages, 1, BULK_PUT_SINK,
+				    MDS_BULK_PORTAL);
+	if (desc == NULL) {
+		ptlrpc_request_free(req);
+		RETURN(-ENOMEM);
+	}
+
+	/* NB req now owns desc and will free it when it gets freed */
+	for (i = 0; i < op_data->op_npages; i++)
+		ptlrpc_prep_bulk_page_pin(desc, pages[i], 0, PAGE_CACHE_SIZE);
+
+	mdc_readdir_pack(req, op_data->op_offset,
+			 PAGE_CACHE_SIZE * op_data->op_npages,
+			 &op_data->op_fid1, op_data->op_capa1);
+
+	ptlrpc_request_set_replen(req);
+	rc = ptlrpc_queue_wait(req);
+	if (rc) {
+		ptlrpc_req_finished(req);
+		if (rc != -ETIMEDOUT)
+			RETURN(rc);
+
+		resends++;
+		if (!client_should_resend(resends, &exp->exp_obd->u.cli)) {
+			CERROR("too many resend retries, returning error\n");
+			RETURN(-EIO);
+		}
+		lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(resends), NULL, NULL, NULL);
+		l_wait_event(waitq, 0, &lwi);
+
+		goto restart_bulk;
+	}
+
+	rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk,
+					  req->rq_bulk->bd_nob_transferred);
+	if (rc < 0) {
+		ptlrpc_req_finished(req);
+		RETURN(rc);
+	}
+
+	if (req->rq_bulk->bd_nob_transferred & ~LU_PAGE_MASK) {
+		CERROR("Unexpected # bytes transferred: %d (%ld expected)\n",
+			req->rq_bulk->bd_nob_transferred,
+			PAGE_CACHE_SIZE * op_data->op_npages);
+		ptlrpc_req_finished(req);
+		RETURN(-EPROTO);
+	}
+
+	*request = req;
+	RETURN(0);
+}
+
+static int mdc_statfs(const struct lu_env *env,
+		      struct obd_export *exp, struct obd_statfs *osfs,
+		      __u64 max_age, __u32 flags)
+{
+	struct obd_device     *obd = class_exp2obd(exp);
+	struct ptlrpc_request *req;
+	struct obd_statfs     *msfs;
+	struct obd_import     *imp = NULL;
+	int		    rc;
+	ENTRY;
+
+	/*
+	 * Since the request might also come from lprocfs, so we need
+	 * sync this with client_disconnect_export Bug15684
+	 */
+	down_read(&obd->u.cli.cl_sem);
+	if (obd->u.cli.cl_import)
+		imp = class_import_get(obd->u.cli.cl_import);
+	up_read(&obd->u.cli.cl_sem);
+	if (!imp)
+		RETURN(-ENODEV);
+
+	req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_STATFS,
+					LUSTRE_MDS_VERSION, MDS_STATFS);
+	if (req == NULL)
+		GOTO(output, rc = -ENOMEM);
+
+	ptlrpc_request_set_replen(req);
+
+	if (flags & OBD_STATFS_NODELAY) {
+		/* procfs requests not want stay in wait for avoid deadlock */
+		req->rq_no_resend = 1;
+		req->rq_no_delay = 1;
+	}
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc) {
+		/* check connection error first */
+		if (imp->imp_connect_error)
+			rc = imp->imp_connect_error;
+		GOTO(out, rc);
+	}
+
+	msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
+	if (msfs == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	*osfs = *msfs;
+	EXIT;
+out:
+	ptlrpc_req_finished(req);
+output:
+	class_import_put(imp);
+	return rc;
+}
+
+static int mdc_ioc_fid2path(struct obd_export *exp, struct getinfo_fid2path *gf)
+{
+	__u32 keylen, vallen;
+	void *key;
+	int rc;
+
+	if (gf->gf_pathlen > PATH_MAX)
+		RETURN(-ENAMETOOLONG);
+	if (gf->gf_pathlen < 2)
+		RETURN(-EOVERFLOW);
+
+	/* Key is KEY_FID2PATH + getinfo_fid2path description */
+	keylen = cfs_size_round(sizeof(KEY_FID2PATH)) + sizeof(*gf);
+	OBD_ALLOC(key, keylen);
+	if (key == NULL)
+		RETURN(-ENOMEM);
+	memcpy(key, KEY_FID2PATH, sizeof(KEY_FID2PATH));
+	memcpy(key + cfs_size_round(sizeof(KEY_FID2PATH)), gf, sizeof(*gf));
+
+	CDEBUG(D_IOCTL, "path get "DFID" from "LPU64" #%d\n",
+	       PFID(&gf->gf_fid), gf->gf_recno, gf->gf_linkno);
+
+	if (!fid_is_sane(&gf->gf_fid))
+		GOTO(out, rc = -EINVAL);
+
+	/* Val is struct getinfo_fid2path result plus path */
+	vallen = sizeof(*gf) + gf->gf_pathlen;
+
+	rc = obd_get_info(NULL, exp, keylen, key, &vallen, gf, NULL);
+	if (rc != 0 && rc != -EREMOTE)
+		GOTO(out, rc);
+
+	if (vallen <= sizeof(*gf))
+		GOTO(out, rc = -EPROTO);
+	else if (vallen > sizeof(*gf) + gf->gf_pathlen)
+		GOTO(out, rc = -EOVERFLOW);
+
+	CDEBUG(D_IOCTL, "path get "DFID" from "LPU64" #%d\n%s\n",
+	       PFID(&gf->gf_fid), gf->gf_recno, gf->gf_linkno, gf->gf_path);
+
+out:
+	OBD_FREE(key, keylen);
+	return rc;
+}
+
+static int mdc_ioc_hsm_progress(struct obd_export *exp,
+				struct hsm_progress_kernel *hpk)
+{
+	struct obd_import		*imp = class_exp2cliimp(exp);
+	struct hsm_progress_kernel	*req_hpk;
+	struct ptlrpc_request		*req;
+	int				 rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_HSM_PROGRESS,
+					LUSTRE_MDS_VERSION, MDS_HSM_PROGRESS);
+	if (req == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	mdc_pack_body(req, NULL, NULL, OBD_MD_FLRMTPERM, 0, 0, 0);
+
+	/* Copy hsm_progress struct */
+	req_hpk = req_capsule_client_get(&req->rq_pill, &RMF_MDS_HSM_PROGRESS);
+	if (req_hpk == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	*req_hpk = *hpk;
+
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_queue_wait(req);
+	GOTO(out, rc);
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int mdc_ioc_hsm_ct_register(struct obd_import *imp, __u32 archives)
+{
+	__u32			*archive_mask;
+	struct ptlrpc_request	*req;
+	int			 rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_HSM_CT_REGISTER,
+					LUSTRE_MDS_VERSION,
+					MDS_HSM_CT_REGISTER);
+	if (req == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	mdc_pack_body(req, NULL, NULL, OBD_MD_FLRMTPERM, 0, 0, 0);
+
+	/* Copy hsm_progress struct */
+	archive_mask = req_capsule_client_get(&req->rq_pill,
+					      &RMF_MDS_HSM_ARCHIVE);
+	if (archive_mask == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	*archive_mask = archives;
+
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_queue_wait(req);
+	GOTO(out, rc);
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int mdc_ioc_hsm_current_action(struct obd_export *exp,
+				      struct md_op_data *op_data)
+{
+	struct hsm_current_action	*hca = op_data->op_data;
+	struct hsm_current_action	*req_hca;
+	struct ptlrpc_request		*req;
+	int				 rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_MDS_HSM_ACTION);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_ACTION);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	mdc_pack_body(req, &op_data->op_fid1, op_data->op_capa1,
+		      OBD_MD_FLRMTPERM, 0, op_data->op_suppgids[0], 0);
+
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
+
+	req_hca = req_capsule_server_get(&req->rq_pill,
+					 &RMF_MDS_HSM_CURRENT_ACTION);
+	if (req_hca == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	*hca = *req_hca;
+
+	EXIT;
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int mdc_ioc_hsm_ct_unregister(struct obd_import *imp)
+{
+	struct ptlrpc_request	*req;
+	int			 rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_HSM_CT_UNREGISTER,
+					LUSTRE_MDS_VERSION,
+					MDS_HSM_CT_UNREGISTER);
+	if (req == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	mdc_pack_body(req, NULL, NULL, OBD_MD_FLRMTPERM, 0, 0, 0);
+
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_queue_wait(req);
+	GOTO(out, rc);
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int mdc_ioc_hsm_state_get(struct obd_export *exp,
+				 struct md_op_data *op_data)
+{
+	struct hsm_user_state	*hus = op_data->op_data;
+	struct hsm_user_state	*req_hus;
+	struct ptlrpc_request	*req;
+	int			 rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_MDS_HSM_STATE_GET);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_STATE_GET);
+	if (rc != 0) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	mdc_pack_body(req, &op_data->op_fid1, op_data->op_capa1,
+		      OBD_MD_FLRMTPERM, 0, op_data->op_suppgids[0], 0);
+
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
+
+	req_hus = req_capsule_server_get(&req->rq_pill, &RMF_HSM_USER_STATE);
+	if (req_hus == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	*hus = *req_hus;
+
+	EXIT;
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int mdc_ioc_hsm_state_set(struct obd_export *exp,
+				 struct md_op_data *op_data)
+{
+	struct hsm_state_set	*hss = op_data->op_data;
+	struct hsm_state_set	*req_hss;
+	struct ptlrpc_request	*req;
+	int			 rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_MDS_HSM_STATE_SET);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_STATE_SET);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	mdc_pack_body(req, &op_data->op_fid1, op_data->op_capa1,
+		      OBD_MD_FLRMTPERM, 0, op_data->op_suppgids[0], 0);
+
+	/* Copy states */
+	req_hss = req_capsule_client_get(&req->rq_pill, &RMF_HSM_STATE_SET);
+	if (req_hss == NULL)
+		GOTO(out, rc = -EPROTO);
+	*req_hss = *hss;
+
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_queue_wait(req);
+	GOTO(out, rc);
+
+	EXIT;
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int mdc_ioc_hsm_request(struct obd_export *exp,
+			       struct hsm_user_request *hur)
+{
+	struct obd_import	*imp = class_exp2cliimp(exp);
+	struct ptlrpc_request	*req;
+	struct hsm_request	*req_hr;
+	struct hsm_user_item	*req_hui;
+	char			*req_opaque;
+	int			 rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(imp, &RQF_MDS_HSM_REQUEST);
+	if (req == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDS_HSM_USER_ITEM, RCL_CLIENT,
+			     hur->hur_request.hr_itemcount
+			     * sizeof(struct hsm_user_item));
+	req_capsule_set_size(&req->rq_pill, &RMF_GENERIC_DATA, RCL_CLIENT,
+			     hur->hur_request.hr_data_len);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_REQUEST);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	mdc_pack_body(req, NULL, NULL, OBD_MD_FLRMTPERM, 0, 0, 0);
+
+	/* Copy hsm_request struct */
+	req_hr = req_capsule_client_get(&req->rq_pill, &RMF_MDS_HSM_REQUEST);
+	if (req_hr == NULL)
+		GOTO(out, rc = -EPROTO);
+	*req_hr = hur->hur_request;
+
+	/* Copy hsm_user_item structs */
+	req_hui = req_capsule_client_get(&req->rq_pill, &RMF_MDS_HSM_USER_ITEM);
+	if (req_hui == NULL)
+		GOTO(out, rc = -EPROTO);
+	memcpy(req_hui, hur->hur_user_item,
+	       hur->hur_request.hr_itemcount * sizeof(struct hsm_user_item));
+
+	/* Copy opaque field */
+	req_opaque = req_capsule_client_get(&req->rq_pill, &RMF_GENERIC_DATA);
+	if (req_opaque == NULL)
+		GOTO(out, rc = -EPROTO);
+	memcpy(req_opaque, hur_data(hur), hur->hur_request.hr_data_len);
+
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_queue_wait(req);
+	GOTO(out, rc);
+
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static struct kuc_hdr *changelog_kuc_hdr(char *buf, int len, int flags)
+{
+	struct kuc_hdr *lh = (struct kuc_hdr *)buf;
+
+	LASSERT(len <= CR_MAXSIZE);
+
+	lh->kuc_magic = KUC_MAGIC;
+	lh->kuc_transport = KUC_TRANSPORT_CHANGELOG;
+	lh->kuc_flags = flags;
+	lh->kuc_msgtype = CL_RECORD;
+	lh->kuc_msglen = len;
+	return lh;
+}
+
+#define D_CHANGELOG 0
+
+struct changelog_show {
+	__u64		cs_startrec;
+	__u32		cs_flags;
+	struct file	*cs_fp;
+	char		*cs_buf;
+	struct obd_device *cs_obd;
+};
+
+static int changelog_show_cb(const struct lu_env *env, struct llog_handle *llh,
+			     struct llog_rec_hdr *hdr, void *data)
+{
+	struct changelog_show *cs = data;
+	struct llog_changelog_rec *rec = (struct llog_changelog_rec *)hdr;
+	struct kuc_hdr *lh;
+	int len, rc;
+	ENTRY;
+
+	if ((rec->cr_hdr.lrh_type != CHANGELOG_REC) ||
+	    (rec->cr.cr_type >= CL_LAST)) {
+		CERROR("Not a changelog rec %d/%d\n", rec->cr_hdr.lrh_type,
+		       rec->cr.cr_type);
+		RETURN(-EINVAL);
+	}
+
+	if (rec->cr.cr_index < cs->cs_startrec) {
+		/* Skip entries earlier than what we are interested in */
+		CDEBUG(D_CHANGELOG, "rec="LPU64" start="LPU64"\n",
+		       rec->cr.cr_index, cs->cs_startrec);
+		RETURN(0);
+	}
+
+	CDEBUG(D_CHANGELOG, LPU64" %02d%-5s "LPU64" 0x%x t="DFID" p="DFID
+		" %.*s\n", rec->cr.cr_index, rec->cr.cr_type,
+		changelog_type2str(rec->cr.cr_type), rec->cr.cr_time,
+		rec->cr.cr_flags & CLF_FLAGMASK,
+		PFID(&rec->cr.cr_tfid), PFID(&rec->cr.cr_pfid),
+		rec->cr.cr_namelen, changelog_rec_name(&rec->cr));
+
+	len = sizeof(*lh) + changelog_rec_size(&rec->cr) + rec->cr.cr_namelen;
+
+	/* Set up the message */
+	lh = changelog_kuc_hdr(cs->cs_buf, len, cs->cs_flags);
+	memcpy(lh + 1, &rec->cr, len - sizeof(*lh));
+
+	rc = libcfs_kkuc_msg_put(cs->cs_fp, lh);
+	CDEBUG(D_CHANGELOG, "kucmsg fp %p len %d rc %d\n", cs->cs_fp, len,rc);
+
+	RETURN(rc);
+}
+
+static int mdc_changelog_send_thread(void *csdata)
+{
+	struct changelog_show *cs = csdata;
+	struct llog_ctxt *ctxt = NULL;
+	struct llog_handle *llh = NULL;
+	struct kuc_hdr *kuch;
+	int rc;
+
+	CDEBUG(D_CHANGELOG, "changelog to fp=%p start "LPU64"\n",
+	       cs->cs_fp, cs->cs_startrec);
+
+	OBD_ALLOC(cs->cs_buf, CR_MAXSIZE);
+	if (cs->cs_buf == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	/* Set up the remote catalog handle */
+	ctxt = llog_get_context(cs->cs_obd, LLOG_CHANGELOG_REPL_CTXT);
+	if (ctxt == NULL)
+		GOTO(out, rc = -ENOENT);
+	rc = llog_open(NULL, ctxt, &llh, NULL, CHANGELOG_CATALOG,
+		       LLOG_OPEN_EXISTS);
+	if (rc) {
+		CERROR("%s: fail to open changelog catalog: rc = %d\n",
+		       cs->cs_obd->obd_name, rc);
+		GOTO(out, rc);
+	}
+	rc = llog_init_handle(NULL, llh, LLOG_F_IS_CAT, NULL);
+	if (rc) {
+		CERROR("llog_init_handle failed %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	rc = llog_cat_process(NULL, llh, changelog_show_cb, cs, 0, 0);
+
+	/* Send EOF no matter what our result */
+	if ((kuch = changelog_kuc_hdr(cs->cs_buf, sizeof(*kuch),
+				      cs->cs_flags))) {
+		kuch->kuc_msgtype = CL_EOF;
+		libcfs_kkuc_msg_put(cs->cs_fp, kuch);
+	}
+
+out:
+	fput(cs->cs_fp);
+	if (llh)
+		llog_cat_close(NULL, llh);
+	if (ctxt)
+		llog_ctxt_put(ctxt);
+	if (cs->cs_buf)
+		OBD_FREE(cs->cs_buf, CR_MAXSIZE);
+	OBD_FREE_PTR(cs);
+	return rc;
+}
+
+static int mdc_ioc_changelog_send(struct obd_device *obd,
+				  struct ioc_changelog *icc)
+{
+	struct changelog_show *cs;
+	int rc;
+
+	/* Freed in mdc_changelog_send_thread */
+	OBD_ALLOC_PTR(cs);
+	if (!cs)
+		return -ENOMEM;
+
+	cs->cs_obd = obd;
+	cs->cs_startrec = icc->icc_recno;
+	/* matching fput in mdc_changelog_send_thread */
+	cs->cs_fp = fget(icc->icc_id);
+	cs->cs_flags = icc->icc_flags;
+
+	/*
+	 * New thread because we should return to user app before
+	 * writing into our pipe
+	 */
+	rc = PTR_ERR(kthread_run(mdc_changelog_send_thread, cs,
+				 "mdc_clg_send_thread"));
+	if (!IS_ERR_VALUE(rc)) {
+		CDEBUG(D_CHANGELOG, "start changelog thread\n");
+		return 0;
+	}
+
+	CERROR("Failed to start changelog thread: %d\n", rc);
+	OBD_FREE_PTR(cs);
+	return rc;
+}
+
+static int mdc_ioc_hsm_ct_start(struct obd_export *exp,
+				struct lustre_kernelcomm *lk);
+
+static int mdc_quotacheck(struct obd_device *unused, struct obd_export *exp,
+			  struct obd_quotactl *oqctl)
+{
+	struct client_obd       *cli = &exp->exp_obd->u.cli;
+	struct ptlrpc_request   *req;
+	struct obd_quotactl     *body;
+	int		      rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+					&RQF_MDS_QUOTACHECK, LUSTRE_MDS_VERSION,
+					MDS_QUOTACHECK);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL);
+	*body = *oqctl;
+
+	ptlrpc_request_set_replen(req);
+
+	/* the next poll will find -ENODATA, that means quotacheck is
+	 * going on */
+	cli->cl_qchk_stat = -ENODATA;
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		cli->cl_qchk_stat = rc;
+	ptlrpc_req_finished(req);
+	RETURN(rc);
+}
+
+static int mdc_quota_poll_check(struct obd_export *exp,
+				struct if_quotacheck *qchk)
+{
+	struct client_obd *cli = &exp->exp_obd->u.cli;
+	int rc;
+	ENTRY;
+
+	qchk->obd_uuid = cli->cl_target_uuid;
+	memcpy(qchk->obd_type, LUSTRE_MDS_NAME, strlen(LUSTRE_MDS_NAME));
+
+	rc = cli->cl_qchk_stat;
+	/* the client is not the previous one */
+	if (rc == CL_NOT_QUOTACHECKED)
+		rc = -EINTR;
+	RETURN(rc);
+}
+
+static int mdc_quotactl(struct obd_device *unused, struct obd_export *exp,
+			struct obd_quotactl *oqctl)
+{
+	struct ptlrpc_request   *req;
+	struct obd_quotactl     *oqc;
+	int		      rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+					&RQF_MDS_QUOTACTL, LUSTRE_MDS_VERSION,
+					MDS_QUOTACTL);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	oqc = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL);
+	*oqc = *oqctl;
+
+	ptlrpc_request_set_replen(req);
+	ptlrpc_at_set_req_timeout(req);
+	req->rq_no_resend = 1;
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		CERROR("ptlrpc_queue_wait failed, rc: %d\n", rc);
+
+	if (req->rq_repmsg &&
+	    (oqc = req_capsule_server_get(&req->rq_pill, &RMF_OBD_QUOTACTL))) {
+		*oqctl = *oqc;
+	} else if (!rc) {
+		CERROR ("Can't unpack obd_quotactl\n");
+		rc = -EPROTO;
+	}
+	ptlrpc_req_finished(req);
+
+	RETURN(rc);
+}
+
+static int mdc_ioc_swap_layouts(struct obd_export *exp,
+				struct md_op_data *op_data)
+{
+	LIST_HEAD(cancels);
+	struct ptlrpc_request	*req;
+	int			 rc, count;
+	struct mdc_swap_layouts *msl, *payload;
+	ENTRY;
+
+	msl = op_data->op_data;
+
+	/* When the MDT will get the MDS_SWAP_LAYOUTS RPC the
+	 * first thing it will do is to cancel the 2 layout
+	 * locks hold by this client.
+	 * So the client must cancel its layout locks on the 2 fids
+	 * with the request RPC to avoid extra RPC round trips
+	 */
+	count = mdc_resource_get_unused(exp, &op_data->op_fid1, &cancels,
+					LCK_CR, MDS_INODELOCK_LAYOUT);
+	count += mdc_resource_get_unused(exp, &op_data->op_fid2, &cancels,
+					 LCK_CR, MDS_INODELOCK_LAYOUT);
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_MDS_SWAP_LAYOUTS);
+	if (req == NULL) {
+		ldlm_lock_list_put(&cancels, l_bl_ast, count);
+		RETURN(-ENOMEM);
+	}
+
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+	mdc_set_capa_size(req, &RMF_CAPA2, op_data->op_capa2);
+
+	rc = mdc_prep_elc_req(exp, req, MDS_SWAP_LAYOUTS, &cancels, count);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	mdc_swap_layouts_pack(req, op_data);
+
+	payload = req_capsule_client_get(&req->rq_pill, &RMF_SWAP_LAYOUTS);
+	LASSERT(payload);
+
+	*payload = *msl;
+
+	ptlrpc_request_set_replen(req);
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
+	EXIT;
+
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int mdc_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
+			 void *karg, void *uarg)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct obd_ioctl_data *data = karg;
+	struct obd_import *imp = obd->u.cli.cl_import;
+	struct llog_ctxt *ctxt;
+	int rc;
+	ENTRY;
+
+	if (!try_module_get(THIS_MODULE)) {
+		CERROR("Can't get module. Is it alive?");
+		return -EINVAL;
+	}
+	switch (cmd) {
+	case OBD_IOC_CHANGELOG_SEND:
+		rc = mdc_ioc_changelog_send(obd, karg);
+		GOTO(out, rc);
+	case OBD_IOC_CHANGELOG_CLEAR: {
+		struct ioc_changelog *icc = karg;
+		struct changelog_setinfo cs =
+			{.cs_recno = icc->icc_recno, .cs_id = icc->icc_id};
+		rc = obd_set_info_async(NULL, exp, strlen(KEY_CHANGELOG_CLEAR),
+					KEY_CHANGELOG_CLEAR, sizeof(cs), &cs,
+					NULL);
+		GOTO(out, rc);
+	}
+	case OBD_IOC_FID2PATH:
+		rc = mdc_ioc_fid2path(exp, karg);
+		GOTO(out, rc);
+	case LL_IOC_HSM_CT_START:
+		rc = mdc_ioc_hsm_ct_start(exp, karg);
+		GOTO(out, rc);
+	case LL_IOC_HSM_PROGRESS:
+		rc = mdc_ioc_hsm_progress(exp, karg);
+		GOTO(out, rc);
+	case LL_IOC_HSM_STATE_GET:
+		rc = mdc_ioc_hsm_state_get(exp, karg);
+		GOTO(out, rc);
+	case LL_IOC_HSM_STATE_SET:
+		rc = mdc_ioc_hsm_state_set(exp, karg);
+	case LL_IOC_HSM_ACTION:
+		rc = mdc_ioc_hsm_current_action(exp, karg);
+		GOTO(out, rc);
+	case LL_IOC_HSM_REQUEST:
+		rc = mdc_ioc_hsm_request(exp, karg);
+		GOTO(out, rc);
+	case OBD_IOC_CLIENT_RECOVER:
+		rc = ptlrpc_recover_import(imp, data->ioc_inlbuf1, 0);
+		if (rc < 0)
+			GOTO(out, rc);
+		GOTO(out, rc = 0);
+	case IOC_OSC_SET_ACTIVE:
+		rc = ptlrpc_set_import_active(imp, data->ioc_offset);
+		GOTO(out, rc);
+	case OBD_IOC_PARSE: {
+		ctxt = llog_get_context(exp->exp_obd, LLOG_CONFIG_REPL_CTXT);
+		rc = class_config_parse_llog(NULL, ctxt, data->ioc_inlbuf1,
+					     NULL);
+		llog_ctxt_put(ctxt);
+		GOTO(out, rc);
+	}
+	case OBD_IOC_LLOG_INFO:
+	case OBD_IOC_LLOG_PRINT: {
+		ctxt = llog_get_context(obd, LLOG_CONFIG_REPL_CTXT);
+		rc = llog_ioctl(NULL, ctxt, cmd, data);
+		llog_ctxt_put(ctxt);
+		GOTO(out, rc);
+	}
+	case OBD_IOC_POLL_QUOTACHECK:
+		rc = mdc_quota_poll_check(exp, (struct if_quotacheck *)karg);
+		GOTO(out, rc);
+	case OBD_IOC_PING_TARGET:
+		rc = ptlrpc_obd_ping(obd);
+		GOTO(out, rc);
+	/*
+	 * Normally IOC_OBD_STATFS, OBD_IOC_QUOTACTL iocontrol are handled by
+	 * LMV instead of MDC. But when the cluster is upgraded from 1.8,
+	 * there'd be no LMV layer thus we might be called here. Eventually
+	 * this code should be removed.
+	 * bz20731, LU-592.
+	 */
+	case IOC_OBD_STATFS: {
+		struct obd_statfs stat_buf = {0};
+
+		if (*((__u32 *) data->ioc_inlbuf2) != 0)
+			GOTO(out, rc = -ENODEV);
+
+		/* copy UUID */
+		if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(obd),
+				     min((int) data->ioc_plen2,
+					 (int) sizeof(struct obd_uuid))))
+			GOTO(out, rc = -EFAULT);
+
+		rc = mdc_statfs(NULL, obd->obd_self_export, &stat_buf,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				0);
+		if (rc != 0)
+			GOTO(out, rc);
+
+		if (copy_to_user(data->ioc_pbuf1, &stat_buf,
+				     min((int) data->ioc_plen1,
+					 (int) sizeof(stat_buf))))
+			GOTO(out, rc = -EFAULT);
+
+		GOTO(out, rc = 0);
+	}
+	case OBD_IOC_QUOTACTL: {
+		struct if_quotactl *qctl = karg;
+		struct obd_quotactl *oqctl;
+
+		OBD_ALLOC_PTR(oqctl);
+		if (!oqctl)
+			RETURN(-ENOMEM);
+
+		QCTL_COPY(oqctl, qctl);
+		rc = obd_quotactl(exp, oqctl);
+		if (rc == 0) {
+			QCTL_COPY(qctl, oqctl);
+			qctl->qc_valid = QC_MDTIDX;
+			qctl->obd_uuid = obd->u.cli.cl_target_uuid;
+		}
+		OBD_FREE_PTR(oqctl);
+		break;
+	}
+	case LL_IOC_GET_CONNECT_FLAGS: {
+		if (copy_to_user(uarg,
+				     exp_connect_flags_ptr(exp),
+				     sizeof(__u64)))
+			GOTO(out, rc = -EFAULT);
+		else
+			GOTO(out, rc = 0);
+	}
+	case LL_IOC_LOV_SWAP_LAYOUTS: {
+		rc = mdc_ioc_swap_layouts(exp, karg);
+		break;
+	}
+	default:
+		CERROR("mdc_ioctl(): unrecognised ioctl %#x\n", cmd);
+		GOTO(out, rc = -ENOTTY);
+	}
+out:
+	module_put(THIS_MODULE);
+
+	return rc;
+}
+
+int mdc_get_info_rpc(struct obd_export *exp,
+		     obd_count keylen, void *key,
+		     int vallen, void *val)
+{
+	struct obd_import      *imp = class_exp2cliimp(exp);
+	struct ptlrpc_request  *req;
+	char		   *tmp;
+	int		     rc = -EINVAL;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(imp, &RQF_MDS_GET_INFO);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_KEY,
+			     RCL_CLIENT, keylen);
+	req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_VALLEN,
+			     RCL_CLIENT, sizeof(__u32));
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GET_INFO);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	tmp = req_capsule_client_get(&req->rq_pill, &RMF_GETINFO_KEY);
+	memcpy(tmp, key, keylen);
+	tmp = req_capsule_client_get(&req->rq_pill, &RMF_GETINFO_VALLEN);
+	memcpy(tmp, &vallen, sizeof(__u32));
+
+	req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_VAL,
+			     RCL_SERVER, vallen);
+	ptlrpc_request_set_replen(req);
+
+	rc = ptlrpc_queue_wait(req);
+	/* -EREMOTE means the get_info result is partial, and it needs to
+	 * continue on another MDT, see fid2path part in lmv_iocontrol */
+	if (rc == 0 || rc == -EREMOTE) {
+		tmp = req_capsule_server_get(&req->rq_pill, &RMF_GETINFO_VAL);
+		memcpy(val, tmp, vallen);
+		if (ptlrpc_rep_need_swab(req)) {
+			if (KEY_IS(KEY_FID2PATH))
+				lustre_swab_fid2path(val);
+		}
+	}
+	ptlrpc_req_finished(req);
+
+	RETURN(rc);
+}
+
+static void lustre_swab_hai(struct hsm_action_item *h)
+{
+	__swab32s(&h->hai_len);
+	__swab32s(&h->hai_action);
+	lustre_swab_lu_fid(&h->hai_fid);
+	lustre_swab_lu_fid(&h->hai_dfid);
+	__swab64s(&h->hai_cookie);
+	__swab64s(&h->hai_extent.offset);
+	__swab64s(&h->hai_extent.length);
+	__swab64s(&h->hai_gid);
+}
+
+static void lustre_swab_hal(struct hsm_action_list *h)
+{
+	struct hsm_action_item	*hai;
+	int			 i;
+
+	__swab32s(&h->hal_version);
+	__swab32s(&h->hal_count);
+	__swab32s(&h->hal_archive_id);
+	__swab64s(&h->hal_flags);
+	hai = hai_zero(h);
+	for (i = 0; i < h->hal_count; i++) {
+		lustre_swab_hai(hai);
+		hai = hai_next(hai);
+	}
+}
+
+static void lustre_swab_kuch(struct kuc_hdr *l)
+{
+	__swab16s(&l->kuc_magic);
+	/* __u8 l->kuc_transport */
+	__swab16s(&l->kuc_msgtype);
+	__swab16s(&l->kuc_msglen);
+}
+
+static int mdc_ioc_hsm_ct_start(struct obd_export *exp,
+				struct lustre_kernelcomm *lk)
+{
+	struct obd_import  *imp = class_exp2cliimp(exp);
+	__u32		    archive = lk->lk_data;
+	int		    rc = 0;
+
+	if (lk->lk_group != KUC_GRP_HSM) {
+		CERROR("Bad copytool group %d\n", lk->lk_group);
+		return -EINVAL;
+	}
+
+	CDEBUG(D_HSM, "CT start r%d w%d u%d g%d f%#x\n", lk->lk_rfd, lk->lk_wfd,
+	       lk->lk_uid, lk->lk_group, lk->lk_flags);
+
+	if (lk->lk_flags & LK_FLG_STOP) {
+		rc = libcfs_kkuc_group_rem(lk->lk_uid, lk->lk_group);
+		/* Unregister with the coordinator */
+		if (rc == 0)
+			rc = mdc_ioc_hsm_ct_unregister(imp);
+	} else {
+		struct file *fp = fget(lk->lk_wfd);
+
+		rc = libcfs_kkuc_group_add(fp, lk->lk_uid, lk->lk_group,
+					   lk->lk_data);
+		if (rc && fp)
+			fput(fp);
+		if (rc == 0)
+			rc = mdc_ioc_hsm_ct_register(imp, archive);
+	}
+
+	return rc;
+}
+
+/**
+ * Send a message to any listening copytools
+ * @param val KUC message (kuc_hdr + hsm_action_list)
+ * @param len total length of message
+ */
+static int mdc_hsm_copytool_send(int len, void *val)
+{
+	struct kuc_hdr		*lh = (struct kuc_hdr *)val;
+	struct hsm_action_list	*hal = (struct hsm_action_list *)(lh + 1);
+	int			 rc;
+	ENTRY;
+
+	if (len < sizeof(*lh) + sizeof(*hal)) {
+		CERROR("Short HSM message %d < %d\n", len,
+		       (int) (sizeof(*lh) + sizeof(*hal)));
+		RETURN(-EPROTO);
+	}
+	if (lh->kuc_magic == __swab16(KUC_MAGIC)) {
+		lustre_swab_kuch(lh);
+		lustre_swab_hal(hal);
+	} else if (lh->kuc_magic != KUC_MAGIC) {
+		CERROR("Bad magic %x!=%x\n", lh->kuc_magic, KUC_MAGIC);
+		RETURN(-EPROTO);
+	}
+
+	CDEBUG(D_HSM, " Received message mg=%x t=%d m=%d l=%d actions=%d "
+	       "on %s\n",
+	       lh->kuc_magic, lh->kuc_transport, lh->kuc_msgtype,
+	       lh->kuc_msglen, hal->hal_count, hal->hal_fsname);
+
+	/* Broadcast to HSM listeners */
+	rc = libcfs_kkuc_group_put(KUC_GRP_HSM, lh);
+
+	RETURN(rc);
+}
+
+/**
+ * callback function passed to kuc for re-registering each HSM copytool
+ * running on MDC, after MDT shutdown/recovery.
+ * @param data archive id served by the copytool
+ * @param cb_arg callback argument (obd_import)
+ */
+static int mdc_hsm_ct_reregister(__u32 data, void *cb_arg)
+{
+	struct obd_import	*imp = (struct obd_import *)cb_arg;
+	__u32			 archive = data;
+	int			 rc;
+
+	CDEBUG(D_HA, "recover copytool registration to MDT (archive=%#x)\n",
+	       archive);
+	rc = mdc_ioc_hsm_ct_register(imp, archive);
+
+	/* ignore error if the copytool is already registered */
+	return ((rc != 0) && (rc != -EEXIST)) ? rc : 0;
+}
+
+/**
+ * Re-establish all kuc contexts with MDT
+ * after MDT shutdown/recovery.
+ */
+static int mdc_kuc_reregister(struct obd_import *imp)
+{
+	/* re-register HSM agents */
+	return libcfs_kkuc_group_foreach(KUC_GRP_HSM, mdc_hsm_ct_reregister,
+					 (void *)imp);
+}
+
+int mdc_set_info_async(const struct lu_env *env,
+		       struct obd_export *exp,
+		       obd_count keylen, void *key,
+		       obd_count vallen, void *val,
+		       struct ptlrpc_request_set *set)
+{
+	struct obd_import	*imp = class_exp2cliimp(exp);
+	int			 rc;
+	ENTRY;
+
+	if (KEY_IS(KEY_READ_ONLY)) {
+		if (vallen != sizeof(int))
+			RETURN(-EINVAL);
+
+		spin_lock(&imp->imp_lock);
+		if (*((int *)val)) {
+			imp->imp_connect_flags_orig |= OBD_CONNECT_RDONLY;
+			imp->imp_connect_data.ocd_connect_flags |=
+							OBD_CONNECT_RDONLY;
+		} else {
+			imp->imp_connect_flags_orig &= ~OBD_CONNECT_RDONLY;
+			imp->imp_connect_data.ocd_connect_flags &=
+							~OBD_CONNECT_RDONLY;
+		}
+		spin_unlock(&imp->imp_lock);
+
+		rc = do_set_info_async(imp, MDS_SET_INFO, LUSTRE_MDS_VERSION,
+				       keylen, key, vallen, val, set);
+		RETURN(rc);
+	}
+	if (KEY_IS(KEY_SPTLRPC_CONF)) {
+		sptlrpc_conf_client_adapt(exp->exp_obd);
+		RETURN(0);
+	}
+	if (KEY_IS(KEY_FLUSH_CTX)) {
+		sptlrpc_import_flush_my_ctx(imp);
+		RETURN(0);
+	}
+	if (KEY_IS(KEY_MDS_CONN)) {
+		/* mds-mds import */
+		spin_lock(&imp->imp_lock);
+		imp->imp_server_timeout = 1;
+		spin_unlock(&imp->imp_lock);
+		imp->imp_client->cli_request_portal = MDS_MDS_PORTAL;
+		CDEBUG(D_OTHER, "%s: timeout / 2\n", exp->exp_obd->obd_name);
+		RETURN(0);
+	}
+	if (KEY_IS(KEY_CHANGELOG_CLEAR)) {
+		rc = do_set_info_async(imp, MDS_SET_INFO, LUSTRE_MDS_VERSION,
+				       keylen, key, vallen, val, set);
+		RETURN(rc);
+	}
+	if (KEY_IS(KEY_HSM_COPYTOOL_SEND)) {
+		rc = mdc_hsm_copytool_send(vallen, val);
+		RETURN(rc);
+	}
+
+	CERROR("Unknown key %s\n", (char *)key);
+	RETURN(-EINVAL);
+}
+
+int mdc_get_info(const struct lu_env *env, struct obd_export *exp,
+		 __u32 keylen, void *key, __u32 *vallen, void *val,
+		 struct lov_stripe_md *lsm)
+{
+	int rc = -EINVAL;
+
+	if (KEY_IS(KEY_MAX_EASIZE)) {
+		int mdsize, *max_easize;
+
+		if (*vallen != sizeof(int))
+			RETURN(-EINVAL);
+		mdsize = *(int*)val;
+		if (mdsize > exp->exp_obd->u.cli.cl_max_mds_easize)
+			exp->exp_obd->u.cli.cl_max_mds_easize = mdsize;
+		max_easize = val;
+		*max_easize = exp->exp_obd->u.cli.cl_max_mds_easize;
+		RETURN(0);
+	} else if (KEY_IS(KEY_CONN_DATA)) {
+		struct obd_import *imp = class_exp2cliimp(exp);
+		struct obd_connect_data *data = val;
+
+		if (*vallen != sizeof(*data))
+			RETURN(-EINVAL);
+
+		*data = imp->imp_connect_data;
+		RETURN(0);
+	} else if (KEY_IS(KEY_TGT_COUNT)) {
+		*((int *)val) = 1;
+		RETURN(0);
+	}
+
+	rc = mdc_get_info_rpc(exp, keylen, key, *vallen, val);
+
+	RETURN(rc);
+}
+
+static int mdc_pin(struct obd_export *exp, const struct lu_fid *fid,
+		   struct obd_capa *oc, struct obd_client_handle *handle,
+		   int flags)
+{
+	struct ptlrpc_request *req;
+	struct mdt_body       *body;
+	int		    rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_PIN);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	mdc_set_capa_size(req, &RMF_CAPA1, oc);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_PIN);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	mdc_pack_body(req, fid, oc, 0, 0, -1, flags);
+
+	ptlrpc_request_set_replen(req);
+
+	mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+	rc = ptlrpc_queue_wait(req);
+	mdc_put_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+	if (rc) {
+		CERROR("Pin failed: %d\n", rc);
+		GOTO(err_out, rc);
+	}
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		GOTO(err_out, rc = -EPROTO);
+
+	handle->och_fh = body->handle;
+	handle->och_magic = OBD_CLIENT_HANDLE_MAGIC;
+
+	handle->och_mod = obd_mod_alloc();
+	if (handle->och_mod == NULL) {
+		DEBUG_REQ(D_ERROR, req, "can't allocate md_open_data");
+		GOTO(err_out, rc = -ENOMEM);
+	}
+	handle->och_mod->mod_open_req = req; /* will be dropped by unpin */
+
+	RETURN(0);
+
+err_out:
+	ptlrpc_req_finished(req);
+	RETURN(rc);
+}
+
+static int mdc_unpin(struct obd_export *exp, struct obd_client_handle *handle,
+		     int flag)
+{
+	struct ptlrpc_request *req;
+	struct mdt_body       *body;
+	int		    rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), &RQF_MDS_UNPIN,
+					LUSTRE_MDS_VERSION, MDS_UNPIN);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_MDT_BODY);
+	body->handle = handle->och_fh;
+	body->flags = flag;
+
+	ptlrpc_request_set_replen(req);
+
+	mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+	rc = ptlrpc_queue_wait(req);
+	mdc_put_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+
+	if (rc != 0)
+		CERROR("Unpin failed: %d\n", rc);
+
+	ptlrpc_req_finished(req);
+	ptlrpc_req_finished(handle->och_mod->mod_open_req);
+
+	obd_mod_put(handle->och_mod);
+	RETURN(rc);
+}
+
+int mdc_sync(struct obd_export *exp, const struct lu_fid *fid,
+	     struct obd_capa *oc, struct ptlrpc_request **request)
+{
+	struct ptlrpc_request *req;
+	int		    rc;
+	ENTRY;
+
+	*request = NULL;
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_SYNC);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	mdc_set_capa_size(req, &RMF_CAPA1, oc);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_SYNC);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	mdc_pack_body(req, fid, oc, 0, 0, -1, 0);
+
+	ptlrpc_request_set_replen(req);
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		ptlrpc_req_finished(req);
+	else
+		*request = req;
+	RETURN(rc);
+}
+
+static int mdc_import_event(struct obd_device *obd, struct obd_import *imp,
+			    enum obd_import_event event)
+{
+	int rc = 0;
+
+	LASSERT(imp->imp_obd == obd);
+
+	switch (event) {
+	case IMP_EVENT_DISCON: {
+#if 0
+		/* XXX Pass event up to OBDs stack. used only for FLD now */
+		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_DISCON, NULL);
+#endif
+		break;
+	}
+	case IMP_EVENT_INACTIVE: {
+		struct client_obd *cli = &obd->u.cli;
+		/*
+		 * Flush current sequence to make client obtain new one
+		 * from server in case of disconnect/reconnect.
+		 */
+		if (cli->cl_seq != NULL)
+			seq_client_flush(cli->cl_seq);
+
+		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE, NULL);
+		break;
+	}
+	case IMP_EVENT_INVALIDATE: {
+		struct ldlm_namespace *ns = obd->obd_namespace;
+
+		ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
+
+		break;
+	}
+	case IMP_EVENT_ACTIVE:
+		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE, NULL);
+		/* restore re-establish kuc registration after reconnecting */
+		if (rc == 0)
+			rc = mdc_kuc_reregister(imp);
+		break;
+	case IMP_EVENT_OCD:
+		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD, NULL);
+		break;
+	case IMP_EVENT_DEACTIVATE:
+	case IMP_EVENT_ACTIVATE:
+		break;
+	default:
+		CERROR("Unknown import event %x\n", event);
+		LBUG();
+	}
+	RETURN(rc);
+}
+
+int mdc_fid_alloc(struct obd_export *exp, struct lu_fid *fid,
+		  struct md_op_data *op_data)
+{
+	struct client_obd *cli = &exp->exp_obd->u.cli;
+	struct lu_client_seq *seq = cli->cl_seq;
+	ENTRY;
+	RETURN(seq_client_alloc_fid(NULL, seq, fid));
+}
+
+struct obd_uuid *mdc_get_uuid(struct obd_export *exp) {
+	struct client_obd *cli = &exp->exp_obd->u.cli;
+	return &cli->cl_target_uuid;
+}
+
+/**
+ * Determine whether the lock can be canceled before replaying it during
+ * recovery, non zero value will be return if the lock can be canceled,
+ * or zero returned for not
+ */
+static int mdc_cancel_for_recovery(struct ldlm_lock *lock)
+{
+	if (lock->l_resource->lr_type != LDLM_IBITS)
+		RETURN(0);
+
+	/* FIXME: if we ever get into a situation where there are too many
+	 * opened files with open locks on a single node, then we really
+	 * should replay these open locks to reget it */
+	if (lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_OPEN)
+		RETURN(0);
+
+	RETURN(1);
+}
+
+static int mdc_resource_inode_free(struct ldlm_resource *res)
+{
+	if (res->lr_lvb_inode)
+		res->lr_lvb_inode = NULL;
+
+	return 0;
+}
+
+struct ldlm_valblock_ops inode_lvbo = {
+	lvbo_free: mdc_resource_inode_free
+};
+
+static int mdc_setup(struct obd_device *obd, struct lustre_cfg *cfg)
+{
+	struct client_obd *cli = &obd->u.cli;
+	struct lprocfs_static_vars lvars = { 0 };
+	int rc;
+	ENTRY;
+
+	OBD_ALLOC(cli->cl_rpc_lock, sizeof (*cli->cl_rpc_lock));
+	if (!cli->cl_rpc_lock)
+		RETURN(-ENOMEM);
+	mdc_init_rpc_lock(cli->cl_rpc_lock);
+
+	ptlrpcd_addref();
+
+	OBD_ALLOC(cli->cl_close_lock, sizeof (*cli->cl_close_lock));
+	if (!cli->cl_close_lock)
+		GOTO(err_rpc_lock, rc = -ENOMEM);
+	mdc_init_rpc_lock(cli->cl_close_lock);
+
+	rc = client_obd_setup(obd, cfg);
+	if (rc)
+		GOTO(err_close_lock, rc);
+	lprocfs_mdc_init_vars(&lvars);
+	lprocfs_obd_setup(obd, lvars.obd_vars);
+	sptlrpc_lprocfs_cliobd_attach(obd);
+	ptlrpc_lprocfs_register_obd(obd);
+
+	ns_register_cancel(obd->obd_namespace, mdc_cancel_for_recovery);
+
+	obd->obd_namespace->ns_lvbo = &inode_lvbo;
+
+	rc = obd_llog_init(obd, &obd->obd_olg, obd, NULL);
+	if (rc) {
+		mdc_cleanup(obd);
+		CERROR("failed to setup llogging subsystems\n");
+	}
+
+	RETURN(rc);
+
+err_close_lock:
+	OBD_FREE(cli->cl_close_lock, sizeof (*cli->cl_close_lock));
+err_rpc_lock:
+	OBD_FREE(cli->cl_rpc_lock, sizeof (*cli->cl_rpc_lock));
+	ptlrpcd_decref();
+	RETURN(rc);
+}
+
+/* Initialize the default and maximum LOV EA and cookie sizes.  This allows
+ * us to make MDS RPCs with large enough reply buffers to hold the
+ * maximum-sized (= maximum striped) EA and cookie without having to
+ * calculate this (via a call into the LOV + OSCs) each time we make an RPC. */
+static int mdc_init_ea_size(struct obd_export *exp, int easize,
+		     int def_easize, int cookiesize)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct client_obd *cli = &obd->u.cli;
+	ENTRY;
+
+	if (cli->cl_max_mds_easize < easize)
+		cli->cl_max_mds_easize = easize;
+
+	if (cli->cl_default_mds_easize < def_easize)
+		cli->cl_default_mds_easize = def_easize;
+
+	if (cli->cl_max_mds_cookiesize < cookiesize)
+		cli->cl_max_mds_cookiesize = cookiesize;
+
+	RETURN(0);
+}
+
+static int mdc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
+{
+	int rc = 0;
+	ENTRY;
+
+	switch (stage) {
+	case OBD_CLEANUP_EARLY:
+		break;
+	case OBD_CLEANUP_EXPORTS:
+		/* Failsafe, ok if racy */
+		if (obd->obd_type->typ_refcnt <= 1)
+			libcfs_kkuc_group_rem(0, KUC_GRP_HSM);
+
+		obd_cleanup_client_import(obd);
+		ptlrpc_lprocfs_unregister_obd(obd);
+		lprocfs_obd_cleanup(obd);
+
+		rc = obd_llog_finish(obd, 0);
+		if (rc != 0)
+			CERROR("failed to cleanup llogging subsystems\n");
+		break;
+	}
+	RETURN(rc);
+}
+
+static int mdc_cleanup(struct obd_device *obd)
+{
+	struct client_obd *cli = &obd->u.cli;
+
+	OBD_FREE(cli->cl_rpc_lock, sizeof (*cli->cl_rpc_lock));
+	OBD_FREE(cli->cl_close_lock, sizeof (*cli->cl_close_lock));
+
+	ptlrpcd_decref();
+
+	return client_obd_cleanup(obd);
+}
+
+
+static int mdc_llog_init(struct obd_device *obd, struct obd_llog_group *olg,
+			 struct obd_device *tgt, int *index)
+{
+	struct llog_ctxt	*ctxt;
+	int			 rc;
+
+	ENTRY;
+
+	LASSERT(olg == &obd->obd_olg);
+
+	rc = llog_setup(NULL, obd, olg, LLOG_CHANGELOG_REPL_CTXT, tgt,
+			&llog_client_ops);
+	if (rc)
+		RETURN(rc);
+
+	ctxt = llog_group_get_ctxt(olg, LLOG_CHANGELOG_REPL_CTXT);
+	llog_initiator_connect(ctxt);
+	llog_ctxt_put(ctxt);
+
+	RETURN(0);
+}
+
+static int mdc_llog_finish(struct obd_device *obd, int count)
+{
+	struct llog_ctxt *ctxt;
+
+	ENTRY;
+
+	ctxt = llog_get_context(obd, LLOG_CHANGELOG_REPL_CTXT);
+	if (ctxt)
+		llog_cleanup(NULL, ctxt);
+
+	RETURN(0);
+}
+
+static int mdc_process_config(struct obd_device *obd, obd_count len, void *buf)
+{
+	struct lustre_cfg *lcfg = buf;
+	struct lprocfs_static_vars lvars = { 0 };
+	int rc = 0;
+
+	lprocfs_mdc_init_vars(&lvars);
+	switch (lcfg->lcfg_command) {
+	default:
+		rc = class_process_proc_param(PARAM_MDC, lvars.obd_vars,
+					      lcfg, obd);
+		if (rc > 0)
+			rc = 0;
+		break;
+	}
+	return(rc);
+}
+
+
+/* get remote permission for current user on fid */
+int mdc_get_remote_perm(struct obd_export *exp, const struct lu_fid *fid,
+			struct obd_capa *oc, __u32 suppgid,
+			struct ptlrpc_request **request)
+{
+	struct ptlrpc_request  *req;
+	int		    rc;
+	ENTRY;
+
+	LASSERT(client_is_remote(exp));
+
+	*request = NULL;
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_GETATTR);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	mdc_set_capa_size(req, &RMF_CAPA1, oc);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GETATTR);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	mdc_pack_body(req, fid, oc, OBD_MD_FLRMTPERM, 0, suppgid, 0);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
+			     sizeof(struct mdt_remote_perm));
+
+	ptlrpc_request_set_replen(req);
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		ptlrpc_req_finished(req);
+	else
+		*request = req;
+	RETURN(rc);
+}
+
+static int mdc_interpret_renew_capa(const struct lu_env *env,
+				    struct ptlrpc_request *req, void *args,
+				    int status)
+{
+	struct mdc_renew_capa_args *ra = args;
+	struct mdt_body *body = NULL;
+	struct lustre_capa *capa;
+	ENTRY;
+
+	if (status)
+		GOTO(out, capa = ERR_PTR(status));
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		GOTO(out, capa = ERR_PTR(-EFAULT));
+
+	if ((body->valid & OBD_MD_FLOSSCAPA) == 0)
+		GOTO(out, capa = ERR_PTR(-ENOENT));
+
+	capa = req_capsule_server_get(&req->rq_pill, &RMF_CAPA2);
+	if (!capa)
+		GOTO(out, capa = ERR_PTR(-EFAULT));
+	EXIT;
+out:
+	ra->ra_cb(ra->ra_oc, capa);
+	return 0;
+}
+
+static int mdc_renew_capa(struct obd_export *exp, struct obd_capa *oc,
+			  renew_capa_cb_t cb)
+{
+	struct ptlrpc_request *req;
+	struct mdc_renew_capa_args *ra;
+	ENTRY;
+
+	req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), &RQF_MDS_GETATTR,
+					LUSTRE_MDS_VERSION, MDS_GETATTR);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	/* NB, OBD_MD_FLOSSCAPA is set here, but it doesn't necessarily mean the
+	 * capa to renew is oss capa.
+	 */
+	mdc_pack_body(req, &oc->c_capa.lc_fid, oc, OBD_MD_FLOSSCAPA, 0, -1, 0);
+	ptlrpc_request_set_replen(req);
+
+	CLASSERT(sizeof(*ra) <= sizeof(req->rq_async_args));
+	ra = ptlrpc_req_async_args(req);
+	ra->ra_oc = oc;
+	ra->ra_cb = cb;
+	req->rq_interpret_reply = mdc_interpret_renew_capa;
+	ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1);
+	RETURN(0);
+}
+
+static int mdc_connect(const struct lu_env *env,
+		       struct obd_export **exp,
+		       struct obd_device *obd, struct obd_uuid *cluuid,
+		       struct obd_connect_data *data,
+		       void *localdata)
+{
+	struct obd_import *imp = obd->u.cli.cl_import;
+
+	/* mds-mds import features */
+	if (data && (data->ocd_connect_flags & OBD_CONNECT_MDS_MDS)) {
+		spin_lock(&imp->imp_lock);
+		imp->imp_server_timeout = 1;
+		spin_unlock(&imp->imp_lock);
+		imp->imp_client->cli_request_portal = MDS_MDS_PORTAL;
+		CDEBUG(D_OTHER, "%s: Set 'mds' portal and timeout\n",
+		       obd->obd_name);
+	}
+
+	return client_connect_import(env, exp, obd, cluuid, data, NULL);
+}
+
+struct obd_ops mdc_obd_ops = {
+	.o_owner	    = THIS_MODULE,
+	.o_setup	    = mdc_setup,
+	.o_precleanup       = mdc_precleanup,
+	.o_cleanup	  = mdc_cleanup,
+	.o_add_conn	 = client_import_add_conn,
+	.o_del_conn	 = client_import_del_conn,
+	.o_connect	  = mdc_connect,
+	.o_disconnect       = client_disconnect_export,
+	.o_iocontrol	= mdc_iocontrol,
+	.o_set_info_async   = mdc_set_info_async,
+	.o_statfs	   = mdc_statfs,
+	.o_pin	      = mdc_pin,
+	.o_unpin	    = mdc_unpin,
+	.o_fid_init	    = client_fid_init,
+	.o_fid_fini	    = client_fid_fini,
+	.o_fid_alloc	= mdc_fid_alloc,
+	.o_import_event     = mdc_import_event,
+	.o_llog_init	= mdc_llog_init,
+	.o_llog_finish      = mdc_llog_finish,
+	.o_get_info	 = mdc_get_info,
+	.o_process_config   = mdc_process_config,
+	.o_get_uuid	 = mdc_get_uuid,
+	.o_quotactl	 = mdc_quotactl,
+	.o_quotacheck       = mdc_quotacheck
+};
+
+struct md_ops mdc_md_ops = {
+	.m_getstatus	= mdc_getstatus,
+	.m_null_inode	    = mdc_null_inode,
+	.m_find_cbdata      = mdc_find_cbdata,
+	.m_close	    = mdc_close,
+	.m_create	   = mdc_create,
+	.m_done_writing     = mdc_done_writing,
+	.m_enqueue	  = mdc_enqueue,
+	.m_getattr	  = mdc_getattr,
+	.m_getattr_name     = mdc_getattr_name,
+	.m_intent_lock      = mdc_intent_lock,
+	.m_link	     = mdc_link,
+	.m_is_subdir	= mdc_is_subdir,
+	.m_rename	   = mdc_rename,
+	.m_setattr	  = mdc_setattr,
+	.m_setxattr	 = mdc_setxattr,
+	.m_getxattr	 = mdc_getxattr,
+	.m_sync	     = mdc_sync,
+	.m_readpage	 = mdc_readpage,
+	.m_unlink	   = mdc_unlink,
+	.m_cancel_unused    = mdc_cancel_unused,
+	.m_init_ea_size     = mdc_init_ea_size,
+	.m_set_lock_data    = mdc_set_lock_data,
+	.m_lock_match       = mdc_lock_match,
+	.m_get_lustre_md    = mdc_get_lustre_md,
+	.m_free_lustre_md   = mdc_free_lustre_md,
+	.m_set_open_replay_data = mdc_set_open_replay_data,
+	.m_clear_open_replay_data = mdc_clear_open_replay_data,
+	.m_renew_capa       = mdc_renew_capa,
+	.m_unpack_capa      = mdc_unpack_capa,
+	.m_get_remote_perm  = mdc_get_remote_perm,
+	.m_intent_getattr_async = mdc_intent_getattr_async,
+	.m_revalidate_lock      = mdc_revalidate_lock
+};
+
+int __init mdc_init(void)
+{
+	int rc;
+	struct lprocfs_static_vars lvars = { 0 };
+	lprocfs_mdc_init_vars(&lvars);
+
+	rc = class_register_type(&mdc_obd_ops, &mdc_md_ops, lvars.module_vars,
+				 LUSTRE_MDC_NAME, NULL);
+	RETURN(rc);
+}
+
+static void /*__exit*/ mdc_exit(void)
+{
+	class_unregister_type(LUSTRE_MDC_NAME);
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Metadata Client");
+MODULE_LICENSE("GPL");
+
+module_init(mdc_init);
+module_exit(mdc_exit);
diff --git a/drivers/staging/lustre/lustre/mgc/Makefile b/drivers/staging/lustre/lustre/mgc/Makefile
new file mode 100644
index 000000000000..267246344e1c
--- /dev/null
+++ b/drivers/staging/lustre/lustre/mgc/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_LUSTRE_FS) += mgc.o
+mgc-y := mgc_request.o lproc_mgc.o
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lustre/mgc/libmgc.c b/drivers/staging/lustre/lustre/mgc/libmgc.c
new file mode 100644
index 000000000000..442146cc7e60
--- /dev/null
+++ b/drivers/staging/lustre/lustre/mgc/libmgc.c
@@ -0,0 +1,166 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/mgc/libmgc.c
+ *
+ * Lustre Management Client
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+/* Minimal MGC for liblustre: only used to read the config log from the MGS
+   at setup time, no updates. */
+
+#define DEBUG_SUBSYSTEM S_MGC
+
+#include <liblustre.h>
+
+#include <obd_class.h>
+#include <lustre_dlm.h>
+#include <lustre_log.h>
+#include <lustre_fsfilt.h>
+#include <lustre_disk.h>
+
+
+static int mgc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	int rc;
+	ENTRY;
+
+	ptlrpcd_addref();
+
+	rc = client_obd_setup(obd, lcfg);
+	if (rc)
+		GOTO(err_decref, rc);
+
+	/* liblustre only support null flavor to MGS */
+	obd->u.cli.cl_flvr_mgc.sf_rpc = SPTLRPC_FLVR_NULL;
+
+	rc = obd_llog_init(obd, &obd->obd_olg, obd, NULL);
+	if (rc) {
+		CERROR("failed to setup llogging subsystems\n");
+		GOTO(err_cleanup, rc);
+	}
+
+	RETURN(rc);
+
+err_cleanup:
+	client_obd_cleanup(obd);
+err_decref:
+	ptlrpcd_decref();
+	RETURN(rc);
+}
+
+static int mgc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
+{
+	int rc = 0;
+	ENTRY;
+
+	switch (stage) {
+	case OBD_CLEANUP_EARLY:
+	case OBD_CLEANUP_EXPORTS:
+		obd_cleanup_client_import(obd);
+		rc = obd_llog_finish(obd, 0);
+		if (rc != 0)
+			CERROR("failed to cleanup llogging subsystems\n");
+		break;
+	}
+	RETURN(rc);
+}
+
+static int mgc_cleanup(struct obd_device *obd)
+{
+	struct client_obd *cli = &obd->u.cli;
+	int rc;
+	ENTRY;
+
+	LASSERT(cli->cl_mgc_vfsmnt == NULL);
+
+	ptlrpcd_decref();
+
+	rc = client_obd_cleanup(obd);
+	RETURN(rc);
+}
+
+static int mgc_llog_init(struct obd_device *obd, struct obd_llog_group *olg,
+			 struct obd_device *tgt, int *index)
+{
+	struct llog_ctxt *ctxt;
+	int rc;
+	ENTRY;
+
+	LASSERT(olg == &obd->obd_olg);
+	rc = llog_setup(NULL, obd, olg, LLOG_CONFIG_REPL_CTXT, tgt,
+			&llog_client_ops);
+	if (rc < 0)
+		RETURN(rc);
+
+	ctxt = llog_group_get_ctxt(olg, LLOG_CONFIG_REPL_CTXT);
+	llog_initiator_connect(ctxt);
+	llog_ctxt_put(ctxt);
+
+	RETURN(rc);
+}
+
+static int mgc_llog_finish(struct obd_device *obd, int count)
+{
+	struct llog_ctxt *ctxt;
+
+	ENTRY;
+
+	ctxt = llog_get_context(obd, LLOG_CONFIG_REPL_CTXT);
+	if (ctxt)
+		llog_cleanup(NULL, ctxt);
+
+	RETURN(0);
+}
+
+struct obd_ops mgc_obd_ops = {
+	.o_owner	= THIS_MODULE,
+	.o_setup	= mgc_setup,
+	.o_precleanup   = mgc_precleanup,
+	.o_cleanup      = mgc_cleanup,
+	.o_add_conn     = client_import_add_conn,
+	.o_del_conn     = client_import_del_conn,
+	.o_connect      = client_connect_import,
+	.o_disconnect   = client_disconnect_export,
+	.o_llog_init    = mgc_llog_init,
+	.o_llog_finish  = mgc_llog_finish,
+};
+
+int __init mgc_init(void)
+{
+	return class_register_type(&mgc_obd_ops, NULL,
+				   NULL, LUSTRE_MGC_NAME, NULL);
+}
diff --git a/drivers/staging/lustre/lustre/mgc/lproc_mgc.c b/drivers/staging/lustre/lustre/mgc/lproc_mgc.c
new file mode 100644
index 000000000000..041f365beabf
--- /dev/null
+++ b/drivers/staging/lustre/lustre/mgc/lproc_mgc.c
@@ -0,0 +1,68 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/version.h>
+#include <linux/vfs.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include "mgc_internal.h"
+
+#ifdef LPROCFS
+
+static struct lprocfs_vars lprocfs_mgc_obd_vars[] = {
+	{ "uuid",	    lprocfs_rd_uuid,	  0, 0 },
+	{ "ping",	    0, lprocfs_wr_ping,       0, 0, 0222 },
+	{ "connect_flags",   lprocfs_rd_connect_flags, 0, 0 },
+	{ "mgs_server_uuid", lprocfs_rd_server_uuid,   0, 0 },
+	{ "mgs_conn_uuid",   lprocfs_rd_conn_uuid,     0, 0 },
+	{ "import",	  lprocfs_rd_import,	0, 0 },
+	{ "state",	   lprocfs_rd_state,	 0, 0 },
+	{ "ir_state",	lprocfs_mgc_rd_ir_state,  0, 0 },
+	{ 0 }
+};
+
+static struct lprocfs_vars lprocfs_mgc_module_vars[] = {
+	{ "num_refs",	lprocfs_rd_numrefs,       0, 0 },
+	{ 0 }
+};
+
+void lprocfs_mgc_init_vars(struct lprocfs_static_vars *lvars)
+{
+	lvars->module_vars = lprocfs_mgc_module_vars;
+	lvars->obd_vars    = lprocfs_mgc_obd_vars;
+}
+#endif /* LPROCFS */
diff --git a/drivers/staging/lustre/lustre/mgc/mgc_internal.h b/drivers/staging/lustre/lustre/mgc/mgc_internal.h
new file mode 100644
index 000000000000..111db90f33a7
--- /dev/null
+++ b/drivers/staging/lustre/lustre/mgc/mgc_internal.h
@@ -0,0 +1,75 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _MGC_INTERNAL_H
+#define _MGC_INTERNAL_H
+
+#include <linux/libcfs/libcfs.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_lib.h>
+#include <lustre_dlm.h>
+#include <lustre_log.h>
+#include <lustre_export.h>
+
+#ifdef LPROCFS
+void lprocfs_mgc_init_vars(struct lprocfs_static_vars *lvars);
+int lprocfs_mgc_rd_ir_state(char *page, char **start, off_t off,
+			    int count, int *eof, void *data);
+#else
+static void lprocfs_mgc_init_vars(struct lprocfs_static_vars *lvars)
+{
+	memset(lvars, 0, sizeof(*lvars));
+}
+static inline int lprocfs_mgc_rd_ir_state(char *page, char **start,
+	off_t off, int count, int *eof, void *data)
+{
+	return 0;
+}
+#endif  /* LPROCFS */
+
+int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld);
+
+static inline int cld_is_sptlrpc(struct config_llog_data *cld)
+{
+	return cld->cld_type == CONFIG_T_SPTLRPC;
+}
+
+static inline int cld_is_recover(struct config_llog_data *cld)
+{
+	return cld->cld_type == CONFIG_T_RECOVER;
+}
+
+#endif  /* _MGC_INTERNAL_H */
diff --git a/drivers/staging/lustre/lustre/mgc/mgc_request.c b/drivers/staging/lustre/lustre/mgc/mgc_request.c
new file mode 100644
index 000000000000..74232f4c1004
--- /dev/null
+++ b/drivers/staging/lustre/lustre/mgc/mgc_request.c
@@ -0,0 +1,1863 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/mgc/mgc_request.c
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_MGC
+#define D_MGC D_CONFIG /*|D_WARNING*/
+
+# include <linux/module.h>
+# include <linux/pagemap.h>
+# include <linux/miscdevice.h>
+# include <linux/init.h>
+
+#include <obd_class.h>
+#include <lustre_dlm.h>
+#include <lprocfs_status.h>
+#include <lustre_log.h>
+#include <lustre_fsfilt.h>
+#include <lustre_disk.h>
+#include "mgc_internal.h"
+
+static int mgc_name2resid(char *name, int len, struct ldlm_res_id *res_id,
+			  int type)
+{
+	__u64 resname = 0;
+
+	if (len > 8) {
+		CERROR("name too long: %s\n", name);
+		return -EINVAL;
+	}
+	if (len <= 0) {
+		CERROR("missing name: %s\n", name);
+		return -EINVAL;
+	}
+	memcpy(&resname, name, len);
+
+	/* Always use the same endianness for the resid */
+	memset(res_id, 0, sizeof(*res_id));
+	res_id->name[0] = cpu_to_le64(resname);
+	/* XXX: unfortunately, sptlprc and config llog share one lock */
+	switch(type) {
+	case CONFIG_T_CONFIG:
+	case CONFIG_T_SPTLRPC:
+		resname = 0;
+		break;
+	case CONFIG_T_RECOVER:
+		resname = type;
+		break;
+	default:
+		LBUG();
+	}
+	res_id->name[1] = cpu_to_le64(resname);
+	CDEBUG(D_MGC, "log %s to resid "LPX64"/"LPX64" (%.8s)\n", name,
+	       res_id->name[0], res_id->name[1], (char *)&res_id->name[0]);
+	return 0;
+}
+
+int mgc_fsname2resid(char *fsname, struct ldlm_res_id *res_id, int type)
+{
+	/* fsname is at most 8 chars long, maybe contain "-".
+	 * e.g. "lustre", "SUN-000" */
+	return mgc_name2resid(fsname, strlen(fsname), res_id, type);
+}
+EXPORT_SYMBOL(mgc_fsname2resid);
+
+int mgc_logname2resid(char *logname, struct ldlm_res_id *res_id, int type)
+{
+	char *name_end;
+	int len;
+
+	/* logname consists of "fsname-nodetype".
+	 * e.g. "lustre-MDT0001", "SUN-000-client" */
+	name_end = strrchr(logname, '-');
+	LASSERT(name_end);
+	len = name_end - logname;
+	return mgc_name2resid(logname, len, res_id, type);
+}
+
+/********************** config llog list **********************/
+static LIST_HEAD(config_llog_list);
+static DEFINE_SPINLOCK(config_list_lock);
+
+/* Take a reference to a config log */
+static int config_log_get(struct config_llog_data *cld)
+{
+	ENTRY;
+	atomic_inc(&cld->cld_refcount);
+	CDEBUG(D_INFO, "log %s refs %d\n", cld->cld_logname,
+	       atomic_read(&cld->cld_refcount));
+	RETURN(0);
+}
+
+/* Drop a reference to a config log.  When no longer referenced,
+   we can free the config log data */
+static void config_log_put(struct config_llog_data *cld)
+{
+	ENTRY;
+
+	CDEBUG(D_INFO, "log %s refs %d\n", cld->cld_logname,
+	       atomic_read(&cld->cld_refcount));
+	LASSERT(atomic_read(&cld->cld_refcount) > 0);
+
+	/* spinlock to make sure no item with 0 refcount in the list */
+	if (atomic_dec_and_lock(&cld->cld_refcount, &config_list_lock)) {
+		list_del(&cld->cld_list_chain);
+		spin_unlock(&config_list_lock);
+
+		CDEBUG(D_MGC, "dropping config log %s\n", cld->cld_logname);
+
+		if (cld->cld_recover)
+			config_log_put(cld->cld_recover);
+		if (cld->cld_sptlrpc)
+			config_log_put(cld->cld_sptlrpc);
+		if (cld_is_sptlrpc(cld))
+			sptlrpc_conf_log_stop(cld->cld_logname);
+
+		class_export_put(cld->cld_mgcexp);
+		OBD_FREE(cld, sizeof(*cld) + strlen(cld->cld_logname) + 1);
+	}
+
+	EXIT;
+}
+
+/* Find a config log by name */
+static
+struct config_llog_data *config_log_find(char *logname,
+					 struct config_llog_instance *cfg)
+{
+	struct config_llog_data *cld;
+	struct config_llog_data *found = NULL;
+	void *		   instance;
+	ENTRY;
+
+	LASSERT(logname != NULL);
+
+	instance = cfg ? cfg->cfg_instance : NULL;
+	spin_lock(&config_list_lock);
+	list_for_each_entry(cld, &config_llog_list, cld_list_chain) {
+		/* check if instance equals */
+		if (instance != cld->cld_cfg.cfg_instance)
+			continue;
+
+		/* instance may be NULL, should check name */
+		if (strcmp(logname, cld->cld_logname) == 0) {
+			found = cld;
+			break;
+		}
+	}
+	if (found) {
+		atomic_inc(&found->cld_refcount);
+		LASSERT(found->cld_stopping == 0 || cld_is_sptlrpc(found) == 0);
+	}
+	spin_unlock(&config_list_lock);
+	RETURN(found);
+}
+
+static
+struct config_llog_data *do_config_log_add(struct obd_device *obd,
+					   char *logname,
+					   int type,
+					   struct config_llog_instance *cfg,
+					   struct super_block *sb)
+{
+	struct config_llog_data *cld;
+	int		      rc;
+	ENTRY;
+
+	CDEBUG(D_MGC, "do adding config log %s:%p\n", logname,
+	       cfg ? cfg->cfg_instance : 0);
+
+	OBD_ALLOC(cld, sizeof(*cld) + strlen(logname) + 1);
+	if (!cld)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	strcpy(cld->cld_logname, logname);
+	if (cfg)
+		cld->cld_cfg = *cfg;
+	else
+		cld->cld_cfg.cfg_callback = class_config_llog_handler;
+	mutex_init(&cld->cld_lock);
+	cld->cld_cfg.cfg_last_idx = 0;
+	cld->cld_cfg.cfg_flags = 0;
+	cld->cld_cfg.cfg_sb = sb;
+	cld->cld_type = type;
+	atomic_set(&cld->cld_refcount, 1);
+
+	/* Keep the mgc around until we are done */
+	cld->cld_mgcexp = class_export_get(obd->obd_self_export);
+
+	if (cld_is_sptlrpc(cld)) {
+		sptlrpc_conf_log_start(logname);
+		cld->cld_cfg.cfg_obdname = obd->obd_name;
+	}
+
+	rc = mgc_logname2resid(logname, &cld->cld_resid, type);
+
+	spin_lock(&config_list_lock);
+	list_add(&cld->cld_list_chain, &config_llog_list);
+	spin_unlock(&config_list_lock);
+
+	if (rc) {
+		config_log_put(cld);
+		RETURN(ERR_PTR(rc));
+	}
+
+	if (cld_is_sptlrpc(cld)) {
+		rc = mgc_process_log(obd, cld);
+		if (rc && rc != -ENOENT)
+			CERROR("failed processing sptlrpc log: %d\n", rc);
+	}
+
+	RETURN(cld);
+}
+
+static struct config_llog_data *config_recover_log_add(struct obd_device *obd,
+	char *fsname,
+	struct config_llog_instance *cfg,
+	struct super_block *sb)
+{
+	struct config_llog_instance lcfg = *cfg;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct config_llog_data *cld;
+	char logname[32];
+
+	if (IS_OST(lsi))
+		return NULL;
+
+	/* for osp-on-ost, see lustre_start_osp() */
+	if (IS_MDT(lsi) && lcfg.cfg_instance)
+		return NULL;
+
+	/* we have to use different llog for clients and mdts for cmd
+	 * where only clients are notified if one of cmd server restarts */
+	LASSERT(strlen(fsname) < sizeof(logname) / 2);
+	strcpy(logname, fsname);
+	if (IS_SERVER(lsi)) { /* mdt */
+		LASSERT(lcfg.cfg_instance == NULL);
+		lcfg.cfg_instance = sb;
+		strcat(logname, "-mdtir");
+	} else {
+		LASSERT(lcfg.cfg_instance != NULL);
+		strcat(logname, "-cliir");
+	}
+
+	cld = do_config_log_add(obd, logname, CONFIG_T_RECOVER, &lcfg, sb);
+	return cld;
+}
+
+
+/** Add this log to the list of active logs watched by an MGC.
+ * Active means we're watching for updates.
+ * We have one active log per "mount" - client instance or servername.
+ * Each instance may be at a different point in the log.
+ */
+static int config_log_add(struct obd_device *obd, char *logname,
+			  struct config_llog_instance *cfg,
+			  struct super_block *sb)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct config_llog_data *cld;
+	struct config_llog_data *sptlrpc_cld;
+	char		     seclogname[32];
+	char		    *ptr;
+	ENTRY;
+
+	CDEBUG(D_MGC, "adding config log %s:%p\n", logname, cfg->cfg_instance);
+
+	/*
+	 * for each regular log, the depended sptlrpc log name is
+	 * <fsname>-sptlrpc. multiple regular logs may share one sptlrpc log.
+	 */
+	ptr = strrchr(logname, '-');
+	if (ptr == NULL || ptr - logname > 8) {
+		CERROR("logname %s is too long\n", logname);
+		RETURN(-EINVAL);
+	}
+
+	memcpy(seclogname, logname, ptr - logname);
+	strcpy(seclogname + (ptr - logname), "-sptlrpc");
+
+	sptlrpc_cld = config_log_find(seclogname, NULL);
+	if (sptlrpc_cld == NULL) {
+		sptlrpc_cld = do_config_log_add(obd, seclogname,
+						CONFIG_T_SPTLRPC, NULL, NULL);
+		if (IS_ERR(sptlrpc_cld)) {
+			CERROR("can't create sptlrpc log: %s\n", seclogname);
+			RETURN(PTR_ERR(sptlrpc_cld));
+		}
+	}
+
+	cld = do_config_log_add(obd, logname, CONFIG_T_CONFIG, cfg, sb);
+	if (IS_ERR(cld)) {
+		CERROR("can't create log: %s\n", logname);
+		config_log_put(sptlrpc_cld);
+		RETURN(PTR_ERR(cld));
+	}
+
+	cld->cld_sptlrpc = sptlrpc_cld;
+
+	LASSERT(lsi->lsi_lmd);
+	if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOIR)) {
+		struct config_llog_data *recover_cld;
+		*strrchr(seclogname, '-') = 0;
+		recover_cld = config_recover_log_add(obd, seclogname, cfg, sb);
+		if (IS_ERR(recover_cld)) {
+			config_log_put(cld);
+			RETURN(PTR_ERR(recover_cld));
+		}
+		cld->cld_recover = recover_cld;
+	}
+
+	RETURN(0);
+}
+
+DEFINE_MUTEX(llog_process_lock);
+
+/** Stop watching for updates on this log.
+ */
+static int config_log_end(char *logname, struct config_llog_instance *cfg)
+{
+	struct config_llog_data *cld;
+	struct config_llog_data *cld_sptlrpc = NULL;
+	struct config_llog_data *cld_recover = NULL;
+	int rc = 0;
+	ENTRY;
+
+	cld = config_log_find(logname, cfg);
+	if (cld == NULL)
+		RETURN(-ENOENT);
+
+	mutex_lock(&cld->cld_lock);
+	/*
+	 * if cld_stopping is set, it means we didn't start the log thus
+	 * not owning the start ref. this can happen after previous umount:
+	 * the cld still hanging there waiting for lock cancel, and we
+	 * remount again but failed in the middle and call log_end without
+	 * calling start_log.
+	 */
+	if (unlikely(cld->cld_stopping)) {
+		mutex_unlock(&cld->cld_lock);
+		/* drop the ref from the find */
+		config_log_put(cld);
+		RETURN(rc);
+	}
+
+	cld->cld_stopping = 1;
+
+	cld_recover = cld->cld_recover;
+	cld->cld_recover = NULL;
+	mutex_unlock(&cld->cld_lock);
+
+	if (cld_recover) {
+		mutex_lock(&cld_recover->cld_lock);
+		cld_recover->cld_stopping = 1;
+		mutex_unlock(&cld_recover->cld_lock);
+		config_log_put(cld_recover);
+	}
+
+	spin_lock(&config_list_lock);
+	cld_sptlrpc = cld->cld_sptlrpc;
+	cld->cld_sptlrpc = NULL;
+	spin_unlock(&config_list_lock);
+
+	if (cld_sptlrpc)
+		config_log_put(cld_sptlrpc);
+
+	/* drop the ref from the find */
+	config_log_put(cld);
+	/* drop the start ref */
+	config_log_put(cld);
+
+	CDEBUG(D_MGC, "end config log %s (%d)\n", logname ? logname : "client",
+	       rc);
+	RETURN(rc);
+}
+
+int lprocfs_mgc_rd_ir_state(char *page, char **start, off_t off,
+			    int count, int *eof, void *data)
+{
+	struct obd_device       *obd = data;
+	struct obd_import       *imp = obd->u.cli.cl_import;
+	struct obd_connect_data *ocd = &imp->imp_connect_data;
+	struct config_llog_data *cld;
+	int rc = 0;
+	ENTRY;
+
+	rc = snprintf(page, count, "imperative_recovery: %s\n",
+		      OCD_HAS_FLAG(ocd, IMP_RECOV) ? "ENABLED" : "DISABLED");
+	rc += snprintf(page + rc, count - rc, "client_state:\n");
+
+	spin_lock(&config_list_lock);
+	list_for_each_entry(cld, &config_llog_list, cld_list_chain) {
+		if (cld->cld_recover == NULL)
+			continue;
+		rc += snprintf(page + rc, count - rc,
+			       "    - { client: %s, nidtbl_version: %u }\n",
+			       cld->cld_logname,
+			       cld->cld_recover->cld_cfg.cfg_last_idx);
+	}
+	spin_unlock(&config_list_lock);
+
+	RETURN(rc);
+}
+
+/* reenqueue any lost locks */
+#define RQ_RUNNING 0x1
+#define RQ_NOW     0x2
+#define RQ_LATER   0x4
+#define RQ_STOP    0x8
+static int		    rq_state = 0;
+static wait_queue_head_t	    rq_waitq;
+static DECLARE_COMPLETION(rq_exit);
+
+static void do_requeue(struct config_llog_data *cld)
+{
+	ENTRY;
+	LASSERT(atomic_read(&cld->cld_refcount) > 0);
+
+	/* Do not run mgc_process_log on a disconnected export or an
+	   export which is being disconnected. Take the client
+	   semaphore to make the check non-racy. */
+	down_read(&cld->cld_mgcexp->exp_obd->u.cli.cl_sem);
+	if (cld->cld_mgcexp->exp_obd->u.cli.cl_conn_count != 0) {
+		CDEBUG(D_MGC, "updating log %s\n", cld->cld_logname);
+		mgc_process_log(cld->cld_mgcexp->exp_obd, cld);
+	} else {
+		CDEBUG(D_MGC, "disconnecting, won't update log %s\n",
+		       cld->cld_logname);
+	}
+	up_read(&cld->cld_mgcexp->exp_obd->u.cli.cl_sem);
+
+	EXIT;
+}
+
+/* this timeout represents how many seconds MGC should wait before
+ * requeue config and recover lock to the MGS. We need to randomize this
+ * in order to not flood the MGS.
+ */
+#define MGC_TIMEOUT_MIN_SECONDS   5
+#define MGC_TIMEOUT_RAND_CENTISEC 0x1ff /* ~500 */
+
+static int mgc_requeue_thread(void *data)
+{
+	int rc = 0;
+	ENTRY;
+
+	CDEBUG(D_MGC, "Starting requeue thread\n");
+
+	/* Keep trying failed locks periodically */
+	spin_lock(&config_list_lock);
+	rq_state |= RQ_RUNNING;
+	while (1) {
+		struct l_wait_info lwi;
+		struct config_llog_data *cld, *cld_prev;
+		int rand = cfs_rand() & MGC_TIMEOUT_RAND_CENTISEC;
+		int stopped = !!(rq_state & RQ_STOP);
+		int to;
+
+		/* Any new or requeued lostlocks will change the state */
+		rq_state &= ~(RQ_NOW | RQ_LATER);
+		spin_unlock(&config_list_lock);
+
+		/* Always wait a few seconds to allow the server who
+		   caused the lock revocation to finish its setup, plus some
+		   random so everyone doesn't try to reconnect at once. */
+		to = MGC_TIMEOUT_MIN_SECONDS * HZ;
+		to += rand * HZ / 100; /* rand is centi-seconds */
+		lwi = LWI_TIMEOUT(to, NULL, NULL);
+		l_wait_event(rq_waitq, rq_state & RQ_STOP, &lwi);
+
+		/*
+		 * iterate & processing through the list. for each cld, process
+		 * its depending sptlrpc cld firstly (if any) and then itself.
+		 *
+		 * it's guaranteed any item in the list must have
+		 * reference > 0; and if cld_lostlock is set, at
+		 * least one reference is taken by the previous enqueue.
+		 */
+		cld_prev = NULL;
+
+		spin_lock(&config_list_lock);
+		list_for_each_entry(cld, &config_llog_list,
+					cld_list_chain) {
+			if (!cld->cld_lostlock)
+				continue;
+
+			spin_unlock(&config_list_lock);
+
+			LASSERT(atomic_read(&cld->cld_refcount) > 0);
+
+			/* Whether we enqueued again or not in mgc_process_log,
+			 * we're done with the ref from the old enqueue */
+			if (cld_prev)
+				config_log_put(cld_prev);
+			cld_prev = cld;
+
+			cld->cld_lostlock = 0;
+			if (likely(!stopped))
+				do_requeue(cld);
+
+			spin_lock(&config_list_lock);
+		}
+		spin_unlock(&config_list_lock);
+		if (cld_prev)
+			config_log_put(cld_prev);
+
+		/* break after scanning the list so that we can drop
+		 * refcount to losing lock clds */
+		if (unlikely(stopped)) {
+			spin_lock(&config_list_lock);
+			break;
+		}
+
+		/* Wait a bit to see if anyone else needs a requeue */
+		lwi = (struct l_wait_info) { 0 };
+		l_wait_event(rq_waitq, rq_state & (RQ_NOW | RQ_STOP),
+			     &lwi);
+		spin_lock(&config_list_lock);
+	}
+	/* spinlock and while guarantee RQ_NOW and RQ_LATER are not set */
+	rq_state &= ~RQ_RUNNING;
+	spin_unlock(&config_list_lock);
+
+	complete(&rq_exit);
+
+	CDEBUG(D_MGC, "Ending requeue thread\n");
+	RETURN(rc);
+}
+
+/* Add a cld to the list to requeue.  Start the requeue thread if needed.
+   We are responsible for dropping the config log reference from here on out. */
+static void mgc_requeue_add(struct config_llog_data *cld)
+{
+	ENTRY;
+
+	CDEBUG(D_INFO, "log %s: requeue (r=%d sp=%d st=%x)\n",
+	       cld->cld_logname, atomic_read(&cld->cld_refcount),
+	       cld->cld_stopping, rq_state);
+	LASSERT(atomic_read(&cld->cld_refcount) > 0);
+
+	mutex_lock(&cld->cld_lock);
+	if (cld->cld_stopping || cld->cld_lostlock) {
+		mutex_unlock(&cld->cld_lock);
+		RETURN_EXIT;
+	}
+	/* this refcount will be released in mgc_requeue_thread. */
+	config_log_get(cld);
+	cld->cld_lostlock = 1;
+	mutex_unlock(&cld->cld_lock);
+
+	/* Hold lock for rq_state */
+	spin_lock(&config_list_lock);
+	if (rq_state & RQ_STOP) {
+		spin_unlock(&config_list_lock);
+		cld->cld_lostlock = 0;
+		config_log_put(cld);
+	} else {
+		rq_state |= RQ_NOW;
+		spin_unlock(&config_list_lock);
+		wake_up(&rq_waitq);
+	}
+	EXIT;
+}
+
+/********************** class fns **********************/
+
+static int mgc_fs_setup(struct obd_device *obd, struct super_block *sb,
+			struct vfsmount *mnt)
+{
+	struct lvfs_run_ctxt saved;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct client_obd *cli = &obd->u.cli;
+	struct dentry *dentry;
+	char *label;
+	int err = 0;
+	ENTRY;
+
+	LASSERT(lsi);
+	LASSERT(lsi->lsi_srv_mnt == mnt);
+
+	/* The mgc fs exclusion sem. Only one fs can be setup at a time. */
+	down(&cli->cl_mgc_sem);
+
+	cfs_cleanup_group_info();
+
+	obd->obd_fsops = fsfilt_get_ops(lsi->lsi_fstype);
+	if (IS_ERR(obd->obd_fsops)) {
+		up(&cli->cl_mgc_sem);
+		CERROR("%s: No fstype %s: rc = %ld\n", lsi->lsi_fstype,
+		       obd->obd_name, PTR_ERR(obd->obd_fsops));
+		RETURN(PTR_ERR(obd->obd_fsops));
+	}
+
+	cli->cl_mgc_vfsmnt = mnt;
+	err = fsfilt_setup(obd, mnt->mnt_sb);
+	if (err)
+		GOTO(err_ops, err);
+
+	OBD_SET_CTXT_MAGIC(&obd->obd_lvfs_ctxt);
+	obd->obd_lvfs_ctxt.pwdmnt = mnt;
+	obd->obd_lvfs_ctxt.pwd = mnt->mnt_root;
+	obd->obd_lvfs_ctxt.fs = get_ds();
+
+	push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+	dentry = ll_lookup_one_len(MOUNT_CONFIGS_DIR, cfs_fs_pwd(current->fs),
+				   strlen(MOUNT_CONFIGS_DIR));
+	pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+	if (IS_ERR(dentry)) {
+		err = PTR_ERR(dentry);
+		CERROR("cannot lookup %s directory: rc = %d\n",
+		       MOUNT_CONFIGS_DIR, err);
+		GOTO(err_ops, err);
+	}
+	cli->cl_mgc_configs_dir = dentry;
+
+	/* We take an obd ref to insure that we can't get to mgc_cleanup
+	   without calling mgc_fs_cleanup first. */
+	class_incref(obd, "mgc_fs", obd);
+
+	label = fsfilt_get_label(obd, mnt->mnt_sb);
+	if (label)
+		CDEBUG(D_MGC, "MGC using disk labelled=%s\n", label);
+
+	/* We keep the cl_mgc_sem until mgc_fs_cleanup */
+	RETURN(0);
+
+err_ops:
+	fsfilt_put_ops(obd->obd_fsops);
+	obd->obd_fsops = NULL;
+	cli->cl_mgc_vfsmnt = NULL;
+	up(&cli->cl_mgc_sem);
+	RETURN(err);
+}
+
+static int mgc_fs_cleanup(struct obd_device *obd)
+{
+	struct client_obd *cli = &obd->u.cli;
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(cli->cl_mgc_vfsmnt != NULL);
+
+	if (cli->cl_mgc_configs_dir != NULL) {
+		struct lvfs_run_ctxt saved;
+		push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+		l_dput(cli->cl_mgc_configs_dir);
+		cli->cl_mgc_configs_dir = NULL;
+		pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+		class_decref(obd, "mgc_fs", obd);
+	}
+
+	cli->cl_mgc_vfsmnt = NULL;
+	if (obd->obd_fsops)
+		fsfilt_put_ops(obd->obd_fsops);
+
+	up(&cli->cl_mgc_sem);
+
+	RETURN(rc);
+}
+
+static atomic_t mgc_count = ATOMIC_INIT(0);
+static int mgc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
+{
+	int rc = 0;
+	ENTRY;
+
+	switch (stage) {
+	case OBD_CLEANUP_EARLY:
+		break;
+	case OBD_CLEANUP_EXPORTS:
+		if (atomic_dec_and_test(&mgc_count)) {
+			int running;
+			/* stop requeue thread */
+			spin_lock(&config_list_lock);
+			running = rq_state & RQ_RUNNING;
+			if (running)
+				rq_state |= RQ_STOP;
+			spin_unlock(&config_list_lock);
+			if (running) {
+				wake_up(&rq_waitq);
+				wait_for_completion(&rq_exit);
+			}
+		}
+		obd_cleanup_client_import(obd);
+		rc = obd_llog_finish(obd, 0);
+		if (rc != 0)
+			CERROR("failed to cleanup llogging subsystems\n");
+		break;
+	}
+	RETURN(rc);
+}
+
+static int mgc_cleanup(struct obd_device *obd)
+{
+	struct client_obd *cli = &obd->u.cli;
+	int rc;
+	ENTRY;
+
+	LASSERT(cli->cl_mgc_vfsmnt == NULL);
+
+	/* COMPAT_146 - old config logs may have added profiles we don't
+	   know about */
+	if (obd->obd_type->typ_refcnt <= 1)
+		/* Only for the last mgc */
+		class_del_profiles();
+
+	lprocfs_obd_cleanup(obd);
+	ptlrpcd_decref();
+
+	rc = client_obd_cleanup(obd);
+	RETURN(rc);
+}
+
+static int mgc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct lprocfs_static_vars lvars;
+	int rc;
+	ENTRY;
+
+	ptlrpcd_addref();
+
+	rc = client_obd_setup(obd, lcfg);
+	if (rc)
+		GOTO(err_decref, rc);
+
+	rc = obd_llog_init(obd, &obd->obd_olg, obd, NULL);
+	if (rc) {
+		CERROR("failed to setup llogging subsystems\n");
+		GOTO(err_cleanup, rc);
+	}
+
+	lprocfs_mgc_init_vars(&lvars);
+	lprocfs_obd_setup(obd, lvars.obd_vars);
+	sptlrpc_lprocfs_cliobd_attach(obd);
+
+	if (atomic_inc_return(&mgc_count) == 1) {
+		rq_state = 0;
+		init_waitqueue_head(&rq_waitq);
+
+		/* start requeue thread */
+		rc = PTR_ERR(kthread_run(mgc_requeue_thread, NULL,
+					     "ll_cfg_requeue"));
+		if (IS_ERR_VALUE(rc)) {
+			CERROR("%s: Cannot start requeue thread (%d),"
+			       "no more log updates!\n",
+			       obd->obd_name, rc);
+			GOTO(err_cleanup, rc);
+		}
+		/* rc is the task_struct pointer of mgc_requeue_thread. */
+		rc = 0;
+	}
+
+	RETURN(rc);
+
+err_cleanup:
+	client_obd_cleanup(obd);
+err_decref:
+	ptlrpcd_decref();
+	RETURN(rc);
+}
+
+/* based on ll_mdc_blocking_ast */
+static int mgc_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+			    void *data, int flag)
+{
+	struct lustre_handle lockh;
+	struct config_llog_data *cld = (struct config_llog_data *)data;
+	int rc = 0;
+	ENTRY;
+
+	switch (flag) {
+	case LDLM_CB_BLOCKING:
+		/* mgs wants the lock, give it up... */
+		LDLM_DEBUG(lock, "MGC blocking CB");
+		ldlm_lock2handle(lock, &lockh);
+		rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
+		break;
+	case LDLM_CB_CANCELING:
+		/* We've given up the lock, prepare ourselves to update. */
+		LDLM_DEBUG(lock, "MGC cancel CB");
+
+		CDEBUG(D_MGC, "Lock res "LPX64" (%.8s)\n",
+		       lock->l_resource->lr_name.name[0],
+		       (char *)&lock->l_resource->lr_name.name[0]);
+
+		if (!cld) {
+			CDEBUG(D_INFO, "missing data, won't requeue\n");
+			break;
+		}
+
+		/* held at mgc_process_log(). */
+		LASSERT(atomic_read(&cld->cld_refcount) > 0);
+		/* Are we done with this log? */
+		if (cld->cld_stopping) {
+			CDEBUG(D_MGC, "log %s: stopping, won't requeue\n",
+			       cld->cld_logname);
+			config_log_put(cld);
+			break;
+		}
+		/* Make sure not to re-enqueue when the mgc is stopping
+		   (we get called from client_disconnect_export) */
+		if (!lock->l_conn_export ||
+		    !lock->l_conn_export->exp_obd->u.cli.cl_conn_count) {
+			CDEBUG(D_MGC, "log %.8s: disconnecting, won't requeue\n",
+			       cld->cld_logname);
+			config_log_put(cld);
+			break;
+		}
+
+		/* Re-enqueue now */
+		mgc_requeue_add(cld);
+		config_log_put(cld);
+		break;
+	default:
+		LBUG();
+	}
+
+	RETURN(rc);
+}
+
+/* Not sure where this should go... */
+#define  MGC_ENQUEUE_LIMIT 50
+#define  MGC_TARGET_REG_LIMIT 10
+#define  MGC_SEND_PARAM_LIMIT 10
+
+/* Send parameter to MGS*/
+static int mgc_set_mgs_param(struct obd_export *exp,
+			     struct mgs_send_param *msp)
+{
+	struct ptlrpc_request *req;
+	struct mgs_send_param *req_msp, *rep_msp;
+	int rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+					&RQF_MGS_SET_INFO, LUSTRE_MGS_VERSION,
+					MGS_SET_INFO);
+	if (!req)
+		RETURN(-ENOMEM);
+
+	req_msp = req_capsule_client_get(&req->rq_pill, &RMF_MGS_SEND_PARAM);
+	if (!req_msp) {
+		ptlrpc_req_finished(req);
+		RETURN(-ENOMEM);
+	}
+
+	memcpy(req_msp, msp, sizeof(*req_msp));
+	ptlrpc_request_set_replen(req);
+
+	/* Limit how long we will wait for the enqueue to complete */
+	req->rq_delay_limit = MGC_SEND_PARAM_LIMIT;
+	rc = ptlrpc_queue_wait(req);
+	if (!rc) {
+		rep_msp = req_capsule_server_get(&req->rq_pill, &RMF_MGS_SEND_PARAM);
+		memcpy(msp, rep_msp, sizeof(*rep_msp));
+	}
+
+	ptlrpc_req_finished(req);
+
+	RETURN(rc);
+}
+
+/* Take a config lock so we can get cancel notifications */
+static int mgc_enqueue(struct obd_export *exp, struct lov_stripe_md *lsm,
+		       __u32 type, ldlm_policy_data_t *policy, __u32 mode,
+		       __u64 *flags, void *bl_cb, void *cp_cb, void *gl_cb,
+		       void *data, __u32 lvb_len, void *lvb_swabber,
+		       struct lustre_handle *lockh)
+{
+	struct config_llog_data *cld = (struct config_llog_data *)data;
+	struct ldlm_enqueue_info einfo = { type, mode, mgc_blocking_ast,
+			 ldlm_completion_ast, NULL, NULL, NULL };
+	struct ptlrpc_request *req;
+	int short_limit = cld_is_sptlrpc(cld);
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_MGC, "Enqueue for %s (res "LPX64")\n", cld->cld_logname,
+	       cld->cld_resid.name[0]);
+
+	/* We need a callback for every lockholder, so don't try to
+	   ldlm_lock_match (see rev 1.1.2.11.2.47) */
+	req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+					&RQF_LDLM_ENQUEUE, LUSTRE_DLM_VERSION,
+					LDLM_ENQUEUE);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, 0);
+	ptlrpc_request_set_replen(req);
+
+	/* check if this is server or client */
+	if (cld->cld_cfg.cfg_sb) {
+		struct lustre_sb_info *lsi = s2lsi(cld->cld_cfg.cfg_sb);
+		if (lsi && IS_SERVER(lsi))
+			short_limit = 1;
+	}
+	/* Limit how long we will wait for the enqueue to complete */
+	req->rq_delay_limit = short_limit ? 5 : MGC_ENQUEUE_LIMIT;
+	rc = ldlm_cli_enqueue(exp, &req, &einfo, &cld->cld_resid, NULL, flags,
+			      NULL, 0, LVB_T_NONE, lockh, 0);
+	/* A failed enqueue should still call the mgc_blocking_ast,
+	   where it will be requeued if needed ("grant failed"). */
+	ptlrpc_req_finished(req);
+	RETURN(rc);
+}
+
+static int mgc_cancel(struct obd_export *exp, struct lov_stripe_md *md,
+		      __u32 mode, struct lustre_handle *lockh)
+{
+	ENTRY;
+
+	ldlm_lock_decref(lockh, mode);
+
+	RETURN(0);
+}
+
+static void mgc_notify_active(struct obd_device *unused)
+{
+	/* wakeup mgc_requeue_thread to requeue mgc lock */
+	spin_lock(&config_list_lock);
+	rq_state |= RQ_NOW;
+	spin_unlock(&config_list_lock);
+	wake_up(&rq_waitq);
+
+	/* TODO: Help the MGS rebuild nidtbl. -jay */
+}
+
+/* Send target_reg message to MGS */
+static int mgc_target_register(struct obd_export *exp,
+			       struct mgs_target_info *mti)
+{
+	struct ptlrpc_request  *req;
+	struct mgs_target_info *req_mti, *rep_mti;
+	int		     rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+					&RQF_MGS_TARGET_REG, LUSTRE_MGS_VERSION,
+					MGS_TARGET_REG);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	req_mti = req_capsule_client_get(&req->rq_pill, &RMF_MGS_TARGET_INFO);
+	if (!req_mti) {
+		ptlrpc_req_finished(req);
+		RETURN(-ENOMEM);
+	}
+
+	memcpy(req_mti, mti, sizeof(*req_mti));
+	ptlrpc_request_set_replen(req);
+	CDEBUG(D_MGC, "register %s\n", mti->mti_svname);
+	/* Limit how long we will wait for the enqueue to complete */
+	req->rq_delay_limit = MGC_TARGET_REG_LIMIT;
+
+	rc = ptlrpc_queue_wait(req);
+	if (!rc) {
+		rep_mti = req_capsule_server_get(&req->rq_pill,
+						 &RMF_MGS_TARGET_INFO);
+		memcpy(mti, rep_mti, sizeof(*rep_mti));
+		CDEBUG(D_MGC, "register %s got index = %d\n",
+		       mti->mti_svname, mti->mti_stripe_index);
+	}
+	ptlrpc_req_finished(req);
+
+	RETURN(rc);
+}
+
+int mgc_set_info_async(const struct lu_env *env, struct obd_export *exp,
+		       obd_count keylen, void *key, obd_count vallen,
+		       void *val, struct ptlrpc_request_set *set)
+{
+	int rc = -EINVAL;
+	ENTRY;
+
+	/* Turn off initial_recov after we try all backup servers once */
+	if (KEY_IS(KEY_INIT_RECOV_BACKUP)) {
+		struct obd_import *imp = class_exp2cliimp(exp);
+		int value;
+		if (vallen != sizeof(int))
+			RETURN(-EINVAL);
+		value = *(int *)val;
+		CDEBUG(D_MGC, "InitRecov %s %d/d%d:i%d:r%d:or%d:%s\n",
+		       imp->imp_obd->obd_name, value,
+		       imp->imp_deactive, imp->imp_invalid,
+		       imp->imp_replayable, imp->imp_obd->obd_replayable,
+		       ptlrpc_import_state_name(imp->imp_state));
+		/* Resurrect if we previously died */
+		if ((imp->imp_state != LUSTRE_IMP_FULL &&
+		     imp->imp_state != LUSTRE_IMP_NEW) || value > 1)
+			ptlrpc_reconnect_import(imp);
+		RETURN(0);
+	}
+	/* FIXME move this to mgc_process_config */
+	if (KEY_IS(KEY_REGISTER_TARGET)) {
+		struct mgs_target_info *mti;
+		if (vallen != sizeof(struct mgs_target_info))
+			RETURN(-EINVAL);
+		mti = (struct mgs_target_info *)val;
+		CDEBUG(D_MGC, "register_target %s %#x\n",
+		       mti->mti_svname, mti->mti_flags);
+		rc =  mgc_target_register(exp, mti);
+		RETURN(rc);
+	}
+	if (KEY_IS(KEY_SET_FS)) {
+		struct super_block *sb = (struct super_block *)val;
+		struct lustre_sb_info *lsi;
+		if (vallen != sizeof(struct super_block))
+			RETURN(-EINVAL);
+		lsi = s2lsi(sb);
+		rc = mgc_fs_setup(exp->exp_obd, sb, lsi->lsi_srv_mnt);
+		if (rc) {
+			CERROR("set_fs got %d\n", rc);
+		}
+		RETURN(rc);
+	}
+	if (KEY_IS(KEY_CLEAR_FS)) {
+		if (vallen != 0)
+			RETURN(-EINVAL);
+		rc = mgc_fs_cleanup(exp->exp_obd);
+		if (rc) {
+			CERROR("clear_fs got %d\n", rc);
+		}
+		RETURN(rc);
+	}
+	if (KEY_IS(KEY_SET_INFO)) {
+		struct mgs_send_param *msp;
+
+		msp = (struct mgs_send_param *)val;
+		rc =  mgc_set_mgs_param(exp, msp);
+		RETURN(rc);
+	}
+	if (KEY_IS(KEY_MGSSEC)) {
+		struct client_obd     *cli = &exp->exp_obd->u.cli;
+		struct sptlrpc_flavor  flvr;
+
+		/*
+		 * empty string means using current flavor, if which haven't
+		 * been set yet, set it as null.
+		 *
+		 * if flavor has been set previously, check the asking flavor
+		 * must match the existing one.
+		 */
+		if (vallen == 0) {
+			if (cli->cl_flvr_mgc.sf_rpc != SPTLRPC_FLVR_INVALID)
+				RETURN(0);
+			val = "null";
+			vallen = 4;
+		}
+
+		rc = sptlrpc_parse_flavor(val, &flvr);
+		if (rc) {
+			CERROR("invalid sptlrpc flavor %s to MGS\n",
+			       (char *) val);
+			RETURN(rc);
+		}
+
+		/*
+		 * caller already hold a mutex
+		 */
+		if (cli->cl_flvr_mgc.sf_rpc == SPTLRPC_FLVR_INVALID) {
+			cli->cl_flvr_mgc = flvr;
+		} else if (memcmp(&cli->cl_flvr_mgc, &flvr,
+				  sizeof(flvr)) != 0) {
+			char    str[20];
+
+			sptlrpc_flavor2name(&cli->cl_flvr_mgc,
+					    str, sizeof(str));
+			LCONSOLE_ERROR("asking sptlrpc flavor %s to MGS but "
+				       "currently %s is in use\n",
+				       (char *) val, str);
+			rc = -EPERM;
+		}
+		RETURN(rc);
+	}
+
+	RETURN(rc);
+}
+
+static int mgc_get_info(const struct lu_env *env, struct obd_export *exp,
+			__u32 keylen, void *key, __u32 *vallen, void *val,
+			struct lov_stripe_md *unused)
+{
+	int rc = -EINVAL;
+
+	if (KEY_IS(KEY_CONN_DATA)) {
+		struct obd_import *imp = class_exp2cliimp(exp);
+		struct obd_connect_data *data = val;
+
+		if (*vallen == sizeof(*data)) {
+			*data = imp->imp_connect_data;
+			rc = 0;
+		}
+	}
+
+	return rc;
+}
+
+static int mgc_import_event(struct obd_device *obd,
+			    struct obd_import *imp,
+			    enum obd_import_event event)
+{
+	int rc = 0;
+
+	LASSERT(imp->imp_obd == obd);
+	CDEBUG(D_MGC, "import event %#x\n", event);
+
+	switch (event) {
+	case IMP_EVENT_DISCON:
+		/* MGC imports should not wait for recovery */
+		if (OCD_HAS_FLAG(&imp->imp_connect_data, IMP_RECOV))
+			ptlrpc_pinger_ir_down();
+		break;
+	case IMP_EVENT_INACTIVE:
+		break;
+	case IMP_EVENT_INVALIDATE: {
+		struct ldlm_namespace *ns = obd->obd_namespace;
+		ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
+		break;
+	}
+	case IMP_EVENT_ACTIVE:
+		CDEBUG(D_INFO, "%s: Reactivating import\n", obd->obd_name);
+		/* Clearing obd_no_recov allows us to continue pinging */
+		obd->obd_no_recov = 0;
+		mgc_notify_active(obd);
+		if (OCD_HAS_FLAG(&imp->imp_connect_data, IMP_RECOV))
+			ptlrpc_pinger_ir_up();
+		break;
+	case IMP_EVENT_OCD:
+		break;
+	case IMP_EVENT_DEACTIVATE:
+	case IMP_EVENT_ACTIVATE:
+		break;
+	default:
+		CERROR("Unknown import event %#x\n", event);
+		LBUG();
+	}
+	RETURN(rc);
+}
+
+static int mgc_llog_init(struct obd_device *obd, struct obd_llog_group *olg,
+			 struct obd_device *tgt, int *index)
+{
+	struct llog_ctxt *ctxt;
+	int rc;
+	ENTRY;
+
+	LASSERT(olg == &obd->obd_olg);
+
+
+	rc = llog_setup(NULL, obd, olg, LLOG_CONFIG_REPL_CTXT, tgt,
+			&llog_client_ops);
+	if (rc)
+		GOTO(out, rc);
+
+	ctxt = llog_group_get_ctxt(olg, LLOG_CONFIG_REPL_CTXT);
+	if (!ctxt)
+		GOTO(out, rc = -ENODEV);
+
+	llog_initiator_connect(ctxt);
+	llog_ctxt_put(ctxt);
+
+	RETURN(0);
+out:
+	ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT);
+	if (ctxt)
+		llog_cleanup(NULL, ctxt);
+	RETURN(rc);
+}
+
+static int mgc_llog_finish(struct obd_device *obd, int count)
+{
+	struct llog_ctxt *ctxt;
+
+	ENTRY;
+
+	ctxt = llog_get_context(obd, LLOG_CONFIG_REPL_CTXT);
+	if (ctxt)
+		llog_cleanup(NULL, ctxt);
+
+	ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT);
+	if (ctxt)
+		llog_cleanup(NULL, ctxt);
+	RETURN(0);
+}
+
+enum {
+	CONFIG_READ_NRPAGES_INIT = 1 << (20 - PAGE_CACHE_SHIFT),
+	CONFIG_READ_NRPAGES      = 4
+};
+
+static int mgc_apply_recover_logs(struct obd_device *mgc,
+				  struct config_llog_data *cld,
+				  __u64 max_version,
+				  void *data, int datalen, bool mne_swab)
+{
+	struct config_llog_instance *cfg = &cld->cld_cfg;
+	struct lustre_sb_info       *lsi = s2lsi(cfg->cfg_sb);
+	struct mgs_nidtbl_entry *entry;
+	struct lustre_cfg       *lcfg;
+	struct lustre_cfg_bufs   bufs;
+	u64   prev_version = 0;
+	char *inst;
+	char *buf;
+	int   bufsz;
+	int   pos;
+	int   rc  = 0;
+	int   off = 0;
+	ENTRY;
+
+	LASSERT(cfg->cfg_instance != NULL);
+	LASSERT(cfg->cfg_sb == cfg->cfg_instance);
+
+	OBD_ALLOC(inst, PAGE_CACHE_SIZE);
+	if (inst == NULL)
+		RETURN(-ENOMEM);
+
+	if (!IS_SERVER(lsi)) {
+		pos = snprintf(inst, PAGE_CACHE_SIZE, "%p", cfg->cfg_instance);
+		if (pos >= PAGE_CACHE_SIZE) {
+			OBD_FREE(inst, PAGE_CACHE_SIZE);
+			return -E2BIG;
+		}
+	} else {
+		LASSERT(IS_MDT(lsi));
+		rc = server_name2svname(lsi->lsi_svname, inst, NULL,
+					PAGE_CACHE_SIZE);
+		if (rc) {
+			OBD_FREE(inst, PAGE_CACHE_SIZE);
+			RETURN(-EINVAL);
+		}
+		pos = strlen(inst);
+	}
+
+	++pos;
+	buf   = inst + pos;
+	bufsz = PAGE_CACHE_SIZE - pos;
+
+	while (datalen > 0) {
+		int   entry_len = sizeof(*entry);
+		int   is_ost;
+		struct obd_device *obd;
+		char *obdname;
+		char *cname;
+		char *params;
+		char *uuid;
+
+		rc = -EINVAL;
+		if (datalen < sizeof(*entry))
+			break;
+
+		entry = (typeof(entry))(data + off);
+
+		/* sanity check */
+		if (entry->mne_nid_type != 0) /* only support type 0 for ipv4 */
+			break;
+		if (entry->mne_nid_count == 0) /* at least one nid entry */
+			break;
+		if (entry->mne_nid_size != sizeof(lnet_nid_t))
+			break;
+
+		entry_len += entry->mne_nid_count * entry->mne_nid_size;
+		if (datalen < entry_len) /* must have entry_len at least */
+			break;
+
+		/* Keep this swab for normal mixed endian handling. LU-1644 */
+		if (mne_swab)
+			lustre_swab_mgs_nidtbl_entry(entry);
+		if (entry->mne_length > PAGE_CACHE_SIZE) {
+			CERROR("MNE too large (%u)\n", entry->mne_length);
+			break;
+		}
+
+		if (entry->mne_length < entry_len)
+			break;
+
+		off     += entry->mne_length;
+		datalen -= entry->mne_length;
+		if (datalen < 0)
+			break;
+
+		if (entry->mne_version > max_version) {
+			CERROR("entry index(%lld) is over max_index(%lld)\n",
+			       entry->mne_version, max_version);
+			break;
+		}
+
+		if (prev_version >= entry->mne_version) {
+			CERROR("index unsorted, prev %lld, now %lld\n",
+			       prev_version, entry->mne_version);
+			break;
+		}
+		prev_version = entry->mne_version;
+
+		/*
+		 * Write a string with format "nid::instance" to
+		 * lustre/<osc|mdc>/<target>-<osc|mdc>-<instance>/import.
+		 */
+
+		is_ost = entry->mne_type == LDD_F_SV_TYPE_OST;
+		memset(buf, 0, bufsz);
+		obdname = buf;
+		pos = 0;
+
+		/* lustre-OST0001-osc-<instance #> */
+		strcpy(obdname, cld->cld_logname);
+		cname = strrchr(obdname, '-');
+		if (cname == NULL) {
+			CERROR("mgc %s: invalid logname %s\n",
+			       mgc->obd_name, obdname);
+			break;
+		}
+
+		pos = cname - obdname;
+		obdname[pos] = 0;
+		pos += sprintf(obdname + pos, "-%s%04x",
+				  is_ost ? "OST" : "MDT", entry->mne_index);
+
+		cname = is_ost ? "osc" : "mdc",
+		pos += sprintf(obdname + pos, "-%s-%s", cname, inst);
+		lustre_cfg_bufs_reset(&bufs, obdname);
+
+		/* find the obd by obdname */
+		obd = class_name2obd(obdname);
+		if (obd == NULL) {
+			CDEBUG(D_INFO, "mgc %s: cannot find obdname %s\n",
+			       mgc->obd_name, obdname);
+			rc = 0;
+			/* this is a safe race, when the ost is starting up...*/
+			continue;
+		}
+
+		/* osc.import = "connection=<Conn UUID>::<target instance>" */
+		++pos;
+		params = buf + pos;
+		pos += sprintf(params, "%s.import=%s", cname, "connection=");
+		uuid = buf + pos;
+
+		down_read(&obd->u.cli.cl_sem);
+		if (obd->u.cli.cl_import == NULL) {
+			/* client does not connect to the OST yet */
+			up_read(&obd->u.cli.cl_sem);
+			rc = 0;
+			continue;
+		}
+
+		/* TODO: iterate all nids to find one */
+		/* find uuid by nid */
+		rc = client_import_find_conn(obd->u.cli.cl_import,
+					     entry->u.nids[0],
+					     (struct obd_uuid *)uuid);
+		up_read(&obd->u.cli.cl_sem);
+		if (rc < 0) {
+			CERROR("mgc: cannot find uuid by nid %s\n",
+			       libcfs_nid2str(entry->u.nids[0]));
+			break;
+		}
+
+		CDEBUG(D_INFO, "Find uuid %s by nid %s\n",
+		       uuid, libcfs_nid2str(entry->u.nids[0]));
+
+		pos += strlen(uuid);
+		pos += sprintf(buf + pos, "::%u", entry->mne_instance);
+		LASSERT(pos < bufsz);
+
+		lustre_cfg_bufs_set_string(&bufs, 1, params);
+
+		rc = -ENOMEM;
+		lcfg = lustre_cfg_new(LCFG_PARAM, &bufs);
+		if (lcfg == NULL) {
+			CERROR("mgc: cannot allocate memory\n");
+			break;
+		}
+
+		CDEBUG(D_INFO, "ir apply logs "LPD64"/"LPD64" for %s -> %s\n",
+		       prev_version, max_version, obdname, params);
+
+		rc = class_process_config(lcfg);
+		lustre_cfg_free(lcfg);
+		if (rc)
+			CDEBUG(D_INFO, "process config for %s error %d\n",
+			       obdname, rc);
+
+		/* continue, even one with error */
+	}
+
+	OBD_FREE(inst, PAGE_CACHE_SIZE);
+	RETURN(rc);
+}
+
+/**
+ * This function is called if this client was notified for target restarting
+ * by the MGS. A CONFIG_READ RPC is going to send to fetch recovery logs.
+ */
+static int mgc_process_recover_log(struct obd_device *obd,
+				   struct config_llog_data *cld)
+{
+	struct ptlrpc_request *req = NULL;
+	struct config_llog_instance *cfg = &cld->cld_cfg;
+	struct mgs_config_body *body;
+	struct mgs_config_res  *res;
+	struct ptlrpc_bulk_desc *desc;
+	struct page **pages;
+	int nrpages;
+	bool eof = true;
+	bool mne_swab = false;
+	int i;
+	int ealen;
+	int rc;
+	ENTRY;
+
+	/* allocate buffer for bulk transfer.
+	 * if this is the first time for this mgs to read logs,
+	 * CONFIG_READ_NRPAGES_INIT will be used since it will read all logs
+	 * once; otherwise, it only reads increment of logs, this should be
+	 * small and CONFIG_READ_NRPAGES will be used.
+	 */
+	nrpages = CONFIG_READ_NRPAGES;
+	if (cfg->cfg_last_idx == 0) /* the first time */
+		nrpages = CONFIG_READ_NRPAGES_INIT;
+
+	OBD_ALLOC(pages, sizeof(*pages) * nrpages);
+	if (pages == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	for (i = 0; i < nrpages; i++) {
+		pages[i] = alloc_page(GFP_IOFS);
+		if (pages[i] == NULL)
+			GOTO(out, rc = -ENOMEM);
+	}
+
+again:
+	LASSERT(cld_is_recover(cld));
+	LASSERT(mutex_is_locked(&cld->cld_lock));
+	req = ptlrpc_request_alloc(class_exp2cliimp(cld->cld_mgcexp),
+				   &RQF_MGS_CONFIG_READ);
+	if (req == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MGS_VERSION, MGS_CONFIG_READ);
+	if (rc)
+		GOTO(out, rc);
+
+	/* pack request */
+	body = req_capsule_client_get(&req->rq_pill, &RMF_MGS_CONFIG_BODY);
+	LASSERT(body != NULL);
+	LASSERT(sizeof(body->mcb_name) > strlen(cld->cld_logname));
+	if (strlcpy(body->mcb_name, cld->cld_logname, sizeof(body->mcb_name))
+	    >= sizeof(body->mcb_name))
+		GOTO(out, rc = -E2BIG);
+	body->mcb_offset = cfg->cfg_last_idx + 1;
+	body->mcb_type   = cld->cld_type;
+	body->mcb_bits   = PAGE_CACHE_SHIFT;
+	body->mcb_units  = nrpages;
+
+	/* allocate bulk transfer descriptor */
+	desc = ptlrpc_prep_bulk_imp(req, nrpages, 1, BULK_PUT_SINK,
+				    MGS_BULK_PORTAL);
+	if (desc == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	for (i = 0; i < nrpages; i++)
+		ptlrpc_prep_bulk_page_pin(desc, pages[i], 0, PAGE_CACHE_SIZE);
+
+	ptlrpc_request_set_replen(req);
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
+
+	res = req_capsule_server_get(&req->rq_pill, &RMF_MGS_CONFIG_RES);
+	if (res->mcr_size < res->mcr_offset)
+		GOTO(out, rc = -EINVAL);
+
+	/* always update the index even though it might have errors with
+	 * handling the recover logs */
+	cfg->cfg_last_idx = res->mcr_offset;
+	eof = res->mcr_offset == res->mcr_size;
+
+	CDEBUG(D_INFO, "Latest version "LPD64", more %d.\n",
+	       res->mcr_offset, eof == false);
+
+	ealen = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, 0);
+	if (ealen < 0)
+		GOTO(out, rc = ealen);
+
+	if (ealen > nrpages << PAGE_CACHE_SHIFT)
+		GOTO(out, rc = -EINVAL);
+
+	if (ealen == 0) { /* no logs transferred */
+		if (!eof)
+			rc = -EINVAL;
+		GOTO(out, rc);
+	}
+
+	mne_swab = !!ptlrpc_rep_need_swab(req);
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 2, 50, 0)
+	/* This import flag means the server did an extra swab of IR MNE
+	 * records (fixed in LU-1252), reverse it here if needed. LU-1644 */
+	if (unlikely(req->rq_import->imp_need_mne_swab))
+		mne_swab = !mne_swab;
+#else
+#warning "LU-1644: Remove old OBD_CONNECT_MNE_SWAB fixup and imp_need_mne_swab"
+#endif
+
+	for (i = 0; i < nrpages && ealen > 0; i++) {
+		int rc2;
+		void *ptr;
+
+		ptr = kmap(pages[i]);
+		rc2 = mgc_apply_recover_logs(obd, cld, res->mcr_offset, ptr,
+					     min_t(int, ealen, PAGE_CACHE_SIZE),
+					     mne_swab);
+		kunmap(pages[i]);
+		if (rc2 < 0) {
+			CWARN("Process recover log %s error %d\n",
+			      cld->cld_logname, rc2);
+			break;
+		}
+
+		ealen -= PAGE_CACHE_SIZE;
+	}
+
+out:
+	if (req)
+		ptlrpc_req_finished(req);
+
+	if (rc == 0 && !eof)
+		goto again;
+
+	if (pages) {
+		for (i = 0; i < nrpages; i++) {
+			if (pages[i] == NULL)
+				break;
+			__free_page(pages[i]);
+		}
+		OBD_FREE(pages, sizeof(*pages) * nrpages);
+	}
+	return rc;
+}
+
+
+/* local_only means it cannot get remote llogs */
+static int mgc_process_cfg_log(struct obd_device *mgc,
+			       struct config_llog_data *cld,
+			       int local_only)
+{
+	struct llog_ctxt *ctxt, *lctxt = NULL;
+	struct lvfs_run_ctxt *saved_ctxt;
+	struct lustre_sb_info *lsi = NULL;
+	int rc = 0, must_pop = 0;
+	bool sptlrpc_started = false;
+
+	ENTRY;
+
+	LASSERT(cld);
+	LASSERT(mutex_is_locked(&cld->cld_lock));
+
+	/*
+	 * local copy of sptlrpc log is controlled elsewhere, don't try to
+	 * read it up here.
+	 */
+	if (cld_is_sptlrpc(cld) && local_only)
+		RETURN(0);
+
+	if (cld->cld_cfg.cfg_sb)
+		lsi = s2lsi(cld->cld_cfg.cfg_sb);
+
+	ctxt = llog_get_context(mgc, LLOG_CONFIG_REPL_CTXT);
+	if (!ctxt) {
+		CERROR("missing llog context\n");
+		RETURN(-EINVAL);
+	}
+
+	OBD_ALLOC_PTR(saved_ctxt);
+	if (saved_ctxt == NULL)
+		RETURN(-ENOMEM);
+
+	lctxt = llog_get_context(mgc, LLOG_CONFIG_ORIG_CTXT);
+
+		if (local_only) { /* no local log at client side */
+		GOTO(out_pop, rc = -EIO);
+	}
+
+	if (cld_is_sptlrpc(cld)) {
+		sptlrpc_conf_log_update_begin(cld->cld_logname);
+		sptlrpc_started = true;
+	}
+
+	/* logname and instance info should be the same, so use our
+	   copy of the instance for the update.  The cfg_last_idx will
+	   be updated here. */
+	rc = class_config_parse_llog(NULL, ctxt, cld->cld_logname,
+				     &cld->cld_cfg);
+	EXIT;
+
+out_pop:
+	llog_ctxt_put(ctxt);
+	if (lctxt)
+		llog_ctxt_put(lctxt);
+	if (must_pop)
+		pop_ctxt(saved_ctxt, &mgc->obd_lvfs_ctxt, NULL);
+
+	OBD_FREE_PTR(saved_ctxt);
+	/*
+	 * update settings on existing OBDs. doing it inside
+	 * of llog_process_lock so no device is attaching/detaching
+	 * in parallel.
+	 * the logname must be <fsname>-sptlrpc
+	 */
+	if (sptlrpc_started) {
+		LASSERT(cld_is_sptlrpc(cld));
+		sptlrpc_conf_log_update_end(cld->cld_logname);
+		class_notify_sptlrpc_conf(cld->cld_logname,
+					  strlen(cld->cld_logname) -
+					  strlen("-sptlrpc"));
+	}
+
+	RETURN(rc);
+}
+
+/** Get a config log from the MGS and process it.
+ * This func is called for both clients and servers.
+ * Copy the log locally before parsing it if appropriate (non-MGS server)
+ */
+int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld)
+{
+	struct lustre_handle lockh = { 0 };
+	__u64 flags = LDLM_FL_NO_LRU;
+	int rc = 0, rcl;
+	ENTRY;
+
+	LASSERT(cld);
+
+	/* I don't want multiple processes running process_log at once --
+	   sounds like badness.  It actually might be fine, as long as
+	   we're not trying to update from the same log
+	   simultaneously (in which case we should use a per-log sem.) */
+	mutex_lock(&cld->cld_lock);
+	if (cld->cld_stopping) {
+		mutex_unlock(&cld->cld_lock);
+		RETURN(0);
+	}
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_MGC_PAUSE_PROCESS_LOG, 20);
+
+	CDEBUG(D_MGC, "Process log %s:%p from %d\n", cld->cld_logname,
+	       cld->cld_cfg.cfg_instance, cld->cld_cfg.cfg_last_idx + 1);
+
+	/* Get the cfg lock on the llog */
+	rcl = mgc_enqueue(mgc->u.cli.cl_mgc_mgsexp, NULL, LDLM_PLAIN, NULL,
+			  LCK_CR, &flags, NULL, NULL, NULL,
+			  cld, 0, NULL, &lockh);
+	if (rcl == 0) {
+		/* Get the cld, it will be released in mgc_blocking_ast. */
+		config_log_get(cld);
+		rc = ldlm_lock_set_data(&lockh, (void *)cld);
+		LASSERT(rc == 0);
+	} else {
+		CDEBUG(D_MGC, "Can't get cfg lock: %d\n", rcl);
+
+		/* mark cld_lostlock so that it will requeue
+		 * after MGC becomes available. */
+		cld->cld_lostlock = 1;
+		/* Get extra reference, it will be put in requeue thread */
+		config_log_get(cld);
+	}
+
+
+	if (cld_is_recover(cld)) {
+		rc = 0; /* this is not a fatal error for recover log */
+		if (rcl == 0)
+			rc = mgc_process_recover_log(mgc, cld);
+	} else {
+		rc = mgc_process_cfg_log(mgc, cld, rcl != 0);
+	}
+
+	CDEBUG(D_MGC, "%s: configuration from log '%s' %sed (%d).\n",
+	       mgc->obd_name, cld->cld_logname, rc ? "fail" : "succeed", rc);
+
+	mutex_unlock(&cld->cld_lock);
+
+	/* Now drop the lock so MGS can revoke it */
+	if (!rcl) {
+		rcl = mgc_cancel(mgc->u.cli.cl_mgc_mgsexp, NULL,
+				 LCK_CR, &lockh);
+		if (rcl)
+			CERROR("Can't drop cfg lock: %d\n", rcl);
+	}
+
+	RETURN(rc);
+}
+
+
+/** Called from lustre_process_log.
+ * LCFG_LOG_START gets the config log from the MGS, processes it to start
+ * any services, and adds it to the list logs to watch (follow).
+ */
+static int mgc_process_config(struct obd_device *obd, obd_count len, void *buf)
+{
+	struct lustre_cfg *lcfg = buf;
+	struct config_llog_instance *cfg = NULL;
+	char *logname;
+	int rc = 0;
+	ENTRY;
+
+	switch(lcfg->lcfg_command) {
+	case LCFG_LOV_ADD_OBD: {
+		/* Overloading this cfg command: register a new target */
+		struct mgs_target_info *mti;
+
+		if (LUSTRE_CFG_BUFLEN(lcfg, 1) !=
+		    sizeof(struct mgs_target_info))
+			GOTO(out, rc = -EINVAL);
+
+		mti = (struct mgs_target_info *)lustre_cfg_buf(lcfg, 1);
+		CDEBUG(D_MGC, "add_target %s %#x\n",
+		       mti->mti_svname, mti->mti_flags);
+		rc = mgc_target_register(obd->u.cli.cl_mgc_mgsexp, mti);
+		break;
+	}
+	case LCFG_LOV_DEL_OBD:
+		/* Unregister has no meaning at the moment. */
+		CERROR("lov_del_obd unimplemented\n");
+		rc = -ENOSYS;
+		break;
+	case LCFG_SPTLRPC_CONF: {
+		rc = sptlrpc_process_config(lcfg);
+		break;
+	}
+	case LCFG_LOG_START: {
+		struct config_llog_data *cld;
+		struct super_block *sb;
+
+		logname = lustre_cfg_string(lcfg, 1);
+		cfg = (struct config_llog_instance *)lustre_cfg_buf(lcfg, 2);
+		sb = *(struct super_block **)lustre_cfg_buf(lcfg, 3);
+
+		CDEBUG(D_MGC, "parse_log %s from %d\n", logname,
+		       cfg->cfg_last_idx);
+
+		/* We're only called through here on the initial mount */
+		rc = config_log_add(obd, logname, cfg, sb);
+		if (rc)
+			break;
+		cld = config_log_find(logname, cfg);
+		if (cld == NULL) {
+			rc = -ENOENT;
+			break;
+		}
+
+		/* COMPAT_146 */
+		/* FIXME only set this for old logs!  Right now this forces
+		   us to always skip the "inside markers" check */
+		cld->cld_cfg.cfg_flags |= CFG_F_COMPAT146;
+
+		rc = mgc_process_log(obd, cld);
+		if (rc == 0 && cld->cld_recover != NULL) {
+			if (OCD_HAS_FLAG(&obd->u.cli.cl_import->
+					 imp_connect_data, IMP_RECOV)) {
+				rc = mgc_process_log(obd, cld->cld_recover);
+			} else {
+				struct config_llog_data *cir = cld->cld_recover;
+				cld->cld_recover = NULL;
+				config_log_put(cir);
+			}
+			if (rc)
+				CERROR("Cannot process recover llog %d\n", rc);
+		}
+		config_log_put(cld);
+
+		break;
+	}
+	case LCFG_LOG_END: {
+		logname = lustre_cfg_string(lcfg, 1);
+
+		if (lcfg->lcfg_bufcount >= 2)
+			cfg = (struct config_llog_instance *)lustre_cfg_buf(
+				lcfg, 2);
+		rc = config_log_end(logname, cfg);
+		break;
+	}
+	default: {
+		CERROR("Unknown command: %d\n", lcfg->lcfg_command);
+		GOTO(out, rc = -EINVAL);
+
+	}
+	}
+out:
+	RETURN(rc);
+}
+
+struct obd_ops mgc_obd_ops = {
+	.o_owner	= THIS_MODULE,
+	.o_setup	= mgc_setup,
+	.o_precleanup   = mgc_precleanup,
+	.o_cleanup      = mgc_cleanup,
+	.o_add_conn     = client_import_add_conn,
+	.o_del_conn     = client_import_del_conn,
+	.o_connect      = client_connect_import,
+	.o_disconnect   = client_disconnect_export,
+	//.o_enqueue      = mgc_enqueue,
+	.o_cancel       = mgc_cancel,
+	//.o_iocontrol    = mgc_iocontrol,
+	.o_set_info_async = mgc_set_info_async,
+	.o_get_info       = mgc_get_info,
+	.o_import_event = mgc_import_event,
+	.o_llog_init    = mgc_llog_init,
+	.o_llog_finish  = mgc_llog_finish,
+	.o_process_config = mgc_process_config,
+};
+
+int __init mgc_init(void)
+{
+	return class_register_type(&mgc_obd_ops, NULL, NULL,
+				   LUSTRE_MGC_NAME, NULL);
+}
+
+static void /*__exit*/ mgc_exit(void)
+{
+	class_unregister_type(LUSTRE_MGC_NAME);
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Management Client");
+MODULE_LICENSE("GPL");
+
+module_init(mgc_init);
+module_exit(mgc_exit);
diff --git a/drivers/staging/lustre/lustre/obdclass/Makefile b/drivers/staging/lustre/lustre/obdclass/Makefile
new file mode 100644
index 000000000000..d2763f3f83cd
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/Makefile
@@ -0,0 +1,13 @@
+obj-$(CONFIG_LUSTRE_FS) += obdclass.o llog_test.o
+
+obdclass-y := linux/linux-module.o linux/linux-obdo.o linux/linux-sysctl.o \
+	      llog.o llog_cat.o llog_obd.o llog_swab.o class_obd.o debug.o \
+	      genops.o uuid.o llog_ioctl.o lprocfs_status.o		   \
+	      lprocfs_jobstats.o lustre_handles.o lustre_peer.o llog_osd.o \
+	      local_storage.o statfs_pack.o obdo.o obd_config.o obd_mount.o\
+	      mea.o lu_object.o dt_object.o capa.o cl_object.o   \
+	      cl_page.o cl_lock.o cl_io.o lu_ref.o acl.o idmap.o	   \
+	      md_local_object.o
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lustre/obdclass/acl.c b/drivers/staging/lustre/lustre/obdclass/acl.c
new file mode 100644
index 000000000000..c2a6702c9f2c
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/acl.c
@@ -0,0 +1,546 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/acl.c
+ *
+ * Lustre Access Control List.
+ *
+ * Author: Fan Yong <fanyong@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <lu_object.h>
+#include <lustre_acl.h>
+#include <lustre_eacl.h>
+#include <obd_support.h>
+
+#ifdef CONFIG_FS_POSIX_ACL
+
+#define CFS_ACL_XATTR_VERSION POSIX_ACL_XATTR_VERSION
+
+enum {
+	ES_UNK  = 0,    /* unknown stat */
+	ES_UNC  = 1,    /* ACL entry is not changed */
+	ES_MOD  = 2,    /* ACL entry is modified */
+	ES_ADD  = 3,    /* ACL entry is added */
+	ES_DEL  = 4     /* ACL entry is deleted */
+};
+
+static inline void lustre_ext_acl_le_to_cpu(ext_acl_xattr_entry *d,
+					    ext_acl_xattr_entry *s)
+{
+	d->e_tag	= le16_to_cpu(s->e_tag);
+	d->e_perm       = le16_to_cpu(s->e_perm);
+	d->e_id	 = le32_to_cpu(s->e_id);
+	d->e_stat       = le32_to_cpu(s->e_stat);
+}
+
+static inline void lustre_ext_acl_cpu_to_le(ext_acl_xattr_entry *d,
+					    ext_acl_xattr_entry *s)
+{
+	d->e_tag	= cpu_to_le16(s->e_tag);
+	d->e_perm       = cpu_to_le16(s->e_perm);
+	d->e_id	 = cpu_to_le32(s->e_id);
+	d->e_stat       = cpu_to_le32(s->e_stat);
+}
+
+static inline void lustre_posix_acl_le_to_cpu(posix_acl_xattr_entry *d,
+					      posix_acl_xattr_entry *s)
+{
+	d->e_tag	= le16_to_cpu(s->e_tag);
+	d->e_perm       = le16_to_cpu(s->e_perm);
+	d->e_id	 = le32_to_cpu(s->e_id);
+}
+
+static inline void lustre_posix_acl_cpu_to_le(posix_acl_xattr_entry *d,
+					      posix_acl_xattr_entry *s)
+{
+	d->e_tag	= cpu_to_le16(s->e_tag);
+	d->e_perm       = cpu_to_le16(s->e_perm);
+	d->e_id	 = cpu_to_le32(s->e_id);
+}
+
+
+/* if "new_count == 0", then "new = {a_version, NULL}", NOT NULL. */
+static int lustre_posix_acl_xattr_reduce_space(posix_acl_xattr_header **header,
+					       int old_count, int new_count)
+{
+	int old_size = CFS_ACL_XATTR_SIZE(old_count, posix_acl_xattr);
+	int new_size = CFS_ACL_XATTR_SIZE(new_count, posix_acl_xattr);
+	posix_acl_xattr_header *new;
+
+	if (unlikely(old_count <= new_count))
+		return old_size;
+
+	OBD_ALLOC(new, new_size);
+	if (unlikely(new == NULL))
+		return -ENOMEM;
+
+	memcpy(new, *header, new_size);
+	OBD_FREE(*header, old_size);
+	*header = new;
+	return new_size;
+}
+
+/* if "new_count == 0", then "new = {0, NULL}", NOT NULL. */
+static int lustre_ext_acl_xattr_reduce_space(ext_acl_xattr_header **header,
+					     int old_count)
+{
+	int ext_count = le32_to_cpu((*header)->a_count);
+	int ext_size = CFS_ACL_XATTR_SIZE(ext_count, ext_acl_xattr);
+	int old_size = CFS_ACL_XATTR_SIZE(old_count, ext_acl_xattr);
+	ext_acl_xattr_header *new;
+
+	if (unlikely(old_count <= ext_count))
+		return 0;
+
+	OBD_ALLOC(new, ext_size);
+	if (unlikely(new == NULL))
+		return -ENOMEM;
+
+	memcpy(new, *header, ext_size);
+	OBD_FREE(*header, old_size);
+	*header = new;
+	return 0;
+}
+
+/*
+ * Generate new extended ACL based on the posix ACL.
+ */
+ext_acl_xattr_header *
+lustre_posix_acl_xattr_2ext(posix_acl_xattr_header *header, int size)
+{
+	int count, i, esize;
+	ext_acl_xattr_header *new;
+	ENTRY;
+
+	if (unlikely(size < 0))
+		RETURN(ERR_PTR(-EINVAL));
+	else if (!size)
+		count = 0;
+	else
+		count = CFS_ACL_XATTR_COUNT(size, posix_acl_xattr);
+	esize = CFS_ACL_XATTR_SIZE(count, ext_acl_xattr);
+	OBD_ALLOC(new, esize);
+	if (unlikely(new == NULL))
+		RETURN(ERR_PTR(-ENOMEM));
+
+	new->a_count = cpu_to_le32(count);
+	for (i = 0; i < count; i++) {
+		new->a_entries[i].e_tag  = header->a_entries[i].e_tag;
+		new->a_entries[i].e_perm = header->a_entries[i].e_perm;
+		new->a_entries[i].e_id   = header->a_entries[i].e_id;
+		new->a_entries[i].e_stat = cpu_to_le32(ES_UNK);
+	}
+
+	RETURN(new);
+}
+EXPORT_SYMBOL(lustre_posix_acl_xattr_2ext);
+
+/*
+ * Filter out the "nobody" entries in the posix ACL.
+ */
+int lustre_posix_acl_xattr_filter(posix_acl_xattr_header *header, int size,
+				  posix_acl_xattr_header **out)
+{
+	int count, i, j, rc = 0;
+	__u32 id;
+	posix_acl_xattr_header *new;
+	ENTRY;
+
+	if (unlikely(size < 0))
+		RETURN(-EINVAL);
+	else if (!size)
+		RETURN(0);
+
+	OBD_ALLOC(new, size);
+	if (unlikely(new == NULL))
+		RETURN(-ENOMEM);
+
+	new->a_version = cpu_to_le32(CFS_ACL_XATTR_VERSION);
+	count = CFS_ACL_XATTR_COUNT(size, posix_acl_xattr);
+	for (i = 0, j = 0; i < count; i++) {
+		id = le32_to_cpu(header->a_entries[i].e_id);
+		switch (le16_to_cpu(header->a_entries[i].e_tag)) {
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			if (id != ACL_UNDEFINED_ID)
+				GOTO(_out, rc = -EIO);
+
+			memcpy(&new->a_entries[j++], &header->a_entries[i],
+			       sizeof(posix_acl_xattr_entry));
+			break;
+		case ACL_USER:
+			if (id != NOBODY_UID)
+				memcpy(&new->a_entries[j++],
+				       &header->a_entries[i],
+				       sizeof(posix_acl_xattr_entry));
+			break;
+		case ACL_GROUP:
+			if (id != NOBODY_GID)
+				memcpy(&new->a_entries[j++],
+				       &header->a_entries[i],
+				       sizeof(posix_acl_xattr_entry));
+			break;
+		default:
+			GOTO(_out, rc = -EIO);
+		}
+	}
+
+	/* free unused space. */
+	rc = lustre_posix_acl_xattr_reduce_space(&new, count, j);
+	if (rc >= 0) {
+		size = rc;
+		*out = new;
+		rc = 0;
+	}
+	EXIT;
+
+_out:
+	if (rc) {
+		OBD_FREE(new, size);
+		size = rc;
+	}
+	return size;
+}
+EXPORT_SYMBOL(lustre_posix_acl_xattr_filter);
+
+/*
+ * Release the posix ACL space.
+ */
+void lustre_posix_acl_xattr_free(posix_acl_xattr_header *header, int size)
+{
+	OBD_FREE(header, size);
+}
+EXPORT_SYMBOL(lustre_posix_acl_xattr_free);
+
+/*
+ * Release the extended ACL space.
+ */
+void lustre_ext_acl_xattr_free(ext_acl_xattr_header *header)
+{
+	OBD_FREE(header, CFS_ACL_XATTR_SIZE(le32_to_cpu(header->a_count), \
+					    ext_acl_xattr));
+}
+EXPORT_SYMBOL(lustre_ext_acl_xattr_free);
+
+static ext_acl_xattr_entry *
+lustre_ext_acl_xattr_search(ext_acl_xattr_header *header,
+			    posix_acl_xattr_entry *entry, int *pos)
+{
+	int once, start, end, i, j, count = le32_to_cpu(header->a_count);
+
+	once = 0;
+	start = *pos;
+	end = count;
+
+again:
+	for (i = start; i < end; i++) {
+		if (header->a_entries[i].e_tag == entry->e_tag &&
+		    header->a_entries[i].e_id == entry->e_id) {
+			j = i;
+			if (++i >= count)
+				i = 0;
+			*pos = i;
+			return &header->a_entries[j];
+		}
+	}
+
+	if (!once) {
+		once = 1;
+		start = 0;
+		end = *pos;
+		goto again;
+	}
+
+	return NULL;
+}
+
+/*
+ * Merge the posix ACL and the extended ACL into new posix ACL.
+ */
+int lustre_acl_xattr_merge2posix(posix_acl_xattr_header *posix_header, int size,
+				 ext_acl_xattr_header *ext_header,
+				 posix_acl_xattr_header **out)
+{
+	int posix_count, posix_size, i, j;
+	int ext_count = le32_to_cpu(ext_header->a_count), pos = 0, rc = 0;
+	posix_acl_xattr_entry pe = {ACL_MASK, 0, ACL_UNDEFINED_ID};
+	posix_acl_xattr_header *new;
+	ext_acl_xattr_entry *ee, ae;
+	ENTRY;
+
+	lustre_posix_acl_cpu_to_le(&pe, &pe);
+	ee = lustre_ext_acl_xattr_search(ext_header, &pe, &pos);
+	if (ee == NULL || le32_to_cpu(ee->e_stat) == ES_DEL) {
+		/* there are only base ACL entries at most. */
+		posix_count = 3;
+		posix_size = CFS_ACL_XATTR_SIZE(posix_count, posix_acl_xattr);
+		OBD_ALLOC(new, posix_size);
+		if (unlikely(new == NULL))
+			RETURN(-ENOMEM);
+
+		new->a_version = cpu_to_le32(CFS_ACL_XATTR_VERSION);
+		for (i = 0, j = 0; i < ext_count; i++) {
+			lustre_ext_acl_le_to_cpu(&ae,
+						 &ext_header->a_entries[i]);
+			switch (ae.e_tag) {
+			case ACL_USER_OBJ:
+			case ACL_GROUP_OBJ:
+			case ACL_OTHER:
+				if (ae.e_id != ACL_UNDEFINED_ID)
+					GOTO(_out, rc = -EIO);
+
+				if (ae.e_stat != ES_DEL) {
+					new->a_entries[j].e_tag =
+						ext_header->a_entries[i].e_tag;
+					new->a_entries[j].e_perm =
+						ext_header->a_entries[i].e_perm;
+					new->a_entries[j++].e_id =
+						ext_header->a_entries[i].e_id;
+				}
+				break;
+			case ACL_MASK:
+			case ACL_USER:
+			case ACL_GROUP:
+				if (ae.e_stat == ES_DEL)
+					break;
+			default:
+				GOTO(_out, rc = -EIO);
+			}
+		}
+	} else {
+		/* maybe there are valid ACL_USER or ACL_GROUP entries in the
+		 * original server-side ACL, they are regarded as ES_UNC stat.*/
+		int ori_posix_count;
+
+		if (unlikely(size < 0))
+			RETURN(-EINVAL);
+		else if (!size)
+			ori_posix_count = 0;
+		else
+			ori_posix_count =
+				CFS_ACL_XATTR_COUNT(size, posix_acl_xattr);
+		posix_count = ori_posix_count + ext_count;
+		posix_size =
+			CFS_ACL_XATTR_SIZE(posix_count, posix_acl_xattr);
+		OBD_ALLOC(new, posix_size);
+		if (unlikely(new == NULL))
+			RETURN(-ENOMEM);
+
+		new->a_version = cpu_to_le32(CFS_ACL_XATTR_VERSION);
+		/* 1. process the unchanged ACL entries
+		 *    in the original server-side ACL. */
+		pos = 0;
+		for (i = 0, j = 0; i < ori_posix_count; i++) {
+			ee = lustre_ext_acl_xattr_search(ext_header,
+					&posix_header->a_entries[i], &pos);
+			if (ee == NULL)
+				memcpy(&new->a_entries[j++],
+				       &posix_header->a_entries[i],
+				       sizeof(posix_acl_xattr_entry));
+		}
+
+		/* 2. process the non-deleted entries
+		 *    from client-side extended ACL. */
+		for (i = 0; i < ext_count; i++) {
+			if (le16_to_cpu(ext_header->a_entries[i].e_stat) !=
+			    ES_DEL) {
+				new->a_entries[j].e_tag =
+						ext_header->a_entries[i].e_tag;
+				new->a_entries[j].e_perm =
+						ext_header->a_entries[i].e_perm;
+				new->a_entries[j++].e_id =
+						ext_header->a_entries[i].e_id;
+			}
+		}
+	}
+
+	/* free unused space. */
+	rc = lustre_posix_acl_xattr_reduce_space(&new, posix_count, j);
+	if (rc >= 0) {
+		posix_size = rc;
+		*out = new;
+		rc = 0;
+	}
+	EXIT;
+
+_out:
+	if (rc) {
+		OBD_FREE(new, posix_size);
+		posix_size = rc;
+	}
+	return posix_size;
+}
+EXPORT_SYMBOL(lustre_acl_xattr_merge2posix);
+
+/*
+ * Merge the posix ACL and the extended ACL into new extended ACL.
+ */
+ext_acl_xattr_header *
+lustre_acl_xattr_merge2ext(posix_acl_xattr_header *posix_header, int size,
+			   ext_acl_xattr_header *ext_header)
+{
+	int ori_ext_count, posix_count, ext_count, ext_size;
+	int i, j, pos = 0, rc = 0;
+	posix_acl_xattr_entry pae;
+	ext_acl_xattr_header *new;
+	ext_acl_xattr_entry *ee, eae;
+	ENTRY;
+
+	if (unlikely(size < 0))
+		RETURN(ERR_PTR(-EINVAL));
+	else if (!size)
+		posix_count = 0;
+	else
+		posix_count = CFS_ACL_XATTR_COUNT(size, posix_acl_xattr);
+	ori_ext_count = le32_to_cpu(ext_header->a_count);
+	ext_count = posix_count + ori_ext_count;
+	ext_size = CFS_ACL_XATTR_SIZE(ext_count, ext_acl_xattr);
+
+	OBD_ALLOC(new, ext_size);
+	if (unlikely(new == NULL))
+		RETURN(ERR_PTR(-ENOMEM));
+
+	for (i = 0, j = 0; i < posix_count; i++) {
+		lustre_posix_acl_le_to_cpu(&pae, &posix_header->a_entries[i]);
+		switch (pae.e_tag) {
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			if (pae.e_id != ACL_UNDEFINED_ID)
+				GOTO(out, rc = -EIO);
+		case ACL_USER:
+			/* ignore "nobody" entry. */
+			if (pae.e_id == NOBODY_UID)
+				break;
+
+			new->a_entries[j].e_tag =
+					posix_header->a_entries[i].e_tag;
+			new->a_entries[j].e_perm =
+					posix_header->a_entries[i].e_perm;
+			new->a_entries[j].e_id =
+					posix_header->a_entries[i].e_id;
+			ee = lustre_ext_acl_xattr_search(ext_header,
+					&posix_header->a_entries[i], &pos);
+			if (ee) {
+				if (posix_header->a_entries[i].e_perm !=
+								ee->e_perm)
+					/* entry modified. */
+					ee->e_stat =
+					new->a_entries[j++].e_stat =
+							cpu_to_le32(ES_MOD);
+				else
+					/* entry unchanged. */
+					ee->e_stat =
+					new->a_entries[j++].e_stat =
+							cpu_to_le32(ES_UNC);
+			} else {
+				/* new entry. */
+				new->a_entries[j++].e_stat =
+							cpu_to_le32(ES_ADD);
+			}
+			break;
+		case ACL_GROUP:
+			/* ignore "nobody" entry. */
+			if (pae.e_id == NOBODY_GID)
+				break;
+			new->a_entries[j].e_tag =
+					posix_header->a_entries[i].e_tag;
+			new->a_entries[j].e_perm =
+					posix_header->a_entries[i].e_perm;
+			new->a_entries[j].e_id =
+					posix_header->a_entries[i].e_id;
+			ee = lustre_ext_acl_xattr_search(ext_header,
+					&posix_header->a_entries[i], &pos);
+			if (ee) {
+				if (posix_header->a_entries[i].e_perm !=
+								ee->e_perm)
+					/* entry modified. */
+					ee->e_stat =
+					new->a_entries[j++].e_stat =
+							cpu_to_le32(ES_MOD);
+				else
+					/* entry unchanged. */
+					ee->e_stat =
+					new->a_entries[j++].e_stat =
+							cpu_to_le32(ES_UNC);
+			} else {
+				/* new entry. */
+				new->a_entries[j++].e_stat =
+							cpu_to_le32(ES_ADD);
+			}
+			break;
+		default:
+			GOTO(out, rc = -EIO);
+		}
+	}
+
+	/* process deleted entries. */
+	for (i = 0; i < ori_ext_count; i++) {
+		lustre_ext_acl_le_to_cpu(&eae, &ext_header->a_entries[i]);
+		if (eae.e_stat == ES_UNK) {
+			/* ignore "nobody" entry. */
+			if ((eae.e_tag == ACL_USER && eae.e_id == NOBODY_UID) ||
+			    (eae.e_tag == ACL_GROUP && eae.e_id == NOBODY_GID))
+				continue;
+
+			new->a_entries[j].e_tag =
+						ext_header->a_entries[i].e_tag;
+			new->a_entries[j].e_perm =
+						ext_header->a_entries[i].e_perm;
+			new->a_entries[j].e_id = ext_header->a_entries[i].e_id;
+			new->a_entries[j++].e_stat = cpu_to_le32(ES_DEL);
+		}
+	}
+
+	new->a_count = cpu_to_le32(j);
+	/* free unused space. */
+	rc = lustre_ext_acl_xattr_reduce_space(&new, ext_count);
+	EXIT;
+
+out:
+	if (rc) {
+		OBD_FREE(new, ext_size);
+		new = ERR_PTR(rc);
+	}
+	return new;
+}
+EXPORT_SYMBOL(lustre_acl_xattr_merge2ext);
+
+#endif
diff --git a/drivers/staging/lustre/lustre/obdclass/capa.c b/drivers/staging/lustre/lustre/obdclass/capa.c
new file mode 100644
index 000000000000..3e532f5106e4
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/capa.c
@@ -0,0 +1,401 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/capa.c
+ *
+ * Lustre Capability Hash Management
+ *
+ * Author: Lai Siyao<lsy@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/version.h>
+#include <linux/fs.h>
+#include <asm/unistd.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <obd_class.h>
+#include <lustre_debug.h>
+#include <lustre/lustre_idl.h>
+
+#include <linux/list.h>
+#include <lustre_capa.h>
+
+#define NR_CAPAHASH 32
+#define CAPA_HASH_SIZE 3000	      /* for MDS & OSS */
+
+struct kmem_cache *capa_cachep = NULL;
+
+/* lock for capa hash/capa_list/fo_capa_keys */
+DEFINE_SPINLOCK(capa_lock);
+
+struct list_head capa_list[CAPA_SITE_MAX];
+
+static struct capa_hmac_alg capa_hmac_algs[] = {
+	DEF_CAPA_HMAC_ALG("sha1", SHA1, 20, 20),
+};
+/* capa count */
+int capa_count[CAPA_SITE_MAX] = { 0, };
+
+EXPORT_SYMBOL(capa_cachep);
+EXPORT_SYMBOL(capa_list);
+EXPORT_SYMBOL(capa_lock);
+EXPORT_SYMBOL(capa_count);
+
+struct hlist_head *init_capa_hash(void)
+{
+	struct hlist_head *hash;
+	int nr_hash, i;
+
+	OBD_ALLOC(hash, PAGE_CACHE_SIZE);
+	if (!hash)
+		return NULL;
+
+	nr_hash = PAGE_CACHE_SIZE / sizeof(struct hlist_head);
+	LASSERT(nr_hash > NR_CAPAHASH);
+
+	for (i = 0; i < NR_CAPAHASH; i++)
+		INIT_HLIST_HEAD(hash + i);
+	return hash;
+}
+EXPORT_SYMBOL(init_capa_hash);
+
+static inline int capa_on_server(struct obd_capa *ocapa)
+{
+	return ocapa->c_site == CAPA_SITE_SERVER;
+}
+
+static inline void capa_delete(struct obd_capa *ocapa)
+{
+	LASSERT(capa_on_server(ocapa));
+	hlist_del_init(&ocapa->u.tgt.c_hash);
+	list_del_init(&ocapa->c_list);
+	capa_count[ocapa->c_site]--;
+	/* release the ref when alloc */
+	capa_put(ocapa);
+}
+
+void cleanup_capa_hash(struct hlist_head *hash)
+{
+	int i;
+	struct hlist_node *next;
+	struct obd_capa *oc;
+
+	spin_lock(&capa_lock);
+	for (i = 0; i < NR_CAPAHASH; i++) {
+		hlist_for_each_entry_safe(oc, next, hash + i,
+					      u.tgt.c_hash)
+			capa_delete(oc);
+	}
+	spin_unlock(&capa_lock);
+
+	OBD_FREE(hash, PAGE_CACHE_SIZE);
+}
+EXPORT_SYMBOL(cleanup_capa_hash);
+
+static inline int capa_hashfn(struct lu_fid *fid)
+{
+	return (fid_oid(fid) ^ fid_ver(fid)) *
+	       (unsigned long)(fid_seq(fid) + 1) % NR_CAPAHASH;
+}
+
+/* capa renewal time check is earlier than that on client, which is to prevent
+ * client renew right after obtaining it. */
+static inline int capa_is_to_expire(struct obd_capa *oc)
+{
+	return cfs_time_before(cfs_time_sub(oc->c_expiry,
+				   cfs_time_seconds(oc->c_capa.lc_timeout)*2/3),
+			       cfs_time_current());
+}
+
+static struct obd_capa *find_capa(struct lustre_capa *capa,
+				  struct hlist_head *head, int alive)
+{
+	struct obd_capa *ocapa;
+	int len = alive ? offsetof(struct lustre_capa, lc_keyid):sizeof(*capa);
+
+	hlist_for_each_entry(ocapa, head, u.tgt.c_hash) {
+		if (memcmp(&ocapa->c_capa, capa, len))
+			continue;
+		/* don't return one that will expire soon in this case */
+		if (alive && capa_is_to_expire(ocapa))
+			continue;
+
+		LASSERT(capa_on_server(ocapa));
+
+		DEBUG_CAPA(D_SEC, &ocapa->c_capa, "found");
+		return ocapa;
+	}
+
+	return NULL;
+}
+
+#define LRU_CAPA_DELETE_COUNT 12
+static inline void capa_delete_lru(struct list_head *head)
+{
+	struct obd_capa *ocapa;
+	struct list_head *node = head->next;
+	int count = 0;
+
+	/* free LRU_CAPA_DELETE_COUNT unused capa from head */
+	while (count++ < LRU_CAPA_DELETE_COUNT) {
+		ocapa = list_entry(node, struct obd_capa, c_list);
+		node = node->next;
+		if (atomic_read(&ocapa->c_refc))
+			continue;
+
+		DEBUG_CAPA(D_SEC, &ocapa->c_capa, "free lru");
+		capa_delete(ocapa);
+	}
+}
+
+/* add or update */
+struct obd_capa *capa_add(struct hlist_head *hash, struct lustre_capa *capa)
+{
+	struct hlist_head *head = hash + capa_hashfn(&capa->lc_fid);
+	struct obd_capa *ocapa, *old = NULL;
+	struct list_head *list = &capa_list[CAPA_SITE_SERVER];
+
+	ocapa = alloc_capa(CAPA_SITE_SERVER);
+	if (IS_ERR(ocapa))
+		return NULL;
+
+	spin_lock(&capa_lock);
+	old = find_capa(capa, head, 0);
+	if (!old) {
+		ocapa->c_capa = *capa;
+		set_capa_expiry(ocapa);
+		hlist_add_head(&ocapa->u.tgt.c_hash, head);
+		list_add_tail(&ocapa->c_list, list);
+		capa_get(ocapa);
+		capa_count[CAPA_SITE_SERVER]++;
+		if (capa_count[CAPA_SITE_SERVER] > CAPA_HASH_SIZE)
+			capa_delete_lru(list);
+		spin_unlock(&capa_lock);
+		return ocapa;
+	} else {
+		capa_get(old);
+		spin_unlock(&capa_lock);
+		capa_put(ocapa);
+		return old;
+	}
+}
+EXPORT_SYMBOL(capa_add);
+
+struct obd_capa *capa_lookup(struct hlist_head *hash, struct lustre_capa *capa,
+			     int alive)
+{
+	struct obd_capa *ocapa;
+
+	spin_lock(&capa_lock);
+	ocapa = find_capa(capa, hash + capa_hashfn(&capa->lc_fid), alive);
+	if (ocapa) {
+		list_move_tail(&ocapa->c_list,
+				   &capa_list[CAPA_SITE_SERVER]);
+		capa_get(ocapa);
+	}
+	spin_unlock(&capa_lock);
+
+	return ocapa;
+}
+EXPORT_SYMBOL(capa_lookup);
+
+int capa_hmac(__u8 *hmac, struct lustre_capa *capa, __u8 *key)
+{
+	struct ll_crypto_hash *tfm;
+	struct capa_hmac_alg  *alg;
+	int keylen;
+	struct scatterlist sl;
+
+	if (capa_alg(capa) != CAPA_HMAC_ALG_SHA1) {
+		CERROR("unknown capability hmac algorithm!\n");
+		return -EFAULT;
+	}
+
+	alg = &capa_hmac_algs[capa_alg(capa)];
+
+	tfm = ll_crypto_alloc_hash(alg->ha_name, 0, 0);
+	if (!tfm) {
+		CERROR("crypto_alloc_tfm failed, check whether your kernel"
+		       "has crypto support!\n");
+		return -ENOMEM;
+	}
+	keylen = alg->ha_keylen;
+
+	sg_set_page(&sl, virt_to_page(capa),
+		    offsetof(struct lustre_capa, lc_hmac),
+		    (unsigned long)(capa) % PAGE_CACHE_SIZE);
+
+	ll_crypto_hmac(tfm, key, &keylen, &sl, sl.length, hmac);
+	ll_crypto_free_hash(tfm);
+
+	return 0;
+}
+EXPORT_SYMBOL(capa_hmac);
+
+int capa_encrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen)
+{
+	struct ll_crypto_cipher *tfm;
+	struct scatterlist sd;
+	struct scatterlist ss;
+	struct blkcipher_desc desc;
+	unsigned int min;
+	int rc;
+	char alg[CRYPTO_MAX_ALG_NAME+1] = "aes";
+	ENTRY;
+
+	/* passing "aes" in a variable instead of a constant string keeps gcc
+	 * 4.3.2 happy */
+	tfm = ll_crypto_alloc_blkcipher(alg, 0, 0 );
+	if (IS_ERR(tfm)) {
+		CERROR("failed to load transform for aes\n");
+		RETURN(PTR_ERR(tfm));
+	}
+
+	min = ll_crypto_tfm_alg_min_keysize(tfm);
+	if (keylen < min) {
+		CERROR("keylen at least %d bits for aes\n", min * 8);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	rc = ll_crypto_blkcipher_setkey(tfm, key, min);
+	if (rc) {
+		CERROR("failed to setting key for aes\n");
+		GOTO(out, rc);
+	}
+
+	sg_set_page(&sd, virt_to_page(d), 16,
+		    (unsigned long)(d) % PAGE_CACHE_SIZE);
+
+	sg_set_page(&ss, virt_to_page(s), 16,
+		    (unsigned long)(s) % PAGE_CACHE_SIZE);
+	desc.tfm   = tfm;
+	desc.info  = NULL;
+	desc.flags = 0;
+	rc = ll_crypto_blkcipher_encrypt(&desc, &sd, &ss, 16);
+	if (rc) {
+		CERROR("failed to encrypt for aes\n");
+		GOTO(out, rc);
+	}
+
+	EXIT;
+
+out:
+	ll_crypto_free_blkcipher(tfm);
+	return rc;
+}
+EXPORT_SYMBOL(capa_encrypt_id);
+
+int capa_decrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen)
+{
+	struct ll_crypto_cipher *tfm;
+	struct scatterlist sd;
+	struct scatterlist ss;
+	struct blkcipher_desc desc;
+	unsigned int min;
+	int rc;
+	char alg[CRYPTO_MAX_ALG_NAME+1] = "aes";
+	ENTRY;
+
+	/* passing "aes" in a variable instead of a constant string keeps gcc
+	 * 4.3.2 happy */
+	tfm = ll_crypto_alloc_blkcipher(alg, 0, 0 );
+	if (IS_ERR(tfm)) {
+		CERROR("failed to load transform for aes\n");
+		RETURN(PTR_ERR(tfm));
+	}
+
+	min = ll_crypto_tfm_alg_min_keysize(tfm);
+	if (keylen < min) {
+		CERROR("keylen at least %d bits for aes\n", min * 8);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	rc = ll_crypto_blkcipher_setkey(tfm, key, min);
+	if (rc) {
+		CERROR("failed to setting key for aes\n");
+		GOTO(out, rc);
+	}
+
+	sg_set_page(&sd, virt_to_page(d), 16,
+		    (unsigned long)(d) % PAGE_CACHE_SIZE);
+
+	sg_set_page(&ss, virt_to_page(s), 16,
+		    (unsigned long)(s) % PAGE_CACHE_SIZE);
+
+	desc.tfm   = tfm;
+	desc.info  = NULL;
+	desc.flags = 0;
+	rc = ll_crypto_blkcipher_decrypt(&desc, &sd, &ss, 16);
+	if (rc) {
+		CERROR("failed to decrypt for aes\n");
+		GOTO(out, rc);
+	}
+
+	EXIT;
+
+out:
+	ll_crypto_free_blkcipher(tfm);
+	return rc;
+}
+EXPORT_SYMBOL(capa_decrypt_id);
+
+void capa_cpy(void *capa, struct obd_capa *ocapa)
+{
+	spin_lock(&ocapa->c_lock);
+	*(struct lustre_capa *)capa = ocapa->c_capa;
+	spin_unlock(&ocapa->c_lock);
+}
+EXPORT_SYMBOL(capa_cpy);
+
+void _debug_capa(struct lustre_capa *c,
+		 struct libcfs_debug_msg_data *msgdata,
+		 const char *fmt, ... )
+{
+	va_list args;
+	va_start(args, fmt);
+	libcfs_debug_vmsg2(msgdata, fmt, args,
+			   " capability@%p fid "DFID" opc "LPX64" uid "LPU64
+			   " gid "LPU64" flags %u alg %d keyid %u timeout %u "
+			   "expiry %u\n", c, PFID(capa_fid(c)), capa_opc(c),
+			   capa_uid(c), capa_gid(c), capa_flags(c),
+			   capa_alg(c), capa_keyid(c), capa_timeout(c),
+			   capa_expiry(c));
+	va_end(args);
+}
+EXPORT_SYMBOL(_debug_capa);
diff --git a/drivers/staging/lustre/lustre/obdclass/cl_internal.h b/drivers/staging/lustre/lustre/obdclass/cl_internal.h
new file mode 100644
index 000000000000..7eb0ad7b3644
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/cl_internal.h
@@ -0,0 +1,121 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Internal cl interfaces.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+#ifndef _CL_INTERNAL_H
+#define _CL_INTERNAL_H
+
+#define CLT_PVEC_SIZE (14)
+
+/**
+ * Possible levels of the nesting. Currently this is 2: there are "top"
+ * entities (files, extent locks), and "sub" entities (stripes and stripe
+ * locks). This is used only for debugging counters right now.
+ */
+enum clt_nesting_level {
+	CNL_TOP,
+	CNL_SUB,
+	CNL_NR
+};
+
+/**
+ * Counters used to check correctness of cl_lock interface usage.
+ */
+struct cl_thread_counters {
+	/**
+	 * Number of outstanding calls to cl_lock_mutex_get() made by the
+	 * current thread. For debugging.
+	 */
+	int	   ctc_nr_locks_locked;
+	/** List of locked locks. */
+	struct lu_ref ctc_locks_locked;
+	/** Number of outstanding holds on locks. */
+	int	   ctc_nr_held;
+	/** Number of outstanding uses on locks. */
+	int	   ctc_nr_used;
+	/** Number of held extent locks. */
+	int	   ctc_nr_locks_acquired;
+};
+
+/**
+ * Thread local state internal for generic cl-code.
+ */
+struct cl_thread_info {
+	/*
+	 * Common fields.
+	 */
+	struct cl_io	 clt_io;
+	struct cl_2queue     clt_queue;
+
+	/*
+	 * Fields used by cl_lock.c
+	 */
+	struct cl_lock_descr clt_descr;
+	struct cl_page_list  clt_list;
+	/**
+	 * Counters for every level of lock nesting.
+	 */
+	struct cl_thread_counters clt_counters[CNL_NR];
+	/** @} debugging */
+
+	/*
+	 * Fields used by cl_page.c
+	 */
+	struct cl_page      *clt_pvec[CLT_PVEC_SIZE];
+
+	/*
+	 * Fields used by cl_io.c
+	 */
+	/**
+	 * Pointer to the topmost ongoing IO in this thread.
+	 */
+	struct cl_io	*clt_current_io;
+	/**
+	 * Used for submitting a sync io.
+	 */
+	struct cl_sync_io    clt_anchor;
+	/**
+	 * Fields used by cl_lock_discard_pages().
+	 */
+	pgoff_t	      clt_next_index;
+	pgoff_t	      clt_fn_index; /* first non-overlapped index */
+};
+
+struct cl_thread_info *cl_env_info(const struct lu_env *env);
+
+#endif /* _CL_INTERNAL_H */
diff --git a/drivers/staging/lustre/lustre/obdclass/cl_io.c b/drivers/staging/lustre/lustre/obdclass/cl_io.c
new file mode 100644
index 000000000000..75c9be8875e0
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/cl_io.c
@@ -0,0 +1,1753 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Client IO.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <linux/list.h>
+#include <cl_object.h>
+#include "cl_internal.h"
+
+/*****************************************************************************
+ *
+ * cl_io interface.
+ *
+ */
+
+#define cl_io_for_each(slice, io) \
+	list_for_each_entry((slice), &io->ci_layers, cis_linkage)
+#define cl_io_for_each_reverse(slice, io)		 \
+	list_for_each_entry_reverse((slice), &io->ci_layers, cis_linkage)
+
+static inline int cl_io_type_is_valid(enum cl_io_type type)
+{
+	return CIT_READ <= type && type < CIT_OP_NR;
+}
+
+static inline int cl_io_is_loopable(const struct cl_io *io)
+{
+	return cl_io_type_is_valid(io->ci_type) && io->ci_type != CIT_MISC;
+}
+
+/**
+ * Returns true iff there is an IO ongoing in the given environment.
+ */
+int cl_io_is_going(const struct lu_env *env)
+{
+	return cl_env_info(env)->clt_current_io != NULL;
+}
+EXPORT_SYMBOL(cl_io_is_going);
+
+/**
+ * cl_io invariant that holds at all times when exported cl_io_*() functions
+ * are entered and left.
+ */
+static int cl_io_invariant(const struct cl_io *io)
+{
+	struct cl_io *up;
+
+	up = io->ci_parent;
+	return
+		/*
+		 * io can own pages only when it is ongoing. Sub-io might
+		 * still be in CIS_LOCKED state when top-io is in
+		 * CIS_IO_GOING.
+		 */
+		ergo(io->ci_owned_nr > 0, io->ci_state == CIS_IO_GOING ||
+		     (io->ci_state == CIS_LOCKED && up != NULL));
+}
+
+/**
+ * Finalize \a io, by calling cl_io_operations::cio_fini() bottom-to-top.
+ */
+void cl_io_fini(const struct lu_env *env, struct cl_io *io)
+{
+	struct cl_io_slice    *slice;
+	struct cl_thread_info *info;
+
+	LINVRNT(cl_io_type_is_valid(io->ci_type));
+	LINVRNT(cl_io_invariant(io));
+	ENTRY;
+
+	while (!list_empty(&io->ci_layers)) {
+		slice = container_of(io->ci_layers.prev, struct cl_io_slice,
+				     cis_linkage);
+		list_del_init(&slice->cis_linkage);
+		if (slice->cis_iop->op[io->ci_type].cio_fini != NULL)
+			slice->cis_iop->op[io->ci_type].cio_fini(env, slice);
+		/*
+		 * Invalidate slice to catch use after free. This assumes that
+		 * slices are allocated within session and can be touched
+		 * after ->cio_fini() returns.
+		 */
+		slice->cis_io = NULL;
+	}
+	io->ci_state = CIS_FINI;
+	info = cl_env_info(env);
+	if (info->clt_current_io == io)
+		info->clt_current_io = NULL;
+
+	/* sanity check for layout change */
+	switch(io->ci_type) {
+	case CIT_READ:
+	case CIT_WRITE:
+		break;
+	case CIT_FAULT:
+	case CIT_FSYNC:
+		LASSERT(!io->ci_need_restart);
+		break;
+	case CIT_SETATTR:
+	case CIT_MISC:
+		/* Check ignore layout change conf */
+		LASSERT(ergo(io->ci_ignore_layout || !io->ci_verify_layout,
+				!io->ci_need_restart));
+		break;
+	default:
+		LBUG();
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(cl_io_fini);
+
+static int cl_io_init0(const struct lu_env *env, struct cl_io *io,
+		       enum cl_io_type iot, struct cl_object *obj)
+{
+	struct cl_object *scan;
+	int result;
+
+	LINVRNT(io->ci_state == CIS_ZERO || io->ci_state == CIS_FINI);
+	LINVRNT(cl_io_type_is_valid(iot));
+	LINVRNT(cl_io_invariant(io));
+	ENTRY;
+
+	io->ci_type = iot;
+	INIT_LIST_HEAD(&io->ci_lockset.cls_todo);
+	INIT_LIST_HEAD(&io->ci_lockset.cls_curr);
+	INIT_LIST_HEAD(&io->ci_lockset.cls_done);
+	INIT_LIST_HEAD(&io->ci_layers);
+
+	result = 0;
+	cl_object_for_each(scan, obj) {
+		if (scan->co_ops->coo_io_init != NULL) {
+			result = scan->co_ops->coo_io_init(env, scan, io);
+			if (result != 0)
+				break;
+		}
+	}
+	if (result == 0)
+		io->ci_state = CIS_INIT;
+	RETURN(result);
+}
+
+/**
+ * Initialize sub-io, by calling cl_io_operations::cio_init() top-to-bottom.
+ *
+ * \pre obj != cl_object_top(obj)
+ */
+int cl_io_sub_init(const struct lu_env *env, struct cl_io *io,
+		   enum cl_io_type iot, struct cl_object *obj)
+{
+	struct cl_thread_info *info = cl_env_info(env);
+
+	LASSERT(obj != cl_object_top(obj));
+	if (info->clt_current_io == NULL)
+		info->clt_current_io = io;
+	return cl_io_init0(env, io, iot, obj);
+}
+EXPORT_SYMBOL(cl_io_sub_init);
+
+/**
+ * Initialize \a io, by calling cl_io_operations::cio_init() top-to-bottom.
+ *
+ * Caller has to call cl_io_fini() after a call to cl_io_init(), no matter
+ * what the latter returned.
+ *
+ * \pre obj == cl_object_top(obj)
+ * \pre cl_io_type_is_valid(iot)
+ * \post cl_io_type_is_valid(io->ci_type) && io->ci_type == iot
+ */
+int cl_io_init(const struct lu_env *env, struct cl_io *io,
+	       enum cl_io_type iot, struct cl_object *obj)
+{
+	struct cl_thread_info *info = cl_env_info(env);
+
+	LASSERT(obj == cl_object_top(obj));
+	LASSERT(info->clt_current_io == NULL);
+
+	info->clt_current_io = io;
+	return cl_io_init0(env, io, iot, obj);
+}
+EXPORT_SYMBOL(cl_io_init);
+
+/**
+ * Initialize read or write io.
+ *
+ * \pre iot == CIT_READ || iot == CIT_WRITE
+ */
+int cl_io_rw_init(const struct lu_env *env, struct cl_io *io,
+		  enum cl_io_type iot, loff_t pos, size_t count)
+{
+	LINVRNT(iot == CIT_READ || iot == CIT_WRITE);
+	LINVRNT(io->ci_obj != NULL);
+	ENTRY;
+
+	LU_OBJECT_HEADER(D_VFSTRACE, env, &io->ci_obj->co_lu,
+			 "io range: %u ["LPU64", "LPU64") %u %u\n",
+			 iot, (__u64)pos, (__u64)pos + count,
+			 io->u.ci_rw.crw_nonblock, io->u.ci_wr.wr_append);
+	io->u.ci_rw.crw_pos    = pos;
+	io->u.ci_rw.crw_count  = count;
+	RETURN(cl_io_init(env, io, iot, io->ci_obj));
+}
+EXPORT_SYMBOL(cl_io_rw_init);
+
+static inline const struct lu_fid *
+cl_lock_descr_fid(const struct cl_lock_descr *descr)
+{
+	return lu_object_fid(&descr->cld_obj->co_lu);
+}
+
+static int cl_lock_descr_sort(const struct cl_lock_descr *d0,
+			      const struct cl_lock_descr *d1)
+{
+	return lu_fid_cmp(cl_lock_descr_fid(d0), cl_lock_descr_fid(d1)) ?:
+		__diff_normalize(d0->cld_start, d1->cld_start);
+}
+
+static int cl_lock_descr_cmp(const struct cl_lock_descr *d0,
+			     const struct cl_lock_descr *d1)
+{
+	int ret;
+
+	ret = lu_fid_cmp(cl_lock_descr_fid(d0), cl_lock_descr_fid(d1));
+	if (ret)
+		return ret;
+	if (d0->cld_end < d1->cld_start)
+		return -1;
+	if (d0->cld_start > d0->cld_end)
+		return 1;
+	return 0;
+}
+
+static void cl_lock_descr_merge(struct cl_lock_descr *d0,
+				const struct cl_lock_descr *d1)
+{
+	d0->cld_start = min(d0->cld_start, d1->cld_start);
+	d0->cld_end = max(d0->cld_end, d1->cld_end);
+
+	if (d1->cld_mode == CLM_WRITE && d0->cld_mode != CLM_WRITE)
+		d0->cld_mode = CLM_WRITE;
+
+	if (d1->cld_mode == CLM_GROUP && d0->cld_mode != CLM_GROUP)
+		d0->cld_mode = CLM_GROUP;
+}
+
+/*
+ * Sort locks in lexicographical order of their (fid, start-offset) pairs.
+ */
+static void cl_io_locks_sort(struct cl_io *io)
+{
+	int done = 0;
+
+	ENTRY;
+	/* hidden treasure: bubble sort for now. */
+	do {
+		struct cl_io_lock_link *curr;
+		struct cl_io_lock_link *prev;
+		struct cl_io_lock_link *temp;
+
+		done = 1;
+		prev = NULL;
+
+		list_for_each_entry_safe(curr, temp,
+					     &io->ci_lockset.cls_todo,
+					     cill_linkage) {
+			if (prev != NULL) {
+				switch (cl_lock_descr_sort(&prev->cill_descr,
+							  &curr->cill_descr)) {
+				case 0:
+					/*
+					 * IMPOSSIBLE: Identical locks are
+					 *	     already removed at
+					 *	     this point.
+					 */
+				default:
+					LBUG();
+				case +1:
+					list_move_tail(&curr->cill_linkage,
+							   &prev->cill_linkage);
+					done = 0;
+					continue; /* don't change prev: it's
+						   * still "previous" */
+				case -1: /* already in order */
+					break;
+				}
+			}
+			prev = curr;
+		}
+	} while (!done);
+	EXIT;
+}
+
+/**
+ * Check whether \a queue contains locks matching \a need.
+ *
+ * \retval +ve there is a matching lock in the \a queue
+ * \retval   0 there are no matching locks in the \a queue
+ */
+int cl_queue_match(const struct list_head *queue,
+		   const struct cl_lock_descr *need)
+{
+       struct cl_io_lock_link *scan;
+
+       ENTRY;
+       list_for_each_entry(scan, queue, cill_linkage) {
+	       if (cl_lock_descr_match(&scan->cill_descr, need))
+		       RETURN(+1);
+       }
+       RETURN(0);
+}
+EXPORT_SYMBOL(cl_queue_match);
+
+static int cl_queue_merge(const struct list_head *queue,
+			  const struct cl_lock_descr *need)
+{
+       struct cl_io_lock_link *scan;
+
+       ENTRY;
+       list_for_each_entry(scan, queue, cill_linkage) {
+	       if (cl_lock_descr_cmp(&scan->cill_descr, need))
+		       continue;
+	       cl_lock_descr_merge(&scan->cill_descr, need);
+	       CDEBUG(D_VFSTRACE, "lock: %d: [%lu, %lu]\n",
+		      scan->cill_descr.cld_mode, scan->cill_descr.cld_start,
+		      scan->cill_descr.cld_end);
+	       RETURN(+1);
+       }
+       RETURN(0);
+
+}
+
+static int cl_lockset_match(const struct cl_lockset *set,
+			    const struct cl_lock_descr *need)
+{
+	return cl_queue_match(&set->cls_curr, need) ||
+	       cl_queue_match(&set->cls_done, need);
+}
+
+static int cl_lockset_merge(const struct cl_lockset *set,
+			    const struct cl_lock_descr *need)
+{
+	return cl_queue_merge(&set->cls_todo, need) ||
+	       cl_lockset_match(set, need);
+}
+
+static int cl_lockset_lock_one(const struct lu_env *env,
+			       struct cl_io *io, struct cl_lockset *set,
+			       struct cl_io_lock_link *link)
+{
+	struct cl_lock *lock;
+	int	     result;
+
+	ENTRY;
+
+	lock = cl_lock_request(env, io, &link->cill_descr, "io", io);
+
+	if (!IS_ERR(lock)) {
+		link->cill_lock = lock;
+		list_move(&link->cill_linkage, &set->cls_curr);
+		if (!(link->cill_descr.cld_enq_flags & CEF_ASYNC)) {
+			result = cl_wait(env, lock);
+			if (result == 0)
+				list_move(&link->cill_linkage,
+					      &set->cls_done);
+		} else
+			result = 0;
+	} else
+		result = PTR_ERR(lock);
+	RETURN(result);
+}
+
+static void cl_lock_link_fini(const struct lu_env *env, struct cl_io *io,
+			      struct cl_io_lock_link *link)
+{
+	struct cl_lock *lock = link->cill_lock;
+
+	ENTRY;
+	list_del_init(&link->cill_linkage);
+	if (lock != NULL) {
+		cl_lock_release(env, lock, "io", io);
+		link->cill_lock = NULL;
+	}
+	if (link->cill_fini != NULL)
+		link->cill_fini(env, link);
+	EXIT;
+}
+
+static int cl_lockset_lock(const struct lu_env *env, struct cl_io *io,
+			   struct cl_lockset *set)
+{
+	struct cl_io_lock_link *link;
+	struct cl_io_lock_link *temp;
+	struct cl_lock	 *lock;
+	int result;
+
+	ENTRY;
+	result = 0;
+	list_for_each_entry_safe(link, temp, &set->cls_todo, cill_linkage) {
+		if (!cl_lockset_match(set, &link->cill_descr)) {
+			/* XXX some locking to guarantee that locks aren't
+			 * expanded in between. */
+			result = cl_lockset_lock_one(env, io, set, link);
+			if (result != 0)
+				break;
+		} else
+			cl_lock_link_fini(env, io, link);
+	}
+	if (result == 0) {
+		list_for_each_entry_safe(link, temp,
+					     &set->cls_curr, cill_linkage) {
+			lock = link->cill_lock;
+			result = cl_wait(env, lock);
+			if (result == 0)
+				list_move(&link->cill_linkage,
+					      &set->cls_done);
+			else
+				break;
+		}
+	}
+	RETURN(result);
+}
+
+/**
+ * Takes locks necessary for the current iteration of io.
+ *
+ * Calls cl_io_operations::cio_lock() top-to-bottom to collect locks required
+ * by layers for the current iteration. Then sort locks (to avoid dead-locks),
+ * and acquire them.
+ */
+int cl_io_lock(const struct lu_env *env, struct cl_io *io)
+{
+	const struct cl_io_slice *scan;
+	int result = 0;
+
+	LINVRNT(cl_io_is_loopable(io));
+	LINVRNT(io->ci_state == CIS_IT_STARTED);
+	LINVRNT(cl_io_invariant(io));
+
+	ENTRY;
+	cl_io_for_each(scan, io) {
+		if (scan->cis_iop->op[io->ci_type].cio_lock == NULL)
+			continue;
+		result = scan->cis_iop->op[io->ci_type].cio_lock(env, scan);
+		if (result != 0)
+			break;
+	}
+	if (result == 0) {
+		cl_io_locks_sort(io);
+		result = cl_lockset_lock(env, io, &io->ci_lockset);
+	}
+	if (result != 0)
+		cl_io_unlock(env, io);
+	else
+		io->ci_state = CIS_LOCKED;
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_lock);
+
+/**
+ * Release locks takes by io.
+ */
+void cl_io_unlock(const struct lu_env *env, struct cl_io *io)
+{
+	struct cl_lockset	*set;
+	struct cl_io_lock_link   *link;
+	struct cl_io_lock_link   *temp;
+	const struct cl_io_slice *scan;
+
+	LASSERT(cl_io_is_loopable(io));
+	LASSERT(CIS_IT_STARTED <= io->ci_state && io->ci_state < CIS_UNLOCKED);
+	LINVRNT(cl_io_invariant(io));
+
+	ENTRY;
+	set = &io->ci_lockset;
+
+	list_for_each_entry_safe(link, temp, &set->cls_todo, cill_linkage)
+		cl_lock_link_fini(env, io, link);
+
+	list_for_each_entry_safe(link, temp, &set->cls_curr, cill_linkage)
+		cl_lock_link_fini(env, io, link);
+
+	list_for_each_entry_safe(link, temp, &set->cls_done, cill_linkage) {
+		cl_unuse(env, link->cill_lock);
+		cl_lock_link_fini(env, io, link);
+	}
+	cl_io_for_each_reverse(scan, io) {
+		if (scan->cis_iop->op[io->ci_type].cio_unlock != NULL)
+			scan->cis_iop->op[io->ci_type].cio_unlock(env, scan);
+	}
+	io->ci_state = CIS_UNLOCKED;
+	LASSERT(!cl_env_info(env)->clt_counters[CNL_TOP].ctc_nr_locks_acquired);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_io_unlock);
+
+/**
+ * Prepares next iteration of io.
+ *
+ * Calls cl_io_operations::cio_iter_init() top-to-bottom. This exists to give
+ * layers a chance to modify io parameters, e.g., so that lov can restrict io
+ * to a single stripe.
+ */
+int cl_io_iter_init(const struct lu_env *env, struct cl_io *io)
+{
+	const struct cl_io_slice *scan;
+	int result;
+
+	LINVRNT(cl_io_is_loopable(io));
+	LINVRNT(io->ci_state == CIS_INIT || io->ci_state == CIS_IT_ENDED);
+	LINVRNT(cl_io_invariant(io));
+
+	ENTRY;
+	result = 0;
+	cl_io_for_each(scan, io) {
+		if (scan->cis_iop->op[io->ci_type].cio_iter_init == NULL)
+			continue;
+		result = scan->cis_iop->op[io->ci_type].cio_iter_init(env,
+								      scan);
+		if (result != 0)
+			break;
+	}
+	if (result == 0)
+		io->ci_state = CIS_IT_STARTED;
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_iter_init);
+
+/**
+ * Finalizes io iteration.
+ *
+ * Calls cl_io_operations::cio_iter_fini() bottom-to-top.
+ */
+void cl_io_iter_fini(const struct lu_env *env, struct cl_io *io)
+{
+	const struct cl_io_slice *scan;
+
+	LINVRNT(cl_io_is_loopable(io));
+	LINVRNT(io->ci_state == CIS_UNLOCKED);
+	LINVRNT(cl_io_invariant(io));
+
+	ENTRY;
+	cl_io_for_each_reverse(scan, io) {
+		if (scan->cis_iop->op[io->ci_type].cio_iter_fini != NULL)
+			scan->cis_iop->op[io->ci_type].cio_iter_fini(env, scan);
+	}
+	io->ci_state = CIS_IT_ENDED;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_io_iter_fini);
+
+/**
+ * Records that read or write io progressed \a nob bytes forward.
+ */
+void cl_io_rw_advance(const struct lu_env *env, struct cl_io *io, size_t nob)
+{
+	const struct cl_io_slice *scan;
+
+	LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE ||
+		nob == 0);
+	LINVRNT(cl_io_is_loopable(io));
+	LINVRNT(cl_io_invariant(io));
+
+	ENTRY;
+
+	io->u.ci_rw.crw_pos   += nob;
+	io->u.ci_rw.crw_count -= nob;
+
+	/* layers have to be notified. */
+	cl_io_for_each_reverse(scan, io) {
+		if (scan->cis_iop->op[io->ci_type].cio_advance != NULL)
+			scan->cis_iop->op[io->ci_type].cio_advance(env, scan,
+								   nob);
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(cl_io_rw_advance);
+
+/**
+ * Adds a lock to a lockset.
+ */
+int cl_io_lock_add(const struct lu_env *env, struct cl_io *io,
+		   struct cl_io_lock_link *link)
+{
+	int result;
+
+	ENTRY;
+	if (cl_lockset_merge(&io->ci_lockset, &link->cill_descr))
+		result = +1;
+	else {
+		list_add(&link->cill_linkage, &io->ci_lockset.cls_todo);
+		result = 0;
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_lock_add);
+
+static void cl_free_io_lock_link(const struct lu_env *env,
+				 struct cl_io_lock_link *link)
+{
+	OBD_FREE_PTR(link);
+}
+
+/**
+ * Allocates new lock link, and uses it to add a lock to a lockset.
+ */
+int cl_io_lock_alloc_add(const struct lu_env *env, struct cl_io *io,
+			 struct cl_lock_descr *descr)
+{
+	struct cl_io_lock_link *link;
+	int result;
+
+	ENTRY;
+	OBD_ALLOC_PTR(link);
+	if (link != NULL) {
+		link->cill_descr     = *descr;
+		link->cill_fini      = cl_free_io_lock_link;
+		result = cl_io_lock_add(env, io, link);
+		if (result) /* lock match */
+			link->cill_fini(env, link);
+	} else
+		result = -ENOMEM;
+
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_lock_alloc_add);
+
+/**
+ * Starts io by calling cl_io_operations::cio_start() top-to-bottom.
+ */
+int cl_io_start(const struct lu_env *env, struct cl_io *io)
+{
+	const struct cl_io_slice *scan;
+	int result = 0;
+
+	LINVRNT(cl_io_is_loopable(io));
+	LINVRNT(io->ci_state == CIS_LOCKED);
+	LINVRNT(cl_io_invariant(io));
+	ENTRY;
+
+	io->ci_state = CIS_IO_GOING;
+	cl_io_for_each(scan, io) {
+		if (scan->cis_iop->op[io->ci_type].cio_start == NULL)
+			continue;
+		result = scan->cis_iop->op[io->ci_type].cio_start(env, scan);
+		if (result != 0)
+			break;
+	}
+	if (result >= 0)
+		result = 0;
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_start);
+
+/**
+ * Wait until current io iteration is finished by calling
+ * cl_io_operations::cio_end() bottom-to-top.
+ */
+void cl_io_end(const struct lu_env *env, struct cl_io *io)
+{
+	const struct cl_io_slice *scan;
+
+	LINVRNT(cl_io_is_loopable(io));
+	LINVRNT(io->ci_state == CIS_IO_GOING);
+	LINVRNT(cl_io_invariant(io));
+	ENTRY;
+
+	cl_io_for_each_reverse(scan, io) {
+		if (scan->cis_iop->op[io->ci_type].cio_end != NULL)
+			scan->cis_iop->op[io->ci_type].cio_end(env, scan);
+		/* TODO: error handling. */
+	}
+	io->ci_state = CIS_IO_FINISHED;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_io_end);
+
+static const struct cl_page_slice *
+cl_io_slice_page(const struct cl_io_slice *ios, struct cl_page *page)
+{
+	const struct cl_page_slice *slice;
+
+	slice = cl_page_at(page, ios->cis_obj->co_lu.lo_dev->ld_type);
+	LINVRNT(slice != NULL);
+	return slice;
+}
+
+/**
+ * True iff \a page is within \a io range.
+ */
+static int cl_page_in_io(const struct cl_page *page, const struct cl_io *io)
+{
+	int     result = 1;
+	loff_t  start;
+	loff_t  end;
+	pgoff_t idx;
+
+	idx = page->cp_index;
+	switch (io->ci_type) {
+	case CIT_READ:
+	case CIT_WRITE:
+		/*
+		 * check that [start, end) and [pos, pos + count) extents
+		 * overlap.
+		 */
+		if (!cl_io_is_append(io)) {
+			const struct cl_io_rw_common *crw = &(io->u.ci_rw);
+			start = cl_offset(page->cp_obj, idx);
+			end   = cl_offset(page->cp_obj, idx + 1);
+			result = crw->crw_pos < end &&
+				 start < crw->crw_pos + crw->crw_count;
+		}
+		break;
+	case CIT_FAULT:
+		result = io->u.ci_fault.ft_index == idx;
+		break;
+	default:
+		LBUG();
+	}
+	return result;
+}
+
+/**
+ * Called by read io, when page has to be read from the server.
+ *
+ * \see cl_io_operations::cio_read_page()
+ */
+int cl_io_read_page(const struct lu_env *env, struct cl_io *io,
+		    struct cl_page *page)
+{
+	const struct cl_io_slice *scan;
+	struct cl_2queue	 *queue;
+	int		       result = 0;
+
+	LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_FAULT);
+	LINVRNT(cl_page_is_owned(page, io));
+	LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED);
+	LINVRNT(cl_page_in_io(page, io));
+	LINVRNT(cl_io_invariant(io));
+	ENTRY;
+
+	queue = &io->ci_queue;
+
+	cl_2queue_init(queue);
+	/*
+	 * ->cio_read_page() methods called in the loop below are supposed to
+	 * never block waiting for network (the only subtle point is the
+	 * creation of new pages for read-ahead that might result in cache
+	 * shrinking, but currently only clean pages are shrunk and this
+	 * requires no network io).
+	 *
+	 * Should this ever starts blocking, retry loop would be needed for
+	 * "parallel io" (see CLO_REPEAT loops in cl_lock.c).
+	 */
+	cl_io_for_each(scan, io) {
+		if (scan->cis_iop->cio_read_page != NULL) {
+			const struct cl_page_slice *slice;
+
+			slice = cl_io_slice_page(scan, page);
+			LINVRNT(slice != NULL);
+			result = scan->cis_iop->cio_read_page(env, scan, slice);
+			if (result != 0)
+				break;
+		}
+	}
+	if (result == 0)
+		result = cl_io_submit_rw(env, io, CRT_READ, queue);
+	/*
+	 * Unlock unsent pages in case of error.
+	 */
+	cl_page_list_disown(env, io, &queue->c2_qin);
+	cl_2queue_fini(env, queue);
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_read_page);
+
+/**
+ * Called by write io to prepare page to receive data from user buffer.
+ *
+ * \see cl_io_operations::cio_prepare_write()
+ */
+int cl_io_prepare_write(const struct lu_env *env, struct cl_io *io,
+			struct cl_page *page, unsigned from, unsigned to)
+{
+	const struct cl_io_slice *scan;
+	int result = 0;
+
+	LINVRNT(io->ci_type == CIT_WRITE);
+	LINVRNT(cl_page_is_owned(page, io));
+	LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED);
+	LINVRNT(cl_io_invariant(io));
+	LASSERT(cl_page_in_io(page, io));
+	ENTRY;
+
+	cl_io_for_each_reverse(scan, io) {
+		if (scan->cis_iop->cio_prepare_write != NULL) {
+			const struct cl_page_slice *slice;
+
+			slice = cl_io_slice_page(scan, page);
+			result = scan->cis_iop->cio_prepare_write(env, scan,
+								  slice,
+								  from, to);
+			if (result != 0)
+				break;
+		}
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_prepare_write);
+
+/**
+ * Called by write io after user data were copied into a page.
+ *
+ * \see cl_io_operations::cio_commit_write()
+ */
+int cl_io_commit_write(const struct lu_env *env, struct cl_io *io,
+		       struct cl_page *page, unsigned from, unsigned to)
+{
+	const struct cl_io_slice *scan;
+	int result = 0;
+
+	LINVRNT(io->ci_type == CIT_WRITE);
+	LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED);
+	LINVRNT(cl_io_invariant(io));
+	/*
+	 * XXX Uh... not nice. Top level cl_io_commit_write() call (vvp->lov)
+	 * already called cl_page_cache_add(), moving page into CPS_CACHED
+	 * state. Better (and more general) way of dealing with such situation
+	 * is needed.
+	 */
+	LASSERT(cl_page_is_owned(page, io) || page->cp_parent != NULL);
+	LASSERT(cl_page_in_io(page, io));
+	ENTRY;
+
+	cl_io_for_each(scan, io) {
+		if (scan->cis_iop->cio_commit_write != NULL) {
+			const struct cl_page_slice *slice;
+
+			slice = cl_io_slice_page(scan, page);
+			result = scan->cis_iop->cio_commit_write(env, scan,
+								 slice,
+								 from, to);
+			if (result != 0)
+				break;
+		}
+	}
+	LINVRNT(result <= 0);
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_commit_write);
+
+/**
+ * Submits a list of pages for immediate io.
+ *
+ * After the function gets returned, The submitted pages are moved to
+ * queue->c2_qout queue, and queue->c2_qin contain both the pages don't need
+ * to be submitted, and the pages are errant to submit.
+ *
+ * \returns 0 if at least one page was submitted, error code otherwise.
+ * \see cl_io_operations::cio_submit()
+ */
+int cl_io_submit_rw(const struct lu_env *env, struct cl_io *io,
+		    enum cl_req_type crt, struct cl_2queue *queue)
+{
+	const struct cl_io_slice *scan;
+	int result = 0;
+
+	LINVRNT(crt < ARRAY_SIZE(scan->cis_iop->req_op));
+	ENTRY;
+
+	cl_io_for_each(scan, io) {
+		if (scan->cis_iop->req_op[crt].cio_submit == NULL)
+			continue;
+		result = scan->cis_iop->req_op[crt].cio_submit(env, scan, crt,
+							       queue);
+		if (result != 0)
+			break;
+	}
+	/*
+	 * If ->cio_submit() failed, no pages were sent.
+	 */
+	LASSERT(ergo(result != 0, list_empty(&queue->c2_qout.pl_pages)));
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_submit_rw);
+
+/**
+ * Submit a sync_io and wait for the IO to be finished, or error happens.
+ * If \a timeout is zero, it means to wait for the IO unconditionally.
+ */
+int cl_io_submit_sync(const struct lu_env *env, struct cl_io *io,
+		      enum cl_req_type iot, struct cl_2queue *queue,
+		      long timeout)
+{
+	struct cl_sync_io *anchor = &cl_env_info(env)->clt_anchor;
+	struct cl_page *pg;
+	int rc;
+
+	cl_page_list_for_each(pg, &queue->c2_qin) {
+		LASSERT(pg->cp_sync_io == NULL);
+		pg->cp_sync_io = anchor;
+	}
+
+	cl_sync_io_init(anchor, queue->c2_qin.pl_nr);
+	rc = cl_io_submit_rw(env, io, iot, queue);
+	if (rc == 0) {
+		/*
+		 * If some pages weren't sent for any reason (e.g.,
+		 * read found up-to-date pages in the cache, or write found
+		 * clean pages), count them as completed to avoid infinite
+		 * wait.
+		 */
+		 cl_page_list_for_each(pg, &queue->c2_qin) {
+			pg->cp_sync_io = NULL;
+			cl_sync_io_note(anchor, +1);
+		 }
+
+		 /* wait for the IO to be finished. */
+		 rc = cl_sync_io_wait(env, io, &queue->c2_qout,
+				      anchor, timeout);
+	} else {
+		LASSERT(list_empty(&queue->c2_qout.pl_pages));
+		cl_page_list_for_each(pg, &queue->c2_qin)
+			pg->cp_sync_io = NULL;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(cl_io_submit_sync);
+
+/**
+ * Cancel an IO which has been submitted by cl_io_submit_rw.
+ */
+int cl_io_cancel(const struct lu_env *env, struct cl_io *io,
+		 struct cl_page_list *queue)
+{
+	struct cl_page *page;
+	int result = 0;
+
+	CERROR("Canceling ongoing page trasmission\n");
+	cl_page_list_for_each(page, queue) {
+		int rc;
+
+		LINVRNT(cl_page_in_io(page, io));
+		rc = cl_page_cancel(env, page);
+		result = result ?: rc;
+	}
+	return result;
+}
+EXPORT_SYMBOL(cl_io_cancel);
+
+/**
+ * Main io loop.
+ *
+ * Pumps io through iterations calling
+ *
+ *    - cl_io_iter_init()
+ *
+ *    - cl_io_lock()
+ *
+ *    - cl_io_start()
+ *
+ *    - cl_io_end()
+ *
+ *    - cl_io_unlock()
+ *
+ *    - cl_io_iter_fini()
+ *
+ * repeatedly until there is no more io to do.
+ */
+int cl_io_loop(const struct lu_env *env, struct cl_io *io)
+{
+	int result   = 0;
+
+	LINVRNT(cl_io_is_loopable(io));
+	ENTRY;
+
+	do {
+		size_t nob;
+
+		io->ci_continue = 0;
+		result = cl_io_iter_init(env, io);
+		if (result == 0) {
+			nob    = io->ci_nob;
+			result = cl_io_lock(env, io);
+			if (result == 0) {
+				/*
+				 * Notify layers that locks has been taken,
+				 * and do actual i/o.
+				 *
+				 *   - llite: kms, short read;
+				 *   - llite: generic_file_read();
+				 */
+				result = cl_io_start(env, io);
+				/*
+				 * Send any remaining pending
+				 * io, etc.
+				 *
+				 *   - llite: ll_rw_stats_tally.
+				 */
+				cl_io_end(env, io);
+				cl_io_unlock(env, io);
+				cl_io_rw_advance(env, io, io->ci_nob - nob);
+			}
+		}
+		cl_io_iter_fini(env, io);
+	} while (result == 0 && io->ci_continue);
+	if (result == 0)
+		result = io->ci_result;
+	RETURN(result < 0 ? result : 0);
+}
+EXPORT_SYMBOL(cl_io_loop);
+
+/**
+ * Adds io slice to the cl_io.
+ *
+ * This is called by cl_object_operations::coo_io_init() methods to add a
+ * per-layer state to the io. New state is added at the end of
+ * cl_io::ci_layers list, that is, it is at the bottom of the stack.
+ *
+ * \see cl_lock_slice_add(), cl_req_slice_add(), cl_page_slice_add()
+ */
+void cl_io_slice_add(struct cl_io *io, struct cl_io_slice *slice,
+		     struct cl_object *obj,
+		     const struct cl_io_operations *ops)
+{
+	struct list_head *linkage = &slice->cis_linkage;
+
+	LASSERT((linkage->prev == NULL && linkage->next == NULL) ||
+		list_empty(linkage));
+	ENTRY;
+
+	list_add_tail(linkage, &io->ci_layers);
+	slice->cis_io  = io;
+	slice->cis_obj = obj;
+	slice->cis_iop = ops;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_io_slice_add);
+
+
+/**
+ * Initializes page list.
+ */
+void cl_page_list_init(struct cl_page_list *plist)
+{
+	ENTRY;
+	plist->pl_nr = 0;
+	INIT_LIST_HEAD(&plist->pl_pages);
+	plist->pl_owner = current;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_init);
+
+/**
+ * Adds a page to a page list.
+ */
+void cl_page_list_add(struct cl_page_list *plist, struct cl_page *page)
+{
+	ENTRY;
+	/* it would be better to check that page is owned by "current" io, but
+	 * it is not passed here. */
+	LASSERT(page->cp_owner != NULL);
+	LINVRNT(plist->pl_owner == current);
+
+	lockdep_off();
+	mutex_lock(&page->cp_mutex);
+	lockdep_on();
+	LASSERT(list_empty(&page->cp_batch));
+	list_add_tail(&page->cp_batch, &plist->pl_pages);
+	++plist->pl_nr;
+	page->cp_queue_ref = lu_ref_add(&page->cp_reference, "queue", plist);
+	cl_page_get(page);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_add);
+
+/**
+ * Removes a page from a page list.
+ */
+void cl_page_list_del(const struct lu_env *env,
+		      struct cl_page_list *plist, struct cl_page *page)
+{
+	LASSERT(plist->pl_nr > 0);
+	LINVRNT(plist->pl_owner == current);
+
+	ENTRY;
+	list_del_init(&page->cp_batch);
+	lockdep_off();
+	mutex_unlock(&page->cp_mutex);
+	lockdep_on();
+	--plist->pl_nr;
+	lu_ref_del_at(&page->cp_reference, page->cp_queue_ref, "queue", plist);
+	cl_page_put(env, page);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_del);
+
+/**
+ * Moves a page from one page list to another.
+ */
+void cl_page_list_move(struct cl_page_list *dst, struct cl_page_list *src,
+		       struct cl_page *page)
+{
+	LASSERT(src->pl_nr > 0);
+	LINVRNT(dst->pl_owner == current);
+	LINVRNT(src->pl_owner == current);
+
+	ENTRY;
+	list_move_tail(&page->cp_batch, &dst->pl_pages);
+	--src->pl_nr;
+	++dst->pl_nr;
+	lu_ref_set_at(&page->cp_reference,
+		      page->cp_queue_ref, "queue", src, dst);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_move);
+
+/**
+ * splice the cl_page_list, just as list head does
+ */
+void cl_page_list_splice(struct cl_page_list *list, struct cl_page_list *head)
+{
+	struct cl_page *page;
+	struct cl_page *tmp;
+
+	LINVRNT(list->pl_owner == current);
+	LINVRNT(head->pl_owner == current);
+
+	ENTRY;
+	cl_page_list_for_each_safe(page, tmp, list)
+		cl_page_list_move(head, list, page);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_splice);
+
+void cl_page_disown0(const struct lu_env *env,
+		     struct cl_io *io, struct cl_page *pg);
+
+/**
+ * Disowns pages in a queue.
+ */
+void cl_page_list_disown(const struct lu_env *env,
+			 struct cl_io *io, struct cl_page_list *plist)
+{
+	struct cl_page *page;
+	struct cl_page *temp;
+
+	LINVRNT(plist->pl_owner == current);
+
+	ENTRY;
+	cl_page_list_for_each_safe(page, temp, plist) {
+		LASSERT(plist->pl_nr > 0);
+
+		list_del_init(&page->cp_batch);
+		lockdep_off();
+		mutex_unlock(&page->cp_mutex);
+		lockdep_on();
+		--plist->pl_nr;
+		/*
+		 * cl_page_disown0 rather than usual cl_page_disown() is used,
+		 * because pages are possibly in CPS_FREEING state already due
+		 * to the call to cl_page_list_discard().
+		 */
+		/*
+		 * XXX cl_page_disown0() will fail if page is not locked.
+		 */
+		cl_page_disown0(env, io, page);
+		lu_ref_del(&page->cp_reference, "queue", plist);
+		cl_page_put(env, page);
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_disown);
+
+/**
+ * Releases pages from queue.
+ */
+void cl_page_list_fini(const struct lu_env *env, struct cl_page_list *plist)
+{
+	struct cl_page *page;
+	struct cl_page *temp;
+
+	LINVRNT(plist->pl_owner == current);
+
+	ENTRY;
+	cl_page_list_for_each_safe(page, temp, plist)
+		cl_page_list_del(env, plist, page);
+	LASSERT(plist->pl_nr == 0);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_fini);
+
+/**
+ * Owns all pages in a queue.
+ */
+int cl_page_list_own(const struct lu_env *env,
+		     struct cl_io *io, struct cl_page_list *plist)
+{
+	struct cl_page *page;
+	struct cl_page *temp;
+	pgoff_t index = 0;
+	int result;
+
+	LINVRNT(plist->pl_owner == current);
+
+	ENTRY;
+	result = 0;
+	cl_page_list_for_each_safe(page, temp, plist) {
+		LASSERT(index <= page->cp_index);
+		index = page->cp_index;
+		if (cl_page_own(env, io, page) == 0)
+			result = result ?: page->cp_error;
+		else
+			cl_page_list_del(env, plist, page);
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_page_list_own);
+
+/**
+ * Assumes all pages in a queue.
+ */
+void cl_page_list_assume(const struct lu_env *env,
+			 struct cl_io *io, struct cl_page_list *plist)
+{
+	struct cl_page *page;
+
+	LINVRNT(plist->pl_owner == current);
+
+	cl_page_list_for_each(page, plist)
+		cl_page_assume(env, io, page);
+}
+EXPORT_SYMBOL(cl_page_list_assume);
+
+/**
+ * Discards all pages in a queue.
+ */
+void cl_page_list_discard(const struct lu_env *env, struct cl_io *io,
+			  struct cl_page_list *plist)
+{
+	struct cl_page *page;
+
+	LINVRNT(plist->pl_owner == current);
+	ENTRY;
+	cl_page_list_for_each(page, plist)
+		cl_page_discard(env, io, page);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_discard);
+
+/**
+ * Unmaps all pages in a queue from user virtual memory.
+ */
+int cl_page_list_unmap(const struct lu_env *env, struct cl_io *io,
+			struct cl_page_list *plist)
+{
+	struct cl_page *page;
+	int result;
+
+	LINVRNT(plist->pl_owner == current);
+	ENTRY;
+	result = 0;
+	cl_page_list_for_each(page, plist) {
+		result = cl_page_unmap(env, io, page);
+		if (result != 0)
+			break;
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_page_list_unmap);
+
+/**
+ * Initialize dual page queue.
+ */
+void cl_2queue_init(struct cl_2queue *queue)
+{
+	ENTRY;
+	cl_page_list_init(&queue->c2_qin);
+	cl_page_list_init(&queue->c2_qout);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_init);
+
+/**
+ * Add a page to the incoming page list of 2-queue.
+ */
+void cl_2queue_add(struct cl_2queue *queue, struct cl_page *page)
+{
+	ENTRY;
+	cl_page_list_add(&queue->c2_qin, page);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_add);
+
+/**
+ * Disown pages in both lists of a 2-queue.
+ */
+void cl_2queue_disown(const struct lu_env *env,
+		      struct cl_io *io, struct cl_2queue *queue)
+{
+	ENTRY;
+	cl_page_list_disown(env, io, &queue->c2_qin);
+	cl_page_list_disown(env, io, &queue->c2_qout);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_disown);
+
+/**
+ * Discard (truncate) pages in both lists of a 2-queue.
+ */
+void cl_2queue_discard(const struct lu_env *env,
+		       struct cl_io *io, struct cl_2queue *queue)
+{
+	ENTRY;
+	cl_page_list_discard(env, io, &queue->c2_qin);
+	cl_page_list_discard(env, io, &queue->c2_qout);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_discard);
+
+/**
+ * Assume to own the pages in cl_2queue
+ */
+void cl_2queue_assume(const struct lu_env *env,
+		      struct cl_io *io, struct cl_2queue *queue)
+{
+	cl_page_list_assume(env, io, &queue->c2_qin);
+	cl_page_list_assume(env, io, &queue->c2_qout);
+}
+EXPORT_SYMBOL(cl_2queue_assume);
+
+/**
+ * Finalize both page lists of a 2-queue.
+ */
+void cl_2queue_fini(const struct lu_env *env, struct cl_2queue *queue)
+{
+	ENTRY;
+	cl_page_list_fini(env, &queue->c2_qout);
+	cl_page_list_fini(env, &queue->c2_qin);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_fini);
+
+/**
+ * Initialize a 2-queue to contain \a page in its incoming page list.
+ */
+void cl_2queue_init_page(struct cl_2queue *queue, struct cl_page *page)
+{
+	ENTRY;
+	cl_2queue_init(queue);
+	cl_2queue_add(queue, page);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_init_page);
+
+/**
+ * Returns top-level io.
+ *
+ * \see cl_object_top(), cl_page_top().
+ */
+struct cl_io *cl_io_top(struct cl_io *io)
+{
+	ENTRY;
+	while (io->ci_parent != NULL)
+		io = io->ci_parent;
+	RETURN(io);
+}
+EXPORT_SYMBOL(cl_io_top);
+
+/**
+ * Prints human readable representation of \a io to the \a f.
+ */
+void cl_io_print(const struct lu_env *env, void *cookie,
+		 lu_printer_t printer, const struct cl_io *io)
+{
+}
+
+/**
+ * Adds request slice to the compound request.
+ *
+ * This is called by cl_device_operations::cdo_req_init() methods to add a
+ * per-layer state to the request. New state is added at the end of
+ * cl_req::crq_layers list, that is, it is at the bottom of the stack.
+ *
+ * \see cl_lock_slice_add(), cl_page_slice_add(), cl_io_slice_add()
+ */
+void cl_req_slice_add(struct cl_req *req, struct cl_req_slice *slice,
+		      struct cl_device *dev,
+		      const struct cl_req_operations *ops)
+{
+	ENTRY;
+	list_add_tail(&slice->crs_linkage, &req->crq_layers);
+	slice->crs_dev = dev;
+	slice->crs_ops = ops;
+	slice->crs_req = req;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_req_slice_add);
+
+static void cl_req_free(const struct lu_env *env, struct cl_req *req)
+{
+	unsigned i;
+
+	LASSERT(list_empty(&req->crq_pages));
+	LASSERT(req->crq_nrpages == 0);
+	LINVRNT(list_empty(&req->crq_layers));
+	LINVRNT(equi(req->crq_nrobjs > 0, req->crq_o != NULL));
+	ENTRY;
+
+	if (req->crq_o != NULL) {
+		for (i = 0; i < req->crq_nrobjs; ++i) {
+			struct cl_object *obj = req->crq_o[i].ro_obj;
+			if (obj != NULL) {
+				lu_object_ref_del_at(&obj->co_lu,
+						     req->crq_o[i].ro_obj_ref,
+						     "cl_req", req);
+				cl_object_put(env, obj);
+			}
+		}
+		OBD_FREE(req->crq_o, req->crq_nrobjs * sizeof req->crq_o[0]);
+	}
+	OBD_FREE_PTR(req);
+	EXIT;
+}
+
+static int cl_req_init(const struct lu_env *env, struct cl_req *req,
+		       struct cl_page *page)
+{
+	struct cl_device     *dev;
+	struct cl_page_slice *slice;
+	int result;
+
+	ENTRY;
+	result = 0;
+	page = cl_page_top(page);
+	do {
+		list_for_each_entry(slice, &page->cp_layers, cpl_linkage) {
+			dev = lu2cl_dev(slice->cpl_obj->co_lu.lo_dev);
+			if (dev->cd_ops->cdo_req_init != NULL) {
+				result = dev->cd_ops->cdo_req_init(env,
+								   dev, req);
+				if (result != 0)
+					break;
+			}
+		}
+		page = page->cp_child;
+	} while (page != NULL && result == 0);
+	RETURN(result);
+}
+
+/**
+ * Invokes per-request transfer completion call-backs
+ * (cl_req_operations::cro_completion()) bottom-to-top.
+ */
+void cl_req_completion(const struct lu_env *env, struct cl_req *req, int rc)
+{
+	struct cl_req_slice *slice;
+
+	ENTRY;
+	/*
+	 * for the lack of list_for_each_entry_reverse_safe()...
+	 */
+	while (!list_empty(&req->crq_layers)) {
+		slice = list_entry(req->crq_layers.prev,
+				       struct cl_req_slice, crs_linkage);
+		list_del_init(&slice->crs_linkage);
+		if (slice->crs_ops->cro_completion != NULL)
+			slice->crs_ops->cro_completion(env, slice, rc);
+	}
+	cl_req_free(env, req);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_req_completion);
+
+/**
+ * Allocates new transfer request.
+ */
+struct cl_req *cl_req_alloc(const struct lu_env *env, struct cl_page *page,
+			    enum cl_req_type crt, int nr_objects)
+{
+	struct cl_req *req;
+
+	LINVRNT(nr_objects > 0);
+	ENTRY;
+
+	OBD_ALLOC_PTR(req);
+	if (req != NULL) {
+		int result;
+
+		OBD_ALLOC(req->crq_o, nr_objects * sizeof req->crq_o[0]);
+		if (req->crq_o != NULL) {
+			req->crq_nrobjs = nr_objects;
+			req->crq_type = crt;
+			INIT_LIST_HEAD(&req->crq_pages);
+			INIT_LIST_HEAD(&req->crq_layers);
+			result = cl_req_init(env, req, page);
+		} else
+			result = -ENOMEM;
+		if (result != 0) {
+			cl_req_completion(env, req, result);
+			req = ERR_PTR(result);
+		}
+	} else
+		req = ERR_PTR(-ENOMEM);
+	RETURN(req);
+}
+EXPORT_SYMBOL(cl_req_alloc);
+
+/**
+ * Adds a page to a request.
+ */
+void cl_req_page_add(const struct lu_env *env,
+		     struct cl_req *req, struct cl_page *page)
+{
+	struct cl_object  *obj;
+	struct cl_req_obj *rqo;
+	int i;
+
+	ENTRY;
+	page = cl_page_top(page);
+
+	LASSERT(list_empty(&page->cp_flight));
+	LASSERT(page->cp_req == NULL);
+
+	CL_PAGE_DEBUG(D_PAGE, env, page, "req %p, %d, %u\n",
+		      req, req->crq_type, req->crq_nrpages);
+
+	list_add_tail(&page->cp_flight, &req->crq_pages);
+	++req->crq_nrpages;
+	page->cp_req = req;
+	obj = cl_object_top(page->cp_obj);
+	for (i = 0, rqo = req->crq_o; obj != rqo->ro_obj; ++i, ++rqo) {
+		if (rqo->ro_obj == NULL) {
+			rqo->ro_obj = obj;
+			cl_object_get(obj);
+			rqo->ro_obj_ref = lu_object_ref_add(&obj->co_lu,
+							    "cl_req", req);
+			break;
+		}
+	}
+	LASSERT(i < req->crq_nrobjs);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_req_page_add);
+
+/**
+ * Removes a page from a request.
+ */
+void cl_req_page_done(const struct lu_env *env, struct cl_page *page)
+{
+	struct cl_req *req = page->cp_req;
+
+	ENTRY;
+	page = cl_page_top(page);
+
+	LASSERT(!list_empty(&page->cp_flight));
+	LASSERT(req->crq_nrpages > 0);
+
+	list_del_init(&page->cp_flight);
+	--req->crq_nrpages;
+	page->cp_req = NULL;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_req_page_done);
+
+/**
+ * Notifies layers that request is about to depart by calling
+ * cl_req_operations::cro_prep() top-to-bottom.
+ */
+int cl_req_prep(const struct lu_env *env, struct cl_req *req)
+{
+	int i;
+	int result;
+	const struct cl_req_slice *slice;
+
+	ENTRY;
+	/*
+	 * Check that the caller of cl_req_alloc() didn't lie about the number
+	 * of objects.
+	 */
+	for (i = 0; i < req->crq_nrobjs; ++i)
+		LASSERT(req->crq_o[i].ro_obj != NULL);
+
+	result = 0;
+	list_for_each_entry(slice, &req->crq_layers, crs_linkage) {
+		if (slice->crs_ops->cro_prep != NULL) {
+			result = slice->crs_ops->cro_prep(env, slice);
+			if (result != 0)
+				break;
+		}
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_req_prep);
+
+/**
+ * Fills in attributes that are passed to server together with transfer. Only
+ * attributes from \a flags may be touched. This can be called multiple times
+ * for the same request.
+ */
+void cl_req_attr_set(const struct lu_env *env, struct cl_req *req,
+		     struct cl_req_attr *attr, obd_valid flags)
+{
+	const struct cl_req_slice *slice;
+	struct cl_page	    *page;
+	int i;
+
+	LASSERT(!list_empty(&req->crq_pages));
+	ENTRY;
+
+	/* Take any page to use as a model. */
+	page = list_entry(req->crq_pages.next, struct cl_page, cp_flight);
+
+	for (i = 0; i < req->crq_nrobjs; ++i) {
+		list_for_each_entry(slice, &req->crq_layers, crs_linkage) {
+			const struct cl_page_slice *scan;
+			const struct cl_object     *obj;
+
+			scan = cl_page_at(page,
+					  slice->crs_dev->cd_lu_dev.ld_type);
+			LASSERT(scan != NULL);
+			obj = scan->cpl_obj;
+			if (slice->crs_ops->cro_attr_set != NULL)
+				slice->crs_ops->cro_attr_set(env, slice, obj,
+							     attr + i, flags);
+		}
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(cl_req_attr_set);
+
+/* XXX complete(), init_completion(), and wait_for_completion(), until they are
+ * implemented in libcfs. */
+# include <linux/sched.h>
+
+/**
+ * Initialize synchronous io wait anchor, for transfer of \a nrpages pages.
+ */
+void cl_sync_io_init(struct cl_sync_io *anchor, int nrpages)
+{
+	ENTRY;
+	init_waitqueue_head(&anchor->csi_waitq);
+	atomic_set(&anchor->csi_sync_nr, nrpages);
+	atomic_set(&anchor->csi_barrier, nrpages > 0);
+	anchor->csi_sync_rc = 0;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_sync_io_init);
+
+/**
+ * Wait until all transfer completes. Transfer completion routine has to call
+ * cl_sync_io_note() for every page.
+ */
+int cl_sync_io_wait(const struct lu_env *env, struct cl_io *io,
+		    struct cl_page_list *queue, struct cl_sync_io *anchor,
+		    long timeout)
+{
+	struct l_wait_info lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(timeout),
+						  NULL, NULL, NULL);
+	int rc;
+	ENTRY;
+
+	LASSERT(timeout >= 0);
+
+	rc = l_wait_event(anchor->csi_waitq,
+			  atomic_read(&anchor->csi_sync_nr) == 0,
+			  &lwi);
+	if (rc < 0) {
+		CERROR("SYNC IO failed with error: %d, try to cancel "
+		       "%d remaining pages\n",
+		       rc, atomic_read(&anchor->csi_sync_nr));
+
+		(void)cl_io_cancel(env, io, queue);
+
+		lwi = (struct l_wait_info) { 0 };
+		(void)l_wait_event(anchor->csi_waitq,
+				   atomic_read(&anchor->csi_sync_nr) == 0,
+				   &lwi);
+	} else {
+		rc = anchor->csi_sync_rc;
+	}
+	LASSERT(atomic_read(&anchor->csi_sync_nr) == 0);
+	cl_page_list_assume(env, io, queue);
+
+	/* wait until cl_sync_io_note() has done wakeup */
+	while (unlikely(atomic_read(&anchor->csi_barrier) != 0)) {
+		cpu_relax();
+	}
+
+	POISON(anchor, 0x5a, sizeof *anchor);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(cl_sync_io_wait);
+
+/**
+ * Indicate that transfer of a single page completed.
+ */
+void cl_sync_io_note(struct cl_sync_io *anchor, int ioret)
+{
+	ENTRY;
+	if (anchor->csi_sync_rc == 0 && ioret < 0)
+		anchor->csi_sync_rc = ioret;
+	/*
+	 * Synchronous IO done without releasing page lock (e.g., as a part of
+	 * ->{prepare,commit}_write(). Completion is used to signal the end of
+	 * IO.
+	 */
+	LASSERT(atomic_read(&anchor->csi_sync_nr) > 0);
+	if (atomic_dec_and_test(&anchor->csi_sync_nr)) {
+		wake_up_all(&anchor->csi_waitq);
+		/* it's safe to nuke or reuse anchor now */
+		atomic_set(&anchor->csi_barrier, 0);
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(cl_sync_io_note);
diff --git a/drivers/staging/lustre/lustre/obdclass/cl_lock.c b/drivers/staging/lustre/lustre/obdclass/cl_lock.c
new file mode 100644
index 000000000000..d34e044fc854
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/cl_lock.c
@@ -0,0 +1,2304 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Client Extent Lock.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <linux/list.h>
+#include <cl_object.h>
+#include "cl_internal.h"
+
+/** Lock class of cl_lock::cll_guard */
+static struct lock_class_key cl_lock_guard_class;
+static struct kmem_cache *cl_lock_kmem;
+
+static struct lu_kmem_descr cl_lock_caches[] = {
+	{
+		.ckd_cache = &cl_lock_kmem,
+		.ckd_name  = "cl_lock_kmem",
+		.ckd_size  = sizeof (struct cl_lock)
+	},
+	{
+		.ckd_cache = NULL
+	}
+};
+
+#define CS_LOCK_INC(o, item)
+#define CS_LOCK_DEC(o, item)
+#define CS_LOCKSTATE_INC(o, state)
+#define CS_LOCKSTATE_DEC(o, state)
+
+/**
+ * Basic lock invariant that is maintained at all times. Caller either has a
+ * reference to \a lock, or somehow assures that \a lock cannot be freed.
+ *
+ * \see cl_lock_invariant()
+ */
+static int cl_lock_invariant_trusted(const struct lu_env *env,
+				     const struct cl_lock *lock)
+{
+	return  ergo(lock->cll_state == CLS_FREEING, lock->cll_holds == 0) &&
+		atomic_read(&lock->cll_ref) >= lock->cll_holds &&
+		lock->cll_holds >= lock->cll_users &&
+		lock->cll_holds >= 0 &&
+		lock->cll_users >= 0 &&
+		lock->cll_depth >= 0;
+}
+
+/**
+ * Stronger lock invariant, checking that caller has a reference on a lock.
+ *
+ * \see cl_lock_invariant_trusted()
+ */
+static int cl_lock_invariant(const struct lu_env *env,
+			     const struct cl_lock *lock)
+{
+	int result;
+
+	result = atomic_read(&lock->cll_ref) > 0 &&
+		cl_lock_invariant_trusted(env, lock);
+	if (!result && env != NULL)
+		CL_LOCK_DEBUG(D_ERROR, env, lock, "invariant broken");
+	return result;
+}
+
+/**
+ * Returns lock "nesting": 0 for a top-lock and 1 for a sub-lock.
+ */
+static enum clt_nesting_level cl_lock_nesting(const struct cl_lock *lock)
+{
+	return cl_object_header(lock->cll_descr.cld_obj)->coh_nesting;
+}
+
+/**
+ * Returns a set of counters for this lock, depending on a lock nesting.
+ */
+static struct cl_thread_counters *cl_lock_counters(const struct lu_env *env,
+						   const struct cl_lock *lock)
+{
+	struct cl_thread_info *info;
+	enum clt_nesting_level nesting;
+
+	info = cl_env_info(env);
+	nesting = cl_lock_nesting(lock);
+	LASSERT(nesting < ARRAY_SIZE(info->clt_counters));
+	return &info->clt_counters[nesting];
+}
+
+static void cl_lock_trace0(int level, const struct lu_env *env,
+			   const char *prefix, const struct cl_lock *lock,
+			   const char *func, const int line)
+{
+	struct cl_object_header *h = cl_object_header(lock->cll_descr.cld_obj);
+	CDEBUG(level, "%s: %p@(%d %p %d %d %d %d %d %lx)"
+		      "(%p/%d/%d) at %s():%d\n",
+	       prefix, lock, atomic_read(&lock->cll_ref),
+	       lock->cll_guarder, lock->cll_depth,
+	       lock->cll_state, lock->cll_error, lock->cll_holds,
+	       lock->cll_users, lock->cll_flags,
+	       env, h->coh_nesting, cl_lock_nr_mutexed(env),
+	       func, line);
+}
+#define cl_lock_trace(level, env, prefix, lock)			 \
+	cl_lock_trace0(level, env, prefix, lock, __FUNCTION__, __LINE__)
+
+#define RETIP ((unsigned long)__builtin_return_address(0))
+
+#ifdef CONFIG_LOCKDEP
+static struct lock_class_key cl_lock_key;
+
+static void cl_lock_lockdep_init(struct cl_lock *lock)
+{
+	lockdep_set_class_and_name(lock, &cl_lock_key, "EXT");
+}
+
+static void cl_lock_lockdep_acquire(const struct lu_env *env,
+				    struct cl_lock *lock, __u32 enqflags)
+{
+	cl_lock_counters(env, lock)->ctc_nr_locks_acquired++;
+	lock_map_acquire(&lock->dep_map);
+}
+
+static void cl_lock_lockdep_release(const struct lu_env *env,
+				    struct cl_lock *lock)
+{
+	cl_lock_counters(env, lock)->ctc_nr_locks_acquired--;
+	lock_release(&lock->dep_map, 0, RETIP);
+}
+
+#else /* !CONFIG_LOCKDEP */
+
+static void cl_lock_lockdep_init(struct cl_lock *lock)
+{}
+static void cl_lock_lockdep_acquire(const struct lu_env *env,
+				    struct cl_lock *lock, __u32 enqflags)
+{}
+static void cl_lock_lockdep_release(const struct lu_env *env,
+				    struct cl_lock *lock)
+{}
+
+#endif /* !CONFIG_LOCKDEP */
+
+/**
+ * Adds lock slice to the compound lock.
+ *
+ * This is called by cl_object_operations::coo_lock_init() methods to add a
+ * per-layer state to the lock. New state is added at the end of
+ * cl_lock::cll_layers list, that is, it is at the bottom of the stack.
+ *
+ * \see cl_req_slice_add(), cl_page_slice_add(), cl_io_slice_add()
+ */
+void cl_lock_slice_add(struct cl_lock *lock, struct cl_lock_slice *slice,
+		       struct cl_object *obj,
+		       const struct cl_lock_operations *ops)
+{
+	ENTRY;
+	slice->cls_lock = lock;
+	list_add_tail(&slice->cls_linkage, &lock->cll_layers);
+	slice->cls_obj = obj;
+	slice->cls_ops = ops;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_slice_add);
+
+/**
+ * Returns true iff a lock with the mode \a has provides at least the same
+ * guarantees as a lock with the mode \a need.
+ */
+int cl_lock_mode_match(enum cl_lock_mode has, enum cl_lock_mode need)
+{
+	LINVRNT(need == CLM_READ || need == CLM_WRITE ||
+		need == CLM_PHANTOM || need == CLM_GROUP);
+	LINVRNT(has == CLM_READ || has == CLM_WRITE ||
+		has == CLM_PHANTOM || has == CLM_GROUP);
+	CLASSERT(CLM_PHANTOM < CLM_READ);
+	CLASSERT(CLM_READ < CLM_WRITE);
+	CLASSERT(CLM_WRITE < CLM_GROUP);
+
+	if (has != CLM_GROUP)
+		return need <= has;
+	else
+		return need == has;
+}
+EXPORT_SYMBOL(cl_lock_mode_match);
+
+/**
+ * Returns true iff extent portions of lock descriptions match.
+ */
+int cl_lock_ext_match(const struct cl_lock_descr *has,
+		      const struct cl_lock_descr *need)
+{
+	return
+		has->cld_start <= need->cld_start &&
+		has->cld_end >= need->cld_end &&
+		cl_lock_mode_match(has->cld_mode, need->cld_mode) &&
+		(has->cld_mode != CLM_GROUP || has->cld_gid == need->cld_gid);
+}
+EXPORT_SYMBOL(cl_lock_ext_match);
+
+/**
+ * Returns true iff a lock with the description \a has provides at least the
+ * same guarantees as a lock with the description \a need.
+ */
+int cl_lock_descr_match(const struct cl_lock_descr *has,
+			const struct cl_lock_descr *need)
+{
+	return
+		cl_object_same(has->cld_obj, need->cld_obj) &&
+		cl_lock_ext_match(has, need);
+}
+EXPORT_SYMBOL(cl_lock_descr_match);
+
+static void cl_lock_free(const struct lu_env *env, struct cl_lock *lock)
+{
+	struct cl_object *obj = lock->cll_descr.cld_obj;
+
+	LINVRNT(!cl_lock_is_mutexed(lock));
+
+	ENTRY;
+	cl_lock_trace(D_DLMTRACE, env, "free lock", lock);
+	might_sleep();
+	while (!list_empty(&lock->cll_layers)) {
+		struct cl_lock_slice *slice;
+
+		slice = list_entry(lock->cll_layers.next,
+				       struct cl_lock_slice, cls_linkage);
+		list_del_init(lock->cll_layers.next);
+		slice->cls_ops->clo_fini(env, slice);
+	}
+	CS_LOCK_DEC(obj, total);
+	CS_LOCKSTATE_DEC(obj, lock->cll_state);
+	lu_object_ref_del_at(&obj->co_lu, lock->cll_obj_ref, "cl_lock", lock);
+	cl_object_put(env, obj);
+	lu_ref_fini(&lock->cll_reference);
+	lu_ref_fini(&lock->cll_holders);
+	mutex_destroy(&lock->cll_guard);
+	OBD_SLAB_FREE_PTR(lock, cl_lock_kmem);
+	EXIT;
+}
+
+/**
+ * Releases a reference on a lock.
+ *
+ * When last reference is released, lock is returned to the cache, unless it
+ * is in cl_lock_state::CLS_FREEING state, in which case it is destroyed
+ * immediately.
+ *
+ * \see cl_object_put(), cl_page_put()
+ */
+void cl_lock_put(const struct lu_env *env, struct cl_lock *lock)
+{
+	struct cl_object	*obj;
+
+	LINVRNT(cl_lock_invariant(env, lock));
+	ENTRY;
+	obj = lock->cll_descr.cld_obj;
+	LINVRNT(obj != NULL);
+
+	CDEBUG(D_TRACE, "releasing reference: %d %p %lu\n",
+	       atomic_read(&lock->cll_ref), lock, RETIP);
+
+	if (atomic_dec_and_test(&lock->cll_ref)) {
+		if (lock->cll_state == CLS_FREEING) {
+			LASSERT(list_empty(&lock->cll_linkage));
+			cl_lock_free(env, lock);
+		}
+		CS_LOCK_DEC(obj, busy);
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_put);
+
+/**
+ * Acquires an additional reference to a lock.
+ *
+ * This can be called only by caller already possessing a reference to \a
+ * lock.
+ *
+ * \see cl_object_get(), cl_page_get()
+ */
+void cl_lock_get(struct cl_lock *lock)
+{
+	LINVRNT(cl_lock_invariant(NULL, lock));
+	CDEBUG(D_TRACE, "acquiring reference: %d %p %lu\n",
+	       atomic_read(&lock->cll_ref), lock, RETIP);
+	atomic_inc(&lock->cll_ref);
+}
+EXPORT_SYMBOL(cl_lock_get);
+
+/**
+ * Acquires a reference to a lock.
+ *
+ * This is much like cl_lock_get(), except that this function can be used to
+ * acquire initial reference to the cached lock. Caller has to deal with all
+ * possible races. Use with care!
+ *
+ * \see cl_page_get_trust()
+ */
+void cl_lock_get_trust(struct cl_lock *lock)
+{
+	CDEBUG(D_TRACE, "acquiring trusted reference: %d %p %lu\n",
+	       atomic_read(&lock->cll_ref), lock, RETIP);
+	if (atomic_inc_return(&lock->cll_ref) == 1)
+		CS_LOCK_INC(lock->cll_descr.cld_obj, busy);
+}
+EXPORT_SYMBOL(cl_lock_get_trust);
+
+/**
+ * Helper function destroying the lock that wasn't completely initialized.
+ *
+ * Other threads can acquire references to the top-lock through its
+ * sub-locks. Hence, it cannot be cl_lock_free()-ed immediately.
+ */
+static void cl_lock_finish(const struct lu_env *env, struct cl_lock *lock)
+{
+	cl_lock_mutex_get(env, lock);
+	cl_lock_cancel(env, lock);
+	cl_lock_delete(env, lock);
+	cl_lock_mutex_put(env, lock);
+	cl_lock_put(env, lock);
+}
+
+static struct cl_lock *cl_lock_alloc(const struct lu_env *env,
+				     struct cl_object *obj,
+				     const struct cl_io *io,
+				     const struct cl_lock_descr *descr)
+{
+	struct cl_lock	  *lock;
+	struct lu_object_header *head;
+
+	ENTRY;
+	OBD_SLAB_ALLOC_PTR_GFP(lock, cl_lock_kmem, __GFP_IO);
+	if (lock != NULL) {
+		atomic_set(&lock->cll_ref, 1);
+		lock->cll_descr = *descr;
+		lock->cll_state = CLS_NEW;
+		cl_object_get(obj);
+		lock->cll_obj_ref = lu_object_ref_add(&obj->co_lu,
+						      "cl_lock", lock);
+		INIT_LIST_HEAD(&lock->cll_layers);
+		INIT_LIST_HEAD(&lock->cll_linkage);
+		INIT_LIST_HEAD(&lock->cll_inclosure);
+		lu_ref_init(&lock->cll_reference);
+		lu_ref_init(&lock->cll_holders);
+		mutex_init(&lock->cll_guard);
+		lockdep_set_class(&lock->cll_guard, &cl_lock_guard_class);
+		init_waitqueue_head(&lock->cll_wq);
+		head = obj->co_lu.lo_header;
+		CS_LOCKSTATE_INC(obj, CLS_NEW);
+		CS_LOCK_INC(obj, total);
+		CS_LOCK_INC(obj, create);
+		cl_lock_lockdep_init(lock);
+		list_for_each_entry(obj, &head->loh_layers,
+					co_lu.lo_linkage) {
+			int err;
+
+			err = obj->co_ops->coo_lock_init(env, obj, lock, io);
+			if (err != 0) {
+				cl_lock_finish(env, lock);
+				lock = ERR_PTR(err);
+				break;
+			}
+		}
+	} else
+		lock = ERR_PTR(-ENOMEM);
+	RETURN(lock);
+}
+
+/**
+ * Transfer the lock into INTRANSIT state and return the original state.
+ *
+ * \pre  state: CLS_CACHED, CLS_HELD or CLS_ENQUEUED
+ * \post state: CLS_INTRANSIT
+ * \see CLS_INTRANSIT
+ */
+enum cl_lock_state cl_lock_intransit(const struct lu_env *env,
+				     struct cl_lock *lock)
+{
+	enum cl_lock_state state = lock->cll_state;
+
+	LASSERT(cl_lock_is_mutexed(lock));
+	LASSERT(state != CLS_INTRANSIT);
+	LASSERTF(state >= CLS_ENQUEUED && state <= CLS_CACHED,
+		 "Malformed lock state %d.\n", state);
+
+	cl_lock_state_set(env, lock, CLS_INTRANSIT);
+	lock->cll_intransit_owner = current;
+	cl_lock_hold_add(env, lock, "intransit", current);
+	return state;
+}
+EXPORT_SYMBOL(cl_lock_intransit);
+
+/**
+ *  Exit the intransit state and restore the lock state to the original state
+ */
+void cl_lock_extransit(const struct lu_env *env, struct cl_lock *lock,
+		       enum cl_lock_state state)
+{
+	LASSERT(cl_lock_is_mutexed(lock));
+	LASSERT(lock->cll_state == CLS_INTRANSIT);
+	LASSERT(state != CLS_INTRANSIT);
+	LASSERT(lock->cll_intransit_owner == current);
+
+	lock->cll_intransit_owner = NULL;
+	cl_lock_state_set(env, lock, state);
+	cl_lock_unhold(env, lock, "intransit", current);
+}
+EXPORT_SYMBOL(cl_lock_extransit);
+
+/**
+ * Checking whether the lock is intransit state
+ */
+int cl_lock_is_intransit(struct cl_lock *lock)
+{
+	LASSERT(cl_lock_is_mutexed(lock));
+	return lock->cll_state == CLS_INTRANSIT &&
+	       lock->cll_intransit_owner != current;
+}
+EXPORT_SYMBOL(cl_lock_is_intransit);
+/**
+ * Returns true iff lock is "suitable" for given io. E.g., locks acquired by
+ * truncate and O_APPEND cannot be reused for read/non-append-write, as they
+ * cover multiple stripes and can trigger cascading timeouts.
+ */
+static int cl_lock_fits_into(const struct lu_env *env,
+			     const struct cl_lock *lock,
+			     const struct cl_lock_descr *need,
+			     const struct cl_io *io)
+{
+	const struct cl_lock_slice *slice;
+
+	LINVRNT(cl_lock_invariant_trusted(env, lock));
+	ENTRY;
+	list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+		if (slice->cls_ops->clo_fits_into != NULL &&
+		    !slice->cls_ops->clo_fits_into(env, slice, need, io))
+			RETURN(0);
+	}
+	RETURN(1);
+}
+
+static struct cl_lock *cl_lock_lookup(const struct lu_env *env,
+				      struct cl_object *obj,
+				      const struct cl_io *io,
+				      const struct cl_lock_descr *need)
+{
+	struct cl_lock	  *lock;
+	struct cl_object_header *head;
+
+	ENTRY;
+
+	head = cl_object_header(obj);
+	LINVRNT(spin_is_locked(&head->coh_lock_guard));
+	CS_LOCK_INC(obj, lookup);
+	list_for_each_entry(lock, &head->coh_locks, cll_linkage) {
+		int matched;
+
+		matched = cl_lock_ext_match(&lock->cll_descr, need) &&
+			  lock->cll_state < CLS_FREEING &&
+			  lock->cll_error == 0 &&
+			  !(lock->cll_flags & CLF_CANCELLED) &&
+			  cl_lock_fits_into(env, lock, need, io);
+		CDEBUG(D_DLMTRACE, "has: "DDESCR"(%d) need: "DDESCR": %d\n",
+		       PDESCR(&lock->cll_descr), lock->cll_state, PDESCR(need),
+		       matched);
+		if (matched) {
+			cl_lock_get_trust(lock);
+			CS_LOCK_INC(obj, hit);
+			RETURN(lock);
+		}
+	}
+	RETURN(NULL);
+}
+
+/**
+ * Returns a lock matching description \a need.
+ *
+ * This is the main entry point into the cl_lock caching interface. First, a
+ * cache (implemented as a per-object linked list) is consulted. If lock is
+ * found there, it is returned immediately. Otherwise new lock is allocated
+ * and returned. In any case, additional reference to lock is acquired.
+ *
+ * \see cl_object_find(), cl_page_find()
+ */
+static struct cl_lock *cl_lock_find(const struct lu_env *env,
+				    const struct cl_io *io,
+				    const struct cl_lock_descr *need)
+{
+	struct cl_object_header *head;
+	struct cl_object	*obj;
+	struct cl_lock	  *lock;
+
+	ENTRY;
+
+	obj  = need->cld_obj;
+	head = cl_object_header(obj);
+
+	spin_lock(&head->coh_lock_guard);
+	lock = cl_lock_lookup(env, obj, io, need);
+	spin_unlock(&head->coh_lock_guard);
+
+	if (lock == NULL) {
+		lock = cl_lock_alloc(env, obj, io, need);
+		if (!IS_ERR(lock)) {
+			struct cl_lock *ghost;
+
+			spin_lock(&head->coh_lock_guard);
+			ghost = cl_lock_lookup(env, obj, io, need);
+			if (ghost == NULL) {
+				list_add_tail(&lock->cll_linkage,
+						  &head->coh_locks);
+				spin_unlock(&head->coh_lock_guard);
+				CS_LOCK_INC(obj, busy);
+			} else {
+				spin_unlock(&head->coh_lock_guard);
+				/*
+				 * Other threads can acquire references to the
+				 * top-lock through its sub-locks. Hence, it
+				 * cannot be cl_lock_free()-ed immediately.
+				 */
+				cl_lock_finish(env, lock);
+				lock = ghost;
+			}
+		}
+	}
+	RETURN(lock);
+}
+
+/**
+ * Returns existing lock matching given description. This is similar to
+ * cl_lock_find() except that no new lock is created, and returned lock is
+ * guaranteed to be in enum cl_lock_state::CLS_HELD state.
+ */
+struct cl_lock *cl_lock_peek(const struct lu_env *env, const struct cl_io *io,
+			     const struct cl_lock_descr *need,
+			     const char *scope, const void *source)
+{
+	struct cl_object_header *head;
+	struct cl_object	*obj;
+	struct cl_lock	  *lock;
+
+	obj  = need->cld_obj;
+	head = cl_object_header(obj);
+
+	do {
+		spin_lock(&head->coh_lock_guard);
+		lock = cl_lock_lookup(env, obj, io, need);
+		spin_unlock(&head->coh_lock_guard);
+		if (lock == NULL)
+			return NULL;
+
+		cl_lock_mutex_get(env, lock);
+		if (lock->cll_state == CLS_INTRANSIT)
+			/* Don't care return value. */
+			cl_lock_state_wait(env, lock);
+		if (lock->cll_state == CLS_FREEING) {
+			cl_lock_mutex_put(env, lock);
+			cl_lock_put(env, lock);
+			lock = NULL;
+		}
+	} while (lock == NULL);
+
+	cl_lock_hold_add(env, lock, scope, source);
+	cl_lock_user_add(env, lock);
+	if (lock->cll_state == CLS_CACHED)
+		cl_use_try(env, lock, 1);
+	if (lock->cll_state == CLS_HELD) {
+		cl_lock_mutex_put(env, lock);
+		cl_lock_lockdep_acquire(env, lock, 0);
+		cl_lock_put(env, lock);
+	} else {
+		cl_unuse_try(env, lock);
+		cl_lock_unhold(env, lock, scope, source);
+		cl_lock_mutex_put(env, lock);
+		cl_lock_put(env, lock);
+		lock = NULL;
+	}
+
+	return lock;
+}
+EXPORT_SYMBOL(cl_lock_peek);
+
+/**
+ * Returns a slice within a lock, corresponding to the given layer in the
+ * device stack.
+ *
+ * \see cl_page_at()
+ */
+const struct cl_lock_slice *cl_lock_at(const struct cl_lock *lock,
+				       const struct lu_device_type *dtype)
+{
+	const struct cl_lock_slice *slice;
+
+	LINVRNT(cl_lock_invariant_trusted(NULL, lock));
+	ENTRY;
+
+	list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+		if (slice->cls_obj->co_lu.lo_dev->ld_type == dtype)
+			RETURN(slice);
+	}
+	RETURN(NULL);
+}
+EXPORT_SYMBOL(cl_lock_at);
+
+static void cl_lock_mutex_tail(const struct lu_env *env, struct cl_lock *lock)
+{
+	struct cl_thread_counters *counters;
+
+	counters = cl_lock_counters(env, lock);
+	lock->cll_depth++;
+	counters->ctc_nr_locks_locked++;
+	lu_ref_add(&counters->ctc_locks_locked, "cll_guard", lock);
+	cl_lock_trace(D_TRACE, env, "got mutex", lock);
+}
+
+/**
+ * Locks cl_lock object.
+ *
+ * This is used to manipulate cl_lock fields, and to serialize state
+ * transitions in the lock state machine.
+ *
+ * \post cl_lock_is_mutexed(lock)
+ *
+ * \see cl_lock_mutex_put()
+ */
+void cl_lock_mutex_get(const struct lu_env *env, struct cl_lock *lock)
+{
+	LINVRNT(cl_lock_invariant(env, lock));
+
+	if (lock->cll_guarder == current) {
+		LINVRNT(cl_lock_is_mutexed(lock));
+		LINVRNT(lock->cll_depth > 0);
+	} else {
+		struct cl_object_header *hdr;
+		struct cl_thread_info   *info;
+		int i;
+
+		LINVRNT(lock->cll_guarder != current);
+		hdr = cl_object_header(lock->cll_descr.cld_obj);
+		/*
+		 * Check that mutices are taken in the bottom-to-top order.
+		 */
+		info = cl_env_info(env);
+		for (i = 0; i < hdr->coh_nesting; ++i)
+			LASSERT(info->clt_counters[i].ctc_nr_locks_locked == 0);
+		mutex_lock_nested(&lock->cll_guard, hdr->coh_nesting);
+		lock->cll_guarder = current;
+		LINVRNT(lock->cll_depth == 0);
+	}
+	cl_lock_mutex_tail(env, lock);
+}
+EXPORT_SYMBOL(cl_lock_mutex_get);
+
+/**
+ * Try-locks cl_lock object.
+ *
+ * \retval 0 \a lock was successfully locked
+ *
+ * \retval -EBUSY \a lock cannot be locked right now
+ *
+ * \post ergo(result == 0, cl_lock_is_mutexed(lock))
+ *
+ * \see cl_lock_mutex_get()
+ */
+int cl_lock_mutex_try(const struct lu_env *env, struct cl_lock *lock)
+{
+	int result;
+
+	LINVRNT(cl_lock_invariant_trusted(env, lock));
+	ENTRY;
+
+	result = 0;
+	if (lock->cll_guarder == current) {
+		LINVRNT(lock->cll_depth > 0);
+		cl_lock_mutex_tail(env, lock);
+	} else if (mutex_trylock(&lock->cll_guard)) {
+		LINVRNT(lock->cll_depth == 0);
+		lock->cll_guarder = current;
+		cl_lock_mutex_tail(env, lock);
+	} else
+		result = -EBUSY;
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_lock_mutex_try);
+
+/**
+ {* Unlocks cl_lock object.
+ *
+ * \pre cl_lock_is_mutexed(lock)
+ *
+ * \see cl_lock_mutex_get()
+ */
+void cl_lock_mutex_put(const struct lu_env *env, struct cl_lock *lock)
+{
+	struct cl_thread_counters *counters;
+
+	LINVRNT(cl_lock_invariant(env, lock));
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(lock->cll_guarder == current);
+	LINVRNT(lock->cll_depth > 0);
+
+	counters = cl_lock_counters(env, lock);
+	LINVRNT(counters->ctc_nr_locks_locked > 0);
+
+	cl_lock_trace(D_TRACE, env, "put mutex", lock);
+	lu_ref_del(&counters->ctc_locks_locked, "cll_guard", lock);
+	counters->ctc_nr_locks_locked--;
+	if (--lock->cll_depth == 0) {
+		lock->cll_guarder = NULL;
+		mutex_unlock(&lock->cll_guard);
+	}
+}
+EXPORT_SYMBOL(cl_lock_mutex_put);
+
+/**
+ * Returns true iff lock's mutex is owned by the current thread.
+ */
+int cl_lock_is_mutexed(struct cl_lock *lock)
+{
+	return lock->cll_guarder == current;
+}
+EXPORT_SYMBOL(cl_lock_is_mutexed);
+
+/**
+ * Returns number of cl_lock mutices held by the current thread (environment).
+ */
+int cl_lock_nr_mutexed(const struct lu_env *env)
+{
+	struct cl_thread_info *info;
+	int i;
+	int locked;
+
+	/*
+	 * NOTE: if summation across all nesting levels (currently 2) proves
+	 *       too expensive, a summary counter can be added to
+	 *       struct cl_thread_info.
+	 */
+	info = cl_env_info(env);
+	for (i = 0, locked = 0; i < ARRAY_SIZE(info->clt_counters); ++i)
+		locked += info->clt_counters[i].ctc_nr_locks_locked;
+	return locked;
+}
+EXPORT_SYMBOL(cl_lock_nr_mutexed);
+
+static void cl_lock_cancel0(const struct lu_env *env, struct cl_lock *lock)
+{
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+	ENTRY;
+	if (!(lock->cll_flags & CLF_CANCELLED)) {
+		const struct cl_lock_slice *slice;
+
+		lock->cll_flags |= CLF_CANCELLED;
+		list_for_each_entry_reverse(slice, &lock->cll_layers,
+						cls_linkage) {
+			if (slice->cls_ops->clo_cancel != NULL)
+				slice->cls_ops->clo_cancel(env, slice);
+		}
+	}
+	EXIT;
+}
+
+static void cl_lock_delete0(const struct lu_env *env, struct cl_lock *lock)
+{
+	struct cl_object_header    *head;
+	const struct cl_lock_slice *slice;
+
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+
+	ENTRY;
+	if (lock->cll_state < CLS_FREEING) {
+		LASSERT(lock->cll_state != CLS_INTRANSIT);
+		cl_lock_state_set(env, lock, CLS_FREEING);
+
+		head = cl_object_header(lock->cll_descr.cld_obj);
+
+		spin_lock(&head->coh_lock_guard);
+		list_del_init(&lock->cll_linkage);
+		spin_unlock(&head->coh_lock_guard);
+
+		/*
+		 * From now on, no new references to this lock can be acquired
+		 * by cl_lock_lookup().
+		 */
+		list_for_each_entry_reverse(slice, &lock->cll_layers,
+						cls_linkage) {
+			if (slice->cls_ops->clo_delete != NULL)
+				slice->cls_ops->clo_delete(env, slice);
+		}
+		/*
+		 * From now on, no new references to this lock can be acquired
+		 * by layer-specific means (like a pointer from struct
+		 * ldlm_lock in osc, or a pointer from top-lock to sub-lock in
+		 * lov).
+		 *
+		 * Lock will be finally freed in cl_lock_put() when last of
+		 * existing references goes away.
+		 */
+	}
+	EXIT;
+}
+
+/**
+ * Mod(ifie)s cl_lock::cll_holds counter for a given lock. Also, for a
+ * top-lock (nesting == 0) accounts for this modification in the per-thread
+ * debugging counters. Sub-lock holds can be released by a thread different
+ * from one that acquired it.
+ */
+static void cl_lock_hold_mod(const struct lu_env *env, struct cl_lock *lock,
+			     int delta)
+{
+	struct cl_thread_counters *counters;
+	enum clt_nesting_level     nesting;
+
+	lock->cll_holds += delta;
+	nesting = cl_lock_nesting(lock);
+	if (nesting == CNL_TOP) {
+		counters = &cl_env_info(env)->clt_counters[CNL_TOP];
+		counters->ctc_nr_held += delta;
+		LASSERT(counters->ctc_nr_held >= 0);
+	}
+}
+
+/**
+ * Mod(ifie)s cl_lock::cll_users counter for a given lock. See
+ * cl_lock_hold_mod() for the explanation of the debugging code.
+ */
+static void cl_lock_used_mod(const struct lu_env *env, struct cl_lock *lock,
+			     int delta)
+{
+	struct cl_thread_counters *counters;
+	enum clt_nesting_level     nesting;
+
+	lock->cll_users += delta;
+	nesting = cl_lock_nesting(lock);
+	if (nesting == CNL_TOP) {
+		counters = &cl_env_info(env)->clt_counters[CNL_TOP];
+		counters->ctc_nr_used += delta;
+		LASSERT(counters->ctc_nr_used >= 0);
+	}
+}
+
+void cl_lock_hold_release(const struct lu_env *env, struct cl_lock *lock,
+			  const char *scope, const void *source)
+{
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+	LASSERT(lock->cll_holds > 0);
+
+	ENTRY;
+	cl_lock_trace(D_DLMTRACE, env, "hold release lock", lock);
+	lu_ref_del(&lock->cll_holders, scope, source);
+	cl_lock_hold_mod(env, lock, -1);
+	if (lock->cll_holds == 0) {
+		CL_LOCK_ASSERT(lock->cll_state != CLS_HELD, env, lock);
+		if (lock->cll_descr.cld_mode == CLM_PHANTOM ||
+		    lock->cll_descr.cld_mode == CLM_GROUP ||
+		    lock->cll_state != CLS_CACHED)
+			/*
+			 * If lock is still phantom or grouplock when user is
+			 * done with it---destroy the lock.
+			 */
+			lock->cll_flags |= CLF_CANCELPEND|CLF_DOOMED;
+		if (lock->cll_flags & CLF_CANCELPEND) {
+			lock->cll_flags &= ~CLF_CANCELPEND;
+			cl_lock_cancel0(env, lock);
+		}
+		if (lock->cll_flags & CLF_DOOMED) {
+			/* no longer doomed: it's dead... Jim. */
+			lock->cll_flags &= ~CLF_DOOMED;
+			cl_lock_delete0(env, lock);
+		}
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_hold_release);
+
+/**
+ * Waits until lock state is changed.
+ *
+ * This function is called with cl_lock mutex locked, atomically releases
+ * mutex and goes to sleep, waiting for a lock state change (signaled by
+ * cl_lock_signal()), and re-acquires the mutex before return.
+ *
+ * This function is used to wait until lock state machine makes some progress
+ * and to emulate synchronous operations on top of asynchronous lock
+ * interface.
+ *
+ * \retval -EINTR wait was interrupted
+ *
+ * \retval 0 wait wasn't interrupted
+ *
+ * \pre cl_lock_is_mutexed(lock)
+ *
+ * \see cl_lock_signal()
+ */
+int cl_lock_state_wait(const struct lu_env *env, struct cl_lock *lock)
+{
+	wait_queue_t waiter;
+	sigset_t blocked;
+	int result;
+
+	ENTRY;
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+	LASSERT(lock->cll_depth == 1);
+	LASSERT(lock->cll_state != CLS_FREEING); /* too late to wait */
+
+	cl_lock_trace(D_DLMTRACE, env, "state wait lock", lock);
+	result = lock->cll_error;
+	if (result == 0) {
+		/* To avoid being interrupted by the 'non-fatal' signals
+		 * (SIGCHLD, for instance), we'd block them temporarily.
+		 * LU-305 */
+		blocked = cfs_block_sigsinv(LUSTRE_FATAL_SIGS);
+
+		init_waitqueue_entry_current(&waiter);
+		add_wait_queue(&lock->cll_wq, &waiter);
+		set_current_state(TASK_INTERRUPTIBLE);
+		cl_lock_mutex_put(env, lock);
+
+		LASSERT(cl_lock_nr_mutexed(env) == 0);
+
+		/* Returning ERESTARTSYS instead of EINTR so syscalls
+		 * can be restarted if signals are pending here */
+		result = -ERESTARTSYS;
+		if (likely(!OBD_FAIL_CHECK(OBD_FAIL_LOCK_STATE_WAIT_INTR))) {
+			waitq_wait(&waiter, TASK_INTERRUPTIBLE);
+			if (!cfs_signal_pending())
+				result = 0;
+		}
+
+		cl_lock_mutex_get(env, lock);
+		set_current_state(TASK_RUNNING);
+		remove_wait_queue(&lock->cll_wq, &waiter);
+
+		/* Restore old blocked signals */
+		cfs_restore_sigs(blocked);
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_lock_state_wait);
+
+static void cl_lock_state_signal(const struct lu_env *env, struct cl_lock *lock,
+				 enum cl_lock_state state)
+{
+	const struct cl_lock_slice *slice;
+
+	ENTRY;
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+
+	list_for_each_entry(slice, &lock->cll_layers, cls_linkage)
+		if (slice->cls_ops->clo_state != NULL)
+			slice->cls_ops->clo_state(env, slice, state);
+	wake_up_all(&lock->cll_wq);
+	EXIT;
+}
+
+/**
+ * Notifies waiters that lock state changed.
+ *
+ * Wakes up all waiters sleeping in cl_lock_state_wait(), also notifies all
+ * layers about state change by calling cl_lock_operations::clo_state()
+ * top-to-bottom.
+ */
+void cl_lock_signal(const struct lu_env *env, struct cl_lock *lock)
+{
+	ENTRY;
+	cl_lock_trace(D_DLMTRACE, env, "state signal lock", lock);
+	cl_lock_state_signal(env, lock, lock->cll_state);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_signal);
+
+/**
+ * Changes lock state.
+ *
+ * This function is invoked to notify layers that lock state changed, possible
+ * as a result of an asynchronous event such as call-back reception.
+ *
+ * \post lock->cll_state == state
+ *
+ * \see cl_lock_operations::clo_state()
+ */
+void cl_lock_state_set(const struct lu_env *env, struct cl_lock *lock,
+		       enum cl_lock_state state)
+{
+	ENTRY;
+	LASSERT(lock->cll_state <= state ||
+		(lock->cll_state == CLS_CACHED &&
+		 (state == CLS_HELD || /* lock found in cache */
+		  state == CLS_NEW  ||   /* sub-lock canceled */
+		  state == CLS_INTRANSIT)) ||
+		/* lock is in transit state */
+		lock->cll_state == CLS_INTRANSIT);
+
+	if (lock->cll_state != state) {
+		CS_LOCKSTATE_DEC(lock->cll_descr.cld_obj, lock->cll_state);
+		CS_LOCKSTATE_INC(lock->cll_descr.cld_obj, state);
+
+		cl_lock_state_signal(env, lock, state);
+		lock->cll_state = state;
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_state_set);
+
+static int cl_unuse_try_internal(const struct lu_env *env, struct cl_lock *lock)
+{
+	const struct cl_lock_slice *slice;
+	int result;
+
+	do {
+		result = 0;
+
+		LINVRNT(cl_lock_is_mutexed(lock));
+		LINVRNT(cl_lock_invariant(env, lock));
+		LASSERT(lock->cll_state == CLS_INTRANSIT);
+
+		result = -ENOSYS;
+		list_for_each_entry_reverse(slice, &lock->cll_layers,
+						cls_linkage) {
+			if (slice->cls_ops->clo_unuse != NULL) {
+				result = slice->cls_ops->clo_unuse(env, slice);
+				if (result != 0)
+					break;
+			}
+		}
+		LASSERT(result != -ENOSYS);
+	} while (result == CLO_REPEAT);
+
+	return result;
+}
+
+/**
+ * Yanks lock from the cache (cl_lock_state::CLS_CACHED state) by calling
+ * cl_lock_operations::clo_use() top-to-bottom to notify layers.
+ * @atomic = 1, it must unuse the lock to recovery the lock to keep the
+ *  use process atomic
+ */
+int cl_use_try(const struct lu_env *env, struct cl_lock *lock, int atomic)
+{
+	const struct cl_lock_slice *slice;
+	int result;
+	enum cl_lock_state state;
+
+	ENTRY;
+	cl_lock_trace(D_DLMTRACE, env, "use lock", lock);
+
+	LASSERT(lock->cll_state == CLS_CACHED);
+	if (lock->cll_error)
+		RETURN(lock->cll_error);
+
+	result = -ENOSYS;
+	state = cl_lock_intransit(env, lock);
+	list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+		if (slice->cls_ops->clo_use != NULL) {
+			result = slice->cls_ops->clo_use(env, slice);
+			if (result != 0)
+				break;
+		}
+	}
+	LASSERT(result != -ENOSYS);
+
+	LASSERTF(lock->cll_state == CLS_INTRANSIT, "Wrong state %d.\n",
+		 lock->cll_state);
+
+	if (result == 0) {
+		state = CLS_HELD;
+	} else {
+		if (result == -ESTALE) {
+			/*
+			 * ESTALE means sublock being cancelled
+			 * at this time, and set lock state to
+			 * be NEW here and ask the caller to repeat.
+			 */
+			state = CLS_NEW;
+			result = CLO_REPEAT;
+		}
+
+		/* @atomic means back-off-on-failure. */
+		if (atomic) {
+			int rc;
+			rc = cl_unuse_try_internal(env, lock);
+			/* Vet the results. */
+			if (rc < 0 && result > 0)
+				result = rc;
+		}
+
+	}
+	cl_lock_extransit(env, lock, state);
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_use_try);
+
+/**
+ * Helper for cl_enqueue_try() that calls ->clo_enqueue() across all layers
+ * top-to-bottom.
+ */
+static int cl_enqueue_kick(const struct lu_env *env,
+			   struct cl_lock *lock,
+			   struct cl_io *io, __u32 flags)
+{
+	int result;
+	const struct cl_lock_slice *slice;
+
+	ENTRY;
+	result = -ENOSYS;
+	list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+		if (slice->cls_ops->clo_enqueue != NULL) {
+			result = slice->cls_ops->clo_enqueue(env,
+							     slice, io, flags);
+			if (result != 0)
+				break;
+		}
+	}
+	LASSERT(result != -ENOSYS);
+	RETURN(result);
+}
+
+/**
+ * Tries to enqueue a lock.
+ *
+ * This function is called repeatedly by cl_enqueue() until either lock is
+ * enqueued, or error occurs. This function does not block waiting for
+ * networking communication to complete.
+ *
+ * \post ergo(result == 0, lock->cll_state == CLS_ENQUEUED ||
+ *			 lock->cll_state == CLS_HELD)
+ *
+ * \see cl_enqueue() cl_lock_operations::clo_enqueue()
+ * \see cl_lock_state::CLS_ENQUEUED
+ */
+int cl_enqueue_try(const struct lu_env *env, struct cl_lock *lock,
+		   struct cl_io *io, __u32 flags)
+{
+	int result;
+
+	ENTRY;
+	cl_lock_trace(D_DLMTRACE, env, "enqueue lock", lock);
+	do {
+		LINVRNT(cl_lock_is_mutexed(lock));
+
+		result = lock->cll_error;
+		if (result != 0)
+			break;
+
+		switch (lock->cll_state) {
+		case CLS_NEW:
+			cl_lock_state_set(env, lock, CLS_QUEUING);
+			/* fall-through */
+		case CLS_QUEUING:
+			/* kick layers. */
+			result = cl_enqueue_kick(env, lock, io, flags);
+			/* For AGL case, the cl_lock::cll_state may
+			 * become CLS_HELD already. */
+			if (result == 0 && lock->cll_state == CLS_QUEUING)
+				cl_lock_state_set(env, lock, CLS_ENQUEUED);
+			break;
+		case CLS_INTRANSIT:
+			LASSERT(cl_lock_is_intransit(lock));
+			result = CLO_WAIT;
+			break;
+		case CLS_CACHED:
+			/* yank lock from the cache. */
+			result = cl_use_try(env, lock, 0);
+			break;
+		case CLS_ENQUEUED:
+		case CLS_HELD:
+			result = 0;
+			break;
+		default:
+		case CLS_FREEING:
+			/*
+			 * impossible, only held locks with increased
+			 * ->cll_holds can be enqueued, and they cannot be
+			 * freed.
+			 */
+			LBUG();
+		}
+	} while (result == CLO_REPEAT);
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_enqueue_try);
+
+/**
+ * Cancel the conflicting lock found during previous enqueue.
+ *
+ * \retval 0 conflicting lock has been canceled.
+ * \retval -ve error code.
+ */
+int cl_lock_enqueue_wait(const struct lu_env *env,
+			 struct cl_lock *lock,
+			 int keep_mutex)
+{
+	struct cl_lock  *conflict;
+	int	      rc = 0;
+	ENTRY;
+
+	LASSERT(cl_lock_is_mutexed(lock));
+	LASSERT(lock->cll_state == CLS_QUEUING);
+	LASSERT(lock->cll_conflict != NULL);
+
+	conflict = lock->cll_conflict;
+	lock->cll_conflict = NULL;
+
+	cl_lock_mutex_put(env, lock);
+	LASSERT(cl_lock_nr_mutexed(env) == 0);
+
+	cl_lock_mutex_get(env, conflict);
+	cl_lock_trace(D_DLMTRACE, env, "enqueue wait", conflict);
+	cl_lock_cancel(env, conflict);
+	cl_lock_delete(env, conflict);
+
+	while (conflict->cll_state != CLS_FREEING) {
+		rc = cl_lock_state_wait(env, conflict);
+		if (rc != 0)
+			break;
+	}
+	cl_lock_mutex_put(env, conflict);
+	lu_ref_del(&conflict->cll_reference, "cancel-wait", lock);
+	cl_lock_put(env, conflict);
+
+	if (keep_mutex)
+		cl_lock_mutex_get(env, lock);
+
+	LASSERT(rc <= 0);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(cl_lock_enqueue_wait);
+
+static int cl_enqueue_locked(const struct lu_env *env, struct cl_lock *lock,
+			     struct cl_io *io, __u32 enqflags)
+{
+	int result;
+
+	ENTRY;
+
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+	LASSERT(lock->cll_holds > 0);
+
+	cl_lock_user_add(env, lock);
+	do {
+		result = cl_enqueue_try(env, lock, io, enqflags);
+		if (result == CLO_WAIT) {
+			if (lock->cll_conflict != NULL)
+				result = cl_lock_enqueue_wait(env, lock, 1);
+			else
+				result = cl_lock_state_wait(env, lock);
+			if (result == 0)
+				continue;
+		}
+		break;
+	} while (1);
+	if (result != 0)
+		cl_unuse_try(env, lock);
+	LASSERT(ergo(result == 0 && !(enqflags & CEF_AGL),
+		     lock->cll_state == CLS_ENQUEUED ||
+		     lock->cll_state == CLS_HELD));
+	RETURN(result);
+}
+
+/**
+ * Enqueues a lock.
+ *
+ * \pre current thread or io owns a hold on lock.
+ *
+ * \post ergo(result == 0, lock->users increased)
+ * \post ergo(result == 0, lock->cll_state == CLS_ENQUEUED ||
+ *			 lock->cll_state == CLS_HELD)
+ */
+int cl_enqueue(const struct lu_env *env, struct cl_lock *lock,
+	       struct cl_io *io, __u32 enqflags)
+{
+	int result;
+
+	ENTRY;
+
+	cl_lock_lockdep_acquire(env, lock, enqflags);
+	cl_lock_mutex_get(env, lock);
+	result = cl_enqueue_locked(env, lock, io, enqflags);
+	cl_lock_mutex_put(env, lock);
+	if (result != 0)
+		cl_lock_lockdep_release(env, lock);
+	LASSERT(ergo(result == 0, lock->cll_state == CLS_ENQUEUED ||
+		     lock->cll_state == CLS_HELD));
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_enqueue);
+
+/**
+ * Tries to unlock a lock.
+ *
+ * This function is called to release underlying resource:
+ * 1. for top lock, the resource is sublocks it held;
+ * 2. for sublock, the resource is the reference to dlmlock.
+ *
+ * cl_unuse_try is a one-shot operation, so it must NOT return CLO_WAIT.
+ *
+ * \see cl_unuse() cl_lock_operations::clo_unuse()
+ * \see cl_lock_state::CLS_CACHED
+ */
+int cl_unuse_try(const struct lu_env *env, struct cl_lock *lock)
+{
+	int			 result;
+	enum cl_lock_state	  state = CLS_NEW;
+
+	ENTRY;
+	cl_lock_trace(D_DLMTRACE, env, "unuse lock", lock);
+
+	if (lock->cll_users > 1) {
+		cl_lock_user_del(env, lock);
+		RETURN(0);
+	}
+
+	/* Only if the lock is in CLS_HELD or CLS_ENQUEUED state, it can hold
+	 * underlying resources. */
+	if (!(lock->cll_state == CLS_HELD || lock->cll_state == CLS_ENQUEUED)) {
+		cl_lock_user_del(env, lock);
+		RETURN(0);
+	}
+
+	/*
+	 * New lock users (->cll_users) are not protecting unlocking
+	 * from proceeding. From this point, lock eventually reaches
+	 * CLS_CACHED, is reinitialized to CLS_NEW or fails into
+	 * CLS_FREEING.
+	 */
+	state = cl_lock_intransit(env, lock);
+
+	result = cl_unuse_try_internal(env, lock);
+	LASSERT(lock->cll_state == CLS_INTRANSIT);
+	LASSERT(result != CLO_WAIT);
+	cl_lock_user_del(env, lock);
+	if (result == 0 || result == -ESTALE) {
+		/*
+		 * Return lock back to the cache. This is the only
+		 * place where lock is moved into CLS_CACHED state.
+		 *
+		 * If one of ->clo_unuse() methods returned -ESTALE, lock
+		 * cannot be placed into cache and has to be
+		 * re-initialized. This happens e.g., when a sub-lock was
+		 * canceled while unlocking was in progress.
+		 */
+		if (state == CLS_HELD && result == 0)
+			state = CLS_CACHED;
+		else
+			state = CLS_NEW;
+		cl_lock_extransit(env, lock, state);
+
+		/*
+		 * Hide -ESTALE error.
+		 * If the lock is a glimpse lock, and it has multiple
+		 * stripes. Assuming that one of its sublock returned -ENAVAIL,
+		 * and other sublocks are matched write locks. In this case,
+		 * we can't set this lock to error because otherwise some of
+		 * its sublocks may not be canceled. This causes some dirty
+		 * pages won't be written to OSTs. -jay
+		 */
+		result = 0;
+	} else {
+		CERROR("result = %d, this is unlikely!\n", result);
+		state = CLS_NEW;
+		cl_lock_extransit(env, lock, state);
+	}
+	RETURN(result ?: lock->cll_error);
+}
+EXPORT_SYMBOL(cl_unuse_try);
+
+static void cl_unuse_locked(const struct lu_env *env, struct cl_lock *lock)
+{
+	int result;
+	ENTRY;
+
+	result = cl_unuse_try(env, lock);
+	if (result)
+		CL_LOCK_DEBUG(D_ERROR, env, lock, "unuse return %d\n", result);
+
+	EXIT;
+}
+
+/**
+ * Unlocks a lock.
+ */
+void cl_unuse(const struct lu_env *env, struct cl_lock *lock)
+{
+	ENTRY;
+	cl_lock_mutex_get(env, lock);
+	cl_unuse_locked(env, lock);
+	cl_lock_mutex_put(env, lock);
+	cl_lock_lockdep_release(env, lock);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_unuse);
+
+/**
+ * Tries to wait for a lock.
+ *
+ * This function is called repeatedly by cl_wait() until either lock is
+ * granted, or error occurs. This function does not block waiting for network
+ * communication to complete.
+ *
+ * \see cl_wait() cl_lock_operations::clo_wait()
+ * \see cl_lock_state::CLS_HELD
+ */
+int cl_wait_try(const struct lu_env *env, struct cl_lock *lock)
+{
+	const struct cl_lock_slice *slice;
+	int			 result;
+
+	ENTRY;
+	cl_lock_trace(D_DLMTRACE, env, "wait lock try", lock);
+	do {
+		LINVRNT(cl_lock_is_mutexed(lock));
+		LINVRNT(cl_lock_invariant(env, lock));
+		LASSERTF(lock->cll_state == CLS_QUEUING ||
+			 lock->cll_state == CLS_ENQUEUED ||
+			 lock->cll_state == CLS_HELD ||
+			 lock->cll_state == CLS_INTRANSIT,
+			 "lock state: %d\n", lock->cll_state);
+		LASSERT(lock->cll_users > 0);
+		LASSERT(lock->cll_holds > 0);
+
+		result = lock->cll_error;
+		if (result != 0)
+			break;
+
+		if (cl_lock_is_intransit(lock)) {
+			result = CLO_WAIT;
+			break;
+		}
+
+		if (lock->cll_state == CLS_HELD)
+			/* nothing to do */
+			break;
+
+		result = -ENOSYS;
+		list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+			if (slice->cls_ops->clo_wait != NULL) {
+				result = slice->cls_ops->clo_wait(env, slice);
+				if (result != 0)
+					break;
+			}
+		}
+		LASSERT(result != -ENOSYS);
+		if (result == 0) {
+			LASSERT(lock->cll_state != CLS_INTRANSIT);
+			cl_lock_state_set(env, lock, CLS_HELD);
+		}
+	} while (result == CLO_REPEAT);
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_wait_try);
+
+/**
+ * Waits until enqueued lock is granted.
+ *
+ * \pre current thread or io owns a hold on the lock
+ * \pre ergo(result == 0, lock->cll_state == CLS_ENQUEUED ||
+ *			lock->cll_state == CLS_HELD)
+ *
+ * \post ergo(result == 0, lock->cll_state == CLS_HELD)
+ */
+int cl_wait(const struct lu_env *env, struct cl_lock *lock)
+{
+	int result;
+
+	ENTRY;
+	cl_lock_mutex_get(env, lock);
+
+	LINVRNT(cl_lock_invariant(env, lock));
+	LASSERTF(lock->cll_state == CLS_ENQUEUED || lock->cll_state == CLS_HELD,
+		 "Wrong state %d \n", lock->cll_state);
+	LASSERT(lock->cll_holds > 0);
+
+	do {
+		result = cl_wait_try(env, lock);
+		if (result == CLO_WAIT) {
+			result = cl_lock_state_wait(env, lock);
+			if (result == 0)
+				continue;
+		}
+		break;
+	} while (1);
+	if (result < 0) {
+		cl_unuse_try(env, lock);
+		cl_lock_lockdep_release(env, lock);
+	}
+	cl_lock_trace(D_DLMTRACE, env, "wait lock", lock);
+	cl_lock_mutex_put(env, lock);
+	LASSERT(ergo(result == 0, lock->cll_state == CLS_HELD));
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_wait);
+
+/**
+ * Executes cl_lock_operations::clo_weigh(), and sums results to estimate lock
+ * value.
+ */
+unsigned long cl_lock_weigh(const struct lu_env *env, struct cl_lock *lock)
+{
+	const struct cl_lock_slice *slice;
+	unsigned long pound;
+	unsigned long ounce;
+
+	ENTRY;
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+
+	pound = 0;
+	list_for_each_entry_reverse(slice, &lock->cll_layers, cls_linkage) {
+		if (slice->cls_ops->clo_weigh != NULL) {
+			ounce = slice->cls_ops->clo_weigh(env, slice);
+			pound += ounce;
+			if (pound < ounce) /* over-weight^Wflow */
+				pound = ~0UL;
+		}
+	}
+	RETURN(pound);
+}
+EXPORT_SYMBOL(cl_lock_weigh);
+
+/**
+ * Notifies layers that lock description changed.
+ *
+ * The server can grant client a lock different from one that was requested
+ * (e.g., larger in extent). This method is called when actually granted lock
+ * description becomes known to let layers to accommodate for changed lock
+ * description.
+ *
+ * \see cl_lock_operations::clo_modify()
+ */
+int cl_lock_modify(const struct lu_env *env, struct cl_lock *lock,
+		   const struct cl_lock_descr *desc)
+{
+	const struct cl_lock_slice *slice;
+	struct cl_object	   *obj = lock->cll_descr.cld_obj;
+	struct cl_object_header    *hdr = cl_object_header(obj);
+	int result;
+
+	ENTRY;
+	cl_lock_trace(D_DLMTRACE, env, "modify lock", lock);
+	/* don't allow object to change */
+	LASSERT(obj == desc->cld_obj);
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+
+	list_for_each_entry_reverse(slice, &lock->cll_layers, cls_linkage) {
+		if (slice->cls_ops->clo_modify != NULL) {
+			result = slice->cls_ops->clo_modify(env, slice, desc);
+			if (result != 0)
+				RETURN(result);
+		}
+	}
+	CL_LOCK_DEBUG(D_DLMTRACE, env, lock, " -> "DDESCR"@"DFID"\n",
+		      PDESCR(desc), PFID(lu_object_fid(&desc->cld_obj->co_lu)));
+	/*
+	 * Just replace description in place. Nothing more is needed for
+	 * now. If locks were indexed according to their extent and/or mode,
+	 * that index would have to be updated here.
+	 */
+	spin_lock(&hdr->coh_lock_guard);
+	lock->cll_descr = *desc;
+	spin_unlock(&hdr->coh_lock_guard);
+	RETURN(0);
+}
+EXPORT_SYMBOL(cl_lock_modify);
+
+/**
+ * Initializes lock closure with a given origin.
+ *
+ * \see cl_lock_closure
+ */
+void cl_lock_closure_init(const struct lu_env *env,
+			  struct cl_lock_closure *closure,
+			  struct cl_lock *origin, int wait)
+{
+	LINVRNT(cl_lock_is_mutexed(origin));
+	LINVRNT(cl_lock_invariant(env, origin));
+
+	INIT_LIST_HEAD(&closure->clc_list);
+	closure->clc_origin = origin;
+	closure->clc_wait   = wait;
+	closure->clc_nr     = 0;
+}
+EXPORT_SYMBOL(cl_lock_closure_init);
+
+/**
+ * Builds a closure of \a lock.
+ *
+ * Building of a closure consists of adding initial lock (\a lock) into it,
+ * and calling cl_lock_operations::clo_closure() methods of \a lock. These
+ * methods might call cl_lock_closure_build() recursively again, adding more
+ * locks to the closure, etc.
+ *
+ * \see cl_lock_closure
+ */
+int cl_lock_closure_build(const struct lu_env *env, struct cl_lock *lock,
+			  struct cl_lock_closure *closure)
+{
+	const struct cl_lock_slice *slice;
+	int result;
+
+	ENTRY;
+	LINVRNT(cl_lock_is_mutexed(closure->clc_origin));
+	LINVRNT(cl_lock_invariant(env, closure->clc_origin));
+
+	result = cl_lock_enclosure(env, lock, closure);
+	if (result == 0) {
+		list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+			if (slice->cls_ops->clo_closure != NULL) {
+				result = slice->cls_ops->clo_closure(env, slice,
+								     closure);
+				if (result != 0)
+					break;
+			}
+		}
+	}
+	if (result != 0)
+		cl_lock_disclosure(env, closure);
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_lock_closure_build);
+
+/**
+ * Adds new lock to a closure.
+ *
+ * Try-locks \a lock and if succeeded, adds it to the closure (never more than
+ * once). If try-lock failed, returns CLO_REPEAT, after optionally waiting
+ * until next try-lock is likely to succeed.
+ */
+int cl_lock_enclosure(const struct lu_env *env, struct cl_lock *lock,
+		      struct cl_lock_closure *closure)
+{
+	int result = 0;
+	ENTRY;
+	cl_lock_trace(D_DLMTRACE, env, "enclosure lock", lock);
+	if (!cl_lock_mutex_try(env, lock)) {
+		/*
+		 * If lock->cll_inclosure is not empty, lock is already in
+		 * this closure.
+		 */
+		if (list_empty(&lock->cll_inclosure)) {
+			cl_lock_get_trust(lock);
+			lu_ref_add(&lock->cll_reference, "closure", closure);
+			list_add(&lock->cll_inclosure, &closure->clc_list);
+			closure->clc_nr++;
+		} else
+			cl_lock_mutex_put(env, lock);
+		result = 0;
+	} else {
+		cl_lock_disclosure(env, closure);
+		if (closure->clc_wait) {
+			cl_lock_get_trust(lock);
+			lu_ref_add(&lock->cll_reference, "closure-w", closure);
+			cl_lock_mutex_put(env, closure->clc_origin);
+
+			LASSERT(cl_lock_nr_mutexed(env) == 0);
+			cl_lock_mutex_get(env, lock);
+			cl_lock_mutex_put(env, lock);
+
+			cl_lock_mutex_get(env, closure->clc_origin);
+			lu_ref_del(&lock->cll_reference, "closure-w", closure);
+			cl_lock_put(env, lock);
+		}
+		result = CLO_REPEAT;
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_lock_enclosure);
+
+/** Releases mutices of enclosed locks. */
+void cl_lock_disclosure(const struct lu_env *env,
+			struct cl_lock_closure *closure)
+{
+	struct cl_lock *scan;
+	struct cl_lock *temp;
+
+	cl_lock_trace(D_DLMTRACE, env, "disclosure lock", closure->clc_origin);
+	list_for_each_entry_safe(scan, temp, &closure->clc_list,
+				     cll_inclosure){
+		list_del_init(&scan->cll_inclosure);
+		cl_lock_mutex_put(env, scan);
+		lu_ref_del(&scan->cll_reference, "closure", closure);
+		cl_lock_put(env, scan);
+		closure->clc_nr--;
+	}
+	LASSERT(closure->clc_nr == 0);
+}
+EXPORT_SYMBOL(cl_lock_disclosure);
+
+/** Finalizes a closure. */
+void cl_lock_closure_fini(struct cl_lock_closure *closure)
+{
+	LASSERT(closure->clc_nr == 0);
+	LASSERT(list_empty(&closure->clc_list));
+}
+EXPORT_SYMBOL(cl_lock_closure_fini);
+
+/**
+ * Destroys this lock. Notifies layers (bottom-to-top) that lock is being
+ * destroyed, then destroy the lock. If there are holds on the lock, postpone
+ * destruction until all holds are released. This is called when a decision is
+ * made to destroy the lock in the future. E.g., when a blocking AST is
+ * received on it, or fatal communication error happens.
+ *
+ * Caller must have a reference on this lock to prevent a situation, when
+ * deleted lock lingers in memory for indefinite time, because nobody calls
+ * cl_lock_put() to finish it.
+ *
+ * \pre atomic_read(&lock->cll_ref) > 0
+ * \pre ergo(cl_lock_nesting(lock) == CNL_TOP,
+ *	   cl_lock_nr_mutexed(env) == 1)
+ *      [i.e., if a top-lock is deleted, mutices of no other locks can be
+ *      held, as deletion of sub-locks might require releasing a top-lock
+ *      mutex]
+ *
+ * \see cl_lock_operations::clo_delete()
+ * \see cl_lock::cll_holds
+ */
+void cl_lock_delete(const struct lu_env *env, struct cl_lock *lock)
+{
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+	LASSERT(ergo(cl_lock_nesting(lock) == CNL_TOP,
+		     cl_lock_nr_mutexed(env) == 1));
+
+	ENTRY;
+	cl_lock_trace(D_DLMTRACE, env, "delete lock", lock);
+	if (lock->cll_holds == 0)
+		cl_lock_delete0(env, lock);
+	else
+		lock->cll_flags |= CLF_DOOMED;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_delete);
+
+/**
+ * Mark lock as irrecoverably failed, and mark it for destruction. This
+ * happens when, e.g., server fails to grant a lock to us, or networking
+ * time-out happens.
+ *
+ * \pre atomic_read(&lock->cll_ref) > 0
+ *
+ * \see clo_lock_delete()
+ * \see cl_lock::cll_holds
+ */
+void cl_lock_error(const struct lu_env *env, struct cl_lock *lock, int error)
+{
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+
+	ENTRY;
+	if (lock->cll_error == 0 && error != 0) {
+		cl_lock_trace(D_DLMTRACE, env, "set lock error", lock);
+		lock->cll_error = error;
+		cl_lock_signal(env, lock);
+		cl_lock_cancel(env, lock);
+		cl_lock_delete(env, lock);
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_error);
+
+/**
+ * Cancels this lock. Notifies layers
+ * (bottom-to-top) that lock is being cancelled, then destroy the lock. If
+ * there are holds on the lock, postpone cancellation until
+ * all holds are released.
+ *
+ * Cancellation notification is delivered to layers at most once.
+ *
+ * \see cl_lock_operations::clo_cancel()
+ * \see cl_lock::cll_holds
+ */
+void cl_lock_cancel(const struct lu_env *env, struct cl_lock *lock)
+{
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+
+	ENTRY;
+	cl_lock_trace(D_DLMTRACE, env, "cancel lock", lock);
+	if (lock->cll_holds == 0)
+		cl_lock_cancel0(env, lock);
+	else
+		lock->cll_flags |= CLF_CANCELPEND;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_cancel);
+
+/**
+ * Finds an existing lock covering given index and optionally different from a
+ * given \a except lock.
+ */
+struct cl_lock *cl_lock_at_pgoff(const struct lu_env *env,
+				 struct cl_object *obj, pgoff_t index,
+				 struct cl_lock *except,
+				 int pending, int canceld)
+{
+	struct cl_object_header *head;
+	struct cl_lock	  *scan;
+	struct cl_lock	  *lock;
+	struct cl_lock_descr    *need;
+
+	ENTRY;
+
+	head = cl_object_header(obj);
+	need = &cl_env_info(env)->clt_descr;
+	lock = NULL;
+
+	need->cld_mode = CLM_READ; /* CLM_READ matches both READ & WRITE, but
+				    * not PHANTOM */
+	need->cld_start = need->cld_end = index;
+	need->cld_enq_flags = 0;
+
+	spin_lock(&head->coh_lock_guard);
+	/* It is fine to match any group lock since there could be only one
+	 * with a uniq gid and it conflicts with all other lock modes too */
+	list_for_each_entry(scan, &head->coh_locks, cll_linkage) {
+		if (scan != except &&
+		    (scan->cll_descr.cld_mode == CLM_GROUP ||
+		    cl_lock_ext_match(&scan->cll_descr, need)) &&
+		    scan->cll_state >= CLS_HELD &&
+		    scan->cll_state < CLS_FREEING &&
+		    /*
+		     * This check is racy as the lock can be canceled right
+		     * after it is done, but this is fine, because page exists
+		     * already.
+		     */
+		    (canceld || !(scan->cll_flags & CLF_CANCELLED)) &&
+		    (pending || !(scan->cll_flags & CLF_CANCELPEND))) {
+			/* Don't increase cs_hit here since this
+			 * is just a helper function. */
+			cl_lock_get_trust(scan);
+			lock = scan;
+			break;
+		}
+	}
+	spin_unlock(&head->coh_lock_guard);
+	RETURN(lock);
+}
+EXPORT_SYMBOL(cl_lock_at_pgoff);
+
+/**
+ * Calculate the page offset at the layer of @lock.
+ * At the time of this writing, @page is top page and @lock is sub lock.
+ */
+static pgoff_t pgoff_at_lock(struct cl_page *page, struct cl_lock *lock)
+{
+	struct lu_device_type *dtype;
+	const struct cl_page_slice *slice;
+
+	dtype = lock->cll_descr.cld_obj->co_lu.lo_dev->ld_type;
+	slice = cl_page_at(page, dtype);
+	LASSERT(slice != NULL);
+	return slice->cpl_page->cp_index;
+}
+
+/**
+ * Check if page @page is covered by an extra lock or discard it.
+ */
+static int check_and_discard_cb(const struct lu_env *env, struct cl_io *io,
+				struct cl_page *page, void *cbdata)
+{
+	struct cl_thread_info *info = cl_env_info(env);
+	struct cl_lock *lock = cbdata;
+	pgoff_t index = pgoff_at_lock(page, lock);
+
+	if (index >= info->clt_fn_index) {
+		struct cl_lock *tmp;
+
+		/* refresh non-overlapped index */
+		tmp = cl_lock_at_pgoff(env, lock->cll_descr.cld_obj, index,
+					lock, 1, 0);
+		if (tmp != NULL) {
+			/* Cache the first-non-overlapped index so as to skip
+			 * all pages within [index, clt_fn_index). This
+			 * is safe because if tmp lock is canceled, it will
+			 * discard these pages. */
+			info->clt_fn_index = tmp->cll_descr.cld_end + 1;
+			if (tmp->cll_descr.cld_end == CL_PAGE_EOF)
+				info->clt_fn_index = CL_PAGE_EOF;
+			cl_lock_put(env, tmp);
+		} else if (cl_page_own(env, io, page) == 0) {
+			/* discard the page */
+			cl_page_unmap(env, io, page);
+			cl_page_discard(env, io, page);
+			cl_page_disown(env, io, page);
+		} else {
+			LASSERT(page->cp_state == CPS_FREEING);
+		}
+	}
+
+	info->clt_next_index = index + 1;
+	return CLP_GANG_OKAY;
+}
+
+static int discard_cb(const struct lu_env *env, struct cl_io *io,
+		      struct cl_page *page, void *cbdata)
+{
+	struct cl_thread_info *info = cl_env_info(env);
+	struct cl_lock *lock   = cbdata;
+
+	LASSERT(lock->cll_descr.cld_mode >= CLM_WRITE);
+	KLASSERT(ergo(page->cp_type == CPT_CACHEABLE,
+		      !PageWriteback(cl_page_vmpage(env, page))));
+	KLASSERT(ergo(page->cp_type == CPT_CACHEABLE,
+		      !PageDirty(cl_page_vmpage(env, page))));
+
+	info->clt_next_index = pgoff_at_lock(page, lock) + 1;
+	if (cl_page_own(env, io, page) == 0) {
+		/* discard the page */
+		cl_page_unmap(env, io, page);
+		cl_page_discard(env, io, page);
+		cl_page_disown(env, io, page);
+	} else {
+		LASSERT(page->cp_state == CPS_FREEING);
+	}
+
+	return CLP_GANG_OKAY;
+}
+
+/**
+ * Discard pages protected by the given lock. This function traverses radix
+ * tree to find all covering pages and discard them. If a page is being covered
+ * by other locks, it should remain in cache.
+ *
+ * If error happens on any step, the process continues anyway (the reasoning
+ * behind this being that lock cancellation cannot be delayed indefinitely).
+ */
+int cl_lock_discard_pages(const struct lu_env *env, struct cl_lock *lock)
+{
+	struct cl_thread_info *info  = cl_env_info(env);
+	struct cl_io	  *io    = &info->clt_io;
+	struct cl_lock_descr  *descr = &lock->cll_descr;
+	cl_page_gang_cb_t      cb;
+	int res;
+	int result;
+
+	LINVRNT(cl_lock_invariant(env, lock));
+	ENTRY;
+
+	io->ci_obj = cl_object_top(descr->cld_obj);
+	io->ci_ignore_layout = 1;
+	result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+	if (result != 0)
+		GOTO(out, result);
+
+	cb = descr->cld_mode == CLM_READ ? check_and_discard_cb : discard_cb;
+	info->clt_fn_index = info->clt_next_index = descr->cld_start;
+	do {
+		res = cl_page_gang_lookup(env, descr->cld_obj, io,
+					  info->clt_next_index, descr->cld_end,
+					  cb, (void *)lock);
+		if (info->clt_next_index > descr->cld_end)
+			break;
+
+		if (res == CLP_GANG_RESCHED)
+			cond_resched();
+	} while (res != CLP_GANG_OKAY);
+out:
+	cl_io_fini(env, io);
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_lock_discard_pages);
+
+/**
+ * Eliminate all locks for a given object.
+ *
+ * Caller has to guarantee that no lock is in active use.
+ *
+ * \param cancel when this is set, cl_locks_prune() cancels locks before
+ *	       destroying.
+ */
+void cl_locks_prune(const struct lu_env *env, struct cl_object *obj, int cancel)
+{
+	struct cl_object_header *head;
+	struct cl_lock	  *lock;
+
+	ENTRY;
+	head = cl_object_header(obj);
+	/*
+	 * If locks are destroyed without cancellation, all pages must be
+	 * already destroyed (as otherwise they will be left unprotected).
+	 */
+	LASSERT(ergo(!cancel,
+		     head->coh_tree.rnode == NULL && head->coh_pages == 0));
+
+	spin_lock(&head->coh_lock_guard);
+	while (!list_empty(&head->coh_locks)) {
+		lock = container_of(head->coh_locks.next,
+				    struct cl_lock, cll_linkage);
+		cl_lock_get_trust(lock);
+		spin_unlock(&head->coh_lock_guard);
+		lu_ref_add(&lock->cll_reference, "prune", current);
+
+again:
+		cl_lock_mutex_get(env, lock);
+		if (lock->cll_state < CLS_FREEING) {
+			LASSERT(lock->cll_users <= 1);
+			if (unlikely(lock->cll_users == 1)) {
+				struct l_wait_info lwi = { 0 };
+
+				cl_lock_mutex_put(env, lock);
+				l_wait_event(lock->cll_wq,
+					     lock->cll_users == 0,
+					     &lwi);
+				goto again;
+			}
+
+			if (cancel)
+				cl_lock_cancel(env, lock);
+			cl_lock_delete(env, lock);
+		}
+		cl_lock_mutex_put(env, lock);
+		lu_ref_del(&lock->cll_reference, "prune", current);
+		cl_lock_put(env, lock);
+		spin_lock(&head->coh_lock_guard);
+	}
+	spin_unlock(&head->coh_lock_guard);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_locks_prune);
+
+static struct cl_lock *cl_lock_hold_mutex(const struct lu_env *env,
+					  const struct cl_io *io,
+					  const struct cl_lock_descr *need,
+					  const char *scope, const void *source)
+{
+	struct cl_lock *lock;
+
+	ENTRY;
+
+	while (1) {
+		lock = cl_lock_find(env, io, need);
+		if (IS_ERR(lock))
+			break;
+		cl_lock_mutex_get(env, lock);
+		if (lock->cll_state < CLS_FREEING &&
+		    !(lock->cll_flags & CLF_CANCELLED)) {
+			cl_lock_hold_mod(env, lock, +1);
+			lu_ref_add(&lock->cll_holders, scope, source);
+			lu_ref_add(&lock->cll_reference, scope, source);
+			break;
+		}
+		cl_lock_mutex_put(env, lock);
+		cl_lock_put(env, lock);
+	}
+	RETURN(lock);
+}
+
+/**
+ * Returns a lock matching \a need description with a reference and a hold on
+ * it.
+ *
+ * This is much like cl_lock_find(), except that cl_lock_hold() additionally
+ * guarantees that lock is not in the CLS_FREEING state on return.
+ */
+struct cl_lock *cl_lock_hold(const struct lu_env *env, const struct cl_io *io,
+			     const struct cl_lock_descr *need,
+			     const char *scope, const void *source)
+{
+	struct cl_lock *lock;
+
+	ENTRY;
+
+	lock = cl_lock_hold_mutex(env, io, need, scope, source);
+	if (!IS_ERR(lock))
+		cl_lock_mutex_put(env, lock);
+	RETURN(lock);
+}
+EXPORT_SYMBOL(cl_lock_hold);
+
+/**
+ * Main high-level entry point of cl_lock interface that finds existing or
+ * enqueues new lock matching given description.
+ */
+struct cl_lock *cl_lock_request(const struct lu_env *env, struct cl_io *io,
+				const struct cl_lock_descr *need,
+				const char *scope, const void *source)
+{
+	struct cl_lock       *lock;
+	int		   rc;
+	__u32		 enqflags = need->cld_enq_flags;
+
+	ENTRY;
+	do {
+		lock = cl_lock_hold_mutex(env, io, need, scope, source);
+		if (IS_ERR(lock))
+			break;
+
+		rc = cl_enqueue_locked(env, lock, io, enqflags);
+		if (rc == 0) {
+			if (cl_lock_fits_into(env, lock, need, io)) {
+				if (!(enqflags & CEF_AGL)) {
+					cl_lock_mutex_put(env, lock);
+					cl_lock_lockdep_acquire(env, lock,
+								enqflags);
+					break;
+				}
+				rc = 1;
+			}
+			cl_unuse_locked(env, lock);
+		}
+		cl_lock_trace(D_DLMTRACE, env,
+			      rc <= 0 ? "enqueue failed" : "agl succeed", lock);
+		cl_lock_hold_release(env, lock, scope, source);
+		cl_lock_mutex_put(env, lock);
+		lu_ref_del(&lock->cll_reference, scope, source);
+		cl_lock_put(env, lock);
+		if (rc > 0) {
+			LASSERT(enqflags & CEF_AGL);
+			lock = NULL;
+		} else if (rc != 0) {
+			lock = ERR_PTR(rc);
+		}
+	} while (rc == 0);
+	RETURN(lock);
+}
+EXPORT_SYMBOL(cl_lock_request);
+
+/**
+ * Adds a hold to a known lock.
+ */
+void cl_lock_hold_add(const struct lu_env *env, struct cl_lock *lock,
+		      const char *scope, const void *source)
+{
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+	LASSERT(lock->cll_state != CLS_FREEING);
+
+	ENTRY;
+	cl_lock_hold_mod(env, lock, +1);
+	cl_lock_get(lock);
+	lu_ref_add(&lock->cll_holders, scope, source);
+	lu_ref_add(&lock->cll_reference, scope, source);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_hold_add);
+
+/**
+ * Releases a hold and a reference on a lock, on which caller acquired a
+ * mutex.
+ */
+void cl_lock_unhold(const struct lu_env *env, struct cl_lock *lock,
+		    const char *scope, const void *source)
+{
+	LINVRNT(cl_lock_invariant(env, lock));
+	ENTRY;
+	cl_lock_hold_release(env, lock, scope, source);
+	lu_ref_del(&lock->cll_reference, scope, source);
+	cl_lock_put(env, lock);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_unhold);
+
+/**
+ * Releases a hold and a reference on a lock, obtained by cl_lock_hold().
+ */
+void cl_lock_release(const struct lu_env *env, struct cl_lock *lock,
+		     const char *scope, const void *source)
+{
+	LINVRNT(cl_lock_invariant(env, lock));
+	ENTRY;
+	cl_lock_trace(D_DLMTRACE, env, "release lock", lock);
+	cl_lock_mutex_get(env, lock);
+	cl_lock_hold_release(env, lock, scope, source);
+	cl_lock_mutex_put(env, lock);
+	lu_ref_del(&lock->cll_reference, scope, source);
+	cl_lock_put(env, lock);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_release);
+
+void cl_lock_user_add(const struct lu_env *env, struct cl_lock *lock)
+{
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+
+	ENTRY;
+	cl_lock_used_mod(env, lock, +1);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_user_add);
+
+void cl_lock_user_del(const struct lu_env *env, struct cl_lock *lock)
+{
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+	LASSERT(lock->cll_users > 0);
+
+	ENTRY;
+	cl_lock_used_mod(env, lock, -1);
+	if (lock->cll_users == 0)
+		wake_up_all(&lock->cll_wq);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_user_del);
+
+const char *cl_lock_mode_name(const enum cl_lock_mode mode)
+{
+	static const char *names[] = {
+		[CLM_PHANTOM] = "P",
+		[CLM_READ]    = "R",
+		[CLM_WRITE]   = "W",
+		[CLM_GROUP]   = "G"
+	};
+	if (0 <= mode && mode < ARRAY_SIZE(names))
+		return names[mode];
+	else
+		return "U";
+}
+EXPORT_SYMBOL(cl_lock_mode_name);
+
+/**
+ * Prints human readable representation of a lock description.
+ */
+void cl_lock_descr_print(const struct lu_env *env, void *cookie,
+		       lu_printer_t printer,
+		       const struct cl_lock_descr *descr)
+{
+	const struct lu_fid  *fid;
+
+	fid = lu_object_fid(&descr->cld_obj->co_lu);
+	(*printer)(env, cookie, DDESCR"@"DFID, PDESCR(descr), PFID(fid));
+}
+EXPORT_SYMBOL(cl_lock_descr_print);
+
+/**
+ * Prints human readable representation of \a lock to the \a f.
+ */
+void cl_lock_print(const struct lu_env *env, void *cookie,
+		   lu_printer_t printer, const struct cl_lock *lock)
+{
+	const struct cl_lock_slice *slice;
+	(*printer)(env, cookie, "lock@%p[%d %d %d %d %d %08lx] ",
+		   lock, atomic_read(&lock->cll_ref),
+		   lock->cll_state, lock->cll_error, lock->cll_holds,
+		   lock->cll_users, lock->cll_flags);
+	cl_lock_descr_print(env, cookie, printer, &lock->cll_descr);
+	(*printer)(env, cookie, " {\n");
+
+	list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+		(*printer)(env, cookie, "    %s@%p: ",
+			   slice->cls_obj->co_lu.lo_dev->ld_type->ldt_name,
+			   slice);
+		if (slice->cls_ops->clo_print != NULL)
+			slice->cls_ops->clo_print(env, cookie, printer, slice);
+		(*printer)(env, cookie, "\n");
+	}
+	(*printer)(env, cookie, "} lock@%p\n", lock);
+}
+EXPORT_SYMBOL(cl_lock_print);
+
+int cl_lock_init(void)
+{
+	return lu_kmem_init(cl_lock_caches);
+}
+
+void cl_lock_fini(void)
+{
+	lu_kmem_fini(cl_lock_caches);
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/cl_object.c b/drivers/staging/lustre/lustre/obdclass/cl_object.c
new file mode 100644
index 000000000000..faa9ef63a3f5
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/cl_object.c
@@ -0,0 +1,1155 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Client Lustre Object.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+/*
+ * Locking.
+ *
+ *  i_mutex
+ *      PG_locked
+ *	  ->coh_page_guard
+ *	  ->coh_lock_guard
+ *	  ->coh_attr_guard
+ *	  ->ls_guard
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/libcfs/libcfs.h>
+/* class_put_type() */
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <linux/list.h>
+#include <linux/libcfs/libcfs_hash.h> /* for cfs_hash stuff */
+#include <cl_object.h>
+#include "cl_internal.h"
+
+static struct kmem_cache *cl_env_kmem;
+
+/** Lock class of cl_object_header::coh_page_guard */
+static struct lock_class_key cl_page_guard_class;
+/** Lock class of cl_object_header::coh_lock_guard */
+static struct lock_class_key cl_lock_guard_class;
+/** Lock class of cl_object_header::coh_attr_guard */
+static struct lock_class_key cl_attr_guard_class;
+
+extern __u32 lu_context_tags_default;
+extern __u32 lu_session_tags_default;
+/**
+ * Initialize cl_object_header.
+ */
+int cl_object_header_init(struct cl_object_header *h)
+{
+	int result;
+
+	ENTRY;
+	result = lu_object_header_init(&h->coh_lu);
+	if (result == 0) {
+		spin_lock_init(&h->coh_page_guard);
+		spin_lock_init(&h->coh_lock_guard);
+		spin_lock_init(&h->coh_attr_guard);
+		lockdep_set_class(&h->coh_page_guard, &cl_page_guard_class);
+		lockdep_set_class(&h->coh_lock_guard, &cl_lock_guard_class);
+		lockdep_set_class(&h->coh_attr_guard, &cl_attr_guard_class);
+		h->coh_pages = 0;
+		/* XXX hard coded GFP_* mask. */
+		INIT_RADIX_TREE(&h->coh_tree, GFP_ATOMIC);
+		INIT_LIST_HEAD(&h->coh_locks);
+		h->coh_page_bufsize = ALIGN(sizeof(struct cl_page), 8);
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_object_header_init);
+
+/**
+ * Finalize cl_object_header.
+ */
+void cl_object_header_fini(struct cl_object_header *h)
+{
+	LASSERT(list_empty(&h->coh_locks));
+	lu_object_header_fini(&h->coh_lu);
+}
+EXPORT_SYMBOL(cl_object_header_fini);
+
+/**
+ * Returns a cl_object with a given \a fid.
+ *
+ * Returns either cached or newly created object. Additional reference on the
+ * returned object is acquired.
+ *
+ * \see lu_object_find(), cl_page_find(), cl_lock_find()
+ */
+struct cl_object *cl_object_find(const struct lu_env *env,
+				 struct cl_device *cd, const struct lu_fid *fid,
+				 const struct cl_object_conf *c)
+{
+	might_sleep();
+	return lu2cl(lu_object_find_slice(env, cl2lu_dev(cd), fid, &c->coc_lu));
+}
+EXPORT_SYMBOL(cl_object_find);
+
+/**
+ * Releases a reference on \a o.
+ *
+ * When last reference is released object is returned to the cache, unless
+ * lu_object_header_flags::LU_OBJECT_HEARD_BANSHEE bit is set in its header.
+ *
+ * \see cl_page_put(), cl_lock_put().
+ */
+void cl_object_put(const struct lu_env *env, struct cl_object *o)
+{
+	lu_object_put(env, &o->co_lu);
+}
+EXPORT_SYMBOL(cl_object_put);
+
+/**
+ * Acquire an additional reference to the object \a o.
+ *
+ * This can only be used to acquire _additional_ reference, i.e., caller
+ * already has to possess at least one reference to \a o before calling this.
+ *
+ * \see cl_page_get(), cl_lock_get().
+ */
+void cl_object_get(struct cl_object *o)
+{
+	lu_object_get(&o->co_lu);
+}
+EXPORT_SYMBOL(cl_object_get);
+
+/**
+ * Returns the top-object for a given \a o.
+ *
+ * \see cl_page_top(), cl_io_top()
+ */
+struct cl_object *cl_object_top(struct cl_object *o)
+{
+	struct cl_object_header *hdr = cl_object_header(o);
+	struct cl_object *top;
+
+	while (hdr->coh_parent != NULL)
+		hdr = hdr->coh_parent;
+
+	top = lu2cl(lu_object_top(&hdr->coh_lu));
+	CDEBUG(D_TRACE, "%p -> %p\n", o, top);
+	return top;
+}
+EXPORT_SYMBOL(cl_object_top);
+
+/**
+ * Returns pointer to the lock protecting data-attributes for the given object
+ * \a o.
+ *
+ * Data-attributes are protected by the cl_object_header::coh_attr_guard
+ * spin-lock in the top-object.
+ *
+ * \see cl_attr, cl_object_attr_lock(), cl_object_operations::coo_attr_get().
+ */
+static spinlock_t *cl_object_attr_guard(struct cl_object *o)
+{
+	return &cl_object_header(cl_object_top(o))->coh_attr_guard;
+}
+
+/**
+ * Locks data-attributes.
+ *
+ * Prevents data-attributes from changing, until lock is released by
+ * cl_object_attr_unlock(). This has to be called before calls to
+ * cl_object_attr_get(), cl_object_attr_set().
+ */
+void cl_object_attr_lock(struct cl_object *o)
+{
+	spin_lock(cl_object_attr_guard(o));
+}
+EXPORT_SYMBOL(cl_object_attr_lock);
+
+/**
+ * Releases data-attributes lock, acquired by cl_object_attr_lock().
+ */
+void cl_object_attr_unlock(struct cl_object *o)
+{
+	spin_unlock(cl_object_attr_guard(o));
+}
+EXPORT_SYMBOL(cl_object_attr_unlock);
+
+/**
+ * Returns data-attributes of an object \a obj.
+ *
+ * Every layer is asked (by calling cl_object_operations::coo_attr_get())
+ * top-to-bottom to fill in parts of \a attr that this layer is responsible
+ * for.
+ */
+int cl_object_attr_get(const struct lu_env *env, struct cl_object *obj,
+		       struct cl_attr *attr)
+{
+	struct lu_object_header *top;
+	int result;
+
+	LASSERT(spin_is_locked(cl_object_attr_guard(obj)));
+	ENTRY;
+
+	top = obj->co_lu.lo_header;
+	result = 0;
+	list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) {
+		if (obj->co_ops->coo_attr_get != NULL) {
+			result = obj->co_ops->coo_attr_get(env, obj, attr);
+			if (result != 0) {
+				if (result > 0)
+					result = 0;
+				break;
+			}
+		}
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_object_attr_get);
+
+/**
+ * Updates data-attributes of an object \a obj.
+ *
+ * Only attributes, mentioned in a validness bit-mask \a v are
+ * updated. Calls cl_object_operations::coo_attr_set() on every layer, bottom
+ * to top.
+ */
+int cl_object_attr_set(const struct lu_env *env, struct cl_object *obj,
+		       const struct cl_attr *attr, unsigned v)
+{
+	struct lu_object_header *top;
+	int result;
+
+	LASSERT(spin_is_locked(cl_object_attr_guard(obj)));
+	ENTRY;
+
+	top = obj->co_lu.lo_header;
+	result = 0;
+	list_for_each_entry_reverse(obj, &top->loh_layers,
+					co_lu.lo_linkage) {
+		if (obj->co_ops->coo_attr_set != NULL) {
+			result = obj->co_ops->coo_attr_set(env, obj, attr, v);
+			if (result != 0) {
+				if (result > 0)
+					result = 0;
+				break;
+			}
+		}
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_object_attr_set);
+
+/**
+ * Notifies layers (bottom-to-top) that glimpse AST was received.
+ *
+ * Layers have to fill \a lvb fields with information that will be shipped
+ * back to glimpse issuer.
+ *
+ * \see cl_lock_operations::clo_glimpse()
+ */
+int cl_object_glimpse(const struct lu_env *env, struct cl_object *obj,
+		      struct ost_lvb *lvb)
+{
+	struct lu_object_header *top;
+	int result;
+
+	ENTRY;
+	top = obj->co_lu.lo_header;
+	result = 0;
+	list_for_each_entry_reverse(obj, &top->loh_layers,
+					co_lu.lo_linkage) {
+		if (obj->co_ops->coo_glimpse != NULL) {
+			result = obj->co_ops->coo_glimpse(env, obj, lvb);
+			if (result != 0)
+				break;
+		}
+	}
+	LU_OBJECT_HEADER(D_DLMTRACE, env, lu_object_top(top),
+			 "size: "LPU64" mtime: "LPU64" atime: "LPU64" "
+			 "ctime: "LPU64" blocks: "LPU64"\n",
+			 lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime,
+			 lvb->lvb_ctime, lvb->lvb_blocks);
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_object_glimpse);
+
+/**
+ * Updates a configuration of an object \a obj.
+ */
+int cl_conf_set(const struct lu_env *env, struct cl_object *obj,
+		const struct cl_object_conf *conf)
+{
+	struct lu_object_header *top;
+	int result;
+
+	ENTRY;
+	top = obj->co_lu.lo_header;
+	result = 0;
+	list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) {
+		if (obj->co_ops->coo_conf_set != NULL) {
+			result = obj->co_ops->coo_conf_set(env, obj, conf);
+			if (result != 0)
+				break;
+		}
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_conf_set);
+
+/**
+ * Helper function removing all object locks, and marking object for
+ * deletion. All object pages must have been deleted at this point.
+ *
+ * This is called by cl_inode_fini() and lov_object_delete() to destroy top-
+ * and sub- objects respectively.
+ */
+void cl_object_kill(const struct lu_env *env, struct cl_object *obj)
+{
+	struct cl_object_header *hdr;
+
+	hdr = cl_object_header(obj);
+	LASSERT(hdr->coh_tree.rnode == NULL);
+	LASSERT(hdr->coh_pages == 0);
+
+	set_bit(LU_OBJECT_HEARD_BANSHEE, &hdr->coh_lu.loh_flags);
+	/*
+	 * Destroy all locks. Object destruction (including cl_inode_fini())
+	 * cannot cancel the locks, because in the case of a local client,
+	 * where client and server share the same thread running
+	 * prune_icache(), this can dead-lock with ldlm_cancel_handler()
+	 * waiting on __wait_on_freeing_inode().
+	 */
+	cl_locks_prune(env, obj, 0);
+}
+EXPORT_SYMBOL(cl_object_kill);
+
+/**
+ * Prunes caches of pages and locks for this object.
+ */
+void cl_object_prune(const struct lu_env *env, struct cl_object *obj)
+{
+	ENTRY;
+	cl_pages_prune(env, obj);
+	cl_locks_prune(env, obj, 1);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_object_prune);
+
+/**
+ * Check if the object has locks.
+ */
+int cl_object_has_locks(struct cl_object *obj)
+{
+	struct cl_object_header *head = cl_object_header(obj);
+	int has;
+
+	spin_lock(&head->coh_lock_guard);
+	has = list_empty(&head->coh_locks);
+	spin_unlock(&head->coh_lock_guard);
+
+	return (has == 0);
+}
+EXPORT_SYMBOL(cl_object_has_locks);
+
+void cache_stats_init(struct cache_stats *cs, const char *name)
+{
+	int i;
+
+	cs->cs_name = name;
+	for (i = 0; i < CS_NR; i++)
+		atomic_set(&cs->cs_stats[i], 0);
+}
+
+int cache_stats_print(const struct cache_stats *cs,
+		      char *page, int count, int h)
+{
+	int nob = 0;
+	int i;
+	/*
+	 *   lookup    hit    total  cached create
+	 * env: ...... ...... ...... ...... ......
+	 */
+	if (h) {
+		const char *names[CS_NR] = CS_NAMES;
+
+		nob += snprintf(page + nob, count - nob, "%6s", " ");
+		for (i = 0; i < CS_NR; i++)
+			nob += snprintf(page + nob, count - nob,
+					"%8s", names[i]);
+		nob += snprintf(page + nob, count - nob, "\n");
+	}
+
+	nob += snprintf(page + nob, count - nob, "%5.5s:", cs->cs_name);
+	for (i = 0; i < CS_NR; i++)
+		nob += snprintf(page + nob, count - nob, "%8u",
+				atomic_read(&cs->cs_stats[i]));
+	return nob;
+}
+
+/**
+ * Initialize client site.
+ *
+ * Perform common initialization (lu_site_init()), and initialize statistical
+ * counters. Also perform global initializations on the first call.
+ */
+int cl_site_init(struct cl_site *s, struct cl_device *d)
+{
+	int i;
+	int result;
+
+	result = lu_site_init(&s->cs_lu, &d->cd_lu_dev);
+	if (result == 0) {
+		cache_stats_init(&s->cs_pages, "pages");
+		cache_stats_init(&s->cs_locks, "locks");
+		for (i = 0; i < ARRAY_SIZE(s->cs_pages_state); ++i)
+			atomic_set(&s->cs_pages_state[0], 0);
+		for (i = 0; i < ARRAY_SIZE(s->cs_locks_state); ++i)
+			atomic_set(&s->cs_locks_state[i], 0);
+	}
+	return result;
+}
+EXPORT_SYMBOL(cl_site_init);
+
+/**
+ * Finalize client site. Dual to cl_site_init().
+ */
+void cl_site_fini(struct cl_site *s)
+{
+	lu_site_fini(&s->cs_lu);
+}
+EXPORT_SYMBOL(cl_site_fini);
+
+static struct cache_stats cl_env_stats = {
+	.cs_name    = "envs",
+	.cs_stats = { ATOMIC_INIT(0), }
+};
+
+/**
+ * Outputs client site statistical counters into a buffer. Suitable for
+ * ll_rd_*()-style functions.
+ */
+int cl_site_stats_print(const struct cl_site *site, char *page, int count)
+{
+	int nob;
+	int i;
+	static const char *pstate[] = {
+		[CPS_CACHED]  = "c",
+		[CPS_OWNED]   = "o",
+		[CPS_PAGEOUT] = "w",
+		[CPS_PAGEIN]  = "r",
+		[CPS_FREEING] = "f"
+	};
+	static const char *lstate[] = {
+		[CLS_NEW]       = "n",
+		[CLS_QUEUING]   = "q",
+		[CLS_ENQUEUED]  = "e",
+		[CLS_HELD]      = "h",
+		[CLS_INTRANSIT] = "t",
+		[CLS_CACHED]    = "c",
+		[CLS_FREEING]   = "f"
+	};
+/*
+       lookup    hit  total   busy create
+pages: ...... ...... ...... ...... ...... [...... ...... ...... ......]
+locks: ...... ...... ...... ...... ...... [...... ...... ...... ...... ......]
+  env: ...... ...... ...... ...... ......
+ */
+	nob = lu_site_stats_print(&site->cs_lu, page, count);
+	nob += cache_stats_print(&site->cs_pages, page + nob, count - nob, 1);
+	nob += snprintf(page + nob, count - nob, " [");
+	for (i = 0; i < ARRAY_SIZE(site->cs_pages_state); ++i)
+		nob += snprintf(page + nob, count - nob, "%s: %u ",
+				pstate[i],
+				atomic_read(&site->cs_pages_state[i]));
+	nob += snprintf(page + nob, count - nob, "]\n");
+	nob += cache_stats_print(&site->cs_locks, page + nob, count - nob, 0);
+	nob += snprintf(page + nob, count - nob, " [");
+	for (i = 0; i < ARRAY_SIZE(site->cs_locks_state); ++i)
+		nob += snprintf(page + nob, count - nob, "%s: %u ",
+				lstate[i],
+				atomic_read(&site->cs_locks_state[i]));
+	nob += snprintf(page + nob, count - nob, "]\n");
+	nob += cache_stats_print(&cl_env_stats, page + nob, count - nob, 0);
+	nob += snprintf(page + nob, count - nob, "\n");
+	return nob;
+}
+EXPORT_SYMBOL(cl_site_stats_print);
+
+/*****************************************************************************
+ *
+ * lu_env handling on client.
+ *
+ */
+
+/**
+ * The most efficient way is to store cl_env pointer in task specific
+ * structures. On Linux, it wont' be easy to use task_struct->journal_info
+ * because Lustre code may call into other fs which has certain assumptions
+ * about journal_info. Currently following fields in task_struct are identified
+ * can be used for this purpose:
+ *  - cl_env: for liblustre.
+ *  - tux_info: ony on RedHat kernel.
+ *  - ...
+ * \note As long as we use task_struct to store cl_env, we assume that once
+ * called into Lustre, we'll never call into the other part of the kernel
+ * which will use those fields in task_struct without explicitly exiting
+ * Lustre.
+ *
+ * If there's no space in task_struct is available, hash will be used.
+ * bz20044, bz22683.
+ */
+
+struct cl_env {
+	void	     *ce_magic;
+	struct lu_env     ce_lu;
+	struct lu_context ce_ses;
+
+	/**
+	 * This allows cl_env to be entered into cl_env_hash which implements
+	 * the current thread -> client environment lookup.
+	 */
+	struct hlist_node  ce_node;
+	/**
+	 * Owner for the current cl_env.
+	 *
+	 * If LL_TASK_CL_ENV is defined, this point to the owning current,
+	 * only for debugging purpose ;
+	 * Otherwise hash is used, and this is the key for cfs_hash.
+	 * Now current thread pid is stored. Note using thread pointer would
+	 * lead to unbalanced hash because of its specific allocation locality
+	 * and could be varied for different platforms and OSes, even different
+	 * OS versions.
+	 */
+	void	     *ce_owner;
+
+	/*
+	 * Linkage into global list of all client environments. Used for
+	 * garbage collection.
+	 */
+	struct list_head	ce_linkage;
+	/*
+	 *
+	 */
+	int	       ce_ref;
+	/*
+	 * Debugging field: address of the caller who made original
+	 * allocation.
+	 */
+	void	     *ce_debug;
+};
+
+#define CL_ENV_INC(counter)
+#define CL_ENV_DEC(counter)
+
+static void cl_env_init0(struct cl_env *cle, void *debug)
+{
+	LASSERT(cle->ce_ref == 0);
+	LASSERT(cle->ce_magic == &cl_env_init0);
+	LASSERT(cle->ce_debug == NULL && cle->ce_owner == NULL);
+
+	cle->ce_ref = 1;
+	cle->ce_debug = debug;
+	CL_ENV_INC(busy);
+}
+
+
+/*
+ * The implementation of using hash table to connect cl_env and thread
+ */
+
+static cfs_hash_t *cl_env_hash;
+
+static unsigned cl_env_hops_hash(cfs_hash_t *lh,
+				 const void *key, unsigned mask)
+{
+#if BITS_PER_LONG == 64
+	return cfs_hash_u64_hash((__u64)key, mask);
+#else
+	return cfs_hash_u32_hash((__u32)key, mask);
+#endif
+}
+
+static void *cl_env_hops_obj(struct hlist_node *hn)
+{
+	struct cl_env *cle = hlist_entry(hn, struct cl_env, ce_node);
+	LASSERT(cle->ce_magic == &cl_env_init0);
+	return (void *)cle;
+}
+
+static int cl_env_hops_keycmp(const void *key, struct hlist_node *hn)
+{
+	struct cl_env *cle = cl_env_hops_obj(hn);
+
+	LASSERT(cle->ce_owner != NULL);
+	return (key == cle->ce_owner);
+}
+
+static void cl_env_hops_noop(cfs_hash_t *hs, struct hlist_node *hn)
+{
+	struct cl_env *cle = hlist_entry(hn, struct cl_env, ce_node);
+	LASSERT(cle->ce_magic == &cl_env_init0);
+}
+
+static cfs_hash_ops_t cl_env_hops = {
+	.hs_hash	= cl_env_hops_hash,
+	.hs_key	 = cl_env_hops_obj,
+	.hs_keycmp      = cl_env_hops_keycmp,
+	.hs_object      = cl_env_hops_obj,
+	.hs_get	 = cl_env_hops_noop,
+	.hs_put_locked  = cl_env_hops_noop,
+};
+
+static inline struct cl_env *cl_env_fetch(void)
+{
+	struct cl_env *cle;
+
+	cle = cfs_hash_lookup(cl_env_hash, (void *) (long) current->pid);
+	LASSERT(ergo(cle, cle->ce_magic == &cl_env_init0));
+	return cle;
+}
+
+static inline void cl_env_attach(struct cl_env *cle)
+{
+	if (cle) {
+		int rc;
+
+		LASSERT(cle->ce_owner == NULL);
+		cle->ce_owner = (void *) (long) current->pid;
+		rc = cfs_hash_add_unique(cl_env_hash, cle->ce_owner,
+					 &cle->ce_node);
+		LASSERT(rc == 0);
+	}
+}
+
+static inline void cl_env_do_detach(struct cl_env *cle)
+{
+	void *cookie;
+
+	LASSERT(cle->ce_owner == (void *) (long) current->pid);
+	cookie = cfs_hash_del(cl_env_hash, cle->ce_owner,
+			      &cle->ce_node);
+	LASSERT(cookie == cle);
+	cle->ce_owner = NULL;
+}
+
+static int cl_env_store_init(void) {
+	cl_env_hash = cfs_hash_create("cl_env",
+				      HASH_CL_ENV_BITS, HASH_CL_ENV_BITS,
+				      HASH_CL_ENV_BKT_BITS, 0,
+				      CFS_HASH_MIN_THETA,
+				      CFS_HASH_MAX_THETA,
+				      &cl_env_hops,
+				      CFS_HASH_RW_BKTLOCK);
+	return cl_env_hash != NULL ? 0 :-ENOMEM;
+}
+
+static void cl_env_store_fini(void) {
+	cfs_hash_putref(cl_env_hash);
+}
+
+
+static inline struct cl_env *cl_env_detach(struct cl_env *cle)
+{
+	if (cle == NULL)
+		cle = cl_env_fetch();
+
+	if (cle && cle->ce_owner)
+		cl_env_do_detach(cle);
+
+	return cle;
+}
+
+static struct lu_env *cl_env_new(__u32 ctx_tags, __u32 ses_tags, void *debug)
+{
+	struct lu_env *env;
+	struct cl_env *cle;
+
+	OBD_SLAB_ALLOC_PTR_GFP(cle, cl_env_kmem, __GFP_IO);
+	if (cle != NULL) {
+		int rc;
+
+		INIT_LIST_HEAD(&cle->ce_linkage);
+		cle->ce_magic = &cl_env_init0;
+		env = &cle->ce_lu;
+		rc = lu_env_init(env, LCT_CL_THREAD|ctx_tags);
+		if (rc == 0) {
+			rc = lu_context_init(&cle->ce_ses,
+					     LCT_SESSION | ses_tags);
+			if (rc == 0) {
+				lu_context_enter(&cle->ce_ses);
+				env->le_ses = &cle->ce_ses;
+				cl_env_init0(cle, debug);
+			} else
+				lu_env_fini(env);
+		}
+		if (rc != 0) {
+			OBD_SLAB_FREE_PTR(cle, cl_env_kmem);
+			env = ERR_PTR(rc);
+		} else {
+			CL_ENV_INC(create);
+			CL_ENV_INC(total);
+		}
+	} else
+		env = ERR_PTR(-ENOMEM);
+	return env;
+}
+
+static void cl_env_fini(struct cl_env *cle)
+{
+	CL_ENV_DEC(total);
+	lu_context_fini(&cle->ce_lu.le_ctx);
+	lu_context_fini(&cle->ce_ses);
+	OBD_SLAB_FREE_PTR(cle, cl_env_kmem);
+}
+
+static inline struct cl_env *cl_env_container(struct lu_env *env)
+{
+	return container_of(env, struct cl_env, ce_lu);
+}
+
+struct lu_env *cl_env_peek(int *refcheck)
+{
+	struct lu_env *env;
+	struct cl_env *cle;
+
+	CL_ENV_INC(lookup);
+
+	/* check that we don't go far from untrusted pointer */
+	CLASSERT(offsetof(struct cl_env, ce_magic) == 0);
+
+	env = NULL;
+	cle = cl_env_fetch();
+	if (cle != NULL) {
+		CL_ENV_INC(hit);
+		env = &cle->ce_lu;
+		*refcheck = ++cle->ce_ref;
+	}
+	CDEBUG(D_OTHER, "%d@%p\n", cle ? cle->ce_ref : 0, cle);
+	return env;
+}
+EXPORT_SYMBOL(cl_env_peek);
+
+/**
+ * Returns lu_env: if there already is an environment associated with the
+ * current thread, it is returned, otherwise, new environment is allocated.
+ *
+ * \param refcheck pointer to a counter used to detect environment leaks. In
+ * the usual case cl_env_get() and cl_env_put() are called in the same lexical
+ * scope and pointer to the same integer is passed as \a refcheck. This is
+ * used to detect missed cl_env_put().
+ *
+ * \see cl_env_put()
+ */
+struct lu_env *cl_env_get(int *refcheck)
+{
+	struct lu_env *env;
+
+	env = cl_env_peek(refcheck);
+	if (env == NULL) {
+		env = cl_env_new(lu_context_tags_default,
+				 lu_session_tags_default,
+				 __builtin_return_address(0));
+
+		if (!IS_ERR(env)) {
+			struct cl_env *cle;
+
+			cle = cl_env_container(env);
+			cl_env_attach(cle);
+			*refcheck = cle->ce_ref;
+			CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+		}
+	}
+	return env;
+}
+EXPORT_SYMBOL(cl_env_get);
+
+/**
+ * Forces an allocation of a fresh environment with given tags.
+ *
+ * \see cl_env_get()
+ */
+struct lu_env *cl_env_alloc(int *refcheck, __u32 tags)
+{
+	struct lu_env *env;
+
+	LASSERT(cl_env_peek(refcheck) == NULL);
+	env = cl_env_new(tags, tags, __builtin_return_address(0));
+	if (!IS_ERR(env)) {
+		struct cl_env *cle;
+
+		cle = cl_env_container(env);
+		*refcheck = cle->ce_ref;
+		CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+	}
+	return env;
+}
+EXPORT_SYMBOL(cl_env_alloc);
+
+static void cl_env_exit(struct cl_env *cle)
+{
+	LASSERT(cle->ce_owner == NULL);
+	lu_context_exit(&cle->ce_lu.le_ctx);
+	lu_context_exit(&cle->ce_ses);
+}
+
+/**
+ * Release an environment.
+ *
+ * Decrement \a env reference counter. When counter drops to 0, nothing in
+ * this thread is using environment and it is returned to the allocation
+ * cache, or freed straight away, if cache is large enough.
+ */
+void cl_env_put(struct lu_env *env, int *refcheck)
+{
+	struct cl_env *cle;
+
+	cle = cl_env_container(env);
+
+	LASSERT(cle->ce_ref > 0);
+	LASSERT(ergo(refcheck != NULL, cle->ce_ref == *refcheck));
+
+	CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+	if (--cle->ce_ref == 0) {
+		CL_ENV_DEC(busy);
+		cl_env_detach(cle);
+		cle->ce_debug = NULL;
+		cl_env_exit(cle);
+		cl_env_fini(cle);
+	}
+}
+EXPORT_SYMBOL(cl_env_put);
+
+/**
+ * Declares a point of re-entrancy.
+ *
+ * \see cl_env_reexit()
+ */
+void *cl_env_reenter(void)
+{
+	return cl_env_detach(NULL);
+}
+EXPORT_SYMBOL(cl_env_reenter);
+
+/**
+ * Exits re-entrancy.
+ */
+void cl_env_reexit(void *cookie)
+{
+	cl_env_detach(NULL);
+	cl_env_attach(cookie);
+}
+EXPORT_SYMBOL(cl_env_reexit);
+
+/**
+ * Setup user-supplied \a env as a current environment. This is to be used to
+ * guaranteed that environment exists even when cl_env_get() fails. It is up
+ * to user to ensure proper concurrency control.
+ *
+ * \see cl_env_unplant()
+ */
+void cl_env_implant(struct lu_env *env, int *refcheck)
+{
+	struct cl_env *cle = cl_env_container(env);
+
+	LASSERT(cle->ce_ref > 0);
+
+	cl_env_attach(cle);
+	cl_env_get(refcheck);
+	CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+}
+EXPORT_SYMBOL(cl_env_implant);
+
+/**
+ * Detach environment installed earlier by cl_env_implant().
+ */
+void cl_env_unplant(struct lu_env *env, int *refcheck)
+{
+	struct cl_env *cle = cl_env_container(env);
+
+	LASSERT(cle->ce_ref > 1);
+
+	CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+
+	cl_env_detach(cle);
+	cl_env_put(env, refcheck);
+}
+EXPORT_SYMBOL(cl_env_unplant);
+
+struct lu_env *cl_env_nested_get(struct cl_env_nest *nest)
+{
+	struct lu_env *env;
+
+	nest->cen_cookie = NULL;
+	env = cl_env_peek(&nest->cen_refcheck);
+	if (env != NULL) {
+		if (!cl_io_is_going(env))
+			return env;
+		else {
+			cl_env_put(env, &nest->cen_refcheck);
+			nest->cen_cookie = cl_env_reenter();
+		}
+	}
+	env = cl_env_get(&nest->cen_refcheck);
+	if (IS_ERR(env)) {
+		cl_env_reexit(nest->cen_cookie);
+		return env;
+	}
+
+	LASSERT(!cl_io_is_going(env));
+	return env;
+}
+EXPORT_SYMBOL(cl_env_nested_get);
+
+void cl_env_nested_put(struct cl_env_nest *nest, struct lu_env *env)
+{
+	cl_env_put(env, &nest->cen_refcheck);
+	cl_env_reexit(nest->cen_cookie);
+}
+EXPORT_SYMBOL(cl_env_nested_put);
+
+/**
+ * Converts struct cl_attr to struct ost_lvb.
+ *
+ * \see cl_lvb2attr
+ */
+void cl_attr2lvb(struct ost_lvb *lvb, const struct cl_attr *attr)
+{
+	ENTRY;
+	lvb->lvb_size   = attr->cat_size;
+	lvb->lvb_mtime  = attr->cat_mtime;
+	lvb->lvb_atime  = attr->cat_atime;
+	lvb->lvb_ctime  = attr->cat_ctime;
+	lvb->lvb_blocks = attr->cat_blocks;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_attr2lvb);
+
+/**
+ * Converts struct ost_lvb to struct cl_attr.
+ *
+ * \see cl_attr2lvb
+ */
+void cl_lvb2attr(struct cl_attr *attr, const struct ost_lvb *lvb)
+{
+	ENTRY;
+	attr->cat_size   = lvb->lvb_size;
+	attr->cat_mtime  = lvb->lvb_mtime;
+	attr->cat_atime  = lvb->lvb_atime;
+	attr->cat_ctime  = lvb->lvb_ctime;
+	attr->cat_blocks = lvb->lvb_blocks;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lvb2attr);
+
+/*****************************************************************************
+ *
+ * Temporary prototype thing: mirror obd-devices into cl devices.
+ *
+ */
+
+struct cl_device *cl_type_setup(const struct lu_env *env, struct lu_site *site,
+				struct lu_device_type *ldt,
+				struct lu_device *next)
+{
+	const char       *typename;
+	struct lu_device *d;
+
+	LASSERT(ldt != NULL);
+
+	typename = ldt->ldt_name;
+	d = ldt->ldt_ops->ldto_device_alloc(env, ldt, NULL);
+	if (!IS_ERR(d)) {
+		int rc;
+
+		if (site != NULL)
+			d->ld_site = site;
+		rc = ldt->ldt_ops->ldto_device_init(env, d, typename, next);
+		if (rc == 0) {
+			lu_device_get(d);
+			lu_ref_add(&d->ld_reference,
+				   "lu-stack", &lu_site_init);
+		} else {
+			ldt->ldt_ops->ldto_device_free(env, d);
+			CERROR("can't init device '%s', %d\n", typename, rc);
+			d = ERR_PTR(rc);
+		}
+	} else
+		CERROR("Cannot allocate device: '%s'\n", typename);
+	return lu2cl_dev(d);
+}
+EXPORT_SYMBOL(cl_type_setup);
+
+/**
+ * Finalize device stack by calling lu_stack_fini().
+ */
+void cl_stack_fini(const struct lu_env *env, struct cl_device *cl)
+{
+	lu_stack_fini(env, cl2lu_dev(cl));
+}
+EXPORT_SYMBOL(cl_stack_fini);
+
+int  cl_lock_init(void);
+void cl_lock_fini(void);
+
+int  cl_page_init(void);
+void cl_page_fini(void);
+
+static struct lu_context_key cl_key;
+
+struct cl_thread_info *cl_env_info(const struct lu_env *env)
+{
+	return lu_context_key_get(&env->le_ctx, &cl_key);
+}
+
+/* defines cl0_key_{init,fini}() */
+LU_KEY_INIT_FINI(cl0, struct cl_thread_info);
+
+static void *cl_key_init(const struct lu_context *ctx,
+			 struct lu_context_key *key)
+{
+	struct cl_thread_info *info;
+
+	info = cl0_key_init(ctx, key);
+	if (!IS_ERR(info)) {
+		int i;
+
+		for (i = 0; i < ARRAY_SIZE(info->clt_counters); ++i)
+			lu_ref_init(&info->clt_counters[i].ctc_locks_locked);
+	}
+	return info;
+}
+
+static void cl_key_fini(const struct lu_context *ctx,
+			struct lu_context_key *key, void *data)
+{
+	struct cl_thread_info *info;
+	int i;
+
+	info = data;
+	for (i = 0; i < ARRAY_SIZE(info->clt_counters); ++i)
+		lu_ref_fini(&info->clt_counters[i].ctc_locks_locked);
+	cl0_key_fini(ctx, key, data);
+}
+
+static void cl_key_exit(const struct lu_context *ctx,
+			struct lu_context_key *key, void *data)
+{
+	struct cl_thread_info *info = data;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(info->clt_counters); ++i) {
+		LASSERT(info->clt_counters[i].ctc_nr_held == 0);
+		LASSERT(info->clt_counters[i].ctc_nr_used == 0);
+		LASSERT(info->clt_counters[i].ctc_nr_locks_acquired == 0);
+		LASSERT(info->clt_counters[i].ctc_nr_locks_locked == 0);
+		lu_ref_fini(&info->clt_counters[i].ctc_locks_locked);
+		lu_ref_init(&info->clt_counters[i].ctc_locks_locked);
+	}
+}
+
+static struct lu_context_key cl_key = {
+	.lct_tags = LCT_CL_THREAD,
+	.lct_init = cl_key_init,
+	.lct_fini = cl_key_fini,
+	.lct_exit = cl_key_exit
+};
+
+static struct lu_kmem_descr cl_object_caches[] = {
+	{
+		.ckd_cache = &cl_env_kmem,
+		.ckd_name  = "cl_env_kmem",
+		.ckd_size  = sizeof (struct cl_env)
+	},
+	{
+		.ckd_cache = NULL
+	}
+};
+
+/**
+ * Global initialization of cl-data. Create kmem caches, register
+ * lu_context_key's, etc.
+ *
+ * \see cl_global_fini()
+ */
+int cl_global_init(void)
+{
+	int result;
+
+	result = cl_env_store_init();
+	if (result)
+		return result;
+
+	result = lu_kmem_init(cl_object_caches);
+	if (result)
+		goto out_store;
+
+	LU_CONTEXT_KEY_INIT(&cl_key);
+	result = lu_context_key_register(&cl_key);
+	if (result)
+		goto out_kmem;
+
+	result = cl_lock_init();
+	if (result)
+		goto out_context;
+
+	result = cl_page_init();
+	if (result)
+		goto out_lock;
+
+	return 0;
+out_lock:
+	cl_lock_fini();
+out_context:
+	lu_context_key_degister(&cl_key);
+out_kmem:
+	lu_kmem_fini(cl_object_caches);
+out_store:
+	cl_env_store_fini();
+	return result;
+}
+
+/**
+ * Finalization of global cl-data. Dual to cl_global_init().
+ */
+void cl_global_fini(void)
+{
+	cl_lock_fini();
+	cl_page_fini();
+	lu_context_key_degister(&cl_key);
+	lu_kmem_fini(cl_object_caches);
+	cl_env_store_fini();
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/cl_page.c b/drivers/staging/lustre/lustre/obdclass/cl_page.c
new file mode 100644
index 000000000000..bb9335911c34
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/cl_page.c
@@ -0,0 +1,1605 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Client Lustre Page.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/libcfs/libcfs.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <linux/list.h>
+
+#include <cl_object.h>
+#include "cl_internal.h"
+
+static void cl_page_delete0(const struct lu_env *env, struct cl_page *pg,
+			    int radix);
+
+# define PASSERT(env, page, expr)				       \
+  do {								    \
+	  if (unlikely(!(expr))) {				      \
+		  CL_PAGE_DEBUG(D_ERROR, (env), (page), #expr "\n");    \
+		  LASSERT(0);					   \
+	  }							     \
+  } while (0)
+
+# define PINVRNT(env, page, exp) \
+	((void)sizeof(env), (void)sizeof(page), (void)sizeof !!(exp))
+
+/* Disable page statistic by default due to huge performance penalty. */
+#define CS_PAGE_INC(o, item)
+#define CS_PAGE_DEC(o, item)
+#define CS_PAGESTATE_INC(o, state)
+#define CS_PAGESTATE_DEC(o, state)
+
+/**
+ * Internal version of cl_page_top, it should be called if the page is
+ * known to be not freed, says with page referenced, or radix tree lock held,
+ * or page owned.
+ */
+static struct cl_page *cl_page_top_trusted(struct cl_page *page)
+{
+	while (page->cp_parent != NULL)
+		page = page->cp_parent;
+	return page;
+}
+
+/**
+ * Internal version of cl_page_get().
+ *
+ * This function can be used to obtain initial reference to previously
+ * unreferenced cached object. It can be called only if concurrent page
+ * reclamation is somehow prevented, e.g., by locking page radix-tree
+ * (cl_object_header::hdr->coh_page_guard), or by keeping a lock on a VM page,
+ * associated with \a page.
+ *
+ * Use with care! Not exported.
+ */
+static void cl_page_get_trust(struct cl_page *page)
+{
+	LASSERT(atomic_read(&page->cp_ref) > 0);
+	atomic_inc(&page->cp_ref);
+}
+
+/**
+ * Returns a slice within a page, corresponding to the given layer in the
+ * device stack.
+ *
+ * \see cl_lock_at()
+ */
+static const struct cl_page_slice *
+cl_page_at_trusted(const struct cl_page *page,
+		   const struct lu_device_type *dtype)
+{
+	const struct cl_page_slice *slice;
+	ENTRY;
+
+	page = cl_page_top_trusted((struct cl_page *)page);
+	do {
+		list_for_each_entry(slice, &page->cp_layers, cpl_linkage) {
+			if (slice->cpl_obj->co_lu.lo_dev->ld_type == dtype)
+				RETURN(slice);
+		}
+		page = page->cp_child;
+	} while (page != NULL);
+	RETURN(NULL);
+}
+
+/**
+ * Returns a page with given index in the given object, or NULL if no page is
+ * found. Acquires a reference on \a page.
+ *
+ * Locking: called under cl_object_header::coh_page_guard spin-lock.
+ */
+struct cl_page *cl_page_lookup(struct cl_object_header *hdr, pgoff_t index)
+{
+	struct cl_page *page;
+
+	LASSERT(spin_is_locked(&hdr->coh_page_guard));
+
+	page = radix_tree_lookup(&hdr->coh_tree, index);
+	if (page != NULL)
+		cl_page_get_trust(page);
+	return page;
+}
+EXPORT_SYMBOL(cl_page_lookup);
+
+/**
+ * Returns a list of pages by a given [start, end] of \a obj.
+ *
+ * \param resched If not NULL, then we give up before hogging CPU for too
+ * long and set *resched = 1, in that case caller should implement a retry
+ * logic.
+ *
+ * Gang tree lookup (radix_tree_gang_lookup()) optimization is absolutely
+ * crucial in the face of [offset, EOF] locks.
+ *
+ * Return at least one page in @queue unless there is no covered page.
+ */
+int cl_page_gang_lookup(const struct lu_env *env, struct cl_object *obj,
+			struct cl_io *io, pgoff_t start, pgoff_t end,
+			cl_page_gang_cb_t cb, void *cbdata)
+{
+	struct cl_object_header *hdr;
+	struct cl_page	  *page;
+	struct cl_page	 **pvec;
+	const struct cl_page_slice  *slice;
+	const struct lu_device_type *dtype;
+	pgoff_t		  idx;
+	unsigned int	     nr;
+	unsigned int	     i;
+	unsigned int	     j;
+	int		      res = CLP_GANG_OKAY;
+	int		      tree_lock = 1;
+	ENTRY;
+
+	idx = start;
+	hdr = cl_object_header(obj);
+	pvec = cl_env_info(env)->clt_pvec;
+	dtype = cl_object_top(obj)->co_lu.lo_dev->ld_type;
+	spin_lock(&hdr->coh_page_guard);
+	while ((nr = radix_tree_gang_lookup(&hdr->coh_tree, (void **)pvec,
+					    idx, CLT_PVEC_SIZE)) > 0) {
+		int end_of_region = 0;
+		idx = pvec[nr - 1]->cp_index + 1;
+		for (i = 0, j = 0; i < nr; ++i) {
+			page = pvec[i];
+			pvec[i] = NULL;
+
+			LASSERT(page->cp_type == CPT_CACHEABLE);
+			if (page->cp_index > end) {
+				end_of_region = 1;
+				break;
+			}
+			if (page->cp_state == CPS_FREEING)
+				continue;
+
+			slice = cl_page_at_trusted(page, dtype);
+			/*
+			 * Pages for lsm-less file has no underneath sub-page
+			 * for osc, in case of ...
+			 */
+			PASSERT(env, page, slice != NULL);
+
+			page = slice->cpl_page;
+			/*
+			 * Can safely call cl_page_get_trust() under
+			 * radix-tree spin-lock.
+			 *
+			 * XXX not true, because @page is from object another
+			 * than @hdr and protected by different tree lock.
+			 */
+			cl_page_get_trust(page);
+			lu_ref_add_atomic(&page->cp_reference,
+					  "gang_lookup", current);
+			pvec[j++] = page;
+		}
+
+		/*
+		 * Here a delicate locking dance is performed. Current thread
+		 * holds a reference to a page, but has to own it before it
+		 * can be placed into queue. Owning implies waiting, so
+		 * radix-tree lock is to be released. After a wait one has to
+		 * check that pages weren't truncated (cl_page_own() returns
+		 * error in the latter case).
+		 */
+		spin_unlock(&hdr->coh_page_guard);
+		tree_lock = 0;
+
+		for (i = 0; i < j; ++i) {
+			page = pvec[i];
+			if (res == CLP_GANG_OKAY)
+				res = (*cb)(env, io, page, cbdata);
+			lu_ref_del(&page->cp_reference,
+				   "gang_lookup", current);
+			cl_page_put(env, page);
+		}
+		if (nr < CLT_PVEC_SIZE || end_of_region)
+			break;
+
+		if (res == CLP_GANG_OKAY && need_resched())
+			res = CLP_GANG_RESCHED;
+		if (res != CLP_GANG_OKAY)
+			break;
+
+		spin_lock(&hdr->coh_page_guard);
+		tree_lock = 1;
+	}
+	if (tree_lock)
+		spin_unlock(&hdr->coh_page_guard);
+	RETURN(res);
+}
+EXPORT_SYMBOL(cl_page_gang_lookup);
+
+static void cl_page_free(const struct lu_env *env, struct cl_page *page)
+{
+	struct cl_object *obj  = page->cp_obj;
+	int pagesize = cl_object_header(obj)->coh_page_bufsize;
+
+	PASSERT(env, page, list_empty(&page->cp_batch));
+	PASSERT(env, page, page->cp_owner == NULL);
+	PASSERT(env, page, page->cp_req == NULL);
+	PASSERT(env, page, page->cp_parent == NULL);
+	PASSERT(env, page, page->cp_state == CPS_FREEING);
+
+	ENTRY;
+	might_sleep();
+	while (!list_empty(&page->cp_layers)) {
+		struct cl_page_slice *slice;
+
+		slice = list_entry(page->cp_layers.next,
+				       struct cl_page_slice, cpl_linkage);
+		list_del_init(page->cp_layers.next);
+		slice->cpl_ops->cpo_fini(env, slice);
+	}
+	CS_PAGE_DEC(obj, total);
+	CS_PAGESTATE_DEC(obj, page->cp_state);
+	lu_object_ref_del_at(&obj->co_lu, page->cp_obj_ref, "cl_page", page);
+	cl_object_put(env, obj);
+	lu_ref_fini(&page->cp_reference);
+	OBD_FREE(page, pagesize);
+	EXIT;
+}
+
+/**
+ * Helper function updating page state. This is the only place in the code
+ * where cl_page::cp_state field is mutated.
+ */
+static inline void cl_page_state_set_trust(struct cl_page *page,
+					   enum cl_page_state state)
+{
+	/* bypass const. */
+	*(enum cl_page_state *)&page->cp_state = state;
+}
+
+static struct cl_page *cl_page_alloc(const struct lu_env *env,
+		struct cl_object *o, pgoff_t ind, struct page *vmpage,
+		enum cl_page_type type)
+{
+	struct cl_page	  *page;
+	struct lu_object_header *head;
+
+	ENTRY;
+	OBD_ALLOC_GFP(page, cl_object_header(o)->coh_page_bufsize,
+			__GFP_IO);
+	if (page != NULL) {
+		int result = 0;
+		atomic_set(&page->cp_ref, 1);
+		if (type == CPT_CACHEABLE) /* for radix tree */
+			atomic_inc(&page->cp_ref);
+		page->cp_obj = o;
+		cl_object_get(o);
+		page->cp_obj_ref = lu_object_ref_add(&o->co_lu, "cl_page",page);
+		page->cp_index = ind;
+		cl_page_state_set_trust(page, CPS_CACHED);
+		page->cp_type = type;
+		INIT_LIST_HEAD(&page->cp_layers);
+		INIT_LIST_HEAD(&page->cp_batch);
+		INIT_LIST_HEAD(&page->cp_flight);
+		mutex_init(&page->cp_mutex);
+		lu_ref_init(&page->cp_reference);
+		head = o->co_lu.lo_header;
+		list_for_each_entry(o, &head->loh_layers,
+					co_lu.lo_linkage) {
+			if (o->co_ops->coo_page_init != NULL) {
+				result = o->co_ops->coo_page_init(env, o,
+								  page, vmpage);
+				if (result != 0) {
+					cl_page_delete0(env, page, 0);
+					cl_page_free(env, page);
+					page = ERR_PTR(result);
+					break;
+				}
+			}
+		}
+		if (result == 0) {
+			CS_PAGE_INC(o, total);
+			CS_PAGE_INC(o, create);
+			CS_PAGESTATE_DEC(o, CPS_CACHED);
+		}
+	} else {
+		page = ERR_PTR(-ENOMEM);
+	}
+	RETURN(page);
+}
+
+/**
+ * Returns a cl_page with index \a idx at the object \a o, and associated with
+ * the VM page \a vmpage.
+ *
+ * This is the main entry point into the cl_page caching interface. First, a
+ * cache (implemented as a per-object radix tree) is consulted. If page is
+ * found there, it is returned immediately. Otherwise new page is allocated
+ * and returned. In any case, additional reference to page is acquired.
+ *
+ * \see cl_object_find(), cl_lock_find()
+ */
+static struct cl_page *cl_page_find0(const struct lu_env *env,
+				     struct cl_object *o,
+				     pgoff_t idx, struct page *vmpage,
+				     enum cl_page_type type,
+				     struct cl_page *parent)
+{
+	struct cl_page	  *page = NULL;
+	struct cl_page	  *ghost = NULL;
+	struct cl_object_header *hdr;
+	int err;
+
+	LASSERT(type == CPT_CACHEABLE || type == CPT_TRANSIENT);
+	might_sleep();
+
+	ENTRY;
+
+	hdr = cl_object_header(o);
+	CS_PAGE_INC(o, lookup);
+
+	CDEBUG(D_PAGE, "%lu@"DFID" %p %lx %d\n",
+	       idx, PFID(&hdr->coh_lu.loh_fid), vmpage, vmpage->private, type);
+	/* fast path. */
+	if (type == CPT_CACHEABLE) {
+		/* vmpage lock is used to protect the child/parent
+		 * relationship */
+		KLASSERT(PageLocked(vmpage));
+		/*
+		 * cl_vmpage_page() can be called here without any locks as
+		 *
+		 *     - "vmpage" is locked (which prevents ->private from
+		 *       concurrent updates), and
+		 *
+		 *     - "o" cannot be destroyed while current thread holds a
+		 *       reference on it.
+		 */
+		page = cl_vmpage_page(vmpage, o);
+		PINVRNT(env, page,
+			ergo(page != NULL,
+			     cl_page_vmpage(env, page) == vmpage &&
+			     (void *)radix_tree_lookup(&hdr->coh_tree,
+						       idx) == page));
+	}
+
+	if (page != NULL) {
+		CS_PAGE_INC(o, hit);
+		RETURN(page);
+	}
+
+	/* allocate and initialize cl_page */
+	page = cl_page_alloc(env, o, idx, vmpage, type);
+	if (IS_ERR(page))
+		RETURN(page);
+
+	if (type == CPT_TRANSIENT) {
+		if (parent) {
+			LASSERT(page->cp_parent == NULL);
+			page->cp_parent = parent;
+			parent->cp_child = page;
+		}
+		RETURN(page);
+	}
+
+	/*
+	 * XXX optimization: use radix_tree_preload() here, and change tree
+	 * gfp mask to GFP_KERNEL in cl_object_header_init().
+	 */
+	spin_lock(&hdr->coh_page_guard);
+	err = radix_tree_insert(&hdr->coh_tree, idx, page);
+	if (err != 0) {
+		ghost = page;
+		/*
+		 * Noted by Jay: a lock on \a vmpage protects cl_page_find()
+		 * from this race, but
+		 *
+		 *     0. it's better to have cl_page interface "locally
+		 *     consistent" so that its correctness can be reasoned
+		 *     about without appealing to the (obscure world of) VM
+		 *     locking.
+		 *
+		 *     1. handling this race allows ->coh_tree to remain
+		 *     consistent even when VM locking is somehow busted,
+		 *     which is very useful during diagnosing and debugging.
+		 */
+		page = ERR_PTR(err);
+		CL_PAGE_DEBUG(D_ERROR, env, ghost,
+			      "fail to insert into radix tree: %d\n", err);
+	} else {
+		if (parent) {
+			LASSERT(page->cp_parent == NULL);
+			page->cp_parent = parent;
+			parent->cp_child = page;
+		}
+		hdr->coh_pages++;
+	}
+	spin_unlock(&hdr->coh_page_guard);
+
+	if (unlikely(ghost != NULL)) {
+		cl_page_delete0(env, ghost, 0);
+		cl_page_free(env, ghost);
+	}
+	RETURN(page);
+}
+
+struct cl_page *cl_page_find(const struct lu_env *env, struct cl_object *o,
+			     pgoff_t idx, struct page *vmpage,
+			     enum cl_page_type type)
+{
+	return cl_page_find0(env, o, idx, vmpage, type, NULL);
+}
+EXPORT_SYMBOL(cl_page_find);
+
+
+struct cl_page *cl_page_find_sub(const struct lu_env *env, struct cl_object *o,
+				 pgoff_t idx, struct page *vmpage,
+				 struct cl_page *parent)
+{
+	return cl_page_find0(env, o, idx, vmpage, parent->cp_type, parent);
+}
+EXPORT_SYMBOL(cl_page_find_sub);
+
+static inline int cl_page_invariant(const struct cl_page *pg)
+{
+	struct cl_object_header *header;
+	struct cl_page	  *parent;
+	struct cl_page	  *child;
+	struct cl_io	    *owner;
+
+	/*
+	 * Page invariant is protected by a VM lock.
+	 */
+	LINVRNT(cl_page_is_vmlocked(NULL, pg));
+
+	header = cl_object_header(pg->cp_obj);
+	parent = pg->cp_parent;
+	child  = pg->cp_child;
+	owner  = pg->cp_owner;
+
+	return cl_page_in_use(pg) &&
+		ergo(parent != NULL, parent->cp_child == pg) &&
+		ergo(child != NULL, child->cp_parent == pg) &&
+		ergo(child != NULL, pg->cp_obj != child->cp_obj) &&
+		ergo(parent != NULL, pg->cp_obj != parent->cp_obj) &&
+		ergo(owner != NULL && parent != NULL,
+		     parent->cp_owner == pg->cp_owner->ci_parent) &&
+		ergo(owner != NULL && child != NULL,
+		     child->cp_owner->ci_parent == owner) &&
+		/*
+		 * Either page is early in initialization (has neither child
+		 * nor parent yet), or it is in the object radix tree.
+		 */
+		ergo(pg->cp_state < CPS_FREEING && pg->cp_type == CPT_CACHEABLE,
+		     (void *)radix_tree_lookup(&header->coh_tree,
+					       pg->cp_index) == pg ||
+		     (child == NULL && parent == NULL));
+}
+
+static void cl_page_state_set0(const struct lu_env *env,
+			       struct cl_page *page, enum cl_page_state state)
+{
+	enum cl_page_state old;
+
+	/*
+	 * Matrix of allowed state transitions [old][new], for sanity
+	 * checking.
+	 */
+	static const int allowed_transitions[CPS_NR][CPS_NR] = {
+		[CPS_CACHED] = {
+			[CPS_CACHED]  = 0,
+			[CPS_OWNED]   = 1, /* io finds existing cached page */
+			[CPS_PAGEIN]  = 0,
+			[CPS_PAGEOUT] = 1, /* write-out from the cache */
+			[CPS_FREEING] = 1, /* eviction on the memory pressure */
+		},
+		[CPS_OWNED] = {
+			[CPS_CACHED]  = 1, /* release to the cache */
+			[CPS_OWNED]   = 0,
+			[CPS_PAGEIN]  = 1, /* start read immediately */
+			[CPS_PAGEOUT] = 1, /* start write immediately */
+			[CPS_FREEING] = 1, /* lock invalidation or truncate */
+		},
+		[CPS_PAGEIN] = {
+			[CPS_CACHED]  = 1, /* io completion */
+			[CPS_OWNED]   = 0,
+			[CPS_PAGEIN]  = 0,
+			[CPS_PAGEOUT] = 0,
+			[CPS_FREEING] = 0,
+		},
+		[CPS_PAGEOUT] = {
+			[CPS_CACHED]  = 1, /* io completion */
+			[CPS_OWNED]   = 0,
+			[CPS_PAGEIN]  = 0,
+			[CPS_PAGEOUT] = 0,
+			[CPS_FREEING] = 0,
+		},
+		[CPS_FREEING] = {
+			[CPS_CACHED]  = 0,
+			[CPS_OWNED]   = 0,
+			[CPS_PAGEIN]  = 0,
+			[CPS_PAGEOUT] = 0,
+			[CPS_FREEING] = 0,
+		}
+	};
+
+	ENTRY;
+	old = page->cp_state;
+	PASSERT(env, page, allowed_transitions[old][state]);
+	CL_PAGE_HEADER(D_TRACE, env, page, "%d -> %d\n", old, state);
+	for (; page != NULL; page = page->cp_child) {
+		PASSERT(env, page, page->cp_state == old);
+		PASSERT(env, page,
+			equi(state == CPS_OWNED, page->cp_owner != NULL));
+
+		CS_PAGESTATE_DEC(page->cp_obj, page->cp_state);
+		CS_PAGESTATE_INC(page->cp_obj, state);
+		cl_page_state_set_trust(page, state);
+	}
+	EXIT;
+}
+
+static void cl_page_state_set(const struct lu_env *env,
+			      struct cl_page *page, enum cl_page_state state)
+{
+	cl_page_state_set0(env, page, state);
+}
+
+/**
+ * Acquires an additional reference to a page.
+ *
+ * This can be called only by caller already possessing a reference to \a
+ * page.
+ *
+ * \see cl_object_get(), cl_lock_get().
+ */
+void cl_page_get(struct cl_page *page)
+{
+	ENTRY;
+	cl_page_get_trust(page);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_get);
+
+/**
+ * Releases a reference to a page.
+ *
+ * When last reference is released, page is returned to the cache, unless it
+ * is in cl_page_state::CPS_FREEING state, in which case it is immediately
+ * destroyed.
+ *
+ * \see cl_object_put(), cl_lock_put().
+ */
+void cl_page_put(const struct lu_env *env, struct cl_page *page)
+{
+	PASSERT(env, page, atomic_read(&page->cp_ref) > !!page->cp_parent);
+
+	ENTRY;
+	CL_PAGE_HEADER(D_TRACE, env, page, "%d\n",
+		       atomic_read(&page->cp_ref));
+
+	if (atomic_dec_and_test(&page->cp_ref)) {
+		LASSERT(page->cp_state == CPS_FREEING);
+
+		LASSERT(atomic_read(&page->cp_ref) == 0);
+		PASSERT(env, page, page->cp_owner == NULL);
+		PASSERT(env, page, list_empty(&page->cp_batch));
+		/*
+		 * Page is no longer reachable by other threads. Tear
+		 * it down.
+		 */
+		cl_page_free(env, page);
+	}
+
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_put);
+
+/**
+ * Returns a VM page associated with a given cl_page.
+ */
+struct page *cl_page_vmpage(const struct lu_env *env, struct cl_page *page)
+{
+	const struct cl_page_slice *slice;
+
+	/*
+	 * Find uppermost layer with ->cpo_vmpage() method, and return its
+	 * result.
+	 */
+	page = cl_page_top(page);
+	do {
+		list_for_each_entry(slice, &page->cp_layers, cpl_linkage) {
+			if (slice->cpl_ops->cpo_vmpage != NULL)
+				RETURN(slice->cpl_ops->cpo_vmpage(env, slice));
+		}
+		page = page->cp_child;
+	} while (page != NULL);
+	LBUG(); /* ->cpo_vmpage() has to be defined somewhere in the stack */
+}
+EXPORT_SYMBOL(cl_page_vmpage);
+
+/**
+ * Returns a cl_page associated with a VM page, and given cl_object.
+ */
+struct cl_page *cl_vmpage_page(struct page *vmpage, struct cl_object *obj)
+{
+	struct cl_page *top;
+	struct cl_page *page;
+
+	ENTRY;
+	KLASSERT(PageLocked(vmpage));
+
+	/*
+	 * NOTE: absence of races and liveness of data are guaranteed by page
+	 *       lock on a "vmpage". That works because object destruction has
+	 *       bottom-to-top pass.
+	 */
+
+	/*
+	 * This loop assumes that ->private points to the top-most page. This
+	 * can be rectified easily.
+	 */
+	top = (struct cl_page *)vmpage->private;
+	if (top == NULL)
+		RETURN(NULL);
+
+	for (page = top; page != NULL; page = page->cp_child) {
+		if (cl_object_same(page->cp_obj, obj)) {
+			cl_page_get_trust(page);
+			break;
+		}
+	}
+	LASSERT(ergo(page, page->cp_type == CPT_CACHEABLE));
+	RETURN(page);
+}
+EXPORT_SYMBOL(cl_vmpage_page);
+
+/**
+ * Returns the top-page for a given page.
+ *
+ * \see cl_object_top(), cl_io_top()
+ */
+struct cl_page *cl_page_top(struct cl_page *page)
+{
+	return cl_page_top_trusted(page);
+}
+EXPORT_SYMBOL(cl_page_top);
+
+const struct cl_page_slice *cl_page_at(const struct cl_page *page,
+				       const struct lu_device_type *dtype)
+{
+	return cl_page_at_trusted(page, dtype);
+}
+EXPORT_SYMBOL(cl_page_at);
+
+#define CL_PAGE_OP(opname) offsetof(struct cl_page_operations, opname)
+
+#define CL_PAGE_INVOKE(_env, _page, _op, _proto, ...)		   \
+({								      \
+	const struct lu_env	*__env  = (_env);		    \
+	struct cl_page	     *__page = (_page);		   \
+	const struct cl_page_slice *__scan;			     \
+	int			 __result;			   \
+	ptrdiff_t		   __op   = (_op);		     \
+	int		       (*__method)_proto;		    \
+									\
+	__result = 0;						   \
+	__page = cl_page_top(__page);				   \
+	do {							    \
+		list_for_each_entry(__scan, &__page->cp_layers,     \
+					cpl_linkage) {		  \
+			__method = *(void **)((char *)__scan->cpl_ops + \
+					      __op);		    \
+			if (__method != NULL) {			 \
+				__result = (*__method)(__env, __scan,   \
+						       ## __VA_ARGS__); \
+				if (__result != 0)		      \
+					break;			  \
+			}					       \
+		}						       \
+		__page = __page->cp_child;			      \
+	} while (__page != NULL && __result == 0);		      \
+	if (__result > 0)					       \
+		__result = 0;					   \
+	__result;						       \
+})
+
+#define CL_PAGE_INVOID(_env, _page, _op, _proto, ...)		   \
+do {								    \
+	const struct lu_env	*__env  = (_env);		    \
+	struct cl_page	     *__page = (_page);		   \
+	const struct cl_page_slice *__scan;			     \
+	ptrdiff_t		   __op   = (_op);		     \
+	void		      (*__method)_proto;		    \
+									\
+	__page = cl_page_top(__page);				   \
+	do {							    \
+		list_for_each_entry(__scan, &__page->cp_layers,     \
+					cpl_linkage) {		  \
+			__method = *(void **)((char *)__scan->cpl_ops + \
+					      __op);		    \
+			if (__method != NULL)			   \
+				(*__method)(__env, __scan,	      \
+					    ## __VA_ARGS__);	    \
+		}						       \
+		__page = __page->cp_child;			      \
+	} while (__page != NULL);				       \
+} while (0)
+
+#define CL_PAGE_INVOID_REVERSE(_env, _page, _op, _proto, ...)	       \
+do {									\
+	const struct lu_env	*__env  = (_env);			\
+	struct cl_page	     *__page = (_page);		       \
+	const struct cl_page_slice *__scan;				 \
+	ptrdiff_t		   __op   = (_op);			 \
+	void		      (*__method)_proto;			\
+									    \
+	/* get to the bottom page. */				       \
+	while (__page->cp_child != NULL)				    \
+		__page = __page->cp_child;				  \
+	do {								\
+		list_for_each_entry_reverse(__scan, &__page->cp_layers, \
+						cpl_linkage) {	      \
+			__method = *(void **)((char *)__scan->cpl_ops +     \
+					      __op);			\
+			if (__method != NULL)			       \
+				(*__method)(__env, __scan,		  \
+					    ## __VA_ARGS__);		\
+		}							   \
+		__page = __page->cp_parent;				 \
+	} while (__page != NULL);					   \
+} while (0)
+
+static int cl_page_invoke(const struct lu_env *env,
+			  struct cl_io *io, struct cl_page *page, ptrdiff_t op)
+
+{
+	PINVRNT(env, page, cl_object_same(page->cp_obj, io->ci_obj));
+	ENTRY;
+	RETURN(CL_PAGE_INVOKE(env, page, op,
+			      (const struct lu_env *,
+			       const struct cl_page_slice *, struct cl_io *),
+			      io));
+}
+
+static void cl_page_invoid(const struct lu_env *env,
+			   struct cl_io *io, struct cl_page *page, ptrdiff_t op)
+
+{
+	PINVRNT(env, page, cl_object_same(page->cp_obj, io->ci_obj));
+	ENTRY;
+	CL_PAGE_INVOID(env, page, op,
+		       (const struct lu_env *,
+			const struct cl_page_slice *, struct cl_io *), io);
+	EXIT;
+}
+
+static void cl_page_owner_clear(struct cl_page *page)
+{
+	ENTRY;
+	for (page = cl_page_top(page); page != NULL; page = page->cp_child) {
+		if (page->cp_owner != NULL) {
+			LASSERT(page->cp_owner->ci_owned_nr > 0);
+			page->cp_owner->ci_owned_nr--;
+			page->cp_owner = NULL;
+			page->cp_task = NULL;
+		}
+	}
+	EXIT;
+}
+
+static void cl_page_owner_set(struct cl_page *page)
+{
+	ENTRY;
+	for (page = cl_page_top(page); page != NULL; page = page->cp_child) {
+		LASSERT(page->cp_owner != NULL);
+		page->cp_owner->ci_owned_nr++;
+	}
+	EXIT;
+}
+
+void cl_page_disown0(const struct lu_env *env,
+		     struct cl_io *io, struct cl_page *pg)
+{
+	enum cl_page_state state;
+
+	ENTRY;
+	state = pg->cp_state;
+	PINVRNT(env, pg, state == CPS_OWNED || state == CPS_FREEING);
+	PINVRNT(env, pg, cl_page_invariant(pg));
+	cl_page_owner_clear(pg);
+
+	if (state == CPS_OWNED)
+		cl_page_state_set(env, pg, CPS_CACHED);
+	/*
+	 * Completion call-backs are executed in the bottom-up order, so that
+	 * uppermost layer (llite), responsible for VFS/VM interaction runs
+	 * last and can release locks safely.
+	 */
+	CL_PAGE_INVOID_REVERSE(env, pg, CL_PAGE_OP(cpo_disown),
+			       (const struct lu_env *,
+				const struct cl_page_slice *, struct cl_io *),
+			       io);
+	EXIT;
+}
+
+/**
+ * returns true, iff page is owned by the given io.
+ */
+int cl_page_is_owned(const struct cl_page *pg, const struct cl_io *io)
+{
+	LINVRNT(cl_object_same(pg->cp_obj, io->ci_obj));
+	ENTRY;
+	RETURN(pg->cp_state == CPS_OWNED && pg->cp_owner == io);
+}
+EXPORT_SYMBOL(cl_page_is_owned);
+
+/**
+ * Try to own a page by IO.
+ *
+ * Waits until page is in cl_page_state::CPS_CACHED state, and then switch it
+ * into cl_page_state::CPS_OWNED state.
+ *
+ * \pre  !cl_page_is_owned(pg, io)
+ * \post result == 0 iff cl_page_is_owned(pg, io)
+ *
+ * \retval 0   success
+ *
+ * \retval -ve failure, e.g., page was destroyed (and landed in
+ *	     cl_page_state::CPS_FREEING instead of cl_page_state::CPS_CACHED).
+ *	     or, page was owned by another thread, or in IO.
+ *
+ * \see cl_page_disown()
+ * \see cl_page_operations::cpo_own()
+ * \see cl_page_own_try()
+ * \see cl_page_own
+ */
+static int cl_page_own0(const struct lu_env *env, struct cl_io *io,
+			struct cl_page *pg, int nonblock)
+{
+	int result;
+
+	PINVRNT(env, pg, !cl_page_is_owned(pg, io));
+
+	ENTRY;
+	pg = cl_page_top(pg);
+	io = cl_io_top(io);
+
+	if (pg->cp_state == CPS_FREEING) {
+		result = -ENOENT;
+	} else {
+		result = CL_PAGE_INVOKE(env, pg, CL_PAGE_OP(cpo_own),
+					(const struct lu_env *,
+					 const struct cl_page_slice *,
+					 struct cl_io *, int),
+					io, nonblock);
+		if (result == 0) {
+			PASSERT(env, pg, pg->cp_owner == NULL);
+			PASSERT(env, pg, pg->cp_req == NULL);
+			pg->cp_owner = io;
+			pg->cp_task  = current;
+			cl_page_owner_set(pg);
+			if (pg->cp_state != CPS_FREEING) {
+				cl_page_state_set(env, pg, CPS_OWNED);
+			} else {
+				cl_page_disown0(env, io, pg);
+				result = -ENOENT;
+			}
+		}
+	}
+	PINVRNT(env, pg, ergo(result == 0, cl_page_invariant(pg)));
+	RETURN(result);
+}
+
+/**
+ * Own a page, might be blocked.
+ *
+ * \see cl_page_own0()
+ */
+int cl_page_own(const struct lu_env *env, struct cl_io *io, struct cl_page *pg)
+{
+	return cl_page_own0(env, io, pg, 0);
+}
+EXPORT_SYMBOL(cl_page_own);
+
+/**
+ * Nonblock version of cl_page_own().
+ *
+ * \see cl_page_own0()
+ */
+int cl_page_own_try(const struct lu_env *env, struct cl_io *io,
+		    struct cl_page *pg)
+{
+	return cl_page_own0(env, io, pg, 1);
+}
+EXPORT_SYMBOL(cl_page_own_try);
+
+
+/**
+ * Assume page ownership.
+ *
+ * Called when page is already locked by the hosting VM.
+ *
+ * \pre !cl_page_is_owned(pg, io)
+ * \post cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_operations::cpo_assume()
+ */
+void cl_page_assume(const struct lu_env *env,
+		    struct cl_io *io, struct cl_page *pg)
+{
+	PINVRNT(env, pg, cl_object_same(pg->cp_obj, io->ci_obj));
+
+	ENTRY;
+	pg = cl_page_top(pg);
+	io = cl_io_top(io);
+
+	cl_page_invoid(env, io, pg, CL_PAGE_OP(cpo_assume));
+	PASSERT(env, pg, pg->cp_owner == NULL);
+	pg->cp_owner = io;
+	pg->cp_task = current;
+	cl_page_owner_set(pg);
+	cl_page_state_set(env, pg, CPS_OWNED);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_assume);
+
+/**
+ * Releases page ownership without unlocking the page.
+ *
+ * Moves page into cl_page_state::CPS_CACHED without releasing a lock on the
+ * underlying VM page (as VM is supposed to do this itself).
+ *
+ * \pre   cl_page_is_owned(pg, io)
+ * \post !cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_assume()
+ */
+void cl_page_unassume(const struct lu_env *env,
+		      struct cl_io *io, struct cl_page *pg)
+{
+	PINVRNT(env, pg, cl_page_is_owned(pg, io));
+	PINVRNT(env, pg, cl_page_invariant(pg));
+
+	ENTRY;
+	pg = cl_page_top(pg);
+	io = cl_io_top(io);
+	cl_page_owner_clear(pg);
+	cl_page_state_set(env, pg, CPS_CACHED);
+	CL_PAGE_INVOID_REVERSE(env, pg, CL_PAGE_OP(cpo_unassume),
+			       (const struct lu_env *,
+				const struct cl_page_slice *, struct cl_io *),
+			       io);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_unassume);
+
+/**
+ * Releases page ownership.
+ *
+ * Moves page into cl_page_state::CPS_CACHED.
+ *
+ * \pre   cl_page_is_owned(pg, io)
+ * \post !cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_own()
+ * \see cl_page_operations::cpo_disown()
+ */
+void cl_page_disown(const struct lu_env *env,
+		    struct cl_io *io, struct cl_page *pg)
+{
+	PINVRNT(env, pg, cl_page_is_owned(pg, io));
+
+	ENTRY;
+	pg = cl_page_top(pg);
+	io = cl_io_top(io);
+	cl_page_disown0(env, io, pg);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_disown);
+
+/**
+ * Called when page is to be removed from the object, e.g., as a result of
+ * truncate.
+ *
+ * Calls cl_page_operations::cpo_discard() top-to-bottom.
+ *
+ * \pre cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_operations::cpo_discard()
+ */
+void cl_page_discard(const struct lu_env *env,
+		     struct cl_io *io, struct cl_page *pg)
+{
+	PINVRNT(env, pg, cl_page_is_owned(pg, io));
+	PINVRNT(env, pg, cl_page_invariant(pg));
+
+	cl_page_invoid(env, io, pg, CL_PAGE_OP(cpo_discard));
+}
+EXPORT_SYMBOL(cl_page_discard);
+
+/**
+ * Version of cl_page_delete() that can be called for not fully constructed
+ * pages, e.g,. in a error handling cl_page_find()->cl_page_delete0()
+ * path. Doesn't check page invariant.
+ */
+static void cl_page_delete0(const struct lu_env *env, struct cl_page *pg,
+			    int radix)
+{
+	struct cl_page *tmp = pg;
+	ENTRY;
+
+	PASSERT(env, pg, pg == cl_page_top(pg));
+	PASSERT(env, pg, pg->cp_state != CPS_FREEING);
+
+	/*
+	 * Severe all ways to obtain new pointers to @pg.
+	 */
+	cl_page_owner_clear(pg);
+
+	/*
+	 * unexport the page firstly before freeing it so that
+	 * the page content is considered to be invalid.
+	 * We have to do this because a CPS_FREEING cl_page may
+	 * be NOT under the protection of a cl_lock.
+	 * Afterwards, if this page is found by other threads, then this
+	 * page will be forced to reread.
+	 */
+	cl_page_export(env, pg, 0);
+	cl_page_state_set0(env, pg, CPS_FREEING);
+
+	CL_PAGE_INVOID(env, pg, CL_PAGE_OP(cpo_delete),
+		       (const struct lu_env *, const struct cl_page_slice *));
+
+	if (tmp->cp_type == CPT_CACHEABLE) {
+		if (!radix)
+			/* !radix means that @pg is not yet in the radix tree,
+			 * skip removing it.
+			 */
+			tmp = pg->cp_child;
+		for (; tmp != NULL; tmp = tmp->cp_child) {
+			void		    *value;
+			struct cl_object_header *hdr;
+
+			hdr = cl_object_header(tmp->cp_obj);
+			spin_lock(&hdr->coh_page_guard);
+			value = radix_tree_delete(&hdr->coh_tree,
+						  tmp->cp_index);
+			PASSERT(env, tmp, value == tmp);
+			PASSERT(env, tmp, hdr->coh_pages > 0);
+			hdr->coh_pages--;
+			spin_unlock(&hdr->coh_page_guard);
+			cl_page_put(env, tmp);
+		}
+	}
+
+	EXIT;
+}
+
+/**
+ * Called when a decision is made to throw page out of memory.
+ *
+ * Notifies all layers about page destruction by calling
+ * cl_page_operations::cpo_delete() method top-to-bottom.
+ *
+ * Moves page into cl_page_state::CPS_FREEING state (this is the only place
+ * where transition to this state happens).
+ *
+ * Eliminates all venues through which new references to the page can be
+ * obtained:
+ *
+ *     - removes page from the radix trees,
+ *
+ *     - breaks linkage from VM page to cl_page.
+ *
+ * Once page reaches cl_page_state::CPS_FREEING, all remaining references will
+ * drain after some time, at which point page will be recycled.
+ *
+ * \pre  pg == cl_page_top(pg)
+ * \pre  VM page is locked
+ * \post pg->cp_state == CPS_FREEING
+ *
+ * \see cl_page_operations::cpo_delete()
+ */
+void cl_page_delete(const struct lu_env *env, struct cl_page *pg)
+{
+	PINVRNT(env, pg, cl_page_invariant(pg));
+	ENTRY;
+	cl_page_delete0(env, pg, 1);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_delete);
+
+/**
+ * Unmaps page from user virtual memory.
+ *
+ * Calls cl_page_operations::cpo_unmap() through all layers top-to-bottom. The
+ * layer responsible for VM interaction has to unmap page from user space
+ * virtual memory.
+ *
+ * \see cl_page_operations::cpo_unmap()
+ */
+int cl_page_unmap(const struct lu_env *env,
+		  struct cl_io *io, struct cl_page *pg)
+{
+	PINVRNT(env, pg, cl_page_is_owned(pg, io));
+	PINVRNT(env, pg, cl_page_invariant(pg));
+
+	return cl_page_invoke(env, io, pg, CL_PAGE_OP(cpo_unmap));
+}
+EXPORT_SYMBOL(cl_page_unmap);
+
+/**
+ * Marks page up-to-date.
+ *
+ * Call cl_page_operations::cpo_export() through all layers top-to-bottom. The
+ * layer responsible for VM interaction has to mark/clear page as up-to-date
+ * by the \a uptodate argument.
+ *
+ * \see cl_page_operations::cpo_export()
+ */
+void cl_page_export(const struct lu_env *env, struct cl_page *pg, int uptodate)
+{
+	PINVRNT(env, pg, cl_page_invariant(pg));
+	CL_PAGE_INVOID(env, pg, CL_PAGE_OP(cpo_export),
+		       (const struct lu_env *,
+			const struct cl_page_slice *, int), uptodate);
+}
+EXPORT_SYMBOL(cl_page_export);
+
+/**
+ * Returns true, iff \a pg is VM locked in a suitable sense by the calling
+ * thread.
+ */
+int cl_page_is_vmlocked(const struct lu_env *env, const struct cl_page *pg)
+{
+	int result;
+	const struct cl_page_slice *slice;
+
+	ENTRY;
+	pg = cl_page_top_trusted((struct cl_page *)pg);
+	slice = container_of(pg->cp_layers.next,
+			     const struct cl_page_slice, cpl_linkage);
+	PASSERT(env, pg, slice->cpl_ops->cpo_is_vmlocked != NULL);
+	/*
+	 * Call ->cpo_is_vmlocked() directly instead of going through
+	 * CL_PAGE_INVOKE(), because cl_page_is_vmlocked() is used by
+	 * cl_page_invariant().
+	 */
+	result = slice->cpl_ops->cpo_is_vmlocked(env, slice);
+	PASSERT(env, pg, result == -EBUSY || result == -ENODATA);
+	RETURN(result == -EBUSY);
+}
+EXPORT_SYMBOL(cl_page_is_vmlocked);
+
+static enum cl_page_state cl_req_type_state(enum cl_req_type crt)
+{
+	ENTRY;
+	RETURN(crt == CRT_WRITE ? CPS_PAGEOUT : CPS_PAGEIN);
+}
+
+static void cl_page_io_start(const struct lu_env *env,
+			     struct cl_page *pg, enum cl_req_type crt)
+{
+	/*
+	 * Page is queued for IO, change its state.
+	 */
+	ENTRY;
+	cl_page_owner_clear(pg);
+	cl_page_state_set(env, pg, cl_req_type_state(crt));
+	EXIT;
+}
+
+/**
+ * Prepares page for immediate transfer. cl_page_operations::cpo_prep() is
+ * called top-to-bottom. Every layer either agrees to submit this page (by
+ * returning 0), or requests to omit this page (by returning -EALREADY). Layer
+ * handling interactions with the VM also has to inform VM that page is under
+ * transfer now.
+ */
+int cl_page_prep(const struct lu_env *env, struct cl_io *io,
+		 struct cl_page *pg, enum cl_req_type crt)
+{
+	int result;
+
+	PINVRNT(env, pg, cl_page_is_owned(pg, io));
+	PINVRNT(env, pg, cl_page_invariant(pg));
+	PINVRNT(env, pg, crt < CRT_NR);
+
+	/*
+	 * XXX this has to be called bottom-to-top, so that llite can set up
+	 * PG_writeback without risking other layers deciding to skip this
+	 * page.
+	 */
+	if (crt >= CRT_NR)
+		return -EINVAL;
+	result = cl_page_invoke(env, io, pg, CL_PAGE_OP(io[crt].cpo_prep));
+	if (result == 0)
+		cl_page_io_start(env, pg, crt);
+
+	KLASSERT(ergo(crt == CRT_WRITE && pg->cp_type == CPT_CACHEABLE,
+		      equi(result == 0,
+			   PageWriteback(cl_page_vmpage(env, pg)))));
+	CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result);
+	return result;
+}
+EXPORT_SYMBOL(cl_page_prep);
+
+/**
+ * Notify layers about transfer completion.
+ *
+ * Invoked by transfer sub-system (which is a part of osc) to notify layers
+ * that a transfer, of which this page is a part of has completed.
+ *
+ * Completion call-backs are executed in the bottom-up order, so that
+ * uppermost layer (llite), responsible for the VFS/VM interaction runs last
+ * and can release locks safely.
+ *
+ * \pre  pg->cp_state == CPS_PAGEIN || pg->cp_state == CPS_PAGEOUT
+ * \post pg->cp_state == CPS_CACHED
+ *
+ * \see cl_page_operations::cpo_completion()
+ */
+void cl_page_completion(const struct lu_env *env,
+			struct cl_page *pg, enum cl_req_type crt, int ioret)
+{
+	struct cl_sync_io *anchor = pg->cp_sync_io;
+
+	PASSERT(env, pg, crt < CRT_NR);
+	/* cl_page::cp_req already cleared by the caller (osc_completion()) */
+	PASSERT(env, pg, pg->cp_req == NULL);
+	PASSERT(env, pg, pg->cp_state == cl_req_type_state(crt));
+
+	ENTRY;
+	CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, ioret);
+	if (crt == CRT_READ && ioret == 0) {
+		PASSERT(env, pg, !(pg->cp_flags & CPF_READ_COMPLETED));
+		pg->cp_flags |= CPF_READ_COMPLETED;
+	}
+
+	cl_page_state_set(env, pg, CPS_CACHED);
+	if (crt >= CRT_NR)
+		return;
+	CL_PAGE_INVOID_REVERSE(env, pg, CL_PAGE_OP(io[crt].cpo_completion),
+			       (const struct lu_env *,
+				const struct cl_page_slice *, int), ioret);
+	if (anchor) {
+		LASSERT(cl_page_is_vmlocked(env, pg));
+		LASSERT(pg->cp_sync_io == anchor);
+		pg->cp_sync_io = NULL;
+	}
+	/*
+	 * As page->cp_obj is pinned by a reference from page->cp_req, it is
+	 * safe to call cl_page_put() without risking object destruction in a
+	 * non-blocking context.
+	 */
+	cl_page_put(env, pg);
+
+	if (anchor)
+		cl_sync_io_note(anchor, ioret);
+
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_completion);
+
+/**
+ * Notify layers that transfer formation engine decided to yank this page from
+ * the cache and to make it a part of a transfer.
+ *
+ * \pre  pg->cp_state == CPS_CACHED
+ * \post pg->cp_state == CPS_PAGEIN || pg->cp_state == CPS_PAGEOUT
+ *
+ * \see cl_page_operations::cpo_make_ready()
+ */
+int cl_page_make_ready(const struct lu_env *env, struct cl_page *pg,
+		       enum cl_req_type crt)
+{
+	int result;
+
+	PINVRNT(env, pg, crt < CRT_NR);
+
+	ENTRY;
+	if (crt >= CRT_NR)
+		RETURN(-EINVAL);
+	result = CL_PAGE_INVOKE(env, pg, CL_PAGE_OP(io[crt].cpo_make_ready),
+				(const struct lu_env *,
+				 const struct cl_page_slice *));
+	if (result == 0) {
+		PASSERT(env, pg, pg->cp_state == CPS_CACHED);
+		cl_page_io_start(env, pg, crt);
+	}
+	CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result);
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_page_make_ready);
+
+/**
+ * Notify layers that high level io decided to place this page into a cache
+ * for future transfer.
+ *
+ * The layer implementing transfer engine (osc) has to register this page in
+ * its queues.
+ *
+ * \pre  cl_page_is_owned(pg, io)
+ * \post cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_operations::cpo_cache_add()
+ */
+int cl_page_cache_add(const struct lu_env *env, struct cl_io *io,
+		      struct cl_page *pg, enum cl_req_type crt)
+{
+	const struct cl_page_slice *scan;
+	int result = 0;
+
+	PINVRNT(env, pg, crt < CRT_NR);
+	PINVRNT(env, pg, cl_page_is_owned(pg, io));
+	PINVRNT(env, pg, cl_page_invariant(pg));
+
+	ENTRY;
+
+	if (crt >= CRT_NR)
+		RETURN(-EINVAL);
+
+	list_for_each_entry(scan, &pg->cp_layers, cpl_linkage) {
+		if (scan->cpl_ops->io[crt].cpo_cache_add == NULL)
+			continue;
+
+		result = scan->cpl_ops->io[crt].cpo_cache_add(env, scan, io);
+		if (result != 0)
+			break;
+	}
+	CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result);
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_page_cache_add);
+
+/**
+ * Called if a pge is being written back by kernel's intention.
+ *
+ * \pre  cl_page_is_owned(pg, io)
+ * \post ergo(result == 0, pg->cp_state == CPS_PAGEOUT)
+ *
+ * \see cl_page_operations::cpo_flush()
+ */
+int cl_page_flush(const struct lu_env *env, struct cl_io *io,
+		  struct cl_page *pg)
+{
+	int result;
+
+	PINVRNT(env, pg, cl_page_is_owned(pg, io));
+	PINVRNT(env, pg, cl_page_invariant(pg));
+
+	ENTRY;
+
+	result = cl_page_invoke(env, io, pg, CL_PAGE_OP(cpo_flush));
+
+	CL_PAGE_HEADER(D_TRACE, env, pg, "%d\n", result);
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_page_flush);
+
+/**
+ * Checks whether page is protected by any extent lock is at least required
+ * mode.
+ *
+ * \return the same as in cl_page_operations::cpo_is_under_lock() method.
+ * \see cl_page_operations::cpo_is_under_lock()
+ */
+int cl_page_is_under_lock(const struct lu_env *env, struct cl_io *io,
+			  struct cl_page *page)
+{
+	int rc;
+
+	PINVRNT(env, page, cl_page_invariant(page));
+
+	ENTRY;
+	rc = CL_PAGE_INVOKE(env, page, CL_PAGE_OP(cpo_is_under_lock),
+			    (const struct lu_env *,
+			     const struct cl_page_slice *, struct cl_io *),
+			    io);
+	PASSERT(env, page, rc != 0);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(cl_page_is_under_lock);
+
+static int page_prune_cb(const struct lu_env *env, struct cl_io *io,
+			 struct cl_page *page, void *cbdata)
+{
+	cl_page_own(env, io, page);
+	cl_page_unmap(env, io, page);
+	cl_page_discard(env, io, page);
+	cl_page_disown(env, io, page);
+	return CLP_GANG_OKAY;
+}
+
+/**
+ * Purges all cached pages belonging to the object \a obj.
+ */
+int cl_pages_prune(const struct lu_env *env, struct cl_object *clobj)
+{
+	struct cl_thread_info   *info;
+	struct cl_object	*obj = cl_object_top(clobj);
+	struct cl_io	    *io;
+	int		      result;
+
+	ENTRY;
+	info  = cl_env_info(env);
+	io    = &info->clt_io;
+
+	/*
+	 * initialize the io. This is ugly since we never do IO in this
+	 * function, we just make cl_page_list functions happy. -jay
+	 */
+	io->ci_obj = obj;
+	io->ci_ignore_layout = 1;
+	result = cl_io_init(env, io, CIT_MISC, obj);
+	if (result != 0) {
+		cl_io_fini(env, io);
+		RETURN(io->ci_result);
+	}
+
+	do {
+		result = cl_page_gang_lookup(env, obj, io, 0, CL_PAGE_EOF,
+					     page_prune_cb, NULL);
+		if (result == CLP_GANG_RESCHED)
+			cond_resched();
+	} while (result != CLP_GANG_OKAY);
+
+	cl_io_fini(env, io);
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_pages_prune);
+
+/**
+ * Tells transfer engine that only part of a page is to be transmitted.
+ *
+ * \see cl_page_operations::cpo_clip()
+ */
+void cl_page_clip(const struct lu_env *env, struct cl_page *pg,
+		  int from, int to)
+{
+	PINVRNT(env, pg, cl_page_invariant(pg));
+
+	CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", from, to);
+	CL_PAGE_INVOID(env, pg, CL_PAGE_OP(cpo_clip),
+		       (const struct lu_env *,
+			const struct cl_page_slice *,int, int),
+		       from, to);
+}
+EXPORT_SYMBOL(cl_page_clip);
+
+/**
+ * Prints human readable representation of \a pg to the \a f.
+ */
+void cl_page_header_print(const struct lu_env *env, void *cookie,
+			  lu_printer_t printer, const struct cl_page *pg)
+{
+	(*printer)(env, cookie,
+		   "page@%p[%d %p:%lu ^%p_%p %d %d %d %p %p %#x]\n",
+		   pg, atomic_read(&pg->cp_ref), pg->cp_obj,
+		   pg->cp_index, pg->cp_parent, pg->cp_child,
+		   pg->cp_state, pg->cp_error, pg->cp_type,
+		   pg->cp_owner, pg->cp_req, pg->cp_flags);
+}
+EXPORT_SYMBOL(cl_page_header_print);
+
+/**
+ * Prints human readable representation of \a pg to the \a f.
+ */
+void cl_page_print(const struct lu_env *env, void *cookie,
+		   lu_printer_t printer, const struct cl_page *pg)
+{
+	struct cl_page *scan;
+
+	for (scan = cl_page_top((struct cl_page *)pg);
+	     scan != NULL; scan = scan->cp_child)
+		cl_page_header_print(env, cookie, printer, scan);
+	CL_PAGE_INVOKE(env, (struct cl_page *)pg, CL_PAGE_OP(cpo_print),
+		       (const struct lu_env *env,
+			const struct cl_page_slice *slice,
+			void *cookie, lu_printer_t p), cookie, printer);
+	(*printer)(env, cookie, "end page@%p\n", pg);
+}
+EXPORT_SYMBOL(cl_page_print);
+
+/**
+ * Cancel a page which is still in a transfer.
+ */
+int cl_page_cancel(const struct lu_env *env, struct cl_page *page)
+{
+	return CL_PAGE_INVOKE(env, page, CL_PAGE_OP(cpo_cancel),
+			      (const struct lu_env *,
+			       const struct cl_page_slice *));
+}
+EXPORT_SYMBOL(cl_page_cancel);
+
+/**
+ * Converts a byte offset within object \a obj into a page index.
+ */
+loff_t cl_offset(const struct cl_object *obj, pgoff_t idx)
+{
+	/*
+	 * XXX for now.
+	 */
+	return (loff_t)idx << PAGE_CACHE_SHIFT;
+}
+EXPORT_SYMBOL(cl_offset);
+
+/**
+ * Converts a page index into a byte offset within object \a obj.
+ */
+pgoff_t cl_index(const struct cl_object *obj, loff_t offset)
+{
+	/*
+	 * XXX for now.
+	 */
+	return offset >> PAGE_CACHE_SHIFT;
+}
+EXPORT_SYMBOL(cl_index);
+
+int cl_page_size(const struct cl_object *obj)
+{
+	return 1 << PAGE_CACHE_SHIFT;
+}
+EXPORT_SYMBOL(cl_page_size);
+
+/**
+ * Adds page slice to the compound page.
+ *
+ * This is called by cl_object_operations::coo_page_init() methods to add a
+ * per-layer state to the page. New state is added at the end of
+ * cl_page::cp_layers list, that is, it is at the bottom of the stack.
+ *
+ * \see cl_lock_slice_add(), cl_req_slice_add(), cl_io_slice_add()
+ */
+void cl_page_slice_add(struct cl_page *page, struct cl_page_slice *slice,
+		       struct cl_object *obj,
+		       const struct cl_page_operations *ops)
+{
+	ENTRY;
+	list_add_tail(&slice->cpl_linkage, &page->cp_layers);
+	slice->cpl_obj  = obj;
+	slice->cpl_ops  = ops;
+	slice->cpl_page = page;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_slice_add);
+
+int  cl_page_init(void)
+{
+	return 0;
+}
+
+void cl_page_fini(void)
+{
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/class_obd.c b/drivers/staging/lustre/lustre/obdclass/class_obd.c
new file mode 100644
index 000000000000..20d9eaf46ae1
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/class_obd.c
@@ -0,0 +1,691 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+# include <asm/atomic.h>
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <linux/lnet/lnetctl.h>
+#include <lustre_debug.h>
+#include <lprocfs_status.h>
+#include <lustre/lustre_build_version.h>
+#include <linux/list.h>
+#include <cl_object.h>
+#include "llog_internal.h"
+
+
+struct obd_device *obd_devs[MAX_OBD_DEVICES];
+EXPORT_SYMBOL(obd_devs);
+struct list_head obd_types;
+DEFINE_RWLOCK(obd_dev_lock);
+
+__u64 obd_max_pages = 0;
+__u64 obd_max_alloc = 0;
+DEFINE_SPINLOCK(obd_updatemax_lock);
+
+/* The following are visible and mutable through /proc/sys/lustre/. */
+unsigned int obd_alloc_fail_rate = 0;
+EXPORT_SYMBOL(obd_alloc_fail_rate);
+unsigned int obd_debug_peer_on_timeout;
+EXPORT_SYMBOL(obd_debug_peer_on_timeout);
+unsigned int obd_dump_on_timeout;
+EXPORT_SYMBOL(obd_dump_on_timeout);
+unsigned int obd_dump_on_eviction;
+EXPORT_SYMBOL(obd_dump_on_eviction);
+unsigned int obd_max_dirty_pages = 256;
+EXPORT_SYMBOL(obd_max_dirty_pages);
+atomic_t obd_unstable_pages;
+EXPORT_SYMBOL(obd_unstable_pages);
+atomic_t obd_dirty_pages;
+EXPORT_SYMBOL(obd_dirty_pages);
+unsigned int obd_timeout = OBD_TIMEOUT_DEFAULT;   /* seconds */
+EXPORT_SYMBOL(obd_timeout);
+unsigned int ldlm_timeout = LDLM_TIMEOUT_DEFAULT; /* seconds */
+EXPORT_SYMBOL(ldlm_timeout);
+unsigned int obd_timeout_set;
+EXPORT_SYMBOL(obd_timeout_set);
+unsigned int ldlm_timeout_set;
+EXPORT_SYMBOL(ldlm_timeout_set);
+/* Adaptive timeout defs here instead of ptlrpc module for /proc/sys/ access */
+unsigned int at_min = 0;
+EXPORT_SYMBOL(at_min);
+unsigned int at_max = 600;
+EXPORT_SYMBOL(at_max);
+unsigned int at_history = 600;
+EXPORT_SYMBOL(at_history);
+int at_early_margin = 5;
+EXPORT_SYMBOL(at_early_margin);
+int at_extra = 30;
+EXPORT_SYMBOL(at_extra);
+
+atomic_t obd_dirty_transit_pages;
+EXPORT_SYMBOL(obd_dirty_transit_pages);
+
+char obd_jobid_var[JOBSTATS_JOBID_VAR_MAX_LEN + 1] = JOBSTATS_DISABLE;
+EXPORT_SYMBOL(obd_jobid_var);
+
+/* Get jobid of current process by reading the environment variable
+ * stored in between the "env_start" & "env_end" of task struct.
+ *
+ * TODO:
+ * It's better to cache the jobid for later use if there is any
+ * efficient way, the cl_env code probably could be reused for this
+ * purpose.
+ *
+ * If some job scheduler doesn't store jobid in the "env_start/end",
+ * then an upcall could be issued here to get the jobid by utilizing
+ * the userspace tools/api. Then, the jobid must be cached.
+ */
+int lustre_get_jobid(char *jobid)
+{
+	int jobid_len = JOBSTATS_JOBID_SIZE;
+	int rc = 0;
+	ENTRY;
+
+	memset(jobid, 0, JOBSTATS_JOBID_SIZE);
+	/* Jobstats isn't enabled */
+	if (strcmp(obd_jobid_var, JOBSTATS_DISABLE) == 0)
+		RETURN(0);
+
+	/* Use process name + fsuid as jobid */
+	if (strcmp(obd_jobid_var, JOBSTATS_PROCNAME_UID) == 0) {
+		snprintf(jobid, JOBSTATS_JOBID_SIZE, "%s.%u",
+			 current_comm(), current_fsuid());
+		RETURN(0);
+	}
+
+	rc = cfs_get_environ(obd_jobid_var, jobid, &jobid_len);
+	if (rc) {
+		if (rc == -EOVERFLOW) {
+			/* For the PBS_JOBID and LOADL_STEP_ID keys (which are
+			 * variable length strings instead of just numbers), it
+			 * might make sense to keep the unique parts for JobID,
+			 * instead of just returning an error.  That means a
+			 * larger temp buffer for cfs_get_environ(), then
+			 * truncating the string at some separator to fit into
+			 * the specified jobid_len.  Fix later if needed. */
+			static bool printed;
+			if (unlikely(!printed)) {
+				LCONSOLE_ERROR_MSG(0x16b, "%s value too large "
+						   "for JobID buffer (%d)\n",
+						   obd_jobid_var, jobid_len);
+				printed = true;
+			}
+		} else {
+			CDEBUG((rc == -ENOENT || rc == -EINVAL ||
+				rc == -EDEADLK) ? D_INFO : D_ERROR,
+			       "Get jobid for (%s) failed: rc = %d\n",
+			       obd_jobid_var, rc);
+		}
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(lustre_get_jobid);
+
+int obd_alloc_fail(const void *ptr, const char *name, const char *type,
+		   size_t size, const char *file, int line)
+{
+	if (ptr == NULL ||
+	    (cfs_rand() & OBD_ALLOC_FAIL_MASK) < obd_alloc_fail_rate) {
+		CERROR("%s%salloc of %s ("LPU64" bytes) failed at %s:%d\n",
+		       ptr ? "force " :"", type, name, (__u64)size, file,
+		       line);
+		CERROR(LPU64" total bytes and "LPU64" total pages "
+		       "("LPU64" bytes) allocated by Lustre, "
+		       "%d total bytes by LNET\n",
+		       obd_memory_sum(),
+		       obd_pages_sum() << PAGE_CACHE_SHIFT,
+		       obd_pages_sum(),
+			atomic_read(&libcfs_kmemory));
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(obd_alloc_fail);
+
+static inline void obd_data2conn(struct lustre_handle *conn,
+				 struct obd_ioctl_data *data)
+{
+	memset(conn, 0, sizeof *conn);
+	conn->cookie = data->ioc_cookie;
+}
+
+static inline void obd_conn2data(struct obd_ioctl_data *data,
+				 struct lustre_handle *conn)
+{
+	data->ioc_cookie = conn->cookie;
+}
+
+int class_resolve_dev_name(__u32 len, const char *name)
+{
+	int rc;
+	int dev;
+
+	ENTRY;
+	if (!len || !name) {
+		CERROR("No name passed,!\n");
+		GOTO(out, rc = -EINVAL);
+	}
+	if (name[len - 1] != 0) {
+		CERROR("Name not nul terminated!\n");
+		GOTO(out, rc = -EINVAL);
+	}
+
+	CDEBUG(D_IOCTL, "device name %s\n", name);
+	dev = class_name2dev(name);
+	if (dev == -1) {
+		CDEBUG(D_IOCTL, "No device for name %s!\n", name);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	CDEBUG(D_IOCTL, "device name %s, dev %d\n", name, dev);
+	rc = dev;
+
+out:
+	RETURN(rc);
+}
+
+int class_handle_ioctl(unsigned int cmd, unsigned long arg)
+{
+	char *buf = NULL;
+	struct obd_ioctl_data *data;
+	struct libcfs_debug_ioctl_data *debug_data;
+	struct obd_device *obd = NULL;
+	int err = 0, len = 0;
+	ENTRY;
+
+	/* only for debugging */
+	if (cmd == LIBCFS_IOC_DEBUG_MASK) {
+		debug_data = (struct libcfs_debug_ioctl_data*)arg;
+		libcfs_subsystem_debug = debug_data->subs;
+		libcfs_debug = debug_data->debug;
+		return 0;
+	}
+
+	CDEBUG(D_IOCTL, "cmd = %x\n", cmd);
+	if (obd_ioctl_getdata(&buf, &len, (void *)arg)) {
+		CERROR("OBD ioctl: data error\n");
+		RETURN(-EINVAL);
+	}
+	data = (struct obd_ioctl_data *)buf;
+
+	switch (cmd) {
+	case OBD_IOC_PROCESS_CFG: {
+		struct lustre_cfg *lcfg;
+
+		if (!data->ioc_plen1 || !data->ioc_pbuf1) {
+			CERROR("No config buffer passed!\n");
+			GOTO(out, err = -EINVAL);
+		}
+		OBD_ALLOC(lcfg, data->ioc_plen1);
+		if (lcfg == NULL)
+			GOTO(out, err = -ENOMEM);
+		err = copy_from_user(lcfg, data->ioc_pbuf1,
+					 data->ioc_plen1);
+		if (!err)
+			err = lustre_cfg_sanity_check(lcfg, data->ioc_plen1);
+		if (!err)
+			err = class_process_config(lcfg);
+
+		OBD_FREE(lcfg, data->ioc_plen1);
+		GOTO(out, err);
+	}
+
+	case OBD_GET_VERSION:
+		if (!data->ioc_inlbuf1) {
+			CERROR("No buffer passed in ioctl\n");
+			GOTO(out, err = -EINVAL);
+		}
+
+		if (strlen(BUILD_VERSION) + 1 > data->ioc_inllen1) {
+			CERROR("ioctl buffer too small to hold version\n");
+			GOTO(out, err = -EINVAL);
+		}
+
+		memcpy(data->ioc_bulk, BUILD_VERSION,
+		       strlen(BUILD_VERSION) + 1);
+
+		err = obd_ioctl_popdata((void *)arg, data, len);
+		if (err)
+			err = -EFAULT;
+		GOTO(out, err);
+
+	case OBD_IOC_NAME2DEV: {
+		/* Resolve a device name.  This does not change the
+		 * currently selected device.
+		 */
+		int dev;
+
+		dev = class_resolve_dev_name(data->ioc_inllen1,
+					     data->ioc_inlbuf1);
+		data->ioc_dev = dev;
+		if (dev < 0)
+			GOTO(out, err = -EINVAL);
+
+		err = obd_ioctl_popdata((void *)arg, data, sizeof(*data));
+		if (err)
+			err = -EFAULT;
+		GOTO(out, err);
+	}
+
+	case OBD_IOC_UUID2DEV: {
+		/* Resolve a device uuid.  This does not change the
+		 * currently selected device.
+		 */
+		int dev;
+		struct obd_uuid uuid;
+
+		if (!data->ioc_inllen1 || !data->ioc_inlbuf1) {
+			CERROR("No UUID passed!\n");
+			GOTO(out, err = -EINVAL);
+		}
+		if (data->ioc_inlbuf1[data->ioc_inllen1 - 1] != 0) {
+			CERROR("UUID not NUL terminated!\n");
+			GOTO(out, err = -EINVAL);
+		}
+
+		CDEBUG(D_IOCTL, "device name %s\n", data->ioc_inlbuf1);
+		obd_str2uuid(&uuid, data->ioc_inlbuf1);
+		dev = class_uuid2dev(&uuid);
+		data->ioc_dev = dev;
+		if (dev == -1) {
+			CDEBUG(D_IOCTL, "No device for UUID %s!\n",
+			       data->ioc_inlbuf1);
+			GOTO(out, err = -EINVAL);
+		}
+
+		CDEBUG(D_IOCTL, "device name %s, dev %d\n", data->ioc_inlbuf1,
+		       dev);
+		err = obd_ioctl_popdata((void *)arg, data, sizeof(*data));
+		if (err)
+			err = -EFAULT;
+		GOTO(out, err);
+	}
+
+	case OBD_IOC_CLOSE_UUID: {
+		CDEBUG(D_IOCTL, "closing all connections to uuid %s (NOOP)\n",
+		       data->ioc_inlbuf1);
+		GOTO(out, err = 0);
+	}
+
+	case OBD_IOC_GETDEVICE: {
+		int     index = data->ioc_count;
+		char    *status, *str;
+
+		if (!data->ioc_inlbuf1) {
+			CERROR("No buffer passed in ioctl\n");
+			GOTO(out, err = -EINVAL);
+		}
+		if (data->ioc_inllen1 < 128) {
+			CERROR("ioctl buffer too small to hold version\n");
+			GOTO(out, err = -EINVAL);
+		}
+
+		obd = class_num2obd(index);
+		if (!obd)
+			GOTO(out, err = -ENOENT);
+
+		if (obd->obd_stopping)
+			status = "ST";
+		else if (obd->obd_set_up)
+			status = "UP";
+		else if (obd->obd_attached)
+			status = "AT";
+		else
+			status = "--";
+		str = (char *)data->ioc_bulk;
+		snprintf(str, len - sizeof(*data), "%3d %s %s %s %s %d",
+			 (int)index, status, obd->obd_type->typ_name,
+			 obd->obd_name, obd->obd_uuid.uuid,
+			 atomic_read(&obd->obd_refcount));
+		err = obd_ioctl_popdata((void *)arg, data, len);
+
+		GOTO(out, err = 0);
+	}
+
+	}
+
+	if (data->ioc_dev == OBD_DEV_BY_DEVNAME) {
+		if (data->ioc_inllen4 <= 0 || data->ioc_inlbuf4 == NULL)
+			GOTO(out, err = -EINVAL);
+		if (strnlen(data->ioc_inlbuf4, MAX_OBD_NAME) >= MAX_OBD_NAME)
+			GOTO(out, err = -EINVAL);
+		obd = class_name2obd(data->ioc_inlbuf4);
+	} else if (data->ioc_dev < class_devno_max()) {
+		obd = class_num2obd(data->ioc_dev);
+	} else {
+		CERROR("OBD ioctl: No device\n");
+		GOTO(out, err = -EINVAL);
+	}
+
+	if (obd == NULL) {
+		CERROR("OBD ioctl : No Device %d\n", data->ioc_dev);
+		GOTO(out, err = -EINVAL);
+	}
+	LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+
+	if (!obd->obd_set_up || obd->obd_stopping) {
+		CERROR("OBD ioctl: device not setup %d \n", data->ioc_dev);
+		GOTO(out, err = -EINVAL);
+	}
+
+	switch(cmd) {
+	case OBD_IOC_NO_TRANSNO: {
+		if (!obd->obd_attached) {
+			CERROR("Device %d not attached\n", obd->obd_minor);
+			GOTO(out, err = -ENODEV);
+		}
+		CDEBUG(D_HA, "%s: disabling committed-transno notification\n",
+		       obd->obd_name);
+		obd->obd_no_transno = 1;
+		GOTO(out, err = 0);
+	}
+
+	default: {
+		err = obd_iocontrol(cmd, obd->obd_self_export, len, data, NULL);
+		if (err)
+			GOTO(out, err);
+
+		err = obd_ioctl_popdata((void *)arg, data, len);
+		if (err)
+			err = -EFAULT;
+		GOTO(out, err);
+	}
+	}
+
+ out:
+	if (buf)
+		obd_ioctl_freedata(buf, len);
+	RETURN(err);
+} /* class_handle_ioctl */
+
+extern psdev_t obd_psdev;
+
+#define OBD_INIT_CHECK
+int obd_init_checks(void)
+{
+	__u64 u64val, div64val;
+	char buf[64];
+	int len, ret = 0;
+
+	CDEBUG(D_INFO, "LPU64=%s, LPD64=%s, LPX64=%s\n", LPU64, LPD64, LPX64);
+
+	CDEBUG(D_INFO, "OBD_OBJECT_EOF = "LPX64"\n", (__u64)OBD_OBJECT_EOF);
+
+	u64val = OBD_OBJECT_EOF;
+	CDEBUG(D_INFO, "u64val OBD_OBJECT_EOF = "LPX64"\n", u64val);
+	if (u64val != OBD_OBJECT_EOF) {
+		CERROR("__u64 "LPX64"(%d) != 0xffffffffffffffff\n",
+		       u64val, (int)sizeof(u64val));
+		ret = -EINVAL;
+	}
+	len = snprintf(buf, sizeof(buf), LPX64, u64val);
+	if (len != 18) {
+		CWARN("LPX64 wrong length! strlen(%s)=%d != 18\n", buf, len);
+		ret = -EINVAL;
+	}
+
+	div64val = OBD_OBJECT_EOF;
+	CDEBUG(D_INFO, "u64val OBD_OBJECT_EOF = "LPX64"\n", u64val);
+	if (u64val != OBD_OBJECT_EOF) {
+		CERROR("__u64 "LPX64"(%d) != 0xffffffffffffffff\n",
+		       u64val, (int)sizeof(u64val));
+		ret = -EOVERFLOW;
+	}
+	if (u64val >> 8 != OBD_OBJECT_EOF >> 8) {
+		CERROR("__u64 "LPX64"(%d) != 0xffffffffffffffff\n",
+		       u64val, (int)sizeof(u64val));
+		return -EOVERFLOW;
+	}
+	if (do_div(div64val, 256) != (u64val & 255)) {
+		CERROR("do_div("LPX64",256) != "LPU64"\n", u64val, u64val &255);
+		return -EOVERFLOW;
+	}
+	if (u64val >> 8 != div64val) {
+		CERROR("do_div("LPX64",256) "LPU64" != "LPU64"\n",
+		       u64val, div64val, u64val >> 8);
+		return -EOVERFLOW;
+	}
+	len = snprintf(buf, sizeof(buf), LPX64, u64val);
+	if (len != 18) {
+		CWARN("LPX64 wrong length! strlen(%s)=%d != 18\n", buf, len);
+		ret = -EINVAL;
+	}
+	len = snprintf(buf, sizeof(buf), LPU64, u64val);
+	if (len != 20) {
+		CWARN("LPU64 wrong length! strlen(%s)=%d != 20\n", buf, len);
+		ret = -EINVAL;
+	}
+	len = snprintf(buf, sizeof(buf), LPD64, u64val);
+	if (len != 2) {
+		CWARN("LPD64 wrong length! strlen(%s)=%d != 2\n", buf, len);
+		ret = -EINVAL;
+	}
+	if ((u64val & ~CFS_PAGE_MASK) >= PAGE_CACHE_SIZE) {
+		CWARN("mask failed: u64val "LPU64" >= "LPU64"\n", u64val,
+		      (__u64)PAGE_CACHE_SIZE);
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+extern spinlock_t obd_types_lock;
+extern int class_procfs_init(void);
+extern int class_procfs_clean(void);
+
+static int __init init_obdclass(void)
+{
+	int i, err;
+	int lustre_register_fs(void);
+
+	for (i = CAPA_SITE_CLIENT; i < CAPA_SITE_MAX; i++)
+		INIT_LIST_HEAD(&capa_list[i]);
+
+	LCONSOLE_INFO("Lustre: Build Version: "BUILD_VERSION"\n");
+
+	spin_lock_init(&obd_types_lock);
+	obd_zombie_impexp_init();
+#ifdef LPROCFS
+	obd_memory = lprocfs_alloc_stats(OBD_STATS_NUM,
+					 LPROCFS_STATS_FLAG_NONE |
+					 LPROCFS_STATS_FLAG_IRQ_SAFE);
+	if (obd_memory == NULL) {
+		CERROR("kmalloc of 'obd_memory' failed\n");
+		RETURN(-ENOMEM);
+	}
+
+	lprocfs_counter_init(obd_memory, OBD_MEMORY_STAT,
+			     LPROCFS_CNTR_AVGMINMAX,
+			     "memused", "bytes");
+	lprocfs_counter_init(obd_memory, OBD_MEMORY_PAGES_STAT,
+			     LPROCFS_CNTR_AVGMINMAX,
+			     "pagesused", "pages");
+#endif
+	err = obd_init_checks();
+	if (err == -EOVERFLOW)
+		return err;
+
+	class_init_uuidlist();
+	err = class_handle_init();
+	if (err)
+		return err;
+
+	INIT_LIST_HEAD(&obd_types);
+
+	err = misc_register(&obd_psdev);
+	if (err) {
+		CERROR("cannot register %d err %d\n", OBD_DEV_MINOR, err);
+		return err;
+	}
+
+	/* This struct is already zeroed for us (static global) */
+	for (i = 0; i < class_devno_max(); i++)
+		obd_devs[i] = NULL;
+
+	/* Default the dirty page cache cap to 1/2 of system memory.
+	 * For clients with less memory, a larger fraction is needed
+	 * for other purposes (mostly for BGL). */
+	if (num_physpages <= 512 << (20 - PAGE_CACHE_SHIFT))
+		obd_max_dirty_pages = num_physpages / 4;
+	else
+		obd_max_dirty_pages = num_physpages / 2;
+
+	err = obd_init_caches();
+	if (err)
+		return err;
+	err = class_procfs_init();
+	if (err)
+		return err;
+
+	err = lu_global_init();
+	if (err)
+		return err;
+
+	err = cl_global_init();
+	if (err != 0)
+		return err;
+
+
+	err = llog_info_init();
+	if (err)
+		return err;
+
+	err = lustre_register_fs();
+
+	return err;
+}
+
+void obd_update_maxusage(void)
+{
+	__u64 max1, max2;
+
+	max1 = obd_pages_sum();
+	max2 = obd_memory_sum();
+
+	spin_lock(&obd_updatemax_lock);
+	if (max1 > obd_max_pages)
+		obd_max_pages = max1;
+	if (max2 > obd_max_alloc)
+		obd_max_alloc = max2;
+	spin_unlock(&obd_updatemax_lock);
+}
+EXPORT_SYMBOL(obd_update_maxusage);
+
+#ifdef LPROCFS
+__u64 obd_memory_max(void)
+{
+	__u64 ret;
+
+	spin_lock(&obd_updatemax_lock);
+	ret = obd_max_alloc;
+	spin_unlock(&obd_updatemax_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(obd_memory_max);
+
+__u64 obd_pages_max(void)
+{
+	__u64 ret;
+
+	spin_lock(&obd_updatemax_lock);
+	ret = obd_max_pages;
+	spin_unlock(&obd_updatemax_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(obd_pages_max);
+#endif
+
+/* liblustre doesn't call cleanup_obdclass, apparently.  we carry on in this
+ * ifdef to the end of the file to cover module and versioning goo.*/
+static void cleanup_obdclass(void)
+{
+	int i;
+	int lustre_unregister_fs(void);
+	__u64 memory_leaked, pages_leaked;
+	__u64 memory_max, pages_max;
+	ENTRY;
+
+	lustre_unregister_fs();
+
+	misc_deregister(&obd_psdev);
+	for (i = 0; i < class_devno_max(); i++) {
+		struct obd_device *obd = class_num2obd(i);
+		if (obd && obd->obd_set_up &&
+		    OBT(obd) && OBP(obd, detach)) {
+			/* XXX should this call generic detach otherwise? */
+			LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+			OBP(obd, detach)(obd);
+		}
+	}
+	llog_info_fini();
+	cl_global_fini();
+	lu_global_fini();
+
+	obd_cleanup_caches();
+	obd_sysctl_clean();
+
+	class_procfs_clean();
+
+	class_handle_cleanup();
+	class_exit_uuidlist();
+	obd_zombie_impexp_stop();
+
+	memory_leaked = obd_memory_sum();
+	pages_leaked = obd_pages_sum();
+
+	memory_max = obd_memory_max();
+	pages_max = obd_pages_max();
+
+	lprocfs_free_stats(&obd_memory);
+	CDEBUG((memory_leaked) ? D_ERROR : D_INFO,
+	       "obd_memory max: "LPU64", leaked: "LPU64"\n",
+	       memory_max, memory_leaked);
+	CDEBUG((pages_leaked) ? D_ERROR : D_INFO,
+	       "obd_memory_pages max: "LPU64", leaked: "LPU64"\n",
+	       pages_max, pages_leaked);
+
+	EXIT;
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Class Driver Build Version: " BUILD_VERSION);
+MODULE_LICENSE("GPL");
+
+cfs_module(obdclass, LUSTRE_VERSION_STRING, init_obdclass, cleanup_obdclass);
diff --git a/drivers/staging/lustre/lustre/obdclass/debug.c b/drivers/staging/lustre/lustre/obdclass/debug.c
new file mode 100644
index 000000000000..15f71bbb7276
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/debug.c
@@ -0,0 +1,124 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/debug.c
+ *
+ * Helper routines for dumping data structs for debugging.
+ */
+
+#define DEBUG_SUBSYSTEM D_OTHER
+
+
+#include <obd_ost.h>
+#include <obd_support.h>
+#include <lustre_debug.h>
+#include <lustre_net.h>
+
+void dump_lniobuf(struct niobuf_local *nb)
+{
+	CDEBUG(D_RPCTRACE,
+	       "niobuf_local: file_offset="LPD64", len=%d, page=%p, rc=%d\n",
+	       nb->lnb_file_offset, nb->len, nb->page, nb->rc);
+	CDEBUG(D_RPCTRACE, "nb->page: index = %ld\n",
+			nb->page ? page_index(nb->page) : -1);
+}
+EXPORT_SYMBOL(dump_lniobuf);
+
+void dump_lsm(int level, struct lov_stripe_md *lsm)
+{
+	CDEBUG(level, "lsm %p, objid "DOSTID", maxbytes "LPX64", magic 0x%08X,"
+	       " stripe_size %u, stripe_count %u, refc: %d,"
+	       " layout_gen %u, pool ["LOV_POOLNAMEF"]\n", lsm,
+	       POSTID(&lsm->lsm_oi), lsm->lsm_maxbytes, lsm->lsm_magic,
+	       lsm->lsm_stripe_size, lsm->lsm_stripe_count,
+	       atomic_read(&lsm->lsm_refc), lsm->lsm_layout_gen,
+	       lsm->lsm_pool_name);
+}
+EXPORT_SYMBOL(dump_lsm);
+
+#define LPDS sizeof(__u64)
+int block_debug_setup(void *addr, int len, __u64 off, __u64 id)
+{
+	LASSERT(addr);
+
+	off = cpu_to_le64 (off);
+	id = cpu_to_le64 (id);
+	memcpy(addr, (char *)&off, LPDS);
+	memcpy(addr + LPDS, (char *)&id, LPDS);
+
+	addr += len - LPDS - LPDS;
+	memcpy(addr, (char *)&off, LPDS);
+	memcpy(addr + LPDS, (char *)&id, LPDS);
+
+	return 0;
+}
+EXPORT_SYMBOL(block_debug_setup);
+
+int block_debug_check(char *who, void *addr, int end, __u64 off, __u64 id)
+{
+	__u64 ne_off;
+	int err = 0;
+
+	LASSERT(addr);
+
+	ne_off = le64_to_cpu (off);
+	id = le64_to_cpu (id);
+	if (memcmp(addr, (char *)&ne_off, LPDS)) {
+		CDEBUG(D_ERROR, "%s: id "LPX64" offset "LPU64" off: "LPX64" != "
+		       LPX64"\n", who, id, off, *(__u64 *)addr, ne_off);
+		err = -EINVAL;
+	}
+	if (memcmp(addr + LPDS, (char *)&id, LPDS)) {
+		CDEBUG(D_ERROR, "%s: id "LPX64" offset "LPU64" id: "LPX64" != "LPX64"\n",
+		       who, id, off, *(__u64 *)(addr + LPDS), id);
+		err = -EINVAL;
+	}
+
+	addr += end - LPDS - LPDS;
+	if (memcmp(addr, (char *)&ne_off, LPDS)) {
+		CDEBUG(D_ERROR, "%s: id "LPX64" offset "LPU64" end off: "LPX64" != "
+		       LPX64"\n", who, id, off, *(__u64 *)addr, ne_off);
+		err = -EINVAL;
+	}
+	if (memcmp(addr + LPDS, (char *)&id, LPDS)) {
+		CDEBUG(D_ERROR, "%s: id "LPX64" offset "LPU64" end id: "LPX64" != "
+		       LPX64"\n", who, id, off, *(__u64 *)(addr + LPDS), id);
+		err = -EINVAL;
+	}
+
+	return err;
+}
+EXPORT_SYMBOL(block_debug_check);
+#undef LPDS
diff --git a/drivers/staging/lustre/lustre/obdclass/dt_object.c b/drivers/staging/lustre/lustre/obdclass/dt_object.c
new file mode 100644
index 000000000000..4303698ca643
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/dt_object.c
@@ -0,0 +1,1055 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/dt_object.c
+ *
+ * Dt Object.
+ * Generic functions from dt_object.h
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd.h>
+#include <dt_object.h>
+#include <linux/list.h>
+/* fid_be_to_cpu() */
+#include <lustre_fid.h>
+
+#include <lustre_quota.h>
+
+/* context key constructor/destructor: dt_global_key_init, dt_global_key_fini */
+LU_KEY_INIT(dt_global, struct dt_thread_info);
+LU_KEY_FINI(dt_global, struct dt_thread_info);
+
+struct lu_context_key dt_key = {
+	.lct_tags = LCT_MD_THREAD | LCT_DT_THREAD | LCT_MG_THREAD | LCT_LOCAL,
+	.lct_init = dt_global_key_init,
+	.lct_fini = dt_global_key_fini
+};
+EXPORT_SYMBOL(dt_key);
+
+/* no lock is necessary to protect the list, because call-backs
+ * are added during system startup. Please refer to "struct dt_device".
+ */
+void dt_txn_callback_add(struct dt_device *dev, struct dt_txn_callback *cb)
+{
+	list_add(&cb->dtc_linkage, &dev->dd_txn_callbacks);
+}
+EXPORT_SYMBOL(dt_txn_callback_add);
+
+void dt_txn_callback_del(struct dt_device *dev, struct dt_txn_callback *cb)
+{
+	list_del_init(&cb->dtc_linkage);
+}
+EXPORT_SYMBOL(dt_txn_callback_del);
+
+int dt_txn_hook_start(const struct lu_env *env,
+		      struct dt_device *dev, struct thandle *th)
+{
+	int rc = 0;
+	struct dt_txn_callback *cb;
+
+	if (th->th_local)
+		return 0;
+
+	list_for_each_entry(cb, &dev->dd_txn_callbacks, dtc_linkage) {
+		if (cb->dtc_txn_start == NULL ||
+		    !(cb->dtc_tag & env->le_ctx.lc_tags))
+			continue;
+		rc = cb->dtc_txn_start(env, th, cb->dtc_cookie);
+		if (rc < 0)
+			break;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(dt_txn_hook_start);
+
+int dt_txn_hook_stop(const struct lu_env *env, struct thandle *txn)
+{
+	struct dt_device       *dev = txn->th_dev;
+	struct dt_txn_callback *cb;
+	int		     rc = 0;
+
+	if (txn->th_local)
+		return 0;
+
+	list_for_each_entry(cb, &dev->dd_txn_callbacks, dtc_linkage) {
+		if (cb->dtc_txn_stop == NULL ||
+		    !(cb->dtc_tag & env->le_ctx.lc_tags))
+			continue;
+		rc = cb->dtc_txn_stop(env, txn, cb->dtc_cookie);
+		if (rc < 0)
+			break;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(dt_txn_hook_stop);
+
+void dt_txn_hook_commit(struct thandle *txn)
+{
+	struct dt_txn_callback *cb;
+
+	if (txn->th_local)
+		return;
+
+	list_for_each_entry(cb, &txn->th_dev->dd_txn_callbacks,
+				dtc_linkage) {
+		if (cb->dtc_txn_commit)
+			cb->dtc_txn_commit(txn, cb->dtc_cookie);
+	}
+}
+EXPORT_SYMBOL(dt_txn_hook_commit);
+
+int dt_device_init(struct dt_device *dev, struct lu_device_type *t)
+{
+
+	INIT_LIST_HEAD(&dev->dd_txn_callbacks);
+	return lu_device_init(&dev->dd_lu_dev, t);
+}
+EXPORT_SYMBOL(dt_device_init);
+
+void dt_device_fini(struct dt_device *dev)
+{
+	lu_device_fini(&dev->dd_lu_dev);
+}
+EXPORT_SYMBOL(dt_device_fini);
+
+int dt_object_init(struct dt_object *obj,
+		   struct lu_object_header *h, struct lu_device *d)
+
+{
+	return lu_object_init(&obj->do_lu, h, d);
+}
+EXPORT_SYMBOL(dt_object_init);
+
+void dt_object_fini(struct dt_object *obj)
+{
+	lu_object_fini(&obj->do_lu);
+}
+EXPORT_SYMBOL(dt_object_fini);
+
+int dt_try_as_dir(const struct lu_env *env, struct dt_object *obj)
+{
+	if (obj->do_index_ops == NULL)
+		obj->do_ops->do_index_try(env, obj, &dt_directory_features);
+	return obj->do_index_ops != NULL;
+}
+EXPORT_SYMBOL(dt_try_as_dir);
+
+enum dt_format_type dt_mode_to_dft(__u32 mode)
+{
+	enum dt_format_type result;
+
+	switch (mode & S_IFMT) {
+	case S_IFDIR:
+		result = DFT_DIR;
+		break;
+	case S_IFREG:
+		result = DFT_REGULAR;
+		break;
+	case S_IFLNK:
+		result = DFT_SYM;
+		break;
+	case S_IFCHR:
+	case S_IFBLK:
+	case S_IFIFO:
+	case S_IFSOCK:
+		result = DFT_NODE;
+		break;
+	default:
+		LBUG();
+		break;
+	}
+	return result;
+}
+EXPORT_SYMBOL(dt_mode_to_dft);
+
+/**
+ * lookup fid for object named \a name in directory \a dir.
+ */
+
+int dt_lookup_dir(const struct lu_env *env, struct dt_object *dir,
+		  const char *name, struct lu_fid *fid)
+{
+	if (dt_try_as_dir(env, dir))
+		return dt_lookup(env, dir, (struct dt_rec *)fid,
+				 (const struct dt_key *)name, BYPASS_CAPA);
+	return -ENOTDIR;
+}
+EXPORT_SYMBOL(dt_lookup_dir);
+
+/* this differs from dt_locate by top_dev as parameter
+ * but not one from lu_site */
+struct dt_object *dt_locate_at(const struct lu_env *env,
+			       struct dt_device *dev, const struct lu_fid *fid,
+			       struct lu_device *top_dev)
+{
+	struct lu_object *lo, *n;
+	ENTRY;
+
+	lo = lu_object_find_at(env, top_dev, fid, NULL);
+	if (IS_ERR(lo))
+		return (void *)lo;
+
+	LASSERT(lo != NULL);
+
+	list_for_each_entry(n, &lo->lo_header->loh_layers, lo_linkage) {
+		if (n->lo_dev == &dev->dd_lu_dev)
+			return container_of0(n, struct dt_object, do_lu);
+	}
+	return ERR_PTR(-ENOENT);
+}
+EXPORT_SYMBOL(dt_locate_at);
+
+/**
+ * find a object named \a entry in given \a dfh->dfh_o directory.
+ */
+static int dt_find_entry(const struct lu_env *env, const char *entry, void *data)
+{
+	struct dt_find_hint  *dfh = data;
+	struct dt_device     *dt = dfh->dfh_dt;
+	struct lu_fid	*fid = dfh->dfh_fid;
+	struct dt_object     *obj = dfh->dfh_o;
+	int		   result;
+
+	result = dt_lookup_dir(env, obj, entry, fid);
+	lu_object_put(env, &obj->do_lu);
+	if (result == 0) {
+		obj = dt_locate(env, dt, fid);
+		if (IS_ERR(obj))
+			result = PTR_ERR(obj);
+	}
+	dfh->dfh_o = obj;
+	return result;
+}
+
+/**
+ * Abstract function which parses path name. This function feeds
+ * path component to \a entry_func.
+ */
+int dt_path_parser(const struct lu_env *env,
+		   char *path, dt_entry_func_t entry_func,
+		   void *data)
+{
+	char *e;
+	int rc = 0;
+
+	while (1) {
+		e = strsep(&path, "/");
+		if (e == NULL)
+			break;
+
+		if (e[0] == 0) {
+			if (!path || path[0] == '\0')
+				break;
+			continue;
+		}
+		rc = entry_func(env, e, data);
+		if (rc)
+			break;
+	}
+
+	return rc;
+}
+
+struct dt_object *
+dt_store_resolve(const struct lu_env *env, struct dt_device *dt,
+		 const char *path, struct lu_fid *fid)
+{
+	struct dt_thread_info *info = dt_info(env);
+	struct dt_find_hint   *dfh = &info->dti_dfh;
+	struct dt_object      *obj;
+	char		      *local = info->dti_buf;
+	int		       result;
+
+
+	dfh->dfh_dt = dt;
+	dfh->dfh_fid = fid;
+
+	strncpy(local, path, DT_MAX_PATH);
+	local[DT_MAX_PATH - 1] = '\0';
+
+	result = dt->dd_ops->dt_root_get(env, dt, fid);
+	if (result == 0) {
+		obj = dt_locate(env, dt, fid);
+		if (!IS_ERR(obj)) {
+			dfh->dfh_o = obj;
+			result = dt_path_parser(env, local, dt_find_entry, dfh);
+			if (result != 0)
+				obj = ERR_PTR(result);
+			else
+				obj = dfh->dfh_o;
+		}
+	} else {
+		obj = ERR_PTR(result);
+	}
+	return obj;
+}
+EXPORT_SYMBOL(dt_store_resolve);
+
+static struct dt_object *dt_reg_open(const struct lu_env *env,
+				     struct dt_device *dt,
+				     struct dt_object *p,
+				     const char *name,
+				     struct lu_fid *fid)
+{
+	struct dt_object *o;
+	int result;
+
+	result = dt_lookup_dir(env, p, name, fid);
+	if (result == 0){
+		o = dt_locate(env, dt, fid);
+	}
+	else
+		o = ERR_PTR(result);
+
+	return o;
+}
+
+/**
+ * Open dt object named \a filename from \a dirname directory.
+ *      \param  dt      dt device
+ *      \param  fid     on success, object fid is stored in *fid
+ */
+struct dt_object *dt_store_open(const struct lu_env *env,
+				struct dt_device *dt,
+				const char *dirname,
+				const char *filename,
+				struct lu_fid *fid)
+{
+	struct dt_object *file;
+	struct dt_object *dir;
+
+	dir = dt_store_resolve(env, dt, dirname, fid);
+	if (!IS_ERR(dir)) {
+		file = dt_reg_open(env, dt, dir,
+				   filename, fid);
+		lu_object_put(env, &dir->do_lu);
+	} else {
+		file = dir;
+	}
+	return file;
+}
+EXPORT_SYMBOL(dt_store_open);
+
+struct dt_object *dt_find_or_create(const struct lu_env *env,
+				    struct dt_device *dt,
+				    const struct lu_fid *fid,
+				    struct dt_object_format *dof,
+				    struct lu_attr *at)
+{
+	struct dt_object *dto;
+	struct thandle *th;
+	int rc;
+
+	ENTRY;
+
+	dto = dt_locate(env, dt, fid);
+	if (IS_ERR(dto))
+		RETURN(dto);
+
+	LASSERT(dto != NULL);
+	if (dt_object_exists(dto))
+		RETURN(dto);
+
+	th = dt_trans_create(env, dt);
+	if (IS_ERR(th))
+		GOTO(out, rc = PTR_ERR(th));
+
+	rc = dt_declare_create(env, dto, at, NULL, dof, th);
+	if (rc)
+		GOTO(trans_stop, rc);
+
+	rc = dt_trans_start_local(env, dt, th);
+	if (rc)
+		GOTO(trans_stop, rc);
+
+	dt_write_lock(env, dto, 0);
+	if (dt_object_exists(dto))
+		GOTO(unlock, rc = 0);
+
+	CDEBUG(D_OTHER, "create new object "DFID"\n", PFID(fid));
+
+	rc = dt_create(env, dto, at, NULL, dof, th);
+	if (rc)
+		GOTO(unlock, rc);
+	LASSERT(dt_object_exists(dto));
+unlock:
+	dt_write_unlock(env, dto);
+trans_stop:
+	dt_trans_stop(env, dt, th);
+out:
+	if (rc) {
+		lu_object_put(env, &dto->do_lu);
+		RETURN(ERR_PTR(rc));
+	}
+	RETURN(dto);
+}
+EXPORT_SYMBOL(dt_find_or_create);
+
+/* dt class init function. */
+int dt_global_init(void)
+{
+	int result;
+
+	LU_CONTEXT_KEY_INIT(&dt_key);
+	result = lu_context_key_register(&dt_key);
+	return result;
+}
+
+void dt_global_fini(void)
+{
+	lu_context_key_degister(&dt_key);
+}
+
+/**
+ * Generic read helper. May return an error for partial reads.
+ *
+ * \param env  lustre environment
+ * \param dt   object to be read
+ * \param buf  lu_buf to be filled, with buffer pointer and length
+ * \param pos position to start reading, updated as data is read
+ *
+ * \retval real size of data read
+ * \retval -ve errno on failure
+ */
+int dt_read(const struct lu_env *env, struct dt_object *dt,
+	    struct lu_buf *buf, loff_t *pos)
+{
+	LASSERTF(dt != NULL, "dt is NULL when we want to read record\n");
+	return dt->do_body_ops->dbo_read(env, dt, buf, pos, BYPASS_CAPA);
+}
+EXPORT_SYMBOL(dt_read);
+
+/**
+ * Read structures of fixed size from storage.  Unlike dt_read(), using
+ * dt_record_read() will return an error for partial reads.
+ *
+ * \param env  lustre environment
+ * \param dt   object to be read
+ * \param buf  lu_buf to be filled, with buffer pointer and length
+ * \param pos position to start reading, updated as data is read
+ *
+ * \retval 0 on successfully reading full buffer
+ * \retval -EFAULT on short read
+ * \retval -ve errno on failure
+ */
+int dt_record_read(const struct lu_env *env, struct dt_object *dt,
+		   struct lu_buf *buf, loff_t *pos)
+{
+	int rc;
+
+	LASSERTF(dt != NULL, "dt is NULL when we want to read record\n");
+
+	rc = dt->do_body_ops->dbo_read(env, dt, buf, pos, BYPASS_CAPA);
+
+	if (rc == buf->lb_len)
+		rc = 0;
+	else if (rc >= 0)
+		rc = -EFAULT;
+	return rc;
+}
+EXPORT_SYMBOL(dt_record_read);
+
+int dt_record_write(const struct lu_env *env, struct dt_object *dt,
+		    const struct lu_buf *buf, loff_t *pos, struct thandle *th)
+{
+	int rc;
+
+	LASSERTF(dt != NULL, "dt is NULL when we want to write record\n");
+	LASSERT(th != NULL);
+	LASSERT(dt->do_body_ops);
+	LASSERT(dt->do_body_ops->dbo_write);
+	rc = dt->do_body_ops->dbo_write(env, dt, buf, pos, th, BYPASS_CAPA, 1);
+	if (rc == buf->lb_len)
+		rc = 0;
+	else if (rc >= 0)
+		rc = -EFAULT;
+	return rc;
+}
+EXPORT_SYMBOL(dt_record_write);
+
+int dt_declare_version_set(const struct lu_env *env, struct dt_object *o,
+			   struct thandle *th)
+{
+	struct lu_buf vbuf;
+	char *xname = XATTR_NAME_VERSION;
+
+	LASSERT(o);
+	vbuf.lb_buf = NULL;
+	vbuf.lb_len = sizeof(dt_obj_version_t);
+	return dt_declare_xattr_set(env, o, &vbuf, xname, 0, th);
+
+}
+EXPORT_SYMBOL(dt_declare_version_set);
+
+void dt_version_set(const struct lu_env *env, struct dt_object *o,
+		    dt_obj_version_t version, struct thandle *th)
+{
+	struct lu_buf vbuf;
+	char *xname = XATTR_NAME_VERSION;
+	int rc;
+
+	LASSERT(o);
+	vbuf.lb_buf = &version;
+	vbuf.lb_len = sizeof(version);
+
+	rc = dt_xattr_set(env, o, &vbuf, xname, 0, th, BYPASS_CAPA);
+	if (rc < 0)
+		CDEBUG(D_INODE, "Can't set version, rc %d\n", rc);
+	return;
+}
+EXPORT_SYMBOL(dt_version_set);
+
+dt_obj_version_t dt_version_get(const struct lu_env *env, struct dt_object *o)
+{
+	struct lu_buf vbuf;
+	char *xname = XATTR_NAME_VERSION;
+	dt_obj_version_t version;
+	int rc;
+
+	LASSERT(o);
+	vbuf.lb_buf = &version;
+	vbuf.lb_len = sizeof(version);
+	rc = dt_xattr_get(env, o, &vbuf, xname, BYPASS_CAPA);
+	if (rc != sizeof(version)) {
+		CDEBUG(D_INODE, "Can't get version, rc %d\n", rc);
+		version = 0;
+	}
+	return version;
+}
+EXPORT_SYMBOL(dt_version_get);
+
+/* list of all supported index types */
+
+/* directories */
+const struct dt_index_features dt_directory_features;
+EXPORT_SYMBOL(dt_directory_features);
+
+/* scrub iterator */
+const struct dt_index_features dt_otable_features;
+EXPORT_SYMBOL(dt_otable_features);
+
+/* lfsck */
+const struct dt_index_features dt_lfsck_features = {
+	.dif_flags		= DT_IND_UPDATE,
+	.dif_keysize_min	= sizeof(struct lu_fid),
+	.dif_keysize_max	= sizeof(struct lu_fid),
+	.dif_recsize_min	= sizeof(__u8),
+	.dif_recsize_max	= sizeof(__u8),
+	.dif_ptrsize		= 4
+};
+EXPORT_SYMBOL(dt_lfsck_features);
+
+/* accounting indexes */
+const struct dt_index_features dt_acct_features = {
+	.dif_flags		= DT_IND_UPDATE,
+	.dif_keysize_min	= sizeof(__u64), /* 64-bit uid/gid */
+	.dif_keysize_max	= sizeof(__u64), /* 64-bit uid/gid */
+	.dif_recsize_min	= sizeof(struct lquota_acct_rec), /* 16 bytes */
+	.dif_recsize_max	= sizeof(struct lquota_acct_rec), /* 16 bytes */
+	.dif_ptrsize		= 4
+};
+EXPORT_SYMBOL(dt_acct_features);
+
+/* global quota files */
+const struct dt_index_features dt_quota_glb_features = {
+	.dif_flags		= DT_IND_UPDATE,
+	/* a different key would have to be used for per-directory quota */
+	.dif_keysize_min	= sizeof(__u64), /* 64-bit uid/gid */
+	.dif_keysize_max	= sizeof(__u64), /* 64-bit uid/gid */
+	.dif_recsize_min	= sizeof(struct lquota_glb_rec), /* 32 bytes */
+	.dif_recsize_max	= sizeof(struct lquota_glb_rec), /* 32 bytes */
+	.dif_ptrsize		= 4
+};
+EXPORT_SYMBOL(dt_quota_glb_features);
+
+/* slave quota files */
+const struct dt_index_features dt_quota_slv_features = {
+	.dif_flags		= DT_IND_UPDATE,
+	/* a different key would have to be used for per-directory quota */
+	.dif_keysize_min	= sizeof(__u64), /* 64-bit uid/gid */
+	.dif_keysize_max	= sizeof(__u64), /* 64-bit uid/gid */
+	.dif_recsize_min	= sizeof(struct lquota_slv_rec), /* 8 bytes */
+	.dif_recsize_max	= sizeof(struct lquota_slv_rec), /* 8 bytes */
+	.dif_ptrsize		= 4
+};
+EXPORT_SYMBOL(dt_quota_slv_features);
+
+/* helper function returning what dt_index_features structure should be used
+ * based on the FID sequence. This is used by OBD_IDX_READ RPC */
+static inline const struct dt_index_features *dt_index_feat_select(__u64 seq,
+								   __u32 mode)
+{
+	if (seq == FID_SEQ_QUOTA_GLB) {
+		/* global quota index */
+		if (!S_ISREG(mode))
+			/* global quota index should be a regular file */
+			return ERR_PTR(-ENOENT);
+		return &dt_quota_glb_features;
+	} else if (seq == FID_SEQ_QUOTA) {
+		/* quota slave index */
+		if (!S_ISREG(mode))
+			/* slave index should be a regular file */
+			return ERR_PTR(-ENOENT);
+		return &dt_quota_slv_features;
+	} else if (seq >= FID_SEQ_NORMAL) {
+		/* object is part of the namespace, verify that it is a
+		 * directory */
+		if (!S_ISDIR(mode))
+			/* sorry, we can only deal with directory */
+			return ERR_PTR(-ENOTDIR);
+		return &dt_directory_features;
+	}
+
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+/*
+ * Fill a lu_idxpage with key/record pairs read for transfer via OBD_IDX_READ
+ * RPC
+ *
+ * \param env - is the environment passed by the caller
+ * \param lp  - is a pointer to the lu_page to fill
+ * \param nob - is the maximum number of bytes that should be copied
+ * \param iops - is the index operation vector associated with the index object
+ * \param it   - is a pointer to the current iterator
+ * \param attr - is the index attribute to pass to iops->rec()
+ * \param arg  - is a pointer to the idx_info structure
+ */
+static int dt_index_page_build(const struct lu_env *env, union lu_page *lp,
+			       int nob, const struct dt_it_ops *iops,
+			       struct dt_it *it, __u32 attr, void *arg)
+{
+	struct idx_info		*ii = (struct idx_info *)arg;
+	struct lu_idxpage	*lip = &lp->lp_idx;
+	char			*entry;
+	int			 rc, size;
+	ENTRY;
+
+	/* no support for variable key & record size for now */
+	LASSERT((ii->ii_flags & II_FL_VARKEY) == 0);
+	LASSERT((ii->ii_flags & II_FL_VARREC) == 0);
+
+	/* initialize the header of the new container */
+	memset(lip, 0, LIP_HDR_SIZE);
+	lip->lip_magic = LIP_MAGIC;
+	nob	   -= LIP_HDR_SIZE;
+
+	/* compute size needed to store a key/record pair */
+	size = ii->ii_recsize + ii->ii_keysize;
+	if ((ii->ii_flags & II_FL_NOHASH) == 0)
+		/* add hash if the client wants it */
+		size += sizeof(__u64);
+
+	entry = lip->lip_entries;
+	do {
+		char		*tmp_entry = entry;
+		struct dt_key	*key;
+		__u64		 hash;
+
+		/* fetch 64-bit hash value */
+		hash = iops->store(env, it);
+		ii->ii_hash_end = hash;
+
+		if (OBD_FAIL_CHECK(OBD_FAIL_OBD_IDX_READ_BREAK)) {
+			if (lip->lip_nr != 0)
+				GOTO(out, rc = 0);
+		}
+
+		if (nob < size) {
+			if (lip->lip_nr == 0)
+				GOTO(out, rc = -EINVAL);
+			GOTO(out, rc = 0);
+		}
+
+		if ((ii->ii_flags & II_FL_NOHASH) == 0) {
+			/* client wants to the 64-bit hash value associated with
+			 * each record */
+			memcpy(tmp_entry, &hash, sizeof(hash));
+			tmp_entry += sizeof(hash);
+		}
+
+		/* then the key value */
+		LASSERT(iops->key_size(env, it) == ii->ii_keysize);
+		key = iops->key(env, it);
+		memcpy(tmp_entry, key, ii->ii_keysize);
+		tmp_entry += ii->ii_keysize;
+
+		/* and finally the record */
+		rc = iops->rec(env, it, (struct dt_rec *)tmp_entry, attr);
+		if (rc != -ESTALE) {
+			if (rc != 0)
+				GOTO(out, rc);
+
+			/* hash/key/record successfully copied! */
+			lip->lip_nr++;
+			if (unlikely(lip->lip_nr == 1 && ii->ii_count == 0))
+				ii->ii_hash_start = hash;
+			entry = tmp_entry + ii->ii_recsize;
+			nob -= size;
+		}
+
+		/* move on to the next record */
+		do {
+			rc = iops->next(env, it);
+		} while (rc == -ESTALE);
+
+	} while (rc == 0);
+
+	GOTO(out, rc);
+out:
+	if (rc >= 0 && lip->lip_nr > 0)
+		/* one more container */
+		ii->ii_count++;
+	if (rc > 0)
+		/* no more entries */
+		ii->ii_hash_end = II_END_OFF;
+	return rc;
+}
+
+/*
+ * Walk index and fill lu_page containers with key/record pairs
+ *
+ * \param env - is the environment passed by the caller
+ * \param obj - is the index object to parse
+ * \param rdpg - is the lu_rdpg descriptor associated with the transfer
+ * \param filler - is the callback function responsible for filling a lu_page
+ *		 with key/record pairs in the format wanted by the caller
+ * \param arg    - is an opaq argument passed to the filler function
+ *
+ * \retval sum (in bytes) of all filled lu_pages
+ * \retval -ve errno on failure
+ */
+int dt_index_walk(const struct lu_env *env, struct dt_object *obj,
+		  const struct lu_rdpg *rdpg, dt_index_page_build_t filler,
+		  void *arg)
+{
+	struct dt_it		*it;
+	const struct dt_it_ops	*iops;
+	unsigned int		 pageidx, nob, nlupgs = 0;
+	int			 rc;
+	ENTRY;
+
+	LASSERT(rdpg->rp_pages != NULL);
+	LASSERT(obj->do_index_ops != NULL);
+
+	nob = rdpg->rp_count;
+	if (nob <= 0)
+		RETURN(-EFAULT);
+
+	/* Iterate through index and fill containers from @rdpg */
+	iops = &obj->do_index_ops->dio_it;
+	LASSERT(iops != NULL);
+	it = iops->init(env, obj, rdpg->rp_attrs, BYPASS_CAPA);
+	if (IS_ERR(it))
+		RETURN(PTR_ERR(it));
+
+	rc = iops->load(env, it, rdpg->rp_hash);
+	if (rc == 0) {
+		/*
+		 * Iterator didn't find record with exactly the key requested.
+		 *
+		 * It is currently either
+		 *
+		 *     - positioned above record with key less than
+		 *     requested---skip it.
+		 *     - or not positioned at all (is in IAM_IT_SKEWED
+		 *     state)---position it on the next item.
+		 */
+		rc = iops->next(env, it);
+	} else if (rc > 0) {
+		rc = 0;
+	}
+
+	/* Fill containers one after the other. There might be multiple
+	 * containers per physical page.
+	 *
+	 * At this point and across for-loop:
+	 *  rc == 0 -> ok, proceed.
+	 *  rc >  0 -> end of index.
+	 *  rc <  0 -> error. */
+	for (pageidx = 0; rc == 0 && nob > 0; pageidx++) {
+		union lu_page	*lp;
+		int		 i;
+
+		LASSERT(pageidx < rdpg->rp_npages);
+		lp = kmap(rdpg->rp_pages[pageidx]);
+
+		/* fill lu pages */
+		for (i = 0; i < LU_PAGE_COUNT; i++, lp++, nob -= LU_PAGE_SIZE) {
+			rc = filler(env, lp, min_t(int, nob, LU_PAGE_SIZE),
+				    iops, it, rdpg->rp_attrs, arg);
+			if (rc < 0)
+				break;
+			/* one more lu_page */
+			nlupgs++;
+			if (rc > 0)
+				/* end of index */
+				break;
+		}
+		kunmap(rdpg->rp_pages[i]);
+	}
+
+	iops->put(env, it);
+	iops->fini(env, it);
+
+	if (rc >= 0)
+		rc = min_t(unsigned int, nlupgs * LU_PAGE_SIZE, rdpg->rp_count);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(dt_index_walk);
+
+/**
+ * Walk key/record pairs of an index and copy them into 4KB containers to be
+ * transferred over the network. This is the common handler for OBD_IDX_READ
+ * RPC processing.
+ *
+ * \param env - is the environment passed by the caller
+ * \param dev - is the dt_device storing the index
+ * \param ii  - is the idx_info structure packed by the client in the
+ *	      OBD_IDX_READ request
+ * \param rdpg - is the lu_rdpg descriptor
+ *
+ * \retval on success, return sum (in bytes) of all filled containers
+ * \retval appropriate error otherwise.
+ */
+int dt_index_read(const struct lu_env *env, struct dt_device *dev,
+		  struct idx_info *ii, const struct lu_rdpg *rdpg)
+{
+	const struct dt_index_features	*feat;
+	struct dt_object		*obj;
+	int				 rc;
+	ENTRY;
+
+	/* rp_count shouldn't be null and should be a multiple of the container
+	 * size */
+	if (rdpg->rp_count <= 0 && (rdpg->rp_count & (LU_PAGE_SIZE - 1)) != 0)
+		RETURN(-EFAULT);
+
+	if (fid_seq(&ii->ii_fid) >= FID_SEQ_NORMAL)
+		/* we don't support directory transfer via OBD_IDX_READ for the
+		 * time being */
+		RETURN(-EOPNOTSUPP);
+
+	if (!fid_is_quota(&ii->ii_fid))
+		/* block access to all local files except quota files */
+		RETURN(-EPERM);
+
+	/* lookup index object subject to the transfer */
+	obj = dt_locate(env, dev, &ii->ii_fid);
+	if (IS_ERR(obj))
+		RETURN(PTR_ERR(obj));
+	if (dt_object_exists(obj) == 0)
+		GOTO(out, rc = -ENOENT);
+
+	/* fetch index features associated with index object */
+	feat = dt_index_feat_select(fid_seq(&ii->ii_fid),
+				    lu_object_attr(&obj->do_lu));
+	if (IS_ERR(feat))
+		GOTO(out, rc = PTR_ERR(feat));
+
+	/* load index feature if not done already */
+	if (obj->do_index_ops == NULL) {
+		rc = obj->do_ops->do_index_try(env, obj, feat);
+		if (rc)
+			GOTO(out, rc);
+	}
+
+	/* fill ii_flags with supported index features */
+	ii->ii_flags &= II_FL_NOHASH;
+
+	ii->ii_keysize = feat->dif_keysize_max;
+	if ((feat->dif_flags & DT_IND_VARKEY) != 0) {
+		/* key size is variable */
+		ii->ii_flags |= II_FL_VARKEY;
+		/* we don't support variable key size for the time being */
+		GOTO(out, rc = -EOPNOTSUPP);
+	}
+
+	ii->ii_recsize = feat->dif_recsize_max;
+	if ((feat->dif_flags & DT_IND_VARREC) != 0) {
+		/* record size is variable */
+		ii->ii_flags |= II_FL_VARREC;
+		/* we don't support variable record size for the time being */
+		GOTO(out, rc = -EOPNOTSUPP);
+	}
+
+	if ((feat->dif_flags & DT_IND_NONUNQ) != 0)
+		/* key isn't necessarily unique */
+		ii->ii_flags |= II_FL_NONUNQ;
+
+	dt_read_lock(env, obj, 0);
+	/* fetch object version before walking the index */
+	ii->ii_version = dt_version_get(env, obj);
+
+	/* walk the index and fill lu_idxpages with key/record pairs */
+	rc = dt_index_walk(env, obj, rdpg, dt_index_page_build ,ii);
+	dt_read_unlock(env, obj);
+
+	if (rc == 0) {
+		/* index is empty */
+		LASSERT(ii->ii_count == 0);
+		ii->ii_hash_end = II_END_OFF;
+	}
+
+	GOTO(out, rc);
+out:
+	lu_object_put(env, &obj->do_lu);
+	return rc;
+}
+EXPORT_SYMBOL(dt_index_read);
+
+#ifdef LPROCFS
+
+int lprocfs_dt_rd_blksize(char *page, char **start, off_t off,
+			  int count, int *eof, void *data)
+{
+	struct dt_device *dt = data;
+	struct obd_statfs osfs;
+
+	int rc = dt_statfs(NULL, dt, &osfs);
+	if (rc != 0) {
+		*eof = 1;
+		rc = snprintf(page, count, "%d\n",
+				(unsigned) osfs.os_bsize);
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_blksize);
+
+int lprocfs_dt_rd_kbytestotal(char *page, char **start, off_t off,
+			      int count, int *eof, void *data)
+{
+	struct dt_device *dt = data;
+	struct obd_statfs osfs;
+
+	int rc = dt_statfs(NULL, dt, &osfs);
+	if (rc != 0) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_blocks;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		*eof = 1;
+		rc = snprintf(page, count, LPU64"\n", result);
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_kbytestotal);
+
+int lprocfs_dt_rd_kbytesfree(char *page, char **start, off_t off,
+			     int count, int *eof, void *data)
+{
+	struct dt_device *dt = data;
+	struct obd_statfs osfs;
+
+	int rc = dt_statfs(NULL, dt, &osfs);
+	if (rc != 0) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_bfree;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		*eof = 1;
+		rc = snprintf(page, count, LPU64"\n", result);
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_kbytesfree);
+
+int lprocfs_dt_rd_kbytesavail(char *page, char **start, off_t off,
+			      int count, int *eof, void *data)
+{
+	struct dt_device *dt = data;
+	struct obd_statfs osfs;
+
+	int rc = dt_statfs(NULL, dt, &osfs);
+	if (rc != 0) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_bavail;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		*eof = 1;
+		rc = snprintf(page, count, LPU64"\n", result);
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_kbytesavail);
+
+int lprocfs_dt_rd_filestotal(char *page, char **start, off_t off,
+			     int count, int *eof, void *data)
+{
+	struct dt_device *dt = data;
+	struct obd_statfs osfs;
+
+	int rc = dt_statfs(NULL, dt, &osfs);
+	if (rc != 0) {
+		*eof = 1;
+		rc = snprintf(page, count, LPU64"\n", osfs.os_files);
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_filestotal);
+
+int lprocfs_dt_rd_filesfree(char *page, char **start, off_t off,
+			    int count, int *eof, void *data)
+{
+	struct dt_device *dt = data;
+	struct obd_statfs osfs;
+
+	int rc = dt_statfs(NULL, dt, &osfs);
+	if (rc != 0) {
+		*eof = 1;
+		rc = snprintf(page, count, LPU64"\n", osfs.os_ffree);
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_filesfree);
+
+#endif /* LPROCFS */
diff --git a/drivers/staging/lustre/lustre/obdclass/genops.c b/drivers/staging/lustre/lustre/obdclass/genops.c
new file mode 100644
index 000000000000..1cc9b55890ca
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/genops.c
@@ -0,0 +1,1855 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/genops.c
+ *
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#include <obd_ost.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+
+extern struct list_head obd_types;
+spinlock_t obd_types_lock;
+
+struct kmem_cache *obd_device_cachep;
+struct kmem_cache *obdo_cachep;
+EXPORT_SYMBOL(obdo_cachep);
+struct kmem_cache *import_cachep;
+
+struct list_head      obd_zombie_imports;
+struct list_head      obd_zombie_exports;
+spinlock_t  obd_zombie_impexp_lock;
+static void obd_zombie_impexp_notify(void);
+static void obd_zombie_export_add(struct obd_export *exp);
+static void obd_zombie_import_add(struct obd_import *imp);
+static void print_export_data(struct obd_export *exp,
+			      const char *status, int locks);
+
+int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c);
+EXPORT_SYMBOL(ptlrpc_put_connection_superhack);
+
+/*
+ * support functions: we could use inter-module communication, but this
+ * is more portable to other OS's
+ */
+static struct obd_device *obd_device_alloc(void)
+{
+	struct obd_device *obd;
+
+	OBD_SLAB_ALLOC_PTR_GFP(obd, obd_device_cachep, __GFP_IO);
+	if (obd != NULL) {
+		obd->obd_magic = OBD_DEVICE_MAGIC;
+	}
+	return obd;
+}
+
+static void obd_device_free(struct obd_device *obd)
+{
+	LASSERT(obd != NULL);
+	LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "obd %p obd_magic %08x != %08x\n",
+		 obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+	if (obd->obd_namespace != NULL) {
+		CERROR("obd %p: namespace %p was not properly cleaned up (obd_force=%d)!\n",
+		       obd, obd->obd_namespace, obd->obd_force);
+		LBUG();
+	}
+	lu_ref_fini(&obd->obd_reference);
+	OBD_SLAB_FREE_PTR(obd, obd_device_cachep);
+}
+
+struct obd_type *class_search_type(const char *name)
+{
+	struct list_head *tmp;
+	struct obd_type *type;
+
+	spin_lock(&obd_types_lock);
+	list_for_each(tmp, &obd_types) {
+		type = list_entry(tmp, struct obd_type, typ_chain);
+		if (strcmp(type->typ_name, name) == 0) {
+			spin_unlock(&obd_types_lock);
+			return type;
+		}
+	}
+	spin_unlock(&obd_types_lock);
+	return NULL;
+}
+EXPORT_SYMBOL(class_search_type);
+
+struct obd_type *class_get_type(const char *name)
+{
+	struct obd_type *type = class_search_type(name);
+
+	if (!type) {
+		const char *modname = name;
+
+		if (strcmp(modname, "obdfilter") == 0)
+			modname = "ofd";
+
+		if (strcmp(modname, LUSTRE_LWP_NAME) == 0)
+			modname = LUSTRE_OSP_NAME;
+
+		if (!strncmp(modname, LUSTRE_MDS_NAME, strlen(LUSTRE_MDS_NAME)))
+			modname = LUSTRE_MDT_NAME;
+
+		if (!request_module("%s", modname)) {
+			CDEBUG(D_INFO, "Loaded module '%s'\n", modname);
+			type = class_search_type(name);
+		} else {
+			LCONSOLE_ERROR_MSG(0x158, "Can't load module '%s'\n",
+					   modname);
+		}
+	}
+	if (type) {
+		spin_lock(&type->obd_type_lock);
+		type->typ_refcnt++;
+		try_module_get(type->typ_dt_ops->o_owner);
+		spin_unlock(&type->obd_type_lock);
+	}
+	return type;
+}
+EXPORT_SYMBOL(class_get_type);
+
+void class_put_type(struct obd_type *type)
+{
+	LASSERT(type);
+	spin_lock(&type->obd_type_lock);
+	type->typ_refcnt--;
+	module_put(type->typ_dt_ops->o_owner);
+	spin_unlock(&type->obd_type_lock);
+}
+EXPORT_SYMBOL(class_put_type);
+
+#define CLASS_MAX_NAME 1024
+
+int class_register_type(struct obd_ops *dt_ops, struct md_ops *md_ops,
+			struct lprocfs_vars *vars, const char *name,
+			struct lu_device_type *ldt)
+{
+	struct obd_type *type;
+	int rc = 0;
+	ENTRY;
+
+	/* sanity check */
+	LASSERT(strnlen(name, CLASS_MAX_NAME) < CLASS_MAX_NAME);
+
+	if (class_search_type(name)) {
+		CDEBUG(D_IOCTL, "Type %s already registered\n", name);
+		RETURN(-EEXIST);
+	}
+
+	rc = -ENOMEM;
+	OBD_ALLOC(type, sizeof(*type));
+	if (type == NULL)
+		RETURN(rc);
+
+	OBD_ALLOC_PTR(type->typ_dt_ops);
+	OBD_ALLOC_PTR(type->typ_md_ops);
+	OBD_ALLOC(type->typ_name, strlen(name) + 1);
+
+	if (type->typ_dt_ops == NULL ||
+	    type->typ_md_ops == NULL ||
+	    type->typ_name == NULL)
+		GOTO (failed, rc);
+
+	*(type->typ_dt_ops) = *dt_ops;
+	/* md_ops is optional */
+	if (md_ops)
+		*(type->typ_md_ops) = *md_ops;
+	strcpy(type->typ_name, name);
+	spin_lock_init(&type->obd_type_lock);
+
+#ifdef LPROCFS
+	type->typ_procroot = lprocfs_register(type->typ_name, proc_lustre_root,
+					      vars, type);
+	if (IS_ERR(type->typ_procroot)) {
+		rc = PTR_ERR(type->typ_procroot);
+		type->typ_procroot = NULL;
+		GOTO (failed, rc);
+	}
+#endif
+	if (ldt != NULL) {
+		type->typ_lu = ldt;
+		rc = lu_device_type_init(ldt);
+		if (rc != 0)
+			GOTO (failed, rc);
+	}
+
+	spin_lock(&obd_types_lock);
+	list_add(&type->typ_chain, &obd_types);
+	spin_unlock(&obd_types_lock);
+
+	RETURN (0);
+
+ failed:
+	if (type->typ_name != NULL)
+		OBD_FREE(type->typ_name, strlen(name) + 1);
+	if (type->typ_md_ops != NULL)
+		OBD_FREE_PTR(type->typ_md_ops);
+	if (type->typ_dt_ops != NULL)
+		OBD_FREE_PTR(type->typ_dt_ops);
+	OBD_FREE(type, sizeof(*type));
+	RETURN(rc);
+}
+EXPORT_SYMBOL(class_register_type);
+
+int class_unregister_type(const char *name)
+{
+	struct obd_type *type = class_search_type(name);
+	ENTRY;
+
+	if (!type) {
+		CERROR("unknown obd type\n");
+		RETURN(-EINVAL);
+	}
+
+	if (type->typ_refcnt) {
+		CERROR("type %s has refcount (%d)\n", name, type->typ_refcnt);
+		/* This is a bad situation, let's make the best of it */
+		/* Remove ops, but leave the name for debugging */
+		OBD_FREE_PTR(type->typ_dt_ops);
+		OBD_FREE_PTR(type->typ_md_ops);
+		RETURN(-EBUSY);
+	}
+
+	/* we do not use type->typ_procroot as for compatibility purposes
+	 * other modules can share names (i.e. lod can use lov entry). so
+	 * we can't reference pointer as it can get invalided when another
+	 * module removes the entry */
+	lprocfs_try_remove_proc_entry(type->typ_name, proc_lustre_root);
+
+	if (type->typ_lu)
+		lu_device_type_fini(type->typ_lu);
+
+	spin_lock(&obd_types_lock);
+	list_del(&type->typ_chain);
+	spin_unlock(&obd_types_lock);
+	OBD_FREE(type->typ_name, strlen(name) + 1);
+	if (type->typ_dt_ops != NULL)
+		OBD_FREE_PTR(type->typ_dt_ops);
+	if (type->typ_md_ops != NULL)
+		OBD_FREE_PTR(type->typ_md_ops);
+	OBD_FREE(type, sizeof(*type));
+	RETURN(0);
+} /* class_unregister_type */
+EXPORT_SYMBOL(class_unregister_type);
+
+/**
+ * Create a new obd device.
+ *
+ * Find an empty slot in ::obd_devs[], create a new obd device in it.
+ *
+ * \param[in] type_name obd device type string.
+ * \param[in] name      obd device name.
+ *
+ * \retval NULL if create fails, otherwise return the obd device
+ *	 pointer created.
+ */
+struct obd_device *class_newdev(const char *type_name, const char *name)
+{
+	struct obd_device *result = NULL;
+	struct obd_device *newdev;
+	struct obd_type *type = NULL;
+	int i;
+	int new_obd_minor = 0;
+	ENTRY;
+
+	if (strlen(name) >= MAX_OBD_NAME) {
+		CERROR("name/uuid must be < %u bytes long\n", MAX_OBD_NAME);
+		RETURN(ERR_PTR(-EINVAL));
+	}
+
+	type = class_get_type(type_name);
+	if (type == NULL){
+		CERROR("OBD: unknown type: %s\n", type_name);
+		RETURN(ERR_PTR(-ENODEV));
+	}
+
+	newdev = obd_device_alloc();
+	if (newdev == NULL)
+		GOTO(out_type, result = ERR_PTR(-ENOMEM));
+
+	LASSERT(newdev->obd_magic == OBD_DEVICE_MAGIC);
+
+	write_lock(&obd_dev_lock);
+	for (i = 0; i < class_devno_max(); i++) {
+		struct obd_device *obd = class_num2obd(i);
+
+		if (obd && (strcmp(name, obd->obd_name) == 0)) {
+			CERROR("Device %s already exists at %d, won't add\n",
+			       name, i);
+			if (result) {
+				LASSERTF(result->obd_magic == OBD_DEVICE_MAGIC,
+					 "%p obd_magic %08x != %08x\n", result,
+					 result->obd_magic, OBD_DEVICE_MAGIC);
+				LASSERTF(result->obd_minor == new_obd_minor,
+					 "%p obd_minor %d != %d\n", result,
+					 result->obd_minor, new_obd_minor);
+
+				obd_devs[result->obd_minor] = NULL;
+				result->obd_name[0]='\0';
+			 }
+			result = ERR_PTR(-EEXIST);
+			break;
+		}
+		if (!result && !obd) {
+			result = newdev;
+			result->obd_minor = i;
+			new_obd_minor = i;
+			result->obd_type = type;
+			strncpy(result->obd_name, name,
+				sizeof(result->obd_name) - 1);
+			obd_devs[i] = result;
+		}
+	}
+	write_unlock(&obd_dev_lock);
+
+	if (result == NULL && i >= class_devno_max()) {
+		CERROR("all %u OBD devices used, increase MAX_OBD_DEVICES\n",
+		       class_devno_max());
+		GOTO(out, result = ERR_PTR(-EOVERFLOW));
+	}
+
+	if (IS_ERR(result))
+		GOTO(out, result);
+
+	CDEBUG(D_IOCTL, "Adding new device %s (%p)\n",
+	       result->obd_name, result);
+
+	RETURN(result);
+out:
+	obd_device_free(newdev);
+out_type:
+	class_put_type(type);
+	return result;
+}
+
+void class_release_dev(struct obd_device *obd)
+{
+	struct obd_type *obd_type = obd->obd_type;
+
+	LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "%p obd_magic %08x != %08x\n",
+		 obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+	LASSERTF(obd == obd_devs[obd->obd_minor], "obd %p != obd_devs[%d] %p\n",
+		 obd, obd->obd_minor, obd_devs[obd->obd_minor]);
+	LASSERT(obd_type != NULL);
+
+	CDEBUG(D_INFO, "Release obd device %s at %d obd_type name =%s\n",
+	       obd->obd_name, obd->obd_minor, obd->obd_type->typ_name);
+
+	write_lock(&obd_dev_lock);
+	obd_devs[obd->obd_minor] = NULL;
+	write_unlock(&obd_dev_lock);
+	obd_device_free(obd);
+
+	class_put_type(obd_type);
+}
+
+int class_name2dev(const char *name)
+{
+	int i;
+
+	if (!name)
+		return -1;
+
+	read_lock(&obd_dev_lock);
+	for (i = 0; i < class_devno_max(); i++) {
+		struct obd_device *obd = class_num2obd(i);
+
+		if (obd && strcmp(name, obd->obd_name) == 0) {
+			/* Make sure we finished attaching before we give
+			   out any references */
+			LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+			if (obd->obd_attached) {
+				read_unlock(&obd_dev_lock);
+				return i;
+			}
+			break;
+		}
+	}
+	read_unlock(&obd_dev_lock);
+
+	return -1;
+}
+EXPORT_SYMBOL(class_name2dev);
+
+struct obd_device *class_name2obd(const char *name)
+{
+	int dev = class_name2dev(name);
+
+	if (dev < 0 || dev > class_devno_max())
+		return NULL;
+	return class_num2obd(dev);
+}
+EXPORT_SYMBOL(class_name2obd);
+
+int class_uuid2dev(struct obd_uuid *uuid)
+{
+	int i;
+
+	read_lock(&obd_dev_lock);
+	for (i = 0; i < class_devno_max(); i++) {
+		struct obd_device *obd = class_num2obd(i);
+
+		if (obd && obd_uuid_equals(uuid, &obd->obd_uuid)) {
+			LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+			read_unlock(&obd_dev_lock);
+			return i;
+		}
+	}
+	read_unlock(&obd_dev_lock);
+
+	return -1;
+}
+EXPORT_SYMBOL(class_uuid2dev);
+
+struct obd_device *class_uuid2obd(struct obd_uuid *uuid)
+{
+	int dev = class_uuid2dev(uuid);
+	if (dev < 0)
+		return NULL;
+	return class_num2obd(dev);
+}
+EXPORT_SYMBOL(class_uuid2obd);
+
+/**
+ * Get obd device from ::obd_devs[]
+ *
+ * \param num [in] array index
+ *
+ * \retval NULL if ::obd_devs[\a num] does not contains an obd device
+ *	 otherwise return the obd device there.
+ */
+struct obd_device *class_num2obd(int num)
+{
+	struct obd_device *obd = NULL;
+
+	if (num < class_devno_max()) {
+		obd = obd_devs[num];
+		if (obd == NULL)
+			return NULL;
+
+		LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC,
+			 "%p obd_magic %08x != %08x\n",
+			 obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+		LASSERTF(obd->obd_minor == num,
+			 "%p obd_minor %0d != %0d\n",
+			 obd, obd->obd_minor, num);
+	}
+
+	return obd;
+}
+EXPORT_SYMBOL(class_num2obd);
+
+/**
+ * Get obd devices count. Device in any
+ *    state are counted
+ * \retval obd device count
+ */
+int get_devices_count(void)
+{
+	int index, max_index = class_devno_max(), dev_count = 0;
+
+	read_lock(&obd_dev_lock);
+	for (index = 0; index <= max_index; index++) {
+		struct obd_device *obd = class_num2obd(index);
+		if (obd != NULL)
+			dev_count++;
+	}
+	read_unlock(&obd_dev_lock);
+
+	return dev_count;
+}
+EXPORT_SYMBOL(get_devices_count);
+
+void class_obd_list(void)
+{
+	char *status;
+	int i;
+
+	read_lock(&obd_dev_lock);
+	for (i = 0; i < class_devno_max(); i++) {
+		struct obd_device *obd = class_num2obd(i);
+
+		if (obd == NULL)
+			continue;
+		if (obd->obd_stopping)
+			status = "ST";
+		else if (obd->obd_set_up)
+			status = "UP";
+		else if (obd->obd_attached)
+			status = "AT";
+		else
+			status = "--";
+		LCONSOLE(D_CONFIG, "%3d %s %s %s %s %d\n",
+			 i, status, obd->obd_type->typ_name,
+			 obd->obd_name, obd->obd_uuid.uuid,
+			 atomic_read(&obd->obd_refcount));
+	}
+	read_unlock(&obd_dev_lock);
+	return;
+}
+
+/* Search for a client OBD connected to tgt_uuid.  If grp_uuid is
+   specified, then only the client with that uuid is returned,
+   otherwise any client connected to the tgt is returned. */
+struct obd_device * class_find_client_obd(struct obd_uuid *tgt_uuid,
+					  const char * typ_name,
+					  struct obd_uuid *grp_uuid)
+{
+	int i;
+
+	read_lock(&obd_dev_lock);
+	for (i = 0; i < class_devno_max(); i++) {
+		struct obd_device *obd = class_num2obd(i);
+
+		if (obd == NULL)
+			continue;
+		if ((strncmp(obd->obd_type->typ_name, typ_name,
+			     strlen(typ_name)) == 0)) {
+			if (obd_uuid_equals(tgt_uuid,
+					    &obd->u.cli.cl_target_uuid) &&
+			    ((grp_uuid)? obd_uuid_equals(grp_uuid,
+							 &obd->obd_uuid) : 1)) {
+				read_unlock(&obd_dev_lock);
+				return obd;
+			}
+		}
+	}
+	read_unlock(&obd_dev_lock);
+
+	return NULL;
+}
+EXPORT_SYMBOL(class_find_client_obd);
+
+/* Iterate the obd_device list looking devices have grp_uuid. Start
+   searching at *next, and if a device is found, the next index to look
+   at is saved in *next. If next is NULL, then the first matching device
+   will always be returned. */
+struct obd_device * class_devices_in_group(struct obd_uuid *grp_uuid, int *next)
+{
+	int i;
+
+	if (next == NULL)
+		i = 0;
+	else if (*next >= 0 && *next < class_devno_max())
+		i = *next;
+	else
+		return NULL;
+
+	read_lock(&obd_dev_lock);
+	for (; i < class_devno_max(); i++) {
+		struct obd_device *obd = class_num2obd(i);
+
+		if (obd == NULL)
+			continue;
+		if (obd_uuid_equals(grp_uuid, &obd->obd_uuid)) {
+			if (next != NULL)
+				*next = i+1;
+			read_unlock(&obd_dev_lock);
+			return obd;
+		}
+	}
+	read_unlock(&obd_dev_lock);
+
+	return NULL;
+}
+EXPORT_SYMBOL(class_devices_in_group);
+
+/**
+ * to notify sptlrpc log for \a fsname has changed, let every relevant OBD
+ * adjust sptlrpc settings accordingly.
+ */
+int class_notify_sptlrpc_conf(const char *fsname, int namelen)
+{
+	struct obd_device  *obd;
+	const char	 *type;
+	int		 i, rc = 0, rc2;
+
+	LASSERT(namelen > 0);
+
+	read_lock(&obd_dev_lock);
+	for (i = 0; i < class_devno_max(); i++) {
+		obd = class_num2obd(i);
+
+		if (obd == NULL || obd->obd_set_up == 0 || obd->obd_stopping)
+			continue;
+
+		/* only notify mdc, osc, mdt, ost */
+		type = obd->obd_type->typ_name;
+		if (strcmp(type, LUSTRE_MDC_NAME) != 0 &&
+		    strcmp(type, LUSTRE_OSC_NAME) != 0 &&
+		    strcmp(type, LUSTRE_MDT_NAME) != 0 &&
+		    strcmp(type, LUSTRE_OST_NAME) != 0)
+			continue;
+
+		if (strncmp(obd->obd_name, fsname, namelen))
+			continue;
+
+		class_incref(obd, __FUNCTION__, obd);
+		read_unlock(&obd_dev_lock);
+		rc2 = obd_set_info_async(NULL, obd->obd_self_export,
+					 sizeof(KEY_SPTLRPC_CONF),
+					 KEY_SPTLRPC_CONF, 0, NULL, NULL);
+		rc = rc ? rc : rc2;
+		class_decref(obd, __FUNCTION__, obd);
+		read_lock(&obd_dev_lock);
+	}
+	read_unlock(&obd_dev_lock);
+	return rc;
+}
+EXPORT_SYMBOL(class_notify_sptlrpc_conf);
+
+void obd_cleanup_caches(void)
+{
+	ENTRY;
+	if (obd_device_cachep) {
+		kmem_cache_destroy(obd_device_cachep);
+		obd_device_cachep = NULL;
+	}
+	if (obdo_cachep) {
+		kmem_cache_destroy(obdo_cachep);
+		obdo_cachep = NULL;
+	}
+	if (import_cachep) {
+		kmem_cache_destroy(import_cachep);
+		import_cachep = NULL;
+	}
+	if (capa_cachep) {
+		kmem_cache_destroy(capa_cachep);
+		capa_cachep = NULL;
+	}
+	EXIT;
+}
+
+int obd_init_caches(void)
+{
+	ENTRY;
+
+	LASSERT(obd_device_cachep == NULL);
+	obd_device_cachep = kmem_cache_create("ll_obd_dev_cache",
+						 sizeof(struct obd_device),
+						 0, 0, NULL);
+	if (!obd_device_cachep)
+		GOTO(out, -ENOMEM);
+
+	LASSERT(obdo_cachep == NULL);
+	obdo_cachep = kmem_cache_create("ll_obdo_cache", sizeof(struct obdo),
+					   0, 0, NULL);
+	if (!obdo_cachep)
+		GOTO(out, -ENOMEM);
+
+	LASSERT(import_cachep == NULL);
+	import_cachep = kmem_cache_create("ll_import_cache",
+					     sizeof(struct obd_import),
+					     0, 0, NULL);
+	if (!import_cachep)
+		GOTO(out, -ENOMEM);
+
+	LASSERT(capa_cachep == NULL);
+	capa_cachep = kmem_cache_create("capa_cache",
+					   sizeof(struct obd_capa), 0, 0, NULL);
+	if (!capa_cachep)
+		GOTO(out, -ENOMEM);
+
+	RETURN(0);
+ out:
+	obd_cleanup_caches();
+	RETURN(-ENOMEM);
+
+}
+
+/* map connection to client */
+struct obd_export *class_conn2export(struct lustre_handle *conn)
+{
+	struct obd_export *export;
+	ENTRY;
+
+	if (!conn) {
+		CDEBUG(D_CACHE, "looking for null handle\n");
+		RETURN(NULL);
+	}
+
+	if (conn->cookie == -1) {  /* this means assign a new connection */
+		CDEBUG(D_CACHE, "want a new connection\n");
+		RETURN(NULL);
+	}
+
+	CDEBUG(D_INFO, "looking for export cookie "LPX64"\n", conn->cookie);
+	export = class_handle2object(conn->cookie);
+	RETURN(export);
+}
+EXPORT_SYMBOL(class_conn2export);
+
+struct obd_device *class_exp2obd(struct obd_export *exp)
+{
+	if (exp)
+		return exp->exp_obd;
+	return NULL;
+}
+EXPORT_SYMBOL(class_exp2obd);
+
+struct obd_device *class_conn2obd(struct lustre_handle *conn)
+{
+	struct obd_export *export;
+	export = class_conn2export(conn);
+	if (export) {
+		struct obd_device *obd = export->exp_obd;
+		class_export_put(export);
+		return obd;
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(class_conn2obd);
+
+struct obd_import *class_exp2cliimp(struct obd_export *exp)
+{
+	struct obd_device *obd = exp->exp_obd;
+	if (obd == NULL)
+		return NULL;
+	return obd->u.cli.cl_import;
+}
+EXPORT_SYMBOL(class_exp2cliimp);
+
+struct obd_import *class_conn2cliimp(struct lustre_handle *conn)
+{
+	struct obd_device *obd = class_conn2obd(conn);
+	if (obd == NULL)
+		return NULL;
+	return obd->u.cli.cl_import;
+}
+EXPORT_SYMBOL(class_conn2cliimp);
+
+/* Export management functions */
+static void class_export_destroy(struct obd_export *exp)
+{
+	struct obd_device *obd = exp->exp_obd;
+	ENTRY;
+
+	LASSERT_ATOMIC_ZERO(&exp->exp_refcount);
+	LASSERT(obd != NULL);
+
+	CDEBUG(D_IOCTL, "destroying export %p/%s for %s\n", exp,
+	       exp->exp_client_uuid.uuid, obd->obd_name);
+
+	/* "Local" exports (lctl, LOV->{mdc,osc}) have no connection. */
+	if (exp->exp_connection)
+		ptlrpc_put_connection_superhack(exp->exp_connection);
+
+	LASSERT(list_empty(&exp->exp_outstanding_replies));
+	LASSERT(list_empty(&exp->exp_uncommitted_replies));
+	LASSERT(list_empty(&exp->exp_req_replay_queue));
+	LASSERT(list_empty(&exp->exp_hp_rpcs));
+	obd_destroy_export(exp);
+	class_decref(obd, "export", exp);
+
+	OBD_FREE_RCU(exp, sizeof(*exp), &exp->exp_handle);
+	EXIT;
+}
+
+static void export_handle_addref(void *export)
+{
+	class_export_get(export);
+}
+
+static struct portals_handle_ops export_handle_ops = {
+	.hop_addref = export_handle_addref,
+	.hop_free   = NULL,
+};
+
+struct obd_export *class_export_get(struct obd_export *exp)
+{
+	atomic_inc(&exp->exp_refcount);
+	CDEBUG(D_INFO, "GETting export %p : new refcount %d\n", exp,
+	       atomic_read(&exp->exp_refcount));
+	return exp;
+}
+EXPORT_SYMBOL(class_export_get);
+
+void class_export_put(struct obd_export *exp)
+{
+	LASSERT(exp != NULL);
+	LASSERT_ATOMIC_GT_LT(&exp->exp_refcount, 0, LI_POISON);
+	CDEBUG(D_INFO, "PUTting export %p : new refcount %d\n", exp,
+	       atomic_read(&exp->exp_refcount) - 1);
+
+	if (atomic_dec_and_test(&exp->exp_refcount)) {
+		LASSERT(!list_empty(&exp->exp_obd_chain));
+		CDEBUG(D_IOCTL, "final put %p/%s\n",
+		       exp, exp->exp_client_uuid.uuid);
+
+		/* release nid stat refererence */
+		lprocfs_exp_cleanup(exp);
+
+		obd_zombie_export_add(exp);
+	}
+}
+EXPORT_SYMBOL(class_export_put);
+
+/* Creates a new export, adds it to the hash table, and returns a
+ * pointer to it. The refcount is 2: one for the hash reference, and
+ * one for the pointer returned by this function. */
+struct obd_export *class_new_export(struct obd_device *obd,
+				    struct obd_uuid *cluuid)
+{
+	struct obd_export *export;
+	cfs_hash_t *hash = NULL;
+	int rc = 0;
+	ENTRY;
+
+	OBD_ALLOC_PTR(export);
+	if (!export)
+		return ERR_PTR(-ENOMEM);
+
+	export->exp_conn_cnt = 0;
+	export->exp_lock_hash = NULL;
+	export->exp_flock_hash = NULL;
+	atomic_set(&export->exp_refcount, 2);
+	atomic_set(&export->exp_rpc_count, 0);
+	atomic_set(&export->exp_cb_count, 0);
+	atomic_set(&export->exp_locks_count, 0);
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+	INIT_LIST_HEAD(&export->exp_locks_list);
+	spin_lock_init(&export->exp_locks_list_guard);
+#endif
+	atomic_set(&export->exp_replay_count, 0);
+	export->exp_obd = obd;
+	INIT_LIST_HEAD(&export->exp_outstanding_replies);
+	spin_lock_init(&export->exp_uncommitted_replies_lock);
+	INIT_LIST_HEAD(&export->exp_uncommitted_replies);
+	INIT_LIST_HEAD(&export->exp_req_replay_queue);
+	INIT_LIST_HEAD(&export->exp_handle.h_link);
+	INIT_LIST_HEAD(&export->exp_hp_rpcs);
+	class_handle_hash(&export->exp_handle, &export_handle_ops);
+	export->exp_last_request_time = cfs_time_current_sec();
+	spin_lock_init(&export->exp_lock);
+	spin_lock_init(&export->exp_rpc_lock);
+	INIT_HLIST_NODE(&export->exp_uuid_hash);
+	INIT_HLIST_NODE(&export->exp_nid_hash);
+	spin_lock_init(&export->exp_bl_list_lock);
+	INIT_LIST_HEAD(&export->exp_bl_list);
+
+	export->exp_sp_peer = LUSTRE_SP_ANY;
+	export->exp_flvr.sf_rpc = SPTLRPC_FLVR_INVALID;
+	export->exp_client_uuid = *cluuid;
+	obd_init_export(export);
+
+	spin_lock(&obd->obd_dev_lock);
+	/* shouldn't happen, but might race */
+	if (obd->obd_stopping)
+		GOTO(exit_unlock, rc = -ENODEV);
+
+	hash = cfs_hash_getref(obd->obd_uuid_hash);
+	if (hash == NULL)
+		GOTO(exit_unlock, rc = -ENODEV);
+	spin_unlock(&obd->obd_dev_lock);
+
+	if (!obd_uuid_equals(cluuid, &obd->obd_uuid)) {
+		rc = cfs_hash_add_unique(hash, cluuid, &export->exp_uuid_hash);
+		if (rc != 0) {
+			LCONSOLE_WARN("%s: denying duplicate export for %s, %d\n",
+				      obd->obd_name, cluuid->uuid, rc);
+			GOTO(exit_err, rc = -EALREADY);
+		}
+	}
+
+	spin_lock(&obd->obd_dev_lock);
+	if (obd->obd_stopping) {
+		cfs_hash_del(hash, cluuid, &export->exp_uuid_hash);
+		GOTO(exit_unlock, rc = -ENODEV);
+	}
+
+	class_incref(obd, "export", export);
+	list_add(&export->exp_obd_chain, &export->exp_obd->obd_exports);
+	list_add_tail(&export->exp_obd_chain_timed,
+			  &export->exp_obd->obd_exports_timed);
+	export->exp_obd->obd_num_exports++;
+	spin_unlock(&obd->obd_dev_lock);
+	cfs_hash_putref(hash);
+	RETURN(export);
+
+exit_unlock:
+	spin_unlock(&obd->obd_dev_lock);
+exit_err:
+	if (hash)
+		cfs_hash_putref(hash);
+	class_handle_unhash(&export->exp_handle);
+	LASSERT(hlist_unhashed(&export->exp_uuid_hash));
+	obd_destroy_export(export);
+	OBD_FREE_PTR(export);
+	return ERR_PTR(rc);
+}
+EXPORT_SYMBOL(class_new_export);
+
+void class_unlink_export(struct obd_export *exp)
+{
+	class_handle_unhash(&exp->exp_handle);
+
+	spin_lock(&exp->exp_obd->obd_dev_lock);
+	/* delete an uuid-export hashitem from hashtables */
+	if (!hlist_unhashed(&exp->exp_uuid_hash))
+		cfs_hash_del(exp->exp_obd->obd_uuid_hash,
+			     &exp->exp_client_uuid,
+			     &exp->exp_uuid_hash);
+
+	list_move(&exp->exp_obd_chain, &exp->exp_obd->obd_unlinked_exports);
+	list_del_init(&exp->exp_obd_chain_timed);
+	exp->exp_obd->obd_num_exports--;
+	spin_unlock(&exp->exp_obd->obd_dev_lock);
+	class_export_put(exp);
+}
+EXPORT_SYMBOL(class_unlink_export);
+
+/* Import management functions */
+void class_import_destroy(struct obd_import *imp)
+{
+	ENTRY;
+
+	CDEBUG(D_IOCTL, "destroying import %p for %s\n", imp,
+		imp->imp_obd->obd_name);
+
+	LASSERT_ATOMIC_ZERO(&imp->imp_refcount);
+
+	ptlrpc_put_connection_superhack(imp->imp_connection);
+
+	while (!list_empty(&imp->imp_conn_list)) {
+		struct obd_import_conn *imp_conn;
+
+		imp_conn = list_entry(imp->imp_conn_list.next,
+					  struct obd_import_conn, oic_item);
+		list_del_init(&imp_conn->oic_item);
+		ptlrpc_put_connection_superhack(imp_conn->oic_conn);
+		OBD_FREE(imp_conn, sizeof(*imp_conn));
+	}
+
+	LASSERT(imp->imp_sec == NULL);
+	class_decref(imp->imp_obd, "import", imp);
+	OBD_FREE_RCU(imp, sizeof(*imp), &imp->imp_handle);
+	EXIT;
+}
+
+static void import_handle_addref(void *import)
+{
+	class_import_get(import);
+}
+
+static struct portals_handle_ops import_handle_ops = {
+	.hop_addref = import_handle_addref,
+	.hop_free   = NULL,
+};
+
+struct obd_import *class_import_get(struct obd_import *import)
+{
+	atomic_inc(&import->imp_refcount);
+	CDEBUG(D_INFO, "import %p refcount=%d obd=%s\n", import,
+	       atomic_read(&import->imp_refcount),
+	       import->imp_obd->obd_name);
+	return import;
+}
+EXPORT_SYMBOL(class_import_get);
+
+void class_import_put(struct obd_import *imp)
+{
+	ENTRY;
+
+	LASSERT(list_empty(&imp->imp_zombie_chain));
+	LASSERT_ATOMIC_GT_LT(&imp->imp_refcount, 0, LI_POISON);
+
+	CDEBUG(D_INFO, "import %p refcount=%d obd=%s\n", imp,
+	       atomic_read(&imp->imp_refcount) - 1,
+	       imp->imp_obd->obd_name);
+
+	if (atomic_dec_and_test(&imp->imp_refcount)) {
+		CDEBUG(D_INFO, "final put import %p\n", imp);
+		obd_zombie_import_add(imp);
+	}
+
+	/* catch possible import put race */
+	LASSERT_ATOMIC_GE_LT(&imp->imp_refcount, 0, LI_POISON);
+	EXIT;
+}
+EXPORT_SYMBOL(class_import_put);
+
+static void init_imp_at(struct imp_at *at) {
+	int i;
+	at_init(&at->iat_net_latency, 0, 0);
+	for (i = 0; i < IMP_AT_MAX_PORTALS; i++) {
+		/* max service estimates are tracked on the server side, so
+		   don't use the AT history here, just use the last reported
+		   val. (But keep hist for proc histogram, worst_ever) */
+		at_init(&at->iat_service_estimate[i], INITIAL_CONNECT_TIMEOUT,
+			AT_FLG_NOHIST);
+	}
+}
+
+struct obd_import *class_new_import(struct obd_device *obd)
+{
+	struct obd_import *imp;
+
+	OBD_ALLOC(imp, sizeof(*imp));
+	if (imp == NULL)
+		return NULL;
+
+	INIT_LIST_HEAD(&imp->imp_pinger_chain);
+	INIT_LIST_HEAD(&imp->imp_zombie_chain);
+	INIT_LIST_HEAD(&imp->imp_replay_list);
+	INIT_LIST_HEAD(&imp->imp_sending_list);
+	INIT_LIST_HEAD(&imp->imp_delayed_list);
+	spin_lock_init(&imp->imp_lock);
+	imp->imp_last_success_conn = 0;
+	imp->imp_state = LUSTRE_IMP_NEW;
+	imp->imp_obd = class_incref(obd, "import", imp);
+	mutex_init(&imp->imp_sec_mutex);
+	init_waitqueue_head(&imp->imp_recovery_waitq);
+
+	atomic_set(&imp->imp_refcount, 2);
+	atomic_set(&imp->imp_unregistering, 0);
+	atomic_set(&imp->imp_inflight, 0);
+	atomic_set(&imp->imp_replay_inflight, 0);
+	atomic_set(&imp->imp_inval_count, 0);
+	INIT_LIST_HEAD(&imp->imp_conn_list);
+	INIT_LIST_HEAD(&imp->imp_handle.h_link);
+	class_handle_hash(&imp->imp_handle, &import_handle_ops);
+	init_imp_at(&imp->imp_at);
+
+	/* the default magic is V2, will be used in connect RPC, and
+	 * then adjusted according to the flags in request/reply. */
+	imp->imp_msg_magic = LUSTRE_MSG_MAGIC_V2;
+
+	return imp;
+}
+EXPORT_SYMBOL(class_new_import);
+
+void class_destroy_import(struct obd_import *import)
+{
+	LASSERT(import != NULL);
+	LASSERT(import != LP_POISON);
+
+	class_handle_unhash(&import->imp_handle);
+
+	spin_lock(&import->imp_lock);
+	import->imp_generation++;
+	spin_unlock(&import->imp_lock);
+	class_import_put(import);
+}
+EXPORT_SYMBOL(class_destroy_import);
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+
+void __class_export_add_lock_ref(struct obd_export *exp, struct ldlm_lock *lock)
+{
+	spin_lock(&exp->exp_locks_list_guard);
+
+	LASSERT(lock->l_exp_refs_nr >= 0);
+
+	if (lock->l_exp_refs_target != NULL &&
+	    lock->l_exp_refs_target != exp) {
+		LCONSOLE_WARN("setting export %p for lock %p which already has export %p\n",
+			      exp, lock, lock->l_exp_refs_target);
+	}
+	if ((lock->l_exp_refs_nr ++) == 0) {
+		list_add(&lock->l_exp_refs_link, &exp->exp_locks_list);
+		lock->l_exp_refs_target = exp;
+	}
+	CDEBUG(D_INFO, "lock = %p, export = %p, refs = %u\n",
+	       lock, exp, lock->l_exp_refs_nr);
+	spin_unlock(&exp->exp_locks_list_guard);
+}
+EXPORT_SYMBOL(__class_export_add_lock_ref);
+
+void __class_export_del_lock_ref(struct obd_export *exp, struct ldlm_lock *lock)
+{
+	spin_lock(&exp->exp_locks_list_guard);
+	LASSERT(lock->l_exp_refs_nr > 0);
+	if (lock->l_exp_refs_target != exp) {
+		LCONSOLE_WARN("lock %p, "
+			      "mismatching export pointers: %p, %p\n",
+			      lock, lock->l_exp_refs_target, exp);
+	}
+	if (-- lock->l_exp_refs_nr == 0) {
+		list_del_init(&lock->l_exp_refs_link);
+		lock->l_exp_refs_target = NULL;
+	}
+	CDEBUG(D_INFO, "lock = %p, export = %p, refs = %u\n",
+	       lock, exp, lock->l_exp_refs_nr);
+	spin_unlock(&exp->exp_locks_list_guard);
+}
+EXPORT_SYMBOL(__class_export_del_lock_ref);
+#endif
+
+/* A connection defines an export context in which preallocation can
+   be managed. This releases the export pointer reference, and returns
+   the export handle, so the export refcount is 1 when this function
+   returns. */
+int class_connect(struct lustre_handle *conn, struct obd_device *obd,
+		  struct obd_uuid *cluuid)
+{
+	struct obd_export *export;
+	LASSERT(conn != NULL);
+	LASSERT(obd != NULL);
+	LASSERT(cluuid != NULL);
+	ENTRY;
+
+	export = class_new_export(obd, cluuid);
+	if (IS_ERR(export))
+		RETURN(PTR_ERR(export));
+
+	conn->cookie = export->exp_handle.h_cookie;
+	class_export_put(export);
+
+	CDEBUG(D_IOCTL, "connect: client %s, cookie "LPX64"\n",
+	       cluuid->uuid, conn->cookie);
+	RETURN(0);
+}
+EXPORT_SYMBOL(class_connect);
+
+/* if export is involved in recovery then clean up related things */
+void class_export_recovery_cleanup(struct obd_export *exp)
+{
+	struct obd_device *obd = exp->exp_obd;
+
+	spin_lock(&obd->obd_recovery_task_lock);
+	if (exp->exp_delayed)
+		obd->obd_delayed_clients--;
+	if (obd->obd_recovering) {
+		if (exp->exp_in_recovery) {
+			spin_lock(&exp->exp_lock);
+			exp->exp_in_recovery = 0;
+			spin_unlock(&exp->exp_lock);
+			LASSERT_ATOMIC_POS(&obd->obd_connected_clients);
+			atomic_dec(&obd->obd_connected_clients);
+		}
+
+		/* if called during recovery then should update
+		 * obd_stale_clients counter,
+		 * lightweight exports are not counted */
+		if (exp->exp_failed &&
+		    (exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT) == 0)
+			exp->exp_obd->obd_stale_clients++;
+	}
+	spin_unlock(&obd->obd_recovery_task_lock);
+	/** Cleanup req replay fields */
+	if (exp->exp_req_replay_needed) {
+		spin_lock(&exp->exp_lock);
+		exp->exp_req_replay_needed = 0;
+		spin_unlock(&exp->exp_lock);
+		LASSERT(atomic_read(&obd->obd_req_replay_clients));
+		atomic_dec(&obd->obd_req_replay_clients);
+	}
+	/** Cleanup lock replay data */
+	if (exp->exp_lock_replay_needed) {
+		spin_lock(&exp->exp_lock);
+		exp->exp_lock_replay_needed = 0;
+		spin_unlock(&exp->exp_lock);
+		LASSERT(atomic_read(&obd->obd_lock_replay_clients));
+		atomic_dec(&obd->obd_lock_replay_clients);
+	}
+}
+
+/* This function removes 1-3 references from the export:
+ * 1 - for export pointer passed
+ * and if disconnect really need
+ * 2 - removing from hash
+ * 3 - in client_unlink_export
+ * The export pointer passed to this function can destroyed */
+int class_disconnect(struct obd_export *export)
+{
+	int already_disconnected;
+	ENTRY;
+
+	if (export == NULL) {
+		CWARN("attempting to free NULL export %p\n", export);
+		RETURN(-EINVAL);
+	}
+
+	spin_lock(&export->exp_lock);
+	already_disconnected = export->exp_disconnected;
+	export->exp_disconnected = 1;
+	spin_unlock(&export->exp_lock);
+
+	/* class_cleanup(), abort_recovery(), and class_fail_export()
+	 * all end up in here, and if any of them race we shouldn't
+	 * call extra class_export_puts(). */
+	if (already_disconnected) {
+		LASSERT(hlist_unhashed(&export->exp_nid_hash));
+		GOTO(no_disconn, already_disconnected);
+	}
+
+	CDEBUG(D_IOCTL, "disconnect: cookie "LPX64"\n",
+	       export->exp_handle.h_cookie);
+
+	if (!hlist_unhashed(&export->exp_nid_hash))
+		cfs_hash_del(export->exp_obd->obd_nid_hash,
+			     &export->exp_connection->c_peer.nid,
+			     &export->exp_nid_hash);
+
+	class_export_recovery_cleanup(export);
+	class_unlink_export(export);
+no_disconn:
+	class_export_put(export);
+	RETURN(0);
+}
+EXPORT_SYMBOL(class_disconnect);
+
+/* Return non-zero for a fully connected export */
+int class_connected_export(struct obd_export *exp)
+{
+	if (exp) {
+		int connected;
+		spin_lock(&exp->exp_lock);
+		connected = (exp->exp_conn_cnt > 0);
+		spin_unlock(&exp->exp_lock);
+		return connected;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(class_connected_export);
+
+static void class_disconnect_export_list(struct list_head *list,
+					 enum obd_option flags)
+{
+	int rc;
+	struct obd_export *exp;
+	ENTRY;
+
+	/* It's possible that an export may disconnect itself, but
+	 * nothing else will be added to this list. */
+	while (!list_empty(list)) {
+		exp = list_entry(list->next, struct obd_export,
+				     exp_obd_chain);
+		/* need for safe call CDEBUG after obd_disconnect */
+		class_export_get(exp);
+
+		spin_lock(&exp->exp_lock);
+		exp->exp_flags = flags;
+		spin_unlock(&exp->exp_lock);
+
+		if (obd_uuid_equals(&exp->exp_client_uuid,
+				    &exp->exp_obd->obd_uuid)) {
+			CDEBUG(D_HA,
+			       "exp %p export uuid == obd uuid, don't discon\n",
+			       exp);
+			/* Need to delete this now so we don't end up pointing
+			 * to work_list later when this export is cleaned up. */
+			list_del_init(&exp->exp_obd_chain);
+			class_export_put(exp);
+			continue;
+		}
+
+		class_export_get(exp);
+		CDEBUG(D_HA, "%s: disconnecting export at %s (%p), "
+		       "last request at "CFS_TIME_T"\n",
+		       exp->exp_obd->obd_name, obd_export_nid2str(exp),
+		       exp, exp->exp_last_request_time);
+		/* release one export reference anyway */
+		rc = obd_disconnect(exp);
+
+		CDEBUG(D_HA, "disconnected export at %s (%p): rc %d\n",
+		       obd_export_nid2str(exp), exp, rc);
+		class_export_put(exp);
+	}
+	EXIT;
+}
+
+void class_disconnect_exports(struct obd_device *obd)
+{
+	struct list_head work_list;
+	ENTRY;
+
+	/* Move all of the exports from obd_exports to a work list, en masse. */
+	INIT_LIST_HEAD(&work_list);
+	spin_lock(&obd->obd_dev_lock);
+	list_splice_init(&obd->obd_exports, &work_list);
+	list_splice_init(&obd->obd_delayed_exports, &work_list);
+	spin_unlock(&obd->obd_dev_lock);
+
+	if (!list_empty(&work_list)) {
+		CDEBUG(D_HA, "OBD device %d (%p) has exports, "
+		       "disconnecting them\n", obd->obd_minor, obd);
+		class_disconnect_export_list(&work_list,
+					     exp_flags_from_obd(obd));
+	} else
+		CDEBUG(D_HA, "OBD device %d (%p) has no exports\n",
+		       obd->obd_minor, obd);
+	EXIT;
+}
+EXPORT_SYMBOL(class_disconnect_exports);
+
+/* Remove exports that have not completed recovery.
+ */
+void class_disconnect_stale_exports(struct obd_device *obd,
+				    int (*test_export)(struct obd_export *))
+{
+	struct list_head work_list;
+	struct obd_export *exp, *n;
+	int evicted = 0;
+	ENTRY;
+
+	INIT_LIST_HEAD(&work_list);
+	spin_lock(&obd->obd_dev_lock);
+	list_for_each_entry_safe(exp, n, &obd->obd_exports,
+				     exp_obd_chain) {
+		/* don't count self-export as client */
+		if (obd_uuid_equals(&exp->exp_client_uuid,
+				    &exp->exp_obd->obd_uuid))
+			continue;
+
+		/* don't evict clients which have no slot in last_rcvd
+		 * (e.g. lightweight connection) */
+		if (exp->exp_target_data.ted_lr_idx == -1)
+			continue;
+
+		spin_lock(&exp->exp_lock);
+		if (exp->exp_failed || test_export(exp)) {
+			spin_unlock(&exp->exp_lock);
+			continue;
+		}
+		exp->exp_failed = 1;
+		spin_unlock(&exp->exp_lock);
+
+		list_move(&exp->exp_obd_chain, &work_list);
+		evicted++;
+		CDEBUG(D_HA, "%s: disconnect stale client %s@%s\n",
+		       obd->obd_name, exp->exp_client_uuid.uuid,
+		       exp->exp_connection == NULL ? "<unknown>" :
+		       libcfs_nid2str(exp->exp_connection->c_peer.nid));
+		print_export_data(exp, "EVICTING", 0);
+	}
+	spin_unlock(&obd->obd_dev_lock);
+
+	if (evicted)
+		LCONSOLE_WARN("%s: disconnecting %d stale clients\n",
+			      obd->obd_name, evicted);
+
+	class_disconnect_export_list(&work_list, exp_flags_from_obd(obd) |
+						 OBD_OPT_ABORT_RECOV);
+	EXIT;
+}
+EXPORT_SYMBOL(class_disconnect_stale_exports);
+
+void class_fail_export(struct obd_export *exp)
+{
+	int rc, already_failed;
+
+	spin_lock(&exp->exp_lock);
+	already_failed = exp->exp_failed;
+	exp->exp_failed = 1;
+	spin_unlock(&exp->exp_lock);
+
+	if (already_failed) {
+		CDEBUG(D_HA, "disconnecting dead export %p/%s; skipping\n",
+		       exp, exp->exp_client_uuid.uuid);
+		return;
+	}
+
+	CDEBUG(D_HA, "disconnecting export %p/%s\n",
+	       exp, exp->exp_client_uuid.uuid);
+
+	if (obd_dump_on_timeout)
+		libcfs_debug_dumplog();
+
+	/* need for safe call CDEBUG after obd_disconnect */
+	class_export_get(exp);
+
+	/* Most callers into obd_disconnect are removing their own reference
+	 * (request, for example) in addition to the one from the hash table.
+	 * We don't have such a reference here, so make one. */
+	class_export_get(exp);
+	rc = obd_disconnect(exp);
+	if (rc)
+		CERROR("disconnecting export %p failed: %d\n", exp, rc);
+	else
+		CDEBUG(D_HA, "disconnected export %p/%s\n",
+		       exp, exp->exp_client_uuid.uuid);
+	class_export_put(exp);
+}
+EXPORT_SYMBOL(class_fail_export);
+
+char *obd_export_nid2str(struct obd_export *exp)
+{
+	if (exp->exp_connection != NULL)
+		return libcfs_nid2str(exp->exp_connection->c_peer.nid);
+
+	return "(no nid)";
+}
+EXPORT_SYMBOL(obd_export_nid2str);
+
+int obd_export_evict_by_nid(struct obd_device *obd, const char *nid)
+{
+	cfs_hash_t *nid_hash;
+	struct obd_export *doomed_exp = NULL;
+	int exports_evicted = 0;
+
+	lnet_nid_t nid_key = libcfs_str2nid((char *)nid);
+
+	spin_lock(&obd->obd_dev_lock);
+	/* umount has run already, so evict thread should leave
+	 * its task to umount thread now */
+	if (obd->obd_stopping) {
+		spin_unlock(&obd->obd_dev_lock);
+		return exports_evicted;
+	}
+	nid_hash = obd->obd_nid_hash;
+	cfs_hash_getref(nid_hash);
+	spin_unlock(&obd->obd_dev_lock);
+
+	do {
+		doomed_exp = cfs_hash_lookup(nid_hash, &nid_key);
+		if (doomed_exp == NULL)
+			break;
+
+		LASSERTF(doomed_exp->exp_connection->c_peer.nid == nid_key,
+			 "nid %s found, wanted nid %s, requested nid %s\n",
+			 obd_export_nid2str(doomed_exp),
+			 libcfs_nid2str(nid_key), nid);
+		LASSERTF(doomed_exp != obd->obd_self_export,
+			 "self-export is hashed by NID?\n");
+		exports_evicted++;
+		LCONSOLE_WARN("%s: evicting %s (at %s) by administrative "
+			      "request\n", obd->obd_name,
+			      obd_uuid2str(&doomed_exp->exp_client_uuid),
+			      obd_export_nid2str(doomed_exp));
+		class_fail_export(doomed_exp);
+		class_export_put(doomed_exp);
+	} while (1);
+
+	cfs_hash_putref(nid_hash);
+
+	if (!exports_evicted)
+		CDEBUG(D_HA,"%s: can't disconnect NID '%s': no exports found\n",
+		       obd->obd_name, nid);
+	return exports_evicted;
+}
+EXPORT_SYMBOL(obd_export_evict_by_nid);
+
+int obd_export_evict_by_uuid(struct obd_device *obd, const char *uuid)
+{
+	cfs_hash_t *uuid_hash;
+	struct obd_export *doomed_exp = NULL;
+	struct obd_uuid doomed_uuid;
+	int exports_evicted = 0;
+
+	spin_lock(&obd->obd_dev_lock);
+	if (obd->obd_stopping) {
+		spin_unlock(&obd->obd_dev_lock);
+		return exports_evicted;
+	}
+	uuid_hash = obd->obd_uuid_hash;
+	cfs_hash_getref(uuid_hash);
+	spin_unlock(&obd->obd_dev_lock);
+
+	obd_str2uuid(&doomed_uuid, uuid);
+	if (obd_uuid_equals(&doomed_uuid, &obd->obd_uuid)) {
+		CERROR("%s: can't evict myself\n", obd->obd_name);
+		cfs_hash_putref(uuid_hash);
+		return exports_evicted;
+	}
+
+	doomed_exp = cfs_hash_lookup(uuid_hash, &doomed_uuid);
+
+	if (doomed_exp == NULL) {
+		CERROR("%s: can't disconnect %s: no exports found\n",
+		       obd->obd_name, uuid);
+	} else {
+		CWARN("%s: evicting %s at adminstrative request\n",
+		       obd->obd_name, doomed_exp->exp_client_uuid.uuid);
+		class_fail_export(doomed_exp);
+		class_export_put(doomed_exp);
+		exports_evicted++;
+	}
+	cfs_hash_putref(uuid_hash);
+
+	return exports_evicted;
+}
+EXPORT_SYMBOL(obd_export_evict_by_uuid);
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+void (*class_export_dump_hook)(struct obd_export*) = NULL;
+EXPORT_SYMBOL(class_export_dump_hook);
+#endif
+
+static void print_export_data(struct obd_export *exp, const char *status,
+			      int locks)
+{
+	struct ptlrpc_reply_state *rs;
+	struct ptlrpc_reply_state *first_reply = NULL;
+	int nreplies = 0;
+
+	spin_lock(&exp->exp_lock);
+	list_for_each_entry(rs, &exp->exp_outstanding_replies,
+				rs_exp_list) {
+		if (nreplies == 0)
+			first_reply = rs;
+		nreplies++;
+	}
+	spin_unlock(&exp->exp_lock);
+
+	CDEBUG(D_HA, "%s: %s %p %s %s %d (%d %d %d) %d %d %d %d: %p %s "LPU64"\n",
+	       exp->exp_obd->obd_name, status, exp, exp->exp_client_uuid.uuid,
+	       obd_export_nid2str(exp), atomic_read(&exp->exp_refcount),
+	       atomic_read(&exp->exp_rpc_count),
+	       atomic_read(&exp->exp_cb_count),
+	       atomic_read(&exp->exp_locks_count),
+	       exp->exp_disconnected, exp->exp_delayed, exp->exp_failed,
+	       nreplies, first_reply, nreplies > 3 ? "..." : "",
+	       exp->exp_last_committed);
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+	if (locks && class_export_dump_hook != NULL)
+		class_export_dump_hook(exp);
+#endif
+}
+
+void dump_exports(struct obd_device *obd, int locks)
+{
+	struct obd_export *exp;
+
+	spin_lock(&obd->obd_dev_lock);
+	list_for_each_entry(exp, &obd->obd_exports, exp_obd_chain)
+		print_export_data(exp, "ACTIVE", locks);
+	list_for_each_entry(exp, &obd->obd_unlinked_exports, exp_obd_chain)
+		print_export_data(exp, "UNLINKED", locks);
+	list_for_each_entry(exp, &obd->obd_delayed_exports, exp_obd_chain)
+		print_export_data(exp, "DELAYED", locks);
+	spin_unlock(&obd->obd_dev_lock);
+	spin_lock(&obd_zombie_impexp_lock);
+	list_for_each_entry(exp, &obd_zombie_exports, exp_obd_chain)
+		print_export_data(exp, "ZOMBIE", locks);
+	spin_unlock(&obd_zombie_impexp_lock);
+}
+EXPORT_SYMBOL(dump_exports);
+
+void obd_exports_barrier(struct obd_device *obd)
+{
+	int waited = 2;
+	LASSERT(list_empty(&obd->obd_exports));
+	spin_lock(&obd->obd_dev_lock);
+	while (!list_empty(&obd->obd_unlinked_exports)) {
+		spin_unlock(&obd->obd_dev_lock);
+		schedule_timeout_and_set_state(TASK_UNINTERRUPTIBLE,
+						   cfs_time_seconds(waited));
+		if (waited > 5 && IS_PO2(waited)) {
+			LCONSOLE_WARN("%s is waiting for obd_unlinked_exports "
+				      "more than %d seconds. "
+				      "The obd refcount = %d. Is it stuck?\n",
+				      obd->obd_name, waited,
+				      atomic_read(&obd->obd_refcount));
+			dump_exports(obd, 1);
+		}
+		waited *= 2;
+		spin_lock(&obd->obd_dev_lock);
+	}
+	spin_unlock(&obd->obd_dev_lock);
+}
+EXPORT_SYMBOL(obd_exports_barrier);
+
+/* Total amount of zombies to be destroyed */
+static int zombies_count = 0;
+
+/**
+ * kill zombie imports and exports
+ */
+void obd_zombie_impexp_cull(void)
+{
+	struct obd_import *import;
+	struct obd_export *export;
+	ENTRY;
+
+	do {
+		spin_lock(&obd_zombie_impexp_lock);
+
+		import = NULL;
+		if (!list_empty(&obd_zombie_imports)) {
+			import = list_entry(obd_zombie_imports.next,
+						struct obd_import,
+						imp_zombie_chain);
+			list_del_init(&import->imp_zombie_chain);
+		}
+
+		export = NULL;
+		if (!list_empty(&obd_zombie_exports)) {
+			export = list_entry(obd_zombie_exports.next,
+						struct obd_export,
+						exp_obd_chain);
+			list_del_init(&export->exp_obd_chain);
+		}
+
+		spin_unlock(&obd_zombie_impexp_lock);
+
+		if (import != NULL) {
+			class_import_destroy(import);
+			spin_lock(&obd_zombie_impexp_lock);
+			zombies_count--;
+			spin_unlock(&obd_zombie_impexp_lock);
+		}
+
+		if (export != NULL) {
+			class_export_destroy(export);
+			spin_lock(&obd_zombie_impexp_lock);
+			zombies_count--;
+			spin_unlock(&obd_zombie_impexp_lock);
+		}
+
+		cond_resched();
+	} while (import != NULL || export != NULL);
+	EXIT;
+}
+
+static struct completion	obd_zombie_start;
+static struct completion	obd_zombie_stop;
+static unsigned long		obd_zombie_flags;
+static wait_queue_head_t		obd_zombie_waitq;
+static pid_t			obd_zombie_pid;
+
+enum {
+	OBD_ZOMBIE_STOP		= 0x0001,
+};
+
+/**
+ * check for work for kill zombie import/export thread.
+ */
+static int obd_zombie_impexp_check(void *arg)
+{
+	int rc;
+
+	spin_lock(&obd_zombie_impexp_lock);
+	rc = (zombies_count == 0) &&
+	     !test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags);
+	spin_unlock(&obd_zombie_impexp_lock);
+
+	RETURN(rc);
+}
+
+/**
+ * Add export to the obd_zombe thread and notify it.
+ */
+static void obd_zombie_export_add(struct obd_export *exp) {
+	spin_lock(&exp->exp_obd->obd_dev_lock);
+	LASSERT(!list_empty(&exp->exp_obd_chain));
+	list_del_init(&exp->exp_obd_chain);
+	spin_unlock(&exp->exp_obd->obd_dev_lock);
+	spin_lock(&obd_zombie_impexp_lock);
+	zombies_count++;
+	list_add(&exp->exp_obd_chain, &obd_zombie_exports);
+	spin_unlock(&obd_zombie_impexp_lock);
+
+	obd_zombie_impexp_notify();
+}
+
+/**
+ * Add import to the obd_zombe thread and notify it.
+ */
+static void obd_zombie_import_add(struct obd_import *imp) {
+	LASSERT(imp->imp_sec == NULL);
+	LASSERT(imp->imp_rq_pool == NULL);
+	spin_lock(&obd_zombie_impexp_lock);
+	LASSERT(list_empty(&imp->imp_zombie_chain));
+	zombies_count++;
+	list_add(&imp->imp_zombie_chain, &obd_zombie_imports);
+	spin_unlock(&obd_zombie_impexp_lock);
+
+	obd_zombie_impexp_notify();
+}
+
+/**
+ * notify import/export destroy thread about new zombie.
+ */
+static void obd_zombie_impexp_notify(void)
+{
+	/*
+	 * Make sure obd_zomebie_impexp_thread get this notification.
+	 * It is possible this signal only get by obd_zombie_barrier, and
+	 * barrier gulps this notification and sleeps away and hangs ensues
+	 */
+	wake_up_all(&obd_zombie_waitq);
+}
+
+/**
+ * check whether obd_zombie is idle
+ */
+static int obd_zombie_is_idle(void)
+{
+	int rc;
+
+	LASSERT(!test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags));
+	spin_lock(&obd_zombie_impexp_lock);
+	rc = (zombies_count == 0);
+	spin_unlock(&obd_zombie_impexp_lock);
+	return rc;
+}
+
+/**
+ * wait when obd_zombie import/export queues become empty
+ */
+void obd_zombie_barrier(void)
+{
+	struct l_wait_info lwi = { 0 };
+
+	if (obd_zombie_pid == current_pid())
+		/* don't wait for myself */
+		return;
+	l_wait_event(obd_zombie_waitq, obd_zombie_is_idle(), &lwi);
+}
+EXPORT_SYMBOL(obd_zombie_barrier);
+
+
+/**
+ * destroy zombie export/import thread.
+ */
+static int obd_zombie_impexp_thread(void *unused)
+{
+	unshare_fs_struct();
+	complete(&obd_zombie_start);
+
+	obd_zombie_pid = current_pid();
+
+	while (!test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags)) {
+		struct l_wait_info lwi = { 0 };
+
+		l_wait_event(obd_zombie_waitq,
+			     !obd_zombie_impexp_check(NULL), &lwi);
+		obd_zombie_impexp_cull();
+
+		/*
+		 * Notify obd_zombie_barrier callers that queues
+		 * may be empty.
+		 */
+		wake_up(&obd_zombie_waitq);
+	}
+
+	complete(&obd_zombie_stop);
+
+	RETURN(0);
+}
+
+
+/**
+ * start destroy zombie import/export thread
+ */
+int obd_zombie_impexp_init(void)
+{
+	task_t *task;
+
+	INIT_LIST_HEAD(&obd_zombie_imports);
+	INIT_LIST_HEAD(&obd_zombie_exports);
+	spin_lock_init(&obd_zombie_impexp_lock);
+	init_completion(&obd_zombie_start);
+	init_completion(&obd_zombie_stop);
+	init_waitqueue_head(&obd_zombie_waitq);
+	obd_zombie_pid = 0;
+
+	task = kthread_run(obd_zombie_impexp_thread, NULL, "obd_zombid");
+	if (IS_ERR(task))
+		RETURN(PTR_ERR(task));
+
+	wait_for_completion(&obd_zombie_start);
+	RETURN(0);
+}
+/**
+ * stop destroy zombie import/export thread
+ */
+void obd_zombie_impexp_stop(void)
+{
+	set_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags);
+	obd_zombie_impexp_notify();
+	wait_for_completion(&obd_zombie_stop);
+}
+
+/***** Kernel-userspace comm helpers *******/
+
+/* Get length of entire message, including header */
+int kuc_len(int payload_len)
+{
+	return sizeof(struct kuc_hdr) + payload_len;
+}
+EXPORT_SYMBOL(kuc_len);
+
+/* Get a pointer to kuc header, given a ptr to the payload
+ * @param p Pointer to payload area
+ * @returns Pointer to kuc header
+ */
+struct kuc_hdr * kuc_ptr(void *p)
+{
+	struct kuc_hdr *lh = ((struct kuc_hdr *)p) - 1;
+	LASSERT(lh->kuc_magic == KUC_MAGIC);
+	return lh;
+}
+EXPORT_SYMBOL(kuc_ptr);
+
+/* Test if payload is part of kuc message
+ * @param p Pointer to payload area
+ * @returns boolean
+ */
+int kuc_ispayload(void *p)
+{
+	struct kuc_hdr *kh = ((struct kuc_hdr *)p) - 1;
+
+	if (kh->kuc_magic == KUC_MAGIC)
+		return 1;
+	else
+		return 0;
+}
+EXPORT_SYMBOL(kuc_ispayload);
+
+/* Alloc space for a message, and fill in header
+ * @return Pointer to payload area
+ */
+void *kuc_alloc(int payload_len, int transport, int type)
+{
+	struct kuc_hdr *lh;
+	int len = kuc_len(payload_len);
+
+	OBD_ALLOC(lh, len);
+	if (lh == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	lh->kuc_magic = KUC_MAGIC;
+	lh->kuc_transport = transport;
+	lh->kuc_msgtype = type;
+	lh->kuc_msglen = len;
+
+	return (void *)(lh + 1);
+}
+EXPORT_SYMBOL(kuc_alloc);
+
+/* Takes pointer to payload area */
+inline void kuc_free(void *p, int payload_len)
+{
+	struct kuc_hdr *lh = kuc_ptr(p);
+	OBD_FREE(lh, kuc_len(payload_len));
+}
+EXPORT_SYMBOL(kuc_free);
diff --git a/drivers/staging/lustre/lustre/obdclass/idmap.c b/drivers/staging/lustre/lustre/obdclass/idmap.c
new file mode 100644
index 000000000000..622f8d165275
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/idmap.c
@@ -0,0 +1,474 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/idmap.c
+ *
+ * Lustre user identity mapping.
+ *
+ * Author: Fan Yong <fanyong@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <lustre_idmap.h>
+#include <md_object.h>
+#include <obd_support.h>
+
+#define lustre_get_group_info(group_info) do {	     \
+	atomic_inc(&(group_info)->usage);	      \
+} while (0)
+
+#define lustre_put_group_info(group_info) do {	     \
+	if (atomic_dec_and_test(&(group_info)->usage)) \
+		groups_free(group_info);	       \
+} while (0)
+
+/*
+ * groups_search() is copied from linux kernel!
+ * A simple bsearch.
+ */
+static int lustre_groups_search(group_info_t *group_info,
+				gid_t grp)
+{
+	int left, right;
+
+	if (!group_info)
+		return 0;
+
+	left = 0;
+	right = group_info->ngroups;
+	while (left < right) {
+		int mid = (left + right) / 2;
+		int cmp = grp - CFS_GROUP_AT(group_info, mid);
+
+		if (cmp > 0)
+			left = mid + 1;
+		else if (cmp < 0)
+			right = mid;
+		else
+			return 1;
+	}
+	return 0;
+}
+
+void lustre_groups_from_list(group_info_t *ginfo, gid_t *glist)
+{
+	int i;
+	int count = ginfo->ngroups;
+
+	/* fill group_info from gid array */
+	for (i = 0; i < ginfo->nblocks && count > 0; i++) {
+		int cp_count = min(CFS_NGROUPS_PER_BLOCK, count);
+		int off = i * CFS_NGROUPS_PER_BLOCK;
+		int len = cp_count * sizeof(*glist);
+
+		memcpy(ginfo->blocks[i], glist + off, len);
+		count -= cp_count;
+	}
+}
+EXPORT_SYMBOL(lustre_groups_from_list);
+
+/* groups_sort() is copied from linux kernel! */
+/* a simple shell-metzner sort */
+void lustre_groups_sort(group_info_t *group_info)
+{
+	int base, max, stride;
+	int gidsetsize = group_info->ngroups;
+
+	for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
+		; /* nothing */
+	stride /= 3;
+
+	while (stride) {
+		max = gidsetsize - stride;
+		for (base = 0; base < max; base++) {
+			int left = base;
+			int right = left + stride;
+			gid_t tmp = CFS_GROUP_AT(group_info, right);
+
+			while (left >= 0 &&
+			       CFS_GROUP_AT(group_info, left) > tmp) {
+				CFS_GROUP_AT(group_info, right) =
+				    CFS_GROUP_AT(group_info, left);
+				right = left;
+				left -= stride;
+			}
+			CFS_GROUP_AT(group_info, right) = tmp;
+		}
+		stride /= 3;
+	}
+}
+EXPORT_SYMBOL(lustre_groups_sort);
+
+int lustre_in_group_p(struct lu_ucred *mu, gid_t grp)
+{
+	int rc = 1;
+
+	if (grp != mu->uc_fsgid) {
+		group_info_t *group_info = NULL;
+
+		if (mu->uc_ginfo || !mu->uc_identity ||
+		    mu->uc_valid == UCRED_OLD)
+			if (grp == mu->uc_suppgids[0] ||
+			    grp == mu->uc_suppgids[1])
+				return 1;
+
+		if (mu->uc_ginfo)
+			group_info = mu->uc_ginfo;
+		else if (mu->uc_identity)
+			group_info = mu->uc_identity->mi_ginfo;
+
+		if (!group_info)
+			return 0;
+
+		lustre_get_group_info(group_info);
+		rc = lustre_groups_search(group_info, grp);
+		lustre_put_group_info(group_info);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(lustre_in_group_p);
+
+struct lustre_idmap_entry {
+	struct list_head       lie_rmt_uid_hash; /* hashed as lie_rmt_uid; */
+	struct list_head       lie_lcl_uid_hash; /* hashed as lie_lcl_uid; */
+	struct list_head       lie_rmt_gid_hash; /* hashed as lie_rmt_gid; */
+	struct list_head       lie_lcl_gid_hash; /* hashed as lie_lcl_gid; */
+	uid_t	    lie_rmt_uid;      /* remote uid */
+	uid_t	    lie_lcl_uid;      /* local uid */
+	gid_t	    lie_rmt_gid;      /* remote gid */
+	gid_t	    lie_lcl_gid;      /* local gid */
+};
+
+static inline __u32 lustre_idmap_hashfunc(__u32 id)
+{
+	return id & (CFS_IDMAP_HASHSIZE - 1);
+}
+
+static
+struct lustre_idmap_entry *idmap_entry_alloc(uid_t rmt_uid, uid_t lcl_uid,
+					     gid_t rmt_gid, gid_t lcl_gid)
+{
+	struct lustre_idmap_entry *e;
+
+	OBD_ALLOC_PTR(e);
+	if (e == NULL)
+		return NULL;
+
+	INIT_LIST_HEAD(&e->lie_rmt_uid_hash);
+	INIT_LIST_HEAD(&e->lie_lcl_uid_hash);
+	INIT_LIST_HEAD(&e->lie_rmt_gid_hash);
+	INIT_LIST_HEAD(&e->lie_lcl_gid_hash);
+	e->lie_rmt_uid = rmt_uid;
+	e->lie_lcl_uid = lcl_uid;
+	e->lie_rmt_gid = rmt_gid;
+	e->lie_lcl_gid = lcl_gid;
+
+	return e;
+}
+
+static void idmap_entry_free(struct lustre_idmap_entry *e)
+{
+	if (!list_empty(&e->lie_rmt_uid_hash))
+		list_del(&e->lie_rmt_uid_hash);
+	if (!list_empty(&e->lie_lcl_uid_hash))
+		list_del(&e->lie_lcl_uid_hash);
+	if (!list_empty(&e->lie_rmt_gid_hash))
+		list_del(&e->lie_rmt_gid_hash);
+	if (!list_empty(&e->lie_lcl_gid_hash))
+		list_del(&e->lie_lcl_gid_hash);
+	OBD_FREE_PTR(e);
+}
+
+/*
+ * return value
+ * NULL: not found entry
+ * ERR_PTR(-EACCES): found 1(remote):N(local) mapped entry
+ * others: found normal entry
+ */
+static
+struct lustre_idmap_entry *idmap_search_entry(struct lustre_idmap_table *t,
+					      uid_t rmt_uid, uid_t lcl_uid,
+					      gid_t rmt_gid, gid_t lcl_gid)
+{
+	struct list_head *head;
+	struct lustre_idmap_entry *e;
+
+	head = &t->lit_idmaps[RMT_UIDMAP_IDX][lustre_idmap_hashfunc(rmt_uid)];
+	list_for_each_entry(e, head, lie_rmt_uid_hash)
+		if (e->lie_rmt_uid == rmt_uid) {
+			if (e->lie_lcl_uid == lcl_uid) {
+				if (e->lie_rmt_gid == rmt_gid &&
+				    e->lie_lcl_gid == lcl_gid)
+					/* must be quaternion match */
+					return e;
+			} else {
+				/* 1:N uid mapping */
+				CERROR("rmt uid %u already be mapped to %u"
+				       " (new %u)\n", e->lie_rmt_uid,
+				       e->lie_lcl_uid, lcl_uid);
+				return ERR_PTR(-EACCES);
+			}
+		}
+
+	head = &t->lit_idmaps[RMT_GIDMAP_IDX][lustre_idmap_hashfunc(rmt_gid)];
+	list_for_each_entry(e, head, lie_rmt_gid_hash)
+		if (e->lie_rmt_gid == rmt_gid) {
+			if (e->lie_lcl_gid == lcl_gid) {
+				if (unlikely(e->lie_rmt_uid == rmt_uid &&
+				    e->lie_lcl_uid == lcl_uid))
+					/* after uid mapping search above,
+					 * we should never come here */
+					LBUG();
+			} else {
+				/* 1:N gid mapping */
+				CERROR("rmt gid %u already be mapped to %u"
+				       " (new %u)\n", e->lie_rmt_gid,
+				       e->lie_lcl_gid, lcl_gid);
+				return ERR_PTR(-EACCES);
+			}
+		}
+
+	return NULL;
+}
+
+static __u32 idmap_lookup_uid(struct list_head *hash, int reverse,
+			      __u32 uid)
+{
+	struct list_head *head = &hash[lustre_idmap_hashfunc(uid)];
+	struct lustre_idmap_entry *e;
+
+	if (!reverse) {
+		list_for_each_entry(e, head, lie_rmt_uid_hash)
+			if (e->lie_rmt_uid == uid)
+				return e->lie_lcl_uid;
+	} else {
+		list_for_each_entry(e, head, lie_lcl_uid_hash)
+			if (e->lie_lcl_uid == uid)
+				return e->lie_rmt_uid;
+	}
+
+	return CFS_IDMAP_NOTFOUND;
+}
+
+static __u32 idmap_lookup_gid(struct list_head *hash, int reverse, __u32 gid)
+{
+	struct list_head *head = &hash[lustre_idmap_hashfunc(gid)];
+	struct lustre_idmap_entry *e;
+
+	if (!reverse) {
+		list_for_each_entry(e, head, lie_rmt_gid_hash)
+			if (e->lie_rmt_gid == gid)
+				return e->lie_lcl_gid;
+	} else {
+		list_for_each_entry(e, head, lie_lcl_gid_hash)
+			if (e->lie_lcl_gid == gid)
+				return e->lie_rmt_gid;
+	}
+
+	return CFS_IDMAP_NOTFOUND;
+}
+
+int lustre_idmap_add(struct lustre_idmap_table *t,
+		     uid_t ruid, uid_t luid,
+		     gid_t rgid, gid_t lgid)
+{
+	struct lustre_idmap_entry *e0, *e1;
+
+	LASSERT(t);
+
+	spin_lock(&t->lit_lock);
+	e0 = idmap_search_entry(t, ruid, luid, rgid, lgid);
+	spin_unlock(&t->lit_lock);
+	if (!e0) {
+		e0 = idmap_entry_alloc(ruid, luid, rgid, lgid);
+		if (!e0)
+			return -ENOMEM;
+
+		spin_lock(&t->lit_lock);
+		e1 = idmap_search_entry(t, ruid, luid, rgid, lgid);
+		if (e1 == NULL) {
+			list_add_tail(&e0->lie_rmt_uid_hash,
+					  &t->lit_idmaps[RMT_UIDMAP_IDX]
+					  [lustre_idmap_hashfunc(ruid)]);
+			list_add_tail(&e0->lie_lcl_uid_hash,
+					  &t->lit_idmaps[LCL_UIDMAP_IDX]
+					  [lustre_idmap_hashfunc(luid)]);
+			list_add_tail(&e0->lie_rmt_gid_hash,
+					  &t->lit_idmaps[RMT_GIDMAP_IDX]
+					  [lustre_idmap_hashfunc(rgid)]);
+			list_add_tail(&e0->lie_lcl_gid_hash,
+					  &t->lit_idmaps[LCL_GIDMAP_IDX]
+					  [lustre_idmap_hashfunc(lgid)]);
+		}
+		spin_unlock(&t->lit_lock);
+		if (e1 != NULL) {
+			idmap_entry_free(e0);
+			if (IS_ERR(e1))
+				return PTR_ERR(e1);
+		}
+	} else if (IS_ERR(e0)) {
+		return PTR_ERR(e0);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(lustre_idmap_add);
+
+int lustre_idmap_del(struct lustre_idmap_table *t,
+		    uid_t ruid, uid_t luid,
+		    gid_t rgid, gid_t lgid)
+{
+	struct lustre_idmap_entry *e;
+	int rc = 0;
+
+	LASSERT(t);
+
+	spin_lock(&t->lit_lock);
+	e = idmap_search_entry(t, ruid, luid, rgid, lgid);
+	if (IS_ERR(e))
+		rc = PTR_ERR(e);
+	else if (e)
+		idmap_entry_free(e);
+	spin_unlock(&t->lit_lock);
+
+	return rc;
+}
+EXPORT_SYMBOL(lustre_idmap_del);
+
+int lustre_idmap_lookup_uid(struct lu_ucred *mu,
+			    struct lustre_idmap_table *t,
+			    int reverse, uid_t uid)
+{
+	struct list_head *hash;
+
+	if (mu && (mu->uc_valid == UCRED_OLD || mu->uc_valid == UCRED_NEW)) {
+		if (!reverse) {
+			if (uid == mu->uc_o_uid)
+				return mu->uc_uid;
+			else if (uid == mu->uc_o_fsuid)
+				return mu->uc_fsuid;
+		} else {
+			if (uid == mu->uc_uid)
+				return mu->uc_o_uid;
+			else if (uid == mu->uc_fsuid)
+				return mu->uc_o_fsuid;
+		}
+	}
+
+	if (t == NULL)
+		return CFS_IDMAP_NOTFOUND;
+
+	hash = t->lit_idmaps[reverse ? LCL_UIDMAP_IDX : RMT_UIDMAP_IDX];
+
+	spin_lock(&t->lit_lock);
+	uid = idmap_lookup_uid(hash, reverse, uid);
+	spin_unlock(&t->lit_lock);
+
+	return uid;
+}
+EXPORT_SYMBOL(lustre_idmap_lookup_uid);
+
+int lustre_idmap_lookup_gid(struct lu_ucred *mu, struct lustre_idmap_table *t,
+			    int reverse, gid_t gid)
+{
+	struct list_head *hash;
+
+	if (mu && (mu->uc_valid == UCRED_OLD || mu->uc_valid == UCRED_NEW)) {
+		if (!reverse) {
+			if (gid == mu->uc_o_gid)
+				return mu->uc_gid;
+			else if (gid == mu->uc_o_fsgid)
+				return mu->uc_fsgid;
+		} else {
+			if (gid == mu->uc_gid)
+				return mu->uc_o_gid;
+			else if (gid == mu->uc_fsgid)
+				return mu->uc_o_fsgid;
+		}
+	}
+
+	if (t == NULL)
+		return CFS_IDMAP_NOTFOUND;
+
+	hash = t->lit_idmaps[reverse ? LCL_GIDMAP_IDX : RMT_GIDMAP_IDX];
+
+	spin_lock(&t->lit_lock);
+	gid = idmap_lookup_gid(hash, reverse, gid);
+	spin_unlock(&t->lit_lock);
+
+	return gid;
+}
+EXPORT_SYMBOL(lustre_idmap_lookup_gid);
+
+struct lustre_idmap_table *lustre_idmap_init(void)
+{
+	struct lustre_idmap_table *t;
+	int i, j;
+
+	OBD_ALLOC_PTR(t);
+	if(unlikely(t == NULL))
+		return (ERR_PTR(-ENOMEM));
+
+	spin_lock_init(&t->lit_lock);
+	for (i = 0; i < ARRAY_SIZE(t->lit_idmaps); i++)
+		for (j = 0; j < ARRAY_SIZE(t->lit_idmaps[i]); j++)
+			INIT_LIST_HEAD(&t->lit_idmaps[i][j]);
+
+	return t;
+}
+EXPORT_SYMBOL(lustre_idmap_init);
+
+void lustre_idmap_fini(struct lustre_idmap_table *t)
+{
+	struct list_head *list;
+	struct lustre_idmap_entry *e;
+	int i;
+	LASSERT(t);
+
+	list = t->lit_idmaps[RMT_UIDMAP_IDX];
+	spin_lock(&t->lit_lock);
+	for (i = 0; i < CFS_IDMAP_HASHSIZE; i++)
+		while (!list_empty(&list[i])) {
+			e = list_entry(list[i].next,
+					   struct lustre_idmap_entry,
+					   lie_rmt_uid_hash);
+			idmap_entry_free(e);
+		}
+	spin_unlock(&t->lit_lock);
+
+	OBD_FREE_PTR(t);
+}
+EXPORT_SYMBOL(lustre_idmap_fini);
diff --git a/drivers/staging/lustre/lustre/obdclass/linkea.c b/drivers/staging/lustre/lustre/obdclass/linkea.c
new file mode 100644
index 000000000000..b5c19ac1470f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/linkea.c
@@ -0,0 +1,194 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2013, Intel Corporation.
+ * Use is subject to license terms.
+ *
+ * Author: Di Wang <di.wang@intel.com>
+ */
+
+#include <lustre/lustre_idl.h>
+#include <obd.h>
+#include <lustre_linkea.h>
+
+int linkea_data_new(struct linkea_data *ldata, struct lu_buf *buf)
+{
+	ldata->ld_buf = lu_buf_check_and_alloc(buf, PAGE_CACHE_SIZE);
+	if (ldata->ld_buf->lb_buf == NULL)
+		return -ENOMEM;
+	ldata->ld_leh = ldata->ld_buf->lb_buf;
+	ldata->ld_leh->leh_magic = LINK_EA_MAGIC;
+	ldata->ld_leh->leh_len = sizeof(struct link_ea_header);
+	ldata->ld_leh->leh_reccount = 0;
+	return 0;
+}
+EXPORT_SYMBOL(linkea_data_new);
+
+int linkea_init(struct linkea_data *ldata)
+{
+	struct link_ea_header *leh;
+
+	LASSERT(ldata->ld_buf != NULL);
+	leh = ldata->ld_buf->lb_buf;
+	if (leh->leh_magic == __swab32(LINK_EA_MAGIC)) {
+		leh->leh_magic = LINK_EA_MAGIC;
+		leh->leh_reccount = __swab32(leh->leh_reccount);
+		leh->leh_len = __swab64(leh->leh_len);
+		/* entries are swabbed by linkea_entry_unpack */
+	}
+	if (leh->leh_magic != LINK_EA_MAGIC)
+		return -EINVAL;
+	if (leh->leh_reccount == 0)
+		return -ENODATA;
+
+	ldata->ld_leh = leh;
+	return 0;
+}
+EXPORT_SYMBOL(linkea_init);
+
+/**
+ * Pack a link_ea_entry.
+ * All elements are stored as chars to avoid alignment issues.
+ * Numbers are always big-endian
+ * \retval record length
+ */
+static int linkea_entry_pack(struct link_ea_entry *lee,
+			     const struct lu_name *lname,
+			     const struct lu_fid *pfid)
+{
+	struct lu_fid   tmpfid;
+	int	     reclen;
+
+	fid_cpu_to_be(&tmpfid, pfid);
+	if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LINKEA_CRASH))
+		tmpfid.f_ver = ~0;
+	memcpy(&lee->lee_parent_fid, &tmpfid, sizeof(tmpfid));
+	memcpy(lee->lee_name, lname->ln_name, lname->ln_namelen);
+	reclen = sizeof(struct link_ea_entry) + lname->ln_namelen;
+
+	lee->lee_reclen[0] = (reclen >> 8) & 0xff;
+	lee->lee_reclen[1] = reclen & 0xff;
+	return reclen;
+}
+
+void linkea_entry_unpack(const struct link_ea_entry *lee, int *reclen,
+			 struct lu_name *lname, struct lu_fid *pfid)
+{
+	*reclen = (lee->lee_reclen[0] << 8) | lee->lee_reclen[1];
+	memcpy(pfid, &lee->lee_parent_fid, sizeof(*pfid));
+	fid_be_to_cpu(pfid, pfid);
+	lname->ln_name = lee->lee_name;
+	lname->ln_namelen = *reclen - sizeof(struct link_ea_entry);
+}
+EXPORT_SYMBOL(linkea_entry_unpack);
+
+/**
+ * Add a record to the end of link ea buf
+ **/
+int linkea_add_buf(struct linkea_data *ldata, const struct lu_name *lname,
+		   const struct lu_fid *pfid)
+{
+	LASSERT(ldata->ld_leh != NULL);
+
+	if (lname == NULL || pfid == NULL)
+		return -EINVAL;
+
+	ldata->ld_reclen = lname->ln_namelen + sizeof(struct link_ea_entry);
+	if (ldata->ld_leh->leh_len + ldata->ld_reclen >
+	    ldata->ld_buf->lb_len) {
+		if (lu_buf_check_and_grow(ldata->ld_buf,
+					  ldata->ld_leh->leh_len +
+					  ldata->ld_reclen) < 0)
+			return -ENOMEM;
+	}
+
+	ldata->ld_leh = ldata->ld_buf->lb_buf;
+	ldata->ld_lee = ldata->ld_buf->lb_buf + ldata->ld_leh->leh_len;
+	ldata->ld_reclen = linkea_entry_pack(ldata->ld_lee, lname, pfid);
+	ldata->ld_leh->leh_len += ldata->ld_reclen;
+	ldata->ld_leh->leh_reccount++;
+	CDEBUG(D_INODE, "New link_ea name '%.*s' is added\n",
+	       lname->ln_namelen, lname->ln_name);
+	return 0;
+}
+EXPORT_SYMBOL(linkea_add_buf);
+
+/** Del the current record from the link ea buf */
+void linkea_del_buf(struct linkea_data *ldata, const struct lu_name *lname)
+{
+	LASSERT(ldata->ld_leh != NULL && ldata->ld_lee != NULL);
+
+	ldata->ld_leh->leh_reccount--;
+	ldata->ld_leh->leh_len -= ldata->ld_reclen;
+	memmove(ldata->ld_lee, (char *)ldata->ld_lee + ldata->ld_reclen,
+		(char *)ldata->ld_leh + ldata->ld_leh->leh_len -
+		(char *)ldata->ld_lee);
+	CDEBUG(D_INODE, "Old link_ea name '%.*s' is removed\n",
+	       lname->ln_namelen, lname->ln_name);
+}
+EXPORT_SYMBOL(linkea_del_buf);
+
+/**
+ * Check if such a link exists in linkEA.
+ *
+ * \param ldata link data the search to be done on
+ * \param lname name in the parent's directory entry pointing to this object
+ * \param pfid parent fid the link to be found for
+ *
+ * \retval   0 success
+ * \retval -ENOENT link does not exist
+ * \retval -ve on error
+ */
+int linkea_links_find(struct linkea_data *ldata, const struct lu_name *lname,
+		      const struct lu_fid  *pfid)
+{
+	struct lu_name tmpname;
+	struct lu_fid  tmpfid;
+	int count;
+
+	LASSERT(ldata->ld_leh != NULL);
+
+	/* link #0 */
+	ldata->ld_lee = (struct link_ea_entry *)(ldata->ld_leh + 1);
+
+	for (count = 0; count < ldata->ld_leh->leh_reccount; count++) {
+		linkea_entry_unpack(ldata->ld_lee, &ldata->ld_reclen,
+				    &tmpname, &tmpfid);
+		if (tmpname.ln_namelen == lname->ln_namelen &&
+		    lu_fid_eq(&tmpfid, pfid) &&
+		    (strncmp(tmpname.ln_name, lname->ln_name,
+			     tmpname.ln_namelen) == 0))
+			break;
+		ldata->ld_lee = (struct link_ea_entry *)((char *)ldata->ld_lee +
+							 ldata->ld_reclen);
+	}
+
+	if (count == ldata->ld_leh->leh_reccount) {
+		CDEBUG(D_INODE, "Old link_ea name '%.*s' not found\n",
+		       lname->ln_namelen, lname->ln_name);
+		ldata->ld_lee = NULL;
+		return -ENOENT;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(linkea_links_find);
diff --git a/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c b/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c
new file mode 100644
index 000000000000..16208ba9d072
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c
@@ -0,0 +1,430 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/linux/linux-module.c
+ *
+ * Object Devices Class Driver
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/sched.h>
+#include <linux/lp.h>
+#include <linux/slab.h>
+#include <linux/ioport.h>
+#include <linux/fcntl.h>
+#include <linux/delay.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <asm/io.h>
+#include <asm/ioctls.h>
+#include <asm/poll.h>
+#include <asm/uaccess.h>
+#include <linux/miscdevice.h>
+#include <linux/seq_file.h>
+
+#include <linux/libcfs/libcfs.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include <linux/lnet/lnetctl.h>
+#include <lprocfs_status.h>
+#include <lustre_ver.h>
+#include <lustre/lustre_build_version.h>
+
+int proc_version;
+
+/* buffer MUST be at least the size of obd_ioctl_hdr */
+int obd_ioctl_getdata(char **buf, int *len, void *arg)
+{
+	struct obd_ioctl_hdr hdr;
+	struct obd_ioctl_data *data;
+	int err;
+	int offset = 0;
+	ENTRY;
+
+	err = copy_from_user(&hdr, (void *)arg, sizeof(hdr));
+	if ( err )
+		RETURN(err);
+
+	if (hdr.ioc_version != OBD_IOCTL_VERSION) {
+		CERROR("Version mismatch kernel (%x) vs application (%x)\n",
+		       OBD_IOCTL_VERSION, hdr.ioc_version);
+		RETURN(-EINVAL);
+	}
+
+	if (hdr.ioc_len > OBD_MAX_IOCTL_BUFFER) {
+		CERROR("User buffer len %d exceeds %d max buffer\n",
+		       hdr.ioc_len, OBD_MAX_IOCTL_BUFFER);
+		RETURN(-EINVAL);
+	}
+
+	if (hdr.ioc_len < sizeof(struct obd_ioctl_data)) {
+		CERROR("User buffer too small for ioctl (%d)\n", hdr.ioc_len);
+		RETURN(-EINVAL);
+	}
+
+	/* When there are lots of processes calling vmalloc on multi-core
+	 * system, the high lock contention will hurt performance badly,
+	 * obdfilter-survey is an example, which relies on ioctl. So we'd
+	 * better avoid vmalloc on ioctl path. LU-66 */
+	OBD_ALLOC_LARGE(*buf, hdr.ioc_len);
+	if (*buf == NULL) {
+		CERROR("Cannot allocate control buffer of len %d\n",
+		       hdr.ioc_len);
+		RETURN(-EINVAL);
+	}
+	*len = hdr.ioc_len;
+	data = (struct obd_ioctl_data *)*buf;
+
+	err = copy_from_user(*buf, (void *)arg, hdr.ioc_len);
+	if ( err ) {
+		OBD_FREE_LARGE(*buf, hdr.ioc_len);
+		RETURN(err);
+	}
+
+	if (obd_ioctl_is_invalid(data)) {
+		CERROR("ioctl not correctly formatted\n");
+		OBD_FREE_LARGE(*buf, hdr.ioc_len);
+		RETURN(-EINVAL);
+	}
+
+	if (data->ioc_inllen1) {
+		data->ioc_inlbuf1 = &data->ioc_bulk[0];
+		offset += cfs_size_round(data->ioc_inllen1);
+	}
+
+	if (data->ioc_inllen2) {
+		data->ioc_inlbuf2 = &data->ioc_bulk[0] + offset;
+		offset += cfs_size_round(data->ioc_inllen2);
+	}
+
+	if (data->ioc_inllen3) {
+		data->ioc_inlbuf3 = &data->ioc_bulk[0] + offset;
+		offset += cfs_size_round(data->ioc_inllen3);
+	}
+
+	if (data->ioc_inllen4) {
+		data->ioc_inlbuf4 = &data->ioc_bulk[0] + offset;
+	}
+
+	EXIT;
+	return 0;
+}
+EXPORT_SYMBOL(obd_ioctl_getdata);
+
+int obd_ioctl_popdata(void *arg, void *data, int len)
+{
+	int err;
+
+	err = copy_to_user(arg, data, len);
+	if (err)
+		err = -EFAULT;
+	return err;
+}
+EXPORT_SYMBOL(obd_ioctl_popdata);
+
+/*  opening /dev/obd */
+static int obd_class_open(struct inode * inode, struct file * file)
+{
+	ENTRY;
+
+	try_module_get(THIS_MODULE);
+	RETURN(0);
+}
+
+/*  closing /dev/obd */
+static int obd_class_release(struct inode * inode, struct file * file)
+{
+	ENTRY;
+
+	module_put(THIS_MODULE);
+	RETURN(0);
+}
+
+/* to control /dev/obd */
+static long obd_class_ioctl(struct file *filp, unsigned int cmd,
+			    unsigned long arg)
+{
+	int err = 0;
+	ENTRY;
+
+	/* Allow non-root access for OBD_IOC_PING_TARGET - used by lfs check */
+	if (!cfs_capable(CFS_CAP_SYS_ADMIN) && (cmd != OBD_IOC_PING_TARGET))
+		RETURN(err = -EACCES);
+	if ((cmd & 0xffffff00) == ((int)'T') << 8) /* ignore all tty ioctls */
+		RETURN(err = -ENOTTY);
+
+	err = class_handle_ioctl(cmd, (unsigned long)arg);
+
+	RETURN(err);
+}
+
+/* declare character device */
+static struct file_operations obd_psdev_fops = {
+	.owner	  = THIS_MODULE,
+	.unlocked_ioctl = obd_class_ioctl, /* unlocked_ioctl */
+	.open	   = obd_class_open,      /* open */
+	.release	= obd_class_release,   /* release */
+};
+
+/* modules setup */
+psdev_t obd_psdev = {
+	.minor = OBD_DEV_MINOR,
+	.name  = OBD_DEV_NAME,
+	.fops  = &obd_psdev_fops,
+};
+
+
+#ifdef LPROCFS
+int obd_proc_read_version(char *page, char **start, off_t off, int count,
+			  int *eof, void *data)
+{
+	*eof = 1;
+	return snprintf(page, count, "lustre: %s\nkernel: %s\nbuild:  %s\n",
+			LUSTRE_VERSION_STRING, "patchless_client",
+			BUILD_VERSION);
+}
+
+int obd_proc_read_pinger(char *page, char **start, off_t off, int count,
+			 int *eof, void *data)
+{
+	*eof = 1;
+	return snprintf(page, count, "%s\n",
+			"on"
+		       );
+}
+
+/**
+ * Check all obd devices health
+ *
+ * \param page
+ * \param start
+ * \param off
+ * \param count
+ * \param eof
+ * \param data
+ *		  proc read function parameters, please refer to kernel
+ *		  code fs/proc/generic.c proc_file_read()
+ * \param data [in] unused
+ *
+ * \retval number of characters printed
+ */
+static int obd_proc_read_health(char *page, char **start, off_t off,
+				int count, int *eof, void *data)
+{
+	int rc = 0, i;
+	*eof = 1;
+
+	if (libcfs_catastrophe)
+		rc += snprintf(page + rc, count - rc, "LBUG\n");
+
+	read_lock(&obd_dev_lock);
+	for (i = 0; i < class_devno_max(); i++) {
+		struct obd_device *obd;
+
+		obd = class_num2obd(i);
+		if (obd == NULL || !obd->obd_attached || !obd->obd_set_up)
+			continue;
+
+		LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+		if (obd->obd_stopping)
+			continue;
+
+		class_incref(obd, __FUNCTION__, current);
+		read_unlock(&obd_dev_lock);
+
+		if (obd_health_check(NULL, obd)) {
+			rc += snprintf(page + rc, count - rc,
+				       "device %s reported unhealthy\n",
+				       obd->obd_name);
+		}
+		class_decref(obd, __FUNCTION__, current);
+		read_lock(&obd_dev_lock);
+	}
+	read_unlock(&obd_dev_lock);
+
+	if (rc == 0)
+		return snprintf(page, count, "healthy\n");
+
+	rc += snprintf(page + rc, count - rc, "NOT HEALTHY\n");
+	return rc;
+}
+
+static int obd_proc_rd_jobid_var(char *page, char **start, off_t off,
+				int count, int *eof, void *data)
+{
+	return snprintf(page, count, "%s\n", obd_jobid_var);
+}
+
+static int obd_proc_wr_jobid_var(struct file *file, const char *buffer,
+				unsigned long count, void *data)
+{
+	if (!count || count > JOBSTATS_JOBID_VAR_MAX_LEN)
+		return -EINVAL;
+
+	memset(obd_jobid_var, 0, JOBSTATS_JOBID_VAR_MAX_LEN + 1);
+	/* Trim the trailing '\n' if any */
+	memcpy(obd_jobid_var, buffer, count - (buffer[count - 1] == '\n'));
+	return count;
+}
+
+/* Root for /proc/fs/lustre */
+struct proc_dir_entry *proc_lustre_root = NULL;
+EXPORT_SYMBOL(proc_lustre_root);
+
+struct lprocfs_vars lprocfs_base[] = {
+	{ "version", obd_proc_read_version, NULL, NULL },
+	{ "pinger", obd_proc_read_pinger, NULL, NULL },
+	{ "health_check", obd_proc_read_health, NULL, NULL },
+	{ "jobid_var", obd_proc_rd_jobid_var,
+		       obd_proc_wr_jobid_var, NULL },
+	{ 0 }
+};
+#else
+#define lprocfs_base NULL
+#endif /* LPROCFS */
+
+static void *obd_device_list_seq_start(struct seq_file *p, loff_t *pos)
+{
+	if (*pos >= class_devno_max())
+		return NULL;
+
+	return pos;
+}
+
+static void obd_device_list_seq_stop(struct seq_file *p, void *v)
+{
+}
+
+static void *obd_device_list_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	++*pos;
+	if (*pos >= class_devno_max())
+		return NULL;
+
+	return pos;
+}
+
+static int obd_device_list_seq_show(struct seq_file *p, void *v)
+{
+	loff_t index = *(loff_t *)v;
+	struct obd_device *obd = class_num2obd((int)index);
+	char *status;
+
+	if (obd == NULL)
+		return 0;
+
+	LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+	if (obd->obd_stopping)
+		status = "ST";
+	else if (obd->obd_inactive)
+		status = "IN";
+	else if (obd->obd_set_up)
+		status = "UP";
+	else if (obd->obd_attached)
+		status = "AT";
+	else
+		status = "--";
+
+	return seq_printf(p, "%3d %s %s %s %s %d\n",
+			  (int)index, status, obd->obd_type->typ_name,
+			  obd->obd_name, obd->obd_uuid.uuid,
+			  atomic_read(&obd->obd_refcount));
+}
+
+struct seq_operations obd_device_list_sops = {
+	.start = obd_device_list_seq_start,
+	.stop = obd_device_list_seq_stop,
+	.next = obd_device_list_seq_next,
+	.show = obd_device_list_seq_show,
+};
+
+static int obd_device_list_open(struct inode *inode, struct file *file)
+{
+	struct proc_dir_entry *dp = PDE(inode);
+	struct seq_file *seq;
+	int rc = seq_open(file, &obd_device_list_sops);
+
+	if (rc)
+		return rc;
+
+	seq = file->private_data;
+	seq->private = dp->data;
+
+	return 0;
+}
+
+struct file_operations obd_device_list_fops = {
+	.owner   = THIS_MODULE,
+	.open    = obd_device_list_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+};
+
+int class_procfs_init(void)
+{
+	int rc;
+	ENTRY;
+
+	obd_sysctl_init();
+	proc_lustre_root = lprocfs_register("fs/lustre", NULL,
+					    lprocfs_base, NULL);
+	rc = lprocfs_seq_create(proc_lustre_root, "devices", 0444,
+				&obd_device_list_fops, NULL);
+	if (rc)
+		CERROR("error adding /proc/fs/lustre/devices file\n");
+	RETURN(0);
+}
+
+int class_procfs_clean(void)
+{
+	ENTRY;
+	if (proc_lustre_root) {
+		lprocfs_remove(&proc_lustre_root);
+	}
+	RETURN(0);
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c b/drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c
new file mode 100644
index 000000000000..6ee347153a16
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c
@@ -0,0 +1,222 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/linux/linux-obdo.c
+ *
+ * Object Devices Class Driver
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/module.h>
+#include <obd_class.h>
+#include <lustre/lustre_idl.h>
+
+#include <linux/fs.h>
+#include <linux/pagemap.h> /* for PAGE_CACHE_SIZE */
+
+/*FIXME: Just copy from obdo_from_inode*/
+void obdo_from_la(struct obdo *dst, struct lu_attr *la, __u64 valid)
+{
+	obd_flag newvalid = 0;
+
+	if (valid & LA_ATIME) {
+		dst->o_atime = la->la_atime;
+		newvalid |= OBD_MD_FLATIME;
+	}
+	if (valid & LA_MTIME) {
+		dst->o_mtime = la->la_mtime;
+		newvalid |= OBD_MD_FLMTIME;
+	}
+	if (valid & LA_CTIME) {
+		dst->o_ctime = la->la_ctime;
+		newvalid |= OBD_MD_FLCTIME;
+	}
+	if (valid & LA_SIZE) {
+		dst->o_size = la->la_size;
+		newvalid |= OBD_MD_FLSIZE;
+	}
+	if (valid & LA_BLOCKS) {  /* allocation of space (x512 bytes) */
+		dst->o_blocks = la->la_blocks;
+		newvalid |= OBD_MD_FLBLOCKS;
+	}
+	if (valid & LA_TYPE) {
+		dst->o_mode = (dst->o_mode & S_IALLUGO) |
+			      (la->la_mode & S_IFMT);
+		newvalid |= OBD_MD_FLTYPE;
+	}
+	if (valid & LA_MODE) {
+		dst->o_mode = (dst->o_mode & S_IFMT) |
+			      (la->la_mode & S_IALLUGO);
+		newvalid |= OBD_MD_FLMODE;
+	}
+	if (valid & LA_UID) {
+		dst->o_uid = la->la_uid;
+		newvalid |= OBD_MD_FLUID;
+	}
+	if (valid & LA_GID) {
+		dst->o_gid = la->la_gid;
+		newvalid |= OBD_MD_FLGID;
+	}
+	dst->o_valid |= newvalid;
+}
+EXPORT_SYMBOL(obdo_from_la);
+
+/*FIXME: Just copy from obdo_from_inode*/
+void la_from_obdo(struct lu_attr *dst, struct obdo *obdo, obd_flag valid)
+{
+	__u64 newvalid = 0;
+
+	valid &= obdo->o_valid;
+
+	if (valid & OBD_MD_FLATIME) {
+		dst->la_atime = obdo->o_atime;
+		newvalid |= LA_ATIME;
+	}
+	if (valid & OBD_MD_FLMTIME) {
+		dst->la_mtime = obdo->o_mtime;
+		newvalid |= LA_MTIME;
+	}
+	if (valid & OBD_MD_FLCTIME) {
+		dst->la_ctime = obdo->o_ctime;
+		newvalid |= LA_CTIME;
+	}
+	if (valid & OBD_MD_FLSIZE) {
+		dst->la_size = obdo->o_size;
+		newvalid |= LA_SIZE;
+	}
+	if (valid & OBD_MD_FLBLOCKS) {
+		dst->la_blocks = obdo->o_blocks;
+		newvalid |= LA_BLOCKS;
+	}
+	if (valid & OBD_MD_FLTYPE) {
+		dst->la_mode = (dst->la_mode & S_IALLUGO) |
+			       (obdo->o_mode & S_IFMT);
+		newvalid |= LA_TYPE;
+	}
+	if (valid & OBD_MD_FLMODE) {
+		dst->la_mode = (dst->la_mode & S_IFMT) |
+			       (obdo->o_mode & S_IALLUGO);
+		newvalid |= LA_MODE;
+	}
+	if (valid & OBD_MD_FLUID) {
+		dst->la_uid = obdo->o_uid;
+		newvalid |= LA_UID;
+	}
+	if (valid & OBD_MD_FLGID) {
+		dst->la_gid = obdo->o_gid;
+		newvalid |= LA_GID;
+	}
+	dst->la_valid = newvalid;
+}
+EXPORT_SYMBOL(la_from_obdo);
+
+void obdo_refresh_inode(struct inode *dst, struct obdo *src, obd_flag valid)
+{
+	valid &= src->o_valid;
+
+	if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME))
+		CDEBUG(D_INODE,
+		       "valid "LPX64", cur time %lu/%lu, new "LPU64"/"LPU64"\n",
+		       src->o_valid, LTIME_S(dst->i_mtime),
+		       LTIME_S(dst->i_ctime), src->o_mtime, src->o_ctime);
+
+	if (valid & OBD_MD_FLATIME && src->o_atime > LTIME_S(dst->i_atime))
+		LTIME_S(dst->i_atime) = src->o_atime;
+	if (valid & OBD_MD_FLMTIME && src->o_mtime > LTIME_S(dst->i_mtime))
+		LTIME_S(dst->i_mtime) = src->o_mtime;
+	if (valid & OBD_MD_FLCTIME && src->o_ctime > LTIME_S(dst->i_ctime))
+		LTIME_S(dst->i_ctime) = src->o_ctime;
+	if (valid & OBD_MD_FLSIZE)
+		i_size_write(dst, src->o_size);
+	/* optimum IO size */
+	if (valid & OBD_MD_FLBLKSZ && src->o_blksize > (1 << dst->i_blkbits))
+		dst->i_blkbits = ffs(src->o_blksize) - 1;
+
+	if (dst->i_blkbits < PAGE_CACHE_SHIFT)
+		dst->i_blkbits = PAGE_CACHE_SHIFT;
+
+	/* allocation of space */
+	if (valid & OBD_MD_FLBLOCKS && src->o_blocks > dst->i_blocks)
+		/*
+		 * XXX shouldn't overflow be checked here like in
+		 * obdo_to_inode().
+		 */
+		dst->i_blocks = src->o_blocks;
+}
+EXPORT_SYMBOL(obdo_refresh_inode);
+
+void obdo_to_inode(struct inode *dst, struct obdo *src, obd_flag valid)
+{
+	valid &= src->o_valid;
+
+	LASSERTF(!(valid & (OBD_MD_FLTYPE | OBD_MD_FLGENER | OBD_MD_FLFID |
+			    OBD_MD_FLID | OBD_MD_FLGROUP)),
+		 "object "DOSTID", valid %x\n", POSTID(&src->o_oi), valid);
+
+	if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME))
+		CDEBUG(D_INODE,
+		       "valid "LPX64", cur time %lu/%lu, new "LPU64"/"LPU64"\n",
+		       src->o_valid, LTIME_S(dst->i_mtime),
+		       LTIME_S(dst->i_ctime), src->o_mtime, src->o_ctime);
+
+	if (valid & OBD_MD_FLATIME)
+		LTIME_S(dst->i_atime) = src->o_atime;
+	if (valid & OBD_MD_FLMTIME)
+		LTIME_S(dst->i_mtime) = src->o_mtime;
+	if (valid & OBD_MD_FLCTIME && src->o_ctime > LTIME_S(dst->i_ctime))
+		LTIME_S(dst->i_ctime) = src->o_ctime;
+	if (valid & OBD_MD_FLSIZE)
+		i_size_write(dst, src->o_size);
+	if (valid & OBD_MD_FLBLOCKS) { /* allocation of space */
+		dst->i_blocks = src->o_blocks;
+		if (dst->i_blocks < src->o_blocks) /* overflow */
+			dst->i_blocks = -1;
+
+	}
+	if (valid & OBD_MD_FLBLKSZ)
+		dst->i_blkbits = ffs(src->o_blksize)-1;
+	if (valid & OBD_MD_FLMODE)
+		dst->i_mode = (dst->i_mode & S_IFMT) | (src->o_mode & ~S_IFMT);
+	if (valid & OBD_MD_FLUID)
+		dst->i_uid = src->o_uid;
+	if (valid & OBD_MD_FLGID)
+		dst->i_gid = src->o_gid;
+	if (valid & OBD_MD_FLFLAGS)
+		dst->i_flags = src->o_flags;
+}
+EXPORT_SYMBOL(obdo_to_inode);
diff --git a/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c b/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c
new file mode 100644
index 000000000000..46aad6813cab
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c
@@ -0,0 +1,445 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/module.h>
+#include <linux/sysctl.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/sysctl.h>
+#include <linux/version.h>
+#include <linux/proc_fs.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/ctype.h>
+#include <asm/bitops.h>
+#include <asm/uaccess.h>
+#include <linux/utsname.h>
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd_support.h>
+#include <lprocfs_status.h>
+
+#ifdef CONFIG_SYSCTL
+ctl_table_header_t *obd_table_header = NULL;
+#endif
+
+
+#define OBD_SYSCTL 300
+
+enum {
+	OBD_TIMEOUT = 3,	/* RPC timeout before recovery/intr */
+	OBD_DUMP_ON_TIMEOUT,    /* dump kernel debug log upon eviction */
+	OBD_MEMUSED,	    /* bytes currently OBD_ALLOCated */
+	OBD_PAGESUSED,	  /* pages currently OBD_PAGE_ALLOCated */
+	OBD_MAXMEMUSED,	 /* maximum bytes OBD_ALLOCated concurrently */
+	OBD_MAXPAGESUSED,       /* maximum pages OBD_PAGE_ALLOCated concurrently */
+	OBD_SYNCFILTER,	 /* XXX temporary, as we play with sync osts.. */
+	OBD_LDLM_TIMEOUT,       /* LDLM timeout for ASTs before client eviction */
+	OBD_DUMP_ON_EVICTION,   /* dump kernel debug log upon eviction */
+	OBD_DEBUG_PEER_ON_TIMEOUT, /* dump peer debug when RPC times out */
+	OBD_ALLOC_FAIL_RATE,    /* memory allocation random failure rate */
+	OBD_MAX_DIRTY_PAGES,    /* maximum dirty pages */
+	OBD_AT_MIN,	     /* Adaptive timeouts params */
+	OBD_AT_MAX,
+	OBD_AT_EXTRA,
+	OBD_AT_EARLY_MARGIN,
+	OBD_AT_HISTORY,
+};
+
+
+int LL_PROC_PROTO(proc_set_timeout)
+{
+	int rc;
+
+	rc = ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+	if (ldlm_timeout >= obd_timeout)
+		ldlm_timeout = max(obd_timeout / 3, 1U);
+	return rc;
+}
+
+int LL_PROC_PROTO(proc_memory_alloc)
+{
+	char buf[22];
+	int len;
+	DECLARE_LL_PROC_PPOS_DECL;
+
+	if (!*lenp || (*ppos && !write)) {
+		*lenp = 0;
+		return 0;
+	}
+	if (write)
+		return -EINVAL;
+
+	len = snprintf(buf, sizeof(buf), LPU64"\n", obd_memory_sum());
+	if (len > *lenp)
+		len = *lenp;
+	buf[len] = '\0';
+	if (copy_to_user(buffer, buf, len))
+		return -EFAULT;
+	*lenp = len;
+	*ppos += *lenp;
+	return 0;
+}
+
+int LL_PROC_PROTO(proc_pages_alloc)
+{
+	char buf[22];
+	int len;
+	DECLARE_LL_PROC_PPOS_DECL;
+
+	if (!*lenp || (*ppos && !write)) {
+		*lenp = 0;
+		return 0;
+	}
+	if (write)
+		return -EINVAL;
+
+	len = snprintf(buf, sizeof(buf), LPU64"\n", obd_pages_sum());
+	if (len > *lenp)
+		len = *lenp;
+	buf[len] = '\0';
+	if (copy_to_user(buffer, buf, len))
+		return -EFAULT;
+	*lenp = len;
+	*ppos += *lenp;
+	return 0;
+}
+
+int LL_PROC_PROTO(proc_mem_max)
+{
+	char buf[22];
+	int len;
+	DECLARE_LL_PROC_PPOS_DECL;
+
+	if (!*lenp || (*ppos && !write)) {
+		*lenp = 0;
+		return 0;
+	}
+	if (write)
+		return -EINVAL;
+
+	len = snprintf(buf, sizeof(buf), LPU64"\n", obd_memory_max());
+	if (len > *lenp)
+		len = *lenp;
+	buf[len] = '\0';
+	if (copy_to_user(buffer, buf, len))
+		return -EFAULT;
+	*lenp = len;
+	*ppos += *lenp;
+	return 0;
+}
+
+int LL_PROC_PROTO(proc_pages_max)
+{
+	char buf[22];
+	int len;
+	DECLARE_LL_PROC_PPOS_DECL;
+
+	if (!*lenp || (*ppos && !write)) {
+		*lenp = 0;
+		return 0;
+	}
+	if (write)
+		return -EINVAL;
+
+	len = snprintf(buf, sizeof(buf), LPU64"\n", obd_pages_max());
+	if (len > *lenp)
+		len = *lenp;
+	buf[len] = '\0';
+	if (copy_to_user(buffer, buf, len))
+		return -EFAULT;
+	*lenp = len;
+	*ppos += *lenp;
+	return 0;
+}
+
+int LL_PROC_PROTO(proc_max_dirty_pages_in_mb)
+{
+	int rc = 0;
+	DECLARE_LL_PROC_PPOS_DECL;
+
+	if (!table->data || !table->maxlen || !*lenp || (*ppos && !write)) {
+		*lenp = 0;
+		return 0;
+	}
+	if (write) {
+		rc = lprocfs_write_frac_helper(buffer, *lenp,
+					       (unsigned int*)table->data,
+					       1 << (20 - PAGE_CACHE_SHIFT));
+		/* Don't allow them to let dirty pages exceed 90% of system
+		 * memory and set a hard minimum of 4MB. */
+		if (obd_max_dirty_pages > ((num_physpages / 10) * 9)) {
+			CERROR("Refusing to set max dirty pages to %u, which "
+			       "is more than 90%% of available RAM; setting "
+			       "to %lu\n", obd_max_dirty_pages,
+			       ((num_physpages / 10) * 9));
+			obd_max_dirty_pages = ((num_physpages / 10) * 9);
+		} else if (obd_max_dirty_pages < 4 << (20 - PAGE_CACHE_SHIFT)) {
+			obd_max_dirty_pages = 4 << (20 - PAGE_CACHE_SHIFT);
+		}
+	} else {
+		char buf[21];
+		int len;
+
+		len = lprocfs_read_frac_helper(buf, sizeof(buf),
+					       *(unsigned int*)table->data,
+					       1 << (20 - PAGE_CACHE_SHIFT));
+		if (len > *lenp)
+			len = *lenp;
+		buf[len] = '\0';
+		if (copy_to_user(buffer, buf, len))
+			return -EFAULT;
+		*lenp = len;
+	}
+	*ppos += *lenp;
+	return rc;
+}
+
+int LL_PROC_PROTO(proc_alloc_fail_rate)
+{
+	int rc	  = 0;
+	DECLARE_LL_PROC_PPOS_DECL;
+
+	if (!table->data || !table->maxlen || !*lenp || (*ppos && !write)) {
+		*lenp = 0;
+		return 0;
+	}
+	if (write) {
+		rc = lprocfs_write_frac_helper(buffer, *lenp,
+					       (unsigned int*)table->data,
+					       OBD_ALLOC_FAIL_MULT);
+	} else {
+		char buf[21];
+		int  len;
+
+		len = lprocfs_read_frac_helper(buf, 21,
+					       *(unsigned int*)table->data,
+					       OBD_ALLOC_FAIL_MULT);
+		if (len > *lenp)
+			len = *lenp;
+		buf[len] = '\0';
+		if (copy_to_user(buffer, buf, len))
+			return -EFAULT;
+		*lenp = len;
+	}
+	*ppos += *lenp;
+	return rc;
+}
+
+int LL_PROC_PROTO(proc_at_min)
+{
+	return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+}
+int LL_PROC_PROTO(proc_at_max)
+{
+	return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+}
+int LL_PROC_PROTO(proc_at_extra)
+{
+	return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+}
+int LL_PROC_PROTO(proc_at_early_margin)
+{
+	return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+}
+int LL_PROC_PROTO(proc_at_history)
+{
+	return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+}
+
+#ifdef CONFIG_SYSCTL
+static ctl_table_t obd_table[] = {
+	{
+		INIT_CTL_NAME(OBD_TIMEOUT)
+		.procname = "timeout",
+		.data     = &obd_timeout,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_set_timeout
+	},
+	{
+		INIT_CTL_NAME(OBD_DEBUG_PEER_ON_TIMEOUT)
+		.procname = "debug_peer_on_timeout",
+		.data     = &obd_debug_peer_on_timeout,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME(OBD_DUMP_ON_TIMEOUT)
+		.procname = "dump_on_timeout",
+		.data     = &obd_dump_on_timeout,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME(OBD_DUMP_ON_EVICTION)
+		.procname = "dump_on_eviction",
+		.data     = &obd_dump_on_eviction,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME(OBD_MEMUSED)
+		.procname = "memused",
+		.data     = NULL,
+		.maxlen   = 0,
+		.mode     = 0444,
+		.proc_handler = &proc_memory_alloc
+	},
+	{
+		INIT_CTL_NAME(OBD_PAGESUSED)
+		.procname = "pagesused",
+		.data     = NULL,
+		.maxlen   = 0,
+		.mode     = 0444,
+		.proc_handler = &proc_pages_alloc
+	},
+	{
+		INIT_CTL_NAME(OBD_MAXMEMUSED)
+		.procname = "memused_max",
+		.data     = NULL,
+		.maxlen   = 0,
+		.mode     = 0444,
+		.proc_handler = &proc_mem_max
+	},
+	{
+		INIT_CTL_NAME(OBD_MAXPAGESUSED)
+		.procname = "pagesused_max",
+		.data     = NULL,
+		.maxlen   = 0,
+		.mode     = 0444,
+		.proc_handler = &proc_pages_max
+	},
+	{
+		INIT_CTL_NAME(OBD_LDLM_TIMEOUT)
+		.procname = "ldlm_timeout",
+		.data     = &ldlm_timeout,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_set_timeout
+	},
+	{
+		INIT_CTL_NAME(OBD_ALLOC_FAIL_RATE)
+		.procname = "alloc_fail_rate",
+		.data     = &obd_alloc_fail_rate,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_alloc_fail_rate
+	},
+	{
+		INIT_CTL_NAME(OBD_MAX_DIRTY_PAGES)
+		.procname = "max_dirty_mb",
+		.data     = &obd_max_dirty_pages,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_max_dirty_pages_in_mb
+	},
+	{
+		INIT_CTL_NAME(OBD_AT_MIN)
+		.procname = "at_min",
+		.data     = &at_min,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_at_min
+	},
+	{
+		INIT_CTL_NAME(OBD_AT_MAX)
+		.procname = "at_max",
+		.data     = &at_max,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_at_max
+	},
+	{
+		INIT_CTL_NAME(OBD_AT_EXTRA)
+		.procname = "at_extra",
+		.data     = &at_extra,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_at_extra
+	},
+	{
+		INIT_CTL_NAME(OBD_AT_EARLY_MARGIN)
+		.procname = "at_early_margin",
+		.data     = &at_early_margin,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_at_early_margin
+	},
+	{
+		INIT_CTL_NAME(OBD_AT_HISTORY)
+		.procname = "at_history",
+		.data     = &at_history,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_at_history
+	},
+	{       INIT_CTL_NAME(0)    }
+};
+
+static ctl_table_t parent_table[] = {
+	{
+		INIT_CTL_NAME(OBD_SYSCTL)
+		.procname = "lustre",
+		.data     = NULL,
+		.maxlen   = 0,
+		.mode     = 0555,
+		.child    = obd_table
+	},
+	{       INIT_CTL_NAME(0)   }
+};
+#endif
+
+void obd_sysctl_init (void)
+{
+#ifdef CONFIG_SYSCTL
+	if ( !obd_table_header )
+		obd_table_header = cfs_register_sysctl_table(parent_table, 0);
+#endif
+}
+
+void obd_sysctl_clean (void)
+{
+#ifdef CONFIG_SYSCTL
+	if ( obd_table_header )
+		unregister_sysctl_table(obd_table_header);
+	obd_table_header = NULL;
+#endif
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/llog.c b/drivers/staging/lustre/lustre/obdclass/llog.c
new file mode 100644
index 000000000000..b1d215e56991
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/llog.c
@@ -0,0 +1,966 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog.c
+ *
+ * OST<->MDS recovery logging infrastructure.
+ * Invariants in implementation:
+ * - we do not share logs among different OST<->MDS connections, so that
+ *   if an OST or MDS fails it need only look at log(s) relevant to itself
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ * Author: Alex Zhuravlev <bzzz@whamcloud.com>
+ * Author: Mikhail Pershin <tappro@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include <obd_class.h>
+#include <lustre_log.h>
+#include "llog_internal.h"
+
+/*
+ * Allocate a new log or catalog handle
+ * Used inside llog_open().
+ */
+struct llog_handle *llog_alloc_handle(void)
+{
+	struct llog_handle *loghandle;
+
+	OBD_ALLOC_PTR(loghandle);
+	if (loghandle == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	init_rwsem(&loghandle->lgh_lock);
+	spin_lock_init(&loghandle->lgh_hdr_lock);
+	INIT_LIST_HEAD(&loghandle->u.phd.phd_entry);
+	atomic_set(&loghandle->lgh_refcount, 1);
+
+	return loghandle;
+}
+
+/*
+ * Free llog handle and header data if exists. Used in llog_close() only
+ */
+void llog_free_handle(struct llog_handle *loghandle)
+{
+	LASSERT(loghandle != NULL);
+
+	/* failed llog_init_handle */
+	if (!loghandle->lgh_hdr)
+		goto out;
+
+	if (loghandle->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN)
+		LASSERT(list_empty(&loghandle->u.phd.phd_entry));
+	else if (loghandle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)
+		LASSERT(list_empty(&loghandle->u.chd.chd_head));
+	LASSERT(sizeof(*(loghandle->lgh_hdr)) == LLOG_CHUNK_SIZE);
+	OBD_FREE(loghandle->lgh_hdr, LLOG_CHUNK_SIZE);
+out:
+	OBD_FREE_PTR(loghandle);
+}
+
+void llog_handle_get(struct llog_handle *loghandle)
+{
+	atomic_inc(&loghandle->lgh_refcount);
+}
+
+void llog_handle_put(struct llog_handle *loghandle)
+{
+	LASSERT(atomic_read(&loghandle->lgh_refcount) > 0);
+	if (atomic_dec_and_test(&loghandle->lgh_refcount))
+		llog_free_handle(loghandle);
+}
+
+/* returns negative on error; 0 if success; 1 if success & log destroyed */
+int llog_cancel_rec(const struct lu_env *env, struct llog_handle *loghandle,
+		    int index)
+{
+	struct llog_log_hdr *llh = loghandle->lgh_hdr;
+	int rc = 0;
+	ENTRY;
+
+	CDEBUG(D_RPCTRACE, "Canceling %d in log "DOSTID"\n",
+	       index, POSTID(&loghandle->lgh_id.lgl_oi));
+
+	if (index == 0) {
+		CERROR("Can't cancel index 0 which is header\n");
+		RETURN(-EINVAL);
+	}
+
+	spin_lock(&loghandle->lgh_hdr_lock);
+	if (!ext2_clear_bit(index, llh->llh_bitmap)) {
+		spin_unlock(&loghandle->lgh_hdr_lock);
+		CDEBUG(D_RPCTRACE, "Catalog index %u already clear?\n", index);
+		RETURN(-ENOENT);
+	}
+
+	llh->llh_count--;
+
+	if ((llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) &&
+	    (llh->llh_count == 1) &&
+	    (loghandle->lgh_last_idx == (LLOG_BITMAP_BYTES * 8) - 1)) {
+		spin_unlock(&loghandle->lgh_hdr_lock);
+		rc = llog_destroy(env, loghandle);
+		if (rc < 0) {
+			CERROR("%s: can't destroy empty llog #"DOSTID
+			       "#%08x: rc = %d\n",
+			       loghandle->lgh_ctxt->loc_obd->obd_name,
+			       POSTID(&loghandle->lgh_id.lgl_oi),
+			       loghandle->lgh_id.lgl_ogen, rc);
+			GOTO(out_err, rc);
+		}
+		RETURN(1);
+	}
+	spin_unlock(&loghandle->lgh_hdr_lock);
+
+	rc = llog_write(env, loghandle, &llh->llh_hdr, NULL, 0, NULL, 0);
+	if (rc < 0) {
+		CERROR("%s: fail to write header for llog #"DOSTID
+		       "#%08x: rc = %d\n",
+		       loghandle->lgh_ctxt->loc_obd->obd_name,
+		       POSTID(&loghandle->lgh_id.lgl_oi),
+		       loghandle->lgh_id.lgl_ogen, rc);
+		GOTO(out_err, rc);
+	}
+	RETURN(0);
+out_err:
+	spin_lock(&loghandle->lgh_hdr_lock);
+	ext2_set_bit(index, llh->llh_bitmap);
+	llh->llh_count++;
+	spin_unlock(&loghandle->lgh_hdr_lock);
+	return rc;
+}
+EXPORT_SYMBOL(llog_cancel_rec);
+
+static int llog_read_header(const struct lu_env *env,
+			    struct llog_handle *handle,
+			    struct obd_uuid *uuid)
+{
+	struct llog_operations *lop;
+	int rc;
+
+	rc = llog_handle2ops(handle, &lop);
+	if (rc)
+		RETURN(rc);
+
+	if (lop->lop_read_header == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	rc = lop->lop_read_header(env, handle);
+	if (rc == LLOG_EEMPTY) {
+		struct llog_log_hdr *llh = handle->lgh_hdr;
+
+		handle->lgh_last_idx = 0; /* header is record with index 0 */
+		llh->llh_count = 1;	 /* for the header record */
+		llh->llh_hdr.lrh_type = LLOG_HDR_MAGIC;
+		llh->llh_hdr.lrh_len = llh->llh_tail.lrt_len = LLOG_CHUNK_SIZE;
+		llh->llh_hdr.lrh_index = llh->llh_tail.lrt_index = 0;
+		llh->llh_timestamp = cfs_time_current_sec();
+		if (uuid)
+			memcpy(&llh->llh_tgtuuid, uuid,
+			       sizeof(llh->llh_tgtuuid));
+		llh->llh_bitmap_offset = offsetof(typeof(*llh), llh_bitmap);
+		ext2_set_bit(0, llh->llh_bitmap);
+		rc = 0;
+	}
+	return rc;
+}
+
+int llog_init_handle(const struct lu_env *env, struct llog_handle *handle,
+		     int flags, struct obd_uuid *uuid)
+{
+	struct llog_log_hdr	*llh;
+	int			 rc;
+
+	ENTRY;
+	LASSERT(handle->lgh_hdr == NULL);
+
+	OBD_ALLOC_PTR(llh);
+	if (llh == NULL)
+		RETURN(-ENOMEM);
+	handle->lgh_hdr = llh;
+	/* first assign flags to use llog_client_ops */
+	llh->llh_flags = flags;
+	rc = llog_read_header(env, handle, uuid);
+	if (rc == 0) {
+		if (unlikely((llh->llh_flags & LLOG_F_IS_PLAIN &&
+			      flags & LLOG_F_IS_CAT) ||
+			     (llh->llh_flags & LLOG_F_IS_CAT &&
+			      flags & LLOG_F_IS_PLAIN))) {
+			CERROR("%s: llog type is %s but initializing %s\n",
+			       handle->lgh_ctxt->loc_obd->obd_name,
+			       llh->llh_flags & LLOG_F_IS_CAT ?
+			       "catalog" : "plain",
+			       flags & LLOG_F_IS_CAT ? "catalog" : "plain");
+			GOTO(out, rc = -EINVAL);
+		} else if (llh->llh_flags &
+			   (LLOG_F_IS_PLAIN | LLOG_F_IS_CAT)) {
+			/*
+			 * it is possible to open llog without specifying llog
+			 * type so it is taken from llh_flags
+			 */
+			flags = llh->llh_flags;
+		} else {
+			/* for some reason the llh_flags has no type set */
+			CERROR("llog type is not specified!\n");
+			GOTO(out, rc = -EINVAL);
+		}
+		if (unlikely(uuid &&
+			     !obd_uuid_equals(uuid, &llh->llh_tgtuuid))) {
+			CERROR("%s: llog uuid mismatch: %s/%s\n",
+			       handle->lgh_ctxt->loc_obd->obd_name,
+			       (char *)uuid->uuid,
+			       (char *)llh->llh_tgtuuid.uuid);
+			GOTO(out, rc = -EEXIST);
+		}
+	}
+	if (flags & LLOG_F_IS_CAT) {
+		LASSERT(list_empty(&handle->u.chd.chd_head));
+		INIT_LIST_HEAD(&handle->u.chd.chd_head);
+		llh->llh_size = sizeof(struct llog_logid_rec);
+	} else if (!(flags & LLOG_F_IS_PLAIN)) {
+		CERROR("%s: unknown flags: %#x (expected %#x or %#x)\n",
+		       handle->lgh_ctxt->loc_obd->obd_name,
+		       flags, LLOG_F_IS_CAT, LLOG_F_IS_PLAIN);
+		rc = -EINVAL;
+	}
+out:
+	if (rc) {
+		OBD_FREE_PTR(llh);
+		handle->lgh_hdr = NULL;
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_init_handle);
+
+int llog_copy_handler(const struct lu_env *env,
+		      struct llog_handle *llh,
+		      struct llog_rec_hdr *rec,
+		      void *data)
+{
+	struct llog_rec_hdr local_rec = *rec;
+	struct llog_handle *local_llh = (struct llog_handle *)data;
+	char *cfg_buf = (char*) (rec + 1);
+	struct lustre_cfg *lcfg;
+	int rc = 0;
+	ENTRY;
+
+	/* Append all records */
+	local_rec.lrh_len -= sizeof(*rec) + sizeof(struct llog_rec_tail);
+	rc = llog_write(env, local_llh, &local_rec, NULL, 0,
+			(void *)cfg_buf, -1);
+
+	lcfg = (struct lustre_cfg *)cfg_buf;
+	CDEBUG(D_INFO, "idx=%d, rc=%d, len=%d, cmd %x %s %s\n",
+	       rec->lrh_index, rc, rec->lrh_len, lcfg->lcfg_command,
+	       lustre_cfg_string(lcfg, 0), lustre_cfg_string(lcfg, 1));
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_copy_handler);
+
+static int llog_process_thread(void *arg)
+{
+	struct llog_process_info	*lpi = arg;
+	struct llog_handle		*loghandle = lpi->lpi_loghandle;
+	struct llog_log_hdr		*llh = loghandle->lgh_hdr;
+	struct llog_process_cat_data	*cd  = lpi->lpi_catdata;
+	char				*buf;
+	__u64				 cur_offset = LLOG_CHUNK_SIZE;
+	__u64				 last_offset;
+	int				 rc = 0, index = 1, last_index;
+	int				 saved_index = 0;
+	int				 last_called_index = 0;
+
+	ENTRY;
+
+	LASSERT(llh);
+
+	OBD_ALLOC(buf, LLOG_CHUNK_SIZE);
+	if (!buf) {
+		lpi->lpi_rc = -ENOMEM;
+		RETURN(0);
+	}
+
+	if (cd != NULL) {
+		last_called_index = cd->lpcd_first_idx;
+		index = cd->lpcd_first_idx + 1;
+	}
+	if (cd != NULL && cd->lpcd_last_idx)
+		last_index = cd->lpcd_last_idx;
+	else
+		last_index = LLOG_BITMAP_BYTES * 8 - 1;
+
+	while (rc == 0) {
+		struct llog_rec_hdr *rec;
+
+		/* skip records not set in bitmap */
+		while (index <= last_index &&
+		       !ext2_test_bit(index, llh->llh_bitmap))
+			++index;
+
+		LASSERT(index <= last_index + 1);
+		if (index == last_index + 1)
+			break;
+repeat:
+		CDEBUG(D_OTHER, "index: %d last_index %d\n",
+		       index, last_index);
+
+		/* get the buf with our target record; avoid old garbage */
+		memset(buf, 0, LLOG_CHUNK_SIZE);
+		last_offset = cur_offset;
+		rc = llog_next_block(lpi->lpi_env, loghandle, &saved_index,
+				     index, &cur_offset, buf, LLOG_CHUNK_SIZE);
+		if (rc)
+			GOTO(out, rc);
+
+		/* NB: when rec->lrh_len is accessed it is already swabbed
+		 * since it is used at the "end" of the loop and the rec
+		 * swabbing is done at the beginning of the loop. */
+		for (rec = (struct llog_rec_hdr *)buf;
+		     (char *)rec < buf + LLOG_CHUNK_SIZE;
+		     rec = (struct llog_rec_hdr *)((char *)rec + rec->lrh_len)){
+
+			CDEBUG(D_OTHER, "processing rec 0x%p type %#x\n",
+			       rec, rec->lrh_type);
+
+			if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+				lustre_swab_llog_rec(rec);
+
+			CDEBUG(D_OTHER, "after swabbing, type=%#x idx=%d\n",
+			       rec->lrh_type, rec->lrh_index);
+
+			if (rec->lrh_index == 0) {
+				/* probably another rec just got added? */
+				if (index <= loghandle->lgh_last_idx)
+					GOTO(repeat, rc = 0);
+				GOTO(out, rc = 0); /* no more records */
+			}
+			if (rec->lrh_len == 0 ||
+			    rec->lrh_len > LLOG_CHUNK_SIZE) {
+				CWARN("invalid length %d in llog record for "
+				      "index %d/%d\n", rec->lrh_len,
+				      rec->lrh_index, index);
+				GOTO(out, rc = -EINVAL);
+			}
+
+			if (rec->lrh_index < index) {
+				CDEBUG(D_OTHER, "skipping lrh_index %d\n",
+				       rec->lrh_index);
+				continue;
+			}
+
+			CDEBUG(D_OTHER,
+			       "lrh_index: %d lrh_len: %d (%d remains)\n",
+			       rec->lrh_index, rec->lrh_len,
+			       (int)(buf + LLOG_CHUNK_SIZE - (char *)rec));
+
+			loghandle->lgh_cur_idx = rec->lrh_index;
+			loghandle->lgh_cur_offset = (char *)rec - (char *)buf +
+						    last_offset;
+
+			/* if set, process the callback on this record */
+			if (ext2_test_bit(index, llh->llh_bitmap)) {
+				rc = lpi->lpi_cb(lpi->lpi_env, loghandle, rec,
+						 lpi->lpi_cbdata);
+				last_called_index = index;
+				if (rc == LLOG_PROC_BREAK) {
+					GOTO(out, rc);
+				} else if (rc == LLOG_DEL_RECORD) {
+					llog_cancel_rec(lpi->lpi_env,
+							loghandle,
+							rec->lrh_index);
+					rc = 0;
+				}
+				if (rc)
+					GOTO(out, rc);
+			} else {
+				CDEBUG(D_OTHER, "Skipped index %d\n", index);
+			}
+
+			/* next record, still in buffer? */
+			++index;
+			if (index > last_index)
+				GOTO(out, rc = 0);
+		}
+	}
+
+out:
+	if (cd != NULL)
+		cd->lpcd_last_idx = last_called_index;
+
+	OBD_FREE(buf, LLOG_CHUNK_SIZE);
+	lpi->lpi_rc = rc;
+	return 0;
+}
+
+static int llog_process_thread_daemonize(void *arg)
+{
+	struct llog_process_info	*lpi = arg;
+	struct lu_env			 env;
+	int				 rc;
+
+	unshare_fs_struct();
+
+	/* client env has no keys, tags is just 0 */
+	rc = lu_env_init(&env, LCT_LOCAL | LCT_MG_THREAD);
+	if (rc)
+		goto out;
+	lpi->lpi_env = &env;
+
+	rc = llog_process_thread(arg);
+
+	lu_env_fini(&env);
+out:
+	complete(&lpi->lpi_completion);
+	return rc;
+}
+
+int llog_process_or_fork(const struct lu_env *env,
+			 struct llog_handle *loghandle,
+			 llog_cb_t cb, void *data, void *catdata, bool fork)
+{
+	struct llog_process_info *lpi;
+	int		      rc;
+
+	ENTRY;
+
+	OBD_ALLOC_PTR(lpi);
+	if (lpi == NULL) {
+		CERROR("cannot alloc pointer\n");
+		RETURN(-ENOMEM);
+	}
+	lpi->lpi_loghandle = loghandle;
+	lpi->lpi_cb	= cb;
+	lpi->lpi_cbdata    = data;
+	lpi->lpi_catdata   = catdata;
+
+	if (fork) {
+		/* The new thread can't use parent env,
+		 * init the new one in llog_process_thread_daemonize. */
+		lpi->lpi_env = NULL;
+		init_completion(&lpi->lpi_completion);
+		rc = PTR_ERR(kthread_run(llog_process_thread_daemonize, lpi,
+					     "llog_process_thread"));
+		if (IS_ERR_VALUE(rc)) {
+			CERROR("%s: cannot start thread: rc = %d\n",
+			       loghandle->lgh_ctxt->loc_obd->obd_name, rc);
+			OBD_FREE_PTR(lpi);
+			RETURN(rc);
+		}
+		wait_for_completion(&lpi->lpi_completion);
+	} else {
+		lpi->lpi_env = env;
+		llog_process_thread(lpi);
+	}
+	rc = lpi->lpi_rc;
+	OBD_FREE_PTR(lpi);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_process_or_fork);
+
+int llog_process(const struct lu_env *env, struct llog_handle *loghandle,
+		 llog_cb_t cb, void *data, void *catdata)
+{
+	return llog_process_or_fork(env, loghandle, cb, data, catdata, true);
+}
+EXPORT_SYMBOL(llog_process);
+
+inline int llog_get_size(struct llog_handle *loghandle)
+{
+	if (loghandle && loghandle->lgh_hdr)
+		return loghandle->lgh_hdr->llh_count;
+	return 0;
+}
+EXPORT_SYMBOL(llog_get_size);
+
+int llog_reverse_process(const struct lu_env *env,
+			 struct llog_handle *loghandle, llog_cb_t cb,
+			 void *data, void *catdata)
+{
+	struct llog_log_hdr *llh = loghandle->lgh_hdr;
+	struct llog_process_cat_data *cd = catdata;
+	void *buf;
+	int rc = 0, first_index = 1, index, idx;
+	ENTRY;
+
+	OBD_ALLOC(buf, LLOG_CHUNK_SIZE);
+	if (!buf)
+		RETURN(-ENOMEM);
+
+	if (cd != NULL)
+		first_index = cd->lpcd_first_idx + 1;
+	if (cd != NULL && cd->lpcd_last_idx)
+		index = cd->lpcd_last_idx;
+	else
+		index = LLOG_BITMAP_BYTES * 8 - 1;
+
+	while (rc == 0) {
+		struct llog_rec_hdr *rec;
+		struct llog_rec_tail *tail;
+
+		/* skip records not set in bitmap */
+		while (index >= first_index &&
+		       !ext2_test_bit(index, llh->llh_bitmap))
+			--index;
+
+		LASSERT(index >= first_index - 1);
+		if (index == first_index - 1)
+			break;
+
+		/* get the buf with our target record; avoid old garbage */
+		memset(buf, 0, LLOG_CHUNK_SIZE);
+		rc = llog_prev_block(env, loghandle, index, buf,
+				     LLOG_CHUNK_SIZE);
+		if (rc)
+			GOTO(out, rc);
+
+		rec = buf;
+		idx = rec->lrh_index;
+		CDEBUG(D_RPCTRACE, "index %u : idx %u\n", index, idx);
+		while (idx < index) {
+			rec = (void *)rec + rec->lrh_len;
+			if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+				lustre_swab_llog_rec(rec);
+			idx ++;
+		}
+		LASSERT(idx == index);
+		tail = (void *)rec + rec->lrh_len - sizeof(*tail);
+
+		/* process records in buffer, starting where we found one */
+		while ((void *)tail > buf) {
+			if (tail->lrt_index == 0)
+				GOTO(out, rc = 0); /* no more records */
+
+			/* if set, process the callback on this record */
+			if (ext2_test_bit(index, llh->llh_bitmap)) {
+				rec = (void *)tail - tail->lrt_len +
+				      sizeof(*tail);
+
+				rc = cb(env, loghandle, rec, data);
+				if (rc == LLOG_PROC_BREAK) {
+					GOTO(out, rc);
+				} else if (rc == LLOG_DEL_RECORD) {
+					llog_cancel_rec(env, loghandle,
+							tail->lrt_index);
+					rc = 0;
+				}
+				if (rc)
+					GOTO(out, rc);
+			}
+
+			/* previous record, still in buffer? */
+			--index;
+			if (index < first_index)
+				GOTO(out, rc = 0);
+			tail = (void *)tail - tail->lrt_len;
+		}
+	}
+
+out:
+	if (buf)
+		OBD_FREE(buf, LLOG_CHUNK_SIZE);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_reverse_process);
+
+/**
+ * new llog API
+ *
+ * API functions:
+ *      llog_open - open llog, may not exist
+ *      llog_exist - check if llog exists
+ *      llog_close - close opened llog, pair for open, frees llog_handle
+ *      llog_declare_create - declare llog creation
+ *      llog_create - create new llog on disk, need transaction handle
+ *      llog_declare_write_rec - declaration of llog write
+ *      llog_write_rec - write llog record on disk, need transaction handle
+ *      llog_declare_add - declare llog catalog record addition
+ *      llog_add - add llog record in catalog, need transaction handle
+ */
+int llog_exist(struct llog_handle *loghandle)
+{
+	struct llog_operations	*lop;
+	int			 rc;
+
+	ENTRY;
+
+	rc = llog_handle2ops(loghandle, &lop);
+	if (rc)
+		RETURN(rc);
+	if (lop->lop_exist == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	rc = lop->lop_exist(loghandle);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_exist);
+
+int llog_declare_create(const struct lu_env *env,
+			struct llog_handle *loghandle, struct thandle *th)
+{
+	struct llog_operations	*lop;
+	int			 raised, rc;
+
+	ENTRY;
+
+	rc = llog_handle2ops(loghandle, &lop);
+	if (rc)
+		RETURN(rc);
+	if (lop->lop_declare_create == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+	if (!raised)
+		cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+	rc = lop->lop_declare_create(env, loghandle, th);
+	if (!raised)
+		cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_declare_create);
+
+int llog_create(const struct lu_env *env, struct llog_handle *handle,
+		struct thandle *th)
+{
+	struct llog_operations	*lop;
+	int			 raised, rc;
+
+	ENTRY;
+
+	rc = llog_handle2ops(handle, &lop);
+	if (rc)
+		RETURN(rc);
+	if (lop->lop_create == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+	if (!raised)
+		cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+	rc = lop->lop_create(env, handle, th);
+	if (!raised)
+		cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_create);
+
+int llog_declare_write_rec(const struct lu_env *env,
+			   struct llog_handle *handle,
+			   struct llog_rec_hdr *rec, int idx,
+			   struct thandle *th)
+{
+	struct llog_operations	*lop;
+	int			 raised, rc;
+
+	ENTRY;
+
+	rc = llog_handle2ops(handle, &lop);
+	if (rc)
+		RETURN(rc);
+	LASSERT(lop);
+	if (lop->lop_declare_write_rec == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+	if (!raised)
+		cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+	rc = lop->lop_declare_write_rec(env, handle, rec, idx, th);
+	if (!raised)
+		cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_declare_write_rec);
+
+int llog_write_rec(const struct lu_env *env, struct llog_handle *handle,
+		   struct llog_rec_hdr *rec, struct llog_cookie *logcookies,
+		   int numcookies, void *buf, int idx, struct thandle *th)
+{
+	struct llog_operations	*lop;
+	int			 raised, rc, buflen;
+
+	ENTRY;
+
+	rc = llog_handle2ops(handle, &lop);
+	if (rc)
+		RETURN(rc);
+
+	LASSERT(lop);
+	if (lop->lop_write_rec == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	if (buf)
+		buflen = rec->lrh_len + sizeof(struct llog_rec_hdr) +
+			 sizeof(struct llog_rec_tail);
+	else
+		buflen = rec->lrh_len;
+	LASSERT(cfs_size_round(buflen) == buflen);
+
+	raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+	if (!raised)
+		cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+	rc = lop->lop_write_rec(env, handle, rec, logcookies, numcookies,
+				buf, idx, th);
+	if (!raised)
+		cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_write_rec);
+
+int llog_add(const struct lu_env *env, struct llog_handle *lgh,
+	     struct llog_rec_hdr *rec, struct llog_cookie *logcookies,
+	     void *buf, struct thandle *th)
+{
+	int raised, rc;
+
+	ENTRY;
+
+	if (lgh->lgh_logops->lop_add == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+	if (!raised)
+		cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+	rc = lgh->lgh_logops->lop_add(env, lgh, rec, logcookies, buf, th);
+	if (!raised)
+		cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_add);
+
+int llog_declare_add(const struct lu_env *env, struct llog_handle *lgh,
+		     struct llog_rec_hdr *rec, struct thandle *th)
+{
+	int raised, rc;
+
+	ENTRY;
+
+	if (lgh->lgh_logops->lop_declare_add == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+	if (!raised)
+		cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+	rc = lgh->lgh_logops->lop_declare_add(env, lgh, rec, th);
+	if (!raised)
+		cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_declare_add);
+
+/**
+ * Helper function to open llog or create it if doesn't exist.
+ * It hides all transaction handling from caller.
+ */
+int llog_open_create(const struct lu_env *env, struct llog_ctxt *ctxt,
+		     struct llog_handle **res, struct llog_logid *logid,
+		     char *name)
+{
+	struct thandle	*th;
+	int		 rc;
+
+	ENTRY;
+
+	rc = llog_open(env, ctxt, res, logid, name, LLOG_OPEN_NEW);
+	if (rc)
+		RETURN(rc);
+
+	if (llog_exist(*res))
+		RETURN(0);
+
+	if ((*res)->lgh_obj != NULL) {
+		struct dt_device *d;
+
+		d = lu2dt_dev((*res)->lgh_obj->do_lu.lo_dev);
+
+		th = dt_trans_create(env, d);
+		if (IS_ERR(th))
+			GOTO(out, rc = PTR_ERR(th));
+
+		rc = llog_declare_create(env, *res, th);
+		if (rc == 0) {
+			rc = dt_trans_start_local(env, d, th);
+			if (rc == 0)
+				rc = llog_create(env, *res, th);
+		}
+		dt_trans_stop(env, d, th);
+	} else {
+		/* lvfs compat code */
+		LASSERT((*res)->lgh_file == NULL);
+		rc = llog_create(env, *res, NULL);
+	}
+out:
+	if (rc)
+		llog_close(env, *res);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_open_create);
+
+/**
+ * Helper function to delete existent llog.
+ */
+int llog_erase(const struct lu_env *env, struct llog_ctxt *ctxt,
+	       struct llog_logid *logid, char *name)
+{
+	struct llog_handle	*handle;
+	int			 rc = 0, rc2;
+
+	ENTRY;
+
+	/* nothing to erase */
+	if (name == NULL && logid == NULL)
+		RETURN(0);
+
+	rc = llog_open(env, ctxt, &handle, logid, name, LLOG_OPEN_EXISTS);
+	if (rc < 0)
+		RETURN(rc);
+
+	rc = llog_init_handle(env, handle, LLOG_F_IS_PLAIN, NULL);
+	if (rc == 0)
+		rc = llog_destroy(env, handle);
+
+	rc2 = llog_close(env, handle);
+	if (rc == 0)
+		rc = rc2;
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_erase);
+
+/*
+ * Helper function for write record in llog.
+ * It hides all transaction handling from caller.
+ * Valid only with local llog.
+ */
+int llog_write(const struct lu_env *env, struct llog_handle *loghandle,
+	       struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+	       int cookiecount, void *buf, int idx)
+{
+	int rc;
+
+	ENTRY;
+
+	LASSERT(loghandle);
+	LASSERT(loghandle->lgh_ctxt);
+
+	if (loghandle->lgh_obj != NULL) {
+		struct dt_device	*dt;
+		struct thandle		*th;
+
+		dt = lu2dt_dev(loghandle->lgh_obj->do_lu.lo_dev);
+
+		th = dt_trans_create(env, dt);
+		if (IS_ERR(th))
+			RETURN(PTR_ERR(th));
+
+		rc = llog_declare_write_rec(env, loghandle, rec, idx, th);
+		if (rc)
+			GOTO(out_trans, rc);
+
+		rc = dt_trans_start_local(env, dt, th);
+		if (rc)
+			GOTO(out_trans, rc);
+
+		down_write(&loghandle->lgh_lock);
+		rc = llog_write_rec(env, loghandle, rec, reccookie,
+				    cookiecount, buf, idx, th);
+		up_write(&loghandle->lgh_lock);
+out_trans:
+		dt_trans_stop(env, dt, th);
+	} else { /* lvfs compatibility */
+		down_write(&loghandle->lgh_lock);
+		rc = llog_write_rec(env, loghandle, rec, reccookie,
+				    cookiecount, buf, idx, NULL);
+		up_write(&loghandle->lgh_lock);
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_write);
+
+int llog_open(const struct lu_env *env, struct llog_ctxt *ctxt,
+	      struct llog_handle **lgh, struct llog_logid *logid,
+	      char *name, enum llog_open_param open_param)
+{
+	int	 raised;
+	int	 rc;
+
+	ENTRY;
+
+	LASSERT(ctxt);
+	LASSERT(ctxt->loc_logops);
+
+	if (ctxt->loc_logops->lop_open == NULL) {
+		*lgh = NULL;
+		RETURN(-EOPNOTSUPP);
+	}
+
+	*lgh = llog_alloc_handle();
+	if (*lgh == NULL)
+		RETURN(-ENOMEM);
+	(*lgh)->lgh_ctxt = ctxt;
+	(*lgh)->lgh_logops = ctxt->loc_logops;
+
+	raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+	if (!raised)
+		cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+	rc = ctxt->loc_logops->lop_open(env, *lgh, logid, name, open_param);
+	if (!raised)
+		cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+	if (rc) {
+		llog_free_handle(*lgh);
+		*lgh = NULL;
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_open);
+
+int llog_close(const struct lu_env *env, struct llog_handle *loghandle)
+{
+	struct llog_operations	*lop;
+	int			 rc;
+
+	ENTRY;
+
+	rc = llog_handle2ops(loghandle, &lop);
+	if (rc)
+		GOTO(out, rc);
+	if (lop->lop_close == NULL)
+		GOTO(out, rc = -EOPNOTSUPP);
+	rc = lop->lop_close(env, loghandle);
+out:
+	llog_handle_put(loghandle);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_close);
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_cat.c b/drivers/staging/lustre/lustre/obdclass/llog_cat.c
new file mode 100644
index 000000000000..cf00b2f550ac
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/llog_cat.c
@@ -0,0 +1,833 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog_cat.c
+ *
+ * OST<->MDS recovery logging infrastructure.
+ *
+ * Invariants in implementation:
+ * - we do not share logs among different OST<->MDS connections, so that
+ *   if an OST or MDS fails it need only look at log(s) relevant to itself
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ * Author: Alexey Zhuravlev <alexey.zhuravlev@intel.com>
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include <obd_class.h>
+
+#include "llog_internal.h"
+
+/* Create a new log handle and add it to the open list.
+ * This log handle will be closed when all of the records in it are removed.
+ *
+ * Assumes caller has already pushed us into the kernel context and is locking.
+ */
+static int llog_cat_new_log(const struct lu_env *env,
+			    struct llog_handle *cathandle,
+			    struct llog_handle *loghandle,
+			    struct thandle *th)
+{
+
+	struct llog_log_hdr *llh;
+	struct llog_logid_rec rec = { { 0 }, };
+	int rc, index, bitmap_size;
+	ENTRY;
+
+	llh = cathandle->lgh_hdr;
+	bitmap_size = LLOG_BITMAP_SIZE(llh);
+
+	index = (cathandle->lgh_last_idx + 1) % bitmap_size;
+
+	/* maximum number of available slots in catlog is bitmap_size - 2 */
+	if (llh->llh_cat_idx == index) {
+		CERROR("no free catalog slots for log...\n");
+		RETURN(-ENOSPC);
+	}
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_MDS_LLOG_CREATE_FAILED))
+		RETURN(-ENOSPC);
+
+	rc = llog_create(env, loghandle, th);
+	/* if llog is already created, no need to initialize it */
+	if (rc == -EEXIST) {
+		RETURN(0);
+	} else if (rc != 0) {
+		CERROR("%s: can't create new plain llog in catalog: rc = %d\n",
+		       loghandle->lgh_ctxt->loc_obd->obd_name, rc);
+		RETURN(rc);
+	}
+
+	rc = llog_init_handle(env, loghandle,
+			      LLOG_F_IS_PLAIN | LLOG_F_ZAP_WHEN_EMPTY,
+			      &cathandle->lgh_hdr->llh_tgtuuid);
+	if (rc)
+		GOTO(out_destroy, rc);
+
+	if (index == 0)
+		index = 1;
+
+	spin_lock(&loghandle->lgh_hdr_lock);
+	llh->llh_count++;
+	if (ext2_set_bit(index, llh->llh_bitmap)) {
+		CERROR("argh, index %u already set in log bitmap?\n",
+		       index);
+		spin_unlock(&loghandle->lgh_hdr_lock);
+		LBUG(); /* should never happen */
+	}
+	spin_unlock(&loghandle->lgh_hdr_lock);
+
+	cathandle->lgh_last_idx = index;
+	llh->llh_tail.lrt_index = index;
+
+	CDEBUG(D_RPCTRACE,"new recovery log "DOSTID":%x for index %u of catalog"
+	       DOSTID"\n", POSTID(&loghandle->lgh_id.lgl_oi),
+	       loghandle->lgh_id.lgl_ogen, index,
+	       POSTID(&cathandle->lgh_id.lgl_oi));
+	/* build the record for this log in the catalog */
+	rec.lid_hdr.lrh_len = sizeof(rec);
+	rec.lid_hdr.lrh_index = index;
+	rec.lid_hdr.lrh_type = LLOG_LOGID_MAGIC;
+	rec.lid_id = loghandle->lgh_id;
+	rec.lid_tail.lrt_len = sizeof(rec);
+	rec.lid_tail.lrt_index = index;
+
+	/* update the catalog: header and record */
+	rc = llog_write_rec(env, cathandle, &rec.lid_hdr,
+			    &loghandle->u.phd.phd_cookie, 1, NULL, index, th);
+	if (rc < 0)
+		GOTO(out_destroy, rc);
+
+	loghandle->lgh_hdr->llh_cat_idx = index;
+	RETURN(0);
+out_destroy:
+	llog_destroy(env, loghandle);
+	RETURN(rc);
+}
+
+/* Open an existent log handle and add it to the open list.
+ * This log handle will be closed when all of the records in it are removed.
+ *
+ * Assumes caller has already pushed us into the kernel context and is locking.
+ * We return a lock on the handle to ensure nobody yanks it from us.
+ *
+ * This takes extra reference on llog_handle via llog_handle_get() and require
+ * this reference to be put by caller using llog_handle_put()
+ */
+int llog_cat_id2handle(const struct lu_env *env, struct llog_handle *cathandle,
+		       struct llog_handle **res, struct llog_logid *logid)
+{
+	struct llog_handle	*loghandle;
+	int			 rc = 0;
+
+	ENTRY;
+
+	if (cathandle == NULL)
+		RETURN(-EBADF);
+
+	down_write(&cathandle->lgh_lock);
+	list_for_each_entry(loghandle, &cathandle->u.chd.chd_head,
+				u.phd.phd_entry) {
+		struct llog_logid *cgl = &loghandle->lgh_id;
+
+		if (ostid_id(&cgl->lgl_oi) == ostid_id(&logid->lgl_oi) &&
+		    ostid_seq(&cgl->lgl_oi) == ostid_seq(&logid->lgl_oi)) {
+			if (cgl->lgl_ogen != logid->lgl_ogen) {
+				CERROR("%s: log "DOSTID" generation %x != %x\n",
+				       loghandle->lgh_ctxt->loc_obd->obd_name,
+				       POSTID(&logid->lgl_oi), cgl->lgl_ogen,
+				       logid->lgl_ogen);
+				continue;
+			}
+			loghandle->u.phd.phd_cat_handle = cathandle;
+			up_write(&cathandle->lgh_lock);
+			GOTO(out, rc = 0);
+		}
+	}
+	up_write(&cathandle->lgh_lock);
+
+	rc = llog_open(env, cathandle->lgh_ctxt, &loghandle, logid, NULL,
+		       LLOG_OPEN_EXISTS);
+	if (rc < 0) {
+		CERROR("%s: error opening log id "DOSTID":%x: rc = %d\n",
+		       cathandle->lgh_ctxt->loc_obd->obd_name,
+		       POSTID(&logid->lgl_oi), logid->lgl_ogen, rc);
+		RETURN(rc);
+	}
+
+	rc = llog_init_handle(env, loghandle, LLOG_F_IS_PLAIN, NULL);
+	if (rc < 0) {
+		llog_close(env, loghandle);
+		loghandle = NULL;
+		RETURN(rc);
+	}
+
+	down_write(&cathandle->lgh_lock);
+	list_add(&loghandle->u.phd.phd_entry, &cathandle->u.chd.chd_head);
+	up_write(&cathandle->lgh_lock);
+
+	loghandle->u.phd.phd_cat_handle = cathandle;
+	loghandle->u.phd.phd_cookie.lgc_lgl = cathandle->lgh_id;
+	loghandle->u.phd.phd_cookie.lgc_index =
+				loghandle->lgh_hdr->llh_cat_idx;
+	EXIT;
+out:
+	llog_handle_get(loghandle);
+	*res = loghandle;
+	return 0;
+}
+
+int llog_cat_close(const struct lu_env *env, struct llog_handle *cathandle)
+{
+	struct llog_handle	*loghandle, *n;
+	int			 rc;
+
+	ENTRY;
+
+	list_for_each_entry_safe(loghandle, n, &cathandle->u.chd.chd_head,
+				     u.phd.phd_entry) {
+		struct llog_log_hdr	*llh = loghandle->lgh_hdr;
+		int			 index;
+
+		/* unlink open-not-created llogs */
+		list_del_init(&loghandle->u.phd.phd_entry);
+		llh = loghandle->lgh_hdr;
+		if (loghandle->lgh_obj != NULL && llh != NULL &&
+		    (llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) &&
+		    (llh->llh_count == 1)) {
+			rc = llog_destroy(env, loghandle);
+			if (rc)
+				CERROR("%s: failure destroying log during "
+				       "cleanup: rc = %d\n",
+				       loghandle->lgh_ctxt->loc_obd->obd_name,
+				       rc);
+
+			index = loghandle->u.phd.phd_cookie.lgc_index;
+			llog_cat_cleanup(env, cathandle, NULL, index);
+		}
+		llog_close(env, loghandle);
+	}
+	/* if handle was stored in ctxt, remove it too */
+	if (cathandle->lgh_ctxt->loc_handle == cathandle)
+		cathandle->lgh_ctxt->loc_handle = NULL;
+	rc = llog_close(env, cathandle);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_close);
+
+/**
+ * lockdep markers for nested struct llog_handle::lgh_lock locking.
+ */
+enum {
+	LLOGH_CAT,
+	LLOGH_LOG
+};
+
+/** Return the currently active log handle.  If the current log handle doesn't
+ * have enough space left for the current record, start a new one.
+ *
+ * If reclen is 0, we only want to know what the currently active log is,
+ * otherwise we get a lock on this log so nobody can steal our space.
+ *
+ * Assumes caller has already pushed us into the kernel context and is locking.
+ *
+ * NOTE: loghandle is write-locked upon successful return
+ */
+static struct llog_handle *llog_cat_current_log(struct llog_handle *cathandle,
+						struct thandle *th)
+{
+	struct llog_handle *loghandle = NULL;
+	ENTRY;
+
+	down_read_nested(&cathandle->lgh_lock, LLOGH_CAT);
+	loghandle = cathandle->u.chd.chd_current_log;
+	if (loghandle) {
+		struct llog_log_hdr *llh;
+
+		down_write_nested(&loghandle->lgh_lock, LLOGH_LOG);
+		llh = loghandle->lgh_hdr;
+		if (llh == NULL ||
+		    loghandle->lgh_last_idx < LLOG_BITMAP_SIZE(llh) - 1) {
+			up_read(&cathandle->lgh_lock);
+			RETURN(loghandle);
+		} else {
+			up_write(&loghandle->lgh_lock);
+		}
+	}
+	up_read(&cathandle->lgh_lock);
+
+	/* time to use next log */
+
+	/* first, we have to make sure the state hasn't changed */
+	down_write_nested(&cathandle->lgh_lock, LLOGH_CAT);
+	loghandle = cathandle->u.chd.chd_current_log;
+	if (loghandle) {
+		struct llog_log_hdr *llh;
+
+		down_write_nested(&loghandle->lgh_lock, LLOGH_LOG);
+		llh = loghandle->lgh_hdr;
+		LASSERT(llh);
+		if (loghandle->lgh_last_idx < LLOG_BITMAP_SIZE(llh) - 1) {
+			up_write(&cathandle->lgh_lock);
+			RETURN(loghandle);
+		} else {
+			up_write(&loghandle->lgh_lock);
+		}
+	}
+
+	CDEBUG(D_INODE, "use next log\n");
+
+	loghandle = cathandle->u.chd.chd_next_log;
+	cathandle->u.chd.chd_current_log = loghandle;
+	cathandle->u.chd.chd_next_log = NULL;
+	down_write_nested(&loghandle->lgh_lock, LLOGH_LOG);
+	up_write(&cathandle->lgh_lock);
+	LASSERT(loghandle);
+	RETURN(loghandle);
+}
+
+/* Add a single record to the recovery log(s) using a catalog
+ * Returns as llog_write_record
+ *
+ * Assumes caller has already pushed us into the kernel context.
+ */
+int llog_cat_add_rec(const struct lu_env *env, struct llog_handle *cathandle,
+		     struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+		     void *buf, struct thandle *th)
+{
+	struct llog_handle *loghandle;
+	int rc;
+	ENTRY;
+
+	LASSERT(rec->lrh_len <= LLOG_CHUNK_SIZE);
+	loghandle = llog_cat_current_log(cathandle, th);
+	LASSERT(!IS_ERR(loghandle));
+
+	/* loghandle is already locked by llog_cat_current_log() for us */
+	if (!llog_exist(loghandle)) {
+		rc = llog_cat_new_log(env, cathandle, loghandle, th);
+		if (rc < 0) {
+			up_write(&loghandle->lgh_lock);
+			RETURN(rc);
+		}
+	}
+	/* now let's try to add the record */
+	rc = llog_write_rec(env, loghandle, rec, reccookie, 1, buf, -1, th);
+	if (rc < 0)
+		CDEBUG_LIMIT(rc == -ENOSPC ? D_HA : D_ERROR,
+			     "llog_write_rec %d: lh=%p\n", rc, loghandle);
+	up_write(&loghandle->lgh_lock);
+	if (rc == -ENOSPC) {
+		/* try to use next log */
+		loghandle = llog_cat_current_log(cathandle, th);
+		LASSERT(!IS_ERR(loghandle));
+		/* new llog can be created concurrently */
+		if (!llog_exist(loghandle)) {
+			rc = llog_cat_new_log(env, cathandle, loghandle, th);
+			if (rc < 0) {
+				up_write(&loghandle->lgh_lock);
+				RETURN(rc);
+			}
+		}
+		/* now let's try to add the record */
+		rc = llog_write_rec(env, loghandle, rec, reccookie, 1, buf,
+				    -1, th);
+		if (rc < 0)
+			CERROR("llog_write_rec %d: lh=%p\n", rc, loghandle);
+		up_write(&loghandle->lgh_lock);
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_add_rec);
+
+int llog_cat_declare_add_rec(const struct lu_env *env,
+			     struct llog_handle *cathandle,
+			     struct llog_rec_hdr *rec, struct thandle *th)
+{
+	struct llog_handle	*loghandle, *next;
+	int			 rc = 0;
+
+	ENTRY;
+
+	if (cathandle->u.chd.chd_current_log == NULL) {
+		/* declare new plain llog */
+		down_write(&cathandle->lgh_lock);
+		if (cathandle->u.chd.chd_current_log == NULL) {
+			rc = llog_open(env, cathandle->lgh_ctxt, &loghandle,
+				       NULL, NULL, LLOG_OPEN_NEW);
+			if (rc == 0) {
+				cathandle->u.chd.chd_current_log = loghandle;
+				list_add_tail(&loghandle->u.phd.phd_entry,
+						  &cathandle->u.chd.chd_head);
+			}
+		}
+		up_write(&cathandle->lgh_lock);
+	} else if (cathandle->u.chd.chd_next_log == NULL) {
+		/* declare next plain llog */
+		down_write(&cathandle->lgh_lock);
+		if (cathandle->u.chd.chd_next_log == NULL) {
+			rc = llog_open(env, cathandle->lgh_ctxt, &loghandle,
+				       NULL, NULL, LLOG_OPEN_NEW);
+			if (rc == 0) {
+				cathandle->u.chd.chd_next_log = loghandle;
+				list_add_tail(&loghandle->u.phd.phd_entry,
+						  &cathandle->u.chd.chd_head);
+			}
+		}
+		up_write(&cathandle->lgh_lock);
+	}
+	if (rc)
+		GOTO(out, rc);
+
+	if (!llog_exist(cathandle->u.chd.chd_current_log)) {
+		rc = llog_declare_create(env, cathandle->u.chd.chd_current_log,
+					 th);
+		if (rc)
+			GOTO(out, rc);
+		llog_declare_write_rec(env, cathandle, NULL, -1, th);
+	}
+	/* declare records in the llogs */
+	rc = llog_declare_write_rec(env, cathandle->u.chd.chd_current_log,
+				    rec, -1, th);
+	if (rc)
+		GOTO(out, rc);
+
+	next = cathandle->u.chd.chd_next_log;
+	if (next) {
+		if (!llog_exist(next)) {
+			rc = llog_declare_create(env, next, th);
+			llog_declare_write_rec(env, cathandle, NULL, -1, th);
+		}
+		llog_declare_write_rec(env, next, rec, -1, th);
+	}
+out:
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_declare_add_rec);
+
+int llog_cat_add(const struct lu_env *env, struct llog_handle *cathandle,
+		 struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+		 void *buf)
+{
+	struct llog_ctxt	*ctxt;
+	struct dt_device	*dt;
+	struct thandle		*th = NULL;
+	int			 rc;
+
+	ctxt = cathandle->lgh_ctxt;
+	LASSERT(ctxt);
+	LASSERT(ctxt->loc_exp);
+
+	if (cathandle->lgh_obj != NULL) {
+		dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt;
+		LASSERT(dt);
+
+		th = dt_trans_create(env, dt);
+		if (IS_ERR(th))
+			RETURN(PTR_ERR(th));
+
+		rc = llog_cat_declare_add_rec(env, cathandle, rec, th);
+		if (rc)
+			GOTO(out_trans, rc);
+
+		rc = dt_trans_start_local(env, dt, th);
+		if (rc)
+			GOTO(out_trans, rc);
+		rc = llog_cat_add_rec(env, cathandle, rec, reccookie, buf, th);
+out_trans:
+		dt_trans_stop(env, dt, th);
+	} else { /* lvfs compat code */
+		LASSERT(cathandle->lgh_file != NULL);
+		rc = llog_cat_declare_add_rec(env, cathandle, rec, th);
+		if (rc == 0)
+			rc = llog_cat_add_rec(env, cathandle, rec, reccookie,
+					      buf, th);
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_add);
+
+/* For each cookie in the cookie array, we clear the log in-use bit and either:
+ * - the log is empty, so mark it free in the catalog header and delete it
+ * - the log is not empty, just write out the log header
+ *
+ * The cookies may be in different log files, so we need to get new logs
+ * each time.
+ *
+ * Assumes caller has already pushed us into the kernel context.
+ */
+int llog_cat_cancel_records(const struct lu_env *env,
+			    struct llog_handle *cathandle, int count,
+			    struct llog_cookie *cookies)
+{
+	int i, index, rc = 0, failed = 0;
+
+	ENTRY;
+
+	for (i = 0; i < count; i++, cookies++) {
+		struct llog_handle	*loghandle;
+		struct llog_logid	*lgl = &cookies->lgc_lgl;
+		int			 lrc;
+
+		rc = llog_cat_id2handle(env, cathandle, &loghandle, lgl);
+		if (rc) {
+			CERROR("%s: cannot find handle for llog "DOSTID": %d\n",
+			       cathandle->lgh_ctxt->loc_obd->obd_name,
+			       POSTID(&lgl->lgl_oi), rc);
+			failed++;
+			continue;
+		}
+
+		lrc = llog_cancel_rec(env, loghandle, cookies->lgc_index);
+		if (lrc == 1) {	  /* log has been destroyed */
+			index = loghandle->u.phd.phd_cookie.lgc_index;
+			rc = llog_cat_cleanup(env, cathandle, loghandle,
+					      index);
+		} else if (lrc == -ENOENT) {
+			if (rc == 0) /* ENOENT shouldn't rewrite any error */
+				rc = lrc;
+		} else if (lrc < 0) {
+			failed++;
+			rc = lrc;
+		}
+		llog_handle_put(loghandle);
+	}
+	if (rc)
+		CERROR("%s: fail to cancel %d of %d llog-records: rc = %d\n",
+		       cathandle->lgh_ctxt->loc_obd->obd_name, failed, count,
+		       rc);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_cancel_records);
+
+int llog_cat_process_cb(const struct lu_env *env, struct llog_handle *cat_llh,
+			struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_process_data *d = data;
+	struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+	struct llog_handle *llh;
+	int rc;
+
+	ENTRY;
+	if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+		CERROR("invalid record in catalog\n");
+		RETURN(-EINVAL);
+	}
+	CDEBUG(D_HA, "processing log "DOSTID":%x at index %u of catalog "
+	       DOSTID"\n", POSTID(&lir->lid_id.lgl_oi), lir->lid_id.lgl_ogen,
+	       rec->lrh_index, POSTID(&cat_llh->lgh_id.lgl_oi));
+
+	rc = llog_cat_id2handle(env, cat_llh, &llh, &lir->lid_id);
+	if (rc) {
+		CERROR("%s: cannot find handle for llog "DOSTID": %d\n",
+		       cat_llh->lgh_ctxt->loc_obd->obd_name,
+		       POSTID(&lir->lid_id.lgl_oi), rc);
+		RETURN(rc);
+	}
+
+	if (rec->lrh_index < d->lpd_startcat)
+		/* Skip processing of the logs until startcat */
+		RETURN(0);
+
+	if (d->lpd_startidx > 0) {
+		struct llog_process_cat_data cd;
+
+		cd.lpcd_first_idx = d->lpd_startidx;
+		cd.lpcd_last_idx = 0;
+		rc = llog_process_or_fork(env, llh, d->lpd_cb, d->lpd_data,
+					  &cd, false);
+		/* Continue processing the next log from idx 0 */
+		d->lpd_startidx = 0;
+	} else {
+		rc = llog_process_or_fork(env, llh, d->lpd_cb, d->lpd_data,
+					  NULL, false);
+	}
+	llog_handle_put(llh);
+
+	RETURN(rc);
+}
+
+int llog_cat_process_or_fork(const struct lu_env *env,
+			     struct llog_handle *cat_llh,
+			     llog_cb_t cb, void *data, int startcat,
+			     int startidx, bool fork)
+{
+	struct llog_process_data d;
+	struct llog_log_hdr *llh = cat_llh->lgh_hdr;
+	int rc;
+	ENTRY;
+
+	LASSERT(llh->llh_flags & LLOG_F_IS_CAT);
+	d.lpd_data = data;
+	d.lpd_cb = cb;
+	d.lpd_startcat = startcat;
+	d.lpd_startidx = startidx;
+
+	if (llh->llh_cat_idx > cat_llh->lgh_last_idx) {
+		struct llog_process_cat_data cd;
+
+		CWARN("catlog "DOSTID" crosses index zero\n",
+		      POSTID(&cat_llh->lgh_id.lgl_oi));
+
+		cd.lpcd_first_idx = llh->llh_cat_idx;
+		cd.lpcd_last_idx = 0;
+		rc = llog_process_or_fork(env, cat_llh, llog_cat_process_cb,
+					  &d, &cd, fork);
+		if (rc != 0)
+			RETURN(rc);
+
+		cd.lpcd_first_idx = 0;
+		cd.lpcd_last_idx = cat_llh->lgh_last_idx;
+		rc = llog_process_or_fork(env, cat_llh, llog_cat_process_cb,
+					  &d, &cd, fork);
+	} else {
+		rc = llog_process_or_fork(env, cat_llh, llog_cat_process_cb,
+					  &d, NULL, fork);
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_process_or_fork);
+
+int llog_cat_process(const struct lu_env *env, struct llog_handle *cat_llh,
+		     llog_cb_t cb, void *data, int startcat, int startidx)
+{
+	return llog_cat_process_or_fork(env, cat_llh, cb, data, startcat,
+					startidx, false);
+}
+EXPORT_SYMBOL(llog_cat_process);
+
+static int llog_cat_reverse_process_cb(const struct lu_env *env,
+				       struct llog_handle *cat_llh,
+				       struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_process_data *d = data;
+	struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+	struct llog_handle *llh;
+	int rc;
+
+	if (le32_to_cpu(rec->lrh_type) != LLOG_LOGID_MAGIC) {
+		CERROR("invalid record in catalog\n");
+		RETURN(-EINVAL);
+	}
+	CDEBUG(D_HA, "processing log "DOSTID":%x at index %u of catalog "
+	       DOSTID"\n", POSTID(&lir->lid_id.lgl_oi), lir->lid_id.lgl_ogen,
+	       le32_to_cpu(rec->lrh_index), POSTID(&cat_llh->lgh_id.lgl_oi));
+
+	rc = llog_cat_id2handle(env, cat_llh, &llh, &lir->lid_id);
+	if (rc) {
+		CERROR("%s: cannot find handle for llog "DOSTID": %d\n",
+		       cat_llh->lgh_ctxt->loc_obd->obd_name,
+		       POSTID(&lir->lid_id.lgl_oi), rc);
+		RETURN(rc);
+	}
+
+	rc = llog_reverse_process(env, llh, d->lpd_cb, d->lpd_data, NULL);
+	llog_handle_put(llh);
+	RETURN(rc);
+}
+
+int llog_cat_reverse_process(const struct lu_env *env,
+			     struct llog_handle *cat_llh,
+			     llog_cb_t cb, void *data)
+{
+	struct llog_process_data d;
+	struct llog_process_cat_data cd;
+	struct llog_log_hdr *llh = cat_llh->lgh_hdr;
+	int rc;
+	ENTRY;
+
+	LASSERT(llh->llh_flags & LLOG_F_IS_CAT);
+	d.lpd_data = data;
+	d.lpd_cb = cb;
+
+	if (llh->llh_cat_idx > cat_llh->lgh_last_idx) {
+		CWARN("catalog "DOSTID" crosses index zero\n",
+		      POSTID(&cat_llh->lgh_id.lgl_oi));
+
+		cd.lpcd_first_idx = 0;
+		cd.lpcd_last_idx = cat_llh->lgh_last_idx;
+		rc = llog_reverse_process(env, cat_llh,
+					  llog_cat_reverse_process_cb,
+					  &d, &cd);
+		if (rc != 0)
+			RETURN(rc);
+
+		cd.lpcd_first_idx = le32_to_cpu(llh->llh_cat_idx);
+		cd.lpcd_last_idx = 0;
+		rc = llog_reverse_process(env, cat_llh,
+					  llog_cat_reverse_process_cb,
+					  &d, &cd);
+	} else {
+		rc = llog_reverse_process(env, cat_llh,
+					  llog_cat_reverse_process_cb,
+					  &d, NULL);
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_reverse_process);
+
+int llog_cat_set_first_idx(struct llog_handle *cathandle, int index)
+{
+	struct llog_log_hdr *llh = cathandle->lgh_hdr;
+	int i, bitmap_size, idx;
+	ENTRY;
+
+	bitmap_size = LLOG_BITMAP_SIZE(llh);
+	if (llh->llh_cat_idx == (index - 1)) {
+		idx = llh->llh_cat_idx + 1;
+		llh->llh_cat_idx = idx;
+		if (idx == cathandle->lgh_last_idx)
+			goto out;
+		for (i = (index + 1) % bitmap_size;
+		     i != cathandle->lgh_last_idx;
+		     i = (i + 1) % bitmap_size) {
+			if (!ext2_test_bit(i, llh->llh_bitmap)) {
+				idx = llh->llh_cat_idx + 1;
+				llh->llh_cat_idx = idx;
+			} else if (i == 0) {
+				llh->llh_cat_idx = 0;
+			} else {
+				break;
+			}
+		}
+out:
+		CDEBUG(D_RPCTRACE, "set catlog "DOSTID" first idx %u\n",
+		       POSTID(&cathandle->lgh_id.lgl_oi), llh->llh_cat_idx);
+	}
+
+	RETURN(0);
+}
+
+/* Cleanup deleted plain llog traces from catalog */
+int llog_cat_cleanup(const struct lu_env *env, struct llog_handle *cathandle,
+		     struct llog_handle *loghandle, int index)
+{
+	int rc;
+
+	LASSERT(index);
+	if (loghandle != NULL) {
+		/* remove destroyed llog from catalog list and
+		 * chd_current_log variable */
+		down_write(&cathandle->lgh_lock);
+		if (cathandle->u.chd.chd_current_log == loghandle)
+			cathandle->u.chd.chd_current_log = NULL;
+		list_del_init(&loghandle->u.phd.phd_entry);
+		up_write(&cathandle->lgh_lock);
+		LASSERT(index == loghandle->u.phd.phd_cookie.lgc_index);
+		/* llog was opened and keep in a list, close it now */
+		llog_close(env, loghandle);
+	}
+	/* remove plain llog entry from catalog by index */
+	llog_cat_set_first_idx(cathandle, index);
+	rc = llog_cancel_rec(env, cathandle, index);
+	if (rc == 0)
+		CDEBUG(D_HA, "cancel plain log at index"
+		       " %u of catalog "DOSTID"\n",
+		       index, POSTID(&cathandle->lgh_id.lgl_oi));
+	return rc;
+}
+
+int cat_cancel_cb(const struct lu_env *env, struct llog_handle *cathandle,
+		  struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_logid_rec	*lir = (struct llog_logid_rec *)rec;
+	struct llog_handle	*loghandle;
+	struct llog_log_hdr	*llh;
+	int			 rc;
+
+	ENTRY;
+
+	if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+		CERROR("invalid record in catalog\n");
+		RETURN(-EINVAL);
+	}
+
+	CDEBUG(D_HA, "processing log "DOSTID":%x at index %u of catalog "
+	       DOSTID"\n", POSTID(&lir->lid_id.lgl_oi), lir->lid_id.lgl_ogen,
+	       rec->lrh_index, POSTID(&cathandle->lgh_id.lgl_oi));
+
+	rc = llog_cat_id2handle(env, cathandle, &loghandle, &lir->lid_id);
+	if (rc) {
+		CERROR("%s: cannot find handle for llog "DOSTID": %d\n",
+		       cathandle->lgh_ctxt->loc_obd->obd_name,
+		       POSTID(&lir->lid_id.lgl_oi), rc);
+		if (rc == -ENOENT || rc == -ESTALE) {
+			/* remove index from catalog */
+			llog_cat_cleanup(env, cathandle, NULL, rec->lrh_index);
+		}
+		RETURN(rc);
+	}
+
+	llh = loghandle->lgh_hdr;
+	if ((llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) &&
+	    (llh->llh_count == 1)) {
+		rc = llog_destroy(env, loghandle);
+		if (rc)
+			CERROR("%s: fail to destroy empty log: rc = %d\n",
+			       loghandle->lgh_ctxt->loc_obd->obd_name, rc);
+
+		llog_cat_cleanup(env, cathandle, loghandle,
+				 loghandle->u.phd.phd_cookie.lgc_index);
+	}
+	llog_handle_put(loghandle);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(cat_cancel_cb);
+
+/* helper to initialize catalog llog and process it to cancel */
+int llog_cat_init_and_process(const struct lu_env *env,
+			      struct llog_handle *llh)
+{
+	int rc;
+
+	rc = llog_init_handle(env, llh, LLOG_F_IS_CAT, NULL);
+	if (rc)
+		RETURN(rc);
+
+	rc = llog_process_or_fork(env, llh, cat_cancel_cb, NULL, NULL, false);
+	if (rc)
+		CERROR("%s: llog_process() with cat_cancel_cb failed: rc = "
+		       "%d\n", llh->lgh_ctxt->loc_obd->obd_name, rc);
+	RETURN(0);
+}
+EXPORT_SYMBOL(llog_cat_init_and_process);
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_internal.h b/drivers/staging/lustre/lustre/obdclass/llog_internal.h
new file mode 100644
index 000000000000..539e1d4f9d4c
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/llog_internal.h
@@ -0,0 +1,98 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LLOG_INTERNAL_H__
+#define __LLOG_INTERNAL_H__
+
+#include <lustre_log.h>
+
+struct llog_process_info {
+	struct llog_handle *lpi_loghandle;
+	llog_cb_t	   lpi_cb;
+	void	       *lpi_cbdata;
+	void	       *lpi_catdata;
+	int		 lpi_rc;
+	struct completion	lpi_completion;
+	const struct lu_env	*lpi_env;
+
+};
+
+struct llog_thread_info {
+	struct lu_attr			 lgi_attr;
+	struct lu_fid			 lgi_fid;
+	struct dt_object_format		 lgi_dof;
+	struct lu_buf			 lgi_buf;
+	loff_t				 lgi_off;
+	struct llog_rec_hdr		 lgi_lrh;
+	struct llog_rec_tail		 lgi_tail;
+};
+
+extern struct lu_context_key llog_thread_key;
+
+static inline struct llog_thread_info *llog_info(const struct lu_env *env)
+{
+	struct llog_thread_info *lgi;
+
+	lgi = lu_context_key_get(&env->le_ctx, &llog_thread_key);
+	LASSERT(lgi);
+	return lgi;
+}
+
+static inline void
+lustre_build_llog_lvfs_oid(struct llog_logid *logid, __u64 ino, __u32 gen)
+{
+	ostid_set_seq_llog(&logid->lgl_oi);
+	ostid_set_id(&logid->lgl_oi, ino);
+	logid->lgl_ogen = gen;
+}
+
+int llog_info_init(void);
+void llog_info_fini(void);
+
+void llog_handle_get(struct llog_handle *loghandle);
+void llog_handle_put(struct llog_handle *loghandle);
+int llog_cat_id2handle(const struct lu_env *env, struct llog_handle *cathandle,
+		       struct llog_handle **res, struct llog_logid *logid);
+int class_config_dump_handler(const struct lu_env *env,
+			      struct llog_handle *handle,
+			      struct llog_rec_hdr *rec, void *data);
+int class_config_parse_rec(struct llog_rec_hdr *rec, char *buf, int size);
+int llog_process_or_fork(const struct lu_env *env,
+			 struct llog_handle *loghandle,
+			 llog_cb_t cb, void *data, void *catdata, bool fork);
+int llog_cat_cleanup(const struct lu_env *env, struct llog_handle *cathandle,
+		     struct llog_handle *loghandle, int index);
+#endif
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_ioctl.c b/drivers/staging/lustre/lustre/obdclass/llog_ioctl.c
new file mode 100644
index 000000000000..0732874e26c5
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/llog_ioctl.c
@@ -0,0 +1,427 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+#include <obd_class.h>
+#include <lustre_log.h>
+#include "llog_internal.h"
+
+static int str2logid(struct llog_logid *logid, char *str, int len)
+{
+	char *start, *end, *endp;
+	__u64 id, seq;
+
+	ENTRY;
+	start = str;
+	if (*start != '#')
+		RETURN(-EINVAL);
+
+	start++;
+	if (start - str >= len - 1)
+		RETURN(-EINVAL);
+	end = strchr(start, '#');
+	if (end == NULL || end == start)
+		RETURN(-EINVAL);
+
+	*end = '\0';
+	id = simple_strtoull(start, &endp, 0);
+	if (endp != end)
+		RETURN(-EINVAL);
+
+	start = ++end;
+	if (start - str >= len - 1)
+		RETURN(-EINVAL);
+	end = strchr(start, '#');
+	if (end == NULL || end == start)
+		RETURN(-EINVAL);
+
+	*end = '\0';
+	seq = simple_strtoull(start, &endp, 0);
+	if (endp != end)
+		RETURN(-EINVAL);
+
+	ostid_set_seq(&logid->lgl_oi, seq);
+	ostid_set_id(&logid->lgl_oi, id);
+
+	start = ++end;
+	if (start - str >= len - 1)
+		RETURN(-EINVAL);
+	logid->lgl_ogen = simple_strtoul(start, &endp, 16);
+	if (*endp != '\0')
+		RETURN(-EINVAL);
+
+	RETURN(0);
+}
+
+static int llog_check_cb(const struct lu_env *env, struct llog_handle *handle,
+			 struct llog_rec_hdr *rec, void *data)
+{
+	struct obd_ioctl_data *ioc_data = (struct obd_ioctl_data *)data;
+	static int l, remains, from, to;
+	static char *out;
+	char *endp;
+	int cur_index, rc = 0;
+
+	ENTRY;
+
+	if (ioc_data && ioc_data->ioc_inllen1 > 0) {
+		l = 0;
+		remains = ioc_data->ioc_inllen4 +
+			cfs_size_round(ioc_data->ioc_inllen1) +
+			cfs_size_round(ioc_data->ioc_inllen2) +
+			cfs_size_round(ioc_data->ioc_inllen3);
+		from = simple_strtol(ioc_data->ioc_inlbuf2, &endp, 0);
+		if (*endp != '\0')
+			RETURN(-EINVAL);
+		to = simple_strtol(ioc_data->ioc_inlbuf3, &endp, 0);
+		if (*endp != '\0')
+			RETURN(-EINVAL);
+		ioc_data->ioc_inllen1 = 0;
+		out = ioc_data->ioc_bulk;
+	}
+
+	cur_index = rec->lrh_index;
+	if (cur_index < from)
+		RETURN(0);
+	if (to > 0 && cur_index > to)
+		RETURN(-LLOG_EEMPTY);
+
+	if (handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) {
+		struct llog_logid_rec	*lir = (struct llog_logid_rec *)rec;
+		struct llog_handle	*loghandle;
+
+		if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+			l = snprintf(out, remains, "[index]: %05d  [type]: "
+				     "%02x  [len]: %04d failed\n",
+				     cur_index, rec->lrh_type,
+				     rec->lrh_len);
+		}
+		if (handle->lgh_ctxt == NULL)
+			RETURN(-EOPNOTSUPP);
+		rc = llog_cat_id2handle(env, handle, &loghandle, &lir->lid_id);
+		if (rc) {
+			CDEBUG(D_IOCTL, "cannot find log #"DOSTID"#%08x\n",
+			       POSTID(&lir->lid_id.lgl_oi),
+			       lir->lid_id.lgl_ogen);
+			RETURN(rc);
+		}
+		rc = llog_process(env, loghandle, llog_check_cb, NULL, NULL);
+		llog_handle_put(loghandle);
+	} else {
+		bool ok;
+
+		switch (rec->lrh_type) {
+		case OST_SZ_REC:
+		case MDS_UNLINK_REC:
+		case MDS_UNLINK64_REC:
+		case MDS_SETATTR64_REC:
+		case OBD_CFG_REC:
+		case LLOG_GEN_REC:
+		case LLOG_HDR_MAGIC:
+			ok = true;
+			break;
+		default:
+			ok = false;
+		}
+
+		l = snprintf(out, remains, "[index]: %05d  [type]: "
+			     "%02x  [len]: %04d %s\n",
+			     cur_index, rec->lrh_type, rec->lrh_len,
+			     ok ? "ok" : "failed");
+		out += l;
+		remains -= l;
+		if (remains <= 0) {
+			CERROR("%s: no space to print log records\n",
+			       handle->lgh_ctxt->loc_obd->obd_name);
+			RETURN(-LLOG_EEMPTY);
+		}
+	}
+	RETURN(rc);
+}
+
+static int llog_print_cb(const struct lu_env *env, struct llog_handle *handle,
+			 struct llog_rec_hdr *rec, void *data)
+{
+	struct obd_ioctl_data *ioc_data = (struct obd_ioctl_data *)data;
+	static int l, remains, from, to;
+	static char *out;
+	char *endp;
+	int cur_index;
+
+	ENTRY;
+	if (ioc_data != NULL && ioc_data->ioc_inllen1 > 0) {
+		l = 0;
+		remains = ioc_data->ioc_inllen4 +
+			cfs_size_round(ioc_data->ioc_inllen1) +
+			cfs_size_round(ioc_data->ioc_inllen2) +
+			cfs_size_round(ioc_data->ioc_inllen3);
+		from = simple_strtol(ioc_data->ioc_inlbuf2, &endp, 0);
+		if (*endp != '\0')
+			RETURN(-EINVAL);
+		to = simple_strtol(ioc_data->ioc_inlbuf3, &endp, 0);
+		if (*endp != '\0')
+			RETURN(-EINVAL);
+		out = ioc_data->ioc_bulk;
+		ioc_data->ioc_inllen1 = 0;
+	}
+
+	cur_index = rec->lrh_index;
+	if (cur_index < from)
+		RETURN(0);
+	if (to > 0 && cur_index > to)
+		RETURN(-LLOG_EEMPTY);
+
+	if (handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) {
+		struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+
+		if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+			CERROR("invalid record in catalog\n");
+			RETURN(-EINVAL);
+		}
+
+		l = snprintf(out, remains,
+			     "[index]: %05d  [logid]: #"DOSTID"#%08x\n",
+			     cur_index, POSTID(&lir->lid_id.lgl_oi),
+			     lir->lid_id.lgl_ogen);
+	} else if (rec->lrh_type == OBD_CFG_REC) {
+		int rc;
+
+		rc = class_config_parse_rec(rec, out, remains);
+		if (rc < 0)
+			RETURN(rc);
+		l = rc;
+	} else {
+		l = snprintf(out, remains,
+			     "[index]: %05d  [type]: %02x  [len]: %04d\n",
+			     cur_index, rec->lrh_type, rec->lrh_len);
+	}
+	out += l;
+	remains -= l;
+	if (remains <= 0) {
+		CERROR("not enough space for print log records\n");
+		RETURN(-LLOG_EEMPTY);
+	}
+
+	RETURN(0);
+}
+static int llog_remove_log(const struct lu_env *env, struct llog_handle *cat,
+			   struct llog_logid *logid)
+{
+	struct llog_handle	*log;
+	int			 rc;
+
+	ENTRY;
+
+	rc = llog_cat_id2handle(env, cat, &log, logid);
+	if (rc) {
+		CDEBUG(D_IOCTL, "cannot find log #"DOSTID"#%08x\n",
+		       POSTID(&logid->lgl_oi), logid->lgl_ogen);
+		RETURN(-ENOENT);
+	}
+
+	rc = llog_destroy(env, log);
+	if (rc) {
+		CDEBUG(D_IOCTL, "cannot destroy log\n");
+		GOTO(out, rc);
+	}
+	llog_cat_cleanup(env, cat, log, log->u.phd.phd_cookie.lgc_index);
+out:
+	llog_handle_put(log);
+	RETURN(rc);
+
+}
+
+static int llog_delete_cb(const struct lu_env *env, struct llog_handle *handle,
+			  struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_logid_rec	*lir = (struct llog_logid_rec *)rec;
+	int			 rc;
+
+	ENTRY;
+	if (rec->lrh_type != LLOG_LOGID_MAGIC)
+		RETURN(-EINVAL);
+	rc = llog_remove_log(env, handle, &lir->lid_id);
+
+	RETURN(rc);
+}
+
+
+int llog_ioctl(const struct lu_env *env, struct llog_ctxt *ctxt, int cmd,
+	       struct obd_ioctl_data *data)
+{
+	struct llog_logid	 logid;
+	int			 rc = 0;
+	struct llog_handle	*handle = NULL;
+
+	ENTRY;
+
+	if (*data->ioc_inlbuf1 == '#') {
+		rc = str2logid(&logid, data->ioc_inlbuf1, data->ioc_inllen1);
+		if (rc)
+			RETURN(rc);
+		rc = llog_open(env, ctxt, &handle, &logid, NULL,
+			       LLOG_OPEN_EXISTS);
+		if (rc)
+			RETURN(rc);
+	} else if (*data->ioc_inlbuf1 == '$') {
+		char *name = data->ioc_inlbuf1 + 1;
+
+		rc = llog_open(env, ctxt, &handle, NULL, name,
+			       LLOG_OPEN_EXISTS);
+		if (rc)
+			RETURN(rc);
+	} else {
+		RETURN(-EINVAL);
+	}
+
+	rc = llog_init_handle(env, handle, 0, NULL);
+	if (rc)
+		GOTO(out_close, rc = -ENOENT);
+
+	switch (cmd) {
+	case OBD_IOC_LLOG_INFO: {
+		int	 l;
+		int	 remains = data->ioc_inllen2 +
+				   cfs_size_round(data->ioc_inllen1);
+		char	*out = data->ioc_bulk;
+
+		l = snprintf(out, remains,
+			     "logid:	    #"DOSTID"#%08x\n"
+			     "flags:	    %x (%s)\n"
+			     "records count:    %d\n"
+			     "last index:       %d\n",
+			     POSTID(&handle->lgh_id.lgl_oi),
+			     handle->lgh_id.lgl_ogen,
+			     handle->lgh_hdr->llh_flags,
+			     handle->lgh_hdr->llh_flags &
+			     LLOG_F_IS_CAT ? "cat" : "plain",
+			     handle->lgh_hdr->llh_count,
+			     handle->lgh_last_idx);
+		out += l;
+		remains -= l;
+		if (remains <= 0) {
+			CERROR("%s: not enough space for log header info\n",
+			       ctxt->loc_obd->obd_name);
+			rc = -ENOSPC;
+		}
+		break;
+	}
+	case OBD_IOC_LLOG_CHECK:
+		LASSERT(data->ioc_inllen1 > 0);
+		rc = llog_process(env, handle, llog_check_cb, data, NULL);
+		if (rc == -LLOG_EEMPTY)
+			rc = 0;
+		else if (rc)
+			GOTO(out_close, rc);
+		break;
+	case OBD_IOC_LLOG_PRINT:
+		LASSERT(data->ioc_inllen1 > 0);
+		rc = llog_process(env, handle, llog_print_cb, data, NULL);
+		if (rc == -LLOG_EEMPTY)
+			rc = 0;
+		else if (rc)
+			GOTO(out_close, rc);
+		break;
+	case OBD_IOC_LLOG_CANCEL: {
+		struct llog_cookie cookie;
+		struct llog_logid plain;
+		char *endp;
+
+		cookie.lgc_index = simple_strtoul(data->ioc_inlbuf3, &endp, 0);
+		if (*endp != '\0')
+			GOTO(out_close, rc = -EINVAL);
+
+		if (handle->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN) {
+			rc = llog_cancel_rec(NULL, handle, cookie.lgc_index);
+			GOTO(out_close, rc);
+		} else if (!(handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)) {
+			GOTO(out_close, rc = -EINVAL);
+		}
+
+		if (data->ioc_inlbuf2 == NULL) /* catalog but no logid */
+			GOTO(out_close, rc = -ENOTTY);
+
+		rc = str2logid(&plain, data->ioc_inlbuf2, data->ioc_inllen2);
+		if (rc)
+			GOTO(out_close, rc);
+		cookie.lgc_lgl = plain;
+		rc = llog_cat_cancel_records(env, handle, 1, &cookie);
+		if (rc)
+			GOTO(out_close, rc);
+		break;
+	}
+	case OBD_IOC_LLOG_REMOVE: {
+		struct llog_logid plain;
+
+		if (handle->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN) {
+			rc = llog_destroy(env, handle);
+			GOTO(out_close, rc);
+		} else if (!(handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)) {
+			GOTO(out_close, rc = -EINVAL);
+		}
+
+		if (data->ioc_inlbuf2 > 0) {
+			/* remove indicate log from the catalog */
+			rc = str2logid(&plain, data->ioc_inlbuf2,
+				       data->ioc_inllen2);
+			if (rc)
+				GOTO(out_close, rc);
+			rc = llog_remove_log(env, handle, &plain);
+		} else {
+			/* remove all the log of the catalog */
+			rc = llog_process(env, handle, llog_delete_cb, NULL,
+					  NULL);
+			if (rc)
+				GOTO(out_close, rc);
+		}
+		break;
+	}
+	default:
+		CERROR("%s: Unknown ioctl cmd %#x\n",
+		       ctxt->loc_obd->obd_name, cmd);
+		GOTO(out_close, rc = -ENOTTY);
+	}
+
+out_close:
+	if (handle->lgh_hdr &&
+	    handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)
+		llog_cat_close(env, handle);
+	else
+		llog_close(env, handle);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_ioctl);
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_lvfs.c b/drivers/staging/lustre/lustre/obdclass/llog_lvfs.c
new file mode 100644
index 000000000000..7e12dc62141f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/llog_lvfs.c
@@ -0,0 +1,862 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog_lvfs.c
+ *
+ * OST<->MDS recovery logging infrastructure.
+ * Invariants in implementation:
+ * - we do not share logs among different OST<->MDS connections, so that
+ *   if an OST or MDS fails it need only look at log(s) relevant to itself
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_log.h>
+#include <obd_ost.h>
+#include <linux/list.h>
+#include <lvfs.h>
+#include <lustre_fsfilt.h>
+#include <lustre_disk.h>
+#include "llog_internal.h"
+
+#if  defined(LLOG_LVFS)
+
+static int llog_lvfs_pad(struct obd_device *obd, struct l_file *file,
+				int len, int index)
+{
+	struct llog_rec_hdr rec = { 0 };
+	struct llog_rec_tail tail;
+	int rc;
+	ENTRY;
+
+	LASSERT(len >= LLOG_MIN_REC_SIZE && (len & 0x7) == 0);
+
+	tail.lrt_len = rec.lrh_len = len;
+	tail.lrt_index = rec.lrh_index = index;
+	rec.lrh_type = LLOG_PAD_MAGIC;
+
+	rc = fsfilt_write_record(obd, file, &rec, sizeof(rec), &file->f_pos, 0);
+	if (rc) {
+		CERROR("error writing padding record: rc %d\n", rc);
+		goto out;
+	}
+
+	file->f_pos += len - sizeof(rec) - sizeof(tail);
+	rc = fsfilt_write_record(obd, file, &tail, sizeof(tail),&file->f_pos,0);
+	if (rc) {
+		CERROR("error writing padding record: rc %d\n", rc);
+		goto out;
+	}
+
+ out:
+	RETURN(rc);
+}
+
+static int llog_lvfs_write_blob(struct obd_device *obd, struct l_file *file,
+				struct llog_rec_hdr *rec, void *buf, loff_t off)
+{
+	int rc;
+	struct llog_rec_tail end;
+	loff_t saved_off = file->f_pos;
+	int buflen = rec->lrh_len;
+
+	ENTRY;
+
+	file->f_pos = off;
+
+	if (buflen == 0)
+		CWARN("0-length record\n");
+
+	if (!buf) {
+		rc = fsfilt_write_record(obd, file, rec, buflen,&file->f_pos,0);
+		if (rc) {
+			CERROR("error writing log record: rc %d\n", rc);
+			goto out;
+		}
+		GOTO(out, rc = 0);
+	}
+
+	/* the buf case */
+	rec->lrh_len = sizeof(*rec) + buflen + sizeof(end);
+	rc = fsfilt_write_record(obd, file, rec, sizeof(*rec), &file->f_pos, 0);
+	if (rc) {
+		CERROR("error writing log hdr: rc %d\n", rc);
+		goto out;
+	}
+
+	rc = fsfilt_write_record(obd, file, buf, buflen, &file->f_pos, 0);
+	if (rc) {
+		CERROR("error writing log buffer: rc %d\n", rc);
+		goto out;
+	}
+
+	end.lrt_len = rec->lrh_len;
+	end.lrt_index = rec->lrh_index;
+	rc = fsfilt_write_record(obd, file, &end, sizeof(end), &file->f_pos, 0);
+	if (rc) {
+		CERROR("error writing log tail: rc %d\n", rc);
+		goto out;
+	}
+
+	rc = 0;
+ out:
+	if (saved_off > file->f_pos)
+		file->f_pos = saved_off;
+	LASSERT(rc <= 0);
+	RETURN(rc);
+}
+
+static int llog_lvfs_read_blob(struct obd_device *obd, struct l_file *file,
+				void *buf, int size, loff_t off)
+{
+	loff_t offset = off;
+	int rc;
+	ENTRY;
+
+	rc = fsfilt_read_record(obd, file, buf, size, &offset);
+	if (rc) {
+		CERROR("error reading log record: rc %d\n", rc);
+		RETURN(rc);
+	}
+	RETURN(0);
+}
+
+static int llog_lvfs_read_header(const struct lu_env *env,
+				 struct llog_handle *handle)
+{
+	struct obd_device *obd;
+	int rc;
+	ENTRY;
+
+	LASSERT(sizeof(*handle->lgh_hdr) == LLOG_CHUNK_SIZE);
+
+	obd = handle->lgh_ctxt->loc_exp->exp_obd;
+
+	if (i_size_read(handle->lgh_file->f_dentry->d_inode) == 0) {
+		CDEBUG(D_HA, "not reading header from 0-byte log\n");
+		RETURN(LLOG_EEMPTY);
+	}
+
+	rc = llog_lvfs_read_blob(obd, handle->lgh_file, handle->lgh_hdr,
+				 LLOG_CHUNK_SIZE, 0);
+	if (rc) {
+		CERROR("error reading log header from %.*s\n",
+		       handle->lgh_file->f_dentry->d_name.len,
+		       handle->lgh_file->f_dentry->d_name.name);
+	} else {
+		struct llog_rec_hdr *llh_hdr = &handle->lgh_hdr->llh_hdr;
+
+		if (LLOG_REC_HDR_NEEDS_SWABBING(llh_hdr))
+			lustre_swab_llog_hdr(handle->lgh_hdr);
+
+		if (llh_hdr->lrh_type != LLOG_HDR_MAGIC) {
+			CERROR("bad log %.*s header magic: %#x (expected %#x)\n",
+			       handle->lgh_file->f_dentry->d_name.len,
+			       handle->lgh_file->f_dentry->d_name.name,
+			       llh_hdr->lrh_type, LLOG_HDR_MAGIC);
+			rc = -EIO;
+		} else if (llh_hdr->lrh_len != LLOG_CHUNK_SIZE) {
+			CERROR("incorrectly sized log %.*s header: %#x "
+			       "(expected %#x)\n",
+			       handle->lgh_file->f_dentry->d_name.len,
+			       handle->lgh_file->f_dentry->d_name.name,
+			       llh_hdr->lrh_len, LLOG_CHUNK_SIZE);
+			CERROR("you may need to re-run lconf --write_conf.\n");
+			rc = -EIO;
+		}
+	}
+
+	handle->lgh_last_idx = handle->lgh_hdr->llh_tail.lrt_index;
+	handle->lgh_file->f_pos = i_size_read(handle->lgh_file->f_dentry->d_inode);
+
+	RETURN(rc);
+}
+
+/* returns negative in on error; 0 if success && reccookie == 0; 1 otherwise */
+/* appends if idx == -1, otherwise overwrites record idx. */
+static int llog_lvfs_write_rec(const struct lu_env *env,
+			       struct llog_handle *loghandle,
+			       struct llog_rec_hdr *rec,
+			       struct llog_cookie *reccookie, int cookiecount,
+			       void *buf, int idx, struct thandle *th)
+{
+	struct llog_log_hdr *llh;
+	int reclen = rec->lrh_len, index, rc;
+	struct llog_rec_tail *lrt;
+	struct obd_device *obd;
+	struct file *file;
+	size_t left;
+	ENTRY;
+
+	llh = loghandle->lgh_hdr;
+	file = loghandle->lgh_file;
+	obd = loghandle->lgh_ctxt->loc_exp->exp_obd;
+
+	/* record length should not bigger than LLOG_CHUNK_SIZE */
+	if (buf)
+		rc = (reclen > LLOG_CHUNK_SIZE - sizeof(struct llog_rec_hdr) -
+		      sizeof(struct llog_rec_tail)) ? -E2BIG : 0;
+	else
+		rc = (reclen > LLOG_CHUNK_SIZE) ? -E2BIG : 0;
+	if (rc)
+		RETURN(rc);
+
+	if (buf)
+		/* write_blob adds header and tail to lrh_len. */
+		reclen = sizeof(*rec) + rec->lrh_len +
+			 sizeof(struct llog_rec_tail);
+
+	if (idx != -1) {
+		loff_t saved_offset;
+
+		/* no header: only allowed to insert record 1 */
+		if (idx != 1 && !i_size_read(file->f_dentry->d_inode)) {
+			CERROR("idx != -1 in empty log\n");
+			LBUG();
+		}
+
+		if (idx && llh->llh_size && llh->llh_size != rec->lrh_len)
+			RETURN(-EINVAL);
+
+		if (!ext2_test_bit(idx, llh->llh_bitmap))
+			CERROR("Modify unset record %u\n", idx);
+		if (idx != rec->lrh_index)
+			CERROR("Index mismatch %d %u\n", idx, rec->lrh_index);
+
+		rc = llog_lvfs_write_blob(obd, file, &llh->llh_hdr, NULL, 0);
+		/* we are done if we only write the header or on error */
+		if (rc || idx == 0)
+			RETURN(rc);
+
+		if (buf) {
+			/* We assume that caller has set lgh_cur_* */
+			saved_offset = loghandle->lgh_cur_offset;
+			CDEBUG(D_OTHER,
+			       "modify record "DOSTID": idx:%d/%u/%d, len:%u "
+			       "offset %llu\n",
+			       POSTID(&loghandle->lgh_id.lgl_oi), idx, rec->lrh_index,
+			       loghandle->lgh_cur_idx, rec->lrh_len,
+			       (long long)(saved_offset - sizeof(*llh)));
+			if (rec->lrh_index != loghandle->lgh_cur_idx) {
+				CERROR("modify idx mismatch %u/%d\n",
+				       idx, loghandle->lgh_cur_idx);
+				RETURN(-EFAULT);
+			}
+		} else {
+			/* Assumes constant lrh_len */
+			saved_offset = sizeof(*llh) + (idx - 1) * reclen;
+		}
+
+		rc = llog_lvfs_write_blob(obd, file, rec, buf, saved_offset);
+		if (rc == 0 && reccookie) {
+			reccookie->lgc_lgl = loghandle->lgh_id;
+			reccookie->lgc_index = idx;
+			rc = 1;
+		}
+		RETURN(rc);
+	}
+
+	/* Make sure that records don't cross a chunk boundary, so we can
+	 * process them page-at-a-time if needed.  If it will cross a chunk
+	 * boundary, write in a fake (but referenced) entry to pad the chunk.
+	 *
+	 * We know that llog_current_log() will return a loghandle that is
+	 * big enough to hold reclen, so all we care about is padding here.
+	 */
+	left = LLOG_CHUNK_SIZE - (file->f_pos & (LLOG_CHUNK_SIZE - 1));
+
+	/* NOTE: padding is a record, but no bit is set */
+	if (left != 0 && left != reclen &&
+	    left < (reclen + LLOG_MIN_REC_SIZE)) {
+		 index = loghandle->lgh_last_idx + 1;
+		 rc = llog_lvfs_pad(obd, file, left, index);
+		 if (rc)
+			 RETURN(rc);
+		 loghandle->lgh_last_idx++; /*for pad rec*/
+	 }
+	 /* if it's the last idx in log file, then return -ENOSPC */
+	 if (loghandle->lgh_last_idx >= LLOG_BITMAP_SIZE(llh) - 1)
+		 RETURN(-ENOSPC);
+	loghandle->lgh_last_idx++;
+	index = loghandle->lgh_last_idx;
+	LASSERT(index < LLOG_BITMAP_SIZE(llh));
+	rec->lrh_index = index;
+	if (buf == NULL) {
+		lrt = (struct llog_rec_tail *)
+			((char *)rec + rec->lrh_len - sizeof(*lrt));
+		lrt->lrt_len = rec->lrh_len;
+		lrt->lrt_index = rec->lrh_index;
+	}
+	/*The caller should make sure only 1 process access the lgh_last_idx,
+	 *Otherwise it might hit the assert.*/
+	LASSERT(index < LLOG_BITMAP_SIZE(llh));
+	spin_lock(&loghandle->lgh_hdr_lock);
+	if (ext2_set_bit(index, llh->llh_bitmap)) {
+		CERROR("argh, index %u already set in log bitmap?\n", index);
+		spin_unlock(&loghandle->lgh_hdr_lock);
+		LBUG(); /* should never happen */
+	}
+	llh->llh_count++;
+	spin_unlock(&loghandle->lgh_hdr_lock);
+	llh->llh_tail.lrt_index = index;
+
+	rc = llog_lvfs_write_blob(obd, file, &llh->llh_hdr, NULL, 0);
+	if (rc)
+		RETURN(rc);
+
+	rc = llog_lvfs_write_blob(obd, file, rec, buf, file->f_pos);
+	if (rc)
+		RETURN(rc);
+
+	CDEBUG(D_RPCTRACE, "added record "DOSTID": idx: %u, %u \n",
+	       POSTID(&loghandle->lgh_id.lgl_oi), index, rec->lrh_len);
+	if (rc == 0 && reccookie) {
+		reccookie->lgc_lgl = loghandle->lgh_id;
+		reccookie->lgc_index = index;
+		if ((rec->lrh_type == MDS_UNLINK_REC) ||
+		    (rec->lrh_type == MDS_SETATTR64_REC))
+			reccookie->lgc_subsys = LLOG_MDS_OST_ORIG_CTXT;
+		else if (rec->lrh_type == OST_SZ_REC)
+			reccookie->lgc_subsys = LLOG_SIZE_ORIG_CTXT;
+		else
+			reccookie->lgc_subsys = -1;
+		rc = 1;
+	}
+	if (rc == 0 && rec->lrh_type == LLOG_GEN_REC)
+		rc = 1;
+
+	RETURN(rc);
+}
+
+/* We can skip reading at least as many log blocks as the number of
+* minimum sized log records we are skipping.  If it turns out
+* that we are not far enough along the log (because the
+* actual records are larger than minimum size) we just skip
+* some more records. */
+
+static void llog_skip_over(__u64 *off, int curr, int goal)
+{
+	if (goal <= curr)
+		return;
+	*off = (*off + (goal-curr-1) * LLOG_MIN_REC_SIZE) &
+		~(LLOG_CHUNK_SIZE - 1);
+}
+
+
+/* sets:
+ *  - cur_offset to the furthest point read in the log file
+ *  - cur_idx to the log index preceeding cur_offset
+ * returns -EIO/-EINVAL on error
+ */
+static int llog_lvfs_next_block(const struct lu_env *env,
+				struct llog_handle *loghandle, int *cur_idx,
+				int next_idx, __u64 *cur_offset, void *buf,
+				int len)
+{
+	int rc;
+	ENTRY;
+
+	if (len == 0 || len & (LLOG_CHUNK_SIZE - 1))
+		RETURN(-EINVAL);
+
+	CDEBUG(D_OTHER, "looking for log index %u (cur idx %u off "LPU64")\n",
+	       next_idx, *cur_idx, *cur_offset);
+
+	while (*cur_offset < i_size_read(loghandle->lgh_file->f_dentry->d_inode)) {
+		struct llog_rec_hdr *rec, *last_rec;
+		struct llog_rec_tail *tail;
+		loff_t ppos;
+		int llen;
+
+		llog_skip_over(cur_offset, *cur_idx, next_idx);
+
+		/* read up to next LLOG_CHUNK_SIZE block */
+		ppos = *cur_offset;
+		llen = LLOG_CHUNK_SIZE - (*cur_offset & (LLOG_CHUNK_SIZE - 1));
+		rc = fsfilt_read_record(loghandle->lgh_ctxt->loc_exp->exp_obd,
+					loghandle->lgh_file, buf, llen,
+					cur_offset);
+		if (rc < 0) {
+			CERROR("Cant read llog block at log id "DOSTID
+			       "/%u offset "LPU64"\n",
+			       POSTID(&loghandle->lgh_id.lgl_oi),
+			       loghandle->lgh_id.lgl_ogen,
+			       *cur_offset);
+			RETURN(rc);
+		}
+
+		/* put number of bytes read into rc to make code simpler */
+		rc = *cur_offset - ppos;
+		if (rc < len) {
+			/* signal the end of the valid buffer to llog_process */
+			memset(buf + rc, 0, len - rc);
+		}
+
+		if (rc == 0) /* end of file, nothing to do */
+			RETURN(0);
+
+		if (rc < sizeof(*tail)) {
+			CERROR("Invalid llog block at log id "DOSTID"/%u offset"
+			       LPU64"\n", POSTID(&loghandle->lgh_id.lgl_oi),
+			       loghandle->lgh_id.lgl_ogen, *cur_offset);
+			RETURN(-EINVAL);
+		}
+
+		rec = buf;
+		if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+			lustre_swab_llog_rec(rec);
+
+		tail = (struct llog_rec_tail *)(buf + rc -
+						sizeof(struct llog_rec_tail));
+
+		/* get the last record in block */
+		last_rec = (struct llog_rec_hdr *)(buf + rc -
+						   le32_to_cpu(tail->lrt_len));
+
+		if (LLOG_REC_HDR_NEEDS_SWABBING(last_rec))
+			lustre_swab_llog_rec(last_rec);
+		LASSERT(last_rec->lrh_index == tail->lrt_index);
+
+		*cur_idx = tail->lrt_index;
+
+		/* this shouldn't happen */
+		if (tail->lrt_index == 0) {
+			CERROR("Invalid llog tail at log id "DOSTID"/%u offset "
+			       LPU64"\n", POSTID(&loghandle->lgh_id.lgl_oi),
+			       loghandle->lgh_id.lgl_ogen, *cur_offset);
+			RETURN(-EINVAL);
+		}
+		if (tail->lrt_index < next_idx)
+			continue;
+
+		/* sanity check that the start of the new buffer is no farther
+		 * than the record that we wanted.  This shouldn't happen. */
+		if (rec->lrh_index > next_idx) {
+			CERROR("missed desired record? %u > %u\n",
+			       rec->lrh_index, next_idx);
+			RETURN(-ENOENT);
+		}
+		RETURN(0);
+	}
+	RETURN(-EIO);
+}
+
+static int llog_lvfs_prev_block(const struct lu_env *env,
+				struct llog_handle *loghandle,
+				int prev_idx, void *buf, int len)
+{
+	__u64 cur_offset;
+	int rc;
+	ENTRY;
+
+	if (len == 0 || len & (LLOG_CHUNK_SIZE - 1))
+		RETURN(-EINVAL);
+
+	CDEBUG(D_OTHER, "looking for log index %u\n", prev_idx);
+
+	cur_offset = LLOG_CHUNK_SIZE;
+	llog_skip_over(&cur_offset, 0, prev_idx);
+
+	while (cur_offset < i_size_read(loghandle->lgh_file->f_dentry->d_inode)) {
+		struct llog_rec_hdr *rec, *last_rec;
+		struct llog_rec_tail *tail;
+		loff_t ppos = cur_offset;
+
+		rc = fsfilt_read_record(loghandle->lgh_ctxt->loc_exp->exp_obd,
+					loghandle->lgh_file, buf, len,
+					&cur_offset);
+		if (rc < 0) {
+			CERROR("Cant read llog block at log id "DOSTID
+			       "/%u offset "LPU64"\n",
+			       POSTID(&loghandle->lgh_id.lgl_oi),
+			       loghandle->lgh_id.lgl_ogen,
+			       cur_offset);
+			RETURN(rc);
+		}
+
+		/* put number of bytes read into rc to make code simpler */
+		rc = cur_offset - ppos;
+
+		if (rc == 0) /* end of file, nothing to do */
+			RETURN(0);
+
+		if (rc < sizeof(*tail)) {
+			CERROR("Invalid llog block at log id "DOSTID"/%u offset"
+			       LPU64"\n", POSTID(&loghandle->lgh_id.lgl_oi),
+			       loghandle->lgh_id.lgl_ogen, cur_offset);
+			RETURN(-EINVAL);
+		}
+
+		rec = buf;
+		if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+			lustre_swab_llog_rec(rec);
+
+		tail = (struct llog_rec_tail *)(buf + rc -
+						sizeof(struct llog_rec_tail));
+
+		/* get the last record in block */
+		last_rec = (struct llog_rec_hdr *)(buf + rc -
+						   le32_to_cpu(tail->lrt_len));
+
+		if (LLOG_REC_HDR_NEEDS_SWABBING(last_rec))
+			lustre_swab_llog_rec(last_rec);
+		LASSERT(last_rec->lrh_index == tail->lrt_index);
+
+		/* this shouldn't happen */
+		if (tail->lrt_index == 0) {
+			CERROR("Invalid llog tail at log id "DOSTID"/%u offset"
+			       LPU64"\n", POSTID(&loghandle->lgh_id.lgl_oi),
+			       loghandle->lgh_id.lgl_ogen, cur_offset);
+			RETURN(-EINVAL);
+		}
+		if (tail->lrt_index < prev_idx)
+			continue;
+
+		/* sanity check that the start of the new buffer is no farther
+		 * than the record that we wanted.  This shouldn't happen. */
+		if (rec->lrh_index > prev_idx) {
+			CERROR("missed desired record? %u > %u\n",
+			       rec->lrh_index, prev_idx);
+			RETURN(-ENOENT);
+		}
+		RETURN(0);
+	}
+	RETURN(-EIO);
+}
+
+static struct file *llog_filp_open(char *dir, char *name, int flags, int mode)
+{
+	char *logname;
+	struct file *filp;
+	int len;
+
+	OBD_ALLOC(logname, PATH_MAX);
+	if (logname == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	len = snprintf(logname, PATH_MAX, "%s/%s", dir, name);
+	if (len >= PATH_MAX - 1) {
+		filp = ERR_PTR(-ENAMETOOLONG);
+	} else {
+		filp = l_filp_open(logname, flags, mode);
+		if (IS_ERR(filp) && PTR_ERR(filp) != -ENOENT)
+			CERROR("logfile creation %s: %ld\n", logname,
+			       PTR_ERR(filp));
+	}
+	OBD_FREE(logname, PATH_MAX);
+	return filp;
+}
+
+static int llog_lvfs_open(const struct lu_env *env,  struct llog_handle *handle,
+			  struct llog_logid *logid, char *name,
+			  enum llog_open_param open_param)
+{
+	struct llog_ctxt	*ctxt = handle->lgh_ctxt;
+	struct l_dentry		*dchild = NULL;
+	struct obd_device	*obd;
+	int			 rc = 0;
+
+	ENTRY;
+
+	LASSERT(ctxt);
+	LASSERT(ctxt->loc_exp);
+	LASSERT(ctxt->loc_exp->exp_obd);
+	obd = ctxt->loc_exp->exp_obd;
+
+	LASSERT(handle);
+	if (logid != NULL) {
+		dchild = obd_lvfs_fid2dentry(ctxt->loc_exp, &logid->lgl_oi,
+					     logid->lgl_ogen);
+		if (IS_ERR(dchild)) {
+			rc = PTR_ERR(dchild);
+			CERROR("%s: error looking up logfile #"DOSTID "#%08x:"
+			       " rc = %d\n", ctxt->loc_obd->obd_name,
+			       POSTID(&logid->lgl_oi), logid->lgl_ogen, rc);
+			GOTO(out, rc);
+		}
+		if (dchild->d_inode == NULL) {
+			l_dput(dchild);
+			rc = -ENOENT;
+			CERROR("%s: nonexistent llog #"DOSTID"#%08x:"
+			       "rc = %d\n", ctxt->loc_obd->obd_name,
+			       POSTID(&logid->lgl_oi), logid->lgl_ogen, rc);
+			GOTO(out, rc);
+		}
+		handle->lgh_file = l_dentry_open(&obd->obd_lvfs_ctxt, dchild,
+						 O_RDWR | O_LARGEFILE);
+		l_dput(dchild);
+		if (IS_ERR(handle->lgh_file)) {
+			rc = PTR_ERR(handle->lgh_file);
+			handle->lgh_file = NULL;
+			CERROR("%s: error opening llog #"DOSTID"#%08x:"
+			       "rc = %d\n", ctxt->loc_obd->obd_name,
+			       POSTID(&logid->lgl_oi), logid->lgl_ogen, rc);
+			GOTO(out, rc);
+		}
+		handle->lgh_id = *logid;
+	} else if (name) {
+		handle->lgh_file = llog_filp_open(MOUNT_CONFIGS_DIR, name,
+						  O_RDWR | O_LARGEFILE, 0644);
+		if (IS_ERR(handle->lgh_file)) {
+			rc = PTR_ERR(handle->lgh_file);
+			handle->lgh_file = NULL;
+			if (rc == -ENOENT && open_param == LLOG_OPEN_NEW) {
+				OBD_ALLOC(handle->lgh_name, strlen(name) + 1);
+				if (handle->lgh_name)
+					strcpy(handle->lgh_name, name);
+				else
+					GOTO(out, rc = -ENOMEM);
+				rc = 0;
+			} else {
+				GOTO(out, rc);
+			}
+		} else {
+			lustre_build_llog_lvfs_oid(&handle->lgh_id,
+			    handle->lgh_file->f_dentry->d_inode->i_ino,
+			    handle->lgh_file->f_dentry->d_inode->i_generation);
+		}
+	} else {
+		LASSERTF(open_param == LLOG_OPEN_NEW, "%#x\n", open_param);
+		handle->lgh_file = NULL;
+	}
+
+	/* No new llog is expected but doesn't exist */
+	if (open_param != LLOG_OPEN_NEW && handle->lgh_file == NULL)
+		GOTO(out_name, rc = -ENOENT);
+
+	RETURN(0);
+out_name:
+	if (handle->lgh_name != NULL)
+		OBD_FREE(handle->lgh_name, strlen(name) + 1);
+out:
+	RETURN(rc);
+}
+
+static int llog_lvfs_exist(struct llog_handle *handle)
+{
+	return (handle->lgh_file != NULL);
+}
+
+/* This is a callback from the llog_* functions.
+ * Assumes caller has already pushed us into the kernel context. */
+static int llog_lvfs_create(const struct lu_env *env,
+			    struct llog_handle *handle,
+			    struct thandle *th)
+{
+	struct llog_ctxt	*ctxt = handle->lgh_ctxt;
+	struct obd_device	*obd;
+	struct l_dentry		*dchild = NULL;
+	struct file		*file;
+	struct obdo		*oa = NULL;
+	int			 rc = 0;
+	int			 open_flags = O_RDWR | O_CREAT | O_LARGEFILE;
+
+	ENTRY;
+
+	LASSERT(ctxt);
+	LASSERT(ctxt->loc_exp);
+	obd = ctxt->loc_exp->exp_obd;
+	LASSERT(handle->lgh_file == NULL);
+
+	if (handle->lgh_name) {
+		file = llog_filp_open(MOUNT_CONFIGS_DIR, handle->lgh_name,
+				      open_flags, 0644);
+		if (IS_ERR(file))
+			RETURN(PTR_ERR(file));
+
+		lustre_build_llog_lvfs_oid(&handle->lgh_id,
+				file->f_dentry->d_inode->i_ino,
+				file->f_dentry->d_inode->i_generation);
+		handle->lgh_file = file;
+	} else {
+		OBDO_ALLOC(oa);
+		if (oa == NULL)
+			RETURN(-ENOMEM);
+
+		ostid_set_seq_llog(&oa->o_oi);
+		oa->o_valid = OBD_MD_FLGENER | OBD_MD_FLGROUP;
+
+		rc = obd_create(NULL, ctxt->loc_exp, oa, NULL, NULL);
+		if (rc)
+			GOTO(out, rc);
+
+		/* FIXME: rationalize the misuse of o_generation in
+		 *	this API along with mds_obd_{create,destroy}.
+		 *	Hopefully it is only an internal API issue. */
+#define o_generation o_parent_oid
+		dchild = obd_lvfs_fid2dentry(ctxt->loc_exp, &oa->o_oi,
+					     oa->o_generation);
+		if (IS_ERR(dchild))
+			GOTO(out, rc = PTR_ERR(dchild));
+
+		file = l_dentry_open(&obd->obd_lvfs_ctxt, dchild, open_flags);
+		l_dput(dchild);
+		if (IS_ERR(file))
+			GOTO(out, rc = PTR_ERR(file));
+		handle->lgh_id.lgl_oi = oa->o_oi;
+		handle->lgh_id.lgl_ogen = oa->o_generation;
+		handle->lgh_file = file;
+out:
+		OBDO_FREE(oa);
+	}
+	RETURN(rc);
+}
+
+static int llog_lvfs_close(const struct lu_env *env,
+			   struct llog_handle *handle)
+{
+	int rc;
+
+	ENTRY;
+
+	if (handle->lgh_file == NULL)
+		RETURN(0);
+	rc = filp_close(handle->lgh_file, 0);
+	if (rc)
+		CERROR("%s: error closing llog #"DOSTID"#%08x: "
+		       "rc = %d\n", handle->lgh_ctxt->loc_obd->obd_name,
+		       POSTID(&handle->lgh_id.lgl_oi),
+		       handle->lgh_id.lgl_ogen, rc);
+	handle->lgh_file = NULL;
+	if (handle->lgh_name) {
+		OBD_FREE(handle->lgh_name, strlen(handle->lgh_name) + 1);
+		handle->lgh_name = NULL;
+	}
+	RETURN(rc);
+}
+
+static int llog_lvfs_destroy(const struct lu_env *env,
+			     struct llog_handle *handle)
+{
+	struct dentry *fdentry;
+	struct obdo *oa;
+	struct obd_device *obd = handle->lgh_ctxt->loc_exp->exp_obd;
+	char *dir;
+	void *th;
+	struct inode *inode;
+	int rc, rc1;
+	ENTRY;
+
+	dir = MOUNT_CONFIGS_DIR;
+
+	LASSERT(handle->lgh_file);
+	fdentry = handle->lgh_file->f_dentry;
+	inode = fdentry->d_parent->d_inode;
+	if (strcmp(fdentry->d_parent->d_name.name, dir) == 0) {
+		struct lvfs_run_ctxt saved;
+		struct vfsmount *mnt = mntget(handle->lgh_file->f_vfsmnt);
+
+		push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+		dget(fdentry);
+		rc = llog_lvfs_close(env, handle);
+		if (rc == 0) {
+			mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
+			rc = ll_vfs_unlink(inode, fdentry, mnt);
+			mutex_unlock(&inode->i_mutex);
+		}
+		mntput(mnt);
+
+		dput(fdentry);
+		pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+		RETURN(rc);
+	}
+
+	OBDO_ALLOC(oa);
+	if (oa == NULL)
+		RETURN(-ENOMEM);
+
+	oa->o_oi = handle->lgh_id.lgl_oi;
+	oa->o_generation = handle->lgh_id.lgl_ogen;
+#undef o_generation
+	oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP | OBD_MD_FLGENER;
+
+	rc = llog_lvfs_close(env, handle);
+	if (rc)
+		GOTO(out, rc);
+
+	th = fsfilt_start_log(obd, inode, FSFILT_OP_UNLINK, NULL, 1);
+	if (IS_ERR(th)) {
+		CERROR("fsfilt_start failed: %ld\n", PTR_ERR(th));
+		GOTO(out, rc = PTR_ERR(th));
+	}
+
+	rc = obd_destroy(NULL, handle->lgh_ctxt->loc_exp, oa,
+			 NULL, NULL, NULL, NULL);
+
+	rc1 = fsfilt_commit(obd, inode, th, 0);
+	if (rc == 0 && rc1 != 0)
+		rc = rc1;
+ out:
+	OBDO_FREE(oa);
+	RETURN(rc);
+}
+
+static int llog_lvfs_declare_create(const struct lu_env *env,
+				    struct llog_handle *res,
+				    struct thandle *th)
+{
+	return 0;
+}
+
+static int llog_lvfs_declare_write_rec(const struct lu_env *env,
+				       struct llog_handle *loghandle,
+				       struct llog_rec_hdr *rec,
+				       int idx, struct thandle *th)
+{
+	return 0;
+}
+
+struct llog_operations llog_lvfs_ops = {
+	.lop_write_rec		= llog_lvfs_write_rec,
+	.lop_next_block		= llog_lvfs_next_block,
+	.lop_prev_block		= llog_lvfs_prev_block,
+	.lop_read_header	= llog_lvfs_read_header,
+	.lop_create		= llog_lvfs_create,
+	.lop_destroy		= llog_lvfs_destroy,
+	.lop_close		= llog_lvfs_close,
+	.lop_open		= llog_lvfs_open,
+	.lop_exist		= llog_lvfs_exist,
+	.lop_declare_create	= llog_lvfs_declare_create,
+	.lop_declare_write_rec	= llog_lvfs_declare_write_rec,
+};
+EXPORT_SYMBOL(llog_lvfs_ops);
+#else /* !__KERNEL__ */
+struct llog_operations llog_lvfs_ops = {};
+#endif
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_obd.c b/drivers/staging/lustre/lustre/obdclass/llog_obd.c
new file mode 100644
index 000000000000..7e2290796315
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/llog_obd.c
@@ -0,0 +1,319 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include <obd_class.h>
+#include <lustre_log.h>
+#include "llog_internal.h"
+
+/* helper functions for calling the llog obd methods */
+static struct llog_ctxt* llog_new_ctxt(struct obd_device *obd)
+{
+	struct llog_ctxt *ctxt;
+
+	OBD_ALLOC_PTR(ctxt);
+	if (!ctxt)
+		return NULL;
+
+	ctxt->loc_obd = obd;
+	atomic_set(&ctxt->loc_refcount, 1);
+
+	return ctxt;
+}
+
+static void llog_ctxt_destroy(struct llog_ctxt *ctxt)
+{
+	if (ctxt->loc_exp) {
+		class_export_put(ctxt->loc_exp);
+		ctxt->loc_exp = NULL;
+	}
+	if (ctxt->loc_imp) {
+		class_import_put(ctxt->loc_imp);
+		ctxt->loc_imp = NULL;
+	}
+	OBD_FREE_PTR(ctxt);
+}
+
+int __llog_ctxt_put(const struct lu_env *env, struct llog_ctxt *ctxt)
+{
+	struct obd_llog_group *olg = ctxt->loc_olg;
+	struct obd_device *obd;
+	int rc = 0;
+
+	spin_lock(&olg->olg_lock);
+	if (!atomic_dec_and_test(&ctxt->loc_refcount)) {
+		spin_unlock(&olg->olg_lock);
+		return rc;
+	}
+	olg->olg_ctxts[ctxt->loc_idx] = NULL;
+	spin_unlock(&olg->olg_lock);
+
+	obd = ctxt->loc_obd;
+	spin_lock(&obd->obd_dev_lock);
+	/* sync with llog ctxt user thread */
+	spin_unlock(&obd->obd_dev_lock);
+
+	/* obd->obd_starting is needed for the case of cleanup
+	 * in error case while obd is starting up. */
+	LASSERTF(obd->obd_starting == 1 ||
+		 obd->obd_stopping == 1 || obd->obd_set_up == 0,
+		 "wrong obd state: %d/%d/%d\n", !!obd->obd_starting,
+		 !!obd->obd_stopping, !!obd->obd_set_up);
+
+	/* cleanup the llog ctxt here */
+	if (CTXTP(ctxt, cleanup))
+		rc = CTXTP(ctxt, cleanup)(env, ctxt);
+
+	llog_ctxt_destroy(ctxt);
+	wake_up(&olg->olg_waitq);
+	return rc;
+}
+EXPORT_SYMBOL(__llog_ctxt_put);
+
+int llog_cleanup(const struct lu_env *env, struct llog_ctxt *ctxt)
+{
+	struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+	struct obd_llog_group *olg;
+	int rc, idx;
+	ENTRY;
+
+	LASSERT(ctxt != NULL);
+	LASSERT(ctxt != LP_POISON);
+
+	olg = ctxt->loc_olg;
+	LASSERT(olg != NULL);
+	LASSERT(olg != LP_POISON);
+
+	idx = ctxt->loc_idx;
+
+	/*
+	 * Banlance the ctxt get when calling llog_cleanup()
+	 */
+	LASSERT(atomic_read(&ctxt->loc_refcount) < LI_POISON);
+	LASSERT(atomic_read(&ctxt->loc_refcount) > 1);
+	llog_ctxt_put(ctxt);
+
+	/*
+	 * Try to free the ctxt.
+	 */
+	rc = __llog_ctxt_put(env, ctxt);
+	if (rc)
+		CERROR("Error %d while cleaning up ctxt %p\n",
+		       rc, ctxt);
+
+	l_wait_event(olg->olg_waitq,
+		     llog_group_ctxt_null(olg, idx), &lwi);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cleanup);
+
+int llog_setup(const struct lu_env *env, struct obd_device *obd,
+	       struct obd_llog_group *olg, int index,
+	       struct obd_device *disk_obd, struct llog_operations *op)
+{
+	struct llog_ctxt *ctxt;
+	int rc = 0;
+	ENTRY;
+
+	if (index < 0 || index >= LLOG_MAX_CTXTS)
+		RETURN(-EINVAL);
+
+	LASSERT(olg != NULL);
+
+	ctxt = llog_new_ctxt(obd);
+	if (!ctxt)
+		RETURN(-ENOMEM);
+
+	ctxt->loc_obd = obd;
+	ctxt->loc_olg = olg;
+	ctxt->loc_idx = index;
+	ctxt->loc_logops = op;
+	mutex_init(&ctxt->loc_mutex);
+	ctxt->loc_exp = class_export_get(disk_obd->obd_self_export);
+	ctxt->loc_flags = LLOG_CTXT_FLAG_UNINITIALIZED;
+
+	rc = llog_group_set_ctxt(olg, ctxt, index);
+	if (rc) {
+		llog_ctxt_destroy(ctxt);
+		if (rc == -EEXIST) {
+			ctxt = llog_group_get_ctxt(olg, index);
+			if (ctxt) {
+				/*
+				 * mds_lov_update_desc() might call here multiple
+				 * times. So if the llog is already set up then
+				 * don't to do it again.
+				 */
+				CDEBUG(D_CONFIG, "obd %s ctxt %d already set up\n",
+				       obd->obd_name, index);
+				LASSERT(ctxt->loc_olg == olg);
+				LASSERT(ctxt->loc_obd == obd);
+				LASSERT(ctxt->loc_exp == disk_obd->obd_self_export);
+				LASSERT(ctxt->loc_logops == op);
+				llog_ctxt_put(ctxt);
+			}
+			rc = 0;
+		}
+		RETURN(rc);
+	}
+
+	if (op->lop_setup) {
+		if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LLOG_SETUP))
+			rc = -EOPNOTSUPP;
+		else
+			rc = op->lop_setup(env, obd, olg, index, disk_obd);
+	}
+
+	if (rc) {
+		CERROR("%s: ctxt %d lop_setup=%p failed: rc = %d\n",
+		       obd->obd_name, index, op->lop_setup, rc);
+		llog_group_clear_ctxt(olg, index);
+		llog_ctxt_destroy(ctxt);
+	} else {
+		CDEBUG(D_CONFIG, "obd %s ctxt %d is initialized\n",
+		       obd->obd_name, index);
+		ctxt->loc_flags &= ~LLOG_CTXT_FLAG_UNINITIALIZED;
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_setup);
+
+int llog_sync(struct llog_ctxt *ctxt, struct obd_export *exp, int flags)
+{
+	int rc = 0;
+	ENTRY;
+
+	if (!ctxt)
+		RETURN(0);
+
+	if (CTXTP(ctxt, sync))
+		rc = CTXTP(ctxt, sync)(ctxt, exp, flags);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_sync);
+
+int llog_obd_add(const struct lu_env *env, struct llog_ctxt *ctxt,
+		 struct llog_rec_hdr *rec, struct lov_stripe_md *lsm,
+		 struct llog_cookie *logcookies, int numcookies)
+{
+	int raised, rc;
+	ENTRY;
+
+	if (!ctxt) {
+		CERROR("No ctxt\n");
+		RETURN(-ENODEV);
+	}
+
+	if (ctxt->loc_flags & LLOG_CTXT_FLAG_UNINITIALIZED)
+		RETURN(-ENXIO);
+
+	CTXT_CHECK_OP(ctxt, obd_add, -EOPNOTSUPP);
+	raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+	if (!raised)
+		cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+	rc = CTXTP(ctxt, obd_add)(env, ctxt, rec, lsm, logcookies,
+				  numcookies);
+	if (!raised)
+		cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_obd_add);
+
+int llog_cancel(const struct lu_env *env, struct llog_ctxt *ctxt,
+		struct lov_stripe_md *lsm, int count,
+		struct llog_cookie *cookies, int flags)
+{
+	int rc;
+	ENTRY;
+
+	if (!ctxt) {
+		CERROR("No ctxt\n");
+		RETURN(-ENODEV);
+	}
+
+	CTXT_CHECK_OP(ctxt, cancel, -EOPNOTSUPP);
+	rc = CTXTP(ctxt, cancel)(env, ctxt, lsm, count, cookies, flags);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cancel);
+
+int obd_llog_init(struct obd_device *obd, struct obd_llog_group *olg,
+		  struct obd_device *disk_obd, int *index)
+{
+	int rc;
+	ENTRY;
+	OBD_CHECK_DT_OP(obd, llog_init, 0);
+	OBD_COUNTER_INCREMENT(obd, llog_init);
+
+	rc = OBP(obd, llog_init)(obd, olg, disk_obd, index);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(obd_llog_init);
+
+int obd_llog_finish(struct obd_device *obd, int count)
+{
+	int rc;
+	ENTRY;
+	OBD_CHECK_DT_OP(obd, llog_finish, 0);
+	OBD_COUNTER_INCREMENT(obd, llog_finish);
+
+	rc = OBP(obd, llog_finish)(obd, count);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(obd_llog_finish);
+
+/* context key constructor/destructor: llog_key_init, llog_key_fini */
+LU_KEY_INIT_FINI(llog, struct llog_thread_info);
+/* context key: llog_thread_key */
+LU_CONTEXT_KEY_DEFINE(llog, LCT_MD_THREAD | LCT_MG_THREAD | LCT_LOCAL);
+LU_KEY_INIT_GENERIC(llog);
+EXPORT_SYMBOL(llog_thread_key);
+
+int llog_info_init(void)
+{
+	llog_key_init_generic(&llog_thread_key, NULL);
+	lu_context_key_register(&llog_thread_key);
+	return 0;
+}
+
+void llog_info_fini(void)
+{
+	lu_context_key_degister(&llog_thread_key);
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_osd.c b/drivers/staging/lustre/lustre/obdclass/llog_osd.c
new file mode 100644
index 000000000000..6dbd21a863c2
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/llog_osd.c
@@ -0,0 +1,1323 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog_osd.c - low level llog routines on top of OSD API
+ *
+ * Author: Alexey Zhuravlev <alexey.zhuravlev@intel.com>
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+#ifndef EXPORT_SYMTAB
+#define EXPORT_SYMTAB
+#endif
+
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_fid.h>
+#include <dt_object.h>
+
+#include "llog_internal.h"
+#include "local_storage.h"
+
+/*
+ * - multi-chunks or big-declaration approach
+ * - use unique sequence instead of llog sb tracking unique ids
+ * - re-use existing environment
+ * - named llog support (can be used for testing only at the present)
+ * - llog_origin_connect() work with OSD API
+ */
+
+static int llog_osd_declare_new_object(const struct lu_env *env,
+				       struct local_oid_storage *los,
+				       struct dt_object *o,
+				       struct thandle *th)
+{
+	struct llog_thread_info *lgi = llog_info(env);
+
+	lgi->lgi_attr.la_valid = LA_MODE;
+	lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
+	lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG);
+
+	return local_object_declare_create(env, los, o, &lgi->lgi_attr,
+					   &lgi->lgi_dof, th);
+}
+
+static int llog_osd_create_new_object(const struct lu_env *env,
+				      struct local_oid_storage *los,
+				      struct dt_object *o,
+				      struct thandle *th)
+{
+	struct llog_thread_info *lgi = llog_info(env);
+
+	lgi->lgi_attr.la_valid = LA_MODE;
+	lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
+	lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG);
+
+	return local_object_create(env, los, o, &lgi->lgi_attr,
+				   &lgi->lgi_dof, th);
+}
+
+static int llog_osd_pad(const struct lu_env *env, struct dt_object *o,
+			loff_t *off, int len, int index, struct thandle *th)
+{
+	struct llog_thread_info	*lgi = llog_info(env);
+	int			 rc;
+
+	ENTRY;
+
+	LASSERT(th);
+	LASSERT(off);
+	LASSERT(len >= LLOG_MIN_REC_SIZE && (len & 0x7) == 0);
+
+	lgi->lgi_tail.lrt_len = lgi->lgi_lrh.lrh_len = len;
+	lgi->lgi_tail.lrt_index = lgi->lgi_lrh.lrh_index = index;
+	lgi->lgi_lrh.lrh_type = LLOG_PAD_MAGIC;
+
+	lgi->lgi_buf.lb_buf = &lgi->lgi_lrh;
+	lgi->lgi_buf.lb_len = sizeof(lgi->lgi_lrh);
+	dt_write_lock(env, o, 0);
+	rc = dt_record_write(env, o, &lgi->lgi_buf, off, th);
+	if (rc) {
+		CERROR("%s: error writing padding record: rc = %d\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name, rc);
+		GOTO(out, rc);
+	}
+
+	lgi->lgi_buf.lb_buf = &lgi->lgi_tail;
+	lgi->lgi_buf.lb_len = sizeof(lgi->lgi_tail);
+	*off += len - sizeof(lgi->lgi_lrh) - sizeof(lgi->lgi_tail);
+	rc = dt_record_write(env, o, &lgi->lgi_buf, off, th);
+	if (rc)
+		CERROR("%s: error writing padding record: rc = %d\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name, rc);
+out:
+	dt_write_unlock(env, o);
+	RETURN(rc);
+}
+
+static int llog_osd_write_blob(const struct lu_env *env, struct dt_object *o,
+			       struct llog_rec_hdr *rec, void *buf,
+			       loff_t *off, struct thandle *th)
+{
+	struct llog_thread_info	*lgi = llog_info(env);
+	int			 buflen = rec->lrh_len;
+	int			 rc;
+
+	ENTRY;
+
+	LASSERT(env);
+	LASSERT(o);
+
+	if (buflen == 0)
+		CWARN("0-length record\n");
+
+	CDEBUG(D_OTHER, "write blob with type %x, buf %p/%u at off %llu\n",
+	       rec->lrh_type, buf, buflen, *off);
+
+	lgi->lgi_attr.la_valid = LA_SIZE;
+	lgi->lgi_attr.la_size = *off;
+
+	if (!buf) {
+		lgi->lgi_buf.lb_len = buflen;
+		lgi->lgi_buf.lb_buf = rec;
+		rc = dt_record_write(env, o, &lgi->lgi_buf, off, th);
+		if (rc)
+			CERROR("%s: error writing log record: rc = %d\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name, rc);
+		GOTO(out, rc);
+	}
+
+	/* the buf case */
+	/* protect the following 3 writes from concurrent read */
+	dt_write_lock(env, o, 0);
+	rec->lrh_len = sizeof(*rec) + buflen + sizeof(lgi->lgi_tail);
+	lgi->lgi_buf.lb_len = sizeof(*rec);
+	lgi->lgi_buf.lb_buf = rec;
+	rc = dt_record_write(env, o, &lgi->lgi_buf, off, th);
+	if (rc) {
+		CERROR("%s: error writing log hdr: rc = %d\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name, rc);
+		GOTO(out_unlock, rc);
+	}
+
+	lgi->lgi_buf.lb_len = buflen;
+	lgi->lgi_buf.lb_buf = buf;
+	rc = dt_record_write(env, o, &lgi->lgi_buf, off, th);
+	if (rc) {
+		CERROR("%s: error writing log buffer: rc = %d\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name,  rc);
+		GOTO(out_unlock, rc);
+	}
+
+	lgi->lgi_tail.lrt_len = rec->lrh_len;
+	lgi->lgi_tail.lrt_index = rec->lrh_index;
+	lgi->lgi_buf.lb_len = sizeof(lgi->lgi_tail);
+	lgi->lgi_buf.lb_buf = &lgi->lgi_tail;
+	rc = dt_record_write(env, o, &lgi->lgi_buf, off, th);
+	if (rc)
+		CERROR("%s: error writing log tail: rc = %d\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name, rc);
+
+out_unlock:
+	dt_write_unlock(env, o);
+
+out:
+	/* cleanup the content written above */
+	if (rc) {
+		dt_punch(env, o, lgi->lgi_attr.la_size, OBD_OBJECT_EOF, th,
+			 BYPASS_CAPA);
+		dt_attr_set(env, o, &lgi->lgi_attr, th, BYPASS_CAPA);
+	}
+
+	RETURN(rc);
+}
+
+static int llog_osd_read_header(const struct lu_env *env,
+				struct llog_handle *handle)
+{
+	struct llog_rec_hdr	*llh_hdr;
+	struct dt_object	*o;
+	struct llog_thread_info	*lgi;
+	int			 rc;
+
+	ENTRY;
+
+	LASSERT(sizeof(*handle->lgh_hdr) == LLOG_CHUNK_SIZE);
+
+	o = handle->lgh_obj;
+	LASSERT(o);
+
+	lgi = llog_info(env);
+
+	rc = dt_attr_get(env, o, &lgi->lgi_attr, NULL);
+	if (rc)
+		RETURN(rc);
+
+	LASSERT(lgi->lgi_attr.la_valid & LA_SIZE);
+
+	if (lgi->lgi_attr.la_size == 0) {
+		CDEBUG(D_HA, "not reading header from 0-byte log\n");
+		RETURN(LLOG_EEMPTY);
+	}
+
+	lgi->lgi_off = 0;
+	lgi->lgi_buf.lb_buf = handle->lgh_hdr;
+	lgi->lgi_buf.lb_len = LLOG_CHUNK_SIZE;
+
+	rc = dt_record_read(env, o, &lgi->lgi_buf, &lgi->lgi_off);
+	if (rc) {
+		CERROR("%s: error reading log header from "DFID": rc = %d\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name,
+		       PFID(lu_object_fid(&o->do_lu)), rc);
+		RETURN(rc);
+	}
+
+	llh_hdr = &handle->lgh_hdr->llh_hdr;
+	if (LLOG_REC_HDR_NEEDS_SWABBING(llh_hdr))
+		lustre_swab_llog_hdr(handle->lgh_hdr);
+
+	if (llh_hdr->lrh_type != LLOG_HDR_MAGIC) {
+		CERROR("%s: bad log %s "DFID" header magic: %#x "
+		       "(expected %#x)\n", o->do_lu.lo_dev->ld_obd->obd_name,
+		       handle->lgh_name ? handle->lgh_name : "",
+		       PFID(lu_object_fid(&o->do_lu)),
+		       llh_hdr->lrh_type, LLOG_HDR_MAGIC);
+		RETURN(-EIO);
+	} else if (llh_hdr->lrh_len != LLOG_CHUNK_SIZE) {
+		CERROR("%s: incorrectly sized log %s "DFID" header: "
+		       "%#x (expected %#x)\n"
+		       "you may need to re-run lconf --write_conf.\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name,
+		       handle->lgh_name ? handle->lgh_name : "",
+		       PFID(lu_object_fid(&o->do_lu)),
+		       llh_hdr->lrh_len, LLOG_CHUNK_SIZE);
+		RETURN(-EIO);
+	}
+
+	handle->lgh_last_idx = handle->lgh_hdr->llh_tail.lrt_index;
+
+	RETURN(0);
+}
+
+static int llog_osd_declare_write_rec(const struct lu_env *env,
+				      struct llog_handle *loghandle,
+				      struct llog_rec_hdr *rec,
+				      int idx, struct thandle *th)
+{
+	struct llog_thread_info	*lgi = llog_info(env);
+	struct dt_object	*o;
+	int			 rc;
+
+	ENTRY;
+
+	LASSERT(env);
+	LASSERT(th);
+	LASSERT(loghandle);
+
+	o = loghandle->lgh_obj;
+	LASSERT(o);
+
+	/* each time we update header */
+	rc = dt_declare_record_write(env, o, sizeof(struct llog_log_hdr), 0,
+				     th);
+	if (rc || idx == 0) /* if error or just header */
+		RETURN(rc);
+
+	if (dt_object_exists(o)) {
+		rc = dt_attr_get(env, o, &lgi->lgi_attr, BYPASS_CAPA);
+		lgi->lgi_off = lgi->lgi_attr.la_size;
+		LASSERT(ergo(rc == 0, lgi->lgi_attr.la_valid & LA_SIZE));
+		if (rc)
+			RETURN(rc);
+
+		rc = dt_declare_punch(env, o, lgi->lgi_off, OBD_OBJECT_EOF, th);
+		if (rc)
+			RETURN(rc);
+	} else {
+		lgi->lgi_off = 0;
+	}
+
+	/* XXX: implement declared window or multi-chunks approach */
+	rc = dt_declare_record_write(env, o, 32 * 1024, lgi->lgi_off, th);
+
+	RETURN(rc);
+}
+
+/* returns negative in on error; 0 if success && reccookie == 0; 1 otherwise */
+/* appends if idx == -1, otherwise overwrites record idx. */
+static int llog_osd_write_rec(const struct lu_env *env,
+			      struct llog_handle *loghandle,
+			      struct llog_rec_hdr *rec,
+			      struct llog_cookie *reccookie, int cookiecount,
+			      void *buf, int idx, struct thandle *th)
+{
+	struct llog_thread_info	*lgi = llog_info(env);
+	struct llog_log_hdr	*llh;
+	int			 reclen = rec->lrh_len;
+	int			 index, rc, old_tail_idx;
+	struct llog_rec_tail	*lrt;
+	struct dt_object	*o;
+	size_t			 left;
+
+	ENTRY;
+
+	LASSERT(env);
+	llh = loghandle->lgh_hdr;
+	LASSERT(llh);
+	o = loghandle->lgh_obj;
+	LASSERT(o);
+	LASSERT(th);
+
+	CDEBUG(D_OTHER, "new record %x to "DFID"\n",
+	       rec->lrh_type, PFID(lu_object_fid(&o->do_lu)));
+
+	/* record length should not bigger than LLOG_CHUNK_SIZE */
+	if (buf)
+		rc = (reclen > LLOG_CHUNK_SIZE - sizeof(struct llog_rec_hdr) -
+		      sizeof(struct llog_rec_tail)) ? -E2BIG : 0;
+	else
+		rc = (reclen > LLOG_CHUNK_SIZE) ? -E2BIG : 0;
+	if (rc)
+		RETURN(rc);
+
+	rc = dt_attr_get(env, o, &lgi->lgi_attr, NULL);
+	if (rc)
+		RETURN(rc);
+
+	if (buf)
+		/* write_blob adds header and tail to lrh_len. */
+		reclen = sizeof(*rec) + rec->lrh_len +
+			 sizeof(struct llog_rec_tail);
+
+	if (idx != -1) {
+		/* no header: only allowed to insert record 1 */
+		if (idx != 1 && lgi->lgi_attr.la_size == 0)
+			LBUG();
+
+		if (idx && llh->llh_size && llh->llh_size != rec->lrh_len)
+			RETURN(-EINVAL);
+
+		if (!ext2_test_bit(idx, llh->llh_bitmap))
+			CERROR("%s: modify unset record %u\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name, idx);
+		if (idx != rec->lrh_index)
+			CERROR("%s: index mismatch %d %u\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name, idx,
+			       rec->lrh_index);
+
+		lgi->lgi_off = 0;
+		rc = llog_osd_write_blob(env, o, &llh->llh_hdr, NULL,
+					 &lgi->lgi_off, th);
+		/* we are done if we only write the header or on error */
+		if (rc || idx == 0)
+			RETURN(rc);
+
+		if (buf) {
+			/* We assume that caller has set lgh_cur_* */
+			lgi->lgi_off = loghandle->lgh_cur_offset;
+			CDEBUG(D_OTHER,
+			       "modify record "DOSTID": idx:%d/%u/%d, len:%u "
+			       "offset %llu\n",
+			       POSTID(&loghandle->lgh_id.lgl_oi), idx,
+			       rec->lrh_index,
+			       loghandle->lgh_cur_idx, rec->lrh_len,
+			       (long long)(lgi->lgi_off - sizeof(*llh)));
+			if (rec->lrh_index != loghandle->lgh_cur_idx) {
+				CERROR("%s: modify idx mismatch %u/%d\n",
+				       o->do_lu.lo_dev->ld_obd->obd_name, idx,
+				       loghandle->lgh_cur_idx);
+				RETURN(-EFAULT);
+			}
+		} else {
+			/* Assumes constant lrh_len */
+			lgi->lgi_off = sizeof(*llh) + (idx - 1) * reclen;
+		}
+
+		rc = llog_osd_write_blob(env, o, rec, buf, &lgi->lgi_off, th);
+		if (rc == 0 && reccookie) {
+			reccookie->lgc_lgl = loghandle->lgh_id;
+			reccookie->lgc_index = idx;
+			rc = 1;
+		}
+		RETURN(rc);
+	}
+
+	/* Make sure that records don't cross a chunk boundary, so we can
+	 * process them page-at-a-time if needed.  If it will cross a chunk
+	 * boundary, write in a fake (but referenced) entry to pad the chunk.
+	 *
+	 * We know that llog_current_log() will return a loghandle that is
+	 * big enough to hold reclen, so all we care about is padding here.
+	 */
+	LASSERT(lgi->lgi_attr.la_valid & LA_SIZE);
+	lgi->lgi_off = lgi->lgi_attr.la_size;
+	left = LLOG_CHUNK_SIZE - (lgi->lgi_off & (LLOG_CHUNK_SIZE - 1));
+	/* NOTE: padding is a record, but no bit is set */
+	if (left != 0 && left != reclen &&
+	    left < (reclen + LLOG_MIN_REC_SIZE)) {
+		index = loghandle->lgh_last_idx + 1;
+		rc = llog_osd_pad(env, o, &lgi->lgi_off, left, index, th);
+		if (rc)
+			RETURN(rc);
+		loghandle->lgh_last_idx++; /*for pad rec*/
+	}
+	/* if it's the last idx in log file, then return -ENOSPC */
+	if (loghandle->lgh_last_idx >= LLOG_BITMAP_SIZE(llh) - 1)
+		RETURN(-ENOSPC);
+
+	loghandle->lgh_last_idx++;
+	index = loghandle->lgh_last_idx;
+	LASSERT(index < LLOG_BITMAP_SIZE(llh));
+	rec->lrh_index = index;
+	if (buf == NULL) {
+		lrt = (struct llog_rec_tail *)((char *)rec + rec->lrh_len -
+					       sizeof(*lrt));
+		lrt->lrt_len = rec->lrh_len;
+		lrt->lrt_index = rec->lrh_index;
+	}
+	/* The caller should make sure only 1 process access the lgh_last_idx,
+	 * Otherwise it might hit the assert.*/
+	LASSERT(index < LLOG_BITMAP_SIZE(llh));
+	spin_lock(&loghandle->lgh_hdr_lock);
+	if (ext2_set_bit(index, llh->llh_bitmap)) {
+		CERROR("%s: index %u already set in log bitmap\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name, index);
+		spin_unlock(&loghandle->lgh_hdr_lock);
+		LBUG(); /* should never happen */
+	}
+	llh->llh_count++;
+	spin_unlock(&loghandle->lgh_hdr_lock);
+	old_tail_idx = llh->llh_tail.lrt_index;
+	llh->llh_tail.lrt_index = index;
+
+	lgi->lgi_off = 0;
+	rc = llog_osd_write_blob(env, o, &llh->llh_hdr, NULL, &lgi->lgi_off,
+				 th);
+	if (rc)
+		GOTO(out, rc);
+
+	rc = dt_attr_get(env, o, &lgi->lgi_attr, NULL);
+	if (rc)
+		GOTO(out, rc);
+
+	LASSERT(lgi->lgi_attr.la_valid & LA_SIZE);
+	lgi->lgi_off = lgi->lgi_attr.la_size;
+
+	rc = llog_osd_write_blob(env, o, rec, buf, &lgi->lgi_off, th);
+
+out:
+	/* cleanup llog for error case */
+	if (rc) {
+		spin_lock(&loghandle->lgh_hdr_lock);
+		ext2_clear_bit(index, llh->llh_bitmap);
+		llh->llh_count--;
+		spin_unlock(&loghandle->lgh_hdr_lock);
+
+		/* restore the header */
+		loghandle->lgh_last_idx--;
+		llh->llh_tail.lrt_index = old_tail_idx;
+		lgi->lgi_off = 0;
+		llog_osd_write_blob(env, o, &llh->llh_hdr, NULL,
+				    &lgi->lgi_off, th);
+	}
+
+	CDEBUG(D_RPCTRACE, "added record "DOSTID": idx: %u, %u\n",
+	       POSTID(&loghandle->lgh_id.lgl_oi), index, rec->lrh_len);
+	if (rc == 0 && reccookie) {
+		reccookie->lgc_lgl = loghandle->lgh_id;
+		reccookie->lgc_index = index;
+		if ((rec->lrh_type == MDS_UNLINK_REC) ||
+		    (rec->lrh_type == MDS_SETATTR64_REC))
+			reccookie->lgc_subsys = LLOG_MDS_OST_ORIG_CTXT;
+		else if (rec->lrh_type == OST_SZ_REC)
+			reccookie->lgc_subsys = LLOG_SIZE_ORIG_CTXT;
+		else
+			reccookie->lgc_subsys = -1;
+		rc = 1;
+	}
+	RETURN(rc);
+}
+
+/* We can skip reading at least as many log blocks as the number of
+ * minimum sized log records we are skipping.  If it turns out
+ * that we are not far enough along the log (because the
+ * actual records are larger than minimum size) we just skip
+ * some more records.
+ */
+static void llog_skip_over(__u64 *off, int curr, int goal)
+{
+	if (goal <= curr)
+		return;
+	*off = (*off + (goal - curr - 1) * LLOG_MIN_REC_SIZE) &
+		~(LLOG_CHUNK_SIZE - 1);
+}
+
+/* sets:
+ *  - cur_offset to the furthest point read in the log file
+ *  - cur_idx to the log index preceeding cur_offset
+ * returns -EIO/-EINVAL on error
+ */
+static int llog_osd_next_block(const struct lu_env *env,
+			       struct llog_handle *loghandle, int *cur_idx,
+			       int next_idx, __u64 *cur_offset, void *buf,
+			       int len)
+{
+	struct llog_thread_info	*lgi = llog_info(env);
+	struct dt_object	*o;
+	struct dt_device	*dt;
+	int			 rc;
+
+	ENTRY;
+
+	LASSERT(env);
+	LASSERT(lgi);
+
+	if (len == 0 || len & (LLOG_CHUNK_SIZE - 1))
+		RETURN(-EINVAL);
+
+	CDEBUG(D_OTHER, "looking for log index %u (cur idx %u off "LPU64")\n",
+	       next_idx, *cur_idx, *cur_offset);
+
+	LASSERT(loghandle);
+	LASSERT(loghandle->lgh_ctxt);
+
+	o = loghandle->lgh_obj;
+	LASSERT(o);
+	LASSERT(dt_object_exists(o));
+	dt = lu2dt_dev(o->do_lu.lo_dev);
+	LASSERT(dt);
+
+	rc = dt_attr_get(env, o, &lgi->lgi_attr, BYPASS_CAPA);
+	if (rc)
+		GOTO(out, rc);
+
+	while (*cur_offset < lgi->lgi_attr.la_size) {
+		struct llog_rec_hdr	*rec, *last_rec;
+		struct llog_rec_tail	*tail;
+
+		llog_skip_over(cur_offset, *cur_idx, next_idx);
+
+		/* read up to next LLOG_CHUNK_SIZE block */
+		lgi->lgi_buf.lb_len = LLOG_CHUNK_SIZE -
+				      (*cur_offset & (LLOG_CHUNK_SIZE - 1));
+		lgi->lgi_buf.lb_buf = buf;
+
+		/* Note: read lock is not needed around la_size get above at
+		 * the time of dt_attr_get(). There are only two cases that
+		 * matter. Either la_size == cur_offset, in which case the
+		 * entire read is skipped, or la_size > cur_offset and the loop
+		 * is entered and this thread is blocked at dt_read_lock()
+		 * until the write is completed. When the write completes, then
+		 * the dt_read() will be done with the full length, and will
+		 * get the full data.
+		 */
+		dt_read_lock(env, o, 0);
+		rc = dt_read(env, o, &lgi->lgi_buf, cur_offset);
+		dt_read_unlock(env, o);
+		if (rc < 0) {
+			CERROR("%s: can't read llog block from log "DFID
+			       " offset "LPU64": rc = %d\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       PFID(lu_object_fid(&o->do_lu)), *cur_offset,
+			       rc);
+			GOTO(out, rc);
+		}
+
+		if (rc < len) {
+			/* signal the end of the valid buffer to
+			 * llog_process */
+			memset(buf + rc, 0, len - rc);
+		}
+
+		if (rc == 0) /* end of file, nothing to do */
+			GOTO(out, rc);
+
+		if (rc < sizeof(*tail)) {
+			CERROR("%s: invalid llog block at log id "DOSTID"/%u "
+			       "offset "LPU64"\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       POSTID(&loghandle->lgh_id.lgl_oi),
+			       loghandle->lgh_id.lgl_ogen, *cur_offset);
+			GOTO(out, rc = -EINVAL);
+		}
+
+		rec = buf;
+		if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+			lustre_swab_llog_rec(rec);
+
+		tail = (struct llog_rec_tail *)((char *)buf + rc -
+						sizeof(struct llog_rec_tail));
+		/* get the last record in block */
+		last_rec = (struct llog_rec_hdr *)((char *)buf + rc -
+						   le32_to_cpu(tail->lrt_len));
+
+		if (LLOG_REC_HDR_NEEDS_SWABBING(last_rec))
+			lustre_swab_llog_rec(last_rec);
+		LASSERT(last_rec->lrh_index == tail->lrt_index);
+
+		*cur_idx = tail->lrt_index;
+
+		/* this shouldn't happen */
+		if (tail->lrt_index == 0) {
+			CERROR("%s: invalid llog tail at log id "DOSTID"/%u "
+			       "offset "LPU64"\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       POSTID(&loghandle->lgh_id.lgl_oi),
+			       loghandle->lgh_id.lgl_ogen, *cur_offset);
+			GOTO(out, rc = -EINVAL);
+		}
+		if (tail->lrt_index < next_idx)
+			continue;
+
+		/* sanity check that the start of the new buffer is no farther
+		 * than the record that we wanted.  This shouldn't happen. */
+		if (rec->lrh_index > next_idx) {
+			CERROR("%s: missed desired record? %u > %u\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       rec->lrh_index, next_idx);
+			GOTO(out, rc = -ENOENT);
+		}
+		GOTO(out, rc = 0);
+	}
+	GOTO(out, rc = -EIO);
+out:
+	return rc;
+}
+
+static int llog_osd_prev_block(const struct lu_env *env,
+			       struct llog_handle *loghandle,
+			       int prev_idx, void *buf, int len)
+{
+	struct llog_thread_info	*lgi = llog_info(env);
+	struct dt_object	*o;
+	struct dt_device	*dt;
+	loff_t			 cur_offset;
+	int			 rc;
+
+	ENTRY;
+
+	if (len == 0 || len & (LLOG_CHUNK_SIZE - 1))
+		RETURN(-EINVAL);
+
+	CDEBUG(D_OTHER, "looking for log index %u\n", prev_idx);
+
+	LASSERT(loghandle);
+	LASSERT(loghandle->lgh_ctxt);
+
+	o = loghandle->lgh_obj;
+	LASSERT(o);
+	LASSERT(dt_object_exists(o));
+	dt = lu2dt_dev(o->do_lu.lo_dev);
+	LASSERT(dt);
+
+	cur_offset = LLOG_CHUNK_SIZE;
+	llog_skip_over(&cur_offset, 0, prev_idx);
+
+	rc = dt_attr_get(env, o, &lgi->lgi_attr, BYPASS_CAPA);
+	if (rc)
+		GOTO(out, rc);
+
+	while (cur_offset < lgi->lgi_attr.la_size) {
+		struct llog_rec_hdr	*rec, *last_rec;
+		struct llog_rec_tail	*tail;
+
+		lgi->lgi_buf.lb_len = len;
+		lgi->lgi_buf.lb_buf = buf;
+		/* It is OK to have locking around dt_read() only, see
+		 * comment in llog_osd_next_block for details
+		 */
+		dt_read_lock(env, o, 0);
+		rc = dt_read(env, o, &lgi->lgi_buf, &cur_offset);
+		dt_read_unlock(env, o);
+		if (rc < 0) {
+			CERROR("%s: can't read llog block from log "DFID
+			       " offset "LPU64": rc = %d\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       PFID(lu_object_fid(&o->do_lu)), cur_offset, rc);
+			GOTO(out, rc);
+		}
+
+		if (rc == 0) /* end of file, nothing to do */
+			GOTO(out, rc);
+
+		if (rc < sizeof(*tail)) {
+			CERROR("%s: invalid llog block at log id "DOSTID"/%u "
+			       "offset "LPU64"\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       POSTID(&loghandle->lgh_id.lgl_oi),
+			       loghandle->lgh_id.lgl_ogen, cur_offset);
+			GOTO(out, rc = -EINVAL);
+		}
+
+		rec = buf;
+		if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+			lustre_swab_llog_rec(rec);
+
+		tail = (struct llog_rec_tail *)((char *)buf + rc -
+						sizeof(struct llog_rec_tail));
+		/* get the last record in block */
+		last_rec = (struct llog_rec_hdr *)((char *)buf + rc -
+						   le32_to_cpu(tail->lrt_len));
+
+		if (LLOG_REC_HDR_NEEDS_SWABBING(last_rec))
+			lustre_swab_llog_rec(last_rec);
+		LASSERT(last_rec->lrh_index == tail->lrt_index);
+
+		/* this shouldn't happen */
+		if (tail->lrt_index == 0) {
+			CERROR("%s: invalid llog tail at log id "DOSTID"/%u "
+			       "offset "LPU64"\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       POSTID(&loghandle->lgh_id.lgl_oi),
+			       loghandle->lgh_id.lgl_ogen, cur_offset);
+			GOTO(out, rc = -EINVAL);
+		}
+		if (tail->lrt_index < prev_idx)
+			continue;
+
+		/* sanity check that the start of the new buffer is no farther
+		 * than the record that we wanted.  This shouldn't happen. */
+		if (rec->lrh_index > prev_idx) {
+			CERROR("%s: missed desired record? %u > %u\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       rec->lrh_index, prev_idx);
+			GOTO(out, rc = -ENOENT);
+		}
+		GOTO(out, rc = 0);
+	}
+	GOTO(out, rc = -EIO);
+out:
+	return rc;
+}
+
+struct dt_object *llog_osd_dir_get(const struct lu_env *env,
+				   struct llog_ctxt *ctxt)
+{
+	struct dt_device	*dt;
+	struct dt_thread_info	*dti = dt_info(env);
+	struct dt_object	*dir;
+	int			 rc;
+
+	dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt;
+	if (ctxt->loc_dir == NULL) {
+		rc = dt_root_get(env, dt, &dti->dti_fid);
+		if (rc)
+			return ERR_PTR(rc);
+		dir = dt_locate(env, dt, &dti->dti_fid);
+	} else {
+		lu_object_get(&ctxt->loc_dir->do_lu);
+		dir = ctxt->loc_dir;
+	}
+
+	return dir;
+}
+
+static int llog_osd_open(const struct lu_env *env, struct llog_handle *handle,
+			 struct llog_logid *logid, char *name,
+			 enum llog_open_param open_param)
+{
+	struct llog_thread_info		*lgi = llog_info(env);
+	struct llog_ctxt		*ctxt = handle->lgh_ctxt;
+	struct dt_object		*o;
+	struct dt_device		*dt;
+	struct ls_device		*ls;
+	struct local_oid_storage	*los;
+	int				 rc = 0;
+
+	ENTRY;
+
+	LASSERT(env);
+	LASSERT(ctxt);
+	LASSERT(ctxt->loc_exp);
+	LASSERT(ctxt->loc_exp->exp_obd);
+	dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt;
+	LASSERT(dt);
+
+	ls = ls_device_get(dt);
+	if (IS_ERR(ls))
+		RETURN(PTR_ERR(ls));
+
+	mutex_lock(&ls->ls_los_mutex);
+	los = dt_los_find(ls, name != NULL ? FID_SEQ_LLOG_NAME : FID_SEQ_LLOG);
+	mutex_unlock(&ls->ls_los_mutex);
+	LASSERT(los);
+	ls_device_put(env, ls);
+
+	LASSERT(handle);
+
+	if (logid != NULL) {
+		logid_to_fid(logid, &lgi->lgi_fid);
+	} else if (name) {
+		struct dt_object *llog_dir;
+
+		llog_dir = llog_osd_dir_get(env, ctxt);
+		if (IS_ERR(llog_dir))
+			GOTO(out, rc = PTR_ERR(llog_dir));
+		dt_read_lock(env, llog_dir, 0);
+		rc = dt_lookup_dir(env, llog_dir, name, &lgi->lgi_fid);
+		dt_read_unlock(env, llog_dir);
+		lu_object_put(env, &llog_dir->do_lu);
+		if (rc == -ENOENT && open_param == LLOG_OPEN_NEW) {
+			/* generate fid for new llog */
+			rc = local_object_fid_generate(env, los,
+						       &lgi->lgi_fid);
+		}
+		if (rc < 0)
+			GOTO(out, rc);
+		OBD_ALLOC(handle->lgh_name, strlen(name) + 1);
+		if (handle->lgh_name)
+			strcpy(handle->lgh_name, name);
+		else
+			GOTO(out, rc = -ENOMEM);
+	} else {
+		LASSERTF(open_param & LLOG_OPEN_NEW, "%#x\n", open_param);
+		/* generate fid for new llog */
+		rc = local_object_fid_generate(env, los, &lgi->lgi_fid);
+		if (rc < 0)
+			GOTO(out, rc);
+	}
+
+	o = ls_locate(env, ls, &lgi->lgi_fid);
+	if (IS_ERR(o))
+		GOTO(out_name, rc = PTR_ERR(o));
+
+	/* No new llog is expected but doesn't exist */
+	if (open_param != LLOG_OPEN_NEW && !dt_object_exists(o))
+		GOTO(out_put, rc = -ENOENT);
+
+	fid_to_logid(&lgi->lgi_fid, &handle->lgh_id);
+	handle->lgh_obj = o;
+	handle->private_data = los;
+	LASSERT(handle->lgh_ctxt);
+
+	RETURN(rc);
+
+out_put:
+	lu_object_put(env, &o->do_lu);
+out_name:
+	if (handle->lgh_name != NULL)
+		OBD_FREE(handle->lgh_name, strlen(name) + 1);
+out:
+	dt_los_put(los);
+	RETURN(rc);
+}
+
+static int llog_osd_exist(struct llog_handle *handle)
+{
+	LASSERT(handle->lgh_obj);
+	return (dt_object_exists(handle->lgh_obj) &&
+		!lu_object_is_dying(handle->lgh_obj->do_lu.lo_header));
+}
+
+static int llog_osd_declare_create(const struct lu_env *env,
+				   struct llog_handle *res, struct thandle *th)
+{
+	struct llog_thread_info		*lgi = llog_info(env);
+	struct local_oid_storage	*los;
+	struct dt_object		*o;
+	int				 rc;
+
+	ENTRY;
+
+	LASSERT(res->lgh_obj);
+	LASSERT(th);
+
+	/* object can be created by another thread */
+	o = res->lgh_obj;
+	if (dt_object_exists(o))
+		RETURN(0);
+
+	los = res->private_data;
+	LASSERT(los);
+
+	rc = llog_osd_declare_new_object(env, los, o, th);
+	if (rc)
+		RETURN(rc);
+
+	rc = dt_declare_record_write(env, o, LLOG_CHUNK_SIZE, 0, th);
+	if (rc)
+		RETURN(rc);
+
+	if (res->lgh_name) {
+		struct dt_object *llog_dir;
+
+		llog_dir = llog_osd_dir_get(env, res->lgh_ctxt);
+		if (IS_ERR(llog_dir))
+			RETURN(PTR_ERR(llog_dir));
+		logid_to_fid(&res->lgh_id, &lgi->lgi_fid);
+		rc = dt_declare_insert(env, llog_dir,
+				       (struct dt_rec *)&lgi->lgi_fid,
+				       (struct dt_key *)res->lgh_name, th);
+		lu_object_put(env, &llog_dir->do_lu);
+		if (rc)
+			CERROR("%s: can't declare named llog %s: rc = %d\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       res->lgh_name, rc);
+	}
+	RETURN(rc);
+}
+
+/* This is a callback from the llog_* functions.
+ * Assumes caller has already pushed us into the kernel context. */
+static int llog_osd_create(const struct lu_env *env, struct llog_handle *res,
+			   struct thandle *th)
+{
+	struct llog_thread_info *lgi = llog_info(env);
+	struct local_oid_storage *los;
+	struct dt_object	*o;
+	int		      rc = 0;
+
+	ENTRY;
+
+	LASSERT(env);
+	o = res->lgh_obj;
+	LASSERT(o);
+
+	/* llog can be already created */
+	if (dt_object_exists(o))
+		RETURN(-EEXIST);
+
+	los = res->private_data;
+	LASSERT(los);
+
+	dt_write_lock(env, o, 0);
+	if (!dt_object_exists(o))
+		rc = llog_osd_create_new_object(env, los, o, th);
+	else
+		rc = -EEXIST;
+
+	dt_write_unlock(env, o);
+	if (rc)
+		RETURN(rc);
+
+	if (res->lgh_name) {
+		struct dt_object *llog_dir;
+
+		llog_dir = llog_osd_dir_get(env, res->lgh_ctxt);
+		if (IS_ERR(llog_dir))
+			RETURN(PTR_ERR(llog_dir));
+
+		logid_to_fid(&res->lgh_id, &lgi->lgi_fid);
+		dt_read_lock(env, llog_dir, 0);
+		rc = dt_insert(env, llog_dir,
+			       (struct dt_rec *)&lgi->lgi_fid,
+			       (struct dt_key *)res->lgh_name,
+			       th, BYPASS_CAPA, 1);
+		dt_read_unlock(env, llog_dir);
+		lu_object_put(env, &llog_dir->do_lu);
+		if (rc)
+			CERROR("%s: can't create named llog %s: rc = %d\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       res->lgh_name, rc);
+	}
+	RETURN(rc);
+}
+
+static int llog_osd_close(const struct lu_env *env, struct llog_handle *handle)
+{
+	struct local_oid_storage	*los;
+	int				 rc = 0;
+
+	ENTRY;
+
+	LASSERT(handle->lgh_obj);
+
+	lu_object_put(env, &handle->lgh_obj->do_lu);
+
+	los = handle->private_data;
+	LASSERT(los);
+	dt_los_put(los);
+
+	if (handle->lgh_name)
+		OBD_FREE(handle->lgh_name, strlen(handle->lgh_name) + 1);
+
+	RETURN(rc);
+}
+
+static int llog_osd_destroy(const struct lu_env *env,
+			    struct llog_handle *loghandle)
+{
+	struct llog_ctxt	*ctxt;
+	struct dt_object	*o, *llog_dir = NULL;
+	struct dt_device	*d;
+	struct thandle		*th;
+	char			*name = NULL;
+	int			 rc;
+
+	ENTRY;
+
+	ctxt = loghandle->lgh_ctxt;
+	LASSERT(ctxt);
+
+	o = loghandle->lgh_obj;
+	LASSERT(o);
+
+	d = lu2dt_dev(o->do_lu.lo_dev);
+	LASSERT(d);
+	LASSERT(d == ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt);
+
+	th = dt_trans_create(env, d);
+	if (IS_ERR(th))
+		RETURN(PTR_ERR(th));
+
+	if (loghandle->lgh_name) {
+		llog_dir = llog_osd_dir_get(env, ctxt);
+		if (IS_ERR(llog_dir))
+			GOTO(out_trans, rc = PTR_ERR(llog_dir));
+
+		name = loghandle->lgh_name;
+		rc = dt_declare_delete(env, llog_dir,
+				       (struct dt_key *)name, th);
+		if (rc)
+			GOTO(out_trans, rc);
+	}
+
+	dt_declare_ref_del(env, o, th);
+
+	rc = dt_declare_destroy(env, o, th);
+	if (rc)
+		GOTO(out_trans, rc);
+
+	rc = dt_trans_start_local(env, d, th);
+	if (rc)
+		GOTO(out_trans, rc);
+
+	dt_write_lock(env, o, 0);
+	if (dt_object_exists(o)) {
+		if (name) {
+			dt_read_lock(env, llog_dir, 0);
+			rc = dt_delete(env, llog_dir,
+				       (struct dt_key *) name,
+				       th, BYPASS_CAPA);
+			dt_read_unlock(env, llog_dir);
+			if (rc) {
+				CERROR("%s: can't remove llog %s: rc = %d\n",
+				       o->do_lu.lo_dev->ld_obd->obd_name,
+				       name, rc);
+				GOTO(out_unlock, rc);
+			}
+		}
+		dt_ref_del(env, o, th);
+		rc = dt_destroy(env, o, th);
+		if (rc)
+			GOTO(out_unlock, rc);
+	}
+out_unlock:
+	dt_write_unlock(env, o);
+out_trans:
+	dt_trans_stop(env, d, th);
+	if (llog_dir != NULL)
+		lu_object_put(env, &llog_dir->do_lu);
+	RETURN(rc);
+}
+
+static int llog_osd_setup(const struct lu_env *env, struct obd_device *obd,
+			  struct obd_llog_group *olg, int ctxt_idx,
+			  struct obd_device *disk_obd)
+{
+	struct local_oid_storage	*los;
+	struct llog_thread_info		*lgi = llog_info(env);
+	struct llog_ctxt		*ctxt;
+	int				 rc = 0;
+
+	ENTRY;
+
+	LASSERT(obd);
+	LASSERT(olg->olg_ctxts[ctxt_idx]);
+
+	ctxt = llog_ctxt_get(olg->olg_ctxts[ctxt_idx]);
+	LASSERT(ctxt);
+
+	/* initialize data allowing to generate new fids,
+	 * literally we need a sequece */
+	lgi->lgi_fid.f_seq = FID_SEQ_LLOG;
+	lgi->lgi_fid.f_oid = 1;
+	lgi->lgi_fid.f_ver = 0;
+	rc = local_oid_storage_init(env, disk_obd->obd_lvfs_ctxt.dt,
+				    &lgi->lgi_fid, &los);
+	if (rc < 0)
+		return rc;
+
+	lgi->lgi_fid.f_seq = FID_SEQ_LLOG_NAME;
+	lgi->lgi_fid.f_oid = 1;
+	lgi->lgi_fid.f_ver = 0;
+	rc = local_oid_storage_init(env, disk_obd->obd_lvfs_ctxt.dt,
+				    &lgi->lgi_fid, &los);
+	llog_ctxt_put(ctxt);
+	return rc;
+}
+
+static int llog_osd_cleanup(const struct lu_env *env, struct llog_ctxt *ctxt)
+{
+	struct dt_device		*dt;
+	struct ls_device		*ls;
+	struct local_oid_storage	*los, *nlos;
+
+	LASSERT(ctxt->loc_exp->exp_obd);
+	dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt;
+	ls = ls_device_get(dt);
+	if (IS_ERR(ls))
+		RETURN(PTR_ERR(ls));
+
+	mutex_lock(&ls->ls_los_mutex);
+	los = dt_los_find(ls, FID_SEQ_LLOG);
+	nlos = dt_los_find(ls, FID_SEQ_LLOG_NAME);
+	mutex_unlock(&ls->ls_los_mutex);
+	if (los != NULL) {
+		dt_los_put(los);
+		local_oid_storage_fini(env, los);
+	}
+	if (nlos != NULL) {
+		dt_los_put(nlos);
+		local_oid_storage_fini(env, nlos);
+	}
+	ls_device_put(env, ls);
+	return 0;
+}
+
+struct llog_operations llog_osd_ops = {
+	.lop_next_block		= llog_osd_next_block,
+	.lop_prev_block		= llog_osd_prev_block,
+	.lop_read_header	= llog_osd_read_header,
+	.lop_destroy		= llog_osd_destroy,
+	.lop_setup		= llog_osd_setup,
+	.lop_cleanup		= llog_osd_cleanup,
+	.lop_open		= llog_osd_open,
+	.lop_exist		= llog_osd_exist,
+	.lop_declare_create	= llog_osd_declare_create,
+	.lop_create		= llog_osd_create,
+	.lop_declare_write_rec	= llog_osd_declare_write_rec,
+	.lop_write_rec		= llog_osd_write_rec,
+	.lop_close		= llog_osd_close,
+};
+EXPORT_SYMBOL(llog_osd_ops);
+
+/* reads the catalog list */
+int llog_osd_get_cat_list(const struct lu_env *env, struct dt_device *d,
+			  int idx, int count, struct llog_catid *idarray)
+{
+	struct llog_thread_info	*lgi = llog_info(env);
+	struct dt_object	*o = NULL;
+	struct thandle		*th;
+	int			 rc, size;
+
+	ENTRY;
+
+	LASSERT(d);
+
+	size = sizeof(*idarray) * count;
+	lgi->lgi_off = idx *  sizeof(*idarray);
+
+	lu_local_obj_fid(&lgi->lgi_fid, LLOG_CATALOGS_OID);
+
+	o = dt_locate(env, d, &lgi->lgi_fid);
+	if (IS_ERR(o))
+		RETURN(PTR_ERR(o));
+
+	if (!dt_object_exists(o)) {
+		th = dt_trans_create(env, d);
+		if (IS_ERR(th))
+			GOTO(out, rc = PTR_ERR(th));
+
+		lgi->lgi_attr.la_valid = LA_MODE;
+		lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
+		lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG);
+
+		rc = dt_declare_create(env, o, &lgi->lgi_attr, NULL,
+				       &lgi->lgi_dof, th);
+		if (rc)
+			GOTO(out_trans, rc);
+
+		rc = dt_trans_start_local(env, d, th);
+		if (rc)
+			GOTO(out_trans, rc);
+
+		dt_write_lock(env, o, 0);
+		if (!dt_object_exists(o))
+			rc = dt_create(env, o, &lgi->lgi_attr, NULL,
+				       &lgi->lgi_dof, th);
+		dt_write_unlock(env, o);
+out_trans:
+		dt_trans_stop(env, d, th);
+		if (rc)
+			GOTO(out, rc);
+	}
+
+	rc = dt_attr_get(env, o, &lgi->lgi_attr, BYPASS_CAPA);
+	if (rc)
+		GOTO(out, rc);
+
+	if (!S_ISREG(lgi->lgi_attr.la_mode)) {
+		CERROR("%s: CATALOGS is not a regular file!: mode = %o\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name,
+		       lgi->lgi_attr.la_mode);
+		GOTO(out, rc = -ENOENT);
+	}
+
+	CDEBUG(D_CONFIG, "cat list: disk size=%d, read=%d\n",
+	       (int)lgi->lgi_attr.la_size, size);
+
+	/* return just number of llogs */
+	if (idarray == NULL) {
+		rc = lgi->lgi_attr.la_size / sizeof(*idarray);
+		GOTO(out, rc);
+	}
+
+	/* read for new ost index or for empty file */
+	memset(idarray, 0, size);
+	if (lgi->lgi_attr.la_size < lgi->lgi_off + size)
+		GOTO(out, rc = 0);
+	if (lgi->lgi_attr.la_size < lgi->lgi_off + size)
+		size = lgi->lgi_attr.la_size - lgi->lgi_off;
+
+	lgi->lgi_buf.lb_buf = idarray;
+	lgi->lgi_buf.lb_len = size;
+	rc = dt_record_read(env, o, &lgi->lgi_buf, &lgi->lgi_off);
+	if (rc) {
+		CERROR("%s: error reading CATALOGS: rc = %d\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name,  rc);
+		GOTO(out, rc);
+	}
+
+	EXIT;
+out:
+	lu_object_put(env, &o->do_lu);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_osd_get_cat_list);
+
+/* writes the cat list */
+int llog_osd_put_cat_list(const struct lu_env *env, struct dt_device *d,
+			  int idx, int count, struct llog_catid *idarray)
+{
+	struct llog_thread_info	*lgi = llog_info(env);
+	struct dt_object	*o = NULL;
+	struct thandle		*th;
+	int			 rc, size;
+
+	if (!count)
+		RETURN(0);
+
+	LASSERT(d);
+
+	size = sizeof(*idarray) * count;
+	lgi->lgi_off = idx * sizeof(*idarray);
+
+	lu_local_obj_fid(&lgi->lgi_fid, LLOG_CATALOGS_OID);
+
+	o = dt_locate(env, d, &lgi->lgi_fid);
+	if (IS_ERR(o))
+		RETURN(PTR_ERR(o));
+
+	if (!dt_object_exists(o))
+		GOTO(out, rc = -ENOENT);
+
+	rc = dt_attr_get(env, o, &lgi->lgi_attr, BYPASS_CAPA);
+	if (rc)
+		GOTO(out, rc);
+
+	if (!S_ISREG(lgi->lgi_attr.la_mode)) {
+		CERROR("%s: CATALOGS is not a regular file!: mode = %o\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name,
+		       lgi->lgi_attr.la_mode);
+		GOTO(out, rc = -ENOENT);
+	}
+
+	th = dt_trans_create(env, d);
+	if (IS_ERR(th))
+		GOTO(out, rc = PTR_ERR(th));
+
+	rc = dt_declare_record_write(env, o, size, lgi->lgi_off, th);
+	if (rc)
+		GOTO(out, rc);
+
+	rc = dt_trans_start_local(env, d, th);
+	if (rc)
+		GOTO(out_trans, rc);
+
+	lgi->lgi_buf.lb_buf = idarray;
+	lgi->lgi_buf.lb_len = size;
+	rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th);
+	if (rc)
+		CDEBUG(D_INODE, "error writeing CATALOGS: rc = %d\n", rc);
+out_trans:
+	dt_trans_stop(env, d, th);
+out:
+	lu_object_put(env, &o->do_lu);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_osd_put_cat_list);
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_swab.c b/drivers/staging/lustre/lustre/obdclass/llog_swab.c
new file mode 100644
index 000000000000..ea70b99706f6
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/llog_swab.c
@@ -0,0 +1,402 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog_swab.c
+ *
+ * Swabbing of llog datatypes (from disk or over the wire).
+ *
+ * Author: jacob berkman  <jacob@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include <lustre_log.h>
+
+static void print_llogd_body(struct llogd_body *d)
+{
+	CDEBUG(D_OTHER, "llogd body: %p\n", d);
+	CDEBUG(D_OTHER, "\tlgd_logid.lgl_oi: "DOSTID"\n",
+	       POSTID(&d->lgd_logid.lgl_oi));
+	CDEBUG(D_OTHER, "\tlgd_logid.lgl_ogen: %#x\n", d->lgd_logid.lgl_ogen);
+	CDEBUG(D_OTHER, "\tlgd_ctxt_idx: %#x\n", d->lgd_ctxt_idx);
+	CDEBUG(D_OTHER, "\tlgd_llh_flags: %#x\n", d->lgd_llh_flags);
+	CDEBUG(D_OTHER, "\tlgd_index: %#x\n", d->lgd_index);
+	CDEBUG(D_OTHER, "\tlgd_saved_index: %#x\n", d->lgd_saved_index);
+	CDEBUG(D_OTHER, "\tlgd_len: %#x\n", d->lgd_len);
+	CDEBUG(D_OTHER, "\tlgd_cur_offset: "LPX64"\n", d->lgd_cur_offset);
+}
+
+void lustre_swab_lu_fid(struct lu_fid *fid)
+{
+	__swab64s (&fid->f_seq);
+	__swab32s (&fid->f_oid);
+	__swab32s (&fid->f_ver);
+}
+EXPORT_SYMBOL(lustre_swab_lu_fid);
+
+void lustre_swab_ost_id(struct ost_id *oid)
+{
+	if (fid_seq_is_mdt0(oid->oi.oi_seq)) {
+		__swab64s(&oid->oi.oi_id);
+		__swab64s(&oid->oi.oi_seq);
+	} else {
+		lustre_swab_lu_fid(&oid->oi_fid);
+	}
+}
+EXPORT_SYMBOL(lustre_swab_ost_id);
+
+void lustre_swab_llogd_body (struct llogd_body *d)
+{
+	ENTRY;
+	print_llogd_body(d);
+	lustre_swab_ost_id(&d->lgd_logid.lgl_oi);
+	__swab32s (&d->lgd_logid.lgl_ogen);
+	__swab32s (&d->lgd_ctxt_idx);
+	__swab32s (&d->lgd_llh_flags);
+	__swab32s (&d->lgd_index);
+	__swab32s (&d->lgd_saved_index);
+	__swab32s (&d->lgd_len);
+	__swab64s (&d->lgd_cur_offset);
+	print_llogd_body(d);
+	EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_llogd_body);
+
+void lustre_swab_llogd_conn_body (struct llogd_conn_body *d)
+{
+	__swab64s (&d->lgdc_gen.mnt_cnt);
+	__swab64s (&d->lgdc_gen.conn_cnt);
+	lustre_swab_ost_id(&d->lgdc_logid.lgl_oi);
+	__swab32s (&d->lgdc_logid.lgl_ogen);
+	__swab32s (&d->lgdc_ctxt_idx);
+}
+EXPORT_SYMBOL(lustre_swab_llogd_conn_body);
+
+void lustre_swab_ll_fid(struct ll_fid *fid)
+{
+	__swab64s (&fid->id);
+	__swab32s (&fid->generation);
+	__swab32s (&fid->f_type);
+}
+EXPORT_SYMBOL(lustre_swab_ll_fid);
+
+void lustre_swab_lu_seq_range(struct lu_seq_range *range)
+{
+	__swab64s (&range->lsr_start);
+	__swab64s (&range->lsr_end);
+	__swab32s (&range->lsr_index);
+	__swab32s (&range->lsr_flags);
+}
+EXPORT_SYMBOL(lustre_swab_lu_seq_range);
+
+void lustre_swab_llog_rec(struct llog_rec_hdr *rec)
+{
+	struct llog_rec_tail *tail = NULL;
+
+	__swab32s(&rec->lrh_len);
+	__swab32s(&rec->lrh_index);
+	__swab32s(&rec->lrh_type);
+	__swab32s(&rec->lrh_id);
+
+	switch (rec->lrh_type) {
+	case OST_SZ_REC:
+	{
+		struct llog_size_change_rec *lsc =
+			(struct llog_size_change_rec *)rec;
+
+		lustre_swab_ll_fid(&lsc->lsc_fid);
+		__swab32s(&lsc->lsc_ioepoch);
+		tail = &lsc->lsc_tail;
+		break;
+	}
+	case MDS_UNLINK_REC:
+	{
+		struct llog_unlink_rec *lur = (struct llog_unlink_rec *)rec;
+
+		__swab64s(&lur->lur_oid);
+		__swab32s(&lur->lur_oseq);
+		__swab32s(&lur->lur_count);
+		tail = &lur->lur_tail;
+		break;
+	}
+	case MDS_UNLINK64_REC:
+	{
+		struct llog_unlink64_rec *lur =
+			(struct llog_unlink64_rec *)rec;
+
+		lustre_swab_lu_fid(&lur->lur_fid);
+		__swab32s(&lur->lur_count);
+		tail = &lur->lur_tail;
+		break;
+	}
+	case CHANGELOG_REC:
+	{
+		struct llog_changelog_rec *cr = (struct llog_changelog_rec*)rec;
+
+		__swab16s(&cr->cr.cr_namelen);
+		__swab16s(&cr->cr.cr_flags);
+		__swab32s(&cr->cr.cr_type);
+		__swab64s(&cr->cr.cr_index);
+		__swab64s(&cr->cr.cr_prev);
+		__swab64s(&cr->cr.cr_time);
+		lustre_swab_lu_fid(&cr->cr.cr_tfid);
+		lustre_swab_lu_fid(&cr->cr.cr_pfid);
+		if (CHANGELOG_REC_EXTENDED(&cr->cr)) {
+			struct llog_changelog_ext_rec *ext =
+				(struct llog_changelog_ext_rec *)rec;
+
+			lustre_swab_lu_fid(&ext->cr.cr_sfid);
+			lustre_swab_lu_fid(&ext->cr.cr_spfid);
+			tail = &ext->cr_tail;
+		} else {
+			tail = &cr->cr_tail;
+		}
+		break;
+	}
+	case CHANGELOG_USER_REC:
+	{
+		struct llog_changelog_user_rec *cur =
+			(struct llog_changelog_user_rec*)rec;
+
+		__swab32s(&cur->cur_id);
+		__swab64s(&cur->cur_endrec);
+		tail = &cur->cur_tail;
+		break;
+	}
+
+	case MDS_SETATTR64_REC:
+	{
+		struct llog_setattr64_rec *lsr =
+			(struct llog_setattr64_rec *)rec;
+
+		lustre_swab_ost_id(&lsr->lsr_oi);
+		__swab32s(&lsr->lsr_uid);
+		__swab32s(&lsr->lsr_uid_h);
+		__swab32s(&lsr->lsr_gid);
+		__swab32s(&lsr->lsr_gid_h);
+		tail = &lsr->lsr_tail;
+		break;
+	}
+	case OBD_CFG_REC:
+		/* these are swabbed as they are consumed */
+		break;
+	case LLOG_HDR_MAGIC:
+	{
+		struct llog_log_hdr *llh = (struct llog_log_hdr *)rec;
+
+		__swab64s(&llh->llh_timestamp);
+		__swab32s(&llh->llh_count);
+		__swab32s(&llh->llh_bitmap_offset);
+		__swab32s(&llh->llh_flags);
+		__swab32s(&llh->llh_size);
+		__swab32s(&llh->llh_cat_idx);
+		tail = &llh->llh_tail;
+		break;
+	}
+	case LLOG_LOGID_MAGIC:
+	{
+		struct llog_logid_rec *lid = (struct llog_logid_rec *)rec;
+
+		lustre_swab_ost_id(&lid->lid_id.lgl_oi);
+		__swab32s(&lid->lid_id.lgl_ogen);
+		tail = &lid->lid_tail;
+		break;
+	}
+	case LLOG_GEN_REC:
+	{
+		struct llog_gen_rec *lgr = (struct llog_gen_rec *)rec;
+
+		__swab64s(&lgr->lgr_gen.mnt_cnt);
+		__swab64s(&lgr->lgr_gen.conn_cnt);
+		tail = &lgr->lgr_tail;
+		break;
+	}
+	case LLOG_PAD_MAGIC:
+		break;
+	default:
+		CERROR("Unknown llog rec type %#x swabbing rec %p\n",
+		       rec->lrh_type, rec);
+	}
+
+	if (tail) {
+		__swab32s(&tail->lrt_len);
+		__swab32s(&tail->lrt_index);
+	}
+}
+EXPORT_SYMBOL(lustre_swab_llog_rec);
+
+static void print_llog_hdr(struct llog_log_hdr *h)
+{
+	CDEBUG(D_OTHER, "llog header: %p\n", h);
+	CDEBUG(D_OTHER, "\tllh_hdr.lrh_index: %#x\n", h->llh_hdr.lrh_index);
+	CDEBUG(D_OTHER, "\tllh_hdr.lrh_len: %#x\n", h->llh_hdr.lrh_len);
+	CDEBUG(D_OTHER, "\tllh_hdr.lrh_type: %#x\n", h->llh_hdr.lrh_type);
+	CDEBUG(D_OTHER, "\tllh_timestamp: "LPX64"\n", h->llh_timestamp);
+	CDEBUG(D_OTHER, "\tllh_count: %#x\n", h->llh_count);
+	CDEBUG(D_OTHER, "\tllh_bitmap_offset: %#x\n", h->llh_bitmap_offset);
+	CDEBUG(D_OTHER, "\tllh_flags: %#x\n", h->llh_flags);
+	CDEBUG(D_OTHER, "\tllh_size: %#x\n", h->llh_size);
+	CDEBUG(D_OTHER, "\tllh_cat_idx: %#x\n", h->llh_cat_idx);
+	CDEBUG(D_OTHER, "\tllh_tail.lrt_index: %#x\n", h->llh_tail.lrt_index);
+	CDEBUG(D_OTHER, "\tllh_tail.lrt_len: %#x\n", h->llh_tail.lrt_len);
+}
+
+void lustre_swab_llog_hdr (struct llog_log_hdr *h)
+{
+	ENTRY;
+	print_llog_hdr(h);
+
+	lustre_swab_llog_rec(&h->llh_hdr);
+
+	print_llog_hdr(h);
+	EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_llog_hdr);
+
+static void print_lustre_cfg(struct lustre_cfg *lcfg)
+{
+	int i;
+	ENTRY;
+
+	if (!(libcfs_debug & D_OTHER)) /* don't loop on nothing */
+		return;
+	CDEBUG(D_OTHER, "lustre_cfg: %p\n", lcfg);
+	CDEBUG(D_OTHER, "\tlcfg->lcfg_version: %#x\n", lcfg->lcfg_version);
+
+	CDEBUG(D_OTHER, "\tlcfg->lcfg_command: %#x\n", lcfg->lcfg_command);
+	CDEBUG(D_OTHER, "\tlcfg->lcfg_num: %#x\n", lcfg->lcfg_num);
+	CDEBUG(D_OTHER, "\tlcfg->lcfg_flags: %#x\n", lcfg->lcfg_flags);
+	CDEBUG(D_OTHER, "\tlcfg->lcfg_nid: %s\n", libcfs_nid2str(lcfg->lcfg_nid));
+
+	CDEBUG(D_OTHER, "\tlcfg->lcfg_bufcount: %d\n", lcfg->lcfg_bufcount);
+	if (lcfg->lcfg_bufcount < LUSTRE_CFG_MAX_BUFCOUNT)
+		for (i = 0; i < lcfg->lcfg_bufcount; i++)
+			CDEBUG(D_OTHER, "\tlcfg->lcfg_buflens[%d]: %d\n",
+			       i, lcfg->lcfg_buflens[i]);
+	EXIT;
+}
+
+void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg)
+{
+	int i;
+	ENTRY;
+
+	__swab32s(&lcfg->lcfg_version);
+
+	if (lcfg->lcfg_version != LUSTRE_CFG_VERSION) {
+		CERROR("not swabbing lustre_cfg version %#x (expecting %#x)\n",
+		       lcfg->lcfg_version, LUSTRE_CFG_VERSION);
+		EXIT;
+		return;
+	}
+
+	__swab32s(&lcfg->lcfg_command);
+	__swab32s(&lcfg->lcfg_num);
+	__swab32s(&lcfg->lcfg_flags);
+	__swab64s(&lcfg->lcfg_nid);
+	__swab32s(&lcfg->lcfg_bufcount);
+	for (i = 0; i < lcfg->lcfg_bufcount && i < LUSTRE_CFG_MAX_BUFCOUNT; i++)
+		__swab32s(&lcfg->lcfg_buflens[i]);
+
+	print_lustre_cfg(lcfg);
+	EXIT;
+	return;
+}
+EXPORT_SYMBOL(lustre_swab_lustre_cfg);
+
+/* used only for compatibility with old on-disk cfg_marker data */
+struct cfg_marker32 {
+	__u32   cm_step;
+	__u32   cm_flags;
+	__u32   cm_vers;
+	__u32   padding;
+	__u32   cm_createtime;
+	__u32   cm_canceltime;
+	char    cm_tgtname[MTI_NAME_MAXLEN];
+	char    cm_comment[MTI_NAME_MAXLEN];
+};
+
+#define MTI_NAMELEN32    (MTI_NAME_MAXLEN - \
+	(sizeof(struct cfg_marker) - sizeof(struct cfg_marker32)))
+
+void lustre_swab_cfg_marker(struct cfg_marker *marker, int swab, int size)
+{
+	struct cfg_marker32 *cm32 = (struct cfg_marker32*)marker;
+	ENTRY;
+
+	if (swab) {
+		__swab32s(&marker->cm_step);
+		__swab32s(&marker->cm_flags);
+		__swab32s(&marker->cm_vers);
+	}
+	if (size == sizeof(*cm32)) {
+		__u32 createtime, canceltime;
+		/* There was a problem with the original declaration of
+		 * cfg_marker on 32-bit systems because it used time_t as
+		 * a wire protocol structure, and didn't verify this in
+		 * wirecheck.  We now have to convert the offsets of the
+		 * later fields in order to work on 32- and 64-bit systems.
+		 *
+		 * Fortunately, the cm_comment field has no functional use
+		 * so can be sacrificed when converting the timestamp size.
+		 *
+		 * Overwrite fields from the end first, so they are not
+		 * clobbered, and use memmove() instead of memcpy() because
+		 * the source and target buffers overlap.  bug 16771 */
+		createtime = cm32->cm_createtime;
+		canceltime = cm32->cm_canceltime;
+		memmove(marker->cm_comment, cm32->cm_comment, MTI_NAMELEN32);
+		marker->cm_comment[MTI_NAMELEN32 - 1] = '\0';
+		memmove(marker->cm_tgtname, cm32->cm_tgtname,
+			sizeof(marker->cm_tgtname));
+		if (swab) {
+			__swab32s(&createtime);
+			__swab32s(&canceltime);
+		}
+		marker->cm_createtime = createtime;
+		marker->cm_canceltime = canceltime;
+		CDEBUG(D_CONFIG, "Find old cfg_marker(Srv32b,Clt64b) "
+		       "for target %s, converting\n",
+		       marker->cm_tgtname);
+	} else if (swab) {
+		__swab64s(&marker->cm_createtime);
+		__swab64s(&marker->cm_canceltime);
+	}
+
+	EXIT;
+	return;
+}
+EXPORT_SYMBOL(lustre_swab_cfg_marker);
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_test.c b/drivers/staging/lustre/lustre/obdclass/llog_test.c
new file mode 100644
index 000000000000..d397f781ec43
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/llog_test.c
@@ -0,0 +1,1087 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog_test.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <obd_class.h>
+#include <lustre_fid.h>
+#include <lustre_log.h>
+
+/* This is slightly more than the number of records that can fit into a
+ * single llog file, because the llog_log_header takes up some of the
+ * space in the first block that cannot be used for the bitmap. */
+#define LLOG_TEST_RECNUM  (LLOG_CHUNK_SIZE * 8)
+
+static int llog_test_rand;
+static struct obd_uuid uuid = { .uuid = "test_uuid" };
+static struct llog_logid cat_logid;
+
+struct llog_mini_rec {
+	struct llog_rec_hdr     lmr_hdr;
+	struct llog_rec_tail    lmr_tail;
+} __attribute__((packed));
+
+static int verify_handle(char *test, struct llog_handle *llh, int num_recs)
+{
+	int i;
+	int last_idx = 0;
+	int active_recs = 0;
+
+	for (i = 0; i < LLOG_BITMAP_BYTES * 8; i++) {
+		if (ext2_test_bit(i, llh->lgh_hdr->llh_bitmap)) {
+			last_idx = i;
+			active_recs++;
+		}
+	}
+
+	if (active_recs != num_recs) {
+		CERROR("%s: expected %d active recs after write, found %d\n",
+		       test, num_recs, active_recs);
+		RETURN(-ERANGE);
+	}
+
+	if (llh->lgh_hdr->llh_count != num_recs) {
+		CERROR("%s: handle->count is %d, expected %d after write\n",
+		       test, llh->lgh_hdr->llh_count, num_recs);
+		RETURN(-ERANGE);
+	}
+
+	if (llh->lgh_last_idx < last_idx) {
+		CERROR("%s: handle->last_idx is %d, expected %d after write\n",
+		       test, llh->lgh_last_idx, last_idx);
+		RETURN(-ERANGE);
+	}
+
+	RETURN(0);
+}
+
+/* Test named-log create/open, close */
+static int llog_test_1(const struct lu_env *env,
+		       struct obd_device *obd, char *name)
+{
+	struct llog_handle	*llh;
+	struct llog_ctxt	*ctxt;
+	int rc;
+	int rc2;
+
+	ENTRY;
+
+	CWARN("1a: create a log with name: %s\n", name);
+	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+	LASSERT(ctxt);
+
+	rc = llog_open_create(env, ctxt, &llh, NULL, name);
+	if (rc) {
+		CERROR("1a: llog_create with name %s failed: %d\n", name, rc);
+		GOTO(out, rc);
+	}
+	rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, &uuid);
+	if (rc) {
+		CERROR("1a: can't init llog handle: %d\n", rc);
+		GOTO(out_close, rc);
+	}
+
+	rc = verify_handle("1", llh, 1);
+
+	CWARN("1b: close newly-created log\n");
+out_close:
+	rc2 = llog_close(env, llh);
+	if (rc2) {
+		CERROR("1b: close log %s failed: %d\n", name, rc2);
+		if (rc == 0)
+			rc = rc2;
+	}
+out:
+	llog_ctxt_put(ctxt);
+	RETURN(rc);
+}
+
+/* Test named-log reopen; returns opened log on success */
+static int llog_test_2(const struct lu_env *env, struct obd_device *obd,
+		       char *name, struct llog_handle **llh)
+{
+	struct llog_ctxt	*ctxt;
+	struct llog_handle	*loghandle;
+	struct llog_logid	 logid;
+	int			 rc;
+
+	ENTRY;
+
+	CWARN("2a: re-open a log with name: %s\n", name);
+	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+	LASSERT(ctxt);
+
+	rc = llog_open(env, ctxt, llh, NULL, name, LLOG_OPEN_EXISTS);
+	if (rc) {
+		CERROR("2a: re-open log with name %s failed: %d\n", name, rc);
+		GOTO(out_put, rc);
+	}
+
+	rc = llog_init_handle(env, *llh, LLOG_F_IS_PLAIN, &uuid);
+	if (rc) {
+		CERROR("2a: can't init llog handle: %d\n", rc);
+		GOTO(out_close_llh, rc);
+	}
+
+	rc = verify_handle("2", *llh, 1);
+	if (rc)
+		GOTO(out_close_llh, rc);
+
+	/* XXX: there is known issue with tests 2b, MGS is not able to create
+	 * anonymous llog, exit now to allow following tests run.
+	 * It is fixed in upcoming llog over OSD code */
+	GOTO(out_put, rc);
+
+	CWARN("2b: create a log without specified NAME & LOGID\n");
+	rc = llog_open_create(env, ctxt, &loghandle, NULL, NULL);
+	if (rc) {
+		CERROR("2b: create log failed\n");
+		GOTO(out_close_llh, rc);
+	}
+	rc = llog_init_handle(env, loghandle, LLOG_F_IS_PLAIN, &uuid);
+	if (rc) {
+		CERROR("2b: can't init llog handle: %d\n", rc);
+		GOTO(out_close, rc);
+	}
+
+	logid = loghandle->lgh_id;
+	llog_close(env, loghandle);
+
+	CWARN("2c: re-open the log by LOGID\n");
+	rc = llog_open(env, ctxt, &loghandle, &logid, NULL, LLOG_OPEN_EXISTS);
+	if (rc) {
+		CERROR("2c: re-open log by LOGID failed\n");
+		GOTO(out_close_llh, rc);
+	}
+
+	rc = llog_init_handle(env, loghandle, LLOG_F_IS_PLAIN, &uuid);
+	if (rc) {
+		CERROR("2c: can't init llog handle: %d\n", rc);
+		GOTO(out_close, rc);
+	}
+
+	CWARN("2b: destroy this log\n");
+	rc = llog_destroy(env, loghandle);
+	if (rc)
+		CERROR("2d: destroy log failed\n");
+out_close:
+	llog_close(env, loghandle);
+out_close_llh:
+	if (rc)
+		llog_close(env, *llh);
+out_put:
+	llog_ctxt_put(ctxt);
+
+	RETURN(rc);
+}
+
+/* Test record writing, single and in bulk */
+static int llog_test_3(const struct lu_env *env, struct obd_device *obd,
+		       struct llog_handle *llh)
+{
+	struct llog_gen_rec	 lgr;
+	int			 rc, i;
+	int			 num_recs = 1; /* 1 for the header */
+
+	ENTRY;
+
+	lgr.lgr_hdr.lrh_len = lgr.lgr_tail.lrt_len = sizeof(lgr);
+	lgr.lgr_hdr.lrh_type = LLOG_GEN_REC;
+
+	CWARN("3a: write one create_rec\n");
+	rc = llog_write(env, llh,  &lgr.lgr_hdr, NULL, 0, NULL, -1);
+	num_recs++;
+	if (rc < 0) {
+		CERROR("3a: write one log record failed: %d\n", rc);
+		RETURN(rc);
+	}
+
+	rc = verify_handle("3a", llh, num_recs);
+	if (rc)
+		RETURN(rc);
+
+	CWARN("3b: write 10 cfg log records with 8 bytes bufs\n");
+	for (i = 0; i < 10; i++) {
+		struct llog_rec_hdr	hdr;
+		char			buf[8];
+
+		hdr.lrh_len = 8;
+		hdr.lrh_type = OBD_CFG_REC;
+		memset(buf, 0, sizeof buf);
+		rc = llog_write(env, llh, &hdr, NULL, 0, buf, -1);
+		if (rc < 0) {
+			CERROR("3b: write 10 records failed at #%d: %d\n",
+			       i + 1, rc);
+			RETURN(rc);
+		}
+		num_recs++;
+	}
+
+	rc = verify_handle("3b", llh, num_recs);
+	if (rc)
+		RETURN(rc);
+
+	CWARN("3c: write 1000 more log records\n");
+	for (i = 0; i < 1000; i++) {
+		rc = llog_write(env, llh, &lgr.lgr_hdr, NULL, 0, NULL, -1);
+		if (rc < 0) {
+			CERROR("3c: write 1000 records failed at #%d: %d\n",
+			       i + 1, rc);
+			RETURN(rc);
+		}
+		num_recs++;
+	}
+
+	rc = verify_handle("3c", llh, num_recs);
+	if (rc)
+		RETURN(rc);
+
+	CWARN("3d: write log more than BITMAP_SIZE, return -ENOSPC\n");
+	for (i = 0; i < LLOG_BITMAP_SIZE(llh->lgh_hdr) + 1; i++) {
+		struct llog_rec_hdr	hdr;
+		char			buf_even[24];
+		char			buf_odd[32];
+
+		memset(buf_odd, 0, sizeof buf_odd);
+		memset(buf_even, 0, sizeof buf_even);
+		if ((i % 2) == 0) {
+			hdr.lrh_len = 24;
+			hdr.lrh_type = OBD_CFG_REC;
+			rc = llog_write(env, llh, &hdr, NULL, 0, buf_even, -1);
+		} else {
+			hdr.lrh_len = 32;
+			hdr.lrh_type = OBD_CFG_REC;
+			rc = llog_write(env, llh, &hdr, NULL, 0, buf_odd, -1);
+		}
+		if (rc == -ENOSPC) {
+			break;
+		} else if (rc < 0) {
+			CERROR("3d: write recs failed at #%d: %d\n",
+			       i + 1, rc);
+			RETURN(rc);
+		}
+		num_recs++;
+	}
+	if (rc != -ENOSPC) {
+		CWARN("3d: write record more than BITMAP size!\n");
+		RETURN(-EINVAL);
+	}
+	CWARN("3d: wrote %d more records before end of llog is reached\n",
+	      num_recs);
+
+	rc = verify_handle("3d", llh, num_recs);
+
+	RETURN(rc);
+}
+
+/* Test catalogue additions */
+static int llog_test_4(const struct lu_env *env, struct obd_device *obd)
+{
+	struct llog_handle	*cath;
+	char			 name[10];
+	int			 rc, rc2, i, buflen;
+	struct llog_mini_rec	 lmr;
+	struct llog_cookie	 cookie;
+	struct llog_ctxt	*ctxt;
+	int			 num_recs = 0;
+	char			*buf;
+	struct llog_rec_hdr	 rec;
+
+	ENTRY;
+
+	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+	LASSERT(ctxt);
+
+	lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE;
+	lmr.lmr_hdr.lrh_type = 0xf00f00;
+
+	sprintf(name, "%x", llog_test_rand + 1);
+	CWARN("4a: create a catalog log with name: %s\n", name);
+	rc = llog_open_create(env, ctxt, &cath, NULL, name);
+	if (rc) {
+		CERROR("4a: llog_create with name %s failed: %d\n", name, rc);
+		GOTO(ctxt_release, rc);
+	}
+	rc = llog_init_handle(env, cath, LLOG_F_IS_CAT, &uuid);
+	if (rc) {
+		CERROR("4a: can't init llog handle: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	num_recs++;
+	cat_logid = cath->lgh_id;
+
+	CWARN("4b: write 1 record into the catalog\n");
+	rc = llog_cat_add(env, cath, &lmr.lmr_hdr, &cookie, NULL);
+	if (rc != 1) {
+		CERROR("4b: write 1 catalog record failed at: %d\n", rc);
+		GOTO(out, rc);
+	}
+	num_recs++;
+	rc = verify_handle("4b", cath, 2);
+	if (rc)
+		GOTO(out, rc);
+
+	rc = verify_handle("4b", cath->u.chd.chd_current_log, num_recs);
+	if (rc)
+		GOTO(out, rc);
+
+	CWARN("4c: cancel 1 log record\n");
+	rc = llog_cat_cancel_records(env, cath, 1, &cookie);
+	if (rc) {
+		CERROR("4c: cancel 1 catalog based record failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+	num_recs--;
+
+	rc = verify_handle("4c", cath->u.chd.chd_current_log, num_recs);
+	if (rc)
+		GOTO(out, rc);
+
+	CWARN("4d: write %d more log records\n", LLOG_TEST_RECNUM);
+	for (i = 0; i < LLOG_TEST_RECNUM; i++) {
+		rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL, NULL);
+		if (rc) {
+			CERROR("4d: write %d records failed at #%d: %d\n",
+			       LLOG_TEST_RECNUM, i + 1, rc);
+			GOTO(out, rc);
+		}
+		num_recs++;
+	}
+
+	/* make sure new plain llog appears */
+	rc = verify_handle("4d", cath, 3);
+	if (rc)
+		GOTO(out, rc);
+
+	CWARN("4e: add 5 large records, one record per block\n");
+	buflen = LLOG_CHUNK_SIZE - sizeof(struct llog_rec_hdr) -
+		 sizeof(struct llog_rec_tail);
+	OBD_ALLOC(buf, buflen);
+	if (buf == NULL)
+		GOTO(out, rc = -ENOMEM);
+	for (i = 0; i < 5; i++) {
+		rec.lrh_len = buflen;
+		rec.lrh_type = OBD_CFG_REC;
+		rc = llog_cat_add(env, cath, &rec, NULL, buf);
+		if (rc) {
+			CERROR("4e: write 5 records failed at #%d: %d\n",
+			       i + 1, rc);
+			GOTO(out_free, rc);
+		}
+		num_recs++;
+	}
+out_free:
+	OBD_FREE(buf, buflen);
+out:
+	CWARN("4f: put newly-created catalog\n");
+	rc2 = llog_cat_close(env, cath);
+	if (rc2) {
+		CERROR("4: close log %s failed: %d\n", name, rc2);
+		if (rc == 0)
+			rc = rc2;
+	}
+ctxt_release:
+	llog_ctxt_put(ctxt);
+	RETURN(rc);
+}
+
+static int cat_counter;
+
+static int cat_print_cb(const struct lu_env *env, struct llog_handle *llh,
+			struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_logid_rec	*lir = (struct llog_logid_rec *)rec;
+	struct lu_fid		 fid = {0};
+
+	if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+		CERROR("invalid record in catalog\n");
+		RETURN(-EINVAL);
+	}
+
+	logid_to_fid(&lir->lid_id, &fid);
+
+	CWARN("seeing record at index %d - "DFID" in log "DFID"\n",
+	      rec->lrh_index, PFID(&fid),
+	      PFID(lu_object_fid(&llh->lgh_obj->do_lu)));
+
+	cat_counter++;
+
+	RETURN(0);
+}
+
+static int plain_counter;
+
+static int plain_print_cb(const struct lu_env *env, struct llog_handle *llh,
+			  struct llog_rec_hdr *rec, void *data)
+{
+	struct lu_fid fid = {0};
+
+	if (!(llh->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN)) {
+		CERROR("log is not plain\n");
+		RETURN(-EINVAL);
+	}
+
+	logid_to_fid(&llh->lgh_id, &fid);
+
+	CDEBUG(D_INFO, "seeing record at index %d in log "DFID"\n",
+	       rec->lrh_index, PFID(&fid));
+
+	plain_counter++;
+
+	RETURN(0);
+}
+
+static int cancel_count;
+
+static int llog_cancel_rec_cb(const struct lu_env *env,
+			      struct llog_handle *llh,
+			      struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_cookie cookie;
+
+	if (!(llh->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN)) {
+		CERROR("log is not plain\n");
+		RETURN(-EINVAL);
+	}
+
+	cookie.lgc_lgl = llh->lgh_id;
+	cookie.lgc_index = rec->lrh_index;
+
+	llog_cat_cancel_records(env, llh->u.phd.phd_cat_handle, 1, &cookie);
+	cancel_count++;
+	if (cancel_count == LLOG_TEST_RECNUM)
+		RETURN(-LLOG_EEMPTY);
+	RETURN(0);
+}
+
+/* Test log and catalogue processing */
+static int llog_test_5(const struct lu_env *env, struct obd_device *obd)
+{
+	struct llog_handle	*llh = NULL;
+	char			 name[10];
+	int			 rc, rc2;
+	struct llog_mini_rec	 lmr;
+	struct llog_ctxt	*ctxt;
+
+	ENTRY;
+
+	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+	LASSERT(ctxt);
+
+	lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE;
+	lmr.lmr_hdr.lrh_type = 0xf00f00;
+
+	CWARN("5a: re-open catalog by id\n");
+	rc = llog_open(env, ctxt, &llh, &cat_logid, NULL, LLOG_OPEN_EXISTS);
+	if (rc) {
+		CERROR("5a: llog_create with logid failed: %d\n", rc);
+		GOTO(out_put, rc);
+	}
+
+	rc = llog_init_handle(env, llh, LLOG_F_IS_CAT, &uuid);
+	if (rc) {
+		CERROR("5a: can't init llog handle: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	CWARN("5b: print the catalog entries.. we expect 2\n");
+	cat_counter = 0;
+	rc = llog_process(env, llh, cat_print_cb, "test 5", NULL);
+	if (rc) {
+		CERROR("5b: process with cat_print_cb failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+	if (cat_counter != 2) {
+		CERROR("5b: %d entries in catalog\n", cat_counter);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	CWARN("5c: Cancel %d records, see one log zapped\n", LLOG_TEST_RECNUM);
+	cancel_count = 0;
+	rc = llog_cat_process(env, llh, llog_cancel_rec_cb, "foobar", 0, 0);
+	if (rc != -LLOG_EEMPTY) {
+		CERROR("5c: process with cat_cancel_cb failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	CWARN("5c: print the catalog entries.. we expect 1\n");
+	cat_counter = 0;
+	rc = llog_process(env, llh, cat_print_cb, "test 5", NULL);
+	if (rc) {
+		CERROR("5c: process with cat_print_cb failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+	if (cat_counter != 1) {
+		CERROR("5c: %d entries in catalog\n", cat_counter);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	CWARN("5d: add 1 record to the log with many canceled empty pages\n");
+	rc = llog_cat_add(env, llh, &lmr.lmr_hdr, NULL, NULL);
+	if (rc) {
+		CERROR("5d: add record to the log with many canceled empty "
+		       "pages failed\n");
+		GOTO(out, rc);
+	}
+
+	CWARN("5e: print plain log entries.. expect 6\n");
+	plain_counter = 0;
+	rc = llog_cat_process(env, llh, plain_print_cb, "foobar", 0, 0);
+	if (rc) {
+		CERROR("5e: process with plain_print_cb failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+	if (plain_counter != 6) {
+		CERROR("5e: found %d records\n", plain_counter);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	CWARN("5f: print plain log entries reversely.. expect 6\n");
+	plain_counter = 0;
+	rc = llog_cat_reverse_process(env, llh, plain_print_cb, "foobar");
+	if (rc) {
+		CERROR("5f: reversely process with plain_print_cb failed:"
+		       "%d\n", rc);
+		GOTO(out, rc);
+	}
+	if (plain_counter != 6) {
+		CERROR("5f: found %d records\n", plain_counter);
+		GOTO(out, rc = -EINVAL);
+	}
+
+out:
+	CWARN("5g: close re-opened catalog\n");
+	rc2 = llog_cat_close(env, llh);
+	if (rc2) {
+		CERROR("5g: close log %s failed: %d\n", name, rc2);
+		if (rc == 0)
+			rc = rc2;
+	}
+out_put:
+	llog_ctxt_put(ctxt);
+
+	RETURN(rc);
+}
+
+/* Test client api; open log by name and process */
+static int llog_test_6(const struct lu_env *env, struct obd_device *obd,
+		       char *name)
+{
+	struct obd_device	*mgc_obd;
+	struct llog_ctxt	*ctxt;
+	struct obd_uuid		*mgs_uuid;
+	struct obd_export	*exp;
+	struct obd_uuid		 uuid = { "LLOG_TEST6_UUID" };
+	struct llog_handle	*llh = NULL;
+	struct llog_ctxt	*nctxt;
+	int			 rc, rc2;
+
+	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+	LASSERT(ctxt);
+	mgs_uuid = &ctxt->loc_exp->exp_obd->obd_uuid;
+
+	CWARN("6a: re-open log %s using client API\n", name);
+	mgc_obd = class_find_client_obd(mgs_uuid, LUSTRE_MGC_NAME, NULL);
+	if (mgc_obd == NULL) {
+		CERROR("6a: no MGC devices connected to %s found.\n",
+		       mgs_uuid->uuid);
+		GOTO(ctxt_release, rc = -ENOENT);
+	}
+
+	rc = obd_connect(NULL, &exp, mgc_obd, &uuid,
+			 NULL /* obd_connect_data */, NULL);
+	if (rc != -EALREADY) {
+		CERROR("6a: connect on connected MGC (%s) failed to return"
+		       " -EALREADY", mgc_obd->obd_name);
+		if (rc == 0)
+			obd_disconnect(exp);
+		GOTO(ctxt_release, rc = -EINVAL);
+	}
+
+	nctxt = llog_get_context(mgc_obd, LLOG_CONFIG_REPL_CTXT);
+	rc = llog_open(env, nctxt, &llh, NULL, name, LLOG_OPEN_EXISTS);
+	if (rc) {
+		CERROR("6a: llog_open failed %d\n", rc);
+		GOTO(nctxt_put, rc);
+	}
+
+	rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL);
+	if (rc) {
+		CERROR("6a: llog_init_handle failed %d\n", rc);
+		GOTO(parse_out, rc);
+	}
+
+	plain_counter = 1; /* llog header is first record */
+	CWARN("6b: process log %s using client API\n", name);
+	rc = llog_process(env, llh, plain_print_cb, NULL, NULL);
+	if (rc)
+		CERROR("6b: llog_process failed %d\n", rc);
+	CWARN("6b: processed %d records\n", plain_counter);
+
+	rc = verify_handle("6b", llh, plain_counter);
+	if (rc)
+		GOTO(parse_out, rc);
+
+	plain_counter = 1; /* llog header is first record */
+	CWARN("6c: process log %s reversely using client API\n", name);
+	rc = llog_reverse_process(env, llh, plain_print_cb, NULL, NULL);
+	if (rc)
+		CERROR("6c: llog_reverse_process failed %d\n", rc);
+	CWARN("6c: processed %d records\n", plain_counter);
+
+	rc = verify_handle("6c", llh, plain_counter);
+	if (rc)
+		GOTO(parse_out, rc);
+
+parse_out:
+	rc2 = llog_close(env, llh);
+	if (rc2) {
+		CERROR("6: llog_close failed: rc = %d\n", rc2);
+		if (rc == 0)
+			rc = rc2;
+	}
+nctxt_put:
+	llog_ctxt_put(nctxt);
+ctxt_release:
+	llog_ctxt_put(ctxt);
+	RETURN(rc);
+}
+
+static union {
+	struct llog_rec_hdr		lrh;   /* common header */
+	struct llog_logid_rec		llr;   /* LLOG_LOGID_MAGIC */
+	struct llog_unlink64_rec	lur;   /* MDS_UNLINK64_REC */
+	struct llog_setattr64_rec	lsr64; /* MDS_SETATTR64_REC */
+	struct llog_size_change_rec	lscr;  /* OST_SZ_REC */
+	struct llog_changelog_rec	lcr;   /* CHANGELOG_REC */
+	struct llog_changelog_user_rec	lcur;  /* CHANGELOG_USER_REC */
+	struct llog_gen_rec		lgr;   /* LLOG_GEN_REC */
+} llog_records;
+
+static int test_7_print_cb(const struct lu_env *env, struct llog_handle *llh,
+			   struct llog_rec_hdr *rec, void *data)
+{
+	struct lu_fid fid = {0};
+
+	logid_to_fid(&llh->lgh_id, &fid);
+
+	CDEBUG(D_OTHER, "record type %#x at index %d in log "DFID"\n",
+	       rec->lrh_type, rec->lrh_index, PFID(&fid));
+
+	plain_counter++;
+	return 0;
+}
+
+static int test_7_cancel_cb(const struct lu_env *env, struct llog_handle *llh,
+			    struct llog_rec_hdr *rec, void *data)
+{
+	plain_counter++;
+	/* test LLOG_DEL_RECORD is working */
+	return LLOG_DEL_RECORD;
+}
+
+static int llog_test_7_sub(const struct lu_env *env, struct llog_ctxt *ctxt)
+{
+	struct llog_handle	*llh;
+	int			 rc = 0, i, process_count;
+	int			 num_recs = 0;
+
+	ENTRY;
+
+	rc = llog_open_create(env, ctxt, &llh, NULL, NULL);
+	if (rc) {
+		CERROR("7_sub: create log failed\n");
+		RETURN(rc);
+	}
+
+	rc = llog_init_handle(env, llh,
+			      LLOG_F_IS_PLAIN | LLOG_F_ZAP_WHEN_EMPTY,
+			      &uuid);
+	if (rc) {
+		CERROR("7_sub: can't init llog handle: %d\n", rc);
+		GOTO(out_close, rc);
+	}
+	for (i = 0; i < LLOG_BITMAP_SIZE(llh->lgh_hdr); i++) {
+		rc = llog_write(env, llh, &llog_records.lrh, NULL, 0,
+				NULL, -1);
+		if (rc == -ENOSPC) {
+			break;
+		} else if (rc < 0) {
+			CERROR("7_sub: write recs failed at #%d: %d\n",
+			       i + 1, rc);
+			GOTO(out_close, rc);
+		}
+		num_recs++;
+	}
+	if (rc != -ENOSPC) {
+		CWARN("7_sub: write record more than BITMAP size!\n");
+		GOTO(out_close, rc = -EINVAL);
+	}
+
+	rc = verify_handle("7_sub", llh, num_recs + 1);
+	if (rc) {
+		CERROR("7_sub: verify handle failed: %d\n", rc);
+		GOTO(out_close, rc);
+	}
+	if (num_recs < LLOG_BITMAP_SIZE(llh->lgh_hdr) - 1)
+		CWARN("7_sub: records are not aligned, written %d from %u\n",
+		      num_recs, LLOG_BITMAP_SIZE(llh->lgh_hdr) - 1);
+
+	plain_counter = 0;
+	rc = llog_process(env, llh, test_7_print_cb, "test 7", NULL);
+	if (rc) {
+		CERROR("7_sub: llog process failed: %d\n", rc);
+		GOTO(out_close, rc);
+	}
+	process_count = plain_counter;
+	if (process_count != num_recs) {
+		CERROR("7_sub: processed %d records from %d total\n",
+		       process_count, num_recs);
+		GOTO(out_close, rc = -EINVAL);
+	}
+
+	plain_counter = 0;
+	rc = llog_reverse_process(env, llh, test_7_cancel_cb, "test 7", NULL);
+	if (rc) {
+		CERROR("7_sub: reverse llog process failed: %d\n", rc);
+		GOTO(out_close, rc);
+	}
+	if (process_count != plain_counter) {
+		CERROR("7_sub: Reverse/direct processing found different"
+		       "number of records: %d/%d\n",
+		       plain_counter, process_count);
+		GOTO(out_close, rc = -EINVAL);
+	}
+	if (llog_exist(llh)) {
+		CERROR("7_sub: llog exists but should be zapped\n");
+		GOTO(out_close, rc = -EEXIST);
+	}
+
+	rc = verify_handle("7_sub", llh, 1);
+out_close:
+	if (rc)
+		llog_destroy(env, llh);
+	llog_close(env, llh);
+	RETURN(rc);
+}
+
+/* Test all llog records writing and processing */
+static int llog_test_7(const struct lu_env *env, struct obd_device *obd)
+{
+	struct llog_ctxt	*ctxt;
+	int			 rc;
+
+	ENTRY;
+
+	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+
+	CWARN("7a: test llog_logid_rec\n");
+	llog_records.llr.lid_hdr.lrh_len = sizeof(llog_records.llr);
+	llog_records.llr.lid_tail.lrt_len = sizeof(llog_records.llr);
+	llog_records.llr.lid_hdr.lrh_type = LLOG_LOGID_MAGIC;
+
+	rc = llog_test_7_sub(env, ctxt);
+	if (rc) {
+		CERROR("7a: llog_logid_rec test failed\n");
+		GOTO(out, rc);
+	}
+
+	CWARN("7b: test llog_unlink64_rec\n");
+	llog_records.lur.lur_hdr.lrh_len = sizeof(llog_records.lur);
+	llog_records.lur.lur_tail.lrt_len = sizeof(llog_records.lur);
+	llog_records.lur.lur_hdr.lrh_type = MDS_UNLINK64_REC;
+
+	rc = llog_test_7_sub(env, ctxt);
+	if (rc) {
+		CERROR("7b: llog_unlink_rec test failed\n");
+		GOTO(out, rc);
+	}
+
+	CWARN("7c: test llog_setattr64_rec\n");
+	llog_records.lsr64.lsr_hdr.lrh_len = sizeof(llog_records.lsr64);
+	llog_records.lsr64.lsr_tail.lrt_len = sizeof(llog_records.lsr64);
+	llog_records.lsr64.lsr_hdr.lrh_type = MDS_SETATTR64_REC;
+
+	rc = llog_test_7_sub(env, ctxt);
+	if (rc) {
+		CERROR("7c: llog_setattr64_rec test failed\n");
+		GOTO(out, rc);
+	}
+
+	CWARN("7d: test llog_size_change_rec\n");
+	llog_records.lscr.lsc_hdr.lrh_len = sizeof(llog_records.lscr);
+	llog_records.lscr.lsc_tail.lrt_len = sizeof(llog_records.lscr);
+	llog_records.lscr.lsc_hdr.lrh_type = OST_SZ_REC;
+
+	rc = llog_test_7_sub(env, ctxt);
+	if (rc) {
+		CERROR("7d: llog_size_change_rec test failed\n");
+		GOTO(out, rc);
+	}
+
+	CWARN("7e: test llog_changelog_rec\n");
+	llog_records.lcr.cr_hdr.lrh_len = sizeof(llog_records.lcr);
+	llog_records.lcr.cr_tail.lrt_len = sizeof(llog_records.lcr);
+	llog_records.lcr.cr_hdr.lrh_type = CHANGELOG_REC;
+
+	rc = llog_test_7_sub(env, ctxt);
+	if (rc) {
+		CERROR("7e: llog_changelog_rec test failed\n");
+		GOTO(out, rc);
+	}
+
+	CWARN("7f: test llog_changelog_user_rec\n");
+	llog_records.lcur.cur_hdr.lrh_len = sizeof(llog_records.lcur);
+	llog_records.lcur.cur_tail.lrt_len = sizeof(llog_records.lcur);
+	llog_records.lcur.cur_hdr.lrh_type = CHANGELOG_USER_REC;
+
+	rc = llog_test_7_sub(env, ctxt);
+	if (rc) {
+		CERROR("7f: llog_changelog_user_rec test failed\n");
+		GOTO(out, rc);
+	}
+
+	CWARN("7g: test llog_gen_rec\n");
+	llog_records.lgr.lgr_hdr.lrh_len = sizeof(llog_records.lgr);
+	llog_records.lgr.lgr_tail.lrt_len = sizeof(llog_records.lgr);
+	llog_records.lgr.lgr_hdr.lrh_type = LLOG_GEN_REC;
+
+	rc = llog_test_7_sub(env, ctxt);
+	if (rc) {
+		CERROR("7g: llog_size_change_rec test failed\n");
+		GOTO(out, rc);
+	}
+out:
+	llog_ctxt_put(ctxt);
+	RETURN(rc);
+}
+
+/* -------------------------------------------------------------------------
+ * Tests above, boring obd functions below
+ * ------------------------------------------------------------------------- */
+static int llog_run_tests(const struct lu_env *env, struct obd_device *obd)
+{
+	struct llog_handle	*llh = NULL;
+	struct llog_ctxt	*ctxt;
+	int			 rc, err;
+	char			 name[10];
+
+	ENTRY;
+	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+	LASSERT(ctxt);
+
+	sprintf(name, "%x", llog_test_rand);
+
+	rc = llog_test_1(env, obd, name);
+	if (rc)
+		GOTO(cleanup_ctxt, rc);
+
+	rc = llog_test_2(env, obd, name, &llh);
+	if (rc)
+		GOTO(cleanup_ctxt, rc);
+
+	rc = llog_test_3(env, obd, llh);
+	if (rc)
+		GOTO(cleanup, rc);
+
+	rc = llog_test_4(env, obd);
+	if (rc)
+		GOTO(cleanup, rc);
+
+	rc = llog_test_5(env, obd);
+	if (rc)
+		GOTO(cleanup, rc);
+
+	rc = llog_test_6(env, obd, name);
+	if (rc)
+		GOTO(cleanup, rc);
+
+	rc = llog_test_7(env, obd);
+	if (rc)
+		GOTO(cleanup, rc);
+
+cleanup:
+	err = llog_destroy(env, llh);
+	if (err)
+		CERROR("cleanup: llog_destroy failed: %d\n", err);
+	llog_close(env, llh);
+	if (rc == 0)
+		rc = err;
+cleanup_ctxt:
+	llog_ctxt_put(ctxt);
+	return rc;
+}
+
+#ifdef LPROCFS
+static struct lprocfs_vars lprocfs_llog_test_obd_vars[] = { {0} };
+static struct lprocfs_vars lprocfs_llog_test_module_vars[] = { {0} };
+static void lprocfs_llog_test_init_vars(struct lprocfs_static_vars *lvars)
+{
+    lvars->module_vars  = lprocfs_llog_test_module_vars;
+    lvars->obd_vars     = lprocfs_llog_test_obd_vars;
+}
+#endif
+
+static int llog_test_cleanup(struct obd_device *obd)
+{
+	struct obd_device	*tgt;
+	struct lu_env		 env;
+	int			 rc;
+
+	ENTRY;
+
+	rc = lu_env_init(&env, LCT_LOCAL | LCT_MG_THREAD);
+	if (rc)
+		RETURN(rc);
+
+	tgt = obd->obd_lvfs_ctxt.dt->dd_lu_dev.ld_obd;
+	rc = llog_cleanup(&env, llog_get_context(tgt, LLOG_TEST_ORIG_CTXT));
+	if (rc)
+		CERROR("failed to llog_test_llog_finish: %d\n", rc);
+	lu_env_fini(&env);
+	RETURN(rc);
+}
+
+static int llog_test_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct obd_device	*tgt;
+	struct llog_ctxt	*ctxt;
+	struct dt_object	*o;
+	struct lu_env		 env;
+	struct lu_context	 test_session;
+	int			 rc;
+
+	ENTRY;
+
+	if (lcfg->lcfg_bufcount < 2) {
+		CERROR("requires a TARGET OBD name\n");
+		RETURN(-EINVAL);
+	}
+
+	if (lcfg->lcfg_buflens[1] < 1) {
+		CERROR("requires a TARGET OBD name\n");
+		RETURN(-EINVAL);
+	}
+
+	/* disk obd */
+	tgt = class_name2obd(lustre_cfg_string(lcfg, 1));
+	if (!tgt || !tgt->obd_attached || !tgt->obd_set_up) {
+		CERROR("target device not attached or not set up (%s)\n",
+		       lustre_cfg_string(lcfg, 1));
+		RETURN(-EINVAL);
+	}
+
+	rc = lu_env_init(&env, LCT_LOCAL | LCT_MG_THREAD);
+	if (rc)
+		RETURN(rc);
+
+	rc = lu_context_init(&test_session, LCT_SESSION);
+	if (rc)
+		GOTO(cleanup_env, rc);
+	test_session.lc_thread = (struct ptlrpc_thread *)current;
+	lu_context_enter(&test_session);
+	env.le_ses = &test_session;
+
+	CWARN("Setup llog-test device over %s device\n",
+	      lustre_cfg_string(lcfg, 1));
+
+	OBD_SET_CTXT_MAGIC(&obd->obd_lvfs_ctxt);
+	obd->obd_lvfs_ctxt.dt = lu2dt_dev(tgt->obd_lu_dev);
+
+	rc = llog_setup(&env, tgt, &tgt->obd_olg, LLOG_TEST_ORIG_CTXT, tgt,
+			&llog_osd_ops);
+	if (rc)
+		GOTO(cleanup_session, rc);
+
+	/* use MGS llog dir for tests */
+	ctxt = llog_get_context(tgt, LLOG_CONFIG_ORIG_CTXT);
+	LASSERT(ctxt);
+	o = ctxt->loc_dir;
+	llog_ctxt_put(ctxt);
+
+	ctxt = llog_get_context(tgt, LLOG_TEST_ORIG_CTXT);
+	LASSERT(ctxt);
+	ctxt->loc_dir = o;
+	llog_ctxt_put(ctxt);
+
+	llog_test_rand = cfs_rand();
+
+	rc = llog_run_tests(&env, tgt);
+	if (rc)
+		llog_test_cleanup(obd);
+cleanup_session:
+	lu_context_exit(&test_session);
+	lu_context_fini(&test_session);
+cleanup_env:
+	lu_env_fini(&env);
+	RETURN(rc);
+}
+
+static struct obd_ops llog_obd_ops = {
+	.o_owner       = THIS_MODULE,
+	.o_setup       = llog_test_setup,
+	.o_cleanup     = llog_test_cleanup,
+};
+
+static int __init llog_test_init(void)
+{
+	struct lprocfs_static_vars lvars;
+
+	lprocfs_llog_test_init_vars(&lvars);
+	return class_register_type(&llog_obd_ops, NULL,
+				   lvars.module_vars, "llog_test", NULL);
+}
+
+static void __exit llog_test_exit(void)
+{
+	class_unregister_type("llog_test");
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("llog test module");
+MODULE_LICENSE("GPL");
+
+module_init(llog_test_init);
+module_exit(llog_test_exit);
diff --git a/drivers/staging/lustre/lustre/obdclass/local_storage.c b/drivers/staging/lustre/lustre/obdclass/local_storage.c
new file mode 100644
index 000000000000..b11ca6706448
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/local_storage.c
@@ -0,0 +1,855 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * lustre/obdclass/local_storage.c
+ *
+ * Local storage for file/objects with fid generation. Works on top of OSD.
+ *
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include "local_storage.h"
+
+/* all initialized local storages on this node are linked on this */
+static LIST_HEAD(ls_list_head);
+static DEFINE_MUTEX(ls_list_mutex);
+
+static int ls_object_init(const struct lu_env *env, struct lu_object *o,
+			  const struct lu_object_conf *unused)
+{
+	struct ls_device	*ls;
+	struct lu_object	*below;
+	struct lu_device	*under;
+
+	ENTRY;
+
+	ls = container_of0(o->lo_dev, struct ls_device, ls_top_dev.dd_lu_dev);
+	under = &ls->ls_osd->dd_lu_dev;
+	below = under->ld_ops->ldo_object_alloc(env, o->lo_header, under);
+	if (below == NULL)
+		RETURN(-ENOMEM);
+
+	lu_object_add(o, below);
+
+	RETURN(0);
+}
+
+static void ls_object_free(const struct lu_env *env, struct lu_object *o)
+{
+	struct ls_object	*obj = lu2ls_obj(o);
+	struct lu_object_header	*h = o->lo_header;
+
+	dt_object_fini(&obj->ls_obj);
+	lu_object_header_fini(h);
+	OBD_FREE_PTR(obj);
+}
+
+struct lu_object_operations ls_lu_obj_ops = {
+	.loo_object_init  = ls_object_init,
+	.loo_object_free  = ls_object_free,
+};
+
+struct lu_object *ls_object_alloc(const struct lu_env *env,
+				  const struct lu_object_header *_h,
+				  struct lu_device *d)
+{
+	struct lu_object_header	*h;
+	struct ls_object	*o;
+	struct lu_object	*l;
+
+	LASSERT(_h == NULL);
+
+	OBD_ALLOC_PTR(o);
+	if (o != NULL) {
+		l = &o->ls_obj.do_lu;
+		h = &o->ls_header;
+
+		lu_object_header_init(h);
+		dt_object_init(&o->ls_obj, h, d);
+		lu_object_add_top(h, l);
+
+		l->lo_ops = &ls_lu_obj_ops;
+
+		return l;
+	} else {
+		return NULL;
+	}
+}
+
+static struct lu_device_operations ls_lu_dev_ops = {
+	.ldo_object_alloc =	ls_object_alloc
+};
+
+static struct ls_device *__ls_find_dev(struct dt_device *dev)
+{
+	struct ls_device *ls, *ret = NULL;
+
+	list_for_each_entry(ls, &ls_list_head, ls_linkage) {
+		if (ls->ls_osd == dev) {
+			atomic_inc(&ls->ls_refcount);
+			ret = ls;
+			break;
+		}
+	}
+	return ret;
+}
+
+struct ls_device *ls_find_dev(struct dt_device *dev)
+{
+	struct ls_device *ls;
+
+	mutex_lock(&ls_list_mutex);
+	ls = __ls_find_dev(dev);
+	mutex_unlock(&ls_list_mutex);
+
+	return ls;
+}
+
+static struct lu_device_type_operations ls_device_type_ops = {
+	.ldto_start = NULL,
+	.ldto_stop  = NULL,
+};
+
+static struct lu_device_type ls_lu_type = {
+	.ldt_name = "local_storage",
+	.ldt_ops  = &ls_device_type_ops,
+};
+
+struct ls_device *ls_device_get(struct dt_device *dev)
+{
+	struct ls_device *ls;
+
+	ENTRY;
+
+	mutex_lock(&ls_list_mutex);
+	ls = __ls_find_dev(dev);
+	if (ls)
+		GOTO(out_ls, ls);
+
+	/* not found, then create */
+	OBD_ALLOC_PTR(ls);
+	if (ls == NULL)
+		GOTO(out_ls, ls = ERR_PTR(-ENOMEM));
+
+	atomic_set(&ls->ls_refcount, 1);
+	INIT_LIST_HEAD(&ls->ls_los_list);
+	mutex_init(&ls->ls_los_mutex);
+
+	ls->ls_osd = dev;
+
+	LASSERT(dev->dd_lu_dev.ld_site);
+	lu_device_init(&ls->ls_top_dev.dd_lu_dev, &ls_lu_type);
+	ls->ls_top_dev.dd_lu_dev.ld_ops = &ls_lu_dev_ops;
+	ls->ls_top_dev.dd_lu_dev.ld_site = dev->dd_lu_dev.ld_site;
+
+	/* finally add ls to the list */
+	list_add(&ls->ls_linkage, &ls_list_head);
+out_ls:
+	mutex_unlock(&ls_list_mutex);
+	RETURN(ls);
+}
+
+void ls_device_put(const struct lu_env *env, struct ls_device *ls)
+{
+	LASSERT(env);
+	if (!atomic_dec_and_test(&ls->ls_refcount))
+		return;
+
+	mutex_lock(&ls_list_mutex);
+	if (atomic_read(&ls->ls_refcount) == 0) {
+		LASSERT(list_empty(&ls->ls_los_list));
+		list_del(&ls->ls_linkage);
+		lu_site_purge(env, ls->ls_top_dev.dd_lu_dev.ld_site, ~0);
+		lu_device_fini(&ls->ls_top_dev.dd_lu_dev);
+		OBD_FREE_PTR(ls);
+	}
+	mutex_unlock(&ls_list_mutex);
+}
+
+/**
+ * local file fid generation
+ */
+int local_object_fid_generate(const struct lu_env *env,
+			      struct local_oid_storage *los,
+			      struct lu_fid *fid)
+{
+	LASSERT(los->los_dev);
+	LASSERT(los->los_obj);
+
+	/* take next OID */
+
+	/* to make it unique after reboot we store
+	 * the latest generated fid atomically with
+	 * object creation see local_object_create() */
+
+	mutex_lock(&los->los_id_lock);
+	fid->f_seq = los->los_seq;
+	fid->f_oid = los->los_last_oid++;
+	fid->f_ver = 0;
+	mutex_unlock(&los->los_id_lock);
+
+	return 0;
+}
+
+int local_object_declare_create(const struct lu_env *env,
+				struct local_oid_storage *los,
+				struct dt_object *o, struct lu_attr *attr,
+				struct dt_object_format *dof,
+				struct thandle *th)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	int			 rc;
+
+	ENTRY;
+
+	/* update fid generation file */
+	if (los != NULL) {
+		LASSERT(dt_object_exists(los->los_obj));
+		rc = dt_declare_record_write(env, los->los_obj,
+					     sizeof(struct los_ondisk), 0, th);
+		if (rc)
+			RETURN(rc);
+	}
+
+	rc = dt_declare_create(env, o, attr, NULL, dof, th);
+	if (rc)
+		RETURN(rc);
+
+	dti->dti_lb.lb_buf = NULL;
+	dti->dti_lb.lb_len = sizeof(dti->dti_lma);
+	rc = dt_declare_xattr_set(env, o, &dti->dti_lb, XATTR_NAME_LMA, 0, th);
+
+	RETURN(rc);
+}
+
+int local_object_create(const struct lu_env *env,
+			struct local_oid_storage *los,
+			struct dt_object *o, struct lu_attr *attr,
+			struct dt_object_format *dof, struct thandle *th)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	struct los_ondisk	 losd;
+	int			 rc;
+
+	ENTRY;
+
+	rc = dt_create(env, o, attr, NULL, dof, th);
+	if (rc)
+		RETURN(rc);
+
+	if (los == NULL)
+		RETURN(rc);
+
+	LASSERT(los->los_obj);
+	LASSERT(dt_object_exists(los->los_obj));
+
+	/* many threads can be updated this, serialize
+	 * them here to avoid the race where one thread
+	 * takes the value first, but writes it last */
+	mutex_lock(&los->los_id_lock);
+
+	/* update local oid number on disk so that
+	 * we know the last one used after reboot */
+	losd.lso_magic = cpu_to_le32(LOS_MAGIC);
+	losd.lso_next_oid = cpu_to_le32(los->los_last_oid);
+
+	dti->dti_off = 0;
+	dti->dti_lb.lb_buf = &losd;
+	dti->dti_lb.lb_len = sizeof(losd);
+	rc = dt_record_write(env, los->los_obj, &dti->dti_lb, &dti->dti_off,
+			     th);
+	mutex_unlock(&los->los_id_lock);
+
+	RETURN(rc);
+}
+
+/*
+ * Create local named object (file, directory or index) in parent directory.
+ */
+struct dt_object *__local_file_create(const struct lu_env *env,
+				      const struct lu_fid *fid,
+				      struct local_oid_storage *los,
+				      struct ls_device *ls,
+				      struct dt_object *parent,
+				      const char *name, struct lu_attr *attr,
+				      struct dt_object_format *dof)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	struct dt_object	*dto;
+	struct thandle		*th;
+	int			 rc;
+
+	dto = ls_locate(env, ls, fid);
+	if (unlikely(IS_ERR(dto)))
+		RETURN(dto);
+
+	LASSERT(dto != NULL);
+	if (dt_object_exists(dto))
+		GOTO(out, rc = -EEXIST);
+
+	th = dt_trans_create(env, ls->ls_osd);
+	if (IS_ERR(th))
+		GOTO(out, rc = PTR_ERR(th));
+
+	rc = local_object_declare_create(env, los, dto, attr, dof, th);
+	if (rc)
+		GOTO(trans_stop, rc);
+
+	if (dti->dti_dof.dof_type == DFT_DIR) {
+		dt_declare_ref_add(env, dto, th);
+		dt_declare_ref_add(env, parent, th);
+	}
+
+	rc = dt_declare_insert(env, parent, (void *)fid, (void *)name, th);
+	if (rc)
+		GOTO(trans_stop, rc);
+
+	rc = dt_trans_start_local(env, ls->ls_osd, th);
+	if (rc)
+		GOTO(trans_stop, rc);
+
+	dt_write_lock(env, dto, 0);
+	if (dt_object_exists(dto))
+		GOTO(unlock, rc = 0);
+
+	CDEBUG(D_OTHER, "create new object "DFID"\n",
+	       PFID(lu_object_fid(&dto->do_lu)));
+	rc = local_object_create(env, los, dto, attr, dof, th);
+	if (rc)
+		GOTO(unlock, rc);
+	LASSERT(dt_object_exists(dto));
+
+	if (dti->dti_dof.dof_type == DFT_DIR) {
+		if (!dt_try_as_dir(env, dto))
+			GOTO(destroy, rc = -ENOTDIR);
+		/* Add "." and ".." for newly created dir */
+		rc = dt_insert(env, dto, (void *)fid, (void *)".", th,
+			       BYPASS_CAPA, 1);
+		if (rc)
+			GOTO(destroy, rc);
+		dt_ref_add(env, dto, th);
+		rc = dt_insert(env, dto, (void *)lu_object_fid(&parent->do_lu),
+			       (void *)"..", th, BYPASS_CAPA, 1);
+		if (rc)
+			GOTO(destroy, rc);
+	}
+
+	dt_write_lock(env, parent, 0);
+	rc = dt_insert(env, parent, (const struct dt_rec *)fid,
+		       (const struct dt_key *)name, th, BYPASS_CAPA, 1);
+	if (dti->dti_dof.dof_type == DFT_DIR)
+		dt_ref_add(env, parent, th);
+	dt_write_unlock(env, parent);
+	if (rc)
+		GOTO(destroy, rc);
+destroy:
+	if (rc)
+		dt_destroy(env, dto, th);
+unlock:
+	dt_write_unlock(env, dto);
+trans_stop:
+	dt_trans_stop(env, ls->ls_osd, th);
+out:
+	if (rc) {
+		lu_object_put_nocache(env, &dto->do_lu);
+		dto = ERR_PTR(rc);
+	}
+	RETURN(dto);
+}
+
+/*
+ * Look up and create (if it does not exist) a local named file or directory in
+ * parent directory.
+ */
+struct dt_object *local_file_find_or_create(const struct lu_env *env,
+					    struct local_oid_storage *los,
+					    struct dt_object *parent,
+					    const char *name, __u32 mode)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	struct dt_object	*dto;
+	int			 rc;
+
+	LASSERT(parent);
+
+	rc = dt_lookup_dir(env, parent, name, &dti->dti_fid);
+	if (rc == 0)
+		/* name is found, get the object */
+		dto = ls_locate(env, dt2ls_dev(los->los_dev), &dti->dti_fid);
+	else if (rc != -ENOENT)
+		dto = ERR_PTR(rc);
+	else {
+		rc = local_object_fid_generate(env, los, &dti->dti_fid);
+		if (rc < 0) {
+			dto = ERR_PTR(rc);
+		} else {
+			/* create the object */
+			dti->dti_attr.la_valid	= LA_MODE;
+			dti->dti_attr.la_mode	= mode;
+			dti->dti_dof.dof_type	= dt_mode_to_dft(mode & S_IFMT);
+			dto = __local_file_create(env, &dti->dti_fid, los,
+						  dt2ls_dev(los->los_dev),
+						  parent, name, &dti->dti_attr,
+						  &dti->dti_dof);
+		}
+	}
+	return dto;
+}
+EXPORT_SYMBOL(local_file_find_or_create);
+
+struct dt_object *local_file_find_or_create_with_fid(const struct lu_env *env,
+						     struct dt_device *dt,
+						     const struct lu_fid *fid,
+						     struct dt_object *parent,
+						     const char *name,
+						     __u32 mode)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	struct dt_object	*dto;
+	int			 rc;
+
+	LASSERT(parent);
+
+	rc = dt_lookup_dir(env, parent, name, &dti->dti_fid);
+	if (rc == 0) {
+		dto = dt_locate(env, dt, &dti->dti_fid);
+	} else if (rc != -ENOENT) {
+		dto = ERR_PTR(rc);
+	} else {
+		struct ls_device *ls;
+
+		ls = ls_device_get(dt);
+		if (IS_ERR(ls)) {
+			dto = ERR_PTR(PTR_ERR(ls));
+		} else {
+			/* create the object */
+			dti->dti_attr.la_valid	= LA_MODE;
+			dti->dti_attr.la_mode	= mode;
+			dti->dti_dof.dof_type	= dt_mode_to_dft(mode & S_IFMT);
+			dto = __local_file_create(env, fid, NULL, ls, parent,
+						  name, &dti->dti_attr,
+						  &dti->dti_dof);
+			/* ls_device_put() will finalize the ls device, we
+			 * have to open the object in other device stack */
+			if (!IS_ERR(dto)) {
+				dti->dti_fid = dto->do_lu.lo_header->loh_fid;
+				lu_object_put_nocache(env, &dto->do_lu);
+				dto = dt_locate(env, dt, &dti->dti_fid);
+			}
+			ls_device_put(env, ls);
+		}
+	}
+	return dto;
+}
+EXPORT_SYMBOL(local_file_find_or_create_with_fid);
+
+/*
+ * Look up and create (if it does not exist) a local named index file in parent
+ * directory.
+ */
+struct dt_object *local_index_find_or_create(const struct lu_env *env,
+					     struct local_oid_storage *los,
+					     struct dt_object *parent,
+					     const char *name, __u32 mode,
+					     const struct dt_index_features *ft)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	struct dt_object	*dto;
+	int			 rc;
+
+	LASSERT(parent);
+
+	rc = dt_lookup_dir(env, parent, name, &dti->dti_fid);
+	if (rc == 0) {
+		/* name is found, get the object */
+		dto = ls_locate(env, dt2ls_dev(los->los_dev), &dti->dti_fid);
+	} else if (rc != -ENOENT) {
+		dto = ERR_PTR(rc);
+	} else {
+		rc = local_object_fid_generate(env, los, &dti->dti_fid);
+		if (rc < 0) {
+			dto = ERR_PTR(rc);
+		} else {
+			/* create the object */
+			dti->dti_attr.la_valid		= LA_MODE;
+			dti->dti_attr.la_mode		= mode;
+			dti->dti_dof.dof_type		= DFT_INDEX;
+			dti->dti_dof.u.dof_idx.di_feat	= ft;
+			dto = __local_file_create(env, &dti->dti_fid, los,
+						  dt2ls_dev(los->los_dev),
+						  parent, name, &dti->dti_attr,
+						  &dti->dti_dof);
+		}
+	}
+	return dto;
+
+}
+EXPORT_SYMBOL(local_index_find_or_create);
+
+struct dt_object *
+local_index_find_or_create_with_fid(const struct lu_env *env,
+				    struct dt_device *dt,
+				    const struct lu_fid *fid,
+				    struct dt_object *parent,
+				    const char *name, __u32 mode,
+				    const struct dt_index_features *ft)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	struct dt_object	*dto;
+	int			 rc;
+
+	LASSERT(parent);
+
+	rc = dt_lookup_dir(env, parent, name, &dti->dti_fid);
+	if (rc == 0) {
+		/* name is found, get the object */
+		if (!lu_fid_eq(fid, &dti->dti_fid))
+			dto = ERR_PTR(-EINVAL);
+		else
+			dto = dt_locate(env, dt, fid);
+	} else if (rc != -ENOENT) {
+		dto = ERR_PTR(rc);
+	} else {
+		struct ls_device *ls;
+
+		ls = ls_device_get(dt);
+		if (IS_ERR(ls)) {
+			dto = ERR_PTR(PTR_ERR(ls));
+		} else {
+			/* create the object */
+			dti->dti_attr.la_valid		= LA_MODE;
+			dti->dti_attr.la_mode		= mode;
+			dti->dti_dof.dof_type		= DFT_INDEX;
+			dti->dti_dof.u.dof_idx.di_feat  = ft;
+			dto = __local_file_create(env, fid, NULL, ls, parent,
+						  name, &dti->dti_attr,
+						  &dti->dti_dof);
+			/* ls_device_put() will finalize the ls device, we
+			 * have to open the object in other device stack */
+			if (!IS_ERR(dto)) {
+				dti->dti_fid = dto->do_lu.lo_header->loh_fid;
+				lu_object_put_nocache(env, &dto->do_lu);
+				dto = dt_locate(env, dt, &dti->dti_fid);
+			}
+			ls_device_put(env, ls);
+		}
+	}
+	return dto;
+}
+EXPORT_SYMBOL(local_index_find_or_create_with_fid);
+
+static int local_object_declare_unlink(const struct lu_env *env,
+				       struct dt_device *dt,
+				       struct dt_object *p,
+				       struct dt_object *c, const char *name,
+				       struct thandle *th)
+{
+	int rc;
+
+	rc = dt_declare_delete(env, p, (const struct dt_key *)name, th);
+	if (rc < 0)
+		return rc;
+
+	rc = dt_declare_ref_del(env, c, th);
+	if (rc < 0)
+		return rc;
+
+	return dt_declare_destroy(env, c, th);
+}
+
+int local_object_unlink(const struct lu_env *env, struct dt_device *dt,
+			struct dt_object *parent, const char *name)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	struct dt_object	*dto;
+	struct thandle		*th;
+	int			 rc;
+
+	ENTRY;
+
+	rc = dt_lookup_dir(env, parent, name, &dti->dti_fid);
+	if (rc == -ENOENT)
+		RETURN(0);
+	else if (rc < 0)
+		RETURN(rc);
+
+	dto = dt_locate(env, dt, &dti->dti_fid);
+	if (unlikely(IS_ERR(dto)))
+		RETURN(PTR_ERR(dto));
+
+	th = dt_trans_create(env, dt);
+	if (IS_ERR(th))
+		GOTO(out, rc = PTR_ERR(th));
+
+	rc = local_object_declare_unlink(env, dt, parent, dto, name, th);
+	if (rc < 0)
+		GOTO(stop, rc);
+
+	rc = dt_trans_start_local(env, dt, th);
+	if (rc < 0)
+		GOTO(stop, rc);
+
+	dt_write_lock(env, dto, 0);
+	rc = dt_delete(env, parent, (struct dt_key *)name, th, BYPASS_CAPA);
+	if (rc < 0)
+		GOTO(unlock, rc);
+
+	rc = dt_ref_del(env, dto, th);
+	if (rc < 0) {
+		rc = dt_insert(env, parent,
+			       (const struct dt_rec *)&dti->dti_fid,
+			       (const struct dt_key *)name, th, BYPASS_CAPA, 1);
+		GOTO(unlock, rc);
+	}
+
+	rc = dt_destroy(env, dto, th);
+unlock:
+	dt_write_unlock(env, dto);
+stop:
+	dt_trans_stop(env, dt, th);
+out:
+	lu_object_put_nocache(env, &dto->do_lu);
+	return rc;
+}
+EXPORT_SYMBOL(local_object_unlink);
+
+struct local_oid_storage *dt_los_find(struct ls_device *ls, __u64 seq)
+{
+	struct local_oid_storage *los, *ret = NULL;
+
+	list_for_each_entry(los, &ls->ls_los_list, los_list) {
+		if (los->los_seq == seq) {
+			atomic_inc(&los->los_refcount);
+			ret = los;
+			break;
+		}
+	}
+	return ret;
+}
+
+void dt_los_put(struct local_oid_storage *los)
+{
+	if (atomic_dec_and_test(&los->los_refcount))
+		/* should never happen, only local_oid_storage_fini should
+		 * drop refcount to zero */
+		LBUG();
+	return;
+}
+
+/**
+ * Initialize local OID storage for required sequence.
+ * That may be needed for services that uses local files and requires
+ * dynamic OID allocation for them.
+ *
+ * Per each sequence we have an object with 'first_fid' identificator
+ * containing the counter for OIDs of locally created files with that
+ * sequence.
+ *
+ * It is used now by llog subsystem and MGS for NID tables
+ *
+ * Function gets first_fid to create counter object.
+ * All dynamic fids will be generated with the same sequence and incremented
+ * OIDs
+ *
+ * Returned local_oid_storage is in-memory representaion of OID storage
+ */
+int local_oid_storage_init(const struct lu_env *env, struct dt_device *dev,
+			   const struct lu_fid *first_fid,
+			   struct local_oid_storage **los)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	struct ls_device	*ls;
+	struct los_ondisk	 losd;
+	struct dt_object	*root = NULL;
+	struct dt_object	*o = NULL;
+	struct thandle		*th;
+	int			 rc;
+
+	ENTRY;
+
+	ls = ls_device_get(dev);
+	if (IS_ERR(ls))
+		RETURN(PTR_ERR(ls));
+
+	mutex_lock(&ls->ls_los_mutex);
+	*los = dt_los_find(ls, fid_seq(first_fid));
+	if (*los != NULL)
+		GOTO(out, rc = 0);
+
+	/* not found, then create */
+	OBD_ALLOC_PTR(*los);
+	if (*los == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	atomic_set(&(*los)->los_refcount, 1);
+	mutex_init(&(*los)->los_id_lock);
+	(*los)->los_dev = &ls->ls_top_dev;
+	atomic_inc(&ls->ls_refcount);
+	list_add(&(*los)->los_list, &ls->ls_los_list);
+
+	rc = dt_root_get(env, dev, &dti->dti_fid);
+	if (rc)
+		GOTO(out_los, rc);
+
+	root = ls_locate(env, ls, &dti->dti_fid);
+	if (IS_ERR(root))
+		GOTO(out_los, rc = PTR_ERR(root));
+
+	/* initialize data allowing to generate new fids,
+	 * literally we need a sequence */
+	snprintf(dti->dti_buf, sizeof(dti->dti_buf), "seq-%Lx-lastid",
+		 fid_seq(first_fid));
+	rc = dt_lookup_dir(env, root, dti->dti_buf, &dti->dti_fid);
+	if (rc == -ENOENT)
+		dti->dti_fid = *first_fid;
+	else if (rc < 0)
+		GOTO(out_los, rc);
+
+	o = ls_locate(env, ls, &dti->dti_fid);
+	if (IS_ERR(o))
+		GOTO(out_los, rc = PTR_ERR(o));
+	LASSERT(fid_seq(&dti->dti_fid) == fid_seq(first_fid));
+	if (!dt_object_exists(o)) {
+		LASSERT(rc == -ENOENT);
+
+		th = dt_trans_create(env, dev);
+		if (IS_ERR(th))
+			GOTO(out_los, rc = PTR_ERR(th));
+
+		dti->dti_attr.la_valid = LA_MODE | LA_TYPE;
+		dti->dti_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
+		dti->dti_dof.dof_type = dt_mode_to_dft(S_IFREG);
+
+		rc = dt_declare_create(env, o, &dti->dti_attr, NULL,
+				       &dti->dti_dof, th);
+		if (rc)
+			GOTO(out_trans, rc);
+
+		rc = dt_declare_insert(env, root,
+				       (const struct dt_rec *)&dti->dti_fid,
+				       (const struct dt_key *)dti->dti_buf,
+				       th);
+		if (rc)
+			GOTO(out_trans, rc);
+
+		rc = dt_declare_record_write(env, o, sizeof(losd), 0, th);
+		if (rc)
+			GOTO(out_trans, rc);
+
+		rc = dt_trans_start_local(env, dev, th);
+		if (rc)
+			GOTO(out_trans, rc);
+
+		dt_write_lock(env, root, 0);
+		dt_write_lock(env, o, 0);
+		if (dt_object_exists(o))
+			GOTO(out_lock, rc = 0);
+
+		rc = dt_create(env, o, &dti->dti_attr, NULL, &dti->dti_dof,
+			       th);
+		if (rc)
+			GOTO(out_lock, rc);
+
+		losd.lso_magic = cpu_to_le32(LOS_MAGIC);
+		losd.lso_next_oid = cpu_to_le32(fid_oid(first_fid) + 1);
+
+		dti->dti_off = 0;
+		dti->dti_lb.lb_buf = &losd;
+		dti->dti_lb.lb_len = sizeof(losd);
+		rc = dt_record_write(env, o, &dti->dti_lb, &dti->dti_off, th);
+		if (rc)
+			GOTO(out_lock, rc);
+		rc = dt_insert(env, root,
+			       (const struct dt_rec *)&dti->dti_fid,
+			       (const struct dt_key *)dti->dti_buf,
+			       th, BYPASS_CAPA, 1);
+		if (rc)
+			GOTO(out_lock, rc);
+out_lock:
+		dt_write_unlock(env, o);
+		dt_write_unlock(env, root);
+out_trans:
+		dt_trans_stop(env, dev, th);
+	} else {
+		dti->dti_off = 0;
+		dti->dti_lb.lb_buf = &losd;
+		dti->dti_lb.lb_len = sizeof(losd);
+		dt_read_lock(env, o, 0);
+		rc = dt_record_read(env, o, &dti->dti_lb, &dti->dti_off);
+		dt_read_unlock(env, o);
+		if (rc == 0 && le32_to_cpu(losd.lso_magic) != LOS_MAGIC) {
+			CERROR("local storage file "DFID" is corrupted\n",
+			       PFID(first_fid));
+			rc = -EINVAL;
+		}
+	}
+out_los:
+	if (root != NULL && !IS_ERR(root))
+		lu_object_put_nocache(env, &root->do_lu);
+
+	if (rc != 0) {
+		list_del(&(*los)->los_list);
+		atomic_dec(&ls->ls_refcount);
+		OBD_FREE_PTR(*los);
+		*los = NULL;
+		if (o != NULL && !IS_ERR(o))
+			lu_object_put_nocache(env, &o->do_lu);
+	} else {
+		(*los)->los_seq = fid_seq(first_fid);
+		(*los)->los_last_oid = le32_to_cpu(losd.lso_next_oid);
+		(*los)->los_obj = o;
+	}
+out:
+	mutex_unlock(&ls->ls_los_mutex);
+	ls_device_put(env, ls);
+	return rc;
+}
+EXPORT_SYMBOL(local_oid_storage_init);
+
+void local_oid_storage_fini(const struct lu_env *env,
+			    struct local_oid_storage *los)
+{
+	struct ls_device *ls;
+
+	if (!atomic_dec_and_test(&los->los_refcount))
+		return;
+
+	LASSERT(env);
+	LASSERT(los->los_dev);
+	ls = dt2ls_dev(los->los_dev);
+
+	mutex_lock(&ls->ls_los_mutex);
+	if (atomic_read(&los->los_refcount) == 0) {
+		if (los->los_obj)
+			lu_object_put_nocache(env, &los->los_obj->do_lu);
+		list_del(&los->los_list);
+		OBD_FREE_PTR(los);
+	}
+	mutex_unlock(&ls->ls_los_mutex);
+	ls_device_put(env, ls);
+}
+EXPORT_SYMBOL(local_oid_storage_fini);
diff --git a/drivers/staging/lustre/lustre/obdclass/local_storage.h b/drivers/staging/lustre/lustre/obdclass/local_storage.h
new file mode 100644
index 000000000000..7c5c0bc855bd
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/local_storage.h
@@ -0,0 +1,76 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * lustre/obdclass/local_storage.c
+ *
+ * Local storage for file/objects with fid generation. Works on top of OSD.
+ *
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#include <dt_object.h>
+#include <obd.h>
+#include <lustre_fid.h>
+#include <lustre_disk.h>
+
+struct ls_device {
+	struct dt_device	 ls_top_dev;
+	/* all initialized ls_devices on this node linked by this */
+	struct list_head		 ls_linkage;
+	/* how many handle's reference this local storage */
+	atomic_t		 ls_refcount;
+	/* underlaying OSD device */
+	struct dt_device	*ls_osd;
+	/* list of all local OID storages */
+	struct list_head		 ls_los_list;
+	struct mutex		 ls_los_mutex;
+};
+
+static inline struct ls_device *dt2ls_dev(struct dt_device *d)
+{
+	return container_of0(d, struct ls_device, ls_top_dev);
+}
+
+struct ls_object {
+	struct lu_object_header	 ls_header;
+	struct dt_object	 ls_obj;
+};
+
+static inline struct ls_object *lu2ls_obj(struct lu_object *o)
+{
+	return container_of0(o, struct ls_object, ls_obj.do_lu);
+}
+
+static inline struct dt_object *ls_locate(const struct lu_env *env,
+					  struct ls_device *ls,
+					  const struct lu_fid *fid)
+{
+	return dt_locate_at(env, ls->ls_osd, fid, &ls->ls_top_dev.dd_lu_dev);
+}
+
+struct ls_device *ls_device_get(struct dt_device *dev);
+void ls_device_put(const struct lu_env *env, struct ls_device *ls);
+struct local_oid_storage *dt_los_find(struct ls_device *ls, __u64 seq);
+void dt_los_put(struct local_oid_storage *los);
diff --git a/drivers/staging/lustre/lustre/obdclass/lprocfs_jobstats.c b/drivers/staging/lustre/lustre/obdclass/lprocfs_jobstats.c
new file mode 100644
index 000000000000..7afc2ad4c8d5
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/lprocfs_jobstats.c
@@ -0,0 +1,575 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ * Use is subject to license terms.
+ *
+ * Author: Niu Yawei <niu@whamcloud.com>
+ */
+/*
+ * lustre/obdclass/lprocfs_jobstats.c
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_CLASS
+
+
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include <lustre/lustre_idl.h>
+
+#if defined(LPROCFS)
+
+/*
+ * JobID formats & JobID environment variable names for supported
+ * job schedulers:
+ *
+ * SLURM:
+ *   JobID format:  32 bit integer.
+ *   JobID env var: SLURM_JOB_ID.
+ * SGE:
+ *   JobID format:  Decimal integer range to 99999.
+ *   JobID env var: JOB_ID.
+ * LSF:
+ *   JobID format:  6 digit integer by default (up to 999999), can be
+ *		  increased to 10 digit (up to 2147483646).
+ *   JobID env var: LSB_JOBID.
+ * Loadleveler:
+ *   JobID format:  String of machine_name.cluster_id.process_id, for
+ *		  example: fr2n02.32.0
+ *   JobID env var: LOADL_STEP_ID.
+ * PBS:
+ *   JobID format:  String of sequence_number[.server_name][@server].
+ *   JobID env var: PBS_JOBID.
+ * Maui/MOAB:
+ *   JobID format:  Same as PBS.
+ *   JobID env var: Same as PBS.
+ */
+
+struct job_stat {
+	struct hlist_node      js_hash;
+	struct list_head	    js_list;
+	atomic_t	  js_refcount;
+	char		  js_jobid[JOBSTATS_JOBID_SIZE];
+	time_t		js_timestamp; /* seconds */
+	struct lprocfs_stats *js_stats;
+	struct obd_job_stats *js_jobstats;
+};
+
+static unsigned job_stat_hash(cfs_hash_t *hs, const void *key, unsigned mask)
+{
+	return cfs_hash_djb2_hash(key, strlen(key), mask);
+}
+
+static void *job_stat_key(struct hlist_node *hnode)
+{
+	struct job_stat *job;
+	job = hlist_entry(hnode, struct job_stat, js_hash);
+	return job->js_jobid;
+}
+
+static int job_stat_keycmp(const void *key, struct hlist_node *hnode)
+{
+	struct job_stat *job;
+	job = hlist_entry(hnode, struct job_stat, js_hash);
+	return (strlen(job->js_jobid) == strlen(key)) &&
+	       !strncmp(job->js_jobid, key, strlen(key));
+}
+
+static void *job_stat_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct job_stat, js_hash);
+}
+
+static void job_stat_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct job_stat *job;
+	job = hlist_entry(hnode, struct job_stat, js_hash);
+	atomic_inc(&job->js_refcount);
+}
+
+static void job_free(struct job_stat *job)
+{
+	LASSERT(atomic_read(&job->js_refcount) == 0);
+	LASSERT(job->js_jobstats);
+
+	write_lock(&job->js_jobstats->ojs_lock);
+	list_del_init(&job->js_list);
+	write_unlock(&job->js_jobstats->ojs_lock);
+
+	lprocfs_free_stats(&job->js_stats);
+	OBD_FREE_PTR(job);
+}
+
+static void job_putref(struct job_stat *job)
+{
+	LASSERT(atomic_read(&job->js_refcount) > 0);
+	if (atomic_dec_and_test(&job->js_refcount))
+		job_free(job);
+}
+
+static void job_stat_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct job_stat *job;
+	job = hlist_entry(hnode, struct job_stat, js_hash);
+	job_putref(job);
+}
+
+static void job_stat_exit(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	CERROR("Should not have any items!");
+}
+
+static cfs_hash_ops_t job_stats_hash_ops = {
+	.hs_hash       = job_stat_hash,
+	.hs_key	= job_stat_key,
+	.hs_keycmp     = job_stat_keycmp,
+	.hs_object     = job_stat_object,
+	.hs_get	= job_stat_get,
+	.hs_put_locked = job_stat_put_locked,
+	.hs_exit       = job_stat_exit,
+};
+
+static int job_iter_callback(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+			     struct hlist_node *hnode, void *data)
+{
+	time_t oldest = *((time_t *)data);
+	struct job_stat *job;
+
+	job = hlist_entry(hnode, struct job_stat, js_hash);
+	if (!oldest || job->js_timestamp < oldest)
+		cfs_hash_bd_del_locked(hs, bd, hnode);
+
+	return 0;
+}
+
+static void lprocfs_job_cleanup(struct obd_job_stats *stats, bool force)
+{
+	time_t oldest, now;
+
+	if (stats->ojs_cleanup_interval == 0)
+		return;
+
+	now = cfs_time_current_sec();
+	if (!force && now < stats->ojs_last_cleanup +
+			    stats->ojs_cleanup_interval)
+		return;
+
+	oldest = now - stats->ojs_cleanup_interval;
+	cfs_hash_for_each_safe(stats->ojs_hash, job_iter_callback,
+			       &oldest);
+	stats->ojs_last_cleanup = cfs_time_current_sec();
+}
+
+static struct job_stat *job_alloc(char *jobid, struct obd_job_stats *jobs)
+{
+	struct job_stat *job;
+
+	LASSERT(jobs->ojs_cntr_num && jobs->ojs_cntr_init_fn);
+
+	OBD_ALLOC_PTR(job);
+	if (job == NULL)
+		return NULL;
+
+	job->js_stats = lprocfs_alloc_stats(jobs->ojs_cntr_num, 0);
+	if (job->js_stats == NULL) {
+		OBD_FREE_PTR(job);
+		return NULL;
+	}
+
+	jobs->ojs_cntr_init_fn(job->js_stats);
+
+	memcpy(job->js_jobid, jobid, JOBSTATS_JOBID_SIZE);
+	job->js_timestamp = cfs_time_current_sec();
+	job->js_jobstats = jobs;
+	INIT_HLIST_NODE(&job->js_hash);
+	INIT_LIST_HEAD(&job->js_list);
+	atomic_set(&job->js_refcount, 1);
+
+	return job;
+}
+
+int lprocfs_job_stats_log(struct obd_device *obd, char *jobid,
+			  int event, long amount)
+{
+	struct obd_job_stats *stats = &obd->u.obt.obt_jobstats;
+	struct job_stat *job, *job2;
+	ENTRY;
+
+	LASSERT(stats && stats->ojs_hash);
+
+	lprocfs_job_cleanup(stats, false);
+
+	if (!jobid || !strlen(jobid))
+		RETURN(-EINVAL);
+
+	if (strlen(jobid) >= JOBSTATS_JOBID_SIZE) {
+		CERROR("Invalid jobid size (%lu), expect(%d)\n",
+		       (unsigned long)strlen(jobid) + 1, JOBSTATS_JOBID_SIZE);
+		RETURN(-EINVAL);
+	}
+
+	job = cfs_hash_lookup(stats->ojs_hash, jobid);
+	if (job)
+		goto found;
+
+	job = job_alloc(jobid, stats);
+	if (job == NULL)
+		RETURN(-ENOMEM);
+
+	job2 = cfs_hash_findadd_unique(stats->ojs_hash, job->js_jobid,
+				       &job->js_hash);
+	if (job2 != job) {
+		job_putref(job);
+		job = job2;
+		/* We cannot LASSERT(!list_empty(&job->js_list)) here,
+		 * since we just lost the race for inserting "job" into the
+		 * ojs_list, and some other thread is doing it _right_now_.
+		 * Instead, be content the other thread is doing this, since
+		 * "job2" was initialized in job_alloc() already. LU-2163 */
+	} else {
+		LASSERT(list_empty(&job->js_list));
+		write_lock(&stats->ojs_lock);
+		list_add_tail(&job->js_list, &stats->ojs_list);
+		write_unlock(&stats->ojs_lock);
+	}
+
+found:
+	LASSERT(stats == job->js_jobstats);
+	LASSERT(stats->ojs_cntr_num > event);
+	job->js_timestamp = cfs_time_current_sec();
+	lprocfs_counter_add(job->js_stats, event, amount);
+
+	job_putref(job);
+	RETURN(0);
+}
+EXPORT_SYMBOL(lprocfs_job_stats_log);
+
+void lprocfs_job_stats_fini(struct obd_device *obd)
+{
+	struct obd_job_stats *stats = &obd->u.obt.obt_jobstats;
+	time_t oldest = 0;
+
+	if (stats->ojs_hash == NULL)
+		return;
+	cfs_hash_for_each_safe(stats->ojs_hash, job_iter_callback, &oldest);
+	cfs_hash_putref(stats->ojs_hash);
+	stats->ojs_hash = NULL;
+	LASSERT(list_empty(&stats->ojs_list));
+}
+EXPORT_SYMBOL(lprocfs_job_stats_fini);
+
+static void *lprocfs_jobstats_seq_start(struct seq_file *p, loff_t *pos)
+{
+	struct obd_job_stats *stats = p->private;
+	loff_t off = *pos;
+	struct job_stat *job;
+
+	read_lock(&stats->ojs_lock);
+	if (off == 0)
+		return SEQ_START_TOKEN;
+	off--;
+	list_for_each_entry(job, &stats->ojs_list, js_list) {
+		if (!off--)
+			return job;
+	}
+	return NULL;
+}
+
+static void lprocfs_jobstats_seq_stop(struct seq_file *p, void *v)
+{
+	struct obd_job_stats *stats = p->private;
+
+	read_unlock(&stats->ojs_lock);
+}
+
+static void *lprocfs_jobstats_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	struct obd_job_stats *stats = p->private;
+	struct job_stat *job;
+	struct list_head *next;
+
+	++*pos;
+	if (v == SEQ_START_TOKEN) {
+		next = stats->ojs_list.next;
+	} else {
+		job = (struct job_stat *)v;
+		next = job->js_list.next;
+	}
+
+	return next == &stats->ojs_list ? NULL :
+		list_entry(next, struct job_stat, js_list);
+}
+
+/*
+ * Example of output on MDT:
+ *
+ * job_stats:
+ * - job_id:	test_id.222.25844
+ *   snapshot_time: 1322494486
+ *   open:	  { samples:	       3, unit: reqs }
+ *   close:	 { samples:	       3, unit: reqs }
+ *   mknod:	 { samples:	       0, unit: reqs }
+ *   link:	  { samples:	       0, unit: reqs }
+ *   unlink:	{ samples:	       0, unit: reqs }
+ *   mkdir:	 { samples:	       0, unit: reqs }
+ *   rmdir:	 { samples:	       0, unit: reqs }
+ *   rename:	{ samples:	       1, unit: reqs }
+ *   getattr:       { samples:	       7, unit: reqs }
+ *   setattr:       { samples:	       0, unit: reqs }
+ *   getxattr:      { samples:	       0, unit: reqs }
+ *   setxattr:      { samples:	       0, unit: reqs }
+ *   statfs:	{ samples:	       0, unit: reqs }
+ *   sync:	  { samples:	       0, unit: reqs }
+ *
+ * Example of output on OST:
+ *
+ * job_stats:
+ * - job_id	 4854
+ *   snapshot_time: 1322494602
+ *   read:	  { samples:  0, unit: bytes, min:  0, max:  0, sum:  0 }
+ *   write:	 { samples:  1, unit: bytes, min: 10, max: 10, sum: 10 }
+ *   setattr:       { samples:  0, unit: reqs }
+ *   punch:	 { samples:  0, unit: reqs }
+ *   sync:	  { samples:  0, unit: reqs }
+ */
+
+static const char spaces[] = "		    ";
+
+static int inline width(const char *str, int len)
+{
+	return len - min((int)strlen(str), 15);
+}
+
+static int lprocfs_jobstats_seq_show(struct seq_file *p, void *v)
+{
+	struct job_stat			*job = v;
+	struct lprocfs_stats		*s;
+	struct lprocfs_counter		ret;
+	struct lprocfs_counter		*cntr;
+	struct lprocfs_counter_header	*cntr_header;
+	int				i;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(p, "job_stats:\n");
+		return 0;
+	}
+
+	seq_printf(p, "- %-16s %s\n", "job_id:", job->js_jobid);
+	seq_printf(p, "  %-16s %ld\n", "snapshot_time:", job->js_timestamp);
+
+	s = job->js_stats;
+	for (i = 0; i < s->ls_num; i++) {
+		cntr = lprocfs_stats_counter_get(s, 0, i);
+		cntr_header = &s->ls_cnt_header[i];
+		lprocfs_stats_collect(s, i, &ret);
+
+		seq_printf(p, "  %s:%.*s { samples: %11"LPF64"u",
+			   cntr_header->lc_name,
+			   width(cntr_header->lc_name, 15), spaces,
+			   ret.lc_count);
+		if (cntr_header->lc_units[0] != '\0')
+			seq_printf(p, ", unit: %5s", cntr_header->lc_units);
+
+		if (cntr_header->lc_config & LPROCFS_CNTR_AVGMINMAX) {
+			seq_printf(p, ", min:%8"LPF64"u, max:%8"LPF64"u,"
+				   " sum:%16"LPF64"u",
+				   ret.lc_count ? ret.lc_min : 0,
+				   ret.lc_count ? ret.lc_max : 0,
+				   ret.lc_count ? ret.lc_sum : 0);
+		}
+		if (cntr_header->lc_config & LPROCFS_CNTR_STDDEV) {
+			seq_printf(p, ", sumsq: %18"LPF64"u",
+				   ret.lc_count ? ret.lc_sumsquare : 0);
+		}
+
+		seq_printf(p, " }\n");
+
+	}
+	return 0;
+}
+
+struct seq_operations lprocfs_jobstats_seq_sops = {
+	start: lprocfs_jobstats_seq_start,
+	stop:  lprocfs_jobstats_seq_stop,
+	next:  lprocfs_jobstats_seq_next,
+	show:  lprocfs_jobstats_seq_show,
+};
+
+static int lprocfs_jobstats_seq_open(struct inode *inode, struct file *file)
+{
+	struct proc_dir_entry *dp = PDE(inode);
+	struct seq_file *seq;
+	int rc;
+
+	if (LPROCFS_ENTRY_AND_CHECK(dp))
+		return -ENOENT;
+
+	rc = seq_open(file, &lprocfs_jobstats_seq_sops);
+	if (rc) {
+		LPROCFS_EXIT();
+		return rc;
+	}
+	seq = file->private_data;
+	seq->private = dp->data;
+	return 0;
+}
+
+static ssize_t lprocfs_jobstats_seq_write(struct file *file, const char *buf,
+					  size_t len, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	struct obd_job_stats *stats = seq->private;
+	char jobid[JOBSTATS_JOBID_SIZE];
+	int all = 0;
+	struct job_stat *job;
+
+	if (!memcmp(buf, "clear", strlen("clear"))) {
+		all = 1;
+	} else if (len < JOBSTATS_JOBID_SIZE) {
+		memset(jobid, 0, JOBSTATS_JOBID_SIZE);
+		/* Trim '\n' if any */
+		if (buf[len - 1] == '\n')
+			memcpy(jobid, buf, len - 1);
+		else
+			memcpy(jobid, buf, len);
+	} else {
+		return -EINVAL;
+	}
+
+	LASSERT(stats->ojs_hash);
+	if (all) {
+		time_t oldest = 0;
+		cfs_hash_for_each_safe(stats->ojs_hash, job_iter_callback,
+				       &oldest);
+		return len;
+	}
+
+	if (!strlen(jobid))
+		return -EINVAL;
+
+	job = cfs_hash_lookup(stats->ojs_hash, jobid);
+	if (!job)
+		return -EINVAL;
+
+	cfs_hash_del_key(stats->ojs_hash, jobid);
+
+	job_putref(job);
+	return len;
+}
+
+struct file_operations lprocfs_jobstats_seq_fops = {
+	.owner   = THIS_MODULE,
+	.open    = lprocfs_jobstats_seq_open,
+	.read    = seq_read,
+	.write   = lprocfs_jobstats_seq_write,
+	.llseek  = seq_lseek,
+	.release = lprocfs_seq_release,
+};
+
+int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num,
+			   cntr_init_callback init_fn)
+{
+	struct proc_dir_entry *entry;
+	struct obd_job_stats *stats;
+	ENTRY;
+
+	LASSERT(obd->obd_proc_entry != NULL);
+	LASSERT(obd->obd_type->typ_name);
+
+	if (strcmp(obd->obd_type->typ_name, LUSTRE_MDT_NAME) &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_OST_NAME)) {
+		CERROR("Invalid obd device type.\n");
+		RETURN(-EINVAL);
+	}
+	stats = &obd->u.obt.obt_jobstats;
+
+	LASSERT(stats->ojs_hash == NULL);
+	stats->ojs_hash = cfs_hash_create("JOB_STATS",
+					  HASH_JOB_STATS_CUR_BITS,
+					  HASH_JOB_STATS_MAX_BITS,
+					  HASH_JOB_STATS_BKT_BITS, 0,
+					  CFS_HASH_MIN_THETA,
+					  CFS_HASH_MAX_THETA,
+					  &job_stats_hash_ops,
+					  CFS_HASH_DEFAULT);
+	if (stats->ojs_hash == NULL)
+		RETURN(-ENOMEM);
+
+	INIT_LIST_HEAD(&stats->ojs_list);
+	rwlock_init(&stats->ojs_lock);
+	stats->ojs_cntr_num = cntr_num;
+	stats->ojs_cntr_init_fn = init_fn;
+	stats->ojs_cleanup_interval = 600; /* 10 mins by default */
+	stats->ojs_last_cleanup = cfs_time_current_sec();
+
+	LPROCFS_WRITE_ENTRY();
+	entry = create_proc_entry("job_stats", 0644, obd->obd_proc_entry);
+	LPROCFS_WRITE_EXIT();
+	if (entry) {
+		entry->proc_fops = &lprocfs_jobstats_seq_fops;
+		entry->data = stats;
+		RETURN(0);
+	} else {
+		lprocfs_job_stats_fini(obd);
+		RETURN(-ENOMEM);
+	}
+}
+EXPORT_SYMBOL(lprocfs_job_stats_init);
+
+int lprocfs_rd_job_interval(char *page, char **start, off_t off,
+			    int count, int *eof, void *data)
+{
+	struct obd_device *obd = (struct obd_device *)data;
+	struct obd_job_stats *stats;
+
+	LASSERT(obd != NULL);
+	stats = &obd->u.obt.obt_jobstats;
+	*eof = 1;
+	return snprintf(page, count, "%d\n", stats->ojs_cleanup_interval);
+}
+EXPORT_SYMBOL(lprocfs_rd_job_interval);
+
+int lprocfs_wr_job_interval(struct file *file, const char *buffer,
+			    unsigned long count, void *data)
+{
+	struct obd_device *obd = (struct obd_device *)data;
+	struct obd_job_stats *stats;
+	int val, rc;
+
+	LASSERT(obd != NULL);
+	stats = &obd->u.obt.obt_jobstats;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	stats->ojs_cleanup_interval = val;
+	lprocfs_job_cleanup(stats, true);
+
+	return count;
+
+}
+EXPORT_SYMBOL(lprocfs_wr_job_interval);
+
+#endif /* LPROCFS*/
diff --git a/drivers/staging/lustre/lustre/obdclass/lprocfs_status.c b/drivers/staging/lustre/lustre/obdclass/lprocfs_status.c
new file mode 100644
index 000000000000..96e568f6757a
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/lprocfs_status.c
@@ -0,0 +1,2599 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lprocfs_status.c
+ *
+ * Author: Hariharan Thantry <thantry@users.sourceforge.net>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include <lustre/lustre_idl.h>
+
+#if defined(LPROCFS)
+
+static int lprocfs_no_percpu_stats = 0;
+CFS_MODULE_PARM(lprocfs_no_percpu_stats, "i", int, 0644,
+		"Do not alloc percpu data for lprocfs stats");
+
+#define MAX_STRING_SIZE 128
+
+/* for bug 10866, global variable */
+DECLARE_RWSEM(_lprocfs_lock);
+EXPORT_SYMBOL(_lprocfs_lock);
+
+int lprocfs_single_release(struct inode *inode, struct file *file)
+{
+	LPROCFS_EXIT();
+	return single_release(inode, file);
+}
+EXPORT_SYMBOL(lprocfs_single_release);
+
+int lprocfs_seq_release(struct inode *inode, struct file *file)
+{
+	LPROCFS_EXIT();
+	return seq_release(inode, file);
+}
+EXPORT_SYMBOL(lprocfs_seq_release);
+
+static struct proc_dir_entry *__lprocfs_srch(struct proc_dir_entry *head,
+					     const char *name)
+{
+	struct proc_dir_entry *temp;
+
+	if (head == NULL)
+		return NULL;
+
+	temp = head->subdir;
+	while (temp != NULL) {
+		if (strcmp(temp->name, name) == 0) {
+			return temp;
+		}
+
+		temp = temp->next;
+	}
+	return NULL;
+}
+
+struct proc_dir_entry *lprocfs_srch(struct proc_dir_entry *head,
+				    const char *name)
+{
+	struct proc_dir_entry *temp;
+
+	LPROCFS_SRCH_ENTRY();
+	temp = __lprocfs_srch(head, name);
+	LPROCFS_SRCH_EXIT();
+	return temp;
+}
+EXPORT_SYMBOL(lprocfs_srch);
+
+/* lprocfs API calls */
+
+/* Function that emulates snprintf but also has the side effect of advancing
+   the page pointer for the next write into the buffer, incrementing the total
+   length written to the buffer, and decrementing the size left in the
+   buffer. */
+static int lprocfs_obd_snprintf(char **page, int end, int *len,
+				const char *format, ...)
+{
+	va_list list;
+	int n;
+
+	if (*len >= end)
+		return 0;
+
+	va_start(list, format);
+	n = vsnprintf(*page, end - *len, format, list);
+	va_end(list);
+
+	*page += n; *len += n;
+	return n;
+}
+
+proc_dir_entry_t *lprocfs_add_simple(struct proc_dir_entry *root,
+					 char *name,
+					 read_proc_t *read_proc,
+					 write_proc_t *write_proc,
+					 void *data,
+					 struct file_operations *fops)
+{
+	proc_dir_entry_t *proc;
+	mode_t mode = 0;
+
+	if (root == NULL || name == NULL)
+		return ERR_PTR(-EINVAL);
+	if (read_proc)
+		mode = 0444;
+	if (write_proc)
+		mode |= 0200;
+	if (fops)
+		mode = 0644;
+	LPROCFS_WRITE_ENTRY();
+	proc = create_proc_entry(name, mode, root);
+	if (!proc) {
+		CERROR("LprocFS: No memory to create /proc entry %s", name);
+		LPROCFS_WRITE_EXIT();
+		return ERR_PTR(-ENOMEM);
+	}
+	proc->read_proc = read_proc;
+	proc->write_proc = write_proc;
+	proc->data = data;
+	if (fops)
+		proc->proc_fops = fops;
+	LPROCFS_WRITE_EXIT();
+	return proc;
+}
+EXPORT_SYMBOL(lprocfs_add_simple);
+
+struct proc_dir_entry *lprocfs_add_symlink(const char *name,
+			struct proc_dir_entry *parent, const char *format, ...)
+{
+	struct proc_dir_entry *entry;
+	char *dest;
+	va_list ap;
+
+	if (parent == NULL || format == NULL)
+		return NULL;
+
+	OBD_ALLOC_WAIT(dest, MAX_STRING_SIZE + 1);
+	if (dest == NULL)
+		return NULL;
+
+	va_start(ap, format);
+	vsnprintf(dest, MAX_STRING_SIZE, format, ap);
+	va_end(ap);
+
+	entry = proc_symlink(name, parent, dest);
+	if (entry == NULL)
+		CERROR("LprocFS: Could not create symbolic link from %s to %s",
+			name, dest);
+
+	OBD_FREE(dest, MAX_STRING_SIZE + 1);
+	return entry;
+}
+EXPORT_SYMBOL(lprocfs_add_symlink);
+
+static ssize_t lprocfs_fops_read(struct file *f, char __user *buf,
+				 size_t size, loff_t *ppos)
+{
+	struct proc_dir_entry *dp = PDE(f->f_dentry->d_inode);
+	char *page, *start = NULL;
+	int rc = 0, eof = 1, count;
+
+	if (*ppos >= PAGE_CACHE_SIZE)
+		return 0;
+
+	page = (char *)__get_free_page(GFP_KERNEL);
+	if (page == NULL)
+		return -ENOMEM;
+
+	if (LPROCFS_ENTRY_AND_CHECK(dp)) {
+		rc = -ENOENT;
+		goto out;
+	}
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_LPROC_REMOVE, 10);
+	if (dp->read_proc)
+		rc = dp->read_proc(page, &start, *ppos, PAGE_CACHE_SIZE,
+				   &eof, dp->data);
+	LPROCFS_EXIT();
+	if (rc <= 0)
+		goto out;
+
+	/* for lustre proc read, the read count must be less than PAGE_SIZE */
+	LASSERT(eof == 1);
+
+	if (start == NULL) {
+		rc -= *ppos;
+		if (rc < 0)
+			rc = 0;
+		if (rc == 0)
+			goto out;
+		start = page + *ppos;
+	} else if (start < page) {
+		start = page;
+	}
+
+	count = (rc < size) ? rc : size;
+	if (copy_to_user(buf, start, count)) {
+		rc = -EFAULT;
+		goto out;
+	}
+	*ppos += count;
+
+out:
+	free_page((unsigned long)page);
+	return rc;
+}
+
+static ssize_t lprocfs_fops_write(struct file *f, const char __user *buf,
+				  size_t size, loff_t *ppos)
+{
+	struct proc_dir_entry *dp = PDE(f->f_dentry->d_inode);
+	int rc = -EIO;
+
+	if (LPROCFS_ENTRY_AND_CHECK(dp))
+		return -ENOENT;
+	if (dp->write_proc)
+		rc = dp->write_proc(f, buf, size, dp->data);
+	LPROCFS_EXIT();
+	return rc;
+}
+
+static struct file_operations lprocfs_generic_fops = {
+	.owner = THIS_MODULE,
+	.read = lprocfs_fops_read,
+	.write = lprocfs_fops_write,
+};
+
+int lprocfs_evict_client_open(struct inode *inode, struct file *f)
+{
+	struct proc_dir_entry *dp = PDE(f->f_dentry->d_inode);
+	struct obd_device *obd = dp->data;
+
+	atomic_inc(&obd->obd_evict_inprogress);
+
+	return 0;
+}
+
+int lprocfs_evict_client_release(struct inode *inode, struct file *f)
+{
+	struct proc_dir_entry *dp = PDE(f->f_dentry->d_inode);
+	struct obd_device *obd = dp->data;
+
+	atomic_dec(&obd->obd_evict_inprogress);
+	wake_up(&obd->obd_evict_inprogress_waitq);
+
+	return 0;
+}
+
+struct file_operations lprocfs_evict_client_fops = {
+	.owner = THIS_MODULE,
+	.read = lprocfs_fops_read,
+	.write = lprocfs_fops_write,
+	.open = lprocfs_evict_client_open,
+	.release = lprocfs_evict_client_release,
+};
+EXPORT_SYMBOL(lprocfs_evict_client_fops);
+
+/**
+ * Add /proc entries.
+ *
+ * \param root [in]  The parent proc entry on which new entry will be added.
+ * \param list [in]  Array of proc entries to be added.
+ * \param data [in]  The argument to be passed when entries read/write routines
+ *		   are called through /proc file.
+ *
+ * \retval 0   on success
+ *	 < 0 on error
+ */
+int lprocfs_add_vars(struct proc_dir_entry *root, struct lprocfs_vars *list,
+		     void *data)
+{
+	int rc = 0;
+
+	if (root == NULL || list == NULL)
+		return -EINVAL;
+
+	LPROCFS_WRITE_ENTRY();
+	while (list->name != NULL) {
+		struct proc_dir_entry *cur_root, *proc;
+		char *pathcopy, *cur, *next, pathbuf[64];
+		int pathsize = strlen(list->name) + 1;
+
+		proc = NULL;
+		cur_root = root;
+
+		/* need copy of path for strsep */
+		if (strlen(list->name) > sizeof(pathbuf) - 1) {
+			OBD_ALLOC(pathcopy, pathsize);
+			if (pathcopy == NULL)
+				GOTO(out, rc = -ENOMEM);
+		} else {
+			pathcopy = pathbuf;
+		}
+
+		next = pathcopy;
+		strcpy(pathcopy, list->name);
+
+		while (cur_root != NULL && (cur = strsep(&next, "/"))) {
+			if (*cur =='\0') /* skip double/trailing "/" */
+				continue;
+
+			proc = __lprocfs_srch(cur_root, cur);
+			CDEBUG(D_OTHER, "cur_root=%s, cur=%s, next=%s, (%s)\n",
+			       cur_root->name, cur, next,
+			       (proc ? "exists" : "new"));
+			if (next != NULL) {
+				cur_root = (proc ? proc :
+					    proc_mkdir(cur, cur_root));
+			} else if (proc == NULL) {
+				mode_t mode = 0;
+				if (list->proc_mode != 0000) {
+					mode = list->proc_mode;
+				} else {
+					if (list->read_fptr)
+						mode = 0444;
+					if (list->write_fptr)
+						mode |= 0200;
+				}
+				proc = create_proc_entry(cur, mode, cur_root);
+			}
+		}
+
+		if (pathcopy != pathbuf)
+			OBD_FREE(pathcopy, pathsize);
+
+		if (cur_root == NULL || proc == NULL) {
+			CERROR("LprocFS: No memory to create /proc entry %s",
+			       list->name);
+			GOTO(out, rc = -ENOMEM);
+		}
+
+		if (list->fops)
+			proc->proc_fops = list->fops;
+		else
+			proc->proc_fops = &lprocfs_generic_fops;
+		proc->read_proc = list->read_fptr;
+		proc->write_proc = list->write_fptr;
+		proc->data = (list->data ? list->data : data);
+		list++;
+	}
+out:
+	LPROCFS_WRITE_EXIT();
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_add_vars);
+
+void lprocfs_remove_nolock(struct proc_dir_entry **rooth)
+{
+	struct proc_dir_entry *root = *rooth;
+	struct proc_dir_entry *temp = root;
+	struct proc_dir_entry *rm_entry;
+	struct proc_dir_entry *parent;
+
+	if (!root)
+		return;
+	*rooth = NULL;
+
+	parent = root->parent;
+	LASSERT(parent != NULL);
+
+	while (1) {
+		while (temp->subdir != NULL)
+			temp = temp->subdir;
+
+		rm_entry = temp;
+		temp = temp->parent;
+
+		/* Memory corruption once caused this to fail, and
+		   without this LASSERT we would loop here forever. */
+		LASSERTF(strlen(rm_entry->name) == rm_entry->namelen,
+			 "0x%p  %s/%s len %d\n", rm_entry, temp->name,
+			 rm_entry->name, (int)strlen(rm_entry->name));
+
+		remove_proc_entry(rm_entry->name, temp);
+		if (temp == parent)
+			break;
+	}
+}
+
+void lprocfs_remove(struct proc_dir_entry **rooth)
+{
+	LPROCFS_WRITE_ENTRY(); /* search vs remove race */
+	lprocfs_remove_nolock(rooth);
+	LPROCFS_WRITE_EXIT();
+}
+EXPORT_SYMBOL(lprocfs_remove);
+
+void lprocfs_remove_proc_entry(const char *name, struct proc_dir_entry *parent)
+{
+	LASSERT(parent != NULL);
+	remove_proc_entry(name, parent);
+}
+EXPORT_SYMBOL(lprocfs_remove_proc_entry);
+
+void lprocfs_try_remove_proc_entry(const char *name,
+				   struct proc_dir_entry *parent)
+{
+	struct proc_dir_entry	 *t = NULL;
+	struct proc_dir_entry	**p;
+	int			  len, busy = 0;
+
+	LASSERT(parent != NULL);
+	len = strlen(name);
+
+	LPROCFS_WRITE_ENTRY();
+
+	/* lookup target name */
+	for (p = &parent->subdir; *p; p = &(*p)->next) {
+		if ((*p)->namelen != len)
+			continue;
+		if (memcmp(name, (*p)->name, len))
+			continue;
+		t = *p;
+		break;
+	}
+
+	if (t) {
+		/* verify it's empty: do not count "num_refs" */
+		for (p = &t->subdir; *p; p = &(*p)->next) {
+			if ((*p)->namelen != strlen("num_refs")) {
+				busy = 1;
+				break;
+			}
+			if (memcmp("num_refs", (*p)->name,
+				   strlen("num_refs"))) {
+				busy = 1;
+				break;
+			}
+		}
+	}
+
+	if (busy == 0)
+		lprocfs_remove_nolock(&t);
+
+	LPROCFS_WRITE_EXIT();
+
+	return;
+}
+EXPORT_SYMBOL(lprocfs_try_remove_proc_entry);
+
+struct proc_dir_entry *lprocfs_register(const char *name,
+					struct proc_dir_entry *parent,
+					struct lprocfs_vars *list, void *data)
+{
+	struct proc_dir_entry *newchild;
+
+	newchild = lprocfs_srch(parent, name);
+	if (newchild != NULL) {
+		CERROR(" Lproc: Attempting to register %s more than once \n",
+		       name);
+		return ERR_PTR(-EALREADY);
+	}
+
+	newchild = proc_mkdir(name, parent);
+	if (newchild != NULL && list != NULL) {
+		int rc = lprocfs_add_vars(newchild, list, data);
+		if (rc) {
+			lprocfs_remove(&newchild);
+			return ERR_PTR(rc);
+		}
+	}
+	return newchild;
+}
+EXPORT_SYMBOL(lprocfs_register);
+
+/* Generic callbacks */
+int lprocfs_rd_uint(char *page, char **start, off_t off,
+		    int count, int *eof, void *data)
+{
+	unsigned int *temp = data;
+	return snprintf(page, count, "%u\n", *temp);
+}
+EXPORT_SYMBOL(lprocfs_rd_uint);
+
+int lprocfs_wr_uint(struct file *file, const char *buffer,
+		    unsigned long count, void *data)
+{
+	unsigned *p = data;
+	char dummy[MAX_STRING_SIZE + 1], *end;
+	unsigned long tmp;
+
+	dummy[MAX_STRING_SIZE] = '\0';
+	if (copy_from_user(dummy, buffer, MAX_STRING_SIZE))
+		return -EFAULT;
+
+	tmp = simple_strtoul(dummy, &end, 0);
+	if (dummy == end)
+		return -EINVAL;
+
+	*p = (unsigned int)tmp;
+	return count;
+}
+EXPORT_SYMBOL(lprocfs_wr_uint);
+
+int lprocfs_rd_u64(char *page, char **start, off_t off,
+		   int count, int *eof, void *data)
+{
+	LASSERT(data != NULL);
+	*eof = 1;
+	return snprintf(page, count, LPU64"\n", *(__u64 *)data);
+}
+EXPORT_SYMBOL(lprocfs_rd_u64);
+
+int lprocfs_rd_atomic(char *page, char **start, off_t off,
+		   int count, int *eof, void *data)
+{
+	atomic_t *atom = data;
+	LASSERT(atom != NULL);
+	*eof = 1;
+	return snprintf(page, count, "%d\n", atomic_read(atom));
+}
+EXPORT_SYMBOL(lprocfs_rd_atomic);
+
+int lprocfs_wr_atomic(struct file *file, const char *buffer,
+		      unsigned long count, void *data)
+{
+	atomic_t *atm = data;
+	int val = 0;
+	int rc;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc < 0)
+		return rc;
+
+	if (val <= 0)
+		return -ERANGE;
+
+	atomic_set(atm, val);
+	return count;
+}
+EXPORT_SYMBOL(lprocfs_wr_atomic);
+
+int lprocfs_rd_uuid(char *page, char **start, off_t off, int count,
+		    int *eof, void *data)
+{
+	struct obd_device *obd = data;
+
+	LASSERT(obd != NULL);
+	*eof = 1;
+	return snprintf(page, count, "%s\n", obd->obd_uuid.uuid);
+}
+EXPORT_SYMBOL(lprocfs_rd_uuid);
+
+int lprocfs_rd_name(char *page, char **start, off_t off, int count,
+		    int *eof, void *data)
+{
+	struct obd_device *dev = data;
+
+	LASSERT(dev != NULL);
+	*eof = 1;
+	return snprintf(page, count, "%s\n", dev->obd_name);
+}
+EXPORT_SYMBOL(lprocfs_rd_name);
+
+int lprocfs_rd_blksize(char *page, char **start, off_t off, int count,
+		       int *eof, void *data)
+{
+	struct obd_device *obd = data;
+	struct obd_statfs  osfs;
+	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+			    OBD_STATFS_NODELAY);
+	if (!rc) {
+		*eof = 1;
+		rc = snprintf(page, count, "%u\n", osfs.os_bsize);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_blksize);
+
+int lprocfs_rd_kbytestotal(char *page, char **start, off_t off, int count,
+			   int *eof, void *data)
+{
+	struct obd_device *obd = data;
+	struct obd_statfs  osfs;
+	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+			    OBD_STATFS_NODELAY);
+	if (!rc) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_blocks;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		*eof = 1;
+		rc = snprintf(page, count, LPU64"\n", result);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_kbytestotal);
+
+int lprocfs_rd_kbytesfree(char *page, char **start, off_t off, int count,
+			  int *eof, void *data)
+{
+	struct obd_device *obd = data;
+	struct obd_statfs  osfs;
+	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+			    OBD_STATFS_NODELAY);
+	if (!rc) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_bfree;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		*eof = 1;
+		rc = snprintf(page, count, LPU64"\n", result);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_kbytesfree);
+
+int lprocfs_rd_kbytesavail(char *page, char **start, off_t off, int count,
+			   int *eof, void *data)
+{
+	struct obd_device *obd = data;
+	struct obd_statfs  osfs;
+	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+			    OBD_STATFS_NODELAY);
+	if (!rc) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_bavail;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		*eof = 1;
+		rc = snprintf(page, count, LPU64"\n", result);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_kbytesavail);
+
+int lprocfs_rd_filestotal(char *page, char **start, off_t off, int count,
+			  int *eof, void *data)
+{
+	struct obd_device *obd = data;
+	struct obd_statfs  osfs;
+	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+			    OBD_STATFS_NODELAY);
+	if (!rc) {
+		*eof = 1;
+		rc = snprintf(page, count, LPU64"\n", osfs.os_files);
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_filestotal);
+
+int lprocfs_rd_filesfree(char *page, char **start, off_t off, int count,
+			 int *eof, void *data)
+{
+	struct obd_device *obd = data;
+	struct obd_statfs  osfs;
+	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+			    OBD_STATFS_NODELAY);
+	if (!rc) {
+		*eof = 1;
+		rc = snprintf(page, count, LPU64"\n", osfs.os_ffree);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_filesfree);
+
+int lprocfs_rd_server_uuid(char *page, char **start, off_t off, int count,
+			   int *eof, void *data)
+{
+	struct obd_device *obd = data;
+	struct obd_import *imp;
+	char *imp_state_name = NULL;
+	int rc = 0;
+
+	LASSERT(obd != NULL);
+	LPROCFS_CLIMP_CHECK(obd);
+	imp = obd->u.cli.cl_import;
+	imp_state_name = ptlrpc_import_state_name(imp->imp_state);
+	*eof = 1;
+	rc = snprintf(page, count, "%s\t%s%s\n",
+		      obd2cli_tgt(obd), imp_state_name,
+		      imp->imp_deactive ? "\tDEACTIVATED" : "");
+
+	LPROCFS_CLIMP_EXIT(obd);
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_server_uuid);
+
+int lprocfs_rd_conn_uuid(char *page, char **start, off_t off, int count,
+			 int *eof,  void *data)
+{
+	struct obd_device *obd = data;
+	struct ptlrpc_connection *conn;
+	int rc = 0;
+
+	LASSERT(obd != NULL);
+
+	LPROCFS_CLIMP_CHECK(obd);
+	conn = obd->u.cli.cl_import->imp_connection;
+	*eof = 1;
+	if (conn && obd->u.cli.cl_import) {
+		rc = snprintf(page, count, "%s\n",
+			      conn->c_remote_uuid.uuid);
+	} else {
+		rc = snprintf(page, count, "%s\n", "<none>");
+	}
+
+	LPROCFS_CLIMP_EXIT(obd);
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_conn_uuid);
+
+/** add up per-cpu counters */
+void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx,
+			   struct lprocfs_counter *cnt)
+{
+	unsigned int			num_entry;
+	struct lprocfs_counter		*percpu_cntr;
+	struct lprocfs_counter_header	*cntr_header;
+	int				i;
+	unsigned long			flags = 0;
+
+	memset(cnt, 0, sizeof(*cnt));
+
+	if (stats == NULL) {
+		/* set count to 1 to avoid divide-by-zero errs in callers */
+		cnt->lc_count = 1;
+		return;
+	}
+
+	cnt->lc_min = LC_MIN_INIT;
+
+	num_entry = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
+
+	for (i = 0; i < num_entry; i++) {
+		if (stats->ls_percpu[i] == NULL)
+			continue;
+		cntr_header = &stats->ls_cnt_header[idx];
+		percpu_cntr = lprocfs_stats_counter_get(stats, i, idx);
+
+		cnt->lc_count += percpu_cntr->lc_count;
+		cnt->lc_sum += percpu_cntr->lc_sum;
+		if (percpu_cntr->lc_min < cnt->lc_min)
+			cnt->lc_min = percpu_cntr->lc_min;
+		if (percpu_cntr->lc_max > cnt->lc_max)
+			cnt->lc_max = percpu_cntr->lc_max;
+		cnt->lc_sumsquare += percpu_cntr->lc_sumsquare;
+	}
+
+	lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
+}
+EXPORT_SYMBOL(lprocfs_stats_collect);
+
+/**
+ * Append a space separated list of current set flags to str.
+ */
+#define flag2str(flag) \
+	if (imp->imp_##flag && max - len > 0) \
+	     len += snprintf(str + len, max - len, "%s" #flag, len ? ", " : "");
+static int obd_import_flags2str(struct obd_import *imp, char *str, int max)
+{
+	int len = 0;
+
+	if (imp->imp_obd->obd_no_recov)
+		len += snprintf(str, max - len, "no_recov");
+
+	flag2str(invalid);
+	flag2str(deactive);
+	flag2str(replayable);
+	flag2str(pingable);
+	return len;
+}
+#undef flags2str
+
+static const char *obd_connect_names[] = {
+	"read_only",
+	"lov_index",
+	"unused",
+	"write_grant",
+	"server_lock",
+	"version",
+	"request_portal",
+	"acl",
+	"xattr",
+	"create_on_write",
+	"truncate_lock",
+	"initial_transno",
+	"inode_bit_locks",
+	"join_file(obsolete)",
+	"getattr_by_fid",
+	"no_oh_for_devices",
+	"remote_client",
+	"remote_client_by_force",
+	"max_byte_per_rpc",
+	"64bit_qdata",
+	"mds_capability",
+	"oss_capability",
+	"early_lock_cancel",
+	"som",
+	"adaptive_timeouts",
+	"lru_resize",
+	"mds_mds_connection",
+	"real_conn",
+	"change_qunit_size",
+	"alt_checksum_algorithm",
+	"fid_is_enabled",
+	"version_recovery",
+	"pools",
+	"grant_shrink",
+	"skip_orphan",
+	"large_ea",
+	"full20",
+	"layout_lock",
+	"64bithash",
+	"object_max_bytes",
+	"imp_recov",
+	"jobstats",
+	"umask",
+	"einprogress",
+	"grant_param",
+	"flock_owner",
+	"lvb_type",
+	"nanoseconds_times",
+	"lightweight_conn",
+	"short_io",
+	"pingless",
+	"unknown",
+	NULL
+};
+
+int obd_connect_flags2str(char *page, int count, __u64 flags, char *sep)
+{
+	__u64 mask = 1;
+	int i, ret = 0;
+
+	for (i = 0; obd_connect_names[i] != NULL; i++, mask <<= 1) {
+		if (flags & mask)
+			ret += snprintf(page + ret, count - ret, "%s%s",
+					ret ? sep : "", obd_connect_names[i]);
+	}
+	if (flags & ~(mask - 1))
+		ret += snprintf(page + ret, count - ret,
+				"%sunknown flags "LPX64,
+				ret ? sep : "", flags & ~(mask - 1));
+	return ret;
+}
+EXPORT_SYMBOL(obd_connect_flags2str);
+
+int lprocfs_rd_import(char *page, char **start, off_t off, int count,
+		      int *eof, void *data)
+{
+	struct lprocfs_counter		ret;
+	struct lprocfs_counter_header	*header;
+	struct obd_device		*obd	= (struct obd_device *)data;
+	struct obd_import		*imp;
+	struct obd_import_conn		*conn;
+	int				i;
+	int				j;
+	int				k;
+	int				rw	= 0;
+
+	LASSERT(obd != NULL);
+	LPROCFS_CLIMP_CHECK(obd);
+	imp = obd->u.cli.cl_import;
+	*eof = 1;
+
+	i = snprintf(page, count,
+		     "import:\n"
+		     "    name: %s\n"
+		     "    target: %s\n"
+		     "    state: %s\n"
+		     "    instance: %u\n"
+		     "    connect_flags: [",
+		     obd->obd_name,
+		     obd2cli_tgt(obd),
+		     ptlrpc_import_state_name(imp->imp_state),
+		     imp->imp_connect_data.ocd_instance);
+	i += obd_connect_flags2str(page + i, count - i,
+				   imp->imp_connect_data.ocd_connect_flags,
+				   ", ");
+	i += snprintf(page + i, count - i,
+		      "]\n"
+		      "    import_flags: [");
+	i += obd_import_flags2str(imp, page + i, count - i);
+
+	i += snprintf(page + i, count - i,
+		      "]\n"
+		      "    connection:\n"
+		      "       failover_nids: [");
+	spin_lock(&imp->imp_lock);
+	j = 0;
+	list_for_each_entry(conn, &imp->imp_conn_list, oic_item) {
+		i += snprintf(page + i, count - i, "%s%s", j ? ", " : "",
+			      libcfs_nid2str(conn->oic_conn->c_peer.nid));
+		j++;
+	}
+	i += snprintf(page + i, count - i,
+		      "]\n"
+		      "       current_connection: %s\n"
+		      "       connection_attempts: %u\n"
+		      "       generation: %u\n"
+		      "       in-progress_invalidations: %u\n",
+		      imp->imp_connection == NULL ? "<none>" :
+			      libcfs_nid2str(imp->imp_connection->c_peer.nid),
+		      imp->imp_conn_cnt,
+		      imp->imp_generation,
+		      atomic_read(&imp->imp_inval_count));
+	spin_unlock(&imp->imp_lock);
+
+	if (obd->obd_svc_stats == NULL)
+		goto out_climp;
+
+	header = &obd->obd_svc_stats->ls_cnt_header[PTLRPC_REQWAIT_CNTR];
+	lprocfs_stats_collect(obd->obd_svc_stats, PTLRPC_REQWAIT_CNTR, &ret);
+	if (ret.lc_count != 0) {
+		/* first argument to do_div MUST be __u64 */
+		__u64 sum = ret.lc_sum;
+		do_div(sum, ret.lc_count);
+		ret.lc_sum = sum;
+	} else
+		ret.lc_sum = 0;
+	i += snprintf(page + i, count - i,
+		      "    rpcs:\n"
+		      "       inflight: %u\n"
+		      "       unregistering: %u\n"
+		      "       timeouts: %u\n"
+		      "       avg_waittime: "LPU64" %s\n",
+		      atomic_read(&imp->imp_inflight),
+		      atomic_read(&imp->imp_unregistering),
+		      atomic_read(&imp->imp_timeouts),
+		      ret.lc_sum, header->lc_units);
+
+	k = 0;
+	for(j = 0; j < IMP_AT_MAX_PORTALS; j++) {
+		if (imp->imp_at.iat_portal[j] == 0)
+			break;
+		k = max_t(unsigned int, k,
+			  at_get(&imp->imp_at.iat_service_estimate[j]));
+	}
+	i += snprintf(page + i, count - i,
+		      "    service_estimates:\n"
+		      "       services: %u sec\n"
+		      "       network: %u sec\n",
+		      k,
+		      at_get(&imp->imp_at.iat_net_latency));
+
+	i += snprintf(page + i, count - i,
+		      "    transactions:\n"
+		      "       last_replay: "LPU64"\n"
+		      "       peer_committed: "LPU64"\n"
+		      "       last_checked: "LPU64"\n",
+		      imp->imp_last_replay_transno,
+		      imp->imp_peer_committed_transno,
+		      imp->imp_last_transno_checked);
+
+	/* avg data rates */
+	for (rw = 0; rw <= 1; rw++) {
+		lprocfs_stats_collect(obd->obd_svc_stats,
+				      PTLRPC_LAST_CNTR + BRW_READ_BYTES + rw,
+				      &ret);
+		if (ret.lc_sum > 0 && ret.lc_count > 0) {
+			/* first argument to do_div MUST be __u64 */
+			__u64 sum = ret.lc_sum;
+			do_div(sum, ret.lc_count);
+			ret.lc_sum = sum;
+			i += snprintf(page + i, count - i,
+				      "    %s_data_averages:\n"
+				      "       bytes_per_rpc: "LPU64"\n",
+				      rw ? "write" : "read",
+				      ret.lc_sum);
+		}
+		k = (int)ret.lc_sum;
+		j = opcode_offset(OST_READ + rw) + EXTRA_MAX_OPCODES;
+		header = &obd->obd_svc_stats->ls_cnt_header[j];
+		lprocfs_stats_collect(obd->obd_svc_stats, j, &ret);
+		if (ret.lc_sum > 0 && ret.lc_count != 0) {
+			/* first argument to do_div MUST be __u64 */
+			__u64 sum = ret.lc_sum;
+			do_div(sum, ret.lc_count);
+			ret.lc_sum = sum;
+			i += snprintf(page + i, count - i,
+				      "       %s_per_rpc: "LPU64"\n",
+				      header->lc_units, ret.lc_sum);
+			j = (int)ret.lc_sum;
+			if (j > 0)
+				i += snprintf(page + i, count - i,
+					      "       MB_per_sec: %u.%.02u\n",
+					      k / j, (100 * k / j) % 100);
+		}
+	}
+
+out_climp:
+	LPROCFS_CLIMP_EXIT(obd);
+	return i;
+}
+EXPORT_SYMBOL(lprocfs_rd_import);
+
+int lprocfs_rd_state(char *page, char **start, off_t off, int count,
+		      int *eof, void *data)
+{
+	struct obd_device *obd = (struct obd_device *)data;
+	struct obd_import *imp;
+	int i, j, k;
+
+	LASSERT(obd != NULL);
+	LPROCFS_CLIMP_CHECK(obd);
+	imp = obd->u.cli.cl_import;
+	*eof = 1;
+
+	i = snprintf(page, count, "current_state: %s\n",
+		     ptlrpc_import_state_name(imp->imp_state));
+	i += snprintf(page + i, count - i,
+		      "state_history:\n");
+	k = imp->imp_state_hist_idx;
+	for (j = 0; j < IMP_STATE_HIST_LEN; j++) {
+		struct import_state_hist *ish =
+			&imp->imp_state_hist[(k + j) % IMP_STATE_HIST_LEN];
+		if (ish->ish_state == 0)
+			continue;
+		i += snprintf(page + i, count - i, " - ["CFS_TIME_T", %s]\n",
+			      ish->ish_time,
+			      ptlrpc_import_state_name(ish->ish_state));
+	}
+
+	LPROCFS_CLIMP_EXIT(obd);
+	return i;
+}
+EXPORT_SYMBOL(lprocfs_rd_state);
+
+int lprocfs_at_hist_helper(char *page, int count, int rc,
+			   struct adaptive_timeout *at)
+{
+	int i;
+	for (i = 0; i < AT_BINS; i++)
+		rc += snprintf(page + rc, count - rc, "%3u ", at->at_hist[i]);
+	rc += snprintf(page + rc, count - rc, "\n");
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_at_hist_helper);
+
+/* See also ptlrpc_lprocfs_rd_timeouts */
+int lprocfs_rd_timeouts(char *page, char **start, off_t off, int count,
+			int *eof, void *data)
+{
+	struct obd_device *obd = (struct obd_device *)data;
+	struct obd_import *imp;
+	unsigned int cur, worst;
+	time_t now, worstt;
+	struct dhms ts;
+	int i, rc = 0;
+
+	LASSERT(obd != NULL);
+	LPROCFS_CLIMP_CHECK(obd);
+	imp = obd->u.cli.cl_import;
+	*eof = 1;
+
+	now = cfs_time_current_sec();
+
+	/* Some network health info for kicks */
+	s2dhms(&ts, now - imp->imp_last_reply_time);
+	rc += snprintf(page + rc, count - rc,
+		       "%-10s : %ld, "DHMS_FMT" ago\n",
+		       "last reply", imp->imp_last_reply_time, DHMS_VARS(&ts));
+
+	cur = at_get(&imp->imp_at.iat_net_latency);
+	worst = imp->imp_at.iat_net_latency.at_worst_ever;
+	worstt = imp->imp_at.iat_net_latency.at_worst_time;
+	s2dhms(&ts, now - worstt);
+	rc += snprintf(page + rc, count - rc,
+		       "%-10s : cur %3u  worst %3u (at %ld, "DHMS_FMT" ago) ",
+		       "network", cur, worst, worstt, DHMS_VARS(&ts));
+	rc = lprocfs_at_hist_helper(page, count, rc,
+				    &imp->imp_at.iat_net_latency);
+
+	for(i = 0; i < IMP_AT_MAX_PORTALS; i++) {
+		if (imp->imp_at.iat_portal[i] == 0)
+			break;
+		cur = at_get(&imp->imp_at.iat_service_estimate[i]);
+		worst = imp->imp_at.iat_service_estimate[i].at_worst_ever;
+		worstt = imp->imp_at.iat_service_estimate[i].at_worst_time;
+		s2dhms(&ts, now - worstt);
+		rc += snprintf(page + rc, count - rc,
+			       "portal %-2d  : cur %3u  worst %3u (at %ld, "
+			       DHMS_FMT" ago) ", imp->imp_at.iat_portal[i],
+			       cur, worst, worstt, DHMS_VARS(&ts));
+		rc = lprocfs_at_hist_helper(page, count, rc,
+					  &imp->imp_at.iat_service_estimate[i]);
+	}
+
+	LPROCFS_CLIMP_EXIT(obd);
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_timeouts);
+
+int lprocfs_rd_connect_flags(char *page, char **start, off_t off,
+			     int count, int *eof, void *data)
+{
+	struct obd_device *obd = data;
+	__u64 flags;
+	int ret = 0;
+
+	LPROCFS_CLIMP_CHECK(obd);
+	flags = obd->u.cli.cl_import->imp_connect_data.ocd_connect_flags;
+	ret = snprintf(page, count, "flags="LPX64"\n", flags);
+	ret += obd_connect_flags2str(page + ret, count - ret, flags, "\n");
+	ret += snprintf(page + ret, count - ret, "\n");
+	LPROCFS_CLIMP_EXIT(obd);
+	return ret;
+}
+EXPORT_SYMBOL(lprocfs_rd_connect_flags);
+
+int lprocfs_rd_num_exports(char *page, char **start, off_t off, int count,
+			   int *eof,  void *data)
+{
+	struct obd_device *obd = data;
+
+	LASSERT(obd != NULL);
+	*eof = 1;
+	return snprintf(page, count, "%u\n", obd->obd_num_exports);
+}
+EXPORT_SYMBOL(lprocfs_rd_num_exports);
+
+int lprocfs_rd_numrefs(char *page, char **start, off_t off, int count,
+		       int *eof, void *data)
+{
+	struct obd_type *class = (struct obd_type*) data;
+
+	LASSERT(class != NULL);
+	*eof = 1;
+	return snprintf(page, count, "%d\n", class->typ_refcnt);
+}
+EXPORT_SYMBOL(lprocfs_rd_numrefs);
+
+int lprocfs_obd_setup(struct obd_device *obd, struct lprocfs_vars *list)
+{
+	int rc = 0;
+
+	LASSERT(obd != NULL);
+	LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+	LASSERT(obd->obd_type->typ_procroot != NULL);
+
+	obd->obd_proc_entry = lprocfs_register(obd->obd_name,
+					       obd->obd_type->typ_procroot,
+					       list, obd);
+	if (IS_ERR(obd->obd_proc_entry)) {
+		rc = PTR_ERR(obd->obd_proc_entry);
+		CERROR("error %d setting up lprocfs for %s\n",rc,obd->obd_name);
+		obd->obd_proc_entry = NULL;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_obd_setup);
+
+int lprocfs_obd_cleanup(struct obd_device *obd)
+{
+	if (!obd)
+		return -EINVAL;
+	if (obd->obd_proc_exports_entry) {
+		/* Should be no exports left */
+		LASSERT(obd->obd_proc_exports_entry->subdir == NULL);
+		lprocfs_remove(&obd->obd_proc_exports_entry);
+		obd->obd_proc_exports_entry = NULL;
+	}
+	if (obd->obd_proc_entry) {
+		lprocfs_remove(&obd->obd_proc_entry);
+		obd->obd_proc_entry = NULL;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_obd_cleanup);
+
+static void lprocfs_free_client_stats(struct nid_stat *client_stat)
+{
+	CDEBUG(D_CONFIG, "stat %p - data %p/%p\n", client_stat,
+	       client_stat->nid_proc, client_stat->nid_stats);
+
+	LASSERTF(atomic_read(&client_stat->nid_exp_ref_count) == 0,
+		 "nid %s:count %d\n", libcfs_nid2str(client_stat->nid),
+		 atomic_read(&client_stat->nid_exp_ref_count));
+
+	if (client_stat->nid_proc)
+		lprocfs_remove(&client_stat->nid_proc);
+
+	if (client_stat->nid_stats)
+		lprocfs_free_stats(&client_stat->nid_stats);
+
+	if (client_stat->nid_ldlm_stats)
+		lprocfs_free_stats(&client_stat->nid_ldlm_stats);
+
+	OBD_FREE_PTR(client_stat);
+	return;
+
+}
+
+void lprocfs_free_per_client_stats(struct obd_device *obd)
+{
+	cfs_hash_t *hash = obd->obd_nid_stats_hash;
+	struct nid_stat *stat;
+	ENTRY;
+
+	/* we need extra list - because hash_exit called to early */
+	/* not need locking because all clients is died */
+	while (!list_empty(&obd->obd_nid_stats)) {
+		stat = list_entry(obd->obd_nid_stats.next,
+				      struct nid_stat, nid_list);
+		list_del_init(&stat->nid_list);
+		cfs_hash_del(hash, &stat->nid, &stat->nid_hash);
+		lprocfs_free_client_stats(stat);
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(lprocfs_free_per_client_stats);
+
+struct lprocfs_stats *lprocfs_alloc_stats(unsigned int num,
+					  enum lprocfs_stats_flags flags)
+{
+	struct lprocfs_stats	*stats;
+	unsigned int		num_entry;
+	unsigned int		percpusize = 0;
+	int			i;
+
+	if (num == 0)
+		return NULL;
+
+	if (lprocfs_no_percpu_stats != 0)
+		flags |= LPROCFS_STATS_FLAG_NOPERCPU;
+
+	if (flags & LPROCFS_STATS_FLAG_NOPERCPU)
+		num_entry = 1;
+	else
+		num_entry = num_possible_cpus();
+
+	/* alloc percpu pointers for all possible cpu slots */
+	LIBCFS_ALLOC(stats, offsetof(typeof(*stats), ls_percpu[num_entry]));
+	if (stats == NULL)
+		return NULL;
+
+	stats->ls_num = num;
+	stats->ls_flags = flags;
+	spin_lock_init(&stats->ls_lock);
+
+	/* alloc num of counter headers */
+	LIBCFS_ALLOC(stats->ls_cnt_header,
+		     stats->ls_num * sizeof(struct lprocfs_counter_header));
+	if (stats->ls_cnt_header == NULL)
+		goto fail;
+
+	if ((flags & LPROCFS_STATS_FLAG_NOPERCPU) != 0) {
+		/* contains only one set counters */
+		percpusize = lprocfs_stats_counter_size(stats);
+		LIBCFS_ALLOC_ATOMIC(stats->ls_percpu[0], percpusize);
+		if (stats->ls_percpu[0] == NULL)
+			goto fail;
+		stats->ls_biggest_alloc_num = 1;
+	} else if ((flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) {
+		/* alloc all percpu data, currently only obd_memory use this */
+		for (i = 0; i < num_entry; ++i)
+			if (lprocfs_stats_alloc_one(stats, i) < 0)
+				goto fail;
+	}
+
+	return stats;
+
+fail:
+	lprocfs_free_stats(&stats);
+	return NULL;
+}
+EXPORT_SYMBOL(lprocfs_alloc_stats);
+
+void lprocfs_free_stats(struct lprocfs_stats **statsh)
+{
+	struct lprocfs_stats *stats = *statsh;
+	unsigned int num_entry;
+	unsigned int percpusize;
+	unsigned int i;
+
+	if (stats == NULL || stats->ls_num == 0)
+		return;
+	*statsh = NULL;
+
+	if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU)
+		num_entry = 1;
+	else
+		num_entry = num_possible_cpus();
+
+	percpusize = lprocfs_stats_counter_size(stats);
+	for (i = 0; i < num_entry; i++)
+		if (stats->ls_percpu[i] != NULL)
+			LIBCFS_FREE(stats->ls_percpu[i], percpusize);
+	if (stats->ls_cnt_header != NULL)
+		LIBCFS_FREE(stats->ls_cnt_header, stats->ls_num *
+					sizeof(struct lprocfs_counter_header));
+	LIBCFS_FREE(stats, offsetof(typeof(*stats), ls_percpu[num_entry]));
+}
+EXPORT_SYMBOL(lprocfs_free_stats);
+
+void lprocfs_clear_stats(struct lprocfs_stats *stats)
+{
+	struct lprocfs_counter		*percpu_cntr;
+	struct lprocfs_counter_header	*header;
+	int				i;
+	int				j;
+	unsigned int			num_entry;
+	unsigned long			flags = 0;
+
+	num_entry = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
+
+	for (i = 0; i < num_entry; i++) {
+		if (stats->ls_percpu[i] == NULL)
+			continue;
+		for (j = 0; j < stats->ls_num; j++) {
+			header = &stats->ls_cnt_header[j];
+			percpu_cntr = lprocfs_stats_counter_get(stats, i, j);
+			percpu_cntr->lc_count		= 0;
+			percpu_cntr->lc_min		= LC_MIN_INIT;
+			percpu_cntr->lc_max		= 0;
+			percpu_cntr->lc_sumsquare	= 0;
+			percpu_cntr->lc_sum		= 0;
+			if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE)
+				percpu_cntr->lc_sum_irq	= 0;
+		}
+	}
+
+	lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
+}
+EXPORT_SYMBOL(lprocfs_clear_stats);
+
+static ssize_t lprocfs_stats_seq_write(struct file *file, const char *buf,
+				       size_t len, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	struct lprocfs_stats *stats = seq->private;
+
+	lprocfs_clear_stats(stats);
+
+	return len;
+}
+
+static void *lprocfs_stats_seq_start(struct seq_file *p, loff_t *pos)
+{
+	struct lprocfs_stats *stats = p->private;
+	/* return 1st cpu location */
+	return (*pos >= stats->ls_num) ? NULL :
+		lprocfs_stats_counter_get(stats, 0, *pos);
+}
+
+static void lprocfs_stats_seq_stop(struct seq_file *p, void *v)
+{
+}
+
+static void *lprocfs_stats_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	struct lprocfs_stats *stats = p->private;
+	++*pos;
+	return (*pos >= stats->ls_num) ? NULL :
+		lprocfs_stats_counter_get(stats, 0, *pos);
+}
+
+/* seq file export of one lprocfs counter */
+static int lprocfs_stats_seq_show(struct seq_file *p, void *v)
+{
+	struct lprocfs_stats		*stats	= p->private;
+	struct lprocfs_counter		*cntr	= v;
+	struct lprocfs_counter		ret;
+	struct lprocfs_counter_header	*header;
+	int				entry_size;
+	int				idx;
+	int				rc	= 0;
+
+	if (cntr == &(stats->ls_percpu[0])->lp_cntr[0]) {
+		struct timeval now;
+		do_gettimeofday(&now);
+		rc = seq_printf(p, "%-25s %lu.%lu secs.usecs\n",
+				"snapshot_time", now.tv_sec, now.tv_usec);
+		if (rc < 0)
+			return rc;
+	}
+	entry_size = sizeof(*cntr);
+	if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE)
+		entry_size += sizeof(__s64);
+	idx = ((void *)cntr - (void *)&(stats->ls_percpu[0])->lp_cntr[0]) /
+		entry_size;
+
+	header = &stats->ls_cnt_header[idx];
+	lprocfs_stats_collect(stats, idx, &ret);
+
+	if (ret.lc_count == 0)
+		goto out;
+
+	rc = seq_printf(p, "%-25s "LPD64" samples [%s]", header->lc_name,
+			ret.lc_count, header->lc_units);
+
+	if (rc < 0)
+		goto out;
+
+	if ((header->lc_config & LPROCFS_CNTR_AVGMINMAX) &&
+	    (ret.lc_count > 0)) {
+		rc = seq_printf(p, " "LPD64" "LPD64" "LPD64,
+				ret.lc_min, ret.lc_max, ret.lc_sum);
+		if (rc < 0)
+			goto out;
+		if (header->lc_config & LPROCFS_CNTR_STDDEV)
+			rc = seq_printf(p, " "LPD64, ret.lc_sumsquare);
+		if (rc < 0)
+			goto out;
+	}
+	rc = seq_printf(p, "\n");
+ out:
+	return (rc < 0) ? rc : 0;
+}
+
+struct seq_operations lprocfs_stats_seq_sops = {
+	start: lprocfs_stats_seq_start,
+	stop:  lprocfs_stats_seq_stop,
+	next:  lprocfs_stats_seq_next,
+	show:  lprocfs_stats_seq_show,
+};
+
+static int lprocfs_stats_seq_open(struct inode *inode, struct file *file)
+{
+	struct proc_dir_entry *dp = PDE(inode);
+	struct seq_file *seq;
+	int rc;
+
+	if (LPROCFS_ENTRY_AND_CHECK(dp))
+		return -ENOENT;
+
+	rc = seq_open(file, &lprocfs_stats_seq_sops);
+	if (rc) {
+		LPROCFS_EXIT();
+		return rc;
+	}
+	seq = file->private_data;
+	seq->private = dp->data;
+	return 0;
+}
+
+struct file_operations lprocfs_stats_seq_fops = {
+	.owner   = THIS_MODULE,
+	.open    = lprocfs_stats_seq_open,
+	.read    = seq_read,
+	.write   = lprocfs_stats_seq_write,
+	.llseek  = seq_lseek,
+	.release = lprocfs_seq_release,
+};
+
+int lprocfs_register_stats(struct proc_dir_entry *root, const char *name,
+			   struct lprocfs_stats *stats)
+{
+	struct proc_dir_entry *entry;
+	LASSERT(root != NULL);
+
+	LPROCFS_WRITE_ENTRY();
+	entry = create_proc_entry(name, 0644, root);
+	if (entry) {
+		entry->proc_fops = &lprocfs_stats_seq_fops;
+		entry->data = stats;
+	}
+
+	LPROCFS_WRITE_EXIT();
+
+	if (entry == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_register_stats);
+
+void lprocfs_counter_init(struct lprocfs_stats *stats, int index,
+			  unsigned conf, const char *name, const char *units)
+{
+	struct lprocfs_counter_header	*header;
+	struct lprocfs_counter		*percpu_cntr;
+	unsigned long			flags = 0;
+	unsigned int			i;
+	unsigned int			num_cpu;
+
+	LASSERT(stats != NULL);
+
+	header = &stats->ls_cnt_header[index];
+	LASSERTF(header != NULL, "Failed to allocate stats header:[%d]%s/%s\n",
+		 index, name, units);
+
+	header->lc_config = conf;
+	header->lc_name   = name;
+	header->lc_units  = units;
+
+	num_cpu = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
+	for (i = 0; i < num_cpu; ++i) {
+		if (stats->ls_percpu[i] == NULL)
+			continue;
+		percpu_cntr = lprocfs_stats_counter_get(stats, i, index);
+		percpu_cntr->lc_count		= 0;
+		percpu_cntr->lc_min		= LC_MIN_INIT;
+		percpu_cntr->lc_max		= 0;
+		percpu_cntr->lc_sumsquare	= 0;
+		percpu_cntr->lc_sum		= 0;
+		if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+			percpu_cntr->lc_sum_irq	= 0;
+	}
+	lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
+}
+EXPORT_SYMBOL(lprocfs_counter_init);
+
+#define LPROCFS_OBD_OP_INIT(base, stats, op)			       \
+do {								       \
+	unsigned int coffset = base + OBD_COUNTER_OFFSET(op);	      \
+	LASSERT(coffset < stats->ls_num);				  \
+	lprocfs_counter_init(stats, coffset, 0, #op, "reqs");	      \
+} while (0)
+
+void lprocfs_init_ops_stats(int num_private_stats, struct lprocfs_stats *stats)
+{
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, iocontrol);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, get_info);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, set_info_async);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, attach);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, detach);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, setup);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, precleanup);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, cleanup);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, process_config);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, postrecov);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, add_conn);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, del_conn);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, connect);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, reconnect);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, disconnect);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_init);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_fini);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_alloc);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, statfs);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, statfs_async);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, packmd);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, unpackmd);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, preallocate);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, precreate);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, create);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, create_async);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, destroy);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, setattr);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, setattr_async);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, getattr);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, getattr_async);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, brw);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, merge_lvb);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, adjust_kms);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, punch);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, sync);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, migrate);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, copy);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, iterate);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, preprw);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, commitrw);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, enqueue);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, change_cbdata);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, find_cbdata);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, cancel);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, cancel_unused);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, init_export);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, destroy_export);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, extent_calc);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, llog_init);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, llog_connect);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, llog_finish);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, pin);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, unpin);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, import_event);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, notify);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, health_check);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, get_uuid);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, quotacheck);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, quotactl);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, ping);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_new);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_rem);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_add);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_del);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, getref);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, putref);
+}
+EXPORT_SYMBOL(lprocfs_init_ops_stats);
+
+int lprocfs_alloc_obd_stats(struct obd_device *obd, unsigned num_private_stats)
+{
+	struct lprocfs_stats *stats;
+	unsigned int num_stats;
+	int rc, i;
+
+	LASSERT(obd->obd_stats == NULL);
+	LASSERT(obd->obd_proc_entry != NULL);
+	LASSERT(obd->obd_cntr_base == 0);
+
+	num_stats = ((int)sizeof(*obd->obd_type->typ_dt_ops) / sizeof(void *)) +
+		num_private_stats - 1 /* o_owner */;
+	stats = lprocfs_alloc_stats(num_stats, 0);
+	if (stats == NULL)
+		return -ENOMEM;
+
+	lprocfs_init_ops_stats(num_private_stats, stats);
+
+	for (i = num_private_stats; i < num_stats; i++) {
+		/* If this LBUGs, it is likely that an obd
+		 * operation was added to struct obd_ops in
+		 * <obd.h>, and that the corresponding line item
+		 * LPROCFS_OBD_OP_INIT(.., .., opname)
+		 * is missing from the list above. */
+		LASSERTF(stats->ls_cnt_header[i].lc_name != NULL,
+			 "Missing obd_stat initializer obd_op "
+			 "operation at offset %d.\n", i - num_private_stats);
+	}
+	rc = lprocfs_register_stats(obd->obd_proc_entry, "stats", stats);
+	if (rc < 0) {
+		lprocfs_free_stats(&stats);
+	} else {
+		obd->obd_stats  = stats;
+		obd->obd_cntr_base = num_private_stats;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_alloc_obd_stats);
+
+void lprocfs_free_obd_stats(struct obd_device *obd)
+{
+	if (obd->obd_stats)
+		lprocfs_free_stats(&obd->obd_stats);
+}
+EXPORT_SYMBOL(lprocfs_free_obd_stats);
+
+#define LPROCFS_MD_OP_INIT(base, stats, op)			     \
+do {								    \
+	unsigned int coffset = base + MD_COUNTER_OFFSET(op);	    \
+	LASSERT(coffset < stats->ls_num);			       \
+	lprocfs_counter_init(stats, coffset, 0, #op, "reqs");	   \
+} while (0)
+
+void lprocfs_init_mps_stats(int num_private_stats, struct lprocfs_stats *stats)
+{
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, getstatus);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, null_inode);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, find_cbdata);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, close);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, create);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, done_writing);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, enqueue);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, getattr);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, getattr_name);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, intent_lock);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, link);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, rename);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, is_subdir);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, setattr);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, sync);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, readpage);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, unlink);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, setxattr);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, getxattr);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, init_ea_size);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, get_lustre_md);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, free_lustre_md);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, set_open_replay_data);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, clear_open_replay_data);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, set_lock_data);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, lock_match);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, cancel_unused);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, renew_capa);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, unpack_capa);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, get_remote_perm);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, intent_getattr_async);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, revalidate_lock);
+}
+EXPORT_SYMBOL(lprocfs_init_mps_stats);
+
+int lprocfs_alloc_md_stats(struct obd_device *obd,
+			   unsigned num_private_stats)
+{
+	struct lprocfs_stats *stats;
+	unsigned int num_stats;
+	int rc, i;
+
+	LASSERT(obd->md_stats == NULL);
+	LASSERT(obd->obd_proc_entry != NULL);
+	LASSERT(obd->md_cntr_base == 0);
+
+	num_stats = 1 + MD_COUNTER_OFFSET(revalidate_lock) +
+		    num_private_stats;
+	stats = lprocfs_alloc_stats(num_stats, 0);
+	if (stats == NULL)
+		return -ENOMEM;
+
+	lprocfs_init_mps_stats(num_private_stats, stats);
+
+	for (i = num_private_stats; i < num_stats; i++) {
+		if (stats->ls_cnt_header[i].lc_name == NULL) {
+			CERROR("Missing md_stat initializer md_op "
+			       "operation at offset %d. Aborting.\n",
+			       i - num_private_stats);
+			LBUG();
+		}
+	}
+	rc = lprocfs_register_stats(obd->obd_proc_entry, "md_stats", stats);
+	if (rc < 0) {
+		lprocfs_free_stats(&stats);
+	} else {
+		obd->md_stats  = stats;
+		obd->md_cntr_base = num_private_stats;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_alloc_md_stats);
+
+void lprocfs_free_md_stats(struct obd_device *obd)
+{
+	struct lprocfs_stats *stats = obd->md_stats;
+
+	if (stats != NULL) {
+		obd->md_stats = NULL;
+		obd->md_cntr_base = 0;
+		lprocfs_free_stats(&stats);
+	}
+}
+EXPORT_SYMBOL(lprocfs_free_md_stats);
+
+void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats)
+{
+	lprocfs_counter_init(ldlm_stats,
+			     LDLM_ENQUEUE - LDLM_FIRST_OPC,
+			     0, "ldlm_enqueue", "reqs");
+	lprocfs_counter_init(ldlm_stats,
+			     LDLM_CONVERT - LDLM_FIRST_OPC,
+			     0, "ldlm_convert", "reqs");
+	lprocfs_counter_init(ldlm_stats,
+			     LDLM_CANCEL - LDLM_FIRST_OPC,
+			     0, "ldlm_cancel", "reqs");
+	lprocfs_counter_init(ldlm_stats,
+			     LDLM_BL_CALLBACK - LDLM_FIRST_OPC,
+			     0, "ldlm_bl_callback", "reqs");
+	lprocfs_counter_init(ldlm_stats,
+			     LDLM_CP_CALLBACK - LDLM_FIRST_OPC,
+			     0, "ldlm_cp_callback", "reqs");
+	lprocfs_counter_init(ldlm_stats,
+			     LDLM_GL_CALLBACK - LDLM_FIRST_OPC,
+			     0, "ldlm_gl_callback", "reqs");
+}
+EXPORT_SYMBOL(lprocfs_init_ldlm_stats);
+
+int lprocfs_exp_rd_nid(char *page, char **start, off_t off, int count,
+			 int *eof,  void *data)
+{
+	struct obd_export *exp = data;
+	LASSERT(exp != NULL);
+	*eof = 1;
+	return snprintf(page, count, "%s\n", obd_export_nid2str(exp));
+}
+
+struct exp_uuid_cb_data {
+	char		   *page;
+	int		     count;
+	int		    *eof;
+	int		    *len;
+};
+
+static void
+lprocfs_exp_rd_cb_data_init(struct exp_uuid_cb_data *cb_data, char *page,
+			    int count, int *eof, int *len)
+{
+	cb_data->page = page;
+	cb_data->count = count;
+	cb_data->eof = eof;
+	cb_data->len = len;
+}
+
+int lprocfs_exp_print_uuid(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+			   struct hlist_node *hnode, void *cb_data)
+
+{
+	struct obd_export *exp = cfs_hash_object(hs, hnode);
+	struct exp_uuid_cb_data *data = (struct exp_uuid_cb_data *)cb_data;
+
+	if (exp->exp_nid_stats)
+		*data->len += snprintf((data->page + *data->len),
+				       data->count, "%s\n",
+				       obd_uuid2str(&exp->exp_client_uuid));
+	return 0;
+}
+
+int lprocfs_exp_rd_uuid(char *page, char **start, off_t off, int count,
+			int *eof,  void *data)
+{
+	struct nid_stat *stats = (struct nid_stat *)data;
+	struct exp_uuid_cb_data cb_data;
+	struct obd_device *obd = stats->nid_obd;
+	int len = 0;
+
+	*eof = 1;
+	page[0] = '\0';
+	lprocfs_exp_rd_cb_data_init(&cb_data, page, count, eof, &len);
+	cfs_hash_for_each_key(obd->obd_nid_hash, &stats->nid,
+			      lprocfs_exp_print_uuid, &cb_data);
+	return (*cb_data.len);
+}
+
+int lprocfs_exp_print_hash(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+			   struct hlist_node *hnode, void *cb_data)
+
+{
+	struct exp_uuid_cb_data *data = cb_data;
+	struct obd_export       *exp = cfs_hash_object(hs, hnode);
+
+	if (exp->exp_lock_hash != NULL) {
+		if (!*data->len) {
+			*data->len += cfs_hash_debug_header(data->page,
+							    data->count);
+		}
+		*data->len += cfs_hash_debug_str(hs, data->page + *data->len,
+						 data->count);
+	}
+
+	return 0;
+}
+
+int lprocfs_exp_rd_hash(char *page, char **start, off_t off, int count,
+			int *eof,  void *data)
+{
+	struct nid_stat *stats = (struct nid_stat *)data;
+	struct exp_uuid_cb_data cb_data;
+	struct obd_device *obd = stats->nid_obd;
+	int len = 0;
+
+	*eof = 1;
+	page[0] = '\0';
+	lprocfs_exp_rd_cb_data_init(&cb_data, page, count, eof, &len);
+
+	cfs_hash_for_each_key(obd->obd_nid_hash, &stats->nid,
+			      lprocfs_exp_print_hash, &cb_data);
+	return (*cb_data.len);
+}
+
+int lprocfs_nid_stats_clear_read(char *page, char **start, off_t off,
+					int count, int *eof,  void *data)
+{
+	*eof = 1;
+	return snprintf(page, count, "%s\n",
+			"Write into this file to clear all nid stats and "
+			"stale nid entries");
+}
+EXPORT_SYMBOL(lprocfs_nid_stats_clear_read);
+
+static int lprocfs_nid_stats_clear_write_cb(void *obj, void *data)
+{
+	struct nid_stat *stat = obj;
+	ENTRY;
+
+	CDEBUG(D_INFO,"refcnt %d\n", atomic_read(&stat->nid_exp_ref_count));
+	if (atomic_read(&stat->nid_exp_ref_count) == 1) {
+		/* object has only hash references. */
+		spin_lock(&stat->nid_obd->obd_nid_lock);
+		list_move(&stat->nid_list, data);
+		spin_unlock(&stat->nid_obd->obd_nid_lock);
+		RETURN(1);
+	}
+	/* we has reference to object - only clear data*/
+	if (stat->nid_stats)
+		lprocfs_clear_stats(stat->nid_stats);
+
+	RETURN(0);
+}
+
+int lprocfs_nid_stats_clear_write(struct file *file, const char *buffer,
+				  unsigned long count, void *data)
+{
+	struct obd_device *obd = (struct obd_device *)data;
+	struct nid_stat *client_stat;
+	LIST_HEAD(free_list);
+
+	cfs_hash_cond_del(obd->obd_nid_stats_hash,
+			  lprocfs_nid_stats_clear_write_cb, &free_list);
+
+	while (!list_empty(&free_list)) {
+		client_stat = list_entry(free_list.next, struct nid_stat,
+					     nid_list);
+		list_del_init(&client_stat->nid_list);
+		lprocfs_free_client_stats(client_stat);
+	}
+
+	return count;
+}
+EXPORT_SYMBOL(lprocfs_nid_stats_clear_write);
+
+int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *nid, int *newnid)
+{
+	struct nid_stat *new_stat, *old_stat;
+	struct obd_device *obd = NULL;
+	proc_dir_entry_t *entry;
+	char *buffer = NULL;
+	int rc = 0;
+	ENTRY;
+
+	*newnid = 0;
+
+	if (!exp || !exp->exp_obd || !exp->exp_obd->obd_proc_exports_entry ||
+	    !exp->exp_obd->obd_nid_stats_hash)
+		RETURN(-EINVAL);
+
+	/* not test against zero because eric say:
+	 * You may only test nid against another nid, or LNET_NID_ANY.
+	 * Anything else is nonsense.*/
+	if (!nid || *nid == LNET_NID_ANY)
+		RETURN(0);
+
+	obd = exp->exp_obd;
+
+	CDEBUG(D_CONFIG, "using hash %p\n", obd->obd_nid_stats_hash);
+
+	OBD_ALLOC_PTR(new_stat);
+	if (new_stat == NULL)
+		RETURN(-ENOMEM);
+
+	new_stat->nid	       = *nid;
+	new_stat->nid_obd	   = exp->exp_obd;
+	/* we need set default refcount to 1 to balance obd_disconnect */
+	atomic_set(&new_stat->nid_exp_ref_count, 1);
+
+	old_stat = cfs_hash_findadd_unique(obd->obd_nid_stats_hash,
+					   nid, &new_stat->nid_hash);
+	CDEBUG(D_INFO, "Found stats %p for nid %s - ref %d\n",
+	       old_stat, libcfs_nid2str(*nid),
+	       atomic_read(&new_stat->nid_exp_ref_count));
+
+	/* We need to release old stats because lprocfs_exp_cleanup() hasn't
+	 * been and will never be called. */
+	if (exp->exp_nid_stats) {
+		nidstat_putref(exp->exp_nid_stats);
+		exp->exp_nid_stats = NULL;
+	}
+
+	/* Return -EALREADY here so that we know that the /proc
+	 * entry already has been created */
+	if (old_stat != new_stat) {
+		exp->exp_nid_stats = old_stat;
+		GOTO(destroy_new, rc = -EALREADY);
+	}
+	/* not found - create */
+	OBD_ALLOC(buffer, LNET_NIDSTR_SIZE);
+	if (buffer == NULL)
+		GOTO(destroy_new, rc = -ENOMEM);
+
+	memcpy(buffer, libcfs_nid2str(*nid), LNET_NIDSTR_SIZE);
+	new_stat->nid_proc = lprocfs_register(buffer,
+					      obd->obd_proc_exports_entry,
+					      NULL, NULL);
+	OBD_FREE(buffer, LNET_NIDSTR_SIZE);
+
+	if (new_stat->nid_proc == NULL) {
+		CERROR("Error making export directory for nid %s\n",
+		       libcfs_nid2str(*nid));
+		GOTO(destroy_new_ns, rc = -ENOMEM);
+	}
+
+	entry = lprocfs_add_simple(new_stat->nid_proc, "uuid",
+				   lprocfs_exp_rd_uuid, NULL, new_stat, NULL);
+	if (IS_ERR(entry)) {
+		CWARN("Error adding the NID stats file\n");
+		rc = PTR_ERR(entry);
+		GOTO(destroy_new_ns, rc);
+	}
+
+	entry = lprocfs_add_simple(new_stat->nid_proc, "hash",
+				   lprocfs_exp_rd_hash, NULL, new_stat, NULL);
+	if (IS_ERR(entry)) {
+		CWARN("Error adding the hash file\n");
+		rc = PTR_ERR(entry);
+		GOTO(destroy_new_ns, rc);
+	}
+
+	exp->exp_nid_stats = new_stat;
+	*newnid = 1;
+	/* protect competitive add to list, not need locking on destroy */
+	spin_lock(&obd->obd_nid_lock);
+	list_add(&new_stat->nid_list, &obd->obd_nid_stats);
+	spin_unlock(&obd->obd_nid_lock);
+
+	RETURN(rc);
+
+destroy_new_ns:
+	if (new_stat->nid_proc != NULL)
+		lprocfs_remove(&new_stat->nid_proc);
+	cfs_hash_del(obd->obd_nid_stats_hash, nid, &new_stat->nid_hash);
+
+destroy_new:
+	nidstat_putref(new_stat);
+	OBD_FREE_PTR(new_stat);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(lprocfs_exp_setup);
+
+int lprocfs_exp_cleanup(struct obd_export *exp)
+{
+	struct nid_stat *stat = exp->exp_nid_stats;
+
+	if(!stat || !exp->exp_obd)
+		RETURN(0);
+
+	nidstat_putref(exp->exp_nid_stats);
+	exp->exp_nid_stats = NULL;
+
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_exp_cleanup);
+
+int lprocfs_write_helper(const char *buffer, unsigned long count,
+			 int *val)
+{
+	return lprocfs_write_frac_helper(buffer, count, val, 1);
+}
+EXPORT_SYMBOL(lprocfs_write_helper);
+
+int lprocfs_write_frac_helper(const char *buffer, unsigned long count,
+			      int *val, int mult)
+{
+	char kernbuf[20], *end, *pbuf;
+
+	if (count > (sizeof(kernbuf) - 1))
+		return -EINVAL;
+
+	if (copy_from_user(kernbuf, buffer, count))
+		return -EFAULT;
+
+	kernbuf[count] = '\0';
+	pbuf = kernbuf;
+	if (*pbuf == '-') {
+		mult = -mult;
+		pbuf++;
+	}
+
+	*val = (int)simple_strtoul(pbuf, &end, 10) * mult;
+	if (pbuf == end)
+		return -EINVAL;
+
+	if (end != NULL && *end == '.') {
+		int temp_val, pow = 1;
+		int i;
+
+		pbuf = end + 1;
+		if (strlen(pbuf) > 5)
+			pbuf[5] = '\0'; /*only allow 5bits fractional*/
+
+		temp_val = (int)simple_strtoul(pbuf, &end, 10) * mult;
+
+		if (pbuf < end) {
+			for (i = 0; i < (end - pbuf); i++)
+				pow *= 10;
+
+			*val += temp_val / pow;
+		}
+	}
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_write_frac_helper);
+
+int lprocfs_read_frac_helper(char *buffer, unsigned long count, long val,
+			     int mult)
+{
+	long decimal_val, frac_val;
+	int prtn;
+
+	if (count < 10)
+		return -EINVAL;
+
+	decimal_val = val / mult;
+	prtn = snprintf(buffer, count, "%ld", decimal_val);
+	frac_val = val % mult;
+
+	if (prtn < (count - 4) && frac_val > 0) {
+		long temp_frac;
+		int i, temp_mult = 1, frac_bits = 0;
+
+		temp_frac = frac_val * 10;
+		buffer[prtn++] = '.';
+		while (frac_bits < 2 && (temp_frac / mult) < 1 ) {
+			/* only reserved 2 bits fraction */
+			buffer[prtn++] ='0';
+			temp_frac *= 10;
+			frac_bits++;
+		}
+		/*
+		 * Need to think these cases :
+		 *      1. #echo x.00 > /proc/xxx       output result : x
+		 *      2. #echo x.0x > /proc/xxx       output result : x.0x
+		 *      3. #echo x.x0 > /proc/xxx       output result : x.x
+		 *      4. #echo x.xx > /proc/xxx       output result : x.xx
+		 *      Only reserved 2 bits fraction.
+		 */
+		for (i = 0; i < (5 - prtn); i++)
+			temp_mult *= 10;
+
+		frac_bits = min((int)count - prtn, 3 - frac_bits);
+		prtn += snprintf(buffer + prtn, frac_bits, "%ld",
+				 frac_val * temp_mult / mult);
+
+		prtn--;
+		while(buffer[prtn] < '1' || buffer[prtn] > '9') {
+			prtn--;
+			if (buffer[prtn] == '.') {
+				prtn--;
+				break;
+			}
+		}
+		prtn++;
+	}
+	buffer[prtn++] ='\n';
+	return prtn;
+}
+EXPORT_SYMBOL(lprocfs_read_frac_helper);
+
+int lprocfs_write_u64_helper(const char *buffer, unsigned long count,__u64 *val)
+{
+	return lprocfs_write_frac_u64_helper(buffer, count, val, 1);
+}
+EXPORT_SYMBOL(lprocfs_write_u64_helper);
+
+int lprocfs_write_frac_u64_helper(const char *buffer, unsigned long count,
+			      __u64 *val, int mult)
+{
+	char kernbuf[22], *end, *pbuf;
+	__u64 whole, frac = 0, units;
+	unsigned frac_d = 1;
+
+	if (count > (sizeof(kernbuf) - 1))
+		return -EINVAL;
+
+	if (copy_from_user(kernbuf, buffer, count))
+		return -EFAULT;
+
+	kernbuf[count] = '\0';
+	pbuf = kernbuf;
+	if (*pbuf == '-') {
+		mult = -mult;
+		pbuf++;
+	}
+
+	whole = simple_strtoull(pbuf, &end, 10);
+	if (pbuf == end)
+		return -EINVAL;
+
+	if (end != NULL && *end == '.') {
+		int i;
+		pbuf = end + 1;
+
+		/* need to limit frac_d to a __u32 */
+		if (strlen(pbuf) > 10)
+			pbuf[10] = '\0';
+
+		frac = simple_strtoull(pbuf, &end, 10);
+		/* count decimal places */
+		for (i = 0; i < (end - pbuf); i++)
+			frac_d *= 10;
+	}
+
+	units = 1;
+	switch(*end) {
+	case 'p': case 'P':
+		units <<= 10;
+	case 't': case 'T':
+		units <<= 10;
+	case 'g': case 'G':
+		units <<= 10;
+	case 'm': case 'M':
+		units <<= 10;
+	case 'k': case 'K':
+		units <<= 10;
+	}
+	/* Specified units override the multiplier */
+	if (units)
+		mult = mult < 0 ? -units : units;
+
+	frac *= mult;
+	do_div(frac, frac_d);
+	*val = whole * mult + frac;
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_write_frac_u64_helper);
+
+static char *lprocfs_strnstr(const char *s1, const char *s2, size_t len)
+{
+	size_t l2;
+
+	l2 = strlen(s2);
+	if (!l2)
+		return (char *)s1;
+	while (len >= l2) {
+		len--;
+		if (!memcmp(s1, s2, l2))
+			return (char *)s1;
+		s1++;
+	}
+	return NULL;
+}
+
+/**
+ * Find the string \a name in the input \a buffer, and return a pointer to the
+ * value immediately following \a name, reducing \a count appropriately.
+ * If \a name is not found the original \a buffer is returned.
+ */
+char *lprocfs_find_named_value(const char *buffer, const char *name,
+				unsigned long *count)
+{
+	char *val;
+	size_t buflen = *count;
+
+	/* there is no strnstr() in rhel5 and ubuntu kernels */
+	val = lprocfs_strnstr(buffer, name, buflen);
+	if (val == NULL)
+		return (char *)buffer;
+
+	val += strlen(name);			     /* skip prefix */
+	while (val < buffer + buflen && isspace(*val)) /* skip separator */
+		val++;
+
+	*count = 0;
+	while (val < buffer + buflen && isalnum(*val)) {
+		++*count;
+		++val;
+	}
+
+	return val - *count;
+}
+EXPORT_SYMBOL(lprocfs_find_named_value);
+
+int lprocfs_seq_create(proc_dir_entry_t *parent,
+		       const char *name,
+		       mode_t mode,
+		       const struct file_operations *seq_fops,
+		       void *data)
+{
+	struct proc_dir_entry *entry;
+	ENTRY;
+
+	/* Disallow secretly (un)writable entries. */
+	LASSERT((seq_fops->write == NULL) == ((mode & 0222) == 0));
+
+	LPROCFS_WRITE_ENTRY();
+	entry = create_proc_entry(name, mode, parent);
+	if (entry) {
+		entry->proc_fops = seq_fops;
+		entry->data = data;
+	}
+	LPROCFS_WRITE_EXIT();
+
+	if (entry == NULL)
+		RETURN(-ENOMEM);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(lprocfs_seq_create);
+
+int lprocfs_obd_seq_create(struct obd_device *dev,
+			   const char *name,
+			   mode_t mode,
+			   const struct file_operations *seq_fops,
+			   void *data)
+{
+	return (lprocfs_seq_create(dev->obd_proc_entry, name,
+				   mode, seq_fops, data));
+}
+EXPORT_SYMBOL(lprocfs_obd_seq_create);
+
+void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value)
+{
+	if (value >= OBD_HIST_MAX)
+		value = OBD_HIST_MAX - 1;
+
+	spin_lock(&oh->oh_lock);
+	oh->oh_buckets[value]++;
+	spin_unlock(&oh->oh_lock);
+}
+EXPORT_SYMBOL(lprocfs_oh_tally);
+
+void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value)
+{
+	unsigned int val;
+
+	for (val = 0; ((1 << val) < value) && (val <= OBD_HIST_MAX); val++)
+		;
+
+	lprocfs_oh_tally(oh, val);
+}
+EXPORT_SYMBOL(lprocfs_oh_tally_log2);
+
+unsigned long lprocfs_oh_sum(struct obd_histogram *oh)
+{
+	unsigned long ret = 0;
+	int i;
+
+	for (i = 0; i < OBD_HIST_MAX; i++)
+		ret +=  oh->oh_buckets[i];
+	return ret;
+}
+EXPORT_SYMBOL(lprocfs_oh_sum);
+
+void lprocfs_oh_clear(struct obd_histogram *oh)
+{
+	spin_lock(&oh->oh_lock);
+	memset(oh->oh_buckets, 0, sizeof(oh->oh_buckets));
+	spin_unlock(&oh->oh_lock);
+}
+EXPORT_SYMBOL(lprocfs_oh_clear);
+
+int lprocfs_obd_rd_hash(char *page, char **start, off_t off,
+			int count, int *eof, void *data)
+{
+	struct obd_device *obd = data;
+	int c = 0;
+
+	if (obd == NULL)
+		return 0;
+
+	c += cfs_hash_debug_header(page, count);
+	c += cfs_hash_debug_str(obd->obd_uuid_hash, page + c, count - c);
+	c += cfs_hash_debug_str(obd->obd_nid_hash, page + c, count - c);
+	c += cfs_hash_debug_str(obd->obd_nid_stats_hash, page+c, count-c);
+
+	return c;
+}
+EXPORT_SYMBOL(lprocfs_obd_rd_hash);
+
+int lprocfs_obd_rd_recovery_status(char *page, char **start, off_t off,
+				   int count, int *eof, void *data)
+{
+	struct obd_device *obd = data;
+	int len = 0, size;
+
+	LASSERT(obd != NULL);
+	LASSERT(count >= 0);
+
+	/* Set start of user data returned to
+	   page + off since the user may have
+	   requested to read much smaller than
+	   what we need to read */
+	*start = page + off;
+
+	/* We know we are allocated a page here.
+	   Also we know that this function will
+	   not need to write more than a page
+	   so we can truncate at PAGE_CACHE_SIZE.  */
+	size = min(count + (int)off + 1, (int)PAGE_CACHE_SIZE);
+
+	/* Initialize the page */
+	memset(page, 0, size);
+
+	if (lprocfs_obd_snprintf(&page, size, &len, "status: ") <= 0)
+		goto out;
+	if (obd->obd_max_recoverable_clients == 0) {
+		if (lprocfs_obd_snprintf(&page, size, &len, "INACTIVE\n") <= 0)
+			goto out;
+
+		goto fclose;
+	}
+
+	/* sampled unlocked, but really... */
+	if (obd->obd_recovering == 0) {
+		if (lprocfs_obd_snprintf(&page, size, &len, "COMPLETE\n") <= 0)
+			goto out;
+		if (lprocfs_obd_snprintf(&page, size, &len,
+					 "recovery_start: %lu\n",
+					 obd->obd_recovery_start) <= 0)
+			goto out;
+		if (lprocfs_obd_snprintf(&page, size, &len,
+					 "recovery_duration: %lu\n",
+					 obd->obd_recovery_end -
+					 obd->obd_recovery_start) <= 0)
+			goto out;
+		/* Number of clients that have completed recovery */
+		if (lprocfs_obd_snprintf(&page, size, &len,
+					 "completed_clients: %d/%d\n",
+					 obd->obd_max_recoverable_clients -
+					 obd->obd_stale_clients,
+					 obd->obd_max_recoverable_clients) <= 0)
+			goto out;
+		if (lprocfs_obd_snprintf(&page, size, &len,
+					 "replayed_requests: %d\n",
+					 obd->obd_replayed_requests) <= 0)
+			goto out;
+		if (lprocfs_obd_snprintf(&page, size, &len,
+					 "last_transno: "LPD64"\n",
+					 obd->obd_next_recovery_transno - 1)<=0)
+			goto out;
+		if (lprocfs_obd_snprintf(&page, size, &len, "VBR: %s\n",
+					 obd->obd_version_recov ?
+					 "ENABLED" : "DISABLED") <=0)
+			goto out;
+		if (lprocfs_obd_snprintf(&page, size, &len, "IR: %s\n",
+					 obd->obd_no_ir ?
+					 "DISABLED" : "ENABLED") <= 0)
+			goto out;
+		goto fclose;
+	}
+
+	if (lprocfs_obd_snprintf(&page, size, &len, "RECOVERING\n") <= 0)
+		goto out;
+	if (lprocfs_obd_snprintf(&page, size, &len, "recovery_start: %lu\n",
+				 obd->obd_recovery_start) <= 0)
+		goto out;
+	if (lprocfs_obd_snprintf(&page, size, &len, "time_remaining: %lu\n",
+				 cfs_time_current_sec() >=
+				 obd->obd_recovery_start +
+				 obd->obd_recovery_timeout ? 0 :
+				 obd->obd_recovery_start +
+				 obd->obd_recovery_timeout -
+				 cfs_time_current_sec()) <= 0)
+		goto out;
+	if (lprocfs_obd_snprintf(&page, size, &len,"connected_clients: %d/%d\n",
+				 atomic_read(&obd->obd_connected_clients),
+				 obd->obd_max_recoverable_clients) <= 0)
+		goto out;
+	/* Number of clients that have completed recovery */
+	if (lprocfs_obd_snprintf(&page, size, &len,"req_replay_clients: %d\n",
+				 atomic_read(&obd->obd_req_replay_clients))
+		<= 0)
+		goto out;
+	if (lprocfs_obd_snprintf(&page, size, &len,"lock_repay_clients: %d\n",
+				 atomic_read(&obd->obd_lock_replay_clients))
+		<=0)
+		goto out;
+	if (lprocfs_obd_snprintf(&page, size, &len,"completed_clients: %d\n",
+				 atomic_read(&obd->obd_connected_clients) -
+				 atomic_read(&obd->obd_lock_replay_clients))
+		<=0)
+		goto out;
+	if (lprocfs_obd_snprintf(&page, size, &len,"evicted_clients: %d\n",
+				 obd->obd_stale_clients) <= 0)
+		goto out;
+	if (lprocfs_obd_snprintf(&page, size, &len,"replayed_requests: %d\n",
+				 obd->obd_replayed_requests) <= 0)
+		goto out;
+	if (lprocfs_obd_snprintf(&page, size, &len, "queued_requests: %d\n",
+				 obd->obd_requests_queued_for_recovery) <= 0)
+		goto out;
+
+	if (lprocfs_obd_snprintf(&page, size, &len, "next_transno: "LPD64"\n",
+				 obd->obd_next_recovery_transno) <= 0)
+		goto out;
+
+fclose:
+	*eof = 1;
+out:
+	return min(count, len - (int)off);
+}
+EXPORT_SYMBOL(lprocfs_obd_rd_recovery_status);
+
+int lprocfs_obd_rd_ir_factor(char *page, char **start, off_t off,
+			     int count, int *eof, void *data)
+{
+	struct obd_device *obd = (struct obd_device *)data;
+	LASSERT(obd != NULL);
+
+	return snprintf(page, count, "%d\n",
+			obd->obd_recovery_ir_factor);
+}
+EXPORT_SYMBOL(lprocfs_obd_rd_ir_factor);
+
+int lprocfs_obd_wr_ir_factor(struct file *file, const char *buffer,
+			     unsigned long count, void *data)
+{
+	struct obd_device *obd = (struct obd_device *)data;
+	int val, rc;
+	LASSERT(obd != NULL);
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	if (val < OBD_IR_FACTOR_MIN || val > OBD_IR_FACTOR_MAX)
+		return -EINVAL;
+
+	obd->obd_recovery_ir_factor = val;
+	return count;
+}
+EXPORT_SYMBOL(lprocfs_obd_wr_ir_factor);
+
+int lprocfs_obd_rd_recovery_time_soft(char *page, char **start, off_t off,
+				      int count, int *eof, void *data)
+{
+	struct obd_device *obd = (struct obd_device *)data;
+	LASSERT(obd != NULL);
+
+	return snprintf(page, count, "%d\n",
+			obd->obd_recovery_timeout);
+}
+EXPORT_SYMBOL(lprocfs_obd_rd_recovery_time_soft);
+
+int lprocfs_obd_wr_recovery_time_soft(struct file *file, const char *buffer,
+				      unsigned long count, void *data)
+{
+	struct obd_device *obd = (struct obd_device *)data;
+	int val, rc;
+	LASSERT(obd != NULL);
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	obd->obd_recovery_timeout = val;
+	return count;
+}
+EXPORT_SYMBOL(lprocfs_obd_wr_recovery_time_soft);
+
+int lprocfs_obd_rd_recovery_time_hard(char *page, char **start, off_t off,
+				      int count, int *eof, void *data)
+{
+	struct obd_device *obd = data;
+	LASSERT(obd != NULL);
+
+	return snprintf(page, count, "%u\n", obd->obd_recovery_time_hard);
+}
+EXPORT_SYMBOL(lprocfs_obd_rd_recovery_time_hard);
+
+int lprocfs_obd_wr_recovery_time_hard(struct file *file, const char *buffer,
+				      unsigned long count, void *data)
+{
+	struct obd_device *obd = data;
+	int val, rc;
+	LASSERT(obd != NULL);
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	obd->obd_recovery_time_hard = val;
+	return count;
+}
+EXPORT_SYMBOL(lprocfs_obd_wr_recovery_time_hard);
+
+int lprocfs_obd_rd_max_pages_per_rpc(char *page, char **start, off_t off,
+				     int count, int *eof, void *data)
+{
+	struct obd_device *dev = data;
+	struct client_obd *cli = &dev->u.cli;
+	int rc;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	rc = snprintf(page, count, "%d\n", cli->cl_max_pages_per_rpc);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_obd_rd_max_pages_per_rpc);
+
+int lprocfs_target_rd_instance(char *page, char **start, off_t off,
+			       int count, int *eof, void *data)
+{
+	struct obd_device *obd = (struct obd_device *)data;
+	struct obd_device_target *target = &obd->u.obt;
+
+	LASSERT(obd != NULL);
+	LASSERT(target->obt_magic == OBT_MAGIC);
+	*eof = 1;
+	return snprintf(page, count, "%u\n", obd->u.obt.obt_instance);
+}
+EXPORT_SYMBOL(lprocfs_target_rd_instance);
+#endif /* LPROCFS*/
diff --git a/drivers/staging/lustre/lustre/obdclass/lu_object.c b/drivers/staging/lustre/lustre/obdclass/lu_object.c
new file mode 100644
index 000000000000..6c0de3f0a073
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/lu_object.c
@@ -0,0 +1,2209 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lu_object.c
+ *
+ * Lustre Object.
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/libcfs/libcfs.h>
+
+# include <linux/module.h>
+
+/* hash_long() */
+#include <linux/libcfs/libcfs_hash.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_disk.h>
+#include <lustre_fid.h>
+#include <lu_object.h>
+#include <lu_ref.h>
+#include <linux/list.h>
+
+static void lu_object_free(const struct lu_env *env, struct lu_object *o);
+
+/**
+ * Decrease reference counter on object. If last reference is freed, return
+ * object to the cache, unless lu_object_is_dying(o) holds. In the latter
+ * case, free object immediately.
+ */
+void lu_object_put(const struct lu_env *env, struct lu_object *o)
+{
+	struct lu_site_bkt_data *bkt;
+	struct lu_object_header *top;
+	struct lu_site	  *site;
+	struct lu_object	*orig;
+	cfs_hash_bd_t	    bd;
+	const struct lu_fid     *fid;
+
+	top  = o->lo_header;
+	site = o->lo_dev->ld_site;
+	orig = o;
+
+	/*
+	 * till we have full fids-on-OST implemented anonymous objects
+	 * are possible in OSP. such an object isn't listed in the site
+	 * so we should not remove it from the site.
+	 */
+	fid = lu_object_fid(o);
+	if (fid_is_zero(fid)) {
+		LASSERT(top->loh_hash.next == NULL
+			&& top->loh_hash.pprev == NULL);
+		LASSERT(list_empty(&top->loh_lru));
+		if (!atomic_dec_and_test(&top->loh_ref))
+			return;
+		list_for_each_entry_reverse(o, &top->loh_layers, lo_linkage) {
+			if (o->lo_ops->loo_object_release != NULL)
+				o->lo_ops->loo_object_release(env, o);
+		}
+		lu_object_free(env, orig);
+		return;
+	}
+
+	cfs_hash_bd_get(site->ls_obj_hash, &top->loh_fid, &bd);
+	bkt = cfs_hash_bd_extra_get(site->ls_obj_hash, &bd);
+
+	if (!cfs_hash_bd_dec_and_lock(site->ls_obj_hash, &bd, &top->loh_ref)) {
+		if (lu_object_is_dying(top)) {
+
+			/*
+			 * somebody may be waiting for this, currently only
+			 * used for cl_object, see cl_object_put_last().
+			 */
+			wake_up_all(&bkt->lsb_marche_funebre);
+		}
+		return;
+	}
+
+	LASSERT(bkt->lsb_busy > 0);
+	bkt->lsb_busy--;
+	/*
+	 * When last reference is released, iterate over object
+	 * layers, and notify them that object is no longer busy.
+	 */
+	list_for_each_entry_reverse(o, &top->loh_layers, lo_linkage) {
+		if (o->lo_ops->loo_object_release != NULL)
+			o->lo_ops->loo_object_release(env, o);
+	}
+
+	if (!lu_object_is_dying(top)) {
+		LASSERT(list_empty(&top->loh_lru));
+		list_add_tail(&top->loh_lru, &bkt->lsb_lru);
+		cfs_hash_bd_unlock(site->ls_obj_hash, &bd, 1);
+		return;
+	}
+
+	/*
+	 * If object is dying (will not be cached), removed it
+	 * from hash table and LRU.
+	 *
+	 * This is done with hash table and LRU lists locked. As the only
+	 * way to acquire first reference to previously unreferenced
+	 * object is through hash-table lookup (lu_object_find()),
+	 * or LRU scanning (lu_site_purge()), that are done under hash-table
+	 * and LRU lock, no race with concurrent object lookup is possible
+	 * and we can safely destroy object below.
+	 */
+	if (!test_and_set_bit(LU_OBJECT_UNHASHED, &top->loh_flags))
+		cfs_hash_bd_del_locked(site->ls_obj_hash, &bd, &top->loh_hash);
+	cfs_hash_bd_unlock(site->ls_obj_hash, &bd, 1);
+	/*
+	 * Object was already removed from hash and lru above, can
+	 * kill it.
+	 */
+	lu_object_free(env, orig);
+}
+EXPORT_SYMBOL(lu_object_put);
+
+/**
+ * Put object and don't keep in cache. This is temporary solution for
+ * multi-site objects when its layering is not constant.
+ */
+void lu_object_put_nocache(const struct lu_env *env, struct lu_object *o)
+{
+	set_bit(LU_OBJECT_HEARD_BANSHEE, &o->lo_header->loh_flags);
+	return lu_object_put(env, o);
+}
+EXPORT_SYMBOL(lu_object_put_nocache);
+
+/**
+ * Kill the object and take it out of LRU cache.
+ * Currently used by client code for layout change.
+ */
+void lu_object_unhash(const struct lu_env *env, struct lu_object *o)
+{
+	struct lu_object_header *top;
+
+	top = o->lo_header;
+	set_bit(LU_OBJECT_HEARD_BANSHEE, &top->loh_flags);
+	if (!test_and_set_bit(LU_OBJECT_UNHASHED, &top->loh_flags)) {
+		cfs_hash_t *obj_hash = o->lo_dev->ld_site->ls_obj_hash;
+		cfs_hash_bd_t bd;
+
+		cfs_hash_bd_get_and_lock(obj_hash, &top->loh_fid, &bd, 1);
+		list_del_init(&top->loh_lru);
+		cfs_hash_bd_del_locked(obj_hash, &bd, &top->loh_hash);
+		cfs_hash_bd_unlock(obj_hash, &bd, 1);
+	}
+}
+EXPORT_SYMBOL(lu_object_unhash);
+
+/**
+ * Allocate new object.
+ *
+ * This follows object creation protocol, described in the comment within
+ * struct lu_device_operations definition.
+ */
+static struct lu_object *lu_object_alloc(const struct lu_env *env,
+					 struct lu_device *dev,
+					 const struct lu_fid *f,
+					 const struct lu_object_conf *conf)
+{
+	struct lu_object *scan;
+	struct lu_object *top;
+	struct list_head *layers;
+	int clean;
+	int result;
+	ENTRY;
+
+	/*
+	 * Create top-level object slice. This will also create
+	 * lu_object_header.
+	 */
+	top = dev->ld_ops->ldo_object_alloc(env, NULL, dev);
+	if (top == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+	if (IS_ERR(top))
+		RETURN(top);
+	/*
+	 * This is the only place where object fid is assigned. It's constant
+	 * after this point.
+	 */
+	top->lo_header->loh_fid = *f;
+	layers = &top->lo_header->loh_layers;
+	do {
+		/*
+		 * Call ->loo_object_init() repeatedly, until no more new
+		 * object slices are created.
+		 */
+		clean = 1;
+		list_for_each_entry(scan, layers, lo_linkage) {
+			if (scan->lo_flags & LU_OBJECT_ALLOCATED)
+				continue;
+			clean = 0;
+			scan->lo_header = top->lo_header;
+			result = scan->lo_ops->loo_object_init(env, scan, conf);
+			if (result != 0) {
+				lu_object_free(env, top);
+				RETURN(ERR_PTR(result));
+			}
+			scan->lo_flags |= LU_OBJECT_ALLOCATED;
+		}
+	} while (!clean);
+
+	list_for_each_entry_reverse(scan, layers, lo_linkage) {
+		if (scan->lo_ops->loo_object_start != NULL) {
+			result = scan->lo_ops->loo_object_start(env, scan);
+			if (result != 0) {
+				lu_object_free(env, top);
+				RETURN(ERR_PTR(result));
+			}
+		}
+	}
+
+	lprocfs_counter_incr(dev->ld_site->ls_stats, LU_SS_CREATED);
+	RETURN(top);
+}
+
+/**
+ * Free an object.
+ */
+static void lu_object_free(const struct lu_env *env, struct lu_object *o)
+{
+	struct lu_site_bkt_data *bkt;
+	struct lu_site	  *site;
+	struct lu_object	*scan;
+	struct list_head	      *layers;
+	struct list_head	       splice;
+
+	site   = o->lo_dev->ld_site;
+	layers = &o->lo_header->loh_layers;
+	bkt    = lu_site_bkt_from_fid(site, &o->lo_header->loh_fid);
+	/*
+	 * First call ->loo_object_delete() method to release all resources.
+	 */
+	list_for_each_entry_reverse(scan, layers, lo_linkage) {
+		if (scan->lo_ops->loo_object_delete != NULL)
+			scan->lo_ops->loo_object_delete(env, scan);
+	}
+
+	/*
+	 * Then, splice object layers into stand-alone list, and call
+	 * ->loo_object_free() on all layers to free memory. Splice is
+	 * necessary, because lu_object_header is freed together with the
+	 * top-level slice.
+	 */
+	INIT_LIST_HEAD(&splice);
+	list_splice_init(layers, &splice);
+	while (!list_empty(&splice)) {
+		/*
+		 * Free layers in bottom-to-top order, so that object header
+		 * lives as long as possible and ->loo_object_free() methods
+		 * can look at its contents.
+		 */
+		o = container_of0(splice.prev, struct lu_object, lo_linkage);
+		list_del_init(&o->lo_linkage);
+		LASSERT(o->lo_ops->loo_object_free != NULL);
+		o->lo_ops->loo_object_free(env, o);
+	}
+
+	if (waitqueue_active(&bkt->lsb_marche_funebre))
+		wake_up_all(&bkt->lsb_marche_funebre);
+}
+
+/**
+ * Free \a nr objects from the cold end of the site LRU list.
+ */
+int lu_site_purge(const struct lu_env *env, struct lu_site *s, int nr)
+{
+	struct lu_object_header *h;
+	struct lu_object_header *temp;
+	struct lu_site_bkt_data *bkt;
+	cfs_hash_bd_t	    bd;
+	cfs_hash_bd_t	    bd2;
+	struct list_head	       dispose;
+	int		      did_sth;
+	int		      start;
+	int		      count;
+	int		      bnr;
+	int		      i;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_OBD_NO_LRU))
+		RETURN(0);
+
+	INIT_LIST_HEAD(&dispose);
+	/*
+	 * Under LRU list lock, scan LRU list and move unreferenced objects to
+	 * the dispose list, removing them from LRU and hash table.
+	 */
+	start = s->ls_purge_start;
+	bnr = (nr == ~0) ? -1 : nr / CFS_HASH_NBKT(s->ls_obj_hash) + 1;
+ again:
+	did_sth = 0;
+	cfs_hash_for_each_bucket(s->ls_obj_hash, &bd, i) {
+		if (i < start)
+			continue;
+		count = bnr;
+		cfs_hash_bd_lock(s->ls_obj_hash, &bd, 1);
+		bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, &bd);
+
+		list_for_each_entry_safe(h, temp, &bkt->lsb_lru, loh_lru) {
+			LASSERT(atomic_read(&h->loh_ref) == 0);
+
+			cfs_hash_bd_get(s->ls_obj_hash, &h->loh_fid, &bd2);
+			LASSERT(bd.bd_bucket == bd2.bd_bucket);
+
+			cfs_hash_bd_del_locked(s->ls_obj_hash,
+					       &bd2, &h->loh_hash);
+			list_move(&h->loh_lru, &dispose);
+			if (did_sth == 0)
+				did_sth = 1;
+
+			if (nr != ~0 && --nr == 0)
+				break;
+
+			if (count > 0 && --count == 0)
+				break;
+
+		}
+		cfs_hash_bd_unlock(s->ls_obj_hash, &bd, 1);
+		cond_resched();
+		/*
+		 * Free everything on the dispose list. This is safe against
+		 * races due to the reasons described in lu_object_put().
+		 */
+		while (!list_empty(&dispose)) {
+			h = container_of0(dispose.next,
+					  struct lu_object_header, loh_lru);
+			list_del_init(&h->loh_lru);
+			lu_object_free(env, lu_object_top(h));
+			lprocfs_counter_incr(s->ls_stats, LU_SS_LRU_PURGED);
+		}
+
+		if (nr == 0)
+			break;
+	}
+
+	if (nr != 0 && did_sth && start != 0) {
+		start = 0; /* restart from the first bucket */
+		goto again;
+	}
+	/* race on s->ls_purge_start, but nobody cares */
+	s->ls_purge_start = i % CFS_HASH_NBKT(s->ls_obj_hash);
+
+	return nr;
+}
+EXPORT_SYMBOL(lu_site_purge);
+
+/*
+ * Object printing.
+ *
+ * Code below has to jump through certain loops to output object description
+ * into libcfs_debug_msg-based log. The problem is that lu_object_print()
+ * composes object description from strings that are parts of _lines_ of
+ * output (i.e., strings that are not terminated by newline). This doesn't fit
+ * very well into libcfs_debug_msg() interface that assumes that each message
+ * supplied to it is a self-contained output line.
+ *
+ * To work around this, strings are collected in a temporary buffer
+ * (implemented as a value of lu_cdebug_key key), until terminating newline
+ * character is detected.
+ *
+ */
+
+enum {
+	/**
+	 * Maximal line size.
+	 *
+	 * XXX overflow is not handled correctly.
+	 */
+	LU_CDEBUG_LINE = 512
+};
+
+struct lu_cdebug_data {
+	/**
+	 * Temporary buffer.
+	 */
+	char lck_area[LU_CDEBUG_LINE];
+};
+
+/* context key constructor/destructor: lu_global_key_init, lu_global_key_fini */
+LU_KEY_INIT_FINI(lu_global, struct lu_cdebug_data);
+
+/**
+ * Key, holding temporary buffer. This key is registered very early by
+ * lu_global_init().
+ */
+struct lu_context_key lu_global_key = {
+	.lct_tags = LCT_MD_THREAD | LCT_DT_THREAD |
+		    LCT_MG_THREAD | LCT_CL_THREAD,
+	.lct_init = lu_global_key_init,
+	.lct_fini = lu_global_key_fini
+};
+
+/**
+ * Printer function emitting messages through libcfs_debug_msg().
+ */
+int lu_cdebug_printer(const struct lu_env *env,
+		      void *cookie, const char *format, ...)
+{
+	struct libcfs_debug_msg_data *msgdata = cookie;
+	struct lu_cdebug_data	*key;
+	int used;
+	int complete;
+	va_list args;
+
+	va_start(args, format);
+
+	key = lu_context_key_get(&env->le_ctx, &lu_global_key);
+	LASSERT(key != NULL);
+
+	used = strlen(key->lck_area);
+	complete = format[strlen(format) - 1] == '\n';
+	/*
+	 * Append new chunk to the buffer.
+	 */
+	vsnprintf(key->lck_area + used,
+		  ARRAY_SIZE(key->lck_area) - used, format, args);
+	if (complete) {
+		if (cfs_cdebug_show(msgdata->msg_mask, msgdata->msg_subsys))
+			libcfs_debug_msg(msgdata, "%s", key->lck_area);
+		key->lck_area[0] = 0;
+	}
+	va_end(args);
+	return 0;
+}
+EXPORT_SYMBOL(lu_cdebug_printer);
+
+/**
+ * Print object header.
+ */
+void lu_object_header_print(const struct lu_env *env, void *cookie,
+			    lu_printer_t printer,
+			    const struct lu_object_header *hdr)
+{
+	(*printer)(env, cookie, "header@%p[%#lx, %d, "DFID"%s%s%s]",
+		   hdr, hdr->loh_flags, atomic_read(&hdr->loh_ref),
+		   PFID(&hdr->loh_fid),
+		   hlist_unhashed(&hdr->loh_hash) ? "" : " hash",
+		   list_empty((struct list_head *)&hdr->loh_lru) ? \
+		   "" : " lru",
+		   hdr->loh_attr & LOHA_EXISTS ? " exist":"");
+}
+EXPORT_SYMBOL(lu_object_header_print);
+
+/**
+ * Print human readable representation of the \a o to the \a printer.
+ */
+void lu_object_print(const struct lu_env *env, void *cookie,
+		     lu_printer_t printer, const struct lu_object *o)
+{
+	static const char ruler[] = "........................................";
+	struct lu_object_header *top;
+	int depth;
+
+	top = o->lo_header;
+	lu_object_header_print(env, cookie, printer, top);
+	(*printer)(env, cookie, "{ \n");
+	list_for_each_entry(o, &top->loh_layers, lo_linkage) {
+		depth = o->lo_depth + 4;
+
+		/*
+		 * print `.' \a depth times followed by type name and address
+		 */
+		(*printer)(env, cookie, "%*.*s%s@%p", depth, depth, ruler,
+			   o->lo_dev->ld_type->ldt_name, o);
+		if (o->lo_ops->loo_object_print != NULL)
+			o->lo_ops->loo_object_print(env, cookie, printer, o);
+		(*printer)(env, cookie, "\n");
+	}
+	(*printer)(env, cookie, "} header@%p\n", top);
+}
+EXPORT_SYMBOL(lu_object_print);
+
+/**
+ * Check object consistency.
+ */
+int lu_object_invariant(const struct lu_object *o)
+{
+	struct lu_object_header *top;
+
+	top = o->lo_header;
+	list_for_each_entry(o, &top->loh_layers, lo_linkage) {
+		if (o->lo_ops->loo_object_invariant != NULL &&
+		    !o->lo_ops->loo_object_invariant(o))
+			return 0;
+	}
+	return 1;
+}
+EXPORT_SYMBOL(lu_object_invariant);
+
+static struct lu_object *htable_lookup(struct lu_site *s,
+				       cfs_hash_bd_t *bd,
+				       const struct lu_fid *f,
+				       wait_queue_t *waiter,
+				       __u64 *version)
+{
+	struct lu_site_bkt_data *bkt;
+	struct lu_object_header *h;
+	struct hlist_node	*hnode;
+	__u64  ver = cfs_hash_bd_version_get(bd);
+
+	if (*version == ver)
+		return NULL;
+
+	*version = ver;
+	bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, bd);
+	/* cfs_hash_bd_peek_locked is a somehow "internal" function
+	 * of cfs_hash, it doesn't add refcount on object. */
+	hnode = cfs_hash_bd_peek_locked(s->ls_obj_hash, bd, (void *)f);
+	if (hnode == NULL) {
+		lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_MISS);
+		return NULL;
+	}
+
+	h = container_of0(hnode, struct lu_object_header, loh_hash);
+	if (likely(!lu_object_is_dying(h))) {
+		cfs_hash_get(s->ls_obj_hash, hnode);
+		lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_HIT);
+		list_del_init(&h->loh_lru);
+		return lu_object_top(h);
+	}
+
+	/*
+	 * Lookup found an object being destroyed this object cannot be
+	 * returned (to assure that references to dying objects are eventually
+	 * drained), and moreover, lookup has to wait until object is freed.
+	 */
+
+	init_waitqueue_entry_current(waiter);
+	add_wait_queue(&bkt->lsb_marche_funebre, waiter);
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_DEATH_RACE);
+	return ERR_PTR(-EAGAIN);
+}
+
+/**
+ * Search cache for an object with the fid \a f. If such object is found,
+ * return it. Otherwise, create new object, insert it into cache and return
+ * it. In any case, additional reference is acquired on the returned object.
+ */
+struct lu_object *lu_object_find(const struct lu_env *env,
+				 struct lu_device *dev, const struct lu_fid *f,
+				 const struct lu_object_conf *conf)
+{
+	return lu_object_find_at(env, dev->ld_site->ls_top_dev, f, conf);
+}
+EXPORT_SYMBOL(lu_object_find);
+
+static struct lu_object *lu_object_new(const struct lu_env *env,
+				       struct lu_device *dev,
+				       const struct lu_fid *f,
+				       const struct lu_object_conf *conf)
+{
+	struct lu_object	*o;
+	cfs_hash_t	      *hs;
+	cfs_hash_bd_t	    bd;
+	struct lu_site_bkt_data *bkt;
+
+	o = lu_object_alloc(env, dev, f, conf);
+	if (unlikely(IS_ERR(o)))
+		return o;
+
+	hs = dev->ld_site->ls_obj_hash;
+	cfs_hash_bd_get_and_lock(hs, (void *)f, &bd, 1);
+	bkt = cfs_hash_bd_extra_get(hs, &bd);
+	cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash);
+	bkt->lsb_busy++;
+	cfs_hash_bd_unlock(hs, &bd, 1);
+	return o;
+}
+
+/**
+ * Core logic of lu_object_find*() functions.
+ */
+static struct lu_object *lu_object_find_try(const struct lu_env *env,
+					    struct lu_device *dev,
+					    const struct lu_fid *f,
+					    const struct lu_object_conf *conf,
+					    wait_queue_t *waiter)
+{
+	struct lu_object      *o;
+	struct lu_object      *shadow;
+	struct lu_site	*s;
+	cfs_hash_t	    *hs;
+	cfs_hash_bd_t	  bd;
+	__u64		  version = 0;
+
+	/*
+	 * This uses standard index maintenance protocol:
+	 *
+	 *     - search index under lock, and return object if found;
+	 *     - otherwise, unlock index, allocate new object;
+	 *     - lock index and search again;
+	 *     - if nothing is found (usual case), insert newly created
+	 *       object into index;
+	 *     - otherwise (race: other thread inserted object), free
+	 *       object just allocated.
+	 *     - unlock index;
+	 *     - return object.
+	 *
+	 * For "LOC_F_NEW" case, we are sure the object is new established.
+	 * It is unnecessary to perform lookup-alloc-lookup-insert, instead,
+	 * just alloc and insert directly.
+	 *
+	 * If dying object is found during index search, add @waiter to the
+	 * site wait-queue and return ERR_PTR(-EAGAIN).
+	 */
+	if (conf != NULL && conf->loc_flags & LOC_F_NEW)
+		return lu_object_new(env, dev, f, conf);
+
+	s  = dev->ld_site;
+	hs = s->ls_obj_hash;
+	cfs_hash_bd_get_and_lock(hs, (void *)f, &bd, 1);
+	o = htable_lookup(s, &bd, f, waiter, &version);
+	cfs_hash_bd_unlock(hs, &bd, 1);
+	if (o != NULL)
+		return o;
+
+	/*
+	 * Allocate new object. This may result in rather complicated
+	 * operations, including fld queries, inode loading, etc.
+	 */
+	o = lu_object_alloc(env, dev, f, conf);
+	if (unlikely(IS_ERR(o)))
+		return o;
+
+	LASSERT(lu_fid_eq(lu_object_fid(o), f));
+
+	cfs_hash_bd_lock(hs, &bd, 1);
+
+	shadow = htable_lookup(s, &bd, f, waiter, &version);
+	if (likely(shadow == NULL)) {
+		struct lu_site_bkt_data *bkt;
+
+		bkt = cfs_hash_bd_extra_get(hs, &bd);
+		cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash);
+		bkt->lsb_busy++;
+		cfs_hash_bd_unlock(hs, &bd, 1);
+		return o;
+	}
+
+	lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_RACE);
+	cfs_hash_bd_unlock(hs, &bd, 1);
+	lu_object_free(env, o);
+	return shadow;
+}
+
+/**
+ * Much like lu_object_find(), but top level device of object is specifically
+ * \a dev rather than top level device of the site. This interface allows
+ * objects of different "stacking" to be created within the same site.
+ */
+struct lu_object *lu_object_find_at(const struct lu_env *env,
+				    struct lu_device *dev,
+				    const struct lu_fid *f,
+				    const struct lu_object_conf *conf)
+{
+	struct lu_site_bkt_data *bkt;
+	struct lu_object	*obj;
+	wait_queue_t	   wait;
+
+	while (1) {
+		obj = lu_object_find_try(env, dev, f, conf, &wait);
+		if (obj != ERR_PTR(-EAGAIN))
+			return obj;
+		/*
+		 * lu_object_find_try() already added waiter into the
+		 * wait queue.
+		 */
+		waitq_wait(&wait, TASK_UNINTERRUPTIBLE);
+		bkt = lu_site_bkt_from_fid(dev->ld_site, (void *)f);
+		remove_wait_queue(&bkt->lsb_marche_funebre, &wait);
+	}
+}
+EXPORT_SYMBOL(lu_object_find_at);
+
+/**
+ * Find object with given fid, and return its slice belonging to given device.
+ */
+struct lu_object *lu_object_find_slice(const struct lu_env *env,
+				       struct lu_device *dev,
+				       const struct lu_fid *f,
+				       const struct lu_object_conf *conf)
+{
+	struct lu_object *top;
+	struct lu_object *obj;
+
+	top = lu_object_find(env, dev, f, conf);
+	if (!IS_ERR(top)) {
+		obj = lu_object_locate(top->lo_header, dev->ld_type);
+		if (obj == NULL)
+			lu_object_put(env, top);
+	} else
+		obj = top;
+	return obj;
+}
+EXPORT_SYMBOL(lu_object_find_slice);
+
+/**
+ * Global list of all device types.
+ */
+static LIST_HEAD(lu_device_types);
+
+int lu_device_type_init(struct lu_device_type *ldt)
+{
+	int result = 0;
+
+	INIT_LIST_HEAD(&ldt->ldt_linkage);
+	if (ldt->ldt_ops->ldto_init)
+		result = ldt->ldt_ops->ldto_init(ldt);
+	if (result == 0)
+		list_add(&ldt->ldt_linkage, &lu_device_types);
+	return result;
+}
+EXPORT_SYMBOL(lu_device_type_init);
+
+void lu_device_type_fini(struct lu_device_type *ldt)
+{
+	list_del_init(&ldt->ldt_linkage);
+	if (ldt->ldt_ops->ldto_fini)
+		ldt->ldt_ops->ldto_fini(ldt);
+}
+EXPORT_SYMBOL(lu_device_type_fini);
+
+void lu_types_stop(void)
+{
+	struct lu_device_type *ldt;
+
+	list_for_each_entry(ldt, &lu_device_types, ldt_linkage) {
+		if (ldt->ldt_device_nr == 0 && ldt->ldt_ops->ldto_stop)
+			ldt->ldt_ops->ldto_stop(ldt);
+	}
+}
+EXPORT_SYMBOL(lu_types_stop);
+
+/**
+ * Global list of all sites on this node
+ */
+static LIST_HEAD(lu_sites);
+static DEFINE_MUTEX(lu_sites_guard);
+
+/**
+ * Global environment used by site shrinker.
+ */
+static struct lu_env lu_shrink_env;
+
+struct lu_site_print_arg {
+	struct lu_env   *lsp_env;
+	void	    *lsp_cookie;
+	lu_printer_t     lsp_printer;
+};
+
+static int
+lu_site_obj_print(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+		  struct hlist_node *hnode, void *data)
+{
+	struct lu_site_print_arg *arg = (struct lu_site_print_arg *)data;
+	struct lu_object_header  *h;
+
+	h = hlist_entry(hnode, struct lu_object_header, loh_hash);
+	if (!list_empty(&h->loh_layers)) {
+		const struct lu_object *o;
+
+		o = lu_object_top(h);
+		lu_object_print(arg->lsp_env, arg->lsp_cookie,
+				arg->lsp_printer, o);
+	} else {
+		lu_object_header_print(arg->lsp_env, arg->lsp_cookie,
+				       arg->lsp_printer, h);
+	}
+	return 0;
+}
+
+/**
+ * Print all objects in \a s.
+ */
+void lu_site_print(const struct lu_env *env, struct lu_site *s, void *cookie,
+		   lu_printer_t printer)
+{
+	struct lu_site_print_arg arg = {
+		.lsp_env     = (struct lu_env *)env,
+		.lsp_cookie  = cookie,
+		.lsp_printer = printer,
+	};
+
+	cfs_hash_for_each(s->ls_obj_hash, lu_site_obj_print, &arg);
+}
+EXPORT_SYMBOL(lu_site_print);
+
+enum {
+	LU_CACHE_PERCENT_MAX     = 50,
+	LU_CACHE_PERCENT_DEFAULT = 20
+};
+
+static unsigned int lu_cache_percent = LU_CACHE_PERCENT_DEFAULT;
+CFS_MODULE_PARM(lu_cache_percent, "i", int, 0644,
+		"Percentage of memory to be used as lu_object cache");
+
+/**
+ * Return desired hash table order.
+ */
+static int lu_htable_order(void)
+{
+	unsigned long cache_size;
+	int bits;
+
+	/*
+	 * Calculate hash table size, assuming that we want reasonable
+	 * performance when 20% of total memory is occupied by cache of
+	 * lu_objects.
+	 *
+	 * Size of lu_object is (arbitrary) taken as 1K (together with inode).
+	 */
+	cache_size = num_physpages;
+
+#if BITS_PER_LONG == 32
+	/* limit hashtable size for lowmem systems to low RAM */
+	if (cache_size > 1 << (30 - PAGE_CACHE_SHIFT))
+		cache_size = 1 << (30 - PAGE_CACHE_SHIFT) * 3 / 4;
+#endif
+
+	/* clear off unreasonable cache setting. */
+	if (lu_cache_percent == 0 || lu_cache_percent > LU_CACHE_PERCENT_MAX) {
+		CWARN("obdclass: invalid lu_cache_percent: %u, it must be in"
+		      " the range of (0, %u]. Will use default value: %u.\n",
+		      lu_cache_percent, LU_CACHE_PERCENT_MAX,
+		      LU_CACHE_PERCENT_DEFAULT);
+
+		lu_cache_percent = LU_CACHE_PERCENT_DEFAULT;
+	}
+	cache_size = cache_size / 100 * lu_cache_percent *
+		(PAGE_CACHE_SIZE / 1024);
+
+	for (bits = 1; (1 << bits) < cache_size; ++bits) {
+		;
+	}
+	return bits;
+}
+
+static unsigned lu_obj_hop_hash(cfs_hash_t *hs,
+				const void *key, unsigned mask)
+{
+	struct lu_fid  *fid = (struct lu_fid *)key;
+	__u32	   hash;
+
+	hash = fid_flatten32(fid);
+	hash += (hash >> 4) + (hash << 12); /* mixing oid and seq */
+	hash = cfs_hash_long(hash, hs->hs_bkt_bits);
+
+	/* give me another random factor */
+	hash -= cfs_hash_long((unsigned long)hs, fid_oid(fid) % 11 + 3);
+
+	hash <<= hs->hs_cur_bits - hs->hs_bkt_bits;
+	hash |= (fid_seq(fid) + fid_oid(fid)) & (CFS_HASH_NBKT(hs) - 1);
+
+	return hash & mask;
+}
+
+static void *lu_obj_hop_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct lu_object_header, loh_hash);
+}
+
+static void *lu_obj_hop_key(struct hlist_node *hnode)
+{
+	struct lu_object_header *h;
+
+	h = hlist_entry(hnode, struct lu_object_header, loh_hash);
+	return &h->loh_fid;
+}
+
+static int lu_obj_hop_keycmp(const void *key, struct hlist_node *hnode)
+{
+	struct lu_object_header *h;
+
+	h = hlist_entry(hnode, struct lu_object_header, loh_hash);
+	return lu_fid_eq(&h->loh_fid, (struct lu_fid *)key);
+}
+
+static void lu_obj_hop_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct lu_object_header *h;
+
+	h = hlist_entry(hnode, struct lu_object_header, loh_hash);
+	if (atomic_add_return(1, &h->loh_ref) == 1) {
+		struct lu_site_bkt_data *bkt;
+		cfs_hash_bd_t	    bd;
+
+		cfs_hash_bd_get(hs, &h->loh_fid, &bd);
+		bkt = cfs_hash_bd_extra_get(hs, &bd);
+		bkt->lsb_busy++;
+	}
+}
+
+static void lu_obj_hop_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	LBUG(); /* we should never called it */
+}
+
+cfs_hash_ops_t lu_site_hash_ops = {
+	.hs_hash	= lu_obj_hop_hash,
+	.hs_key	 = lu_obj_hop_key,
+	.hs_keycmp      = lu_obj_hop_keycmp,
+	.hs_object      = lu_obj_hop_object,
+	.hs_get	 = lu_obj_hop_get,
+	.hs_put_locked  = lu_obj_hop_put_locked,
+};
+
+void lu_dev_add_linkage(struct lu_site *s, struct lu_device *d)
+{
+	spin_lock(&s->ls_ld_lock);
+	if (list_empty(&d->ld_linkage))
+		list_add(&d->ld_linkage, &s->ls_ld_linkage);
+	spin_unlock(&s->ls_ld_lock);
+}
+EXPORT_SYMBOL(lu_dev_add_linkage);
+
+void lu_dev_del_linkage(struct lu_site *s, struct lu_device *d)
+{
+	spin_lock(&s->ls_ld_lock);
+	list_del_init(&d->ld_linkage);
+	spin_unlock(&s->ls_ld_lock);
+}
+EXPORT_SYMBOL(lu_dev_del_linkage);
+
+/**
+ * Initialize site \a s, with \a d as the top level device.
+ */
+#define LU_SITE_BITS_MIN    12
+#define LU_SITE_BITS_MAX    24
+/**
+ * total 256 buckets, we don't want too many buckets because:
+ * - consume too much memory
+ * - avoid unbalanced LRU list
+ */
+#define LU_SITE_BKT_BITS    8
+
+int lu_site_init(struct lu_site *s, struct lu_device *top)
+{
+	struct lu_site_bkt_data *bkt;
+	cfs_hash_bd_t bd;
+	char name[16];
+	int bits;
+	int i;
+	ENTRY;
+
+	memset(s, 0, sizeof *s);
+	bits = lu_htable_order();
+	snprintf(name, 16, "lu_site_%s", top->ld_type->ldt_name);
+	for (bits = min(max(LU_SITE_BITS_MIN, bits), LU_SITE_BITS_MAX);
+	     bits >= LU_SITE_BITS_MIN; bits--) {
+		s->ls_obj_hash = cfs_hash_create(name, bits, bits,
+						 bits - LU_SITE_BKT_BITS,
+						 sizeof(*bkt), 0, 0,
+						 &lu_site_hash_ops,
+						 CFS_HASH_SPIN_BKTLOCK |
+						 CFS_HASH_NO_ITEMREF |
+						 CFS_HASH_DEPTH |
+						 CFS_HASH_ASSERT_EMPTY);
+		if (s->ls_obj_hash != NULL)
+			break;
+	}
+
+	if (s->ls_obj_hash == NULL) {
+		CERROR("failed to create lu_site hash with bits: %d\n", bits);
+		return -ENOMEM;
+	}
+
+	cfs_hash_for_each_bucket(s->ls_obj_hash, &bd, i) {
+		bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, &bd);
+		INIT_LIST_HEAD(&bkt->lsb_lru);
+		init_waitqueue_head(&bkt->lsb_marche_funebre);
+	}
+
+	s->ls_stats = lprocfs_alloc_stats(LU_SS_LAST_STAT, 0);
+	if (s->ls_stats == NULL) {
+		cfs_hash_putref(s->ls_obj_hash);
+		s->ls_obj_hash = NULL;
+		return -ENOMEM;
+	}
+
+	lprocfs_counter_init(s->ls_stats, LU_SS_CREATED,
+			     0, "created", "created");
+	lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_HIT,
+			     0, "cache_hit", "cache_hit");
+	lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_MISS,
+			     0, "cache_miss", "cache_miss");
+	lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_RACE,
+			     0, "cache_race", "cache_race");
+	lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_DEATH_RACE,
+			     0, "cache_death_race", "cache_death_race");
+	lprocfs_counter_init(s->ls_stats, LU_SS_LRU_PURGED,
+			     0, "lru_purged", "lru_purged");
+
+	INIT_LIST_HEAD(&s->ls_linkage);
+	s->ls_top_dev = top;
+	top->ld_site = s;
+	lu_device_get(top);
+	lu_ref_add(&top->ld_reference, "site-top", s);
+
+	INIT_LIST_HEAD(&s->ls_ld_linkage);
+	spin_lock_init(&s->ls_ld_lock);
+
+	lu_dev_add_linkage(s, top);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(lu_site_init);
+
+/**
+ * Finalize \a s and release its resources.
+ */
+void lu_site_fini(struct lu_site *s)
+{
+	mutex_lock(&lu_sites_guard);
+	list_del_init(&s->ls_linkage);
+	mutex_unlock(&lu_sites_guard);
+
+	if (s->ls_obj_hash != NULL) {
+		cfs_hash_putref(s->ls_obj_hash);
+		s->ls_obj_hash = NULL;
+	}
+
+	if (s->ls_top_dev != NULL) {
+		s->ls_top_dev->ld_site = NULL;
+		lu_ref_del(&s->ls_top_dev->ld_reference, "site-top", s);
+		lu_device_put(s->ls_top_dev);
+		s->ls_top_dev = NULL;
+	}
+
+	if (s->ls_stats != NULL)
+		lprocfs_free_stats(&s->ls_stats);
+}
+EXPORT_SYMBOL(lu_site_fini);
+
+/**
+ * Called when initialization of stack for this site is completed.
+ */
+int lu_site_init_finish(struct lu_site *s)
+{
+	int result;
+	mutex_lock(&lu_sites_guard);
+	result = lu_context_refill(&lu_shrink_env.le_ctx);
+	if (result == 0)
+		list_add(&s->ls_linkage, &lu_sites);
+	mutex_unlock(&lu_sites_guard);
+	return result;
+}
+EXPORT_SYMBOL(lu_site_init_finish);
+
+/**
+ * Acquire additional reference on device \a d
+ */
+void lu_device_get(struct lu_device *d)
+{
+	atomic_inc(&d->ld_ref);
+}
+EXPORT_SYMBOL(lu_device_get);
+
+/**
+ * Release reference on device \a d.
+ */
+void lu_device_put(struct lu_device *d)
+{
+	LASSERT(atomic_read(&d->ld_ref) > 0);
+	atomic_dec(&d->ld_ref);
+}
+EXPORT_SYMBOL(lu_device_put);
+
+/**
+ * Initialize device \a d of type \a t.
+ */
+int lu_device_init(struct lu_device *d, struct lu_device_type *t)
+{
+	if (t->ldt_device_nr++ == 0 && t->ldt_ops->ldto_start != NULL)
+		t->ldt_ops->ldto_start(t);
+	memset(d, 0, sizeof *d);
+	atomic_set(&d->ld_ref, 0);
+	d->ld_type = t;
+	lu_ref_init(&d->ld_reference);
+	INIT_LIST_HEAD(&d->ld_linkage);
+	return 0;
+}
+EXPORT_SYMBOL(lu_device_init);
+
+/**
+ * Finalize device \a d.
+ */
+void lu_device_fini(struct lu_device *d)
+{
+	struct lu_device_type *t;
+
+	t = d->ld_type;
+	if (d->ld_obd != NULL) {
+		d->ld_obd->obd_lu_dev = NULL;
+		d->ld_obd = NULL;
+	}
+
+	lu_ref_fini(&d->ld_reference);
+	LASSERTF(atomic_read(&d->ld_ref) == 0,
+		 "Refcount is %u\n", atomic_read(&d->ld_ref));
+	LASSERT(t->ldt_device_nr > 0);
+	if (--t->ldt_device_nr == 0 && t->ldt_ops->ldto_stop != NULL)
+		t->ldt_ops->ldto_stop(t);
+}
+EXPORT_SYMBOL(lu_device_fini);
+
+/**
+ * Initialize object \a o that is part of compound object \a h and was created
+ * by device \a d.
+ */
+int lu_object_init(struct lu_object *o,
+		   struct lu_object_header *h, struct lu_device *d)
+{
+	memset(o, 0, sizeof *o);
+	o->lo_header = h;
+	o->lo_dev    = d;
+	lu_device_get(d);
+	o->lo_dev_ref = lu_ref_add(&d->ld_reference, "lu_object", o);
+	INIT_LIST_HEAD(&o->lo_linkage);
+	return 0;
+}
+EXPORT_SYMBOL(lu_object_init);
+
+/**
+ * Finalize object and release its resources.
+ */
+void lu_object_fini(struct lu_object *o)
+{
+	struct lu_device *dev = o->lo_dev;
+
+	LASSERT(list_empty(&o->lo_linkage));
+
+	if (dev != NULL) {
+		lu_ref_del_at(&dev->ld_reference,
+			      o->lo_dev_ref , "lu_object", o);
+		lu_device_put(dev);
+		o->lo_dev = NULL;
+	}
+}
+EXPORT_SYMBOL(lu_object_fini);
+
+/**
+ * Add object \a o as first layer of compound object \a h
+ *
+ * This is typically called by the ->ldo_object_alloc() method of top-level
+ * device.
+ */
+void lu_object_add_top(struct lu_object_header *h, struct lu_object *o)
+{
+	list_move(&o->lo_linkage, &h->loh_layers);
+}
+EXPORT_SYMBOL(lu_object_add_top);
+
+/**
+ * Add object \a o as a layer of compound object, going after \a before.
+ *
+ * This is typically called by the ->ldo_object_alloc() method of \a
+ * before->lo_dev.
+ */
+void lu_object_add(struct lu_object *before, struct lu_object *o)
+{
+	list_move(&o->lo_linkage, &before->lo_linkage);
+}
+EXPORT_SYMBOL(lu_object_add);
+
+/**
+ * Initialize compound object.
+ */
+int lu_object_header_init(struct lu_object_header *h)
+{
+	memset(h, 0, sizeof *h);
+	atomic_set(&h->loh_ref, 1);
+	INIT_HLIST_NODE(&h->loh_hash);
+	INIT_LIST_HEAD(&h->loh_lru);
+	INIT_LIST_HEAD(&h->loh_layers);
+	lu_ref_init(&h->loh_reference);
+	return 0;
+}
+EXPORT_SYMBOL(lu_object_header_init);
+
+/**
+ * Finalize compound object.
+ */
+void lu_object_header_fini(struct lu_object_header *h)
+{
+	LASSERT(list_empty(&h->loh_layers));
+	LASSERT(list_empty(&h->loh_lru));
+	LASSERT(hlist_unhashed(&h->loh_hash));
+	lu_ref_fini(&h->loh_reference);
+}
+EXPORT_SYMBOL(lu_object_header_fini);
+
+/**
+ * Given a compound object, find its slice, corresponding to the device type
+ * \a dtype.
+ */
+struct lu_object *lu_object_locate(struct lu_object_header *h,
+				   const struct lu_device_type *dtype)
+{
+	struct lu_object *o;
+
+	list_for_each_entry(o, &h->loh_layers, lo_linkage) {
+		if (o->lo_dev->ld_type == dtype)
+			return o;
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(lu_object_locate);
+
+
+
+/**
+ * Finalize and free devices in the device stack.
+ *
+ * Finalize device stack by purging object cache, and calling
+ * lu_device_type_operations::ldto_device_fini() and
+ * lu_device_type_operations::ldto_device_free() on all devices in the stack.
+ */
+void lu_stack_fini(const struct lu_env *env, struct lu_device *top)
+{
+	struct lu_site   *site = top->ld_site;
+	struct lu_device *scan;
+	struct lu_device *next;
+
+	lu_site_purge(env, site, ~0);
+	for (scan = top; scan != NULL; scan = next) {
+		next = scan->ld_type->ldt_ops->ldto_device_fini(env, scan);
+		lu_ref_del(&scan->ld_reference, "lu-stack", &lu_site_init);
+		lu_device_put(scan);
+	}
+
+	/* purge again. */
+	lu_site_purge(env, site, ~0);
+
+	for (scan = top; scan != NULL; scan = next) {
+		const struct lu_device_type *ldt = scan->ld_type;
+		struct obd_type	     *type;
+
+		next = ldt->ldt_ops->ldto_device_free(env, scan);
+		type = ldt->ldt_obd_type;
+		if (type != NULL) {
+			type->typ_refcnt--;
+			class_put_type(type);
+		}
+	}
+}
+EXPORT_SYMBOL(lu_stack_fini);
+
+enum {
+	/**
+	 * Maximal number of tld slots.
+	 */
+	LU_CONTEXT_KEY_NR = 40
+};
+
+static struct lu_context_key *lu_keys[LU_CONTEXT_KEY_NR] = { NULL, };
+
+static DEFINE_SPINLOCK(lu_keys_guard);
+
+/**
+ * Global counter incremented whenever key is registered, unregistered,
+ * revived or quiesced. This is used to void unnecessary calls to
+ * lu_context_refill(). No locking is provided, as initialization and shutdown
+ * are supposed to be externally serialized.
+ */
+static unsigned key_set_version = 0;
+
+/**
+ * Register new key.
+ */
+int lu_context_key_register(struct lu_context_key *key)
+{
+	int result;
+	int i;
+
+	LASSERT(key->lct_init != NULL);
+	LASSERT(key->lct_fini != NULL);
+	LASSERT(key->lct_tags != 0);
+	LASSERT(key->lct_owner != NULL);
+
+	result = -ENFILE;
+	spin_lock(&lu_keys_guard);
+	for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
+		if (lu_keys[i] == NULL) {
+			key->lct_index = i;
+			atomic_set(&key->lct_used, 1);
+			lu_keys[i] = key;
+			lu_ref_init(&key->lct_reference);
+			result = 0;
+			++key_set_version;
+			break;
+		}
+	}
+	spin_unlock(&lu_keys_guard);
+	return result;
+}
+EXPORT_SYMBOL(lu_context_key_register);
+
+static void key_fini(struct lu_context *ctx, int index)
+{
+	if (ctx->lc_value != NULL && ctx->lc_value[index] != NULL) {
+		struct lu_context_key *key;
+
+		key = lu_keys[index];
+		LASSERT(key != NULL);
+		LASSERT(key->lct_fini != NULL);
+		LASSERT(atomic_read(&key->lct_used) > 1);
+
+		key->lct_fini(ctx, key, ctx->lc_value[index]);
+		lu_ref_del(&key->lct_reference, "ctx", ctx);
+		atomic_dec(&key->lct_used);
+
+		LASSERT(key->lct_owner != NULL);
+		if ((ctx->lc_tags & LCT_NOREF) == 0) {
+			LINVRNT(module_refcount(key->lct_owner) > 0);
+			module_put(key->lct_owner);
+		}
+		ctx->lc_value[index] = NULL;
+	}
+}
+
+/**
+ * Deregister key.
+ */
+void lu_context_key_degister(struct lu_context_key *key)
+{
+	LASSERT(atomic_read(&key->lct_used) >= 1);
+	LINVRNT(0 <= key->lct_index && key->lct_index < ARRAY_SIZE(lu_keys));
+
+	lu_context_key_quiesce(key);
+
+	++key_set_version;
+	spin_lock(&lu_keys_guard);
+	key_fini(&lu_shrink_env.le_ctx, key->lct_index);
+	if (lu_keys[key->lct_index]) {
+		lu_keys[key->lct_index] = NULL;
+		lu_ref_fini(&key->lct_reference);
+	}
+	spin_unlock(&lu_keys_guard);
+
+	LASSERTF(atomic_read(&key->lct_used) == 1,
+		 "key has instances: %d\n",
+		 atomic_read(&key->lct_used));
+}
+EXPORT_SYMBOL(lu_context_key_degister);
+
+/**
+ * Register a number of keys. This has to be called after all keys have been
+ * initialized by a call to LU_CONTEXT_KEY_INIT().
+ */
+int lu_context_key_register_many(struct lu_context_key *k, ...)
+{
+	struct lu_context_key *key = k;
+	va_list args;
+	int result;
+
+	va_start(args, k);
+	do {
+		result = lu_context_key_register(key);
+		if (result)
+			break;
+		key = va_arg(args, struct lu_context_key *);
+	} while (key != NULL);
+	va_end(args);
+
+	if (result != 0) {
+		va_start(args, k);
+		while (k != key) {
+			lu_context_key_degister(k);
+			k = va_arg(args, struct lu_context_key *);
+		}
+		va_end(args);
+	}
+
+	return result;
+}
+EXPORT_SYMBOL(lu_context_key_register_many);
+
+/**
+ * De-register a number of keys. This is a dual to
+ * lu_context_key_register_many().
+ */
+void lu_context_key_degister_many(struct lu_context_key *k, ...)
+{
+	va_list args;
+
+	va_start(args, k);
+	do {
+		lu_context_key_degister(k);
+		k = va_arg(args, struct lu_context_key*);
+	} while (k != NULL);
+	va_end(args);
+}
+EXPORT_SYMBOL(lu_context_key_degister_many);
+
+/**
+ * Revive a number of keys.
+ */
+void lu_context_key_revive_many(struct lu_context_key *k, ...)
+{
+	va_list args;
+
+	va_start(args, k);
+	do {
+		lu_context_key_revive(k);
+		k = va_arg(args, struct lu_context_key*);
+	} while (k != NULL);
+	va_end(args);
+}
+EXPORT_SYMBOL(lu_context_key_revive_many);
+
+/**
+ * Quiescent a number of keys.
+ */
+void lu_context_key_quiesce_many(struct lu_context_key *k, ...)
+{
+	va_list args;
+
+	va_start(args, k);
+	do {
+		lu_context_key_quiesce(k);
+		k = va_arg(args, struct lu_context_key*);
+	} while (k != NULL);
+	va_end(args);
+}
+EXPORT_SYMBOL(lu_context_key_quiesce_many);
+
+/**
+ * Return value associated with key \a key in context \a ctx.
+ */
+void *lu_context_key_get(const struct lu_context *ctx,
+			 const struct lu_context_key *key)
+{
+	LINVRNT(ctx->lc_state == LCS_ENTERED);
+	LINVRNT(0 <= key->lct_index && key->lct_index < ARRAY_SIZE(lu_keys));
+	LASSERT(lu_keys[key->lct_index] == key);
+	return ctx->lc_value[key->lct_index];
+}
+EXPORT_SYMBOL(lu_context_key_get);
+
+/**
+ * List of remembered contexts. XXX document me.
+ */
+static LIST_HEAD(lu_context_remembered);
+
+/**
+ * Destroy \a key in all remembered contexts. This is used to destroy key
+ * values in "shared" contexts (like service threads), when a module owning
+ * the key is about to be unloaded.
+ */
+void lu_context_key_quiesce(struct lu_context_key *key)
+{
+	struct lu_context *ctx;
+
+	if (!(key->lct_tags & LCT_QUIESCENT)) {
+		/*
+		 * XXX layering violation.
+		 */
+		key->lct_tags |= LCT_QUIESCENT;
+		/*
+		 * XXX memory barrier has to go here.
+		 */
+		spin_lock(&lu_keys_guard);
+		list_for_each_entry(ctx, &lu_context_remembered,
+					lc_remember)
+			key_fini(ctx, key->lct_index);
+		spin_unlock(&lu_keys_guard);
+		++key_set_version;
+	}
+}
+EXPORT_SYMBOL(lu_context_key_quiesce);
+
+void lu_context_key_revive(struct lu_context_key *key)
+{
+	key->lct_tags &= ~LCT_QUIESCENT;
+	++key_set_version;
+}
+EXPORT_SYMBOL(lu_context_key_revive);
+
+static void keys_fini(struct lu_context *ctx)
+{
+	int	i;
+
+	if (ctx->lc_value == NULL)
+		return;
+
+	for (i = 0; i < ARRAY_SIZE(lu_keys); ++i)
+		key_fini(ctx, i);
+
+	OBD_FREE(ctx->lc_value, ARRAY_SIZE(lu_keys) * sizeof ctx->lc_value[0]);
+	ctx->lc_value = NULL;
+}
+
+static int keys_fill(struct lu_context *ctx)
+{
+	int i;
+
+	LINVRNT(ctx->lc_value != NULL);
+	for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
+		struct lu_context_key *key;
+
+		key = lu_keys[i];
+		if (ctx->lc_value[i] == NULL && key != NULL &&
+		    (key->lct_tags & ctx->lc_tags) &&
+		    /*
+		     * Don't create values for a LCT_QUIESCENT key, as this
+		     * will pin module owning a key.
+		     */
+		    !(key->lct_tags & LCT_QUIESCENT)) {
+			void *value;
+
+			LINVRNT(key->lct_init != NULL);
+			LINVRNT(key->lct_index == i);
+
+			value = key->lct_init(ctx, key);
+			if (unlikely(IS_ERR(value)))
+				return PTR_ERR(value);
+
+			LASSERT(key->lct_owner != NULL);
+			if (!(ctx->lc_tags & LCT_NOREF))
+				try_module_get(key->lct_owner);
+			lu_ref_add_atomic(&key->lct_reference, "ctx", ctx);
+			atomic_inc(&key->lct_used);
+			/*
+			 * This is the only place in the code, where an
+			 * element of ctx->lc_value[] array is set to non-NULL
+			 * value.
+			 */
+			ctx->lc_value[i] = value;
+			if (key->lct_exit != NULL)
+				ctx->lc_tags |= LCT_HAS_EXIT;
+		}
+		ctx->lc_version = key_set_version;
+	}
+	return 0;
+}
+
+static int keys_init(struct lu_context *ctx)
+{
+	OBD_ALLOC(ctx->lc_value, ARRAY_SIZE(lu_keys) * sizeof ctx->lc_value[0]);
+	if (likely(ctx->lc_value != NULL))
+		return keys_fill(ctx);
+
+	return -ENOMEM;
+}
+
+/**
+ * Initialize context data-structure. Create values for all keys.
+ */
+int lu_context_init(struct lu_context *ctx, __u32 tags)
+{
+	int	rc;
+
+	memset(ctx, 0, sizeof *ctx);
+	ctx->lc_state = LCS_INITIALIZED;
+	ctx->lc_tags = tags;
+	if (tags & LCT_REMEMBER) {
+		spin_lock(&lu_keys_guard);
+		list_add(&ctx->lc_remember, &lu_context_remembered);
+		spin_unlock(&lu_keys_guard);
+	} else {
+		INIT_LIST_HEAD(&ctx->lc_remember);
+	}
+
+	rc = keys_init(ctx);
+	if (rc != 0)
+		lu_context_fini(ctx);
+
+	return rc;
+}
+EXPORT_SYMBOL(lu_context_init);
+
+/**
+ * Finalize context data-structure. Destroy key values.
+ */
+void lu_context_fini(struct lu_context *ctx)
+{
+	LINVRNT(ctx->lc_state == LCS_INITIALIZED || ctx->lc_state == LCS_LEFT);
+	ctx->lc_state = LCS_FINALIZED;
+
+	if ((ctx->lc_tags & LCT_REMEMBER) == 0) {
+		LASSERT(list_empty(&ctx->lc_remember));
+		keys_fini(ctx);
+
+	} else { /* could race with key degister */
+		spin_lock(&lu_keys_guard);
+		keys_fini(ctx);
+		list_del_init(&ctx->lc_remember);
+		spin_unlock(&lu_keys_guard);
+	}
+}
+EXPORT_SYMBOL(lu_context_fini);
+
+/**
+ * Called before entering context.
+ */
+void lu_context_enter(struct lu_context *ctx)
+{
+	LINVRNT(ctx->lc_state == LCS_INITIALIZED || ctx->lc_state == LCS_LEFT);
+	ctx->lc_state = LCS_ENTERED;
+}
+EXPORT_SYMBOL(lu_context_enter);
+
+/**
+ * Called after exiting from \a ctx
+ */
+void lu_context_exit(struct lu_context *ctx)
+{
+	int i;
+
+	LINVRNT(ctx->lc_state == LCS_ENTERED);
+	ctx->lc_state = LCS_LEFT;
+	if (ctx->lc_tags & LCT_HAS_EXIT && ctx->lc_value != NULL) {
+		for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
+			if (ctx->lc_value[i] != NULL) {
+				struct lu_context_key *key;
+
+				key = lu_keys[i];
+				LASSERT(key != NULL);
+				if (key->lct_exit != NULL)
+					key->lct_exit(ctx,
+						      key, ctx->lc_value[i]);
+			}
+		}
+	}
+}
+EXPORT_SYMBOL(lu_context_exit);
+
+/**
+ * Allocate for context all missing keys that were registered after context
+ * creation. key_set_version is only changed in rare cases when modules
+ * are loaded and removed.
+ */
+int lu_context_refill(struct lu_context *ctx)
+{
+	return likely(ctx->lc_version == key_set_version) ? 0 : keys_fill(ctx);
+}
+EXPORT_SYMBOL(lu_context_refill);
+
+/**
+ * lu_ctx_tags/lu_ses_tags will be updated if there are new types of
+ * obd being added. Currently, this is only used on client side, specifically
+ * for echo device client, for other stack (like ptlrpc threads), context are
+ * predefined when the lu_device type are registered, during the module probe
+ * phase.
+ */
+__u32 lu_context_tags_default = 0;
+__u32 lu_session_tags_default = 0;
+
+void lu_context_tags_update(__u32 tags)
+{
+	spin_lock(&lu_keys_guard);
+	lu_context_tags_default |= tags;
+	key_set_version++;
+	spin_unlock(&lu_keys_guard);
+}
+EXPORT_SYMBOL(lu_context_tags_update);
+
+void lu_context_tags_clear(__u32 tags)
+{
+	spin_lock(&lu_keys_guard);
+	lu_context_tags_default &= ~tags;
+	key_set_version++;
+	spin_unlock(&lu_keys_guard);
+}
+EXPORT_SYMBOL(lu_context_tags_clear);
+
+void lu_session_tags_update(__u32 tags)
+{
+	spin_lock(&lu_keys_guard);
+	lu_session_tags_default |= tags;
+	key_set_version++;
+	spin_unlock(&lu_keys_guard);
+}
+EXPORT_SYMBOL(lu_session_tags_update);
+
+void lu_session_tags_clear(__u32 tags)
+{
+	spin_lock(&lu_keys_guard);
+	lu_session_tags_default &= ~tags;
+	key_set_version++;
+	spin_unlock(&lu_keys_guard);
+}
+EXPORT_SYMBOL(lu_session_tags_clear);
+
+int lu_env_init(struct lu_env *env, __u32 tags)
+{
+	int result;
+
+	env->le_ses = NULL;
+	result = lu_context_init(&env->le_ctx, tags);
+	if (likely(result == 0))
+		lu_context_enter(&env->le_ctx);
+	return result;
+}
+EXPORT_SYMBOL(lu_env_init);
+
+void lu_env_fini(struct lu_env *env)
+{
+	lu_context_exit(&env->le_ctx);
+	lu_context_fini(&env->le_ctx);
+	env->le_ses = NULL;
+}
+EXPORT_SYMBOL(lu_env_fini);
+
+int lu_env_refill(struct lu_env *env)
+{
+	int result;
+
+	result = lu_context_refill(&env->le_ctx);
+	if (result == 0 && env->le_ses != NULL)
+		result = lu_context_refill(env->le_ses);
+	return result;
+}
+EXPORT_SYMBOL(lu_env_refill);
+
+/**
+ * Currently, this API will only be used by echo client.
+ * Because echo client and normal lustre client will share
+ * same cl_env cache. So echo client needs to refresh
+ * the env context after it get one from the cache, especially
+ * when normal client and echo client co-exist in the same client.
+ */
+int lu_env_refill_by_tags(struct lu_env *env, __u32 ctags,
+			  __u32 stags)
+{
+	int    result;
+
+	if ((env->le_ctx.lc_tags & ctags) != ctags) {
+		env->le_ctx.lc_version = 0;
+		env->le_ctx.lc_tags |= ctags;
+	}
+
+	if (env->le_ses && (env->le_ses->lc_tags & stags) != stags) {
+		env->le_ses->lc_version = 0;
+		env->le_ses->lc_tags |= stags;
+	}
+
+	result = lu_env_refill(env);
+
+	return result;
+}
+EXPORT_SYMBOL(lu_env_refill_by_tags);
+
+static struct shrinker *lu_site_shrinker = NULL;
+
+typedef struct lu_site_stats{
+	unsigned	lss_populated;
+	unsigned	lss_max_search;
+	unsigned	lss_total;
+	unsigned	lss_busy;
+} lu_site_stats_t;
+
+static void lu_site_stats_get(cfs_hash_t *hs,
+			      lu_site_stats_t *stats, int populated)
+{
+	cfs_hash_bd_t bd;
+	int	   i;
+
+	cfs_hash_for_each_bucket(hs, &bd, i) {
+		struct lu_site_bkt_data *bkt = cfs_hash_bd_extra_get(hs, &bd);
+		struct hlist_head	*hhead;
+
+		cfs_hash_bd_lock(hs, &bd, 1);
+		stats->lss_busy  += bkt->lsb_busy;
+		stats->lss_total += cfs_hash_bd_count_get(&bd);
+		stats->lss_max_search = max((int)stats->lss_max_search,
+					    cfs_hash_bd_depmax_get(&bd));
+		if (!populated) {
+			cfs_hash_bd_unlock(hs, &bd, 1);
+			continue;
+		}
+
+		cfs_hash_bd_for_each_hlist(hs, &bd, hhead) {
+			if (!hlist_empty(hhead))
+				stats->lss_populated++;
+		}
+		cfs_hash_bd_unlock(hs, &bd, 1);
+	}
+}
+
+
+/*
+ * There exists a potential lock inversion deadlock scenario when using
+ * Lustre on top of ZFS. This occurs between one of ZFS's
+ * buf_hash_table.ht_lock's, and Lustre's lu_sites_guard lock. Essentially,
+ * thread A will take the lu_sites_guard lock and sleep on the ht_lock,
+ * while thread B will take the ht_lock and sleep on the lu_sites_guard
+ * lock. Obviously neither thread will wake and drop their respective hold
+ * on their lock.
+ *
+ * To prevent this from happening we must ensure the lu_sites_guard lock is
+ * not taken while down this code path. ZFS reliably does not set the
+ * __GFP_FS bit in its code paths, so this can be used to determine if it
+ * is safe to take the lu_sites_guard lock.
+ *
+ * Ideally we should accurately return the remaining number of cached
+ * objects without taking the  lu_sites_guard lock, but this is not
+ * possible in the current implementation.
+ */
+static int lu_cache_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask))
+{
+	lu_site_stats_t stats;
+	struct lu_site *s;
+	struct lu_site *tmp;
+	int cached = 0;
+	int remain = shrink_param(sc, nr_to_scan);
+	LIST_HEAD(splice);
+
+	if (!(shrink_param(sc, gfp_mask) & __GFP_FS)) {
+		if (remain != 0)
+			return -1;
+		else
+			/* We must not take the lu_sites_guard lock when
+			 * __GFP_FS is *not* set because of the deadlock
+			 * possibility detailed above. Additionally,
+			 * since we cannot determine the number of
+			 * objects in the cache without taking this
+			 * lock, we're in a particularly tough spot. As
+			 * a result, we'll just lie and say our cache is
+			 * empty. This _should_ be ok, as we can't
+			 * reclaim objects when __GFP_FS is *not* set
+			 * anyways.
+			 */
+			return 0;
+	}
+
+	CDEBUG(D_INODE, "Shrink %d objects\n", remain);
+
+	mutex_lock(&lu_sites_guard);
+	list_for_each_entry_safe(s, tmp, &lu_sites, ls_linkage) {
+		if (shrink_param(sc, nr_to_scan) != 0) {
+			remain = lu_site_purge(&lu_shrink_env, s, remain);
+			/*
+			 * Move just shrunk site to the tail of site list to
+			 * assure shrinking fairness.
+			 */
+			list_move_tail(&s->ls_linkage, &splice);
+		}
+
+		memset(&stats, 0, sizeof(stats));
+		lu_site_stats_get(s->ls_obj_hash, &stats, 0);
+		cached += stats.lss_total - stats.lss_busy;
+		if (shrink_param(sc, nr_to_scan) && remain <= 0)
+			break;
+	}
+	list_splice(&splice, lu_sites.prev);
+	mutex_unlock(&lu_sites_guard);
+
+	cached = (cached / 100) * sysctl_vfs_cache_pressure;
+	if (shrink_param(sc, nr_to_scan) == 0)
+		CDEBUG(D_INODE, "%d objects cached\n", cached);
+	return cached;
+}
+
+/*
+ * Debugging stuff.
+ */
+
+/**
+ * Environment to be used in debugger, contains all tags.
+ */
+struct lu_env lu_debugging_env;
+
+/**
+ * Debugging printer function using printk().
+ */
+int lu_printk_printer(const struct lu_env *env,
+		      void *unused, const char *format, ...)
+{
+	va_list args;
+
+	va_start(args, format);
+	vprintk(format, args);
+	va_end(args);
+	return 0;
+}
+
+int lu_debugging_setup(void)
+{
+	return lu_env_init(&lu_debugging_env, ~0);
+}
+
+void lu_context_keys_dump(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
+		struct lu_context_key *key;
+
+		key = lu_keys[i];
+		if (key != NULL) {
+			CERROR("[%d]: %p %x (%p,%p,%p) %d %d \"%s\"@%p\n",
+			       i, key, key->lct_tags,
+			       key->lct_init, key->lct_fini, key->lct_exit,
+			       key->lct_index, atomic_read(&key->lct_used),
+			       key->lct_owner ? key->lct_owner->name : "",
+			       key->lct_owner);
+			lu_ref_print(&key->lct_reference);
+		}
+	}
+}
+EXPORT_SYMBOL(lu_context_keys_dump);
+
+/**
+ * Initialization of global lu_* data.
+ */
+int lu_global_init(void)
+{
+	int result;
+
+	CDEBUG(D_INFO, "Lustre LU module (%p).\n", &lu_keys);
+
+	result = lu_ref_global_init();
+	if (result != 0)
+		return result;
+
+	LU_CONTEXT_KEY_INIT(&lu_global_key);
+	result = lu_context_key_register(&lu_global_key);
+	if (result != 0)
+		return result;
+
+	/*
+	 * At this level, we don't know what tags are needed, so allocate them
+	 * conservatively. This should not be too bad, because this
+	 * environment is global.
+	 */
+	mutex_lock(&lu_sites_guard);
+	result = lu_env_init(&lu_shrink_env, LCT_SHRINKER);
+	mutex_unlock(&lu_sites_guard);
+	if (result != 0)
+		return result;
+
+	/*
+	 * seeks estimation: 3 seeks to read a record from oi, one to read
+	 * inode, one for ea. Unfortunately setting this high value results in
+	 * lu_object/inode cache consuming all the memory.
+	 */
+	lu_site_shrinker = set_shrinker(DEFAULT_SEEKS, lu_cache_shrink);
+	if (lu_site_shrinker == NULL)
+		return -ENOMEM;
+
+	return result;
+}
+
+/**
+ * Dual to lu_global_init().
+ */
+void lu_global_fini(void)
+{
+	if (lu_site_shrinker != NULL) {
+		remove_shrinker(lu_site_shrinker);
+		lu_site_shrinker = NULL;
+	}
+
+	lu_context_key_degister(&lu_global_key);
+
+	/*
+	 * Tear shrinker environment down _after_ de-registering
+	 * lu_global_key, because the latter has a value in the former.
+	 */
+	mutex_lock(&lu_sites_guard);
+	lu_env_fini(&lu_shrink_env);
+	mutex_unlock(&lu_sites_guard);
+
+	lu_ref_global_fini();
+}
+
+static __u32 ls_stats_read(struct lprocfs_stats *stats, int idx)
+{
+#ifdef LPROCFS
+	struct lprocfs_counter ret;
+
+	lprocfs_stats_collect(stats, idx, &ret);
+	return (__u32)ret.lc_count;
+#else
+	return 0;
+#endif
+}
+
+/**
+ * Output site statistical counters into a buffer. Suitable for
+ * lprocfs_rd_*()-style functions.
+ */
+int lu_site_stats_print(const struct lu_site *s, char *page, int count)
+{
+	lu_site_stats_t stats;
+
+	memset(&stats, 0, sizeof(stats));
+	lu_site_stats_get(s->ls_obj_hash, &stats, 1);
+
+	return snprintf(page, count, "%d/%d %d/%d %d %d %d %d %d %d %d\n",
+			stats.lss_busy,
+			stats.lss_total,
+			stats.lss_populated,
+			CFS_HASH_NHLIST(s->ls_obj_hash),
+			stats.lss_max_search,
+			ls_stats_read(s->ls_stats, LU_SS_CREATED),
+			ls_stats_read(s->ls_stats, LU_SS_CACHE_HIT),
+			ls_stats_read(s->ls_stats, LU_SS_CACHE_MISS),
+			ls_stats_read(s->ls_stats, LU_SS_CACHE_RACE),
+			ls_stats_read(s->ls_stats, LU_SS_CACHE_DEATH_RACE),
+			ls_stats_read(s->ls_stats, LU_SS_LRU_PURGED));
+}
+EXPORT_SYMBOL(lu_site_stats_print);
+
+/**
+ * Helper function to initialize a number of kmem slab caches at once.
+ */
+int lu_kmem_init(struct lu_kmem_descr *caches)
+{
+	int result;
+	struct lu_kmem_descr *iter = caches;
+
+	for (result = 0; iter->ckd_cache != NULL; ++iter) {
+		*iter->ckd_cache = kmem_cache_create(iter->ckd_name,
+							iter->ckd_size,
+							0, 0, NULL);
+		if (*iter->ckd_cache == NULL) {
+			result = -ENOMEM;
+			/* free all previously allocated caches */
+			lu_kmem_fini(caches);
+			break;
+		}
+	}
+	return result;
+}
+EXPORT_SYMBOL(lu_kmem_init);
+
+/**
+ * Helper function to finalize a number of kmem slab cached at once. Dual to
+ * lu_kmem_init().
+ */
+void lu_kmem_fini(struct lu_kmem_descr *caches)
+{
+	for (; caches->ckd_cache != NULL; ++caches) {
+		if (*caches->ckd_cache != NULL) {
+			kmem_cache_destroy(*caches->ckd_cache);
+			*caches->ckd_cache = NULL;
+		}
+	}
+}
+EXPORT_SYMBOL(lu_kmem_fini);
+
+/**
+ * Temporary solution to be able to assign fid in ->do_create()
+ * till we have fully-functional OST fids
+ */
+void lu_object_assign_fid(const struct lu_env *env, struct lu_object *o,
+			  const struct lu_fid *fid)
+{
+	struct lu_site		*s = o->lo_dev->ld_site;
+	struct lu_fid		*old = &o->lo_header->loh_fid;
+	struct lu_site_bkt_data	*bkt;
+	struct lu_object	*shadow;
+	wait_queue_t		 waiter;
+	cfs_hash_t		*hs;
+	cfs_hash_bd_t		 bd;
+	__u64			 version = 0;
+
+	LASSERT(fid_is_zero(old));
+
+	hs = s->ls_obj_hash;
+	cfs_hash_bd_get_and_lock(hs, (void *)fid, &bd, 1);
+	shadow = htable_lookup(s, &bd, fid, &waiter, &version);
+	/* supposed to be unique */
+	LASSERT(shadow == NULL);
+	*old = *fid;
+	bkt = cfs_hash_bd_extra_get(hs, &bd);
+	cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash);
+	bkt->lsb_busy++;
+	cfs_hash_bd_unlock(hs, &bd, 1);
+}
+EXPORT_SYMBOL(lu_object_assign_fid);
+
+/**
+ * allocates object with 0 (non-assiged) fid
+ * XXX: temporary solution to be able to assign fid in ->do_create()
+ *      till we have fully-functional OST fids
+ */
+struct lu_object *lu_object_anon(const struct lu_env *env,
+				 struct lu_device *dev,
+				 const struct lu_object_conf *conf)
+{
+	struct lu_fid     fid;
+	struct lu_object *o;
+
+	fid_zero(&fid);
+	o = lu_object_alloc(env, dev, &fid, conf);
+
+	return o;
+}
+EXPORT_SYMBOL(lu_object_anon);
+
+struct lu_buf LU_BUF_NULL = {
+	.lb_buf = NULL,
+	.lb_len = 0
+};
+EXPORT_SYMBOL(LU_BUF_NULL);
+
+void lu_buf_free(struct lu_buf *buf)
+{
+	LASSERT(buf);
+	if (buf->lb_buf) {
+		LASSERT(buf->lb_len > 0);
+		OBD_FREE_LARGE(buf->lb_buf, buf->lb_len);
+		buf->lb_buf = NULL;
+		buf->lb_len = 0;
+	}
+}
+EXPORT_SYMBOL(lu_buf_free);
+
+void lu_buf_alloc(struct lu_buf *buf, int size)
+{
+	LASSERT(buf);
+	LASSERT(buf->lb_buf == NULL);
+	LASSERT(buf->lb_len == 0);
+	OBD_ALLOC_LARGE(buf->lb_buf, size);
+	if (likely(buf->lb_buf))
+		buf->lb_len = size;
+}
+EXPORT_SYMBOL(lu_buf_alloc);
+
+void lu_buf_realloc(struct lu_buf *buf, int size)
+{
+	lu_buf_free(buf);
+	lu_buf_alloc(buf, size);
+}
+EXPORT_SYMBOL(lu_buf_realloc);
+
+struct lu_buf *lu_buf_check_and_alloc(struct lu_buf *buf, int len)
+{
+	if (buf->lb_buf == NULL && buf->lb_len == 0)
+		lu_buf_alloc(buf, len);
+
+	if ((len > buf->lb_len) && (buf->lb_buf != NULL))
+		lu_buf_realloc(buf, len);
+
+	return buf;
+}
+EXPORT_SYMBOL(lu_buf_check_and_alloc);
+
+/**
+ * Increase the size of the \a buf.
+ * preserves old data in buffer
+ * old buffer remains unchanged on error
+ * \retval 0 or -ENOMEM
+ */
+int lu_buf_check_and_grow(struct lu_buf *buf, int len)
+{
+	char *ptr;
+
+	if (len <= buf->lb_len)
+		return 0;
+
+	OBD_ALLOC_LARGE(ptr, len);
+	if (ptr == NULL)
+		return -ENOMEM;
+
+	/* Free the old buf */
+	if (buf->lb_buf != NULL) {
+		memcpy(ptr, buf->lb_buf, buf->lb_len);
+		OBD_FREE_LARGE(buf->lb_buf, buf->lb_len);
+	}
+
+	buf->lb_buf = ptr;
+	buf->lb_len = len;
+	return 0;
+}
+EXPORT_SYMBOL(lu_buf_check_and_grow);
diff --git a/drivers/staging/lustre/lustre/obdclass/lu_ref.c b/drivers/staging/lustre/lustre/obdclass/lu_ref.c
new file mode 100644
index 000000000000..23a76f158356
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/lu_ref.c
@@ -0,0 +1,50 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lu_ref.c
+ *
+ * Lustre reference.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+# include <linux/libcfs/libcfs.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lu_ref.h>
diff --git a/drivers/staging/lustre/lustre/obdclass/lu_ucred.c b/drivers/staging/lustre/lustre/obdclass/lu_ucred.c
new file mode 100644
index 000000000000..229db6c39b78
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/lu_ucred.c
@@ -0,0 +1,107 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lu_object.c
+ *
+ * Lustre Object.
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/libcfs/libcfs.h>
+#include <obd_support.h>
+#include <lu_object.h>
+#include <md_object.h>
+
+/* context key constructor/destructor: lu_ucred_key_init, lu_ucred_key_fini */
+LU_KEY_INIT_FINI(lu_ucred, struct lu_ucred);
+
+static struct lu_context_key lu_ucred_key = {
+	.lct_tags = LCT_SESSION,
+	.lct_init = lu_ucred_key_init,
+	.lct_fini = lu_ucred_key_fini
+};
+
+/**
+ * Get ucred key if session exists and ucred key is allocated on it.
+ * Return NULL otherwise.
+ */
+struct lu_ucred *lu_ucred(const struct lu_env *env)
+{
+	if (!env->le_ses)
+		return NULL;
+	return lu_context_key_get(env->le_ses, &lu_ucred_key);
+}
+EXPORT_SYMBOL(lu_ucred);
+
+/**
+ * Get ucred key and check if it is properly initialized.
+ * Return NULL otherwise.
+ */
+struct lu_ucred *lu_ucred_check(const struct lu_env *env)
+{
+	struct lu_ucred *uc = lu_ucred(env);
+	if (uc && uc->uc_valid != UCRED_OLD && uc->uc_valid != UCRED_NEW)
+		return NULL;
+	return uc;
+}
+EXPORT_SYMBOL(lu_ucred_check);
+
+/**
+ * Get ucred key, which must exist and must be properly initialized.
+ * Assert otherwise.
+ */
+struct lu_ucred *lu_ucred_assert(const struct lu_env *env)
+{
+	struct lu_ucred *uc = lu_ucred_check(env);
+	LASSERT(uc != NULL);
+	return uc;
+}
+EXPORT_SYMBOL(lu_ucred_assert);
+
+int lu_ucred_global_init(void)
+{
+	LU_CONTEXT_KEY_INIT(&lu_ucred_key);
+	return lu_context_key_register(&lu_ucred_key);
+}
+
+void lu_ucred_global_fini(void)
+{
+	lu_context_key_degister(&lu_ucred_key);
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/lustre_handles.c b/drivers/staging/lustre/lustre/obdclass/lustre_handles.c
new file mode 100644
index 000000000000..69d6499ef731
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/lustre_handles.c
@@ -0,0 +1,263 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lustre_handles.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd_support.h>
+#include <lustre_handles.h>
+#include <lustre_lib.h>
+
+
+static __u64 handle_base;
+#define HANDLE_INCR 7
+static spinlock_t handle_base_lock;
+
+static struct handle_bucket {
+	spinlock_t	lock;
+	struct list_head	head;
+} *handle_hash;
+
+#define HANDLE_HASH_SIZE (1 << 16)
+#define HANDLE_HASH_MASK (HANDLE_HASH_SIZE - 1)
+
+/*
+ * Generate a unique 64bit cookie (hash) for a handle and insert it into
+ * global (per-node) hash-table.
+ */
+void class_handle_hash(struct portals_handle *h,
+		       struct portals_handle_ops *ops)
+{
+	struct handle_bucket *bucket;
+	ENTRY;
+
+	LASSERT(h != NULL);
+	LASSERT(list_empty(&h->h_link));
+
+	/*
+	 * This is fast, but simplistic cookie generation algorithm, it will
+	 * need a re-do at some point in the future for security.
+	 */
+	spin_lock(&handle_base_lock);
+	handle_base += HANDLE_INCR;
+
+	if (unlikely(handle_base == 0)) {
+		/*
+		 * Cookie of zero is "dangerous", because in many places it's
+		 * assumed that 0 means "unassigned" handle, not bound to any
+		 * object.
+		 */
+		CWARN("The universe has been exhausted: cookie wrap-around.\n");
+		handle_base += HANDLE_INCR;
+	}
+	h->h_cookie = handle_base;
+	spin_unlock(&handle_base_lock);
+
+	h->h_ops = ops;
+	spin_lock_init(&h->h_lock);
+
+	bucket = &handle_hash[h->h_cookie & HANDLE_HASH_MASK];
+	spin_lock(&bucket->lock);
+	list_add_rcu(&h->h_link, &bucket->head);
+	h->h_in = 1;
+	spin_unlock(&bucket->lock);
+
+	CDEBUG(D_INFO, "added object %p with handle "LPX64" to hash\n",
+	       h, h->h_cookie);
+	EXIT;
+}
+EXPORT_SYMBOL(class_handle_hash);
+
+static void class_handle_unhash_nolock(struct portals_handle *h)
+{
+	if (list_empty(&h->h_link)) {
+		CERROR("removing an already-removed handle ("LPX64")\n",
+		       h->h_cookie);
+		return;
+	}
+
+	CDEBUG(D_INFO, "removing object %p with handle "LPX64" from hash\n",
+	       h, h->h_cookie);
+
+	spin_lock(&h->h_lock);
+	if (h->h_in == 0) {
+		spin_unlock(&h->h_lock);
+		return;
+	}
+	h->h_in = 0;
+	spin_unlock(&h->h_lock);
+	list_del_rcu(&h->h_link);
+}
+
+void class_handle_unhash(struct portals_handle *h)
+{
+	struct handle_bucket *bucket;
+	bucket = handle_hash + (h->h_cookie & HANDLE_HASH_MASK);
+
+	spin_lock(&bucket->lock);
+	class_handle_unhash_nolock(h);
+	spin_unlock(&bucket->lock);
+}
+EXPORT_SYMBOL(class_handle_unhash);
+
+void class_handle_hash_back(struct portals_handle *h)
+{
+	struct handle_bucket *bucket;
+	ENTRY;
+
+	bucket = handle_hash + (h->h_cookie & HANDLE_HASH_MASK);
+
+	spin_lock(&bucket->lock);
+	list_add_rcu(&h->h_link, &bucket->head);
+	h->h_in = 1;
+	spin_unlock(&bucket->lock);
+
+	EXIT;
+}
+EXPORT_SYMBOL(class_handle_hash_back);
+
+void *class_handle2object(__u64 cookie)
+{
+	struct handle_bucket *bucket;
+	struct portals_handle *h;
+	void *retval = NULL;
+	ENTRY;
+
+	LASSERT(handle_hash != NULL);
+
+	/* Be careful when you want to change this code. See the
+	 * rcu_read_lock() definition on top this file. - jxiong */
+	bucket = handle_hash + (cookie & HANDLE_HASH_MASK);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(h, &bucket->head, h_link) {
+		if (h->h_cookie != cookie)
+			continue;
+
+		spin_lock(&h->h_lock);
+		if (likely(h->h_in != 0)) {
+			h->h_ops->hop_addref(h);
+			retval = h;
+		}
+		spin_unlock(&h->h_lock);
+		break;
+	}
+	rcu_read_unlock();
+
+	RETURN(retval);
+}
+EXPORT_SYMBOL(class_handle2object);
+
+void class_handle_free_cb(cfs_rcu_head_t *rcu)
+{
+	struct portals_handle *h = RCU2HANDLE(rcu);
+	void *ptr = (void *)(unsigned long)h->h_cookie;
+
+	if (h->h_ops->hop_free != NULL)
+		h->h_ops->hop_free(ptr, h->h_size);
+	else
+		OBD_FREE(ptr, h->h_size);
+}
+EXPORT_SYMBOL(class_handle_free_cb);
+
+int class_handle_init(void)
+{
+	struct handle_bucket *bucket;
+	struct timeval tv;
+	int seed[2];
+
+	LASSERT(handle_hash == NULL);
+
+	OBD_ALLOC_LARGE(handle_hash, sizeof(*bucket) * HANDLE_HASH_SIZE);
+	if (handle_hash == NULL)
+		return -ENOMEM;
+
+	spin_lock_init(&handle_base_lock);
+	for (bucket = handle_hash + HANDLE_HASH_SIZE - 1; bucket >= handle_hash;
+	     bucket--) {
+		INIT_LIST_HEAD(&bucket->head);
+		spin_lock_init(&bucket->lock);
+	}
+
+	/** bug 21430: add randomness to the initial base */
+	cfs_get_random_bytes(seed, sizeof(seed));
+	do_gettimeofday(&tv);
+	cfs_srand(tv.tv_sec ^ seed[0], tv.tv_usec ^ seed[1]);
+
+	cfs_get_random_bytes(&handle_base, sizeof(handle_base));
+	LASSERT(handle_base != 0ULL);
+
+	return 0;
+}
+
+static int cleanup_all_handles(void)
+{
+	int rc;
+	int i;
+
+	for (rc = i = 0; i < HANDLE_HASH_SIZE; i++) {
+		struct portals_handle *h;
+
+		spin_lock(&handle_hash[i].lock);
+		list_for_each_entry_rcu(h, &(handle_hash[i].head), h_link) {
+			CERROR("force clean handle "LPX64" addr %p ops %p\n",
+			       h->h_cookie, h, h->h_ops);
+
+			class_handle_unhash_nolock(h);
+			rc++;
+		}
+		spin_unlock(&handle_hash[i].lock);
+	}
+
+	return rc;
+}
+
+void class_handle_cleanup(void)
+{
+	int count;
+	LASSERT(handle_hash != NULL);
+
+	count = cleanup_all_handles();
+
+	OBD_FREE_LARGE(handle_hash, sizeof(*handle_hash) * HANDLE_HASH_SIZE);
+	handle_hash = NULL;
+
+	if (count != 0)
+		CERROR("handle_count at cleanup: %d\n", count);
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/lustre_peer.c b/drivers/staging/lustre/lustre/obdclass/lustre_peer.c
new file mode 100644
index 000000000000..2fa2589dc8eb
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/lustre_peer.c
@@ -0,0 +1,218 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <obd.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_lib.h>
+#include <lustre_ha.h>
+#include <lustre_net.h>
+#include <lprocfs_status.h>
+
+#define NIDS_MAX	32
+
+struct uuid_nid_data {
+	struct list_head       un_list;
+	struct obd_uuid  un_uuid;
+	int	      un_nid_count;
+	lnet_nid_t       un_nids[NIDS_MAX];
+};
+
+/* FIXME: This should probably become more elegant than a global linked list */
+static struct list_head	g_uuid_list;
+static spinlock_t	g_uuid_lock;
+
+void class_init_uuidlist(void)
+{
+	INIT_LIST_HEAD(&g_uuid_list);
+	spin_lock_init(&g_uuid_lock);
+}
+
+void class_exit_uuidlist(void)
+{
+	/* delete all */
+	class_del_uuid(NULL);
+}
+
+int lustre_uuid_to_peer(const char *uuid, lnet_nid_t *peer_nid, int index)
+{
+	struct uuid_nid_data *data;
+	struct obd_uuid tmp;
+	int rc = -ENOENT;
+
+	obd_str2uuid(&tmp, uuid);
+	spin_lock(&g_uuid_lock);
+	list_for_each_entry(data, &g_uuid_list, un_list) {
+		if (obd_uuid_equals(&data->un_uuid, &tmp)) {
+			if (index >= data->un_nid_count)
+				break;
+
+			rc = 0;
+			*peer_nid = data->un_nids[index];
+			break;
+		}
+	}
+	spin_unlock(&g_uuid_lock);
+	return rc;
+}
+EXPORT_SYMBOL(lustre_uuid_to_peer);
+
+/* Add a nid to a niduuid.  Multiple nids can be added to a single uuid;
+   LNET will choose the best one. */
+int class_add_uuid(const char *uuid, __u64 nid)
+{
+	struct uuid_nid_data *data, *entry;
+	int found = 0;
+
+	LASSERT(nid != 0);  /* valid newconfig NID is never zero */
+
+	if (strlen(uuid) > UUID_MAX - 1)
+		return -EOVERFLOW;
+
+	OBD_ALLOC_PTR(data);
+	if (data == NULL)
+		return -ENOMEM;
+
+	obd_str2uuid(&data->un_uuid, uuid);
+	data->un_nids[0] = nid;
+	data->un_nid_count = 1;
+
+	spin_lock(&g_uuid_lock);
+	list_for_each_entry(entry, &g_uuid_list, un_list) {
+		if (obd_uuid_equals(&entry->un_uuid, &data->un_uuid)) {
+			int i;
+
+			found = 1;
+			for (i = 0; i < entry->un_nid_count; i++)
+				if (nid == entry->un_nids[i])
+					break;
+
+			if (i == entry->un_nid_count) {
+				LASSERT(entry->un_nid_count < NIDS_MAX);
+				entry->un_nids[entry->un_nid_count++] = nid;
+			}
+			break;
+		}
+	}
+	if (!found)
+		list_add(&data->un_list, &g_uuid_list);
+	spin_unlock(&g_uuid_lock);
+
+	if (found) {
+		CDEBUG(D_INFO, "found uuid %s %s cnt=%d\n", uuid,
+		       libcfs_nid2str(nid), entry->un_nid_count);
+		OBD_FREE(data, sizeof(*data));
+	} else {
+		CDEBUG(D_INFO, "add uuid %s %s\n", uuid, libcfs_nid2str(nid));
+	}
+	return 0;
+}
+EXPORT_SYMBOL(class_add_uuid);
+
+/* Delete the nids for one uuid if specified, otherwise delete all */
+int class_del_uuid(const char *uuid)
+{
+	LIST_HEAD(deathrow);
+	struct uuid_nid_data *data;
+
+	spin_lock(&g_uuid_lock);
+	if (uuid != NULL) {
+		struct obd_uuid tmp;
+
+		obd_str2uuid(&tmp, uuid);
+		list_for_each_entry(data, &g_uuid_list, un_list) {
+			if (obd_uuid_equals(&data->un_uuid, &tmp)) {
+				list_move(&data->un_list, &deathrow);
+				break;
+			}
+		}
+	} else
+		list_splice_init(&g_uuid_list, &deathrow);
+	spin_unlock(&g_uuid_lock);
+
+	if (uuid != NULL && list_empty(&deathrow)) {
+		CDEBUG(D_INFO, "Try to delete a non-existent uuid %s\n", uuid);
+		return -EINVAL;
+	}
+
+	while (!list_empty(&deathrow)) {
+		data = list_entry(deathrow.next, struct uuid_nid_data,
+				      un_list);
+		list_del(&data->un_list);
+
+		CDEBUG(D_INFO, "del uuid %s %s/%d\n",
+		       obd_uuid2str(&data->un_uuid),
+		       libcfs_nid2str(data->un_nids[0]),
+		       data->un_nid_count);
+
+		OBD_FREE(data, sizeof(*data));
+	}
+
+	return 0;
+}
+
+/* check if @nid exists in nid list of @uuid */
+int class_check_uuid(struct obd_uuid *uuid, __u64 nid)
+{
+	struct uuid_nid_data *entry;
+	int found = 0;
+	ENTRY;
+
+	CDEBUG(D_INFO, "check if uuid %s has %s.\n",
+	       obd_uuid2str(uuid), libcfs_nid2str(nid));
+
+	spin_lock(&g_uuid_lock);
+	list_for_each_entry(entry, &g_uuid_list, un_list) {
+		int i;
+
+		if (!obd_uuid_equals(&entry->un_uuid, uuid))
+			continue;
+
+		/* found the uuid, check if it has @nid */
+		for (i = 0; i < entry->un_nid_count; i++) {
+			if (entry->un_nids[i] == nid) {
+				found = 1;
+				break;
+			}
+		}
+		break;
+	}
+	spin_unlock(&g_uuid_lock);
+	RETURN(found);
+}
+EXPORT_SYMBOL(class_check_uuid);
diff --git a/drivers/staging/lustre/lustre/obdclass/md_attrs.c b/drivers/staging/lustre/lustre/obdclass/md_attrs.c
new file mode 100644
index 000000000000..b71344a04c7e
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/md_attrs.c
@@ -0,0 +1,202 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Intel Corporation.
+ * Use is subject to license terms.
+ *
+ * Author: Johann Lombardi <johann.lombardi@intel.com>
+ */
+
+#include <lustre/lustre_idl.h>
+#include <obd.h>
+#include <md_object.h>
+
+/**
+ * Initialize new \a lma. Only fid is stored.
+ *
+ * \param lma - is the new LMA structure to be initialized
+ * \param fid - is the FID of the object this LMA belongs to
+ * \param incompat - features that MDS must understand to access object
+ */
+void lustre_lma_init(struct lustre_mdt_attrs *lma, const struct lu_fid *fid,
+		     __u32 incompat)
+{
+	lma->lma_compat   = 0;
+	lma->lma_incompat = incompat;
+	lma->lma_self_fid = *fid;
+
+	/* If a field is added in struct lustre_mdt_attrs, zero it explicitly
+	 * and change the test below. */
+	LASSERT(sizeof(*lma) ==
+		(offsetof(struct lustre_mdt_attrs, lma_self_fid) +
+		 sizeof(lma->lma_self_fid)));
+};
+EXPORT_SYMBOL(lustre_lma_init);
+
+/**
+ * Swab, if needed, LMA structure which is stored on-disk in little-endian order.
+ *
+ * \param lma - is a pointer to the LMA structure to be swabbed.
+ */
+void lustre_lma_swab(struct lustre_mdt_attrs *lma)
+{
+	/* Use LUSTRE_MSG_MAGIC to detect local endianess. */
+	if (LUSTRE_MSG_MAGIC != cpu_to_le32(LUSTRE_MSG_MAGIC)) {
+		__swab32s(&lma->lma_compat);
+		__swab32s(&lma->lma_incompat);
+		lustre_swab_lu_fid(&lma->lma_self_fid);
+	}
+};
+EXPORT_SYMBOL(lustre_lma_swab);
+
+/**
+ * Swab, if needed, SOM structure which is stored on-disk in little-endian
+ * order.
+ *
+ * \param attrs - is a pointer to the SOM structure to be swabbed.
+ */
+void lustre_som_swab(struct som_attrs *attrs)
+{
+	/* Use LUSTRE_MSG_MAGIC to detect local endianess. */
+	if (LUSTRE_MSG_MAGIC != cpu_to_le32(LUSTRE_MSG_MAGIC)) {
+		__swab32s(&attrs->som_compat);
+		__swab32s(&attrs->som_incompat);
+		__swab64s(&attrs->som_ioepoch);
+		__swab64s(&attrs->som_size);
+		__swab64s(&attrs->som_blocks);
+		__swab64s(&attrs->som_mountid);
+	}
+};
+EXPORT_SYMBOL(lustre_som_swab);
+
+/*
+ * Swab and extract SOM attributes from on-disk xattr.
+ *
+ * \param buf - is a buffer containing the on-disk SOM extended attribute.
+ * \param rc  - is the SOM xattr stored in \a buf
+ * \param msd - is the md_som_data structure where to extract SOM attributes.
+ */
+int lustre_buf2som(void *buf, int rc, struct md_som_data *msd)
+{
+	struct som_attrs *attrs = (struct som_attrs *)buf;
+	ENTRY;
+
+	if (rc == 0 ||  rc == -ENODATA)
+		/* no SOM attributes */
+		RETURN(-ENODATA);
+
+	if (rc < 0)
+		/* error hit while fetching xattr */
+		RETURN(rc);
+
+	/* check SOM compatibility */
+	if (attrs->som_incompat & ~cpu_to_le32(SOM_INCOMPAT_SUPP))
+		RETURN(-ENODATA);
+
+	/* unpack SOM attributes */
+	lustre_som_swab(attrs);
+
+	/* fill in-memory msd structure */
+	msd->msd_compat   = attrs->som_compat;
+	msd->msd_incompat = attrs->som_incompat;
+	msd->msd_ioepoch  = attrs->som_ioepoch;
+	msd->msd_size     = attrs->som_size;
+	msd->msd_blocks   = attrs->som_blocks;
+	msd->msd_mountid  = attrs->som_mountid;
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(lustre_buf2som);
+
+/**
+ * Swab, if needed, HSM structure which is stored on-disk in little-endian
+ * order.
+ *
+ * \param attrs - is a pointer to the HSM structure to be swabbed.
+ */
+void lustre_hsm_swab(struct hsm_attrs *attrs)
+{
+	/* Use LUSTRE_MSG_MAGIC to detect local endianess. */
+	if (LUSTRE_MSG_MAGIC != cpu_to_le32(LUSTRE_MSG_MAGIC)) {
+		__swab32s(&attrs->hsm_compat);
+		__swab32s(&attrs->hsm_flags);
+		__swab64s(&attrs->hsm_arch_id);
+		__swab64s(&attrs->hsm_arch_ver);
+	}
+};
+EXPORT_SYMBOL(lustre_hsm_swab);
+
+/*
+ * Swab and extract HSM attributes from on-disk xattr.
+ *
+ * \param buf - is a buffer containing the on-disk HSM extended attribute.
+ * \param rc  - is the HSM xattr stored in \a buf
+ * \param mh  - is the md_hsm structure where to extract HSM attributes.
+ */
+int lustre_buf2hsm(void *buf, int rc, struct md_hsm *mh)
+{
+	struct hsm_attrs *attrs = (struct hsm_attrs *)buf;
+	ENTRY;
+
+	if (rc == 0 ||  rc == -ENODATA)
+		/* no HSM attributes */
+		RETURN(-ENODATA);
+
+	if (rc < 0)
+		/* error hit while fetching xattr */
+		RETURN(rc);
+
+	/* unpack HSM attributes */
+	lustre_hsm_swab(attrs);
+
+	/* fill md_hsm structure */
+	mh->mh_compat   = attrs->hsm_compat;
+	mh->mh_flags    = attrs->hsm_flags;
+	mh->mh_arch_id  = attrs->hsm_arch_id;
+	mh->mh_arch_ver = attrs->hsm_arch_ver;
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(lustre_buf2hsm);
+
+/*
+ * Pack HSM attributes.
+ *
+ * \param buf - is the output buffer where to pack the on-disk HSM xattr.
+ * \param mh  - is the md_hsm structure to pack.
+ */
+void lustre_hsm2buf(void *buf, struct md_hsm *mh)
+{
+	struct hsm_attrs *attrs = (struct hsm_attrs *)buf;
+	ENTRY;
+
+	/* copy HSM attributes */
+	attrs->hsm_compat   = mh->mh_compat;
+	attrs->hsm_flags    = mh->mh_flags;
+	attrs->hsm_arch_id  = mh->mh_arch_id;
+	attrs->hsm_arch_ver = mh->mh_arch_ver;
+
+	/* pack xattr */
+	lustre_hsm_swab(attrs);
+}
+EXPORT_SYMBOL(lustre_hsm2buf);
diff --git a/drivers/staging/lustre/lustre/obdclass/md_local_object.c b/drivers/staging/lustre/lustre/obdclass/md_local_object.c
new file mode 100644
index 000000000000..ac5f44f19715
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/md_local_object.c
@@ -0,0 +1,459 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/md_local_object.c
+ *
+ * Lustre Local Object create APIs
+ * 'create on first mount' facility. Files registed under llo module will
+ * be created on first mount.
+ *
+ * Author: Pravin Shelar  <pravin.shelar@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd_support.h>
+#include <lustre_disk.h>
+#include <lustre_fid.h>
+#include <lu_object.h>
+#include <linux/list.h>
+#include <md_object.h>
+
+
+/** List head to hold list of objects to be created. */
+static struct list_head llo_lobj_list;
+
+/** Lock to protect list manipulations */
+static struct mutex	llo_lock;
+
+/**
+ * Structure used to maintain state of path parsing.
+ * \see llo_find_entry, llo_store_resolve
+ */
+struct llo_find_hint {
+	struct lu_fid	*lfh_cfid;
+	struct md_device     *lfh_md;
+	struct md_object     *lfh_pobj;
+};
+
+/**
+ * Thread Local storage for this module.
+ */
+struct llo_thread_info {
+	/** buffer to resolve path */
+	char		    lti_buf[DT_MAX_PATH];
+	/** used for path resolve */
+	struct lu_fid	   lti_fid;
+	/** used to pass child object fid */
+	struct lu_fid	   lti_cfid;
+	struct llo_find_hint    lti_lfh;
+	struct md_op_spec       lti_spc;
+	struct md_attr	  lti_ma;
+	struct lu_name	  lti_lname;
+};
+
+LU_KEY_INIT(llod_global, struct llo_thread_info);
+LU_KEY_FINI(llod_global, struct llo_thread_info);
+
+static struct lu_context_key llod_key = {
+	.lct_tags = LCT_MD_THREAD,
+	.lct_init = llod_global_key_init,
+	.lct_fini = llod_global_key_fini
+};
+
+static inline struct llo_thread_info * llo_env_info(const struct lu_env *env)
+{
+	return lu_context_key_get(&env->le_ctx,  &llod_key);
+}
+
+/**
+ * Search md object for given fid.
+ */
+static struct md_object *llo_locate(const struct lu_env *env,
+				    struct md_device *md,
+				    const struct lu_fid *fid)
+{
+	struct lu_object *obj;
+	struct md_object *mdo;
+
+	obj = lu_object_find(env, &md->md_lu_dev, fid, NULL);
+	if (!IS_ERR(obj)) {
+		obj = lu_object_locate(obj->lo_header, md->md_lu_dev.ld_type);
+		LASSERT(obj != NULL);
+		mdo = (struct md_object *) obj;
+	} else
+		mdo = (struct md_object *)obj;
+	return mdo;
+}
+
+/**
+ * Lookup FID for object named \a name in directory \a pobj.
+ */
+static int llo_lookup(const struct lu_env  *env,
+		      struct md_object *pobj,
+		      const char *name,
+		      struct lu_fid *fid)
+{
+	struct llo_thread_info *info = llo_env_info(env);
+	struct lu_name	  *lname = &info->lti_lname;
+	struct md_op_spec       *spec = &info->lti_spc;
+
+	spec->sp_feat = NULL;
+	spec->sp_cr_flags = 0;
+	spec->sp_cr_lookup = 0;
+	spec->sp_cr_mode = 0;
+
+	lname->ln_name = name;
+	lname->ln_namelen = strlen(name);
+
+	return mdo_lookup(env, pobj, lname, fid, spec);
+}
+
+/**
+ * Function to look up path component, this is passed to parsing
+ * function. \see llo_store_resolve
+ *
+ * \retval      rc returns error code for lookup or locate operation
+ *
+ * pointer to object is returned in data (lfh->lfh_pobj)
+ */
+static int llo_find_entry(const struct lu_env  *env,
+			  const char *name, void *data)
+{
+	struct llo_find_hint    *lfh = data;
+	struct md_device	*md = lfh->lfh_md;
+	struct lu_fid	   *fid = lfh->lfh_cfid;
+	struct md_object	*obj = lfh->lfh_pobj;
+	int		     result;
+
+	/* lookup fid for object */
+	result = llo_lookup(env, obj, name, fid);
+	lu_object_put(env, &obj->mo_lu);
+
+	if (result == 0) {
+		/* get md object for fid that we got in lookup */
+		obj = llo_locate(env, md, fid);
+		if (IS_ERR(obj))
+			result = PTR_ERR(obj);
+	}
+
+	lfh->lfh_pobj = obj;
+	return result;
+}
+
+static struct md_object *llo_reg_open(const struct lu_env *env,
+				      struct md_device *md,
+				      struct md_object *p,
+				      const char *name,
+				      struct lu_fid *fid)
+{
+	struct md_object *o;
+	int result;
+
+	result = llo_lookup(env, p, name, fid);
+	if (result == 0)
+		o = llo_locate(env, md, fid);
+	else
+		o = ERR_PTR(result);
+
+	return o;
+}
+
+/**
+ * Resolve given \a path, on success function returns
+ * md object for last directory and \a fid points to
+ * its fid.
+ */
+struct md_object *llo_store_resolve(const struct lu_env *env,
+				    struct md_device *md,
+				    struct dt_device *dt,
+				    const char *path,
+				    struct lu_fid *fid)
+{
+	struct llo_thread_info *info = llo_env_info(env);
+	struct llo_find_hint *lfh = &info->lti_lfh;
+	char *local = info->lti_buf;
+	struct md_object	*obj;
+	int result;
+
+	strncpy(local, path, DT_MAX_PATH);
+	local[DT_MAX_PATH - 1] = '\0';
+
+	lfh->lfh_md = md;
+	lfh->lfh_cfid = fid;
+	/* start path resolution from backend fs root. */
+	result = dt->dd_ops->dt_root_get(env, dt, fid);
+	if (result == 0) {
+		/* get md object for root */
+		obj = llo_locate(env, md, fid);
+		if (!IS_ERR(obj)) {
+			/* start path parser from root md */
+			lfh->lfh_pobj = obj;
+			result = dt_path_parser(env, local, llo_find_entry, lfh);
+			if (result != 0)
+				obj = ERR_PTR(result);
+			else
+				obj = lfh->lfh_pobj;
+		}
+	} else {
+		obj = ERR_PTR(result);
+	}
+	return obj;
+}
+EXPORT_SYMBOL(llo_store_resolve);
+
+/**
+ * Returns md object for \a objname in given \a dirname.
+ */
+struct md_object *llo_store_open(const struct lu_env *env,
+				 struct md_device *md,
+				 struct dt_device *dt,
+				 const char *dirname,
+				 const char *objname,
+				 struct lu_fid *fid)
+{
+	struct md_object *obj;
+	struct md_object *dir;
+
+	/* search md object for parent dir */
+	dir = llo_store_resolve(env, md, dt, dirname, fid);
+	if (!IS_ERR(dir)) {
+		obj = llo_reg_open(env, md, dir, objname, fid);
+		lu_object_put(env, &dir->mo_lu);
+	} else
+		obj = dir;
+
+	return obj;
+}
+EXPORT_SYMBOL(llo_store_open);
+
+static struct md_object *llo_create_obj(const struct lu_env *env,
+					struct md_device *md,
+					struct md_object *dir,
+					const char *objname,
+					const struct lu_fid *fid,
+					const struct dt_index_features *feat)
+{
+	struct llo_thread_info *info = llo_env_info(env);
+	struct md_object	*mdo;
+	struct md_attr	  *ma = &info->lti_ma;
+	struct md_op_spec       *spec = &info->lti_spc;
+	struct lu_name	  *lname = &info->lti_lname;
+	struct lu_attr	  *la = &ma->ma_attr;
+	int rc;
+
+	mdo = llo_locate(env, md, fid);
+	if (IS_ERR(mdo))
+		return mdo;
+
+	lname->ln_name = objname;
+	lname->ln_namelen = strlen(objname);
+
+	spec->sp_feat = feat;
+	spec->sp_cr_flags = 0;
+	spec->sp_cr_lookup = 1;
+	spec->sp_cr_mode = 0;
+
+	if (feat == &dt_directory_features)
+		la->la_mode = S_IFDIR | S_IXUGO;
+	else
+		la->la_mode = S_IFREG;
+
+	la->la_mode |= S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH;
+	la->la_uid = la->la_gid = 0;
+	la->la_valid = LA_MODE | LA_UID | LA_GID;
+
+	ma->ma_valid = 0;
+	ma->ma_need = 0;
+
+	rc = mdo_create(env, dir, lname, mdo, spec, ma);
+
+	if (rc) {
+		lu_object_put(env, &mdo->mo_lu);
+		mdo = ERR_PTR(rc);
+	}
+
+	return mdo;
+}
+
+/**
+ * Create md object, object could be diretcory or
+ * special index defined by \a feat in \a directory.
+ *
+ *       \param  md       device
+ *       \param  dirname  parent directory
+ *       \param  objname  file name
+ *       \param  fid      object fid
+ *       \param  feat     index features required for directory create
+ */
+
+struct md_object *llo_store_create_index(const struct lu_env *env,
+					 struct md_device *md,
+					 struct dt_device *dt,
+					 const char *dirname,
+					 const char *objname,
+					 const struct lu_fid *fid,
+					 const struct dt_index_features *feat)
+{
+	struct llo_thread_info *info = llo_env_info(env);
+	struct md_object *obj;
+	struct md_object *dir;
+	struct lu_fid *ignore = &info->lti_fid;
+
+	dir = llo_store_resolve(env, md, dt, dirname, ignore);
+	if (!IS_ERR(dir)) {
+		obj = llo_create_obj(env, md, dir, objname, fid, feat);
+		lu_object_put(env, &dir->mo_lu);
+	} else {
+		obj = dir;
+	}
+	return obj;
+}
+
+EXPORT_SYMBOL(llo_store_create_index);
+
+/**
+ * Create md object for regular file in \a directory.
+ *
+ *       \param  md       device
+ *       \param  dirname  parent directory
+ *       \param  objname  file name
+ *       \param  fid      object fid.
+ */
+
+struct md_object *llo_store_create(const struct lu_env *env,
+				   struct md_device *md,
+				   struct dt_device *dt,
+				   const char *dirname,
+				   const char *objname,
+				   const struct lu_fid *fid)
+{
+	return llo_store_create_index(env, md, dt, dirname,
+				      objname, fid, NULL);
+}
+
+EXPORT_SYMBOL(llo_store_create);
+
+/**
+ * Register object for 'create on first mount' facility.
+ * objects are created in order of registration.
+ */
+
+void llo_local_obj_register(struct lu_local_obj_desc *llod)
+{
+	mutex_lock(&llo_lock);
+	list_add_tail(&llod->llod_linkage, &llo_lobj_list);
+	mutex_unlock(&llo_lock);
+}
+
+EXPORT_SYMBOL(llo_local_obj_register);
+
+void llo_local_obj_unregister(struct lu_local_obj_desc *llod)
+{
+	mutex_lock(&llo_lock);
+	list_del(&llod->llod_linkage);
+	mutex_unlock(&llo_lock);
+}
+
+EXPORT_SYMBOL(llo_local_obj_unregister);
+
+/**
+ * Created registed objects.
+ */
+
+int llo_local_objects_setup(const struct lu_env *env,
+			     struct md_device * md,
+			     struct dt_device *dt)
+{
+	struct llo_thread_info *info = llo_env_info(env);
+	struct lu_fid *fid;
+	struct lu_local_obj_desc *scan;
+	struct md_object *mdo;
+	const char *dir;
+	int rc = 0;
+
+	fid = &info->lti_cfid;
+	mutex_lock(&llo_lock);
+
+	list_for_each_entry(scan, &llo_lobj_list, llod_linkage) {
+		lu_local_obj_fid(fid, scan->llod_oid);
+		dir = "";
+		if (scan->llod_dir)
+			dir = scan->llod_dir;
+
+		if (scan->llod_is_index)
+			mdo = llo_store_create_index(env, md, dt ,
+						     dir, scan->llod_name,
+						     fid,
+						     scan->llod_feat);
+		else
+			mdo = llo_store_create(env, md, dt,
+					       dir, scan->llod_name,
+					       fid);
+		if (IS_ERR(mdo) && PTR_ERR(mdo) != -EEXIST) {
+			rc = PTR_ERR(mdo);
+			CERROR("creating obj [%s] fid = "DFID" rc = %d\n",
+			       scan->llod_name, PFID(fid), rc);
+			goto out;
+		}
+
+		if (!IS_ERR(mdo))
+			lu_object_put(env, &mdo->mo_lu);
+	}
+
+out:
+	mutex_unlock(&llo_lock);
+	return rc;
+}
+
+EXPORT_SYMBOL(llo_local_objects_setup);
+
+int llo_global_init(void)
+{
+	int result;
+
+	INIT_LIST_HEAD(&llo_lobj_list);
+	mutex_init(&llo_lock);
+
+	LU_CONTEXT_KEY_INIT(&llod_key);
+	result = lu_context_key_register(&llod_key);
+	return result;
+}
+
+void llo_global_fini(void)
+{
+	lu_context_key_degister(&llod_key);
+	LASSERT(list_empty(&llo_lobj_list));
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/mea.c b/drivers/staging/lustre/lustre/obdclass/mea.c
new file mode 100644
index 000000000000..c4f0dbc23611
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/mea.c
@@ -0,0 +1,112 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#include <obd_class.h>
+#include <linux/kmod.h>   /* for request_module() */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <lprocfs_status.h>
+#include <lustre/lustre_idl.h>
+
+static int mea_last_char_hash(int count, char *name, int namelen)
+{
+	unsigned int c;
+
+	c = name[namelen - 1];
+	if (c == 0)
+		CWARN("looks like wrong len is passed\n");
+	c = c % count;
+	return c;
+}
+
+static int mea_all_chars_hash(int count, char *name, int namelen)
+{
+	unsigned int c = 0;
+
+	while (--namelen >= 0)
+		c += name[namelen];
+	c = c % count;
+	return c;
+}
+
+int raw_name2idx(int hashtype, int count, const char *name, int namelen)
+{
+	unsigned int	c = 0;
+	int		idx;
+
+	LASSERT(namelen > 0);
+
+	if (filename_is_volatile(name, namelen, &idx)) {
+		if ((idx >= 0) && (idx < count))
+			return idx;
+		goto hashchoice;
+	}
+
+	if (count <= 1)
+		return 0;
+
+hashchoice:
+	switch (hashtype) {
+	case MEA_MAGIC_LAST_CHAR:
+		c = mea_last_char_hash(count, (char *)name, namelen);
+		break;
+	case MEA_MAGIC_ALL_CHARS:
+		c = mea_all_chars_hash(count, (char *)name, namelen);
+		break;
+	case MEA_MAGIC_HASH_SEGMENT:
+		CERROR("Unsupported hash type MEA_MAGIC_HASH_SEGMENT\n");
+		break;
+	default:
+		CERROR("Unknown hash type 0x%x\n", hashtype);
+	}
+
+	LASSERT(c < count);
+	return c;
+}
+EXPORT_SYMBOL(raw_name2idx);
+
+int mea_name2idx(struct lmv_stripe_md *mea, const char *name, int namelen)
+{
+	unsigned int c;
+
+	LASSERT(mea && mea->mea_count);
+
+	c = raw_name2idx(mea->mea_magic, mea->mea_count, name, namelen);
+
+	LASSERT(c < mea->mea_count);
+	return c;
+}
+EXPORT_SYMBOL(mea_name2idx);
diff --git a/drivers/staging/lustre/lustre/obdclass/obd_config.c b/drivers/staging/lustre/lustre/obdclass/obd_config.c
new file mode 100644
index 000000000000..9636aa9efed0
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/obd_config.c
@@ -0,0 +1,1899 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/obd_config.c
+ *
+ * Config API
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#include <obd_class.h>
+#include <linux/string.h>
+#include <lustre_log.h>
+#include <lprocfs_status.h>
+#include <lustre_param.h>
+
+#include "llog_internal.h"
+
+static cfs_hash_ops_t uuid_hash_ops;
+static cfs_hash_ops_t nid_hash_ops;
+static cfs_hash_ops_t nid_stat_hash_ops;
+
+/*********** string parsing utils *********/
+
+/* returns 0 if we find this key in the buffer, else 1 */
+int class_find_param(char *buf, char *key, char **valp)
+{
+	char *ptr;
+
+	if (!buf)
+		return 1;
+
+	if ((ptr = strstr(buf, key)) == NULL)
+		return 1;
+
+	if (valp)
+		*valp = ptr + strlen(key);
+
+	return 0;
+}
+EXPORT_SYMBOL(class_find_param);
+
+/**
+ * Check whether the proc parameter \a param is an old parameter or not from
+ * the array \a ptr which contains the mapping from old parameters to new ones.
+ * If it's an old one, then return the pointer to the cfg_interop_param struc-
+ * ture which contains both the old and new parameters.
+ *
+ * \param param			proc parameter
+ * \param ptr			an array which contains the mapping from
+ *				old parameters to new ones
+ *
+ * \retval valid-pointer	pointer to the cfg_interop_param structure
+ *				which contains the old and new parameters
+ * \retval NULL			\a param or \a ptr is NULL,
+ *				or \a param is not an old parameter
+ */
+struct cfg_interop_param *class_find_old_param(const char *param,
+					       struct cfg_interop_param *ptr)
+{
+	char *value = NULL;
+	int   name_len = 0;
+
+	if (param == NULL || ptr == NULL)
+		RETURN(NULL);
+
+	value = strchr(param, '=');
+	if (value == NULL)
+		name_len = strlen(param);
+	else
+		name_len = value - param;
+
+	while (ptr->old_param != NULL) {
+		if (strncmp(param, ptr->old_param, name_len) == 0 &&
+		    name_len == strlen(ptr->old_param))
+			RETURN(ptr);
+		ptr++;
+	}
+
+	RETURN(NULL);
+}
+EXPORT_SYMBOL(class_find_old_param);
+
+/**
+ * Finds a parameter in \a params and copies it to \a copy.
+ *
+ * Leading spaces are skipped. Next space or end of string is the
+ * parameter terminator with the exception that spaces inside single or double
+ * quotes get included into a parameter. The parameter is copied into \a copy
+ * which has to be allocated big enough by a caller, quotes are stripped in
+ * the copy and the copy is terminated by 0.
+ *
+ * On return \a params is set to next parameter or to NULL if last
+ * parameter is returned.
+ *
+ * \retval 0 if parameter is returned in \a copy
+ * \retval 1 otherwise
+ * \retval -EINVAL if unbalanced quota is found
+ */
+int class_get_next_param(char **params, char *copy)
+{
+	char *q1, *q2, *str;
+	int len;
+
+	str = *params;
+	while (*str == ' ')
+		str++;
+
+	if (*str == '\0') {
+		*params = NULL;
+		return 1;
+	}
+
+	while (1) {
+		q1 = strpbrk(str, " '\"");
+		if (q1 == NULL) {
+			len = strlen(str);
+			memcpy(copy, str, len);
+			copy[len] = '\0';
+			*params = NULL;
+			return 0;
+		}
+		len = q1 - str;
+		if (*q1 == ' ') {
+			memcpy(copy, str, len);
+			copy[len] = '\0';
+			*params = str + len;
+			return 0;
+		}
+
+		memcpy(copy, str, len);
+		copy += len;
+
+		/* search for the matching closing quote */
+		str = q1 + 1;
+		q2 = strchr(str, *q1);
+		if (q2 == NULL) {
+			CERROR("Unbalanced quota in parameters: \"%s\"\n",
+			       *params);
+			return -EINVAL;
+		}
+		len = q2 - str;
+		memcpy(copy, str, len);
+		copy += len;
+		str = q2 + 1;
+	}
+	return 1;
+}
+EXPORT_SYMBOL(class_get_next_param);
+
+/* returns 0 if this is the first key in the buffer, else 1.
+   valp points to first char after key. */
+int class_match_param(char *buf, char *key, char **valp)
+{
+	if (!buf)
+		return 1;
+
+	if (memcmp(buf, key, strlen(key)) != 0)
+		return 1;
+
+	if (valp)
+		*valp = buf + strlen(key);
+
+	return 0;
+}
+EXPORT_SYMBOL(class_match_param);
+
+static int parse_nid(char *buf, void *value, int quiet)
+{
+	lnet_nid_t *nid = (lnet_nid_t *)value;
+
+	*nid = libcfs_str2nid(buf);
+	if (*nid != LNET_NID_ANY)
+		return 0;
+
+	if (!quiet)
+		LCONSOLE_ERROR_MSG(0x159, "Can't parse NID '%s'\n", buf);
+	return -EINVAL;
+}
+
+static int parse_net(char *buf, void *value)
+{
+	__u32 *net = (__u32 *)value;
+
+	*net = libcfs_str2net(buf);
+	CDEBUG(D_INFO, "Net %s\n", libcfs_net2str(*net));
+	return 0;
+}
+
+enum {
+	CLASS_PARSE_NID = 1,
+	CLASS_PARSE_NET,
+};
+
+/* 0 is good nid,
+   1 not found
+   < 0 error
+   endh is set to next separator */
+static int class_parse_value(char *buf, int opc, void *value, char **endh,
+			     int quiet)
+{
+	char *endp;
+	char  tmp;
+	int   rc = 0;
+
+	if (!buf)
+		return 1;
+	while (*buf == ',' || *buf == ':')
+		buf++;
+	if (*buf == ' ' || *buf == '/' || *buf == '\0')
+		return 1;
+
+	/* nid separators or end of nids */
+	endp = strpbrk(buf, ",: /");
+	if (endp == NULL)
+		endp = buf + strlen(buf);
+
+	tmp = *endp;
+	*endp = '\0';
+	switch (opc) {
+	default:
+		LBUG();
+	case CLASS_PARSE_NID:
+		rc = parse_nid(buf, value, quiet);
+		break;
+	case CLASS_PARSE_NET:
+		rc = parse_net(buf, value);
+		break;
+	}
+	*endp = tmp;
+	if (rc != 0)
+		return rc;
+	if (endh)
+		*endh = endp;
+	return 0;
+}
+
+int class_parse_nid(char *buf, lnet_nid_t *nid, char **endh)
+{
+	return class_parse_value(buf, CLASS_PARSE_NID, (void *)nid, endh, 0);
+}
+EXPORT_SYMBOL(class_parse_nid);
+
+int class_parse_nid_quiet(char *buf, lnet_nid_t *nid, char **endh)
+{
+	return class_parse_value(buf, CLASS_PARSE_NID, (void *)nid, endh, 1);
+}
+EXPORT_SYMBOL(class_parse_nid_quiet);
+
+int class_parse_net(char *buf, __u32 *net, char **endh)
+{
+	return class_parse_value(buf, CLASS_PARSE_NET, (void *)net, endh, 0);
+}
+EXPORT_SYMBOL(class_parse_net);
+
+/* 1 param contains key and match
+ * 0 param contains key and not match
+ * -1 param does not contain key
+ */
+int class_match_nid(char *buf, char *key, lnet_nid_t nid)
+{
+	lnet_nid_t tmp;
+	int   rc = -1;
+
+	while (class_find_param(buf, key, &buf) == 0) {
+		/* please restrict to the nids pertaining to
+		 * the specified nids */
+		while (class_parse_nid(buf, &tmp, &buf) == 0) {
+			if (tmp == nid)
+				return 1;
+		}
+		rc = 0;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(class_match_nid);
+
+int class_match_net(char *buf, char *key, __u32 net)
+{
+	__u32 tmp;
+	int   rc = -1;
+
+	while (class_find_param(buf, key, &buf) == 0) {
+		/* please restrict to the nids pertaining to
+		 * the specified networks */
+		while (class_parse_net(buf, &tmp, &buf) == 0) {
+			if (tmp == net)
+				return 1;
+		}
+		rc = 0;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(class_match_net);
+
+/********************** class fns **********************/
+
+/**
+ * Create a new obd device and set the type, name and uuid.  If successful,
+ * the new device can be accessed by either name or uuid.
+ */
+int class_attach(struct lustre_cfg *lcfg)
+{
+	struct obd_device *obd = NULL;
+	char *typename, *name, *uuid;
+	int rc, len;
+	ENTRY;
+
+	if (!LUSTRE_CFG_BUFLEN(lcfg, 1)) {
+		CERROR("No type passed!\n");
+		RETURN(-EINVAL);
+	}
+	typename = lustre_cfg_string(lcfg, 1);
+
+	if (!LUSTRE_CFG_BUFLEN(lcfg, 0)) {
+		CERROR("No name passed!\n");
+		RETURN(-EINVAL);
+	}
+	name = lustre_cfg_string(lcfg, 0);
+
+	if (!LUSTRE_CFG_BUFLEN(lcfg, 2)) {
+		CERROR("No UUID passed!\n");
+		RETURN(-EINVAL);
+	}
+	uuid = lustre_cfg_string(lcfg, 2);
+
+	CDEBUG(D_IOCTL, "attach type %s name: %s uuid: %s\n",
+	       MKSTR(typename), MKSTR(name), MKSTR(uuid));
+
+	obd = class_newdev(typename, name);
+	if (IS_ERR(obd)) {
+		/* Already exists or out of obds */
+		rc = PTR_ERR(obd);
+		obd = NULL;
+		CERROR("Cannot create device %s of type %s : %d\n",
+		       name, typename, rc);
+		GOTO(out, rc);
+	}
+	LASSERTF(obd != NULL, "Cannot get obd device %s of type %s\n",
+		 name, typename);
+	LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC,
+		 "obd %p obd_magic %08X != %08X\n",
+		 obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+	LASSERTF(strncmp(obd->obd_name, name, strlen(name)) == 0,
+		 "%p obd_name %s != %s\n", obd, obd->obd_name, name);
+
+	rwlock_init(&obd->obd_pool_lock);
+	obd->obd_pool_limit = 0;
+	obd->obd_pool_slv = 0;
+
+	INIT_LIST_HEAD(&obd->obd_exports);
+	INIT_LIST_HEAD(&obd->obd_unlinked_exports);
+	INIT_LIST_HEAD(&obd->obd_delayed_exports);
+	INIT_LIST_HEAD(&obd->obd_exports_timed);
+	INIT_LIST_HEAD(&obd->obd_nid_stats);
+	spin_lock_init(&obd->obd_nid_lock);
+	spin_lock_init(&obd->obd_dev_lock);
+	mutex_init(&obd->obd_dev_mutex);
+	spin_lock_init(&obd->obd_osfs_lock);
+	/* obd->obd_osfs_age must be set to a value in the distant
+	 * past to guarantee a fresh statfs is fetched on mount. */
+	obd->obd_osfs_age = cfs_time_shift_64(-1000);
+
+	/* XXX belongs in setup not attach  */
+	init_rwsem(&obd->obd_observer_link_sem);
+	/* recovery data */
+	cfs_init_timer(&obd->obd_recovery_timer);
+	spin_lock_init(&obd->obd_recovery_task_lock);
+	init_waitqueue_head(&obd->obd_next_transno_waitq);
+	init_waitqueue_head(&obd->obd_evict_inprogress_waitq);
+	INIT_LIST_HEAD(&obd->obd_req_replay_queue);
+	INIT_LIST_HEAD(&obd->obd_lock_replay_queue);
+	INIT_LIST_HEAD(&obd->obd_final_req_queue);
+	INIT_LIST_HEAD(&obd->obd_evict_list);
+
+	llog_group_init(&obd->obd_olg, FID_SEQ_LLOG);
+
+	obd->obd_conn_inprogress = 0;
+
+	len = strlen(uuid);
+	if (len >= sizeof(obd->obd_uuid)) {
+		CERROR("uuid must be < %d bytes long\n",
+		       (int)sizeof(obd->obd_uuid));
+		GOTO(out, rc = -EINVAL);
+	}
+	memcpy(obd->obd_uuid.uuid, uuid, len);
+
+	/* do the attach */
+	if (OBP(obd, attach)) {
+		rc = OBP(obd,attach)(obd, sizeof *lcfg, lcfg);
+		if (rc)
+			GOTO(out, rc = -EINVAL);
+	}
+
+	/* Detach drops this */
+	spin_lock(&obd->obd_dev_lock);
+	atomic_set(&obd->obd_refcount, 1);
+	spin_unlock(&obd->obd_dev_lock);
+	lu_ref_init(&obd->obd_reference);
+	lu_ref_add(&obd->obd_reference, "attach", obd);
+
+	obd->obd_attached = 1;
+	CDEBUG(D_IOCTL, "OBD: dev %d attached type %s with refcount %d\n",
+	       obd->obd_minor, typename, atomic_read(&obd->obd_refcount));
+	RETURN(0);
+ out:
+	if (obd != NULL) {
+		class_release_dev(obd);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(class_attach);
+
+/** Create hashes, self-export, and call type-specific setup.
+ * Setup is effectively the "start this obd" call.
+ */
+int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	int err = 0;
+	struct obd_export *exp;
+	ENTRY;
+
+	LASSERT(obd != NULL);
+	LASSERTF(obd == class_num2obd(obd->obd_minor),
+		 "obd %p != obd_devs[%d] %p\n",
+		 obd, obd->obd_minor, class_num2obd(obd->obd_minor));
+	LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC,
+		 "obd %p obd_magic %08x != %08x\n",
+		 obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+
+	/* have we attached a type to this device? */
+	if (!obd->obd_attached) {
+		CERROR("Device %d not attached\n", obd->obd_minor);
+		RETURN(-ENODEV);
+	}
+
+	if (obd->obd_set_up) {
+		CERROR("Device %d already setup (type %s)\n",
+		       obd->obd_minor, obd->obd_type->typ_name);
+		RETURN(-EEXIST);
+	}
+
+	/* is someone else setting us up right now? (attach inits spinlock) */
+	spin_lock(&obd->obd_dev_lock);
+	if (obd->obd_starting) {
+		spin_unlock(&obd->obd_dev_lock);
+		CERROR("Device %d setup in progress (type %s)\n",
+		       obd->obd_minor, obd->obd_type->typ_name);
+		RETURN(-EEXIST);
+	}
+	/* just leave this on forever.  I can't use obd_set_up here because
+	   other fns check that status, and we're not actually set up yet. */
+	obd->obd_starting = 1;
+	obd->obd_uuid_hash = NULL;
+	obd->obd_nid_hash = NULL;
+	obd->obd_nid_stats_hash = NULL;
+	spin_unlock(&obd->obd_dev_lock);
+
+	/* create an uuid-export lustre hash */
+	obd->obd_uuid_hash = cfs_hash_create("UUID_HASH",
+					     HASH_UUID_CUR_BITS,
+					     HASH_UUID_MAX_BITS,
+					     HASH_UUID_BKT_BITS, 0,
+					     CFS_HASH_MIN_THETA,
+					     CFS_HASH_MAX_THETA,
+					     &uuid_hash_ops, CFS_HASH_DEFAULT);
+	if (!obd->obd_uuid_hash)
+		GOTO(err_hash, err = -ENOMEM);
+
+	/* create a nid-export lustre hash */
+	obd->obd_nid_hash = cfs_hash_create("NID_HASH",
+					    HASH_NID_CUR_BITS,
+					    HASH_NID_MAX_BITS,
+					    HASH_NID_BKT_BITS, 0,
+					    CFS_HASH_MIN_THETA,
+					    CFS_HASH_MAX_THETA,
+					    &nid_hash_ops, CFS_HASH_DEFAULT);
+	if (!obd->obd_nid_hash)
+		GOTO(err_hash, err = -ENOMEM);
+
+	/* create a nid-stats lustre hash */
+	obd->obd_nid_stats_hash = cfs_hash_create("NID_STATS",
+						  HASH_NID_STATS_CUR_BITS,
+						  HASH_NID_STATS_MAX_BITS,
+						  HASH_NID_STATS_BKT_BITS, 0,
+						  CFS_HASH_MIN_THETA,
+						  CFS_HASH_MAX_THETA,
+						  &nid_stat_hash_ops, CFS_HASH_DEFAULT);
+	if (!obd->obd_nid_stats_hash)
+		GOTO(err_hash, err = -ENOMEM);
+
+	exp = class_new_export(obd, &obd->obd_uuid);
+	if (IS_ERR(exp))
+		GOTO(err_hash, err = PTR_ERR(exp));
+
+	obd->obd_self_export = exp;
+	list_del_init(&exp->exp_obd_chain_timed);
+	class_export_put(exp);
+
+	err = obd_setup(obd, lcfg);
+	if (err)
+		GOTO(err_exp, err);
+
+	obd->obd_set_up = 1;
+
+	spin_lock(&obd->obd_dev_lock);
+	/* cleanup drops this */
+	class_incref(obd, "setup", obd);
+	spin_unlock(&obd->obd_dev_lock);
+
+	CDEBUG(D_IOCTL, "finished setup of obd %s (uuid %s)\n",
+	       obd->obd_name, obd->obd_uuid.uuid);
+
+	RETURN(0);
+err_exp:
+	if (obd->obd_self_export) {
+		class_unlink_export(obd->obd_self_export);
+		obd->obd_self_export = NULL;
+	}
+err_hash:
+	if (obd->obd_uuid_hash) {
+		cfs_hash_putref(obd->obd_uuid_hash);
+		obd->obd_uuid_hash = NULL;
+	}
+	if (obd->obd_nid_hash) {
+		cfs_hash_putref(obd->obd_nid_hash);
+		obd->obd_nid_hash = NULL;
+	}
+	if (obd->obd_nid_stats_hash) {
+		cfs_hash_putref(obd->obd_nid_stats_hash);
+		obd->obd_nid_stats_hash = NULL;
+	}
+	obd->obd_starting = 0;
+	CERROR("setup %s failed (%d)\n", obd->obd_name, err);
+	return err;
+}
+EXPORT_SYMBOL(class_setup);
+
+/** We have finished using this obd and are ready to destroy it.
+ * There can be no more references to this obd.
+ */
+int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	ENTRY;
+
+	if (obd->obd_set_up) {
+		CERROR("OBD device %d still set up\n", obd->obd_minor);
+		RETURN(-EBUSY);
+	}
+
+	spin_lock(&obd->obd_dev_lock);
+	if (!obd->obd_attached) {
+		spin_unlock(&obd->obd_dev_lock);
+		CERROR("OBD device %d not attached\n", obd->obd_minor);
+		RETURN(-ENODEV);
+	}
+	obd->obd_attached = 0;
+	spin_unlock(&obd->obd_dev_lock);
+
+	CDEBUG(D_IOCTL, "detach on obd %s (uuid %s)\n",
+	       obd->obd_name, obd->obd_uuid.uuid);
+
+	class_decref(obd, "attach", obd);
+	RETURN(0);
+}
+EXPORT_SYMBOL(class_detach);
+
+/** Start shutting down the obd.  There may be in-progess ops when
+ * this is called.  We tell them to start shutting down with a call
+ * to class_disconnect_exports().
+ */
+int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	int err = 0;
+	char *flag;
+	ENTRY;
+
+	OBD_RACE(OBD_FAIL_LDLM_RECOV_CLIENTS);
+
+	if (!obd->obd_set_up) {
+		CERROR("Device %d not setup\n", obd->obd_minor);
+		RETURN(-ENODEV);
+	}
+
+	spin_lock(&obd->obd_dev_lock);
+	if (obd->obd_stopping) {
+		spin_unlock(&obd->obd_dev_lock);
+		CERROR("OBD %d already stopping\n", obd->obd_minor);
+		RETURN(-ENODEV);
+	}
+	/* Leave this on forever */
+	obd->obd_stopping = 1;
+
+	/* wait for already-arrived-connections to finish. */
+	while (obd->obd_conn_inprogress > 0) {
+		spin_unlock(&obd->obd_dev_lock);
+
+		cond_resched();
+
+		spin_lock(&obd->obd_dev_lock);
+	}
+	spin_unlock(&obd->obd_dev_lock);
+
+	if (lcfg->lcfg_bufcount >= 2 && LUSTRE_CFG_BUFLEN(lcfg, 1) > 0) {
+		for (flag = lustre_cfg_string(lcfg, 1); *flag != 0; flag++)
+			switch (*flag) {
+			case 'F':
+				obd->obd_force = 1;
+				break;
+			case 'A':
+				LCONSOLE_WARN("Failing over %s\n",
+					      obd->obd_name);
+				obd->obd_fail = 1;
+				obd->obd_no_transno = 1;
+				obd->obd_no_recov = 1;
+				if (OBP(obd, iocontrol)) {
+					obd_iocontrol(OBD_IOC_SYNC,
+						      obd->obd_self_export,
+						      0, NULL, NULL);
+				}
+				break;
+			default:
+				CERROR("Unrecognised flag '%c'\n", *flag);
+			}
+	}
+
+	LASSERT(obd->obd_self_export);
+
+	/* The three references that should be remaining are the
+	 * obd_self_export and the attach and setup references. */
+	if (atomic_read(&obd->obd_refcount) > 3) {
+		/* refcounf - 3 might be the number of real exports
+		   (excluding self export). But class_incref is called
+		   by other things as well, so don't count on it. */
+		CDEBUG(D_IOCTL, "%s: forcing exports to disconnect: %d\n",
+		       obd->obd_name, atomic_read(&obd->obd_refcount) - 3);
+		dump_exports(obd, 0);
+		class_disconnect_exports(obd);
+	}
+
+	/* Precleanup, we must make sure all exports get destroyed. */
+	err = obd_precleanup(obd, OBD_CLEANUP_EXPORTS);
+	if (err)
+		CERROR("Precleanup %s returned %d\n",
+		       obd->obd_name, err);
+
+	/* destroy an uuid-export hash body */
+	if (obd->obd_uuid_hash) {
+		cfs_hash_putref(obd->obd_uuid_hash);
+		obd->obd_uuid_hash = NULL;
+	}
+
+	/* destroy a nid-export hash body */
+	if (obd->obd_nid_hash) {
+		cfs_hash_putref(obd->obd_nid_hash);
+		obd->obd_nid_hash = NULL;
+	}
+
+	/* destroy a nid-stats hash body */
+	if (obd->obd_nid_stats_hash) {
+		cfs_hash_putref(obd->obd_nid_stats_hash);
+		obd->obd_nid_stats_hash = NULL;
+	}
+
+	class_decref(obd, "setup", obd);
+	obd->obd_set_up = 0;
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(class_cleanup);
+
+struct obd_device *class_incref(struct obd_device *obd,
+				const char *scope, const void *source)
+{
+	lu_ref_add_atomic(&obd->obd_reference, scope, source);
+	atomic_inc(&obd->obd_refcount);
+	CDEBUG(D_INFO, "incref %s (%p) now %d\n", obd->obd_name, obd,
+	       atomic_read(&obd->obd_refcount));
+
+	return obd;
+}
+EXPORT_SYMBOL(class_incref);
+
+void class_decref(struct obd_device *obd, const char *scope, const void *source)
+{
+	int err;
+	int refs;
+
+	spin_lock(&obd->obd_dev_lock);
+	atomic_dec(&obd->obd_refcount);
+	refs = atomic_read(&obd->obd_refcount);
+	spin_unlock(&obd->obd_dev_lock);
+	lu_ref_del(&obd->obd_reference, scope, source);
+
+	CDEBUG(D_INFO, "Decref %s (%p) now %d\n", obd->obd_name, obd, refs);
+
+	if ((refs == 1) && obd->obd_stopping) {
+		/* All exports have been destroyed; there should
+		   be no more in-progress ops by this point.*/
+
+		spin_lock(&obd->obd_self_export->exp_lock);
+		obd->obd_self_export->exp_flags |= exp_flags_from_obd(obd);
+		spin_unlock(&obd->obd_self_export->exp_lock);
+
+		/* note that we'll recurse into class_decref again */
+		class_unlink_export(obd->obd_self_export);
+		return;
+	}
+
+	if (refs == 0) {
+		CDEBUG(D_CONFIG, "finishing cleanup of obd %s (%s)\n",
+		       obd->obd_name, obd->obd_uuid.uuid);
+		LASSERT(!obd->obd_attached);
+		if (obd->obd_stopping) {
+			/* If we're not stopping, we were never set up */
+			err = obd_cleanup(obd);
+			if (err)
+				CERROR("Cleanup %s returned %d\n",
+				       obd->obd_name, err);
+		}
+		if (OBP(obd, detach)) {
+			err = OBP(obd, detach)(obd);
+			if (err)
+				CERROR("Detach returned %d\n", err);
+		}
+		class_release_dev(obd);
+	}
+}
+EXPORT_SYMBOL(class_decref);
+
+/** Add a failover nid location.
+ * Client obd types contact server obd types using this nid list.
+ */
+int class_add_conn(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct obd_import *imp;
+	struct obd_uuid uuid;
+	int rc;
+	ENTRY;
+
+	if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1 ||
+	    LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(struct obd_uuid)) {
+		CERROR("invalid conn_uuid\n");
+		RETURN(-EINVAL);
+	}
+	if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_OSP_NAME) &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_LWP_NAME) &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_MGC_NAME)) {
+		CERROR("can't add connection on non-client dev\n");
+		RETURN(-EINVAL);
+	}
+
+	imp = obd->u.cli.cl_import;
+	if (!imp) {
+		CERROR("try to add conn on immature client dev\n");
+		RETURN(-EINVAL);
+	}
+
+	obd_str2uuid(&uuid, lustre_cfg_string(lcfg, 1));
+	rc = obd_add_conn(imp, &uuid, lcfg->lcfg_num);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(class_add_conn);
+
+/** Remove a failover nid location.
+ */
+int class_del_conn(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct obd_import *imp;
+	struct obd_uuid uuid;
+	int rc;
+	ENTRY;
+
+	if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1 ||
+	    LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(struct obd_uuid)) {
+		CERROR("invalid conn_uuid\n");
+		RETURN(-EINVAL);
+	}
+	if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME)) {
+		CERROR("can't del connection on non-client dev\n");
+		RETURN(-EINVAL);
+	}
+
+	imp = obd->u.cli.cl_import;
+	if (!imp) {
+		CERROR("try to del conn on immature client dev\n");
+		RETURN(-EINVAL);
+	}
+
+	obd_str2uuid(&uuid, lustre_cfg_string(lcfg, 1));
+	rc = obd_del_conn(imp, &uuid);
+
+	RETURN(rc);
+}
+
+LIST_HEAD(lustre_profile_list);
+
+struct lustre_profile *class_get_profile(const char * prof)
+{
+	struct lustre_profile *lprof;
+
+	ENTRY;
+	list_for_each_entry(lprof, &lustre_profile_list, lp_list) {
+		if (!strcmp(lprof->lp_profile, prof)) {
+			RETURN(lprof);
+		}
+	}
+	RETURN(NULL);
+}
+EXPORT_SYMBOL(class_get_profile);
+
+/** Create a named "profile".
+ * This defines the mdc and osc names to use for a client.
+ * This also is used to define the lov to be used by a mdt.
+ */
+int class_add_profile(int proflen, char *prof, int osclen, char *osc,
+		      int mdclen, char *mdc)
+{
+	struct lustre_profile *lprof;
+	int err = 0;
+	ENTRY;
+
+	CDEBUG(D_CONFIG, "Add profile %s\n", prof);
+
+	OBD_ALLOC(lprof, sizeof(*lprof));
+	if (lprof == NULL)
+		RETURN(-ENOMEM);
+	INIT_LIST_HEAD(&lprof->lp_list);
+
+	LASSERT(proflen == (strlen(prof) + 1));
+	OBD_ALLOC(lprof->lp_profile, proflen);
+	if (lprof->lp_profile == NULL)
+		GOTO(out, err = -ENOMEM);
+	memcpy(lprof->lp_profile, prof, proflen);
+
+	LASSERT(osclen == (strlen(osc) + 1));
+	OBD_ALLOC(lprof->lp_dt, osclen);
+	if (lprof->lp_dt == NULL)
+		GOTO(out, err = -ENOMEM);
+	memcpy(lprof->lp_dt, osc, osclen);
+
+	if (mdclen > 0) {
+		LASSERT(mdclen == (strlen(mdc) + 1));
+		OBD_ALLOC(lprof->lp_md, mdclen);
+		if (lprof->lp_md == NULL)
+			GOTO(out, err = -ENOMEM);
+		memcpy(lprof->lp_md, mdc, mdclen);
+	}
+
+	list_add(&lprof->lp_list, &lustre_profile_list);
+	RETURN(err);
+
+out:
+	if (lprof->lp_md)
+		OBD_FREE(lprof->lp_md, mdclen);
+	if (lprof->lp_dt)
+		OBD_FREE(lprof->lp_dt, osclen);
+	if (lprof->lp_profile)
+		OBD_FREE(lprof->lp_profile, proflen);
+	OBD_FREE(lprof, sizeof(*lprof));
+	RETURN(err);
+}
+
+void class_del_profile(const char *prof)
+{
+	struct lustre_profile *lprof;
+	ENTRY;
+
+	CDEBUG(D_CONFIG, "Del profile %s\n", prof);
+
+	lprof = class_get_profile(prof);
+	if (lprof) {
+		list_del(&lprof->lp_list);
+		OBD_FREE(lprof->lp_profile, strlen(lprof->lp_profile) + 1);
+		OBD_FREE(lprof->lp_dt, strlen(lprof->lp_dt) + 1);
+		if (lprof->lp_md)
+			OBD_FREE(lprof->lp_md, strlen(lprof->lp_md) + 1);
+		OBD_FREE(lprof, sizeof *lprof);
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(class_del_profile);
+
+/* COMPAT_146 */
+void class_del_profiles(void)
+{
+	struct lustre_profile *lprof, *n;
+	ENTRY;
+
+	list_for_each_entry_safe(lprof, n, &lustre_profile_list, lp_list) {
+		list_del(&lprof->lp_list);
+		OBD_FREE(lprof->lp_profile, strlen(lprof->lp_profile) + 1);
+		OBD_FREE(lprof->lp_dt, strlen(lprof->lp_dt) + 1);
+		if (lprof->lp_md)
+			OBD_FREE(lprof->lp_md, strlen(lprof->lp_md) + 1);
+		OBD_FREE(lprof, sizeof *lprof);
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(class_del_profiles);
+
+static int class_set_global(char *ptr, int val, struct lustre_cfg *lcfg)
+{
+	ENTRY;
+	if (class_match_param(ptr, PARAM_AT_MIN, NULL) == 0)
+		at_min = val;
+	else if (class_match_param(ptr, PARAM_AT_MAX, NULL) == 0)
+		at_max = val;
+	else if (class_match_param(ptr, PARAM_AT_EXTRA, NULL) == 0)
+		at_extra = val;
+	else if (class_match_param(ptr, PARAM_AT_EARLY_MARGIN, NULL) == 0)
+		at_early_margin = val;
+	else if (class_match_param(ptr, PARAM_AT_HISTORY, NULL) == 0)
+		at_history = val;
+	else if (class_match_param(ptr, PARAM_JOBID_VAR, NULL) == 0)
+		strlcpy(obd_jobid_var, lustre_cfg_string(lcfg, 2),
+			JOBSTATS_JOBID_VAR_MAX_LEN + 1);
+	else
+		RETURN(-EINVAL);
+
+	CDEBUG(D_IOCTL, "global %s = %d\n", ptr, val);
+	RETURN(0);
+}
+
+
+/* We can't call ll_process_config or lquota_process_config directly because
+ * it lives in a module that must be loaded after this one. */
+static int (*client_process_config)(struct lustre_cfg *lcfg) = NULL;
+static int (*quota_process_config)(struct lustre_cfg *lcfg) = NULL;
+
+void lustre_register_client_process_config(int (*cpc)(struct lustre_cfg *lcfg))
+{
+	client_process_config = cpc;
+}
+EXPORT_SYMBOL(lustre_register_client_process_config);
+
+/**
+ * Rename the proc parameter in \a cfg with a new name \a new_name.
+ *
+ * \param cfg	   config structure which contains the proc parameter
+ * \param new_name new name of the proc parameter
+ *
+ * \retval valid-pointer    pointer to the newly-allocated config structure
+ *			    which contains the renamed proc parameter
+ * \retval ERR_PTR(-EINVAL) if \a cfg or \a new_name is NULL, or \a cfg does
+ *			    not contain a proc parameter
+ * \retval ERR_PTR(-ENOMEM) if memory allocation failure occurs
+ */
+struct lustre_cfg *lustre_cfg_rename(struct lustre_cfg *cfg,
+				     const char *new_name)
+{
+	struct lustre_cfg_bufs	*bufs = NULL;
+	struct lustre_cfg	*new_cfg = NULL;
+	char			*param = NULL;
+	char			*new_param = NULL;
+	char			*value = NULL;
+	int			 name_len = 0;
+	int			 new_len = 0;
+	ENTRY;
+
+	if (cfg == NULL || new_name == NULL)
+		RETURN(ERR_PTR(-EINVAL));
+
+	param = lustre_cfg_string(cfg, 1);
+	if (param == NULL)
+		RETURN(ERR_PTR(-EINVAL));
+
+	value = strchr(param, '=');
+	if (value == NULL)
+		name_len = strlen(param);
+	else
+		name_len = value - param;
+
+	new_len = LUSTRE_CFG_BUFLEN(cfg, 1) + strlen(new_name) - name_len;
+
+	OBD_ALLOC(new_param, new_len);
+	if (new_param == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	strcpy(new_param, new_name);
+	if (value != NULL)
+		strcat(new_param, value);
+
+	OBD_ALLOC_PTR(bufs);
+	if (bufs == NULL) {
+		OBD_FREE(new_param, new_len);
+		RETURN(ERR_PTR(-ENOMEM));
+	}
+
+	lustre_cfg_bufs_reset(bufs, NULL);
+	lustre_cfg_bufs_init(bufs, cfg);
+	lustre_cfg_bufs_set_string(bufs, 1, new_param);
+
+	new_cfg = lustre_cfg_new(cfg->lcfg_command, bufs);
+
+	OBD_FREE(new_param, new_len);
+	OBD_FREE_PTR(bufs);
+	if (new_cfg == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	new_cfg->lcfg_num = cfg->lcfg_num;
+	new_cfg->lcfg_flags = cfg->lcfg_flags;
+	new_cfg->lcfg_nid = cfg->lcfg_nid;
+	new_cfg->lcfg_nal = cfg->lcfg_nal;
+
+	RETURN(new_cfg);
+}
+EXPORT_SYMBOL(lustre_cfg_rename);
+
+void lustre_register_quota_process_config(int (*qpc)(struct lustre_cfg *lcfg))
+{
+	quota_process_config = qpc;
+}
+EXPORT_SYMBOL(lustre_register_quota_process_config);
+
+/** Process configuration commands given in lustre_cfg form.
+ * These may come from direct calls (e.g. class_manual_cleanup)
+ * or processing the config llog, or ioctl from lctl.
+ */
+int class_process_config(struct lustre_cfg *lcfg)
+{
+	struct obd_device *obd;
+	int err;
+
+	LASSERT(lcfg && !IS_ERR(lcfg));
+	CDEBUG(D_IOCTL, "processing cmd: %x\n", lcfg->lcfg_command);
+
+	/* Commands that don't need a device */
+	switch(lcfg->lcfg_command) {
+	case LCFG_ATTACH: {
+		err = class_attach(lcfg);
+		GOTO(out, err);
+	}
+	case LCFG_ADD_UUID: {
+		CDEBUG(D_IOCTL, "adding mapping from uuid %s to nid "LPX64
+		       " (%s)\n", lustre_cfg_string(lcfg, 1),
+		       lcfg->lcfg_nid, libcfs_nid2str(lcfg->lcfg_nid));
+
+		err = class_add_uuid(lustre_cfg_string(lcfg, 1), lcfg->lcfg_nid);
+		GOTO(out, err);
+	}
+	case LCFG_DEL_UUID: {
+		CDEBUG(D_IOCTL, "removing mappings for uuid %s\n",
+		       (lcfg->lcfg_bufcount < 2 || LUSTRE_CFG_BUFLEN(lcfg, 1) == 0)
+		       ? "<all uuids>" : lustre_cfg_string(lcfg, 1));
+
+		err = class_del_uuid(lustre_cfg_string(lcfg, 1));
+		GOTO(out, err);
+	}
+	case LCFG_MOUNTOPT: {
+		CDEBUG(D_IOCTL, "mountopt: profile %s osc %s mdc %s\n",
+		       lustre_cfg_string(lcfg, 1),
+		       lustre_cfg_string(lcfg, 2),
+		       lustre_cfg_string(lcfg, 3));
+		/* set these mount options somewhere, so ll_fill_super
+		 * can find them. */
+		err = class_add_profile(LUSTRE_CFG_BUFLEN(lcfg, 1),
+					lustre_cfg_string(lcfg, 1),
+					LUSTRE_CFG_BUFLEN(lcfg, 2),
+					lustre_cfg_string(lcfg, 2),
+					LUSTRE_CFG_BUFLEN(lcfg, 3),
+					lustre_cfg_string(lcfg, 3));
+		GOTO(out, err);
+	}
+	case LCFG_DEL_MOUNTOPT: {
+		CDEBUG(D_IOCTL, "mountopt: profile %s\n",
+		       lustre_cfg_string(lcfg, 1));
+		class_del_profile(lustre_cfg_string(lcfg, 1));
+		GOTO(out, err = 0);
+	}
+	case LCFG_SET_TIMEOUT: {
+		CDEBUG(D_IOCTL, "changing lustre timeout from %d to %d\n",
+		       obd_timeout, lcfg->lcfg_num);
+		obd_timeout = max(lcfg->lcfg_num, 1U);
+		obd_timeout_set = 1;
+		GOTO(out, err = 0);
+	}
+	case LCFG_SET_LDLM_TIMEOUT: {
+		CDEBUG(D_IOCTL, "changing lustre ldlm_timeout from %d to %d\n",
+		       ldlm_timeout, lcfg->lcfg_num);
+		ldlm_timeout = max(lcfg->lcfg_num, 1U);
+		if (ldlm_timeout >= obd_timeout)
+			ldlm_timeout = max(obd_timeout / 3, 1U);
+		ldlm_timeout_set = 1;
+		GOTO(out, err = 0);
+	}
+	case LCFG_SET_UPCALL: {
+		LCONSOLE_ERROR_MSG(0x15a, "recovery upcall is deprecated\n");
+		/* COMPAT_146 Don't fail on old configs */
+		GOTO(out, err = 0);
+	}
+	case LCFG_MARKER: {
+		struct cfg_marker *marker;
+		marker = lustre_cfg_buf(lcfg, 1);
+		CDEBUG(D_IOCTL, "marker %d (%#x) %.16s %s\n", marker->cm_step,
+		       marker->cm_flags, marker->cm_tgtname, marker->cm_comment);
+		GOTO(out, err = 0);
+	}
+	case LCFG_PARAM: {
+		char *tmp;
+		/* llite has no obd */
+		if ((class_match_param(lustre_cfg_string(lcfg, 1),
+				       PARAM_LLITE, 0) == 0) &&
+		    client_process_config) {
+			err = (*client_process_config)(lcfg);
+			GOTO(out, err);
+		} else if ((class_match_param(lustre_cfg_string(lcfg, 1),
+					      PARAM_SYS, &tmp) == 0)) {
+			/* Global param settings */
+			err = class_set_global(tmp, lcfg->lcfg_num, lcfg);
+			/*
+			 * Client or server should not fail to mount if
+			 * it hits an unknown configuration parameter.
+			 */
+			if (err != 0)
+				CWARN("Ignoring unknown param %s\n", tmp);
+
+			GOTO(out, err = 0);
+		} else if ((class_match_param(lustre_cfg_string(lcfg, 1),
+					      PARAM_QUOTA, &tmp) == 0) &&
+			   quota_process_config) {
+			err = (*quota_process_config)(lcfg);
+			GOTO(out, err);
+		}
+		/* Fall through */
+		break;
+	}
+	}
+
+	/* Commands that require a device */
+	obd = class_name2obd(lustre_cfg_string(lcfg, 0));
+	if (obd == NULL) {
+		if (!LUSTRE_CFG_BUFLEN(lcfg, 0))
+			CERROR("this lcfg command requires a device name\n");
+		else
+			CERROR("no device for: %s\n",
+			       lustre_cfg_string(lcfg, 0));
+
+		GOTO(out, err = -EINVAL);
+	}
+
+	switch(lcfg->lcfg_command) {
+	case LCFG_SETUP: {
+		err = class_setup(obd, lcfg);
+		GOTO(out, err);
+	}
+	case LCFG_DETACH: {
+		err = class_detach(obd, lcfg);
+		GOTO(out, err = 0);
+	}
+	case LCFG_CLEANUP: {
+		err = class_cleanup(obd, lcfg);
+		GOTO(out, err = 0);
+	}
+	case LCFG_ADD_CONN: {
+		err = class_add_conn(obd, lcfg);
+		GOTO(out, err = 0);
+	}
+	case LCFG_DEL_CONN: {
+		err = class_del_conn(obd, lcfg);
+		GOTO(out, err = 0);
+	}
+	case LCFG_POOL_NEW: {
+		err = obd_pool_new(obd, lustre_cfg_string(lcfg, 2));
+		GOTO(out, err = 0);
+		break;
+	}
+	case LCFG_POOL_ADD: {
+		err = obd_pool_add(obd, lustre_cfg_string(lcfg, 2),
+				   lustre_cfg_string(lcfg, 3));
+		GOTO(out, err = 0);
+		break;
+	}
+	case LCFG_POOL_REM: {
+		err = obd_pool_rem(obd, lustre_cfg_string(lcfg, 2),
+				   lustre_cfg_string(lcfg, 3));
+		GOTO(out, err = 0);
+		break;
+	}
+	case LCFG_POOL_DEL: {
+		err = obd_pool_del(obd, lustre_cfg_string(lcfg, 2));
+		GOTO(out, err = 0);
+		break;
+	}
+	default: {
+		err = obd_process_config(obd, sizeof(*lcfg), lcfg);
+		GOTO(out, err);
+
+	}
+	}
+out:
+	if ((err < 0) && !(lcfg->lcfg_command & LCFG_REQUIRED)) {
+		CWARN("Ignoring error %d on optional command %#x\n", err,
+		      lcfg->lcfg_command);
+		err = 0;
+	}
+	return err;
+}
+EXPORT_SYMBOL(class_process_config);
+
+int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars,
+			     struct lustre_cfg *lcfg, void *data)
+{
+	struct lprocfs_vars *var;
+	char *key, *sval;
+	int i, keylen, vallen;
+	int matched = 0, j = 0;
+	int rc = 0;
+	int skip = 0;
+	ENTRY;
+
+	if (lcfg->lcfg_command != LCFG_PARAM) {
+		CERROR("Unknown command: %d\n", lcfg->lcfg_command);
+		RETURN(-EINVAL);
+	}
+
+	/* e.g. tunefs.lustre --param mdt.group_upcall=foo /r/tmp/lustre-mdt
+	   or   lctl conf_param lustre-MDT0000.mdt.group_upcall=bar
+	   or   lctl conf_param lustre-OST0000.osc.max_dirty_mb=36 */
+	for (i = 1; i < lcfg->lcfg_bufcount; i++) {
+		key = lustre_cfg_buf(lcfg, i);
+		/* Strip off prefix */
+		class_match_param(key, prefix, &key);
+		sval = strchr(key, '=');
+		if (!sval || (*(sval + 1) == 0)) {
+			CERROR("Can't parse param %s (missing '=')\n", key);
+			/* rc = -EINVAL;	continue parsing other params */
+			continue;
+		}
+		keylen = sval - key;
+		sval++;
+		vallen = strlen(sval);
+		matched = 0;
+		j = 0;
+		/* Search proc entries */
+		while (lvars[j].name) {
+			var = &lvars[j];
+			if (class_match_param(key, (char *)var->name, 0) == 0 &&
+			    keylen == strlen(var->name)) {
+				matched++;
+				rc = -EROFS;
+				if (var->write_fptr) {
+					mm_segment_t oldfs;
+					oldfs = get_fs();
+					set_fs(KERNEL_DS);
+					rc = (var->write_fptr)(NULL, sval,
+							       vallen, data);
+					set_fs(oldfs);
+				}
+				break;
+			}
+			j++;
+		}
+		if (!matched) {
+			/* If the prefix doesn't match, return error so we
+			   can pass it down the stack */
+			if (strnchr(key, keylen, '.'))
+			    RETURN(-ENOSYS);
+			CERROR("%s: unknown param %s\n",
+			       (char *)lustre_cfg_string(lcfg, 0), key);
+			/* rc = -EINVAL;	continue parsing other params */
+			skip++;
+		} else if (rc < 0) {
+			CERROR("writing proc entry %s err %d\n",
+			       var->name, rc);
+			rc = 0;
+		} else {
+			CDEBUG(D_CONFIG, "%s.%.*s: Set parameter %.*s=%s\n",
+					 lustre_cfg_string(lcfg, 0),
+					 (int)strlen(prefix) - 1, prefix,
+					 (int)(sval - key - 1), key, sval);
+		}
+	}
+
+	if (rc > 0)
+		rc = 0;
+	if (!rc && skip)
+		rc = skip;
+	RETURN(rc);
+}
+EXPORT_SYMBOL(class_process_proc_param);
+
+extern int lustre_check_exclusion(struct super_block *sb, char *svname);
+
+/** Parse a configuration llog, doing various manipulations on them
+ * for various reasons, (modifications for compatibility, skip obsolete
+ * records, change uuids, etc), then class_process_config() resulting
+ * net records.
+ */
+int class_config_llog_handler(const struct lu_env *env,
+			      struct llog_handle *handle,
+			      struct llog_rec_hdr *rec, void *data)
+{
+	struct config_llog_instance *clli = data;
+	int cfg_len = rec->lrh_len;
+	char *cfg_buf = (char*) (rec + 1);
+	int rc = 0;
+	ENTRY;
+
+	//class_config_dump_handler(handle, rec, data);
+
+	switch (rec->lrh_type) {
+	case OBD_CFG_REC: {
+		struct lustre_cfg *lcfg, *lcfg_new;
+		struct lustre_cfg_bufs bufs;
+		char *inst_name = NULL;
+		int inst_len = 0;
+		int inst = 0, swab = 0;
+
+		lcfg = (struct lustre_cfg *)cfg_buf;
+		if (lcfg->lcfg_version == __swab32(LUSTRE_CFG_VERSION)) {
+			lustre_swab_lustre_cfg(lcfg);
+			swab = 1;
+		}
+
+		rc = lustre_cfg_sanity_check(cfg_buf, cfg_len);
+		if (rc)
+			GOTO(out, rc);
+
+		/* Figure out config state info */
+		if (lcfg->lcfg_command == LCFG_MARKER) {
+			struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1);
+			lustre_swab_cfg_marker(marker, swab,
+					       LUSTRE_CFG_BUFLEN(lcfg, 1));
+			CDEBUG(D_CONFIG, "Marker, inst_flg=%#x mark_flg=%#x\n",
+			       clli->cfg_flags, marker->cm_flags);
+			if (marker->cm_flags & CM_START) {
+				/* all previous flags off */
+				clli->cfg_flags = CFG_F_MARKER;
+				if (marker->cm_flags & CM_SKIP) {
+					clli->cfg_flags |= CFG_F_SKIP;
+					CDEBUG(D_CONFIG, "SKIP #%d\n",
+					       marker->cm_step);
+				} else if ((marker->cm_flags & CM_EXCLUDE) ||
+					   (clli->cfg_sb &&
+					    lustre_check_exclusion(clli->cfg_sb,
+							 marker->cm_tgtname))) {
+					clli->cfg_flags |= CFG_F_EXCLUDE;
+					CDEBUG(D_CONFIG, "EXCLUDE %d\n",
+					       marker->cm_step);
+				}
+			} else if (marker->cm_flags & CM_END) {
+				clli->cfg_flags = 0;
+			}
+		}
+		/* A config command without a start marker before it is
+		   illegal (post 146) */
+		if (!(clli->cfg_flags & CFG_F_COMPAT146) &&
+		    !(clli->cfg_flags & CFG_F_MARKER) &&
+		    (lcfg->lcfg_command != LCFG_MARKER)) {
+			CWARN("Config not inside markers, ignoring! "
+			      "(inst: %p, uuid: %s, flags: %#x)\n",
+			      clli->cfg_instance,
+			      clli->cfg_uuid.uuid, clli->cfg_flags);
+			clli->cfg_flags |= CFG_F_SKIP;
+		}
+		if (clli->cfg_flags & CFG_F_SKIP) {
+			CDEBUG(D_CONFIG, "skipping %#x\n",
+			       clli->cfg_flags);
+			rc = 0;
+			/* No processing! */
+			break;
+		}
+
+		/*
+		 * For interoperability between 1.8 and 2.0,
+		 * rename "mds" obd device type to "mdt".
+		 */
+		{
+			char *typename = lustre_cfg_string(lcfg, 1);
+			char *index = lustre_cfg_string(lcfg, 2);
+
+			if ((lcfg->lcfg_command == LCFG_ATTACH && typename &&
+			     strcmp(typename, "mds") == 0)) {
+				CWARN("For 1.8 interoperability, rename obd "
+				       "type from mds to mdt\n");
+				typename[2] = 't';
+			}
+			if ((lcfg->lcfg_command == LCFG_SETUP && index &&
+			     strcmp(index, "type") == 0)) {
+				CDEBUG(D_INFO, "For 1.8 interoperability, "
+				       "set this index to '0'\n");
+				index[0] = '0';
+				index[1] = 0;
+			}
+		}
+
+
+		if ((clli->cfg_flags & CFG_F_EXCLUDE) &&
+		    (lcfg->lcfg_command == LCFG_LOV_ADD_OBD))
+			/* Add inactive instead */
+			lcfg->lcfg_command = LCFG_LOV_ADD_INA;
+
+		lustre_cfg_bufs_init(&bufs, lcfg);
+
+		if (clli && clli->cfg_instance &&
+		    LUSTRE_CFG_BUFLEN(lcfg, 0) > 0){
+			inst = 1;
+			inst_len = LUSTRE_CFG_BUFLEN(lcfg, 0) +
+				   sizeof(clli->cfg_instance) * 2 + 4;
+			OBD_ALLOC(inst_name, inst_len);
+			if (inst_name == NULL)
+				GOTO(out, rc = -ENOMEM);
+			sprintf(inst_name, "%s-%p",
+				lustre_cfg_string(lcfg, 0),
+				clli->cfg_instance);
+			lustre_cfg_bufs_set_string(&bufs, 0, inst_name);
+			CDEBUG(D_CONFIG, "cmd %x, instance name: %s\n",
+			       lcfg->lcfg_command, inst_name);
+		}
+
+		/* we override the llog's uuid for clients, to insure they
+		are unique */
+		if (clli && clli->cfg_instance != NULL &&
+		    lcfg->lcfg_command == LCFG_ATTACH) {
+			lustre_cfg_bufs_set_string(&bufs, 2,
+						   clli->cfg_uuid.uuid);
+		}
+		/*
+		 * sptlrpc config record, we expect 2 data segments:
+		 *  [0]: fs_name/target_name,
+		 *  [1]: rule string
+		 * moving them to index [1] and [2], and insert MGC's
+		 * obdname at index [0].
+		 */
+		if (clli && clli->cfg_instance == NULL &&
+		    lcfg->lcfg_command == LCFG_SPTLRPC_CONF) {
+			lustre_cfg_bufs_set(&bufs, 2, bufs.lcfg_buf[1],
+					    bufs.lcfg_buflen[1]);
+			lustre_cfg_bufs_set(&bufs, 1, bufs.lcfg_buf[0],
+					    bufs.lcfg_buflen[0]);
+			lustre_cfg_bufs_set_string(&bufs, 0,
+						   clli->cfg_obdname);
+		}
+
+		lcfg_new = lustre_cfg_new(lcfg->lcfg_command, &bufs);
+
+		lcfg_new->lcfg_num   = lcfg->lcfg_num;
+		lcfg_new->lcfg_flags = lcfg->lcfg_flags;
+
+		/* XXX Hack to try to remain binary compatible with
+		 * pre-newconfig logs */
+		if (lcfg->lcfg_nal != 0 &&      /* pre-newconfig log? */
+		    (lcfg->lcfg_nid >> 32) == 0) {
+			__u32 addr = (__u32)(lcfg->lcfg_nid & 0xffffffff);
+
+			lcfg_new->lcfg_nid =
+				LNET_MKNID(LNET_MKNET(lcfg->lcfg_nal, 0), addr);
+			CWARN("Converted pre-newconfig NAL %d NID %x to %s\n",
+			      lcfg->lcfg_nal, addr,
+			      libcfs_nid2str(lcfg_new->lcfg_nid));
+		} else {
+			lcfg_new->lcfg_nid = lcfg->lcfg_nid;
+		}
+
+		lcfg_new->lcfg_nal = 0; /* illegal value for obsolete field */
+
+		rc = class_process_config(lcfg_new);
+		lustre_cfg_free(lcfg_new);
+
+		if (inst)
+			OBD_FREE(inst_name, inst_len);
+		break;
+	}
+	default:
+		CERROR("Unknown llog record type %#x encountered\n",
+		       rec->lrh_type);
+		break;
+	}
+out:
+	if (rc) {
+		CERROR("%s: cfg command failed: rc = %d\n",
+		       handle->lgh_ctxt->loc_obd->obd_name, rc);
+		class_config_dump_handler(NULL, handle, rec, data);
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(class_config_llog_handler);
+
+int class_config_parse_llog(const struct lu_env *env, struct llog_ctxt *ctxt,
+			    char *name, struct config_llog_instance *cfg)
+{
+	struct llog_process_cat_data	 cd = {0, 0};
+	struct llog_handle		*llh;
+	llog_cb_t			 callback;
+	int				 rc;
+	ENTRY;
+
+	CDEBUG(D_INFO, "looking up llog %s\n", name);
+	rc = llog_open(env, ctxt, &llh, NULL, name, LLOG_OPEN_EXISTS);
+	if (rc)
+		RETURN(rc);
+
+	rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL);
+	if (rc)
+		GOTO(parse_out, rc);
+
+	/* continue processing from where we last stopped to end-of-log */
+	if (cfg) {
+		cd.lpcd_first_idx = cfg->cfg_last_idx;
+		callback = cfg->cfg_callback;
+		LASSERT(callback != NULL);
+	} else {
+		callback = class_config_llog_handler;
+	}
+
+	cd.lpcd_last_idx = 0;
+
+	rc = llog_process(env, llh, callback, cfg, &cd);
+
+	CDEBUG(D_CONFIG, "Processed log %s gen %d-%d (rc=%d)\n", name,
+	       cd.lpcd_first_idx + 1, cd.lpcd_last_idx, rc);
+	if (cfg)
+		cfg->cfg_last_idx = cd.lpcd_last_idx;
+
+parse_out:
+	llog_close(env, llh);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(class_config_parse_llog);
+
+/**
+ * parse config record and output dump in supplied buffer.
+ * This is separated from class_config_dump_handler() to use
+ * for ioctl needs as well
+ */
+int class_config_parse_rec(struct llog_rec_hdr *rec, char *buf, int size)
+{
+	struct lustre_cfg	*lcfg = (struct lustre_cfg *)(rec + 1);
+	char			*ptr = buf;
+	char			*end = buf + size;
+	int			 rc = 0;
+
+	ENTRY;
+
+	LASSERT(rec->lrh_type == OBD_CFG_REC);
+	rc = lustre_cfg_sanity_check(lcfg, rec->lrh_len);
+	if (rc < 0)
+		RETURN(rc);
+
+	ptr += snprintf(ptr, end-ptr, "cmd=%05x ", lcfg->lcfg_command);
+	if (lcfg->lcfg_flags)
+		ptr += snprintf(ptr, end-ptr, "flags=%#08x ",
+				lcfg->lcfg_flags);
+
+	if (lcfg->lcfg_num)
+		ptr += snprintf(ptr, end-ptr, "num=%#08x ", lcfg->lcfg_num);
+
+	if (lcfg->lcfg_nid)
+		ptr += snprintf(ptr, end-ptr, "nid=%s("LPX64")\n     ",
+				libcfs_nid2str(lcfg->lcfg_nid),
+				lcfg->lcfg_nid);
+
+	if (lcfg->lcfg_command == LCFG_MARKER) {
+		struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1);
+
+		ptr += snprintf(ptr, end-ptr, "marker=%d(%#x)%s '%s'",
+				marker->cm_step, marker->cm_flags,
+				marker->cm_tgtname, marker->cm_comment);
+	} else {
+		int i;
+
+		for (i = 0; i <  lcfg->lcfg_bufcount; i++) {
+			ptr += snprintf(ptr, end-ptr, "%d:%s  ", i,
+					lustre_cfg_string(lcfg, i));
+		}
+	}
+	/* return consumed bytes */
+	rc = ptr - buf;
+	RETURN(rc);
+}
+
+int class_config_dump_handler(const struct lu_env *env,
+			      struct llog_handle *handle,
+			      struct llog_rec_hdr *rec, void *data)
+{
+	char	*outstr;
+	int	 rc = 0;
+
+	ENTRY;
+
+	OBD_ALLOC(outstr, 256);
+	if (outstr == NULL)
+		RETURN(-ENOMEM);
+
+	if (rec->lrh_type == OBD_CFG_REC) {
+		class_config_parse_rec(rec, outstr, 256);
+		LCONSOLE(D_WARNING, "   %s\n", outstr);
+	} else {
+		LCONSOLE(D_WARNING, "unhandled lrh_type: %#x\n", rec->lrh_type);
+		rc = -EINVAL;
+	}
+
+	OBD_FREE(outstr, 256);
+	RETURN(rc);
+}
+
+int class_config_dump_llog(const struct lu_env *env, struct llog_ctxt *ctxt,
+			   char *name, struct config_llog_instance *cfg)
+{
+	struct llog_handle	*llh;
+	int			 rc;
+
+	ENTRY;
+
+	LCONSOLE_INFO("Dumping config log %s\n", name);
+
+	rc = llog_open(env, ctxt, &llh, NULL, name, LLOG_OPEN_EXISTS);
+	if (rc)
+		RETURN(rc);
+
+	rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL);
+	if (rc)
+		GOTO(parse_out, rc);
+
+	rc = llog_process(env, llh, class_config_dump_handler, cfg, NULL);
+parse_out:
+	llog_close(env, llh);
+
+	LCONSOLE_INFO("End config log %s\n", name);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(class_config_dump_llog);
+
+/** Call class_cleanup and class_detach.
+ * "Manual" only in the sense that we're faking lcfg commands.
+ */
+int class_manual_cleanup(struct obd_device *obd)
+{
+	char		    flags[3] = "";
+	struct lustre_cfg      *lcfg;
+	struct lustre_cfg_bufs  bufs;
+	int		     rc;
+	ENTRY;
+
+	if (!obd) {
+		CERROR("empty cleanup\n");
+		RETURN(-EALREADY);
+	}
+
+	if (obd->obd_force)
+		strcat(flags, "F");
+	if (obd->obd_fail)
+		strcat(flags, "A");
+
+	CDEBUG(D_CONFIG, "Manual cleanup of %s (flags='%s')\n",
+	       obd->obd_name, flags);
+
+	lustre_cfg_bufs_reset(&bufs, obd->obd_name);
+	lustre_cfg_bufs_set_string(&bufs, 1, flags);
+	lcfg = lustre_cfg_new(LCFG_CLEANUP, &bufs);
+	if (!lcfg)
+		RETURN(-ENOMEM);
+
+	rc = class_process_config(lcfg);
+	if (rc) {
+		CERROR("cleanup failed %d: %s\n", rc, obd->obd_name);
+		GOTO(out, rc);
+	}
+
+	/* the lcfg is almost the same for both ops */
+	lcfg->lcfg_command = LCFG_DETACH;
+	rc = class_process_config(lcfg);
+	if (rc)
+		CERROR("detach failed %d: %s\n", rc, obd->obd_name);
+out:
+	lustre_cfg_free(lcfg);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(class_manual_cleanup);
+
+/*
+ * uuid<->export lustre hash operations
+ */
+
+static unsigned
+uuid_hash(cfs_hash_t *hs, const void *key, unsigned mask)
+{
+	return cfs_hash_djb2_hash(((struct obd_uuid *)key)->uuid,
+				  sizeof(((struct obd_uuid *)key)->uuid), mask);
+}
+
+static void *
+uuid_key(struct hlist_node *hnode)
+{
+	struct obd_export *exp;
+
+	exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+
+	return &exp->exp_client_uuid;
+}
+
+/*
+ * NOTE: It is impossible to find an export that is in failed
+ *       state with this function
+ */
+static int
+uuid_keycmp(const void *key, struct hlist_node *hnode)
+{
+	struct obd_export *exp;
+
+	LASSERT(key);
+	exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+
+	return obd_uuid_equals(key, &exp->exp_client_uuid) &&
+	       !exp->exp_failed;
+}
+
+static void *
+uuid_export_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+}
+
+static void
+uuid_export_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct obd_export *exp;
+
+	exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+	class_export_get(exp);
+}
+
+static void
+uuid_export_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct obd_export *exp;
+
+	exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+	class_export_put(exp);
+}
+
+static cfs_hash_ops_t uuid_hash_ops = {
+	.hs_hash	= uuid_hash,
+	.hs_key	 = uuid_key,
+	.hs_keycmp      = uuid_keycmp,
+	.hs_object      = uuid_export_object,
+	.hs_get	 = uuid_export_get,
+	.hs_put_locked  = uuid_export_put_locked,
+};
+
+
+/*
+ * nid<->export hash operations
+ */
+
+static unsigned
+nid_hash(cfs_hash_t *hs, const void *key, unsigned mask)
+{
+	return cfs_hash_djb2_hash(key, sizeof(lnet_nid_t), mask);
+}
+
+static void *
+nid_key(struct hlist_node *hnode)
+{
+	struct obd_export *exp;
+
+	exp = hlist_entry(hnode, struct obd_export, exp_nid_hash);
+
+	RETURN(&exp->exp_connection->c_peer.nid);
+}
+
+/*
+ * NOTE: It is impossible to find an export that is in failed
+ *       state with this function
+ */
+static int
+nid_kepcmp(const void *key, struct hlist_node *hnode)
+{
+	struct obd_export *exp;
+
+	LASSERT(key);
+	exp = hlist_entry(hnode, struct obd_export, exp_nid_hash);
+
+	RETURN(exp->exp_connection->c_peer.nid == *(lnet_nid_t *)key &&
+	       !exp->exp_failed);
+}
+
+static void *
+nid_export_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct obd_export, exp_nid_hash);
+}
+
+static void
+nid_export_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct obd_export *exp;
+
+	exp = hlist_entry(hnode, struct obd_export, exp_nid_hash);
+	class_export_get(exp);
+}
+
+static void
+nid_export_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct obd_export *exp;
+
+	exp = hlist_entry(hnode, struct obd_export, exp_nid_hash);
+	class_export_put(exp);
+}
+
+static cfs_hash_ops_t nid_hash_ops = {
+	.hs_hash	= nid_hash,
+	.hs_key	 = nid_key,
+	.hs_keycmp      = nid_kepcmp,
+	.hs_object      = nid_export_object,
+	.hs_get	 = nid_export_get,
+	.hs_put_locked  = nid_export_put_locked,
+};
+
+
+/*
+ * nid<->nidstats hash operations
+ */
+
+static void *
+nidstats_key(struct hlist_node *hnode)
+{
+	struct nid_stat *ns;
+
+	ns = hlist_entry(hnode, struct nid_stat, nid_hash);
+
+	return &ns->nid;
+}
+
+static int
+nidstats_keycmp(const void *key, struct hlist_node *hnode)
+{
+	return *(lnet_nid_t *)nidstats_key(hnode) == *(lnet_nid_t *)key;
+}
+
+static void *
+nidstats_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct nid_stat, nid_hash);
+}
+
+static void
+nidstats_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct nid_stat *ns;
+
+	ns = hlist_entry(hnode, struct nid_stat, nid_hash);
+	nidstat_getref(ns);
+}
+
+static void
+nidstats_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct nid_stat *ns;
+
+	ns = hlist_entry(hnode, struct nid_stat, nid_hash);
+	nidstat_putref(ns);
+}
+
+static cfs_hash_ops_t nid_stat_hash_ops = {
+	.hs_hash	= nid_hash,
+	.hs_key	 = nidstats_key,
+	.hs_keycmp      = nidstats_keycmp,
+	.hs_object      = nidstats_object,
+	.hs_get	 = nidstats_get,
+	.hs_put_locked  = nidstats_put_locked,
+};
diff --git a/drivers/staging/lustre/lustre/obdclass/obd_mount.c b/drivers/staging/lustre/lustre/obdclass/obd_mount.c
new file mode 100644
index 000000000000..99adad9793c5
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/obd_mount.c
@@ -0,0 +1,1321 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/obd_mount.c
+ *
+ * Client mount routines
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#define D_MOUNT (D_SUPER|D_CONFIG/*|D_WARNING */)
+#define PRINT_CMD CDEBUG
+
+#include <obd.h>
+#include <lvfs.h>
+#include <lustre_fsfilt.h>
+#include <obd_class.h>
+#include <lustre/lustre_user.h>
+#include <linux/version.h>
+#include <lustre_log.h>
+#include <lustre_disk.h>
+#include <lustre_param.h>
+
+static int (*client_fill_super)(struct super_block *sb,
+				struct vfsmount *mnt);
+
+static void (*kill_super_cb)(struct super_block *sb);
+
+/**************** config llog ********************/
+
+/** Get a config log from the MGS and process it.
+ * This func is called for both clients and servers.
+ * Continue to process new statements appended to the logs
+ * (whenever the config lock is revoked) until lustre_end_log
+ * is called.
+ * @param sb The superblock is used by the MGC to write to the local copy of
+ *   the config log
+ * @param logname The name of the llog to replicate from the MGS
+ * @param cfg Since the same mgc may be used to follow multiple config logs
+ *   (e.g. ost1, ost2, client), the config_llog_instance keeps the state for
+ *   this log, and is added to the mgc's list of logs to follow.
+ */
+int lustre_process_log(struct super_block *sb, char *logname,
+		     struct config_llog_instance *cfg)
+{
+	struct lustre_cfg *lcfg;
+	struct lustre_cfg_bufs *bufs;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct obd_device *mgc = lsi->lsi_mgc;
+	int rc;
+	ENTRY;
+
+	LASSERT(mgc);
+	LASSERT(cfg);
+
+	OBD_ALLOC_PTR(bufs);
+	if (bufs == NULL)
+		RETURN(-ENOMEM);
+
+	/* mgc_process_config */
+	lustre_cfg_bufs_reset(bufs, mgc->obd_name);
+	lustre_cfg_bufs_set_string(bufs, 1, logname);
+	lustre_cfg_bufs_set(bufs, 2, cfg, sizeof(*cfg));
+	lustre_cfg_bufs_set(bufs, 3, &sb, sizeof(sb));
+	lcfg = lustre_cfg_new(LCFG_LOG_START, bufs);
+	rc = obd_process_config(mgc, sizeof(*lcfg), lcfg);
+	lustre_cfg_free(lcfg);
+
+	OBD_FREE_PTR(bufs);
+
+	if (rc == -EINVAL)
+		LCONSOLE_ERROR_MSG(0x15b, "%s: The configuration from log '%s'"
+				   "failed from the MGS (%d).  Make sure this "
+				   "client and the MGS are running compatible "
+				   "versions of Lustre.\n",
+				   mgc->obd_name, logname, rc);
+
+	if (rc)
+		LCONSOLE_ERROR_MSG(0x15c, "%s: The configuration from log '%s' "
+				   "failed (%d). This may be the result of "
+				   "communication errors between this node and "
+				   "the MGS, a bad configuration, or other "
+				   "errors. See the syslog for more "
+				   "information.\n", mgc->obd_name, logname,
+				   rc);
+
+	/* class_obd_list(); */
+	RETURN(rc);
+}
+EXPORT_SYMBOL(lustre_process_log);
+
+/* Stop watching this config log for updates */
+int lustre_end_log(struct super_block *sb, char *logname,
+		       struct config_llog_instance *cfg)
+{
+	struct lustre_cfg *lcfg;
+	struct lustre_cfg_bufs bufs;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct obd_device *mgc = lsi->lsi_mgc;
+	int rc;
+	ENTRY;
+
+	if (!mgc)
+		RETURN(-ENOENT);
+
+	/* mgc_process_config */
+	lustre_cfg_bufs_reset(&bufs, mgc->obd_name);
+	lustre_cfg_bufs_set_string(&bufs, 1, logname);
+	if (cfg)
+		lustre_cfg_bufs_set(&bufs, 2, cfg, sizeof(*cfg));
+	lcfg = lustre_cfg_new(LCFG_LOG_END, &bufs);
+	rc = obd_process_config(mgc, sizeof(*lcfg), lcfg);
+	lustre_cfg_free(lcfg);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(lustre_end_log);
+
+/**************** obd start *******************/
+
+/** lustre_cfg_bufs are a holdover from 1.4; we can still set these up from
+ * lctl (and do for echo cli/srv.
+ */
+int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd,
+	    char *s1, char *s2, char *s3, char *s4)
+{
+	struct lustre_cfg_bufs bufs;
+	struct lustre_cfg    * lcfg = NULL;
+	int rc;
+
+	CDEBUG(D_TRACE, "lcfg %s %#x %s %s %s %s\n", cfgname,
+	       cmd, s1, s2, s3, s4);
+
+	lustre_cfg_bufs_reset(&bufs, cfgname);
+	if (s1)
+		lustre_cfg_bufs_set_string(&bufs, 1, s1);
+	if (s2)
+		lustre_cfg_bufs_set_string(&bufs, 2, s2);
+	if (s3)
+		lustre_cfg_bufs_set_string(&bufs, 3, s3);
+	if (s4)
+		lustre_cfg_bufs_set_string(&bufs, 4, s4);
+
+	lcfg = lustre_cfg_new(cmd, &bufs);
+	lcfg->lcfg_nid = nid;
+	rc = class_process_config(lcfg);
+	lustre_cfg_free(lcfg);
+	return(rc);
+}
+EXPORT_SYMBOL(do_lcfg);
+
+/** Call class_attach and class_setup.  These methods in turn call
+ * obd type-specific methods.
+ */
+int lustre_start_simple(char *obdname, char *type, char *uuid,
+			char *s1, char *s2, char *s3, char *s4)
+{
+	int rc;
+	CDEBUG(D_MOUNT, "Starting obd %s (typ=%s)\n", obdname, type);
+
+	rc = do_lcfg(obdname, 0, LCFG_ATTACH, type, uuid, 0, 0);
+	if (rc) {
+		CERROR("%s attach error %d\n", obdname, rc);
+		return rc;
+	}
+	rc = do_lcfg(obdname, 0, LCFG_SETUP, s1, s2, s3, s4);
+	if (rc) {
+		CERROR("%s setup error %d\n", obdname, rc);
+		do_lcfg(obdname, 0, LCFG_DETACH, 0, 0, 0, 0);
+	}
+	return rc;
+}
+
+DEFINE_MUTEX(mgc_start_lock);
+
+/** Set up a mgc obd to process startup logs
+ *
+ * \param sb [in] super block of the mgc obd
+ *
+ * \retval 0 success, otherwise error code
+ */
+int lustre_start_mgc(struct super_block *sb)
+{
+	struct obd_connect_data *data = NULL;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct obd_device *obd;
+	struct obd_export *exp;
+	struct obd_uuid *uuid;
+	class_uuid_t uuidc;
+	lnet_nid_t nid;
+	char *mgcname = NULL, *niduuid = NULL, *mgssec = NULL;
+	char *ptr;
+	int recov_bk;
+	int rc = 0, i = 0, j, len;
+	ENTRY;
+
+	LASSERT(lsi->lsi_lmd);
+
+	/* Find the first non-lo MGS nid for our MGC name */
+	if (IS_SERVER(lsi)) {
+		/* mount -o mgsnode=nid */
+		ptr = lsi->lsi_lmd->lmd_mgs;
+		if (lsi->lsi_lmd->lmd_mgs &&
+		    (class_parse_nid(lsi->lsi_lmd->lmd_mgs, &nid, &ptr) == 0)) {
+			i++;
+		} else if (IS_MGS(lsi)) {
+			lnet_process_id_t id;
+			while ((rc = LNetGetId(i++, &id)) != -ENOENT) {
+				if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND)
+					continue;
+				nid = id.nid;
+				i++;
+				break;
+			}
+		}
+	} else { /* client */
+		/* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */
+		ptr = lsi->lsi_lmd->lmd_dev;
+		if (class_parse_nid(ptr, &nid, &ptr) == 0)
+			i++;
+	}
+	if (i == 0) {
+		CERROR("No valid MGS nids found.\n");
+		RETURN(-EINVAL);
+	}
+
+	mutex_lock(&mgc_start_lock);
+
+	len = strlen(LUSTRE_MGC_OBDNAME) + strlen(libcfs_nid2str(nid)) + 1;
+	OBD_ALLOC(mgcname, len);
+	OBD_ALLOC(niduuid, len + 2);
+	if (!mgcname || !niduuid)
+		GOTO(out_free, rc = -ENOMEM);
+	sprintf(mgcname, "%s%s", LUSTRE_MGC_OBDNAME, libcfs_nid2str(nid));
+
+	mgssec = lsi->lsi_lmd->lmd_mgssec ? lsi->lsi_lmd->lmd_mgssec : "";
+
+	OBD_ALLOC_PTR(data);
+	if (data == NULL)
+		GOTO(out_free, rc = -ENOMEM);
+
+	obd = class_name2obd(mgcname);
+	if (obd && !obd->obd_stopping) {
+		rc = obd_set_info_async(NULL, obd->obd_self_export,
+					strlen(KEY_MGSSEC), KEY_MGSSEC,
+					strlen(mgssec), mgssec, NULL);
+		if (rc)
+			GOTO(out_free, rc);
+
+		/* Re-using an existing MGC */
+		atomic_inc(&obd->u.cli.cl_mgc_refcount);
+
+		/* IR compatibility check, only for clients */
+		if (lmd_is_client(lsi->lsi_lmd)) {
+			int has_ir;
+			int vallen = sizeof(*data);
+			__u32 *flags = &lsi->lsi_lmd->lmd_flags;
+
+			rc = obd_get_info(NULL, obd->obd_self_export,
+					  strlen(KEY_CONN_DATA), KEY_CONN_DATA,
+					  &vallen, data, NULL);
+			LASSERT(rc == 0);
+			has_ir = OCD_HAS_FLAG(data, IMP_RECOV);
+			if (has_ir ^ !(*flags & LMD_FLG_NOIR)) {
+				/* LMD_FLG_NOIR is for test purpose only */
+				LCONSOLE_WARN(
+				    "Trying to mount a client with IR setting "
+				    "not compatible with current mgc. "
+				    "Force to use current mgc setting that is "
+				    "IR %s.\n",
+				    has_ir ? "enabled" : "disabled");
+				if (has_ir)
+					*flags &= ~LMD_FLG_NOIR;
+				else
+					*flags |= LMD_FLG_NOIR;
+			}
+		}
+
+		recov_bk = 0;
+		/* If we are restarting the MGS, don't try to keep the MGC's
+		   old connection, or registration will fail. */
+		if (IS_MGS(lsi)) {
+			CDEBUG(D_MOUNT, "New MGS with live MGC\n");
+			recov_bk = 1;
+		}
+
+		/* Try all connections, but only once (again).
+		   We don't want to block another target from starting
+		   (using its local copy of the log), but we do want to connect
+		   if at all possible. */
+		recov_bk++;
+		CDEBUG(D_MOUNT, "%s: Set MGC reconnect %d\n", mgcname,recov_bk);
+		rc = obd_set_info_async(NULL, obd->obd_self_export,
+					sizeof(KEY_INIT_RECOV_BACKUP),
+					KEY_INIT_RECOV_BACKUP,
+					sizeof(recov_bk), &recov_bk, NULL);
+		GOTO(out, rc = 0);
+	}
+
+	CDEBUG(D_MOUNT, "Start MGC '%s'\n", mgcname);
+
+	/* Add the primary nids for the MGS */
+	i = 0;
+	sprintf(niduuid, "%s_%x", mgcname, i);
+	if (IS_SERVER(lsi)) {
+		ptr = lsi->lsi_lmd->lmd_mgs;
+		if (IS_MGS(lsi)) {
+			/* Use local nids (including LO) */
+			lnet_process_id_t id;
+			while ((rc = LNetGetId(i++, &id)) != -ENOENT) {
+				rc = do_lcfg(mgcname, id.nid,
+					     LCFG_ADD_UUID, niduuid, 0,0,0);
+			}
+		} else {
+			/* Use mgsnode= nids */
+			/* mount -o mgsnode=nid */
+			if (lsi->lsi_lmd->lmd_mgs) {
+				ptr = lsi->lsi_lmd->lmd_mgs;
+			} else if (class_find_param(ptr, PARAM_MGSNODE,
+						    &ptr) != 0) {
+				CERROR("No MGS nids given.\n");
+				GOTO(out_free, rc = -EINVAL);
+			}
+			while (class_parse_nid(ptr, &nid, &ptr) == 0) {
+				rc = do_lcfg(mgcname, nid,
+					     LCFG_ADD_UUID, niduuid, 0,0,0);
+				i++;
+			}
+		}
+	} else { /* client */
+		/* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */
+		ptr = lsi->lsi_lmd->lmd_dev;
+		while (class_parse_nid(ptr, &nid, &ptr) == 0) {
+			rc = do_lcfg(mgcname, nid,
+				     LCFG_ADD_UUID, niduuid, 0,0,0);
+			i++;
+			/* Stop at the first failover nid */
+			if (*ptr == ':')
+				break;
+		}
+	}
+	if (i == 0) {
+		CERROR("No valid MGS nids found.\n");
+		GOTO(out_free, rc = -EINVAL);
+	}
+	lsi->lsi_lmd->lmd_mgs_failnodes = 1;
+
+	/* Random uuid for MGC allows easier reconnects */
+	OBD_ALLOC_PTR(uuid);
+	ll_generate_random_uuid(uuidc);
+	class_uuid_unparse(uuidc, uuid);
+
+	/* Start the MGC */
+	rc = lustre_start_simple(mgcname, LUSTRE_MGC_NAME,
+				 (char *)uuid->uuid, LUSTRE_MGS_OBDNAME,
+				 niduuid, 0, 0);
+	OBD_FREE_PTR(uuid);
+	if (rc)
+		GOTO(out_free, rc);
+
+	/* Add any failover MGS nids */
+	i = 1;
+	while (ptr && ((*ptr == ':' ||
+	       class_find_param(ptr, PARAM_MGSNODE, &ptr) == 0))) {
+		/* New failover node */
+		sprintf(niduuid, "%s_%x", mgcname, i);
+		j = 0;
+		while (class_parse_nid_quiet(ptr, &nid, &ptr) == 0) {
+			j++;
+			rc = do_lcfg(mgcname, nid,
+				     LCFG_ADD_UUID, niduuid, 0,0,0);
+			if (*ptr == ':')
+				break;
+		}
+		if (j > 0) {
+			rc = do_lcfg(mgcname, 0, LCFG_ADD_CONN,
+				     niduuid, 0, 0, 0);
+			i++;
+		} else {
+			/* at ":/fsname" */
+			break;
+		}
+	}
+	lsi->lsi_lmd->lmd_mgs_failnodes = i;
+
+	obd = class_name2obd(mgcname);
+	if (!obd) {
+		CERROR("Can't find mgcobd %s\n", mgcname);
+		GOTO(out_free, rc = -ENOTCONN);
+	}
+
+	rc = obd_set_info_async(NULL, obd->obd_self_export,
+				strlen(KEY_MGSSEC), KEY_MGSSEC,
+				strlen(mgssec), mgssec, NULL);
+	if (rc)
+		GOTO(out_free, rc);
+
+	/* Keep a refcount of servers/clients who started with "mount",
+	   so we know when we can get rid of the mgc. */
+	atomic_set(&obd->u.cli.cl_mgc_refcount, 1);
+
+	/* Try all connections, but only once. */
+	recov_bk = 1;
+	rc = obd_set_info_async(NULL, obd->obd_self_export,
+				sizeof(KEY_INIT_RECOV_BACKUP),
+				KEY_INIT_RECOV_BACKUP,
+				sizeof(recov_bk), &recov_bk, NULL);
+	if (rc)
+		/* nonfatal */
+		CWARN("can't set %s %d\n", KEY_INIT_RECOV_BACKUP, rc);
+
+	/* We connect to the MGS at setup, and don't disconnect until cleanup */
+	data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_AT |
+				  OBD_CONNECT_FULL20 | OBD_CONNECT_IMP_RECOV |
+				  OBD_CONNECT_LVB_TYPE;
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 2, 50, 0)
+	data->ocd_connect_flags |= OBD_CONNECT_MNE_SWAB;
+#else
+#warning "LU-1644: Remove old OBD_CONNECT_MNE_SWAB fixup and imp_need_mne_swab"
+#endif
+
+	if (lmd_is_client(lsi->lsi_lmd) &&
+	    lsi->lsi_lmd->lmd_flags & LMD_FLG_NOIR)
+		data->ocd_connect_flags &= ~OBD_CONNECT_IMP_RECOV;
+	data->ocd_version = LUSTRE_VERSION_CODE;
+	rc = obd_connect(NULL, &exp, obd, &(obd->obd_uuid), data, NULL);
+	if (rc) {
+		CERROR("connect failed %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	obd->u.cli.cl_mgc_mgsexp = exp;
+
+out:
+	/* Keep the mgc info in the sb. Note that many lsi's can point
+	   to the same mgc.*/
+	lsi->lsi_mgc = obd;
+out_free:
+	mutex_unlock(&mgc_start_lock);
+
+	if (data)
+		OBD_FREE_PTR(data);
+	if (mgcname)
+		OBD_FREE(mgcname, len);
+	if (niduuid)
+		OBD_FREE(niduuid, len + 2);
+	RETURN(rc);
+}
+
+static int lustre_stop_mgc(struct super_block *sb)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct obd_device *obd;
+	char *niduuid = 0, *ptr = 0;
+	int i, rc = 0, len = 0;
+	ENTRY;
+
+	if (!lsi)
+		RETURN(-ENOENT);
+	obd = lsi->lsi_mgc;
+	if (!obd)
+		RETURN(-ENOENT);
+	lsi->lsi_mgc = NULL;
+
+	mutex_lock(&mgc_start_lock);
+	LASSERT(atomic_read(&obd->u.cli.cl_mgc_refcount) > 0);
+	if (!atomic_dec_and_test(&obd->u.cli.cl_mgc_refcount)) {
+		/* This is not fatal, every client that stops
+		   will call in here. */
+		CDEBUG(D_MOUNT, "mgc still has %d references.\n",
+		       atomic_read(&obd->u.cli.cl_mgc_refcount));
+		GOTO(out, rc = -EBUSY);
+	}
+
+	/* The MGC has no recoverable data in any case.
+	 * force shotdown set in umount_begin */
+	obd->obd_no_recov = 1;
+
+	if (obd->u.cli.cl_mgc_mgsexp) {
+		/* An error is not fatal, if we are unable to send the
+		   disconnect mgs ping evictor cleans up the export */
+		rc = obd_disconnect(obd->u.cli.cl_mgc_mgsexp);
+		if (rc)
+			CDEBUG(D_MOUNT, "disconnect failed %d\n", rc);
+	}
+
+	/* Save the obdname for cleaning the nid uuids, which are
+	   obdname_XX */
+	len = strlen(obd->obd_name) + 6;
+	OBD_ALLOC(niduuid, len);
+	if (niduuid) {
+		strcpy(niduuid, obd->obd_name);
+		ptr = niduuid + strlen(niduuid);
+	}
+
+	rc = class_manual_cleanup(obd);
+	if (rc)
+		GOTO(out, rc);
+
+	/* Clean the nid uuids */
+	if (!niduuid)
+		GOTO(out, rc = -ENOMEM);
+
+	for (i = 0; i < lsi->lsi_lmd->lmd_mgs_failnodes; i++) {
+		sprintf(ptr, "_%x", i);
+		rc = do_lcfg(LUSTRE_MGC_OBDNAME, 0, LCFG_DEL_UUID,
+			     niduuid, 0, 0, 0);
+		if (rc)
+			CERROR("del MDC UUID %s failed: rc = %d\n",
+			       niduuid, rc);
+	}
+out:
+	if (niduuid)
+		OBD_FREE(niduuid, len);
+
+	/* class_import_put will get rid of the additional connections */
+	mutex_unlock(&mgc_start_lock);
+	RETURN(rc);
+}
+
+/***************** lustre superblock **************/
+
+struct lustre_sb_info *lustre_init_lsi(struct super_block *sb)
+{
+	struct lustre_sb_info *lsi;
+	ENTRY;
+
+	OBD_ALLOC_PTR(lsi);
+	if (!lsi)
+		RETURN(NULL);
+	OBD_ALLOC_PTR(lsi->lsi_lmd);
+	if (!lsi->lsi_lmd) {
+		OBD_FREE_PTR(lsi);
+		RETURN(NULL);
+	}
+
+	lsi->lsi_lmd->lmd_exclude_count = 0;
+	lsi->lsi_lmd->lmd_recovery_time_soft = 0;
+	lsi->lsi_lmd->lmd_recovery_time_hard = 0;
+	s2lsi_nocast(sb) = lsi;
+	/* we take 1 extra ref for our setup */
+	atomic_set(&lsi->lsi_mounts, 1);
+
+	/* Default umount style */
+	lsi->lsi_flags = LSI_UMOUNT_FAILOVER;
+
+	RETURN(lsi);
+}
+
+static int lustre_free_lsi(struct super_block *sb)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	ENTRY;
+
+	LASSERT(lsi != NULL);
+	CDEBUG(D_MOUNT, "Freeing lsi %p\n", lsi);
+
+	/* someone didn't call server_put_mount. */
+	LASSERT(atomic_read(&lsi->lsi_mounts) == 0);
+
+	if (lsi->lsi_lmd != NULL) {
+		if (lsi->lsi_lmd->lmd_dev != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_dev,
+				 strlen(lsi->lsi_lmd->lmd_dev) + 1);
+		if (lsi->lsi_lmd->lmd_profile != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_profile,
+				 strlen(lsi->lsi_lmd->lmd_profile) + 1);
+		if (lsi->lsi_lmd->lmd_mgssec != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_mgssec,
+				 strlen(lsi->lsi_lmd->lmd_mgssec) + 1);
+		if (lsi->lsi_lmd->lmd_opts != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_opts,
+				 strlen(lsi->lsi_lmd->lmd_opts) + 1);
+		if (lsi->lsi_lmd->lmd_exclude_count)
+			OBD_FREE(lsi->lsi_lmd->lmd_exclude,
+				 sizeof(lsi->lsi_lmd->lmd_exclude[0]) *
+				 lsi->lsi_lmd->lmd_exclude_count);
+		if (lsi->lsi_lmd->lmd_mgs != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_mgs,
+				 strlen(lsi->lsi_lmd->lmd_mgs) + 1);
+		if (lsi->lsi_lmd->lmd_osd_type != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_osd_type,
+				 strlen(lsi->lsi_lmd->lmd_osd_type) + 1);
+		if (lsi->lsi_lmd->lmd_params != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_params, 4096);
+
+		OBD_FREE(lsi->lsi_lmd, sizeof(*lsi->lsi_lmd));
+	}
+
+	LASSERT(lsi->lsi_llsbi == NULL);
+	OBD_FREE(lsi, sizeof(*lsi));
+	s2lsi_nocast(sb) = NULL;
+
+	RETURN(0);
+}
+
+/* The lsi has one reference for every server that is using the disk -
+   e.g. MDT, MGS, and potentially MGC */
+int lustre_put_lsi(struct super_block *sb)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	ENTRY;
+
+	LASSERT(lsi != NULL);
+
+	CDEBUG(D_MOUNT, "put %p %d\n", sb, atomic_read(&lsi->lsi_mounts));
+	if (atomic_dec_and_test(&lsi->lsi_mounts)) {
+		if (IS_SERVER(lsi) && lsi->lsi_osd_exp) {
+			obd_disconnect(lsi->lsi_osd_exp);
+			/* wait till OSD is gone */
+			obd_zombie_barrier();
+		}
+		lustre_free_lsi(sb);
+		RETURN(1);
+	}
+	RETURN(0);
+}
+
+/** Get the fsname ("lustre") from the server name ("lustre-OST003F").
+ * @param [in] svname server name including type and index
+ * @param [out] fsname Buffer to copy filesystem name prefix into.
+ *  Must have at least 'strlen(fsname) + 1' chars.
+ * @param [out] endptr if endptr isn't NULL it is set to end of fsname
+ * rc < 0  on error
+ */
+int server_name2fsname(const char *svname, char *fsname, const char **endptr)
+{
+	const char *dash = strrchr(svname, '-');
+	if (!dash) {
+		dash = strrchr(svname, ':');
+		if (!dash)
+			return -EINVAL;
+	}
+
+	/* interpret <fsname>-MDTXXXXX-mdc as mdt, the better way is to pass
+	 * in the fsname, then determine the server index */
+	if (!strcmp(LUSTRE_MDC_NAME, dash + 1)) {
+		dash--;
+		for (; dash > svname && *dash != '-' && *dash != ':'; dash--)
+			;
+		if (dash == svname)
+			return -EINVAL;
+	}
+
+	if (fsname != NULL) {
+		strncpy(fsname, svname, dash - svname);
+		fsname[dash - svname] = '\0';
+	}
+
+	if (endptr != NULL)
+		*endptr = dash;
+
+	return 0;
+}
+EXPORT_SYMBOL(server_name2fsname);
+
+/**
+ * Get service name (svname) from string
+ * rc < 0 on error
+ * if endptr isn't NULL it is set to end of fsname *
+ */
+int server_name2svname(const char *label, char *svname, const char **endptr,
+		       size_t svsize)
+{
+	int rc;
+	const const char *dash;
+
+	/* We use server_name2fsname() just for parsing */
+	rc = server_name2fsname(label, NULL, &dash);
+	if (rc != 0)
+		return rc;
+
+	if (*dash != '-')
+		return -1;
+
+	if (strlcpy(svname, dash + 1, svsize) >= svsize)
+		return -E2BIG;
+
+	return 0;
+}
+EXPORT_SYMBOL(server_name2svname);
+
+
+/* Get the index from the obd name.
+   rc = server type, or
+   rc < 0  on error
+   if endptr isn't NULL it is set to end of name */
+int server_name2index(const char *svname, __u32 *idx, const char **endptr)
+{
+	unsigned long index;
+	int rc;
+	const char *dash;
+
+	/* We use server_name2fsname() just for parsing */
+	rc = server_name2fsname(svname, NULL, &dash);
+	if (rc != 0)
+		return rc;
+
+	if (*dash != '-')
+		return -EINVAL;
+
+	dash++;
+
+	if (strncmp(dash, "MDT", 3) == 0)
+		rc = LDD_F_SV_TYPE_MDT;
+	else if (strncmp(dash, "OST", 3) == 0)
+		rc = LDD_F_SV_TYPE_OST;
+	else
+		return -EINVAL;
+
+	dash += 3;
+
+	if (strcmp(dash, "all") == 0)
+		return rc | LDD_F_SV_ALL;
+
+	index = simple_strtoul(dash, (char **)endptr, 16);
+	*idx = index;
+
+	return rc;
+}
+EXPORT_SYMBOL(server_name2index);
+
+/*************** mount common betweeen server and client ***************/
+
+/* Common umount */
+int lustre_common_put_super(struct super_block *sb)
+{
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_MOUNT, "dropping sb %p\n", sb);
+
+	/* Drop a ref to the MGC */
+	rc = lustre_stop_mgc(sb);
+	if (rc && (rc != -ENOENT)) {
+		if (rc != -EBUSY) {
+			CERROR("Can't stop MGC: %d\n", rc);
+			RETURN(rc);
+		}
+		/* BUSY just means that there's some other obd that
+		   needs the mgc.  Let him clean it up. */
+		CDEBUG(D_MOUNT, "MGC still in use\n");
+	}
+	/* Drop a ref to the mounted disk */
+	lustre_put_lsi(sb);
+	lu_types_stop();
+	RETURN(rc);
+}
+EXPORT_SYMBOL(lustre_common_put_super);
+
+static void lmd_print(struct lustre_mount_data *lmd)
+{
+	int i;
+
+	PRINT_CMD(D_MOUNT, "  mount data:\n");
+	if (lmd_is_client(lmd))
+		PRINT_CMD(D_MOUNT, "profile: %s\n", lmd->lmd_profile);
+	PRINT_CMD(D_MOUNT, "device:  %s\n", lmd->lmd_dev);
+	PRINT_CMD(D_MOUNT, "flags:   %x\n", lmd->lmd_flags);
+
+	if (lmd->lmd_opts)
+		PRINT_CMD(D_MOUNT, "options: %s\n", lmd->lmd_opts);
+
+	if (lmd->lmd_recovery_time_soft)
+		PRINT_CMD(D_MOUNT, "recovery time soft: %d\n",
+			  lmd->lmd_recovery_time_soft);
+
+	if (lmd->lmd_recovery_time_hard)
+		PRINT_CMD(D_MOUNT, "recovery time hard: %d\n",
+			  lmd->lmd_recovery_time_hard);
+
+	for (i = 0; i < lmd->lmd_exclude_count; i++) {
+		PRINT_CMD(D_MOUNT, "exclude %d:  OST%04x\n", i,
+			  lmd->lmd_exclude[i]);
+	}
+}
+
+/* Is this server on the exclusion list */
+int lustre_check_exclusion(struct super_block *sb, char *svname)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct lustre_mount_data *lmd = lsi->lsi_lmd;
+	__u32 index;
+	int i, rc;
+	ENTRY;
+
+	rc = server_name2index(svname, &index, NULL);
+	if (rc != LDD_F_SV_TYPE_OST)
+		/* Only exclude OSTs */
+		RETURN(0);
+
+	CDEBUG(D_MOUNT, "Check exclusion %s (%d) in %d of %s\n", svname,
+	       index, lmd->lmd_exclude_count, lmd->lmd_dev);
+
+	for(i = 0; i < lmd->lmd_exclude_count; i++) {
+		if (index == lmd->lmd_exclude[i]) {
+			CWARN("Excluding %s (on exclusion list)\n", svname);
+			RETURN(1);
+		}
+	}
+	RETURN(0);
+}
+
+/* mount -v  -o exclude=lustre-OST0001:lustre-OST0002 -t lustre ... */
+static int lmd_make_exclusion(struct lustre_mount_data *lmd, const char *ptr)
+{
+	const char *s1 = ptr, *s2;
+	__u32 index, *exclude_list;
+	int rc = 0, devmax;
+	ENTRY;
+
+	/* The shortest an ost name can be is 8 chars: -OST0000.
+	   We don't actually know the fsname at this time, so in fact
+	   a user could specify any fsname. */
+	devmax = strlen(ptr) / 8 + 1;
+
+	/* temp storage until we figure out how many we have */
+	OBD_ALLOC(exclude_list, sizeof(index) * devmax);
+	if (!exclude_list)
+		RETURN(-ENOMEM);
+
+	/* we enter this fn pointing at the '=' */
+	while (*s1 && *s1 != ' ' && *s1 != ',') {
+		s1++;
+		rc = server_name2index(s1, &index, &s2);
+		if (rc < 0) {
+			CERROR("Can't parse server name '%s'\n", s1);
+			break;
+		}
+		if (rc == LDD_F_SV_TYPE_OST)
+			exclude_list[lmd->lmd_exclude_count++] = index;
+		else
+			CDEBUG(D_MOUNT, "ignoring exclude %.7s\n", s1);
+		s1 = s2;
+		/* now we are pointing at ':' (next exclude)
+		   or ',' (end of excludes) */
+		if (lmd->lmd_exclude_count >= devmax)
+			break;
+	}
+	if (rc >= 0) /* non-err */
+		rc = 0;
+
+	if (lmd->lmd_exclude_count) {
+		/* permanent, freed in lustre_free_lsi */
+		OBD_ALLOC(lmd->lmd_exclude, sizeof(index) *
+			  lmd->lmd_exclude_count);
+		if (lmd->lmd_exclude) {
+			memcpy(lmd->lmd_exclude, exclude_list,
+			       sizeof(index) * lmd->lmd_exclude_count);
+		} else {
+			rc = -ENOMEM;
+			lmd->lmd_exclude_count = 0;
+		}
+	}
+	OBD_FREE(exclude_list, sizeof(index) * devmax);
+	RETURN(rc);
+}
+
+static int lmd_parse_mgssec(struct lustre_mount_data *lmd, char *ptr)
+{
+	char   *tail;
+	int     length;
+
+	if (lmd->lmd_mgssec != NULL) {
+		OBD_FREE(lmd->lmd_mgssec, strlen(lmd->lmd_mgssec) + 1);
+		lmd->lmd_mgssec = NULL;
+	}
+
+	tail = strchr(ptr, ',');
+	if (tail == NULL)
+		length = strlen(ptr);
+	else
+		length = tail - ptr;
+
+	OBD_ALLOC(lmd->lmd_mgssec, length + 1);
+	if (lmd->lmd_mgssec == NULL)
+		return -ENOMEM;
+
+	memcpy(lmd->lmd_mgssec, ptr, length);
+	lmd->lmd_mgssec[length] = '\0';
+	return 0;
+}
+
+static int lmd_parse_string(char **handle, char *ptr)
+{
+	char   *tail;
+	int     length;
+
+	if ((handle == NULL) || (ptr == NULL))
+		return -EINVAL;
+
+	if (*handle != NULL) {
+		OBD_FREE(*handle, strlen(*handle) + 1);
+		*handle = NULL;
+	}
+
+	tail = strchr(ptr, ',');
+	if (tail == NULL)
+		length = strlen(ptr);
+	else
+		length = tail - ptr;
+
+	OBD_ALLOC(*handle, length + 1);
+	if (*handle == NULL)
+		return -ENOMEM;
+
+	memcpy(*handle, ptr, length);
+	(*handle)[length] = '\0';
+
+	return 0;
+}
+
+/* Collect multiple values for mgsnid specifiers */
+static int lmd_parse_mgs(struct lustre_mount_data *lmd, char **ptr)
+{
+	lnet_nid_t nid;
+	char *tail = *ptr;
+	char *mgsnid;
+	int   length;
+	int   oldlen = 0;
+
+	/* Find end of nidlist */
+	while (class_parse_nid_quiet(tail, &nid, &tail) == 0) {}
+	length = tail - *ptr;
+	if (length == 0) {
+		LCONSOLE_ERROR_MSG(0x159, "Can't parse NID '%s'\n", *ptr);
+		return -EINVAL;
+	}
+
+	if (lmd->lmd_mgs != NULL)
+		oldlen = strlen(lmd->lmd_mgs) + 1;
+
+	OBD_ALLOC(mgsnid, oldlen + length + 1);
+	if (mgsnid == NULL)
+		return -ENOMEM;
+
+	if (lmd->lmd_mgs != NULL) {
+		/* Multiple mgsnid= are taken to mean failover locations */
+		memcpy(mgsnid, lmd->lmd_mgs, oldlen);
+		mgsnid[oldlen - 1] = ':';
+		OBD_FREE(lmd->lmd_mgs, oldlen);
+	}
+	memcpy(mgsnid + oldlen, *ptr, length);
+	mgsnid[oldlen + length] = '\0';
+	lmd->lmd_mgs = mgsnid;
+	*ptr = tail;
+
+	return 0;
+}
+
+/** Parse mount line options
+ * e.g. mount -v -t lustre -o abort_recov uml1:uml2:/lustre-client /mnt/lustre
+ * dev is passed as device=uml1:/lustre by mount.lustre
+ */
+static int lmd_parse(char *options, struct lustre_mount_data *lmd)
+{
+	char *s1, *s2, *devname = NULL;
+	struct lustre_mount_data *raw = (struct lustre_mount_data *)options;
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(lmd);
+	if (!options) {
+		LCONSOLE_ERROR_MSG(0x162, "Missing mount data: check that "
+				   "/sbin/mount.lustre is installed.\n");
+		RETURN(-EINVAL);
+	}
+
+	/* Options should be a string - try to detect old lmd data */
+	if ((raw->lmd_magic & 0xffffff00) == (LMD_MAGIC & 0xffffff00)) {
+		LCONSOLE_ERROR_MSG(0x163, "You're using an old version of "
+				   "/sbin/mount.lustre.  Please install "
+				   "version %s\n", LUSTRE_VERSION_STRING);
+		RETURN(-EINVAL);
+	}
+	lmd->lmd_magic = LMD_MAGIC;
+
+	OBD_ALLOC(lmd->lmd_params, 4096);
+	if (lmd->lmd_params == NULL)
+		RETURN(-ENOMEM);
+	lmd->lmd_params[0] = '\0';
+
+	/* Set default flags here */
+
+	s1 = options;
+	while (*s1) {
+		int clear = 0;
+		int time_min = OBD_RECOVERY_TIME_MIN;
+
+		/* Skip whitespace and extra commas */
+		while (*s1 == ' ' || *s1 == ',')
+			s1++;
+
+		/* Client options are parsed in ll_options: eg. flock,
+		   user_xattr, acl */
+
+		/* Parse non-ldiskfs options here. Rather than modifying
+		   ldiskfs, we just zero these out here */
+		if (strncmp(s1, "abort_recov", 11) == 0) {
+			lmd->lmd_flags |= LMD_FLG_ABORT_RECOV;
+			clear++;
+		} else if (strncmp(s1, "recovery_time_soft=", 19) == 0) {
+			lmd->lmd_recovery_time_soft = max_t(int,
+				simple_strtoul(s1 + 19, NULL, 10), time_min);
+			clear++;
+		} else if (strncmp(s1, "recovery_time_hard=", 19) == 0) {
+			lmd->lmd_recovery_time_hard = max_t(int,
+				simple_strtoul(s1 + 19, NULL, 10), time_min);
+			clear++;
+		} else if (strncmp(s1, "noir", 4) == 0) {
+			lmd->lmd_flags |= LMD_FLG_NOIR; /* test purpose only. */
+			clear++;
+		} else if (strncmp(s1, "nosvc", 5) == 0) {
+			lmd->lmd_flags |= LMD_FLG_NOSVC;
+			clear++;
+		} else if (strncmp(s1, "nomgs", 5) == 0) {
+			lmd->lmd_flags |= LMD_FLG_NOMGS;
+			clear++;
+		} else if (strncmp(s1, "noscrub", 7) == 0) {
+			lmd->lmd_flags |= LMD_FLG_NOSCRUB;
+			clear++;
+		} else if (strncmp(s1, PARAM_MGSNODE,
+				   sizeof(PARAM_MGSNODE) - 1) == 0) {
+			s2 = s1 + sizeof(PARAM_MGSNODE) - 1;
+			/* Assume the next mount opt is the first
+			   invalid nid we get to. */
+			rc = lmd_parse_mgs(lmd, &s2);
+			if (rc)
+				goto invalid;
+			clear++;
+		} else if (strncmp(s1, "writeconf", 9) == 0) {
+			lmd->lmd_flags |= LMD_FLG_WRITECONF;
+			clear++;
+		} else if (strncmp(s1, "update", 6) == 0) {
+			lmd->lmd_flags |= LMD_FLG_UPDATE;
+			clear++;
+		} else if (strncmp(s1, "virgin", 6) == 0) {
+			lmd->lmd_flags |= LMD_FLG_VIRGIN;
+			clear++;
+		} else if (strncmp(s1, "noprimnode", 10) == 0) {
+			lmd->lmd_flags |= LMD_FLG_NO_PRIMNODE;
+			clear++;
+		} else if (strncmp(s1, "mgssec=", 7) == 0) {
+			rc = lmd_parse_mgssec(lmd, s1 + 7);
+			if (rc)
+				goto invalid;
+			clear++;
+		/* ost exclusion list */
+		} else if (strncmp(s1, "exclude=", 8) == 0) {
+			rc = lmd_make_exclusion(lmd, s1 + 7);
+			if (rc)
+				goto invalid;
+			clear++;
+		} else if (strncmp(s1, "mgs", 3) == 0) {
+			/* We are an MGS */
+			lmd->lmd_flags |= LMD_FLG_MGS;
+			clear++;
+		} else if (strncmp(s1, "svname=", 7) == 0) {
+			rc = lmd_parse_string(&lmd->lmd_profile, s1 + 7);
+			if (rc)
+				goto invalid;
+			clear++;
+		} else if (strncmp(s1, "param=", 6) == 0) {
+			int length;
+			char *tail = strchr(s1 + 6, ',');
+			if (tail == NULL)
+				length = strlen(s1);
+			else
+				length = tail - s1;
+			length -= 6;
+			strncat(lmd->lmd_params, s1 + 6, length);
+			strcat(lmd->lmd_params, " ");
+			clear++;
+		} else if (strncmp(s1, "osd=", 4) == 0) {
+			rc = lmd_parse_string(&lmd->lmd_osd_type, s1 + 4);
+			if (rc)
+				goto invalid;
+			clear++;
+		}
+		/* Linux 2.4 doesn't pass the device, so we stuck it at the
+		   end of the options. */
+		else if (strncmp(s1, "device=", 7) == 0) {
+			devname = s1 + 7;
+			/* terminate options right before device.  device
+			   must be the last one. */
+			*s1 = '\0';
+			break;
+		}
+
+		/* Find next opt */
+		s2 = strchr(s1, ',');
+		if (s2 == NULL) {
+			if (clear)
+				*s1 = '\0';
+			break;
+		}
+		s2++;
+		if (clear)
+			memmove(s1, s2, strlen(s2) + 1);
+		else
+			s1 = s2;
+	}
+
+	if (!devname) {
+		LCONSOLE_ERROR_MSG(0x164, "Can't find the device name "
+				   "(need mount option 'device=...')\n");
+		goto invalid;
+	}
+
+	s1 = strstr(devname, ":/");
+	if (s1) {
+		++s1;
+		lmd->lmd_flags |= LMD_FLG_CLIENT;
+		/* Remove leading /s from fsname */
+		while (*++s1 == '/') ;
+		/* Freed in lustre_free_lsi */
+		OBD_ALLOC(lmd->lmd_profile, strlen(s1) + 8);
+		if (!lmd->lmd_profile)
+			RETURN(-ENOMEM);
+		sprintf(lmd->lmd_profile, "%s-client", s1);
+	}
+
+	/* Freed in lustre_free_lsi */
+	OBD_ALLOC(lmd->lmd_dev, strlen(devname) + 1);
+	if (!lmd->lmd_dev)
+		RETURN(-ENOMEM);
+	strcpy(lmd->lmd_dev, devname);
+
+	/* Save mount options */
+	s1 = options + strlen(options) - 1;
+	while (s1 >= options && (*s1 == ',' || *s1 == ' '))
+		*s1-- = 0;
+	if (*options != 0) {
+		/* Freed in lustre_free_lsi */
+		OBD_ALLOC(lmd->lmd_opts, strlen(options) + 1);
+		if (!lmd->lmd_opts)
+			RETURN(-ENOMEM);
+		strcpy(lmd->lmd_opts, options);
+	}
+
+	lmd_print(lmd);
+	lmd->lmd_magic = LMD_MAGIC;
+
+	RETURN(rc);
+
+invalid:
+	CERROR("Bad mount options %s\n", options);
+	RETURN(-EINVAL);
+}
+
+struct lustre_mount_data2 {
+	void *lmd2_data;
+	struct vfsmount *lmd2_mnt;
+};
+
+/** This is the entry point for the mount call into Lustre.
+ * This is called when a server or client is mounted,
+ * and this is where we start setting things up.
+ * @param data Mount options (e.g. -o flock,abort_recov)
+ */
+int lustre_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct lustre_mount_data *lmd;
+	struct lustre_mount_data2 *lmd2 = data;
+	struct lustre_sb_info *lsi;
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_MOUNT|D_VFSTRACE, "VFS Op: sb %p\n", sb);
+
+	lsi = lustre_init_lsi(sb);
+	if (!lsi)
+		RETURN(-ENOMEM);
+	lmd = lsi->lsi_lmd;
+
+	/*
+	 * Disable lockdep during mount, because mount locking patterns are
+	 * `special'.
+	 */
+	lockdep_off();
+
+	/*
+	 * LU-639: the obd cleanup of last mount may not finish yet, wait here.
+	 */
+	obd_zombie_barrier();
+
+	/* Figure out the lmd from the mount options */
+	if (lmd_parse((char *)(lmd2->lmd2_data), lmd)) {
+		lustre_put_lsi(sb);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	if (lmd_is_client(lmd)) {
+		CDEBUG(D_MOUNT, "Mounting client %s\n", lmd->lmd_profile);
+		if (!client_fill_super) {
+			LCONSOLE_ERROR_MSG(0x165, "Nothing registered for "
+					   "client mount! Is the 'lustre' "
+					   "module loaded?\n");
+			lustre_put_lsi(sb);
+			rc = -ENODEV;
+		} else {
+			rc = lustre_start_mgc(sb);
+			if (rc) {
+				lustre_put_lsi(sb);
+				GOTO(out, rc);
+			}
+			/* Connect and start */
+			/* (should always be ll_fill_super) */
+			rc = (*client_fill_super)(sb, lmd2->lmd2_mnt);
+			/* c_f_s will call lustre_common_put_super on failure */
+		}
+	} else {
+		CERROR("This is client-side-only module, "
+		       "cannot handle server mount.\n");
+		rc = -EINVAL;
+	}
+
+	/* If error happens in fill_super() call, @lsi will be killed there.
+	 * This is why we do not put it here. */
+	GOTO(out, rc);
+out:
+	if (rc) {
+		CERROR("Unable to mount %s (%d)\n",
+		       s2lsi(sb) ? lmd->lmd_dev : "", rc);
+	} else {
+		CDEBUG(D_SUPER, "Mount %s complete\n",
+		       lmd->lmd_dev);
+	}
+	lockdep_on();
+	return rc;
+}
+
+
+/* We can't call ll_fill_super by name because it lives in a module that
+   must be loaded after this one. */
+void lustre_register_client_fill_super(int (*cfs)(struct super_block *sb,
+						  struct vfsmount *mnt))
+{
+	client_fill_super = cfs;
+}
+EXPORT_SYMBOL(lustre_register_client_fill_super);
+
+void lustre_register_kill_super_cb(void (*cfs)(struct super_block *sb))
+{
+	kill_super_cb = cfs;
+}
+EXPORT_SYMBOL(lustre_register_kill_super_cb);
+
+/***************** FS registration ******************/
+struct dentry *lustre_mount(struct file_system_type *fs_type, int flags,
+				const char *devname, void *data)
+{
+	struct lustre_mount_data2 lmd2 = { data, NULL };
+
+	return mount_nodev(fs_type, flags, &lmd2, lustre_fill_super);
+}
+
+void lustre_kill_super(struct super_block *sb)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+
+	if (kill_super_cb && lsi && !IS_SERVER(lsi))
+		(*kill_super_cb)(sb);
+
+	kill_anon_super(sb);
+}
+
+/** Register the "lustre" fs type
+ */
+struct file_system_type lustre_fs_type = {
+	.owner	= THIS_MODULE,
+	.name	 = "lustre",
+	.mount	= lustre_mount,
+	.kill_sb      = lustre_kill_super,
+	.fs_flags     = FS_BINARY_MOUNTDATA | FS_REQUIRES_DEV |
+			FS_HAS_FIEMAP | FS_RENAME_DOES_D_MOVE,
+};
+
+int lustre_register_fs(void)
+{
+	return register_filesystem(&lustre_fs_type);
+}
+
+int lustre_unregister_fs(void)
+{
+	return unregister_filesystem(&lustre_fs_type);
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/obd_mount_server.c b/drivers/staging/lustre/lustre/obdclass/obd_mount_server.c
new file mode 100644
index 000000000000..a3a44091c433
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/obd_mount_server.c
@@ -0,0 +1,1783 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/obd_mount_server.c
+ *
+ * Server mount routines
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#define D_MOUNT (D_SUPER | D_CONFIG /* | D_WARNING */)
+#define PRINT_CMD CDEBUG
+#define PRINT_MASK (D_SUPER | D_CONFIG)
+
+#include <obd.h>
+#include <lvfs.h>
+#include <lustre_fsfilt.h>
+#include <obd_class.h>
+#include <lustre/lustre_user.h>
+#include <linux/version.h>
+#include <lustre_log.h>
+#include <lustre_disk.h>
+#include <lustre_param.h>
+
+/*********** mount lookup *********/
+
+DEFINE_MUTEX(lustre_mount_info_lock);
+static LIST_HEAD(server_mount_info_list);
+
+static struct lustre_mount_info *server_find_mount(const char *name)
+{
+	struct list_head *tmp;
+	struct lustre_mount_info *lmi;
+	ENTRY;
+
+	list_for_each(tmp, &server_mount_info_list) {
+		lmi = list_entry(tmp, struct lustre_mount_info,
+				     lmi_list_chain);
+		if (strcmp(name, lmi->lmi_name) == 0)
+			RETURN(lmi);
+	}
+	RETURN(NULL);
+}
+
+/* we must register an obd for a mount before we call the setup routine.
+ *_setup will call lustre_get_mount to get the mnt struct
+ by obd_name, since we can't pass the pointer to setup. */
+static int server_register_mount(const char *name, struct super_block *sb,
+				 struct vfsmount *mnt)
+{
+	struct lustre_mount_info *lmi;
+	char *name_cp;
+	ENTRY;
+
+	LASSERT(sb);
+
+	OBD_ALLOC(lmi, sizeof(*lmi));
+	if (!lmi)
+		RETURN(-ENOMEM);
+	OBD_ALLOC(name_cp, strlen(name) + 1);
+	if (!name_cp) {
+		OBD_FREE(lmi, sizeof(*lmi));
+		RETURN(-ENOMEM);
+	}
+	strcpy(name_cp, name);
+
+	mutex_lock(&lustre_mount_info_lock);
+
+	if (server_find_mount(name)) {
+		mutex_unlock(&lustre_mount_info_lock);
+		OBD_FREE(lmi, sizeof(*lmi));
+		OBD_FREE(name_cp, strlen(name) + 1);
+		CERROR("Already registered %s\n", name);
+		RETURN(-EEXIST);
+	}
+	lmi->lmi_name = name_cp;
+	lmi->lmi_sb = sb;
+	lmi->lmi_mnt = mnt;
+	list_add(&lmi->lmi_list_chain, &server_mount_info_list);
+
+	mutex_unlock(&lustre_mount_info_lock);
+
+	CDEBUG(D_MOUNT, "reg_mnt %p from %s\n", lmi->lmi_mnt, name);
+
+	RETURN(0);
+}
+
+/* when an obd no longer needs a mount */
+static int server_deregister_mount(const char *name)
+{
+	struct lustre_mount_info *lmi;
+	ENTRY;
+
+	mutex_lock(&lustre_mount_info_lock);
+	lmi = server_find_mount(name);
+	if (!lmi) {
+		mutex_unlock(&lustre_mount_info_lock);
+		CERROR("%s not registered\n", name);
+		RETURN(-ENOENT);
+	}
+
+	CDEBUG(D_MOUNT, "dereg_mnt %p from %s\n", lmi->lmi_mnt, name);
+
+	OBD_FREE(lmi->lmi_name, strlen(lmi->lmi_name) + 1);
+	list_del(&lmi->lmi_list_chain);
+	OBD_FREE(lmi, sizeof(*lmi));
+	mutex_unlock(&lustre_mount_info_lock);
+
+	RETURN(0);
+}
+
+/* obd's look up a registered mount using their obdname. This is just
+   for initial obd setup to find the mount struct.  It should not be
+   called every time you want to mntget. */
+struct lustre_mount_info *server_get_mount(const char *name)
+{
+	struct lustre_mount_info *lmi;
+	struct lustre_sb_info *lsi;
+	ENTRY;
+
+	mutex_lock(&lustre_mount_info_lock);
+	lmi = server_find_mount(name);
+	mutex_unlock(&lustre_mount_info_lock);
+	if (!lmi) {
+		CERROR("Can't find mount for %s\n", name);
+		RETURN(NULL);
+	}
+	lsi = s2lsi(lmi->lmi_sb);
+
+	atomic_inc(&lsi->lsi_mounts);
+
+	CDEBUG(D_MOUNT, "get_mnt %p from %s, refs=%d\n", lmi->lmi_mnt,
+	       name, atomic_read(&lsi->lsi_mounts));
+
+	RETURN(lmi);
+}
+EXPORT_SYMBOL(server_get_mount);
+
+/*
+ * Used by mdt to get mount_info from obdname.
+ * There are no blocking when using the mount_info.
+ * Do not use server_get_mount for this purpose.
+ */
+struct lustre_mount_info *server_get_mount_2(const char *name)
+{
+	struct lustre_mount_info *lmi;
+	ENTRY;
+
+	mutex_lock(&lustre_mount_info_lock);
+	lmi = server_find_mount(name);
+	mutex_unlock(&lustre_mount_info_lock);
+	if (!lmi)
+		CERROR("Can't find mount for %s\n", name);
+
+	RETURN(lmi);
+}
+EXPORT_SYMBOL(server_get_mount_2);
+
+/* to be called from obd_cleanup methods */
+int server_put_mount(const char *name, struct vfsmount *mnt)
+{
+	struct lustre_mount_info *lmi;
+	struct lustre_sb_info *lsi;
+	ENTRY;
+
+	mutex_lock(&lustre_mount_info_lock);
+	lmi = server_find_mount(name);
+	mutex_unlock(&lustre_mount_info_lock);
+	if (!lmi) {
+		CERROR("Can't find mount for %s\n", name);
+		RETURN(-ENOENT);
+	}
+	lsi = s2lsi(lmi->lmi_sb);
+
+	CDEBUG(D_MOUNT, "put_mnt %p from %s, refs=%d\n",
+	       lmi->lmi_mnt, name, atomic_read(&lsi->lsi_mounts));
+
+	if (lustre_put_lsi(lmi->lmi_sb))
+		CDEBUG(D_MOUNT, "Last put of mnt %p from %s\n",
+		       lmi->lmi_mnt, name);
+
+	/* this obd should never need the mount again */
+	server_deregister_mount(name);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(server_put_mount);
+
+/* Corresponding to server_get_mount_2 */
+int server_put_mount_2(const char *name, struct vfsmount *mnt)
+{
+	ENTRY;
+	RETURN(0);
+}
+EXPORT_SYMBOL(server_put_mount_2);
+
+/* Set up a MGS to serve startup logs */
+static int server_start_mgs(struct super_block *sb)
+{
+	struct lustre_sb_info    *lsi = s2lsi(sb);
+	struct vfsmount	  *mnt = lsi->lsi_srv_mnt;
+	struct lustre_mount_info *lmi;
+	int    rc = 0;
+	ENTRY;
+
+	/* It is impossible to have more than 1 MGS per node, since
+	   MGC wouldn't know which to connect to */
+	lmi = server_find_mount(LUSTRE_MGS_OBDNAME);
+	if (lmi) {
+		lsi = s2lsi(lmi->lmi_sb);
+		LCONSOLE_ERROR_MSG(0x15d, "The MGS service was already started"
+				   " from server\n");
+		RETURN(-EALREADY);
+	}
+
+	CDEBUG(D_CONFIG, "Start MGS service %s\n", LUSTRE_MGS_OBDNAME);
+
+	rc = server_register_mount(LUSTRE_MGS_OBDNAME, sb, mnt);
+
+	if (!rc) {
+		rc = lustre_start_simple(LUSTRE_MGS_OBDNAME, LUSTRE_MGS_NAME,
+					 LUSTRE_MGS_OBDNAME, 0, 0,
+					 lsi->lsi_osd_obdname, 0);
+		/* Do NOT call server_deregister_mount() here. This leads to
+		 * inability cleanup cleanly and free lsi and other stuff when
+		 * mgs calls server_put_mount() in error handling case. -umka */
+	}
+
+	if (rc)
+		LCONSOLE_ERROR_MSG(0x15e, "Failed to start MGS '%s' (%d). "
+				   "Is the 'mgs' module loaded?\n",
+				   LUSTRE_MGS_OBDNAME, rc);
+	RETURN(rc);
+}
+
+static int server_stop_mgs(struct super_block *sb)
+{
+	struct obd_device *obd;
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_MOUNT, "Stop MGS service %s\n", LUSTRE_MGS_OBDNAME);
+
+	/* There better be only one MGS */
+	obd = class_name2obd(LUSTRE_MGS_OBDNAME);
+	if (!obd) {
+		CDEBUG(D_CONFIG, "mgs %s not running\n", LUSTRE_MGS_OBDNAME);
+		RETURN(-EALREADY);
+	}
+
+	/* The MGS should always stop when we say so */
+	obd->obd_force = 1;
+	rc = class_manual_cleanup(obd);
+	RETURN(rc);
+}
+
+/* Since there's only one mgc per node, we have to change it's fs to get
+   access to the right disk. */
+static int server_mgc_set_fs(struct obd_device *mgc, struct super_block *sb)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_MOUNT, "Set mgc disk for %s\n", lsi->lsi_lmd->lmd_dev);
+
+	/* cl_mgc_sem in mgc insures we sleep if the mgc_fs is busy */
+	rc = obd_set_info_async(NULL, mgc->obd_self_export,
+				sizeof(KEY_SET_FS), KEY_SET_FS,
+				sizeof(*sb), sb, NULL);
+	if (rc != 0)
+		CERROR("can't set_fs %d\n", rc);
+
+	RETURN(rc);
+}
+
+static int server_mgc_clear_fs(struct obd_device *mgc)
+{
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_MOUNT, "Unassign mgc disk\n");
+
+	rc = obd_set_info_async(NULL, mgc->obd_self_export,
+				sizeof(KEY_CLEAR_FS), KEY_CLEAR_FS,
+				0, NULL, NULL);
+	RETURN(rc);
+}
+
+static int is_mdc_device(const char *devname)
+{
+	char *ptr;
+
+	ptr = strrchr(devname, '-');
+	if (ptr != NULL && strcmp(ptr, "-mdc") == 0)
+		return 1;
+
+	return 0;
+}
+
+static inline int tgt_is_mdt0(const char *tgtname)
+{
+	__u32 idx;
+	int   type;
+
+	type = server_name2index(tgtname, &idx, NULL);
+	if (type != LDD_F_SV_TYPE_MDT)
+		return 0;
+
+	return idx == 0;
+}
+
+static inline int is_mdc_for_mdt0(const char *devname)
+{
+	char   *ptr;
+
+	if (!is_mdc_device(devname))
+		return 0;
+
+	ptr = strrchr(devname, '-');
+	if (ptr == NULL)
+		return 0;
+
+	*ptr = 0;
+	if (tgt_is_mdt0(devname)) {
+		*ptr = '-';
+		return 1;
+	}
+	*ptr = '-';
+	return 0;
+}
+
+/**
+ * Convert OST/MDT name(fsname-OSTxxxx) to a lwp name
+ * (fsname-MDT0000-lwp-OSTxxxx)
+ **/
+int tgt_name2lwpname(const char *svname, char *lwpname)
+{
+	char		*fsname;
+	const char	*tgt;
+	int		rc;
+	ENTRY;
+
+	OBD_ALLOC(fsname, MTI_NAME_MAXLEN);
+	if (fsname == NULL)
+		RETURN(-ENOMEM);
+
+	rc = server_name2fsname(svname, fsname, &tgt);
+	if (rc != 0) {
+		CERROR("%s: failed to get fsname from svname. %d\n",
+		       svname, rc);
+		GOTO(cleanup, rc);
+	}
+
+	if (*tgt != '-' && *tgt != ':') {
+		CERROR("%s: invalid svname name!\n", svname);
+		GOTO(cleanup, rc = -EINVAL);
+	}
+
+	tgt++;
+	if (strncmp(tgt, "OST", 3) != 0 && strncmp(tgt, "MDT", 3) != 0) {
+		CERROR("%s is not an OST or MDT target!\n", svname);
+		GOTO(cleanup, rc = -EINVAL);
+	}
+	sprintf(lwpname, "%s-MDT0000-%s-%s", fsname, LUSTRE_LWP_NAME, tgt);
+cleanup:
+	if (fsname != NULL)
+		OBD_FREE(fsname, MTI_NAME_MAXLEN);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(tgt_name2lwpname);
+
+static LIST_HEAD(lwp_register_list);
+DEFINE_MUTEX(lwp_register_list_lock);
+
+int lustre_register_lwp_item(const char *lwpname, struct obd_export **exp,
+			     register_lwp_cb cb_func, void *cb_data)
+{
+	struct obd_device	 *lwp;
+	struct lwp_register_item *lri;
+	ENTRY;
+
+	LASSERTF(strlen(lwpname) < MTI_NAME_MAXLEN, "lwpname is too long %s\n",
+		 lwpname);
+	LASSERT(exp != NULL && *exp == NULL);
+
+	OBD_ALLOC_PTR(lri);
+	if (lri == NULL)
+		RETURN(-ENOMEM);
+
+	mutex_lock(&lwp_register_list_lock);
+
+	lwp = class_name2obd(lwpname);
+	if (lwp != NULL && lwp->obd_set_up == 1) {
+		struct obd_uuid *uuid;
+
+		OBD_ALLOC_PTR(uuid);
+		if (uuid == NULL) {
+			mutex_unlock(&lwp_register_list_lock);
+			OBD_FREE_PTR(lri);
+			RETURN(-ENOMEM);
+		}
+		memcpy(uuid->uuid, lwpname, strlen(lwpname));
+		*exp = cfs_hash_lookup(lwp->obd_uuid_hash, uuid);
+		OBD_FREE_PTR(uuid);
+	}
+
+	memcpy(lri->lri_name, lwpname, strlen(lwpname));
+	lri->lri_exp = exp;
+	lri->lri_cb_func = cb_func;
+	lri->lri_cb_data = cb_data;
+	INIT_LIST_HEAD(&lri->lri_list);
+	list_add(&lri->lri_list, &lwp_register_list);
+
+	if (*exp != NULL && cb_func != NULL)
+		cb_func(cb_data);
+
+	mutex_unlock(&lwp_register_list_lock);
+	RETURN(0);
+}
+EXPORT_SYMBOL(lustre_register_lwp_item);
+
+void lustre_deregister_lwp_item(struct obd_export **exp)
+{
+	struct lwp_register_item *lri, *tmp;
+
+	mutex_lock(&lwp_register_list_lock);
+	list_for_each_entry_safe(lri, tmp, &lwp_register_list, lri_list) {
+		if (exp == lri->lri_exp) {
+			if (*exp)
+				class_export_put(*exp);
+			list_del(&lri->lri_list);
+			OBD_FREE_PTR(lri);
+			break;
+		}
+	}
+	mutex_unlock(&lwp_register_list_lock);
+}
+EXPORT_SYMBOL(lustre_deregister_lwp_item);
+
+static void lustre_notify_lwp_list(struct obd_export *exp)
+{
+	struct lwp_register_item *lri, *tmp;
+	LASSERT(exp != NULL);
+
+	mutex_lock(&lwp_register_list_lock);
+	list_for_each_entry_safe(lri, tmp, &lwp_register_list, lri_list) {
+		if (strcmp(exp->exp_obd->obd_name, lri->lri_name))
+			continue;
+		if (*lri->lri_exp != NULL)
+			continue;
+		*lri->lri_exp = class_export_get(exp);
+		if (lri->lri_cb_func != NULL)
+			lri->lri_cb_func(lri->lri_cb_data);
+	}
+	mutex_unlock(&lwp_register_list_lock);
+}
+
+static int lustre_lwp_connect(struct obd_device *lwp)
+{
+	struct lu_env		 env;
+	struct lu_context	 session_ctx;
+	struct obd_export	*exp;
+	struct obd_uuid		*uuid = NULL;
+	struct obd_connect_data	*data = NULL;
+	int			 rc;
+	ENTRY;
+
+	/* log has been fully processed, let clients connect */
+	rc = lu_env_init(&env, lwp->obd_lu_dev->ld_type->ldt_ctx_tags);
+	if (rc != 0)
+		RETURN(rc);
+
+	lu_context_init(&session_ctx, LCT_SESSION);
+	session_ctx.lc_thread = NULL;
+	lu_context_enter(&session_ctx);
+	env.le_ses = &session_ctx;
+
+	OBD_ALLOC_PTR(data);
+	if (data == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_INDEX;
+	data->ocd_version = LUSTRE_VERSION_CODE;
+	data->ocd_connect_flags |= OBD_CONNECT_MDS_MDS | OBD_CONNECT_FID |
+		OBD_CONNECT_AT | OBD_CONNECT_LRU_RESIZE |
+		OBD_CONNECT_FULL20 | OBD_CONNECT_LVB_TYPE |
+		OBD_CONNECT_LIGHTWEIGHT;
+	OBD_ALLOC_PTR(uuid);
+	if (uuid == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	if (strlen(lwp->obd_name) > sizeof(uuid->uuid)) {
+		CERROR("%s: Too long lwp name %s, max_size is %d\n",
+		       lwp->obd_name, lwp->obd_name, (int)sizeof(uuid->uuid));
+		GOTO(out, rc = -EINVAL);
+	}
+
+	/* Use lwp name as the uuid, so we find the export by lwp name later */
+	memcpy(uuid->uuid, lwp->obd_name, strlen(lwp->obd_name));
+	rc = obd_connect(&env, &exp, lwp, uuid, data, NULL);
+	if (rc != 0)
+		CERROR("%s: connect failed: rc = %d\n", lwp->obd_name, rc);
+	else
+		lustre_notify_lwp_list(exp);
+
+out:
+	if (data != NULL)
+		OBD_FREE_PTR(data);
+	if (uuid != NULL)
+		OBD_FREE_PTR(uuid);
+
+	lu_env_fini(&env);
+	lu_context_exit(&session_ctx);
+	lu_context_fini(&session_ctx);
+
+	RETURN(rc);
+}
+
+/**
+ * lwp is used by slaves (Non-MDT0 targets) to manage the connection
+ * to MDT0.
+ **/
+static int lustre_lwp_setup(struct lustre_cfg *lcfg, struct lustre_sb_info *lsi)
+{
+	struct obd_connect_data *data = NULL;
+	struct obd_device	*obd;
+	char			*lwpname = NULL;
+	char			*lwpuuid = NULL;
+	int			 rc;
+	ENTRY;
+
+	rc = class_add_uuid(lustre_cfg_string(lcfg, 1),
+			    lcfg->lcfg_nid);
+	if (rc) {
+		CERROR("%s: Can't add uuid: rc =%d\n", lsi->lsi_svname, rc);
+		GOTO(out, rc);
+	}
+
+	OBD_ALLOC(lwpname, MTI_NAME_MAXLEN);
+	if (lwpname == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	rc = tgt_name2lwpname(lsi->lsi_svname, lwpname);
+	if (rc != 0) {
+		CERROR("%s: failed to generate lwp name. %d\n",
+		       lsi->lsi_svname, rc);
+		GOTO(out, rc);
+	}
+
+	OBD_ALLOC(lwpuuid, MTI_NAME_MAXLEN);
+	if (lwpuuid == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	sprintf(lwpuuid, "%s_UUID", lwpname);
+	rc = lustre_start_simple(lwpname, LUSTRE_LWP_NAME,
+				 lwpuuid, lustre_cfg_string(lcfg, 1),
+				 0, 0, 0);
+	if (rc) {
+		CERROR("%s: setup up failed: rc %d\n", lwpname, rc);
+		GOTO(out, rc);
+	}
+
+	obd = class_name2obd(lwpname);
+	LASSERT(obd != NULL);
+
+	rc = lustre_lwp_connect(obd);
+	if (rc != 0)
+		CERROR("%s: connect failed: rc = %d\n", lwpname, rc);
+out:
+	if (data != NULL)
+		OBD_FREE_PTR(data);
+	if (lwpname != NULL)
+		OBD_FREE(lwpname, MTI_NAME_MAXLEN);
+	if (lwpuuid != NULL)
+		OBD_FREE(lwpuuid, MTI_NAME_MAXLEN);
+
+	RETURN(rc);
+}
+
+/* the caller is responsible for memory free */
+static struct obd_device *lustre_find_lwp(struct lustre_sb_info *lsi,
+					  char **lwpname, char **logname)
+{
+	struct obd_device	*lwp;
+	int			 rc = 0;
+	ENTRY;
+
+	LASSERT(lwpname != NULL);
+	LASSERT(IS_OST(lsi) || IS_MDT(lsi));
+
+	OBD_ALLOC(*lwpname, MTI_NAME_MAXLEN);
+	if (*lwpname == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	if (logname != NULL) {
+		OBD_ALLOC(*logname, MTI_NAME_MAXLEN);
+		if (*logname == NULL)
+			GOTO(out, rc = -ENOMEM);
+		rc = server_name2fsname(lsi->lsi_svname, *lwpname, NULL);
+		if (rc != 0) {
+			CERROR("%s: failed to get fsname from svname. %d\n",
+			       lsi->lsi_svname, rc);
+			GOTO(out, rc = -EINVAL);
+		}
+		sprintf(*logname, "%s-client", *lwpname);
+	}
+
+	rc = tgt_name2lwpname(lsi->lsi_svname, *lwpname);
+	if (rc != 0) {
+		CERROR("%s: failed to generate lwp name. %d\n",
+		       lsi->lsi_svname, rc);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	lwp = class_name2obd(*lwpname);
+
+out:
+	if (rc != 0) {
+		if (*lwpname != NULL) {
+			OBD_FREE(*lwpname, MTI_NAME_MAXLEN);
+			*lwpname = NULL;
+		}
+		if (logname != NULL && *logname != NULL) {
+			OBD_FREE(*logname, MTI_NAME_MAXLEN);
+			*logname = NULL;
+		}
+		lwp = ERR_PTR(rc);
+	}
+
+	RETURN(lwp != NULL ? lwp : ERR_PTR(-ENOENT));
+}
+
+static int lustre_lwp_add_conn(struct lustre_cfg *cfg,
+			       struct lustre_sb_info *lsi)
+{
+	struct lustre_cfg_bufs *bufs = NULL;
+	struct lustre_cfg      *lcfg = NULL;
+	char		       *lwpname = NULL;
+	struct obd_device      *lwp;
+	int			rc;
+	ENTRY;
+
+	lwp = lustre_find_lwp(lsi, &lwpname, NULL);
+	if (IS_ERR(lwp)) {
+		CERROR("%s: can't find lwp device.\n", lsi->lsi_svname);
+		GOTO(out, rc = PTR_ERR(lwp));
+	}
+	LASSERT(lwpname != NULL);
+
+	OBD_ALLOC_PTR(bufs);
+	if (bufs == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	lustre_cfg_bufs_reset(bufs, lwpname);
+	lustre_cfg_bufs_set_string(bufs, 1,
+				   lustre_cfg_string(cfg, 1));
+
+	lcfg = lustre_cfg_new(LCFG_ADD_CONN, bufs);
+
+	rc = class_add_conn(lwp, lcfg);
+	if (rc)
+		CERROR("%s: can't add conn: rc = %d\n", lwpname, rc);
+
+out:
+	if (bufs != NULL)
+		OBD_FREE_PTR(bufs);
+	if (lcfg != NULL)
+		lustre_cfg_free(lcfg);
+	if (lwpname != NULL)
+		OBD_FREE(lwpname, MTI_NAME_MAXLEN);
+	RETURN(rc);
+}
+
+/**
+ * Retrieve MDT nids from the client log, then start the lwp device.
+ * there are only two scenarios which would include mdt nid.
+ * 1.
+ * marker   5 (flags=0x01, v2.1.54.0) lustre-MDT0000  'add mdc' xxx-
+ * add_uuid  nid=192.168.122.162@tcp(0x20000c0a87aa2)  0:  1:192.168.122.162@tcp
+ * attach    0:lustre-MDT0000-mdc  1:mdc  2:lustre-clilmv_UUID
+ * setup     0:lustre-MDT0000-mdc  1:lustre-MDT0000_UUID  2:192.168.122.162@tcp
+ * add_uuid  nid=192.168.172.1@tcp(0x20000c0a8ac01)  0:  1:192.168.172.1@tcp
+ * add_conn  0:lustre-MDT0000-mdc  1:192.168.172.1@tcp
+ * modify_mdc_tgts add 0:lustre-clilmv  1:lustre-MDT0000_UUID xxxx
+ * marker   5 (flags=0x02, v2.1.54.0) lustre-MDT0000  'add mdc' xxxx-
+ * 2.
+ * marker   7 (flags=0x01, v2.1.54.0) lustre-MDT0000  'add failnid' xxxx-
+ * add_uuid  nid=192.168.122.2@tcp(0x20000c0a87a02)  0:  1:192.168.122.2@tcp
+ * add_conn  0:lustre-MDT0000-mdc  1:192.168.122.2@tcp
+ * marker   7 (flags=0x02, v2.1.54.0) lustre-MDT0000  'add failnid' xxxx-
+ **/
+static int client_lwp_config_process(const struct lu_env *env,
+				     struct llog_handle *handle,
+				     struct llog_rec_hdr *rec, void *data)
+{
+	struct config_llog_instance *clli = data;
+	int			     cfg_len = rec->lrh_len;
+	char			    *cfg_buf = (char *) (rec + 1);
+	struct lustre_cfg	    *lcfg = NULL;
+	struct lustre_sb_info	    *lsi;
+	int			     rc = 0, swab = 0;
+	ENTRY;
+
+	if (rec->lrh_type != OBD_CFG_REC) {
+		CERROR("Unknown llog record type %#x encountered\n",
+		       rec->lrh_type);
+		RETURN(-EINVAL);
+	}
+
+	LASSERT(clli->cfg_sb != NULL);
+	lsi = s2lsi(clli->cfg_sb);
+
+	lcfg = (struct lustre_cfg *)cfg_buf;
+	if (lcfg->lcfg_version == __swab32(LUSTRE_CFG_VERSION)) {
+		lustre_swab_lustre_cfg(lcfg);
+		swab = 1;
+	}
+
+	rc = lustre_cfg_sanity_check(cfg_buf, cfg_len);
+	if (rc)
+		GOTO(out, rc);
+
+	switch (lcfg->lcfg_command) {
+	case LCFG_MARKER: {
+		struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1);
+
+		lustre_swab_cfg_marker(marker, swab,
+				       LUSTRE_CFG_BUFLEN(lcfg, 1));
+		if (marker->cm_flags & CM_SKIP ||
+		    marker->cm_flags & CM_EXCLUDE)
+			GOTO(out, rc = 0);
+
+		if (!tgt_is_mdt0(marker->cm_tgtname))
+			GOTO(out, rc = 0);
+
+		if (!strncmp(marker->cm_comment, "add mdc", 7) ||
+		    !strncmp(marker->cm_comment, "add failnid", 11)) {
+			if (marker->cm_flags & CM_START) {
+				clli->cfg_flags = CFG_F_MARKER;
+				/* This hack is to differentiate the
+				 * ADD_UUID is come from "add mdc" record
+				 * or from "add failnid" record. */
+				if (!strncmp(marker->cm_comment,
+					     "add failnid", 11))
+					clli->cfg_flags |= CFG_F_SKIP;
+			} else if (marker->cm_flags & CM_END) {
+				clli->cfg_flags = 0;
+			}
+		}
+		break;
+	}
+	case LCFG_ADD_UUID: {
+		if (clli->cfg_flags == CFG_F_MARKER) {
+			rc = lustre_lwp_setup(lcfg, lsi);
+			/* XXX: process only the first nid as
+			 * we don't need another instance of lwp */
+			clli->cfg_flags |= CFG_F_SKIP;
+		} else if (clli->cfg_flags == (CFG_F_MARKER | CFG_F_SKIP)) {
+			rc = class_add_uuid(lustre_cfg_string(lcfg, 1),
+					    lcfg->lcfg_nid);
+			if (rc)
+				CERROR("%s: Fail to add uuid, rc:%d\n",
+				       lsi->lsi_svname, rc);
+		}
+		break;
+	}
+	case LCFG_ADD_CONN: {
+		if (is_mdc_for_mdt0(lustre_cfg_string(lcfg, 0)))
+			rc = lustre_lwp_add_conn(lcfg, lsi);
+		break;
+	}
+	default:
+		break;
+	}
+out:
+	RETURN(rc);
+}
+
+static int lustre_disconnect_lwp(struct super_block *sb)
+{
+	struct lustre_sb_info		*lsi = s2lsi(sb);
+	struct obd_device		*lwp;
+	char				*lwpname = NULL;
+	char				*logname = NULL;
+	struct lustre_cfg		*lcfg = NULL;
+	struct lustre_cfg_bufs		*bufs = NULL;
+	struct config_llog_instance	*cfg = NULL;
+	int				 rc;
+	ENTRY;
+
+	lwp = lustre_find_lwp(lsi, &lwpname, &logname);
+	if (IS_ERR(lwp) && PTR_ERR(lwp) != -ENOENT)
+		GOTO(out, rc = PTR_ERR(lwp));
+
+	LASSERT(lwpname != NULL);
+	LASSERT(logname != NULL);
+
+	OBD_ALLOC_PTR(cfg);
+	if (cfg == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	/* end log first */
+	cfg->cfg_instance = sb;
+	rc = lustre_end_log(sb, logname, cfg);
+	if (rc != 0) {
+		CERROR("%s: Can't end config log %s.\n", lwpname, logname);
+		GOTO(out, rc);
+	}
+
+	if (PTR_ERR(lwp) == -ENOENT) {
+		CDEBUG(D_CONFIG, "%s: lwp device wasn't started.\n",
+		       lsi->lsi_svname);
+		GOTO(out, rc = 0);
+	}
+
+	OBD_ALLOC_PTR(bufs);
+	if (bufs == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	lustre_cfg_bufs_reset(bufs, lwp->obd_name);
+	lustre_cfg_bufs_set_string(bufs, 1, NULL);
+	lcfg = lustre_cfg_new(LCFG_CLEANUP, bufs);
+	if (!lcfg)
+		GOTO(out, rc = -ENOMEM);
+
+	/* Disconnect import first. NULL is passed for the '@env', since
+	 * it will not be used. */
+	rc = lwp->obd_lu_dev->ld_ops->ldo_process_config(NULL, lwp->obd_lu_dev,
+							 lcfg);
+out:
+	if (lcfg)
+		lustre_cfg_free(lcfg);
+	if (bufs)
+		OBD_FREE_PTR(bufs);
+	if (cfg)
+		OBD_FREE_PTR(cfg);
+	if (lwpname)
+		OBD_FREE(lwpname, MTI_NAME_MAXLEN);
+	if (logname)
+		OBD_FREE(logname, MTI_NAME_MAXLEN);
+	RETURN(rc);
+}
+
+/**
+ * Stop the lwp for an OST/MDT target.
+ **/
+static int lustre_stop_lwp(struct super_block *sb)
+{
+	struct lustre_sb_info	*lsi = s2lsi(sb);
+	struct obd_device	*lwp = NULL;
+	char			*lwpname = NULL;
+	int			 rc = 0;
+	ENTRY;
+
+	lwp = lustre_find_lwp(lsi, &lwpname, NULL);
+	if (IS_ERR(lwp)) {
+		CDEBUG(PTR_ERR(lwp) == -ENOENT ? D_CONFIG : D_ERROR,
+		       "%s: lwp wasn't started.\n", lsi->lsi_svname);
+		GOTO(out, rc = 0);
+	}
+
+	lwp->obd_force = 1;
+	rc = class_manual_cleanup(lwp);
+
+out:
+	if (lwpname != NULL)
+		OBD_FREE(lwpname, MTI_NAME_MAXLEN);
+	RETURN(rc);
+}
+
+/**
+ * Start the lwp(fsname-MDT0000-lwp-OSTxxxx) for an OST or MDT target,
+ * which would be used to establish connection from OST to MDT0.
+ **/
+static int lustre_start_lwp(struct super_block *sb)
+{
+	struct lustre_sb_info	    *lsi = s2lsi(sb);
+	struct config_llog_instance *cfg = NULL;
+	struct obd_device	    *lwp;
+	char			    *lwpname = NULL;
+	char			    *logname = NULL;
+	int			     rc;
+	ENTRY;
+
+	lwp = lustre_find_lwp(lsi, &lwpname, &logname);
+
+	/* the lwp device already stared */
+	if (lwp && !IS_ERR(lwp))
+		GOTO(out, rc = 0);
+
+	if (PTR_ERR(lwp) != -ENOENT)
+		GOTO(out, rc = PTR_ERR(lwp));
+
+	LASSERT(lwpname != NULL);
+	LASSERT(logname != NULL);
+
+	OBD_ALLOC_PTR(cfg);
+	if (cfg == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	cfg->cfg_callback = client_lwp_config_process;
+	cfg->cfg_instance = sb;
+
+	rc = lustre_process_log(sb, logname, cfg);
+out:
+	if (lwpname != NULL)
+		OBD_FREE(lwpname, MTI_NAME_MAXLEN);
+	if (logname != NULL)
+		OBD_FREE(logname, MTI_NAME_MAXLEN);
+	if (cfg != NULL)
+		OBD_FREE_PTR(cfg);
+	RETURN(rc);
+}
+
+DEFINE_MUTEX(server_start_lock);
+
+/* Stop MDS/OSS if nobody is using them */
+static int server_stop_servers(int lsiflags)
+{
+	struct obd_device *obd = NULL;
+	struct obd_type *type = NULL;
+	int rc = 0;
+	ENTRY;
+
+	mutex_lock(&server_start_lock);
+
+	/* Either an MDT or an OST or neither  */
+	/* if this was an MDT, and there are no more MDT's, clean up the MDS */
+	if (lsiflags & LDD_F_SV_TYPE_MDT) {
+		obd = class_name2obd(LUSTRE_MDS_OBDNAME);
+		if (obd != NULL)
+			type = class_search_type(LUSTRE_MDT_NAME);
+	}
+
+	/* if this was an OST, and there are no more OST's, clean up the OSS */
+	if (lsiflags & LDD_F_SV_TYPE_OST) {
+		obd = class_name2obd(LUSTRE_OSS_OBDNAME);
+		if (obd != NULL)
+			type = class_search_type(LUSTRE_OST_NAME);
+	}
+
+	if (obd != NULL && (type == NULL || type->typ_refcnt == 0)) {
+		int err;
+
+		obd->obd_force = 1;
+		/* obd_fail doesn't mean much on a server obd */
+		err = class_manual_cleanup(obd);
+		if (rc != 0)
+			rc = err;
+	}
+
+	mutex_unlock(&server_start_lock);
+
+	RETURN(rc);
+}
+
+int server_mti_print(const char *title, struct mgs_target_info *mti)
+{
+	PRINT_CMD(PRINT_MASK, "mti %s\n", title);
+	PRINT_CMD(PRINT_MASK, "server: %s\n", mti->mti_svname);
+	PRINT_CMD(PRINT_MASK, "fs:     %s\n", mti->mti_fsname);
+	PRINT_CMD(PRINT_MASK, "uuid:   %s\n", mti->mti_uuid);
+	PRINT_CMD(PRINT_MASK, "ver: %d  flags: %#x\n",
+		  mti->mti_config_ver, mti->mti_flags);
+	return 0;
+}
+EXPORT_SYMBOL(server_mti_print);
+
+/* Generate data for registration */
+static int server_lsi2mti(struct lustre_sb_info *lsi,
+			  struct mgs_target_info *mti)
+{
+	lnet_process_id_t id;
+	int rc, i = 0;
+	int cplen = 0;
+	ENTRY;
+
+	if (!IS_SERVER(lsi))
+		RETURN(-EINVAL);
+
+	if (strlcpy(mti->mti_svname, lsi->lsi_svname, sizeof(mti->mti_svname))
+	    >= sizeof(mti->mti_svname))
+		RETURN(-E2BIG);
+
+	mti->mti_nid_count = 0;
+	while (LNetGetId(i++, &id) != -ENOENT) {
+		if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND)
+			continue;
+
+		/* server use --servicenode param, only allow specified
+		 * nids be registered */
+		if ((lsi->lsi_lmd->lmd_flags & LMD_FLG_NO_PRIMNODE) != 0 &&
+		    class_match_nid(lsi->lsi_lmd->lmd_params,
+				    PARAM_FAILNODE, id.nid) < 1)
+			continue;
+
+		/* match specified network */
+		if (!class_match_net(lsi->lsi_lmd->lmd_params,
+				     PARAM_NETWORK, LNET_NIDNET(id.nid)))
+			continue;
+
+		mti->mti_nids[mti->mti_nid_count] = id.nid;
+		mti->mti_nid_count++;
+		if (mti->mti_nid_count >= MTI_NIDS_MAX) {
+			CWARN("Only using first %d nids for %s\n",
+			      mti->mti_nid_count, mti->mti_svname);
+			break;
+		}
+	}
+
+	mti->mti_lustre_ver = LUSTRE_VERSION_CODE;
+	mti->mti_config_ver = 0;
+
+	rc = server_name2fsname(lsi->lsi_svname, mti->mti_fsname, NULL);
+	if (rc != 0)
+		return rc;
+
+	rc = server_name2index(lsi->lsi_svname, &mti->mti_stripe_index, NULL);
+	if (rc < 0)
+		return rc;
+	/* Orion requires index to be set */
+	LASSERT(!(rc & LDD_F_NEED_INDEX));
+	/* keep only LDD flags */
+	mti->mti_flags = lsi->lsi_flags & LDD_F_MASK;
+	if (mti->mti_flags & (LDD_F_WRITECONF | LDD_F_VIRGIN))
+		mti->mti_flags |= LDD_F_UPDATE;
+	cplen = strlcpy(mti->mti_params, lsi->lsi_lmd->lmd_params,
+			sizeof(mti->mti_params));
+	if (cplen >= sizeof(mti->mti_params))
+		return -E2BIG;
+	return 0;
+}
+
+/* Register an old or new target with the MGS. If needed MGS will construct
+   startup logs and assign index */
+static int server_register_target(struct lustre_sb_info *lsi)
+{
+	struct obd_device *mgc = lsi->lsi_mgc;
+	struct mgs_target_info *mti = NULL;
+	bool writeconf;
+	int rc;
+	ENTRY;
+
+	LASSERT(mgc);
+
+	if (!IS_SERVER(lsi))
+		RETURN(-EINVAL);
+
+	OBD_ALLOC_PTR(mti);
+	if (!mti)
+		RETURN(-ENOMEM);
+
+	rc = server_lsi2mti(lsi, mti);
+	if (rc)
+		GOTO(out, rc);
+
+	CDEBUG(D_MOUNT, "Registration %s, fs=%s, %s, index=%04x, flags=%#x\n",
+	       mti->mti_svname, mti->mti_fsname,
+	       libcfs_nid2str(mti->mti_nids[0]), mti->mti_stripe_index,
+	       mti->mti_flags);
+
+	/* if write_conf is true, the registration must succeed */
+	writeconf = !!(lsi->lsi_flags & (LDD_F_NEED_INDEX | LDD_F_UPDATE));
+	mti->mti_flags |= LDD_F_OPC_REG;
+
+	/* Register the target */
+	/* FIXME use mgc_process_config instead */
+	rc = obd_set_info_async(NULL, mgc->u.cli.cl_mgc_mgsexp,
+				sizeof(KEY_REGISTER_TARGET),
+				KEY_REGISTER_TARGET,
+				sizeof(*mti), mti, NULL);
+	if (rc) {
+		if (mti->mti_flags & LDD_F_ERROR) {
+			LCONSOLE_ERROR_MSG(0x160,
+				"%s: the MGS refuses to allow this server "
+				"to start: rc = %d. Please see messages on "
+				"the MGS.\n", lsi->lsi_svname, rc);
+		} else if (writeconf) {
+			LCONSOLE_ERROR_MSG(0x15f,
+				"%s: cannot register this server with the MGS: "
+				"rc = %d. Is the MGS running?\n",
+				lsi->lsi_svname, rc);
+		} else {
+			CERROR("%s: error registering with the MGS: rc = %d "
+			       "(not fatal)\n", lsi->lsi_svname, rc);
+			/* reset the error code for non-fatal error. */
+			rc = 0;
+		}
+		GOTO(out, rc);
+	}
+
+out:
+	if (mti)
+		OBD_FREE_PTR(mti);
+	RETURN(rc);
+}
+
+/**
+ * Notify the MGS that this target is ready.
+ * Used by IR - if the MGS receives this message, it will notify clients.
+ */
+static int server_notify_target(struct super_block *sb, struct obd_device *obd)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct obd_device *mgc = lsi->lsi_mgc;
+	struct mgs_target_info *mti = NULL;
+	int rc;
+	ENTRY;
+
+	LASSERT(mgc);
+
+	if (!(IS_SERVER(lsi)))
+		RETURN(-EINVAL);
+
+	OBD_ALLOC_PTR(mti);
+	if (!mti)
+		RETURN(-ENOMEM);
+	rc = server_lsi2mti(lsi, mti);
+	if (rc)
+		GOTO(out, rc);
+
+	mti->mti_instance = obd->u.obt.obt_instance;
+	mti->mti_flags |= LDD_F_OPC_READY;
+
+	/* FIXME use mgc_process_config instead */
+	rc = obd_set_info_async(NULL, mgc->u.cli.cl_mgc_mgsexp,
+				sizeof(KEY_REGISTER_TARGET),
+				KEY_REGISTER_TARGET,
+				sizeof(*mti), mti, NULL);
+
+	/* Imperative recovery: if the mgs informs us to use IR? */
+	if (!rc && !(mti->mti_flags & LDD_F_ERROR) &&
+	    (mti->mti_flags & LDD_F_IR_CAPABLE))
+		lsi->lsi_flags |= LDD_F_IR_CAPABLE;
+
+out:
+	if (mti)
+		OBD_FREE_PTR(mti);
+	RETURN(rc);
+
+}
+
+/** Start server targets: MDTs and OSTs
+ */
+static int server_start_targets(struct super_block *sb, struct vfsmount *mnt)
+{
+	struct obd_device *obd;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct config_llog_instance cfg;
+	struct lu_env env;
+	struct lu_device *dev;
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_MOUNT, "starting target %s\n", lsi->lsi_svname);
+
+	if (IS_MDT(lsi)) {
+		/* make sure the MDS is started */
+		mutex_lock(&server_start_lock);
+		obd = class_name2obd(LUSTRE_MDS_OBDNAME);
+		if (!obd) {
+			rc = lustre_start_simple(LUSTRE_MDS_OBDNAME,
+						 LUSTRE_MDS_NAME,
+						 LUSTRE_MDS_OBDNAME"_uuid",
+						 0, 0, 0, 0);
+			if (rc) {
+				mutex_unlock(&server_start_lock);
+				CERROR("failed to start MDS: %d\n", rc);
+				RETURN(rc);
+			}
+		}
+		mutex_unlock(&server_start_lock);
+	}
+
+	/* If we're an OST, make sure the global OSS is running */
+	if (IS_OST(lsi)) {
+		/* make sure OSS is started */
+		mutex_lock(&server_start_lock);
+		obd = class_name2obd(LUSTRE_OSS_OBDNAME);
+		if (!obd) {
+			rc = lustre_start_simple(LUSTRE_OSS_OBDNAME,
+						 LUSTRE_OSS_NAME,
+						 LUSTRE_OSS_OBDNAME"_uuid",
+						 0, 0, 0, 0);
+			if (rc) {
+				mutex_unlock(&server_start_lock);
+				CERROR("failed to start OSS: %d\n", rc);
+				RETURN(rc);
+			}
+		}
+		mutex_unlock(&server_start_lock);
+	}
+
+	/* Set the mgc fs to our server disk.  This allows the MGC to
+	 * read and write configs locally, in case it can't talk to the MGS. */
+	if (lsi->lsi_srv_mnt) {
+		rc = server_mgc_set_fs(lsi->lsi_mgc, sb);
+		if (rc)
+			GOTO(out_stop_service, rc);
+	}
+
+	/* Register with MGS */
+	rc = server_register_target(lsi);
+	if (rc)
+		GOTO(out_mgc, rc);
+
+	/* Let the target look up the mount using the target's name
+	   (we can't pass the sb or mnt through class_process_config.) */
+	rc = server_register_mount(lsi->lsi_svname, sb, mnt);
+	if (rc)
+		GOTO(out_mgc, rc);
+
+	/* Start targets using the llog named for the target */
+	memset(&cfg, 0, sizeof(cfg));
+	cfg.cfg_callback = class_config_llog_handler;
+	rc = lustre_process_log(sb, lsi->lsi_svname, &cfg);
+	if (rc) {
+		CERROR("failed to start server %s: %d\n",
+		       lsi->lsi_svname, rc);
+		/* Do NOT call server_deregister_mount() here. This makes it
+		 * impossible to find mount later in cleanup time and leaves
+		 * @lsi and othder stuff leaked. -umka */
+		GOTO(out_mgc, rc);
+	}
+
+	obd = class_name2obd(lsi->lsi_svname);
+	if (!obd) {
+		CERROR("no server named %s was started\n", lsi->lsi_svname);
+		GOTO(out_mgc, rc = -ENXIO);
+	}
+
+	if (IS_OST(lsi) || IS_MDT(lsi)) {
+		rc = lustre_start_lwp(sb);
+		if (rc) {
+			CERROR("%s: failed to start LWP: %d\n",
+			       lsi->lsi_svname, rc);
+			GOTO(out_mgc, rc);
+		}
+	}
+
+	server_notify_target(sb, obd);
+
+	/* calculate recovery timeout, do it after lustre_process_log */
+	server_calc_timeout(lsi, obd);
+
+	/* log has been fully processed */
+	obd_notify(obd, NULL, OBD_NOTIFY_CONFIG, (void *)CONFIG_LOG);
+
+	/* log has been fully processed, let clients connect */
+	dev = obd->obd_lu_dev;
+	if (dev && dev->ld_ops->ldo_prepare) {
+		rc = lu_env_init(&env, dev->ld_type->ldt_ctx_tags);
+		if (rc == 0) {
+			struct lu_context  session_ctx;
+
+			lu_context_init(&session_ctx, LCT_SESSION);
+			session_ctx.lc_thread = NULL;
+			lu_context_enter(&session_ctx);
+			env.le_ses = &session_ctx;
+
+			rc = dev->ld_ops->ldo_prepare(&env, NULL, dev);
+
+			lu_env_fini(&env);
+			lu_context_exit(&session_ctx);
+			lu_context_fini(&session_ctx);
+		}
+	}
+
+	/* abort recovery only on the complete stack:
+	 * many devices can be involved */
+	if ((lsi->lsi_lmd->lmd_flags & LMD_FLG_ABORT_RECOV) &&
+	    (OBP(obd, iocontrol))) {
+		obd_iocontrol(OBD_IOC_ABORT_RECOVERY, obd->obd_self_export, 0,
+			      NULL, NULL);
+	}
+
+out_mgc:
+	/* Release the mgc fs for others to use */
+	if (lsi->lsi_srv_mnt)
+		server_mgc_clear_fs(lsi->lsi_mgc);
+
+out_stop_service:
+	if (rc != 0)
+		server_stop_servers(lsi->lsi_flags);
+
+	RETURN(rc);
+}
+
+static int lsi_prepare(struct lustre_sb_info *lsi)
+{
+	__u32 index;
+	int rc;
+	ENTRY;
+
+	LASSERT(lsi);
+	LASSERT(lsi->lsi_lmd);
+
+	/* The server name is given as a mount line option */
+	if (lsi->lsi_lmd->lmd_profile == NULL) {
+		LCONSOLE_ERROR("Can't determine server name\n");
+		RETURN(-EINVAL);
+	}
+
+	if (strlen(lsi->lsi_lmd->lmd_profile) >= sizeof(lsi->lsi_svname))
+		RETURN(-ENAMETOOLONG);
+
+	strcpy(lsi->lsi_svname, lsi->lsi_lmd->lmd_profile);
+
+	/* Determine osd type */
+	if (lsi->lsi_lmd->lmd_osd_type != NULL) {
+		if (strlen(lsi->lsi_lmd->lmd_osd_type) >=
+		    sizeof(lsi->lsi_osd_type))
+			RETURN(-ENAMETOOLONG);
+
+		strcpy(lsi->lsi_osd_type, lsi->lsi_lmd->lmd_osd_type);
+	} else {
+		strcpy(lsi->lsi_osd_type, LUSTRE_OSD_LDISKFS_NAME);
+	}
+
+	/* XXX: a temp. solution for components using fsfilt
+	 *      to be removed in one of the subsequent patches */
+	if (!strcmp(lsi->lsi_lmd->lmd_osd_type, "osd-ldiskfs"))
+		strcpy(lsi->lsi_fstype, "ldiskfs");
+	else
+		strcpy(lsi->lsi_fstype, lsi->lsi_lmd->lmd_osd_type);
+
+	/* Determine server type */
+	rc = server_name2index(lsi->lsi_svname, &index, NULL);
+	if (rc < 0) {
+		if (lsi->lsi_lmd->lmd_flags & LMD_FLG_MGS) {
+			/* Assume we're a bare MGS */
+			rc = 0;
+			lsi->lsi_lmd->lmd_flags |= LMD_FLG_NOSVC;
+		} else {
+			LCONSOLE_ERROR("Can't determine server type of '%s'\n",
+				       lsi->lsi_svname);
+			RETURN(rc);
+		}
+	}
+	lsi->lsi_flags |= rc;
+
+	/* Add mount line flags that used to be in ldd:
+	 * writeconf, mgs, anything else?
+	 */
+	lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_WRITECONF) ?
+		LDD_F_WRITECONF : 0;
+	lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_VIRGIN) ?
+		LDD_F_VIRGIN : 0;
+	lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_UPDATE) ?
+		LDD_F_UPDATE : 0;
+	lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_MGS) ?
+		LDD_F_SV_TYPE_MGS : 0;
+	lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_NO_PRIMNODE) ?
+		LDD_F_NO_PRIMNODE : 0;
+
+	RETURN(0);
+}
+
+/*************** server mount ******************/
+
+/** Start the shutdown of servers at umount.
+ */
+static void server_put_super(struct super_block *sb)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct obd_device     *obd;
+	char *tmpname, *extraname = NULL;
+	int tmpname_sz;
+	int lsiflags = lsi->lsi_flags;
+	ENTRY;
+
+	LASSERT(IS_SERVER(lsi));
+
+	tmpname_sz = strlen(lsi->lsi_svname) + 1;
+	OBD_ALLOC(tmpname, tmpname_sz);
+	memcpy(tmpname, lsi->lsi_svname, tmpname_sz);
+	CDEBUG(D_MOUNT, "server put_super %s\n", tmpname);
+	if (IS_MDT(lsi) && (lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC))
+		snprintf(tmpname, tmpname_sz, "MGS");
+
+	/* disconnect the lwp first to drain off the inflight request */
+	if (IS_OST(lsi) || IS_MDT(lsi)) {
+		int	rc;
+
+		rc = lustre_disconnect_lwp(sb);
+		if (rc && rc != ETIMEDOUT)
+			CERROR("%s: failed to disconnect lwp. (rc=%d)\n",
+			       tmpname, rc);
+	}
+
+	/* Stop the target */
+	if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC) &&
+	    (IS_MDT(lsi) || IS_OST(lsi))) {
+		struct lustre_profile *lprof = NULL;
+
+		/* tell the mgc to drop the config log */
+		lustre_end_log(sb, lsi->lsi_svname, NULL);
+
+		/* COMPAT_146 - profile may get deleted in mgc_cleanup.
+		   If there are any setup/cleanup errors, save the lov
+		   name for safety cleanup later. */
+		lprof = class_get_profile(lsi->lsi_svname);
+		if (lprof && lprof->lp_dt) {
+			OBD_ALLOC(extraname, strlen(lprof->lp_dt) + 1);
+			strcpy(extraname, lprof->lp_dt);
+		}
+
+		obd = class_name2obd(lsi->lsi_svname);
+		if (obd) {
+			CDEBUG(D_MOUNT, "stopping %s\n", obd->obd_name);
+			if (lsiflags & LSI_UMOUNT_FAILOVER)
+				obd->obd_fail = 1;
+			/* We can't seem to give an error return code
+			 * to .put_super, so we better make sure we clean up! */
+			obd->obd_force = 1;
+			class_manual_cleanup(obd);
+		} else {
+			CERROR("no obd %s\n", lsi->lsi_svname);
+			server_deregister_mount(lsi->lsi_svname);
+		}
+	}
+
+	/* If they wanted the mgs to stop separately from the mdt, they
+	   should have put it on a different device. */
+	if (IS_MGS(lsi)) {
+		/* if MDS start with --nomgs, don't stop MGS then */
+		if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOMGS))
+			server_stop_mgs(sb);
+	}
+
+	if (IS_OST(lsi) || IS_MDT(lsi)) {
+		if (lustre_stop_lwp(sb) < 0)
+			CERROR("%s: failed to stop lwp!\n", tmpname);
+	}
+
+	/* Clean the mgc and sb */
+	lustre_common_put_super(sb);
+
+	/* wait till all in-progress cleanups are done
+	 * specifically we're interested in ofd cleanup
+	 * as it pins OSS */
+	obd_zombie_barrier();
+
+	/* Stop the servers (MDS, OSS) if no longer needed.  We must wait
+	   until the target is really gone so that our type refcount check
+	   is right. */
+	server_stop_servers(lsiflags);
+
+	/* In case of startup or cleanup err, stop related obds */
+	if (extraname) {
+		obd = class_name2obd(extraname);
+		if (obd) {
+			CWARN("Cleaning orphaned obd %s\n", extraname);
+			obd->obd_force = 1;
+			class_manual_cleanup(obd);
+		}
+		OBD_FREE(extraname, strlen(extraname) + 1);
+	}
+
+	LCONSOLE_WARN("server umount %s complete\n", tmpname);
+	OBD_FREE(tmpname, tmpname_sz);
+	EXIT;
+}
+
+/** Called only for 'umount -f'
+ */
+static void server_umount_begin(struct super_block *sb)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	ENTRY;
+
+	CDEBUG(D_MOUNT, "umount -f\n");
+	/* umount = failover
+	   umount -f = force
+	   no third way to do non-force, non-failover */
+	lsi->lsi_flags &= ~LSI_UMOUNT_FAILOVER;
+	EXIT;
+}
+
+static int server_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct obd_statfs statfs;
+	int rc;
+	ENTRY;
+
+	if (lsi->lsi_dt_dev) {
+		rc = dt_statfs(NULL, lsi->lsi_dt_dev, &statfs);
+		if (rc == 0) {
+			statfs_unpack(buf, &statfs);
+			buf->f_type = sb->s_magic;
+			RETURN(0);
+		}
+	}
+
+	/* just return 0 */
+	buf->f_type = sb->s_magic;
+	buf->f_bsize = sb->s_blocksize;
+	buf->f_blocks = 1;
+	buf->f_bfree = 0;
+	buf->f_bavail = 0;
+	buf->f_files = 1;
+	buf->f_ffree = 0;
+	buf->f_namelen = NAME_MAX;
+	RETURN(0);
+}
+
+/** The operations we support directly on the superblock:
+ * mount, umount, and df.
+ */
+static struct super_operations server_ops = {
+	.put_super	= server_put_super,
+	.umount_begin	= server_umount_begin, /* umount -f */
+	.statfs		= server_statfs,
+};
+
+#define log2(n) ffz(~(n))
+#define LUSTRE_SUPER_MAGIC 0x0BD00BD1
+
+static int server_fill_super_common(struct super_block *sb)
+{
+	struct inode *root = 0;
+	ENTRY;
+
+	CDEBUG(D_MOUNT, "Server sb, dev=%d\n", (int)sb->s_dev);
+
+	sb->s_blocksize = 4096;
+	sb->s_blocksize_bits = log2(sb->s_blocksize);
+	sb->s_magic = LUSTRE_SUPER_MAGIC;
+	sb->s_maxbytes = 0; /* we don't allow file IO on server mountpoints */
+	sb->s_flags |= MS_RDONLY;
+	sb->s_op = &server_ops;
+
+	root = new_inode(sb);
+	if (!root) {
+		CERROR("Can't make root inode\n");
+		RETURN(-EIO);
+	}
+
+	/* returns -EIO for every operation */
+	/* make_bad_inode(root); -- badness - can't umount */
+	/* apparently we need to be a directory for the mount to finish */
+	root->i_mode = S_IFDIR;
+
+	sb->s_root = d_make_root(root);
+	if (!sb->s_root) {
+		CERROR("%s: can't make root dentry\n", sb->s_id);
+		RETURN(-EIO);
+	}
+
+	RETURN(0);
+}
+
+static int osd_start(struct lustre_sb_info *lsi, unsigned long mflags)
+{
+	struct lustre_mount_data *lmd = lsi->lsi_lmd;
+	struct obd_device	 *obd;
+	struct dt_device_param    p;
+	char			  flagstr[16];
+	int			  rc;
+	ENTRY;
+
+	CDEBUG(D_MOUNT,
+	       "Attempting to start %s, type=%s, lsifl=%x, mountfl=%lx\n",
+	       lsi->lsi_svname, lsi->lsi_osd_type, lsi->lsi_flags, mflags);
+
+	sprintf(lsi->lsi_osd_obdname, "%s-osd", lsi->lsi_svname);
+	strcpy(lsi->lsi_osd_uuid, lsi->lsi_osd_obdname);
+	strcat(lsi->lsi_osd_uuid, "_UUID");
+	sprintf(flagstr, "%lu:%lu", mflags, (unsigned long) lmd->lmd_flags);
+
+	obd = class_name2obd(lsi->lsi_osd_obdname);
+	if (obd == NULL) {
+		rc = lustre_start_simple(lsi->lsi_osd_obdname,
+					 lsi->lsi_osd_type,
+					 lsi->lsi_osd_uuid, lmd->lmd_dev,
+					 flagstr, lsi->lsi_lmd->lmd_opts,
+					 lsi->lsi_svname);
+		if (rc)
+			GOTO(out, rc);
+		obd = class_name2obd(lsi->lsi_osd_obdname);
+		LASSERT(obd);
+	}
+
+	rc = obd_connect(NULL, &lsi->lsi_osd_exp,
+			 obd, &obd->obd_uuid, NULL, NULL);
+	if (rc) {
+		obd->obd_force = 1;
+		class_manual_cleanup(obd);
+		lsi->lsi_dt_dev = NULL;
+	}
+
+	/* XXX: to keep support old components relying on lsi_srv_mnt
+	 *	we get this info from OSD just started */
+	LASSERT(obd->obd_lu_dev);
+	lsi->lsi_dt_dev = lu2dt_dev(obd->obd_lu_dev);
+	LASSERT(lsi->lsi_dt_dev);
+
+	dt_conf_get(NULL, lsi->lsi_dt_dev, &p);
+
+	lsi->lsi_srv_mnt = p.ddp_mnt;
+
+out:
+	RETURN(rc);
+}
+
+/** Fill in the superblock info for a Lustre server.
+ * Mount the device with the correct options.
+ * Read the on-disk config file.
+ * Start the services.
+ */
+int server_fill_super(struct super_block *sb)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	int rc;
+	ENTRY;
+
+	rc = lsi_prepare(lsi);
+	if (rc)
+		RETURN(rc);
+
+	/* Start low level OSD */
+	rc = osd_start(lsi, sb->s_flags);
+	if (rc) {
+		CERROR("Unable to start osd on %s: %d\n",
+		       lsi->lsi_lmd->lmd_dev, rc);
+		lustre_put_lsi(sb);
+		RETURN(rc);
+	}
+
+	CDEBUG(D_MOUNT, "Found service %s on device %s\n",
+	       lsi->lsi_svname, lsi->lsi_lmd->lmd_dev);
+
+	if (class_name2obd(lsi->lsi_svname)) {
+		LCONSOLE_ERROR_MSG(0x161, "The target named %s is already "
+				   "running. Double-mount may have compromised"
+				   " the disk journal.\n",
+				   lsi->lsi_svname);
+		lustre_put_lsi(sb);
+		RETURN(-EALREADY);
+	}
+
+	/* Start MGS before MGC */
+	if (IS_MGS(lsi) && !(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOMGS)) {
+		rc = server_start_mgs(sb);
+		if (rc)
+			GOTO(out_mnt, rc);
+	}
+
+	/* Start MGC before servers */
+	rc = lustre_start_mgc(sb);
+	if (rc)
+		GOTO(out_mnt, rc);
+
+	/* Set up all obd devices for service */
+	if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC) &&
+	    (IS_OST(lsi) || IS_MDT(lsi))) {
+		rc = server_start_targets(sb, lsi->lsi_srv_mnt);
+		if (rc < 0) {
+			CERROR("Unable to start targets: %d\n", rc);
+			GOTO(out_mnt, rc);
+		}
+		/* FIXME overmount client here, or can we just start a
+		 * client log and client_fill_super on this sb?  We
+		 * need to make sure server_put_super gets called too
+		 * - ll_put_super calls lustre_common_put_super; check
+		 * there for LSI_SERVER flag, call s_p_s if so.
+		 *
+		 * Probably should start client from new thread so we
+		 * can return.  Client will not finish until all
+		 * servers are connected.  Note - MGS-only server does
+		 * NOT get a client, since there is no lustre fs
+		 * associated - the MGS is for all lustre fs's */
+	}
+
+	rc = server_fill_super_common(sb);
+	if (rc)
+		GOTO(out_mnt, rc);
+
+	RETURN(0);
+out_mnt:
+	/* We jump here in case of failure while starting targets or MGS.
+	 * In this case we can't just put @mnt and have to do real cleanup
+	 * with stoping targets, etc. */
+	server_put_super(sb);
+	return rc;
+}
+
+/*
+ * Calculate timeout value for a target.
+ */
+void server_calc_timeout(struct lustre_sb_info *lsi, struct obd_device *obd)
+{
+	struct lustre_mount_data *lmd;
+	int soft = 0;
+	int hard = 0;
+	int factor = 0;
+	bool has_ir = !!(lsi->lsi_flags & LDD_F_IR_CAPABLE);
+	int min = OBD_RECOVERY_TIME_MIN;
+
+	LASSERT(IS_SERVER(lsi));
+
+	lmd = lsi->lsi_lmd;
+	if (lmd) {
+		soft   = lmd->lmd_recovery_time_soft;
+		hard   = lmd->lmd_recovery_time_hard;
+		has_ir = has_ir && !(lmd->lmd_flags & LMD_FLG_NOIR);
+		obd->obd_no_ir = !has_ir;
+	}
+
+	if (soft == 0)
+		soft = OBD_RECOVERY_TIME_SOFT;
+	if (hard == 0)
+		hard = OBD_RECOVERY_TIME_HARD;
+
+	/* target may have ir_factor configured. */
+	factor = OBD_IR_FACTOR_DEFAULT;
+	if (obd->obd_recovery_ir_factor)
+		factor = obd->obd_recovery_ir_factor;
+
+	if (has_ir) {
+		int new_soft = soft;
+		int new_hard = hard;
+
+		/* adjust timeout value by imperative recovery */
+
+		new_soft = (soft * factor) / OBD_IR_FACTOR_MAX;
+		new_hard = (hard * factor) / OBD_IR_FACTOR_MAX;
+
+		/* make sure the timeout is not too short */
+		new_soft = max(min, new_soft);
+		new_hard = max(new_soft, new_hard);
+
+		LCONSOLE_INFO("%s: Imperative Recovery enabled, recovery "
+			      "window shrunk from %d-%d down to %d-%d\n",
+			      obd->obd_name, soft, hard, new_soft, new_hard);
+
+		soft = new_soft;
+		hard = new_hard;
+	}
+
+	/* we're done */
+	obd->obd_recovery_timeout   = max(obd->obd_recovery_timeout, soft);
+	obd->obd_recovery_time_hard = hard;
+	obd->obd_recovery_ir_factor = factor;
+}
+EXPORT_SYMBOL(server_calc_timeout);
diff --git a/drivers/staging/lustre/lustre/obdclass/obdo.c b/drivers/staging/lustre/lustre/obdclass/obdo.c
new file mode 100644
index 000000000000..01a0e1f83a68
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/obdo.c
@@ -0,0 +1,362 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/obdo.c
+ *
+ * Object Devices Class Driver
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd_class.h>
+#include <lustre/lustre_idl.h>
+
+void obdo_set_parent_fid(struct obdo *dst, const struct lu_fid *parent)
+{
+	dst->o_parent_oid = fid_oid(parent);
+	dst->o_parent_seq = fid_seq(parent);
+	dst->o_parent_ver = fid_ver(parent);
+	dst->o_valid |= OBD_MD_FLGENER | OBD_MD_FLFID;
+}
+EXPORT_SYMBOL(obdo_set_parent_fid);
+
+/* WARNING: the file systems must take care not to tinker with
+   attributes they don't manage (such as blocks). */
+void obdo_from_inode(struct obdo *dst, struct inode *src, obd_flag valid)
+{
+	obd_flag newvalid = 0;
+
+	if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME))
+		CDEBUG(D_INODE, "valid %x, new time %lu/%lu\n",
+		       valid, LTIME_S(src->i_mtime),
+		       LTIME_S(src->i_ctime));
+
+	if (valid & OBD_MD_FLATIME) {
+		dst->o_atime = LTIME_S(src->i_atime);
+		newvalid |= OBD_MD_FLATIME;
+	}
+	if (valid & OBD_MD_FLMTIME) {
+		dst->o_mtime = LTIME_S(src->i_mtime);
+		newvalid |= OBD_MD_FLMTIME;
+	}
+	if (valid & OBD_MD_FLCTIME) {
+		dst->o_ctime = LTIME_S(src->i_ctime);
+		newvalid |= OBD_MD_FLCTIME;
+	}
+	if (valid & OBD_MD_FLSIZE) {
+		dst->o_size = i_size_read(src);
+		newvalid |= OBD_MD_FLSIZE;
+	}
+	if (valid & OBD_MD_FLBLOCKS) {  /* allocation of space (x512 bytes) */
+		dst->o_blocks = src->i_blocks;
+		newvalid |= OBD_MD_FLBLOCKS;
+	}
+	if (valid & OBD_MD_FLBLKSZ) {   /* optimal block size */
+		dst->o_blksize = ll_inode_blksize(src);
+		newvalid |= OBD_MD_FLBLKSZ;
+	}
+	if (valid & OBD_MD_FLTYPE) {
+		dst->o_mode = (dst->o_mode & S_IALLUGO) |
+			      (src->i_mode & S_IFMT);
+		newvalid |= OBD_MD_FLTYPE;
+	}
+	if (valid & OBD_MD_FLMODE) {
+		dst->o_mode = (dst->o_mode & S_IFMT) |
+			      (src->i_mode & S_IALLUGO);
+		newvalid |= OBD_MD_FLMODE;
+	}
+	if (valid & OBD_MD_FLUID) {
+		dst->o_uid = src->i_uid;
+		newvalid |= OBD_MD_FLUID;
+	}
+	if (valid & OBD_MD_FLGID) {
+		dst->o_gid = src->i_gid;
+		newvalid |= OBD_MD_FLGID;
+	}
+	if (valid & OBD_MD_FLFLAGS) {
+		dst->o_flags = ll_inode_flags(src);
+		newvalid |= OBD_MD_FLFLAGS;
+	}
+	dst->o_valid |= newvalid;
+}
+EXPORT_SYMBOL(obdo_from_inode);
+
+void obdo_cpy_md(struct obdo *dst, struct obdo *src, obd_flag valid)
+{
+	CDEBUG(D_INODE, "src obdo "DOSTID" valid "LPX64", dst obdo "DOSTID"\n",
+	       POSTID(&src->o_oi), src->o_valid, POSTID(&dst->o_oi));
+	if (valid & OBD_MD_FLATIME)
+		dst->o_atime = src->o_atime;
+	if (valid & OBD_MD_FLMTIME)
+		dst->o_mtime = src->o_mtime;
+	if (valid & OBD_MD_FLCTIME)
+		dst->o_ctime = src->o_ctime;
+	if (valid & OBD_MD_FLSIZE)
+		dst->o_size = src->o_size;
+	if (valid & OBD_MD_FLBLOCKS) /* allocation of space */
+		dst->o_blocks = src->o_blocks;
+	if (valid & OBD_MD_FLBLKSZ)
+		dst->o_blksize = src->o_blksize;
+	if (valid & OBD_MD_FLTYPE)
+		dst->o_mode = (dst->o_mode & ~S_IFMT) | (src->o_mode & S_IFMT);
+	if (valid & OBD_MD_FLMODE)
+		dst->o_mode = (dst->o_mode & S_IFMT) | (src->o_mode & ~S_IFMT);
+	if (valid & OBD_MD_FLUID)
+		dst->o_uid = src->o_uid;
+	if (valid & OBD_MD_FLGID)
+		dst->o_gid = src->o_gid;
+	if (valid & OBD_MD_FLFLAGS)
+		dst->o_flags = src->o_flags;
+	if (valid & OBD_MD_FLFID) {
+		dst->o_parent_seq = src->o_parent_seq;
+		dst->o_parent_ver = src->o_parent_ver;
+	}
+	if (valid & OBD_MD_FLGENER)
+		dst->o_parent_oid = src->o_parent_oid;
+	if (valid & OBD_MD_FLHANDLE)
+		dst->o_handle = src->o_handle;
+	if (valid & OBD_MD_FLCOOKIE)
+		dst->o_lcookie = src->o_lcookie;
+
+	dst->o_valid |= valid;
+}
+EXPORT_SYMBOL(obdo_cpy_md);
+
+/* returns FALSE if comparison (by flags) is same, TRUE if changed */
+int obdo_cmp_md(struct obdo *dst, struct obdo *src, obd_flag compare)
+{
+	int res = 0;
+
+	if ( compare & OBD_MD_FLATIME )
+		res = (res || (dst->o_atime != src->o_atime));
+	if ( compare & OBD_MD_FLMTIME )
+		res = (res || (dst->o_mtime != src->o_mtime));
+	if ( compare & OBD_MD_FLCTIME )
+		res = (res || (dst->o_ctime != src->o_ctime));
+	if ( compare & OBD_MD_FLSIZE )
+		res = (res || (dst->o_size != src->o_size));
+	if ( compare & OBD_MD_FLBLOCKS ) /* allocation of space */
+		res = (res || (dst->o_blocks != src->o_blocks));
+	if ( compare & OBD_MD_FLBLKSZ )
+		res = (res || (dst->o_blksize != src->o_blksize));
+	if ( compare & OBD_MD_FLTYPE )
+		res = (res || (((dst->o_mode ^ src->o_mode) & S_IFMT) != 0));
+	if ( compare & OBD_MD_FLMODE )
+		res = (res || (((dst->o_mode ^ src->o_mode) & ~S_IFMT) != 0));
+	if ( compare & OBD_MD_FLUID )
+		res = (res || (dst->o_uid != src->o_uid));
+	if ( compare & OBD_MD_FLGID )
+		res = (res || (dst->o_gid != src->o_gid));
+	if ( compare & OBD_MD_FLFLAGS )
+		res = (res || (dst->o_flags != src->o_flags));
+	if ( compare & OBD_MD_FLNLINK )
+		res = (res || (dst->o_nlink != src->o_nlink));
+	if ( compare & OBD_MD_FLFID ) {
+		res = (res || (dst->o_parent_seq != src->o_parent_seq));
+		res = (res || (dst->o_parent_ver != src->o_parent_ver));
+	}
+	if ( compare & OBD_MD_FLGENER )
+		res = (res || (dst->o_parent_oid != src->o_parent_oid));
+	/* XXX Don't know if thses should be included here - wasn't previously
+	if ( compare & OBD_MD_FLINLINE )
+		res = (res || memcmp(dst->o_inline, src->o_inline));
+	*/
+	return res;
+}
+EXPORT_SYMBOL(obdo_cmp_md);
+
+void obdo_to_ioobj(struct obdo *oa, struct obd_ioobj *ioobj)
+{
+	ioobj->ioo_oid = oa->o_oi;
+	if (unlikely(!(oa->o_valid & OBD_MD_FLGROUP)))
+		ostid_set_seq_mdt0(&ioobj->ioo_oid);
+
+	/* Since 2.4 this does not contain o_mode in the low 16 bits.
+	 * Instead, it holds (bd_md_max_brw - 1) for multi-bulk BRW RPCs */
+	ioobj->ioo_max_brw = 0;
+}
+EXPORT_SYMBOL(obdo_to_ioobj);
+
+void obdo_from_iattr(struct obdo *oa, struct iattr *attr, unsigned int ia_valid)
+{
+	if (ia_valid & ATTR_ATIME) {
+		oa->o_atime = LTIME_S(attr->ia_atime);
+		oa->o_valid |= OBD_MD_FLATIME;
+	}
+	if (ia_valid & ATTR_MTIME) {
+		oa->o_mtime = LTIME_S(attr->ia_mtime);
+		oa->o_valid |= OBD_MD_FLMTIME;
+	}
+	if (ia_valid & ATTR_CTIME) {
+		oa->o_ctime = LTIME_S(attr->ia_ctime);
+		oa->o_valid |= OBD_MD_FLCTIME;
+	}
+	if (ia_valid & ATTR_SIZE) {
+		oa->o_size = attr->ia_size;
+		oa->o_valid |= OBD_MD_FLSIZE;
+	}
+	if (ia_valid & ATTR_MODE) {
+		oa->o_mode = attr->ia_mode;
+		oa->o_valid |= OBD_MD_FLTYPE | OBD_MD_FLMODE;
+		if (!current_is_in_group(oa->o_gid) &&
+		    !cfs_capable(CFS_CAP_FSETID))
+			oa->o_mode &= ~S_ISGID;
+	}
+	if (ia_valid & ATTR_UID) {
+		oa->o_uid = attr->ia_uid;
+		oa->o_valid |= OBD_MD_FLUID;
+	}
+	if (ia_valid & ATTR_GID) {
+		oa->o_gid = attr->ia_gid;
+		oa->o_valid |= OBD_MD_FLGID;
+	}
+}
+EXPORT_SYMBOL(obdo_from_iattr);
+
+void iattr_from_obdo(struct iattr *attr, struct obdo *oa, obd_flag valid)
+{
+	valid &= oa->o_valid;
+
+	if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME))
+		CDEBUG(D_INODE, "valid "LPX64", new time "LPU64"/"LPU64"\n",
+		       oa->o_valid, oa->o_mtime, oa->o_ctime);
+
+	attr->ia_valid = 0;
+	if (valid & OBD_MD_FLATIME) {
+		LTIME_S(attr->ia_atime) = oa->o_atime;
+		attr->ia_valid |= ATTR_ATIME;
+	}
+	if (valid & OBD_MD_FLMTIME) {
+		LTIME_S(attr->ia_mtime) = oa->o_mtime;
+		attr->ia_valid |= ATTR_MTIME;
+	}
+	if (valid & OBD_MD_FLCTIME) {
+		LTIME_S(attr->ia_ctime) = oa->o_ctime;
+		attr->ia_valid |= ATTR_CTIME;
+	}
+	if (valid & OBD_MD_FLSIZE) {
+		attr->ia_size = oa->o_size;
+		attr->ia_valid |= ATTR_SIZE;
+	}
+#if 0   /* you shouldn't be able to change a file's type with setattr */
+	if (valid & OBD_MD_FLTYPE) {
+		attr->ia_mode = (attr->ia_mode & ~S_IFMT)|(oa->o_mode & S_IFMT);
+		attr->ia_valid |= ATTR_MODE;
+	}
+#endif
+	if (valid & OBD_MD_FLMODE) {
+		attr->ia_mode = (attr->ia_mode & S_IFMT)|(oa->o_mode & ~S_IFMT);
+		attr->ia_valid |= ATTR_MODE;
+		if (!current_is_in_group(oa->o_gid) &&
+		    !cfs_capable(CFS_CAP_FSETID))
+			attr->ia_mode &= ~S_ISGID;
+	}
+	if (valid & OBD_MD_FLUID) {
+		attr->ia_uid = oa->o_uid;
+		attr->ia_valid |= ATTR_UID;
+	}
+	if (valid & OBD_MD_FLGID) {
+		attr->ia_gid = oa->o_gid;
+		attr->ia_valid |= ATTR_GID;
+	}
+}
+EXPORT_SYMBOL(iattr_from_obdo);
+
+void md_from_obdo(struct md_op_data *op_data, struct obdo *oa, obd_flag valid)
+{
+	iattr_from_obdo(&op_data->op_attr, oa, valid);
+	if (valid & OBD_MD_FLBLOCKS) {
+		op_data->op_attr_blocks = oa->o_blocks;
+		op_data->op_attr.ia_valid |= ATTR_BLOCKS;
+	}
+	if (valid & OBD_MD_FLFLAGS) {
+		((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
+			oa->o_flags;
+		op_data->op_attr.ia_valid |= ATTR_ATTR_FLAG;
+	}
+}
+EXPORT_SYMBOL(md_from_obdo);
+
+void obdo_from_md(struct obdo *oa, struct md_op_data *op_data,
+		  unsigned int valid)
+{
+	obdo_from_iattr(oa, &op_data->op_attr, valid);
+	if (valid & ATTR_BLOCKS) {
+		oa->o_blocks = op_data->op_attr_blocks;
+		oa->o_valid |= OBD_MD_FLBLOCKS;
+	}
+	if (valid & ATTR_ATTR_FLAG) {
+		oa->o_flags =
+			((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags;
+		oa->o_valid |= OBD_MD_FLFLAGS;
+	}
+}
+EXPORT_SYMBOL(obdo_from_md);
+
+void obdo_cpu_to_le(struct obdo *dobdo, struct obdo *sobdo)
+{
+	dobdo->o_size = cpu_to_le64(sobdo->o_size);
+	dobdo->o_mtime = cpu_to_le64(sobdo->o_mtime);
+	dobdo->o_atime = cpu_to_le64(sobdo->o_atime);
+	dobdo->o_ctime = cpu_to_le64(sobdo->o_ctime);
+	dobdo->o_blocks = cpu_to_le64(sobdo->o_blocks);
+	dobdo->o_mode = cpu_to_le32(sobdo->o_mode);
+	dobdo->o_uid = cpu_to_le32(sobdo->o_uid);
+	dobdo->o_gid = cpu_to_le32(sobdo->o_gid);
+	dobdo->o_flags = cpu_to_le32(sobdo->o_flags);
+	dobdo->o_nlink = cpu_to_le32(sobdo->o_nlink);
+	dobdo->o_blksize = cpu_to_le32(sobdo->o_blksize);
+	dobdo->o_valid = cpu_to_le64(sobdo->o_valid);
+}
+EXPORT_SYMBOL(obdo_cpu_to_le);
+
+void obdo_le_to_cpu(struct obdo *dobdo, struct obdo *sobdo)
+{
+	dobdo->o_size = le64_to_cpu(sobdo->o_size);
+	dobdo->o_mtime = le64_to_cpu(sobdo->o_mtime);
+	dobdo->o_atime = le64_to_cpu(sobdo->o_atime);
+	dobdo->o_ctime = le64_to_cpu(sobdo->o_ctime);
+	dobdo->o_blocks = le64_to_cpu(sobdo->o_blocks);
+	dobdo->o_mode = le32_to_cpu(sobdo->o_mode);
+	dobdo->o_uid = le32_to_cpu(sobdo->o_uid);
+	dobdo->o_gid = le32_to_cpu(sobdo->o_gid);
+	dobdo->o_flags = le32_to_cpu(sobdo->o_flags);
+	dobdo->o_nlink = le32_to_cpu(sobdo->o_nlink);
+	dobdo->o_blksize = le32_to_cpu(sobdo->o_blksize);
+	dobdo->o_valid = le64_to_cpu(sobdo->o_valid);
+}
+EXPORT_SYMBOL(obdo_le_to_cpu);
diff --git a/drivers/staging/lustre/lustre/obdclass/statfs_pack.c b/drivers/staging/lustre/lustre/obdclass/statfs_pack.c
new file mode 100644
index 000000000000..c3b7a78dba50
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/statfs_pack.c
@@ -0,0 +1,75 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/statfs_pack.c
+ *
+ * (Un)packing of OST/MDS requests
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+
+#include <lustre_export.h>
+#include <lustre_net.h>
+#include <obd_support.h>
+#include <obd_class.h>
+
+void statfs_pack(struct obd_statfs *osfs, struct kstatfs *sfs)
+{
+	memset(osfs, 0, sizeof(*osfs));
+	osfs->os_type = sfs->f_type;
+	osfs->os_blocks = sfs->f_blocks;
+	osfs->os_bfree = sfs->f_bfree;
+	osfs->os_bavail = sfs->f_bavail;
+	osfs->os_files = sfs->f_files;
+	osfs->os_ffree = sfs->f_ffree;
+	osfs->os_bsize = sfs->f_bsize;
+	osfs->os_namelen = sfs->f_namelen;
+}
+EXPORT_SYMBOL(statfs_pack);
+
+void statfs_unpack(struct kstatfs *sfs, struct obd_statfs *osfs)
+{
+	memset(sfs, 0, sizeof(*sfs));
+	sfs->f_type = osfs->os_type;
+	sfs->f_blocks = osfs->os_blocks;
+	sfs->f_bfree = osfs->os_bfree;
+	sfs->f_bavail = osfs->os_bavail;
+	sfs->f_files = osfs->os_files;
+	sfs->f_ffree = osfs->os_ffree;
+	sfs->f_bsize = osfs->os_bsize;
+	sfs->f_namelen = osfs->os_namelen;
+}
+EXPORT_SYMBOL(statfs_unpack);
diff --git a/drivers/staging/lustre/lustre/obdclass/uuid.c b/drivers/staging/lustre/lustre/obdclass/uuid.c
new file mode 100644
index 000000000000..af5f27f82bc5
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/uuid.c
@@ -0,0 +1,82 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/uuid.c
+ *
+ * Public include file for the UUID library
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+# include <linux/libcfs/libcfs.h>
+
+#include <obd_support.h>
+#include <obd_class.h>
+
+
+static inline __u32 consume(int nob, __u8 **ptr)
+{
+	__u32 value;
+
+	LASSERT(nob <= sizeof value);
+
+	for (value = 0; nob > 0; --nob)
+		value = (value << 8) | *((*ptr)++);
+	return value;
+}
+
+#define CONSUME(val, ptr) (val) = consume(sizeof(val), (ptr))
+
+static void uuid_unpack(class_uuid_t in, __u16 *uu, int nr)
+{
+	__u8 *ptr = in;
+
+	LASSERT(nr * sizeof *uu == sizeof(class_uuid_t));
+
+	while (nr-- > 0)
+		CONSUME(uu[nr], &ptr);
+}
+
+void class_uuid_unparse(class_uuid_t uu, struct obd_uuid *out)
+{
+	/* uu as an array of __u16's */
+	__u16 uuid[sizeof(class_uuid_t) / sizeof(__u16)];
+
+	CLASSERT(ARRAY_SIZE(uuid) == 8);
+
+	uuid_unpack(uu, uuid, ARRAY_SIZE(uuid));
+	sprintf(out->uuid, "%04x%04x-%04x-%04x-%04x-%04x%04x%04x",
+		uuid[0], uuid[1], uuid[2], uuid[3],
+		uuid[4], uuid[5], uuid[6], uuid[7]);
+}
+EXPORT_SYMBOL(class_uuid_unparse);
diff --git a/drivers/staging/lustre/lustre/obdecho/Makefile b/drivers/staging/lustre/lustre/obdecho/Makefile
new file mode 100644
index 000000000000..4c48e2432f9b
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdecho/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_LUSTRE_FS) += obdecho.o
+obdecho-y := echo_client.o lproc_echo.o
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lustre/obdecho/echo.c b/drivers/staging/lustre/lustre/obdecho/echo.c
new file mode 100644
index 000000000000..9e64939af9dc
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdecho/echo.c
@@ -0,0 +1,679 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdecho/echo.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_ECHO
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_debug.h>
+#include <lustre_dlm.h>
+#include <lprocfs_status.h>
+
+#include "echo_internal.h"
+
+/* The echo objid needs to be below 2^32, because regular FID numbers are
+ * limited to 2^32 objects in f_oid for the FID_SEQ_ECHO range. b=23335 */
+#define ECHO_INIT_OID	0x10000000ULL
+#define ECHO_HANDLE_MAGIC    0xabcd0123fedc9876ULL
+
+#define ECHO_PERSISTENT_PAGES (ECHO_PERSISTENT_SIZE >> PAGE_CACHE_SHIFT)
+static struct page *echo_persistent_pages[ECHO_PERSISTENT_PAGES];
+
+enum {
+	LPROC_ECHO_READ_BYTES = 1,
+	LPROC_ECHO_WRITE_BYTES = 2,
+	LPROC_ECHO_LAST = LPROC_ECHO_WRITE_BYTES +1
+};
+
+static int echo_connect(const struct lu_env *env,
+			struct obd_export **exp, struct obd_device *obd,
+			struct obd_uuid *cluuid, struct obd_connect_data *data,
+			void *localdata)
+{
+	struct lustre_handle conn = { 0 };
+	int rc;
+
+	data->ocd_connect_flags &= ECHO_CONNECT_SUPPORTED;
+	rc = class_connect(&conn, obd, cluuid);
+	if (rc) {
+		CERROR("can't connect %d\n", rc);
+		return rc;
+	}
+	*exp = class_conn2export(&conn);
+
+	return 0;
+}
+
+static int echo_disconnect(struct obd_export *exp)
+{
+	LASSERT (exp != NULL);
+
+	return server_disconnect_export(exp);
+}
+
+static int echo_init_export(struct obd_export *exp)
+{
+	return ldlm_init_export(exp);
+}
+
+static int echo_destroy_export(struct obd_export *exp)
+{
+	ENTRY;
+
+	target_destroy_export(exp);
+	ldlm_destroy_export(exp);
+
+	RETURN(0);
+}
+
+ static __u64 echo_next_id(struct obd_device *obddev)
+{
+	obd_id id;
+
+	spin_lock(&obddev->u.echo.eo_lock);
+	id = ++obddev->u.echo.eo_lastino;
+	spin_unlock(&obddev->u.echo.eo_lock);
+
+	return id;
+}
+
+static int echo_create(const struct lu_env *env, struct obd_export *exp,
+		       struct obdo *oa, struct lov_stripe_md **ea,
+		       struct obd_trans_info *oti)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+
+	if (!obd) {
+		CERROR("invalid client cookie "LPX64"\n",
+		       exp->exp_handle.h_cookie);
+		return -EINVAL;
+	}
+
+	if (!(oa->o_mode && S_IFMT)) {
+		CERROR("echo obd: no type!\n");
+		return -ENOENT;
+	}
+
+	if (!(oa->o_valid & OBD_MD_FLTYPE)) {
+		CERROR("invalid o_valid "LPX64"\n", oa->o_valid);
+		return -EINVAL;
+	}
+
+	ostid_set_seq_echo(&oa->o_oi);
+	ostid_set_id(&oa->o_oi, echo_next_id(obd));
+	oa->o_valid = OBD_MD_FLID;
+
+	return 0;
+}
+
+static int echo_destroy(const struct lu_env *env, struct obd_export *exp,
+			struct obdo *oa, struct lov_stripe_md *ea,
+			struct obd_trans_info *oti, struct obd_export *md_exp,
+			void *capa)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+
+	ENTRY;
+	if (!obd) {
+		CERROR("invalid client cookie "LPX64"\n",
+		       exp->exp_handle.h_cookie);
+		RETURN(-EINVAL);
+	}
+
+	if (!(oa->o_valid & OBD_MD_FLID)) {
+		CERROR("obdo missing FLID valid flag: "LPX64"\n", oa->o_valid);
+		RETURN(-EINVAL);
+	}
+
+	if (ostid_id(&oa->o_oi) > obd->u.echo.eo_lastino ||
+	    ostid_id(&oa->o_oi) < ECHO_INIT_OID) {
+		CERROR("bad destroy objid: "DOSTID"\n", POSTID(&oa->o_oi));
+		RETURN(-EINVAL);
+	}
+
+	RETURN(0);
+}
+
+static int echo_getattr(const struct lu_env *env, struct obd_export *exp,
+			struct obd_info *oinfo)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	obd_id id = ostid_id(&oinfo->oi_oa->o_oi);
+
+	ENTRY;
+	if (!obd) {
+		CERROR("invalid client cookie "LPX64"\n",
+		       exp->exp_handle.h_cookie);
+		RETURN(-EINVAL);
+	}
+
+	if (!(oinfo->oi_oa->o_valid & OBD_MD_FLID)) {
+		CERROR("obdo missing FLID valid flag: "LPX64"\n",
+		       oinfo->oi_oa->o_valid);
+		RETURN(-EINVAL);
+	}
+
+	obdo_cpy_md(oinfo->oi_oa, &obd->u.echo.eo_oa, oinfo->oi_oa->o_valid);
+	ostid_set_seq_echo(&oinfo->oi_oa->o_oi);
+	ostid_set_id(&oinfo->oi_oa->o_oi, id);
+
+	RETURN(0);
+}
+
+static int echo_setattr(const struct lu_env *env, struct obd_export *exp,
+			struct obd_info *oinfo, struct obd_trans_info *oti)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+
+	ENTRY;
+	if (!obd) {
+		CERROR("invalid client cookie "LPX64"\n",
+		       exp->exp_handle.h_cookie);
+		RETURN(-EINVAL);
+	}
+
+	if (!(oinfo->oi_oa->o_valid & OBD_MD_FLID)) {
+		CERROR("obdo missing FLID valid flag: "LPX64"\n",
+		       oinfo->oi_oa->o_valid);
+		RETURN(-EINVAL);
+	}
+
+	memcpy(&obd->u.echo.eo_oa, oinfo->oi_oa, sizeof(*oinfo->oi_oa));
+
+	if (ostid_id(&oinfo->oi_oa->o_oi) & 4) {
+		/* Save lock to force ACKed reply */
+		ldlm_lock_addref (&obd->u.echo.eo_nl_lock, LCK_NL);
+		oti->oti_ack_locks[0].mode = LCK_NL;
+		oti->oti_ack_locks[0].lock = obd->u.echo.eo_nl_lock;
+	}
+
+	RETURN(0);
+}
+
+static void
+echo_page_debug_setup(struct page *page, int rw, obd_id id,
+		      __u64 offset, int len)
+{
+	int   page_offset = offset & ~CFS_PAGE_MASK;
+	char *addr	= ((char *)kmap(page)) + page_offset;
+
+	if (len % OBD_ECHO_BLOCK_SIZE != 0)
+		CERROR("Unexpected block size %d\n", len);
+
+	while (len > 0) {
+		if (rw & OBD_BRW_READ)
+			block_debug_setup(addr, OBD_ECHO_BLOCK_SIZE,
+					  offset, id);
+		else
+			block_debug_setup(addr, OBD_ECHO_BLOCK_SIZE,
+					  0xecc0ecc0ecc0ecc0ULL,
+					  0xecc0ecc0ecc0ecc0ULL);
+
+		addr   += OBD_ECHO_BLOCK_SIZE;
+		offset += OBD_ECHO_BLOCK_SIZE;
+		len    -= OBD_ECHO_BLOCK_SIZE;
+	}
+
+	kunmap(page);
+}
+
+static int
+echo_page_debug_check(struct page *page, obd_id id,
+		      __u64 offset, int len)
+{
+	int   page_offset = offset & ~CFS_PAGE_MASK;
+	char *addr	= ((char *)kmap(page)) + page_offset;
+	int   rc	  = 0;
+	int   rc2;
+
+	if (len % OBD_ECHO_BLOCK_SIZE != 0)
+		CERROR("Unexpected block size %d\n", len);
+
+	while (len > 0) {
+		rc2 = block_debug_check("echo", addr, OBD_ECHO_BLOCK_SIZE,
+					offset, id);
+
+		if (rc2 != 0 && rc == 0)
+			rc = rc2;
+
+		addr   += OBD_ECHO_BLOCK_SIZE;
+		offset += OBD_ECHO_BLOCK_SIZE;
+		len    -= OBD_ECHO_BLOCK_SIZE;
+	}
+
+	kunmap(page);
+
+	return (rc);
+}
+
+/* This allows us to verify that desc_private is passed unmolested */
+#define DESC_PRIV 0x10293847
+
+static int echo_map_nb_to_lb(struct obdo *oa, struct obd_ioobj *obj,
+			     struct niobuf_remote *nb, int *pages,
+			     struct niobuf_local *lb, int cmd, int *left)
+{
+	int gfp_mask = (ostid_id(&obj->ioo_oid) & 1) ?
+			GFP_HIGHUSER : GFP_IOFS;
+	int ispersistent = ostid_id(&obj->ioo_oid) == ECHO_PERSISTENT_OBJID;
+	int debug_setup = (!ispersistent &&
+			   (oa->o_valid & OBD_MD_FLFLAGS) != 0 &&
+			   (oa->o_flags & OBD_FL_DEBUG_CHECK) != 0);
+	struct niobuf_local *res = lb;
+	obd_off offset = nb->offset;
+	int len = nb->len;
+
+	while (len > 0) {
+		int plen = PAGE_CACHE_SIZE - (offset & (PAGE_CACHE_SIZE-1));
+		if (len < plen)
+			plen = len;
+
+		/* check for local buf overflow */
+		if (*left == 0)
+			return -EINVAL;
+
+		res->lnb_file_offset = offset;
+		res->len = plen;
+		LASSERT((res->lnb_file_offset & ~CFS_PAGE_MASK) + res->len <=
+			PAGE_CACHE_SIZE);
+
+		if (ispersistent &&
+		    ((res->lnb_file_offset >> PAGE_CACHE_SHIFT) <
+		      ECHO_PERSISTENT_PAGES)) {
+			res->page =
+				echo_persistent_pages[res->lnb_file_offset >>
+						      PAGE_CACHE_SHIFT];
+			/* Take extra ref so __free_pages() can be called OK */
+			get_page (res->page);
+		} else {
+			OBD_PAGE_ALLOC(res->page, gfp_mask);
+			if (res->page == NULL) {
+				CERROR("can't get page for id " DOSTID"\n",
+				       POSTID(&obj->ioo_oid));
+				return -ENOMEM;
+			}
+		}
+
+		CDEBUG(D_PAGE, "$$$$ get page %p @ "LPU64" for %d\n",
+		       res->page, res->lnb_file_offset, res->len);
+
+		if (cmd & OBD_BRW_READ)
+			res->rc = res->len;
+
+		if (debug_setup)
+			echo_page_debug_setup(res->page, cmd,
+					      ostid_id(&obj->ioo_oid),
+					      res->lnb_file_offset, res->len);
+
+		offset += plen;
+		len -= plen;
+		res++;
+
+		(*left)--;
+		(*pages)++;
+	}
+
+	return 0;
+}
+
+static int echo_finalize_lb(struct obdo *oa, struct obd_ioobj *obj,
+			    struct niobuf_remote *rb, int *pgs,
+			    struct niobuf_local *lb, int verify)
+{
+	struct niobuf_local *res = lb;
+	obd_off start  = rb->offset >> PAGE_CACHE_SHIFT;
+	obd_off end    = (rb->offset + rb->len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	int     count  = (int)(end - start);
+	int     rc     = 0;
+	int     i;
+
+	for (i = 0; i < count; i++, (*pgs) ++, res++) {
+		struct page *page = res->page;
+		void       *addr;
+
+		if (page == NULL) {
+			CERROR("null page objid "LPU64":%p, buf %d/%d\n",
+			       ostid_id(&obj->ioo_oid), page, i,
+			       obj->ioo_bufcnt);
+			return -EFAULT;
+		}
+
+		addr = kmap(page);
+
+		CDEBUG(D_PAGE, "$$$$ use page %p, addr %p@"LPU64"\n",
+		       res->page, addr, res->lnb_file_offset);
+
+		if (verify) {
+			int vrc = echo_page_debug_check(page,
+							ostid_id(&obj->ioo_oid),
+							res->lnb_file_offset,
+							res->len);
+			/* check all the pages always */
+			if (vrc != 0 && rc == 0)
+				rc = vrc;
+		}
+
+		kunmap(page);
+		/* NB see comment above regarding persistent pages */
+		OBD_PAGE_FREE(page);
+	}
+
+	return rc;
+}
+
+static int echo_preprw(const struct lu_env *env, int cmd,
+		       struct obd_export *export, struct obdo *oa,
+		       int objcount, struct obd_ioobj *obj,
+		       struct niobuf_remote *nb, int *pages,
+		       struct niobuf_local *res, struct obd_trans_info *oti,
+		       struct lustre_capa *unused)
+{
+	struct obd_device *obd;
+	int tot_bytes = 0;
+	int rc = 0;
+	int i, left;
+	ENTRY;
+
+	obd = export->exp_obd;
+	if (obd == NULL)
+		RETURN(-EINVAL);
+
+	/* Temp fix to stop falling foul of osc_announce_cached() */
+	oa->o_valid &= ~(OBD_MD_FLBLOCKS | OBD_MD_FLGRANT);
+
+	memset(res, 0, sizeof(*res) * *pages);
+
+	CDEBUG(D_PAGE, "%s %d obdos with %d IOs\n",
+	       cmd == OBD_BRW_READ ? "reading" : "writing", objcount, *pages);
+
+	if (oti)
+		oti->oti_handle = (void *)DESC_PRIV;
+
+	left = *pages;
+	*pages = 0;
+
+	for (i = 0; i < objcount; i++, obj++) {
+		int j;
+
+		for (j = 0 ; j < obj->ioo_bufcnt ; j++, nb++) {
+
+			rc = echo_map_nb_to_lb(oa, obj, nb, pages,
+					       res + *pages, cmd, &left);
+			if (rc)
+				GOTO(preprw_cleanup, rc);
+
+			tot_bytes += nb->len;
+		}
+	}
+
+	atomic_add(*pages, &obd->u.echo.eo_prep);
+
+	if (cmd & OBD_BRW_READ)
+		lprocfs_counter_add(obd->obd_stats, LPROC_ECHO_READ_BYTES,
+				    tot_bytes);
+	else
+		lprocfs_counter_add(obd->obd_stats, LPROC_ECHO_WRITE_BYTES,
+				    tot_bytes);
+
+	CDEBUG(D_PAGE, "%d pages allocated after prep\n",
+	       atomic_read(&obd->u.echo.eo_prep));
+
+	RETURN(0);
+
+preprw_cleanup:
+	/* It is possible that we would rather handle errors by  allow
+	 * any already-set-up pages to complete, rather than tearing them
+	 * all down again.  I believe that this is what the in-kernel
+	 * prep/commit operations do.
+	 */
+	CERROR("cleaning up %u pages (%d obdos)\n", *pages, objcount);
+	for (i = 0; i < *pages; i++) {
+		kunmap(res[i].page);
+		/* NB if this is a persistent page, __free_pages will just
+		 * lose the extra ref gained above */
+		OBD_PAGE_FREE(res[i].page);
+		res[i].page = NULL;
+		atomic_dec(&obd->u.echo.eo_prep);
+	}
+
+	return rc;
+}
+
+static int echo_commitrw(const struct lu_env *env, int cmd,
+			 struct obd_export *export, struct obdo *oa,
+			 int objcount, struct obd_ioobj *obj,
+			 struct niobuf_remote *rb, int niocount,
+			 struct niobuf_local *res, struct obd_trans_info *oti,
+			 int rc)
+{
+	struct obd_device *obd;
+	int pgs = 0;
+	int i;
+	ENTRY;
+
+	obd = export->exp_obd;
+	if (obd == NULL)
+		RETURN(-EINVAL);
+
+	if (rc)
+		GOTO(commitrw_cleanup, rc);
+
+	if ((cmd & OBD_BRW_RWMASK) == OBD_BRW_READ) {
+		CDEBUG(D_PAGE, "reading %d obdos with %d IOs\n",
+		       objcount, niocount);
+	} else {
+		CDEBUG(D_PAGE, "writing %d obdos with %d IOs\n",
+		       objcount, niocount);
+	}
+
+	if (niocount && res == NULL) {
+		CERROR("NULL res niobuf with niocount %d\n", niocount);
+		RETURN(-EINVAL);
+	}
+
+	LASSERT(oti == NULL || oti->oti_handle == (void *)DESC_PRIV);
+
+	for (i = 0; i < objcount; i++, obj++) {
+		int verify = (rc == 0 &&
+			     ostid_id(&obj->ioo_oid) != ECHO_PERSISTENT_OBJID &&
+			      (oa->o_valid & OBD_MD_FLFLAGS) != 0 &&
+			      (oa->o_flags & OBD_FL_DEBUG_CHECK) != 0);
+		int j;
+
+		for (j = 0 ; j < obj->ioo_bufcnt ; j++, rb++) {
+			int vrc = echo_finalize_lb(oa, obj, rb, &pgs, &res[pgs],
+						   verify);
+			if (vrc == 0)
+				continue;
+
+			if (vrc == -EFAULT)
+				GOTO(commitrw_cleanup, rc = vrc);
+
+			if (rc == 0)
+				rc = vrc;
+		}
+
+	}
+
+	atomic_sub(pgs, &obd->u.echo.eo_prep);
+
+	CDEBUG(D_PAGE, "%d pages remain after commit\n",
+	       atomic_read(&obd->u.echo.eo_prep));
+	RETURN(rc);
+
+commitrw_cleanup:
+	atomic_sub(pgs, &obd->u.echo.eo_prep);
+
+	CERROR("cleaning up %d pages (%d obdos)\n",
+	       niocount - pgs - 1, objcount);
+
+	while (pgs < niocount) {
+		struct page *page = res[pgs++].page;
+
+		if (page == NULL)
+			continue;
+
+		/* NB see comment above regarding persistent pages */
+		OBD_PAGE_FREE(page);
+		atomic_dec(&obd->u.echo.eo_prep);
+	}
+	return rc;
+}
+
+static int echo_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct lprocfs_static_vars lvars;
+	int			rc;
+	__u64		      lock_flags = 0;
+	struct ldlm_res_id	 res_id = {.name = {1}};
+	char		       ns_name[48];
+	ENTRY;
+
+	obd->u.echo.eo_obt.obt_magic = OBT_MAGIC;
+	spin_lock_init(&obd->u.echo.eo_lock);
+	obd->u.echo.eo_lastino = ECHO_INIT_OID;
+
+	sprintf(ns_name, "echotgt-%s", obd->obd_uuid.uuid);
+	obd->obd_namespace = ldlm_namespace_new(obd, ns_name,
+						LDLM_NAMESPACE_SERVER,
+						LDLM_NAMESPACE_MODEST,
+						LDLM_NS_TYPE_OST);
+	if (obd->obd_namespace == NULL) {
+		LBUG();
+		RETURN(-ENOMEM);
+	}
+
+	rc = ldlm_cli_enqueue_local(obd->obd_namespace, &res_id, LDLM_PLAIN,
+				    NULL, LCK_NL, &lock_flags, NULL,
+				    ldlm_completion_ast, NULL, NULL, 0,
+				    LVB_T_NONE, NULL, &obd->u.echo.eo_nl_lock);
+	LASSERT (rc == ELDLM_OK);
+
+	lprocfs_echo_init_vars(&lvars);
+	if (lprocfs_obd_setup(obd, lvars.obd_vars) == 0 &&
+	    lprocfs_alloc_obd_stats(obd, LPROC_ECHO_LAST) == 0) {
+		lprocfs_counter_init(obd->obd_stats, LPROC_ECHO_READ_BYTES,
+				     LPROCFS_CNTR_AVGMINMAX,
+				     "read_bytes", "bytes");
+		lprocfs_counter_init(obd->obd_stats, LPROC_ECHO_WRITE_BYTES,
+				     LPROCFS_CNTR_AVGMINMAX,
+				     "write_bytes", "bytes");
+	}
+
+	ptlrpc_init_client (LDLM_CB_REQUEST_PORTAL, LDLM_CB_REPLY_PORTAL,
+			    "echo_ldlm_cb_client", &obd->obd_ldlm_client);
+	RETURN(0);
+}
+
+static int echo_cleanup(struct obd_device *obd)
+{
+	int leaked;
+	ENTRY;
+
+	lprocfs_obd_cleanup(obd);
+	lprocfs_free_obd_stats(obd);
+
+	ldlm_lock_decref(&obd->u.echo.eo_nl_lock, LCK_NL);
+
+	/* XXX Bug 3413; wait for a bit to ensure the BL callback has
+	 * happened before calling ldlm_namespace_free() */
+	schedule_timeout_and_set_state(TASK_UNINTERRUPTIBLE, cfs_time_seconds(1));
+
+	ldlm_namespace_free(obd->obd_namespace, NULL, obd->obd_force);
+	obd->obd_namespace = NULL;
+
+	leaked = atomic_read(&obd->u.echo.eo_prep);
+	if (leaked != 0)
+		CERROR("%d prep/commitrw pages leaked\n", leaked);
+
+	RETURN(0);
+}
+
+struct obd_ops echo_obd_ops = {
+	.o_owner	   = THIS_MODULE,
+	.o_connect	 = echo_connect,
+	.o_disconnect      = echo_disconnect,
+	.o_init_export     = echo_init_export,
+	.o_destroy_export  = echo_destroy_export,
+	.o_create	  = echo_create,
+	.o_destroy	 = echo_destroy,
+	.o_getattr	 = echo_getattr,
+	.o_setattr	 = echo_setattr,
+	.o_preprw	  = echo_preprw,
+	.o_commitrw	= echo_commitrw,
+	.o_setup	   = echo_setup,
+	.o_cleanup	 = echo_cleanup
+};
+
+void echo_persistent_pages_fini(void)
+{
+	int     i;
+
+	for (i = 0; i < ECHO_PERSISTENT_PAGES; i++)
+		if (echo_persistent_pages[i] != NULL) {
+			OBD_PAGE_FREE(echo_persistent_pages[i]);
+			echo_persistent_pages[i] = NULL;
+		}
+}
+
+int echo_persistent_pages_init(void)
+{
+	struct page *pg;
+	int	  i;
+
+	for (i = 0; i < ECHO_PERSISTENT_PAGES; i++) {
+		int gfp_mask = (i < ECHO_PERSISTENT_PAGES/2) ?
+			GFP_IOFS : GFP_HIGHUSER;
+
+		OBD_PAGE_ALLOC(pg, gfp_mask);
+		if (pg == NULL) {
+			echo_persistent_pages_fini ();
+			return (-ENOMEM);
+		}
+
+		memset (kmap (pg), 0, PAGE_CACHE_SIZE);
+		kunmap (pg);
+
+		echo_persistent_pages[i] = pg;
+	}
+
+	return (0);
+}
diff --git a/drivers/staging/lustre/lustre/obdecho/echo_client.c b/drivers/staging/lustre/lustre/obdecho/echo_client.c
new file mode 100644
index 000000000000..0545d1666841
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdecho/echo_client.c
@@ -0,0 +1,3217 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_ECHO
+#include <linux/libcfs/libcfs.h>
+
+#include <obd.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_debug.h>
+#include <lprocfs_status.h>
+#include <cl_object.h>
+#include <lustre_fid.h>
+#include <lustre_acl.h>
+#include <lustre_net.h>
+#include <obd_lov.h>
+
+#include "echo_internal.h"
+
+/** \defgroup echo_client Echo Client
+ * @{
+ */
+
+struct echo_device {
+	struct cl_device	ed_cl;
+	struct echo_client_obd *ed_ec;
+
+	struct cl_site	  ed_site_myself;
+	struct cl_site	 *ed_site;
+	struct lu_device       *ed_next;
+	int		     ed_next_islov;
+	int		     ed_next_ismd;
+	struct lu_client_seq   *ed_cl_seq;
+};
+
+struct echo_object {
+	struct cl_object	eo_cl;
+	struct cl_object_header eo_hdr;
+
+	struct echo_device     *eo_dev;
+	struct list_head	      eo_obj_chain;
+	struct lov_stripe_md   *eo_lsm;
+	atomic_t	    eo_npages;
+	int		     eo_deleted;
+};
+
+struct echo_object_conf {
+	struct cl_object_conf  eoc_cl;
+	struct lov_stripe_md **eoc_md;
+};
+
+struct echo_page {
+	struct cl_page_slice   ep_cl;
+	struct mutex		ep_lock;
+	struct page	    *ep_vmpage;
+};
+
+struct echo_lock {
+	struct cl_lock_slice   el_cl;
+	struct list_head	     el_chain;
+	struct echo_object    *el_object;
+	__u64		  el_cookie;
+	atomic_t	   el_refcount;
+};
+
+struct echo_io {
+	struct cl_io_slice     ei_cl;
+};
+
+#if 0
+struct echo_req {
+	struct cl_req_slice er_cl;
+};
+#endif
+
+static int echo_client_setup(const struct lu_env *env,
+			     struct obd_device *obddev,
+			     struct lustre_cfg *lcfg);
+static int echo_client_cleanup(struct obd_device *obddev);
+
+
+/** \defgroup echo_helpers Helper functions
+ * @{
+ */
+static inline struct echo_device *cl2echo_dev(const struct cl_device *dev)
+{
+	return container_of0(dev, struct echo_device, ed_cl);
+}
+
+static inline struct cl_device *echo_dev2cl(struct echo_device *d)
+{
+	return &d->ed_cl;
+}
+
+static inline struct echo_device *obd2echo_dev(const struct obd_device *obd)
+{
+	return cl2echo_dev(lu2cl_dev(obd->obd_lu_dev));
+}
+
+static inline struct cl_object *echo_obj2cl(struct echo_object *eco)
+{
+	return &eco->eo_cl;
+}
+
+static inline struct echo_object *cl2echo_obj(const struct cl_object *o)
+{
+	return container_of(o, struct echo_object, eo_cl);
+}
+
+static inline struct echo_page *cl2echo_page(const struct cl_page_slice *s)
+{
+	return container_of(s, struct echo_page, ep_cl);
+}
+
+static inline struct echo_lock *cl2echo_lock(const struct cl_lock_slice *s)
+{
+	return container_of(s, struct echo_lock, el_cl);
+}
+
+static inline struct cl_lock *echo_lock2cl(const struct echo_lock *ecl)
+{
+	return ecl->el_cl.cls_lock;
+}
+
+static struct lu_context_key echo_thread_key;
+static inline struct echo_thread_info *echo_env_info(const struct lu_env *env)
+{
+	struct echo_thread_info *info;
+	info = lu_context_key_get(&env->le_ctx, &echo_thread_key);
+	LASSERT(info != NULL);
+	return info;
+}
+
+static inline
+struct echo_object_conf *cl2echo_conf(const struct cl_object_conf *c)
+{
+	return container_of(c, struct echo_object_conf, eoc_cl);
+}
+
+/** @} echo_helpers */
+
+static struct echo_object *cl_echo_object_find(struct echo_device *d,
+					       struct lov_stripe_md **lsm);
+static int cl_echo_object_put(struct echo_object *eco);
+static int cl_echo_enqueue   (struct echo_object *eco, obd_off start,
+			      obd_off end, int mode, __u64 *cookie);
+static int cl_echo_cancel    (struct echo_device *d, __u64 cookie);
+static int cl_echo_object_brw(struct echo_object *eco, int rw, obd_off offset,
+			      struct page **pages, int npages, int async);
+
+static struct echo_thread_info *echo_env_info(const struct lu_env *env);
+
+struct echo_thread_info {
+	struct echo_object_conf eti_conf;
+	struct lustre_md	eti_md;
+
+	struct cl_2queue	eti_queue;
+	struct cl_io	    eti_io;
+	struct cl_lock_descr    eti_descr;
+	struct lu_fid	   eti_fid;
+	struct lu_fid		eti_fid2;
+	struct md_op_spec       eti_spec;
+	struct lov_mds_md_v3    eti_lmm;
+	struct lov_user_md_v3   eti_lum;
+	struct md_attr	  eti_ma;
+	struct lu_name	  eti_lname;
+	/* per-thread values, can be re-used */
+	void			*eti_big_lmm;
+	int			eti_big_lmmsize;
+	char		    eti_name[20];
+	struct lu_buf	   eti_buf;
+	char		    eti_xattr_buf[LUSTRE_POSIX_ACL_MAX_SIZE];
+};
+
+/* No session used right now */
+struct echo_session_info {
+	unsigned long dummy;
+};
+
+static struct kmem_cache *echo_lock_kmem;
+static struct kmem_cache *echo_object_kmem;
+static struct kmem_cache *echo_thread_kmem;
+static struct kmem_cache *echo_session_kmem;
+//static struct kmem_cache *echo_req_kmem;
+
+static struct lu_kmem_descr echo_caches[] = {
+	{
+		.ckd_cache = &echo_lock_kmem,
+		.ckd_name  = "echo_lock_kmem",
+		.ckd_size  = sizeof (struct echo_lock)
+	},
+	{
+		.ckd_cache = &echo_object_kmem,
+		.ckd_name  = "echo_object_kmem",
+		.ckd_size  = sizeof (struct echo_object)
+	},
+	{
+		.ckd_cache = &echo_thread_kmem,
+		.ckd_name  = "echo_thread_kmem",
+		.ckd_size  = sizeof (struct echo_thread_info)
+	},
+	{
+		.ckd_cache = &echo_session_kmem,
+		.ckd_name  = "echo_session_kmem",
+		.ckd_size  = sizeof (struct echo_session_info)
+	},
+#if 0
+	{
+		.ckd_cache = &echo_req_kmem,
+		.ckd_name  = "echo_req_kmem",
+		.ckd_size  = sizeof (struct echo_req)
+	},
+#endif
+	{
+		.ckd_cache = NULL
+	}
+};
+
+/** \defgroup echo_page Page operations
+ *
+ * Echo page operations.
+ *
+ * @{
+ */
+static struct page *echo_page_vmpage(const struct lu_env *env,
+				    const struct cl_page_slice *slice)
+{
+	return cl2echo_page(slice)->ep_vmpage;
+}
+
+static int echo_page_own(const struct lu_env *env,
+			 const struct cl_page_slice *slice,
+			 struct cl_io *io, int nonblock)
+{
+	struct echo_page *ep = cl2echo_page(slice);
+
+	if (!nonblock)
+		mutex_lock(&ep->ep_lock);
+	else if (!mutex_trylock(&ep->ep_lock))
+		return -EAGAIN;
+	return 0;
+}
+
+static void echo_page_disown(const struct lu_env *env,
+			     const struct cl_page_slice *slice,
+			     struct cl_io *io)
+{
+	struct echo_page *ep = cl2echo_page(slice);
+
+	LASSERT(mutex_is_locked(&ep->ep_lock));
+	mutex_unlock(&ep->ep_lock);
+}
+
+static void echo_page_discard(const struct lu_env *env,
+			      const struct cl_page_slice *slice,
+			      struct cl_io *unused)
+{
+	cl_page_delete(env, slice->cpl_page);
+}
+
+static int echo_page_is_vmlocked(const struct lu_env *env,
+				 const struct cl_page_slice *slice)
+{
+	if (mutex_is_locked(&cl2echo_page(slice)->ep_lock))
+		return -EBUSY;
+	return -ENODATA;
+}
+
+static void echo_page_completion(const struct lu_env *env,
+				 const struct cl_page_slice *slice,
+				 int ioret)
+{
+	LASSERT(slice->cpl_page->cp_sync_io != NULL);
+}
+
+static void echo_page_fini(const struct lu_env *env,
+			   struct cl_page_slice *slice)
+{
+	struct echo_page *ep    = cl2echo_page(slice);
+	struct echo_object *eco = cl2echo_obj(slice->cpl_obj);
+	struct page *vmpage      = ep->ep_vmpage;
+	ENTRY;
+
+	atomic_dec(&eco->eo_npages);
+	page_cache_release(vmpage);
+	EXIT;
+}
+
+static int echo_page_prep(const struct lu_env *env,
+			  const struct cl_page_slice *slice,
+			  struct cl_io *unused)
+{
+	return 0;
+}
+
+static int echo_page_print(const struct lu_env *env,
+			   const struct cl_page_slice *slice,
+			   void *cookie, lu_printer_t printer)
+{
+	struct echo_page *ep = cl2echo_page(slice);
+
+	(*printer)(env, cookie, LUSTRE_ECHO_CLIENT_NAME"-page@%p %d vm@%p\n",
+		   ep, mutex_is_locked(&ep->ep_lock), ep->ep_vmpage);
+	return 0;
+}
+
+static const struct cl_page_operations echo_page_ops = {
+	.cpo_own	   = echo_page_own,
+	.cpo_disown	= echo_page_disown,
+	.cpo_discard       = echo_page_discard,
+	.cpo_vmpage	= echo_page_vmpage,
+	.cpo_fini	  = echo_page_fini,
+	.cpo_print	 = echo_page_print,
+	.cpo_is_vmlocked   = echo_page_is_vmlocked,
+	.io = {
+		[CRT_READ] = {
+			.cpo_prep	= echo_page_prep,
+			.cpo_completion  = echo_page_completion,
+		},
+		[CRT_WRITE] = {
+			.cpo_prep	= echo_page_prep,
+			.cpo_completion  = echo_page_completion,
+		}
+	}
+};
+/** @} echo_page */
+
+/** \defgroup echo_lock Locking
+ *
+ * echo lock operations
+ *
+ * @{
+ */
+static void echo_lock_fini(const struct lu_env *env,
+			   struct cl_lock_slice *slice)
+{
+	struct echo_lock *ecl = cl2echo_lock(slice);
+
+	LASSERT(list_empty(&ecl->el_chain));
+	OBD_SLAB_FREE_PTR(ecl, echo_lock_kmem);
+}
+
+static void echo_lock_delete(const struct lu_env *env,
+			     const struct cl_lock_slice *slice)
+{
+	struct echo_lock *ecl      = cl2echo_lock(slice);
+
+	LASSERT(list_empty(&ecl->el_chain));
+}
+
+static int echo_lock_fits_into(const struct lu_env *env,
+			       const struct cl_lock_slice *slice,
+			       const struct cl_lock_descr *need,
+			       const struct cl_io *unused)
+{
+	return 1;
+}
+
+static struct cl_lock_operations echo_lock_ops = {
+	.clo_fini      = echo_lock_fini,
+	.clo_delete    = echo_lock_delete,
+	.clo_fits_into = echo_lock_fits_into
+};
+
+/** @} echo_lock */
+
+/** \defgroup echo_cl_ops cl_object operations
+ *
+ * operations for cl_object
+ *
+ * @{
+ */
+static int echo_page_init(const struct lu_env *env, struct cl_object *obj,
+			struct cl_page *page, struct page *vmpage)
+{
+	struct echo_page *ep = cl_object_page_slice(obj, page);
+	struct echo_object *eco = cl2echo_obj(obj);
+	ENTRY;
+
+	ep->ep_vmpage = vmpage;
+	page_cache_get(vmpage);
+	mutex_init(&ep->ep_lock);
+	cl_page_slice_add(page, &ep->ep_cl, obj, &echo_page_ops);
+	atomic_inc(&eco->eo_npages);
+	RETURN(0);
+}
+
+static int echo_io_init(const struct lu_env *env, struct cl_object *obj,
+			struct cl_io *io)
+{
+	return 0;
+}
+
+static int echo_lock_init(const struct lu_env *env,
+			  struct cl_object *obj, struct cl_lock *lock,
+			  const struct cl_io *unused)
+{
+	struct echo_lock *el;
+	ENTRY;
+
+	OBD_SLAB_ALLOC_PTR_GFP(el, echo_lock_kmem, __GFP_IO);
+	if (el != NULL) {
+		cl_lock_slice_add(lock, &el->el_cl, obj, &echo_lock_ops);
+		el->el_object = cl2echo_obj(obj);
+		INIT_LIST_HEAD(&el->el_chain);
+		atomic_set(&el->el_refcount, 0);
+	}
+	RETURN(el == NULL ? -ENOMEM : 0);
+}
+
+static int echo_conf_set(const struct lu_env *env, struct cl_object *obj,
+			 const struct cl_object_conf *conf)
+{
+	return 0;
+}
+
+static const struct cl_object_operations echo_cl_obj_ops = {
+	.coo_page_init = echo_page_init,
+	.coo_lock_init = echo_lock_init,
+	.coo_io_init   = echo_io_init,
+	.coo_conf_set  = echo_conf_set
+};
+/** @} echo_cl_ops */
+
+/** \defgroup echo_lu_ops lu_object operations
+ *
+ * operations for echo lu object.
+ *
+ * @{
+ */
+static int echo_object_init(const struct lu_env *env, struct lu_object *obj,
+			    const struct lu_object_conf *conf)
+{
+	struct echo_device *ed	 = cl2echo_dev(lu2cl_dev(obj->lo_dev));
+	struct echo_client_obd *ec     = ed->ed_ec;
+	struct echo_object *eco	= cl2echo_obj(lu2cl(obj));
+	ENTRY;
+
+	if (ed->ed_next) {
+		struct lu_object  *below;
+		struct lu_device  *under;
+
+		under = ed->ed_next;
+		below = under->ld_ops->ldo_object_alloc(env, obj->lo_header,
+							under);
+		if (below == NULL)
+			RETURN(-ENOMEM);
+		lu_object_add(obj, below);
+	}
+
+	if (!ed->ed_next_ismd) {
+		const struct cl_object_conf *cconf = lu2cl_conf(conf);
+		struct echo_object_conf *econf = cl2echo_conf(cconf);
+
+		LASSERT(econf->eoc_md);
+		eco->eo_lsm = *econf->eoc_md;
+		/* clear the lsm pointer so that it won't get freed. */
+		*econf->eoc_md = NULL;
+	} else {
+		eco->eo_lsm = NULL;
+	}
+
+	eco->eo_dev = ed;
+	atomic_set(&eco->eo_npages, 0);
+	cl_object_page_init(lu2cl(obj), sizeof(struct echo_page));
+
+	spin_lock(&ec->ec_lock);
+	list_add_tail(&eco->eo_obj_chain, &ec->ec_objects);
+	spin_unlock(&ec->ec_lock);
+
+	RETURN(0);
+}
+
+/* taken from osc_unpackmd() */
+static int echo_alloc_memmd(struct echo_device *ed,
+			    struct lov_stripe_md **lsmp)
+{
+	int lsm_size;
+
+	ENTRY;
+
+	/* If export is lov/osc then use their obd method */
+	if (ed->ed_next != NULL)
+		return obd_alloc_memmd(ed->ed_ec->ec_exp, lsmp);
+	/* OFD has no unpackmd method, do everything here */
+	lsm_size = lov_stripe_md_size(1);
+
+	LASSERT(*lsmp == NULL);
+	OBD_ALLOC(*lsmp, lsm_size);
+	if (*lsmp == NULL)
+		RETURN(-ENOMEM);
+
+	OBD_ALLOC((*lsmp)->lsm_oinfo[0], sizeof(struct lov_oinfo));
+	if ((*lsmp)->lsm_oinfo[0] == NULL) {
+		OBD_FREE(*lsmp, lsm_size);
+		RETURN(-ENOMEM);
+	}
+
+	loi_init((*lsmp)->lsm_oinfo[0]);
+	(*lsmp)->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES;
+	ostid_set_seq_echo(&(*lsmp)->lsm_oi);
+
+	RETURN(lsm_size);
+}
+
+static int echo_free_memmd(struct echo_device *ed, struct lov_stripe_md **lsmp)
+{
+	int lsm_size;
+
+	ENTRY;
+
+	/* If export is lov/osc then use their obd method */
+	if (ed->ed_next != NULL)
+		return obd_free_memmd(ed->ed_ec->ec_exp, lsmp);
+	/* OFD has no unpackmd method, do everything here */
+	lsm_size = lov_stripe_md_size(1);
+
+	LASSERT(*lsmp != NULL);
+	OBD_FREE((*lsmp)->lsm_oinfo[0], sizeof(struct lov_oinfo));
+	OBD_FREE(*lsmp, lsm_size);
+	*lsmp = NULL;
+	RETURN(0);
+}
+
+static void echo_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+	struct echo_object *eco    = cl2echo_obj(lu2cl(obj));
+	struct echo_client_obd *ec = eco->eo_dev->ed_ec;
+	ENTRY;
+
+	LASSERT(atomic_read(&eco->eo_npages) == 0);
+
+	spin_lock(&ec->ec_lock);
+	list_del_init(&eco->eo_obj_chain);
+	spin_unlock(&ec->ec_lock);
+
+	lu_object_fini(obj);
+	lu_object_header_fini(obj->lo_header);
+
+	if (eco->eo_lsm)
+		echo_free_memmd(eco->eo_dev, &eco->eo_lsm);
+	OBD_SLAB_FREE_PTR(eco, echo_object_kmem);
+	EXIT;
+}
+
+static int echo_object_print(const struct lu_env *env, void *cookie,
+			    lu_printer_t p, const struct lu_object *o)
+{
+	struct echo_object *obj = cl2echo_obj(lu2cl(o));
+
+	return (*p)(env, cookie, "echoclient-object@%p", obj);
+}
+
+static const struct lu_object_operations echo_lu_obj_ops = {
+	.loo_object_init      = echo_object_init,
+	.loo_object_delete    = NULL,
+	.loo_object_release   = NULL,
+	.loo_object_free      = echo_object_free,
+	.loo_object_print     = echo_object_print,
+	.loo_object_invariant = NULL
+};
+/** @} echo_lu_ops */
+
+/** \defgroup echo_lu_dev_ops  lu_device operations
+ *
+ * Operations for echo lu device.
+ *
+ * @{
+ */
+static struct lu_object *echo_object_alloc(const struct lu_env *env,
+					   const struct lu_object_header *hdr,
+					   struct lu_device *dev)
+{
+	struct echo_object *eco;
+	struct lu_object *obj = NULL;
+	ENTRY;
+
+	/* we're the top dev. */
+	LASSERT(hdr == NULL);
+	OBD_SLAB_ALLOC_PTR_GFP(eco, echo_object_kmem, __GFP_IO);
+	if (eco != NULL) {
+		struct cl_object_header *hdr = &eco->eo_hdr;
+
+		obj = &echo_obj2cl(eco)->co_lu;
+		cl_object_header_init(hdr);
+		lu_object_init(obj, &hdr->coh_lu, dev);
+		lu_object_add_top(&hdr->coh_lu, obj);
+
+		eco->eo_cl.co_ops = &echo_cl_obj_ops;
+		obj->lo_ops       = &echo_lu_obj_ops;
+	}
+	RETURN(obj);
+}
+
+static struct lu_device_operations echo_device_lu_ops = {
+	.ldo_object_alloc   = echo_object_alloc,
+};
+
+/** @} echo_lu_dev_ops */
+
+static struct cl_device_operations echo_device_cl_ops = {
+};
+
+/** \defgroup echo_init Setup and teardown
+ *
+ * Init and fini functions for echo client.
+ *
+ * @{
+ */
+static int echo_site_init(const struct lu_env *env, struct echo_device *ed)
+{
+	struct cl_site *site = &ed->ed_site_myself;
+	int rc;
+
+	/* initialize site */
+	rc = cl_site_init(site, &ed->ed_cl);
+	if (rc) {
+		CERROR("Cannot initilize site for echo client(%d)\n", rc);
+		return rc;
+	}
+
+	rc = lu_site_init_finish(&site->cs_lu);
+	if (rc)
+		return rc;
+
+	ed->ed_site = site;
+	return 0;
+}
+
+static void echo_site_fini(const struct lu_env *env, struct echo_device *ed)
+{
+	if (ed->ed_site) {
+		if (!ed->ed_next_ismd)
+			cl_site_fini(ed->ed_site);
+		ed->ed_site = NULL;
+	}
+}
+
+static void *echo_thread_key_init(const struct lu_context *ctx,
+			  struct lu_context_key *key)
+{
+	struct echo_thread_info *info;
+
+	OBD_SLAB_ALLOC_PTR_GFP(info, echo_thread_kmem, __GFP_IO);
+	if (info == NULL)
+		info = ERR_PTR(-ENOMEM);
+	return info;
+}
+
+static void echo_thread_key_fini(const struct lu_context *ctx,
+			 struct lu_context_key *key, void *data)
+{
+	struct echo_thread_info *info = data;
+	OBD_SLAB_FREE_PTR(info, echo_thread_kmem);
+}
+
+static void echo_thread_key_exit(const struct lu_context *ctx,
+			 struct lu_context_key *key, void *data)
+{
+}
+
+static struct lu_context_key echo_thread_key = {
+	.lct_tags = LCT_CL_THREAD,
+	.lct_init = echo_thread_key_init,
+	.lct_fini = echo_thread_key_fini,
+	.lct_exit = echo_thread_key_exit
+};
+
+static void *echo_session_key_init(const struct lu_context *ctx,
+				  struct lu_context_key *key)
+{
+	struct echo_session_info *session;
+
+	OBD_SLAB_ALLOC_PTR_GFP(session, echo_session_kmem, __GFP_IO);
+	if (session == NULL)
+		session = ERR_PTR(-ENOMEM);
+	return session;
+}
+
+static void echo_session_key_fini(const struct lu_context *ctx,
+				 struct lu_context_key *key, void *data)
+{
+	struct echo_session_info *session = data;
+	OBD_SLAB_FREE_PTR(session, echo_session_kmem);
+}
+
+static void echo_session_key_exit(const struct lu_context *ctx,
+				 struct lu_context_key *key, void *data)
+{
+}
+
+static struct lu_context_key echo_session_key = {
+	.lct_tags = LCT_SESSION,
+	.lct_init = echo_session_key_init,
+	.lct_fini = echo_session_key_fini,
+	.lct_exit = echo_session_key_exit
+};
+
+LU_TYPE_INIT_FINI(echo, &echo_thread_key, &echo_session_key);
+
+#define ECHO_SEQ_WIDTH 0xffffffff
+static int echo_fid_init(struct echo_device *ed, char *obd_name,
+			 struct seq_server_site *ss)
+{
+	char *prefix;
+	int rc;
+	ENTRY;
+
+	OBD_ALLOC_PTR(ed->ed_cl_seq);
+	if (ed->ed_cl_seq == NULL)
+		RETURN(-ENOMEM);
+
+	OBD_ALLOC(prefix, MAX_OBD_NAME + 5);
+	if (prefix == NULL)
+		GOTO(out_free_seq, rc = -ENOMEM);
+
+	snprintf(prefix, MAX_OBD_NAME + 5, "srv-%s", obd_name);
+
+	/* Init client side sequence-manager */
+	rc = seq_client_init(ed->ed_cl_seq, NULL,
+			     LUSTRE_SEQ_METADATA,
+			     prefix, ss->ss_server_seq);
+	ed->ed_cl_seq->lcs_width = ECHO_SEQ_WIDTH;
+	OBD_FREE(prefix, MAX_OBD_NAME + 5);
+	if (rc)
+		GOTO(out_free_seq, rc);
+
+	RETURN(0);
+
+out_free_seq:
+	OBD_FREE_PTR(ed->ed_cl_seq);
+	ed->ed_cl_seq = NULL;
+	RETURN(rc);
+}
+
+static int echo_fid_fini(struct obd_device *obddev)
+{
+	struct echo_device *ed = obd2echo_dev(obddev);
+	ENTRY;
+
+	if (ed->ed_cl_seq != NULL) {
+		seq_client_fini(ed->ed_cl_seq);
+		OBD_FREE_PTR(ed->ed_cl_seq);
+		ed->ed_cl_seq = NULL;
+	}
+
+	RETURN(0);
+}
+
+static struct lu_device *echo_device_alloc(const struct lu_env *env,
+					   struct lu_device_type *t,
+					   struct lustre_cfg *cfg)
+{
+	struct lu_device   *next;
+	struct echo_device *ed;
+	struct cl_device   *cd;
+	struct obd_device  *obd = NULL; /* to keep compiler happy */
+	struct obd_device  *tgt;
+	const char *tgt_type_name;
+	int rc;
+	int cleanup = 0;
+	ENTRY;
+
+	OBD_ALLOC_PTR(ed);
+	if (ed == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	cleanup = 1;
+	cd = &ed->ed_cl;
+	rc = cl_device_init(cd, t);
+	if (rc)
+		GOTO(out, rc);
+
+	cd->cd_lu_dev.ld_ops = &echo_device_lu_ops;
+	cd->cd_ops = &echo_device_cl_ops;
+
+	cleanup = 2;
+	obd = class_name2obd(lustre_cfg_string(cfg, 0));
+	LASSERT(obd != NULL);
+	LASSERT(env != NULL);
+
+	tgt = class_name2obd(lustre_cfg_string(cfg, 1));
+	if (tgt == NULL) {
+		CERROR("Can not find tgt device %s\n",
+			lustre_cfg_string(cfg, 1));
+		GOTO(out, rc = -ENODEV);
+	}
+
+	next = tgt->obd_lu_dev;
+	if (!strcmp(tgt->obd_type->typ_name, LUSTRE_MDT_NAME)) {
+		ed->ed_next_ismd = 1;
+	} else {
+		ed->ed_next_ismd = 0;
+		rc = echo_site_init(env, ed);
+		if (rc)
+			GOTO(out, rc);
+	}
+	cleanup = 3;
+
+	rc = echo_client_setup(env, obd, cfg);
+	if (rc)
+		GOTO(out, rc);
+
+	ed->ed_ec = &obd->u.echo_client;
+	cleanup = 4;
+
+	if (ed->ed_next_ismd) {
+		/* Suppose to connect to some Metadata layer */
+		struct lu_site *ls;
+		struct lu_device *ld;
+		int    found = 0;
+
+		if (next == NULL) {
+			CERROR("%s is not lu device type!\n",
+			       lustre_cfg_string(cfg, 1));
+			GOTO(out, rc = -EINVAL);
+		}
+
+		tgt_type_name = lustre_cfg_string(cfg, 2);
+		if (!tgt_type_name) {
+			CERROR("%s no type name for echo %s setup\n",
+				lustre_cfg_string(cfg, 1),
+				tgt->obd_type->typ_name);
+			GOTO(out, rc = -EINVAL);
+		}
+
+		ls = next->ld_site;
+
+		spin_lock(&ls->ls_ld_lock);
+		list_for_each_entry(ld, &ls->ls_ld_linkage, ld_linkage) {
+			if (strcmp(ld->ld_type->ldt_name, tgt_type_name) == 0) {
+				found = 1;
+				break;
+			}
+		}
+		spin_unlock(&ls->ls_ld_lock);
+
+		if (found == 0) {
+			CERROR("%s is not lu device type!\n",
+			       lustre_cfg_string(cfg, 1));
+			GOTO(out, rc = -EINVAL);
+		}
+
+		next = ld;
+		/* For MD echo client, it will use the site in MDS stack */
+		ed->ed_site_myself.cs_lu = *ls;
+		ed->ed_site = &ed->ed_site_myself;
+		ed->ed_cl.cd_lu_dev.ld_site = &ed->ed_site_myself.cs_lu;
+		rc = echo_fid_init(ed, obd->obd_name, lu_site2seq(ls));
+		if (rc) {
+			CERROR("echo fid init error %d\n", rc);
+			GOTO(out, rc);
+		}
+	} else {
+		 /* if echo client is to be stacked upon ost device, the next is
+		  * NULL since ost is not a clio device so far */
+		if (next != NULL && !lu_device_is_cl(next))
+			next = NULL;
+
+		tgt_type_name = tgt->obd_type->typ_name;
+		if (next != NULL) {
+			LASSERT(next != NULL);
+			if (next->ld_site != NULL)
+				GOTO(out, rc = -EBUSY);
+
+			next->ld_site = &ed->ed_site->cs_lu;
+			rc = next->ld_type->ldt_ops->ldto_device_init(env, next,
+						     next->ld_type->ldt_name,
+						     NULL);
+			if (rc)
+				GOTO(out, rc);
+
+			/* Tricky case, I have to determine the obd type since
+			 * CLIO uses the different parameters to initialize
+			 * objects for lov & osc. */
+			if (strcmp(tgt_type_name, LUSTRE_LOV_NAME) == 0)
+				ed->ed_next_islov = 1;
+			else
+				LASSERT(strcmp(tgt_type_name,
+					       LUSTRE_OSC_NAME) == 0);
+		} else
+			LASSERT(strcmp(tgt_type_name, LUSTRE_OST_NAME) == 0);
+	}
+
+	ed->ed_next = next;
+	RETURN(&cd->cd_lu_dev);
+out:
+	switch(cleanup) {
+	case 4: {
+		int rc2;
+		rc2 = echo_client_cleanup(obd);
+		if (rc2)
+			CERROR("Cleanup obd device %s error(%d)\n",
+			       obd->obd_name, rc2);
+	}
+
+	case 3:
+		echo_site_fini(env, ed);
+	case 2:
+		cl_device_fini(&ed->ed_cl);
+	case 1:
+		OBD_FREE_PTR(ed);
+	case 0:
+	default:
+		break;
+	}
+	return(ERR_PTR(rc));
+}
+
+static int echo_device_init(const struct lu_env *env, struct lu_device *d,
+			  const char *name, struct lu_device *next)
+{
+	LBUG();
+	return 0;
+}
+
+static struct lu_device *echo_device_fini(const struct lu_env *env,
+					  struct lu_device *d)
+{
+	struct echo_device *ed = cl2echo_dev(lu2cl_dev(d));
+	struct lu_device *next = ed->ed_next;
+
+	while (next && !ed->ed_next_ismd)
+		next = next->ld_type->ldt_ops->ldto_device_fini(env, next);
+	return NULL;
+}
+
+static void echo_lock_release(const struct lu_env *env,
+			      struct echo_lock *ecl,
+			      int still_used)
+{
+	struct cl_lock *clk = echo_lock2cl(ecl);
+
+	cl_lock_get(clk);
+	cl_unuse(env, clk);
+	cl_lock_release(env, clk, "ec enqueue", ecl->el_object);
+	if (!still_used) {
+		cl_lock_mutex_get(env, clk);
+		cl_lock_cancel(env, clk);
+		cl_lock_delete(env, clk);
+		cl_lock_mutex_put(env, clk);
+	}
+	cl_lock_put(env, clk);
+}
+
+static struct lu_device *echo_device_free(const struct lu_env *env,
+					  struct lu_device *d)
+{
+	struct echo_device     *ed   = cl2echo_dev(lu2cl_dev(d));
+	struct echo_client_obd *ec   = ed->ed_ec;
+	struct echo_object     *eco;
+	struct lu_device       *next = ed->ed_next;
+
+	CDEBUG(D_INFO, "echo device:%p is going to be freed, next = %p\n",
+	       ed, next);
+
+	lu_site_purge(env, &ed->ed_site->cs_lu, -1);
+
+	/* check if there are objects still alive.
+	 * It shouldn't have any object because lu_site_purge would cleanup
+	 * all of cached objects. Anyway, probably the echo device is being
+	 * parallelly accessed.
+	 */
+	spin_lock(&ec->ec_lock);
+	list_for_each_entry(eco, &ec->ec_objects, eo_obj_chain)
+		eco->eo_deleted = 1;
+	spin_unlock(&ec->ec_lock);
+
+	/* purge again */
+	lu_site_purge(env, &ed->ed_site->cs_lu, -1);
+
+	CDEBUG(D_INFO,
+	       "Waiting for the reference of echo object to be dropped\n");
+
+	/* Wait for the last reference to be dropped. */
+	spin_lock(&ec->ec_lock);
+	while (!list_empty(&ec->ec_objects)) {
+		spin_unlock(&ec->ec_lock);
+		CERROR("echo_client still has objects at cleanup time, "
+		       "wait for 1 second\n");
+		schedule_timeout_and_set_state(TASK_UNINTERRUPTIBLE,
+						   cfs_time_seconds(1));
+		lu_site_purge(env, &ed->ed_site->cs_lu, -1);
+		spin_lock(&ec->ec_lock);
+	}
+	spin_unlock(&ec->ec_lock);
+
+	LASSERT(list_empty(&ec->ec_locks));
+
+	CDEBUG(D_INFO, "No object exists, exiting...\n");
+
+	echo_client_cleanup(d->ld_obd);
+	echo_fid_fini(d->ld_obd);
+	while (next && !ed->ed_next_ismd)
+		next = next->ld_type->ldt_ops->ldto_device_free(env, next);
+
+	LASSERT(ed->ed_site == lu2cl_site(d->ld_site));
+	echo_site_fini(env, ed);
+	cl_device_fini(&ed->ed_cl);
+	OBD_FREE_PTR(ed);
+
+	return NULL;
+}
+
+static const struct lu_device_type_operations echo_device_type_ops = {
+	.ldto_init = echo_type_init,
+	.ldto_fini = echo_type_fini,
+
+	.ldto_start = echo_type_start,
+	.ldto_stop  = echo_type_stop,
+
+	.ldto_device_alloc = echo_device_alloc,
+	.ldto_device_free  = echo_device_free,
+	.ldto_device_init  = echo_device_init,
+	.ldto_device_fini  = echo_device_fini
+};
+
+static struct lu_device_type echo_device_type = {
+	.ldt_tags     = LU_DEVICE_CL,
+	.ldt_name     = LUSTRE_ECHO_CLIENT_NAME,
+	.ldt_ops      = &echo_device_type_ops,
+	.ldt_ctx_tags = LCT_CL_THREAD | LCT_MD_THREAD | LCT_DT_THREAD,
+};
+/** @} echo_init */
+
+/** \defgroup echo_exports Exported operations
+ *
+ * exporting functions to echo client
+ *
+ * @{
+ */
+
+/* Interfaces to echo client obd device */
+static struct echo_object *cl_echo_object_find(struct echo_device *d,
+					       struct lov_stripe_md **lsmp)
+{
+	struct lu_env *env;
+	struct echo_thread_info *info;
+	struct echo_object_conf *conf;
+	struct lov_stripe_md    *lsm;
+	struct echo_object *eco;
+	struct cl_object   *obj;
+	struct lu_fid *fid;
+	int refcheck;
+	int rc;
+	ENTRY;
+
+	LASSERT(lsmp);
+	lsm = *lsmp;
+	LASSERT(lsm);
+	LASSERT(ostid_id(&lsm->lsm_oi) != 0);
+	LASSERT(ostid_seq(&lsm->lsm_oi) == FID_SEQ_ECHO);
+
+	/* Never return an object if the obd is to be freed. */
+	if (echo_dev2cl(d)->cd_lu_dev.ld_obd->obd_stopping)
+		RETURN(ERR_PTR(-ENODEV));
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN((void *)env);
+
+	info = echo_env_info(env);
+	conf = &info->eti_conf;
+	if (d->ed_next) {
+		if (!d->ed_next_islov) {
+			struct lov_oinfo *oinfo = lsm->lsm_oinfo[0];
+			LASSERT(oinfo != NULL);
+			oinfo->loi_oi = lsm->lsm_oi;
+			conf->eoc_cl.u.coc_oinfo = oinfo;
+		} else {
+			struct lustre_md *md;
+			md = &info->eti_md;
+			memset(md, 0, sizeof *md);
+			md->lsm = lsm;
+			conf->eoc_cl.u.coc_md = md;
+		}
+	}
+	conf->eoc_md = lsmp;
+
+	fid  = &info->eti_fid;
+	rc = ostid_to_fid(fid, &lsm->lsm_oi, 0);
+	if (rc != 0)
+		GOTO(out, eco = ERR_PTR(rc));
+
+	/* In the function below, .hs_keycmp resolves to
+	 * lu_obj_hop_keycmp() */
+	/* coverity[overrun-buffer-val] */
+	obj = cl_object_find(env, echo_dev2cl(d), fid, &conf->eoc_cl);
+	if (IS_ERR(obj))
+		GOTO(out, eco = (void*)obj);
+
+	eco = cl2echo_obj(obj);
+	if (eco->eo_deleted) {
+		cl_object_put(env, obj);
+		eco = ERR_PTR(-EAGAIN);
+	}
+
+out:
+	cl_env_put(env, &refcheck);
+	RETURN(eco);
+}
+
+static int cl_echo_object_put(struct echo_object *eco)
+{
+	struct lu_env *env;
+	struct cl_object *obj = echo_obj2cl(eco);
+	int refcheck;
+	ENTRY;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	/* an external function to kill an object? */
+	if (eco->eo_deleted) {
+		struct lu_object_header *loh = obj->co_lu.lo_header;
+		LASSERT(&eco->eo_hdr == luh2coh(loh));
+		set_bit(LU_OBJECT_HEARD_BANSHEE, &loh->loh_flags);
+	}
+
+	cl_object_put(env, obj);
+	cl_env_put(env, &refcheck);
+	RETURN(0);
+}
+
+static int cl_echo_enqueue0(struct lu_env *env, struct echo_object *eco,
+			    obd_off start, obd_off end, int mode,
+			    __u64 *cookie , __u32 enqflags)
+{
+	struct cl_io *io;
+	struct cl_lock *lck;
+	struct cl_object *obj;
+	struct cl_lock_descr *descr;
+	struct echo_thread_info *info;
+	int rc = -ENOMEM;
+	ENTRY;
+
+	info = echo_env_info(env);
+	io = &info->eti_io;
+	descr = &info->eti_descr;
+	obj = echo_obj2cl(eco);
+
+	descr->cld_obj   = obj;
+	descr->cld_start = cl_index(obj, start);
+	descr->cld_end   = cl_index(obj, end);
+	descr->cld_mode  = mode == LCK_PW ? CLM_WRITE : CLM_READ;
+	descr->cld_enq_flags = enqflags;
+	io->ci_obj = obj;
+
+	lck = cl_lock_request(env, io, descr, "ec enqueue", eco);
+	if (lck) {
+		struct echo_client_obd *ec = eco->eo_dev->ed_ec;
+		struct echo_lock *el;
+
+		rc = cl_wait(env, lck);
+		if (rc == 0) {
+			el = cl2echo_lock(cl_lock_at(lck, &echo_device_type));
+			spin_lock(&ec->ec_lock);
+			if (list_empty(&el->el_chain)) {
+				list_add(&el->el_chain, &ec->ec_locks);
+				el->el_cookie = ++ec->ec_unique;
+			}
+			atomic_inc(&el->el_refcount);
+			*cookie = el->el_cookie;
+			spin_unlock(&ec->ec_lock);
+		} else {
+			cl_lock_release(env, lck, "ec enqueue", current);
+		}
+	}
+	RETURN(rc);
+}
+
+static int cl_echo_enqueue(struct echo_object *eco, obd_off start, obd_off end,
+			   int mode, __u64 *cookie)
+{
+	struct echo_thread_info *info;
+	struct lu_env *env;
+	struct cl_io *io;
+	int refcheck;
+	int result;
+	ENTRY;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	info = echo_env_info(env);
+	io = &info->eti_io;
+
+	io->ci_ignore_layout = 1;
+	result = cl_io_init(env, io, CIT_MISC, echo_obj2cl(eco));
+	if (result < 0)
+		GOTO(out, result);
+	LASSERT(result == 0);
+
+	result = cl_echo_enqueue0(env, eco, start, end, mode, cookie, 0);
+	cl_io_fini(env, io);
+
+	EXIT;
+out:
+	cl_env_put(env, &refcheck);
+	return result;
+}
+
+static int cl_echo_cancel0(struct lu_env *env, struct echo_device *ed,
+			   __u64 cookie)
+{
+	struct echo_client_obd *ec = ed->ed_ec;
+	struct echo_lock       *ecl = NULL;
+	struct list_head	     *el;
+	int found = 0, still_used = 0;
+	ENTRY;
+
+	LASSERT(ec != NULL);
+	spin_lock(&ec->ec_lock);
+	list_for_each (el, &ec->ec_locks) {
+		ecl = list_entry (el, struct echo_lock, el_chain);
+		CDEBUG(D_INFO, "ecl: %p, cookie: "LPX64"\n", ecl, ecl->el_cookie);
+		found = (ecl->el_cookie == cookie);
+		if (found) {
+			if (atomic_dec_and_test(&ecl->el_refcount))
+				list_del_init(&ecl->el_chain);
+			else
+				still_used = 1;
+			break;
+		}
+	}
+	spin_unlock(&ec->ec_lock);
+
+	if (!found)
+		RETURN(-ENOENT);
+
+	echo_lock_release(env, ecl, still_used);
+	RETURN(0);
+}
+
+static int cl_echo_cancel(struct echo_device *ed, __u64 cookie)
+{
+	struct lu_env *env;
+	int refcheck;
+	int rc;
+	ENTRY;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	rc = cl_echo_cancel0(env, ed, cookie);
+
+	cl_env_put(env, &refcheck);
+	RETURN(rc);
+}
+
+static int cl_echo_async_brw(const struct lu_env *env, struct cl_io *io,
+			     enum cl_req_type unused, struct cl_2queue *queue)
+{
+	struct cl_page *clp;
+	struct cl_page *temp;
+	int result = 0;
+	ENTRY;
+
+	cl_page_list_for_each_safe(clp, temp, &queue->c2_qin) {
+		int rc;
+		rc = cl_page_cache_add(env, io, clp, CRT_WRITE);
+		if (rc == 0)
+			continue;
+		result = result ?: rc;
+	}
+	RETURN(result);
+}
+
+static int cl_echo_object_brw(struct echo_object *eco, int rw, obd_off offset,
+			      struct page **pages, int npages, int async)
+{
+	struct lu_env	   *env;
+	struct echo_thread_info *info;
+	struct cl_object	*obj = echo_obj2cl(eco);
+	struct echo_device      *ed  = eco->eo_dev;
+	struct cl_2queue	*queue;
+	struct cl_io	    *io;
+	struct cl_page	  *clp;
+	struct lustre_handle    lh = { 0 };
+	int page_size = cl_page_size(obj);
+	int refcheck;
+	int rc;
+	int i;
+	ENTRY;
+
+	LASSERT((offset & ~CFS_PAGE_MASK) == 0);
+	LASSERT(ed->ed_next != NULL);
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	info    = echo_env_info(env);
+	io      = &info->eti_io;
+	queue   = &info->eti_queue;
+
+	cl_2queue_init(queue);
+
+	io->ci_ignore_layout = 1;
+	rc = cl_io_init(env, io, CIT_MISC, obj);
+	if (rc < 0)
+		GOTO(out, rc);
+	LASSERT(rc == 0);
+
+
+	rc = cl_echo_enqueue0(env, eco, offset,
+			      offset + npages * PAGE_CACHE_SIZE - 1,
+			      rw == READ ? LCK_PR : LCK_PW, &lh.cookie,
+			      CEF_NEVER);
+	if (rc < 0)
+		GOTO(error_lock, rc);
+
+	for (i = 0; i < npages; i++) {
+		LASSERT(pages[i]);
+		clp = cl_page_find(env, obj, cl_index(obj, offset),
+				   pages[i], CPT_TRANSIENT);
+		if (IS_ERR(clp)) {
+			rc = PTR_ERR(clp);
+			break;
+		}
+		LASSERT(clp->cp_type == CPT_TRANSIENT);
+
+		rc = cl_page_own(env, io, clp);
+		if (rc) {
+			LASSERT(clp->cp_state == CPS_FREEING);
+			cl_page_put(env, clp);
+			break;
+		}
+
+		cl_2queue_add(queue, clp);
+
+		/* drop the reference count for cl_page_find, so that the page
+		 * will be freed in cl_2queue_fini. */
+		cl_page_put(env, clp);
+		cl_page_clip(env, clp, 0, page_size);
+
+		offset += page_size;
+	}
+
+	if (rc == 0) {
+		enum cl_req_type typ = rw == READ ? CRT_READ : CRT_WRITE;
+
+		async = async && (typ == CRT_WRITE);
+		if (async)
+			rc = cl_echo_async_brw(env, io, typ, queue);
+		else
+			rc = cl_io_submit_sync(env, io, typ, queue, 0);
+		CDEBUG(D_INFO, "echo_client %s write returns %d\n",
+		       async ? "async" : "sync", rc);
+	}
+
+	cl_echo_cancel0(env, ed, lh.cookie);
+	EXIT;
+error_lock:
+	cl_2queue_discard(env, io, queue);
+	cl_2queue_disown(env, io, queue);
+	cl_2queue_fini(env, queue);
+	cl_io_fini(env, io);
+out:
+	cl_env_put(env, &refcheck);
+	return rc;
+}
+/** @} echo_exports */
+
+
+static obd_id last_object_id;
+
+static int
+echo_copyout_lsm (struct lov_stripe_md *lsm, void *_ulsm, int ulsm_nob)
+{
+	struct lov_stripe_md *ulsm = _ulsm;
+	int nob, i;
+
+	nob = offsetof (struct lov_stripe_md, lsm_oinfo[lsm->lsm_stripe_count]);
+	if (nob > ulsm_nob)
+		return (-EINVAL);
+
+	if (copy_to_user (ulsm, lsm, sizeof(ulsm)))
+		return (-EFAULT);
+
+	for (i = 0; i < lsm->lsm_stripe_count; i++) {
+		if (copy_to_user (ulsm->lsm_oinfo[i], lsm->lsm_oinfo[i],
+				      sizeof(lsm->lsm_oinfo[0])))
+			return (-EFAULT);
+	}
+	return 0;
+}
+
+static int
+echo_copyin_lsm (struct echo_device *ed, struct lov_stripe_md *lsm,
+		 void *ulsm, int ulsm_nob)
+{
+	struct echo_client_obd *ec = ed->ed_ec;
+	int		     i;
+
+	if (ulsm_nob < sizeof (*lsm))
+		return (-EINVAL);
+
+	if (copy_from_user (lsm, ulsm, sizeof (*lsm)))
+		return (-EFAULT);
+
+	if (lsm->lsm_stripe_count > ec->ec_nstripes ||
+	    lsm->lsm_magic != LOV_MAGIC ||
+	    (lsm->lsm_stripe_size & (~CFS_PAGE_MASK)) != 0 ||
+	    ((__u64)lsm->lsm_stripe_size * lsm->lsm_stripe_count > ~0UL))
+		return (-EINVAL);
+
+
+	for (i = 0; i < lsm->lsm_stripe_count; i++) {
+		if (copy_from_user(lsm->lsm_oinfo[i],
+				       ((struct lov_stripe_md *)ulsm)-> \
+				       lsm_oinfo[i],
+				       sizeof(lsm->lsm_oinfo[0])))
+			return (-EFAULT);
+	}
+	return (0);
+}
+
+static inline void echo_md_build_name(struct lu_name *lname, char *name,
+				      __u64 id)
+{
+	sprintf(name, LPU64, id);
+	lname->ln_name = name;
+	lname->ln_namelen = strlen(name);
+}
+
+/* similar to mdt_attr_get_complex */
+static int echo_big_lmm_get(const struct lu_env *env, struct md_object *o,
+			    struct md_attr *ma)
+{
+	struct echo_thread_info	*info = echo_env_info(env);
+	int			 rc;
+
+	ENTRY;
+
+	LASSERT(ma->ma_lmm_size > 0);
+
+	rc = mo_xattr_get(env, o, &LU_BUF_NULL, XATTR_NAME_LOV);
+	if (rc < 0)
+		RETURN(rc);
+
+	/* big_lmm may need to be grown */
+	if (info->eti_big_lmmsize < rc) {
+		int size = size_roundup_power2(rc);
+
+		if (info->eti_big_lmmsize > 0) {
+			/* free old buffer */
+			LASSERT(info->eti_big_lmm);
+			OBD_FREE_LARGE(info->eti_big_lmm,
+				       info->eti_big_lmmsize);
+			info->eti_big_lmm = NULL;
+			info->eti_big_lmmsize = 0;
+		}
+
+		OBD_ALLOC_LARGE(info->eti_big_lmm, size);
+		if (info->eti_big_lmm == NULL)
+			RETURN(-ENOMEM);
+		info->eti_big_lmmsize = size;
+	}
+	LASSERT(info->eti_big_lmmsize >= rc);
+
+	info->eti_buf.lb_buf = info->eti_big_lmm;
+	info->eti_buf.lb_len = info->eti_big_lmmsize;
+	rc = mo_xattr_get(env, o, &info->eti_buf, XATTR_NAME_LOV);
+	if (rc < 0)
+		RETURN(rc);
+
+	ma->ma_valid |= MA_LOV;
+	ma->ma_lmm = info->eti_big_lmm;
+	ma->ma_lmm_size = rc;
+
+	RETURN(0);
+}
+
+int echo_attr_get_complex(const struct lu_env *env, struct md_object *next,
+			  struct md_attr *ma)
+{
+	struct echo_thread_info	*info = echo_env_info(env);
+	struct lu_buf		*buf = &info->eti_buf;
+	umode_t		 mode = lu_object_attr(&next->mo_lu);
+	int			 need = ma->ma_need;
+	int			 rc = 0, rc2;
+
+	ENTRY;
+
+	ma->ma_valid = 0;
+
+	if (need & MA_INODE) {
+		ma->ma_need = MA_INODE;
+		rc = mo_attr_get(env, next, ma);
+		if (rc)
+			GOTO(out, rc);
+		ma->ma_valid |= MA_INODE;
+	}
+
+	if (need & MA_LOV) {
+		if (S_ISREG(mode) || S_ISDIR(mode)) {
+			LASSERT(ma->ma_lmm_size > 0);
+			buf->lb_buf = ma->ma_lmm;
+			buf->lb_len = ma->ma_lmm_size;
+			rc2 = mo_xattr_get(env, next, buf, XATTR_NAME_LOV);
+			if (rc2 > 0) {
+				ma->ma_lmm_size = rc2;
+				ma->ma_valid |= MA_LOV;
+			} else if (rc2 == -ENODATA) {
+				/* no LOV EA */
+				ma->ma_lmm_size = 0;
+			} else if (rc2 == -ERANGE) {
+				rc2 = echo_big_lmm_get(env, next, ma);
+				if (rc2 < 0)
+					GOTO(out, rc = rc2);
+			} else {
+				GOTO(out, rc = rc2);
+			}
+		}
+	}
+
+#ifdef CONFIG_FS_POSIX_ACL
+	if (need & MA_ACL_DEF && S_ISDIR(mode)) {
+		buf->lb_buf = ma->ma_acl;
+		buf->lb_len = ma->ma_acl_size;
+		rc2 = mo_xattr_get(env, next, buf, XATTR_NAME_ACL_DEFAULT);
+		if (rc2 > 0) {
+			ma->ma_acl_size = rc2;
+			ma->ma_valid |= MA_ACL_DEF;
+		} else if (rc2 == -ENODATA) {
+			/* no ACLs */
+			ma->ma_acl_size = 0;
+		} else {
+			GOTO(out, rc = rc2);
+		}
+	}
+#endif
+out:
+	ma->ma_need = need;
+	CDEBUG(D_INODE, "after getattr rc = %d, ma_valid = "LPX64" ma_lmm=%p\n",
+	       rc, ma->ma_valid, ma->ma_lmm);
+	RETURN(rc);
+}
+
+static int
+echo_md_create_internal(const struct lu_env *env, struct echo_device *ed,
+			struct md_object *parent, struct lu_fid *fid,
+			struct lu_name *lname, struct md_op_spec *spec,
+			struct md_attr *ma)
+{
+	struct lu_object	*ec_child, *child;
+	struct lu_device	*ld = ed->ed_next;
+	struct echo_thread_info *info = echo_env_info(env);
+	struct lu_fid		*fid2 = &info->eti_fid2;
+	struct lu_object_conf    conf = { .loc_flags = LOC_F_NEW };
+	int			 rc;
+
+	ENTRY;
+
+	rc = mdo_lookup(env, parent, lname, fid2, spec);
+	if (rc == 0)
+		return -EEXIST;
+	else if (rc != -ENOENT)
+		return rc;
+
+	ec_child = lu_object_find_at(env, &ed->ed_cl.cd_lu_dev,
+				     fid, &conf);
+	if (IS_ERR(ec_child)) {
+		CERROR("Can not find the child "DFID": rc = %ld\n", PFID(fid),
+			PTR_ERR(ec_child));
+		RETURN(PTR_ERR(ec_child));
+	}
+
+	child = lu_object_locate(ec_child->lo_header, ld->ld_type);
+	if (child == NULL) {
+		CERROR("Can not locate the child "DFID"\n", PFID(fid));
+		GOTO(out_put, rc = -EINVAL);
+	}
+
+	CDEBUG(D_RPCTRACE, "Start creating object "DFID" %s %p\n",
+	       PFID(lu_object_fid(&parent->mo_lu)), lname->ln_name, parent);
+
+	/*
+	 * Do not perform lookup sanity check. We know that name does not exist.
+	 */
+	spec->sp_cr_lookup = 0;
+	rc = mdo_create(env, parent, lname, lu2md(child), spec, ma);
+	if (rc) {
+		CERROR("Can not create child "DFID": rc = %d\n", PFID(fid), rc);
+		GOTO(out_put, rc);
+	}
+	CDEBUG(D_RPCTRACE, "End creating object "DFID" %s %p rc  = %d\n",
+	       PFID(lu_object_fid(&parent->mo_lu)), lname->ln_name, parent, rc);
+	EXIT;
+out_put:
+	lu_object_put(env, ec_child);
+	return rc;
+}
+
+static int echo_set_lmm_size(const struct lu_env *env, struct lu_device *ld,
+			     struct md_attr *ma)
+{
+	struct echo_thread_info *info = echo_env_info(env);
+
+	if (strcmp(ld->ld_type->ldt_name, LUSTRE_MDD_NAME)) {
+		ma->ma_lmm = (void *)&info->eti_lmm;
+		ma->ma_lmm_size = sizeof(info->eti_lmm);
+	} else {
+		LASSERT(info->eti_big_lmmsize);
+		ma->ma_lmm = info->eti_big_lmm;
+		ma->ma_lmm_size = info->eti_big_lmmsize;
+	}
+
+	return 0;
+}
+
+static int echo_create_md_object(const struct lu_env *env,
+				 struct echo_device *ed,
+				 struct lu_object *ec_parent,
+				 struct lu_fid *fid,
+				 char *name, int namelen,
+				 __u64 id, __u32 mode, int count,
+				 int stripe_count, int stripe_offset)
+{
+	struct lu_object	*parent;
+	struct echo_thread_info *info = echo_env_info(env);
+	struct lu_name	  *lname = &info->eti_lname;
+	struct md_op_spec       *spec = &info->eti_spec;
+	struct md_attr	  *ma = &info->eti_ma;
+	struct lu_device	*ld = ed->ed_next;
+	int		      rc = 0;
+	int		      i;
+
+	ENTRY;
+
+	if (ec_parent == NULL)
+		return -1;
+	parent = lu_object_locate(ec_parent->lo_header, ld->ld_type);
+	if (parent == NULL)
+		RETURN(-ENXIO);
+
+	memset(ma, 0, sizeof(*ma));
+	memset(spec, 0, sizeof(*spec));
+	if (stripe_count != 0) {
+		spec->sp_cr_flags |= FMODE_WRITE;
+		echo_set_lmm_size(env, ld, ma);
+		if (stripe_count != -1) {
+			struct lov_user_md_v3 *lum = &info->eti_lum;
+
+			lum->lmm_magic = LOV_USER_MAGIC_V3;
+			lum->lmm_stripe_count = stripe_count;
+			lum->lmm_stripe_offset = stripe_offset;
+			lum->lmm_pattern = 0;
+			spec->u.sp_ea.eadata = lum;
+			spec->u.sp_ea.eadatalen = sizeof(*lum);
+			spec->sp_cr_flags |= MDS_OPEN_HAS_EA;
+		}
+	}
+
+	ma->ma_attr.la_mode = mode;
+	ma->ma_attr.la_valid = LA_CTIME | LA_MODE;
+	ma->ma_attr.la_ctime = cfs_time_current_64();
+
+	if (name != NULL) {
+		lname->ln_name = name;
+		lname->ln_namelen = namelen;
+		/* If name is specified, only create one object by name */
+		rc = echo_md_create_internal(env, ed, lu2md(parent), fid, lname,
+					     spec, ma);
+		RETURN(rc);
+	}
+
+	/* Create multiple object sequenced by id */
+	for (i = 0; i < count; i++) {
+		char *tmp_name = info->eti_name;
+
+		echo_md_build_name(lname, tmp_name, id);
+
+		rc = echo_md_create_internal(env, ed, lu2md(parent), fid, lname,
+					     spec, ma);
+		if (rc) {
+			CERROR("Can not create child %s: rc = %d\n", tmp_name,
+				rc);
+			break;
+		}
+		id++;
+		fid->f_oid++;
+	}
+
+	RETURN(rc);
+}
+
+static struct lu_object *echo_md_lookup(const struct lu_env *env,
+					struct echo_device *ed,
+					struct md_object *parent,
+					struct lu_name *lname)
+{
+	struct echo_thread_info *info = echo_env_info(env);
+	struct lu_fid	   *fid = &info->eti_fid;
+	struct lu_object	*child;
+	int    rc;
+	ENTRY;
+
+	CDEBUG(D_INFO, "lookup %s in parent "DFID" %p\n", lname->ln_name,
+	       PFID(fid), parent);
+	rc = mdo_lookup(env, parent, lname, fid, NULL);
+	if (rc) {
+		CERROR("lookup %s: rc = %d\n", lname->ln_name, rc);
+		RETURN(ERR_PTR(rc));
+	}
+
+	/* In the function below, .hs_keycmp resolves to
+	 * lu_obj_hop_keycmp() */
+	/* coverity[overrun-buffer-val] */
+	child = lu_object_find_at(env, &ed->ed_cl.cd_lu_dev, fid, NULL);
+
+	RETURN(child);
+}
+
+static int echo_setattr_object(const struct lu_env *env,
+			       struct echo_device *ed,
+			       struct lu_object *ec_parent,
+			       __u64 id, int count)
+{
+	struct lu_object	*parent;
+	struct echo_thread_info *info = echo_env_info(env);
+	struct lu_name	  *lname = &info->eti_lname;
+	char		    *name = info->eti_name;
+	struct lu_device	*ld = ed->ed_next;
+	struct lu_buf	   *buf = &info->eti_buf;
+	int		      rc = 0;
+	int		      i;
+
+	ENTRY;
+
+	if (ec_parent == NULL)
+		return -1;
+	parent = lu_object_locate(ec_parent->lo_header, ld->ld_type);
+	if (parent == NULL)
+		RETURN(-ENXIO);
+
+	for (i = 0; i < count; i++) {
+		struct lu_object *ec_child, *child;
+
+		echo_md_build_name(lname, name, id);
+
+		ec_child = echo_md_lookup(env, ed, lu2md(parent), lname);
+		if (IS_ERR(ec_child)) {
+			CERROR("Can't find child %s: rc = %ld\n",
+				lname->ln_name, PTR_ERR(ec_child));
+			RETURN(PTR_ERR(ec_child));
+		}
+
+		child = lu_object_locate(ec_child->lo_header, ld->ld_type);
+		if (child == NULL) {
+			CERROR("Can not locate the child %s\n", lname->ln_name);
+			lu_object_put(env, ec_child);
+			rc = -EINVAL;
+			break;
+		}
+
+		CDEBUG(D_RPCTRACE, "Start setattr object "DFID"\n",
+		       PFID(lu_object_fid(child)));
+
+		buf->lb_buf = info->eti_xattr_buf;
+		buf->lb_len = sizeof(info->eti_xattr_buf);
+
+		sprintf(name, "%s.test1", XATTR_USER_PREFIX);
+		rc = mo_xattr_set(env, lu2md(child), buf, name,
+				  LU_XATTR_CREATE);
+		if (rc < 0) {
+			CERROR("Can not setattr child "DFID": rc = %d\n",
+				PFID(lu_object_fid(child)), rc);
+			lu_object_put(env, ec_child);
+			break;
+		}
+		CDEBUG(D_RPCTRACE, "End setattr object "DFID"\n",
+		       PFID(lu_object_fid(child)));
+		id++;
+		lu_object_put(env, ec_child);
+	}
+	RETURN(rc);
+}
+
+static int echo_getattr_object(const struct lu_env *env,
+			       struct echo_device *ed,
+			       struct lu_object *ec_parent,
+			       __u64 id, int count)
+{
+	struct lu_object	*parent;
+	struct echo_thread_info *info = echo_env_info(env);
+	struct lu_name	  *lname = &info->eti_lname;
+	char		    *name = info->eti_name;
+	struct md_attr	  *ma = &info->eti_ma;
+	struct lu_device	*ld = ed->ed_next;
+	int		      rc = 0;
+	int		      i;
+
+	ENTRY;
+
+	if (ec_parent == NULL)
+		return -1;
+	parent = lu_object_locate(ec_parent->lo_header, ld->ld_type);
+	if (parent == NULL)
+		RETURN(-ENXIO);
+
+	memset(ma, 0, sizeof(*ma));
+	ma->ma_need |= MA_INODE | MA_LOV | MA_PFID | MA_HSM | MA_ACL_DEF;
+	ma->ma_acl = info->eti_xattr_buf;
+	ma->ma_acl_size = sizeof(info->eti_xattr_buf);
+
+	for (i = 0; i < count; i++) {
+		struct lu_object *ec_child, *child;
+
+		ma->ma_valid = 0;
+		echo_md_build_name(lname, name, id);
+		echo_set_lmm_size(env, ld, ma);
+
+		ec_child = echo_md_lookup(env, ed, lu2md(parent), lname);
+		if (IS_ERR(ec_child)) {
+			CERROR("Can't find child %s: rc = %ld\n",
+			       lname->ln_name, PTR_ERR(ec_child));
+			RETURN(PTR_ERR(ec_child));
+		}
+
+		child = lu_object_locate(ec_child->lo_header, ld->ld_type);
+		if (child == NULL) {
+			CERROR("Can not locate the child %s\n", lname->ln_name);
+			lu_object_put(env, ec_child);
+			RETURN(-EINVAL);
+		}
+
+		CDEBUG(D_RPCTRACE, "Start getattr object "DFID"\n",
+		       PFID(lu_object_fid(child)));
+		rc = echo_attr_get_complex(env, lu2md(child), ma);
+		if (rc) {
+			CERROR("Can not getattr child "DFID": rc = %d\n",
+				PFID(lu_object_fid(child)), rc);
+			lu_object_put(env, ec_child);
+			break;
+		}
+		CDEBUG(D_RPCTRACE, "End getattr object "DFID"\n",
+		       PFID(lu_object_fid(child)));
+		id++;
+		lu_object_put(env, ec_child);
+	}
+
+	RETURN(rc);
+}
+
+static int echo_lookup_object(const struct lu_env *env,
+			      struct echo_device *ed,
+			      struct lu_object *ec_parent,
+			      __u64 id, int count)
+{
+	struct lu_object	*parent;
+	struct echo_thread_info *info = echo_env_info(env);
+	struct lu_name	  *lname = &info->eti_lname;
+	char		    *name = info->eti_name;
+	struct lu_fid	   *fid = &info->eti_fid;
+	struct lu_device	*ld = ed->ed_next;
+	int		      rc = 0;
+	int		      i;
+
+	if (ec_parent == NULL)
+		return -1;
+	parent = lu_object_locate(ec_parent->lo_header, ld->ld_type);
+	if (parent == NULL)
+		return -ENXIO;
+
+	/*prepare the requests*/
+	for (i = 0; i < count; i++) {
+		echo_md_build_name(lname, name, id);
+
+		CDEBUG(D_RPCTRACE, "Start lookup object "DFID" %s %p\n",
+		       PFID(lu_object_fid(parent)), lname->ln_name, parent);
+
+		rc = mdo_lookup(env, lu2md(parent), lname, fid, NULL);
+		if (rc) {
+			CERROR("Can not lookup child %s: rc = %d\n", name, rc);
+			break;
+		}
+		CDEBUG(D_RPCTRACE, "End lookup object "DFID" %s %p\n",
+		       PFID(lu_object_fid(parent)), lname->ln_name, parent);
+
+		id++;
+	}
+	return rc;
+}
+
+static int echo_md_destroy_internal(const struct lu_env *env,
+				    struct echo_device *ed,
+				    struct md_object *parent,
+				    struct lu_name *lname,
+				    struct md_attr *ma)
+{
+	struct lu_device   *ld = ed->ed_next;
+	struct lu_object   *ec_child;
+	struct lu_object   *child;
+	int		 rc;
+
+	ENTRY;
+
+	ec_child = echo_md_lookup(env, ed, parent, lname);
+	if (IS_ERR(ec_child)) {
+		CERROR("Can't find child %s: rc = %ld\n", lname->ln_name,
+			PTR_ERR(ec_child));
+		RETURN(PTR_ERR(ec_child));
+	}
+
+	child = lu_object_locate(ec_child->lo_header, ld->ld_type);
+	if (child == NULL) {
+		CERROR("Can not locate the child %s\n", lname->ln_name);
+		GOTO(out_put, rc = -EINVAL);
+	}
+
+	CDEBUG(D_RPCTRACE, "Start destroy object "DFID" %s %p\n",
+	       PFID(lu_object_fid(&parent->mo_lu)), lname->ln_name, parent);
+
+	rc = mdo_unlink(env, parent, lu2md(child), lname, ma, 0);
+	if (rc) {
+		CERROR("Can not unlink child %s: rc = %d\n",
+			lname->ln_name, rc);
+		GOTO(out_put, rc);
+	}
+	CDEBUG(D_RPCTRACE, "End destroy object "DFID" %s %p\n",
+	       PFID(lu_object_fid(&parent->mo_lu)), lname->ln_name, parent);
+out_put:
+	lu_object_put(env, ec_child);
+	return rc;
+}
+
+static int echo_destroy_object(const struct lu_env *env,
+			       struct echo_device *ed,
+			       struct lu_object *ec_parent,
+			       char *name, int namelen,
+			       __u64 id, __u32 mode,
+			       int count)
+{
+	struct echo_thread_info *info = echo_env_info(env);
+	struct lu_name	  *lname = &info->eti_lname;
+	struct md_attr	  *ma = &info->eti_ma;
+	struct lu_device	*ld = ed->ed_next;
+	struct lu_object	*parent;
+	int		      rc = 0;
+	int		      i;
+	ENTRY;
+
+	parent = lu_object_locate(ec_parent->lo_header, ld->ld_type);
+	if (parent == NULL)
+		RETURN(-EINVAL);
+
+	memset(ma, 0, sizeof(*ma));
+	ma->ma_attr.la_mode = mode;
+	ma->ma_attr.la_valid = LA_CTIME;
+	ma->ma_attr.la_ctime = cfs_time_current_64();
+	ma->ma_need = MA_INODE;
+	ma->ma_valid = 0;
+
+	if (name != NULL) {
+		lname->ln_name = name;
+		lname->ln_namelen = namelen;
+		rc = echo_md_destroy_internal(env, ed, lu2md(parent), lname,
+					      ma);
+		RETURN(rc);
+	}
+
+	/*prepare the requests*/
+	for (i = 0; i < count; i++) {
+		char *tmp_name = info->eti_name;
+
+		ma->ma_valid = 0;
+		echo_md_build_name(lname, tmp_name, id);
+
+		rc = echo_md_destroy_internal(env, ed, lu2md(parent), lname,
+					      ma);
+		if (rc) {
+			CERROR("Can not unlink child %s: rc = %d\n", name, rc);
+			break;
+		}
+		id++;
+	}
+
+	RETURN(rc);
+}
+
+static struct lu_object *echo_resolve_path(const struct lu_env *env,
+					   struct echo_device *ed, char *path,
+					   int path_len)
+{
+	struct lu_device	*ld = ed->ed_next;
+	struct md_device	*md = lu2md_dev(ld);
+	struct echo_thread_info *info = echo_env_info(env);
+	struct lu_fid	   *fid = &info->eti_fid;
+	struct lu_name	  *lname = &info->eti_lname;
+	struct lu_object	*parent = NULL;
+	struct lu_object	*child = NULL;
+	int rc = 0;
+	ENTRY;
+
+	/*Only support MDD layer right now*/
+	rc = md->md_ops->mdo_root_get(env, md, fid);
+	if (rc) {
+		CERROR("get root error: rc = %d\n", rc);
+		RETURN(ERR_PTR(rc));
+	}
+
+	/* In the function below, .hs_keycmp resolves to
+	 * lu_obj_hop_keycmp() */
+	/* coverity[overrun-buffer-val] */
+	parent = lu_object_find_at(env, &ed->ed_cl.cd_lu_dev, fid, NULL);
+	if (IS_ERR(parent)) {
+		CERROR("Can not find the parent "DFID": rc = %ld\n",
+			PFID(fid), PTR_ERR(parent));
+		RETURN(parent);
+	}
+
+	while (1) {
+		struct lu_object *ld_parent;
+		char *e;
+
+		e = strsep(&path, "/");
+		if (e == NULL)
+			break;
+
+		if (e[0] == 0) {
+			if (!path || path[0] == '\0')
+				break;
+			continue;
+		}
+
+		lname->ln_name = e;
+		lname->ln_namelen = strlen(e);
+
+		ld_parent = lu_object_locate(parent->lo_header, ld->ld_type);
+		if (ld_parent == NULL) {
+			lu_object_put(env, parent);
+			rc = -EINVAL;
+			break;
+		}
+
+		child = echo_md_lookup(env, ed, lu2md(ld_parent), lname);
+		lu_object_put(env, parent);
+		if (IS_ERR(child)) {
+			rc = (int)PTR_ERR(child);
+			CERROR("lookup %s under parent "DFID": rc = %d\n",
+				lname->ln_name, PFID(lu_object_fid(ld_parent)),
+				rc);
+			break;
+		}
+		parent = child;
+	}
+	if (rc)
+		RETURN(ERR_PTR(rc));
+
+	RETURN(parent);
+}
+
+static void echo_ucred_init(struct lu_env *env)
+{
+	struct lu_ucred *ucred = lu_ucred(env);
+
+	ucred->uc_valid = UCRED_INVALID;
+
+	ucred->uc_suppgids[0] = -1;
+	ucred->uc_suppgids[1] = -1;
+
+	ucred->uc_uid   = ucred->uc_o_uid   = current_uid();
+	ucred->uc_gid   = ucred->uc_o_gid   = current_gid();
+	ucred->uc_fsuid = ucred->uc_o_fsuid = current_fsuid();
+	ucred->uc_fsgid = ucred->uc_o_fsgid = current_fsgid();
+	ucred->uc_cap   = cfs_curproc_cap_pack();
+
+	/* remove fs privilege for non-root user. */
+	if (ucred->uc_fsuid)
+		ucred->uc_cap &= ~CFS_CAP_FS_MASK;
+	ucred->uc_valid = UCRED_NEW;
+}
+
+static void echo_ucred_fini(struct lu_env *env)
+{
+	struct lu_ucred *ucred = lu_ucred(env);
+	ucred->uc_valid = UCRED_INIT;
+}
+
+#define ECHO_MD_CTX_TAG (LCT_REMEMBER | LCT_MD_THREAD)
+#define ECHO_MD_SES_TAG (LCT_REMEMBER | LCT_SESSION)
+static int echo_md_handler(struct echo_device *ed, int command,
+			   char *path, int path_len, __u64 id, int count,
+			   struct obd_ioctl_data *data)
+{
+	struct echo_thread_info *info;
+	struct lu_device      *ld = ed->ed_next;
+	struct lu_env	 *env;
+	int		    refcheck;
+	struct lu_object      *parent;
+	char		  *name = NULL;
+	int		    namelen = data->ioc_plen2;
+	int		    rc = 0;
+	ENTRY;
+
+	if (ld == NULL) {
+		CERROR("MD echo client is not being initialized properly\n");
+		RETURN(-EINVAL);
+	}
+
+	if (strcmp(ld->ld_type->ldt_name, LUSTRE_MDD_NAME)) {
+		CERROR("Only support MDD layer right now!\n");
+		RETURN(-EINVAL);
+	}
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	rc = lu_env_refill_by_tags(env, ECHO_MD_CTX_TAG, ECHO_MD_SES_TAG);
+	if (rc != 0)
+		GOTO(out_env, rc);
+
+	/* init big_lmm buffer */
+	info = echo_env_info(env);
+	LASSERT(info->eti_big_lmm == NULL);
+	OBD_ALLOC_LARGE(info->eti_big_lmm, MIN_MD_SIZE);
+	if (info->eti_big_lmm == NULL)
+		GOTO(out_env, rc = -ENOMEM);
+	info->eti_big_lmmsize = MIN_MD_SIZE;
+
+	parent = echo_resolve_path(env, ed, path, path_len);
+	if (IS_ERR(parent)) {
+		CERROR("Can not resolve the path %s: rc = %ld\n", path,
+			PTR_ERR(parent));
+		GOTO(out_free, rc = PTR_ERR(parent));
+	}
+
+	if (namelen > 0) {
+		OBD_ALLOC(name, namelen + 1);
+		if (name == NULL)
+			GOTO(out_put, rc = -ENOMEM);
+		if (copy_from_user(name, data->ioc_pbuf2, namelen))
+			GOTO(out_name, rc = -EFAULT);
+	}
+
+	echo_ucred_init(env);
+
+	switch (command) {
+	case ECHO_MD_CREATE:
+	case ECHO_MD_MKDIR: {
+		struct echo_thread_info *info = echo_env_info(env);
+		__u32 mode = data->ioc_obdo2.o_mode;
+		struct lu_fid *fid = &info->eti_fid;
+		int stripe_count = (int)data->ioc_obdo2.o_misc;
+		int stripe_index = (int)data->ioc_obdo2.o_stripe_idx;
+
+		rc = ostid_to_fid(fid, &data->ioc_obdo1.o_oi, 0);
+		if (rc != 0)
+			break;
+
+		/* In the function below, .hs_keycmp resolves to
+		 * lu_obj_hop_keycmp() */
+		/* coverity[overrun-buffer-val] */
+		rc = echo_create_md_object(env, ed, parent, fid, name, namelen,
+					   id, mode, count, stripe_count,
+					   stripe_index);
+		break;
+	}
+	case ECHO_MD_DESTROY:
+	case ECHO_MD_RMDIR: {
+		__u32 mode = data->ioc_obdo2.o_mode;
+
+		rc = echo_destroy_object(env, ed, parent, name, namelen,
+					 id, mode, count);
+		break;
+	}
+	case ECHO_MD_LOOKUP:
+		rc = echo_lookup_object(env, ed, parent, id, count);
+		break;
+	case ECHO_MD_GETATTR:
+		rc = echo_getattr_object(env, ed, parent, id, count);
+		break;
+	case ECHO_MD_SETATTR:
+		rc = echo_setattr_object(env, ed, parent, id, count);
+		break;
+	default:
+		CERROR("unknown command %d\n", command);
+		rc = -EINVAL;
+		break;
+	}
+	echo_ucred_fini(env);
+
+out_name:
+	if (name != NULL)
+		OBD_FREE(name, namelen + 1);
+out_put:
+	lu_object_put(env, parent);
+out_free:
+	LASSERT(info->eti_big_lmm);
+	OBD_FREE_LARGE(info->eti_big_lmm, info->eti_big_lmmsize);
+	info->eti_big_lmm = NULL;
+	info->eti_big_lmmsize = 0;
+out_env:
+	cl_env_put(env, &refcheck);
+	return rc;
+}
+
+static int echo_create_object(const struct lu_env *env, struct echo_device *ed,
+			      int on_target, struct obdo *oa, void *ulsm,
+			      int ulsm_nob, struct obd_trans_info *oti)
+{
+	struct echo_object     *eco;
+	struct echo_client_obd *ec = ed->ed_ec;
+	struct lov_stripe_md   *lsm = NULL;
+	int		     rc;
+	int		     created = 0;
+	ENTRY;
+
+	if ((oa->o_valid & OBD_MD_FLID) == 0 && /* no obj id */
+	    (on_target ||		       /* set_stripe */
+	     ec->ec_nstripes != 0)) {	   /* LOV */
+		CERROR ("No valid oid\n");
+		RETURN(-EINVAL);
+	}
+
+	rc = echo_alloc_memmd(ed, &lsm);
+	if (rc < 0) {
+		CERROR("Cannot allocate md: rc = %d\n", rc);
+		GOTO(failed, rc);
+	}
+
+	if (ulsm != NULL) {
+		int i, idx;
+
+		rc = echo_copyin_lsm (ed, lsm, ulsm, ulsm_nob);
+		if (rc != 0)
+			GOTO(failed, rc);
+
+		if (lsm->lsm_stripe_count == 0)
+			lsm->lsm_stripe_count = ec->ec_nstripes;
+
+		if (lsm->lsm_stripe_size == 0)
+			lsm->lsm_stripe_size = PAGE_CACHE_SIZE;
+
+		idx = cfs_rand();
+
+		/* setup stripes: indices + default ids if required */
+		for (i = 0; i < lsm->lsm_stripe_count; i++) {
+			if (ostid_id(&lsm->lsm_oinfo[i]->loi_oi) == 0)
+				lsm->lsm_oinfo[i]->loi_oi = lsm->lsm_oi;
+
+			lsm->lsm_oinfo[i]->loi_ost_idx =
+				(idx + i) % ec->ec_nstripes;
+		}
+	}
+
+	/* setup object ID here for !on_target and LOV hint */
+	if (oa->o_valid & OBD_MD_FLID) {
+		LASSERT(oa->o_valid & OBD_MD_FLGROUP);
+		lsm->lsm_oi = oa->o_oi;
+	}
+
+	if (ostid_id(&lsm->lsm_oi) == 0)
+		ostid_set_id(&lsm->lsm_oi, ++last_object_id);
+
+	rc = 0;
+	if (on_target) {
+		/* Only echo objects are allowed to be created */
+		LASSERT((oa->o_valid & OBD_MD_FLGROUP) &&
+			(ostid_seq(&oa->o_oi) == FID_SEQ_ECHO));
+		rc = obd_create(env, ec->ec_exp, oa, &lsm, oti);
+		if (rc != 0) {
+			CERROR("Cannot create objects: rc = %d\n", rc);
+			GOTO(failed, rc);
+		}
+		created = 1;
+	}
+
+	/* See what object ID we were given */
+	oa->o_oi = lsm->lsm_oi;
+	oa->o_valid |= OBD_MD_FLID;
+
+	eco = cl_echo_object_find(ed, &lsm);
+	if (IS_ERR(eco))
+		GOTO(failed, rc = PTR_ERR(eco));
+	cl_echo_object_put(eco);
+
+	CDEBUG(D_INFO, "oa oid "DOSTID"\n", POSTID(&oa->o_oi));
+	EXIT;
+
+ failed:
+	if (created && rc)
+		obd_destroy(env, ec->ec_exp, oa, lsm, oti, NULL, NULL);
+	if (lsm)
+		echo_free_memmd(ed, &lsm);
+	if (rc)
+		CERROR("create object failed with: rc = %d\n", rc);
+	return (rc);
+}
+
+static int echo_get_object(struct echo_object **ecop, struct echo_device *ed,
+			   struct obdo *oa)
+{
+	struct lov_stripe_md   *lsm = NULL;
+	struct echo_object     *eco;
+	int		     rc;
+	ENTRY;
+
+	if ((oa->o_valid & OBD_MD_FLID) == 0 || ostid_id(&oa->o_oi) == 0) {
+		/* disallow use of object id 0 */
+		CERROR ("No valid oid\n");
+		RETURN(-EINVAL);
+	}
+
+	rc = echo_alloc_memmd(ed, &lsm);
+	if (rc < 0)
+		RETURN(rc);
+
+	lsm->lsm_oi = oa->o_oi;
+	if (!(oa->o_valid & OBD_MD_FLGROUP))
+		ostid_set_seq_echo(&lsm->lsm_oi);
+
+	rc = 0;
+	eco = cl_echo_object_find(ed, &lsm);
+	if (!IS_ERR(eco))
+		*ecop = eco;
+	else
+		rc = PTR_ERR(eco);
+	if (lsm)
+		echo_free_memmd(ed, &lsm);
+	RETURN(rc);
+}
+
+static void echo_put_object(struct echo_object *eco)
+{
+	if (cl_echo_object_put(eco))
+		CERROR("echo client: drop an object failed");
+}
+
+static void
+echo_get_stripe_off_id (struct lov_stripe_md *lsm, obd_off *offp, obd_id *idp)
+{
+	unsigned long stripe_count;
+	unsigned long stripe_size;
+	unsigned long width;
+	unsigned long woffset;
+	int	   stripe_index;
+	obd_off       offset;
+
+	if (lsm->lsm_stripe_count <= 1)
+		return;
+
+	offset       = *offp;
+	stripe_size  = lsm->lsm_stripe_size;
+	stripe_count = lsm->lsm_stripe_count;
+
+	/* width = # bytes in all stripes */
+	width = stripe_size * stripe_count;
+
+	/* woffset = offset within a width; offset = whole number of widths */
+	woffset = do_div (offset, width);
+
+	stripe_index = woffset / stripe_size;
+
+	*idp = ostid_id(&lsm->lsm_oinfo[stripe_index]->loi_oi);
+	*offp = offset * stripe_size + woffset % stripe_size;
+}
+
+static void
+echo_client_page_debug_setup(struct lov_stripe_md *lsm,
+			     struct page *page, int rw, obd_id id,
+			     obd_off offset, obd_off count)
+{
+	char    *addr;
+	obd_off  stripe_off;
+	obd_id   stripe_id;
+	int      delta;
+
+	/* no partial pages on the client */
+	LASSERT(count == PAGE_CACHE_SIZE);
+
+	addr = kmap(page);
+
+	for (delta = 0; delta < PAGE_CACHE_SIZE; delta += OBD_ECHO_BLOCK_SIZE) {
+		if (rw == OBD_BRW_WRITE) {
+			stripe_off = offset + delta;
+			stripe_id = id;
+			echo_get_stripe_off_id(lsm, &stripe_off, &stripe_id);
+		} else {
+			stripe_off = 0xdeadbeef00c0ffeeULL;
+			stripe_id = 0xdeadbeef00c0ffeeULL;
+		}
+		block_debug_setup(addr + delta, OBD_ECHO_BLOCK_SIZE,
+				  stripe_off, stripe_id);
+	}
+
+	kunmap(page);
+}
+
+static int echo_client_page_debug_check(struct lov_stripe_md *lsm,
+					struct page *page, obd_id id,
+					obd_off offset, obd_off count)
+{
+	obd_off stripe_off;
+	obd_id  stripe_id;
+	char   *addr;
+	int     delta;
+	int     rc;
+	int     rc2;
+
+	/* no partial pages on the client */
+	LASSERT(count == PAGE_CACHE_SIZE);
+
+	addr = kmap(page);
+
+	for (rc = delta = 0; delta < PAGE_CACHE_SIZE; delta += OBD_ECHO_BLOCK_SIZE) {
+		stripe_off = offset + delta;
+		stripe_id = id;
+		echo_get_stripe_off_id (lsm, &stripe_off, &stripe_id);
+
+		rc2 = block_debug_check("test_brw",
+					addr + delta, OBD_ECHO_BLOCK_SIZE,
+					stripe_off, stripe_id);
+		if (rc2 != 0) {
+			CERROR ("Error in echo object "LPX64"\n", id);
+			rc = rc2;
+		}
+	}
+
+	kunmap(page);
+	return rc;
+}
+
+static int echo_client_kbrw(struct echo_device *ed, int rw, struct obdo *oa,
+			    struct echo_object *eco, obd_off offset,
+			    obd_size count, int async,
+			    struct obd_trans_info *oti)
+{
+	struct lov_stripe_md   *lsm = eco->eo_lsm;
+	obd_count	       npages;
+	struct brw_page	*pga;
+	struct brw_page	*pgp;
+	struct page	    **pages;
+	obd_off		 off;
+	int		     i;
+	int		     rc;
+	int		     verify;
+	int		     gfp_mask;
+	int		     brw_flags = 0;
+	ENTRY;
+
+	verify = (ostid_id(&oa->o_oi) != ECHO_PERSISTENT_OBJID &&
+		  (oa->o_valid & OBD_MD_FLFLAGS) != 0 &&
+		  (oa->o_flags & OBD_FL_DEBUG_CHECK) != 0);
+
+	gfp_mask = ((ostid_id(&oa->o_oi) & 2) == 0) ? GFP_IOFS : GFP_HIGHUSER;
+
+	LASSERT(rw == OBD_BRW_WRITE || rw == OBD_BRW_READ);
+	LASSERT(lsm != NULL);
+	LASSERT(ostid_id(&lsm->lsm_oi) == ostid_id(&oa->o_oi));
+
+	if (count <= 0 ||
+	    (count & (~CFS_PAGE_MASK)) != 0)
+		RETURN(-EINVAL);
+
+	/* XXX think again with misaligned I/O */
+	npages = count >> PAGE_CACHE_SHIFT;
+
+	if (rw == OBD_BRW_WRITE)
+		brw_flags = OBD_BRW_ASYNC;
+
+	OBD_ALLOC(pga, npages * sizeof(*pga));
+	if (pga == NULL)
+		RETURN(-ENOMEM);
+
+	OBD_ALLOC(pages, npages * sizeof(*pages));
+	if (pages == NULL) {
+		OBD_FREE(pga, npages * sizeof(*pga));
+		RETURN(-ENOMEM);
+	}
+
+	for (i = 0, pgp = pga, off = offset;
+	     i < npages;
+	     i++, pgp++, off += PAGE_CACHE_SIZE) {
+
+		LASSERT (pgp->pg == NULL);      /* for cleanup */
+
+		rc = -ENOMEM;
+		OBD_PAGE_ALLOC(pgp->pg, gfp_mask);
+		if (pgp->pg == NULL)
+			goto out;
+
+		pages[i] = pgp->pg;
+		pgp->count = PAGE_CACHE_SIZE;
+		pgp->off = off;
+		pgp->flag = brw_flags;
+
+		if (verify)
+			echo_client_page_debug_setup(lsm, pgp->pg, rw,
+						     ostid_id(&oa->o_oi), off,
+						     pgp->count);
+	}
+
+	/* brw mode can only be used at client */
+	LASSERT(ed->ed_next != NULL);
+	rc = cl_echo_object_brw(eco, rw, offset, pages, npages, async);
+
+ out:
+	if (rc != 0 || rw != OBD_BRW_READ)
+		verify = 0;
+
+	for (i = 0, pgp = pga; i < npages; i++, pgp++) {
+		if (pgp->pg == NULL)
+			continue;
+
+		if (verify) {
+			int vrc;
+			vrc = echo_client_page_debug_check(lsm, pgp->pg,
+							   ostid_id(&oa->o_oi),
+							   pgp->off, pgp->count);
+			if (vrc != 0 && rc == 0)
+				rc = vrc;
+		}
+		OBD_PAGE_FREE(pgp->pg);
+	}
+	OBD_FREE(pga, npages * sizeof(*pga));
+	OBD_FREE(pages, npages * sizeof(*pages));
+	RETURN(rc);
+}
+
+static int echo_client_prep_commit(const struct lu_env *env,
+				   struct obd_export *exp, int rw,
+				   struct obdo *oa, struct echo_object *eco,
+				   obd_off offset, obd_size count,
+				   obd_size batch, struct obd_trans_info *oti,
+				   int async)
+{
+	struct lov_stripe_md *lsm = eco->eo_lsm;
+	struct obd_ioobj ioo;
+	struct niobuf_local *lnb;
+	struct niobuf_remote *rnb;
+	obd_off off;
+	obd_size npages, tot_pages;
+	int i, ret = 0, brw_flags = 0;
+
+	ENTRY;
+
+	if (count <= 0 || (count & (~CFS_PAGE_MASK)) != 0 ||
+	    (lsm != NULL && ostid_id(&lsm->lsm_oi) != ostid_id(&oa->o_oi)))
+		RETURN(-EINVAL);
+
+	npages = batch >> PAGE_CACHE_SHIFT;
+	tot_pages = count >> PAGE_CACHE_SHIFT;
+
+	OBD_ALLOC(lnb, npages * sizeof(struct niobuf_local));
+	OBD_ALLOC(rnb, npages * sizeof(struct niobuf_remote));
+
+	if (lnb == NULL || rnb == NULL)
+		GOTO(out, ret = -ENOMEM);
+
+	if (rw == OBD_BRW_WRITE && async)
+		brw_flags |= OBD_BRW_ASYNC;
+
+	obdo_to_ioobj(oa, &ioo);
+
+	off = offset;
+
+	for(; tot_pages; tot_pages -= npages) {
+		int lpages;
+
+		if (tot_pages < npages)
+			npages = tot_pages;
+
+		for (i = 0; i < npages; i++, off += PAGE_CACHE_SIZE) {
+			rnb[i].offset = off;
+			rnb[i].len = PAGE_CACHE_SIZE;
+			rnb[i].flags = brw_flags;
+		}
+
+		ioo.ioo_bufcnt = npages;
+		oti->oti_transno = 0;
+
+		lpages = npages;
+		ret = obd_preprw(env, rw, exp, oa, 1, &ioo, rnb, &lpages,
+				 lnb, oti, NULL);
+		if (ret != 0)
+			GOTO(out, ret);
+		LASSERT(lpages == npages);
+
+		for (i = 0; i < lpages; i++) {
+			struct page *page = lnb[i].page;
+
+			/* read past eof? */
+			if (page == NULL && lnb[i].rc == 0)
+				continue;
+
+			if (async)
+				lnb[i].flags |= OBD_BRW_ASYNC;
+
+			if (ostid_id(&oa->o_oi) == ECHO_PERSISTENT_OBJID ||
+			    (oa->o_valid & OBD_MD_FLFLAGS) == 0 ||
+			    (oa->o_flags & OBD_FL_DEBUG_CHECK) == 0)
+				continue;
+
+			if (rw == OBD_BRW_WRITE)
+				echo_client_page_debug_setup(lsm, page, rw,
+							    ostid_id(&oa->o_oi),
+							     rnb[i].offset,
+							     rnb[i].len);
+			else
+				echo_client_page_debug_check(lsm, page,
+							    ostid_id(&oa->o_oi),
+							     rnb[i].offset,
+							     rnb[i].len);
+		}
+
+		ret = obd_commitrw(env, rw, exp, oa, 1, &ioo,
+				   rnb, npages, lnb, oti, ret);
+		if (ret != 0)
+			GOTO(out, ret);
+
+		/* Reset oti otherwise it would confuse ldiskfs. */
+		memset(oti, 0, sizeof(*oti));
+	}
+
+out:
+	if (lnb)
+		OBD_FREE(lnb, npages * sizeof(struct niobuf_local));
+	if (rnb)
+		OBD_FREE(rnb, npages * sizeof(struct niobuf_remote));
+	RETURN(ret);
+}
+
+static int echo_client_brw_ioctl(const struct lu_env *env, int rw,
+				 struct obd_export *exp,
+				 struct obd_ioctl_data *data,
+				 struct obd_trans_info *dummy_oti)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	struct echo_device *ed = obd2echo_dev(obd);
+	struct echo_client_obd *ec = ed->ed_ec;
+	struct obdo *oa = &data->ioc_obdo1;
+	struct echo_object *eco;
+	int rc;
+	int async = 1;
+	long test_mode;
+	ENTRY;
+
+	LASSERT(oa->o_valid & OBD_MD_FLGROUP);
+
+	rc = echo_get_object(&eco, ed, oa);
+	if (rc)
+		RETURN(rc);
+
+	oa->o_valid &= ~OBD_MD_FLHANDLE;
+
+	/* OFD/obdfilter works only via prep/commit */
+	test_mode = (long)data->ioc_pbuf1;
+	if (test_mode == 1)
+		async = 0;
+
+	if (ed->ed_next == NULL && test_mode != 3) {
+		test_mode = 3;
+		data->ioc_plen1 = data->ioc_count;
+	}
+
+	/* Truncate batch size to maximum */
+	if (data->ioc_plen1 > PTLRPC_MAX_BRW_SIZE)
+		data->ioc_plen1 = PTLRPC_MAX_BRW_SIZE;
+
+	switch (test_mode) {
+	case 1:
+		/* fall through */
+	case 2:
+		rc = echo_client_kbrw(ed, rw, oa,
+				      eco, data->ioc_offset,
+				      data->ioc_count, async, dummy_oti);
+		break;
+	case 3:
+		rc = echo_client_prep_commit(env, ec->ec_exp, rw, oa,
+					     eco, data->ioc_offset,
+					     data->ioc_count, data->ioc_plen1,
+					     dummy_oti, async);
+		break;
+	default:
+		rc = -EINVAL;
+	}
+	echo_put_object(eco);
+	RETURN(rc);
+}
+
+static int
+echo_client_enqueue(struct obd_export *exp, struct obdo *oa,
+		    int mode, obd_off offset, obd_size nob)
+{
+	struct echo_device     *ed = obd2echo_dev(exp->exp_obd);
+	struct lustre_handle   *ulh = &oa->o_handle;
+	struct echo_object     *eco;
+	obd_off		 end;
+	int		     rc;
+	ENTRY;
+
+	if (ed->ed_next == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	if (!(mode == LCK_PR || mode == LCK_PW))
+		RETURN(-EINVAL);
+
+	if ((offset & (~CFS_PAGE_MASK)) != 0 ||
+	    (nob & (~CFS_PAGE_MASK)) != 0)
+		RETURN(-EINVAL);
+
+	rc = echo_get_object (&eco, ed, oa);
+	if (rc != 0)
+		RETURN(rc);
+
+	end = (nob == 0) ? ((obd_off) -1) : (offset + nob - 1);
+	rc = cl_echo_enqueue(eco, offset, end, mode, &ulh->cookie);
+	if (rc == 0) {
+		oa->o_valid |= OBD_MD_FLHANDLE;
+		CDEBUG(D_INFO, "Cookie is "LPX64"\n", ulh->cookie);
+	}
+	echo_put_object(eco);
+	RETURN(rc);
+}
+
+static int
+echo_client_cancel(struct obd_export *exp, struct obdo *oa)
+{
+	struct echo_device *ed     = obd2echo_dev(exp->exp_obd);
+	__u64	       cookie = oa->o_handle.cookie;
+
+	if ((oa->o_valid & OBD_MD_FLHANDLE) == 0)
+		return -EINVAL;
+
+	CDEBUG(D_INFO, "Cookie is "LPX64"\n", cookie);
+	return cl_echo_cancel(ed, cookie);
+}
+
+static int
+echo_client_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
+		      void *karg, void *uarg)
+{
+	struct obd_device      *obd = exp->exp_obd;
+	struct echo_device     *ed = obd2echo_dev(obd);
+	struct echo_client_obd *ec = ed->ed_ec;
+	struct echo_object     *eco;
+	struct obd_ioctl_data  *data = karg;
+	struct obd_trans_info   dummy_oti;
+	struct lu_env	  *env;
+	struct oti_req_ack_lock *ack_lock;
+	struct obdo	    *oa;
+	struct lu_fid	   fid;
+	int		     rw = OBD_BRW_READ;
+	int		     rc = 0;
+	int		     i;
+	ENTRY;
+
+	memset(&dummy_oti, 0, sizeof(dummy_oti));
+
+	oa = &data->ioc_obdo1;
+	if (!(oa->o_valid & OBD_MD_FLGROUP)) {
+		oa->o_valid |= OBD_MD_FLGROUP;
+		ostid_set_seq_echo(&oa->o_oi);
+	}
+
+	/* This FID is unpacked just for validation at this point */
+	rc = ostid_to_fid(&fid, &oa->o_oi, 0);
+	if (rc < 0)
+		RETURN(rc);
+
+	OBD_ALLOC_PTR(env);
+	if (env == NULL)
+		RETURN(-ENOMEM);
+
+	rc = lu_env_init(env, LCT_DT_THREAD);
+	if (rc)
+		GOTO(out, rc = -ENOMEM);
+
+	switch (cmd) {
+	case OBD_IOC_CREATE:		    /* may create echo object */
+		if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+			GOTO (out, rc = -EPERM);
+
+		rc = echo_create_object(env, ed, 1, oa, data->ioc_pbuf1,
+					data->ioc_plen1, &dummy_oti);
+		GOTO(out, rc);
+
+	case OBD_IOC_ECHO_MD: {
+		int count;
+		int cmd;
+		char *dir = NULL;
+		int dirlen;
+		__u64 id;
+
+		if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+			GOTO(out, rc = -EPERM);
+
+		count = data->ioc_count;
+		cmd = data->ioc_command;
+
+		id = ostid_id(&data->ioc_obdo2.o_oi);
+
+		dirlen = data->ioc_plen1;
+		OBD_ALLOC(dir, dirlen + 1);
+		if (dir == NULL)
+			GOTO(out, rc = -ENOMEM);
+
+		if (copy_from_user(dir, data->ioc_pbuf1, dirlen)) {
+			OBD_FREE(dir, data->ioc_plen1 + 1);
+			GOTO(out, rc = -EFAULT);
+		}
+
+		rc = echo_md_handler(ed, cmd, dir, dirlen, id, count, data);
+		OBD_FREE(dir, dirlen + 1);
+		GOTO(out, rc);
+	}
+	case OBD_IOC_ECHO_ALLOC_SEQ: {
+		struct lu_env   *cl_env;
+		int	      refcheck;
+		__u64	    seq;
+		int	      max_count;
+
+		if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+			GOTO(out, rc = -EPERM);
+
+		cl_env = cl_env_get(&refcheck);
+		if (IS_ERR(cl_env))
+			GOTO(out, rc = PTR_ERR(cl_env));
+
+		rc = lu_env_refill_by_tags(cl_env, ECHO_MD_CTX_TAG,
+					    ECHO_MD_SES_TAG);
+		if (rc != 0) {
+			cl_env_put(cl_env, &refcheck);
+			GOTO(out, rc);
+		}
+
+		rc = seq_client_get_seq(cl_env, ed->ed_cl_seq, &seq);
+		cl_env_put(cl_env, &refcheck);
+		if (rc < 0) {
+			CERROR("%s: Can not alloc seq: rc = %d\n",
+			       obd->obd_name, rc);
+			GOTO(out, rc);
+		}
+
+		if (copy_to_user(data->ioc_pbuf1, &seq, data->ioc_plen1))
+			return -EFAULT;
+
+		max_count = LUSTRE_METADATA_SEQ_MAX_WIDTH;
+		if (copy_to_user(data->ioc_pbuf2, &max_count,
+				     data->ioc_plen2))
+			return -EFAULT;
+		GOTO(out, rc);
+	}
+	case OBD_IOC_DESTROY:
+		if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+			GOTO (out, rc = -EPERM);
+
+		rc = echo_get_object(&eco, ed, oa);
+		if (rc == 0) {
+			rc = obd_destroy(env, ec->ec_exp, oa, eco->eo_lsm,
+					 &dummy_oti, NULL, NULL);
+			if (rc == 0)
+				eco->eo_deleted = 1;
+			echo_put_object(eco);
+		}
+		GOTO(out, rc);
+
+	case OBD_IOC_GETATTR:
+		rc = echo_get_object(&eco, ed, oa);
+		if (rc == 0) {
+			struct obd_info oinfo = { { { 0 } } };
+			oinfo.oi_md = eco->eo_lsm;
+			oinfo.oi_oa = oa;
+			rc = obd_getattr(env, ec->ec_exp, &oinfo);
+			echo_put_object(eco);
+		}
+		GOTO(out, rc);
+
+	case OBD_IOC_SETATTR:
+		if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+			GOTO (out, rc = -EPERM);
+
+		rc = echo_get_object(&eco, ed, oa);
+		if (rc == 0) {
+			struct obd_info oinfo = { { { 0 } } };
+			oinfo.oi_oa = oa;
+			oinfo.oi_md = eco->eo_lsm;
+
+			rc = obd_setattr(env, ec->ec_exp, &oinfo, NULL);
+			echo_put_object(eco);
+		}
+		GOTO(out, rc);
+
+	case OBD_IOC_BRW_WRITE:
+		if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+			GOTO (out, rc = -EPERM);
+
+		rw = OBD_BRW_WRITE;
+		/* fall through */
+	case OBD_IOC_BRW_READ:
+		rc = echo_client_brw_ioctl(env, rw, exp, data, &dummy_oti);
+		GOTO(out, rc);
+
+	case ECHO_IOC_GET_STRIPE:
+		rc = echo_get_object(&eco, ed, oa);
+		if (rc == 0) {
+			rc = echo_copyout_lsm(eco->eo_lsm, data->ioc_pbuf1,
+					      data->ioc_plen1);
+			echo_put_object(eco);
+		}
+		GOTO(out, rc);
+
+	case ECHO_IOC_SET_STRIPE:
+		if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+			GOTO (out, rc = -EPERM);
+
+		if (data->ioc_pbuf1 == NULL) {  /* unset */
+			rc = echo_get_object(&eco, ed, oa);
+			if (rc == 0) {
+				eco->eo_deleted = 1;
+				echo_put_object(eco);
+			}
+		} else {
+			rc = echo_create_object(env, ed, 0, oa,
+						data->ioc_pbuf1,
+						data->ioc_plen1, &dummy_oti);
+		}
+		GOTO (out, rc);
+
+	case ECHO_IOC_ENQUEUE:
+		if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+			GOTO (out, rc = -EPERM);
+
+		rc = echo_client_enqueue(exp, oa,
+					 data->ioc_conn1, /* lock mode */
+					 data->ioc_offset,
+					 data->ioc_count);/*extent*/
+		GOTO (out, rc);
+
+	case ECHO_IOC_CANCEL:
+		rc = echo_client_cancel(exp, oa);
+		GOTO (out, rc);
+
+	default:
+		CERROR ("echo_ioctl(): unrecognised ioctl %#x\n", cmd);
+		GOTO (out, rc = -ENOTTY);
+	}
+
+	EXIT;
+out:
+	lu_env_fini(env);
+	OBD_FREE_PTR(env);
+
+	/* XXX this should be in a helper also called by target_send_reply */
+	for (ack_lock = dummy_oti.oti_ack_locks, i = 0; i < 4;
+	     i++, ack_lock++) {
+		if (!ack_lock->mode)
+			break;
+		ldlm_lock_decref(&ack_lock->lock, ack_lock->mode);
+	}
+
+	return rc;
+}
+
+static int echo_client_setup(const struct lu_env *env,
+			     struct obd_device *obddev, struct lustre_cfg *lcfg)
+{
+	struct echo_client_obd *ec = &obddev->u.echo_client;
+	struct obd_device *tgt;
+	struct obd_uuid echo_uuid = { "ECHO_UUID" };
+	struct obd_connect_data *ocd = NULL;
+	int rc;
+	ENTRY;
+
+	if (lcfg->lcfg_bufcount < 2 || LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
+		CERROR("requires a TARGET OBD name\n");
+		RETURN(-EINVAL);
+	}
+
+	tgt = class_name2obd(lustre_cfg_string(lcfg, 1));
+	if (!tgt || !tgt->obd_attached || !tgt->obd_set_up) {
+		CERROR("device not attached or not set up (%s)\n",
+		       lustre_cfg_string(lcfg, 1));
+		RETURN(-EINVAL);
+	}
+
+	spin_lock_init(&ec->ec_lock);
+	INIT_LIST_HEAD (&ec->ec_objects);
+	INIT_LIST_HEAD (&ec->ec_locks);
+	ec->ec_unique = 0;
+	ec->ec_nstripes = 0;
+
+	if (!strcmp(tgt->obd_type->typ_name, LUSTRE_MDT_NAME)) {
+		lu_context_tags_update(ECHO_MD_CTX_TAG);
+		lu_session_tags_update(ECHO_MD_SES_TAG);
+		RETURN(0);
+	}
+
+	OBD_ALLOC(ocd, sizeof(*ocd));
+	if (ocd == NULL) {
+		CERROR("Can't alloc ocd connecting to %s\n",
+		       lustre_cfg_string(lcfg, 1));
+		return -ENOMEM;
+	}
+
+	ocd->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_REQPORTAL |
+				 OBD_CONNECT_BRW_SIZE |
+				 OBD_CONNECT_GRANT | OBD_CONNECT_FULL20 |
+				 OBD_CONNECT_64BITHASH | OBD_CONNECT_LVB_TYPE;
+	ocd->ocd_brw_size = DT_MAX_BRW_SIZE;
+	ocd->ocd_version = LUSTRE_VERSION_CODE;
+	ocd->ocd_group = FID_SEQ_ECHO;
+
+	rc = obd_connect(env, &ec->ec_exp, tgt, &echo_uuid, ocd, NULL);
+	if (rc == 0) {
+		/* Turn off pinger because it connects to tgt obd directly. */
+		spin_lock(&tgt->obd_dev_lock);
+		list_del_init(&ec->ec_exp->exp_obd_chain_timed);
+		spin_unlock(&tgt->obd_dev_lock);
+	}
+
+	OBD_FREE(ocd, sizeof(*ocd));
+
+	if (rc != 0) {
+		CERROR("fail to connect to device %s\n",
+		       lustre_cfg_string(lcfg, 1));
+		return (rc);
+	}
+
+	RETURN(rc);
+}
+
+static int echo_client_cleanup(struct obd_device *obddev)
+{
+	struct echo_device *ed = obd2echo_dev(obddev);
+	struct echo_client_obd *ec = &obddev->u.echo_client;
+	int rc;
+	ENTRY;
+
+	/*Do nothing for Metadata echo client*/
+	if (ed == NULL )
+		RETURN(0);
+
+	if (ed->ed_next_ismd) {
+		lu_context_tags_clear(ECHO_MD_CTX_TAG);
+		lu_session_tags_clear(ECHO_MD_SES_TAG);
+		RETURN(0);
+	}
+
+	if (!list_empty(&obddev->obd_exports)) {
+		CERROR("still has clients!\n");
+		RETURN(-EBUSY);
+	}
+
+	LASSERT(atomic_read(&ec->ec_exp->exp_refcount) > 0);
+	rc = obd_disconnect(ec->ec_exp);
+	if (rc != 0)
+		CERROR("fail to disconnect device: %d\n", rc);
+
+	RETURN(rc);
+}
+
+static int echo_client_connect(const struct lu_env *env,
+			       struct obd_export **exp,
+			       struct obd_device *src, struct obd_uuid *cluuid,
+			       struct obd_connect_data *data, void *localdata)
+{
+	int		rc;
+	struct lustre_handle conn = { 0 };
+
+	ENTRY;
+	rc = class_connect(&conn, src, cluuid);
+	if (rc == 0) {
+		*exp = class_conn2export(&conn);
+	}
+
+	RETURN (rc);
+}
+
+static int echo_client_disconnect(struct obd_export *exp)
+{
+#if 0
+	struct obd_device      *obd;
+	struct echo_client_obd *ec;
+	struct ec_lock	 *ecl;
+#endif
+	int		     rc;
+	ENTRY;
+
+	if (exp == NULL)
+		GOTO(out, rc = -EINVAL);
+
+#if 0
+	obd = exp->exp_obd;
+	ec = &obd->u.echo_client;
+
+	/* no more contention on export's lock list */
+	while (!list_empty (&exp->exp_ec_data.eced_locks)) {
+		ecl = list_entry (exp->exp_ec_data.eced_locks.next,
+				      struct ec_lock, ecl_exp_chain);
+		list_del (&ecl->ecl_exp_chain);
+
+		rc = obd_cancel(ec->ec_exp, ecl->ecl_object->eco_lsm,
+				 ecl->ecl_mode, &ecl->ecl_lock_handle);
+
+		CDEBUG (D_INFO, "Cancel lock on object "LPX64" on disconnect "
+			"(%d)\n", ecl->ecl_object->eco_id, rc);
+
+		echo_put_object (ecl->ecl_object);
+		OBD_FREE (ecl, sizeof (*ecl));
+	}
+#endif
+
+	rc = class_disconnect(exp);
+	GOTO(out, rc);
+ out:
+	return rc;
+}
+
+static struct obd_ops echo_client_obd_ops = {
+	.o_owner       = THIS_MODULE,
+
+#if 0
+	.o_setup       = echo_client_setup,
+	.o_cleanup     = echo_client_cleanup,
+#endif
+
+	.o_iocontrol   = echo_client_iocontrol,
+	.o_connect     = echo_client_connect,
+	.o_disconnect  = echo_client_disconnect
+};
+
+int echo_client_init(void)
+{
+	struct lprocfs_static_vars lvars = { 0 };
+	int rc;
+
+	lprocfs_echo_init_vars(&lvars);
+
+	rc = lu_kmem_init(echo_caches);
+	if (rc == 0) {
+		rc = class_register_type(&echo_client_obd_ops, NULL,
+					 lvars.module_vars,
+					 LUSTRE_ECHO_CLIENT_NAME,
+					 &echo_device_type);
+		if (rc)
+			lu_kmem_fini(echo_caches);
+	}
+	return rc;
+}
+
+void echo_client_exit(void)
+{
+	class_unregister_type(LUSTRE_ECHO_CLIENT_NAME);
+	lu_kmem_fini(echo_caches);
+}
+
+static int __init obdecho_init(void)
+{
+	struct lprocfs_static_vars lvars;
+	int rc;
+
+	ENTRY;
+	LCONSOLE_INFO("Echo OBD driver; http://www.lustre.org/\n");
+
+	LASSERT(PAGE_CACHE_SIZE % OBD_ECHO_BLOCK_SIZE == 0);
+
+	lprocfs_echo_init_vars(&lvars);
+
+
+	rc = echo_client_init();
+
+	RETURN(rc);
+}
+
+static void /*__exit*/ obdecho_exit(void)
+{
+	echo_client_exit();
+
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Testing Echo OBD driver");
+MODULE_LICENSE("GPL");
+
+cfs_module(obdecho, LUSTRE_VERSION_STRING, obdecho_init, obdecho_exit);
+
+/** @} echo_client */
diff --git a/drivers/staging/lustre/lustre/obdecho/echo_internal.h b/drivers/staging/lustre/lustre/obdecho/echo_internal.h
new file mode 100644
index 000000000000..8e9dbc2351e7
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdecho/echo_internal.h
@@ -0,0 +1,47 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Whamcloud, Inc.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdecho/echo_internal.h
+ */
+
+#ifndef _ECHO_INTERNAL_H
+#define _ECHO_INTERNAL_H
+
+/* The persistent object (i.e. actually stores stuff!) */
+#define ECHO_PERSISTENT_OBJID    1ULL
+#define ECHO_PERSISTENT_SIZE     ((__u64)(1<<20))
+
+/* block size to use for data verification */
+#define OBD_ECHO_BLOCK_SIZE	(4<<10)
+
+
+#endif
diff --git a/drivers/staging/lustre/lustre/obdecho/lproc_echo.c b/drivers/staging/lustre/lustre/obdecho/lproc_echo.c
new file mode 100644
index 000000000000..e23ed32a4855
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdecho/lproc_echo.c
@@ -0,0 +1,55 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_ECHO
+
+#include <lprocfs_status.h>
+#include <obd_class.h>
+
+#ifdef LPROCFS
+static struct lprocfs_vars lprocfs_echo_obd_vars[] = {
+	{ "uuid",	 lprocfs_rd_uuid,	0, 0 },
+	{ 0 }
+};
+
+static struct lprocfs_vars lprocfs_echo_module_vars[] = {
+	{ "num_refs",     lprocfs_rd_numrefs,     0, 0 },
+	{ 0 }
+};
+
+void lprocfs_echo_init_vars(struct lprocfs_static_vars *lvars)
+{
+    lvars->module_vars  = lprocfs_echo_module_vars;
+    lvars->obd_vars     = lprocfs_echo_obd_vars;
+}
+#endif /* LPROCFS */
diff --git a/drivers/staging/lustre/lustre/osc/Makefile b/drivers/staging/lustre/lustre/osc/Makefile
new file mode 100644
index 000000000000..bbd2f7707e9f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/osc/Makefile
@@ -0,0 +1,7 @@
+obj-$(CONFIG_LUSTRE_FS) += osc.o
+osc-y := osc_request.o lproc_osc.o osc_dev.o osc_object.o \
+	 osc_page.o osc_lock.o osc_io.o osc_quota.o osc_cache.o
+
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lustre/osc/lproc_osc.c b/drivers/staging/lustre/lustre/osc/lproc_osc.c
new file mode 100644
index 000000000000..016ad02da297
--- /dev/null
+++ b/drivers/staging/lustre/lustre/osc/lproc_osc.c
@@ -0,0 +1,715 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/version.h>
+#include <asm/statfs.h>
+#include <obd_cksum.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include <linux/seq_file.h>
+#include "osc_internal.h"
+
+#ifdef LPROCFS
+static int osc_rd_active(char *page, char **start, off_t off,
+			 int count, int *eof, void *data)
+{
+	struct obd_device *dev = data;
+	int rc;
+
+	LPROCFS_CLIMP_CHECK(dev);
+	rc = snprintf(page, count, "%d\n", !dev->u.cli.cl_import->imp_deactive);
+	LPROCFS_CLIMP_EXIT(dev);
+	return rc;
+}
+
+static int osc_wr_active(struct file *file, const char *buffer,
+			 unsigned long count, void *data)
+{
+	struct obd_device *dev = data;
+	int val, rc;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+	if (val < 0 || val > 1)
+		return -ERANGE;
+
+	/* opposite senses */
+	if (dev->u.cli.cl_import->imp_deactive == val)
+		rc = ptlrpc_set_import_active(dev->u.cli.cl_import, val);
+	else
+		CDEBUG(D_CONFIG, "activate %d: ignoring repeat request\n", val);
+
+	return count;
+}
+
+static int osc_rd_max_rpcs_in_flight(char *page, char **start, off_t off,
+				     int count, int *eof, void *data)
+{
+	struct obd_device *dev = data;
+	struct client_obd *cli = &dev->u.cli;
+	int rc;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	rc = snprintf(page, count, "%u\n", cli->cl_max_rpcs_in_flight);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+	return rc;
+}
+
+static int osc_wr_max_rpcs_in_flight(struct file *file, const char *buffer,
+				     unsigned long count, void *data)
+{
+	struct obd_device *dev = data;
+	struct client_obd *cli = &dev->u.cli;
+	struct ptlrpc_request_pool *pool = cli->cl_import->imp_rq_pool;
+	int val, rc;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	if (val < 1 || val > OSC_MAX_RIF_MAX)
+		return -ERANGE;
+
+	LPROCFS_CLIMP_CHECK(dev);
+	if (pool && val > cli->cl_max_rpcs_in_flight)
+		pool->prp_populate(pool, val-cli->cl_max_rpcs_in_flight);
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	cli->cl_max_rpcs_in_flight = val;
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	LPROCFS_CLIMP_EXIT(dev);
+	return count;
+}
+
+static int osc_rd_max_dirty_mb(char *page, char **start, off_t off, int count,
+			       int *eof, void *data)
+{
+	struct obd_device *dev = data;
+	struct client_obd *cli = &dev->u.cli;
+	long val;
+	int mult;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	val = cli->cl_dirty_max;
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	mult = 1 << 20;
+	return lprocfs_read_frac_helper(page, count, val, mult);
+}
+
+static int osc_wr_max_dirty_mb(struct file *file, const char *buffer,
+			       unsigned long count, void *data)
+{
+	struct obd_device *dev = data;
+	struct client_obd *cli = &dev->u.cli;
+	int pages_number, mult, rc;
+
+	mult = 1 << (20 - PAGE_CACHE_SHIFT);
+	rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
+	if (rc)
+		return rc;
+
+	if (pages_number <= 0 ||
+	    pages_number > OSC_MAX_DIRTY_MB_MAX << (20 - PAGE_CACHE_SHIFT) ||
+	    pages_number > num_physpages / 4) /* 1/4 of RAM */
+		return -ERANGE;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	cli->cl_dirty_max = (obd_count)(pages_number << PAGE_CACHE_SHIFT);
+	osc_wake_cache_waiters(cli);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	return count;
+}
+
+static int osc_rd_cached_mb(char *page, char **start, off_t off, int count,
+			    int *eof, void *data)
+{
+	struct obd_device *dev = data;
+	struct client_obd *cli = &dev->u.cli;
+	int shift = 20 - PAGE_CACHE_SHIFT;
+	int rc;
+
+	rc = snprintf(page, count,
+		      "used_mb: %d\n"
+		      "busy_cnt: %d\n",
+		      (atomic_read(&cli->cl_lru_in_list) +
+			atomic_read(&cli->cl_lru_busy)) >> shift,
+		      atomic_read(&cli->cl_lru_busy));
+
+	return rc;
+}
+
+/* shrink the number of caching pages to a specific number */
+static int osc_wr_cached_mb(struct file *file, const char *buffer,
+			    unsigned long count, void *data)
+{
+	struct obd_device *dev = data;
+	struct client_obd *cli = &dev->u.cli;
+	int pages_number, mult, rc;
+
+	mult = 1 << (20 - PAGE_CACHE_SHIFT);
+	buffer = lprocfs_find_named_value(buffer, "used_mb:", &count);
+	rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
+	if (rc)
+		return rc;
+
+	if (pages_number < 0)
+		return -ERANGE;
+
+	rc = atomic_read(&cli->cl_lru_in_list) - pages_number;
+	if (rc > 0)
+		(void)osc_lru_shrink(cli, rc);
+
+	return count;
+}
+
+static int osc_rd_cur_dirty_bytes(char *page, char **start, off_t off,
+				  int count, int *eof, void *data)
+{
+	struct obd_device *dev = data;
+	struct client_obd *cli = &dev->u.cli;
+	int rc;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	rc = snprintf(page, count, "%lu\n", cli->cl_dirty);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+	return rc;
+}
+
+static int osc_rd_cur_grant_bytes(char *page, char **start, off_t off,
+				  int count, int *eof, void *data)
+{
+	struct obd_device *dev = data;
+	struct client_obd *cli = &dev->u.cli;
+	int rc;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	rc = snprintf(page, count, "%lu\n", cli->cl_avail_grant);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+	return rc;
+}
+
+static int osc_wr_cur_grant_bytes(struct file *file, const char *buffer,
+				  unsigned long count, void *data)
+{
+	struct obd_device *obd = data;
+	struct client_obd *cli = &obd->u.cli;
+	int		rc;
+	__u64	      val;
+
+	if (obd == NULL)
+		return 0;
+
+	rc = lprocfs_write_u64_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	/* this is only for shrinking grant */
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	if (val >= cli->cl_avail_grant) {
+		client_obd_list_unlock(&cli->cl_loi_list_lock);
+		return 0;
+	}
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	LPROCFS_CLIMP_CHECK(obd);
+	if (cli->cl_import->imp_state == LUSTRE_IMP_FULL)
+		rc = osc_shrink_grant_to_target(cli, val);
+	LPROCFS_CLIMP_EXIT(obd);
+	if (rc)
+		return rc;
+	return count;
+}
+
+static int osc_rd_cur_lost_grant_bytes(char *page, char **start, off_t off,
+				       int count, int *eof, void *data)
+{
+	struct obd_device *dev = data;
+	struct client_obd *cli = &dev->u.cli;
+	int rc;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	rc = snprintf(page, count, "%lu\n", cli->cl_lost_grant);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+	return rc;
+}
+
+static int osc_rd_grant_shrink_interval(char *page, char **start, off_t off,
+					int count, int *eof, void *data)
+{
+	struct obd_device *obd = data;
+
+	if (obd == NULL)
+		return 0;
+	return snprintf(page, count, "%d\n",
+			obd->u.cli.cl_grant_shrink_interval);
+}
+
+static int osc_wr_grant_shrink_interval(struct file *file, const char *buffer,
+					unsigned long count, void *data)
+{
+	struct obd_device *obd = data;
+	int val, rc;
+
+	if (obd == NULL)
+		return 0;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	if (val <= 0)
+		return -ERANGE;
+
+	obd->u.cli.cl_grant_shrink_interval = val;
+
+	return count;
+}
+
+static int osc_rd_checksum(char *page, char **start, off_t off, int count,
+			   int *eof, void *data)
+{
+	struct obd_device *obd = data;
+
+	if (obd == NULL)
+		return 0;
+
+	return snprintf(page, count, "%d\n",
+			obd->u.cli.cl_checksum ? 1 : 0);
+}
+
+static int osc_wr_checksum(struct file *file, const char *buffer,
+			   unsigned long count, void *data)
+{
+	struct obd_device *obd = data;
+	int val, rc;
+
+	if (obd == NULL)
+		return 0;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	obd->u.cli.cl_checksum = (val ? 1 : 0);
+
+	return count;
+}
+
+static int osc_rd_checksum_type(char *page, char **start, off_t off, int count,
+				int *eof, void *data)
+{
+	struct obd_device *obd = data;
+	int i, len =0;
+	DECLARE_CKSUM_NAME;
+
+	if (obd == NULL)
+		return 0;
+
+	for (i = 0; i < ARRAY_SIZE(cksum_name) && len < count; i++) {
+		if (((1 << i) & obd->u.cli.cl_supp_cksum_types) == 0)
+			continue;
+		if (obd->u.cli.cl_cksum_type == (1 << i))
+			len += snprintf(page + len, count - len, "[%s] ",
+					cksum_name[i]);
+		else
+			len += snprintf(page + len, count - len, "%s ",
+					cksum_name[i]);
+	}
+	if (len < count)
+		len += sprintf(page + len, "\n");
+	return len;
+}
+
+static int osc_wd_checksum_type(struct file *file, const char *buffer,
+				unsigned long count, void *data)
+{
+	struct obd_device *obd = data;
+	int i;
+	DECLARE_CKSUM_NAME;
+	char kernbuf[10];
+
+	if (obd == NULL)
+		return 0;
+
+	if (count > sizeof(kernbuf) - 1)
+		return -EINVAL;
+	if (copy_from_user(kernbuf, buffer, count))
+		return -EFAULT;
+	if (count > 0 && kernbuf[count - 1] == '\n')
+		kernbuf[count - 1] = '\0';
+	else
+		kernbuf[count] = '\0';
+
+	for (i = 0; i < ARRAY_SIZE(cksum_name); i++) {
+		if (((1 << i) & obd->u.cli.cl_supp_cksum_types) == 0)
+			continue;
+		if (!strcmp(kernbuf, cksum_name[i])) {
+		       obd->u.cli.cl_cksum_type = 1 << i;
+		       return count;
+		}
+	}
+	return -EINVAL;
+}
+
+static int osc_rd_resend_count(char *page, char **start, off_t off, int count,
+			       int *eof, void *data)
+{
+	struct obd_device *obd = data;
+
+	return snprintf(page, count, "%u\n",
+			atomic_read(&obd->u.cli.cl_resends));
+}
+
+static int osc_wr_resend_count(struct file *file, const char *buffer,
+			       unsigned long count, void *data)
+{
+	struct obd_device *obd = data;
+	int val, rc;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	if (val < 0)
+	       return -EINVAL;
+
+	atomic_set(&obd->u.cli.cl_resends, val);
+
+	return count;
+}
+
+static int osc_rd_contention_seconds(char *page, char **start, off_t off,
+				     int count, int *eof, void *data)
+{
+	struct obd_device *obd = data;
+	struct osc_device *od  = obd2osc_dev(obd);
+
+	return snprintf(page, count, "%u\n", od->od_contention_time);
+}
+
+static int osc_wr_contention_seconds(struct file *file, const char *buffer,
+				     unsigned long count, void *data)
+{
+	struct obd_device *obd = data;
+	struct osc_device *od  = obd2osc_dev(obd);
+
+	return lprocfs_write_helper(buffer, count, &od->od_contention_time) ?:
+		count;
+}
+
+static int osc_rd_lockless_truncate(char *page, char **start, off_t off,
+				    int count, int *eof, void *data)
+{
+	struct obd_device *obd = data;
+	struct osc_device *od  = obd2osc_dev(obd);
+
+	return snprintf(page, count, "%u\n", od->od_lockless_truncate);
+}
+
+static int osc_wr_lockless_truncate(struct file *file, const char *buffer,
+				    unsigned long count, void *data)
+{
+	struct obd_device *obd = data;
+	struct osc_device *od  = obd2osc_dev(obd);
+
+	return lprocfs_write_helper(buffer, count, &od->od_lockless_truncate) ?:
+		count;
+}
+
+static int osc_rd_destroys_in_flight(char *page, char **start, off_t off,
+				     int count, int *eof, void *data)
+{
+	struct obd_device *obd = data;
+	return snprintf(page, count, "%u\n",
+			atomic_read(&obd->u.cli.cl_destroy_in_flight));
+}
+
+static int lprocfs_osc_wr_max_pages_per_rpc(struct file *file,
+	const char *buffer, unsigned long count, void *data)
+{
+	struct obd_device *dev = data;
+	struct client_obd *cli = &dev->u.cli;
+	struct obd_connect_data *ocd = &cli->cl_import->imp_connect_data;
+	int chunk_mask, rc;
+	__u64 val;
+
+	rc = lprocfs_write_u64_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	/* if the max_pages is specified in bytes, convert to pages */
+	if (val >= ONE_MB_BRW_SIZE)
+		val >>= PAGE_CACHE_SHIFT;
+
+	LPROCFS_CLIMP_CHECK(dev);
+
+	chunk_mask = ~((1 << (cli->cl_chunkbits - PAGE_CACHE_SHIFT)) - 1);
+	/* max_pages_per_rpc must be chunk aligned */
+	val = (val + ~chunk_mask) & chunk_mask;
+	if (val == 0 || val > ocd->ocd_brw_size >> PAGE_CACHE_SHIFT) {
+		LPROCFS_CLIMP_EXIT(dev);
+		return -ERANGE;
+	}
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	cli->cl_max_pages_per_rpc = val;
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	LPROCFS_CLIMP_EXIT(dev);
+	return count;
+}
+
+static struct lprocfs_vars lprocfs_osc_obd_vars[] = {
+	{ "uuid",	    lprocfs_rd_uuid,	0, 0 },
+	{ "ping",	    0, lprocfs_wr_ping,     0, 0, 0222 },
+	{ "connect_flags",   lprocfs_rd_connect_flags, 0, 0 },
+	{ "blocksize",       lprocfs_rd_blksize,     0, 0 },
+	{ "kbytestotal",     lprocfs_rd_kbytestotal, 0, 0 },
+	{ "kbytesfree",      lprocfs_rd_kbytesfree,  0, 0 },
+	{ "kbytesavail",     lprocfs_rd_kbytesavail, 0, 0 },
+	{ "filestotal",      lprocfs_rd_filestotal,  0, 0 },
+	{ "filesfree",       lprocfs_rd_filesfree,   0, 0 },
+	//{ "filegroups",      lprocfs_rd_filegroups,  0, 0 },
+	{ "ost_server_uuid", lprocfs_rd_server_uuid, 0, 0 },
+	{ "ost_conn_uuid",   lprocfs_rd_conn_uuid, 0, 0 },
+	{ "active",	  osc_rd_active,
+			     osc_wr_active, 0 },
+	{ "max_pages_per_rpc", lprocfs_obd_rd_max_pages_per_rpc,
+			       lprocfs_osc_wr_max_pages_per_rpc, 0 },
+	{ "max_rpcs_in_flight", osc_rd_max_rpcs_in_flight,
+				osc_wr_max_rpcs_in_flight, 0 },
+	{ "destroys_in_flight", osc_rd_destroys_in_flight, 0, 0 },
+	{ "max_dirty_mb",    osc_rd_max_dirty_mb, osc_wr_max_dirty_mb, 0 },
+	{ "osc_cached_mb",   osc_rd_cached_mb,     osc_wr_cached_mb, 0 },
+	{ "cur_dirty_bytes", osc_rd_cur_dirty_bytes, 0, 0 },
+	{ "cur_grant_bytes", osc_rd_cur_grant_bytes,
+			     osc_wr_cur_grant_bytes, 0 },
+	{ "cur_lost_grant_bytes", osc_rd_cur_lost_grant_bytes, 0, 0},
+	{ "grant_shrink_interval", osc_rd_grant_shrink_interval,
+				   osc_wr_grant_shrink_interval, 0 },
+	{ "checksums",       osc_rd_checksum, osc_wr_checksum, 0 },
+	{ "checksum_type",   osc_rd_checksum_type, osc_wd_checksum_type, 0 },
+	{ "resend_count",    osc_rd_resend_count, osc_wr_resend_count, 0},
+	{ "timeouts",	lprocfs_rd_timeouts,      0, 0 },
+	{ "contention_seconds", osc_rd_contention_seconds,
+				osc_wr_contention_seconds, 0 },
+	{ "lockless_truncate",  osc_rd_lockless_truncate,
+				osc_wr_lockless_truncate, 0 },
+	{ "import",	  lprocfs_rd_import,	lprocfs_wr_import, 0 },
+	{ "state",	   lprocfs_rd_state,	 0, 0 },
+	{ "pinger_recov",    lprocfs_rd_pinger_recov,
+			     lprocfs_wr_pinger_recov,  0, 0 },
+	{ 0 }
+};
+
+static struct lprocfs_vars lprocfs_osc_module_vars[] = {
+	{ "num_refs",	lprocfs_rd_numrefs,     0, 0 },
+	{ 0 }
+};
+
+#define pct(a,b) (b ? a * 100 / b : 0)
+
+static int osc_rpc_stats_seq_show(struct seq_file *seq, void *v)
+{
+	struct timeval now;
+	struct obd_device *dev = seq->private;
+	struct client_obd *cli = &dev->u.cli;
+	unsigned long read_tot = 0, write_tot = 0, read_cum, write_cum;
+	int i;
+
+	do_gettimeofday(&now);
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+
+	seq_printf(seq, "snapshot_time:	 %lu.%lu (secs.usecs)\n",
+		   now.tv_sec, now.tv_usec);
+	seq_printf(seq, "read RPCs in flight:  %d\n",
+		   cli->cl_r_in_flight);
+	seq_printf(seq, "write RPCs in flight: %d\n",
+		   cli->cl_w_in_flight);
+	seq_printf(seq, "pending write pages:  %d\n",
+		   atomic_read(&cli->cl_pending_w_pages));
+	seq_printf(seq, "pending read pages:   %d\n",
+		   atomic_read(&cli->cl_pending_r_pages));
+
+	seq_printf(seq, "\n\t\t\tread\t\t\twrite\n");
+	seq_printf(seq, "pages per rpc	 rpcs   %% cum %% |");
+	seq_printf(seq, "       rpcs   %% cum %%\n");
+
+	read_tot = lprocfs_oh_sum(&cli->cl_read_page_hist);
+	write_tot = lprocfs_oh_sum(&cli->cl_write_page_hist);
+
+	read_cum = 0;
+	write_cum = 0;
+	for (i = 0; i < OBD_HIST_MAX; i++) {
+		unsigned long r = cli->cl_read_page_hist.oh_buckets[i];
+		unsigned long w = cli->cl_write_page_hist.oh_buckets[i];
+		read_cum += r;
+		write_cum += w;
+		seq_printf(seq, "%d:\t\t%10lu %3lu %3lu   | %10lu %3lu %3lu\n",
+				 1 << i, r, pct(r, read_tot),
+				 pct(read_cum, read_tot), w,
+				 pct(w, write_tot),
+				 pct(write_cum, write_tot));
+		if (read_cum == read_tot && write_cum == write_tot)
+			break;
+	}
+
+	seq_printf(seq, "\n\t\t\tread\t\t\twrite\n");
+	seq_printf(seq, "rpcs in flight	rpcs   %% cum %% |");
+	seq_printf(seq, "       rpcs   %% cum %%\n");
+
+	read_tot = lprocfs_oh_sum(&cli->cl_read_rpc_hist);
+	write_tot = lprocfs_oh_sum(&cli->cl_write_rpc_hist);
+
+	read_cum = 0;
+	write_cum = 0;
+	for (i = 0; i < OBD_HIST_MAX; i++) {
+		unsigned long r = cli->cl_read_rpc_hist.oh_buckets[i];
+		unsigned long w = cli->cl_write_rpc_hist.oh_buckets[i];
+		read_cum += r;
+		write_cum += w;
+		seq_printf(seq, "%d:\t\t%10lu %3lu %3lu   | %10lu %3lu %3lu\n",
+				 i, r, pct(r, read_tot),
+				 pct(read_cum, read_tot), w,
+				 pct(w, write_tot),
+				 pct(write_cum, write_tot));
+		if (read_cum == read_tot && write_cum == write_tot)
+			break;
+	}
+
+	seq_printf(seq, "\n\t\t\tread\t\t\twrite\n");
+	seq_printf(seq, "offset		rpcs   %% cum %% |");
+	seq_printf(seq, "       rpcs   %% cum %%\n");
+
+	read_tot = lprocfs_oh_sum(&cli->cl_read_offset_hist);
+	write_tot = lprocfs_oh_sum(&cli->cl_write_offset_hist);
+
+	read_cum = 0;
+	write_cum = 0;
+	for (i = 0; i < OBD_HIST_MAX; i++) {
+		unsigned long r = cli->cl_read_offset_hist.oh_buckets[i];
+		unsigned long w = cli->cl_write_offset_hist.oh_buckets[i];
+		read_cum += r;
+		write_cum += w;
+		seq_printf(seq, "%d:\t\t%10lu %3lu %3lu   | %10lu %3lu %3lu\n",
+			   (i == 0) ? 0 : 1 << (i - 1),
+			   r, pct(r, read_tot), pct(read_cum, read_tot),
+			   w, pct(w, write_tot), pct(write_cum, write_tot));
+		if (read_cum == read_tot && write_cum == write_tot)
+			break;
+	}
+
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	return 0;
+}
+#undef pct
+
+static ssize_t osc_rpc_stats_seq_write(struct file *file, const char *buf,
+				       size_t len, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	struct obd_device *dev = seq->private;
+	struct client_obd *cli = &dev->u.cli;
+
+	lprocfs_oh_clear(&cli->cl_read_rpc_hist);
+	lprocfs_oh_clear(&cli->cl_write_rpc_hist);
+	lprocfs_oh_clear(&cli->cl_read_page_hist);
+	lprocfs_oh_clear(&cli->cl_write_page_hist);
+	lprocfs_oh_clear(&cli->cl_read_offset_hist);
+	lprocfs_oh_clear(&cli->cl_write_offset_hist);
+
+	return len;
+}
+
+LPROC_SEQ_FOPS(osc_rpc_stats);
+
+static int osc_stats_seq_show(struct seq_file *seq, void *v)
+{
+	struct timeval now;
+	struct obd_device *dev = seq->private;
+	struct osc_stats *stats = &obd2osc_dev(dev)->od_stats;
+
+	do_gettimeofday(&now);
+
+	seq_printf(seq, "snapshot_time:	 %lu.%lu (secs.usecs)\n",
+		   now.tv_sec, now.tv_usec);
+	seq_printf(seq, "lockless_write_bytes\t\t"LPU64"\n",
+		   stats->os_lockless_writes);
+	seq_printf(seq, "lockless_read_bytes\t\t"LPU64"\n",
+		   stats->os_lockless_reads);
+	seq_printf(seq, "lockless_truncate\t\t"LPU64"\n",
+		   stats->os_lockless_truncates);
+	return 0;
+}
+
+static ssize_t osc_stats_seq_write(struct file *file, const char *buf,
+				   size_t len, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	struct obd_device *dev = seq->private;
+	struct osc_stats *stats = &obd2osc_dev(dev)->od_stats;
+
+	memset(stats, 0, sizeof(*stats));
+	return len;
+}
+
+LPROC_SEQ_FOPS(osc_stats);
+
+int lproc_osc_attach_seqstat(struct obd_device *dev)
+{
+	int rc;
+
+	rc = lprocfs_seq_create(dev->obd_proc_entry, "osc_stats", 0644,
+				&osc_stats_fops, dev);
+	if (rc == 0)
+		rc = lprocfs_obd_seq_create(dev, "rpc_stats", 0644,
+					    &osc_rpc_stats_fops, dev);
+
+	return rc;
+}
+
+void lprocfs_osc_init_vars(struct lprocfs_static_vars *lvars)
+{
+	lvars->module_vars = lprocfs_osc_module_vars;
+	lvars->obd_vars    = lprocfs_osc_obd_vars;
+}
+#endif /* LPROCFS */
diff --git a/drivers/staging/lustre/lustre/osc/osc_cache.c b/drivers/staging/lustre/lustre/osc/osc_cache.c
new file mode 100644
index 000000000000..206feadb7371
--- /dev/null
+++ b/drivers/staging/lustre/lustre/osc/osc_cache.c
@@ -0,0 +1,3002 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ *
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * osc cache management.
+ *
+ * Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include "osc_cl_internal.h"
+#include "osc_internal.h"
+
+static int extent_debug; /* set it to be true for more debug */
+
+static void osc_update_pending(struct osc_object *obj, int cmd, int delta);
+static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext,
+			   int state);
+static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli,
+			      struct osc_async_page *oap, int sent, int rc);
+static int osc_make_ready(const struct lu_env *env, struct osc_async_page *oap,
+			  int cmd);
+static int osc_refresh_count(const struct lu_env *env,
+			     struct osc_async_page *oap, int cmd);
+static int osc_io_unplug_async(const struct lu_env *env,
+			       struct client_obd *cli, struct osc_object *osc);
+static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages,
+			   unsigned int lost_grant);
+
+static void osc_extent_tree_dump0(int level, struct osc_object *obj,
+				  const char *func, int line);
+#define osc_extent_tree_dump(lvl, obj) \
+	osc_extent_tree_dump0(lvl, obj, __func__, __LINE__)
+
+/** \addtogroup osc
+ *  @{
+ */
+
+/* ------------------ osc extent ------------------ */
+static inline char *ext_flags(struct osc_extent *ext, char *flags)
+{
+	char *buf = flags;
+	*buf++ = ext->oe_rw ? 'r' : 'w';
+	if (ext->oe_intree)
+		*buf++ = 'i';
+	if (ext->oe_srvlock)
+		*buf++ = 's';
+	if (ext->oe_hp)
+		*buf++ = 'h';
+	if (ext->oe_urgent)
+		*buf++ = 'u';
+	if (ext->oe_memalloc)
+		*buf++ = 'm';
+	if (ext->oe_trunc_pending)
+		*buf++ = 't';
+	if (ext->oe_fsync_wait)
+		*buf++ = 'Y';
+	*buf = 0;
+	return flags;
+}
+
+static inline char list_empty_marker(struct list_head *list)
+{
+	return list_empty(list) ? '-' : '+';
+}
+
+#define EXTSTR       "[%lu -> %lu/%lu]"
+#define EXTPARA(ext) (ext)->oe_start, (ext)->oe_end, (ext)->oe_max_end
+
+#define OSC_EXTENT_DUMP(lvl, extent, fmt, ...) do {			      \
+	struct osc_extent *__ext = (extent);				      \
+	const char *__str[] = OES_STRINGS;				      \
+	char __buf[16];							      \
+									      \
+	CDEBUG(lvl,							      \
+		"extent %p@{" EXTSTR ", "				      \
+		"[%d|%d|%c|%s|%s|%p], [%d|%d|%c|%c|%p|%u|%p]} " fmt,	      \
+		/* ----- extent part 0 ----- */				      \
+		__ext, EXTPARA(__ext),					      \
+		/* ----- part 1 ----- */				      \
+		atomic_read(&__ext->oe_refc),			      \
+		atomic_read(&__ext->oe_users),			      \
+		list_empty_marker(&__ext->oe_link),			      \
+		__str[__ext->oe_state], ext_flags(__ext, __buf),	      \
+		__ext->oe_obj,						      \
+		/* ----- part 2 ----- */				      \
+		__ext->oe_grants, __ext->oe_nr_pages,			      \
+		list_empty_marker(&__ext->oe_pages),			      \
+		waitqueue_active(&__ext->oe_waitq) ? '+' : '-',		      \
+		__ext->oe_osclock, __ext->oe_mppr, __ext->oe_owner,	      \
+		/* ----- part 4 ----- */				      \
+		## __VA_ARGS__);					      \
+} while (0)
+
+#undef EASSERTF
+#define EASSERTF(expr, ext, fmt, args...) do {				\
+	if (!(expr)) {							\
+		OSC_EXTENT_DUMP(D_ERROR, (ext), fmt, ##args);		 \
+		osc_extent_tree_dump(D_ERROR, (ext)->oe_obj);		 \
+		LASSERT(expr);						\
+	}								     \
+} while (0)
+
+#undef EASSERT
+#define EASSERT(expr, ext) EASSERTF(expr, ext, "\n")
+
+static inline struct osc_extent *rb_extent(struct rb_node *n)
+{
+	if (n == NULL)
+		return NULL;
+
+	return container_of(n, struct osc_extent, oe_node);
+}
+
+static inline struct osc_extent *next_extent(struct osc_extent *ext)
+{
+	if (ext == NULL)
+		return NULL;
+
+	LASSERT(ext->oe_intree);
+	return rb_extent(rb_next(&ext->oe_node));
+}
+
+static inline struct osc_extent *prev_extent(struct osc_extent *ext)
+{
+	if (ext == NULL)
+		return NULL;
+
+	LASSERT(ext->oe_intree);
+	return rb_extent(rb_prev(&ext->oe_node));
+}
+
+static inline struct osc_extent *first_extent(struct osc_object *obj)
+{
+	return rb_extent(rb_first(&obj->oo_root));
+}
+
+/* object must be locked by caller. */
+static int osc_extent_sanity_check0(struct osc_extent *ext,
+				    const char *func, const int line)
+{
+	struct osc_object *obj = ext->oe_obj;
+	struct osc_async_page *oap;
+	int page_count;
+	int rc = 0;
+
+	if (!osc_object_is_locked(obj))
+		GOTO(out, rc = 9);
+
+	if (ext->oe_state >= OES_STATE_MAX)
+		GOTO(out, rc = 10);
+
+	if (atomic_read(&ext->oe_refc) <= 0)
+		GOTO(out, rc = 20);
+
+	if (atomic_read(&ext->oe_refc) < atomic_read(&ext->oe_users))
+		GOTO(out, rc = 30);
+
+	switch (ext->oe_state) {
+	case OES_INV:
+		if (ext->oe_nr_pages > 0 || !list_empty(&ext->oe_pages))
+			GOTO(out, rc = 35);
+		GOTO(out, rc = 0);
+		break;
+	case OES_ACTIVE:
+		if (atomic_read(&ext->oe_users) == 0)
+			GOTO(out, rc = 40);
+		if (ext->oe_hp)
+			GOTO(out, rc = 50);
+		if (ext->oe_fsync_wait && !ext->oe_urgent)
+			GOTO(out, rc = 55);
+		break;
+	case OES_CACHE:
+		if (ext->oe_grants == 0)
+			GOTO(out, rc = 60);
+		if (ext->oe_fsync_wait && !ext->oe_urgent && !ext->oe_hp)
+			GOTO(out, rc = 65);
+	default:
+		if (atomic_read(&ext->oe_users) > 0)
+			GOTO(out, rc = 70);
+	}
+
+	if (ext->oe_max_end < ext->oe_end || ext->oe_end < ext->oe_start)
+		GOTO(out, rc = 80);
+
+	if (ext->oe_osclock == NULL && ext->oe_grants > 0)
+		GOTO(out, rc = 90);
+
+	if (ext->oe_osclock) {
+		struct cl_lock_descr *descr;
+		descr = &ext->oe_osclock->cll_descr;
+		if (!(descr->cld_start <= ext->oe_start &&
+		      descr->cld_end >= ext->oe_max_end))
+			GOTO(out, rc = 100);
+	}
+
+	if (ext->oe_nr_pages > ext->oe_mppr)
+		GOTO(out, rc = 105);
+
+	/* Do not verify page list if extent is in RPC. This is because an
+	 * in-RPC extent is supposed to be exclusively accessible w/o lock. */
+	if (ext->oe_state > OES_CACHE)
+		GOTO(out, rc = 0);
+
+	if (!extent_debug)
+		GOTO(out, rc = 0);
+
+	page_count = 0;
+	list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
+		pgoff_t index = oap2cl_page(oap)->cp_index;
+		++page_count;
+		if (index > ext->oe_end || index < ext->oe_start)
+			GOTO(out, rc = 110);
+	}
+	if (page_count != ext->oe_nr_pages)
+		GOTO(out, rc = 120);
+
+out:
+	if (rc != 0)
+		OSC_EXTENT_DUMP(D_ERROR, ext,
+				"%s:%d sanity check %p failed with rc = %d\n",
+				func, line, ext, rc);
+	return rc;
+}
+
+#define sanity_check_nolock(ext) \
+	osc_extent_sanity_check0(ext, __func__, __LINE__)
+
+#define sanity_check(ext) ({						   \
+	int __res;							     \
+	osc_object_lock((ext)->oe_obj);					\
+	__res = sanity_check_nolock(ext);				      \
+	osc_object_unlock((ext)->oe_obj);				      \
+	__res;								 \
+})
+
+
+/**
+ * sanity check - to make sure there is no overlapped extent in the tree.
+ */
+static int osc_extent_is_overlapped(struct osc_object *obj,
+				    struct osc_extent *ext)
+{
+	struct osc_extent *tmp;
+
+	LASSERT(osc_object_is_locked(obj));
+
+	if (!extent_debug)
+		return 0;
+
+	for (tmp = first_extent(obj); tmp != NULL; tmp = next_extent(tmp)) {
+		if (tmp == ext)
+			continue;
+		if (tmp->oe_end >= ext->oe_start &&
+		    tmp->oe_start <= ext->oe_end)
+			return 1;
+	}
+	return 0;
+}
+
+static void osc_extent_state_set(struct osc_extent *ext, int state)
+{
+	LASSERT(osc_object_is_locked(ext->oe_obj));
+	LASSERT(state >= OES_INV && state < OES_STATE_MAX);
+
+	/* Never try to sanity check a state changing extent :-) */
+	/* LASSERT(sanity_check_nolock(ext) == 0); */
+
+	/* TODO: validate the state machine */
+	ext->oe_state = state;
+	wake_up_all(&ext->oe_waitq);
+}
+
+static struct osc_extent *osc_extent_alloc(struct osc_object *obj)
+{
+	struct osc_extent *ext;
+
+	OBD_SLAB_ALLOC_PTR_GFP(ext, osc_extent_kmem, GFP_IOFS);
+	if (ext == NULL)
+		return NULL;
+
+	RB_CLEAR_NODE(&ext->oe_node);
+	ext->oe_obj = obj;
+	atomic_set(&ext->oe_refc, 1);
+	atomic_set(&ext->oe_users, 0);
+	INIT_LIST_HEAD(&ext->oe_link);
+	ext->oe_state = OES_INV;
+	INIT_LIST_HEAD(&ext->oe_pages);
+	init_waitqueue_head(&ext->oe_waitq);
+	ext->oe_osclock = NULL;
+
+	return ext;
+}
+
+static void osc_extent_free(struct osc_extent *ext)
+{
+	OBD_SLAB_FREE_PTR(ext, osc_extent_kmem);
+}
+
+static struct osc_extent *osc_extent_get(struct osc_extent *ext)
+{
+	LASSERT(atomic_read(&ext->oe_refc) >= 0);
+	atomic_inc(&ext->oe_refc);
+	return ext;
+}
+
+static void osc_extent_put(const struct lu_env *env, struct osc_extent *ext)
+{
+	LASSERT(atomic_read(&ext->oe_refc) > 0);
+	if (atomic_dec_and_test(&ext->oe_refc)) {
+		LASSERT(list_empty(&ext->oe_link));
+		LASSERT(atomic_read(&ext->oe_users) == 0);
+		LASSERT(ext->oe_state == OES_INV);
+		LASSERT(!ext->oe_intree);
+
+		if (ext->oe_osclock) {
+			cl_lock_put(env, ext->oe_osclock);
+			ext->oe_osclock = NULL;
+		}
+		osc_extent_free(ext);
+	}
+}
+
+/**
+ * osc_extent_put_trust() is a special version of osc_extent_put() when
+ * it's known that the caller is not the last user. This is to address the
+ * problem of lacking of lu_env ;-).
+ */
+static void osc_extent_put_trust(struct osc_extent *ext)
+{
+	LASSERT(atomic_read(&ext->oe_refc) > 1);
+	LASSERT(osc_object_is_locked(ext->oe_obj));
+	atomic_dec(&ext->oe_refc);
+}
+
+/**
+ * Return the extent which includes pgoff @index, or return the greatest
+ * previous extent in the tree.
+ */
+static struct osc_extent *osc_extent_search(struct osc_object *obj,
+					    pgoff_t index)
+{
+	struct rb_node    *n = obj->oo_root.rb_node;
+	struct osc_extent *tmp, *p = NULL;
+
+	LASSERT(osc_object_is_locked(obj));
+	while (n != NULL) {
+		tmp = rb_extent(n);
+		if (index < tmp->oe_start) {
+			n = n->rb_left;
+		} else if (index > tmp->oe_end) {
+			p = rb_extent(n);
+			n = n->rb_right;
+		} else {
+			return tmp;
+		}
+	}
+	return p;
+}
+
+/*
+ * Return the extent covering @index, otherwise return NULL.
+ * caller must have held object lock.
+ */
+static struct osc_extent *osc_extent_lookup(struct osc_object *obj,
+					    pgoff_t index)
+{
+	struct osc_extent *ext;
+
+	ext = osc_extent_search(obj, index);
+	if (ext != NULL && ext->oe_start <= index && index <= ext->oe_end)
+		return osc_extent_get(ext);
+	return NULL;
+}
+
+/* caller must have held object lock. */
+static void osc_extent_insert(struct osc_object *obj, struct osc_extent *ext)
+{
+	struct rb_node   **n      = &obj->oo_root.rb_node;
+	struct rb_node    *parent = NULL;
+	struct osc_extent *tmp;
+
+	LASSERT(ext->oe_intree == 0);
+	LASSERT(ext->oe_obj == obj);
+	LASSERT(osc_object_is_locked(obj));
+	while (*n != NULL) {
+		tmp = rb_extent(*n);
+		parent = *n;
+
+		if (ext->oe_end < tmp->oe_start)
+			n = &(*n)->rb_left;
+		else if (ext->oe_start > tmp->oe_end)
+			n = &(*n)->rb_right;
+		else
+			EASSERTF(0, tmp, EXTSTR, EXTPARA(ext));
+	}
+	rb_link_node(&ext->oe_node, parent, n);
+	rb_insert_color(&ext->oe_node, &obj->oo_root);
+	osc_extent_get(ext);
+	ext->oe_intree = 1;
+}
+
+/* caller must have held object lock. */
+static void osc_extent_erase(struct osc_extent *ext)
+{
+	struct osc_object *obj = ext->oe_obj;
+	LASSERT(osc_object_is_locked(obj));
+	if (ext->oe_intree) {
+		rb_erase(&ext->oe_node, &obj->oo_root);
+		ext->oe_intree = 0;
+		/* rbtree held a refcount */
+		osc_extent_put_trust(ext);
+	}
+}
+
+static struct osc_extent *osc_extent_hold(struct osc_extent *ext)
+{
+	struct osc_object *obj = ext->oe_obj;
+
+	LASSERT(osc_object_is_locked(obj));
+	LASSERT(ext->oe_state == OES_ACTIVE || ext->oe_state == OES_CACHE);
+	if (ext->oe_state == OES_CACHE) {
+		osc_extent_state_set(ext, OES_ACTIVE);
+		osc_update_pending(obj, OBD_BRW_WRITE, -ext->oe_nr_pages);
+	}
+	atomic_inc(&ext->oe_users);
+	list_del_init(&ext->oe_link);
+	return osc_extent_get(ext);
+}
+
+static void __osc_extent_remove(struct osc_extent *ext)
+{
+	LASSERT(osc_object_is_locked(ext->oe_obj));
+	LASSERT(list_empty(&ext->oe_pages));
+	osc_extent_erase(ext);
+	list_del_init(&ext->oe_link);
+	osc_extent_state_set(ext, OES_INV);
+	OSC_EXTENT_DUMP(D_CACHE, ext, "destroyed.\n");
+}
+
+static void osc_extent_remove(struct osc_extent *ext)
+{
+	struct osc_object *obj = ext->oe_obj;
+
+	osc_object_lock(obj);
+	__osc_extent_remove(ext);
+	osc_object_unlock(obj);
+}
+
+/**
+ * This function is used to merge extents to get better performance. It checks
+ * if @cur and @victim are contiguous at chunk level.
+ */
+static int osc_extent_merge(const struct lu_env *env, struct osc_extent *cur,
+			    struct osc_extent *victim)
+{
+	struct osc_object *obj = cur->oe_obj;
+	pgoff_t chunk_start;
+	pgoff_t chunk_end;
+	int ppc_bits;
+
+	LASSERT(cur->oe_state == OES_CACHE);
+	LASSERT(osc_object_is_locked(obj));
+	if (victim == NULL)
+		return -EINVAL;
+
+	if (victim->oe_state != OES_CACHE || victim->oe_fsync_wait)
+		return -EBUSY;
+
+	if (cur->oe_max_end != victim->oe_max_end)
+		return -ERANGE;
+
+	LASSERT(cur->oe_osclock == victim->oe_osclock);
+	ppc_bits = osc_cli(obj)->cl_chunkbits - PAGE_CACHE_SHIFT;
+	chunk_start = cur->oe_start >> ppc_bits;
+	chunk_end   = cur->oe_end   >> ppc_bits;
+	if (chunk_start   != (victim->oe_end >> ppc_bits) + 1 &&
+	    chunk_end + 1 != victim->oe_start >> ppc_bits)
+		return -ERANGE;
+
+	OSC_EXTENT_DUMP(D_CACHE, victim, "will be merged by %p.\n", cur);
+
+	cur->oe_start     = min(cur->oe_start, victim->oe_start);
+	cur->oe_end       = max(cur->oe_end,   victim->oe_end);
+	cur->oe_grants   += victim->oe_grants;
+	cur->oe_nr_pages += victim->oe_nr_pages;
+	/* only the following bits are needed to merge */
+	cur->oe_urgent   |= victim->oe_urgent;
+	cur->oe_memalloc |= victim->oe_memalloc;
+	list_splice_init(&victim->oe_pages, &cur->oe_pages);
+	list_del_init(&victim->oe_link);
+	victim->oe_nr_pages = 0;
+
+	osc_extent_get(victim);
+	__osc_extent_remove(victim);
+	osc_extent_put(env, victim);
+
+	OSC_EXTENT_DUMP(D_CACHE, cur, "after merging %p.\n", victim);
+	return 0;
+}
+
+/**
+ * Drop user count of osc_extent, and unplug IO asynchronously.
+ */
+int osc_extent_release(const struct lu_env *env, struct osc_extent *ext)
+{
+	struct osc_object *obj = ext->oe_obj;
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(atomic_read(&ext->oe_users) > 0);
+	LASSERT(sanity_check(ext) == 0);
+	LASSERT(ext->oe_grants > 0);
+
+	if (atomic_dec_and_lock(&ext->oe_users, &obj->oo_lock)) {
+		LASSERT(ext->oe_state == OES_ACTIVE);
+		if (ext->oe_trunc_pending) {
+			/* a truncate process is waiting for this extent.
+			 * This may happen due to a race, check
+			 * osc_cache_truncate_start(). */
+			osc_extent_state_set(ext, OES_TRUNC);
+			ext->oe_trunc_pending = 0;
+		} else {
+			osc_extent_state_set(ext, OES_CACHE);
+			osc_update_pending(obj, OBD_BRW_WRITE,
+					   ext->oe_nr_pages);
+
+			/* try to merge the previous and next extent. */
+			osc_extent_merge(env, ext, prev_extent(ext));
+			osc_extent_merge(env, ext, next_extent(ext));
+
+			if (ext->oe_urgent)
+				list_move_tail(&ext->oe_link,
+						   &obj->oo_urgent_exts);
+		}
+		osc_object_unlock(obj);
+
+		osc_io_unplug_async(env, osc_cli(obj), obj);
+	}
+	osc_extent_put(env, ext);
+	RETURN(rc);
+}
+
+static inline int overlapped(struct osc_extent *ex1, struct osc_extent *ex2)
+{
+	return !(ex1->oe_end < ex2->oe_start || ex2->oe_end < ex1->oe_start);
+}
+
+/**
+ * Find or create an extent which includes @index, core function to manage
+ * extent tree.
+ */
+struct osc_extent *osc_extent_find(const struct lu_env *env,
+				   struct osc_object *obj, pgoff_t index,
+				   int *grants)
+
+{
+	struct client_obd *cli = osc_cli(obj);
+	struct cl_lock    *lock;
+	struct osc_extent *cur;
+	struct osc_extent *ext;
+	struct osc_extent *conflict = NULL;
+	struct osc_extent *found = NULL;
+	pgoff_t    chunk;
+	pgoff_t    max_end;
+	int	max_pages; /* max_pages_per_rpc */
+	int	chunksize;
+	int	ppc_bits; /* pages per chunk bits */
+	int	chunk_mask;
+	int	rc;
+	ENTRY;
+
+	cur = osc_extent_alloc(obj);
+	if (cur == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	lock = cl_lock_at_pgoff(env, osc2cl(obj), index, NULL, 1, 0);
+	LASSERT(lock != NULL);
+	LASSERT(lock->cll_descr.cld_mode >= CLM_WRITE);
+
+	LASSERT(cli->cl_chunkbits >= PAGE_CACHE_SHIFT);
+	ppc_bits   = cli->cl_chunkbits - PAGE_CACHE_SHIFT;
+	chunk_mask = ~((1 << ppc_bits) - 1);
+	chunksize  = 1 << cli->cl_chunkbits;
+	chunk      = index >> ppc_bits;
+
+	/* align end to rpc edge, rpc size may not be a power 2 integer. */
+	max_pages = cli->cl_max_pages_per_rpc;
+	LASSERT((max_pages & ~chunk_mask) == 0);
+	max_end = index - (index % max_pages) + max_pages - 1;
+	max_end = min_t(pgoff_t, max_end, lock->cll_descr.cld_end);
+
+	/* initialize new extent by parameters so far */
+	cur->oe_max_end = max_end;
+	cur->oe_start   = index & chunk_mask;
+	cur->oe_end     = ((index + ~chunk_mask + 1) & chunk_mask) - 1;
+	if (cur->oe_start < lock->cll_descr.cld_start)
+		cur->oe_start = lock->cll_descr.cld_start;
+	if (cur->oe_end > max_end)
+		cur->oe_end = max_end;
+	cur->oe_osclock = lock;
+	cur->oe_grants  = 0;
+	cur->oe_mppr    = max_pages;
+
+	/* grants has been allocated by caller */
+	LASSERTF(*grants >= chunksize + cli->cl_extent_tax,
+		 "%u/%u/%u.\n", *grants, chunksize, cli->cl_extent_tax);
+	LASSERTF((max_end - cur->oe_start) < max_pages, EXTSTR, EXTPARA(cur));
+
+restart:
+	osc_object_lock(obj);
+	ext = osc_extent_search(obj, cur->oe_start);
+	if (ext == NULL)
+		ext = first_extent(obj);
+	while (ext != NULL) {
+		loff_t ext_chk_start = ext->oe_start >> ppc_bits;
+		loff_t ext_chk_end   = ext->oe_end   >> ppc_bits;
+
+		LASSERT(sanity_check_nolock(ext) == 0);
+		if (chunk > ext_chk_end + 1)
+			break;
+
+		/* if covering by different locks, no chance to match */
+		if (lock != ext->oe_osclock) {
+			EASSERTF(!overlapped(ext, cur), ext,
+				 EXTSTR, EXTPARA(cur));
+
+			ext = next_extent(ext);
+			continue;
+		}
+
+		/* discontiguous chunks? */
+		if (chunk + 1 < ext_chk_start) {
+			ext = next_extent(ext);
+			continue;
+		}
+
+		/* ok, from now on, ext and cur have these attrs:
+		 * 1. covered by the same lock
+		 * 2. contiguous at chunk level or overlapping. */
+
+		if (overlapped(ext, cur)) {
+			/* cur is the minimum unit, so overlapping means
+			 * full contain. */
+			EASSERTF((ext->oe_start <= cur->oe_start &&
+				  ext->oe_end >= cur->oe_end),
+				 ext, EXTSTR, EXTPARA(cur));
+
+			if (ext->oe_state > OES_CACHE || ext->oe_fsync_wait) {
+				/* for simplicity, we wait for this extent to
+				 * finish before going forward. */
+				conflict = osc_extent_get(ext);
+				break;
+			}
+
+			found = osc_extent_hold(ext);
+			break;
+		}
+
+		/* non-overlapped extent */
+		if (ext->oe_state != OES_CACHE || ext->oe_fsync_wait) {
+			/* we can't do anything for a non OES_CACHE extent, or
+			 * if there is someone waiting for this extent to be
+			 * flushed, try next one. */
+			ext = next_extent(ext);
+			continue;
+		}
+
+		/* check if they belong to the same rpc slot before trying to
+		 * merge. the extents are not overlapped and contiguous at
+		 * chunk level to get here. */
+		if (ext->oe_max_end != max_end) {
+			/* if they don't belong to the same RPC slot or
+			 * max_pages_per_rpc has ever changed, do not merge. */
+			ext = next_extent(ext);
+			continue;
+		}
+
+		/* it's required that an extent must be contiguous at chunk
+		 * level so that we know the whole extent is covered by grant
+		 * (the pages in the extent are NOT required to be contiguous).
+		 * Otherwise, it will be too much difficult to know which
+		 * chunks have grants allocated. */
+
+		/* try to do front merge - extend ext's start */
+		if (chunk + 1 == ext_chk_start) {
+			/* ext must be chunk size aligned */
+			EASSERT((ext->oe_start & ~chunk_mask) == 0, ext);
+
+			/* pull ext's start back to cover cur */
+			ext->oe_start   = cur->oe_start;
+			ext->oe_grants += chunksize;
+			*grants -= chunksize;
+
+			found = osc_extent_hold(ext);
+		} else if (chunk == ext_chk_end + 1) {
+			/* rear merge */
+			ext->oe_end     = cur->oe_end;
+			ext->oe_grants += chunksize;
+			*grants -= chunksize;
+
+			/* try to merge with the next one because we just fill
+			 * in a gap */
+			if (osc_extent_merge(env, ext, next_extent(ext)) == 0)
+				/* we can save extent tax from next extent */
+				*grants += cli->cl_extent_tax;
+
+			found = osc_extent_hold(ext);
+		}
+		if (found != NULL)
+			break;
+
+		ext = next_extent(ext);
+	}
+
+	osc_extent_tree_dump(D_CACHE, obj);
+	if (found != NULL) {
+		LASSERT(conflict == NULL);
+		if (!IS_ERR(found)) {
+			LASSERT(found->oe_osclock == cur->oe_osclock);
+			OSC_EXTENT_DUMP(D_CACHE, found,
+					"found caching ext for %lu.\n", index);
+		}
+	} else if (conflict == NULL) {
+		/* create a new extent */
+		EASSERT(osc_extent_is_overlapped(obj, cur) == 0, cur);
+		cur->oe_grants = chunksize + cli->cl_extent_tax;
+		*grants -= cur->oe_grants;
+		LASSERT(*grants >= 0);
+
+		cur->oe_state = OES_CACHE;
+		found = osc_extent_hold(cur);
+		osc_extent_insert(obj, cur);
+		OSC_EXTENT_DUMP(D_CACHE, cur, "add into tree %lu/%lu.\n",
+				index, lock->cll_descr.cld_end);
+	}
+	osc_object_unlock(obj);
+
+	if (conflict != NULL) {
+		LASSERT(found == NULL);
+
+		/* waiting for IO to finish. Please notice that it's impossible
+		 * to be an OES_TRUNC extent. */
+		rc = osc_extent_wait(env, conflict, OES_INV);
+		osc_extent_put(env, conflict);
+		conflict = NULL;
+		if (rc < 0)
+			GOTO(out, found = ERR_PTR(rc));
+
+		goto restart;
+	}
+	EXIT;
+
+out:
+	osc_extent_put(env, cur);
+	LASSERT(*grants >= 0);
+	return found;
+}
+
+/**
+ * Called when IO is finished to an extent.
+ */
+int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext,
+		      int sent, int rc)
+{
+	struct client_obd *cli = osc_cli(ext->oe_obj);
+	struct osc_async_page *oap;
+	struct osc_async_page *tmp;
+	int nr_pages = ext->oe_nr_pages;
+	int lost_grant = 0;
+	int blocksize = cli->cl_import->imp_obd->obd_osfs.os_bsize ? : 4096;
+	__u64 last_off = 0;
+	int last_count = -1;
+	ENTRY;
+
+	OSC_EXTENT_DUMP(D_CACHE, ext, "extent finished.\n");
+
+	ext->oe_rc = rc ?: ext->oe_nr_pages;
+	EASSERT(ergo(rc == 0, ext->oe_state == OES_RPC), ext);
+	list_for_each_entry_safe(oap, tmp, &ext->oe_pages,
+				     oap_pending_item) {
+		list_del_init(&oap->oap_rpc_item);
+		list_del_init(&oap->oap_pending_item);
+		if (last_off <= oap->oap_obj_off) {
+			last_off = oap->oap_obj_off;
+			last_count = oap->oap_count;
+		}
+
+		--ext->oe_nr_pages;
+		osc_ap_completion(env, cli, oap, sent, rc);
+	}
+	EASSERT(ext->oe_nr_pages == 0, ext);
+
+	if (!sent) {
+		lost_grant = ext->oe_grants;
+	} else if (blocksize < PAGE_CACHE_SIZE &&
+		   last_count != PAGE_CACHE_SIZE) {
+		/* For short writes we shouldn't count parts of pages that
+		 * span a whole chunk on the OST side, or our accounting goes
+		 * wrong.  Should match the code in filter_grant_check. */
+		int offset = oap->oap_page_off & ~CFS_PAGE_MASK;
+		int count = oap->oap_count + (offset & (blocksize - 1));
+		int end = (offset + oap->oap_count) & (blocksize - 1);
+		if (end)
+			count += blocksize - end;
+
+		lost_grant = PAGE_CACHE_SIZE - count;
+	}
+	if (ext->oe_grants > 0)
+		osc_free_grant(cli, nr_pages, lost_grant);
+
+	osc_extent_remove(ext);
+	/* put the refcount for RPC */
+	osc_extent_put(env, ext);
+	RETURN(0);
+}
+
+static int extent_wait_cb(struct osc_extent *ext, int state)
+{
+	int ret;
+
+	osc_object_lock(ext->oe_obj);
+	ret = ext->oe_state == state;
+	osc_object_unlock(ext->oe_obj);
+
+	return ret;
+}
+
+/**
+ * Wait for the extent's state to become @state.
+ */
+static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext,
+			   int state)
+{
+	struct osc_object *obj = ext->oe_obj;
+	struct l_wait_info lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(600), NULL,
+						  LWI_ON_SIGNAL_NOOP, NULL);
+	int rc = 0;
+	ENTRY;
+
+	osc_object_lock(obj);
+	LASSERT(sanity_check_nolock(ext) == 0);
+	/* `Kick' this extent only if the caller is waiting for it to be
+	 * written out. */
+	if (state == OES_INV && !ext->oe_urgent && !ext->oe_hp) {
+		if (ext->oe_state == OES_ACTIVE) {
+			ext->oe_urgent = 1;
+		} else if (ext->oe_state == OES_CACHE) {
+			ext->oe_urgent = 1;
+			osc_extent_hold(ext);
+			rc = 1;
+		}
+	}
+	osc_object_unlock(obj);
+	if (rc == 1)
+		osc_extent_release(env, ext);
+
+	/* wait for the extent until its state becomes @state */
+	rc = l_wait_event(ext->oe_waitq, extent_wait_cb(ext, state), &lwi);
+	if (rc == -ETIMEDOUT) {
+		OSC_EXTENT_DUMP(D_ERROR, ext,
+			"%s: wait ext to %d timedout, recovery in progress?\n",
+			osc_export(obj)->exp_obd->obd_name, state);
+
+		lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+		rc = l_wait_event(ext->oe_waitq, extent_wait_cb(ext, state),
+				  &lwi);
+	}
+	if (rc == 0 && ext->oe_rc < 0)
+		rc = ext->oe_rc;
+	RETURN(rc);
+}
+
+/**
+ * Discard pages with index greater than @size. If @ext is overlapped with
+ * @size, then partial truncate happens.
+ */
+static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index,
+				bool partial)
+{
+	struct cl_env_nest     nest;
+	struct lu_env	 *env;
+	struct cl_io	  *io;
+	struct osc_object     *obj = ext->oe_obj;
+	struct client_obd     *cli = osc_cli(obj);
+	struct osc_async_page *oap;
+	struct osc_async_page *tmp;
+	int		    pages_in_chunk = 0;
+	int		    ppc_bits    = cli->cl_chunkbits - PAGE_CACHE_SHIFT;
+	__u64		  trunc_chunk = trunc_index >> ppc_bits;
+	int		    grants   = 0;
+	int		    nr_pages = 0;
+	int		    rc       = 0;
+	ENTRY;
+
+	LASSERT(sanity_check(ext) == 0);
+	LASSERT(ext->oe_state == OES_TRUNC);
+	LASSERT(!ext->oe_urgent);
+
+	/* Request new lu_env.
+	 * We can't use that env from osc_cache_truncate_start() because
+	 * it's from lov_io_sub and not fully initialized. */
+	env = cl_env_nested_get(&nest);
+	io  = &osc_env_info(env)->oti_io;
+	io->ci_obj = cl_object_top(osc2cl(obj));
+	rc = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+	if (rc < 0)
+		GOTO(out, rc);
+
+	/* discard all pages with index greater then trunc_index */
+	list_for_each_entry_safe(oap, tmp, &ext->oe_pages,
+				     oap_pending_item) {
+		struct cl_page  *sub  = oap2cl_page(oap);
+		struct cl_page  *page = cl_page_top(sub);
+
+		LASSERT(list_empty(&oap->oap_rpc_item));
+
+		/* only discard the pages with their index greater than
+		 * trunc_index, and ... */
+		if (sub->cp_index < trunc_index ||
+		    (sub->cp_index == trunc_index && partial)) {
+			/* accounting how many pages remaining in the chunk
+			 * so that we can calculate grants correctly. */
+			if (sub->cp_index >> ppc_bits == trunc_chunk)
+				++pages_in_chunk;
+			continue;
+		}
+
+		list_del_init(&oap->oap_pending_item);
+
+		cl_page_get(page);
+		lu_ref_add(&page->cp_reference, "truncate", current);
+
+		if (cl_page_own(env, io, page) == 0) {
+			cl_page_unmap(env, io, page);
+			cl_page_discard(env, io, page);
+			cl_page_disown(env, io, page);
+		} else {
+			LASSERT(page->cp_state == CPS_FREEING);
+			LASSERT(0);
+		}
+
+		lu_ref_del(&page->cp_reference, "truncate", current);
+		cl_page_put(env, page);
+
+		--ext->oe_nr_pages;
+		++nr_pages;
+	}
+	EASSERTF(ergo(ext->oe_start >= trunc_index + !!partial,
+		      ext->oe_nr_pages == 0),
+		ext, "trunc_index %lu, partial %d\n", trunc_index, partial);
+
+	osc_object_lock(obj);
+	if (ext->oe_nr_pages == 0) {
+		LASSERT(pages_in_chunk == 0);
+		grants = ext->oe_grants;
+		ext->oe_grants = 0;
+	} else { /* calculate how many grants we can free */
+		int     chunks = (ext->oe_end >> ppc_bits) - trunc_chunk;
+		pgoff_t last_index;
+
+
+		/* if there is no pages in this chunk, we can also free grants
+		 * for the last chunk */
+		if (pages_in_chunk == 0) {
+			/* if this is the 1st chunk and no pages in this chunk,
+			 * ext->oe_nr_pages must be zero, so we should be in
+			 * the other if-clause. */
+			LASSERT(trunc_chunk > 0);
+			--trunc_chunk;
+			++chunks;
+		}
+
+		/* this is what we can free from this extent */
+		grants	  = chunks << cli->cl_chunkbits;
+		ext->oe_grants -= grants;
+		last_index      = ((trunc_chunk + 1) << ppc_bits) - 1;
+		ext->oe_end     = min(last_index, ext->oe_max_end);
+		LASSERT(ext->oe_end >= ext->oe_start);
+		LASSERT(ext->oe_grants > 0);
+	}
+	osc_object_unlock(obj);
+
+	if (grants > 0 || nr_pages > 0)
+		osc_free_grant(cli, nr_pages, grants);
+
+out:
+	cl_io_fini(env, io);
+	cl_env_nested_put(&nest, env);
+	RETURN(rc);
+}
+
+/**
+ * This function is used to make the extent prepared for transfer.
+ * A race with flusing page - ll_writepage() has to be handled cautiously.
+ */
+static int osc_extent_make_ready(const struct lu_env *env,
+				 struct osc_extent *ext)
+{
+	struct osc_async_page *oap;
+	struct osc_async_page *last = NULL;
+	struct osc_object *obj = ext->oe_obj;
+	int page_count = 0;
+	int rc;
+	ENTRY;
+
+	/* we're going to grab page lock, so object lock must not be taken. */
+	LASSERT(sanity_check(ext) == 0);
+	/* in locking state, any process should not touch this extent. */
+	EASSERT(ext->oe_state == OES_LOCKING, ext);
+	EASSERT(ext->oe_owner != NULL, ext);
+
+	OSC_EXTENT_DUMP(D_CACHE, ext, "make ready\n");
+
+	list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
+		++page_count;
+		if (last == NULL || last->oap_obj_off < oap->oap_obj_off)
+			last = oap;
+
+		/* checking ASYNC_READY is race safe */
+		if ((oap->oap_async_flags & ASYNC_READY) != 0)
+			continue;
+
+		rc = osc_make_ready(env, oap, OBD_BRW_WRITE);
+		switch (rc) {
+		case 0:
+			spin_lock(&oap->oap_lock);
+			oap->oap_async_flags |= ASYNC_READY;
+			spin_unlock(&oap->oap_lock);
+			break;
+		case -EALREADY:
+			LASSERT((oap->oap_async_flags & ASYNC_READY) != 0);
+			break;
+		default:
+			LASSERTF(0, "unknown return code: %d\n", rc);
+		}
+	}
+
+	LASSERT(page_count == ext->oe_nr_pages);
+	LASSERT(last != NULL);
+	/* the last page is the only one we need to refresh its count by
+	 * the size of file. */
+	if (!(last->oap_async_flags & ASYNC_COUNT_STABLE)) {
+		last->oap_count = osc_refresh_count(env, last, OBD_BRW_WRITE);
+		LASSERT(last->oap_count > 0);
+		LASSERT(last->oap_page_off + last->oap_count <= PAGE_CACHE_SIZE);
+		last->oap_async_flags |= ASYNC_COUNT_STABLE;
+	}
+
+	/* for the rest of pages, we don't need to call osf_refresh_count()
+	 * because it's known they are not the last page */
+	list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
+		if (!(oap->oap_async_flags & ASYNC_COUNT_STABLE)) {
+			oap->oap_count = PAGE_CACHE_SIZE - oap->oap_page_off;
+			oap->oap_async_flags |= ASYNC_COUNT_STABLE;
+		}
+	}
+
+	osc_object_lock(obj);
+	osc_extent_state_set(ext, OES_RPC);
+	osc_object_unlock(obj);
+	/* get a refcount for RPC. */
+	osc_extent_get(ext);
+
+	RETURN(0);
+}
+
+/**
+ * Quick and simple version of osc_extent_find(). This function is frequently
+ * called to expand the extent for the same IO. To expand the extent, the
+ * page index must be in the same or next chunk of ext->oe_end.
+ */
+static int osc_extent_expand(struct osc_extent *ext, pgoff_t index, int *grants)
+{
+	struct osc_object *obj = ext->oe_obj;
+	struct client_obd *cli = osc_cli(obj);
+	struct osc_extent *next;
+	int ppc_bits = cli->cl_chunkbits - PAGE_CACHE_SHIFT;
+	pgoff_t chunk = index >> ppc_bits;
+	pgoff_t end_chunk;
+	pgoff_t end_index;
+	int chunksize = 1 << cli->cl_chunkbits;
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(ext->oe_max_end >= index && ext->oe_start <= index);
+	osc_object_lock(obj);
+	LASSERT(sanity_check_nolock(ext) == 0);
+	end_chunk = ext->oe_end >> ppc_bits;
+	if (chunk > end_chunk + 1)
+		GOTO(out, rc = -ERANGE);
+
+	if (end_chunk >= chunk)
+		GOTO(out, rc = 0);
+
+	LASSERT(end_chunk + 1 == chunk);
+	/* try to expand this extent to cover @index */
+	end_index = min(ext->oe_max_end, ((chunk + 1) << ppc_bits) - 1);
+
+	next = next_extent(ext);
+	if (next != NULL && next->oe_start <= end_index)
+		/* complex mode - overlapped with the next extent,
+		 * this case will be handled by osc_extent_find() */
+		GOTO(out, rc = -EAGAIN);
+
+	ext->oe_end = end_index;
+	ext->oe_grants += chunksize;
+	*grants -= chunksize;
+	LASSERT(*grants >= 0);
+	EASSERTF(osc_extent_is_overlapped(obj, ext) == 0, ext,
+		 "overlapped after expanding for %lu.\n", index);
+	EXIT;
+
+out:
+	osc_object_unlock(obj);
+	RETURN(rc);
+}
+
+static void osc_extent_tree_dump0(int level, struct osc_object *obj,
+				  const char *func, int line)
+{
+	struct osc_extent *ext;
+	int cnt;
+
+	CDEBUG(level, "Dump object %p extents at %s:%d, mppr: %u.\n",
+	       obj, func, line, osc_cli(obj)->cl_max_pages_per_rpc);
+
+	/* osc_object_lock(obj); */
+	cnt = 1;
+	for (ext = first_extent(obj); ext != NULL; ext = next_extent(ext))
+		OSC_EXTENT_DUMP(level, ext, "in tree %d.\n", cnt++);
+
+	cnt = 1;
+	list_for_each_entry(ext, &obj->oo_hp_exts, oe_link)
+		OSC_EXTENT_DUMP(level, ext, "hp %d.\n", cnt++);
+
+	cnt = 1;
+	list_for_each_entry(ext, &obj->oo_urgent_exts, oe_link)
+		OSC_EXTENT_DUMP(level, ext, "urgent %d.\n", cnt++);
+
+	cnt = 1;
+	list_for_each_entry(ext, &obj->oo_reading_exts, oe_link)
+		OSC_EXTENT_DUMP(level, ext, "reading %d.\n", cnt++);
+	/* osc_object_unlock(obj); */
+}
+
+/* ------------------ osc extent end ------------------ */
+
+static inline int osc_is_ready(struct osc_object *osc)
+{
+	return !list_empty(&osc->oo_ready_item) ||
+	       !list_empty(&osc->oo_hp_ready_item);
+}
+
+#define OSC_IO_DEBUG(OSC, STR, args...)					       \
+	CDEBUG(D_CACHE, "obj %p ready %d|%c|%c wr %d|%c|%c rd %d|%c " STR,     \
+	       (OSC), osc_is_ready(OSC),				       \
+	       list_empty_marker(&(OSC)->oo_hp_ready_item),		       \
+	       list_empty_marker(&(OSC)->oo_ready_item),		       \
+	       atomic_read(&(OSC)->oo_nr_writes),			       \
+	       list_empty_marker(&(OSC)->oo_hp_exts),			       \
+	       list_empty_marker(&(OSC)->oo_urgent_exts),		       \
+	       atomic_read(&(OSC)->oo_nr_reads),			       \
+	       list_empty_marker(&(OSC)->oo_reading_exts),		       \
+	       ##args)
+
+static int osc_make_ready(const struct lu_env *env, struct osc_async_page *oap,
+			  int cmd)
+{
+	struct osc_page *opg  = oap2osc_page(oap);
+	struct cl_page  *page = cl_page_top(oap2cl_page(oap));
+	int result;
+
+	LASSERT(cmd == OBD_BRW_WRITE); /* no cached reads */
+
+	ENTRY;
+	result = cl_page_make_ready(env, page, CRT_WRITE);
+	if (result == 0)
+		opg->ops_submit_time = cfs_time_current();
+	RETURN(result);
+}
+
+static int osc_refresh_count(const struct lu_env *env,
+			     struct osc_async_page *oap, int cmd)
+{
+	struct osc_page  *opg = oap2osc_page(oap);
+	struct cl_page   *page = oap2cl_page(oap);
+	struct cl_object *obj;
+	struct cl_attr   *attr = &osc_env_info(env)->oti_attr;
+
+	int result;
+	loff_t kms;
+
+	/* readpage queues with _COUNT_STABLE, shouldn't get here. */
+	LASSERT(!(cmd & OBD_BRW_READ));
+	LASSERT(opg != NULL);
+	obj = opg->ops_cl.cpl_obj;
+
+	cl_object_attr_lock(obj);
+	result = cl_object_attr_get(env, obj, attr);
+	cl_object_attr_unlock(obj);
+	if (result < 0)
+		return result;
+	kms = attr->cat_kms;
+	if (cl_offset(obj, page->cp_index) >= kms)
+		/* catch race with truncate */
+		return 0;
+	else if (cl_offset(obj, page->cp_index + 1) > kms)
+		/* catch sub-page write at end of file */
+		return kms % PAGE_CACHE_SIZE;
+	else
+		return PAGE_CACHE_SIZE;
+}
+
+static int osc_completion(const struct lu_env *env, struct osc_async_page *oap,
+			  int cmd, int rc)
+{
+	struct osc_page   *opg  = oap2osc_page(oap);
+	struct cl_page    *page = cl_page_top(oap2cl_page(oap));
+	struct osc_object *obj  = cl2osc(opg->ops_cl.cpl_obj);
+	enum cl_req_type   crt;
+	int srvlock;
+
+	ENTRY;
+
+	cmd &= ~OBD_BRW_NOQUOTA;
+	LASSERT(equi(page->cp_state == CPS_PAGEIN,  cmd == OBD_BRW_READ));
+	LASSERT(equi(page->cp_state == CPS_PAGEOUT, cmd == OBD_BRW_WRITE));
+	LASSERT(opg->ops_transfer_pinned);
+
+	/*
+	 * page->cp_req can be NULL if io submission failed before
+	 * cl_req was allocated.
+	 */
+	if (page->cp_req != NULL)
+		cl_req_page_done(env, page);
+	LASSERT(page->cp_req == NULL);
+
+	crt = cmd == OBD_BRW_READ ? CRT_READ : CRT_WRITE;
+	/* Clear opg->ops_transfer_pinned before VM lock is released. */
+	opg->ops_transfer_pinned = 0;
+
+	spin_lock(&obj->oo_seatbelt);
+	LASSERT(opg->ops_submitter != NULL);
+	LASSERT(!list_empty(&opg->ops_inflight));
+	list_del_init(&opg->ops_inflight);
+	opg->ops_submitter = NULL;
+	spin_unlock(&obj->oo_seatbelt);
+
+	opg->ops_submit_time = 0;
+	srvlock = oap->oap_brw_flags & OBD_BRW_SRVLOCK;
+
+	/* statistic */
+	if (rc == 0 && srvlock) {
+		struct lu_device *ld    = opg->ops_cl.cpl_obj->co_lu.lo_dev;
+		struct osc_stats *stats = &lu2osc_dev(ld)->od_stats;
+		int bytes = oap->oap_count;
+
+		if (crt == CRT_READ)
+			stats->os_lockless_reads += bytes;
+		else
+			stats->os_lockless_writes += bytes;
+	}
+
+	/*
+	 * This has to be the last operation with the page, as locks are
+	 * released in cl_page_completion() and nothing except for the
+	 * reference counter protects page from concurrent reclaim.
+	 */
+	lu_ref_del(&page->cp_reference, "transfer", page);
+
+	cl_page_completion(env, page, crt, rc);
+
+	RETURN(0);
+}
+
+#define OSC_DUMP_GRANT(cli, fmt, args...) do {				      \
+	struct client_obd *__tmp = (cli);				      \
+	CDEBUG(D_CACHE, "%s: { dirty: %ld/%ld dirty_pages: %d/%d "	      \
+	       "unstable_pages: %d/%d dropped: %ld avail: %ld, "	      \
+	       "reserved: %ld, flight: %d } " fmt,			      \
+	       __tmp->cl_import->imp_obd->obd_name,			      \
+	       __tmp->cl_dirty, __tmp->cl_dirty_max,			      \
+	       atomic_read(&obd_dirty_pages), obd_max_dirty_pages,	      \
+	       atomic_read(&obd_unstable_pages), obd_max_dirty_pages,     \
+	       __tmp->cl_lost_grant, __tmp->cl_avail_grant,		      \
+	       __tmp->cl_reserved_grant, __tmp->cl_w_in_flight, ##args);      \
+} while (0)
+
+/* caller must hold loi_list_lock */
+static void osc_consume_write_grant(struct client_obd *cli,
+				    struct brw_page *pga)
+{
+	LASSERT(spin_is_locked(&cli->cl_loi_list_lock.lock));
+	LASSERT(!(pga->flag & OBD_BRW_FROM_GRANT));
+	atomic_inc(&obd_dirty_pages);
+	cli->cl_dirty += PAGE_CACHE_SIZE;
+	pga->flag |= OBD_BRW_FROM_GRANT;
+	CDEBUG(D_CACHE, "using %lu grant credits for brw %p page %p\n",
+	       PAGE_CACHE_SIZE, pga, pga->pg);
+	osc_update_next_shrink(cli);
+}
+
+/* the companion to osc_consume_write_grant, called when a brw has completed.
+ * must be called with the loi lock held. */
+static void osc_release_write_grant(struct client_obd *cli,
+				    struct brw_page *pga)
+{
+	ENTRY;
+
+	LASSERT(spin_is_locked(&cli->cl_loi_list_lock.lock));
+	if (!(pga->flag & OBD_BRW_FROM_GRANT)) {
+		EXIT;
+		return;
+	}
+
+	pga->flag &= ~OBD_BRW_FROM_GRANT;
+	atomic_dec(&obd_dirty_pages);
+	cli->cl_dirty -= PAGE_CACHE_SIZE;
+	if (pga->flag & OBD_BRW_NOCACHE) {
+		pga->flag &= ~OBD_BRW_NOCACHE;
+		atomic_dec(&obd_dirty_transit_pages);
+		cli->cl_dirty_transit -= PAGE_CACHE_SIZE;
+	}
+	EXIT;
+}
+
+/**
+ * To avoid sleeping with object lock held, it's good for us allocate enough
+ * grants before entering into critical section.
+ *
+ * client_obd_list_lock held by caller
+ */
+static int osc_reserve_grant(struct client_obd *cli, unsigned int bytes)
+{
+	int rc = -EDQUOT;
+
+	if (cli->cl_avail_grant >= bytes) {
+		cli->cl_avail_grant    -= bytes;
+		cli->cl_reserved_grant += bytes;
+		rc = 0;
+	}
+	return rc;
+}
+
+static void __osc_unreserve_grant(struct client_obd *cli,
+				  unsigned int reserved, unsigned int unused)
+{
+	/* it's quite normal for us to get more grant than reserved.
+	 * Thinking about a case that two extents merged by adding a new
+	 * chunk, we can save one extent tax. If extent tax is greater than
+	 * one chunk, we can save more grant by adding a new chunk */
+	cli->cl_reserved_grant -= reserved;
+	if (unused > reserved) {
+		cli->cl_avail_grant += reserved;
+		cli->cl_lost_grant  += unused - reserved;
+	} else {
+		cli->cl_avail_grant += unused;
+	}
+}
+
+void osc_unreserve_grant(struct client_obd *cli,
+			 unsigned int reserved, unsigned int unused)
+{
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	__osc_unreserve_grant(cli, reserved, unused);
+	if (unused > 0)
+		osc_wake_cache_waiters(cli);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+}
+
+/**
+ * Free grant after IO is finished or canceled.
+ *
+ * @lost_grant is used to remember how many grants we have allocated but not
+ * used, we should return these grants to OST. There're two cases where grants
+ * can be lost:
+ * 1. truncate;
+ * 2. blocksize at OST is less than PAGE_CACHE_SIZE and a partial page was
+ *    written. In this case OST may use less chunks to serve this partial
+ *    write. OSTs don't actually know the page size on the client side. so
+ *    clients have to calculate lost grant by the blocksize on the OST.
+ *    See filter_grant_check() for details.
+ */
+static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages,
+			   unsigned int lost_grant)
+{
+	int grant = (1 << cli->cl_chunkbits) + cli->cl_extent_tax;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	atomic_sub(nr_pages, &obd_dirty_pages);
+	cli->cl_dirty -= nr_pages << PAGE_CACHE_SHIFT;
+	cli->cl_lost_grant += lost_grant;
+	if (cli->cl_avail_grant < grant && cli->cl_lost_grant >= grant) {
+		/* borrow some grant from truncate to avoid the case that
+		 * truncate uses up all avail grant */
+		cli->cl_lost_grant -= grant;
+		cli->cl_avail_grant += grant;
+	}
+	osc_wake_cache_waiters(cli);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+	CDEBUG(D_CACHE, "lost %u grant: %lu avail: %lu dirty: %lu\n",
+	       lost_grant, cli->cl_lost_grant,
+	       cli->cl_avail_grant, cli->cl_dirty);
+}
+
+/**
+ * The companion to osc_enter_cache(), called when @oap is no longer part of
+ * the dirty accounting due to error.
+ */
+static void osc_exit_cache(struct client_obd *cli, struct osc_async_page *oap)
+{
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	osc_release_write_grant(cli, &oap->oap_brw_page);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+}
+
+/**
+ * Non-blocking version of osc_enter_cache() that consumes grant only when it
+ * is available.
+ */
+static int osc_enter_cache_try(struct client_obd *cli,
+			       struct osc_async_page *oap,
+			       int bytes, int transient)
+{
+	int rc;
+
+	OSC_DUMP_GRANT(cli, "need:%d.\n", bytes);
+
+	rc = osc_reserve_grant(cli, bytes);
+	if (rc < 0)
+		return 0;
+
+	if (cli->cl_dirty + PAGE_CACHE_SIZE <= cli->cl_dirty_max &&
+	    atomic_read(&obd_unstable_pages) + 1 +
+	    atomic_read(&obd_dirty_pages) <= obd_max_dirty_pages) {
+		osc_consume_write_grant(cli, &oap->oap_brw_page);
+		if (transient) {
+			cli->cl_dirty_transit += PAGE_CACHE_SIZE;
+			atomic_inc(&obd_dirty_transit_pages);
+			oap->oap_brw_flags |= OBD_BRW_NOCACHE;
+		}
+		rc = 1;
+	} else {
+		__osc_unreserve_grant(cli, bytes, bytes);
+		rc = 0;
+	}
+	return rc;
+}
+
+static int ocw_granted(struct client_obd *cli, struct osc_cache_waiter *ocw)
+{
+	int rc;
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	rc = list_empty(&ocw->ocw_entry);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+	return rc;
+}
+
+/**
+ * The main entry to reserve dirty page accounting. Usually the grant reserved
+ * in this function will be freed in bulk in osc_free_grant() unless it fails
+ * to add osc cache, in that case, it will be freed in osc_exit_cache().
+ *
+ * The process will be put into sleep if it's already run out of grant.
+ */
+static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli,
+			   struct osc_async_page *oap, int bytes)
+{
+	struct osc_object *osc = oap->oap_obj;
+	struct lov_oinfo  *loi = osc->oo_oinfo;
+	struct osc_cache_waiter ocw;
+	struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+	int rc = -EDQUOT;
+	ENTRY;
+
+	OSC_DUMP_GRANT(cli, "need:%d.\n", bytes);
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+
+	/* force the caller to try sync io.  this can jump the list
+	 * of queued writes and create a discontiguous rpc stream */
+	if (OBD_FAIL_CHECK(OBD_FAIL_OSC_NO_GRANT) ||
+	    cli->cl_dirty_max < PAGE_CACHE_SIZE     ||
+	    cli->cl_ar.ar_force_sync || loi->loi_ar.ar_force_sync)
+		GOTO(out, rc = -EDQUOT);
+
+	/* Hopefully normal case - cache space and write credits available */
+	if (osc_enter_cache_try(cli, oap, bytes, 0))
+		GOTO(out, rc = 0);
+
+	/* We can get here for two reasons: too many dirty pages in cache, or
+	 * run out of grants. In both cases we should write dirty pages out.
+	 * Adding a cache waiter will trigger urgent write-out no matter what
+	 * RPC size will be.
+	 * The exiting condition is no avail grants and no dirty pages caching,
+	 * that really means there is no space on the OST. */
+	init_waitqueue_head(&ocw.ocw_waitq);
+	ocw.ocw_oap   = oap;
+	ocw.ocw_grant = bytes;
+	while (cli->cl_dirty > 0 || cli->cl_w_in_flight > 0) {
+		list_add_tail(&ocw.ocw_entry, &cli->cl_cache_waiters);
+		ocw.ocw_rc = 0;
+		client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+		osc_io_unplug_async(env, cli, NULL);
+
+		CDEBUG(D_CACHE, "%s: sleeping for cache space @ %p for %p\n",
+		       cli->cl_import->imp_obd->obd_name, &ocw, oap);
+
+		rc = l_wait_event(ocw.ocw_waitq, ocw_granted(cli, &ocw), &lwi);
+
+		client_obd_list_lock(&cli->cl_loi_list_lock);
+
+		/* l_wait_event is interrupted by signal */
+		if (rc < 0) {
+			list_del_init(&ocw.ocw_entry);
+			GOTO(out, rc);
+		}
+
+		LASSERT(list_empty(&ocw.ocw_entry));
+		rc = ocw.ocw_rc;
+
+		if (rc != -EDQUOT)
+			GOTO(out, rc);
+		if (osc_enter_cache_try(cli, oap, bytes, 0))
+			GOTO(out, rc = 0);
+	}
+	EXIT;
+out:
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+	OSC_DUMP_GRANT(cli, "returned %d.\n", rc);
+	RETURN(rc);
+}
+
+/* caller must hold loi_list_lock */
+void osc_wake_cache_waiters(struct client_obd *cli)
+{
+	struct list_head *l, *tmp;
+	struct osc_cache_waiter *ocw;
+
+	ENTRY;
+	list_for_each_safe(l, tmp, &cli->cl_cache_waiters) {
+		ocw = list_entry(l, struct osc_cache_waiter, ocw_entry);
+		list_del_init(&ocw->ocw_entry);
+
+		ocw->ocw_rc = -EDQUOT;
+		/* we can't dirty more */
+		if (cli->cl_dirty + PAGE_CACHE_SIZE > cli->cl_dirty_max ||
+		    atomic_read(&obd_unstable_pages) + 1 +
+		    atomic_read(&obd_dirty_pages) > obd_max_dirty_pages) {
+			CDEBUG(D_CACHE, "no dirty room: dirty: %ld "
+			       "osc max %ld, sys max %d\n", cli->cl_dirty,
+			       cli->cl_dirty_max, obd_max_dirty_pages);
+			goto wakeup;
+		}
+
+		ocw->ocw_rc = 0;
+		if (!osc_enter_cache_try(cli, ocw->ocw_oap, ocw->ocw_grant, 0))
+			ocw->ocw_rc = -EDQUOT;
+
+wakeup:
+		CDEBUG(D_CACHE, "wake up %p for oap %p, avail grant %ld, %d\n",
+		       ocw, ocw->ocw_oap, cli->cl_avail_grant, ocw->ocw_rc);
+
+		wake_up(&ocw->ocw_waitq);
+	}
+
+	EXIT;
+}
+
+static int osc_max_rpc_in_flight(struct client_obd *cli, struct osc_object *osc)
+{
+	int hprpc = !!list_empty(&osc->oo_hp_exts);
+	return rpcs_in_flight(cli) >= cli->cl_max_rpcs_in_flight + hprpc;
+}
+
+/* This maintains the lists of pending pages to read/write for a given object
+ * (lop).  This is used by osc_check_rpcs->osc_next_obj() and osc_list_maint()
+ * to quickly find objects that are ready to send an RPC. */
+static int osc_makes_rpc(struct client_obd *cli, struct osc_object *osc,
+			 int cmd)
+{
+	int invalid_import = 0;
+	ENTRY;
+
+	/* if we have an invalid import we want to drain the queued pages
+	 * by forcing them through rpcs that immediately fail and complete
+	 * the pages.  recovery relies on this to empty the queued pages
+	 * before canceling the locks and evicting down the llite pages */
+	if ((cli->cl_import == NULL || cli->cl_import->imp_invalid))
+		invalid_import = 1;
+
+	if (cmd & OBD_BRW_WRITE) {
+		if (atomic_read(&osc->oo_nr_writes) == 0)
+			RETURN(0);
+		if (invalid_import) {
+			CDEBUG(D_CACHE, "invalid import forcing RPC\n");
+			RETURN(1);
+		}
+		if (!list_empty(&osc->oo_hp_exts)) {
+			CDEBUG(D_CACHE, "high prio request forcing RPC\n");
+			RETURN(1);
+		}
+		if (!list_empty(&osc->oo_urgent_exts)) {
+			CDEBUG(D_CACHE, "urgent request forcing RPC\n");
+			RETURN(1);
+		}
+		/* trigger a write rpc stream as long as there are dirtiers
+		 * waiting for space.  as they're waiting, they're not going to
+		 * create more pages to coalesce with what's waiting.. */
+		if (!list_empty(&cli->cl_cache_waiters)) {
+			CDEBUG(D_CACHE, "cache waiters forcing RPC\n");
+			RETURN(1);
+		}
+		if (atomic_read(&osc->oo_nr_writes) >=
+		    cli->cl_max_pages_per_rpc)
+			RETURN(1);
+	} else {
+		if (atomic_read(&osc->oo_nr_reads) == 0)
+			RETURN(0);
+		if (invalid_import) {
+			CDEBUG(D_CACHE, "invalid import forcing RPC\n");
+			RETURN(1);
+		}
+		/* all read are urgent. */
+		if (!list_empty(&osc->oo_reading_exts))
+			RETURN(1);
+	}
+
+	RETURN(0);
+}
+
+static void osc_update_pending(struct osc_object *obj, int cmd, int delta)
+{
+	struct client_obd *cli = osc_cli(obj);
+	if (cmd & OBD_BRW_WRITE) {
+		atomic_add(delta, &obj->oo_nr_writes);
+		atomic_add(delta, &cli->cl_pending_w_pages);
+		LASSERT(atomic_read(&obj->oo_nr_writes) >= 0);
+	} else {
+		atomic_add(delta, &obj->oo_nr_reads);
+		atomic_add(delta, &cli->cl_pending_r_pages);
+		LASSERT(atomic_read(&obj->oo_nr_reads) >= 0);
+	}
+	OSC_IO_DEBUG(obj, "update pending cmd %d delta %d.\n", cmd, delta);
+}
+
+static int osc_makes_hprpc(struct osc_object *obj)
+{
+	return !list_empty(&obj->oo_hp_exts);
+}
+
+static void on_list(struct list_head *item, struct list_head *list, int should_be_on)
+{
+	if (list_empty(item) && should_be_on)
+		list_add_tail(item, list);
+	else if (!list_empty(item) && !should_be_on)
+		list_del_init(item);
+}
+
+/* maintain the osc's cli list membership invariants so that osc_send_oap_rpc
+ * can find pages to build into rpcs quickly */
+static int __osc_list_maint(struct client_obd *cli, struct osc_object *osc)
+{
+	if (osc_makes_hprpc(osc)) {
+		/* HP rpc */
+		on_list(&osc->oo_ready_item, &cli->cl_loi_ready_list, 0);
+		on_list(&osc->oo_hp_ready_item, &cli->cl_loi_hp_ready_list, 1);
+	} else {
+		on_list(&osc->oo_hp_ready_item, &cli->cl_loi_hp_ready_list, 0);
+		on_list(&osc->oo_ready_item, &cli->cl_loi_ready_list,
+			osc_makes_rpc(cli, osc, OBD_BRW_WRITE) ||
+			osc_makes_rpc(cli, osc, OBD_BRW_READ));
+	}
+
+	on_list(&osc->oo_write_item, &cli->cl_loi_write_list,
+		atomic_read(&osc->oo_nr_writes) > 0);
+
+	on_list(&osc->oo_read_item, &cli->cl_loi_read_list,
+		atomic_read(&osc->oo_nr_reads) > 0);
+
+	return osc_is_ready(osc);
+}
+
+static int osc_list_maint(struct client_obd *cli, struct osc_object *osc)
+{
+	int is_ready;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	is_ready = __osc_list_maint(cli, osc);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	return is_ready;
+}
+
+/* this is trying to propogate async writeback errors back up to the
+ * application.  As an async write fails we record the error code for later if
+ * the app does an fsync.  As long as errors persist we force future rpcs to be
+ * sync so that the app can get a sync error and break the cycle of queueing
+ * pages for which writeback will fail. */
+static void osc_process_ar(struct osc_async_rc *ar, __u64 xid,
+			   int rc)
+{
+	if (rc) {
+		if (!ar->ar_rc)
+			ar->ar_rc = rc;
+
+		ar->ar_force_sync = 1;
+		ar->ar_min_xid = ptlrpc_sample_next_xid();
+		return;
+
+	}
+
+	if (ar->ar_force_sync && (xid >= ar->ar_min_xid))
+		ar->ar_force_sync = 0;
+}
+
+/* Performs "unstable" page accounting. This function balances the
+ * increment operations performed in osc_inc_unstable_pages. It is
+ * registered as the RPC request callback, and is executed when the
+ * bulk RPC is committed on the server. Thus at this point, the pages
+ * involved in the bulk transfer are no longer considered unstable. */
+void osc_dec_unstable_pages(struct ptlrpc_request *req)
+{
+	struct ptlrpc_bulk_desc *desc       = req->rq_bulk;
+	struct client_obd       *cli	= &req->rq_import->imp_obd->u.cli;
+	obd_count		page_count = desc->bd_iov_count;
+	int i;
+
+	/* No unstable page tracking */
+	if (cli->cl_cache == NULL)
+		return;
+
+	LASSERT(page_count >= 0);
+
+	for (i = 0; i < page_count; i++)
+		dec_zone_page_state(desc->bd_iov[i].kiov_page, NR_UNSTABLE_NFS);
+
+	atomic_sub(page_count, &cli->cl_cache->ccc_unstable_nr);
+	LASSERT(atomic_read(&cli->cl_cache->ccc_unstable_nr) >= 0);
+
+	atomic_sub(page_count, &obd_unstable_pages);
+	LASSERT(atomic_read(&obd_unstable_pages) >= 0);
+
+	spin_lock(&req->rq_lock);
+	req->rq_committed = 1;
+	req->rq_unstable  = 0;
+	spin_unlock(&req->rq_lock);
+
+	wake_up_all(&cli->cl_cache->ccc_unstable_waitq);
+}
+
+/* "unstable" page accounting. See: osc_dec_unstable_pages. */
+void osc_inc_unstable_pages(struct ptlrpc_request *req)
+{
+	struct ptlrpc_bulk_desc *desc = req->rq_bulk;
+	struct client_obd       *cli  = &req->rq_import->imp_obd->u.cli;
+	obd_count		page_count = desc->bd_iov_count;
+	int i;
+
+	/* No unstable page tracking */
+	if (cli->cl_cache == NULL)
+		return;
+
+	LASSERT(page_count >= 0);
+
+	for (i = 0; i < page_count; i++)
+		inc_zone_page_state(desc->bd_iov[i].kiov_page, NR_UNSTABLE_NFS);
+
+	LASSERT(atomic_read(&cli->cl_cache->ccc_unstable_nr) >= 0);
+	atomic_add(page_count, &cli->cl_cache->ccc_unstable_nr);
+
+	LASSERT(atomic_read(&obd_unstable_pages) >= 0);
+	atomic_add(page_count, &obd_unstable_pages);
+
+	spin_lock(&req->rq_lock);
+
+	/* If the request has already been committed (i.e. brw_commit
+	 * called via rq_commit_cb), we need to undo the unstable page
+	 * increments we just performed because rq_commit_cb wont be
+	 * called again. Otherwise, just set the commit callback so the
+	 * unstable page accounting is properly updated when the request
+	 * is committed */
+	if (req->rq_committed) {
+		/* Drop lock before calling osc_dec_unstable_pages */
+		spin_unlock(&req->rq_lock);
+		osc_dec_unstable_pages(req);
+		spin_lock(&req->rq_lock);
+	} else {
+		req->rq_unstable  = 1;
+		req->rq_commit_cb = osc_dec_unstable_pages;
+	}
+
+	spin_unlock(&req->rq_lock);
+}
+
+/* this must be called holding the loi list lock to give coverage to exit_cache,
+ * async_flag maintenance, and oap_request */
+static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli,
+			      struct osc_async_page *oap, int sent, int rc)
+{
+	struct osc_object *osc = oap->oap_obj;
+	struct lov_oinfo  *loi = osc->oo_oinfo;
+	__u64 xid = 0;
+
+	ENTRY;
+	if (oap->oap_request != NULL) {
+		if (rc == 0)
+			osc_inc_unstable_pages(oap->oap_request);
+
+		xid = ptlrpc_req_xid(oap->oap_request);
+		ptlrpc_req_finished(oap->oap_request);
+		oap->oap_request = NULL;
+	}
+
+	/* As the transfer for this page is being done, clear the flags */
+	spin_lock(&oap->oap_lock);
+	oap->oap_async_flags = 0;
+	spin_unlock(&oap->oap_lock);
+	oap->oap_interrupted = 0;
+
+	if (oap->oap_cmd & OBD_BRW_WRITE && xid > 0) {
+		client_obd_list_lock(&cli->cl_loi_list_lock);
+		osc_process_ar(&cli->cl_ar, xid, rc);
+		osc_process_ar(&loi->loi_ar, xid, rc);
+		client_obd_list_unlock(&cli->cl_loi_list_lock);
+	}
+
+	rc = osc_completion(env, oap, oap->oap_cmd, rc);
+	if (rc)
+		CERROR("completion on oap %p obj %p returns %d.\n",
+		       oap, osc, rc);
+
+	EXIT;
+}
+
+/**
+ * Try to add extent to one RPC. We need to think about the following things:
+ * - # of pages must not be over max_pages_per_rpc
+ * - extent must be compatible with previous ones
+ */
+static int try_to_add_extent_for_io(struct client_obd *cli,
+				    struct osc_extent *ext, struct list_head *rpclist,
+				    int *pc, unsigned int *max_pages)
+{
+	struct osc_extent *tmp;
+	ENTRY;
+
+	EASSERT((ext->oe_state == OES_CACHE || ext->oe_state == OES_LOCK_DONE),
+		ext);
+
+	*max_pages = max(ext->oe_mppr, *max_pages);
+	if (*pc + ext->oe_nr_pages > *max_pages)
+		RETURN(0);
+
+	list_for_each_entry(tmp, rpclist, oe_link) {
+		EASSERT(tmp->oe_owner == current, tmp);
+#if 0
+		if (overlapped(tmp, ext)) {
+			OSC_EXTENT_DUMP(D_ERROR, tmp, "overlapped %p.\n", ext);
+			EASSERT(0, ext);
+		}
+#endif
+
+		if (tmp->oe_srvlock != ext->oe_srvlock ||
+		    !tmp->oe_grants != !ext->oe_grants)
+			RETURN(0);
+
+		/* remove break for strict check */
+		break;
+	}
+
+	*pc += ext->oe_nr_pages;
+	list_move_tail(&ext->oe_link, rpclist);
+	ext->oe_owner = current;
+	RETURN(1);
+}
+
+/**
+ * In order to prevent multiple ptlrpcd from breaking contiguous extents,
+ * get_write_extent() takes all appropriate extents in atomic.
+ *
+ * The following policy is used to collect extents for IO:
+ * 1. Add as many HP extents as possible;
+ * 2. Add the first urgent extent in urgent extent list and take it out of
+ *    urgent list;
+ * 3. Add subsequent extents of this urgent extent;
+ * 4. If urgent list is not empty, goto 2;
+ * 5. Traverse the extent tree from the 1st extent;
+ * 6. Above steps exit if there is no space in this RPC.
+ */
+static int get_write_extents(struct osc_object *obj, struct list_head *rpclist)
+{
+	struct client_obd *cli = osc_cli(obj);
+	struct osc_extent *ext;
+	int page_count = 0;
+	unsigned int max_pages = cli->cl_max_pages_per_rpc;
+
+	LASSERT(osc_object_is_locked(obj));
+	while (!list_empty(&obj->oo_hp_exts)) {
+		ext = list_entry(obj->oo_hp_exts.next, struct osc_extent,
+				     oe_link);
+		LASSERT(ext->oe_state == OES_CACHE);
+		if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count,
+					      &max_pages))
+			return page_count;
+		EASSERT(ext->oe_nr_pages <= max_pages, ext);
+	}
+	if (page_count == max_pages)
+		return page_count;
+
+	while (!list_empty(&obj->oo_urgent_exts)) {
+		ext = list_entry(obj->oo_urgent_exts.next,
+				     struct osc_extent, oe_link);
+		if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count,
+					      &max_pages))
+			return page_count;
+
+		if (!ext->oe_intree)
+			continue;
+
+		while ((ext = next_extent(ext)) != NULL) {
+			if ((ext->oe_state != OES_CACHE) ||
+			    (!list_empty(&ext->oe_link) &&
+			     ext->oe_owner != NULL))
+				continue;
+
+			if (!try_to_add_extent_for_io(cli, ext, rpclist,
+						      &page_count, &max_pages))
+				return page_count;
+		}
+	}
+	if (page_count == max_pages)
+		return page_count;
+
+	ext = first_extent(obj);
+	while (ext != NULL) {
+		if ((ext->oe_state != OES_CACHE) ||
+		    /* this extent may be already in current rpclist */
+		    (!list_empty(&ext->oe_link) && ext->oe_owner != NULL)) {
+			ext = next_extent(ext);
+			continue;
+		}
+
+		if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count,
+					      &max_pages))
+			return page_count;
+
+		ext = next_extent(ext);
+	}
+	return page_count;
+}
+
+static int
+osc_send_write_rpc(const struct lu_env *env, struct client_obd *cli,
+		   struct osc_object *osc, pdl_policy_t pol)
+{
+	LIST_HEAD(rpclist);
+	struct osc_extent *ext;
+	struct osc_extent *tmp;
+	struct osc_extent *first = NULL;
+	obd_count page_count = 0;
+	int srvlock = 0;
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(osc_object_is_locked(osc));
+
+	page_count = get_write_extents(osc, &rpclist);
+	LASSERT(equi(page_count == 0, list_empty(&rpclist)));
+
+	if (list_empty(&rpclist))
+		RETURN(0);
+
+	osc_update_pending(osc, OBD_BRW_WRITE, -page_count);
+
+	list_for_each_entry(ext, &rpclist, oe_link) {
+		LASSERT(ext->oe_state == OES_CACHE ||
+			ext->oe_state == OES_LOCK_DONE);
+		if (ext->oe_state == OES_CACHE)
+			osc_extent_state_set(ext, OES_LOCKING);
+		else
+			osc_extent_state_set(ext, OES_RPC);
+	}
+
+	/* we're going to grab page lock, so release object lock because
+	 * lock order is page lock -> object lock. */
+	osc_object_unlock(osc);
+
+	list_for_each_entry_safe(ext, tmp, &rpclist, oe_link) {
+		if (ext->oe_state == OES_LOCKING) {
+			rc = osc_extent_make_ready(env, ext);
+			if (unlikely(rc < 0)) {
+				list_del_init(&ext->oe_link);
+				osc_extent_finish(env, ext, 0, rc);
+				continue;
+			}
+		}
+		if (first == NULL) {
+			first = ext;
+			srvlock = ext->oe_srvlock;
+		} else {
+			LASSERT(srvlock == ext->oe_srvlock);
+		}
+	}
+
+	if (!list_empty(&rpclist)) {
+		LASSERT(page_count > 0);
+		rc = osc_build_rpc(env, cli, &rpclist, OBD_BRW_WRITE, pol);
+		LASSERT(list_empty(&rpclist));
+	}
+
+	osc_object_lock(osc);
+	RETURN(rc);
+}
+
+/**
+ * prepare pages for ASYNC io and put pages in send queue.
+ *
+ * \param cmd OBD_BRW_* macroses
+ * \param lop pending pages
+ *
+ * \return zero if no page added to send queue.
+ * \return 1 if pages successfully added to send queue.
+ * \return negative on errors.
+ */
+static int
+osc_send_read_rpc(const struct lu_env *env, struct client_obd *cli,
+		  struct osc_object *osc, pdl_policy_t pol)
+{
+	struct osc_extent *ext;
+	struct osc_extent *next;
+	LIST_HEAD(rpclist);
+	int page_count = 0;
+	unsigned int max_pages = cli->cl_max_pages_per_rpc;
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(osc_object_is_locked(osc));
+	list_for_each_entry_safe(ext, next,
+				     &osc->oo_reading_exts, oe_link) {
+		EASSERT(ext->oe_state == OES_LOCK_DONE, ext);
+		if (!try_to_add_extent_for_io(cli, ext, &rpclist, &page_count,
+					      &max_pages))
+			break;
+		osc_extent_state_set(ext, OES_RPC);
+		EASSERT(ext->oe_nr_pages <= max_pages, ext);
+	}
+	LASSERT(page_count <= max_pages);
+
+	osc_update_pending(osc, OBD_BRW_READ, -page_count);
+
+	if (!list_empty(&rpclist)) {
+		osc_object_unlock(osc);
+
+		LASSERT(page_count > 0);
+		rc = osc_build_rpc(env, cli, &rpclist, OBD_BRW_READ, pol);
+		LASSERT(list_empty(&rpclist));
+
+		osc_object_lock(osc);
+	}
+	RETURN(rc);
+}
+
+#define list_to_obj(list, item) ({					      \
+	struct list_head *__tmp = (list)->next;				      \
+	list_del_init(__tmp);					      \
+	list_entry(__tmp, struct osc_object, oo_##item);		      \
+})
+
+/* This is called by osc_check_rpcs() to find which objects have pages that
+ * we could be sending.  These lists are maintained by osc_makes_rpc(). */
+static struct osc_object *osc_next_obj(struct client_obd *cli)
+{
+	ENTRY;
+
+	/* First return objects that have blocked locks so that they
+	 * will be flushed quickly and other clients can get the lock,
+	 * then objects which have pages ready to be stuffed into RPCs */
+	if (!list_empty(&cli->cl_loi_hp_ready_list))
+		RETURN(list_to_obj(&cli->cl_loi_hp_ready_list, hp_ready_item));
+	if (!list_empty(&cli->cl_loi_ready_list))
+		RETURN(list_to_obj(&cli->cl_loi_ready_list, ready_item));
+
+	/* then if we have cache waiters, return all objects with queued
+	 * writes.  This is especially important when many small files
+	 * have filled up the cache and not been fired into rpcs because
+	 * they don't pass the nr_pending/object threshhold */
+	if (!list_empty(&cli->cl_cache_waiters) &&
+	    !list_empty(&cli->cl_loi_write_list))
+		RETURN(list_to_obj(&cli->cl_loi_write_list, write_item));
+
+	/* then return all queued objects when we have an invalid import
+	 * so that they get flushed */
+	if (cli->cl_import == NULL || cli->cl_import->imp_invalid) {
+		if (!list_empty(&cli->cl_loi_write_list))
+			RETURN(list_to_obj(&cli->cl_loi_write_list,
+					   write_item));
+		if (!list_empty(&cli->cl_loi_read_list))
+			RETURN(list_to_obj(&cli->cl_loi_read_list,
+					   read_item));
+	}
+	RETURN(NULL);
+}
+
+/* called with the loi list lock held */
+static void osc_check_rpcs(const struct lu_env *env, struct client_obd *cli,
+			   pdl_policy_t pol)
+{
+	struct osc_object *osc;
+	int rc = 0;
+	ENTRY;
+
+	while ((osc = osc_next_obj(cli)) != NULL) {
+		struct cl_object *obj = osc2cl(osc);
+		struct lu_ref_link *link;
+
+		OSC_IO_DEBUG(osc, "%lu in flight\n", rpcs_in_flight(cli));
+
+		if (osc_max_rpc_in_flight(cli, osc)) {
+			__osc_list_maint(cli, osc);
+			break;
+		}
+
+		cl_object_get(obj);
+		client_obd_list_unlock(&cli->cl_loi_list_lock);
+		link = lu_object_ref_add(&obj->co_lu, "check", current);
+
+		/* attempt some read/write balancing by alternating between
+		 * reads and writes in an object.  The makes_rpc checks here
+		 * would be redundant if we were getting read/write work items
+		 * instead of objects.  we don't want send_oap_rpc to drain a
+		 * partial read pending queue when we're given this object to
+		 * do io on writes while there are cache waiters */
+		osc_object_lock(osc);
+		if (osc_makes_rpc(cli, osc, OBD_BRW_WRITE)) {
+			rc = osc_send_write_rpc(env, cli, osc, pol);
+			if (rc < 0) {
+				CERROR("Write request failed with %d\n", rc);
+
+				/* osc_send_write_rpc failed, mostly because of
+				 * memory pressure.
+				 *
+				 * It can't break here, because if:
+				 *  - a page was submitted by osc_io_submit, so
+				 *    page locked;
+				 *  - no request in flight
+				 *  - no subsequent request
+				 * The system will be in live-lock state,
+				 * because there is no chance to call
+				 * osc_io_unplug() and osc_check_rpcs() any
+				 * more. pdflush can't help in this case,
+				 * because it might be blocked at grabbing
+				 * the page lock as we mentioned.
+				 *
+				 * Anyway, continue to drain pages. */
+				/* break; */
+			}
+		}
+		if (osc_makes_rpc(cli, osc, OBD_BRW_READ)) {
+			rc = osc_send_read_rpc(env, cli, osc, pol);
+			if (rc < 0)
+				CERROR("Read request failed with %d\n", rc);
+		}
+		osc_object_unlock(osc);
+
+		osc_list_maint(cli, osc);
+		lu_object_ref_del_at(&obj->co_lu, link, "check", current);
+		cl_object_put(env, obj);
+
+		client_obd_list_lock(&cli->cl_loi_list_lock);
+	}
+}
+
+static int osc_io_unplug0(const struct lu_env *env, struct client_obd *cli,
+			  struct osc_object *osc, pdl_policy_t pol, int async)
+{
+	int has_rpcs = 1;
+	int rc = 0;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	if (osc != NULL)
+		has_rpcs = __osc_list_maint(cli, osc);
+	if (has_rpcs) {
+		if (!async) {
+			/* disable osc_lru_shrink() temporarily to avoid
+			 * potential stack overrun problem. LU-2859 */
+			atomic_inc(&cli->cl_lru_shrinkers);
+			osc_check_rpcs(env, cli, pol);
+			atomic_dec(&cli->cl_lru_shrinkers);
+		} else {
+			CDEBUG(D_CACHE, "Queue writeback work for client %p.\n",
+			       cli);
+			LASSERT(cli->cl_writeback_work != NULL);
+			rc = ptlrpcd_queue_work(cli->cl_writeback_work);
+		}
+	}
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+	return rc;
+}
+
+static int osc_io_unplug_async(const struct lu_env *env,
+				struct client_obd *cli, struct osc_object *osc)
+{
+	/* XXX: policy is no use actually. */
+	return osc_io_unplug0(env, cli, osc, PDL_POLICY_ROUND, 1);
+}
+
+void osc_io_unplug(const struct lu_env *env, struct client_obd *cli,
+		   struct osc_object *osc, pdl_policy_t pol)
+{
+	(void)osc_io_unplug0(env, cli, osc, pol, 0);
+}
+
+int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops,
+			struct page *page, loff_t offset)
+{
+	struct obd_export     *exp = osc_export(osc);
+	struct osc_async_page *oap = &ops->ops_oap;
+	ENTRY;
+
+	if (!page)
+		return cfs_size_round(sizeof(*oap));
+
+	oap->oap_magic = OAP_MAGIC;
+	oap->oap_cli = &exp->exp_obd->u.cli;
+	oap->oap_obj = osc;
+
+	oap->oap_page = page;
+	oap->oap_obj_off = offset;
+	LASSERT(!(offset & ~CFS_PAGE_MASK));
+
+	if (!client_is_remote(exp) && cfs_capable(CFS_CAP_SYS_RESOURCE))
+		oap->oap_brw_flags = OBD_BRW_NOQUOTA;
+
+	INIT_LIST_HEAD(&oap->oap_pending_item);
+	INIT_LIST_HEAD(&oap->oap_rpc_item);
+
+	spin_lock_init(&oap->oap_lock);
+	CDEBUG(D_INFO, "oap %p page %p obj off "LPU64"\n",
+	       oap, page, oap->oap_obj_off);
+	RETURN(0);
+}
+
+int osc_queue_async_io(const struct lu_env *env, struct cl_io *io,
+		       struct osc_page *ops)
+{
+	struct osc_io *oio = osc_env_io(env);
+	struct osc_extent     *ext = NULL;
+	struct osc_async_page *oap = &ops->ops_oap;
+	struct client_obd     *cli = oap->oap_cli;
+	struct osc_object     *osc = oap->oap_obj;
+	pgoff_t index;
+	int    grants = 0;
+	int    brw_flags = OBD_BRW_ASYNC;
+	int    cmd = OBD_BRW_WRITE;
+	int    need_release = 0;
+	int    rc = 0;
+	ENTRY;
+
+	if (oap->oap_magic != OAP_MAGIC)
+		RETURN(-EINVAL);
+
+	if (cli->cl_import == NULL || cli->cl_import->imp_invalid)
+		RETURN(-EIO);
+
+	if (!list_empty(&oap->oap_pending_item) ||
+	    !list_empty(&oap->oap_rpc_item))
+		RETURN(-EBUSY);
+
+	/* Set the OBD_BRW_SRVLOCK before the page is queued. */
+	brw_flags |= ops->ops_srvlock ? OBD_BRW_SRVLOCK : 0;
+	if (!client_is_remote(osc_export(osc)) &&
+	    cfs_capable(CFS_CAP_SYS_RESOURCE)) {
+		brw_flags |= OBD_BRW_NOQUOTA;
+		cmd |= OBD_BRW_NOQUOTA;
+	}
+
+	/* check if the file's owner/group is over quota */
+	if (!(cmd & OBD_BRW_NOQUOTA)) {
+		struct cl_object *obj;
+		struct cl_attr   *attr;
+		unsigned int qid[MAXQUOTAS];
+
+		obj = cl_object_top(&osc->oo_cl);
+		attr = &osc_env_info(env)->oti_attr;
+
+		cl_object_attr_lock(obj);
+		rc = cl_object_attr_get(env, obj, attr);
+		cl_object_attr_unlock(obj);
+
+		qid[USRQUOTA] = attr->cat_uid;
+		qid[GRPQUOTA] = attr->cat_gid;
+		if (rc == 0 && osc_quota_chkdq(cli, qid) == NO_QUOTA)
+			rc = -EDQUOT;
+		if (rc)
+			RETURN(rc);
+	}
+
+	oap->oap_cmd = cmd;
+	oap->oap_page_off = ops->ops_from;
+	oap->oap_count = ops->ops_to - ops->ops_from;
+	oap->oap_async_flags = 0;
+	oap->oap_brw_flags = brw_flags;
+
+	OSC_IO_DEBUG(osc, "oap %p page %p added for cmd %d\n",
+		     oap, oap->oap_page, oap->oap_cmd & OBD_BRW_RWMASK);
+
+	index = oap2cl_page(oap)->cp_index;
+
+	/* Add this page into extent by the following steps:
+	 * 1. if there exists an active extent for this IO, mostly this page
+	 *    can be added to the active extent and sometimes we need to
+	 *    expand extent to accomodate this page;
+	 * 2. otherwise, a new extent will be allocated. */
+
+	ext = oio->oi_active;
+	if (ext != NULL && ext->oe_start <= index && ext->oe_max_end >= index) {
+		/* one chunk plus extent overhead must be enough to write this
+		 * page */
+		grants = (1 << cli->cl_chunkbits) + cli->cl_extent_tax;
+		if (ext->oe_end >= index)
+			grants = 0;
+
+		/* it doesn't need any grant to dirty this page */
+		client_obd_list_lock(&cli->cl_loi_list_lock);
+		rc = osc_enter_cache_try(cli, oap, grants, 0);
+		client_obd_list_unlock(&cli->cl_loi_list_lock);
+		if (rc == 0) { /* try failed */
+			grants = 0;
+			need_release = 1;
+		} else if (ext->oe_end < index) {
+			int tmp = grants;
+			/* try to expand this extent */
+			rc = osc_extent_expand(ext, index, &tmp);
+			if (rc < 0) {
+				need_release = 1;
+				/* don't free reserved grant */
+			} else {
+				OSC_EXTENT_DUMP(D_CACHE, ext,
+						"expanded for %lu.\n", index);
+				osc_unreserve_grant(cli, grants, tmp);
+				grants = 0;
+			}
+		}
+		rc = 0;
+	} else if (ext != NULL) {
+		/* index is located outside of active extent */
+		need_release = 1;
+	}
+	if (need_release) {
+		osc_extent_release(env, ext);
+		oio->oi_active = NULL;
+		ext = NULL;
+	}
+
+	if (ext == NULL) {
+		int tmp = (1 << cli->cl_chunkbits) + cli->cl_extent_tax;
+
+		/* try to find new extent to cover this page */
+		LASSERT(oio->oi_active == NULL);
+		/* we may have allocated grant for this page if we failed
+		 * to expand the previous active extent. */
+		LASSERT(ergo(grants > 0, grants >= tmp));
+
+		rc = 0;
+		if (grants == 0) {
+			/* we haven't allocated grant for this page. */
+			rc = osc_enter_cache(env, cli, oap, tmp);
+			if (rc == 0)
+				grants = tmp;
+		}
+
+		tmp = grants;
+		if (rc == 0) {
+			ext = osc_extent_find(env, osc, index, &tmp);
+			if (IS_ERR(ext)) {
+				LASSERT(tmp == grants);
+				osc_exit_cache(cli, oap);
+				rc = PTR_ERR(ext);
+				ext = NULL;
+			} else {
+				oio->oi_active = ext;
+			}
+		}
+		if (grants > 0)
+			osc_unreserve_grant(cli, grants, tmp);
+	}
+
+	LASSERT(ergo(rc == 0, ext != NULL));
+	if (ext != NULL) {
+		EASSERTF(ext->oe_end >= index && ext->oe_start <= index,
+			 ext, "index = %lu.\n", index);
+		LASSERT((oap->oap_brw_flags & OBD_BRW_FROM_GRANT) != 0);
+
+		osc_object_lock(osc);
+		if (ext->oe_nr_pages == 0)
+			ext->oe_srvlock = ops->ops_srvlock;
+		else
+			LASSERT(ext->oe_srvlock == ops->ops_srvlock);
+		++ext->oe_nr_pages;
+		list_add_tail(&oap->oap_pending_item, &ext->oe_pages);
+		osc_object_unlock(osc);
+	}
+	RETURN(rc);
+}
+
+int osc_teardown_async_page(const struct lu_env *env,
+			    struct osc_object *obj, struct osc_page *ops)
+{
+	struct osc_async_page *oap = &ops->ops_oap;
+	struct osc_extent     *ext = NULL;
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(oap->oap_magic == OAP_MAGIC);
+
+	CDEBUG(D_INFO, "teardown oap %p page %p at index %lu.\n",
+	       oap, ops, oap2cl_page(oap)->cp_index);
+
+	osc_object_lock(obj);
+	if (!list_empty(&oap->oap_rpc_item)) {
+		CDEBUG(D_CACHE, "oap %p is not in cache.\n", oap);
+		rc = -EBUSY;
+	} else if (!list_empty(&oap->oap_pending_item)) {
+		ext = osc_extent_lookup(obj, oap2cl_page(oap)->cp_index);
+		/* only truncated pages are allowed to be taken out.
+		 * See osc_extent_truncate() and osc_cache_truncate_start()
+		 * for details. */
+		if (ext != NULL && ext->oe_state != OES_TRUNC) {
+			OSC_EXTENT_DUMP(D_ERROR, ext, "trunc at %lu.\n",
+					oap2cl_page(oap)->cp_index);
+			rc = -EBUSY;
+		}
+	}
+	osc_object_unlock(obj);
+	if (ext != NULL)
+		osc_extent_put(env, ext);
+	RETURN(rc);
+}
+
+/**
+ * This is called when a page is picked up by kernel to write out.
+ *
+ * We should find out the corresponding extent and add the whole extent
+ * into urgent list. The extent may be being truncated or used, handle it
+ * carefully.
+ */
+int osc_flush_async_page(const struct lu_env *env, struct cl_io *io,
+			 struct osc_page *ops)
+{
+	struct osc_extent *ext   = NULL;
+	struct osc_object *obj   = cl2osc(ops->ops_cl.cpl_obj);
+	struct cl_page    *cp    = ops->ops_cl.cpl_page;
+	pgoff_t	    index = cp->cp_index;
+	struct osc_async_page *oap = &ops->ops_oap;
+	bool unplug = false;
+	int rc = 0;
+	ENTRY;
+
+	osc_object_lock(obj);
+	ext = osc_extent_lookup(obj, index);
+	if (ext == NULL) {
+		osc_extent_tree_dump(D_ERROR, obj);
+		LASSERTF(0, "page index %lu is NOT covered.\n", index);
+	}
+
+	switch (ext->oe_state) {
+	case OES_RPC:
+	case OES_LOCK_DONE:
+		CL_PAGE_DEBUG(D_ERROR, env, cl_page_top(cp),
+			      "flush an in-rpc page?\n");
+		LASSERT(0);
+		break;
+	case OES_LOCKING:
+		/* If we know this extent is being written out, we should abort
+		 * so that the writer can make this page ready. Otherwise, there
+		 * exists a deadlock problem because other process can wait for
+		 * page writeback bit holding page lock; and meanwhile in
+		 * vvp_page_make_ready(), we need to grab page lock before
+		 * really sending the RPC. */
+	case OES_TRUNC:
+		/* race with truncate, page will be redirtied */
+		GOTO(out, rc = -EAGAIN);
+	default:
+		break;
+	}
+
+	rc = cl_page_prep(env, io, cl_page_top(cp), CRT_WRITE);
+	if (rc)
+		GOTO(out, rc);
+
+	spin_lock(&oap->oap_lock);
+	oap->oap_async_flags |= ASYNC_READY|ASYNC_URGENT;
+	spin_unlock(&oap->oap_lock);
+
+	if (memory_pressure_get())
+		ext->oe_memalloc = 1;
+
+	ext->oe_urgent = 1;
+	if (ext->oe_state == OES_CACHE) {
+		OSC_EXTENT_DUMP(D_CACHE, ext,
+				"flush page %p make it urgent.\n", oap);
+		if (list_empty(&ext->oe_link))
+			list_add_tail(&ext->oe_link, &obj->oo_urgent_exts);
+		unplug = true;
+	}
+	rc = 0;
+	EXIT;
+
+out:
+	osc_object_unlock(obj);
+	osc_extent_put(env, ext);
+	if (unplug)
+		osc_io_unplug_async(env, osc_cli(obj), obj);
+	return rc;
+}
+
+/**
+ * this is called when a sync waiter receives an interruption.  Its job is to
+ * get the caller woken as soon as possible.  If its page hasn't been put in an
+ * rpc yet it can dequeue immediately.  Otherwise it has to mark the rpc as
+ * desiring interruption which will forcefully complete the rpc once the rpc
+ * has timed out.
+ */
+int osc_cancel_async_page(const struct lu_env *env, struct osc_page *ops)
+{
+	struct osc_async_page *oap = &ops->ops_oap;
+	struct osc_object     *obj = oap->oap_obj;
+	struct client_obd     *cli = osc_cli(obj);
+	struct osc_extent     *ext;
+	struct osc_extent     *found = NULL;
+	struct list_head	    *plist;
+	pgoff_t index = oap2cl_page(oap)->cp_index;
+	int     rc = -EBUSY;
+	int     cmd;
+	ENTRY;
+
+	LASSERT(!oap->oap_interrupted);
+	oap->oap_interrupted = 1;
+
+	/* Find out the caching extent */
+	osc_object_lock(obj);
+	if (oap->oap_cmd & OBD_BRW_WRITE) {
+		plist = &obj->oo_urgent_exts;
+		cmd   = OBD_BRW_WRITE;
+	} else {
+		plist = &obj->oo_reading_exts;
+		cmd   = OBD_BRW_READ;
+	}
+	list_for_each_entry(ext, plist, oe_link) {
+		if (ext->oe_start <= index && ext->oe_end >= index) {
+			LASSERT(ext->oe_state == OES_LOCK_DONE);
+			/* For OES_LOCK_DONE state extent, it has already held
+			 * a refcount for RPC. */
+			found = osc_extent_get(ext);
+			break;
+		}
+	}
+	if (found != NULL) {
+		list_del_init(&found->oe_link);
+		osc_update_pending(obj, cmd, -found->oe_nr_pages);
+		osc_object_unlock(obj);
+
+		osc_extent_finish(env, found, 0, -EINTR);
+		osc_extent_put(env, found);
+		rc = 0;
+	} else {
+		osc_object_unlock(obj);
+		/* ok, it's been put in an rpc. only one oap gets a request
+		 * reference */
+		if (oap->oap_request != NULL) {
+			ptlrpc_mark_interrupted(oap->oap_request);
+			ptlrpcd_wake(oap->oap_request);
+			ptlrpc_req_finished(oap->oap_request);
+			oap->oap_request = NULL;
+		}
+	}
+
+	osc_list_maint(cli, obj);
+	RETURN(rc);
+}
+
+int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj,
+			 struct list_head *list, int cmd, int brw_flags)
+{
+	struct client_obd     *cli = osc_cli(obj);
+	struct osc_extent     *ext;
+	struct osc_async_page *oap;
+	int     page_count = 0;
+	int     mppr       = cli->cl_max_pages_per_rpc;
+	pgoff_t start      = CL_PAGE_EOF;
+	pgoff_t end	= 0;
+	ENTRY;
+
+	list_for_each_entry(oap, list, oap_pending_item) {
+		struct cl_page *cp = oap2cl_page(oap);
+		if (cp->cp_index > end)
+			end = cp->cp_index;
+		if (cp->cp_index < start)
+			start = cp->cp_index;
+		++page_count;
+		mppr <<= (page_count > mppr);
+	}
+
+	ext = osc_extent_alloc(obj);
+	if (ext == NULL) {
+		list_for_each_entry(oap, list, oap_pending_item) {
+			list_del_init(&oap->oap_pending_item);
+			osc_ap_completion(env, cli, oap, 0, -ENOMEM);
+		}
+		RETURN(-ENOMEM);
+	}
+
+	ext->oe_rw = !!(cmd & OBD_BRW_READ);
+	ext->oe_urgent = 1;
+	ext->oe_start = start;
+	ext->oe_end = ext->oe_max_end = end;
+	ext->oe_obj = obj;
+	ext->oe_srvlock = !!(brw_flags & OBD_BRW_SRVLOCK);
+	ext->oe_nr_pages = page_count;
+	ext->oe_mppr = mppr;
+	list_splice_init(list, &ext->oe_pages);
+
+	osc_object_lock(obj);
+	/* Reuse the initial refcount for RPC, don't drop it */
+	osc_extent_state_set(ext, OES_LOCK_DONE);
+	if (cmd & OBD_BRW_WRITE) {
+		list_add_tail(&ext->oe_link, &obj->oo_urgent_exts);
+		osc_update_pending(obj, OBD_BRW_WRITE, page_count);
+	} else {
+		list_add_tail(&ext->oe_link, &obj->oo_reading_exts);
+		osc_update_pending(obj, OBD_BRW_READ, page_count);
+	}
+	osc_object_unlock(obj);
+
+	osc_io_unplug(env, cli, obj, PDL_POLICY_ROUND);
+	RETURN(0);
+}
+
+/**
+ * Called by osc_io_setattr_start() to freeze and destroy covering extents.
+ */
+int osc_cache_truncate_start(const struct lu_env *env, struct osc_io *oio,
+			     struct osc_object *obj, __u64 size)
+{
+	struct client_obd *cli = osc_cli(obj);
+	struct osc_extent *ext;
+	struct osc_extent *waiting = NULL;
+	pgoff_t index;
+	LIST_HEAD(list);
+	int result = 0;
+	bool partial;
+	ENTRY;
+
+	/* pages with index greater or equal to index will be truncated. */
+	index = cl_index(osc2cl(obj), size);
+	partial = size > cl_offset(osc2cl(obj), index);
+
+again:
+	osc_object_lock(obj);
+	ext = osc_extent_search(obj, index);
+	if (ext == NULL)
+		ext = first_extent(obj);
+	else if (ext->oe_end < index)
+		ext = next_extent(ext);
+	while (ext != NULL) {
+		EASSERT(ext->oe_state != OES_TRUNC, ext);
+
+		if (ext->oe_state > OES_CACHE || ext->oe_urgent) {
+			/* if ext is in urgent state, it means there must exist
+			 * a page already having been flushed by write_page().
+			 * We have to wait for this extent because we can't
+			 * truncate that page. */
+			LASSERT(!ext->oe_hp);
+			OSC_EXTENT_DUMP(D_CACHE, ext,
+					"waiting for busy extent\n");
+			waiting = osc_extent_get(ext);
+			break;
+		}
+
+		OSC_EXTENT_DUMP(D_CACHE, ext, "try to trunc:"LPU64".\n", size);
+
+		osc_extent_get(ext);
+		if (ext->oe_state == OES_ACTIVE) {
+			/* though we grab inode mutex for write path, but we
+			 * release it before releasing extent(in osc_io_end()),
+			 * so there is a race window that an extent is still
+			 * in OES_ACTIVE when truncate starts. */
+			LASSERT(!ext->oe_trunc_pending);
+			ext->oe_trunc_pending = 1;
+		} else {
+			EASSERT(ext->oe_state == OES_CACHE, ext);
+			osc_extent_state_set(ext, OES_TRUNC);
+			osc_update_pending(obj, OBD_BRW_WRITE,
+					   -ext->oe_nr_pages);
+		}
+		EASSERT(list_empty(&ext->oe_link), ext);
+		list_add_tail(&ext->oe_link, &list);
+
+		ext = next_extent(ext);
+	}
+	osc_object_unlock(obj);
+
+	osc_list_maint(cli, obj);
+
+	while (!list_empty(&list)) {
+		int rc;
+
+		ext = list_entry(list.next, struct osc_extent, oe_link);
+		list_del_init(&ext->oe_link);
+
+		/* extent may be in OES_ACTIVE state because inode mutex
+		 * is released before osc_io_end() in file write case */
+		if (ext->oe_state != OES_TRUNC)
+			osc_extent_wait(env, ext, OES_TRUNC);
+
+		rc = osc_extent_truncate(ext, index, partial);
+		if (rc < 0) {
+			if (result == 0)
+				result = rc;
+
+			OSC_EXTENT_DUMP(D_ERROR, ext,
+					"truncate error %d\n", rc);
+		} else if (ext->oe_nr_pages == 0) {
+			osc_extent_remove(ext);
+		} else {
+			/* this must be an overlapped extent which means only
+			 * part of pages in this extent have been truncated.
+			 */
+			EASSERTF(ext->oe_start <= index, ext,
+				 "trunc index = %lu/%d.\n", index, partial);
+			/* fix index to skip this partially truncated extent */
+			index = ext->oe_end + 1;
+			partial = false;
+
+			/* we need to hold this extent in OES_TRUNC state so
+			 * that no writeback will happen. This is to avoid
+			 * BUG 17397. */
+			LASSERT(oio->oi_trunc == NULL);
+			oio->oi_trunc = osc_extent_get(ext);
+			OSC_EXTENT_DUMP(D_CACHE, ext,
+					"trunc at "LPU64"\n", size);
+		}
+		osc_extent_put(env, ext);
+	}
+	if (waiting != NULL) {
+		int rc;
+
+		/* ignore the result of osc_extent_wait the write initiator
+		 * should take care of it. */
+		rc = osc_extent_wait(env, waiting, OES_INV);
+		if (rc < 0)
+			OSC_EXTENT_DUMP(D_CACHE, ext, "wait error: %d.\n", rc);
+
+		osc_extent_put(env, waiting);
+		waiting = NULL;
+		goto again;
+	}
+	RETURN(result);
+}
+
+/**
+ * Called after osc_io_setattr_end to add oio->oi_trunc back to cache.
+ */
+void osc_cache_truncate_end(const struct lu_env *env, struct osc_io *oio,
+			    struct osc_object *obj)
+{
+	struct osc_extent *ext = oio->oi_trunc;
+
+	oio->oi_trunc = NULL;
+	if (ext != NULL) {
+		bool unplug = false;
+
+		EASSERT(ext->oe_nr_pages > 0, ext);
+		EASSERT(ext->oe_state == OES_TRUNC, ext);
+		EASSERT(!ext->oe_urgent, ext);
+
+		OSC_EXTENT_DUMP(D_CACHE, ext, "trunc -> cache.\n");
+		osc_object_lock(obj);
+		osc_extent_state_set(ext, OES_CACHE);
+		if (ext->oe_fsync_wait && !ext->oe_urgent) {
+			ext->oe_urgent = 1;
+			list_move_tail(&ext->oe_link, &obj->oo_urgent_exts);
+			unplug = true;
+		}
+		osc_update_pending(obj, OBD_BRW_WRITE, ext->oe_nr_pages);
+		osc_object_unlock(obj);
+		osc_extent_put(env, ext);
+
+		if (unplug)
+			osc_io_unplug_async(env, osc_cli(obj), obj);
+	}
+}
+
+/**
+ * Wait for extents in a specific range to be written out.
+ * The caller must have called osc_cache_writeback_range() to issue IO
+ * otherwise it will take a long time for this function to finish.
+ *
+ * Caller must hold inode_mutex , or cancel exclusive dlm lock so that
+ * nobody else can dirty this range of file while we're waiting for
+ * extents to be written.
+ */
+int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj,
+			 pgoff_t start, pgoff_t end)
+{
+	struct osc_extent *ext;
+	pgoff_t index = start;
+	int     result = 0;
+	ENTRY;
+
+again:
+	osc_object_lock(obj);
+	ext = osc_extent_search(obj, index);
+	if (ext == NULL)
+		ext = first_extent(obj);
+	else if (ext->oe_end < index)
+		ext = next_extent(ext);
+	while (ext != NULL) {
+		int rc;
+
+		if (ext->oe_start > end)
+			break;
+
+		if (!ext->oe_fsync_wait) {
+			ext = next_extent(ext);
+			continue;
+		}
+
+		EASSERT(ergo(ext->oe_state == OES_CACHE,
+			     ext->oe_hp || ext->oe_urgent), ext);
+		EASSERT(ergo(ext->oe_state == OES_ACTIVE,
+			     !ext->oe_hp && ext->oe_urgent), ext);
+
+		index = ext->oe_end + 1;
+		osc_extent_get(ext);
+		osc_object_unlock(obj);
+
+		rc = osc_extent_wait(env, ext, OES_INV);
+		if (result == 0)
+			result = rc;
+		osc_extent_put(env, ext);
+		goto again;
+	}
+	osc_object_unlock(obj);
+
+	OSC_IO_DEBUG(obj, "sync file range.\n");
+	RETURN(result);
+}
+
+/**
+ * Called to write out a range of osc object.
+ *
+ * @hp     : should be set this is caused by lock cancel;
+ * @discard: is set if dirty pages should be dropped - file will be deleted or
+ *	   truncated, this implies there is no partially discarding extents.
+ *
+ * Return how many pages will be issued, or error code if error occurred.
+ */
+int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj,
+			      pgoff_t start, pgoff_t end, int hp, int discard)
+{
+	struct osc_extent *ext;
+	LIST_HEAD(discard_list);
+	bool unplug = false;
+	int result = 0;
+	ENTRY;
+
+	osc_object_lock(obj);
+	ext = osc_extent_search(obj, start);
+	if (ext == NULL)
+		ext = first_extent(obj);
+	else if (ext->oe_end < start)
+		ext = next_extent(ext);
+	while (ext != NULL) {
+		if (ext->oe_start > end)
+			break;
+
+		ext->oe_fsync_wait = 1;
+		switch (ext->oe_state) {
+		case OES_CACHE:
+			result += ext->oe_nr_pages;
+			if (!discard) {
+				struct list_head *list = NULL;
+				if (hp) {
+					EASSERT(!ext->oe_hp, ext);
+					ext->oe_hp = 1;
+					list = &obj->oo_hp_exts;
+				} else if (!ext->oe_urgent) {
+					ext->oe_urgent = 1;
+					list = &obj->oo_urgent_exts;
+				}
+				if (list != NULL)
+					list_move_tail(&ext->oe_link, list);
+				unplug = true;
+			} else {
+				/* the only discarder is lock cancelling, so
+				 * [start, end] must contain this extent */
+				EASSERT(ext->oe_start >= start &&
+					ext->oe_max_end <= end, ext);
+				osc_extent_state_set(ext, OES_LOCKING);
+				ext->oe_owner = current;
+				list_move_tail(&ext->oe_link,
+						   &discard_list);
+				osc_update_pending(obj, OBD_BRW_WRITE,
+						   -ext->oe_nr_pages);
+			}
+			break;
+		case OES_ACTIVE:
+			/* It's pretty bad to wait for ACTIVE extents, because
+			 * we don't know how long we will wait for it to be
+			 * flushed since it may be blocked at awaiting more
+			 * grants. We do this for the correctness of fsync. */
+			LASSERT(hp == 0 && discard == 0);
+			ext->oe_urgent = 1;
+			break;
+		case OES_TRUNC:
+			/* this extent is being truncated, can't do anything
+			 * for it now. it will be set to urgent after truncate
+			 * is finished in osc_cache_truncate_end(). */
+		default:
+			break;
+		}
+		ext = next_extent(ext);
+	}
+	osc_object_unlock(obj);
+
+	LASSERT(ergo(!discard, list_empty(&discard_list)));
+	if (!list_empty(&discard_list)) {
+		struct osc_extent *tmp;
+		int rc;
+
+		osc_list_maint(osc_cli(obj), obj);
+		list_for_each_entry_safe(ext, tmp, &discard_list, oe_link) {
+			list_del_init(&ext->oe_link);
+			EASSERT(ext->oe_state == OES_LOCKING, ext);
+
+			/* Discard caching pages. We don't actually write this
+			 * extent out but we complete it as if we did. */
+			rc = osc_extent_make_ready(env, ext);
+			if (unlikely(rc < 0)) {
+				OSC_EXTENT_DUMP(D_ERROR, ext,
+						"make_ready returned %d\n", rc);
+				if (result >= 0)
+					result = rc;
+			}
+
+			/* finish the extent as if the pages were sent */
+			osc_extent_finish(env, ext, 0, 0);
+		}
+	}
+
+	if (unplug)
+		osc_io_unplug(env, osc_cli(obj), obj, PDL_POLICY_ROUND);
+
+	if (hp || discard) {
+		int rc;
+		rc = osc_cache_wait_range(env, obj, start, end);
+		if (result >= 0 && rc < 0)
+			result = rc;
+	}
+
+	OSC_IO_DEBUG(obj, "cache page out.\n");
+	RETURN(result);
+}
+
+/** @} osc */
diff --git a/drivers/staging/lustre/lustre/osc/osc_cl_internal.h b/drivers/staging/lustre/lustre/osc/osc_cl_internal.h
new file mode 100644
index 000000000000..001a9c84ab8e
--- /dev/null
+++ b/drivers/staging/lustre/lustre/osc/osc_cl_internal.h
@@ -0,0 +1,679 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Internal interfaces of OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#ifndef OSC_CL_INTERNAL_H
+#define OSC_CL_INTERNAL_H
+
+# include <linux/libcfs/libcfs.h>
+
+#include <obd.h>
+/* osc_build_res_name() */
+#include <obd_ost.h>
+#include <cl_object.h>
+#include <lclient.h>
+#include "osc_internal.h"
+
+/** \defgroup osc osc
+ *  @{
+ */
+
+struct osc_extent;
+
+/**
+ * State maintained by osc layer for each IO context.
+ */
+struct osc_io {
+	/** super class */
+	struct cl_io_slice oi_cl;
+	/** true if this io is lockless. */
+	int		oi_lockless;
+	/** active extents, we know how many bytes is going to be written,
+	 * so having an active extent will prevent it from being fragmented */
+	struct osc_extent *oi_active;
+	/** partially truncated extent, we need to hold this extent to prevent
+	 * page writeback from happening. */
+	struct osc_extent *oi_trunc;
+
+	struct obd_info    oi_info;
+	struct obdo	oi_oa;
+	struct osc_async_cbargs {
+		bool		  opc_rpc_sent;
+		int	       opc_rc;
+		struct completion	opc_sync;
+	} oi_cbarg;
+};
+
+/**
+ * State of transfer for osc.
+ */
+struct osc_req {
+	struct cl_req_slice    or_cl;
+};
+
+/**
+ * State maintained by osc layer for the duration of a system call.
+ */
+struct osc_session {
+	struct osc_io       os_io;
+};
+
+#define OTI_PVEC_SIZE 64
+struct osc_thread_info {
+	struct ldlm_res_id      oti_resname;
+	ldlm_policy_data_t      oti_policy;
+	struct cl_lock_descr    oti_descr;
+	struct cl_attr	  oti_attr;
+	struct lustre_handle    oti_handle;
+	struct cl_page_list     oti_plist;
+	struct cl_io		oti_io;
+	struct cl_page	       *oti_pvec[OTI_PVEC_SIZE];
+};
+
+struct osc_object {
+	struct cl_object   oo_cl;
+	struct lov_oinfo  *oo_oinfo;
+	/**
+	 * True if locking against this stripe got -EUSERS.
+	 */
+	int		oo_contended;
+	cfs_time_t	 oo_contention_time;
+	/**
+	 * List of pages in transfer.
+	 */
+	struct list_head	 oo_inflight[CRT_NR];
+	/**
+	 * Lock, protecting ccc_object::cob_inflight, because a seat-belt is
+	 * locked during take-off and landing.
+	 */
+	spinlock_t	   oo_seatbelt;
+
+	/**
+	 * used by the osc to keep track of what objects to build into rpcs.
+	 * Protected by client_obd->cli_loi_list_lock.
+	 */
+	struct list_head	   oo_ready_item;
+	struct list_head	   oo_hp_ready_item;
+	struct list_head	   oo_write_item;
+	struct list_head	   oo_read_item;
+
+	/**
+	 * extent is a red black tree to manage (async) dirty pages.
+	 */
+	struct rb_root       oo_root;
+	/**
+	 * Manage write(dirty) extents.
+	 */
+	struct list_head	   oo_hp_exts; /* list of hp extents */
+	struct list_head	   oo_urgent_exts; /* list of writeback extents */
+	struct list_head	   oo_rpc_exts;
+
+	struct list_head	   oo_reading_exts;
+
+	atomic_t	 oo_nr_reads;
+	atomic_t	 oo_nr_writes;
+
+	/** Protect extent tree. Will be used to protect
+	 * oo_{read|write}_pages soon. */
+	spinlock_t	    oo_lock;
+};
+
+static inline void osc_object_lock(struct osc_object *obj)
+{
+	spin_lock(&obj->oo_lock);
+}
+
+static inline int osc_object_trylock(struct osc_object *obj)
+{
+	return spin_trylock(&obj->oo_lock);
+}
+
+static inline void osc_object_unlock(struct osc_object *obj)
+{
+	spin_unlock(&obj->oo_lock);
+}
+
+static inline int osc_object_is_locked(struct osc_object *obj)
+{
+	return spin_is_locked(&obj->oo_lock);
+}
+
+/*
+ * Lock "micro-states" for osc layer.
+ */
+enum osc_lock_state {
+	OLS_NEW,
+	OLS_ENQUEUED,
+	OLS_UPCALL_RECEIVED,
+	OLS_GRANTED,
+	OLS_RELEASED,
+	OLS_BLOCKED,
+	OLS_CANCELLED
+};
+
+/**
+ * osc-private state of cl_lock.
+ *
+ * Interaction with DLM.
+ *
+ * CLIO enqueues all DLM locks through ptlrpcd (that is, in "async" mode).
+ *
+ * Once receive upcall is invoked, osc_lock remembers a handle of DLM lock in
+ * osc_lock::ols_handle and a pointer to that lock in osc_lock::ols_lock.
+ *
+ * This pointer is protected through a reference, acquired by
+ * osc_lock_upcall0(). Also, an additional reference is acquired by
+ * ldlm_lock_addref() call protecting the lock from cancellation, until
+ * osc_lock_unuse() releases it.
+ *
+ * Below is a description of how lock references are acquired and released
+ * inside of DLM.
+ *
+ * - When new lock is created and enqueued to the server (ldlm_cli_enqueue())
+ *      - ldlm_lock_create()
+ *	  - ldlm_lock_new(): initializes a lock with 2 references. One for
+ *	    the caller (released when reply from the server is received, or on
+ *	    error), and another for the hash table.
+ *      - ldlm_lock_addref_internal(): protects the lock from cancellation.
+ *
+ * - When reply is received from the server (osc_enqueue_interpret())
+ *      - ldlm_cli_enqueue_fini()
+ *	  - LDLM_LOCK_PUT(): releases caller reference acquired by
+ *	    ldlm_lock_new().
+ *	  - if (rc != 0)
+ *		ldlm_lock_decref(): error case: matches ldlm_cli_enqueue().
+ *      - ldlm_lock_decref(): for async locks, matches ldlm_cli_enqueue().
+ *
+ * - When lock is being cancelled (ldlm_lock_cancel())
+ *      - ldlm_lock_destroy()
+ *	  - LDLM_LOCK_PUT(): releases hash-table reference acquired by
+ *	    ldlm_lock_new().
+ *
+ * osc_lock is detached from ldlm_lock by osc_lock_detach() that is called
+ * either when lock is cancelled (osc_lock_blocking()), or when locks is
+ * deleted without cancellation (e.g., from cl_locks_prune()). In the latter
+ * case ldlm lock remains in memory, and can be re-attached to osc_lock in the
+ * future.
+ */
+struct osc_lock {
+	struct cl_lock_slice     ols_cl;
+	/** underlying DLM lock */
+	struct ldlm_lock	*ols_lock;
+	/** lock value block */
+	struct ost_lvb	   ols_lvb;
+	/** DLM flags with which osc_lock::ols_lock was enqueued */
+	__u64		    ols_flags;
+	/** osc_lock::ols_lock handle */
+	struct lustre_handle     ols_handle;
+	struct ldlm_enqueue_info ols_einfo;
+	enum osc_lock_state      ols_state;
+
+	/**
+	 * How many pages are using this lock for io, currently only used by
+	 * read-ahead. If non-zero, the underlying dlm lock won't be cancelled
+	 * during recovery to avoid deadlock. see bz16774.
+	 *
+	 * \see osc_page::ops_lock
+	 * \see osc_page_addref_lock(), osc_page_putref_lock()
+	 */
+	atomic_t	     ols_pageref;
+
+	/**
+	 * true, if ldlm_lock_addref() was called against
+	 * osc_lock::ols_lock. This is used for sanity checking.
+	 *
+	 * \see osc_lock::ols_has_ref
+	 */
+	unsigned		  ols_hold :1,
+	/**
+	 * this is much like osc_lock::ols_hold, except that this bit is
+	 * cleared _after_ reference in released in osc_lock_unuse(). This
+	 * fine distinction is needed because:
+	 *
+	 *     - if ldlm lock still has a reference, osc_ast_data_get() needs
+	 *       to return associated cl_lock (so that a flag is needed that is
+	 *       cleared after ldlm_lock_decref() returned), and
+	 *
+	 *     - ldlm_lock_decref() can invoke blocking ast (for a
+	 *       LDLM_FL_CBPENDING lock), and osc_lock functions like
+	 *       osc_lock_cancel() called from there need to know whether to
+	 *       release lock reference (so that a flag is needed that is
+	 *       cleared before ldlm_lock_decref() is called).
+	 */
+				 ols_has_ref:1,
+	/**
+	 * inherit the lockless attribute from top level cl_io.
+	 * If true, osc_lock_enqueue is able to tolerate the -EUSERS error.
+	 */
+				 ols_locklessable:1,
+	/**
+	 * set by osc_lock_use() to wait until blocking AST enters into
+	 * osc_ldlm_blocking_ast0(), so that cl_lock mutex can be used for
+	 * further synchronization.
+	 */
+				 ols_ast_wait:1,
+	/**
+	 * If the data of this lock has been flushed to server side.
+	 */
+				 ols_flush:1,
+	/**
+	 * if set, the osc_lock is a glimpse lock. For glimpse locks, we treat
+	 * the EVAVAIL error as torerable, this will make upper logic happy
+	 * to wait all glimpse locks to each OSTs to be completed.
+	 * Glimpse lock converts to normal lock if the server lock is
+	 * granted.
+	 * Glimpse lock should be destroyed immediately after use.
+	 */
+				 ols_glimpse:1,
+	/**
+	 * For async glimpse lock.
+	 */
+				 ols_agl:1;
+	/**
+	 * IO that owns this lock. This field is used for a dead-lock
+	 * avoidance by osc_lock_enqueue_wait().
+	 *
+	 * XXX: unfortunately, the owner of a osc_lock is not unique,
+	 * the lock may have multiple users, if the lock is granted and
+	 * then matched.
+	 */
+	struct osc_io	   *ols_owner;
+};
+
+
+/**
+ * Page state private for osc layer.
+ */
+struct osc_page {
+	struct cl_page_slice  ops_cl;
+	/**
+	 * Page queues used by osc to detect when RPC can be formed.
+	 */
+	struct osc_async_page ops_oap;
+	/**
+	 * An offset within page from which next transfer starts. This is used
+	 * by cl_page_clip() to submit partial page transfers.
+	 */
+	int		   ops_from;
+	/**
+	 * An offset within page at which next transfer ends.
+	 *
+	 * \see osc_page::ops_from.
+	 */
+	int		   ops_to;
+	/**
+	 * Boolean, true iff page is under transfer. Used for sanity checking.
+	 */
+	unsigned	      ops_transfer_pinned:1,
+	/**
+	 * True for a `temporary page' created by read-ahead code, probably
+	 * outside of any DLM lock.
+	 */
+			      ops_temp:1,
+	/**
+	 * in LRU?
+	 */
+			      ops_in_lru:1,
+	/**
+	 * Set if the page must be transferred with OBD_BRW_SRVLOCK.
+	 */
+			      ops_srvlock:1;
+	union {
+		/**
+		 * lru page list. ops_inflight and ops_lru are exclusive so
+		 * that they can share the same data.
+		 */
+		struct list_head	      ops_lru;
+		/**
+		 * Linkage into a per-osc_object list of pages in flight. For
+		 * debugging.
+		 */
+		struct list_head	    ops_inflight;
+	};
+	/**
+	 * Thread that submitted this page for transfer. For debugging.
+	 */
+	task_t	   *ops_submitter;
+	/**
+	 * Submit time - the time when the page is starting RPC. For debugging.
+	 */
+	cfs_time_t	    ops_submit_time;
+
+	/**
+	 * A lock of which we hold a reference covers this page. Only used by
+	 * read-ahead: for a readahead page, we hold it's covering lock to
+	 * prevent it from being canceled during recovery.
+	 *
+	 * \see osc_lock::ols_pageref
+	 * \see osc_page_addref_lock(), osc_page_putref_lock().
+	 */
+	struct cl_lock       *ops_lock;
+};
+
+extern struct kmem_cache *osc_lock_kmem;
+extern struct kmem_cache *osc_object_kmem;
+extern struct kmem_cache *osc_thread_kmem;
+extern struct kmem_cache *osc_session_kmem;
+extern struct kmem_cache *osc_req_kmem;
+extern struct kmem_cache *osc_extent_kmem;
+
+extern struct lu_device_type osc_device_type;
+extern struct lu_context_key osc_key;
+extern struct lu_context_key osc_session_key;
+
+#define OSC_FLAGS (ASYNC_URGENT|ASYNC_READY)
+
+int osc_lock_init(const struct lu_env *env,
+		  struct cl_object *obj, struct cl_lock *lock,
+		  const struct cl_io *io);
+int osc_io_init  (const struct lu_env *env,
+		  struct cl_object *obj, struct cl_io *io);
+int osc_req_init (const struct lu_env *env, struct cl_device *dev,
+		  struct cl_req *req);
+struct lu_object *osc_object_alloc(const struct lu_env *env,
+				   const struct lu_object_header *hdr,
+				   struct lu_device *dev);
+int osc_page_init(const struct lu_env *env, struct cl_object *obj,
+		  struct cl_page *page, struct page *vmpage);
+
+void osc_index2policy  (ldlm_policy_data_t *policy, const struct cl_object *obj,
+			pgoff_t start, pgoff_t end);
+int  osc_lvb_print     (const struct lu_env *env, void *cookie,
+			lu_printer_t p, const struct ost_lvb *lvb);
+
+void osc_page_submit(const struct lu_env *env, struct osc_page *opg,
+		     enum cl_req_type crt, int brw_flags);
+int osc_cancel_async_page(const struct lu_env *env, struct osc_page *ops);
+int osc_set_async_flags(struct osc_object *obj, struct osc_page *opg,
+			obd_flag async_flags);
+int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops,
+			struct page *page, loff_t offset);
+int osc_queue_async_io(const struct lu_env *env, struct cl_io *io,
+		       struct osc_page *ops);
+int osc_teardown_async_page(const struct lu_env *env, struct osc_object *obj,
+			    struct osc_page *ops);
+int osc_flush_async_page(const struct lu_env *env, struct cl_io *io,
+			 struct osc_page *ops);
+int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj,
+			 struct list_head *list, int cmd, int brw_flags);
+int osc_cache_truncate_start(const struct lu_env *env, struct osc_io *oio,
+			     struct osc_object *obj, __u64 size);
+void osc_cache_truncate_end(const struct lu_env *env, struct osc_io *oio,
+			    struct osc_object *obj);
+int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj,
+			      pgoff_t start, pgoff_t end, int hp, int discard);
+int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj,
+			 pgoff_t start, pgoff_t end);
+void osc_io_unplug(const struct lu_env *env, struct client_obd *cli,
+		   struct osc_object *osc, pdl_policy_t pol);
+
+void osc_object_set_contended  (struct osc_object *obj);
+void osc_object_clear_contended(struct osc_object *obj);
+int  osc_object_is_contended   (struct osc_object *obj);
+
+int  osc_lock_is_lockless      (const struct osc_lock *olck);
+
+/*****************************************************************************
+ *
+ * Accessors.
+ *
+ */
+
+static inline struct osc_thread_info *osc_env_info(const struct lu_env *env)
+{
+	struct osc_thread_info *info;
+
+	info = lu_context_key_get(&env->le_ctx, &osc_key);
+	LASSERT(info != NULL);
+	return info;
+}
+
+static inline struct osc_session *osc_env_session(const struct lu_env *env)
+{
+	struct osc_session *ses;
+
+	ses = lu_context_key_get(env->le_ses, &osc_session_key);
+	LASSERT(ses != NULL);
+	return ses;
+}
+
+static inline struct osc_io *osc_env_io(const struct lu_env *env)
+{
+	return &osc_env_session(env)->os_io;
+}
+
+static inline int osc_is_object(const struct lu_object *obj)
+{
+	return obj->lo_dev->ld_type == &osc_device_type;
+}
+
+static inline struct osc_device *lu2osc_dev(const struct lu_device *d)
+{
+	LINVRNT(d->ld_type == &osc_device_type);
+	return container_of0(d, struct osc_device, od_cl.cd_lu_dev);
+}
+
+static inline struct obd_export *osc_export(const struct osc_object *obj)
+{
+	return lu2osc_dev(obj->oo_cl.co_lu.lo_dev)->od_exp;
+}
+
+static inline struct client_obd *osc_cli(const struct osc_object *obj)
+{
+	return &osc_export(obj)->exp_obd->u.cli;
+}
+
+static inline struct osc_object *cl2osc(const struct cl_object *obj)
+{
+	LINVRNT(osc_is_object(&obj->co_lu));
+	return container_of0(obj, struct osc_object, oo_cl);
+}
+
+static inline struct cl_object *osc2cl(const struct osc_object *obj)
+{
+	return (struct cl_object *)&obj->oo_cl;
+}
+
+static inline ldlm_mode_t osc_cl_lock2ldlm(enum cl_lock_mode mode)
+{
+	LASSERT(mode == CLM_READ || mode == CLM_WRITE || mode == CLM_GROUP);
+	if (mode == CLM_READ)
+		return LCK_PR;
+	else if (mode == CLM_WRITE)
+		return LCK_PW;
+	else
+		return LCK_GROUP;
+}
+
+static inline enum cl_lock_mode osc_ldlm2cl_lock(ldlm_mode_t mode)
+{
+	LASSERT(mode == LCK_PR || mode == LCK_PW || mode == LCK_GROUP);
+	if (mode == LCK_PR)
+		return CLM_READ;
+	else if (mode == LCK_PW)
+		return CLM_WRITE;
+	else
+		return CLM_GROUP;
+}
+
+static inline struct osc_page *cl2osc_page(const struct cl_page_slice *slice)
+{
+	LINVRNT(osc_is_object(&slice->cpl_obj->co_lu));
+	return container_of0(slice, struct osc_page, ops_cl);
+}
+
+static inline struct osc_page *oap2osc(struct osc_async_page *oap)
+{
+	return container_of0(oap, struct osc_page, ops_oap);
+}
+
+static inline struct cl_page *oap2cl_page(struct osc_async_page *oap)
+{
+	return oap2osc(oap)->ops_cl.cpl_page;
+}
+
+static inline struct osc_page *oap2osc_page(struct osc_async_page *oap)
+{
+	return (struct osc_page *)container_of(oap, struct osc_page, ops_oap);
+}
+
+static inline struct osc_lock *cl2osc_lock(const struct cl_lock_slice *slice)
+{
+	LINVRNT(osc_is_object(&slice->cls_obj->co_lu));
+	return container_of0(slice, struct osc_lock, ols_cl);
+}
+
+static inline struct osc_lock *osc_lock_at(const struct cl_lock *lock)
+{
+	return cl2osc_lock(cl_lock_at(lock, &osc_device_type));
+}
+
+static inline int osc_io_srvlock(struct osc_io *oio)
+{
+	return (oio->oi_lockless && !oio->oi_cl.cis_io->ci_no_srvlock);
+}
+
+enum osc_extent_state {
+	OES_INV       = 0, /** extent is just initialized or destroyed */
+	OES_ACTIVE    = 1, /** process is using this extent */
+	OES_CACHE     = 2, /** extent is ready for IO */
+	OES_LOCKING   = 3, /** locking page to prepare IO */
+	OES_LOCK_DONE = 4, /** locking finished, ready to send */
+	OES_RPC       = 5, /** in RPC */
+	OES_TRUNC     = 6, /** being truncated */
+	OES_STATE_MAX
+};
+#define OES_STRINGS { "inv", "active", "cache", "locking", "lockdone", "rpc", \
+		      "trunc", NULL }
+
+/**
+ * osc_extent data to manage dirty pages.
+ * osc_extent has the following attributes:
+ * 1. all pages in the same must be in one RPC in write back;
+ * 2. # of pages must be less than max_pages_per_rpc - implied by 1;
+ * 3. must be covered by only 1 osc_lock;
+ * 4. exclusive. It's impossible to have overlapped osc_extent.
+ *
+ * The lifetime of an extent is from when the 1st page is dirtied to when
+ * all pages inside it are written out.
+ *
+ * LOCKING ORDER
+ * =============
+ * page lock -> client_obd_list_lock -> object lock(osc_object::oo_lock)
+ */
+struct osc_extent {
+	/** red-black tree node */
+	struct rb_node     oe_node;
+	/** osc_object of this extent */
+	struct osc_object *oe_obj;
+	/** refcount, removed from red-black tree if reaches zero. */
+	atomic_t       oe_refc;
+	/** busy if non-zero */
+	atomic_t       oe_users;
+	/** link list of osc_object's oo_{hp|urgent|locking}_exts. */
+	struct list_head	 oe_link;
+	/** state of this extent */
+	unsigned int       oe_state;
+	/** flags for this extent. */
+	unsigned int       oe_intree:1,
+	/** 0 is write, 1 is read */
+			   oe_rw:1,
+			   oe_srvlock:1,
+			   oe_memalloc:1,
+	/** an ACTIVE extent is going to be truncated, so when this extent
+	 * is released, it will turn into TRUNC state instead of CACHE. */
+			   oe_trunc_pending:1,
+	/** this extent should be written asap and someone may wait for the
+	 * write to finish. This bit is usually set along with urgent if
+	 * the extent was CACHE state.
+	 * fsync_wait extent can't be merged because new extent region may
+	 * exceed fsync range. */
+			   oe_fsync_wait:1,
+	/** covering lock is being canceled */
+			   oe_hp:1,
+	/** this extent should be written back asap. set if one of pages is
+	 * called by page WB daemon, or sync write or reading requests. */
+			   oe_urgent:1;
+	/** how many grants allocated for this extent.
+	 *  Grant allocated for this extent. There is no grant allocated
+	 *  for reading extents and sync write extents. */
+	unsigned int       oe_grants;
+	/** # of dirty pages in this extent */
+	unsigned int       oe_nr_pages;
+	/** list of pending oap pages. Pages in this list are NOT sorted. */
+	struct list_head	 oe_pages;
+	/** Since an extent has to be written out in atomic, this is used to
+	 * remember the next page need to be locked to write this extent out.
+	 * Not used right now.
+	 */
+	struct osc_page   *oe_next_page;
+	/** start and end index of this extent, include start and end
+	 * themselves. Page offset here is the page index of osc_pages.
+	 * oe_start is used as keyword for red-black tree. */
+	pgoff_t	    oe_start;
+	pgoff_t	    oe_end;
+	/** maximum ending index of this extent, this is limited by
+	 * max_pages_per_rpc, lock extent and chunk size. */
+	pgoff_t	    oe_max_end;
+	/** waitqueue - for those who want to be notified if this extent's
+	 * state has changed. */
+	wait_queue_head_t	oe_waitq;
+	/** lock covering this extent */
+	struct cl_lock    *oe_osclock;
+	/** terminator of this extent. Must be true if this extent is in IO. */
+	task_t	*oe_owner;
+	/** return value of writeback. If somebody is waiting for this extent,
+	 * this value can be known by outside world. */
+	int		oe_rc;
+	/** max pages per rpc when this extent was created */
+	unsigned int       oe_mppr;
+};
+
+int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext,
+		      int sent, int rc);
+int osc_extent_release(const struct lu_env *env, struct osc_extent *ext);
+
+/** @} osc */
+
+#endif /* OSC_CL_INTERNAL_H */
diff --git a/drivers/staging/lustre/lustre/osc/osc_dev.c b/drivers/staging/lustre/lustre/osc/osc_dev.c
new file mode 100644
index 000000000000..4208ddfd73b3
--- /dev/null
+++ b/drivers/staging/lustre/lustre/osc/osc_dev.c
@@ -0,0 +1,261 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_device, cl_req for OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+/* class_name2obd() */
+#include <obd_class.h>
+
+#include "osc_cl_internal.h"
+
+/** \addtogroup osc
+ * @{
+ */
+
+struct kmem_cache *osc_lock_kmem;
+struct kmem_cache *osc_object_kmem;
+struct kmem_cache *osc_thread_kmem;
+struct kmem_cache *osc_session_kmem;
+struct kmem_cache *osc_req_kmem;
+struct kmem_cache *osc_extent_kmem;
+struct kmem_cache *osc_quota_kmem;
+
+struct lu_kmem_descr osc_caches[] = {
+	{
+		.ckd_cache = &osc_lock_kmem,
+		.ckd_name  = "osc_lock_kmem",
+		.ckd_size  = sizeof (struct osc_lock)
+	},
+	{
+		.ckd_cache = &osc_object_kmem,
+		.ckd_name  = "osc_object_kmem",
+		.ckd_size  = sizeof (struct osc_object)
+	},
+	{
+		.ckd_cache = &osc_thread_kmem,
+		.ckd_name  = "osc_thread_kmem",
+		.ckd_size  = sizeof (struct osc_thread_info)
+	},
+	{
+		.ckd_cache = &osc_session_kmem,
+		.ckd_name  = "osc_session_kmem",
+		.ckd_size  = sizeof (struct osc_session)
+	},
+	{
+		.ckd_cache = &osc_req_kmem,
+		.ckd_name  = "osc_req_kmem",
+		.ckd_size  = sizeof (struct osc_req)
+	},
+	{
+		.ckd_cache = &osc_extent_kmem,
+		.ckd_name  = "osc_extent_kmem",
+		.ckd_size  = sizeof (struct osc_extent)
+	},
+	{
+		.ckd_cache = &osc_quota_kmem,
+		.ckd_name  = "osc_quota_kmem",
+		.ckd_size  = sizeof(struct osc_quota_info)
+	},
+	{
+		.ckd_cache = NULL
+	}
+};
+
+struct lock_class_key osc_ast_guard_class;
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ */
+
+static struct lu_device *osc2lu_dev(struct osc_device *osc)
+{
+	return &osc->od_cl.cd_lu_dev;
+}
+
+/*****************************************************************************
+ *
+ * Osc device and device type functions.
+ *
+ */
+
+static void *osc_key_init(const struct lu_context *ctx,
+			 struct lu_context_key *key)
+{
+	struct osc_thread_info *info;
+
+	OBD_SLAB_ALLOC_PTR_GFP(info, osc_thread_kmem, __GFP_IO);
+	if (info == NULL)
+		info = ERR_PTR(-ENOMEM);
+	return info;
+}
+
+static void osc_key_fini(const struct lu_context *ctx,
+			 struct lu_context_key *key, void *data)
+{
+	struct osc_thread_info *info = data;
+	OBD_SLAB_FREE_PTR(info, osc_thread_kmem);
+}
+
+struct lu_context_key osc_key = {
+	.lct_tags = LCT_CL_THREAD,
+	.lct_init = osc_key_init,
+	.lct_fini = osc_key_fini
+};
+
+static void *osc_session_init(const struct lu_context *ctx,
+			      struct lu_context_key *key)
+{
+	struct osc_session *info;
+
+	OBD_SLAB_ALLOC_PTR_GFP(info, osc_session_kmem, __GFP_IO);
+	if (info == NULL)
+		info = ERR_PTR(-ENOMEM);
+	return info;
+}
+
+static void osc_session_fini(const struct lu_context *ctx,
+			     struct lu_context_key *key, void *data)
+{
+	struct osc_session *info = data;
+	OBD_SLAB_FREE_PTR(info, osc_session_kmem);
+}
+
+struct lu_context_key osc_session_key = {
+	.lct_tags = LCT_SESSION,
+	.lct_init = osc_session_init,
+	.lct_fini = osc_session_fini
+};
+
+/* type constructor/destructor: osc_type_{init,fini,start,stop}(). */
+LU_TYPE_INIT_FINI(osc, &osc_key, &osc_session_key);
+
+static int osc_cl_process_config(const struct lu_env *env,
+				 struct lu_device *d, struct lustre_cfg *cfg)
+{
+	ENTRY;
+	RETURN(osc_process_config_base(d->ld_obd, cfg));
+}
+
+static const struct lu_device_operations osc_lu_ops = {
+	.ldo_object_alloc      = osc_object_alloc,
+	.ldo_process_config    = osc_cl_process_config,
+	.ldo_recovery_complete = NULL
+};
+
+static const struct cl_device_operations osc_cl_ops = {
+	.cdo_req_init = osc_req_init
+};
+
+static int osc_device_init(const struct lu_env *env, struct lu_device *d,
+			   const char *name, struct lu_device *next)
+{
+	RETURN(0);
+}
+
+static struct lu_device *osc_device_fini(const struct lu_env *env,
+					 struct lu_device *d)
+{
+	return 0;
+}
+
+static struct lu_device *osc_device_free(const struct lu_env *env,
+					 struct lu_device *d)
+{
+	struct osc_device *od = lu2osc_dev(d);
+
+	cl_device_fini(lu2cl_dev(d));
+	OBD_FREE_PTR(od);
+	return NULL;
+}
+
+static struct lu_device *osc_device_alloc(const struct lu_env *env,
+					  struct lu_device_type *t,
+					  struct lustre_cfg *cfg)
+{
+	struct lu_device *d;
+	struct osc_device *od;
+	struct obd_device *obd;
+	int rc;
+
+	OBD_ALLOC_PTR(od);
+	if (od == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	cl_device_init(&od->od_cl, t);
+	d = osc2lu_dev(od);
+	d->ld_ops = &osc_lu_ops;
+	od->od_cl.cd_ops = &osc_cl_ops;
+
+	/* Setup OSC OBD */
+	obd = class_name2obd(lustre_cfg_string(cfg, 0));
+	LASSERT(obd != NULL);
+	rc = osc_setup(obd, cfg);
+	if (rc) {
+		osc_device_free(env, d);
+		RETURN(ERR_PTR(rc));
+	}
+	od->od_exp = obd->obd_self_export;
+	RETURN(d);
+}
+
+static const struct lu_device_type_operations osc_device_type_ops = {
+	.ldto_init = osc_type_init,
+	.ldto_fini = osc_type_fini,
+
+	.ldto_start = osc_type_start,
+	.ldto_stop  = osc_type_stop,
+
+	.ldto_device_alloc = osc_device_alloc,
+	.ldto_device_free  = osc_device_free,
+
+	.ldto_device_init    = osc_device_init,
+	.ldto_device_fini    = osc_device_fini
+};
+
+struct lu_device_type osc_device_type = {
+	.ldt_tags     = LU_DEVICE_CL,
+	.ldt_name     = LUSTRE_OSC_NAME,
+	.ldt_ops      = &osc_device_type_ops,
+	.ldt_ctx_tags = LCT_CL_THREAD
+};
+
+/** @} osc */
diff --git a/drivers/staging/lustre/lustre/osc/osc_internal.h b/drivers/staging/lustre/lustre/osc/osc_internal.h
new file mode 100644
index 000000000000..5343da2fa87a
--- /dev/null
+++ b/drivers/staging/lustre/lustre/osc/osc_internal.h
@@ -0,0 +1,210 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef OSC_INTERNAL_H
+#define OSC_INTERNAL_H
+
+#define OAP_MAGIC 8675309
+
+struct lu_env;
+
+enum async_flags {
+	ASYNC_READY = 0x1, /* ap_make_ready will not be called before this
+			      page is added to an rpc */
+	ASYNC_URGENT = 0x2, /* page must be put into an RPC before return */
+	ASYNC_COUNT_STABLE = 0x4, /* ap_refresh_count will not be called
+				     to give the caller a chance to update
+				     or cancel the size of the io */
+	ASYNC_HP = 0x10,
+};
+
+struct osc_async_page {
+	int		     oap_magic;
+	unsigned short	  oap_cmd;
+	unsigned short	  oap_interrupted:1;
+
+	struct list_head	      oap_pending_item;
+	struct list_head	      oap_rpc_item;
+
+	obd_off		 oap_obj_off;
+	unsigned		oap_page_off;
+	enum async_flags	oap_async_flags;
+
+	struct brw_page	 oap_brw_page;
+
+	struct ptlrpc_request   *oap_request;
+	struct client_obd       *oap_cli;
+	struct osc_object       *oap_obj;
+
+	struct ldlm_lock	*oap_ldlm_lock;
+	spinlock_t		 oap_lock;
+};
+
+#define oap_page	oap_brw_page.pg
+#define oap_count       oap_brw_page.count
+#define oap_brw_flags   oap_brw_page.flag
+
+struct osc_cache_waiter {
+	struct list_head	      ocw_entry;
+	wait_queue_head_t	     ocw_waitq;
+	struct osc_async_page  *ocw_oap;
+	int		     ocw_grant;
+	int		     ocw_rc;
+};
+
+int osc_create(const struct lu_env *env, struct obd_export *exp,
+	       struct obdo *oa, struct lov_stripe_md **ea,
+	       struct obd_trans_info *oti);
+int osc_real_create(struct obd_export *exp, struct obdo *oa,
+		    struct lov_stripe_md **ea, struct obd_trans_info *oti);
+void osc_wake_cache_waiters(struct client_obd *cli);
+int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes);
+void osc_update_next_shrink(struct client_obd *cli);
+
+/*
+ * cl integration.
+ */
+#include <cl_object.h>
+
+extern struct ptlrpc_request_set *PTLRPCD_SET;
+
+int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
+		     __u64 *flags, ldlm_policy_data_t *policy,
+		     struct ost_lvb *lvb, int kms_valid,
+		     obd_enqueue_update_f upcall,
+		     void *cookie, struct ldlm_enqueue_info *einfo,
+		     struct lustre_handle *lockh,
+		     struct ptlrpc_request_set *rqset, int async, int agl);
+int osc_cancel_base(struct lustre_handle *lockh, __u32 mode);
+
+int osc_match_base(struct obd_export *exp, struct ldlm_res_id *res_id,
+		   __u32 type, ldlm_policy_data_t *policy, __u32 mode,
+		   int *flags, void *data, struct lustre_handle *lockh,
+		   int unref);
+
+int osc_setattr_async_base(struct obd_export *exp, struct obd_info *oinfo,
+			   struct obd_trans_info *oti,
+			   obd_enqueue_update_f upcall, void *cookie,
+			   struct ptlrpc_request_set *rqset);
+int osc_punch_base(struct obd_export *exp, struct obd_info *oinfo,
+		   obd_enqueue_update_f upcall, void *cookie,
+		   struct ptlrpc_request_set *rqset);
+int osc_sync_base(struct obd_export *exp, struct obd_info *oinfo,
+		  obd_enqueue_update_f upcall, void *cookie,
+		  struct ptlrpc_request_set *rqset);
+
+int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *cfg);
+int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
+		  struct list_head *ext_list, int cmd, pdl_policy_t p);
+int osc_lru_shrink(struct client_obd *cli, int target);
+
+extern spinlock_t osc_ast_guard;
+
+int osc_cleanup(struct obd_device *obd);
+int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg);
+
+#ifdef LPROCFS
+int lproc_osc_attach_seqstat(struct obd_device *dev);
+void lprocfs_osc_init_vars(struct lprocfs_static_vars *lvars);
+#else
+static inline int lproc_osc_attach_seqstat(struct obd_device *dev) {return 0;}
+static inline void lprocfs_osc_init_vars(struct lprocfs_static_vars *lvars)
+{
+	memset(lvars, 0, sizeof(*lvars));
+}
+#endif
+
+extern struct lu_device_type osc_device_type;
+
+static inline int osc_recoverable_error(int rc)
+{
+	return (rc == -EIO || rc == -EROFS || rc == -ENOMEM ||
+		rc == -EAGAIN || rc == -EINPROGRESS);
+}
+
+static inline unsigned long rpcs_in_flight(struct client_obd *cli)
+{
+	return cli->cl_r_in_flight + cli->cl_w_in_flight;
+}
+
+#ifndef min_t
+#define min_t(type,x,y) \
+	({ type __x = (x); type __y = (y); __x < __y ? __x: __y; })
+#endif
+
+struct osc_device {
+	struct cl_device    od_cl;
+	struct obd_export  *od_exp;
+
+	/* Write stats is actually protected by client_obd's lock. */
+	struct osc_stats {
+		uint64_t     os_lockless_writes;	  /* by bytes */
+		uint64_t     os_lockless_reads;	   /* by bytes */
+		uint64_t     os_lockless_truncates;       /* by times */
+	} od_stats;
+
+	/* configuration item(s) */
+	int		 od_contention_time;
+	int		 od_lockless_truncate;
+};
+
+static inline struct osc_device *obd2osc_dev(const struct obd_device *d)
+{
+	return container_of0(d->obd_lu_dev, struct osc_device, od_cl.cd_lu_dev);
+}
+
+int osc_dlm_lock_pageref(struct ldlm_lock *dlm);
+
+extern struct kmem_cache *osc_quota_kmem;
+struct osc_quota_info {
+	/** linkage for quota hash table */
+	struct hlist_node oqi_hash;
+	obd_uid	  oqi_id;
+};
+int osc_quota_setup(struct obd_device *obd);
+int osc_quota_cleanup(struct obd_device *obd);
+int osc_quota_setdq(struct client_obd *cli, const unsigned int qid[],
+		    obd_flag valid, obd_flag flags);
+int osc_quota_chkdq(struct client_obd *cli, const unsigned int qid[]);
+int osc_quotactl(struct obd_device *unused, struct obd_export *exp,
+		 struct obd_quotactl *oqctl);
+int osc_quotacheck(struct obd_device *unused, struct obd_export *exp,
+		   struct obd_quotactl *oqctl);
+int osc_quota_poll_check(struct obd_export *exp, struct if_quotacheck *qchk);
+
+void osc_inc_unstable_pages(struct ptlrpc_request *req);
+void osc_dec_unstable_pages(struct ptlrpc_request *req);
+#endif /* OSC_INTERNAL_H */
diff --git a/drivers/staging/lustre/lustre/osc/osc_io.c b/drivers/staging/lustre/lustre/osc/osc_io.c
new file mode 100644
index 000000000000..1b277045b3e4
--- /dev/null
+++ b/drivers/staging/lustre/lustre/osc/osc_io.c
@@ -0,0 +1,836 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_io for OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include "osc_cl_internal.h"
+
+/** \addtogroup osc
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ */
+
+static struct osc_req *cl2osc_req(const struct cl_req_slice *slice)
+{
+	LINVRNT(slice->crs_dev->cd_lu_dev.ld_type == &osc_device_type);
+	return container_of0(slice, struct osc_req, or_cl);
+}
+
+static struct osc_io *cl2osc_io(const struct lu_env *env,
+				const struct cl_io_slice *slice)
+{
+	struct osc_io *oio = container_of0(slice, struct osc_io, oi_cl);
+	LINVRNT(oio == osc_env_io(env));
+	return oio;
+}
+
+static struct osc_page *osc_cl_page_osc(struct cl_page *page)
+{
+	const struct cl_page_slice *slice;
+
+	slice = cl_page_at(page, &osc_device_type);
+	LASSERT(slice != NULL);
+
+	return cl2osc_page(slice);
+}
+
+
+/*****************************************************************************
+ *
+ * io operations.
+ *
+ */
+
+static void osc_io_fini(const struct lu_env *env, const struct cl_io_slice *io)
+{
+}
+
+/**
+ * An implementation of cl_io_operations::cio_io_submit() method for osc
+ * layer. Iterates over pages in the in-queue, prepares each for io by calling
+ * cl_page_prep() and then either submits them through osc_io_submit_page()
+ * or, if page is already submitted, changes osc flags through
+ * osc_set_async_flags().
+ */
+static int osc_io_submit(const struct lu_env *env,
+			 const struct cl_io_slice *ios,
+			 enum cl_req_type crt, struct cl_2queue *queue)
+{
+	struct cl_page    *page;
+	struct cl_page    *tmp;
+	struct client_obd *cli  = NULL;
+	struct osc_object *osc  = NULL; /* to keep gcc happy */
+	struct osc_page   *opg;
+	struct cl_io      *io;
+	LIST_HEAD     (list);
+
+	struct cl_page_list *qin      = &queue->c2_qin;
+	struct cl_page_list *qout     = &queue->c2_qout;
+	int queued = 0;
+	int result = 0;
+	int cmd;
+	int brw_flags;
+	int max_pages;
+
+	LASSERT(qin->pl_nr > 0);
+
+	CDEBUG(D_CACHE, "%d %d\n", qin->pl_nr, crt);
+
+	osc = cl2osc(ios->cis_obj);
+	cli = osc_cli(osc);
+	max_pages = cli->cl_max_pages_per_rpc;
+
+	cmd = crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ;
+	brw_flags = osc_io_srvlock(cl2osc_io(env, ios)) ? OBD_BRW_SRVLOCK : 0;
+
+	/*
+	 * NOTE: here @page is a top-level page. This is done to avoid
+	 *       creation of sub-page-list.
+	 */
+	cl_page_list_for_each_safe(page, tmp, qin) {
+		struct osc_async_page *oap;
+
+		/* Top level IO. */
+		io = page->cp_owner;
+		LASSERT(io != NULL);
+
+		opg = osc_cl_page_osc(page);
+		oap = &opg->ops_oap;
+		LASSERT(osc == oap->oap_obj);
+
+		if (!list_empty(&oap->oap_pending_item) ||
+		    !list_empty(&oap->oap_rpc_item)) {
+			CDEBUG(D_CACHE, "Busy oap %p page %p for submit.\n",
+			       oap, opg);
+			result = -EBUSY;
+			break;
+		}
+
+		result = cl_page_prep(env, io, page, crt);
+		if (result != 0) {
+			LASSERT(result < 0);
+			if (result != -EALREADY)
+				break;
+			/*
+			 * Handle -EALREADY error: for read case, the page is
+			 * already in UPTODATE state; for write, the page
+			 * is not dirty.
+			 */
+			result = 0;
+			continue;
+		}
+
+		cl_page_list_move(qout, qin, page);
+		oap->oap_async_flags = ASYNC_URGENT|ASYNC_READY;
+		oap->oap_async_flags |= ASYNC_COUNT_STABLE;
+
+		osc_page_submit(env, opg, crt, brw_flags);
+		list_add_tail(&oap->oap_pending_item, &list);
+		if (++queued == max_pages) {
+			queued = 0;
+			result = osc_queue_sync_pages(env, osc, &list, cmd,
+						      brw_flags);
+			if (result < 0)
+				break;
+		}
+	}
+
+	if (queued > 0)
+		result = osc_queue_sync_pages(env, osc, &list, cmd, brw_flags);
+
+	CDEBUG(D_INFO, "%d/%d %d\n", qin->pl_nr, qout->pl_nr, result);
+	return qout->pl_nr > 0 ? 0 : result;
+}
+
+static void osc_page_touch_at(const struct lu_env *env,
+			      struct cl_object *obj, pgoff_t idx, unsigned to)
+{
+	struct lov_oinfo  *loi  = cl2osc(obj)->oo_oinfo;
+	struct cl_attr    *attr = &osc_env_info(env)->oti_attr;
+	int valid;
+	__u64 kms;
+
+	/* offset within stripe */
+	kms = cl_offset(obj, idx) + to;
+
+	cl_object_attr_lock(obj);
+	/*
+	 * XXX old code used
+	 *
+	 *	 ll_inode_size_lock(inode, 0); lov_stripe_lock(lsm);
+	 *
+	 * here
+	 */
+	CDEBUG(D_INODE, "stripe KMS %sincreasing "LPU64"->"LPU64" "LPU64"\n",
+	       kms > loi->loi_kms ? "" : "not ", loi->loi_kms, kms,
+	       loi->loi_lvb.lvb_size);
+
+	valid = 0;
+	if (kms > loi->loi_kms) {
+		attr->cat_kms = kms;
+		valid |= CAT_KMS;
+	}
+	if (kms > loi->loi_lvb.lvb_size) {
+		attr->cat_size = kms;
+		valid |= CAT_SIZE;
+	}
+	cl_object_attr_set(env, obj, attr, valid);
+	cl_object_attr_unlock(obj);
+}
+
+/**
+ * This is called when a page is accessed within file in a way that creates
+ * new page, if one were missing (i.e., if there were a hole at that place in
+ * the file, or accessed page is beyond the current file size). Examples:
+ * ->commit_write() and ->nopage() methods.
+ *
+ * Expand stripe KMS if necessary.
+ */
+static void osc_page_touch(const struct lu_env *env,
+			   struct osc_page *opage, unsigned to)
+{
+	struct cl_page    *page = opage->ops_cl.cpl_page;
+	struct cl_object  *obj  = opage->ops_cl.cpl_obj;
+
+	osc_page_touch_at(env, obj, page->cp_index, to);
+}
+
+/**
+ * Implements cl_io_operations::cio_prepare_write() method for osc layer.
+ *
+ * \retval -EIO transfer initiated against this osc will most likely fail
+ * \retval 0    transfer initiated against this osc will most likely succeed.
+ *
+ * The reason for this check is to immediately return an error to the caller
+ * in the case of a deactivated import. Note, that import can be deactivated
+ * later, while pages, dirtied by this IO, are still in the cache, but this is
+ * irrelevant, because that would still return an error to the application (if
+ * it does fsync), but many applications don't do fsync because of performance
+ * issues, and we wanted to return an -EIO at write time to notify the
+ * application.
+ */
+static int osc_io_prepare_write(const struct lu_env *env,
+				const struct cl_io_slice *ios,
+				const struct cl_page_slice *slice,
+				unsigned from, unsigned to)
+{
+	struct osc_device *dev = lu2osc_dev(slice->cpl_obj->co_lu.lo_dev);
+	struct obd_import *imp = class_exp2cliimp(dev->od_exp);
+	struct osc_io     *oio = cl2osc_io(env, ios);
+	int result = 0;
+	ENTRY;
+
+	/*
+	 * This implements OBD_BRW_CHECK logic from old client.
+	 */
+
+	if (imp == NULL || imp->imp_invalid)
+		result = -EIO;
+	if (result == 0 && oio->oi_lockless)
+		/* this page contains `invalid' data, but who cares?
+		 * nobody can access the invalid data.
+		 * in osc_io_commit_write(), we're going to write exact
+		 * [from, to) bytes of this page to OST. -jay */
+		cl_page_export(env, slice->cpl_page, 1);
+
+	RETURN(result);
+}
+
+static int osc_io_commit_write(const struct lu_env *env,
+			       const struct cl_io_slice *ios,
+			       const struct cl_page_slice *slice,
+			       unsigned from, unsigned to)
+{
+	struct osc_io	 *oio = cl2osc_io(env, ios);
+	struct osc_page       *opg = cl2osc_page(slice);
+	struct osc_object     *obj = cl2osc(opg->ops_cl.cpl_obj);
+	struct osc_async_page *oap = &opg->ops_oap;
+	ENTRY;
+
+	LASSERT(to > 0);
+	/*
+	 * XXX instead of calling osc_page_touch() here and in
+	 * osc_io_fault_start() it might be more logical to introduce
+	 * cl_page_touch() method, that generic cl_io_commit_write() and page
+	 * fault code calls.
+	 */
+	osc_page_touch(env, cl2osc_page(slice), to);
+	if (!client_is_remote(osc_export(obj)) &&
+	    cfs_capable(CFS_CAP_SYS_RESOURCE))
+		oap->oap_brw_flags |= OBD_BRW_NOQUOTA;
+
+	if (oio->oi_lockless)
+		/* see osc_io_prepare_write() for lockless io handling. */
+		cl_page_clip(env, slice->cpl_page, from, to);
+
+	RETURN(0);
+}
+
+static int osc_io_fault_start(const struct lu_env *env,
+			      const struct cl_io_slice *ios)
+{
+	struct cl_io       *io;
+	struct cl_fault_io *fio;
+
+	ENTRY;
+
+	io  = ios->cis_io;
+	fio = &io->u.ci_fault;
+	CDEBUG(D_INFO, "%lu %d %d\n",
+	       fio->ft_index, fio->ft_writable, fio->ft_nob);
+	/*
+	 * If mapping is writeable, adjust kms to cover this page,
+	 * but do not extend kms beyond actual file size.
+	 * See bug 10919.
+	 */
+	if (fio->ft_writable)
+		osc_page_touch_at(env, ios->cis_obj,
+				  fio->ft_index, fio->ft_nob);
+	RETURN(0);
+}
+
+static int osc_async_upcall(void *a, int rc)
+{
+	struct osc_async_cbargs *args = a;
+
+	args->opc_rc = rc;
+	complete(&args->opc_sync);
+	return 0;
+}
+
+/**
+ * Checks that there are no pages being written in the extent being truncated.
+ */
+static int trunc_check_cb(const struct lu_env *env, struct cl_io *io,
+			  struct cl_page *page, void *cbdata)
+{
+	const struct cl_page_slice *slice;
+	struct osc_page *ops;
+	struct osc_async_page *oap;
+	__u64 start = *(__u64 *)cbdata;
+
+	slice = cl_page_at(page, &osc_device_type);
+	LASSERT(slice != NULL);
+	ops = cl2osc_page(slice);
+	oap = &ops->ops_oap;
+
+	if (oap->oap_cmd & OBD_BRW_WRITE &&
+	    !list_empty(&oap->oap_pending_item))
+		CL_PAGE_DEBUG(D_ERROR, env, page, "exists " LPU64 "/%s.\n",
+				start, current->comm);
+
+	{
+		struct page *vmpage = cl_page_vmpage(env, page);
+		if (PageLocked(vmpage))
+			CDEBUG(D_CACHE, "page %p index %lu locked for %d.\n",
+			       ops, page->cp_index,
+			       (oap->oap_cmd & OBD_BRW_RWMASK));
+	}
+
+	return CLP_GANG_OKAY;
+}
+
+static void osc_trunc_check(const struct lu_env *env, struct cl_io *io,
+			    struct osc_io *oio, __u64 size)
+{
+	struct cl_object *clob;
+	int     partial;
+	pgoff_t start;
+
+	clob    = oio->oi_cl.cis_obj;
+	start   = cl_index(clob, size);
+	partial = cl_offset(clob, start) < size;
+
+	/*
+	 * Complain if there are pages in the truncated region.
+	 */
+	cl_page_gang_lookup(env, clob, io, start + partial, CL_PAGE_EOF,
+			    trunc_check_cb, (void *)&size);
+}
+
+static int osc_io_setattr_start(const struct lu_env *env,
+				const struct cl_io_slice *slice)
+{
+	struct cl_io	    *io     = slice->cis_io;
+	struct osc_io	   *oio    = cl2osc_io(env, slice);
+	struct cl_object	*obj    = slice->cis_obj;
+	struct lov_oinfo	*loi    = cl2osc(obj)->oo_oinfo;
+	struct cl_attr	  *attr   = &osc_env_info(env)->oti_attr;
+	struct obdo	     *oa     = &oio->oi_oa;
+	struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
+	__u64		    size   = io->u.ci_setattr.sa_attr.lvb_size;
+	unsigned int	     ia_valid = io->u.ci_setattr.sa_valid;
+	int		      result = 0;
+	struct obd_info	  oinfo = { { { 0 } } };
+
+	/* truncate cache dirty pages first */
+	if (cl_io_is_trunc(io))
+		result = osc_cache_truncate_start(env, oio, cl2osc(obj), size);
+
+	if (result == 0 && oio->oi_lockless == 0) {
+		cl_object_attr_lock(obj);
+		result = cl_object_attr_get(env, obj, attr);
+		if (result == 0) {
+			struct ost_lvb *lvb = &io->u.ci_setattr.sa_attr;
+			unsigned int cl_valid = 0;
+
+			if (ia_valid & ATTR_SIZE) {
+				attr->cat_size = attr->cat_kms = size;
+				cl_valid = (CAT_SIZE | CAT_KMS);
+			}
+			if (ia_valid & ATTR_MTIME_SET) {
+				attr->cat_mtime = lvb->lvb_mtime;
+				cl_valid |= CAT_MTIME;
+			}
+			if (ia_valid & ATTR_ATIME_SET) {
+				attr->cat_atime = lvb->lvb_atime;
+				cl_valid |= CAT_ATIME;
+			}
+			if (ia_valid & ATTR_CTIME_SET) {
+				attr->cat_ctime = lvb->lvb_ctime;
+				cl_valid |= CAT_CTIME;
+			}
+			result = cl_object_attr_set(env, obj, attr, cl_valid);
+		}
+		cl_object_attr_unlock(obj);
+	}
+	memset(oa, 0, sizeof(*oa));
+	if (result == 0) {
+		oa->o_oi = loi->loi_oi;
+		oa->o_mtime = attr->cat_mtime;
+		oa->o_atime = attr->cat_atime;
+		oa->o_ctime = attr->cat_ctime;
+		oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP | OBD_MD_FLATIME |
+			OBD_MD_FLCTIME | OBD_MD_FLMTIME;
+		if (ia_valid & ATTR_SIZE) {
+			oa->o_size = size;
+			oa->o_blocks = OBD_OBJECT_EOF;
+			oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
+
+			if (oio->oi_lockless) {
+				oa->o_flags = OBD_FL_SRVLOCK;
+				oa->o_valid |= OBD_MD_FLFLAGS;
+			}
+		} else {
+			LASSERT(oio->oi_lockless == 0);
+		}
+
+		oinfo.oi_oa = oa;
+		oinfo.oi_capa = io->u.ci_setattr.sa_capa;
+		init_completion(&cbargs->opc_sync);
+
+		if (ia_valid & ATTR_SIZE)
+			result = osc_punch_base(osc_export(cl2osc(obj)),
+						&oinfo, osc_async_upcall,
+						cbargs, PTLRPCD_SET);
+		else
+			result = osc_setattr_async_base(osc_export(cl2osc(obj)),
+							&oinfo, NULL,
+							osc_async_upcall,
+							cbargs, PTLRPCD_SET);
+		cbargs->opc_rpc_sent = result == 0;
+	}
+	return result;
+}
+
+static void osc_io_setattr_end(const struct lu_env *env,
+			       const struct cl_io_slice *slice)
+{
+	struct cl_io     *io  = slice->cis_io;
+	struct osc_io    *oio = cl2osc_io(env, slice);
+	struct cl_object *obj = slice->cis_obj;
+	struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
+	int result = 0;
+
+	if (cbargs->opc_rpc_sent) {
+		wait_for_completion(&cbargs->opc_sync);
+		result = io->ci_result = cbargs->opc_rc;
+	}
+	if (result == 0) {
+		if (oio->oi_lockless) {
+			/* lockless truncate */
+			struct osc_device *osd = lu2osc_dev(obj->co_lu.lo_dev);
+
+			LASSERT(cl_io_is_trunc(io));
+			/* XXX: Need a lock. */
+			osd->od_stats.os_lockless_truncates++;
+		}
+	}
+
+	if (cl_io_is_trunc(io)) {
+		__u64 size = io->u.ci_setattr.sa_attr.lvb_size;
+		osc_trunc_check(env, io, oio, size);
+		if (oio->oi_trunc != NULL) {
+			osc_cache_truncate_end(env, oio, cl2osc(obj));
+			oio->oi_trunc = NULL;
+		}
+	}
+}
+
+static int osc_io_read_start(const struct lu_env *env,
+			     const struct cl_io_slice *slice)
+{
+	struct osc_io    *oio   = cl2osc_io(env, slice);
+	struct cl_object *obj   = slice->cis_obj;
+	struct cl_attr   *attr  = &osc_env_info(env)->oti_attr;
+	int	      result = 0;
+	ENTRY;
+
+	if (oio->oi_lockless == 0) {
+		cl_object_attr_lock(obj);
+		result = cl_object_attr_get(env, obj, attr);
+		if (result == 0) {
+			attr->cat_atime = LTIME_S(CFS_CURRENT_TIME);
+			result = cl_object_attr_set(env, obj, attr,
+						    CAT_ATIME);
+		}
+		cl_object_attr_unlock(obj);
+	}
+	RETURN(result);
+}
+
+static int osc_io_write_start(const struct lu_env *env,
+			      const struct cl_io_slice *slice)
+{
+	struct osc_io    *oio   = cl2osc_io(env, slice);
+	struct cl_object *obj   = slice->cis_obj;
+	struct cl_attr   *attr  = &osc_env_info(env)->oti_attr;
+	int	      result = 0;
+	ENTRY;
+
+	if (oio->oi_lockless == 0) {
+		OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_DELAY_SETTIME, 1);
+		cl_object_attr_lock(obj);
+		result = cl_object_attr_get(env, obj, attr);
+		if (result == 0) {
+			attr->cat_mtime = attr->cat_ctime =
+				LTIME_S(CFS_CURRENT_TIME);
+			result = cl_object_attr_set(env, obj, attr,
+						    CAT_MTIME | CAT_CTIME);
+		}
+		cl_object_attr_unlock(obj);
+	}
+	RETURN(result);
+}
+
+static int osc_fsync_ost(const struct lu_env *env, struct osc_object *obj,
+			 struct cl_fsync_io *fio)
+{
+	struct osc_io    *oio   = osc_env_io(env);
+	struct obdo      *oa    = &oio->oi_oa;
+	struct obd_info  *oinfo = &oio->oi_info;
+	struct lov_oinfo *loi   = obj->oo_oinfo;
+	struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
+	int rc = 0;
+	ENTRY;
+
+	memset(oa, 0, sizeof(*oa));
+	oa->o_oi = loi->loi_oi;
+	oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
+
+	/* reload size abd blocks for start and end of sync range */
+	oa->o_size = fio->fi_start;
+	oa->o_blocks = fio->fi_end;
+	oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
+
+	obdo_set_parent_fid(oa, fio->fi_fid);
+
+	memset(oinfo, 0, sizeof(*oinfo));
+	oinfo->oi_oa = oa;
+	oinfo->oi_capa = fio->fi_capa;
+	init_completion(&cbargs->opc_sync);
+
+	rc = osc_sync_base(osc_export(obj), oinfo, osc_async_upcall, cbargs,
+			   PTLRPCD_SET);
+	RETURN(rc);
+}
+
+static int osc_io_fsync_start(const struct lu_env *env,
+			      const struct cl_io_slice *slice)
+{
+	struct cl_io       *io  = slice->cis_io;
+	struct cl_fsync_io *fio = &io->u.ci_fsync;
+	struct cl_object   *obj = slice->cis_obj;
+	struct osc_object  *osc = cl2osc(obj);
+	pgoff_t start  = cl_index(obj, fio->fi_start);
+	pgoff_t end    = cl_index(obj, fio->fi_end);
+	int     result = 0;
+	ENTRY;
+
+	if (fio->fi_end == OBD_OBJECT_EOF)
+		end = CL_PAGE_EOF;
+
+	result = osc_cache_writeback_range(env, osc, start, end, 0,
+					   fio->fi_mode == CL_FSYNC_DISCARD);
+	if (result > 0) {
+		fio->fi_nr_written += result;
+		result = 0;
+	}
+	if (fio->fi_mode == CL_FSYNC_ALL) {
+		int rc;
+
+		/* we have to wait for writeback to finish before we can
+		 * send OST_SYNC RPC. This is bad because it causes extents
+		 * to be written osc by osc. However, we usually start
+		 * writeback before CL_FSYNC_ALL so this won't have any real
+		 * problem. */
+		rc = osc_cache_wait_range(env, osc, start, end);
+		if (result == 0)
+			result = rc;
+		rc = osc_fsync_ost(env, osc, fio);
+		if (result == 0)
+			result = rc;
+	}
+
+	RETURN(result);
+}
+
+static void osc_io_fsync_end(const struct lu_env *env,
+			     const struct cl_io_slice *slice)
+{
+	struct cl_fsync_io *fio = &slice->cis_io->u.ci_fsync;
+	struct cl_object   *obj = slice->cis_obj;
+	pgoff_t start = cl_index(obj, fio->fi_start);
+	pgoff_t end   = cl_index(obj, fio->fi_end);
+	int result = 0;
+
+	if (fio->fi_mode == CL_FSYNC_LOCAL) {
+		result = osc_cache_wait_range(env, cl2osc(obj), start, end);
+	} else if (fio->fi_mode == CL_FSYNC_ALL) {
+		struct osc_io	   *oio    = cl2osc_io(env, slice);
+		struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
+
+		wait_for_completion(&cbargs->opc_sync);
+		if (result == 0)
+			result = cbargs->opc_rc;
+	}
+	slice->cis_io->ci_result = result;
+}
+
+static void osc_io_end(const struct lu_env *env,
+		       const struct cl_io_slice *slice)
+{
+	struct osc_io *oio = cl2osc_io(env, slice);
+
+	if (oio->oi_active) {
+		osc_extent_release(env, oio->oi_active);
+		oio->oi_active = NULL;
+	}
+}
+
+static const struct cl_io_operations osc_io_ops = {
+	.op = {
+		[CIT_READ] = {
+			.cio_start  = osc_io_read_start,
+			.cio_fini   = osc_io_fini
+		},
+		[CIT_WRITE] = {
+			.cio_start  = osc_io_write_start,
+			.cio_end    = osc_io_end,
+			.cio_fini   = osc_io_fini
+		},
+		[CIT_SETATTR] = {
+			.cio_start  = osc_io_setattr_start,
+			.cio_end    = osc_io_setattr_end
+		},
+		[CIT_FAULT] = {
+			.cio_start  = osc_io_fault_start,
+			.cio_end    = osc_io_end,
+			.cio_fini   = osc_io_fini
+		},
+		[CIT_FSYNC] = {
+			.cio_start  = osc_io_fsync_start,
+			.cio_end    = osc_io_fsync_end,
+			.cio_fini   = osc_io_fini
+		},
+		[CIT_MISC] = {
+			.cio_fini   = osc_io_fini
+		}
+	},
+	.req_op = {
+		 [CRT_READ] = {
+			 .cio_submit    = osc_io_submit
+		 },
+		 [CRT_WRITE] = {
+			 .cio_submit    = osc_io_submit
+		 }
+	 },
+	.cio_prepare_write = osc_io_prepare_write,
+	.cio_commit_write  = osc_io_commit_write
+};
+
+/*****************************************************************************
+ *
+ * Transfer operations.
+ *
+ */
+
+static int osc_req_prep(const struct lu_env *env,
+			const struct cl_req_slice *slice)
+{
+	return 0;
+}
+
+static void osc_req_completion(const struct lu_env *env,
+			       const struct cl_req_slice *slice, int ioret)
+{
+	struct osc_req *or;
+
+	or = cl2osc_req(slice);
+	OBD_SLAB_FREE_PTR(or, osc_req_kmem);
+}
+
+/**
+ * Implementation of struct cl_req_operations::cro_attr_set() for osc
+ * layer. osc is responsible for struct obdo::o_id and struct obdo::o_seq
+ * fields.
+ */
+static void osc_req_attr_set(const struct lu_env *env,
+			     const struct cl_req_slice *slice,
+			     const struct cl_object *obj,
+			     struct cl_req_attr *attr, obd_valid flags)
+{
+	struct lov_oinfo *oinfo;
+	struct cl_req    *clerq;
+	struct cl_page   *apage; /* _some_ page in @clerq */
+	struct cl_lock   *lock;  /* _some_ lock protecting @apage */
+	struct osc_lock  *olck;
+	struct osc_page  *opg;
+	struct obdo      *oa;
+	struct ost_lvb   *lvb;
+
+	oinfo	= cl2osc(obj)->oo_oinfo;
+	lvb	= &oinfo->loi_lvb;
+	oa	= attr->cra_oa;
+
+	if ((flags & OBD_MD_FLMTIME) != 0) {
+		oa->o_mtime = lvb->lvb_mtime;
+		oa->o_valid |= OBD_MD_FLMTIME;
+	}
+	if ((flags & OBD_MD_FLATIME) != 0) {
+		oa->o_atime = lvb->lvb_atime;
+		oa->o_valid |= OBD_MD_FLATIME;
+	}
+	if ((flags & OBD_MD_FLCTIME) != 0) {
+		oa->o_ctime = lvb->lvb_ctime;
+		oa->o_valid |= OBD_MD_FLCTIME;
+	}
+	if (flags & OBD_MD_FLGROUP) {
+		ostid_set_seq(&oa->o_oi, ostid_seq(&oinfo->loi_oi));
+		oa->o_valid |= OBD_MD_FLGROUP;
+	}
+	if (flags & OBD_MD_FLID) {
+		ostid_set_id(&oa->o_oi, ostid_id(&oinfo->loi_oi));
+		oa->o_valid |= OBD_MD_FLID;
+	}
+	if (flags & OBD_MD_FLHANDLE) {
+		clerq = slice->crs_req;
+		LASSERT(!list_empty(&clerq->crq_pages));
+		apage = container_of(clerq->crq_pages.next,
+				     struct cl_page, cp_flight);
+		opg = osc_cl_page_osc(apage);
+		apage = opg->ops_cl.cpl_page; /* now apage is a sub-page */
+		lock = cl_lock_at_page(env, apage->cp_obj, apage, NULL, 1, 1);
+		if (lock == NULL) {
+			struct cl_object_header *head;
+			struct cl_lock	  *scan;
+
+			head = cl_object_header(apage->cp_obj);
+			list_for_each_entry(scan, &head->coh_locks,
+						cll_linkage)
+				CL_LOCK_DEBUG(D_ERROR, env, scan,
+					      "no cover page!\n");
+			CL_PAGE_DEBUG(D_ERROR, env, apage,
+				      "dump uncover page!\n");
+			libcfs_debug_dumpstack(NULL);
+			LBUG();
+		}
+
+		olck = osc_lock_at(lock);
+		LASSERT(olck != NULL);
+		LASSERT(ergo(opg->ops_srvlock, olck->ols_lock == NULL));
+		/* check for lockless io. */
+		if (olck->ols_lock != NULL) {
+			oa->o_handle = olck->ols_lock->l_remote_handle;
+			oa->o_valid |= OBD_MD_FLHANDLE;
+		}
+		cl_lock_put(env, lock);
+	}
+}
+
+static const struct cl_req_operations osc_req_ops = {
+	.cro_prep       = osc_req_prep,
+	.cro_attr_set   = osc_req_attr_set,
+	.cro_completion = osc_req_completion
+};
+
+
+int osc_io_init(const struct lu_env *env,
+		struct cl_object *obj, struct cl_io *io)
+{
+	struct osc_io *oio = osc_env_io(env);
+
+	CL_IO_SLICE_CLEAN(oio, oi_cl);
+	cl_io_slice_add(io, &oio->oi_cl, obj, &osc_io_ops);
+	return 0;
+}
+
+int osc_req_init(const struct lu_env *env, struct cl_device *dev,
+		 struct cl_req *req)
+{
+	struct osc_req *or;
+	int result;
+
+	OBD_SLAB_ALLOC_PTR_GFP(or, osc_req_kmem, __GFP_IO);
+	if (or != NULL) {
+		cl_req_slice_add(req, &or->or_cl, dev, &osc_req_ops);
+		result = 0;
+	} else
+		result = -ENOMEM;
+	return result;
+}
+
+/** @} osc */
diff --git a/drivers/staging/lustre/lustre/osc/osc_lock.c b/drivers/staging/lustre/lustre/osc/osc_lock.c
new file mode 100644
index 000000000000..640bc3d34709
--- /dev/null
+++ b/drivers/staging/lustre/lustre/osc/osc_lock.c
@@ -0,0 +1,1663 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_lock for OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+# include <linux/libcfs/libcfs.h>
+/* fid_build_reg_res_name() */
+#include <lustre_fid.h>
+
+#include "osc_cl_internal.h"
+
+/** \addtogroup osc
+ *  @{
+ */
+
+#define _PAGEREF_MAGIC  (-10000000)
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ */
+
+static const struct cl_lock_operations osc_lock_ops;
+static const struct cl_lock_operations osc_lock_lockless_ops;
+static void osc_lock_to_lockless(const struct lu_env *env,
+				 struct osc_lock *ols, int force);
+static int osc_lock_has_pages(struct osc_lock *olck);
+
+int osc_lock_is_lockless(const struct osc_lock *olck)
+{
+	return (olck->ols_cl.cls_ops == &osc_lock_lockless_ops);
+}
+
+/**
+ * Returns a weak pointer to the ldlm lock identified by a handle. Returned
+ * pointer cannot be dereferenced, as lock is not protected from concurrent
+ * reclaim. This function is a helper for osc_lock_invariant().
+ */
+static struct ldlm_lock *osc_handle_ptr(struct lustre_handle *handle)
+{
+	struct ldlm_lock *lock;
+
+	lock = ldlm_handle2lock(handle);
+	if (lock != NULL)
+		LDLM_LOCK_PUT(lock);
+	return lock;
+}
+
+/**
+ * Invariant that has to be true all of the time.
+ */
+static int osc_lock_invariant(struct osc_lock *ols)
+{
+	struct ldlm_lock *lock	= osc_handle_ptr(&ols->ols_handle);
+	struct ldlm_lock *olock       = ols->ols_lock;
+	int	       handle_used = lustre_handle_is_used(&ols->ols_handle);
+
+	return
+		ergo(osc_lock_is_lockless(ols),
+		     ols->ols_locklessable && ols->ols_lock == NULL)  ||
+		(ergo(olock != NULL, handle_used) &&
+		 ergo(olock != NULL,
+		      olock->l_handle.h_cookie == ols->ols_handle.cookie) &&
+		 /*
+		  * Check that ->ols_handle and ->ols_lock are consistent, but
+		  * take into account that they are set at the different time.
+		  */
+		 ergo(handle_used,
+		      ergo(lock != NULL && olock != NULL, lock == olock) &&
+		      ergo(lock == NULL, olock == NULL)) &&
+		 ergo(ols->ols_state == OLS_CANCELLED,
+		      olock == NULL && !handle_used) &&
+		 /*
+		  * DLM lock is destroyed only after we have seen cancellation
+		  * ast.
+		  */
+		 ergo(olock != NULL && ols->ols_state < OLS_CANCELLED,
+		      !olock->l_destroyed) &&
+		 ergo(ols->ols_state == OLS_GRANTED,
+		      olock != NULL &&
+		      olock->l_req_mode == olock->l_granted_mode &&
+		      ols->ols_hold));
+}
+
+/*****************************************************************************
+ *
+ * Lock operations.
+ *
+ */
+
+/**
+ * Breaks a link between osc_lock and dlm_lock.
+ */
+static void osc_lock_detach(const struct lu_env *env, struct osc_lock *olck)
+{
+	struct ldlm_lock *dlmlock;
+
+	spin_lock(&osc_ast_guard);
+	dlmlock = olck->ols_lock;
+	if (dlmlock == NULL) {
+		spin_unlock(&osc_ast_guard);
+		return;
+	}
+
+	olck->ols_lock = NULL;
+	/* wb(); --- for all who checks (ols->ols_lock != NULL) before
+	 * call to osc_lock_detach() */
+	dlmlock->l_ast_data = NULL;
+	olck->ols_handle.cookie = 0ULL;
+	spin_unlock(&osc_ast_guard);
+
+	lock_res_and_lock(dlmlock);
+	if (dlmlock->l_granted_mode == dlmlock->l_req_mode) {
+		struct cl_object *obj = olck->ols_cl.cls_obj;
+		struct cl_attr *attr  = &osc_env_info(env)->oti_attr;
+		__u64 old_kms;
+
+		cl_object_attr_lock(obj);
+		/* Must get the value under the lock to avoid possible races. */
+		old_kms = cl2osc(obj)->oo_oinfo->loi_kms;
+		/* Update the kms. Need to loop all granted locks.
+		 * Not a problem for the client */
+		attr->cat_kms = ldlm_extent_shift_kms(dlmlock, old_kms);
+
+		cl_object_attr_set(env, obj, attr, CAT_KMS);
+		cl_object_attr_unlock(obj);
+	}
+	unlock_res_and_lock(dlmlock);
+
+	/* release a reference taken in osc_lock_upcall0(). */
+	LASSERT(olck->ols_has_ref);
+	lu_ref_del(&dlmlock->l_reference, "osc_lock", olck);
+	LDLM_LOCK_RELEASE(dlmlock);
+	olck->ols_has_ref = 0;
+}
+
+static int osc_lock_unhold(struct osc_lock *ols)
+{
+	int result = 0;
+
+	if (ols->ols_hold) {
+		ols->ols_hold = 0;
+		result = osc_cancel_base(&ols->ols_handle,
+					 ols->ols_einfo.ei_mode);
+	}
+	return result;
+}
+
+static int osc_lock_unuse(const struct lu_env *env,
+			  const struct cl_lock_slice *slice)
+{
+	struct osc_lock *ols = cl2osc_lock(slice);
+
+	LINVRNT(osc_lock_invariant(ols));
+
+	switch (ols->ols_state) {
+	case OLS_NEW:
+		LASSERT(!ols->ols_hold);
+		LASSERT(ols->ols_agl);
+		return 0;
+	case OLS_UPCALL_RECEIVED:
+		osc_lock_unhold(ols);
+	case OLS_ENQUEUED:
+		LASSERT(!ols->ols_hold);
+		osc_lock_detach(env, ols);
+		ols->ols_state = OLS_NEW;
+		return 0;
+	case OLS_GRANTED:
+		LASSERT(!ols->ols_glimpse);
+		LASSERT(ols->ols_hold);
+		/*
+		 * Move lock into OLS_RELEASED state before calling
+		 * osc_cancel_base() so that possible synchronous cancellation
+		 * (that always happens e.g., for liblustre) sees that lock is
+		 * released.
+		 */
+		ols->ols_state = OLS_RELEASED;
+		return osc_lock_unhold(ols);
+	default:
+		CERROR("Impossible state: %d\n", ols->ols_state);
+		LBUG();
+	}
+}
+
+static void osc_lock_fini(const struct lu_env *env,
+			  struct cl_lock_slice *slice)
+{
+	struct osc_lock  *ols = cl2osc_lock(slice);
+
+	LINVRNT(osc_lock_invariant(ols));
+	/*
+	 * ->ols_hold can still be true at this point if, for example, a
+	 * thread that requested a lock was killed (and released a reference
+	 * to the lock), before reply from a server was received. In this case
+	 * lock is destroyed immediately after upcall.
+	 */
+	osc_lock_unhold(ols);
+	LASSERT(ols->ols_lock == NULL);
+	LASSERT(atomic_read(&ols->ols_pageref) == 0 ||
+		atomic_read(&ols->ols_pageref) == _PAGEREF_MAGIC);
+
+	OBD_SLAB_FREE_PTR(ols, osc_lock_kmem);
+}
+
+static void osc_lock_build_policy(const struct lu_env *env,
+				  const struct cl_lock *lock,
+				  ldlm_policy_data_t *policy)
+{
+	const struct cl_lock_descr *d = &lock->cll_descr;
+
+	osc_index2policy(policy, d->cld_obj, d->cld_start, d->cld_end);
+	policy->l_extent.gid = d->cld_gid;
+}
+
+static __u64 osc_enq2ldlm_flags(__u32 enqflags)
+{
+	__u64 result = 0;
+
+	LASSERT((enqflags & ~CEF_MASK) == 0);
+
+	if (enqflags & CEF_NONBLOCK)
+		result |= LDLM_FL_BLOCK_NOWAIT;
+	if (enqflags & CEF_ASYNC)
+		result |= LDLM_FL_HAS_INTENT;
+	if (enqflags & CEF_DISCARD_DATA)
+		result |= LDLM_AST_DISCARD_DATA;
+	return result;
+}
+
+/**
+ * Global spin-lock protecting consistency of ldlm_lock::l_ast_data
+ * pointers. Initialized in osc_init().
+ */
+spinlock_t osc_ast_guard;
+
+static struct osc_lock *osc_ast_data_get(struct ldlm_lock *dlm_lock)
+{
+	struct osc_lock *olck;
+
+	lock_res_and_lock(dlm_lock);
+	spin_lock(&osc_ast_guard);
+	olck = dlm_lock->l_ast_data;
+	if (olck != NULL) {
+		struct cl_lock *lock = olck->ols_cl.cls_lock;
+		/*
+		 * If osc_lock holds a reference on ldlm lock, return it even
+		 * when cl_lock is in CLS_FREEING state. This way
+		 *
+		 *	 osc_ast_data_get(dlmlock) == NULL
+		 *
+		 * guarantees that all osc references on dlmlock were
+		 * released. osc_dlm_blocking_ast0() relies on that.
+		 */
+		if (lock->cll_state < CLS_FREEING || olck->ols_has_ref) {
+			cl_lock_get_trust(lock);
+			lu_ref_add_atomic(&lock->cll_reference,
+					  "ast", current);
+		} else
+			olck = NULL;
+	}
+	spin_unlock(&osc_ast_guard);
+	unlock_res_and_lock(dlm_lock);
+	return olck;
+}
+
+static void osc_ast_data_put(const struct lu_env *env, struct osc_lock *olck)
+{
+	struct cl_lock *lock;
+
+	lock = olck->ols_cl.cls_lock;
+	lu_ref_del(&lock->cll_reference, "ast", current);
+	cl_lock_put(env, lock);
+}
+
+/**
+ * Updates object attributes from a lock value block (lvb) received together
+ * with the DLM lock reply from the server. Copy of osc_update_enqueue()
+ * logic.
+ *
+ * This can be optimized to not update attributes when lock is a result of a
+ * local match.
+ *
+ * Called under lock and resource spin-locks.
+ */
+static void osc_lock_lvb_update(const struct lu_env *env, struct osc_lock *olck,
+				int rc)
+{
+	struct ost_lvb    *lvb;
+	struct cl_object  *obj;
+	struct lov_oinfo  *oinfo;
+	struct cl_attr    *attr;
+	unsigned	   valid;
+
+	ENTRY;
+
+	if (!(olck->ols_flags & LDLM_FL_LVB_READY))
+		RETURN_EXIT;
+
+	lvb   = &olck->ols_lvb;
+	obj   = olck->ols_cl.cls_obj;
+	oinfo = cl2osc(obj)->oo_oinfo;
+	attr  = &osc_env_info(env)->oti_attr;
+	valid = CAT_BLOCKS | CAT_ATIME | CAT_CTIME | CAT_MTIME | CAT_SIZE;
+	cl_lvb2attr(attr, lvb);
+
+	cl_object_attr_lock(obj);
+	if (rc == 0) {
+		struct ldlm_lock  *dlmlock;
+		__u64 size;
+
+		dlmlock = olck->ols_lock;
+		LASSERT(dlmlock != NULL);
+
+		/* re-grab LVB from a dlm lock under DLM spin-locks. */
+		*lvb = *(struct ost_lvb *)dlmlock->l_lvb_data;
+		size = lvb->lvb_size;
+		/* Extend KMS up to the end of this lock and no further
+		 * A lock on [x,y] means a KMS of up to y + 1 bytes! */
+		if (size > dlmlock->l_policy_data.l_extent.end)
+			size = dlmlock->l_policy_data.l_extent.end + 1;
+		if (size >= oinfo->loi_kms) {
+			LDLM_DEBUG(dlmlock, "lock acquired, setting rss="LPU64
+				   ", kms="LPU64, lvb->lvb_size, size);
+			valid |= CAT_KMS;
+			attr->cat_kms = size;
+		} else {
+			LDLM_DEBUG(dlmlock, "lock acquired, setting rss="
+				   LPU64"; leaving kms="LPU64", end="LPU64,
+				   lvb->lvb_size, oinfo->loi_kms,
+				   dlmlock->l_policy_data.l_extent.end);
+		}
+		ldlm_lock_allow_match_locked(dlmlock);
+	} else if (rc == -ENAVAIL && olck->ols_glimpse) {
+		CDEBUG(D_INODE, "glimpsed, setting rss="LPU64"; leaving"
+		       " kms="LPU64"\n", lvb->lvb_size, oinfo->loi_kms);
+	} else
+		valid = 0;
+
+	if (valid != 0)
+		cl_object_attr_set(env, obj, attr, valid);
+
+	cl_object_attr_unlock(obj);
+
+	EXIT;
+}
+
+/**
+ * Called when a lock is granted, from an upcall (when server returned a
+ * granted lock), or from completion AST, when server returned a blocked lock.
+ *
+ * Called under lock and resource spin-locks, that are released temporarily
+ * here.
+ */
+static void osc_lock_granted(const struct lu_env *env, struct osc_lock *olck,
+			     struct ldlm_lock *dlmlock, int rc)
+{
+	struct ldlm_extent   *ext;
+	struct cl_lock       *lock;
+	struct cl_lock_descr *descr;
+
+	LASSERT(dlmlock->l_granted_mode == dlmlock->l_req_mode);
+
+	ENTRY;
+	if (olck->ols_state < OLS_GRANTED) {
+		lock  = olck->ols_cl.cls_lock;
+		ext   = &dlmlock->l_policy_data.l_extent;
+		descr = &osc_env_info(env)->oti_descr;
+		descr->cld_obj = lock->cll_descr.cld_obj;
+
+		/* XXX check that ->l_granted_mode is valid. */
+		descr->cld_mode  = osc_ldlm2cl_lock(dlmlock->l_granted_mode);
+		descr->cld_start = cl_index(descr->cld_obj, ext->start);
+		descr->cld_end   = cl_index(descr->cld_obj, ext->end);
+		descr->cld_gid   = ext->gid;
+		/*
+		 * tell upper layers the extent of the lock that was actually
+		 * granted
+		 */
+		olck->ols_state = OLS_GRANTED;
+		osc_lock_lvb_update(env, olck, rc);
+
+		/* release DLM spin-locks to allow cl_lock_{modify,signal}()
+		 * to take a semaphore on a parent lock. This is safe, because
+		 * spin-locks are needed to protect consistency of
+		 * dlmlock->l_*_mode and LVB, and we have finished processing
+		 * them. */
+		unlock_res_and_lock(dlmlock);
+		cl_lock_modify(env, lock, descr);
+		cl_lock_signal(env, lock);
+		LINVRNT(osc_lock_invariant(olck));
+		lock_res_and_lock(dlmlock);
+	}
+	EXIT;
+}
+
+static void osc_lock_upcall0(const struct lu_env *env, struct osc_lock *olck)
+
+{
+	struct ldlm_lock *dlmlock;
+
+	ENTRY;
+
+	dlmlock = ldlm_handle2lock_long(&olck->ols_handle, 0);
+	LASSERT(dlmlock != NULL);
+
+	lock_res_and_lock(dlmlock);
+	spin_lock(&osc_ast_guard);
+	LASSERT(dlmlock->l_ast_data == olck);
+	LASSERT(olck->ols_lock == NULL);
+	olck->ols_lock = dlmlock;
+	spin_unlock(&osc_ast_guard);
+
+	/*
+	 * Lock might be not yet granted. In this case, completion ast
+	 * (osc_ldlm_completion_ast()) comes later and finishes lock
+	 * granting.
+	 */
+	if (dlmlock->l_granted_mode == dlmlock->l_req_mode)
+		osc_lock_granted(env, olck, dlmlock, 0);
+	unlock_res_and_lock(dlmlock);
+
+	/*
+	 * osc_enqueue_interpret() decrefs asynchronous locks, counter
+	 * this.
+	 */
+	ldlm_lock_addref(&olck->ols_handle, olck->ols_einfo.ei_mode);
+	olck->ols_hold = 1;
+
+	/* lock reference taken by ldlm_handle2lock_long() is owned by
+	 * osc_lock and released in osc_lock_detach() */
+	lu_ref_add(&dlmlock->l_reference, "osc_lock", olck);
+	olck->ols_has_ref = 1;
+}
+
+/**
+ * Lock upcall function that is executed either when a reply to ENQUEUE rpc is
+ * received from a server, or after osc_enqueue_base() matched a local DLM
+ * lock.
+ */
+static int osc_lock_upcall(void *cookie, int errcode)
+{
+	struct osc_lock	 *olck  = cookie;
+	struct cl_lock_slice    *slice = &olck->ols_cl;
+	struct cl_lock	  *lock  = slice->cls_lock;
+	struct lu_env	   *env;
+	struct cl_env_nest       nest;
+
+	ENTRY;
+	env = cl_env_nested_get(&nest);
+	if (!IS_ERR(env)) {
+		int rc;
+
+		cl_lock_mutex_get(env, lock);
+
+		LASSERT(lock->cll_state >= CLS_QUEUING);
+		if (olck->ols_state == OLS_ENQUEUED) {
+			olck->ols_state = OLS_UPCALL_RECEIVED;
+			rc = ldlm_error2errno(errcode);
+		} else if (olck->ols_state == OLS_CANCELLED) {
+			rc = -EIO;
+		} else {
+			CERROR("Impossible state: %d\n", olck->ols_state);
+			LBUG();
+		}
+		if (rc) {
+			struct ldlm_lock *dlmlock;
+
+			dlmlock = ldlm_handle2lock(&olck->ols_handle);
+			if (dlmlock != NULL) {
+				lock_res_and_lock(dlmlock);
+				spin_lock(&osc_ast_guard);
+				LASSERT(olck->ols_lock == NULL);
+				dlmlock->l_ast_data = NULL;
+				olck->ols_handle.cookie = 0ULL;
+				spin_unlock(&osc_ast_guard);
+				ldlm_lock_fail_match_locked(dlmlock);
+				unlock_res_and_lock(dlmlock);
+				LDLM_LOCK_PUT(dlmlock);
+			}
+		} else {
+			if (olck->ols_glimpse)
+				olck->ols_glimpse = 0;
+			osc_lock_upcall0(env, olck);
+		}
+
+		/* Error handling, some errors are tolerable. */
+		if (olck->ols_locklessable && rc == -EUSERS) {
+			/* This is a tolerable error, turn this lock into
+			 * lockless lock.
+			 */
+			osc_object_set_contended(cl2osc(slice->cls_obj));
+			LASSERT(slice->cls_ops == &osc_lock_ops);
+
+			/* Change this lock to ldlmlock-less lock. */
+			osc_lock_to_lockless(env, olck, 1);
+			olck->ols_state = OLS_GRANTED;
+			rc = 0;
+		} else if (olck->ols_glimpse && rc == -ENAVAIL) {
+			osc_lock_lvb_update(env, olck, rc);
+			cl_lock_delete(env, lock);
+			/* Hide the error. */
+			rc = 0;
+		}
+
+		if (rc == 0) {
+			/* For AGL case, the RPC sponsor may exits the cl_lock
+			*  processing without wait() called before related OSC
+			*  lock upcall(). So update the lock status according
+			*  to the enqueue result inside AGL upcall(). */
+			if (olck->ols_agl) {
+				lock->cll_flags |= CLF_FROM_UPCALL;
+				cl_wait_try(env, lock);
+				lock->cll_flags &= ~CLF_FROM_UPCALL;
+				if (!olck->ols_glimpse)
+					olck->ols_agl = 0;
+			}
+			cl_lock_signal(env, lock);
+			/* del user for lock upcall cookie */
+			cl_unuse_try(env, lock);
+		} else {
+			/* del user for lock upcall cookie */
+			cl_lock_user_del(env, lock);
+			cl_lock_error(env, lock, rc);
+		}
+
+		/* release cookie reference, acquired by osc_lock_enqueue() */
+		cl_lock_hold_release(env, lock, "upcall", lock);
+		cl_lock_mutex_put(env, lock);
+
+		lu_ref_del(&lock->cll_reference, "upcall", lock);
+		/* This maybe the last reference, so must be called after
+		 * cl_lock_mutex_put(). */
+		cl_lock_put(env, lock);
+
+		cl_env_nested_put(&nest, env);
+	} else {
+		/* should never happen, similar to osc_ldlm_blocking_ast(). */
+		LBUG();
+	}
+	RETURN(errcode);
+}
+
+/**
+ * Core of osc_dlm_blocking_ast() logic.
+ */
+static void osc_lock_blocking(const struct lu_env *env,
+			      struct ldlm_lock *dlmlock,
+			      struct osc_lock *olck, int blocking)
+{
+	struct cl_lock *lock = olck->ols_cl.cls_lock;
+
+	LASSERT(olck->ols_lock == dlmlock);
+	CLASSERT(OLS_BLOCKED < OLS_CANCELLED);
+	LASSERT(!osc_lock_is_lockless(olck));
+
+	/*
+	 * Lock might be still addref-ed here, if e.g., blocking ast
+	 * is sent for a failed lock.
+	 */
+	osc_lock_unhold(olck);
+
+	if (blocking && olck->ols_state < OLS_BLOCKED)
+		/*
+		 * Move osc_lock into OLS_BLOCKED before canceling the lock,
+		 * because it recursively re-enters osc_lock_blocking(), with
+		 * the state set to OLS_CANCELLED.
+		 */
+		olck->ols_state = OLS_BLOCKED;
+	/*
+	 * cancel and destroy lock at least once no matter how blocking ast is
+	 * entered (see comment above osc_ldlm_blocking_ast() for use
+	 * cases). cl_lock_cancel() and cl_lock_delete() are idempotent.
+	 */
+	cl_lock_cancel(env, lock);
+	cl_lock_delete(env, lock);
+}
+
+/**
+ * Helper for osc_dlm_blocking_ast() handling discrepancies between cl_lock
+ * and ldlm_lock caches.
+ */
+static int osc_dlm_blocking_ast0(const struct lu_env *env,
+				 struct ldlm_lock *dlmlock,
+				 void *data, int flag)
+{
+	struct osc_lock *olck;
+	struct cl_lock  *lock;
+	int result;
+	int cancel;
+
+	LASSERT(flag == LDLM_CB_BLOCKING || flag == LDLM_CB_CANCELING);
+
+	cancel = 0;
+	olck = osc_ast_data_get(dlmlock);
+	if (olck != NULL) {
+		lock = olck->ols_cl.cls_lock;
+		cl_lock_mutex_get(env, lock);
+		LINVRNT(osc_lock_invariant(olck));
+		if (olck->ols_ast_wait) {
+			/* wake up osc_lock_use() */
+			cl_lock_signal(env, lock);
+			olck->ols_ast_wait = 0;
+		}
+		/*
+		 * Lock might have been canceled while this thread was
+		 * sleeping for lock mutex, but olck is pinned in memory.
+		 */
+		if (olck == dlmlock->l_ast_data) {
+			/*
+			 * NOTE: DLM sends blocking AST's for failed locks
+			 *       (that are still in pre-OLS_GRANTED state)
+			 *       too, and they have to be canceled otherwise
+			 *       DLM lock is never destroyed and stuck in
+			 *       the memory.
+			 *
+			 *       Alternatively, ldlm_cli_cancel() can be
+			 *       called here directly for osc_locks with
+			 *       ols_state < OLS_GRANTED to maintain an
+			 *       invariant that ->clo_cancel() is only called
+			 *       for locks that were granted.
+			 */
+			LASSERT(data == olck);
+			osc_lock_blocking(env, dlmlock,
+					  olck, flag == LDLM_CB_BLOCKING);
+		} else
+			cancel = 1;
+		cl_lock_mutex_put(env, lock);
+		osc_ast_data_put(env, olck);
+	} else
+		/*
+		 * DLM lock exists, but there is no cl_lock attached to it.
+		 * This is a `normal' race. cl_object and its cl_lock's can be
+		 * removed by memory pressure, together with all pages.
+		 */
+		cancel = (flag == LDLM_CB_BLOCKING);
+
+	if (cancel) {
+		struct lustre_handle *lockh;
+
+		lockh = &osc_env_info(env)->oti_handle;
+		ldlm_lock2handle(dlmlock, lockh);
+		result = ldlm_cli_cancel(lockh, LCF_ASYNC);
+	} else
+		result = 0;
+	return result;
+}
+
+/**
+ * Blocking ast invoked by ldlm when dlm lock is either blocking progress of
+ * some other lock, or is canceled. This function is installed as a
+ * ldlm_lock::l_blocking_ast() for client extent locks.
+ *
+ * Control flow is tricky, because ldlm uses the same call-back
+ * (ldlm_lock::l_blocking_ast()) for both blocking and cancellation ast's.
+ *
+ * \param dlmlock lock for which ast occurred.
+ *
+ * \param new description of a conflicting lock in case of blocking ast.
+ *
+ * \param data value of dlmlock->l_ast_data
+ *
+ * \param flag LDLM_CB_BLOCKING or LDLM_CB_CANCELING. Used to distinguish
+ *	     cancellation and blocking ast's.
+ *
+ * Possible use cases:
+ *
+ *     - ldlm calls dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING) to cancel
+ *       lock due to lock lru pressure, or explicit user request to purge
+ *       locks.
+ *
+ *     - ldlm calls dlmlock->l_blocking_ast(..., LDLM_CB_BLOCKING) to notify
+ *       us that dlmlock conflicts with another lock that some client is
+ *       enqueing. Lock is canceled.
+ *
+ *	   - cl_lock_cancel() is called. osc_lock_cancel() calls
+ *	     ldlm_cli_cancel() that calls
+ *
+ *		  dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING)
+ *
+ *	     recursively entering osc_ldlm_blocking_ast().
+ *
+ *     - client cancels lock voluntary (e.g., as a part of early cancellation):
+ *
+ *	   cl_lock_cancel()->
+ *	     osc_lock_cancel()->
+ *	       ldlm_cli_cancel()->
+ *		 dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING)
+ *
+ */
+static int osc_ldlm_blocking_ast(struct ldlm_lock *dlmlock,
+				 struct ldlm_lock_desc *new, void *data,
+				 int flag)
+{
+	struct lu_env     *env;
+	struct cl_env_nest nest;
+	int		result;
+
+	/*
+	 * This can be called in the context of outer IO, e.g.,
+	 *
+	 *     cl_enqueue()->...
+	 *       ->osc_enqueue_base()->...
+	 *	 ->ldlm_prep_elc_req()->...
+	 *	   ->ldlm_cancel_callback()->...
+	 *	     ->osc_ldlm_blocking_ast()
+	 *
+	 * new environment has to be created to not corrupt outer context.
+	 */
+	env = cl_env_nested_get(&nest);
+	if (!IS_ERR(env)) {
+		result = osc_dlm_blocking_ast0(env, dlmlock, data, flag);
+		cl_env_nested_put(&nest, env);
+	} else {
+		result = PTR_ERR(env);
+		/*
+		 * XXX This should never happen, as cl_lock is
+		 * stuck. Pre-allocated environment a la vvp_inode_fini_env
+		 * should be used.
+		 */
+		LBUG();
+	}
+	if (result != 0) {
+		if (result == -ENODATA)
+			result = 0;
+		else
+			CERROR("BAST failed: %d\n", result);
+	}
+	return result;
+}
+
+static int osc_ldlm_completion_ast(struct ldlm_lock *dlmlock,
+				   __u64 flags, void *data)
+{
+	struct cl_env_nest nest;
+	struct lu_env     *env;
+	struct osc_lock   *olck;
+	struct cl_lock    *lock;
+	int result;
+	int dlmrc;
+
+	/* first, do dlm part of the work */
+	dlmrc = ldlm_completion_ast_async(dlmlock, flags, data);
+	/* then, notify cl_lock */
+	env = cl_env_nested_get(&nest);
+	if (!IS_ERR(env)) {
+		olck = osc_ast_data_get(dlmlock);
+		if (olck != NULL) {
+			lock = olck->ols_cl.cls_lock;
+			cl_lock_mutex_get(env, lock);
+			/*
+			 * ldlm_handle_cp_callback() copied LVB from request
+			 * to lock->l_lvb_data, store it in osc_lock.
+			 */
+			LASSERT(dlmlock->l_lvb_data != NULL);
+			lock_res_and_lock(dlmlock);
+			olck->ols_lvb = *(struct ost_lvb *)dlmlock->l_lvb_data;
+			if (olck->ols_lock == NULL) {
+				/*
+				 * upcall (osc_lock_upcall()) hasn't yet been
+				 * called. Do nothing now, upcall will bind
+				 * olck to dlmlock and signal the waiters.
+				 *
+				 * This maintains an invariant that osc_lock
+				 * and ldlm_lock are always bound when
+				 * osc_lock is in OLS_GRANTED state.
+				 */
+			} else if (dlmlock->l_granted_mode ==
+				   dlmlock->l_req_mode) {
+				osc_lock_granted(env, olck, dlmlock, dlmrc);
+			}
+			unlock_res_and_lock(dlmlock);
+
+			if (dlmrc != 0) {
+				CL_LOCK_DEBUG(D_ERROR, env, lock,
+					      "dlmlock returned %d\n", dlmrc);
+				cl_lock_error(env, lock, dlmrc);
+			}
+			cl_lock_mutex_put(env, lock);
+			osc_ast_data_put(env, olck);
+			result = 0;
+		} else
+			result = -ELDLM_NO_LOCK_DATA;
+		cl_env_nested_put(&nest, env);
+	} else
+		result = PTR_ERR(env);
+	return dlmrc ?: result;
+}
+
+static int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data)
+{
+	struct ptlrpc_request  *req  = data;
+	struct osc_lock	*olck;
+	struct cl_lock	 *lock;
+	struct cl_object       *obj;
+	struct cl_env_nest      nest;
+	struct lu_env	  *env;
+	struct ost_lvb	 *lvb;
+	struct req_capsule     *cap;
+	int		     result;
+
+	LASSERT(lustre_msg_get_opc(req->rq_reqmsg) == LDLM_GL_CALLBACK);
+
+	env = cl_env_nested_get(&nest);
+	if (!IS_ERR(env)) {
+		/* osc_ast_data_get() has to go after environment is
+		 * allocated, because osc_ast_data() acquires a
+		 * reference to a lock, and it can only be released in
+		 * environment.
+		 */
+		olck = osc_ast_data_get(dlmlock);
+		if (olck != NULL) {
+			lock = olck->ols_cl.cls_lock;
+			/* Do not grab the mutex of cl_lock for glimpse.
+			 * See LU-1274 for details.
+			 * BTW, it's okay for cl_lock to be cancelled during
+			 * this period because server can handle this race.
+			 * See ldlm_server_glimpse_ast() for details.
+			 * cl_lock_mutex_get(env, lock); */
+			cap = &req->rq_pill;
+			req_capsule_extend(cap, &RQF_LDLM_GL_CALLBACK);
+			req_capsule_set_size(cap, &RMF_DLM_LVB, RCL_SERVER,
+					     sizeof *lvb);
+			result = req_capsule_server_pack(cap);
+			if (result == 0) {
+				lvb = req_capsule_server_get(cap, &RMF_DLM_LVB);
+				obj = lock->cll_descr.cld_obj;
+				result = cl_object_glimpse(env, obj, lvb);
+			}
+			if (!exp_connect_lvb_type(req->rq_export))
+				req_capsule_shrink(&req->rq_pill,
+						   &RMF_DLM_LVB,
+						   sizeof(struct ost_lvb_v1),
+						   RCL_SERVER);
+			osc_ast_data_put(env, olck);
+		} else {
+			/*
+			 * These errors are normal races, so we don't want to
+			 * fill the console with messages by calling
+			 * ptlrpc_error()
+			 */
+			lustre_pack_reply(req, 1, NULL, NULL);
+			result = -ELDLM_NO_LOCK_DATA;
+		}
+		cl_env_nested_put(&nest, env);
+	} else
+		result = PTR_ERR(env);
+	req->rq_status = result;
+	return result;
+}
+
+static unsigned long osc_lock_weigh(const struct lu_env *env,
+				    const struct cl_lock_slice *slice)
+{
+	/*
+	 * don't need to grab coh_page_guard since we don't care the exact #
+	 * of pages..
+	 */
+	return cl_object_header(slice->cls_obj)->coh_pages;
+}
+
+/**
+ * Get the weight of dlm lock for early cancellation.
+ *
+ * XXX: it should return the pages covered by this \a dlmlock.
+ */
+static unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock)
+{
+	struct cl_env_nest       nest;
+	struct lu_env	   *env;
+	struct osc_lock	 *lock;
+	struct cl_lock	  *cll;
+	unsigned long	    weight;
+	ENTRY;
+
+	might_sleep();
+	/*
+	 * osc_ldlm_weigh_ast has a complex context since it might be called
+	 * because of lock canceling, or from user's input. We have to make
+	 * a new environment for it. Probably it is implementation safe to use
+	 * the upper context because cl_lock_put don't modify environment
+	 * variables. But in case of ..
+	 */
+	env = cl_env_nested_get(&nest);
+	if (IS_ERR(env))
+		/* Mostly because lack of memory, tend to eliminate this lock*/
+		RETURN(0);
+
+	LASSERT(dlmlock->l_resource->lr_type == LDLM_EXTENT);
+	lock = osc_ast_data_get(dlmlock);
+	if (lock == NULL) {
+		/* cl_lock was destroyed because of memory pressure.
+		 * It is much reasonable to assign this type of lock
+		 * a lower cost.
+		 */
+		GOTO(out, weight = 0);
+	}
+
+	cll = lock->ols_cl.cls_lock;
+	cl_lock_mutex_get(env, cll);
+	weight = cl_lock_weigh(env, cll);
+	cl_lock_mutex_put(env, cll);
+	osc_ast_data_put(env, lock);
+	EXIT;
+
+out:
+	cl_env_nested_put(&nest, env);
+	return weight;
+}
+
+static void osc_lock_build_einfo(const struct lu_env *env,
+				 const struct cl_lock *clock,
+				 struct osc_lock *lock,
+				 struct ldlm_enqueue_info *einfo)
+{
+	enum cl_lock_mode mode;
+
+	mode = clock->cll_descr.cld_mode;
+	if (mode == CLM_PHANTOM)
+		/*
+		 * For now, enqueue all glimpse locks in read mode. In the
+		 * future, client might choose to enqueue LCK_PW lock for
+		 * glimpse on a file opened for write.
+		 */
+		mode = CLM_READ;
+
+	einfo->ei_type   = LDLM_EXTENT;
+	einfo->ei_mode   = osc_cl_lock2ldlm(mode);
+	einfo->ei_cb_bl  = osc_ldlm_blocking_ast;
+	einfo->ei_cb_cp  = osc_ldlm_completion_ast;
+	einfo->ei_cb_gl  = osc_ldlm_glimpse_ast;
+	einfo->ei_cb_wg  = osc_ldlm_weigh_ast;
+	einfo->ei_cbdata = lock; /* value to be put into ->l_ast_data */
+}
+
+/**
+ * Determine if the lock should be converted into a lockless lock.
+ *
+ * Steps to check:
+ * - if the lock has an explicite requirment for a non-lockless lock;
+ * - if the io lock request type ci_lockreq;
+ * - send the enqueue rpc to ost to make the further decision;
+ * - special treat to truncate lockless lock
+ *
+ *  Additional policy can be implemented here, e.g., never do lockless-io
+ *  for large extents.
+ */
+static void osc_lock_to_lockless(const struct lu_env *env,
+				 struct osc_lock *ols, int force)
+{
+	struct cl_lock_slice *slice = &ols->ols_cl;
+
+	LASSERT(ols->ols_state == OLS_NEW ||
+		ols->ols_state == OLS_UPCALL_RECEIVED);
+
+	if (force) {
+		ols->ols_locklessable = 1;
+		slice->cls_ops = &osc_lock_lockless_ops;
+	} else {
+		struct osc_io *oio     = osc_env_io(env);
+		struct cl_io  *io      = oio->oi_cl.cis_io;
+		struct cl_object *obj  = slice->cls_obj;
+		struct osc_object *oob = cl2osc(obj);
+		const struct osc_device *osd = lu2osc_dev(obj->co_lu.lo_dev);
+		struct obd_connect_data *ocd;
+
+		LASSERT(io->ci_lockreq == CILR_MANDATORY ||
+			io->ci_lockreq == CILR_MAYBE ||
+			io->ci_lockreq == CILR_NEVER);
+
+		ocd = &class_exp2cliimp(osc_export(oob))->imp_connect_data;
+		ols->ols_locklessable = (io->ci_type != CIT_SETATTR) &&
+				(io->ci_lockreq == CILR_MAYBE) &&
+				(ocd->ocd_connect_flags & OBD_CONNECT_SRVLOCK);
+		if (io->ci_lockreq == CILR_NEVER ||
+			/* lockless IO */
+		    (ols->ols_locklessable && osc_object_is_contended(oob)) ||
+			/* lockless truncate */
+		    (cl_io_is_trunc(io) &&
+		     (ocd->ocd_connect_flags & OBD_CONNECT_TRUNCLOCK) &&
+		      osd->od_lockless_truncate)) {
+			ols->ols_locklessable = 1;
+			slice->cls_ops = &osc_lock_lockless_ops;
+		}
+	}
+	LASSERT(ergo(ols->ols_glimpse, !osc_lock_is_lockless(ols)));
+}
+
+static int osc_lock_compatible(const struct osc_lock *qing,
+			       const struct osc_lock *qed)
+{
+	enum cl_lock_mode qing_mode;
+	enum cl_lock_mode qed_mode;
+
+	qing_mode = qing->ols_cl.cls_lock->cll_descr.cld_mode;
+	if (qed->ols_glimpse &&
+	    (qed->ols_state >= OLS_UPCALL_RECEIVED || qing_mode == CLM_READ))
+		return 1;
+
+	qed_mode = qed->ols_cl.cls_lock->cll_descr.cld_mode;
+	return ((qing_mode == CLM_READ) && (qed_mode == CLM_READ));
+}
+
+/**
+ * Cancel all conflicting locks and wait for them to be destroyed.
+ *
+ * This function is used for two purposes:
+ *
+ *     - early cancel all conflicting locks before starting IO, and
+ *
+ *     - guarantee that pages added to the page cache by lockless IO are never
+ *       covered by locks other than lockless IO lock, and, hence, are not
+ *       visible to other threads.
+ */
+static int osc_lock_enqueue_wait(const struct lu_env *env,
+				 const struct osc_lock *olck)
+{
+	struct cl_lock	  *lock    = olck->ols_cl.cls_lock;
+	struct cl_lock_descr    *descr   = &lock->cll_descr;
+	struct cl_object_header *hdr     = cl_object_header(descr->cld_obj);
+	struct cl_lock	  *scan;
+	struct cl_lock	  *conflict= NULL;
+	int lockless		     = osc_lock_is_lockless(olck);
+	int rc			   = 0;
+	ENTRY;
+
+	LASSERT(cl_lock_is_mutexed(lock));
+
+	/* make it enqueue anyway for glimpse lock, because we actually
+	 * don't need to cancel any conflicting locks. */
+	if (olck->ols_glimpse)
+		return 0;
+
+	spin_lock(&hdr->coh_lock_guard);
+	list_for_each_entry(scan, &hdr->coh_locks, cll_linkage) {
+		struct cl_lock_descr *cld = &scan->cll_descr;
+		const struct osc_lock *scan_ols;
+
+		if (scan == lock)
+			break;
+
+		if (scan->cll_state < CLS_QUEUING ||
+		    scan->cll_state == CLS_FREEING ||
+		    cld->cld_start > descr->cld_end ||
+		    cld->cld_end < descr->cld_start)
+			continue;
+
+		/* overlapped and living locks. */
+
+		/* We're not supposed to give up group lock. */
+		if (scan->cll_descr.cld_mode == CLM_GROUP) {
+			LASSERT(descr->cld_mode != CLM_GROUP ||
+				descr->cld_gid != scan->cll_descr.cld_gid);
+			continue;
+		}
+
+		scan_ols = osc_lock_at(scan);
+
+		/* We need to cancel the compatible locks if we're enqueuing
+		 * a lockless lock, for example:
+		 * imagine that client has PR lock on [0, 1000], and thread T0
+		 * is doing lockless IO in [500, 1500] region. Concurrent
+		 * thread T1 can see lockless data in [500, 1000], which is
+		 * wrong, because these data are possibly stale. */
+		if (!lockless && osc_lock_compatible(olck, scan_ols))
+			continue;
+
+		cl_lock_get_trust(scan);
+		conflict = scan;
+		break;
+	}
+	spin_unlock(&hdr->coh_lock_guard);
+
+	if (conflict) {
+		if (lock->cll_descr.cld_mode == CLM_GROUP) {
+			/* we want a group lock but a previous lock request
+			 * conflicts, we do not wait but return 0 so the
+			 * request is send to the server
+			 */
+			CDEBUG(D_DLMTRACE, "group lock %p is conflicted "
+					   "with %p, no wait, send to server\n",
+			       lock, conflict);
+			cl_lock_put(env, conflict);
+			rc = 0;
+		} else {
+			CDEBUG(D_DLMTRACE, "lock %p is conflicted with %p, "
+					   "will wait\n",
+			       lock, conflict);
+			LASSERT(lock->cll_conflict == NULL);
+			lu_ref_add(&conflict->cll_reference, "cancel-wait",
+				   lock);
+			lock->cll_conflict = conflict;
+			rc = CLO_WAIT;
+		}
+	}
+	RETURN(rc);
+}
+
+/**
+ * Implementation of cl_lock_operations::clo_enqueue() method for osc
+ * layer. This initiates ldlm enqueue:
+ *
+ *     - cancels conflicting locks early (osc_lock_enqueue_wait());
+ *
+ *     - calls osc_enqueue_base() to do actual enqueue.
+ *
+ * osc_enqueue_base() is supplied with an upcall function that is executed
+ * when lock is received either after a local cached ldlm lock is matched, or
+ * when a reply from the server is received.
+ *
+ * This function does not wait for the network communication to complete.
+ */
+static int osc_lock_enqueue(const struct lu_env *env,
+			    const struct cl_lock_slice *slice,
+			    struct cl_io *unused, __u32 enqflags)
+{
+	struct osc_lock	  *ols     = cl2osc_lock(slice);
+	struct cl_lock	   *lock    = ols->ols_cl.cls_lock;
+	int result;
+	ENTRY;
+
+	LASSERT(cl_lock_is_mutexed(lock));
+	LASSERTF(ols->ols_state == OLS_NEW,
+		 "Impossible state: %d\n", ols->ols_state);
+
+	LASSERTF(ergo(ols->ols_glimpse, lock->cll_descr.cld_mode <= CLM_READ),
+		"lock = %p, ols = %p\n", lock, ols);
+
+	result = osc_lock_enqueue_wait(env, ols);
+	if (result == 0) {
+		if (!osc_lock_is_lockless(ols)) {
+			struct osc_object	*obj = cl2osc(slice->cls_obj);
+			struct osc_thread_info   *info = osc_env_info(env);
+			struct ldlm_res_id       *resname = &info->oti_resname;
+			ldlm_policy_data_t       *policy = &info->oti_policy;
+			struct ldlm_enqueue_info *einfo = &ols->ols_einfo;
+
+			/* lock will be passed as upcall cookie,
+			 * hold ref to prevent to be released. */
+			cl_lock_hold_add(env, lock, "upcall", lock);
+			/* a user for lock also */
+			cl_lock_user_add(env, lock);
+			ols->ols_state = OLS_ENQUEUED;
+
+			/*
+			 * XXX: this is possible blocking point as
+			 * ldlm_lock_match(LDLM_FL_LVB_READY) waits for
+			 * LDLM_CP_CALLBACK.
+			 */
+			ostid_build_res_name(&obj->oo_oinfo->loi_oi, resname);
+			osc_lock_build_policy(env, lock, policy);
+			result = osc_enqueue_base(osc_export(obj), resname,
+					  &ols->ols_flags, policy,
+					  &ols->ols_lvb,
+					  obj->oo_oinfo->loi_kms_valid,
+					  osc_lock_upcall,
+					  ols, einfo, &ols->ols_handle,
+					  PTLRPCD_SET, 1, ols->ols_agl);
+			if (result != 0) {
+				cl_lock_user_del(env, lock);
+				cl_lock_unhold(env, lock, "upcall", lock);
+				if (unlikely(result == -ECANCELED)) {
+					ols->ols_state = OLS_NEW;
+					result = 0;
+				}
+			}
+		} else {
+			ols->ols_state = OLS_GRANTED;
+			ols->ols_owner = osc_env_io(env);
+		}
+	}
+	LASSERT(ergo(ols->ols_glimpse, !osc_lock_is_lockless(ols)));
+	RETURN(result);
+}
+
+static int osc_lock_wait(const struct lu_env *env,
+			 const struct cl_lock_slice *slice)
+{
+	struct osc_lock *olck = cl2osc_lock(slice);
+	struct cl_lock  *lock = olck->ols_cl.cls_lock;
+
+	LINVRNT(osc_lock_invariant(olck));
+
+	if (olck->ols_glimpse && olck->ols_state >= OLS_UPCALL_RECEIVED) {
+		if (olck->ols_flags & LDLM_FL_LVB_READY) {
+			return 0;
+		} else if (olck->ols_agl) {
+			if (lock->cll_flags & CLF_FROM_UPCALL)
+				/* It is from enqueue RPC reply upcall for
+				 * updating state. Do not re-enqueue. */
+				return -ENAVAIL;
+			else
+				olck->ols_state = OLS_NEW;
+		} else {
+			LASSERT(lock->cll_error);
+			return lock->cll_error;
+		}
+	}
+
+	if (olck->ols_state == OLS_NEW) {
+		int rc;
+
+		LASSERT(olck->ols_agl);
+		olck->ols_agl = 0;
+		rc = osc_lock_enqueue(env, slice, NULL, CEF_ASYNC | CEF_MUST);
+		if (rc != 0)
+			return rc;
+		else
+			return CLO_REENQUEUED;
+	}
+
+	LASSERT(equi(olck->ols_state >= OLS_UPCALL_RECEIVED &&
+		     lock->cll_error == 0, olck->ols_lock != NULL));
+
+	return lock->cll_error ?: olck->ols_state >= OLS_GRANTED ? 0 : CLO_WAIT;
+}
+
+/**
+ * An implementation of cl_lock_operations::clo_use() method that pins cached
+ * lock.
+ */
+static int osc_lock_use(const struct lu_env *env,
+			const struct cl_lock_slice *slice)
+{
+	struct osc_lock *olck = cl2osc_lock(slice);
+	int rc;
+
+	LASSERT(!olck->ols_hold);
+
+	/*
+	 * Atomically check for LDLM_FL_CBPENDING and addref a lock if this
+	 * flag is not set. This protects us from a concurrent blocking ast.
+	 */
+	rc = ldlm_lock_addref_try(&olck->ols_handle, olck->ols_einfo.ei_mode);
+	if (rc == 0) {
+		olck->ols_hold = 1;
+		olck->ols_state = OLS_GRANTED;
+	} else {
+		struct cl_lock *lock;
+
+		/*
+		 * Lock is being cancelled somewhere within
+		 * ldlm_handle_bl_callback(): LDLM_FL_CBPENDING is already
+		 * set, but osc_ldlm_blocking_ast() hasn't yet acquired
+		 * cl_lock mutex.
+		 */
+		lock = slice->cls_lock;
+		LASSERT(lock->cll_state == CLS_INTRANSIT);
+		LASSERT(lock->cll_users > 0);
+		/* set a flag for osc_dlm_blocking_ast0() to signal the
+		 * lock.*/
+		olck->ols_ast_wait = 1;
+		rc = CLO_WAIT;
+	}
+	return rc;
+}
+
+static int osc_lock_flush(struct osc_lock *ols, int discard)
+{
+	struct cl_lock       *lock  = ols->ols_cl.cls_lock;
+	struct cl_env_nest    nest;
+	struct lu_env	*env;
+	int result = 0;
+	ENTRY;
+
+	env = cl_env_nested_get(&nest);
+	if (!IS_ERR(env)) {
+		struct osc_object    *obj   = cl2osc(ols->ols_cl.cls_obj);
+		struct cl_lock_descr *descr = &lock->cll_descr;
+		int rc = 0;
+
+		if (descr->cld_mode >= CLM_WRITE) {
+			result = osc_cache_writeback_range(env, obj,
+					descr->cld_start, descr->cld_end,
+					1, discard);
+			LDLM_DEBUG(ols->ols_lock,
+				"lock %p: %d pages were %s.\n", lock, result,
+				discard ? "discarded" : "written");
+			if (result > 0)
+				result = 0;
+		}
+
+		rc = cl_lock_discard_pages(env, lock);
+		if (result == 0 && rc < 0)
+			result = rc;
+
+		cl_env_nested_put(&nest, env);
+	} else
+		result = PTR_ERR(env);
+	if (result == 0) {
+		ols->ols_flush = 1;
+		LINVRNT(!osc_lock_has_pages(ols));
+	}
+	RETURN(result);
+}
+
+/**
+ * Implements cl_lock_operations::clo_cancel() method for osc layer. This is
+ * called (as part of cl_lock_cancel()) when lock is canceled either voluntary
+ * (LRU pressure, early cancellation, umount, etc.) or due to the conflict
+ * with some other lock some where in the cluster. This function does the
+ * following:
+ *
+ *     - invalidates all pages protected by this lock (after sending dirty
+ *       ones to the server, as necessary);
+ *
+ *     - decref's underlying ldlm lock;
+ *
+ *     - cancels ldlm lock (ldlm_cli_cancel()).
+ */
+static void osc_lock_cancel(const struct lu_env *env,
+			    const struct cl_lock_slice *slice)
+{
+	struct cl_lock   *lock    = slice->cls_lock;
+	struct osc_lock  *olck    = cl2osc_lock(slice);
+	struct ldlm_lock *dlmlock = olck->ols_lock;
+	int	       result  = 0;
+	int	       discard;
+
+	LASSERT(cl_lock_is_mutexed(lock));
+	LINVRNT(osc_lock_invariant(olck));
+
+	if (dlmlock != NULL) {
+		int do_cancel;
+
+		discard = !!(dlmlock->l_flags & LDLM_FL_DISCARD_DATA);
+		if (olck->ols_state >= OLS_GRANTED)
+			result = osc_lock_flush(olck, discard);
+		osc_lock_unhold(olck);
+
+		lock_res_and_lock(dlmlock);
+		/* Now that we're the only user of dlm read/write reference,
+		 * mostly the ->l_readers + ->l_writers should be zero.
+		 * However, there is a corner case.
+		 * See bug 18829 for details.*/
+		do_cancel = (dlmlock->l_readers == 0 &&
+			     dlmlock->l_writers == 0);
+		dlmlock->l_flags |= LDLM_FL_CBPENDING;
+		unlock_res_and_lock(dlmlock);
+		if (do_cancel)
+			result = ldlm_cli_cancel(&olck->ols_handle, LCF_ASYNC);
+		if (result < 0)
+			CL_LOCK_DEBUG(D_ERROR, env, lock,
+				      "lock %p cancel failure with error(%d)\n",
+				      lock, result);
+	}
+	olck->ols_state = OLS_CANCELLED;
+	olck->ols_flags &= ~LDLM_FL_LVB_READY;
+	osc_lock_detach(env, olck);
+}
+
+static int osc_lock_has_pages(struct osc_lock *olck)
+{
+	return 0;
+}
+
+static void osc_lock_delete(const struct lu_env *env,
+			    const struct cl_lock_slice *slice)
+{
+	struct osc_lock *olck;
+
+	olck = cl2osc_lock(slice);
+	if (olck->ols_glimpse) {
+		LASSERT(!olck->ols_hold);
+		LASSERT(!olck->ols_lock);
+		return;
+	}
+
+	LINVRNT(osc_lock_invariant(olck));
+	LINVRNT(!osc_lock_has_pages(olck));
+
+	osc_lock_unhold(olck);
+	osc_lock_detach(env, olck);
+}
+
+/**
+ * Implements cl_lock_operations::clo_state() method for osc layer.
+ *
+ * Maintains osc_lock::ols_owner field.
+ *
+ * This assumes that lock always enters CLS_HELD (from some other state) in
+ * the same IO context as one that requested the lock. This should not be a
+ * problem, because context is by definition shared by all activity pertaining
+ * to the same high-level IO.
+ */
+static void osc_lock_state(const struct lu_env *env,
+			   const struct cl_lock_slice *slice,
+			   enum cl_lock_state state)
+{
+	struct osc_lock *lock = cl2osc_lock(slice);
+
+	/*
+	 * XXX multiple io contexts can use the lock at the same time.
+	 */
+	LINVRNT(osc_lock_invariant(lock));
+	if (state == CLS_HELD && slice->cls_lock->cll_state != CLS_HELD) {
+		struct osc_io *oio = osc_env_io(env);
+
+		LASSERT(lock->ols_owner == NULL);
+		lock->ols_owner = oio;
+	} else if (state != CLS_HELD)
+		lock->ols_owner = NULL;
+}
+
+static int osc_lock_print(const struct lu_env *env, void *cookie,
+			  lu_printer_t p, const struct cl_lock_slice *slice)
+{
+	struct osc_lock *lock = cl2osc_lock(slice);
+
+	/*
+	 * XXX print ldlm lock and einfo properly.
+	 */
+	(*p)(env, cookie, "%p %#16llx "LPX64" %d %p ",
+	     lock->ols_lock, lock->ols_flags, lock->ols_handle.cookie,
+	     lock->ols_state, lock->ols_owner);
+	osc_lvb_print(env, cookie, p, &lock->ols_lvb);
+	return 0;
+}
+
+static int osc_lock_fits_into(const struct lu_env *env,
+			      const struct cl_lock_slice *slice,
+			      const struct cl_lock_descr *need,
+			      const struct cl_io *io)
+{
+	struct osc_lock *ols = cl2osc_lock(slice);
+
+	if (need->cld_enq_flags & CEF_NEVER)
+		return 0;
+
+	if (ols->ols_state >= OLS_CANCELLED)
+		return 0;
+
+	if (need->cld_mode == CLM_PHANTOM) {
+		if (ols->ols_agl)
+			return !(ols->ols_state > OLS_RELEASED);
+
+		/*
+		 * Note: the QUEUED lock can't be matched here, otherwise
+		 * it might cause the deadlocks.
+		 * In read_process,
+		 * P1: enqueued read lock, create sublock1
+		 * P2: enqueued write lock, create sublock2(conflicted
+		 *     with sublock1).
+		 * P1: Grant read lock.
+		 * P1: enqueued glimpse lock(with holding sublock1_read),
+		 *     matched with sublock2, waiting sublock2 to be granted.
+		 *     But sublock2 can not be granted, because P1
+		 *     will not release sublock1. Bang!
+		 */
+		if (ols->ols_state < OLS_GRANTED ||
+		    ols->ols_state > OLS_RELEASED)
+			return 0;
+	} else if (need->cld_enq_flags & CEF_MUST) {
+		/*
+		 * If the lock hasn't ever enqueued, it can't be matched
+		 * because enqueue process brings in many information
+		 * which can be used to determine things such as lockless,
+		 * CEF_MUST, etc.
+		 */
+		if (ols->ols_state < OLS_UPCALL_RECEIVED &&
+		    ols->ols_locklessable)
+			return 0;
+	}
+	return 1;
+}
+
+static const struct cl_lock_operations osc_lock_ops = {
+	.clo_fini    = osc_lock_fini,
+	.clo_enqueue = osc_lock_enqueue,
+	.clo_wait    = osc_lock_wait,
+	.clo_unuse   = osc_lock_unuse,
+	.clo_use     = osc_lock_use,
+	.clo_delete  = osc_lock_delete,
+	.clo_state   = osc_lock_state,
+	.clo_cancel  = osc_lock_cancel,
+	.clo_weigh   = osc_lock_weigh,
+	.clo_print   = osc_lock_print,
+	.clo_fits_into = osc_lock_fits_into,
+};
+
+static int osc_lock_lockless_unuse(const struct lu_env *env,
+				   const struct cl_lock_slice *slice)
+{
+	struct osc_lock *ols = cl2osc_lock(slice);
+	struct cl_lock *lock = slice->cls_lock;
+
+	LASSERT(ols->ols_state == OLS_GRANTED);
+	LINVRNT(osc_lock_invariant(ols));
+
+	cl_lock_cancel(env, lock);
+	cl_lock_delete(env, lock);
+	return 0;
+}
+
+static void osc_lock_lockless_cancel(const struct lu_env *env,
+				     const struct cl_lock_slice *slice)
+{
+	struct osc_lock   *ols  = cl2osc_lock(slice);
+	int result;
+
+	result = osc_lock_flush(ols, 0);
+	if (result)
+		CERROR("Pages for lockless lock %p were not purged(%d)\n",
+		       ols, result);
+	ols->ols_state = OLS_CANCELLED;
+}
+
+static int osc_lock_lockless_wait(const struct lu_env *env,
+				  const struct cl_lock_slice *slice)
+{
+	struct osc_lock *olck = cl2osc_lock(slice);
+	struct cl_lock  *lock = olck->ols_cl.cls_lock;
+
+	LINVRNT(osc_lock_invariant(olck));
+	LASSERT(olck->ols_state >= OLS_UPCALL_RECEIVED);
+
+	return lock->cll_error;
+}
+
+static void osc_lock_lockless_state(const struct lu_env *env,
+				    const struct cl_lock_slice *slice,
+				    enum cl_lock_state state)
+{
+	struct osc_lock *lock = cl2osc_lock(slice);
+
+	LINVRNT(osc_lock_invariant(lock));
+	if (state == CLS_HELD) {
+		struct osc_io *oio  = osc_env_io(env);
+
+		LASSERT(ergo(lock->ols_owner, lock->ols_owner == oio));
+		lock->ols_owner = oio;
+
+		/* set the io to be lockless if this lock is for io's
+		 * host object */
+		if (cl_object_same(oio->oi_cl.cis_obj, slice->cls_obj))
+			oio->oi_lockless = 1;
+	}
+}
+
+static int osc_lock_lockless_fits_into(const struct lu_env *env,
+				       const struct cl_lock_slice *slice,
+				       const struct cl_lock_descr *need,
+				       const struct cl_io *io)
+{
+	struct osc_lock *lock = cl2osc_lock(slice);
+
+	if (!(need->cld_enq_flags & CEF_NEVER))
+		return 0;
+
+	/* lockless lock should only be used by its owning io. b22147 */
+	return (lock->ols_owner == osc_env_io(env));
+}
+
+static const struct cl_lock_operations osc_lock_lockless_ops = {
+	.clo_fini      = osc_lock_fini,
+	.clo_enqueue   = osc_lock_enqueue,
+	.clo_wait      = osc_lock_lockless_wait,
+	.clo_unuse     = osc_lock_lockless_unuse,
+	.clo_state     = osc_lock_lockless_state,
+	.clo_fits_into = osc_lock_lockless_fits_into,
+	.clo_cancel    = osc_lock_lockless_cancel,
+	.clo_print     = osc_lock_print
+};
+
+int osc_lock_init(const struct lu_env *env,
+		  struct cl_object *obj, struct cl_lock *lock,
+		  const struct cl_io *unused)
+{
+	struct osc_lock *clk;
+	int result;
+
+	OBD_SLAB_ALLOC_PTR_GFP(clk, osc_lock_kmem, __GFP_IO);
+	if (clk != NULL) {
+		__u32 enqflags = lock->cll_descr.cld_enq_flags;
+
+		osc_lock_build_einfo(env, lock, clk, &clk->ols_einfo);
+		atomic_set(&clk->ols_pageref, 0);
+		clk->ols_state = OLS_NEW;
+
+		clk->ols_flags = osc_enq2ldlm_flags(enqflags);
+		clk->ols_agl = !!(enqflags & CEF_AGL);
+		if (clk->ols_agl)
+			clk->ols_flags |= LDLM_FL_BLOCK_NOWAIT;
+		if (clk->ols_flags & LDLM_FL_HAS_INTENT)
+			clk->ols_glimpse = 1;
+
+		cl_lock_slice_add(lock, &clk->ols_cl, obj, &osc_lock_ops);
+
+		if (!(enqflags & CEF_MUST))
+			/* try to convert this lock to a lockless lock */
+			osc_lock_to_lockless(env, clk, (enqflags & CEF_NEVER));
+		if (clk->ols_locklessable && !(enqflags & CEF_DISCARD_DATA))
+			clk->ols_flags |= LDLM_FL_DENY_ON_CONTENTION;
+
+		LDLM_DEBUG_NOLOCK("lock %p, osc lock %p, flags %llx\n",
+				lock, clk, clk->ols_flags);
+
+		result = 0;
+	} else
+		result = -ENOMEM;
+	return result;
+}
+
+int osc_dlm_lock_pageref(struct ldlm_lock *dlm)
+{
+	struct osc_lock *olock;
+	int	      rc = 0;
+
+	spin_lock(&osc_ast_guard);
+	olock = dlm->l_ast_data;
+	/*
+	 * there's a very rare race with osc_page_addref_lock(), but that
+	 * doesn't matter because in the worst case we don't cancel a lock
+	 * which we actually can, that's no harm.
+	 */
+	if (olock != NULL &&
+	    atomic_add_return(_PAGEREF_MAGIC,
+				  &olock->ols_pageref) != _PAGEREF_MAGIC) {
+		atomic_sub(_PAGEREF_MAGIC, &olock->ols_pageref);
+		rc = 1;
+	}
+	spin_unlock(&osc_ast_guard);
+	return rc;
+}
+
+/** @} osc */
diff --git a/drivers/staging/lustre/lustre/osc/osc_object.c b/drivers/staging/lustre/lustre/osc/osc_object.c
new file mode 100644
index 000000000000..ca94e6331381
--- /dev/null
+++ b/drivers/staging/lustre/lustre/osc/osc_object.c
@@ -0,0 +1,275 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_object for OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include "osc_cl_internal.h"
+
+/** \addtogroup osc
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ */
+
+static struct lu_object *osc2lu(struct osc_object *osc)
+{
+	return &osc->oo_cl.co_lu;
+}
+
+static struct osc_object *lu2osc(const struct lu_object *obj)
+{
+	LINVRNT(osc_is_object(obj));
+	return container_of0(obj, struct osc_object, oo_cl.co_lu);
+}
+
+/*****************************************************************************
+ *
+ * Object operations.
+ *
+ */
+
+static int osc_object_init(const struct lu_env *env, struct lu_object *obj,
+			   const struct lu_object_conf *conf)
+{
+	struct osc_object	   *osc   = lu2osc(obj);
+	const struct cl_object_conf *cconf = lu2cl_conf(conf);
+	int i;
+
+	osc->oo_oinfo = cconf->u.coc_oinfo;
+	spin_lock_init(&osc->oo_seatbelt);
+	for (i = 0; i < CRT_NR; ++i)
+		INIT_LIST_HEAD(&osc->oo_inflight[i]);
+
+	INIT_LIST_HEAD(&osc->oo_ready_item);
+	INIT_LIST_HEAD(&osc->oo_hp_ready_item);
+	INIT_LIST_HEAD(&osc->oo_write_item);
+	INIT_LIST_HEAD(&osc->oo_read_item);
+
+	osc->oo_root.rb_node = NULL;
+	INIT_LIST_HEAD(&osc->oo_hp_exts);
+	INIT_LIST_HEAD(&osc->oo_urgent_exts);
+	INIT_LIST_HEAD(&osc->oo_rpc_exts);
+	INIT_LIST_HEAD(&osc->oo_reading_exts);
+	atomic_set(&osc->oo_nr_reads, 0);
+	atomic_set(&osc->oo_nr_writes, 0);
+	spin_lock_init(&osc->oo_lock);
+
+	cl_object_page_init(lu2cl(obj), sizeof(struct osc_page));
+
+	return 0;
+}
+
+static void osc_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+	struct osc_object *osc = lu2osc(obj);
+	int i;
+
+	for (i = 0; i < CRT_NR; ++i)
+		LASSERT(list_empty(&osc->oo_inflight[i]));
+
+	LASSERT(list_empty(&osc->oo_ready_item));
+	LASSERT(list_empty(&osc->oo_hp_ready_item));
+	LASSERT(list_empty(&osc->oo_write_item));
+	LASSERT(list_empty(&osc->oo_read_item));
+
+	LASSERT(osc->oo_root.rb_node == NULL);
+	LASSERT(list_empty(&osc->oo_hp_exts));
+	LASSERT(list_empty(&osc->oo_urgent_exts));
+	LASSERT(list_empty(&osc->oo_rpc_exts));
+	LASSERT(list_empty(&osc->oo_reading_exts));
+	LASSERT(atomic_read(&osc->oo_nr_reads) == 0);
+	LASSERT(atomic_read(&osc->oo_nr_writes) == 0);
+
+	lu_object_fini(obj);
+	OBD_SLAB_FREE_PTR(osc, osc_object_kmem);
+}
+
+int osc_lvb_print(const struct lu_env *env, void *cookie,
+		  lu_printer_t p, const struct ost_lvb *lvb)
+{
+	return (*p)(env, cookie, "size: "LPU64" mtime: "LPU64" atime: "LPU64" "
+		    "ctime: "LPU64" blocks: "LPU64,
+		    lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime,
+		    lvb->lvb_ctime, lvb->lvb_blocks);
+}
+
+static int osc_object_print(const struct lu_env *env, void *cookie,
+			    lu_printer_t p, const struct lu_object *obj)
+{
+	struct osc_object   *osc   = lu2osc(obj);
+	struct lov_oinfo    *oinfo = osc->oo_oinfo;
+	struct osc_async_rc *ar    = &oinfo->loi_ar;
+
+	(*p)(env, cookie, "id: "DOSTID" "
+	     "idx: %d gen: %d kms_valid: %u kms "LPU64" "
+	     "rc: %d force_sync: %d min_xid: "LPU64" ",
+	     POSTID(&oinfo->loi_oi), oinfo->loi_ost_idx,
+	     oinfo->loi_ost_gen, oinfo->loi_kms_valid, oinfo->loi_kms,
+	     ar->ar_rc, ar->ar_force_sync, ar->ar_min_xid);
+	osc_lvb_print(env, cookie, p, &oinfo->loi_lvb);
+	return 0;
+}
+
+
+static int osc_attr_get(const struct lu_env *env, struct cl_object *obj,
+			struct cl_attr *attr)
+{
+	struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
+
+	cl_lvb2attr(attr, &oinfo->loi_lvb);
+	attr->cat_kms = oinfo->loi_kms_valid ? oinfo->loi_kms : 0;
+	return 0;
+}
+
+int osc_attr_set(const struct lu_env *env, struct cl_object *obj,
+		 const struct cl_attr *attr, unsigned valid)
+{
+	struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
+	struct ost_lvb   *lvb   = &oinfo->loi_lvb;
+
+	if (valid & CAT_SIZE)
+		lvb->lvb_size = attr->cat_size;
+	if (valid & CAT_MTIME)
+		lvb->lvb_mtime = attr->cat_mtime;
+	if (valid & CAT_ATIME)
+		lvb->lvb_atime = attr->cat_atime;
+	if (valid & CAT_CTIME)
+		lvb->lvb_ctime = attr->cat_ctime;
+	if (valid & CAT_BLOCKS)
+		lvb->lvb_blocks = attr->cat_blocks;
+	if (valid & CAT_KMS) {
+		CDEBUG(D_CACHE, "set kms from "LPU64"to "LPU64"\n",
+		       oinfo->loi_kms, (__u64)attr->cat_kms);
+		loi_kms_set(oinfo, attr->cat_kms);
+	}
+	return 0;
+}
+
+static int osc_object_glimpse(const struct lu_env *env,
+			      const struct cl_object *obj, struct ost_lvb *lvb)
+{
+	struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
+
+	ENTRY;
+	lvb->lvb_size   = oinfo->loi_kms;
+	lvb->lvb_blocks = oinfo->loi_lvb.lvb_blocks;
+	RETURN(0);
+}
+
+
+void osc_object_set_contended(struct osc_object *obj)
+{
+	obj->oo_contention_time = cfs_time_current();
+	/* mb(); */
+	obj->oo_contended = 1;
+}
+
+void osc_object_clear_contended(struct osc_object *obj)
+{
+	obj->oo_contended = 0;
+}
+
+int osc_object_is_contended(struct osc_object *obj)
+{
+	struct osc_device *dev  = lu2osc_dev(obj->oo_cl.co_lu.lo_dev);
+	int osc_contention_time = dev->od_contention_time;
+	cfs_time_t cur_time     = cfs_time_current();
+	cfs_time_t retry_time;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_OSC_OBJECT_CONTENTION))
+		return 1;
+
+	if (!obj->oo_contended)
+		return 0;
+
+	/*
+	 * I like copy-paste. the code is copied from
+	 * ll_file_is_contended.
+	 */
+	retry_time = cfs_time_add(obj->oo_contention_time,
+				  cfs_time_seconds(osc_contention_time));
+	if (cfs_time_after(cur_time, retry_time)) {
+		osc_object_clear_contended(obj);
+		return 0;
+	}
+	return 1;
+}
+
+static const struct cl_object_operations osc_ops = {
+	.coo_page_init = osc_page_init,
+	.coo_lock_init = osc_lock_init,
+	.coo_io_init   = osc_io_init,
+	.coo_attr_get  = osc_attr_get,
+	.coo_attr_set  = osc_attr_set,
+	.coo_glimpse   = osc_object_glimpse
+};
+
+static const struct lu_object_operations osc_lu_obj_ops = {
+	.loo_object_init      = osc_object_init,
+	.loo_object_delete    = NULL,
+	.loo_object_release   = NULL,
+	.loo_object_free      = osc_object_free,
+	.loo_object_print     = osc_object_print,
+	.loo_object_invariant = NULL
+};
+
+struct lu_object *osc_object_alloc(const struct lu_env *env,
+				   const struct lu_object_header *unused,
+				   struct lu_device *dev)
+{
+	struct osc_object *osc;
+	struct lu_object  *obj;
+
+	OBD_SLAB_ALLOC_PTR_GFP(osc, osc_object_kmem, __GFP_IO);
+	if (osc != NULL) {
+		obj = osc2lu(osc);
+		lu_object_init(obj, NULL, dev);
+		osc->oo_cl.co_ops = &osc_ops;
+		obj->lo_ops = &osc_lu_obj_ops;
+	} else
+		obj = NULL;
+	return obj;
+}
+
+/** @} osc */
diff --git a/drivers/staging/lustre/lustre/osc/osc_page.c b/drivers/staging/lustre/lustre/osc/osc_page.c
new file mode 100644
index 000000000000..07d3702ac574
--- /dev/null
+++ b/drivers/staging/lustre/lustre/osc/osc_page.c
@@ -0,0 +1,926 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_page for OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include "osc_cl_internal.h"
+
+static void osc_lru_del(struct client_obd *cli, struct osc_page *opg, bool del);
+static void osc_lru_add(struct client_obd *cli, struct osc_page *opg);
+static int osc_lru_reserve(const struct lu_env *env, struct osc_object *obj,
+			   struct osc_page *opg);
+
+/** \addtogroup osc
+ *  @{
+ */
+
+/*
+ * Comment out osc_page_protected because it may sleep inside the
+ * the client_obd_list_lock.
+ * client_obd_list_lock -> osc_ap_completion -> osc_completion ->
+ *   -> osc_page_protected -> osc_page_is_dlocked -> osc_match_base
+ *   -> ldlm_lock_match -> sptlrpc_import_check_ctx -> sleep.
+ */
+#if 0
+static int osc_page_is_dlocked(const struct lu_env *env,
+			       const struct osc_page *opg,
+			       enum cl_lock_mode mode, int pending, int unref)
+{
+	struct cl_page	 *page;
+	struct osc_object      *obj;
+	struct osc_thread_info *info;
+	struct ldlm_res_id     *resname;
+	struct lustre_handle   *lockh;
+	ldlm_policy_data_t     *policy;
+	ldlm_mode_t	     dlmmode;
+	int		     flags;
+
+	might_sleep();
+
+	info = osc_env_info(env);
+	resname = &info->oti_resname;
+	policy = &info->oti_policy;
+	lockh = &info->oti_handle;
+	page = opg->ops_cl.cpl_page;
+	obj = cl2osc(opg->ops_cl.cpl_obj);
+
+	flags = LDLM_FL_TEST_LOCK | LDLM_FL_BLOCK_GRANTED;
+	if (pending)
+		flags |= LDLM_FL_CBPENDING;
+
+	dlmmode = osc_cl_lock2ldlm(mode) | LCK_PW;
+	osc_lock_build_res(env, obj, resname);
+	osc_index2policy(policy, page->cp_obj, page->cp_index, page->cp_index);
+	return osc_match_base(osc_export(obj), resname, LDLM_EXTENT, policy,
+			      dlmmode, &flags, NULL, lockh, unref);
+}
+
+/**
+ * Checks an invariant that a page in the cache is covered by a lock, as
+ * needed.
+ */
+static int osc_page_protected(const struct lu_env *env,
+			      const struct osc_page *opg,
+			      enum cl_lock_mode mode, int unref)
+{
+	struct cl_object_header *hdr;
+	struct cl_lock	  *scan;
+	struct cl_page	  *page;
+	struct cl_lock_descr    *descr;
+	int result;
+
+	LINVRNT(!opg->ops_temp);
+
+	page = opg->ops_cl.cpl_page;
+	if (page->cp_owner != NULL &&
+	    cl_io_top(page->cp_owner)->ci_lockreq == CILR_NEVER)
+		/*
+		 * If IO is done without locks (liblustre, or lloop), lock is
+		 * not required.
+		 */
+		result = 1;
+	else
+		/* otherwise check for a DLM lock */
+	result = osc_page_is_dlocked(env, opg, mode, 1, unref);
+	if (result == 0) {
+		/* maybe this page is a part of a lockless io? */
+		hdr = cl_object_header(opg->ops_cl.cpl_obj);
+		descr = &osc_env_info(env)->oti_descr;
+		descr->cld_mode = mode;
+		descr->cld_start = page->cp_index;
+		descr->cld_end   = page->cp_index;
+		spin_lock(&hdr->coh_lock_guard);
+		list_for_each_entry(scan, &hdr->coh_locks, cll_linkage) {
+			/*
+			 * Lock-less sub-lock has to be either in HELD state
+			 * (when io is actively going on), or in CACHED state,
+			 * when top-lock is being unlocked:
+			 * cl_io_unlock()->cl_unuse()->...->lov_lock_unuse().
+			 */
+			if ((scan->cll_state == CLS_HELD ||
+			     scan->cll_state == CLS_CACHED) &&
+			    cl_lock_ext_match(&scan->cll_descr, descr)) {
+				struct osc_lock *olck;
+
+				olck = osc_lock_at(scan);
+				result = osc_lock_is_lockless(olck);
+				break;
+			}
+		}
+		spin_unlock(&hdr->coh_lock_guard);
+	}
+	return result;
+}
+#else
+static int osc_page_protected(const struct lu_env *env,
+			      const struct osc_page *opg,
+			      enum cl_lock_mode mode, int unref)
+{
+	return 1;
+}
+#endif
+
+/*****************************************************************************
+ *
+ * Page operations.
+ *
+ */
+static void osc_page_fini(const struct lu_env *env,
+			  struct cl_page_slice *slice)
+{
+	struct osc_page *opg = cl2osc_page(slice);
+	CDEBUG(D_TRACE, "%p\n", opg);
+	LASSERT(opg->ops_lock == NULL);
+}
+
+static void osc_page_transfer_get(struct osc_page *opg, const char *label)
+{
+	struct cl_page *page = cl_page_top(opg->ops_cl.cpl_page);
+
+	LASSERT(!opg->ops_transfer_pinned);
+	cl_page_get(page);
+	lu_ref_add_atomic(&page->cp_reference, label, page);
+	opg->ops_transfer_pinned = 1;
+}
+
+static void osc_page_transfer_put(const struct lu_env *env,
+				  struct osc_page *opg)
+{
+	struct cl_page *page = cl_page_top(opg->ops_cl.cpl_page);
+
+	if (opg->ops_transfer_pinned) {
+		lu_ref_del(&page->cp_reference, "transfer", page);
+		opg->ops_transfer_pinned = 0;
+		cl_page_put(env, page);
+	}
+}
+
+/**
+ * This is called once for every page when it is submitted for a transfer
+ * either opportunistic (osc_page_cache_add()), or immediate
+ * (osc_page_submit()).
+ */
+static void osc_page_transfer_add(const struct lu_env *env,
+				  struct osc_page *opg, enum cl_req_type crt)
+{
+	struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj);
+
+	/* ops_lru and ops_inflight share the same field, so take it from LRU
+	 * first and then use it as inflight. */
+	osc_lru_del(osc_cli(obj), opg, false);
+
+	spin_lock(&obj->oo_seatbelt);
+	list_add(&opg->ops_inflight, &obj->oo_inflight[crt]);
+	opg->ops_submitter = current;
+	spin_unlock(&obj->oo_seatbelt);
+}
+
+static int osc_page_cache_add(const struct lu_env *env,
+			      const struct cl_page_slice *slice,
+			      struct cl_io *io)
+{
+	struct osc_io   *oio = osc_env_io(env);
+	struct osc_page *opg = cl2osc_page(slice);
+	int result;
+	ENTRY;
+
+	LINVRNT(osc_page_protected(env, opg, CLM_WRITE, 0));
+
+	osc_page_transfer_get(opg, "transfer\0cache");
+	result = osc_queue_async_io(env, io, opg);
+	if (result != 0)
+		osc_page_transfer_put(env, opg);
+	else
+		osc_page_transfer_add(env, opg, CRT_WRITE);
+
+	/* for sync write, kernel will wait for this page to be flushed before
+	 * osc_io_end() is called, so release it earlier.
+	 * for mkwrite(), it's known there is no further pages. */
+	if (cl_io_is_sync_write(io) || cl_io_is_mkwrite(io)) {
+		if (oio->oi_active != NULL) {
+			osc_extent_release(env, oio->oi_active);
+			oio->oi_active = NULL;
+		}
+	}
+
+	RETURN(result);
+}
+
+void osc_index2policy(ldlm_policy_data_t *policy, const struct cl_object *obj,
+		      pgoff_t start, pgoff_t end)
+{
+	memset(policy, 0, sizeof *policy);
+	policy->l_extent.start = cl_offset(obj, start);
+	policy->l_extent.end   = cl_offset(obj, end + 1) - 1;
+}
+
+static int osc_page_addref_lock(const struct lu_env *env,
+				struct osc_page *opg,
+				struct cl_lock *lock)
+{
+	struct osc_lock *olock;
+	int	      rc;
+
+	LASSERT(opg->ops_lock == NULL);
+
+	olock = osc_lock_at(lock);
+	if (atomic_inc_return(&olock->ols_pageref) <= 0) {
+		atomic_dec(&olock->ols_pageref);
+		rc = -ENODATA;
+	} else {
+		cl_lock_get(lock);
+		opg->ops_lock = lock;
+		rc = 0;
+	}
+	return rc;
+}
+
+static void osc_page_putref_lock(const struct lu_env *env,
+				 struct osc_page *opg)
+{
+	struct cl_lock  *lock = opg->ops_lock;
+	struct osc_lock *olock;
+
+	LASSERT(lock != NULL);
+	olock = osc_lock_at(lock);
+
+	atomic_dec(&olock->ols_pageref);
+	opg->ops_lock = NULL;
+
+	cl_lock_put(env, lock);
+}
+
+static int osc_page_is_under_lock(const struct lu_env *env,
+				  const struct cl_page_slice *slice,
+				  struct cl_io *unused)
+{
+	struct cl_lock *lock;
+	int	     result = -ENODATA;
+
+	ENTRY;
+	lock = cl_lock_at_page(env, slice->cpl_obj, slice->cpl_page,
+			       NULL, 1, 0);
+	if (lock != NULL) {
+		if (osc_page_addref_lock(env, cl2osc_page(slice), lock) == 0)
+			result = -EBUSY;
+		cl_lock_put(env, lock);
+	}
+	RETURN(result);
+}
+
+static void osc_page_disown(const struct lu_env *env,
+			    const struct cl_page_slice *slice,
+			    struct cl_io *io)
+{
+	struct osc_page *opg = cl2osc_page(slice);
+
+	if (unlikely(opg->ops_lock))
+		osc_page_putref_lock(env, opg);
+}
+
+static void osc_page_completion_read(const struct lu_env *env,
+				     const struct cl_page_slice *slice,
+				     int ioret)
+{
+	struct osc_page   *opg = cl2osc_page(slice);
+	struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj);
+
+	if (likely(opg->ops_lock))
+		osc_page_putref_lock(env, opg);
+	osc_lru_add(osc_cli(obj), opg);
+}
+
+static void osc_page_completion_write(const struct lu_env *env,
+				      const struct cl_page_slice *slice,
+				      int ioret)
+{
+	struct osc_page   *opg = cl2osc_page(slice);
+	struct osc_object *obj = cl2osc(slice->cpl_obj);
+
+	osc_lru_add(osc_cli(obj), opg);
+}
+
+static int osc_page_fail(const struct lu_env *env,
+			 const struct cl_page_slice *slice,
+			 struct cl_io *unused)
+{
+	/*
+	 * Cached read?
+	 */
+	LBUG();
+	return 0;
+}
+
+
+static const char *osc_list(struct list_head *head)
+{
+	return list_empty(head) ? "-" : "+";
+}
+
+static inline cfs_time_t osc_submit_duration(struct osc_page *opg)
+{
+	if (opg->ops_submit_time == 0)
+		return 0;
+
+	return (cfs_time_current() - opg->ops_submit_time);
+}
+
+static int osc_page_print(const struct lu_env *env,
+			  const struct cl_page_slice *slice,
+			  void *cookie, lu_printer_t printer)
+{
+	struct osc_page       *opg = cl2osc_page(slice);
+	struct osc_async_page *oap = &opg->ops_oap;
+	struct osc_object     *obj = cl2osc(slice->cpl_obj);
+	struct client_obd     *cli = &osc_export(obj)->exp_obd->u.cli;
+
+	return (*printer)(env, cookie, LUSTRE_OSC_NAME"-page@%p: "
+			  "1< %#x %d %u %s %s > "
+			  "2< "LPU64" %u %u %#x %#x | %p %p %p > "
+			  "3< %s %p %d %lu %d > "
+			  "4< %d %d %d %lu %s | %s %s %s %s > "
+			  "5< %s %s %s %s | %d %s | %d %s %s>\n",
+			  opg,
+			  /* 1 */
+			  oap->oap_magic, oap->oap_cmd,
+			  oap->oap_interrupted,
+			  osc_list(&oap->oap_pending_item),
+			  osc_list(&oap->oap_rpc_item),
+			  /* 2 */
+			  oap->oap_obj_off, oap->oap_page_off, oap->oap_count,
+			  oap->oap_async_flags, oap->oap_brw_flags,
+			  oap->oap_request, oap->oap_cli, obj,
+			  /* 3 */
+			  osc_list(&opg->ops_inflight),
+			  opg->ops_submitter, opg->ops_transfer_pinned,
+			  osc_submit_duration(opg), opg->ops_srvlock,
+			  /* 4 */
+			  cli->cl_r_in_flight, cli->cl_w_in_flight,
+			  cli->cl_max_rpcs_in_flight,
+			  cli->cl_avail_grant,
+			  osc_list(&cli->cl_cache_waiters),
+			  osc_list(&cli->cl_loi_ready_list),
+			  osc_list(&cli->cl_loi_hp_ready_list),
+			  osc_list(&cli->cl_loi_write_list),
+			  osc_list(&cli->cl_loi_read_list),
+			  /* 5 */
+			  osc_list(&obj->oo_ready_item),
+			  osc_list(&obj->oo_hp_ready_item),
+			  osc_list(&obj->oo_write_item),
+			  osc_list(&obj->oo_read_item),
+			  atomic_read(&obj->oo_nr_reads),
+			  osc_list(&obj->oo_reading_exts),
+			  atomic_read(&obj->oo_nr_writes),
+			  osc_list(&obj->oo_hp_exts),
+			  osc_list(&obj->oo_urgent_exts));
+}
+
+static void osc_page_delete(const struct lu_env *env,
+			    const struct cl_page_slice *slice)
+{
+	struct osc_page   *opg = cl2osc_page(slice);
+	struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj);
+	int rc;
+
+	LINVRNT(opg->ops_temp || osc_page_protected(env, opg, CLM_READ, 1));
+
+	ENTRY;
+	CDEBUG(D_TRACE, "%p\n", opg);
+	osc_page_transfer_put(env, opg);
+	rc = osc_teardown_async_page(env, obj, opg);
+	if (rc) {
+		CL_PAGE_DEBUG(D_ERROR, env, cl_page_top(slice->cpl_page),
+			      "Trying to teardown failed: %d\n", rc);
+		LASSERT(0);
+	}
+
+	spin_lock(&obj->oo_seatbelt);
+	if (opg->ops_submitter != NULL) {
+		LASSERT(!list_empty(&opg->ops_inflight));
+		list_del_init(&opg->ops_inflight);
+		opg->ops_submitter = NULL;
+	}
+	spin_unlock(&obj->oo_seatbelt);
+
+	osc_lru_del(osc_cli(obj), opg, true);
+	EXIT;
+}
+
+void osc_page_clip(const struct lu_env *env, const struct cl_page_slice *slice,
+		   int from, int to)
+{
+	struct osc_page       *opg = cl2osc_page(slice);
+	struct osc_async_page *oap = &opg->ops_oap;
+
+	LINVRNT(osc_page_protected(env, opg, CLM_READ, 0));
+
+	opg->ops_from = from;
+	opg->ops_to   = to;
+	spin_lock(&oap->oap_lock);
+	oap->oap_async_flags |= ASYNC_COUNT_STABLE;
+	spin_unlock(&oap->oap_lock);
+}
+
+static int osc_page_cancel(const struct lu_env *env,
+			   const struct cl_page_slice *slice)
+{
+	struct osc_page *opg = cl2osc_page(slice);
+	int rc = 0;
+
+	LINVRNT(osc_page_protected(env, opg, CLM_READ, 0));
+
+	/* Check if the transferring against this page
+	 * is completed, or not even queued. */
+	if (opg->ops_transfer_pinned)
+		/* FIXME: may not be interrupted.. */
+		rc = osc_cancel_async_page(env, opg);
+	LASSERT(ergo(rc == 0, opg->ops_transfer_pinned == 0));
+	return rc;
+}
+
+static int osc_page_flush(const struct lu_env *env,
+			  const struct cl_page_slice *slice,
+			  struct cl_io *io)
+{
+	struct osc_page *opg = cl2osc_page(slice);
+	int rc = 0;
+	ENTRY;
+	rc = osc_flush_async_page(env, io, opg);
+	RETURN(rc);
+}
+
+static const struct cl_page_operations osc_page_ops = {
+	.cpo_fini	  = osc_page_fini,
+	.cpo_print	 = osc_page_print,
+	.cpo_delete	= osc_page_delete,
+	.cpo_is_under_lock = osc_page_is_under_lock,
+	.cpo_disown	= osc_page_disown,
+	.io = {
+		[CRT_READ] = {
+			.cpo_cache_add  = osc_page_fail,
+			.cpo_completion = osc_page_completion_read
+		},
+		[CRT_WRITE] = {
+			.cpo_cache_add  = osc_page_cache_add,
+			.cpo_completion = osc_page_completion_write
+		}
+	},
+	.cpo_clip	   = osc_page_clip,
+	.cpo_cancel	 = osc_page_cancel,
+	.cpo_flush	  = osc_page_flush
+};
+
+int osc_page_init(const struct lu_env *env, struct cl_object *obj,
+		struct cl_page *page, struct page *vmpage)
+{
+	struct osc_object *osc = cl2osc(obj);
+	struct osc_page   *opg = cl_object_page_slice(obj, page);
+	int result;
+
+	opg->ops_from = 0;
+	opg->ops_to   = PAGE_CACHE_SIZE;
+
+	result = osc_prep_async_page(osc, opg, vmpage,
+					cl_offset(obj, page->cp_index));
+	if (result == 0) {
+		struct osc_io *oio = osc_env_io(env);
+		opg->ops_srvlock = osc_io_srvlock(oio);
+		cl_page_slice_add(page, &opg->ops_cl, obj,
+				&osc_page_ops);
+	}
+	/*
+	 * Cannot assert osc_page_protected() here as read-ahead
+	 * creates temporary pages outside of a lock.
+	 */
+	/* ops_inflight and ops_lru are the same field, but it doesn't
+	 * hurt to initialize it twice :-) */
+	INIT_LIST_HEAD(&opg->ops_inflight);
+	INIT_LIST_HEAD(&opg->ops_lru);
+
+	/* reserve an LRU space for this page */
+	if (page->cp_type == CPT_CACHEABLE && result == 0)
+		result = osc_lru_reserve(env, osc, opg);
+
+	return result;
+}
+
+/**
+ * Helper function called by osc_io_submit() for every page in an immediate
+ * transfer (i.e., transferred synchronously).
+ */
+void osc_page_submit(const struct lu_env *env, struct osc_page *opg,
+		     enum cl_req_type crt, int brw_flags)
+{
+	struct osc_async_page *oap = &opg->ops_oap;
+	struct osc_object     *obj = oap->oap_obj;
+
+	LINVRNT(osc_page_protected(env, opg,
+				   crt == CRT_WRITE ? CLM_WRITE : CLM_READ, 1));
+
+	LASSERTF(oap->oap_magic == OAP_MAGIC, "Bad oap magic: oap %p, "
+		 "magic 0x%x\n", oap, oap->oap_magic);
+	LASSERT(oap->oap_async_flags & ASYNC_READY);
+	LASSERT(oap->oap_async_flags & ASYNC_COUNT_STABLE);
+
+	oap->oap_cmd       = crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ;
+	oap->oap_page_off  = opg->ops_from;
+	oap->oap_count     = opg->ops_to - opg->ops_from;
+	oap->oap_brw_flags = OBD_BRW_SYNC | brw_flags;
+
+	if (!client_is_remote(osc_export(obj)) &&
+			cfs_capable(CFS_CAP_SYS_RESOURCE)) {
+		oap->oap_brw_flags |= OBD_BRW_NOQUOTA;
+		oap->oap_cmd |= OBD_BRW_NOQUOTA;
+	}
+
+	opg->ops_submit_time = cfs_time_current();
+	osc_page_transfer_get(opg, "transfer\0imm");
+	osc_page_transfer_add(env, opg, crt);
+}
+
+/* --------------- LRU page management ------------------ */
+
+/* OSC is a natural place to manage LRU pages as applications are specialized
+ * to write OSC by OSC. Ideally, if one OSC is used more frequently it should
+ * occupy more LRU slots. On the other hand, we should avoid using up all LRU
+ * slots (client_obd::cl_lru_left) otherwise process has to be put into sleep
+ * for free LRU slots - this will be very bad so the algorithm requires each
+ * OSC to free slots voluntarily to maintain a reasonable number of free slots
+ * at any time.
+ */
+
+static CFS_DECL_WAITQ(osc_lru_waitq);
+static atomic_t osc_lru_waiters = ATOMIC_INIT(0);
+/* LRU pages are freed in batch mode. OSC should at least free this
+ * number of pages to avoid running out of LRU budget, and.. */
+static const int lru_shrink_min = 2 << (20 - PAGE_CACHE_SHIFT);  /* 2M */
+/* free this number at most otherwise it will take too long time to finsih. */
+static const int lru_shrink_max = 32 << (20 - PAGE_CACHE_SHIFT); /* 32M */
+
+/* Check if we can free LRU slots from this OSC. If there exists LRU waiters,
+ * we should free slots aggressively. In this way, slots are freed in a steady
+ * step to maintain fairness among OSCs.
+ *
+ * Return how many LRU pages should be freed. */
+static int osc_cache_too_much(struct client_obd *cli)
+{
+	struct cl_client_cache *cache = cli->cl_cache;
+	int pages = atomic_read(&cli->cl_lru_in_list) >> 1;
+
+	if (atomic_read(&osc_lru_waiters) > 0 &&
+	    atomic_read(cli->cl_lru_left) < lru_shrink_max)
+		/* drop lru pages aggressively */
+		return min(pages, lru_shrink_max);
+
+	/* if it's going to run out LRU slots, we should free some, but not
+	 * too much to maintain faireness among OSCs. */
+	if (atomic_read(cli->cl_lru_left) < cache->ccc_lru_max >> 4) {
+		unsigned long tmp;
+
+		tmp = cache->ccc_lru_max / atomic_read(&cache->ccc_users);
+		if (pages > tmp)
+			return min(pages, lru_shrink_max);
+
+		return pages > lru_shrink_min ? lru_shrink_min : 0;
+	}
+
+	return 0;
+}
+
+/* Return how many pages are not discarded in @pvec. */
+static int discard_pagevec(const struct lu_env *env, struct cl_io *io,
+			   struct cl_page **pvec, int max_index)
+{
+	int count;
+	int i;
+
+	for (count = 0, i = 0; i < max_index; i++) {
+		struct cl_page *page = pvec[i];
+		if (cl_page_own_try(env, io, page) == 0) {
+			/* free LRU page only if nobody is using it.
+			 * This check is necessary to avoid freeing the pages
+			 * having already been removed from LRU and pinned
+			 * for IO. */
+			if (!cl_page_in_use(page)) {
+				cl_page_unmap(env, io, page);
+				cl_page_discard(env, io, page);
+				++count;
+			}
+			cl_page_disown(env, io, page);
+		}
+		cl_page_put(env, page);
+		pvec[i] = NULL;
+	}
+	return max_index - count;
+}
+
+/**
+ * Drop @target of pages from LRU at most.
+ */
+int osc_lru_shrink(struct client_obd *cli, int target)
+{
+	struct cl_env_nest nest;
+	struct lu_env *env;
+	struct cl_io *io;
+	struct cl_object *clobj = NULL;
+	struct cl_page **pvec;
+	struct osc_page *opg;
+	int maxscan = 0;
+	int count = 0;
+	int index = 0;
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(atomic_read(&cli->cl_lru_in_list) >= 0);
+	if (atomic_read(&cli->cl_lru_in_list) == 0 || target <= 0)
+		RETURN(0);
+
+	env = cl_env_nested_get(&nest);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	pvec = osc_env_info(env)->oti_pvec;
+	io = &osc_env_info(env)->oti_io;
+
+	client_obd_list_lock(&cli->cl_lru_list_lock);
+	atomic_inc(&cli->cl_lru_shrinkers);
+	maxscan = min(target << 1, atomic_read(&cli->cl_lru_in_list));
+	while (!list_empty(&cli->cl_lru_list)) {
+		struct cl_page *page;
+
+		if (--maxscan < 0)
+			break;
+
+		opg = list_entry(cli->cl_lru_list.next, struct osc_page,
+				     ops_lru);
+		page = cl_page_top(opg->ops_cl.cpl_page);
+		if (cl_page_in_use_noref(page)) {
+			list_move_tail(&opg->ops_lru, &cli->cl_lru_list);
+			continue;
+		}
+
+		LASSERT(page->cp_obj != NULL);
+		if (clobj != page->cp_obj) {
+			struct cl_object *tmp = page->cp_obj;
+
+			cl_object_get(tmp);
+			client_obd_list_unlock(&cli->cl_lru_list_lock);
+
+			if (clobj != NULL) {
+				count -= discard_pagevec(env, io, pvec, index);
+				index = 0;
+
+				cl_io_fini(env, io);
+				cl_object_put(env, clobj);
+				clobj = NULL;
+			}
+
+			clobj = tmp;
+			io->ci_obj = clobj;
+			io->ci_ignore_layout = 1;
+			rc = cl_io_init(env, io, CIT_MISC, clobj);
+
+			client_obd_list_lock(&cli->cl_lru_list_lock);
+
+			if (rc != 0)
+				break;
+
+			++maxscan;
+			continue;
+		}
+
+		/* move this page to the end of list as it will be discarded
+		 * soon. The page will be finally removed from LRU list in
+		 * osc_page_delete().  */
+		list_move_tail(&opg->ops_lru, &cli->cl_lru_list);
+
+		/* it's okay to grab a refcount here w/o holding lock because
+		 * it has to grab cl_lru_list_lock to delete the page. */
+		cl_page_get(page);
+		pvec[index++] = page;
+		if (++count >= target)
+			break;
+
+		if (unlikely(index == OTI_PVEC_SIZE)) {
+			client_obd_list_unlock(&cli->cl_lru_list_lock);
+			count -= discard_pagevec(env, io, pvec, index);
+			index = 0;
+
+			client_obd_list_lock(&cli->cl_lru_list_lock);
+		}
+	}
+	client_obd_list_unlock(&cli->cl_lru_list_lock);
+
+	if (clobj != NULL) {
+		count -= discard_pagevec(env, io, pvec, index);
+
+		cl_io_fini(env, io);
+		cl_object_put(env, clobj);
+	}
+	cl_env_nested_put(&nest, env);
+
+	atomic_dec(&cli->cl_lru_shrinkers);
+	RETURN(count > 0 ? count : rc);
+}
+
+static void osc_lru_add(struct client_obd *cli, struct osc_page *opg)
+{
+	bool wakeup = false;
+
+	if (!opg->ops_in_lru)
+		return;
+
+	atomic_dec(&cli->cl_lru_busy);
+	client_obd_list_lock(&cli->cl_lru_list_lock);
+	if (list_empty(&opg->ops_lru)) {
+		list_move_tail(&opg->ops_lru, &cli->cl_lru_list);
+		atomic_inc_return(&cli->cl_lru_in_list);
+		wakeup = atomic_read(&osc_lru_waiters) > 0;
+	}
+	client_obd_list_unlock(&cli->cl_lru_list_lock);
+
+	if (wakeup) {
+		osc_lru_shrink(cli, osc_cache_too_much(cli));
+		wake_up_all(&osc_lru_waitq);
+	}
+}
+
+/* delete page from LRUlist. The page can be deleted from LRUlist for two
+ * reasons: redirtied or deleted from page cache. */
+static void osc_lru_del(struct client_obd *cli, struct osc_page *opg, bool del)
+{
+	if (opg->ops_in_lru) {
+		client_obd_list_lock(&cli->cl_lru_list_lock);
+		if (!list_empty(&opg->ops_lru)) {
+			LASSERT(atomic_read(&cli->cl_lru_in_list) > 0);
+			list_del_init(&opg->ops_lru);
+			atomic_dec(&cli->cl_lru_in_list);
+			if (!del)
+				atomic_inc(&cli->cl_lru_busy);
+		} else if (del) {
+			LASSERT(atomic_read(&cli->cl_lru_busy) > 0);
+			atomic_dec(&cli->cl_lru_busy);
+		}
+		client_obd_list_unlock(&cli->cl_lru_list_lock);
+		if (del) {
+			atomic_inc(cli->cl_lru_left);
+			/* this is a great place to release more LRU pages if
+			 * this osc occupies too many LRU pages and kernel is
+			 * stealing one of them.
+			 * cl_lru_shrinkers is to avoid recursive call in case
+			 * we're already in the context of osc_lru_shrink(). */
+			if (atomic_read(&cli->cl_lru_shrinkers) == 0)
+				osc_lru_shrink(cli, osc_cache_too_much(cli));
+			wake_up(&osc_lru_waitq);
+		}
+	} else {
+		LASSERT(list_empty(&opg->ops_lru));
+	}
+}
+
+static inline int max_to_shrink(struct client_obd *cli)
+{
+	return min(atomic_read(&cli->cl_lru_in_list) >> 1, lru_shrink_max);
+}
+
+static int osc_lru_reclaim(struct client_obd *cli)
+{
+	struct cl_client_cache *cache = cli->cl_cache;
+	int max_scans;
+	int rc;
+
+	LASSERT(cache != NULL);
+	LASSERT(!list_empty(&cache->ccc_lru));
+
+	rc = osc_lru_shrink(cli, lru_shrink_min);
+	if (rc != 0) {
+		CDEBUG(D_CACHE, "%s: Free %d pages from own LRU: %p.\n",
+			cli->cl_import->imp_obd->obd_name, rc, cli);
+		return rc;
+	}
+
+	CDEBUG(D_CACHE, "%s: cli %p no free slots, pages: %d, busy: %d.\n",
+		cli->cl_import->imp_obd->obd_name, cli,
+		atomic_read(&cli->cl_lru_in_list),
+		atomic_read(&cli->cl_lru_busy));
+
+	/* Reclaim LRU slots from other client_obd as it can't free enough
+	 * from its own. This should rarely happen. */
+	spin_lock(&cache->ccc_lru_lock);
+	cache->ccc_lru_shrinkers++;
+	list_move_tail(&cli->cl_lru_osc, &cache->ccc_lru);
+
+	max_scans = atomic_read(&cache->ccc_users);
+	while (--max_scans > 0 && !list_empty(&cache->ccc_lru)) {
+		cli = list_entry(cache->ccc_lru.next, struct client_obd,
+					cl_lru_osc);
+
+		CDEBUG(D_CACHE, "%s: cli %p LRU pages: %d, busy: %d.\n",
+			cli->cl_import->imp_obd->obd_name, cli,
+			atomic_read(&cli->cl_lru_in_list),
+			atomic_read(&cli->cl_lru_busy));
+
+		list_move_tail(&cli->cl_lru_osc, &cache->ccc_lru);
+		if (atomic_read(&cli->cl_lru_in_list) > 0) {
+			spin_unlock(&cache->ccc_lru_lock);
+
+			rc = osc_lru_shrink(cli, max_to_shrink(cli));
+			spin_lock(&cache->ccc_lru_lock);
+			if (rc != 0)
+				break;
+		}
+	}
+	spin_unlock(&cache->ccc_lru_lock);
+
+	CDEBUG(D_CACHE, "%s: cli %p freed %d pages.\n",
+		cli->cl_import->imp_obd->obd_name, cli, rc);
+	return rc;
+}
+
+static int osc_lru_reserve(const struct lu_env *env, struct osc_object *obj,
+			   struct osc_page *opg)
+{
+	struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+	struct client_obd *cli = osc_cli(obj);
+	int rc = 0;
+	ENTRY;
+
+	if (cli->cl_cache == NULL) /* shall not be in LRU */
+		RETURN(0);
+
+	LASSERT(atomic_read(cli->cl_lru_left) >= 0);
+	while (!cfs_atomic_add_unless(cli->cl_lru_left, -1, 0)) {
+		int gen;
+
+		/* run out of LRU spaces, try to drop some by itself */
+		rc = osc_lru_reclaim(cli);
+		if (rc < 0)
+			break;
+		if (rc > 0)
+			continue;
+
+		cond_resched();
+
+		/* slowest case, all of caching pages are busy, notifying
+		 * other OSCs that we're lack of LRU slots. */
+		atomic_inc(&osc_lru_waiters);
+
+		gen = atomic_read(&cli->cl_lru_in_list);
+		rc = l_wait_event(osc_lru_waitq,
+				atomic_read(cli->cl_lru_left) > 0 ||
+				(atomic_read(&cli->cl_lru_in_list) > 0 &&
+				 gen != atomic_read(&cli->cl_lru_in_list)),
+				&lwi);
+
+		atomic_dec(&osc_lru_waiters);
+		if (rc < 0)
+			break;
+	}
+
+	if (rc >= 0) {
+		atomic_inc(&cli->cl_lru_busy);
+		opg->ops_in_lru = 1;
+		rc = 0;
+	}
+
+	RETURN(rc);
+}
+
+/** @} osc */
diff --git a/drivers/staging/lustre/lustre/osc/osc_quota.c b/drivers/staging/lustre/lustre/osc/osc_quota.c
new file mode 100644
index 000000000000..69caab76ced3
--- /dev/null
+++ b/drivers/staging/lustre/lustre/osc/osc_quota.c
@@ -0,0 +1,332 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ * Code originally extracted from quota directory
+ */
+
+#include <obd_ost.h>
+#include "osc_internal.h"
+
+static inline struct osc_quota_info *osc_oqi_alloc(obd_uid id)
+{
+	struct osc_quota_info *oqi;
+
+	OBD_SLAB_ALLOC_PTR(oqi, osc_quota_kmem);
+	if (oqi != NULL)
+		oqi->oqi_id = id;
+
+	return oqi;
+}
+
+int osc_quota_chkdq(struct client_obd *cli, const unsigned int qid[])
+{
+	int type;
+	ENTRY;
+
+	for (type = 0; type < MAXQUOTAS; type++) {
+		struct osc_quota_info *oqi;
+
+		oqi = cfs_hash_lookup(cli->cl_quota_hash[type], &qid[type]);
+		if (oqi) {
+			obd_uid id = oqi->oqi_id;
+
+			LASSERTF(id == qid[type],
+				 "The ids don't match %u != %u\n",
+				 id, qid[type]);
+
+			/* the slot is busy, the user is about to run out of
+			 * quota space on this OST */
+			CDEBUG(D_QUOTA, "chkdq found noquota for %s %d\n",
+			       type == USRQUOTA ? "user" : "grout", qid[type]);
+			RETURN(NO_QUOTA);
+		}
+	}
+
+	RETURN(QUOTA_OK);
+}
+
+#define MD_QUOTA_FLAG(type) ((type == USRQUOTA) ? OBD_MD_FLUSRQUOTA \
+						: OBD_MD_FLGRPQUOTA)
+#define FL_QUOTA_FLAG(type) ((type == USRQUOTA) ? OBD_FL_NO_USRQUOTA \
+						: OBD_FL_NO_GRPQUOTA)
+
+int osc_quota_setdq(struct client_obd *cli, const unsigned int qid[],
+		    obd_flag valid, obd_flag flags)
+{
+	int type;
+	int rc = 0;
+	ENTRY;
+
+	if ((valid & (OBD_MD_FLUSRQUOTA | OBD_MD_FLGRPQUOTA)) == 0)
+		RETURN(0);
+
+	for (type = 0; type < MAXQUOTAS; type++) {
+		struct osc_quota_info *oqi;
+
+		if ((valid & MD_QUOTA_FLAG(type)) == 0)
+			continue;
+
+		/* lookup the ID in the per-type hash table */
+		oqi = cfs_hash_lookup(cli->cl_quota_hash[type], &qid[type]);
+		if ((flags & FL_QUOTA_FLAG(type)) != 0) {
+			/* This ID is getting close to its quota limit, let's
+			 * switch to sync I/O */
+			if (oqi != NULL)
+				continue;
+
+			oqi = osc_oqi_alloc(qid[type]);
+			if (oqi == NULL) {
+				rc = -ENOMEM;
+				break;
+			}
+
+			rc = cfs_hash_add_unique(cli->cl_quota_hash[type],
+						 &qid[type], &oqi->oqi_hash);
+			/* race with others? */
+			if (rc == -EALREADY) {
+				rc = 0;
+				OBD_SLAB_FREE_PTR(oqi, osc_quota_kmem);
+			}
+
+			CDEBUG(D_QUOTA, "%s: setdq to insert for %s %d (%d)\n",
+			       cli->cl_import->imp_obd->obd_name,
+			       type == USRQUOTA ? "user" : "group",
+			       qid[type], rc);
+		} else {
+			/* This ID is now off the hook, let's remove it from
+			 * the hash table */
+			if (oqi == NULL)
+				continue;
+
+			oqi = cfs_hash_del_key(cli->cl_quota_hash[type],
+					       &qid[type]);
+			if (oqi)
+				OBD_SLAB_FREE_PTR(oqi, osc_quota_kmem);
+
+			CDEBUG(D_QUOTA, "%s: setdq to remove for %s %d (%p)\n",
+			       cli->cl_import->imp_obd->obd_name,
+			       type == USRQUOTA ? "user" : "group",
+			       qid[type], oqi);
+		}
+	}
+
+	RETURN(rc);
+}
+
+/*
+ * Hash operations for uid/gid <-> osc_quota_info
+ */
+static unsigned
+oqi_hashfn(cfs_hash_t *hs, const void *key, unsigned mask)
+{
+	return cfs_hash_u32_hash(*((__u32*)key), mask);
+}
+
+static int
+oqi_keycmp(const void *key, struct hlist_node *hnode)
+{
+	struct osc_quota_info *oqi;
+	obd_uid uid;
+
+	LASSERT(key != NULL);
+	uid = *((obd_uid*)key);
+	oqi = hlist_entry(hnode, struct osc_quota_info, oqi_hash);
+
+	return uid == oqi->oqi_id;
+}
+
+static void *
+oqi_key(struct hlist_node *hnode)
+{
+	struct osc_quota_info *oqi;
+	oqi = hlist_entry(hnode, struct osc_quota_info, oqi_hash);
+	return &oqi->oqi_id;
+}
+
+static void *
+oqi_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct osc_quota_info, oqi_hash);
+}
+
+static void
+oqi_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+}
+
+static void
+oqi_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+}
+
+static void
+oqi_exit(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct osc_quota_info *oqi;
+
+	oqi = hlist_entry(hnode, struct osc_quota_info, oqi_hash);
+
+	OBD_SLAB_FREE_PTR(oqi, osc_quota_kmem);
+}
+
+#define HASH_QUOTA_BKT_BITS 5
+#define HASH_QUOTA_CUR_BITS 5
+#define HASH_QUOTA_MAX_BITS 15
+
+static cfs_hash_ops_t quota_hash_ops = {
+	.hs_hash	= oqi_hashfn,
+	.hs_keycmp	= oqi_keycmp,
+	.hs_key		= oqi_key,
+	.hs_object	= oqi_object,
+	.hs_get		= oqi_get,
+	.hs_put_locked	= oqi_put_locked,
+	.hs_exit	= oqi_exit,
+};
+
+int osc_quota_setup(struct obd_device *obd)
+{
+	struct client_obd *cli = &obd->u.cli;
+	int i, type;
+	ENTRY;
+
+	for (type = 0; type < MAXQUOTAS; type++) {
+		cli->cl_quota_hash[type] = cfs_hash_create("QUOTA_HASH",
+							   HASH_QUOTA_CUR_BITS,
+							   HASH_QUOTA_MAX_BITS,
+							   HASH_QUOTA_BKT_BITS,
+							   0,
+							   CFS_HASH_MIN_THETA,
+							   CFS_HASH_MAX_THETA,
+							   &quota_hash_ops,
+							   CFS_HASH_DEFAULT);
+		if (cli->cl_quota_hash[type] == NULL)
+			break;
+	}
+
+	if (type == MAXQUOTAS)
+		RETURN(0);
+
+	for (i = 0; i < type; i++)
+		cfs_hash_putref(cli->cl_quota_hash[i]);
+
+	RETURN(-ENOMEM);
+}
+
+int osc_quota_cleanup(struct obd_device *obd)
+{
+	struct client_obd     *cli = &obd->u.cli;
+	int type;
+	ENTRY;
+
+	for (type = 0; type < MAXQUOTAS; type++)
+		cfs_hash_putref(cli->cl_quota_hash[type]);
+
+	RETURN(0);
+}
+
+int osc_quotactl(struct obd_device *unused, struct obd_export *exp,
+		 struct obd_quotactl *oqctl)
+{
+	struct ptlrpc_request *req;
+	struct obd_quotactl   *oqc;
+	int		    rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+					&RQF_OST_QUOTACTL, LUSTRE_OST_VERSION,
+					OST_QUOTACTL);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	oqc = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL);
+	*oqc = *oqctl;
+
+	ptlrpc_request_set_replen(req);
+	ptlrpc_at_set_req_timeout(req);
+	req->rq_no_resend = 1;
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		CERROR("ptlrpc_queue_wait failed, rc: %d\n", rc);
+
+	if (req->rq_repmsg &&
+	    (oqc = req_capsule_server_get(&req->rq_pill, &RMF_OBD_QUOTACTL))) {
+		*oqctl = *oqc;
+	} else if (!rc) {
+		CERROR ("Can't unpack obd_quotactl\n");
+		rc = -EPROTO;
+	}
+	ptlrpc_req_finished(req);
+
+	RETURN(rc);
+}
+
+int osc_quotacheck(struct obd_device *unused, struct obd_export *exp,
+		   struct obd_quotactl *oqctl)
+{
+	struct client_obd       *cli = &exp->exp_obd->u.cli;
+	struct ptlrpc_request   *req;
+	struct obd_quotactl     *body;
+	int		      rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+					&RQF_OST_QUOTACHECK, LUSTRE_OST_VERSION,
+					OST_QUOTACHECK);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL);
+	*body = *oqctl;
+
+	ptlrpc_request_set_replen(req);
+
+	/* the next poll will find -ENODATA, that means quotacheck is
+	 * going on */
+	cli->cl_qchk_stat = -ENODATA;
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		cli->cl_qchk_stat = rc;
+	ptlrpc_req_finished(req);
+	RETURN(rc);
+}
+
+int osc_quota_poll_check(struct obd_export *exp, struct if_quotacheck *qchk)
+{
+	struct client_obd *cli = &exp->exp_obd->u.cli;
+	int rc;
+	ENTRY;
+
+	qchk->obd_uuid = cli->cl_target_uuid;
+	memcpy(qchk->obd_type, LUSTRE_OST_NAME, strlen(LUSTRE_OST_NAME));
+
+	rc = cli->cl_qchk_stat;
+	/* the client is not the previous one */
+	if (rc == CL_NOT_QUOTACHECKED)
+		rc = -EINTR;
+	RETURN(rc);
+}
diff --git a/drivers/staging/lustre/lustre/osc/osc_request.c b/drivers/staging/lustre/lustre/osc/osc_request.c
new file mode 100644
index 000000000000..3062e47de8f9
--- /dev/null
+++ b/drivers/staging/lustre/lustre/osc/osc_request.c
@@ -0,0 +1,3668 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include <linux/libcfs/libcfs.h>
+
+
+#include <lustre_dlm.h>
+#include <lustre_net.h>
+#include <lustre/lustre_user.h>
+#include <obd_cksum.h>
+#include <obd_ost.h>
+#include <obd_lov.h>
+
+#ifdef  __CYGWIN__
+# include <ctype.h>
+#endif
+
+#include <lustre_ha.h>
+#include <lprocfs_status.h>
+#include <lustre_log.h>
+#include <lustre_debug.h>
+#include <lustre_param.h>
+#include <lustre_fid.h>
+#include "osc_internal.h"
+#include "osc_cl_internal.h"
+
+static void osc_release_ppga(struct brw_page **ppga, obd_count count);
+static int brw_interpret(const struct lu_env *env,
+			 struct ptlrpc_request *req, void *data, int rc);
+int osc_cleanup(struct obd_device *obd);
+
+/* Pack OSC object metadata for disk storage (LE byte order). */
+static int osc_packmd(struct obd_export *exp, struct lov_mds_md **lmmp,
+		      struct lov_stripe_md *lsm)
+{
+	int lmm_size;
+	ENTRY;
+
+	lmm_size = sizeof(**lmmp);
+	if (lmmp == NULL)
+		RETURN(lmm_size);
+
+	if (*lmmp != NULL && lsm == NULL) {
+		OBD_FREE(*lmmp, lmm_size);
+		*lmmp = NULL;
+		RETURN(0);
+	} else if (unlikely(lsm != NULL && ostid_id(&lsm->lsm_oi) == 0)) {
+		RETURN(-EBADF);
+	}
+
+	if (*lmmp == NULL) {
+		OBD_ALLOC(*lmmp, lmm_size);
+		if (*lmmp == NULL)
+			RETURN(-ENOMEM);
+	}
+
+	if (lsm)
+		ostid_cpu_to_le(&lsm->lsm_oi, &(*lmmp)->lmm_oi);
+
+	RETURN(lmm_size);
+}
+
+/* Unpack OSC object metadata from disk storage (LE byte order). */
+static int osc_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp,
+			struct lov_mds_md *lmm, int lmm_bytes)
+{
+	int lsm_size;
+	struct obd_import *imp = class_exp2cliimp(exp);
+	ENTRY;
+
+	if (lmm != NULL) {
+		if (lmm_bytes < sizeof(*lmm)) {
+			CERROR("%s: lov_mds_md too small: %d, need %d\n",
+			       exp->exp_obd->obd_name, lmm_bytes,
+			       (int)sizeof(*lmm));
+			RETURN(-EINVAL);
+		}
+		/* XXX LOV_MAGIC etc check? */
+
+		if (unlikely(ostid_id(&lmm->lmm_oi) == 0)) {
+			CERROR("%s: zero lmm_object_id: rc = %d\n",
+			       exp->exp_obd->obd_name, -EINVAL);
+			RETURN(-EINVAL);
+		}
+	}
+
+	lsm_size = lov_stripe_md_size(1);
+	if (lsmp == NULL)
+		RETURN(lsm_size);
+
+	if (*lsmp != NULL && lmm == NULL) {
+		OBD_FREE((*lsmp)->lsm_oinfo[0], sizeof(struct lov_oinfo));
+		OBD_FREE(*lsmp, lsm_size);
+		*lsmp = NULL;
+		RETURN(0);
+	}
+
+	if (*lsmp == NULL) {
+		OBD_ALLOC(*lsmp, lsm_size);
+		if (unlikely(*lsmp == NULL))
+			RETURN(-ENOMEM);
+		OBD_ALLOC((*lsmp)->lsm_oinfo[0], sizeof(struct lov_oinfo));
+		if (unlikely((*lsmp)->lsm_oinfo[0] == NULL)) {
+			OBD_FREE(*lsmp, lsm_size);
+			RETURN(-ENOMEM);
+		}
+		loi_init((*lsmp)->lsm_oinfo[0]);
+	} else if (unlikely(ostid_id(&(*lsmp)->lsm_oi) == 0)) {
+		RETURN(-EBADF);
+	}
+
+	if (lmm != NULL)
+		/* XXX zero *lsmp? */
+		ostid_le_to_cpu(&lmm->lmm_oi, &(*lsmp)->lsm_oi);
+
+	if (imp != NULL &&
+	    (imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_MAXBYTES))
+		(*lsmp)->lsm_maxbytes = imp->imp_connect_data.ocd_maxbytes;
+	else
+		(*lsmp)->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES;
+
+	RETURN(lsm_size);
+}
+
+static inline void osc_pack_capa(struct ptlrpc_request *req,
+				 struct ost_body *body, void *capa)
+{
+	struct obd_capa *oc = (struct obd_capa *)capa;
+	struct lustre_capa *c;
+
+	if (!capa)
+		return;
+
+	c = req_capsule_client_get(&req->rq_pill, &RMF_CAPA1);
+	LASSERT(c);
+	capa_cpy(c, oc);
+	body->oa.o_valid |= OBD_MD_FLOSSCAPA;
+	DEBUG_CAPA(D_SEC, c, "pack");
+}
+
+static inline void osc_pack_req_body(struct ptlrpc_request *req,
+				     struct obd_info *oinfo)
+{
+	struct ost_body *body;
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+	LASSERT(body);
+
+	lustre_set_wire_obdo(&body->oa, oinfo->oi_oa);
+	osc_pack_capa(req, body, oinfo->oi_capa);
+}
+
+static inline void osc_set_capa_size(struct ptlrpc_request *req,
+				     const struct req_msg_field *field,
+				     struct obd_capa *oc)
+{
+	if (oc == NULL)
+		req_capsule_set_size(&req->rq_pill, field, RCL_CLIENT, 0);
+	else
+		/* it is already calculated as sizeof struct obd_capa */
+		;
+}
+
+static int osc_getattr_interpret(const struct lu_env *env,
+				 struct ptlrpc_request *req,
+				 struct osc_async_args *aa, int rc)
+{
+	struct ost_body *body;
+	ENTRY;
+
+	if (rc != 0)
+		GOTO(out, rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	if (body) {
+		CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode);
+		lustre_get_wire_obdo(aa->aa_oi->oi_oa, &body->oa);
+
+		/* This should really be sent by the OST */
+		aa->aa_oi->oi_oa->o_blksize = DT_MAX_BRW_SIZE;
+		aa->aa_oi->oi_oa->o_valid |= OBD_MD_FLBLKSZ;
+	} else {
+		CDEBUG(D_INFO, "can't unpack ost_body\n");
+		rc = -EPROTO;
+		aa->aa_oi->oi_oa->o_valid = 0;
+	}
+out:
+	rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc);
+	RETURN(rc);
+}
+
+static int osc_getattr_async(struct obd_export *exp, struct obd_info *oinfo,
+			     struct ptlrpc_request_set *set)
+{
+	struct ptlrpc_request *req;
+	struct osc_async_args *aa;
+	int		    rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_GETATTR);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa);
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GETATTR);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	osc_pack_req_body(req, oinfo);
+
+	ptlrpc_request_set_replen(req);
+	req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_getattr_interpret;
+
+	CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+	aa = ptlrpc_req_async_args(req);
+	aa->aa_oi = oinfo;
+
+	ptlrpc_set_add_req(set, req);
+	RETURN(0);
+}
+
+static int osc_getattr(const struct lu_env *env, struct obd_export *exp,
+		       struct obd_info *oinfo)
+{
+	struct ptlrpc_request *req;
+	struct ost_body       *body;
+	int		    rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_GETATTR);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa);
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GETATTR);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	osc_pack_req_body(req, oinfo);
+
+	ptlrpc_request_set_replen(req);
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	if (body == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode);
+	lustre_get_wire_obdo(oinfo->oi_oa, &body->oa);
+
+	oinfo->oi_oa->o_blksize = cli_brw_size(exp->exp_obd);
+	oinfo->oi_oa->o_valid |= OBD_MD_FLBLKSZ;
+
+	EXIT;
+ out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int osc_setattr(const struct lu_env *env, struct obd_export *exp,
+		       struct obd_info *oinfo, struct obd_trans_info *oti)
+{
+	struct ptlrpc_request *req;
+	struct ost_body       *body;
+	int		    rc;
+	ENTRY;
+
+	LASSERT(oinfo->oi_oa->o_valid & OBD_MD_FLGROUP);
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SETATTR);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa);
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SETATTR);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	osc_pack_req_body(req, oinfo);
+
+	ptlrpc_request_set_replen(req);
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	if (body == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	lustre_get_wire_obdo(oinfo->oi_oa, &body->oa);
+
+	EXIT;
+out:
+	ptlrpc_req_finished(req);
+	RETURN(rc);
+}
+
+static int osc_setattr_interpret(const struct lu_env *env,
+				 struct ptlrpc_request *req,
+				 struct osc_setattr_args *sa, int rc)
+{
+	struct ost_body *body;
+	ENTRY;
+
+	if (rc != 0)
+		GOTO(out, rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	if (body == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	lustre_get_wire_obdo(sa->sa_oa, &body->oa);
+out:
+	rc = sa->sa_upcall(sa->sa_cookie, rc);
+	RETURN(rc);
+}
+
+int osc_setattr_async_base(struct obd_export *exp, struct obd_info *oinfo,
+			   struct obd_trans_info *oti,
+			   obd_enqueue_update_f upcall, void *cookie,
+			   struct ptlrpc_request_set *rqset)
+{
+	struct ptlrpc_request   *req;
+	struct osc_setattr_args *sa;
+	int		      rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SETATTR);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa);
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SETATTR);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	if (oti && oinfo->oi_oa->o_valid & OBD_MD_FLCOOKIE)
+		oinfo->oi_oa->o_lcookie = *oti->oti_logcookies;
+
+	osc_pack_req_body(req, oinfo);
+
+	ptlrpc_request_set_replen(req);
+
+	/* do mds to ost setattr asynchronously */
+	if (!rqset) {
+		/* Do not wait for response. */
+		ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+	} else {
+		req->rq_interpret_reply =
+			(ptlrpc_interpterer_t)osc_setattr_interpret;
+
+		CLASSERT (sizeof(*sa) <= sizeof(req->rq_async_args));
+		sa = ptlrpc_req_async_args(req);
+		sa->sa_oa = oinfo->oi_oa;
+		sa->sa_upcall = upcall;
+		sa->sa_cookie = cookie;
+
+		if (rqset == PTLRPCD_SET)
+			ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+		else
+			ptlrpc_set_add_req(rqset, req);
+	}
+
+	RETURN(0);
+}
+
+static int osc_setattr_async(struct obd_export *exp, struct obd_info *oinfo,
+			     struct obd_trans_info *oti,
+			     struct ptlrpc_request_set *rqset)
+{
+	return osc_setattr_async_base(exp, oinfo, oti,
+				      oinfo->oi_cb_up, oinfo, rqset);
+}
+
+int osc_real_create(struct obd_export *exp, struct obdo *oa,
+		    struct lov_stripe_md **ea, struct obd_trans_info *oti)
+{
+	struct ptlrpc_request *req;
+	struct ost_body       *body;
+	struct lov_stripe_md  *lsm;
+	int		    rc;
+	ENTRY;
+
+	LASSERT(oa);
+	LASSERT(ea);
+
+	lsm = *ea;
+	if (!lsm) {
+		rc = obd_alloc_memmd(exp, &lsm);
+		if (rc < 0)
+			RETURN(rc);
+	}
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_CREATE);
+	if (req == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_CREATE);
+	if (rc) {
+		ptlrpc_request_free(req);
+		GOTO(out, rc);
+	}
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+	LASSERT(body);
+	lustre_set_wire_obdo(&body->oa, oa);
+
+	ptlrpc_request_set_replen(req);
+
+	if ((oa->o_valid & OBD_MD_FLFLAGS) &&
+	    oa->o_flags == OBD_FL_DELORPHAN) {
+		DEBUG_REQ(D_HA, req,
+			  "delorphan from OST integration");
+		/* Don't resend the delorphan req */
+		req->rq_no_resend = req->rq_no_delay = 1;
+	}
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		GOTO(out_req, rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	if (body == NULL)
+		GOTO(out_req, rc = -EPROTO);
+
+	lustre_get_wire_obdo(oa, &body->oa);
+
+	oa->o_blksize = cli_brw_size(exp->exp_obd);
+	oa->o_valid |= OBD_MD_FLBLKSZ;
+
+	/* XXX LOV STACKING: the lsm that is passed to us from LOV does not
+	 * have valid lsm_oinfo data structs, so don't go touching that.
+	 * This needs to be fixed in a big way.
+	 */
+	lsm->lsm_oi = oa->o_oi;
+	*ea = lsm;
+
+	if (oti != NULL) {
+		oti->oti_transno = lustre_msg_get_transno(req->rq_repmsg);
+
+		if (oa->o_valid & OBD_MD_FLCOOKIE) {
+			if (!oti->oti_logcookies)
+				oti_alloc_cookies(oti, 1);
+			*oti->oti_logcookies = oa->o_lcookie;
+		}
+	}
+
+	CDEBUG(D_HA, "transno: "LPD64"\n",
+	       lustre_msg_get_transno(req->rq_repmsg));
+out_req:
+	ptlrpc_req_finished(req);
+out:
+	if (rc && !*ea)
+		obd_free_memmd(exp, &lsm);
+	RETURN(rc);
+}
+
+int osc_punch_base(struct obd_export *exp, struct obd_info *oinfo,
+		   obd_enqueue_update_f upcall, void *cookie,
+		   struct ptlrpc_request_set *rqset)
+{
+	struct ptlrpc_request   *req;
+	struct osc_setattr_args *sa;
+	struct ost_body	 *body;
+	int		      rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_PUNCH);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa);
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_PUNCH);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+	req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
+	ptlrpc_at_set_req_timeout(req);
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+	LASSERT(body);
+	lustre_set_wire_obdo(&body->oa, oinfo->oi_oa);
+	osc_pack_capa(req, body, oinfo->oi_capa);
+
+	ptlrpc_request_set_replen(req);
+
+	req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_setattr_interpret;
+	CLASSERT (sizeof(*sa) <= sizeof(req->rq_async_args));
+	sa = ptlrpc_req_async_args(req);
+	sa->sa_oa     = oinfo->oi_oa;
+	sa->sa_upcall = upcall;
+	sa->sa_cookie = cookie;
+	if (rqset == PTLRPCD_SET)
+		ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+	else
+		ptlrpc_set_add_req(rqset, req);
+
+	RETURN(0);
+}
+
+static int osc_punch(const struct lu_env *env, struct obd_export *exp,
+		     struct obd_info *oinfo, struct obd_trans_info *oti,
+		     struct ptlrpc_request_set *rqset)
+{
+	oinfo->oi_oa->o_size   = oinfo->oi_policy.l_extent.start;
+	oinfo->oi_oa->o_blocks = oinfo->oi_policy.l_extent.end;
+	oinfo->oi_oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
+	return osc_punch_base(exp, oinfo,
+			      oinfo->oi_cb_up, oinfo, rqset);
+}
+
+static int osc_sync_interpret(const struct lu_env *env,
+			      struct ptlrpc_request *req,
+			      void *arg, int rc)
+{
+	struct osc_fsync_args *fa = arg;
+	struct ost_body *body;
+	ENTRY;
+
+	if (rc)
+		GOTO(out, rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	if (body == NULL) {
+		CERROR ("can't unpack ost_body\n");
+		GOTO(out, rc = -EPROTO);
+	}
+
+	*fa->fa_oi->oi_oa = body->oa;
+out:
+	rc = fa->fa_upcall(fa->fa_cookie, rc);
+	RETURN(rc);
+}
+
+int osc_sync_base(struct obd_export *exp, struct obd_info *oinfo,
+		  obd_enqueue_update_f upcall, void *cookie,
+		  struct ptlrpc_request_set *rqset)
+{
+	struct ptlrpc_request *req;
+	struct ost_body       *body;
+	struct osc_fsync_args *fa;
+	int		    rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SYNC);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa);
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SYNC);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	/* overload the size and blocks fields in the oa with start/end */
+	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+	LASSERT(body);
+	lustre_set_wire_obdo(&body->oa, oinfo->oi_oa);
+	osc_pack_capa(req, body, oinfo->oi_capa);
+
+	ptlrpc_request_set_replen(req);
+	req->rq_interpret_reply = osc_sync_interpret;
+
+	CLASSERT(sizeof(*fa) <= sizeof(req->rq_async_args));
+	fa = ptlrpc_req_async_args(req);
+	fa->fa_oi = oinfo;
+	fa->fa_upcall = upcall;
+	fa->fa_cookie = cookie;
+
+	if (rqset == PTLRPCD_SET)
+		ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+	else
+		ptlrpc_set_add_req(rqset, req);
+
+	RETURN (0);
+}
+
+static int osc_sync(const struct lu_env *env, struct obd_export *exp,
+		    struct obd_info *oinfo, obd_size start, obd_size end,
+		    struct ptlrpc_request_set *set)
+{
+	ENTRY;
+
+	if (!oinfo->oi_oa) {
+		CDEBUG(D_INFO, "oa NULL\n");
+		RETURN(-EINVAL);
+	}
+
+	oinfo->oi_oa->o_size = start;
+	oinfo->oi_oa->o_blocks = end;
+	oinfo->oi_oa->o_valid |= (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS);
+
+	RETURN(osc_sync_base(exp, oinfo, oinfo->oi_cb_up, oinfo, set));
+}
+
+/* Find and cancel locally locks matched by @mode in the resource found by
+ * @objid. Found locks are added into @cancel list. Returns the amount of
+ * locks added to @cancels list. */
+static int osc_resource_get_unused(struct obd_export *exp, struct obdo *oa,
+				   struct list_head *cancels,
+				   ldlm_mode_t mode, int lock_flags)
+{
+	struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
+	struct ldlm_res_id res_id;
+	struct ldlm_resource *res;
+	int count;
+	ENTRY;
+
+	/* Return, i.e. cancel nothing, only if ELC is supported (flag in
+	 * export) but disabled through procfs (flag in NS).
+	 *
+	 * This distinguishes from a case when ELC is not supported originally,
+	 * when we still want to cancel locks in advance and just cancel them
+	 * locally, without sending any RPC. */
+	if (exp_connect_cancelset(exp) && !ns_connect_cancelset(ns))
+		RETURN(0);
+
+	ostid_build_res_name(&oa->o_oi, &res_id);
+	res = ldlm_resource_get(ns, NULL, &res_id, 0, 0);
+	if (res == NULL)
+		RETURN(0);
+
+	LDLM_RESOURCE_ADDREF(res);
+	count = ldlm_cancel_resource_local(res, cancels, NULL, mode,
+					   lock_flags, 0, NULL);
+	LDLM_RESOURCE_DELREF(res);
+	ldlm_resource_putref(res);
+	RETURN(count);
+}
+
+static int osc_destroy_interpret(const struct lu_env *env,
+				 struct ptlrpc_request *req, void *data,
+				 int rc)
+{
+	struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
+
+	atomic_dec(&cli->cl_destroy_in_flight);
+	wake_up(&cli->cl_destroy_waitq);
+	return 0;
+}
+
+static int osc_can_send_destroy(struct client_obd *cli)
+{
+	if (atomic_inc_return(&cli->cl_destroy_in_flight) <=
+	    cli->cl_max_rpcs_in_flight) {
+		/* The destroy request can be sent */
+		return 1;
+	}
+	if (atomic_dec_return(&cli->cl_destroy_in_flight) <
+	    cli->cl_max_rpcs_in_flight) {
+		/*
+		 * The counter has been modified between the two atomic
+		 * operations.
+		 */
+		wake_up(&cli->cl_destroy_waitq);
+	}
+	return 0;
+}
+
+int osc_create(const struct lu_env *env, struct obd_export *exp,
+	       struct obdo *oa, struct lov_stripe_md **ea,
+	       struct obd_trans_info *oti)
+{
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(oa);
+	LASSERT(ea);
+	LASSERT(oa->o_valid & OBD_MD_FLGROUP);
+
+	if ((oa->o_valid & OBD_MD_FLFLAGS) &&
+	    oa->o_flags == OBD_FL_RECREATE_OBJS) {
+		RETURN(osc_real_create(exp, oa, ea, oti));
+	}
+
+	if (!fid_seq_is_mdt(ostid_seq(&oa->o_oi)))
+		RETURN(osc_real_create(exp, oa, ea, oti));
+
+	/* we should not get here anymore */
+	LBUG();
+
+	RETURN(rc);
+}
+
+/* Destroy requests can be async always on the client, and we don't even really
+ * care about the return code since the client cannot do anything at all about
+ * a destroy failure.
+ * When the MDS is unlinking a filename, it saves the file objects into a
+ * recovery llog, and these object records are cancelled when the OST reports
+ * they were destroyed and sync'd to disk (i.e. transaction committed).
+ * If the client dies, or the OST is down when the object should be destroyed,
+ * the records are not cancelled, and when the OST reconnects to the MDS next,
+ * it will retrieve the llog unlink logs and then sends the log cancellation
+ * cookies to the MDS after committing destroy transactions. */
+static int osc_destroy(const struct lu_env *env, struct obd_export *exp,
+		       struct obdo *oa, struct lov_stripe_md *ea,
+		       struct obd_trans_info *oti, struct obd_export *md_export,
+		       void *capa)
+{
+	struct client_obd     *cli = &exp->exp_obd->u.cli;
+	struct ptlrpc_request *req;
+	struct ost_body       *body;
+	LIST_HEAD(cancels);
+	int rc, count;
+	ENTRY;
+
+	if (!oa) {
+		CDEBUG(D_INFO, "oa NULL\n");
+		RETURN(-EINVAL);
+	}
+
+	count = osc_resource_get_unused(exp, oa, &cancels, LCK_PW,
+					LDLM_FL_DISCARD_DATA);
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_DESTROY);
+	if (req == NULL) {
+		ldlm_lock_list_put(&cancels, l_bl_ast, count);
+		RETURN(-ENOMEM);
+	}
+
+	osc_set_capa_size(req, &RMF_CAPA1, (struct obd_capa *)capa);
+	rc = ldlm_prep_elc_req(exp, req, LUSTRE_OST_VERSION, OST_DESTROY,
+			       0, &cancels, count);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
+	ptlrpc_at_set_req_timeout(req);
+
+	if (oti != NULL && oa->o_valid & OBD_MD_FLCOOKIE)
+		oa->o_lcookie = *oti->oti_logcookies;
+	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+	LASSERT(body);
+	lustre_set_wire_obdo(&body->oa, oa);
+
+	osc_pack_capa(req, body, (struct obd_capa *)capa);
+	ptlrpc_request_set_replen(req);
+
+	/* If osc_destory is for destroying the unlink orphan,
+	 * sent from MDT to OST, which should not be blocked here,
+	 * because the process might be triggered by ptlrpcd, and
+	 * it is not good to block ptlrpcd thread (b=16006)*/
+	if (!(oa->o_flags & OBD_FL_DELORPHAN)) {
+		req->rq_interpret_reply = osc_destroy_interpret;
+		if (!osc_can_send_destroy(cli)) {
+			struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP,
+							  NULL);
+
+			/*
+			 * Wait until the number of on-going destroy RPCs drops
+			 * under max_rpc_in_flight
+			 */
+			l_wait_event_exclusive(cli->cl_destroy_waitq,
+					       osc_can_send_destroy(cli), &lwi);
+		}
+	}
+
+	/* Do not wait for response */
+	ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+	RETURN(0);
+}
+
+static void osc_announce_cached(struct client_obd *cli, struct obdo *oa,
+				long writing_bytes)
+{
+	obd_flag bits = OBD_MD_FLBLOCKS|OBD_MD_FLGRANT;
+
+	LASSERT(!(oa->o_valid & bits));
+
+	oa->o_valid |= bits;
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	oa->o_dirty = cli->cl_dirty;
+	if (unlikely(cli->cl_dirty - cli->cl_dirty_transit >
+		     cli->cl_dirty_max)) {
+		CERROR("dirty %lu - %lu > dirty_max %lu\n",
+		       cli->cl_dirty, cli->cl_dirty_transit, cli->cl_dirty_max);
+		oa->o_undirty = 0;
+	} else if (unlikely(atomic_read(&obd_unstable_pages) +
+			    atomic_read(&obd_dirty_pages) -
+			    atomic_read(&obd_dirty_transit_pages) >
+			    (long)(obd_max_dirty_pages + 1))) {
+		/* The atomic_read() allowing the atomic_inc() are
+		 * not covered by a lock thus they may safely race and trip
+		 * this CERROR() unless we add in a small fudge factor (+1). */
+		CERROR("%s: dirty %d + %d - %d > system dirty_max %d\n",
+		       cli->cl_import->imp_obd->obd_name,
+		       atomic_read(&obd_unstable_pages),
+		       atomic_read(&obd_dirty_pages),
+		       atomic_read(&obd_dirty_transit_pages),
+		       obd_max_dirty_pages);
+		oa->o_undirty = 0;
+	} else if (unlikely(cli->cl_dirty_max - cli->cl_dirty > 0x7fffffff)) {
+		CERROR("dirty %lu - dirty_max %lu too big???\n",
+		       cli->cl_dirty, cli->cl_dirty_max);
+		oa->o_undirty = 0;
+	} else {
+		long max_in_flight = (cli->cl_max_pages_per_rpc <<
+				      PAGE_CACHE_SHIFT)*
+				     (cli->cl_max_rpcs_in_flight + 1);
+		oa->o_undirty = max(cli->cl_dirty_max, max_in_flight);
+	}
+	oa->o_grant = cli->cl_avail_grant + cli->cl_reserved_grant;
+	oa->o_dropped = cli->cl_lost_grant;
+	cli->cl_lost_grant = 0;
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+	CDEBUG(D_CACHE,"dirty: "LPU64" undirty: %u dropped %u grant: "LPU64"\n",
+	       oa->o_dirty, oa->o_undirty, oa->o_dropped, oa->o_grant);
+
+}
+
+void osc_update_next_shrink(struct client_obd *cli)
+{
+	cli->cl_next_shrink_grant =
+		cfs_time_shift(cli->cl_grant_shrink_interval);
+	CDEBUG(D_CACHE, "next time %ld to shrink grant \n",
+	       cli->cl_next_shrink_grant);
+}
+
+static void __osc_update_grant(struct client_obd *cli, obd_size grant)
+{
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	cli->cl_avail_grant += grant;
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+}
+
+static void osc_update_grant(struct client_obd *cli, struct ost_body *body)
+{
+	if (body->oa.o_valid & OBD_MD_FLGRANT) {
+		CDEBUG(D_CACHE, "got "LPU64" extra grant\n", body->oa.o_grant);
+		__osc_update_grant(cli, body->oa.o_grant);
+	}
+}
+
+static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
+			      obd_count keylen, void *key, obd_count vallen,
+			      void *val, struct ptlrpc_request_set *set);
+
+static int osc_shrink_grant_interpret(const struct lu_env *env,
+				      struct ptlrpc_request *req,
+				      void *aa, int rc)
+{
+	struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
+	struct obdo *oa = ((struct osc_grant_args *)aa)->aa_oa;
+	struct ost_body *body;
+
+	if (rc != 0) {
+		__osc_update_grant(cli, oa->o_grant);
+		GOTO(out, rc);
+	}
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	LASSERT(body);
+	osc_update_grant(cli, body);
+out:
+	OBDO_FREE(oa);
+	return rc;
+}
+
+static void osc_shrink_grant_local(struct client_obd *cli, struct obdo *oa)
+{
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	oa->o_grant = cli->cl_avail_grant / 4;
+	cli->cl_avail_grant -= oa->o_grant;
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+	if (!(oa->o_valid & OBD_MD_FLFLAGS)) {
+		oa->o_valid |= OBD_MD_FLFLAGS;
+		oa->o_flags = 0;
+	}
+	oa->o_flags |= OBD_FL_SHRINK_GRANT;
+	osc_update_next_shrink(cli);
+}
+
+/* Shrink the current grant, either from some large amount to enough for a
+ * full set of in-flight RPCs, or if we have already shrunk to that limit
+ * then to enough for a single RPC.  This avoids keeping more grant than
+ * needed, and avoids shrinking the grant piecemeal. */
+static int osc_shrink_grant(struct client_obd *cli)
+{
+	__u64 target_bytes = (cli->cl_max_rpcs_in_flight + 1) *
+			     (cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT);
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	if (cli->cl_avail_grant <= target_bytes)
+		target_bytes = cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT;
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	return osc_shrink_grant_to_target(cli, target_bytes);
+}
+
+int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes)
+{
+	int			rc = 0;
+	struct ost_body	*body;
+	ENTRY;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	/* Don't shrink if we are already above or below the desired limit
+	 * We don't want to shrink below a single RPC, as that will negatively
+	 * impact block allocation and long-term performance. */
+	if (target_bytes < cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT)
+		target_bytes = cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT;
+
+	if (target_bytes >= cli->cl_avail_grant) {
+		client_obd_list_unlock(&cli->cl_loi_list_lock);
+		RETURN(0);
+	}
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	OBD_ALLOC_PTR(body);
+	if (!body)
+		RETURN(-ENOMEM);
+
+	osc_announce_cached(cli, &body->oa, 0);
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	body->oa.o_grant = cli->cl_avail_grant - target_bytes;
+	cli->cl_avail_grant = target_bytes;
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+	if (!(body->oa.o_valid & OBD_MD_FLFLAGS)) {
+		body->oa.o_valid |= OBD_MD_FLFLAGS;
+		body->oa.o_flags = 0;
+	}
+	body->oa.o_flags |= OBD_FL_SHRINK_GRANT;
+	osc_update_next_shrink(cli);
+
+	rc = osc_set_info_async(NULL, cli->cl_import->imp_obd->obd_self_export,
+				sizeof(KEY_GRANT_SHRINK), KEY_GRANT_SHRINK,
+				sizeof(*body), body, NULL);
+	if (rc != 0)
+		__osc_update_grant(cli, body->oa.o_grant);
+	OBD_FREE_PTR(body);
+	RETURN(rc);
+}
+
+static int osc_should_shrink_grant(struct client_obd *client)
+{
+	cfs_time_t time = cfs_time_current();
+	cfs_time_t next_shrink = client->cl_next_shrink_grant;
+
+	if ((client->cl_import->imp_connect_data.ocd_connect_flags &
+	     OBD_CONNECT_GRANT_SHRINK) == 0)
+		return 0;
+
+	if (cfs_time_aftereq(time, next_shrink - 5 * CFS_TICK)) {
+		/* Get the current RPC size directly, instead of going via:
+		 * cli_brw_size(obd->u.cli.cl_import->imp_obd->obd_self_export)
+		 * Keep comment here so that it can be found by searching. */
+		int brw_size = client->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT;
+
+		if (client->cl_import->imp_state == LUSTRE_IMP_FULL &&
+		    client->cl_avail_grant > brw_size)
+			return 1;
+		else
+			osc_update_next_shrink(client);
+	}
+	return 0;
+}
+
+static int osc_grant_shrink_grant_cb(struct timeout_item *item, void *data)
+{
+	struct client_obd *client;
+
+	list_for_each_entry(client, &item->ti_obd_list,
+				cl_grant_shrink_list) {
+		if (osc_should_shrink_grant(client))
+			osc_shrink_grant(client);
+	}
+	return 0;
+}
+
+static int osc_add_shrink_grant(struct client_obd *client)
+{
+	int rc;
+
+	rc = ptlrpc_add_timeout_client(client->cl_grant_shrink_interval,
+				       TIMEOUT_GRANT,
+				       osc_grant_shrink_grant_cb, NULL,
+				       &client->cl_grant_shrink_list);
+	if (rc) {
+		CERROR("add grant client %s error %d\n",
+			client->cl_import->imp_obd->obd_name, rc);
+		return rc;
+	}
+	CDEBUG(D_CACHE, "add grant client %s \n",
+	       client->cl_import->imp_obd->obd_name);
+	osc_update_next_shrink(client);
+	return 0;
+}
+
+static int osc_del_shrink_grant(struct client_obd *client)
+{
+	return ptlrpc_del_timeout_client(&client->cl_grant_shrink_list,
+					 TIMEOUT_GRANT);
+}
+
+static void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd)
+{
+	/*
+	 * ocd_grant is the total grant amount we're expect to hold: if we've
+	 * been evicted, it's the new avail_grant amount, cl_dirty will drop
+	 * to 0 as inflight RPCs fail out; otherwise, it's avail_grant + dirty.
+	 *
+	 * race is tolerable here: if we're evicted, but imp_state already
+	 * left EVICTED state, then cl_dirty must be 0 already.
+	 */
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	if (cli->cl_import->imp_state == LUSTRE_IMP_EVICTED)
+		cli->cl_avail_grant = ocd->ocd_grant;
+	else
+		cli->cl_avail_grant = ocd->ocd_grant - cli->cl_dirty;
+
+	if (cli->cl_avail_grant < 0) {
+		CWARN("%s: available grant < 0: avail/ocd/dirty %ld/%u/%ld\n",
+		      cli->cl_import->imp_obd->obd_name, cli->cl_avail_grant,
+		      ocd->ocd_grant, cli->cl_dirty);
+		/* workaround for servers which do not have the patch from
+		 * LU-2679 */
+		cli->cl_avail_grant = ocd->ocd_grant;
+	}
+
+	/* determine the appropriate chunk size used by osc_extent. */
+	cli->cl_chunkbits = max_t(int, PAGE_CACHE_SHIFT, ocd->ocd_blocksize);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	CDEBUG(D_CACHE, "%s, setting cl_avail_grant: %ld cl_lost_grant: %ld."
+		"chunk bits: %d.\n", cli->cl_import->imp_obd->obd_name,
+		cli->cl_avail_grant, cli->cl_lost_grant, cli->cl_chunkbits);
+
+	if (ocd->ocd_connect_flags & OBD_CONNECT_GRANT_SHRINK &&
+	    list_empty(&cli->cl_grant_shrink_list))
+		osc_add_shrink_grant(cli);
+}
+
+/* We assume that the reason this OSC got a short read is because it read
+ * beyond the end of a stripe file; i.e. lustre is reading a sparse file
+ * via the LOV, and it _knows_ it's reading inside the file, it's just that
+ * this stripe never got written at or beyond this stripe offset yet. */
+static void handle_short_read(int nob_read, obd_count page_count,
+			      struct brw_page **pga)
+{
+	char *ptr;
+	int i = 0;
+
+	/* skip bytes read OK */
+	while (nob_read > 0) {
+		LASSERT (page_count > 0);
+
+		if (pga[i]->count > nob_read) {
+			/* EOF inside this page */
+			ptr = kmap(pga[i]->pg) +
+				(pga[i]->off & ~CFS_PAGE_MASK);
+			memset(ptr + nob_read, 0, pga[i]->count - nob_read);
+			kunmap(pga[i]->pg);
+			page_count--;
+			i++;
+			break;
+		}
+
+		nob_read -= pga[i]->count;
+		page_count--;
+		i++;
+	}
+
+	/* zero remaining pages */
+	while (page_count-- > 0) {
+		ptr = kmap(pga[i]->pg) + (pga[i]->off & ~CFS_PAGE_MASK);
+		memset(ptr, 0, pga[i]->count);
+		kunmap(pga[i]->pg);
+		i++;
+	}
+}
+
+static int check_write_rcs(struct ptlrpc_request *req,
+			   int requested_nob, int niocount,
+			   obd_count page_count, struct brw_page **pga)
+{
+	int     i;
+	__u32   *remote_rcs;
+
+	remote_rcs = req_capsule_server_sized_get(&req->rq_pill, &RMF_RCS,
+						  sizeof(*remote_rcs) *
+						  niocount);
+	if (remote_rcs == NULL) {
+		CDEBUG(D_INFO, "Missing/short RC vector on BRW_WRITE reply\n");
+		return(-EPROTO);
+	}
+
+	/* return error if any niobuf was in error */
+	for (i = 0; i < niocount; i++) {
+		if ((int)remote_rcs[i] < 0)
+			return(remote_rcs[i]);
+
+		if (remote_rcs[i] != 0) {
+			CDEBUG(D_INFO, "rc[%d] invalid (%d) req %p\n",
+				i, remote_rcs[i], req);
+			return(-EPROTO);
+		}
+	}
+
+	if (req->rq_bulk->bd_nob_transferred != requested_nob) {
+		CERROR("Unexpected # bytes transferred: %d (requested %d)\n",
+		       req->rq_bulk->bd_nob_transferred, requested_nob);
+		return(-EPROTO);
+	}
+
+	return (0);
+}
+
+static inline int can_merge_pages(struct brw_page *p1, struct brw_page *p2)
+{
+	if (p1->flag != p2->flag) {
+		unsigned mask = ~(OBD_BRW_FROM_GRANT| OBD_BRW_NOCACHE|
+				  OBD_BRW_SYNC|OBD_BRW_ASYNC|OBD_BRW_NOQUOTA);
+
+		/* warn if we try to combine flags that we don't know to be
+		 * safe to combine */
+		if (unlikely((p1->flag & mask) != (p2->flag & mask))) {
+			CWARN("Saw flags 0x%x and 0x%x in the same brw, please "
+			      "report this at http://bugs.whamcloud.com/\n",
+			      p1->flag, p2->flag);
+		}
+		return 0;
+	}
+
+	return (p1->off + p1->count == p2->off);
+}
+
+static obd_count osc_checksum_bulk(int nob, obd_count pg_count,
+				   struct brw_page **pga, int opc,
+				   cksum_type_t cksum_type)
+{
+	__u32				cksum;
+	int				i = 0;
+	struct cfs_crypto_hash_desc	*hdesc;
+	unsigned int			bufsize;
+	int				err;
+	unsigned char			cfs_alg = cksum_obd2cfs(cksum_type);
+
+	LASSERT(pg_count > 0);
+
+	hdesc = cfs_crypto_hash_init(cfs_alg, NULL, 0);
+	if (IS_ERR(hdesc)) {
+		CERROR("Unable to initialize checksum hash %s\n",
+		       cfs_crypto_hash_name(cfs_alg));
+		return PTR_ERR(hdesc);
+	}
+
+	while (nob > 0 && pg_count > 0) {
+		int count = pga[i]->count > nob ? nob : pga[i]->count;
+
+		/* corrupt the data before we compute the checksum, to
+		 * simulate an OST->client data error */
+		if (i == 0 && opc == OST_READ &&
+		    OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE)) {
+			unsigned char *ptr = kmap(pga[i]->pg);
+			int off = pga[i]->off & ~CFS_PAGE_MASK;
+			memcpy(ptr + off, "bad1", min(4, nob));
+			kunmap(pga[i]->pg);
+		}
+		cfs_crypto_hash_update_page(hdesc, pga[i]->pg,
+				  pga[i]->off & ~CFS_PAGE_MASK,
+				  count);
+		LL_CDEBUG_PAGE(D_PAGE, pga[i]->pg, "off %d\n",
+			       (int)(pga[i]->off & ~CFS_PAGE_MASK));
+
+		nob -= pga[i]->count;
+		pg_count--;
+		i++;
+	}
+
+	bufsize = 4;
+	err = cfs_crypto_hash_final(hdesc, (unsigned char *)&cksum, &bufsize);
+
+	if (err)
+		cfs_crypto_hash_final(hdesc, NULL, NULL);
+
+	/* For sending we only compute the wrong checksum instead
+	 * of corrupting the data so it is still correct on a redo */
+	if (opc == OST_WRITE && OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND))
+		cksum++;
+
+	return cksum;
+}
+
+static int osc_brw_prep_request(int cmd, struct client_obd *cli,struct obdo *oa,
+				struct lov_stripe_md *lsm, obd_count page_count,
+				struct brw_page **pga,
+				struct ptlrpc_request **reqp,
+				struct obd_capa *ocapa, int reserve,
+				int resend)
+{
+	struct ptlrpc_request   *req;
+	struct ptlrpc_bulk_desc *desc;
+	struct ost_body	 *body;
+	struct obd_ioobj	*ioobj;
+	struct niobuf_remote    *niobuf;
+	int niocount, i, requested_nob, opc, rc;
+	struct osc_brw_async_args *aa;
+	struct req_capsule      *pill;
+	struct brw_page *pg_prev;
+
+	ENTRY;
+	if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ))
+		RETURN(-ENOMEM); /* Recoverable */
+	if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ2))
+		RETURN(-EINVAL); /* Fatal */
+
+	if ((cmd & OBD_BRW_WRITE) != 0) {
+		opc = OST_WRITE;
+		req = ptlrpc_request_alloc_pool(cli->cl_import,
+						cli->cl_import->imp_rq_pool,
+						&RQF_OST_BRW_WRITE);
+	} else {
+		opc = OST_READ;
+		req = ptlrpc_request_alloc(cli->cl_import, &RQF_OST_BRW_READ);
+	}
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	for (niocount = i = 1; i < page_count; i++) {
+		if (!can_merge_pages(pga[i - 1], pga[i]))
+			niocount++;
+	}
+
+	pill = &req->rq_pill;
+	req_capsule_set_size(pill, &RMF_OBD_IOOBJ, RCL_CLIENT,
+			     sizeof(*ioobj));
+	req_capsule_set_size(pill, &RMF_NIOBUF_REMOTE, RCL_CLIENT,
+			     niocount * sizeof(*niobuf));
+	osc_set_capa_size(req, &RMF_CAPA1, ocapa);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, opc);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+	req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
+	ptlrpc_at_set_req_timeout(req);
+	/* ask ptlrpc not to resend on EINPROGRESS since BRWs have their own
+	 * retry logic */
+	req->rq_no_retry_einprogress = 1;
+
+	desc = ptlrpc_prep_bulk_imp(req, page_count,
+		cli->cl_import->imp_connect_data.ocd_brw_size >> LNET_MTU_BITS,
+		opc == OST_WRITE ? BULK_GET_SOURCE : BULK_PUT_SINK,
+		OST_BULK_PORTAL);
+
+	if (desc == NULL)
+		GOTO(out, rc = -ENOMEM);
+	/* NB request now owns desc and will free it when it gets freed */
+
+	body = req_capsule_client_get(pill, &RMF_OST_BODY);
+	ioobj = req_capsule_client_get(pill, &RMF_OBD_IOOBJ);
+	niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE);
+	LASSERT(body != NULL && ioobj != NULL && niobuf != NULL);
+
+	lustre_set_wire_obdo(&body->oa, oa);
+
+	obdo_to_ioobj(oa, ioobj);
+	ioobj->ioo_bufcnt = niocount;
+	/* The high bits of ioo_max_brw tells server _maximum_ number of bulks
+	 * that might be send for this request.  The actual number is decided
+	 * when the RPC is finally sent in ptlrpc_register_bulk(). It sends
+	 * "max - 1" for old client compatibility sending "0", and also so the
+	 * the actual maximum is a power-of-two number, not one less. LU-1431 */
+	ioobj_max_brw_set(ioobj, desc->bd_md_max_brw);
+	osc_pack_capa(req, body, ocapa);
+	LASSERT(page_count > 0);
+	pg_prev = pga[0];
+	for (requested_nob = i = 0; i < page_count; i++, niobuf++) {
+		struct brw_page *pg = pga[i];
+		int poff = pg->off & ~CFS_PAGE_MASK;
+
+		LASSERT(pg->count > 0);
+		/* make sure there is no gap in the middle of page array */
+		LASSERTF(page_count == 1 ||
+			 (ergo(i == 0, poff + pg->count == PAGE_CACHE_SIZE) &&
+			  ergo(i > 0 && i < page_count - 1,
+			       poff == 0 && pg->count == PAGE_CACHE_SIZE)   &&
+			  ergo(i == page_count - 1, poff == 0)),
+			 "i: %d/%d pg: %p off: "LPU64", count: %u\n",
+			 i, page_count, pg, pg->off, pg->count);
+		LASSERTF(i == 0 || pg->off > pg_prev->off,
+			 "i %d p_c %u pg %p [pri %lu ind %lu] off "LPU64
+			 " prev_pg %p [pri %lu ind %lu] off "LPU64"\n",
+			 i, page_count,
+			 pg->pg, page_private(pg->pg), pg->pg->index, pg->off,
+			 pg_prev->pg, page_private(pg_prev->pg),
+			 pg_prev->pg->index, pg_prev->off);
+		LASSERT((pga[0]->flag & OBD_BRW_SRVLOCK) ==
+			(pg->flag & OBD_BRW_SRVLOCK));
+
+		ptlrpc_prep_bulk_page_pin(desc, pg->pg, poff, pg->count);
+		requested_nob += pg->count;
+
+		if (i > 0 && can_merge_pages(pg_prev, pg)) {
+			niobuf--;
+			niobuf->len += pg->count;
+		} else {
+			niobuf->offset = pg->off;
+			niobuf->len    = pg->count;
+			niobuf->flags  = pg->flag;
+		}
+		pg_prev = pg;
+	}
+
+	LASSERTF((void *)(niobuf - niocount) ==
+		req_capsule_client_get(&req->rq_pill, &RMF_NIOBUF_REMOTE),
+		"want %p - real %p\n", req_capsule_client_get(&req->rq_pill,
+		&RMF_NIOBUF_REMOTE), (void *)(niobuf - niocount));
+
+	osc_announce_cached(cli, &body->oa, opc == OST_WRITE ? requested_nob:0);
+	if (resend) {
+		if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) {
+			body->oa.o_valid |= OBD_MD_FLFLAGS;
+			body->oa.o_flags = 0;
+		}
+		body->oa.o_flags |= OBD_FL_RECOV_RESEND;
+	}
+
+	if (osc_should_shrink_grant(cli))
+		osc_shrink_grant_local(cli, &body->oa);
+
+	/* size[REQ_REC_OFF] still sizeof (*body) */
+	if (opc == OST_WRITE) {
+		if (cli->cl_checksum &&
+		    !sptlrpc_flavor_has_bulk(&req->rq_flvr)) {
+			/* store cl_cksum_type in a local variable since
+			 * it can be changed via lprocfs */
+			cksum_type_t cksum_type = cli->cl_cksum_type;
+
+			if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) {
+				oa->o_flags &= OBD_FL_LOCAL_MASK;
+				body->oa.o_flags = 0;
+			}
+			body->oa.o_flags |= cksum_type_pack(cksum_type);
+			body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
+			body->oa.o_cksum = osc_checksum_bulk(requested_nob,
+							     page_count, pga,
+							     OST_WRITE,
+							     cksum_type);
+			CDEBUG(D_PAGE, "checksum at write origin: %x\n",
+			       body->oa.o_cksum);
+			/* save this in 'oa', too, for later checking */
+			oa->o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
+			oa->o_flags |= cksum_type_pack(cksum_type);
+		} else {
+			/* clear out the checksum flag, in case this is a
+			 * resend but cl_checksum is no longer set. b=11238 */
+			oa->o_valid &= ~OBD_MD_FLCKSUM;
+		}
+		oa->o_cksum = body->oa.o_cksum;
+		/* 1 RC per niobuf */
+		req_capsule_set_size(pill, &RMF_RCS, RCL_SERVER,
+				     sizeof(__u32) * niocount);
+	} else {
+		if (cli->cl_checksum &&
+		    !sptlrpc_flavor_has_bulk(&req->rq_flvr)) {
+			if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0)
+				body->oa.o_flags = 0;
+			body->oa.o_flags |= cksum_type_pack(cli->cl_cksum_type);
+			body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
+		}
+	}
+	ptlrpc_request_set_replen(req);
+
+	CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+	aa = ptlrpc_req_async_args(req);
+	aa->aa_oa = oa;
+	aa->aa_requested_nob = requested_nob;
+	aa->aa_nio_count = niocount;
+	aa->aa_page_count = page_count;
+	aa->aa_resends = 0;
+	aa->aa_ppga = pga;
+	aa->aa_cli = cli;
+	INIT_LIST_HEAD(&aa->aa_oaps);
+	if (ocapa && reserve)
+		aa->aa_ocapa = capa_get(ocapa);
+
+	*reqp = req;
+	RETURN(0);
+
+ out:
+	ptlrpc_req_finished(req);
+	RETURN(rc);
+}
+
+static int check_write_checksum(struct obdo *oa, const lnet_process_id_t *peer,
+				__u32 client_cksum, __u32 server_cksum, int nob,
+				obd_count page_count, struct brw_page **pga,
+				cksum_type_t client_cksum_type)
+{
+	__u32 new_cksum;
+	char *msg;
+	cksum_type_t cksum_type;
+
+	if (server_cksum == client_cksum) {
+		CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum);
+		return 0;
+	}
+
+	cksum_type = cksum_type_unpack(oa->o_valid & OBD_MD_FLFLAGS ?
+				       oa->o_flags : 0);
+	new_cksum = osc_checksum_bulk(nob, page_count, pga, OST_WRITE,
+				      cksum_type);
+
+	if (cksum_type != client_cksum_type)
+		msg = "the server did not use the checksum type specified in "
+		      "the original request - likely a protocol problem";
+	else if (new_cksum == server_cksum)
+		msg = "changed on the client after we checksummed it - "
+		      "likely false positive due to mmap IO (bug 11742)";
+	else if (new_cksum == client_cksum)
+		msg = "changed in transit before arrival at OST";
+	else
+		msg = "changed in transit AND doesn't match the original - "
+		      "likely false positive due to mmap IO (bug 11742)";
+
+	LCONSOLE_ERROR_MSG(0x132, "BAD WRITE CHECKSUM: %s: from %s inode "DFID
+			   " object "DOSTID" extent ["LPU64"-"LPU64"]\n",
+			   msg, libcfs_nid2str(peer->nid),
+			   oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : (__u64)0,
+			   oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0,
+			   oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0,
+			   POSTID(&oa->o_oi), pga[0]->off,
+			   pga[page_count-1]->off + pga[page_count-1]->count - 1);
+	CERROR("original client csum %x (type %x), server csum %x (type %x), "
+	       "client csum now %x\n", client_cksum, client_cksum_type,
+	       server_cksum, cksum_type, new_cksum);
+	return 1;
+}
+
+/* Note rc enters this function as number of bytes transferred */
+static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
+{
+	struct osc_brw_async_args *aa = (void *)&req->rq_async_args;
+	const lnet_process_id_t *peer =
+			&req->rq_import->imp_connection->c_peer;
+	struct client_obd *cli = aa->aa_cli;
+	struct ost_body *body;
+	__u32 client_cksum = 0;
+	ENTRY;
+
+	if (rc < 0 && rc != -EDQUOT) {
+		DEBUG_REQ(D_INFO, req, "Failed request with rc = %d\n", rc);
+		RETURN(rc);
+	}
+
+	LASSERTF(req->rq_repmsg != NULL, "rc = %d\n", rc);
+	body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	if (body == NULL) {
+		DEBUG_REQ(D_INFO, req, "Can't unpack body\n");
+		RETURN(-EPROTO);
+	}
+
+	/* set/clear over quota flag for a uid/gid */
+	if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE &&
+	    body->oa.o_valid & (OBD_MD_FLUSRQUOTA | OBD_MD_FLGRPQUOTA)) {
+		unsigned int qid[MAXQUOTAS] = { body->oa.o_uid, body->oa.o_gid };
+
+		CDEBUG(D_QUOTA, "setdq for [%u %u] with valid "LPX64", flags %x\n",
+		       body->oa.o_uid, body->oa.o_gid, body->oa.o_valid,
+		       body->oa.o_flags);
+		osc_quota_setdq(cli, qid, body->oa.o_valid, body->oa.o_flags);
+	}
+
+	osc_update_grant(cli, body);
+
+	if (rc < 0)
+		RETURN(rc);
+
+	if (aa->aa_oa->o_valid & OBD_MD_FLCKSUM)
+		client_cksum = aa->aa_oa->o_cksum; /* save for later */
+
+	if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE) {
+		if (rc > 0) {
+			CERROR("Unexpected +ve rc %d\n", rc);
+			RETURN(-EPROTO);
+		}
+		LASSERT(req->rq_bulk->bd_nob == aa->aa_requested_nob);
+
+		if (sptlrpc_cli_unwrap_bulk_write(req, req->rq_bulk))
+			RETURN(-EAGAIN);
+
+		if ((aa->aa_oa->o_valid & OBD_MD_FLCKSUM) && client_cksum &&
+		    check_write_checksum(&body->oa, peer, client_cksum,
+					 body->oa.o_cksum, aa->aa_requested_nob,
+					 aa->aa_page_count, aa->aa_ppga,
+					 cksum_type_unpack(aa->aa_oa->o_flags)))
+			RETURN(-EAGAIN);
+
+		rc = check_write_rcs(req, aa->aa_requested_nob,aa->aa_nio_count,
+				     aa->aa_page_count, aa->aa_ppga);
+		GOTO(out, rc);
+	}
+
+	/* The rest of this function executes only for OST_READs */
+
+	/* if unwrap_bulk failed, return -EAGAIN to retry */
+	rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, rc);
+	if (rc < 0)
+		GOTO(out, rc = -EAGAIN);
+
+	if (rc > aa->aa_requested_nob) {
+		CERROR("Unexpected rc %d (%d requested)\n", rc,
+		       aa->aa_requested_nob);
+		RETURN(-EPROTO);
+	}
+
+	if (rc != req->rq_bulk->bd_nob_transferred) {
+		CERROR ("Unexpected rc %d (%d transferred)\n",
+			rc, req->rq_bulk->bd_nob_transferred);
+		return (-EPROTO);
+	}
+
+	if (rc < aa->aa_requested_nob)
+		handle_short_read(rc, aa->aa_page_count, aa->aa_ppga);
+
+	if (body->oa.o_valid & OBD_MD_FLCKSUM) {
+		static int cksum_counter;
+		__u32      server_cksum = body->oa.o_cksum;
+		char      *via;
+		char      *router;
+		cksum_type_t cksum_type;
+
+		cksum_type = cksum_type_unpack(body->oa.o_valid &OBD_MD_FLFLAGS?
+					       body->oa.o_flags : 0);
+		client_cksum = osc_checksum_bulk(rc, aa->aa_page_count,
+						 aa->aa_ppga, OST_READ,
+						 cksum_type);
+
+		if (peer->nid == req->rq_bulk->bd_sender) {
+			via = router = "";
+		} else {
+			via = " via ";
+			router = libcfs_nid2str(req->rq_bulk->bd_sender);
+		}
+
+		if (server_cksum == ~0 && rc > 0) {
+			CERROR("Protocol error: server %s set the 'checksum' "
+			       "bit, but didn't send a checksum.  Not fatal, "
+			       "but please notify on http://bugs.whamcloud.com/\n",
+			       libcfs_nid2str(peer->nid));
+		} else if (server_cksum != client_cksum) {
+			LCONSOLE_ERROR_MSG(0x133, "%s: BAD READ CHECKSUM: from "
+					   "%s%s%s inode "DFID" object "DOSTID
+					   " extent ["LPU64"-"LPU64"]\n",
+					   req->rq_import->imp_obd->obd_name,
+					   libcfs_nid2str(peer->nid),
+					   via, router,
+					   body->oa.o_valid & OBD_MD_FLFID ?
+						body->oa.o_parent_seq : (__u64)0,
+					   body->oa.o_valid & OBD_MD_FLFID ?
+						body->oa.o_parent_oid : 0,
+					   body->oa.o_valid & OBD_MD_FLFID ?
+						body->oa.o_parent_ver : 0,
+					   POSTID(&body->oa.o_oi),
+					   aa->aa_ppga[0]->off,
+					   aa->aa_ppga[aa->aa_page_count-1]->off +
+					   aa->aa_ppga[aa->aa_page_count-1]->count -
+									1);
+			CERROR("client %x, server %x, cksum_type %x\n",
+			       client_cksum, server_cksum, cksum_type);
+			cksum_counter = 0;
+			aa->aa_oa->o_cksum = client_cksum;
+			rc = -EAGAIN;
+		} else {
+			cksum_counter++;
+			CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum);
+			rc = 0;
+		}
+	} else if (unlikely(client_cksum)) {
+		static int cksum_missed;
+
+		cksum_missed++;
+		if ((cksum_missed & (-cksum_missed)) == cksum_missed)
+			CERROR("Checksum %u requested from %s but not sent\n",
+			       cksum_missed, libcfs_nid2str(peer->nid));
+	} else {
+		rc = 0;
+	}
+out:
+	if (rc >= 0)
+		lustre_get_wire_obdo(aa->aa_oa, &body->oa);
+
+	RETURN(rc);
+}
+
+static int osc_brw_internal(int cmd, struct obd_export *exp, struct obdo *oa,
+			    struct lov_stripe_md *lsm,
+			    obd_count page_count, struct brw_page **pga,
+			    struct obd_capa *ocapa)
+{
+	struct ptlrpc_request *req;
+	int		    rc;
+	wait_queue_head_t	    waitq;
+	int		    generation, resends = 0;
+	struct l_wait_info     lwi;
+
+	ENTRY;
+
+	init_waitqueue_head(&waitq);
+	generation = exp->exp_obd->u.cli.cl_import->imp_generation;
+
+restart_bulk:
+	rc = osc_brw_prep_request(cmd, &exp->exp_obd->u.cli, oa, lsm,
+				  page_count, pga, &req, ocapa, 0, resends);
+	if (rc != 0)
+		return (rc);
+
+	if (resends) {
+		req->rq_generation_set = 1;
+		req->rq_import_generation = generation;
+		req->rq_sent = cfs_time_current_sec() + resends;
+	}
+
+	rc = ptlrpc_queue_wait(req);
+
+	if (rc == -ETIMEDOUT && req->rq_resend) {
+		DEBUG_REQ(D_HA, req,  "BULK TIMEOUT");
+		ptlrpc_req_finished(req);
+		goto restart_bulk;
+	}
+
+	rc = osc_brw_fini_request(req, rc);
+
+	ptlrpc_req_finished(req);
+	/* When server return -EINPROGRESS, client should always retry
+	 * regardless of the number of times the bulk was resent already.*/
+	if (osc_recoverable_error(rc)) {
+		resends++;
+		if (rc != -EINPROGRESS &&
+		    !client_should_resend(resends, &exp->exp_obd->u.cli)) {
+			CERROR("%s: too many resend retries for object: "
+			       ""DOSTID", rc = %d.\n", exp->exp_obd->obd_name,
+			       POSTID(&oa->o_oi), rc);
+			goto out;
+		}
+		if (generation !=
+		    exp->exp_obd->u.cli.cl_import->imp_generation) {
+			CDEBUG(D_HA, "%s: resend cross eviction for object: "
+			       ""DOSTID", rc = %d.\n", exp->exp_obd->obd_name,
+			       POSTID(&oa->o_oi), rc);
+			goto out;
+		}
+
+		lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(resends), NULL, NULL,
+				       NULL);
+		l_wait_event(waitq, 0, &lwi);
+
+		goto restart_bulk;
+	}
+out:
+	if (rc == -EAGAIN || rc == -EINPROGRESS)
+		rc = -EIO;
+	RETURN (rc);
+}
+
+static int osc_brw_redo_request(struct ptlrpc_request *request,
+				struct osc_brw_async_args *aa, int rc)
+{
+	struct ptlrpc_request *new_req;
+	struct osc_brw_async_args *new_aa;
+	struct osc_async_page *oap;
+	ENTRY;
+
+	DEBUG_REQ(rc == -EINPROGRESS ? D_RPCTRACE : D_ERROR, request,
+		  "redo for recoverable error %d", rc);
+
+	rc = osc_brw_prep_request(lustre_msg_get_opc(request->rq_reqmsg) ==
+					OST_WRITE ? OBD_BRW_WRITE :OBD_BRW_READ,
+				  aa->aa_cli, aa->aa_oa,
+				  NULL /* lsm unused by osc currently */,
+				  aa->aa_page_count, aa->aa_ppga,
+				  &new_req, aa->aa_ocapa, 0, 1);
+	if (rc)
+		RETURN(rc);
+
+	list_for_each_entry(oap, &aa->aa_oaps, oap_rpc_item) {
+		if (oap->oap_request != NULL) {
+			LASSERTF(request == oap->oap_request,
+				 "request %p != oap_request %p\n",
+				 request, oap->oap_request);
+			if (oap->oap_interrupted) {
+				ptlrpc_req_finished(new_req);
+				RETURN(-EINTR);
+			}
+		}
+	}
+	/* New request takes over pga and oaps from old request.
+	 * Note that copying a list_head doesn't work, need to move it... */
+	aa->aa_resends++;
+	new_req->rq_interpret_reply = request->rq_interpret_reply;
+	new_req->rq_async_args = request->rq_async_args;
+	new_req->rq_commit_cb = request->rq_commit_cb;
+	/* cap resend delay to the current request timeout, this is similar to
+	 * what ptlrpc does (see after_reply()) */
+	if (aa->aa_resends > new_req->rq_timeout)
+		new_req->rq_sent = cfs_time_current_sec() + new_req->rq_timeout;
+	else
+		new_req->rq_sent = cfs_time_current_sec() + aa->aa_resends;
+	new_req->rq_generation_set = 1;
+	new_req->rq_import_generation = request->rq_import_generation;
+
+	new_aa = ptlrpc_req_async_args(new_req);
+
+	INIT_LIST_HEAD(&new_aa->aa_oaps);
+	list_splice_init(&aa->aa_oaps, &new_aa->aa_oaps);
+	INIT_LIST_HEAD(&new_aa->aa_exts);
+	list_splice_init(&aa->aa_exts, &new_aa->aa_exts);
+	new_aa->aa_resends = aa->aa_resends;
+
+	list_for_each_entry(oap, &new_aa->aa_oaps, oap_rpc_item) {
+		if (oap->oap_request) {
+			ptlrpc_req_finished(oap->oap_request);
+			oap->oap_request = ptlrpc_request_addref(new_req);
+		}
+	}
+
+	new_aa->aa_ocapa = aa->aa_ocapa;
+	aa->aa_ocapa = NULL;
+
+	/* XXX: This code will run into problem if we're going to support
+	 * to add a series of BRW RPCs into a self-defined ptlrpc_request_set
+	 * and wait for all of them to be finished. We should inherit request
+	 * set from old request. */
+	ptlrpcd_add_req(new_req, PDL_POLICY_SAME, -1);
+
+	DEBUG_REQ(D_INFO, new_req, "new request");
+	RETURN(0);
+}
+
+/*
+ * ugh, we want disk allocation on the target to happen in offset order.  we'll
+ * follow sedgewicks advice and stick to the dead simple shellsort -- it'll do
+ * fine for our small page arrays and doesn't require allocation.  its an
+ * insertion sort that swaps elements that are strides apart, shrinking the
+ * stride down until its '1' and the array is sorted.
+ */
+static void sort_brw_pages(struct brw_page **array, int num)
+{
+	int stride, i, j;
+	struct brw_page *tmp;
+
+	if (num == 1)
+		return;
+	for (stride = 1; stride < num ; stride = (stride * 3) + 1)
+		;
+
+	do {
+		stride /= 3;
+		for (i = stride ; i < num ; i++) {
+			tmp = array[i];
+			j = i;
+			while (j >= stride && array[j - stride]->off > tmp->off) {
+				array[j] = array[j - stride];
+				j -= stride;
+			}
+			array[j] = tmp;
+		}
+	} while (stride > 1);
+}
+
+static obd_count max_unfragmented_pages(struct brw_page **pg, obd_count pages)
+{
+	int count = 1;
+	int offset;
+	int i = 0;
+
+	LASSERT (pages > 0);
+	offset = pg[i]->off & ~CFS_PAGE_MASK;
+
+	for (;;) {
+		pages--;
+		if (pages == 0)	 /* that's all */
+			return count;
+
+		if (offset + pg[i]->count < PAGE_CACHE_SIZE)
+			return count;   /* doesn't end on page boundary */
+
+		i++;
+		offset = pg[i]->off & ~CFS_PAGE_MASK;
+		if (offset != 0)	/* doesn't start on page boundary */
+			return count;
+
+		count++;
+	}
+}
+
+static struct brw_page **osc_build_ppga(struct brw_page *pga, obd_count count)
+{
+	struct brw_page **ppga;
+	int i;
+
+	OBD_ALLOC(ppga, sizeof(*ppga) * count);
+	if (ppga == NULL)
+		return NULL;
+
+	for (i = 0; i < count; i++)
+		ppga[i] = pga + i;
+	return ppga;
+}
+
+static void osc_release_ppga(struct brw_page **ppga, obd_count count)
+{
+	LASSERT(ppga != NULL);
+	OBD_FREE(ppga, sizeof(*ppga) * count);
+}
+
+static int osc_brw(int cmd, struct obd_export *exp, struct obd_info *oinfo,
+		   obd_count page_count, struct brw_page *pga,
+		   struct obd_trans_info *oti)
+{
+	struct obdo *saved_oa = NULL;
+	struct brw_page **ppga, **orig;
+	struct obd_import *imp = class_exp2cliimp(exp);
+	struct client_obd *cli;
+	int rc, page_count_orig;
+	ENTRY;
+
+	LASSERT((imp != NULL) && (imp->imp_obd != NULL));
+	cli = &imp->imp_obd->u.cli;
+
+	if (cmd & OBD_BRW_CHECK) {
+		/* The caller just wants to know if there's a chance that this
+		 * I/O can succeed */
+
+		if (imp->imp_invalid)
+			RETURN(-EIO);
+		RETURN(0);
+	}
+
+	/* test_brw with a failed create can trip this, maybe others. */
+	LASSERT(cli->cl_max_pages_per_rpc);
+
+	rc = 0;
+
+	orig = ppga = osc_build_ppga(pga, page_count);
+	if (ppga == NULL)
+		RETURN(-ENOMEM);
+	page_count_orig = page_count;
+
+	sort_brw_pages(ppga, page_count);
+	while (page_count) {
+		obd_count pages_per_brw;
+
+		if (page_count > cli->cl_max_pages_per_rpc)
+			pages_per_brw = cli->cl_max_pages_per_rpc;
+		else
+			pages_per_brw = page_count;
+
+		pages_per_brw = max_unfragmented_pages(ppga, pages_per_brw);
+
+		if (saved_oa != NULL) {
+			/* restore previously saved oa */
+			*oinfo->oi_oa = *saved_oa;
+		} else if (page_count > pages_per_brw) {
+			/* save a copy of oa (brw will clobber it) */
+			OBDO_ALLOC(saved_oa);
+			if (saved_oa == NULL)
+				GOTO(out, rc = -ENOMEM);
+			*saved_oa = *oinfo->oi_oa;
+		}
+
+		rc = osc_brw_internal(cmd, exp, oinfo->oi_oa, oinfo->oi_md,
+				      pages_per_brw, ppga, oinfo->oi_capa);
+
+		if (rc != 0)
+			break;
+
+		page_count -= pages_per_brw;
+		ppga += pages_per_brw;
+	}
+
+out:
+	osc_release_ppga(orig, page_count_orig);
+
+	if (saved_oa != NULL)
+		OBDO_FREE(saved_oa);
+
+	RETURN(rc);
+}
+
+static int brw_interpret(const struct lu_env *env,
+			 struct ptlrpc_request *req, void *data, int rc)
+{
+	struct osc_brw_async_args *aa = data;
+	struct osc_extent *ext;
+	struct osc_extent *tmp;
+	struct cl_object  *obj = NULL;
+	struct client_obd *cli = aa->aa_cli;
+	ENTRY;
+
+	rc = osc_brw_fini_request(req, rc);
+	CDEBUG(D_INODE, "request %p aa %p rc %d\n", req, aa, rc);
+	/* When server return -EINPROGRESS, client should always retry
+	 * regardless of the number of times the bulk was resent already. */
+	if (osc_recoverable_error(rc)) {
+		if (req->rq_import_generation !=
+		    req->rq_import->imp_generation) {
+			CDEBUG(D_HA, "%s: resend cross eviction for object: "
+			       ""DOSTID", rc = %d.\n",
+			       req->rq_import->imp_obd->obd_name,
+			       POSTID(&aa->aa_oa->o_oi), rc);
+		} else if (rc == -EINPROGRESS ||
+		    client_should_resend(aa->aa_resends, aa->aa_cli)) {
+			rc = osc_brw_redo_request(req, aa, rc);
+		} else {
+			CERROR("%s: too many resent retries for object: "
+			       ""LPU64":"LPU64", rc = %d.\n",
+			       req->rq_import->imp_obd->obd_name,
+			       POSTID(&aa->aa_oa->o_oi), rc);
+		}
+
+		if (rc == 0)
+			RETURN(0);
+		else if (rc == -EAGAIN || rc == -EINPROGRESS)
+			rc = -EIO;
+	}
+
+	if (aa->aa_ocapa) {
+		capa_put(aa->aa_ocapa);
+		aa->aa_ocapa = NULL;
+	}
+
+	list_for_each_entry_safe(ext, tmp, &aa->aa_exts, oe_link) {
+		if (obj == NULL && rc == 0) {
+			obj = osc2cl(ext->oe_obj);
+			cl_object_get(obj);
+		}
+
+		list_del_init(&ext->oe_link);
+		osc_extent_finish(env, ext, 1, rc);
+	}
+	LASSERT(list_empty(&aa->aa_exts));
+	LASSERT(list_empty(&aa->aa_oaps));
+
+	if (obj != NULL) {
+		struct obdo *oa = aa->aa_oa;
+		struct cl_attr *attr  = &osc_env_info(env)->oti_attr;
+		unsigned long valid = 0;
+
+		LASSERT(rc == 0);
+		if (oa->o_valid & OBD_MD_FLBLOCKS) {
+			attr->cat_blocks = oa->o_blocks;
+			valid |= CAT_BLOCKS;
+		}
+		if (oa->o_valid & OBD_MD_FLMTIME) {
+			attr->cat_mtime = oa->o_mtime;
+			valid |= CAT_MTIME;
+		}
+		if (oa->o_valid & OBD_MD_FLATIME) {
+			attr->cat_atime = oa->o_atime;
+			valid |= CAT_ATIME;
+		}
+		if (oa->o_valid & OBD_MD_FLCTIME) {
+			attr->cat_ctime = oa->o_ctime;
+			valid |= CAT_CTIME;
+		}
+		if (valid != 0) {
+			cl_object_attr_lock(obj);
+			cl_object_attr_set(env, obj, attr, valid);
+			cl_object_attr_unlock(obj);
+		}
+		cl_object_put(env, obj);
+	}
+	OBDO_FREE(aa->aa_oa);
+
+	cl_req_completion(env, aa->aa_clerq, rc < 0 ? rc :
+			  req->rq_bulk->bd_nob_transferred);
+	osc_release_ppga(aa->aa_ppga, aa->aa_page_count);
+	ptlrpc_lprocfs_brw(req, req->rq_bulk->bd_nob_transferred);
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	/* We need to decrement before osc_ap_completion->osc_wake_cache_waiters
+	 * is called so we know whether to go to sync BRWs or wait for more
+	 * RPCs to complete */
+	if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE)
+		cli->cl_w_in_flight--;
+	else
+		cli->cl_r_in_flight--;
+	osc_wake_cache_waiters(cli);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	osc_io_unplug(env, cli, NULL, PDL_POLICY_SAME);
+	RETURN(rc);
+}
+
+static void brw_commit(struct ptlrpc_request *req)
+{
+	spin_lock(&req->rq_lock);
+	/* If osc_inc_unstable_pages (via osc_extent_finish) races with
+	 * this called via the rq_commit_cb, I need to ensure
+	 * osc_dec_unstable_pages is still called. Otherwise unstable
+	 * pages may be leaked. */
+	if (req->rq_unstable)
+		osc_dec_unstable_pages(req);
+	else
+		req->rq_committed = 1;
+	spin_unlock(&req->rq_lock);
+}
+
+/**
+ * Build an RPC by the list of extent @ext_list. The caller must ensure
+ * that the total pages in this list are NOT over max pages per RPC.
+ * Extents in the list must be in OES_RPC state.
+ */
+int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
+		  struct list_head *ext_list, int cmd, pdl_policy_t pol)
+{
+	struct ptlrpc_request *req = NULL;
+	struct osc_extent *ext;
+	LIST_HEAD(rpc_list);
+	struct brw_page **pga = NULL;
+	struct osc_brw_async_args *aa = NULL;
+	struct obdo *oa = NULL;
+	struct osc_async_page *oap;
+	struct osc_async_page *tmp;
+	struct cl_req *clerq = NULL;
+	enum cl_req_type crt = (cmd & OBD_BRW_WRITE) ? CRT_WRITE : CRT_READ;
+	struct ldlm_lock *lock = NULL;
+	struct cl_req_attr crattr;
+	obd_off starting_offset = OBD_OBJECT_EOF;
+	obd_off ending_offset = 0;
+	int i, rc, mpflag = 0, mem_tight = 0, page_count = 0;
+
+	ENTRY;
+	LASSERT(!list_empty(ext_list));
+
+	/* add pages into rpc_list to build BRW rpc */
+	list_for_each_entry(ext, ext_list, oe_link) {
+		LASSERT(ext->oe_state == OES_RPC);
+		mem_tight |= ext->oe_memalloc;
+		list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
+			++page_count;
+			list_add_tail(&oap->oap_rpc_item, &rpc_list);
+			if (starting_offset > oap->oap_obj_off)
+				starting_offset = oap->oap_obj_off;
+			else
+				LASSERT(oap->oap_page_off == 0);
+			if (ending_offset < oap->oap_obj_off + oap->oap_count)
+				ending_offset = oap->oap_obj_off +
+						oap->oap_count;
+			else
+				LASSERT(oap->oap_page_off + oap->oap_count ==
+					PAGE_CACHE_SIZE);
+		}
+	}
+
+	if (mem_tight)
+		mpflag = cfs_memory_pressure_get_and_set();
+
+	memset(&crattr, 0, sizeof crattr);
+	OBD_ALLOC(pga, sizeof(*pga) * page_count);
+	if (pga == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	OBDO_ALLOC(oa);
+	if (oa == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	i = 0;
+	list_for_each_entry(oap, &rpc_list, oap_rpc_item) {
+		struct cl_page *page = oap2cl_page(oap);
+		if (clerq == NULL) {
+			clerq = cl_req_alloc(env, page, crt,
+					     1 /* only 1-object rpcs for
+						* now */);
+			if (IS_ERR(clerq))
+				GOTO(out, rc = PTR_ERR(clerq));
+			lock = oap->oap_ldlm_lock;
+		}
+		if (mem_tight)
+			oap->oap_brw_flags |= OBD_BRW_MEMALLOC;
+		pga[i] = &oap->oap_brw_page;
+		pga[i]->off = oap->oap_obj_off + oap->oap_page_off;
+		CDEBUG(0, "put page %p index %lu oap %p flg %x to pga\n",
+		       pga[i]->pg, page_index(oap->oap_page), oap, pga[i]->flag);
+		i++;
+		cl_req_page_add(env, clerq, page);
+	}
+
+	/* always get the data for the obdo for the rpc */
+	LASSERT(clerq != NULL);
+	crattr.cra_oa = oa;
+	crattr.cra_capa = NULL;
+	memset(crattr.cra_jobid, 0, JOBSTATS_JOBID_SIZE);
+	cl_req_attr_set(env, clerq, &crattr, ~0ULL);
+	if (lock) {
+		oa->o_handle = lock->l_remote_handle;
+		oa->o_valid |= OBD_MD_FLHANDLE;
+	}
+
+	rc = cl_req_prep(env, clerq);
+	if (rc != 0) {
+		CERROR("cl_req_prep failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	sort_brw_pages(pga, page_count);
+	rc = osc_brw_prep_request(cmd, cli, oa, NULL, page_count,
+			pga, &req, crattr.cra_capa, 1, 0);
+	if (rc != 0) {
+		CERROR("prep_req failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	req->rq_commit_cb = brw_commit;
+	req->rq_interpret_reply = brw_interpret;
+
+	if (mem_tight != 0)
+		req->rq_memalloc = 1;
+
+	/* Need to update the timestamps after the request is built in case
+	 * we race with setattr (locally or in queue at OST).  If OST gets
+	 * later setattr before earlier BRW (as determined by the request xid),
+	 * the OST will not use BRW timestamps.  Sadly, there is no obvious
+	 * way to do this in a single call.  bug 10150 */
+	cl_req_attr_set(env, clerq, &crattr,
+			OBD_MD_FLMTIME|OBD_MD_FLCTIME|OBD_MD_FLATIME);
+
+	lustre_msg_set_jobid(req->rq_reqmsg, crattr.cra_jobid);
+
+	CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+	aa = ptlrpc_req_async_args(req);
+	INIT_LIST_HEAD(&aa->aa_oaps);
+	list_splice_init(&rpc_list, &aa->aa_oaps);
+	INIT_LIST_HEAD(&aa->aa_exts);
+	list_splice_init(ext_list, &aa->aa_exts);
+	aa->aa_clerq = clerq;
+
+	/* queued sync pages can be torn down while the pages
+	 * were between the pending list and the rpc */
+	tmp = NULL;
+	list_for_each_entry(oap, &aa->aa_oaps, oap_rpc_item) {
+		/* only one oap gets a request reference */
+		if (tmp == NULL)
+			tmp = oap;
+		if (oap->oap_interrupted && !req->rq_intr) {
+			CDEBUG(D_INODE, "oap %p in req %p interrupted\n",
+					oap, req);
+			ptlrpc_mark_interrupted(req);
+		}
+	}
+	if (tmp != NULL)
+		tmp->oap_request = ptlrpc_request_addref(req);
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	starting_offset >>= PAGE_CACHE_SHIFT;
+	if (cmd == OBD_BRW_READ) {
+		cli->cl_r_in_flight++;
+		lprocfs_oh_tally_log2(&cli->cl_read_page_hist, page_count);
+		lprocfs_oh_tally(&cli->cl_read_rpc_hist, cli->cl_r_in_flight);
+		lprocfs_oh_tally_log2(&cli->cl_read_offset_hist,
+				      starting_offset + 1);
+	} else {
+		cli->cl_w_in_flight++;
+		lprocfs_oh_tally_log2(&cli->cl_write_page_hist, page_count);
+		lprocfs_oh_tally(&cli->cl_write_rpc_hist, cli->cl_w_in_flight);
+		lprocfs_oh_tally_log2(&cli->cl_write_offset_hist,
+				      starting_offset + 1);
+	}
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	DEBUG_REQ(D_INODE, req, "%d pages, aa %p. now %dr/%dw in flight",
+		  page_count, aa, cli->cl_r_in_flight,
+		  cli->cl_w_in_flight);
+
+	/* XXX: Maybe the caller can check the RPC bulk descriptor to
+	 * see which CPU/NUMA node the majority of pages were allocated
+	 * on, and try to assign the async RPC to the CPU core
+	 * (PDL_POLICY_PREFERRED) to reduce cross-CPU memory traffic.
+	 *
+	 * But on the other hand, we expect that multiple ptlrpcd
+	 * threads and the initial write sponsor can run in parallel,
+	 * especially when data checksum is enabled, which is CPU-bound
+	 * operation and single ptlrpcd thread cannot process in time.
+	 * So more ptlrpcd threads sharing BRW load
+	 * (with PDL_POLICY_ROUND) seems better.
+	 */
+	ptlrpcd_add_req(req, pol, -1);
+	rc = 0;
+	EXIT;
+
+out:
+	if (mem_tight != 0)
+		cfs_memory_pressure_restore(mpflag);
+
+	capa_put(crattr.cra_capa);
+	if (rc != 0) {
+		LASSERT(req == NULL);
+
+		if (oa)
+			OBDO_FREE(oa);
+		if (pga)
+			OBD_FREE(pga, sizeof(*pga) * page_count);
+		/* this should happen rarely and is pretty bad, it makes the
+		 * pending list not follow the dirty order */
+		while (!list_empty(ext_list)) {
+			ext = list_entry(ext_list->next, struct osc_extent,
+					     oe_link);
+			list_del_init(&ext->oe_link);
+			osc_extent_finish(env, ext, 0, rc);
+		}
+		if (clerq && !IS_ERR(clerq))
+			cl_req_completion(env, clerq, rc);
+	}
+	RETURN(rc);
+}
+
+static int osc_set_lock_data_with_check(struct ldlm_lock *lock,
+					struct ldlm_enqueue_info *einfo)
+{
+	void *data = einfo->ei_cbdata;
+	int set = 0;
+
+	LASSERT(lock != NULL);
+	LASSERT(lock->l_blocking_ast == einfo->ei_cb_bl);
+	LASSERT(lock->l_resource->lr_type == einfo->ei_type);
+	LASSERT(lock->l_completion_ast == einfo->ei_cb_cp);
+	LASSERT(lock->l_glimpse_ast == einfo->ei_cb_gl);
+
+	lock_res_and_lock(lock);
+	spin_lock(&osc_ast_guard);
+
+	if (lock->l_ast_data == NULL)
+		lock->l_ast_data = data;
+	if (lock->l_ast_data == data)
+		set = 1;
+
+	spin_unlock(&osc_ast_guard);
+	unlock_res_and_lock(lock);
+
+	return set;
+}
+
+static int osc_set_data_with_check(struct lustre_handle *lockh,
+				   struct ldlm_enqueue_info *einfo)
+{
+	struct ldlm_lock *lock = ldlm_handle2lock(lockh);
+	int set = 0;
+
+	if (lock != NULL) {
+		set = osc_set_lock_data_with_check(lock, einfo);
+		LDLM_LOCK_PUT(lock);
+	} else
+		CERROR("lockh %p, data %p - client evicted?\n",
+		       lockh, einfo->ei_cbdata);
+	return set;
+}
+
+static int osc_change_cbdata(struct obd_export *exp, struct lov_stripe_md *lsm,
+			     ldlm_iterator_t replace, void *data)
+{
+	struct ldlm_res_id res_id;
+	struct obd_device *obd = class_exp2obd(exp);
+
+	ostid_build_res_name(&lsm->lsm_oi, &res_id);
+	ldlm_resource_iterate(obd->obd_namespace, &res_id, replace, data);
+	return 0;
+}
+
+/* find any ldlm lock of the inode in osc
+ * return 0    not find
+ *	1    find one
+ *      < 0    error */
+static int osc_find_cbdata(struct obd_export *exp, struct lov_stripe_md *lsm,
+			   ldlm_iterator_t replace, void *data)
+{
+	struct ldlm_res_id res_id;
+	struct obd_device *obd = class_exp2obd(exp);
+	int rc = 0;
+
+	ostid_build_res_name(&lsm->lsm_oi, &res_id);
+	rc = ldlm_resource_iterate(obd->obd_namespace, &res_id, replace, data);
+	if (rc == LDLM_ITER_STOP)
+		return(1);
+	if (rc == LDLM_ITER_CONTINUE)
+		return(0);
+	return(rc);
+}
+
+static int osc_enqueue_fini(struct ptlrpc_request *req, struct ost_lvb *lvb,
+			    obd_enqueue_update_f upcall, void *cookie,
+			    __u64 *flags, int agl, int rc)
+{
+	int intent = *flags & LDLM_FL_HAS_INTENT;
+	ENTRY;
+
+	if (intent) {
+		/* The request was created before ldlm_cli_enqueue call. */
+		if (rc == ELDLM_LOCK_ABORTED) {
+			struct ldlm_reply *rep;
+			rep = req_capsule_server_get(&req->rq_pill,
+						     &RMF_DLM_REP);
+
+			LASSERT(rep != NULL);
+			if (rep->lock_policy_res1)
+				rc = rep->lock_policy_res1;
+		}
+	}
+
+	if ((intent != 0 && rc == ELDLM_LOCK_ABORTED && agl == 0) ||
+	    (rc == 0)) {
+		*flags |= LDLM_FL_LVB_READY;
+		CDEBUG(D_INODE,"got kms "LPU64" blocks "LPU64" mtime "LPU64"\n",
+		       lvb->lvb_size, lvb->lvb_blocks, lvb->lvb_mtime);
+	}
+
+	/* Call the update callback. */
+	rc = (*upcall)(cookie, rc);
+	RETURN(rc);
+}
+
+static int osc_enqueue_interpret(const struct lu_env *env,
+				 struct ptlrpc_request *req,
+				 struct osc_enqueue_args *aa, int rc)
+{
+	struct ldlm_lock *lock;
+	struct lustre_handle handle;
+	__u32 mode;
+	struct ost_lvb *lvb;
+	__u32 lvb_len;
+	__u64 *flags = aa->oa_flags;
+
+	/* Make a local copy of a lock handle and a mode, because aa->oa_*
+	 * might be freed anytime after lock upcall has been called. */
+	lustre_handle_copy(&handle, aa->oa_lockh);
+	mode = aa->oa_ei->ei_mode;
+
+	/* ldlm_cli_enqueue is holding a reference on the lock, so it must
+	 * be valid. */
+	lock = ldlm_handle2lock(&handle);
+
+	/* Take an additional reference so that a blocking AST that
+	 * ldlm_cli_enqueue_fini() might post for a failed lock, is guaranteed
+	 * to arrive after an upcall has been executed by
+	 * osc_enqueue_fini(). */
+	ldlm_lock_addref(&handle, mode);
+
+	/* Let CP AST to grant the lock first. */
+	OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_ENQ_RACE, 1);
+
+	if (aa->oa_agl && rc == ELDLM_LOCK_ABORTED) {
+		lvb = NULL;
+		lvb_len = 0;
+	} else {
+		lvb = aa->oa_lvb;
+		lvb_len = sizeof(*aa->oa_lvb);
+	}
+
+	/* Complete obtaining the lock procedure. */
+	rc = ldlm_cli_enqueue_fini(aa->oa_exp, req, aa->oa_ei->ei_type, 1,
+				   mode, flags, lvb, lvb_len, &handle, rc);
+	/* Complete osc stuff. */
+	rc = osc_enqueue_fini(req, aa->oa_lvb, aa->oa_upcall, aa->oa_cookie,
+			      flags, aa->oa_agl, rc);
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_CANCEL_RACE, 10);
+
+	/* Release the lock for async request. */
+	if (lustre_handle_is_used(&handle) && rc == ELDLM_OK)
+		/*
+		 * Releases a reference taken by ldlm_cli_enqueue(), if it is
+		 * not already released by
+		 * ldlm_cli_enqueue_fini()->failed_lock_cleanup()
+		 */
+		ldlm_lock_decref(&handle, mode);
+
+	LASSERTF(lock != NULL, "lockh %p, req %p, aa %p - client evicted?\n",
+		 aa->oa_lockh, req, aa);
+	ldlm_lock_decref(&handle, mode);
+	LDLM_LOCK_PUT(lock);
+	return rc;
+}
+
+void osc_update_enqueue(struct lustre_handle *lov_lockhp,
+			struct lov_oinfo *loi, int flags,
+			struct ost_lvb *lvb, __u32 mode, int rc)
+{
+	struct ldlm_lock *lock = ldlm_handle2lock(lov_lockhp);
+
+	if (rc == ELDLM_OK) {
+		__u64 tmp;
+
+		LASSERT(lock != NULL);
+		loi->loi_lvb = *lvb;
+		tmp = loi->loi_lvb.lvb_size;
+		/* Extend KMS up to the end of this lock and no further
+		 * A lock on [x,y] means a KMS of up to y + 1 bytes! */
+		if (tmp > lock->l_policy_data.l_extent.end)
+			tmp = lock->l_policy_data.l_extent.end + 1;
+		if (tmp >= loi->loi_kms) {
+			LDLM_DEBUG(lock, "lock acquired, setting rss="LPU64
+				   ", kms="LPU64, loi->loi_lvb.lvb_size, tmp);
+			loi_kms_set(loi, tmp);
+		} else {
+			LDLM_DEBUG(lock, "lock acquired, setting rss="
+				   LPU64"; leaving kms="LPU64", end="LPU64,
+				   loi->loi_lvb.lvb_size, loi->loi_kms,
+				   lock->l_policy_data.l_extent.end);
+		}
+		ldlm_lock_allow_match(lock);
+	} else if (rc == ELDLM_LOCK_ABORTED && (flags & LDLM_FL_HAS_INTENT)) {
+		LASSERT(lock != NULL);
+		loi->loi_lvb = *lvb;
+		ldlm_lock_allow_match(lock);
+		CDEBUG(D_INODE, "glimpsed, setting rss="LPU64"; leaving"
+		       " kms="LPU64"\n", loi->loi_lvb.lvb_size, loi->loi_kms);
+		rc = ELDLM_OK;
+	}
+
+	if (lock != NULL) {
+		if (rc != ELDLM_OK)
+			ldlm_lock_fail_match(lock);
+
+		LDLM_LOCK_PUT(lock);
+	}
+}
+EXPORT_SYMBOL(osc_update_enqueue);
+
+struct ptlrpc_request_set *PTLRPCD_SET = (void *)1;
+
+/* When enqueuing asynchronously, locks are not ordered, we can obtain a lock
+ * from the 2nd OSC before a lock from the 1st one. This does not deadlock with
+ * other synchronous requests, however keeping some locks and trying to obtain
+ * others may take a considerable amount of time in a case of ost failure; and
+ * when other sync requests do not get released lock from a client, the client
+ * is excluded from the cluster -- such scenarious make the life difficult, so
+ * release locks just after they are obtained. */
+int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
+		     __u64 *flags, ldlm_policy_data_t *policy,
+		     struct ost_lvb *lvb, int kms_valid,
+		     obd_enqueue_update_f upcall, void *cookie,
+		     struct ldlm_enqueue_info *einfo,
+		     struct lustre_handle *lockh,
+		     struct ptlrpc_request_set *rqset, int async, int agl)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct ptlrpc_request *req = NULL;
+	int intent = *flags & LDLM_FL_HAS_INTENT;
+	int match_lvb = (agl != 0 ? 0 : LDLM_FL_LVB_READY);
+	ldlm_mode_t mode;
+	int rc;
+	ENTRY;
+
+	/* Filesystem lock extents are extended to page boundaries so that
+	 * dealing with the page cache is a little smoother.  */
+	policy->l_extent.start -= policy->l_extent.start & ~CFS_PAGE_MASK;
+	policy->l_extent.end |= ~CFS_PAGE_MASK;
+
+	/*
+	 * kms is not valid when either object is completely fresh (so that no
+	 * locks are cached), or object was evicted. In the latter case cached
+	 * lock cannot be used, because it would prime inode state with
+	 * potentially stale LVB.
+	 */
+	if (!kms_valid)
+		goto no_match;
+
+	/* Next, search for already existing extent locks that will cover us */
+	/* If we're trying to read, we also search for an existing PW lock.  The
+	 * VFS and page cache already protect us locally, so lots of readers/
+	 * writers can share a single PW lock.
+	 *
+	 * There are problems with conversion deadlocks, so instead of
+	 * converting a read lock to a write lock, we'll just enqueue a new
+	 * one.
+	 *
+	 * At some point we should cancel the read lock instead of making them
+	 * send us a blocking callback, but there are problems with canceling
+	 * locks out from other users right now, too. */
+	mode = einfo->ei_mode;
+	if (einfo->ei_mode == LCK_PR)
+		mode |= LCK_PW;
+	mode = ldlm_lock_match(obd->obd_namespace, *flags | match_lvb, res_id,
+			       einfo->ei_type, policy, mode, lockh, 0);
+	if (mode) {
+		struct ldlm_lock *matched = ldlm_handle2lock(lockh);
+
+		if ((agl != 0) && !(matched->l_flags & LDLM_FL_LVB_READY)) {
+			/* For AGL, if enqueue RPC is sent but the lock is not
+			 * granted, then skip to process this strpe.
+			 * Return -ECANCELED to tell the caller. */
+			ldlm_lock_decref(lockh, mode);
+			LDLM_LOCK_PUT(matched);
+			RETURN(-ECANCELED);
+		} else if (osc_set_lock_data_with_check(matched, einfo)) {
+			*flags |= LDLM_FL_LVB_READY;
+			/* addref the lock only if not async requests and PW
+			 * lock is matched whereas we asked for PR. */
+			if (!rqset && einfo->ei_mode != mode)
+				ldlm_lock_addref(lockh, LCK_PR);
+			if (intent) {
+				/* I would like to be able to ASSERT here that
+				 * rss <= kms, but I can't, for reasons which
+				 * are explained in lov_enqueue() */
+			}
+
+			/* We already have a lock, and it's referenced.
+			 *
+			 * At this point, the cl_lock::cll_state is CLS_QUEUING,
+			 * AGL upcall may change it to CLS_HELD directly. */
+			(*upcall)(cookie, ELDLM_OK);
+
+			if (einfo->ei_mode != mode)
+				ldlm_lock_decref(lockh, LCK_PW);
+			else if (rqset)
+				/* For async requests, decref the lock. */
+				ldlm_lock_decref(lockh, einfo->ei_mode);
+			LDLM_LOCK_PUT(matched);
+			RETURN(ELDLM_OK);
+		} else {
+			ldlm_lock_decref(lockh, mode);
+			LDLM_LOCK_PUT(matched);
+		}
+	}
+
+ no_match:
+	if (intent) {
+		LIST_HEAD(cancels);
+		req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+					   &RQF_LDLM_ENQUEUE_LVB);
+		if (req == NULL)
+			RETURN(-ENOMEM);
+
+		rc = ldlm_prep_enqueue_req(exp, req, &cancels, 0);
+		if (rc) {
+			ptlrpc_request_free(req);
+			RETURN(rc);
+		}
+
+		req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
+				     sizeof *lvb);
+		ptlrpc_request_set_replen(req);
+	}
+
+	/* users of osc_enqueue() can pass this flag for ldlm_lock_match() */
+	*flags &= ~LDLM_FL_BLOCK_GRANTED;
+
+	rc = ldlm_cli_enqueue(exp, &req, einfo, res_id, policy, flags, lvb,
+			      sizeof(*lvb), LVB_T_OST, lockh, async);
+	if (rqset) {
+		if (!rc) {
+			struct osc_enqueue_args *aa;
+			CLASSERT (sizeof(*aa) <= sizeof(req->rq_async_args));
+			aa = ptlrpc_req_async_args(req);
+			aa->oa_ei = einfo;
+			aa->oa_exp = exp;
+			aa->oa_flags  = flags;
+			aa->oa_upcall = upcall;
+			aa->oa_cookie = cookie;
+			aa->oa_lvb    = lvb;
+			aa->oa_lockh  = lockh;
+			aa->oa_agl    = !!agl;
+
+			req->rq_interpret_reply =
+				(ptlrpc_interpterer_t)osc_enqueue_interpret;
+			if (rqset == PTLRPCD_SET)
+				ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+			else
+				ptlrpc_set_add_req(rqset, req);
+		} else if (intent) {
+			ptlrpc_req_finished(req);
+		}
+		RETURN(rc);
+	}
+
+	rc = osc_enqueue_fini(req, lvb, upcall, cookie, flags, agl, rc);
+	if (intent)
+		ptlrpc_req_finished(req);
+
+	RETURN(rc);
+}
+
+static int osc_enqueue(struct obd_export *exp, struct obd_info *oinfo,
+		       struct ldlm_enqueue_info *einfo,
+		       struct ptlrpc_request_set *rqset)
+{
+	struct ldlm_res_id res_id;
+	int rc;
+	ENTRY;
+
+	ostid_build_res_name(&oinfo->oi_md->lsm_oi, &res_id);
+	rc = osc_enqueue_base(exp, &res_id, &oinfo->oi_flags, &oinfo->oi_policy,
+			      &oinfo->oi_md->lsm_oinfo[0]->loi_lvb,
+			      oinfo->oi_md->lsm_oinfo[0]->loi_kms_valid,
+			      oinfo->oi_cb_up, oinfo, einfo, oinfo->oi_lockh,
+			      rqset, rqset != NULL, 0);
+	RETURN(rc);
+}
+
+int osc_match_base(struct obd_export *exp, struct ldlm_res_id *res_id,
+		   __u32 type, ldlm_policy_data_t *policy, __u32 mode,
+		   int *flags, void *data, struct lustre_handle *lockh,
+		   int unref)
+{
+	struct obd_device *obd = exp->exp_obd;
+	int lflags = *flags;
+	ldlm_mode_t rc;
+	ENTRY;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_OSC_MATCH))
+		RETURN(-EIO);
+
+	/* Filesystem lock extents are extended to page boundaries so that
+	 * dealing with the page cache is a little smoother */
+	policy->l_extent.start -= policy->l_extent.start & ~CFS_PAGE_MASK;
+	policy->l_extent.end |= ~CFS_PAGE_MASK;
+
+	/* Next, search for already existing extent locks that will cover us */
+	/* If we're trying to read, we also search for an existing PW lock.  The
+	 * VFS and page cache already protect us locally, so lots of readers/
+	 * writers can share a single PW lock. */
+	rc = mode;
+	if (mode == LCK_PR)
+		rc |= LCK_PW;
+	rc = ldlm_lock_match(obd->obd_namespace, lflags,
+			     res_id, type, policy, rc, lockh, unref);
+	if (rc) {
+		if (data != NULL) {
+			if (!osc_set_data_with_check(lockh, data)) {
+				if (!(lflags & LDLM_FL_TEST_LOCK))
+					ldlm_lock_decref(lockh, rc);
+				RETURN(0);
+			}
+		}
+		if (!(lflags & LDLM_FL_TEST_LOCK) && mode != rc) {
+			ldlm_lock_addref(lockh, LCK_PR);
+			ldlm_lock_decref(lockh, LCK_PW);
+		}
+		RETURN(rc);
+	}
+	RETURN(rc);
+}
+
+int osc_cancel_base(struct lustre_handle *lockh, __u32 mode)
+{
+	ENTRY;
+
+	if (unlikely(mode == LCK_GROUP))
+		ldlm_lock_decref_and_cancel(lockh, mode);
+	else
+		ldlm_lock_decref(lockh, mode);
+
+	RETURN(0);
+}
+
+static int osc_cancel(struct obd_export *exp, struct lov_stripe_md *md,
+		      __u32 mode, struct lustre_handle *lockh)
+{
+	ENTRY;
+	RETURN(osc_cancel_base(lockh, mode));
+}
+
+static int osc_cancel_unused(struct obd_export *exp,
+			     struct lov_stripe_md *lsm,
+			     ldlm_cancel_flags_t flags,
+			     void *opaque)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	struct ldlm_res_id res_id, *resp = NULL;
+
+	if (lsm != NULL) {
+		ostid_build_res_name(&lsm->lsm_oi, &res_id);
+		resp = &res_id;
+	}
+
+	return ldlm_cli_cancel_unused(obd->obd_namespace, resp, flags, opaque);
+}
+
+static int osc_statfs_interpret(const struct lu_env *env,
+				struct ptlrpc_request *req,
+				struct osc_async_args *aa, int rc)
+{
+	struct obd_statfs *msfs;
+	ENTRY;
+
+	if (rc == -EBADR)
+		/* The request has in fact never been sent
+		 * due to issues at a higher level (LOV).
+		 * Exit immediately since the caller is
+		 * aware of the problem and takes care
+		 * of the clean up */
+		 RETURN(rc);
+
+	if ((rc == -ENOTCONN || rc == -EAGAIN) &&
+	    (aa->aa_oi->oi_flags & OBD_STATFS_NODELAY))
+		GOTO(out, rc = 0);
+
+	if (rc != 0)
+		GOTO(out, rc);
+
+	msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
+	if (msfs == NULL) {
+		GOTO(out, rc = -EPROTO);
+	}
+
+	*aa->aa_oi->oi_osfs = *msfs;
+out:
+	rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc);
+	RETURN(rc);
+}
+
+static int osc_statfs_async(struct obd_export *exp,
+			    struct obd_info *oinfo, __u64 max_age,
+			    struct ptlrpc_request_set *rqset)
+{
+	struct obd_device     *obd = class_exp2obd(exp);
+	struct ptlrpc_request *req;
+	struct osc_async_args *aa;
+	int		    rc;
+	ENTRY;
+
+	/* We could possibly pass max_age in the request (as an absolute
+	 * timestamp or a "seconds.usec ago") so the target can avoid doing
+	 * extra calls into the filesystem if that isn't necessary (e.g.
+	 * during mount that would help a bit).  Having relative timestamps
+	 * is not so great if request processing is slow, while absolute
+	 * timestamps are not ideal because they need time synchronization. */
+	req = ptlrpc_request_alloc(obd->u.cli.cl_import, &RQF_OST_STATFS);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+	ptlrpc_request_set_replen(req);
+	req->rq_request_portal = OST_CREATE_PORTAL;
+	ptlrpc_at_set_req_timeout(req);
+
+	if (oinfo->oi_flags & OBD_STATFS_NODELAY) {
+		/* procfs requests not want stat in wait for avoid deadlock */
+		req->rq_no_resend = 1;
+		req->rq_no_delay = 1;
+	}
+
+	req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_statfs_interpret;
+	CLASSERT (sizeof(*aa) <= sizeof(req->rq_async_args));
+	aa = ptlrpc_req_async_args(req);
+	aa->aa_oi = oinfo;
+
+	ptlrpc_set_add_req(rqset, req);
+	RETURN(0);
+}
+
+static int osc_statfs(const struct lu_env *env, struct obd_export *exp,
+		      struct obd_statfs *osfs, __u64 max_age, __u32 flags)
+{
+	struct obd_device     *obd = class_exp2obd(exp);
+	struct obd_statfs     *msfs;
+	struct ptlrpc_request *req;
+	struct obd_import     *imp = NULL;
+	int rc;
+	ENTRY;
+
+	/*Since the request might also come from lprocfs, so we need
+	 *sync this with client_disconnect_export Bug15684*/
+	down_read(&obd->u.cli.cl_sem);
+	if (obd->u.cli.cl_import)
+		imp = class_import_get(obd->u.cli.cl_import);
+	up_read(&obd->u.cli.cl_sem);
+	if (!imp)
+		RETURN(-ENODEV);
+
+	/* We could possibly pass max_age in the request (as an absolute
+	 * timestamp or a "seconds.usec ago") so the target can avoid doing
+	 * extra calls into the filesystem if that isn't necessary (e.g.
+	 * during mount that would help a bit).  Having relative timestamps
+	 * is not so great if request processing is slow, while absolute
+	 * timestamps are not ideal because they need time synchronization. */
+	req = ptlrpc_request_alloc(imp, &RQF_OST_STATFS);
+
+	class_import_put(imp);
+
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+	ptlrpc_request_set_replen(req);
+	req->rq_request_portal = OST_CREATE_PORTAL;
+	ptlrpc_at_set_req_timeout(req);
+
+	if (flags & OBD_STATFS_NODELAY) {
+		/* procfs requests not want stat in wait for avoid deadlock */
+		req->rq_no_resend = 1;
+		req->rq_no_delay = 1;
+	}
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
+
+	msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
+	if (msfs == NULL) {
+		GOTO(out, rc = -EPROTO);
+	}
+
+	*osfs = *msfs;
+
+	EXIT;
+ out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+/* Retrieve object striping information.
+ *
+ * @lmmu is a pointer to an in-core struct with lmm_ost_count indicating
+ * the maximum number of OST indices which will fit in the user buffer.
+ * lmm_magic must be LOV_MAGIC (we only use 1 slot here).
+ */
+static int osc_getstripe(struct lov_stripe_md *lsm, struct lov_user_md *lump)
+{
+	/* we use lov_user_md_v3 because it is larger than lov_user_md_v1 */
+	struct lov_user_md_v3 lum, *lumk;
+	struct lov_user_ost_data_v1 *lmm_objects;
+	int rc = 0, lum_size;
+	ENTRY;
+
+	if (!lsm)
+		RETURN(-ENODATA);
+
+	/* we only need the header part from user space to get lmm_magic and
+	 * lmm_stripe_count, (the header part is common to v1 and v3) */
+	lum_size = sizeof(struct lov_user_md_v1);
+	if (copy_from_user(&lum, lump, lum_size))
+		RETURN(-EFAULT);
+
+	if ((lum.lmm_magic != LOV_USER_MAGIC_V1) &&
+	    (lum.lmm_magic != LOV_USER_MAGIC_V3))
+		RETURN(-EINVAL);
+
+	/* lov_user_md_vX and lov_mds_md_vX must have the same size */
+	LASSERT(sizeof(struct lov_user_md_v1) == sizeof(struct lov_mds_md_v1));
+	LASSERT(sizeof(struct lov_user_md_v3) == sizeof(struct lov_mds_md_v3));
+	LASSERT(sizeof(lum.lmm_objects[0]) == sizeof(lumk->lmm_objects[0]));
+
+	/* we can use lov_mds_md_size() to compute lum_size
+	 * because lov_user_md_vX and lov_mds_md_vX have the same size */
+	if (lum.lmm_stripe_count > 0) {
+		lum_size = lov_mds_md_size(lum.lmm_stripe_count, lum.lmm_magic);
+		OBD_ALLOC(lumk, lum_size);
+		if (!lumk)
+			RETURN(-ENOMEM);
+
+		if (lum.lmm_magic == LOV_USER_MAGIC_V1)
+			lmm_objects =
+			    &(((struct lov_user_md_v1 *)lumk)->lmm_objects[0]);
+		else
+			lmm_objects = &(lumk->lmm_objects[0]);
+		lmm_objects->l_ost_oi = lsm->lsm_oi;
+	} else {
+		lum_size = lov_mds_md_size(0, lum.lmm_magic);
+		lumk = &lum;
+	}
+
+	lumk->lmm_oi = lsm->lsm_oi;
+	lumk->lmm_stripe_count = 1;
+
+	if (copy_to_user(lump, lumk, lum_size))
+		rc = -EFAULT;
+
+	if (lumk != &lum)
+		OBD_FREE(lumk, lum_size);
+
+	RETURN(rc);
+}
+
+
+static int osc_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
+			 void *karg, void *uarg)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct obd_ioctl_data *data = karg;
+	int err = 0;
+	ENTRY;
+
+	if (!try_module_get(THIS_MODULE)) {
+		CERROR("Can't get module. Is it alive?");
+		return -EINVAL;
+	}
+	switch (cmd) {
+	case OBD_IOC_LOV_GET_CONFIG: {
+		char *buf;
+		struct lov_desc *desc;
+		struct obd_uuid uuid;
+
+		buf = NULL;
+		len = 0;
+		if (obd_ioctl_getdata(&buf, &len, (void *)uarg))
+			GOTO(out, err = -EINVAL);
+
+		data = (struct obd_ioctl_data *)buf;
+
+		if (sizeof(*desc) > data->ioc_inllen1) {
+			obd_ioctl_freedata(buf, len);
+			GOTO(out, err = -EINVAL);
+		}
+
+		if (data->ioc_inllen2 < sizeof(uuid)) {
+			obd_ioctl_freedata(buf, len);
+			GOTO(out, err = -EINVAL);
+		}
+
+		desc = (struct lov_desc *)data->ioc_inlbuf1;
+		desc->ld_tgt_count = 1;
+		desc->ld_active_tgt_count = 1;
+		desc->ld_default_stripe_count = 1;
+		desc->ld_default_stripe_size = 0;
+		desc->ld_default_stripe_offset = 0;
+		desc->ld_pattern = 0;
+		memcpy(&desc->ld_uuid, &obd->obd_uuid, sizeof(uuid));
+
+		memcpy(data->ioc_inlbuf2, &obd->obd_uuid, sizeof(uuid));
+
+		err = copy_to_user((void *)uarg, buf, len);
+		if (err)
+			err = -EFAULT;
+		obd_ioctl_freedata(buf, len);
+		GOTO(out, err);
+	}
+	case LL_IOC_LOV_SETSTRIPE:
+		err = obd_alloc_memmd(exp, karg);
+		if (err > 0)
+			err = 0;
+		GOTO(out, err);
+	case LL_IOC_LOV_GETSTRIPE:
+		err = osc_getstripe(karg, uarg);
+		GOTO(out, err);
+	case OBD_IOC_CLIENT_RECOVER:
+		err = ptlrpc_recover_import(obd->u.cli.cl_import,
+					    data->ioc_inlbuf1, 0);
+		if (err > 0)
+			err = 0;
+		GOTO(out, err);
+	case IOC_OSC_SET_ACTIVE:
+		err = ptlrpc_set_import_active(obd->u.cli.cl_import,
+					       data->ioc_offset);
+		GOTO(out, err);
+	case OBD_IOC_POLL_QUOTACHECK:
+		err = osc_quota_poll_check(exp, (struct if_quotacheck *)karg);
+		GOTO(out, err);
+	case OBD_IOC_PING_TARGET:
+		err = ptlrpc_obd_ping(obd);
+		GOTO(out, err);
+	default:
+		CDEBUG(D_INODE, "unrecognised ioctl %#x by %s\n",
+		       cmd, current_comm());
+		GOTO(out, err = -ENOTTY);
+	}
+out:
+	module_put(THIS_MODULE);
+	return err;
+}
+
+static int osc_get_info(const struct lu_env *env, struct obd_export *exp,
+			obd_count keylen, void *key, __u32 *vallen, void *val,
+			struct lov_stripe_md *lsm)
+{
+	ENTRY;
+	if (!vallen || !val)
+		RETURN(-EFAULT);
+
+	if (KEY_IS(KEY_LOCK_TO_STRIPE)) {
+		__u32 *stripe = val;
+		*vallen = sizeof(*stripe);
+		*stripe = 0;
+		RETURN(0);
+	} else if (KEY_IS(KEY_LAST_ID)) {
+		struct ptlrpc_request *req;
+		obd_id		*reply;
+		char		  *tmp;
+		int		    rc;
+
+		req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+					   &RQF_OST_GET_INFO_LAST_ID);
+		if (req == NULL)
+			RETURN(-ENOMEM);
+
+		req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY,
+				     RCL_CLIENT, keylen);
+		rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GET_INFO);
+		if (rc) {
+			ptlrpc_request_free(req);
+			RETURN(rc);
+		}
+
+		tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY);
+		memcpy(tmp, key, keylen);
+
+		req->rq_no_delay = req->rq_no_resend = 1;
+		ptlrpc_request_set_replen(req);
+		rc = ptlrpc_queue_wait(req);
+		if (rc)
+			GOTO(out, rc);
+
+		reply = req_capsule_server_get(&req->rq_pill, &RMF_OBD_ID);
+		if (reply == NULL)
+			GOTO(out, rc = -EPROTO);
+
+		*((obd_id *)val) = *reply;
+	out:
+		ptlrpc_req_finished(req);
+		RETURN(rc);
+	} else if (KEY_IS(KEY_FIEMAP)) {
+		struct ptlrpc_request *req;
+		struct ll_user_fiemap *reply;
+		char *tmp;
+		int rc;
+
+		req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+					   &RQF_OST_GET_INFO_FIEMAP);
+		if (req == NULL)
+			RETURN(-ENOMEM);
+
+		req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_KEY,
+				     RCL_CLIENT, keylen);
+		req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_VAL,
+				     RCL_CLIENT, *vallen);
+		req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_VAL,
+				     RCL_SERVER, *vallen);
+
+		rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GET_INFO);
+		if (rc) {
+			ptlrpc_request_free(req);
+			RETURN(rc);
+		}
+
+		tmp = req_capsule_client_get(&req->rq_pill, &RMF_FIEMAP_KEY);
+		memcpy(tmp, key, keylen);
+		tmp = req_capsule_client_get(&req->rq_pill, &RMF_FIEMAP_VAL);
+		memcpy(tmp, val, *vallen);
+
+		ptlrpc_request_set_replen(req);
+		rc = ptlrpc_queue_wait(req);
+		if (rc)
+			GOTO(out1, rc);
+
+		reply = req_capsule_server_get(&req->rq_pill, &RMF_FIEMAP_VAL);
+		if (reply == NULL)
+			GOTO(out1, rc = -EPROTO);
+
+		memcpy(val, reply, *vallen);
+	out1:
+		ptlrpc_req_finished(req);
+
+		RETURN(rc);
+	}
+
+	RETURN(-EINVAL);
+}
+
+static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
+			      obd_count keylen, void *key, obd_count vallen,
+			      void *val, struct ptlrpc_request_set *set)
+{
+	struct ptlrpc_request *req;
+	struct obd_device     *obd = exp->exp_obd;
+	struct obd_import     *imp = class_exp2cliimp(exp);
+	char		  *tmp;
+	int		    rc;
+	ENTRY;
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_SHUTDOWN, 10);
+
+	if (KEY_IS(KEY_CHECKSUM)) {
+		if (vallen != sizeof(int))
+			RETURN(-EINVAL);
+		exp->exp_obd->u.cli.cl_checksum = (*(int *)val) ? 1 : 0;
+		RETURN(0);
+	}
+
+	if (KEY_IS(KEY_SPTLRPC_CONF)) {
+		sptlrpc_conf_client_adapt(obd);
+		RETURN(0);
+	}
+
+	if (KEY_IS(KEY_FLUSH_CTX)) {
+		sptlrpc_import_flush_my_ctx(imp);
+		RETURN(0);
+	}
+
+	if (KEY_IS(KEY_CACHE_SET)) {
+		struct client_obd *cli = &obd->u.cli;
+
+		LASSERT(cli->cl_cache == NULL); /* only once */
+		cli->cl_cache = (struct cl_client_cache *)val;
+		atomic_inc(&cli->cl_cache->ccc_users);
+		cli->cl_lru_left = &cli->cl_cache->ccc_lru_left;
+
+		/* add this osc into entity list */
+		LASSERT(list_empty(&cli->cl_lru_osc));
+		spin_lock(&cli->cl_cache->ccc_lru_lock);
+		list_add(&cli->cl_lru_osc, &cli->cl_cache->ccc_lru);
+		spin_unlock(&cli->cl_cache->ccc_lru_lock);
+
+		RETURN(0);
+	}
+
+	if (KEY_IS(KEY_CACHE_LRU_SHRINK)) {
+		struct client_obd *cli = &obd->u.cli;
+		int nr = atomic_read(&cli->cl_lru_in_list) >> 1;
+		int target = *(int *)val;
+
+		nr = osc_lru_shrink(cli, min(nr, target));
+		*(int *)val -= nr;
+		RETURN(0);
+	}
+
+	if (!set && !KEY_IS(KEY_GRANT_SHRINK))
+		RETURN(-EINVAL);
+
+	/* We pass all other commands directly to OST. Since nobody calls osc
+	   methods directly and everybody is supposed to go through LOV, we
+	   assume lov checked invalid values for us.
+	   The only recognised values so far are evict_by_nid and mds_conn.
+	   Even if something bad goes through, we'd get a -EINVAL from OST
+	   anyway. */
+
+	req = ptlrpc_request_alloc(imp, KEY_IS(KEY_GRANT_SHRINK) ?
+						&RQF_OST_SET_GRANT_INFO :
+						&RQF_OBD_SET_INFO);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY,
+			     RCL_CLIENT, keylen);
+	if (!KEY_IS(KEY_GRANT_SHRINK))
+		req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_VAL,
+				     RCL_CLIENT, vallen);
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SET_INFO);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY);
+	memcpy(tmp, key, keylen);
+	tmp = req_capsule_client_get(&req->rq_pill, KEY_IS(KEY_GRANT_SHRINK) ?
+							&RMF_OST_BODY :
+							&RMF_SETINFO_VAL);
+	memcpy(tmp, val, vallen);
+
+	if (KEY_IS(KEY_GRANT_SHRINK)) {
+		struct osc_grant_args *aa;
+		struct obdo *oa;
+
+		CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+		aa = ptlrpc_req_async_args(req);
+		OBDO_ALLOC(oa);
+		if (!oa) {
+			ptlrpc_req_finished(req);
+			RETURN(-ENOMEM);
+		}
+		*oa = ((struct ost_body *)val)->oa;
+		aa->aa_oa = oa;
+		req->rq_interpret_reply = osc_shrink_grant_interpret;
+	}
+
+	ptlrpc_request_set_replen(req);
+	if (!KEY_IS(KEY_GRANT_SHRINK)) {
+		LASSERT(set != NULL);
+		ptlrpc_set_add_req(set, req);
+		ptlrpc_check_set(NULL, set);
+	} else
+		ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+
+	RETURN(0);
+}
+
+
+static int osc_llog_init(struct obd_device *obd, struct obd_llog_group *olg,
+			 struct obd_device *disk_obd, int *index)
+{
+	/* this code is not supposed to be used with LOD/OSP
+	 * to be removed soon */
+	LBUG();
+	return 0;
+}
+
+static int osc_llog_finish(struct obd_device *obd, int count)
+{
+	struct llog_ctxt *ctxt;
+
+	ENTRY;
+
+	ctxt = llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT);
+	if (ctxt) {
+		llog_cat_close(NULL, ctxt->loc_handle);
+		llog_cleanup(NULL, ctxt);
+	}
+
+	ctxt = llog_get_context(obd, LLOG_SIZE_REPL_CTXT);
+	if (ctxt)
+		llog_cleanup(NULL, ctxt);
+	RETURN(0);
+}
+
+static int osc_reconnect(const struct lu_env *env,
+			 struct obd_export *exp, struct obd_device *obd,
+			 struct obd_uuid *cluuid,
+			 struct obd_connect_data *data,
+			 void *localdata)
+{
+	struct client_obd *cli = &obd->u.cli;
+
+	if (data != NULL && (data->ocd_connect_flags & OBD_CONNECT_GRANT)) {
+		long lost_grant;
+
+		client_obd_list_lock(&cli->cl_loi_list_lock);
+		data->ocd_grant = (cli->cl_avail_grant + cli->cl_dirty) ?:
+				2 * cli_brw_size(obd);
+		lost_grant = cli->cl_lost_grant;
+		cli->cl_lost_grant = 0;
+		client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+		CDEBUG(D_RPCTRACE, "ocd_connect_flags: "LPX64" ocd_version: %d"
+		       " ocd_grant: %d, lost: %ld.\n", data->ocd_connect_flags,
+		       data->ocd_version, data->ocd_grant, lost_grant);
+	}
+
+	RETURN(0);
+}
+
+static int osc_disconnect(struct obd_export *exp)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	struct llog_ctxt  *ctxt;
+	int rc;
+
+	ctxt = llog_get_context(obd, LLOG_SIZE_REPL_CTXT);
+	if (ctxt) {
+		if (obd->u.cli.cl_conn_count == 1) {
+			/* Flush any remaining cancel messages out to the
+			 * target */
+			llog_sync(ctxt, exp, 0);
+		}
+		llog_ctxt_put(ctxt);
+	} else {
+		CDEBUG(D_HA, "No LLOG_SIZE_REPL_CTXT found in obd %p\n",
+		       obd);
+	}
+
+	rc = client_disconnect_export(exp);
+	/**
+	 * Initially we put del_shrink_grant before disconnect_export, but it
+	 * causes the following problem if setup (connect) and cleanup
+	 * (disconnect) are tangled together.
+	 *      connect p1		     disconnect p2
+	 *   ptlrpc_connect_import
+	 *     ...............	       class_manual_cleanup
+	 *				     osc_disconnect
+	 *				     del_shrink_grant
+	 *   ptlrpc_connect_interrupt
+	 *     init_grant_shrink
+	 *   add this client to shrink list
+	 *				      cleanup_osc
+	 * Bang! pinger trigger the shrink.
+	 * So the osc should be disconnected from the shrink list, after we
+	 * are sure the import has been destroyed. BUG18662
+	 */
+	if (obd->u.cli.cl_import == NULL)
+		osc_del_shrink_grant(&obd->u.cli);
+	return rc;
+}
+
+static int osc_import_event(struct obd_device *obd,
+			    struct obd_import *imp,
+			    enum obd_import_event event)
+{
+	struct client_obd *cli;
+	int rc = 0;
+
+	ENTRY;
+	LASSERT(imp->imp_obd == obd);
+
+	switch (event) {
+	case IMP_EVENT_DISCON: {
+		cli = &obd->u.cli;
+		client_obd_list_lock(&cli->cl_loi_list_lock);
+		cli->cl_avail_grant = 0;
+		cli->cl_lost_grant = 0;
+		client_obd_list_unlock(&cli->cl_loi_list_lock);
+		break;
+	}
+	case IMP_EVENT_INACTIVE: {
+		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE, NULL);
+		break;
+	}
+	case IMP_EVENT_INVALIDATE: {
+		struct ldlm_namespace *ns = obd->obd_namespace;
+		struct lu_env	 *env;
+		int		    refcheck;
+
+		env = cl_env_get(&refcheck);
+		if (!IS_ERR(env)) {
+			/* Reset grants */
+			cli = &obd->u.cli;
+			/* all pages go to failing rpcs due to the invalid
+			 * import */
+			osc_io_unplug(env, cli, NULL, PDL_POLICY_ROUND);
+
+			ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
+			cl_env_put(env, &refcheck);
+		} else
+			rc = PTR_ERR(env);
+		break;
+	}
+	case IMP_EVENT_ACTIVE: {
+		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE, NULL);
+		break;
+	}
+	case IMP_EVENT_OCD: {
+		struct obd_connect_data *ocd = &imp->imp_connect_data;
+
+		if (ocd->ocd_connect_flags & OBD_CONNECT_GRANT)
+			osc_init_grant(&obd->u.cli, ocd);
+
+		/* See bug 7198 */
+		if (ocd->ocd_connect_flags & OBD_CONNECT_REQPORTAL)
+			imp->imp_client->cli_request_portal =OST_REQUEST_PORTAL;
+
+		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD, NULL);
+		break;
+	}
+	case IMP_EVENT_DEACTIVATE: {
+		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_DEACTIVATE, NULL);
+		break;
+	}
+	case IMP_EVENT_ACTIVATE: {
+		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVATE, NULL);
+		break;
+	}
+	default:
+		CERROR("Unknown import event %d\n", event);
+		LBUG();
+	}
+	RETURN(rc);
+}
+
+/**
+ * Determine whether the lock can be canceled before replaying the lock
+ * during recovery, see bug16774 for detailed information.
+ *
+ * \retval zero the lock can't be canceled
+ * \retval other ok to cancel
+ */
+static int osc_cancel_for_recovery(struct ldlm_lock *lock)
+{
+	check_res_locked(lock->l_resource);
+
+	/*
+	 * Cancel all unused extent lock in granted mode LCK_PR or LCK_CR.
+	 *
+	 * XXX as a future improvement, we can also cancel unused write lock
+	 * if it doesn't have dirty data and active mmaps.
+	 */
+	if (lock->l_resource->lr_type == LDLM_EXTENT &&
+	    (lock->l_granted_mode == LCK_PR ||
+	     lock->l_granted_mode == LCK_CR) &&
+	    (osc_dlm_lock_pageref(lock) == 0))
+		RETURN(1);
+
+	RETURN(0);
+}
+
+static int brw_queue_work(const struct lu_env *env, void *data)
+{
+	struct client_obd *cli = data;
+
+	CDEBUG(D_CACHE, "Run writeback work for client obd %p.\n", cli);
+
+	osc_io_unplug(env, cli, NULL, PDL_POLICY_SAME);
+	RETURN(0);
+}
+
+int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct lprocfs_static_vars lvars = { 0 };
+	struct client_obd	  *cli = &obd->u.cli;
+	void		       *handler;
+	int			rc;
+	ENTRY;
+
+	rc = ptlrpcd_addref();
+	if (rc)
+		RETURN(rc);
+
+	rc = client_obd_setup(obd, lcfg);
+	if (rc)
+		GOTO(out_ptlrpcd, rc);
+
+	handler = ptlrpcd_alloc_work(cli->cl_import, brw_queue_work, cli);
+	if (IS_ERR(handler))
+		GOTO(out_client_setup, rc = PTR_ERR(handler));
+	cli->cl_writeback_work = handler;
+
+	rc = osc_quota_setup(obd);
+	if (rc)
+		GOTO(out_ptlrpcd_work, rc);
+
+	cli->cl_grant_shrink_interval = GRANT_SHRINK_INTERVAL;
+	lprocfs_osc_init_vars(&lvars);
+	if (lprocfs_obd_setup(obd, lvars.obd_vars) == 0) {
+		lproc_osc_attach_seqstat(obd);
+		sptlrpc_lprocfs_cliobd_attach(obd);
+		ptlrpc_lprocfs_register_obd(obd);
+	}
+
+	/* We need to allocate a few requests more, because
+	 * brw_interpret tries to create new requests before freeing
+	 * previous ones, Ideally we want to have 2x max_rpcs_in_flight
+	 * reserved, but I'm afraid that might be too much wasted RAM
+	 * in fact, so 2 is just my guess and still should work. */
+	cli->cl_import->imp_rq_pool =
+		ptlrpc_init_rq_pool(cli->cl_max_rpcs_in_flight + 2,
+				    OST_MAXREQSIZE,
+				    ptlrpc_add_rqs_to_pool);
+
+	INIT_LIST_HEAD(&cli->cl_grant_shrink_list);
+	ns_register_cancel(obd->obd_namespace, osc_cancel_for_recovery);
+	RETURN(rc);
+
+out_ptlrpcd_work:
+	ptlrpcd_destroy_work(handler);
+out_client_setup:
+	client_obd_cleanup(obd);
+out_ptlrpcd:
+	ptlrpcd_decref();
+	RETURN(rc);
+}
+
+static int osc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
+{
+	int rc = 0;
+	ENTRY;
+
+	switch (stage) {
+	case OBD_CLEANUP_EARLY: {
+		struct obd_import *imp;
+		imp = obd->u.cli.cl_import;
+		CDEBUG(D_HA, "Deactivating import %s\n", obd->obd_name);
+		/* ptlrpc_abort_inflight to stop an mds_lov_synchronize */
+		ptlrpc_deactivate_import(imp);
+		spin_lock(&imp->imp_lock);
+		imp->imp_pingable = 0;
+		spin_unlock(&imp->imp_lock);
+		break;
+	}
+	case OBD_CLEANUP_EXPORTS: {
+		struct client_obd *cli = &obd->u.cli;
+		/* LU-464
+		 * for echo client, export may be on zombie list, wait for
+		 * zombie thread to cull it, because cli.cl_import will be
+		 * cleared in client_disconnect_export():
+		 *   class_export_destroy() -> obd_cleanup() ->
+		 *   echo_device_free() -> echo_client_cleanup() ->
+		 *   obd_disconnect() -> osc_disconnect() ->
+		 *   client_disconnect_export()
+		 */
+		obd_zombie_barrier();
+		if (cli->cl_writeback_work) {
+			ptlrpcd_destroy_work(cli->cl_writeback_work);
+			cli->cl_writeback_work = NULL;
+		}
+		obd_cleanup_client_import(obd);
+		ptlrpc_lprocfs_unregister_obd(obd);
+		lprocfs_obd_cleanup(obd);
+		rc = obd_llog_finish(obd, 0);
+		if (rc != 0)
+			CERROR("failed to cleanup llogging subsystems\n");
+		break;
+		}
+	}
+	RETURN(rc);
+}
+
+int osc_cleanup(struct obd_device *obd)
+{
+	struct client_obd *cli = &obd->u.cli;
+	int rc;
+
+	ENTRY;
+
+	/* lru cleanup */
+	if (cli->cl_cache != NULL) {
+		LASSERT(atomic_read(&cli->cl_cache->ccc_users) > 0);
+		spin_lock(&cli->cl_cache->ccc_lru_lock);
+		list_del_init(&cli->cl_lru_osc);
+		spin_unlock(&cli->cl_cache->ccc_lru_lock);
+		cli->cl_lru_left = NULL;
+		atomic_dec(&cli->cl_cache->ccc_users);
+		cli->cl_cache = NULL;
+	}
+
+	/* free memory of osc quota cache */
+	osc_quota_cleanup(obd);
+
+	rc = client_obd_cleanup(obd);
+
+	ptlrpcd_decref();
+	RETURN(rc);
+}
+
+int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct lprocfs_static_vars lvars = { 0 };
+	int rc = 0;
+
+	lprocfs_osc_init_vars(&lvars);
+
+	switch (lcfg->lcfg_command) {
+	default:
+		rc = class_process_proc_param(PARAM_OSC, lvars.obd_vars,
+					      lcfg, obd);
+		if (rc > 0)
+			rc = 0;
+		break;
+	}
+
+	return(rc);
+}
+
+static int osc_process_config(struct obd_device *obd, obd_count len, void *buf)
+{
+	return osc_process_config_base(obd, buf);
+}
+
+struct obd_ops osc_obd_ops = {
+	.o_owner		= THIS_MODULE,
+	.o_setup		= osc_setup,
+	.o_precleanup	   = osc_precleanup,
+	.o_cleanup	      = osc_cleanup,
+	.o_add_conn	     = client_import_add_conn,
+	.o_del_conn	     = client_import_del_conn,
+	.o_connect	      = client_connect_import,
+	.o_reconnect	    = osc_reconnect,
+	.o_disconnect	   = osc_disconnect,
+	.o_statfs	       = osc_statfs,
+	.o_statfs_async	 = osc_statfs_async,
+	.o_packmd	       = osc_packmd,
+	.o_unpackmd	     = osc_unpackmd,
+	.o_create	       = osc_create,
+	.o_destroy	      = osc_destroy,
+	.o_getattr	      = osc_getattr,
+	.o_getattr_async	= osc_getattr_async,
+	.o_setattr	      = osc_setattr,
+	.o_setattr_async	= osc_setattr_async,
+	.o_brw		  = osc_brw,
+	.o_punch		= osc_punch,
+	.o_sync		 = osc_sync,
+	.o_enqueue	      = osc_enqueue,
+	.o_change_cbdata	= osc_change_cbdata,
+	.o_find_cbdata	  = osc_find_cbdata,
+	.o_cancel	       = osc_cancel,
+	.o_cancel_unused	= osc_cancel_unused,
+	.o_iocontrol	    = osc_iocontrol,
+	.o_get_info	     = osc_get_info,
+	.o_set_info_async       = osc_set_info_async,
+	.o_import_event	 = osc_import_event,
+	.o_llog_init	    = osc_llog_init,
+	.o_llog_finish	  = osc_llog_finish,
+	.o_process_config       = osc_process_config,
+	.o_quotactl	     = osc_quotactl,
+	.o_quotacheck	   = osc_quotacheck,
+};
+
+extern struct lu_kmem_descr osc_caches[];
+extern spinlock_t osc_ast_guard;
+extern struct lock_class_key osc_ast_guard_class;
+
+int __init osc_init(void)
+{
+	struct lprocfs_static_vars lvars = { 0 };
+	int rc;
+	ENTRY;
+
+	/* print an address of _any_ initialized kernel symbol from this
+	 * module, to allow debugging with gdb that doesn't support data
+	 * symbols from modules.*/
+	CDEBUG(D_INFO, "Lustre OSC module (%p).\n", &osc_caches);
+
+	rc = lu_kmem_init(osc_caches);
+
+	lprocfs_osc_init_vars(&lvars);
+
+	rc = class_register_type(&osc_obd_ops, NULL, lvars.module_vars,
+				 LUSTRE_OSC_NAME, &osc_device_type);
+	if (rc) {
+		lu_kmem_fini(osc_caches);
+		RETURN(rc);
+	}
+
+	spin_lock_init(&osc_ast_guard);
+	lockdep_set_class(&osc_ast_guard, &osc_ast_guard_class);
+
+	RETURN(rc);
+}
+
+static void /*__exit*/ osc_exit(void)
+{
+	class_unregister_type(LUSTRE_OSC_NAME);
+	lu_kmem_fini(osc_caches);
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Object Storage Client (OSC)");
+MODULE_LICENSE("GPL");
+
+cfs_module(osc, LUSTRE_VERSION_STRING, osc_init, osc_exit);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/Makefile b/drivers/staging/lustre/lustre/ptlrpc/Makefile
new file mode 100644
index 000000000000..983eb66a554d
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/Makefile
@@ -0,0 +1,23 @@
+obj-$(CONFIG_LUSTRE_FS) += ptlrpc.o
+LDLM := ../../lustre/ldlm/
+
+ldlm_objs := $(LDLM)l_lock.o $(LDLM)ldlm_lock.o
+ldlm_objs += $(LDLM)ldlm_resource.o $(LDLM)ldlm_lib.o
+ldlm_objs += $(LDLM)ldlm_plain.o $(LDLM)ldlm_extent.o
+ldlm_objs += $(LDLM)ldlm_request.o $(LDLM)ldlm_lockd.o
+ldlm_objs += $(LDLM)ldlm_flock.o $(LDLM)ldlm_inodebits.o
+ldlm_objs += $(LDLM)ldlm_pool.o
+ldlm_objs += $(LDLM)interval_tree.o
+ptlrpc_objs := client.o recover.o connection.o niobuf.o pack_generic.o
+ptlrpc_objs += events.o ptlrpc_module.o service.o pinger.o
+ptlrpc_objs += llog_net.o llog_client.o llog_server.o import.o ptlrpcd.o
+ptlrpc_objs += pers.o lproc_ptlrpc.o wiretest.o layout.o
+ptlrpc_objs += sec.o sec_bulk.o sec_gc.o sec_config.o sec_lproc.o
+ptlrpc_objs += sec_null.o sec_plain.o nrs.o nrs_fifo.o
+
+ptlrpc-y := $(ldlm_objs) $(ptlrpc_objs)
+
+obj-$(CONFIG_PTLRPC_GSS) += gss/
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lustre/ptlrpc/client.c b/drivers/staging/lustre/lustre/ptlrpc/client.c
new file mode 100644
index 000000000000..22f7e654c9d8
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/client.c
@@ -0,0 +1,3059 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/** Implementation of client-side PortalRPC interfaces */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_lib.h>
+#include <lustre_ha.h>
+#include <lustre_import.h>
+#include <lustre_req_layout.h>
+
+#include "ptlrpc_internal.h"
+
+static int ptlrpc_send_new_req(struct ptlrpc_request *req);
+
+/**
+ * Initialize passed in client structure \a cl.
+ */
+void ptlrpc_init_client(int req_portal, int rep_portal, char *name,
+			struct ptlrpc_client *cl)
+{
+	cl->cli_request_portal = req_portal;
+	cl->cli_reply_portal   = rep_portal;
+	cl->cli_name	   = name;
+}
+EXPORT_SYMBOL(ptlrpc_init_client);
+
+/**
+ * Return PortalRPC connection for remore uud \a uuid
+ */
+struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid)
+{
+	struct ptlrpc_connection *c;
+	lnet_nid_t		self;
+	lnet_process_id_t	 peer;
+	int		       err;
+
+	/* ptlrpc_uuid_to_peer() initializes its 2nd parameter
+	 * before accessing its values. */
+	/* coverity[uninit_use_in_call] */
+	err = ptlrpc_uuid_to_peer(uuid, &peer, &self);
+	if (err != 0) {
+		CNETERR("cannot find peer %s!\n", uuid->uuid);
+		return NULL;
+	}
+
+	c = ptlrpc_connection_get(peer, self, uuid);
+	if (c) {
+		memcpy(c->c_remote_uuid.uuid,
+		       uuid->uuid, sizeof(c->c_remote_uuid.uuid));
+	}
+
+	CDEBUG(D_INFO, "%s -> %p\n", uuid->uuid, c);
+
+	return c;
+}
+EXPORT_SYMBOL(ptlrpc_uuid_to_connection);
+
+/**
+ * Allocate and initialize new bulk descriptor on the sender.
+ * Returns pointer to the descriptor or NULL on error.
+ */
+struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned npages, unsigned max_brw,
+					 unsigned type, unsigned portal)
+{
+	struct ptlrpc_bulk_desc *desc;
+	int i;
+
+	OBD_ALLOC(desc, offsetof(struct ptlrpc_bulk_desc, bd_iov[npages]));
+	if (!desc)
+		return NULL;
+
+	spin_lock_init(&desc->bd_lock);
+	init_waitqueue_head(&desc->bd_waitq);
+	desc->bd_max_iov = npages;
+	desc->bd_iov_count = 0;
+	desc->bd_portal = portal;
+	desc->bd_type = type;
+	desc->bd_md_count = 0;
+	LASSERT(max_brw > 0);
+	desc->bd_md_max_brw = min(max_brw, PTLRPC_BULK_OPS_COUNT);
+	/* PTLRPC_BULK_OPS_COUNT is the compile-time transfer limit for this
+	 * node. Negotiated ocd_brw_size will always be <= this number. */
+	for (i = 0; i < PTLRPC_BULK_OPS_COUNT; i++)
+		LNetInvalidateHandle(&desc->bd_mds[i]);
+
+	return desc;
+}
+
+/**
+ * Prepare bulk descriptor for specified outgoing request \a req that
+ * can fit \a npages * pages. \a type is bulk type. \a portal is where
+ * the bulk to be sent. Used on client-side.
+ * Returns pointer to newly allocatrd initialized bulk descriptor or NULL on
+ * error.
+ */
+struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req,
+					      unsigned npages, unsigned max_brw,
+					      unsigned type, unsigned portal)
+{
+	struct obd_import *imp = req->rq_import;
+	struct ptlrpc_bulk_desc *desc;
+
+	ENTRY;
+	LASSERT(type == BULK_PUT_SINK || type == BULK_GET_SOURCE);
+	desc = ptlrpc_new_bulk(npages, max_brw, type, portal);
+	if (desc == NULL)
+		RETURN(NULL);
+
+	desc->bd_import_generation = req->rq_import_generation;
+	desc->bd_import = class_import_get(imp);
+	desc->bd_req = req;
+
+	desc->bd_cbid.cbid_fn  = client_bulk_callback;
+	desc->bd_cbid.cbid_arg = desc;
+
+	/* This makes req own desc, and free it when she frees herself */
+	req->rq_bulk = desc;
+
+	return desc;
+}
+EXPORT_SYMBOL(ptlrpc_prep_bulk_imp);
+
+/**
+ * Add a page \a page to the bulk descriptor \a desc.
+ * Data to transfer in the page starts at offset \a pageoffset and
+ * amount of data to transfer from the page is \a len
+ */
+void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
+			     struct page *page, int pageoffset, int len, int pin)
+{
+	LASSERT(desc->bd_iov_count < desc->bd_max_iov);
+	LASSERT(page != NULL);
+	LASSERT(pageoffset >= 0);
+	LASSERT(len > 0);
+	LASSERT(pageoffset + len <= PAGE_CACHE_SIZE);
+
+	desc->bd_nob += len;
+
+	if (pin)
+		page_cache_get(page);
+
+	ptlrpc_add_bulk_page(desc, page, pageoffset, len);
+}
+EXPORT_SYMBOL(__ptlrpc_prep_bulk_page);
+
+/**
+ * Uninitialize and free bulk descriptor \a desc.
+ * Works on bulk descriptors both from server and client side.
+ */
+void __ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc, int unpin)
+{
+	int i;
+	ENTRY;
+
+	LASSERT(desc != NULL);
+	LASSERT(desc->bd_iov_count != LI_POISON); /* not freed already */
+	LASSERT(desc->bd_md_count == 0);	 /* network hands off */
+	LASSERT((desc->bd_export != NULL) ^ (desc->bd_import != NULL));
+
+	sptlrpc_enc_pool_put_pages(desc);
+
+	if (desc->bd_export)
+		class_export_put(desc->bd_export);
+	else
+		class_import_put(desc->bd_import);
+
+	if (unpin) {
+		for (i = 0; i < desc->bd_iov_count ; i++)
+			page_cache_release(desc->bd_iov[i].kiov_page);
+	}
+
+	OBD_FREE(desc, offsetof(struct ptlrpc_bulk_desc,
+				bd_iov[desc->bd_max_iov]));
+	EXIT;
+}
+EXPORT_SYMBOL(__ptlrpc_free_bulk);
+
+/**
+ * Set server timelimit for this req, i.e. how long are we willing to wait
+ * for reply before timing out this request.
+ */
+void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req)
+{
+	__u32 serv_est;
+	int idx;
+	struct imp_at *at;
+
+	LASSERT(req->rq_import);
+
+	if (AT_OFF) {
+		/* non-AT settings */
+		/**
+		 * \a imp_server_timeout means this is reverse import and
+		 * we send (currently only) ASTs to the client and cannot afford
+		 * to wait too long for the reply, otherwise the other client
+		 * (because of which we are sending this request) would
+		 * timeout waiting for us
+		 */
+		req->rq_timeout = req->rq_import->imp_server_timeout ?
+				  obd_timeout / 2 : obd_timeout;
+	} else {
+		at = &req->rq_import->imp_at;
+		idx = import_at_get_index(req->rq_import,
+					  req->rq_request_portal);
+		serv_est = at_get(&at->iat_service_estimate[idx]);
+		req->rq_timeout = at_est2timeout(serv_est);
+	}
+	/* We could get even fancier here, using history to predict increased
+	   loading... */
+
+	/* Let the server know what this RPC timeout is by putting it in the
+	   reqmsg*/
+	lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout);
+}
+EXPORT_SYMBOL(ptlrpc_at_set_req_timeout);
+
+/* Adjust max service estimate based on server value */
+static void ptlrpc_at_adj_service(struct ptlrpc_request *req,
+				  unsigned int serv_est)
+{
+	int idx;
+	unsigned int oldse;
+	struct imp_at *at;
+
+	LASSERT(req->rq_import);
+	at = &req->rq_import->imp_at;
+
+	idx = import_at_get_index(req->rq_import, req->rq_request_portal);
+	/* max service estimates are tracked on the server side,
+	   so just keep minimal history here */
+	oldse = at_measured(&at->iat_service_estimate[idx], serv_est);
+	if (oldse != 0)
+		CDEBUG(D_ADAPTTO, "The RPC service estimate for %s ptl %d "
+		       "has changed from %d to %d\n",
+		       req->rq_import->imp_obd->obd_name,req->rq_request_portal,
+		       oldse, at_get(&at->iat_service_estimate[idx]));
+}
+
+/* Expected network latency per remote node (secs) */
+int ptlrpc_at_get_net_latency(struct ptlrpc_request *req)
+{
+	return AT_OFF ? 0 : at_get(&req->rq_import->imp_at.iat_net_latency);
+}
+
+/* Adjust expected network latency */
+static void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req,
+				      unsigned int service_time)
+{
+	unsigned int nl, oldnl;
+	struct imp_at *at;
+	time_t now = cfs_time_current_sec();
+
+	LASSERT(req->rq_import);
+	at = &req->rq_import->imp_at;
+
+	/* Network latency is total time less server processing time */
+	nl = max_t(int, now - req->rq_sent - service_time, 0) +1/*st rounding*/;
+	if (service_time > now - req->rq_sent + 3 /* bz16408 */)
+		CWARN("Reported service time %u > total measured time "
+		      CFS_DURATION_T"\n", service_time,
+		      cfs_time_sub(now, req->rq_sent));
+
+	oldnl = at_measured(&at->iat_net_latency, nl);
+	if (oldnl != 0)
+		CDEBUG(D_ADAPTTO, "The network latency for %s (nid %s) "
+		       "has changed from %d to %d\n",
+		       req->rq_import->imp_obd->obd_name,
+		       obd_uuid2str(
+			       &req->rq_import->imp_connection->c_remote_uuid),
+		       oldnl, at_get(&at->iat_net_latency));
+}
+
+static int unpack_reply(struct ptlrpc_request *req)
+{
+	int rc;
+
+	if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) {
+		rc = ptlrpc_unpack_rep_msg(req, req->rq_replen);
+		if (rc) {
+			DEBUG_REQ(D_ERROR, req, "unpack_rep failed: %d", rc);
+			return(-EPROTO);
+		}
+	}
+
+	rc = lustre_unpack_rep_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF);
+	if (rc) {
+		DEBUG_REQ(D_ERROR, req, "unpack ptlrpc body failed: %d", rc);
+		return(-EPROTO);
+	}
+	return 0;
+}
+
+/**
+ * Handle an early reply message, called with the rq_lock held.
+ * If anything goes wrong just ignore it - same as if it never happened
+ */
+static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req)
+{
+	struct ptlrpc_request *early_req;
+	time_t		 olddl;
+	int		    rc;
+	ENTRY;
+
+	req->rq_early = 0;
+	spin_unlock(&req->rq_lock);
+
+	rc = sptlrpc_cli_unwrap_early_reply(req, &early_req);
+	if (rc) {
+		spin_lock(&req->rq_lock);
+		RETURN(rc);
+	}
+
+	rc = unpack_reply(early_req);
+	if (rc == 0) {
+		/* Expecting to increase the service time estimate here */
+		ptlrpc_at_adj_service(req,
+			lustre_msg_get_timeout(early_req->rq_repmsg));
+		ptlrpc_at_adj_net_latency(req,
+			lustre_msg_get_service_time(early_req->rq_repmsg));
+	}
+
+	sptlrpc_cli_finish_early_reply(early_req);
+
+	if (rc != 0) {
+		spin_lock(&req->rq_lock);
+		RETURN(rc);
+	}
+
+	/* Adjust the local timeout for this req */
+	ptlrpc_at_set_req_timeout(req);
+
+	spin_lock(&req->rq_lock);
+	olddl = req->rq_deadline;
+	/* server assumes it now has rq_timeout from when it sent the
+	 * early reply, so client should give it at least that long. */
+	req->rq_deadline = cfs_time_current_sec() + req->rq_timeout +
+			   ptlrpc_at_get_net_latency(req);
+
+	DEBUG_REQ(D_ADAPTTO, req,
+		  "Early reply #%d, new deadline in "CFS_DURATION_T"s "
+		  "("CFS_DURATION_T"s)", req->rq_early_count,
+		  cfs_time_sub(req->rq_deadline, cfs_time_current_sec()),
+		  cfs_time_sub(req->rq_deadline, olddl));
+
+	RETURN(rc);
+}
+
+/**
+ * Wind down request pool \a pool.
+ * Frees all requests from the pool too
+ */
+void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool)
+{
+	struct list_head *l, *tmp;
+	struct ptlrpc_request *req;
+
+	LASSERT(pool != NULL);
+
+	spin_lock(&pool->prp_lock);
+	list_for_each_safe(l, tmp, &pool->prp_req_list) {
+		req = list_entry(l, struct ptlrpc_request, rq_list);
+		list_del(&req->rq_list);
+		LASSERT(req->rq_reqbuf);
+		LASSERT(req->rq_reqbuf_len == pool->prp_rq_size);
+		OBD_FREE_LARGE(req->rq_reqbuf, pool->prp_rq_size);
+		OBD_FREE(req, sizeof(*req));
+	}
+	spin_unlock(&pool->prp_lock);
+	OBD_FREE(pool, sizeof(*pool));
+}
+EXPORT_SYMBOL(ptlrpc_free_rq_pool);
+
+/**
+ * Allocates, initializes and adds \a num_rq requests to the pool \a pool
+ */
+void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq)
+{
+	int i;
+	int size = 1;
+
+	while (size < pool->prp_rq_size)
+		size <<= 1;
+
+	LASSERTF(list_empty(&pool->prp_req_list) ||
+		 size == pool->prp_rq_size,
+		 "Trying to change pool size with nonempty pool "
+		 "from %d to %d bytes\n", pool->prp_rq_size, size);
+
+	spin_lock(&pool->prp_lock);
+	pool->prp_rq_size = size;
+	for (i = 0; i < num_rq; i++) {
+		struct ptlrpc_request *req;
+		struct lustre_msg *msg;
+
+		spin_unlock(&pool->prp_lock);
+		OBD_ALLOC(req, sizeof(struct ptlrpc_request));
+		if (!req)
+			return;
+		OBD_ALLOC_LARGE(msg, size);
+		if (!msg) {
+			OBD_FREE(req, sizeof(struct ptlrpc_request));
+			return;
+		}
+		req->rq_reqbuf = msg;
+		req->rq_reqbuf_len = size;
+		req->rq_pool = pool;
+		spin_lock(&pool->prp_lock);
+		list_add_tail(&req->rq_list, &pool->prp_req_list);
+	}
+	spin_unlock(&pool->prp_lock);
+	return;
+}
+EXPORT_SYMBOL(ptlrpc_add_rqs_to_pool);
+
+/**
+ * Create and initialize new request pool with given attributes:
+ * \a num_rq - initial number of requests to create for the pool
+ * \a msgsize - maximum message size possible for requests in thid pool
+ * \a populate_pool - function to be called when more requests need to be added
+ *		    to the pool
+ * Returns pointer to newly created pool or NULL on error.
+ */
+struct ptlrpc_request_pool *
+ptlrpc_init_rq_pool(int num_rq, int msgsize,
+		    void (*populate_pool)(struct ptlrpc_request_pool *, int))
+{
+	struct ptlrpc_request_pool *pool;
+
+	OBD_ALLOC(pool, sizeof (struct ptlrpc_request_pool));
+	if (!pool)
+		return NULL;
+
+	/* Request next power of two for the allocation, because internally
+	   kernel would do exactly this */
+
+	spin_lock_init(&pool->prp_lock);
+	INIT_LIST_HEAD(&pool->prp_req_list);
+	pool->prp_rq_size = msgsize + SPTLRPC_MAX_PAYLOAD;
+	pool->prp_populate = populate_pool;
+
+	populate_pool(pool, num_rq);
+
+	if (list_empty(&pool->prp_req_list)) {
+		/* have not allocated a single request for the pool */
+		OBD_FREE(pool, sizeof (struct ptlrpc_request_pool));
+		pool = NULL;
+	}
+	return pool;
+}
+EXPORT_SYMBOL(ptlrpc_init_rq_pool);
+
+/**
+ * Fetches one request from pool \a pool
+ */
+static struct ptlrpc_request *
+ptlrpc_prep_req_from_pool(struct ptlrpc_request_pool *pool)
+{
+	struct ptlrpc_request *request;
+	struct lustre_msg *reqbuf;
+
+	if (!pool)
+		return NULL;
+
+	spin_lock(&pool->prp_lock);
+
+	/* See if we have anything in a pool, and bail out if nothing,
+	 * in writeout path, where this matters, this is safe to do, because
+	 * nothing is lost in this case, and when some in-flight requests
+	 * complete, this code will be called again. */
+	if (unlikely(list_empty(&pool->prp_req_list))) {
+		spin_unlock(&pool->prp_lock);
+		return NULL;
+	}
+
+	request = list_entry(pool->prp_req_list.next, struct ptlrpc_request,
+				 rq_list);
+	list_del_init(&request->rq_list);
+	spin_unlock(&pool->prp_lock);
+
+	LASSERT(request->rq_reqbuf);
+	LASSERT(request->rq_pool);
+
+	reqbuf = request->rq_reqbuf;
+	memset(request, 0, sizeof(*request));
+	request->rq_reqbuf = reqbuf;
+	request->rq_reqbuf_len = pool->prp_rq_size;
+	request->rq_pool = pool;
+
+	return request;
+}
+
+/**
+ * Returns freed \a request to pool.
+ */
+static void __ptlrpc_free_req_to_pool(struct ptlrpc_request *request)
+{
+	struct ptlrpc_request_pool *pool = request->rq_pool;
+
+	spin_lock(&pool->prp_lock);
+	LASSERT(list_empty(&request->rq_list));
+	LASSERT(!request->rq_receiving_reply);
+	list_add_tail(&request->rq_list, &pool->prp_req_list);
+	spin_unlock(&pool->prp_lock);
+}
+
+static int __ptlrpc_request_bufs_pack(struct ptlrpc_request *request,
+				      __u32 version, int opcode,
+				      int count, __u32 *lengths, char **bufs,
+				      struct ptlrpc_cli_ctx *ctx)
+{
+	struct obd_import  *imp = request->rq_import;
+	int		 rc;
+	ENTRY;
+
+	if (unlikely(ctx))
+		request->rq_cli_ctx = sptlrpc_cli_ctx_get(ctx);
+	else {
+		rc = sptlrpc_req_get_ctx(request);
+		if (rc)
+			GOTO(out_free, rc);
+	}
+
+	sptlrpc_req_set_flavor(request, opcode);
+
+	rc = lustre_pack_request(request, imp->imp_msg_magic, count,
+				 lengths, bufs);
+	if (rc) {
+		LASSERT(!request->rq_pool);
+		GOTO(out_ctx, rc);
+	}
+
+	lustre_msg_add_version(request->rq_reqmsg, version);
+	request->rq_send_state = LUSTRE_IMP_FULL;
+	request->rq_type = PTL_RPC_MSG_REQUEST;
+	request->rq_export = NULL;
+
+	request->rq_req_cbid.cbid_fn  = request_out_callback;
+	request->rq_req_cbid.cbid_arg = request;
+
+	request->rq_reply_cbid.cbid_fn  = reply_in_callback;
+	request->rq_reply_cbid.cbid_arg = request;
+
+	request->rq_reply_deadline = 0;
+	request->rq_phase = RQ_PHASE_NEW;
+	request->rq_next_phase = RQ_PHASE_UNDEFINED;
+
+	request->rq_request_portal = imp->imp_client->cli_request_portal;
+	request->rq_reply_portal = imp->imp_client->cli_reply_portal;
+
+	ptlrpc_at_set_req_timeout(request);
+
+	spin_lock_init(&request->rq_lock);
+	INIT_LIST_HEAD(&request->rq_list);
+	INIT_LIST_HEAD(&request->rq_timed_list);
+	INIT_LIST_HEAD(&request->rq_replay_list);
+	INIT_LIST_HEAD(&request->rq_ctx_chain);
+	INIT_LIST_HEAD(&request->rq_set_chain);
+	INIT_LIST_HEAD(&request->rq_history_list);
+	INIT_LIST_HEAD(&request->rq_exp_list);
+	init_waitqueue_head(&request->rq_reply_waitq);
+	init_waitqueue_head(&request->rq_set_waitq);
+	request->rq_xid = ptlrpc_next_xid();
+	atomic_set(&request->rq_refcount, 1);
+
+	lustre_msg_set_opc(request->rq_reqmsg, opcode);
+
+	RETURN(0);
+out_ctx:
+	sptlrpc_cli_ctx_put(request->rq_cli_ctx, 1);
+out_free:
+	class_import_put(imp);
+	return rc;
+}
+
+int ptlrpc_request_bufs_pack(struct ptlrpc_request *request,
+			     __u32 version, int opcode, char **bufs,
+			     struct ptlrpc_cli_ctx *ctx)
+{
+	int count;
+
+	count = req_capsule_filled_sizes(&request->rq_pill, RCL_CLIENT);
+	return __ptlrpc_request_bufs_pack(request, version, opcode, count,
+					  request->rq_pill.rc_area[RCL_CLIENT],
+					  bufs, ctx);
+}
+EXPORT_SYMBOL(ptlrpc_request_bufs_pack);
+
+/**
+ * Pack request buffers for network transfer, performing necessary encryption
+ * steps if necessary.
+ */
+int ptlrpc_request_pack(struct ptlrpc_request *request,
+			__u32 version, int opcode)
+{
+	int rc;
+	rc = ptlrpc_request_bufs_pack(request, version, opcode, NULL, NULL);
+	if (rc)
+		return rc;
+
+	/* For some old 1.8 clients (< 1.8.7), they will LASSERT the size of
+	 * ptlrpc_body sent from server equal to local ptlrpc_body size, so we
+	 * have to send old ptlrpc_body to keep interoprability with these
+	 * clients.
+	 *
+	 * Only three kinds of server->client RPCs so far:
+	 *  - LDLM_BL_CALLBACK
+	 *  - LDLM_CP_CALLBACK
+	 *  - LDLM_GL_CALLBACK
+	 *
+	 * XXX This should be removed whenever we drop the interoprability with
+	 *     the these old clients.
+	 */
+	if (opcode == LDLM_BL_CALLBACK || opcode == LDLM_CP_CALLBACK ||
+	    opcode == LDLM_GL_CALLBACK)
+		req_capsule_shrink(&request->rq_pill, &RMF_PTLRPC_BODY,
+				   sizeof(struct ptlrpc_body_v2), RCL_CLIENT);
+
+	return rc;
+}
+EXPORT_SYMBOL(ptlrpc_request_pack);
+
+/**
+ * Helper function to allocate new request on import \a imp
+ * and possibly using existing request from pool \a pool if provided.
+ * Returns allocated request structure with import field filled or
+ * NULL on error.
+ */
+static inline
+struct ptlrpc_request *__ptlrpc_request_alloc(struct obd_import *imp,
+					      struct ptlrpc_request_pool *pool)
+{
+	struct ptlrpc_request *request = NULL;
+
+	if (pool)
+		request = ptlrpc_prep_req_from_pool(pool);
+
+	if (!request)
+		OBD_ALLOC_PTR(request);
+
+	if (request) {
+		LASSERTF((unsigned long)imp > 0x1000, "%p", imp);
+		LASSERT(imp != LP_POISON);
+		LASSERTF((unsigned long)imp->imp_client > 0x1000, "%p",
+			imp->imp_client);
+		LASSERT(imp->imp_client != LP_POISON);
+
+		request->rq_import = class_import_get(imp);
+	} else {
+		CERROR("request allocation out of memory\n");
+	}
+
+	return request;
+}
+
+/**
+ * Helper function for creating a request.
+ * Calls __ptlrpc_request_alloc to allocate new request sturcture and inits
+ * buffer structures according to capsule template \a format.
+ * Returns allocated request structure pointer or NULL on error.
+ */
+static struct ptlrpc_request *
+ptlrpc_request_alloc_internal(struct obd_import *imp,
+			      struct ptlrpc_request_pool * pool,
+			      const struct req_format *format)
+{
+	struct ptlrpc_request *request;
+
+	request = __ptlrpc_request_alloc(imp, pool);
+	if (request == NULL)
+		return NULL;
+
+	req_capsule_init(&request->rq_pill, request, RCL_CLIENT);
+	req_capsule_set(&request->rq_pill, format);
+	return request;
+}
+
+/**
+ * Allocate new request structure for import \a imp and initialize its
+ * buffer structure according to capsule template \a format.
+ */
+struct ptlrpc_request *ptlrpc_request_alloc(struct obd_import *imp,
+					    const struct req_format *format)
+{
+	return ptlrpc_request_alloc_internal(imp, NULL, format);
+}
+EXPORT_SYMBOL(ptlrpc_request_alloc);
+
+/**
+ * Allocate new request structure for import \a imp from pool \a pool and
+ * initialize its buffer structure according to capsule template \a format.
+ */
+struct ptlrpc_request *ptlrpc_request_alloc_pool(struct obd_import *imp,
+					    struct ptlrpc_request_pool * pool,
+					    const struct req_format *format)
+{
+	return ptlrpc_request_alloc_internal(imp, pool, format);
+}
+EXPORT_SYMBOL(ptlrpc_request_alloc_pool);
+
+/**
+ * For requests not from pool, free memory of the request structure.
+ * For requests obtained from a pool earlier, return request back to pool.
+ */
+void ptlrpc_request_free(struct ptlrpc_request *request)
+{
+	if (request->rq_pool)
+		__ptlrpc_free_req_to_pool(request);
+	else
+		OBD_FREE_PTR(request);
+}
+EXPORT_SYMBOL(ptlrpc_request_free);
+
+/**
+ * Allocate new request for operatione \a opcode and immediatelly pack it for
+ * network transfer.
+ * Only used for simple requests like OBD_PING where the only important
+ * part of the request is operation itself.
+ * Returns allocated request or NULL on error.
+ */
+struct ptlrpc_request *ptlrpc_request_alloc_pack(struct obd_import *imp,
+						const struct req_format *format,
+						__u32 version, int opcode)
+{
+	struct ptlrpc_request *req = ptlrpc_request_alloc(imp, format);
+	int		    rc;
+
+	if (req) {
+		rc = ptlrpc_request_pack(req, version, opcode);
+		if (rc) {
+			ptlrpc_request_free(req);
+			req = NULL;
+		}
+	}
+	return req;
+}
+EXPORT_SYMBOL(ptlrpc_request_alloc_pack);
+
+/**
+ * Prepare request (fetched from pool \a poolif not NULL) on import \a imp
+ * for operation \a opcode. Request would contain \a count buffers.
+ * Sizes of buffers are described in array \a lengths and buffers themselves
+ * are provided by a pointer \a bufs.
+ * Returns prepared request structure pointer or NULL on error.
+ */
+struct ptlrpc_request *
+ptlrpc_prep_req_pool(struct obd_import *imp,
+		     __u32 version, int opcode,
+		     int count, __u32 *lengths, char **bufs,
+		     struct ptlrpc_request_pool *pool)
+{
+	struct ptlrpc_request *request;
+	int		    rc;
+
+	request = __ptlrpc_request_alloc(imp, pool);
+	if (!request)
+		return NULL;
+
+	rc = __ptlrpc_request_bufs_pack(request, version, opcode, count,
+					lengths, bufs, NULL);
+	if (rc) {
+		ptlrpc_request_free(request);
+		request = NULL;
+	}
+	return request;
+}
+EXPORT_SYMBOL(ptlrpc_prep_req_pool);
+
+/**
+ * Same as ptlrpc_prep_req_pool, but without pool
+ */
+struct ptlrpc_request *
+ptlrpc_prep_req(struct obd_import *imp, __u32 version, int opcode, int count,
+		__u32 *lengths, char **bufs)
+{
+	return ptlrpc_prep_req_pool(imp, version, opcode, count, lengths, bufs,
+				    NULL);
+}
+EXPORT_SYMBOL(ptlrpc_prep_req);
+
+/**
+ * Allocate and initialize new request set structure.
+ * Returns a pointer to the newly allocated set structure or NULL on error.
+ */
+struct ptlrpc_request_set *ptlrpc_prep_set(void)
+{
+	struct ptlrpc_request_set *set;
+
+	ENTRY;
+	OBD_ALLOC(set, sizeof *set);
+	if (!set)
+		RETURN(NULL);
+	atomic_set(&set->set_refcount, 1);
+	INIT_LIST_HEAD(&set->set_requests);
+	init_waitqueue_head(&set->set_waitq);
+	atomic_set(&set->set_new_count, 0);
+	atomic_set(&set->set_remaining, 0);
+	spin_lock_init(&set->set_new_req_lock);
+	INIT_LIST_HEAD(&set->set_new_requests);
+	INIT_LIST_HEAD(&set->set_cblist);
+	set->set_max_inflight = UINT_MAX;
+	set->set_producer     = NULL;
+	set->set_producer_arg = NULL;
+	set->set_rc	   = 0;
+
+	RETURN(set);
+}
+EXPORT_SYMBOL(ptlrpc_prep_set);
+
+/**
+ * Allocate and initialize new request set structure with flow control
+ * extension. This extension allows to control the number of requests in-flight
+ * for the whole set. A callback function to generate requests must be provided
+ * and the request set will keep the number of requests sent over the wire to
+ * @max_inflight.
+ * Returns a pointer to the newly allocated set structure or NULL on error.
+ */
+struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func,
+					     void *arg)
+
+{
+	struct ptlrpc_request_set *set;
+
+	set = ptlrpc_prep_set();
+	if (!set)
+		RETURN(NULL);
+
+	set->set_max_inflight  = max;
+	set->set_producer      = func;
+	set->set_producer_arg  = arg;
+
+	RETURN(set);
+}
+EXPORT_SYMBOL(ptlrpc_prep_fcset);
+
+/**
+ * Wind down and free request set structure previously allocated with
+ * ptlrpc_prep_set.
+ * Ensures that all requests on the set have completed and removes
+ * all requests from the request list in a set.
+ * If any unsent request happen to be on the list, pretends that they got
+ * an error in flight and calls their completion handler.
+ */
+void ptlrpc_set_destroy(struct ptlrpc_request_set *set)
+{
+	struct list_head       *tmp;
+	struct list_head       *next;
+	int	       expected_phase;
+	int	       n = 0;
+	ENTRY;
+
+	/* Requests on the set should either all be completed, or all be new */
+	expected_phase = (atomic_read(&set->set_remaining) == 0) ?
+			 RQ_PHASE_COMPLETE : RQ_PHASE_NEW;
+	list_for_each (tmp, &set->set_requests) {
+		struct ptlrpc_request *req =
+			list_entry(tmp, struct ptlrpc_request,
+				       rq_set_chain);
+
+		LASSERT(req->rq_phase == expected_phase);
+		n++;
+	}
+
+	LASSERTF(atomic_read(&set->set_remaining) == 0 ||
+		 atomic_read(&set->set_remaining) == n, "%d / %d\n",
+		 atomic_read(&set->set_remaining), n);
+
+	list_for_each_safe(tmp, next, &set->set_requests) {
+		struct ptlrpc_request *req =
+			list_entry(tmp, struct ptlrpc_request,
+				       rq_set_chain);
+		list_del_init(&req->rq_set_chain);
+
+		LASSERT(req->rq_phase == expected_phase);
+
+		if (req->rq_phase == RQ_PHASE_NEW) {
+			ptlrpc_req_interpret(NULL, req, -EBADR);
+			atomic_dec(&set->set_remaining);
+		}
+
+		spin_lock(&req->rq_lock);
+		req->rq_set = NULL;
+		req->rq_invalid_rqset = 0;
+		spin_unlock(&req->rq_lock);
+
+		ptlrpc_req_finished (req);
+	}
+
+	LASSERT(atomic_read(&set->set_remaining) == 0);
+
+	ptlrpc_reqset_put(set);
+	EXIT;
+}
+EXPORT_SYMBOL(ptlrpc_set_destroy);
+
+/**
+ * Add a callback function \a fn to the set.
+ * This function would be called when all requests on this set are completed.
+ * The function will be passed \a data argument.
+ */
+int ptlrpc_set_add_cb(struct ptlrpc_request_set *set,
+		      set_interpreter_func fn, void *data)
+{
+	struct ptlrpc_set_cbdata *cbdata;
+
+	OBD_ALLOC_PTR(cbdata);
+	if (cbdata == NULL)
+		RETURN(-ENOMEM);
+
+	cbdata->psc_interpret = fn;
+	cbdata->psc_data = data;
+	list_add_tail(&cbdata->psc_item, &set->set_cblist);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_set_add_cb);
+
+/**
+ * Add a new request to the general purpose request set.
+ * Assumes request reference from the caller.
+ */
+void ptlrpc_set_add_req(struct ptlrpc_request_set *set,
+			struct ptlrpc_request *req)
+{
+	LASSERT(list_empty(&req->rq_set_chain));
+
+	/* The set takes over the caller's request reference */
+	list_add_tail(&req->rq_set_chain, &set->set_requests);
+	req->rq_set = set;
+	atomic_inc(&set->set_remaining);
+	req->rq_queued_time = cfs_time_current();
+
+	if (req->rq_reqmsg != NULL)
+		lustre_msg_set_jobid(req->rq_reqmsg, NULL);
+
+	if (set->set_producer != NULL)
+		/* If the request set has a producer callback, the RPC must be
+		 * sent straight away */
+		ptlrpc_send_new_req(req);
+}
+EXPORT_SYMBOL(ptlrpc_set_add_req);
+
+/**
+ * Add a request to a request with dedicated server thread
+ * and wake the thread to make any necessary processing.
+ * Currently only used for ptlrpcd.
+ */
+void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc,
+			   struct ptlrpc_request *req)
+{
+	struct ptlrpc_request_set *set = pc->pc_set;
+	int count, i;
+
+	LASSERT(req->rq_set == NULL);
+	LASSERT(test_bit(LIOD_STOP, &pc->pc_flags) == 0);
+
+	spin_lock(&set->set_new_req_lock);
+	/*
+	 * The set takes over the caller's request reference.
+	 */
+	req->rq_set = set;
+	req->rq_queued_time = cfs_time_current();
+	list_add_tail(&req->rq_set_chain, &set->set_new_requests);
+	count = atomic_inc_return(&set->set_new_count);
+	spin_unlock(&set->set_new_req_lock);
+
+	/* Only need to call wakeup once for the first entry. */
+	if (count == 1) {
+		wake_up(&set->set_waitq);
+
+		/* XXX: It maybe unnecessary to wakeup all the partners. But to
+		 *      guarantee the async RPC can be processed ASAP, we have
+		 *      no other better choice. It maybe fixed in future. */
+		for (i = 0; i < pc->pc_npartners; i++)
+			wake_up(&pc->pc_partners[i]->pc_set->set_waitq);
+	}
+}
+EXPORT_SYMBOL(ptlrpc_set_add_new_req);
+
+/**
+ * Based on the current state of the import, determine if the request
+ * can be sent, is an error, or should be delayed.
+ *
+ * Returns true if this request should be delayed. If false, and
+ * *status is set, then the request can not be sent and *status is the
+ * error code.  If false and status is 0, then request can be sent.
+ *
+ * The imp->imp_lock must be held.
+ */
+static int ptlrpc_import_delay_req(struct obd_import *imp,
+				   struct ptlrpc_request *req, int *status)
+{
+	int delay = 0;
+	ENTRY;
+
+	LASSERT (status != NULL);
+	*status = 0;
+
+	if (req->rq_ctx_init || req->rq_ctx_fini) {
+		/* always allow ctx init/fini rpc go through */
+	} else if (imp->imp_state == LUSTRE_IMP_NEW) {
+		DEBUG_REQ(D_ERROR, req, "Uninitialized import.");
+		*status = -EIO;
+	} else if (imp->imp_state == LUSTRE_IMP_CLOSED) {
+		/* pings may safely race with umount */
+		DEBUG_REQ(lustre_msg_get_opc(req->rq_reqmsg) == OBD_PING ?
+			  D_HA : D_ERROR, req, "IMP_CLOSED ");
+		*status = -EIO;
+	} else if (ptlrpc_send_limit_expired(req)) {
+		/* probably doesn't need to be a D_ERROR after initial testing */
+		DEBUG_REQ(D_ERROR, req, "send limit expired ");
+		*status = -EIO;
+	} else if (req->rq_send_state == LUSTRE_IMP_CONNECTING &&
+		   imp->imp_state == LUSTRE_IMP_CONNECTING) {
+		/* allow CONNECT even if import is invalid */ ;
+		if (atomic_read(&imp->imp_inval_count) != 0) {
+			DEBUG_REQ(D_ERROR, req, "invalidate in flight");
+			*status = -EIO;
+		}
+	} else if (imp->imp_invalid || imp->imp_obd->obd_no_recov) {
+		if (!imp->imp_deactive)
+			DEBUG_REQ(D_NET, req, "IMP_INVALID");
+		*status = -ESHUTDOWN; /* bz 12940 */
+	} else if (req->rq_import_generation != imp->imp_generation) {
+		DEBUG_REQ(D_ERROR, req, "req wrong generation:");
+		*status = -EIO;
+	} else if (req->rq_send_state != imp->imp_state) {
+		/* invalidate in progress - any requests should be drop */
+		if (atomic_read(&imp->imp_inval_count) != 0) {
+			DEBUG_REQ(D_ERROR, req, "invalidate in flight");
+			*status = -EIO;
+		} else if (imp->imp_dlm_fake || req->rq_no_delay) {
+			*status = -EWOULDBLOCK;
+		} else if (req->rq_allow_replay &&
+			  (imp->imp_state == LUSTRE_IMP_REPLAY ||
+			   imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS ||
+			   imp->imp_state == LUSTRE_IMP_REPLAY_WAIT ||
+			   imp->imp_state == LUSTRE_IMP_RECOVER)) {
+			DEBUG_REQ(D_HA, req, "allow during recovery.\n");
+		} else {
+			delay = 1;
+		}
+	}
+
+	RETURN(delay);
+}
+
+/**
+ * Decide if the eror message regarding provided request \a req
+ * should be printed to the console or not.
+ * Makes it's decision on request status and other properties.
+ * Returns 1 to print error on the system console or 0 if not.
+ */
+static int ptlrpc_console_allow(struct ptlrpc_request *req)
+{
+	__u32 opc;
+	int err;
+
+	LASSERT(req->rq_reqmsg != NULL);
+	opc = lustre_msg_get_opc(req->rq_reqmsg);
+
+	/* Suppress particular reconnect errors which are to be expected.  No
+	 * errors are suppressed for the initial connection on an import */
+	if ((lustre_handle_is_used(&req->rq_import->imp_remote_handle)) &&
+	    (opc == OST_CONNECT || opc == MDS_CONNECT || opc == MGS_CONNECT)) {
+
+		/* Suppress timed out reconnect requests */
+		if (req->rq_timedout)
+			return 0;
+
+		/* Suppress unavailable/again reconnect requests */
+		err = lustre_msg_get_status(req->rq_repmsg);
+		if (err == -ENODEV || err == -EAGAIN)
+			return 0;
+	}
+
+	return 1;
+}
+
+/**
+ * Check request processing status.
+ * Returns the status.
+ */
+static int ptlrpc_check_status(struct ptlrpc_request *req)
+{
+	int err;
+	ENTRY;
+
+	err = lustre_msg_get_status(req->rq_repmsg);
+	if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) {
+		struct obd_import *imp = req->rq_import;
+		__u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
+		if (ptlrpc_console_allow(req))
+			LCONSOLE_ERROR_MSG(0x011, "%s: Communicating with %s,"
+					   " operation %s failed with %d.\n",
+					   imp->imp_obd->obd_name,
+					   libcfs_nid2str(
+					   imp->imp_connection->c_peer.nid),
+					   ll_opcode2str(opc), err);
+		RETURN(err < 0 ? err : -EINVAL);
+	}
+
+	if (err < 0) {
+		DEBUG_REQ(D_INFO, req, "status is %d", err);
+	} else if (err > 0) {
+		/* XXX: translate this error from net to host */
+		DEBUG_REQ(D_INFO, req, "status is %d", err);
+	}
+
+	RETURN(err);
+}
+
+/**
+ * save pre-versions of objects into request for replay.
+ * Versions are obtained from server reply.
+ * used for VBR.
+ */
+static void ptlrpc_save_versions(struct ptlrpc_request *req)
+{
+	struct lustre_msg *repmsg = req->rq_repmsg;
+	struct lustre_msg *reqmsg = req->rq_reqmsg;
+	__u64 *versions = lustre_msg_get_versions(repmsg);
+	ENTRY;
+
+	if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)
+		return;
+
+	LASSERT(versions);
+	lustre_msg_set_versions(reqmsg, versions);
+	CDEBUG(D_INFO, "Client save versions ["LPX64"/"LPX64"]\n",
+	       versions[0], versions[1]);
+
+	EXIT;
+}
+
+/**
+ * Callback function called when client receives RPC reply for \a req.
+ * Returns 0 on success or error code.
+ * The return alue would be assigned to req->rq_status by the caller
+ * as request processing status.
+ * This function also decides if the request needs to be saved for later replay.
+ */
+static int after_reply(struct ptlrpc_request *req)
+{
+	struct obd_import *imp = req->rq_import;
+	struct obd_device *obd = req->rq_import->imp_obd;
+	int rc;
+	struct timeval work_start;
+	long timediff;
+	ENTRY;
+
+	LASSERT(obd != NULL);
+	/* repbuf must be unlinked */
+	LASSERT(!req->rq_receiving_reply && !req->rq_must_unlink);
+
+	if (req->rq_reply_truncate) {
+		if (ptlrpc_no_resend(req)) {
+			DEBUG_REQ(D_ERROR, req, "reply buffer overflow,"
+				  " expected: %d, actual size: %d",
+				  req->rq_nob_received, req->rq_repbuf_len);
+			RETURN(-EOVERFLOW);
+		}
+
+		sptlrpc_cli_free_repbuf(req);
+		/* Pass the required reply buffer size (include
+		 * space for early reply).
+		 * NB: no need to roundup because alloc_repbuf
+		 * will roundup it */
+		req->rq_replen       = req->rq_nob_received;
+		req->rq_nob_received = 0;
+		req->rq_resend       = 1;
+		RETURN(0);
+	}
+
+	/*
+	 * NB Until this point, the whole of the incoming message,
+	 * including buflens, status etc is in the sender's byte order.
+	 */
+	rc = sptlrpc_cli_unwrap_reply(req);
+	if (rc) {
+		DEBUG_REQ(D_ERROR, req, "unwrap reply failed (%d):", rc);
+		RETURN(rc);
+	}
+
+	/*
+	 * Security layer unwrap might ask resend this request.
+	 */
+	if (req->rq_resend)
+		RETURN(0);
+
+	rc = unpack_reply(req);
+	if (rc)
+		RETURN(rc);
+
+	/* retry indefinitely on EINPROGRESS */
+	if (lustre_msg_get_status(req->rq_repmsg) == -EINPROGRESS &&
+	    ptlrpc_no_resend(req) == 0 && !req->rq_no_retry_einprogress) {
+		time_t	now = cfs_time_current_sec();
+
+		DEBUG_REQ(D_RPCTRACE, req, "Resending request on EINPROGRESS");
+		req->rq_resend = 1;
+		req->rq_nr_resend++;
+
+		/* allocate new xid to avoid reply reconstruction */
+		if (!req->rq_bulk) {
+			/* new xid is already allocated for bulk in
+			 * ptlrpc_check_set() */
+			req->rq_xid = ptlrpc_next_xid();
+			DEBUG_REQ(D_RPCTRACE, req, "Allocating new xid for "
+				  "resend on EINPROGRESS");
+		}
+
+		/* Readjust the timeout for current conditions */
+		ptlrpc_at_set_req_timeout(req);
+		/* delay resend to give a chance to the server to get ready.
+		 * The delay is increased by 1s on every resend and is capped to
+		 * the current request timeout (i.e. obd_timeout if AT is off,
+		 * or AT service time x 125% + 5s, see at_est2timeout) */
+		if (req->rq_nr_resend > req->rq_timeout)
+			req->rq_sent = now + req->rq_timeout;
+		else
+			req->rq_sent = now + req->rq_nr_resend;
+
+		RETURN(0);
+	}
+
+	do_gettimeofday(&work_start);
+	timediff = cfs_timeval_sub(&work_start, &req->rq_arrival_time, NULL);
+	if (obd->obd_svc_stats != NULL) {
+		lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQWAIT_CNTR,
+				    timediff);
+		ptlrpc_lprocfs_rpc_sent(req, timediff);
+	}
+
+	if (lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_REPLY &&
+	    lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_ERR) {
+		DEBUG_REQ(D_ERROR, req, "invalid packet received (type=%u)",
+			  lustre_msg_get_type(req->rq_repmsg));
+		RETURN(-EPROTO);
+	}
+
+	if (lustre_msg_get_opc(req->rq_reqmsg) != OBD_PING)
+		CFS_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_PAUSE_REP, cfs_fail_val);
+	ptlrpc_at_adj_service(req, lustre_msg_get_timeout(req->rq_repmsg));
+	ptlrpc_at_adj_net_latency(req,
+				  lustre_msg_get_service_time(req->rq_repmsg));
+
+	rc = ptlrpc_check_status(req);
+	imp->imp_connect_error = rc;
+
+	if (rc) {
+		/*
+		 * Either we've been evicted, or the server has failed for
+		 * some reason. Try to reconnect, and if that fails, punt to
+		 * the upcall.
+		 */
+		if (ll_rpc_recoverable_error(rc)) {
+			if (req->rq_send_state != LUSTRE_IMP_FULL ||
+			    imp->imp_obd->obd_no_recov || imp->imp_dlm_fake) {
+				RETURN(rc);
+			}
+			ptlrpc_request_handle_notconn(req);
+			RETURN(rc);
+		}
+	} else {
+		/*
+		 * Let's look if server sent slv. Do it only for RPC with
+		 * rc == 0.
+		 */
+		ldlm_cli_update_pool(req);
+	}
+
+	/*
+	 * Store transno in reqmsg for replay.
+	 */
+	if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)) {
+		req->rq_transno = lustre_msg_get_transno(req->rq_repmsg);
+		lustre_msg_set_transno(req->rq_reqmsg, req->rq_transno);
+	}
+
+	if (imp->imp_replayable) {
+		spin_lock(&imp->imp_lock);
+		/*
+		 * No point in adding already-committed requests to the replay
+		 * list, we will just remove them immediately. b=9829
+		 */
+		if (req->rq_transno != 0 &&
+		    (req->rq_transno >
+		     lustre_msg_get_last_committed(req->rq_repmsg) ||
+		     req->rq_replay)) {
+			/** version recovery */
+			ptlrpc_save_versions(req);
+			ptlrpc_retain_replayable_request(req, imp);
+		} else if (req->rq_commit_cb != NULL) {
+			spin_unlock(&imp->imp_lock);
+			req->rq_commit_cb(req);
+			spin_lock(&imp->imp_lock);
+		}
+
+		/*
+		 * Replay-enabled imports return commit-status information.
+		 */
+		if (lustre_msg_get_last_committed(req->rq_repmsg)) {
+			imp->imp_peer_committed_transno =
+				lustre_msg_get_last_committed(req->rq_repmsg);
+		}
+
+		ptlrpc_free_committed(imp);
+
+		if (!list_empty(&imp->imp_replay_list)) {
+			struct ptlrpc_request *last;
+
+			last = list_entry(imp->imp_replay_list.prev,
+					      struct ptlrpc_request,
+					      rq_replay_list);
+			/*
+			 * Requests with rq_replay stay on the list even if no
+			 * commit is expected.
+			 */
+			if (last->rq_transno > imp->imp_peer_committed_transno)
+				ptlrpc_pinger_commit_expected(imp);
+		}
+
+		spin_unlock(&imp->imp_lock);
+	}
+
+	RETURN(rc);
+}
+
+/**
+ * Helper function to send request \a req over the network for the first time
+ * Also adjusts request phase.
+ * Returns 0 on success or error code.
+ */
+static int ptlrpc_send_new_req(struct ptlrpc_request *req)
+{
+	struct obd_import     *imp = req->rq_import;
+	int rc;
+	ENTRY;
+
+	LASSERT(req->rq_phase == RQ_PHASE_NEW);
+	if (req->rq_sent && (req->rq_sent > cfs_time_current_sec()) &&
+	    (!req->rq_generation_set ||
+	     req->rq_import_generation == imp->imp_generation))
+		RETURN (0);
+
+	ptlrpc_rqphase_move(req, RQ_PHASE_RPC);
+
+	spin_lock(&imp->imp_lock);
+
+	if (!req->rq_generation_set)
+		req->rq_import_generation = imp->imp_generation;
+
+	if (ptlrpc_import_delay_req(imp, req, &rc)) {
+		spin_lock(&req->rq_lock);
+		req->rq_waiting = 1;
+		spin_unlock(&req->rq_lock);
+
+		DEBUG_REQ(D_HA, req, "req from PID %d waiting for recovery: "
+			  "(%s != %s)", lustre_msg_get_status(req->rq_reqmsg),
+			  ptlrpc_import_state_name(req->rq_send_state),
+			  ptlrpc_import_state_name(imp->imp_state));
+		LASSERT(list_empty(&req->rq_list));
+		list_add_tail(&req->rq_list, &imp->imp_delayed_list);
+		atomic_inc(&req->rq_import->imp_inflight);
+		spin_unlock(&imp->imp_lock);
+		RETURN(0);
+	}
+
+	if (rc != 0) {
+		spin_unlock(&imp->imp_lock);
+		req->rq_status = rc;
+		ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+		RETURN(rc);
+	}
+
+	LASSERT(list_empty(&req->rq_list));
+	list_add_tail(&req->rq_list, &imp->imp_sending_list);
+	atomic_inc(&req->rq_import->imp_inflight);
+	spin_unlock(&imp->imp_lock);
+
+	lustre_msg_set_status(req->rq_reqmsg, current_pid());
+
+	rc = sptlrpc_req_refresh_ctx(req, -1);
+	if (rc) {
+		if (req->rq_err) {
+			req->rq_status = rc;
+			RETURN(1);
+		} else {
+			req->rq_wait_ctx = 1;
+			RETURN(0);
+		}
+	}
+
+	CDEBUG(D_RPCTRACE, "Sending RPC pname:cluuid:pid:xid:nid:opc"
+	       " %s:%s:%d:"LPU64":%s:%d\n", current_comm(),
+	       imp->imp_obd->obd_uuid.uuid,
+	       lustre_msg_get_status(req->rq_reqmsg), req->rq_xid,
+	       libcfs_nid2str(imp->imp_connection->c_peer.nid),
+	       lustre_msg_get_opc(req->rq_reqmsg));
+
+	rc = ptl_send_rpc(req, 0);
+	if (rc) {
+		DEBUG_REQ(D_HA, req, "send failed (%d); expect timeout", rc);
+		req->rq_net_err = 1;
+		RETURN(rc);
+	}
+	RETURN(0);
+}
+
+static inline int ptlrpc_set_producer(struct ptlrpc_request_set *set)
+{
+	int remaining, rc;
+	ENTRY;
+
+	LASSERT(set->set_producer != NULL);
+
+	remaining = atomic_read(&set->set_remaining);
+
+	/* populate the ->set_requests list with requests until we
+	 * reach the maximum number of RPCs in flight for this set */
+	while (atomic_read(&set->set_remaining) < set->set_max_inflight) {
+		rc = set->set_producer(set, set->set_producer_arg);
+		if (rc == -ENOENT) {
+			/* no more RPC to produce */
+			set->set_producer     = NULL;
+			set->set_producer_arg = NULL;
+			RETURN(0);
+		}
+	}
+
+	RETURN((atomic_read(&set->set_remaining) - remaining));
+}
+
+/**
+ * this sends any unsent RPCs in \a set and returns 1 if all are sent
+ * and no more replies are expected.
+ * (it is possible to get less replies than requests sent e.g. due to timed out
+ * requests or requests that we had trouble to send out)
+ */
+int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
+{
+	struct list_head *tmp, *next;
+	int force_timer_recalc = 0;
+	ENTRY;
+
+	if (atomic_read(&set->set_remaining) == 0)
+		RETURN(1);
+
+	list_for_each_safe(tmp, next, &set->set_requests) {
+		struct ptlrpc_request *req =
+			list_entry(tmp, struct ptlrpc_request,
+				       rq_set_chain);
+		struct obd_import *imp = req->rq_import;
+		int unregistered = 0;
+		int rc = 0;
+
+		if (req->rq_phase == RQ_PHASE_NEW &&
+		    ptlrpc_send_new_req(req)) {
+			force_timer_recalc = 1;
+		}
+
+		/* delayed send - skip */
+		if (req->rq_phase == RQ_PHASE_NEW && req->rq_sent)
+			continue;
+
+		/* delayed resend - skip */
+		if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend &&
+		    req->rq_sent > cfs_time_current_sec())
+			continue;
+
+		if (!(req->rq_phase == RQ_PHASE_RPC ||
+		      req->rq_phase == RQ_PHASE_BULK ||
+		      req->rq_phase == RQ_PHASE_INTERPRET ||
+		      req->rq_phase == RQ_PHASE_UNREGISTERING ||
+		      req->rq_phase == RQ_PHASE_COMPLETE)) {
+			DEBUG_REQ(D_ERROR, req, "bad phase %x", req->rq_phase);
+			LBUG();
+		}
+
+		if (req->rq_phase == RQ_PHASE_UNREGISTERING) {
+			LASSERT(req->rq_next_phase != req->rq_phase);
+			LASSERT(req->rq_next_phase != RQ_PHASE_UNDEFINED);
+
+			/*
+			 * Skip processing until reply is unlinked. We
+			 * can't return to pool before that and we can't
+			 * call interpret before that. We need to make
+			 * sure that all rdma transfers finished and will
+			 * not corrupt any data.
+			 */
+			if (ptlrpc_client_recv_or_unlink(req) ||
+			    ptlrpc_client_bulk_active(req))
+				continue;
+
+			/*
+			 * Turn fail_loc off to prevent it from looping
+			 * forever.
+			 */
+			if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) {
+				OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK,
+						     OBD_FAIL_ONCE);
+			}
+			if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK)) {
+				OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK,
+						     OBD_FAIL_ONCE);
+			}
+
+			/*
+			 * Move to next phase if reply was successfully
+			 * unlinked.
+			 */
+			ptlrpc_rqphase_move(req, req->rq_next_phase);
+		}
+
+		if (req->rq_phase == RQ_PHASE_COMPLETE)
+			continue;
+
+		if (req->rq_phase == RQ_PHASE_INTERPRET)
+			GOTO(interpret, req->rq_status);
+
+		/*
+		 * Note that this also will start async reply unlink.
+		 */
+		if (req->rq_net_err && !req->rq_timedout) {
+			ptlrpc_expire_one_request(req, 1);
+
+			/*
+			 * Check if we still need to wait for unlink.
+			 */
+			if (ptlrpc_client_recv_or_unlink(req) ||
+			    ptlrpc_client_bulk_active(req))
+				continue;
+			/* If there is no need to resend, fail it now. */
+			if (req->rq_no_resend) {
+				if (req->rq_status == 0)
+					req->rq_status = -EIO;
+				ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+				GOTO(interpret, req->rq_status);
+			} else {
+				continue;
+			}
+		}
+
+		if (req->rq_err) {
+			spin_lock(&req->rq_lock);
+			req->rq_replied = 0;
+			spin_unlock(&req->rq_lock);
+			if (req->rq_status == 0)
+				req->rq_status = -EIO;
+			ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+			GOTO(interpret, req->rq_status);
+		}
+
+		/* ptlrpc_set_wait->l_wait_event sets lwi_allow_intr
+		 * so it sets rq_intr regardless of individual rpc
+		 * timeouts. The synchronous IO waiting path sets
+		 * rq_intr irrespective of whether ptlrpcd
+		 * has seen a timeout.  Our policy is to only interpret
+		 * interrupted rpcs after they have timed out, so we
+		 * need to enforce that here.
+		 */
+
+		if (req->rq_intr && (req->rq_timedout || req->rq_waiting ||
+				     req->rq_wait_ctx)) {
+			req->rq_status = -EINTR;
+			ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+			GOTO(interpret, req->rq_status);
+		}
+
+		if (req->rq_phase == RQ_PHASE_RPC) {
+			if (req->rq_timedout || req->rq_resend ||
+			    req->rq_waiting || req->rq_wait_ctx) {
+				int status;
+
+				if (!ptlrpc_unregister_reply(req, 1))
+					continue;
+
+				spin_lock(&imp->imp_lock);
+				if (ptlrpc_import_delay_req(imp, req, &status)){
+					/* put on delay list - only if we wait
+					 * recovery finished - before send */
+					list_del_init(&req->rq_list);
+					list_add_tail(&req->rq_list,
+							  &imp->
+							  imp_delayed_list);
+					spin_unlock(&imp->imp_lock);
+					continue;
+				}
+
+				if (status != 0)  {
+					req->rq_status = status;
+					ptlrpc_rqphase_move(req,
+						RQ_PHASE_INTERPRET);
+					spin_unlock(&imp->imp_lock);
+					GOTO(interpret, req->rq_status);
+				}
+				if (ptlrpc_no_resend(req) &&
+				    !req->rq_wait_ctx) {
+					req->rq_status = -ENOTCONN;
+					ptlrpc_rqphase_move(req,
+							    RQ_PHASE_INTERPRET);
+					spin_unlock(&imp->imp_lock);
+					GOTO(interpret, req->rq_status);
+				}
+
+				list_del_init(&req->rq_list);
+				list_add_tail(&req->rq_list,
+						  &imp->imp_sending_list);
+
+				spin_unlock(&imp->imp_lock);
+
+				spin_lock(&req->rq_lock);
+				req->rq_waiting = 0;
+				spin_unlock(&req->rq_lock);
+
+				if (req->rq_timedout || req->rq_resend) {
+					/* This is re-sending anyways,
+					 * let's mark req as resend. */
+					spin_lock(&req->rq_lock);
+					req->rq_resend = 1;
+					spin_unlock(&req->rq_lock);
+					if (req->rq_bulk) {
+						__u64 old_xid;
+
+						if (!ptlrpc_unregister_bulk(req, 1))
+							continue;
+
+						/* ensure previous bulk fails */
+						old_xid = req->rq_xid;
+						req->rq_xid = ptlrpc_next_xid();
+						CDEBUG(D_HA, "resend bulk "
+						       "old x"LPU64
+						       " new x"LPU64"\n",
+						       old_xid, req->rq_xid);
+					}
+				}
+				/*
+				 * rq_wait_ctx is only touched by ptlrpcd,
+				 * so no lock is needed here.
+				 */
+				status = sptlrpc_req_refresh_ctx(req, -1);
+				if (status) {
+					if (req->rq_err) {
+						req->rq_status = status;
+						spin_lock(&req->rq_lock);
+						req->rq_wait_ctx = 0;
+						spin_unlock(&req->rq_lock);
+						force_timer_recalc = 1;
+					} else {
+						spin_lock(&req->rq_lock);
+						req->rq_wait_ctx = 1;
+						spin_unlock(&req->rq_lock);
+					}
+
+					continue;
+				} else {
+					spin_lock(&req->rq_lock);
+					req->rq_wait_ctx = 0;
+					spin_unlock(&req->rq_lock);
+				}
+
+				rc = ptl_send_rpc(req, 0);
+				if (rc) {
+					DEBUG_REQ(D_HA, req,
+						  "send failed: rc = %d", rc);
+					force_timer_recalc = 1;
+					spin_lock(&req->rq_lock);
+					req->rq_net_err = 1;
+					spin_unlock(&req->rq_lock);
+				}
+				/* need to reset the timeout */
+				force_timer_recalc = 1;
+			}
+
+			spin_lock(&req->rq_lock);
+
+			if (ptlrpc_client_early(req)) {
+				ptlrpc_at_recv_early_reply(req);
+				spin_unlock(&req->rq_lock);
+				continue;
+			}
+
+			/* Still waiting for a reply? */
+			if (ptlrpc_client_recv(req)) {
+				spin_unlock(&req->rq_lock);
+				continue;
+			}
+
+			/* Did we actually receive a reply? */
+			if (!ptlrpc_client_replied(req)) {
+				spin_unlock(&req->rq_lock);
+				continue;
+			}
+
+			spin_unlock(&req->rq_lock);
+
+			/* unlink from net because we are going to
+			 * swab in-place of reply buffer */
+			unregistered = ptlrpc_unregister_reply(req, 1);
+			if (!unregistered)
+				continue;
+
+			req->rq_status = after_reply(req);
+			if (req->rq_resend)
+				continue;
+
+			/* If there is no bulk associated with this request,
+			 * then we're done and should let the interpreter
+			 * process the reply. Similarly if the RPC returned
+			 * an error, and therefore the bulk will never arrive.
+			 */
+			if (req->rq_bulk == NULL || req->rq_status < 0) {
+				ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+				GOTO(interpret, req->rq_status);
+			}
+
+			ptlrpc_rqphase_move(req, RQ_PHASE_BULK);
+		}
+
+		LASSERT(req->rq_phase == RQ_PHASE_BULK);
+		if (ptlrpc_client_bulk_active(req))
+			continue;
+
+		if (req->rq_bulk->bd_failure) {
+			/* The RPC reply arrived OK, but the bulk screwed
+			 * up!  Dead weird since the server told us the RPC
+			 * was good after getting the REPLY for her GET or
+			 * the ACK for her PUT. */
+			DEBUG_REQ(D_ERROR, req, "bulk transfer failed");
+			req->rq_status = -EIO;
+		}
+
+		ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+
+	interpret:
+		LASSERT(req->rq_phase == RQ_PHASE_INTERPRET);
+
+		/* This moves to "unregistering" phase we need to wait for
+		 * reply unlink. */
+		if (!unregistered && !ptlrpc_unregister_reply(req, 1)) {
+			/* start async bulk unlink too */
+			ptlrpc_unregister_bulk(req, 1);
+			continue;
+		}
+
+		if (!ptlrpc_unregister_bulk(req, 1))
+			continue;
+
+		/* When calling interpret receiving already should be
+		 * finished. */
+		LASSERT(!req->rq_receiving_reply);
+
+		ptlrpc_req_interpret(env, req, req->rq_status);
+
+		ptlrpc_rqphase_move(req, RQ_PHASE_COMPLETE);
+
+		CDEBUG(req->rq_reqmsg != NULL ? D_RPCTRACE : 0,
+			"Completed RPC pname:cluuid:pid:xid:nid:"
+			"opc %s:%s:%d:"LPU64":%s:%d\n",
+			current_comm(), imp->imp_obd->obd_uuid.uuid,
+			lustre_msg_get_status(req->rq_reqmsg), req->rq_xid,
+			libcfs_nid2str(imp->imp_connection->c_peer.nid),
+			lustre_msg_get_opc(req->rq_reqmsg));
+
+		spin_lock(&imp->imp_lock);
+		/* Request already may be not on sending or delaying list. This
+		 * may happen in the case of marking it erroneous for the case
+		 * ptlrpc_import_delay_req(req, status) find it impossible to
+		 * allow sending this rpc and returns *status != 0. */
+		if (!list_empty(&req->rq_list)) {
+			list_del_init(&req->rq_list);
+			atomic_dec(&imp->imp_inflight);
+		}
+		spin_unlock(&imp->imp_lock);
+
+		atomic_dec(&set->set_remaining);
+		wake_up_all(&imp->imp_recovery_waitq);
+
+		if (set->set_producer) {
+			/* produce a new request if possible */
+			if (ptlrpc_set_producer(set) > 0)
+				force_timer_recalc = 1;
+
+			/* free the request that has just been completed
+			 * in order not to pollute set->set_requests */
+			list_del_init(&req->rq_set_chain);
+			spin_lock(&req->rq_lock);
+			req->rq_set = NULL;
+			req->rq_invalid_rqset = 0;
+			spin_unlock(&req->rq_lock);
+
+			/* record rq_status to compute the final status later */
+			if (req->rq_status != 0)
+				set->set_rc = req->rq_status;
+			ptlrpc_req_finished(req);
+		}
+	}
+
+	/* If we hit an error, we want to recover promptly. */
+	RETURN(atomic_read(&set->set_remaining) == 0 || force_timer_recalc);
+}
+EXPORT_SYMBOL(ptlrpc_check_set);
+
+/**
+ * Time out request \a req. is \a async_unlink is set, that means do not wait
+ * until LNet actually confirms network buffer unlinking.
+ * Return 1 if we should give up further retrying attempts or 0 otherwise.
+ */
+int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink)
+{
+	struct obd_import *imp = req->rq_import;
+	int rc = 0;
+	ENTRY;
+
+	spin_lock(&req->rq_lock);
+	req->rq_timedout = 1;
+	spin_unlock(&req->rq_lock);
+
+	DEBUG_REQ(D_WARNING, req, "Request sent has %s: [sent "CFS_DURATION_T
+		  "/real "CFS_DURATION_T"]",
+		  req->rq_net_err ? "failed due to network error" :
+		     ((req->rq_real_sent == 0 ||
+		       cfs_time_before(req->rq_real_sent, req->rq_sent) ||
+		       cfs_time_aftereq(req->rq_real_sent, req->rq_deadline)) ?
+		      "timed out for sent delay" : "timed out for slow reply"),
+		  req->rq_sent, req->rq_real_sent);
+
+	if (imp != NULL && obd_debug_peer_on_timeout)
+		LNetCtl(IOC_LIBCFS_DEBUG_PEER, &imp->imp_connection->c_peer);
+
+	ptlrpc_unregister_reply(req, async_unlink);
+	ptlrpc_unregister_bulk(req, async_unlink);
+
+	if (obd_dump_on_timeout)
+		libcfs_debug_dumplog();
+
+	if (imp == NULL) {
+		DEBUG_REQ(D_HA, req, "NULL import: already cleaned up?");
+		RETURN(1);
+	}
+
+	atomic_inc(&imp->imp_timeouts);
+
+	/* The DLM server doesn't want recovery run on its imports. */
+	if (imp->imp_dlm_fake)
+		RETURN(1);
+
+	/* If this request is for recovery or other primordial tasks,
+	 * then error it out here. */
+	if (req->rq_ctx_init || req->rq_ctx_fini ||
+	    req->rq_send_state != LUSTRE_IMP_FULL ||
+	    imp->imp_obd->obd_no_recov) {
+		DEBUG_REQ(D_RPCTRACE, req, "err -110, sent_state=%s (now=%s)",
+			  ptlrpc_import_state_name(req->rq_send_state),
+			  ptlrpc_import_state_name(imp->imp_state));
+		spin_lock(&req->rq_lock);
+		req->rq_status = -ETIMEDOUT;
+		req->rq_err = 1;
+		spin_unlock(&req->rq_lock);
+		RETURN(1);
+	}
+
+	/* if a request can't be resent we can't wait for an answer after
+	   the timeout */
+	if (ptlrpc_no_resend(req)) {
+		DEBUG_REQ(D_RPCTRACE, req, "TIMEOUT-NORESEND:");
+		rc = 1;
+	}
+
+	ptlrpc_fail_import(imp, lustre_msg_get_conn_cnt(req->rq_reqmsg));
+
+	RETURN(rc);
+}
+
+/**
+ * Time out all uncompleted requests in request set pointed by \a data
+ * Callback used when waiting on sets with l_wait_event.
+ * Always returns 1.
+ */
+int ptlrpc_expired_set(void *data)
+{
+	struct ptlrpc_request_set *set = data;
+	struct list_head		*tmp;
+	time_t		     now = cfs_time_current_sec();
+	ENTRY;
+
+	LASSERT(set != NULL);
+
+	/*
+	 * A timeout expired. See which reqs it applies to...
+	 */
+	list_for_each (tmp, &set->set_requests) {
+		struct ptlrpc_request *req =
+			list_entry(tmp, struct ptlrpc_request,
+				       rq_set_chain);
+
+		/* don't expire request waiting for context */
+		if (req->rq_wait_ctx)
+			continue;
+
+		/* Request in-flight? */
+		if (!((req->rq_phase == RQ_PHASE_RPC &&
+		       !req->rq_waiting && !req->rq_resend) ||
+		      (req->rq_phase == RQ_PHASE_BULK)))
+			continue;
+
+		if (req->rq_timedout ||     /* already dealt with */
+		    req->rq_deadline > now) /* not expired */
+			continue;
+
+		/* Deal with this guy. Do it asynchronously to not block
+		 * ptlrpcd thread. */
+		ptlrpc_expire_one_request(req, 1);
+	}
+
+	/*
+	 * When waiting for a whole set, we always break out of the
+	 * sleep so we can recalculate the timeout, or enable interrupts
+	 * if everyone's timed out.
+	 */
+	RETURN(1);
+}
+EXPORT_SYMBOL(ptlrpc_expired_set);
+
+/**
+ * Sets rq_intr flag in \a req under spinlock.
+ */
+void ptlrpc_mark_interrupted(struct ptlrpc_request *req)
+{
+	spin_lock(&req->rq_lock);
+	req->rq_intr = 1;
+	spin_unlock(&req->rq_lock);
+}
+EXPORT_SYMBOL(ptlrpc_mark_interrupted);
+
+/**
+ * Interrupts (sets interrupted flag) all uncompleted requests in
+ * a set \a data. Callback for l_wait_event for interruptible waits.
+ */
+void ptlrpc_interrupted_set(void *data)
+{
+	struct ptlrpc_request_set *set = data;
+	struct list_head *tmp;
+
+	LASSERT(set != NULL);
+	CDEBUG(D_RPCTRACE, "INTERRUPTED SET %p\n", set);
+
+	list_for_each(tmp, &set->set_requests) {
+		struct ptlrpc_request *req =
+			list_entry(tmp, struct ptlrpc_request,
+				       rq_set_chain);
+
+		if (req->rq_phase != RQ_PHASE_RPC &&
+		    req->rq_phase != RQ_PHASE_UNREGISTERING)
+			continue;
+
+		ptlrpc_mark_interrupted(req);
+	}
+}
+EXPORT_SYMBOL(ptlrpc_interrupted_set);
+
+/**
+ * Get the smallest timeout in the set; this does NOT set a timeout.
+ */
+int ptlrpc_set_next_timeout(struct ptlrpc_request_set *set)
+{
+	struct list_head	    *tmp;
+	time_t		 now = cfs_time_current_sec();
+	int		    timeout = 0;
+	struct ptlrpc_request *req;
+	int		    deadline;
+	ENTRY;
+
+	SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */
+
+	list_for_each(tmp, &set->set_requests) {
+		req = list_entry(tmp, struct ptlrpc_request, rq_set_chain);
+
+		/*
+		 * Request in-flight?
+		 */
+		if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) ||
+		      (req->rq_phase == RQ_PHASE_BULK) ||
+		      (req->rq_phase == RQ_PHASE_NEW)))
+			continue;
+
+		/*
+		 * Already timed out.
+		 */
+		if (req->rq_timedout)
+			continue;
+
+		/*
+		 * Waiting for ctx.
+		 */
+		if (req->rq_wait_ctx)
+			continue;
+
+		if (req->rq_phase == RQ_PHASE_NEW)
+			deadline = req->rq_sent;
+		else if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend)
+			deadline = req->rq_sent;
+		else
+			deadline = req->rq_sent + req->rq_timeout;
+
+		if (deadline <= now)    /* actually expired already */
+			timeout = 1;    /* ASAP */
+		else if (timeout == 0 || timeout > deadline - now)
+			timeout = deadline - now;
+	}
+	RETURN(timeout);
+}
+EXPORT_SYMBOL(ptlrpc_set_next_timeout);
+
+/**
+ * Send all unset request from the set and then wait untill all
+ * requests in the set complete (either get a reply, timeout, get an
+ * error or otherwise be interrupted).
+ * Returns 0 on success or error code otherwise.
+ */
+int ptlrpc_set_wait(struct ptlrpc_request_set *set)
+{
+	struct list_head	    *tmp;
+	struct ptlrpc_request *req;
+	struct l_wait_info     lwi;
+	int		    rc, timeout;
+	ENTRY;
+
+	if (set->set_producer)
+		(void)ptlrpc_set_producer(set);
+	else
+		list_for_each(tmp, &set->set_requests) {
+			req = list_entry(tmp, struct ptlrpc_request,
+					     rq_set_chain);
+			if (req->rq_phase == RQ_PHASE_NEW)
+				(void)ptlrpc_send_new_req(req);
+		}
+
+	if (list_empty(&set->set_requests))
+		RETURN(0);
+
+	do {
+		timeout = ptlrpc_set_next_timeout(set);
+
+		/* wait until all complete, interrupted, or an in-flight
+		 * req times out */
+		CDEBUG(D_RPCTRACE, "set %p going to sleep for %d seconds\n",
+		       set, timeout);
+
+		if (timeout == 0 && !cfs_signal_pending())
+			/*
+			 * No requests are in-flight (ether timed out
+			 * or delayed), so we can allow interrupts.
+			 * We still want to block for a limited time,
+			 * so we allow interrupts during the timeout.
+			 */
+			lwi = LWI_TIMEOUT_INTR_ALL(cfs_time_seconds(1),
+						   ptlrpc_expired_set,
+						   ptlrpc_interrupted_set, set);
+		else
+			/*
+			 * At least one request is in flight, so no
+			 * interrupts are allowed. Wait until all
+			 * complete, or an in-flight req times out.
+			 */
+			lwi = LWI_TIMEOUT(cfs_time_seconds(timeout? timeout : 1),
+					  ptlrpc_expired_set, set);
+
+		rc = l_wait_event(set->set_waitq, ptlrpc_check_set(NULL, set), &lwi);
+
+		/* LU-769 - if we ignored the signal because it was already
+		 * pending when we started, we need to handle it now or we risk
+		 * it being ignored forever */
+		if (rc == -ETIMEDOUT && !lwi.lwi_allow_intr &&
+		    cfs_signal_pending()) {
+			sigset_t blocked_sigs =
+					   cfs_block_sigsinv(LUSTRE_FATAL_SIGS);
+
+			/* In fact we only interrupt for the "fatal" signals
+			 * like SIGINT or SIGKILL. We still ignore less
+			 * important signals since ptlrpc set is not easily
+			 * reentrant from userspace again */
+			if (cfs_signal_pending())
+				ptlrpc_interrupted_set(set);
+			cfs_restore_sigs(blocked_sigs);
+		}
+
+		LASSERT(rc == 0 || rc == -EINTR || rc == -ETIMEDOUT);
+
+		/* -EINTR => all requests have been flagged rq_intr so next
+		 * check completes.
+		 * -ETIMEDOUT => someone timed out.  When all reqs have
+		 * timed out, signals are enabled allowing completion with
+		 * EINTR.
+		 * I don't really care if we go once more round the loop in
+		 * the error cases -eeb. */
+		if (rc == 0 && atomic_read(&set->set_remaining) == 0) {
+			list_for_each(tmp, &set->set_requests) {
+				req = list_entry(tmp, struct ptlrpc_request,
+						     rq_set_chain);
+				spin_lock(&req->rq_lock);
+				req->rq_invalid_rqset = 1;
+				spin_unlock(&req->rq_lock);
+			}
+		}
+	} while (rc != 0 || atomic_read(&set->set_remaining) != 0);
+
+	LASSERT(atomic_read(&set->set_remaining) == 0);
+
+	rc = set->set_rc; /* rq_status of already freed requests if any */
+	list_for_each(tmp, &set->set_requests) {
+		req = list_entry(tmp, struct ptlrpc_request, rq_set_chain);
+
+		LASSERT(req->rq_phase == RQ_PHASE_COMPLETE);
+		if (req->rq_status != 0)
+			rc = req->rq_status;
+	}
+
+	if (set->set_interpret != NULL) {
+		int (*interpreter)(struct ptlrpc_request_set *set,void *,int) =
+			set->set_interpret;
+		rc = interpreter (set, set->set_arg, rc);
+	} else {
+		struct ptlrpc_set_cbdata *cbdata, *n;
+		int err;
+
+		list_for_each_entry_safe(cbdata, n,
+					 &set->set_cblist, psc_item) {
+			list_del_init(&cbdata->psc_item);
+			err = cbdata->psc_interpret(set, cbdata->psc_data, rc);
+			if (err && !rc)
+				rc = err;
+			OBD_FREE_PTR(cbdata);
+		}
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_set_wait);
+
+/**
+ * Helper fuction for request freeing.
+ * Called when request count reached zero and request needs to be freed.
+ * Removes request from all sorts of sending/replay lists it might be on,
+ * frees network buffers if any are present.
+ * If \a locked is set, that means caller is already holding import imp_lock
+ * and so we no longer need to reobtain it (for certain lists manipulations)
+ */
+static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked)
+{
+	ENTRY;
+	if (request == NULL) {
+		EXIT;
+		return;
+	}
+
+	LASSERTF(!request->rq_receiving_reply, "req %p\n", request);
+	LASSERTF(request->rq_rqbd == NULL, "req %p\n",request);/* client-side */
+	LASSERTF(list_empty(&request->rq_list), "req %p\n", request);
+	LASSERTF(list_empty(&request->rq_set_chain), "req %p\n", request);
+	LASSERTF(list_empty(&request->rq_exp_list), "req %p\n", request);
+	LASSERTF(!request->rq_replay, "req %p\n", request);
+
+	req_capsule_fini(&request->rq_pill);
+
+	/* We must take it off the imp_replay_list first.  Otherwise, we'll set
+	 * request->rq_reqmsg to NULL while osc_close is dereferencing it. */
+	if (request->rq_import != NULL) {
+		if (!locked)
+			spin_lock(&request->rq_import->imp_lock);
+		list_del_init(&request->rq_replay_list);
+		if (!locked)
+			spin_unlock(&request->rq_import->imp_lock);
+	}
+	LASSERTF(list_empty(&request->rq_replay_list), "req %p\n", request);
+
+	if (atomic_read(&request->rq_refcount) != 0) {
+		DEBUG_REQ(D_ERROR, request,
+			  "freeing request with nonzero refcount");
+		LBUG();
+	}
+
+	if (request->rq_repbuf != NULL)
+		sptlrpc_cli_free_repbuf(request);
+	if (request->rq_export != NULL) {
+		class_export_put(request->rq_export);
+		request->rq_export = NULL;
+	}
+	if (request->rq_import != NULL) {
+		class_import_put(request->rq_import);
+		request->rq_import = NULL;
+	}
+	if (request->rq_bulk != NULL)
+		ptlrpc_free_bulk_pin(request->rq_bulk);
+
+	if (request->rq_reqbuf != NULL || request->rq_clrbuf != NULL)
+		sptlrpc_cli_free_reqbuf(request);
+
+	if (request->rq_cli_ctx)
+		sptlrpc_req_put_ctx(request, !locked);
+
+	if (request->rq_pool)
+		__ptlrpc_free_req_to_pool(request);
+	else
+		OBD_FREE(request, sizeof(*request));
+	EXIT;
+}
+
+static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked);
+/**
+ * Drop one request reference. Must be called with import imp_lock held.
+ * When reference count drops to zero, reuqest is freed.
+ */
+void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request)
+{
+	LASSERT(spin_is_locked(&request->rq_import->imp_lock));
+	(void)__ptlrpc_req_finished(request, 1);
+}
+EXPORT_SYMBOL(ptlrpc_req_finished_with_imp_lock);
+
+/**
+ * Helper function
+ * Drops one reference count for request \a request.
+ * \a locked set indicates that caller holds import imp_lock.
+ * Frees the request whe reference count reaches zero.
+ */
+static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked)
+{
+	ENTRY;
+	if (request == NULL)
+		RETURN(1);
+
+	if (request == LP_POISON ||
+	    request->rq_reqmsg == LP_POISON) {
+		CERROR("dereferencing freed request (bug 575)\n");
+		LBUG();
+		RETURN(1);
+	}
+
+	DEBUG_REQ(D_INFO, request, "refcount now %u",
+		  atomic_read(&request->rq_refcount) - 1);
+
+	if (atomic_dec_and_test(&request->rq_refcount)) {
+		__ptlrpc_free_req(request, locked);
+		RETURN(1);
+	}
+
+	RETURN(0);
+}
+
+/**
+ * Drops one reference count for a request.
+ */
+void ptlrpc_req_finished(struct ptlrpc_request *request)
+{
+	__ptlrpc_req_finished(request, 0);
+}
+EXPORT_SYMBOL(ptlrpc_req_finished);
+
+/**
+ * Returns xid of a \a request
+ */
+__u64 ptlrpc_req_xid(struct ptlrpc_request *request)
+{
+	return request->rq_xid;
+}
+EXPORT_SYMBOL(ptlrpc_req_xid);
+
+/**
+ * Disengage the client's reply buffer from the network
+ * NB does _NOT_ unregister any client-side bulk.
+ * IDEMPOTENT, but _not_ safe against concurrent callers.
+ * The request owner (i.e. the thread doing the I/O) must call...
+ * Returns 0 on success or 1 if unregistering cannot be made.
+ */
+int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async)
+{
+	int		rc;
+	wait_queue_head_t       *wq;
+	struct l_wait_info lwi;
+
+	/*
+	 * Might sleep.
+	 */
+	LASSERT(!in_interrupt());
+
+	/*
+	 * Let's setup deadline for reply unlink.
+	 */
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
+	    async && request->rq_reply_deadline == 0)
+		request->rq_reply_deadline = cfs_time_current_sec()+LONG_UNLINK;
+
+	/*
+	 * Nothing left to do.
+	 */
+	if (!ptlrpc_client_recv_or_unlink(request))
+		RETURN(1);
+
+	LNetMDUnlink(request->rq_reply_md_h);
+
+	/*
+	 * Let's check it once again.
+	 */
+	if (!ptlrpc_client_recv_or_unlink(request))
+		RETURN(1);
+
+	/*
+	 * Move to "Unregistering" phase as reply was not unlinked yet.
+	 */
+	ptlrpc_rqphase_move(request, RQ_PHASE_UNREGISTERING);
+
+	/*
+	 * Do not wait for unlink to finish.
+	 */
+	if (async)
+		RETURN(0);
+
+	/*
+	 * We have to l_wait_event() whatever the result, to give liblustre
+	 * a chance to run reply_in_callback(), and to make sure we've
+	 * unlinked before returning a req to the pool.
+	 */
+	if (request->rq_set != NULL)
+		wq = &request->rq_set->set_waitq;
+	else
+		wq = &request->rq_reply_waitq;
+
+	for (;;) {
+		/* Network access will complete in finite time but the HUGE
+		 * timeout lets us CWARN for visibility of sluggish NALs */
+		lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK),
+					   cfs_time_seconds(1), NULL, NULL);
+		rc = l_wait_event(*wq, !ptlrpc_client_recv_or_unlink(request),
+				  &lwi);
+		if (rc == 0) {
+			ptlrpc_rqphase_move(request, request->rq_next_phase);
+			RETURN(1);
+		}
+
+		LASSERT(rc == -ETIMEDOUT);
+		DEBUG_REQ(D_WARNING, request, "Unexpectedly long timeout "
+			  "rvcng=%d unlnk=%d", request->rq_receiving_reply,
+			  request->rq_must_unlink);
+	}
+	RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_unregister_reply);
+
+/**
+ * Iterates through replay_list on import and prunes
+ * all requests have transno smaller than last_committed for the
+ * import and don't have rq_replay set.
+ * Since requests are sorted in transno order, stops when meetign first
+ * transno bigger than last_committed.
+ * caller must hold imp->imp_lock
+ */
+void ptlrpc_free_committed(struct obd_import *imp)
+{
+	struct list_head *tmp, *saved;
+	struct ptlrpc_request *req;
+	struct ptlrpc_request *last_req = NULL; /* temporary fire escape */
+	ENTRY;
+
+	LASSERT(imp != NULL);
+
+	LASSERT(spin_is_locked(&imp->imp_lock));
+
+
+	if (imp->imp_peer_committed_transno == imp->imp_last_transno_checked &&
+	    imp->imp_generation == imp->imp_last_generation_checked) {
+		CDEBUG(D_INFO, "%s: skip recheck: last_committed "LPU64"\n",
+		       imp->imp_obd->obd_name, imp->imp_peer_committed_transno);
+		EXIT;
+		return;
+	}
+	CDEBUG(D_RPCTRACE, "%s: committing for last_committed "LPU64" gen %d\n",
+	       imp->imp_obd->obd_name, imp->imp_peer_committed_transno,
+	       imp->imp_generation);
+	imp->imp_last_transno_checked = imp->imp_peer_committed_transno;
+	imp->imp_last_generation_checked = imp->imp_generation;
+
+	list_for_each_safe(tmp, saved, &imp->imp_replay_list) {
+		req = list_entry(tmp, struct ptlrpc_request,
+				     rq_replay_list);
+
+		/* XXX ok to remove when 1357 resolved - rread 05/29/03  */
+		LASSERT(req != last_req);
+		last_req = req;
+
+		if (req->rq_transno == 0) {
+			DEBUG_REQ(D_EMERG, req, "zero transno during replay");
+			LBUG();
+		}
+		if (req->rq_import_generation < imp->imp_generation) {
+			DEBUG_REQ(D_RPCTRACE, req, "free request with old gen");
+			GOTO(free_req, 0);
+		}
+
+		if (req->rq_replay) {
+			DEBUG_REQ(D_RPCTRACE, req, "keeping (FL_REPLAY)");
+			continue;
+		}
+
+		/* not yet committed */
+		if (req->rq_transno > imp->imp_peer_committed_transno) {
+			DEBUG_REQ(D_RPCTRACE, req, "stopping search");
+			break;
+		}
+
+		DEBUG_REQ(D_INFO, req, "commit (last_committed "LPU64")",
+			  imp->imp_peer_committed_transno);
+free_req:
+		spin_lock(&req->rq_lock);
+		req->rq_replay = 0;
+		spin_unlock(&req->rq_lock);
+		if (req->rq_commit_cb != NULL)
+			req->rq_commit_cb(req);
+		list_del_init(&req->rq_replay_list);
+		__ptlrpc_req_finished(req, 1);
+	}
+
+	EXIT;
+	return;
+}
+
+void ptlrpc_cleanup_client(struct obd_import *imp)
+{
+	ENTRY;
+	EXIT;
+	return;
+}
+EXPORT_SYMBOL(ptlrpc_cleanup_client);
+
+/**
+ * Schedule previously sent request for resend.
+ * For bulk requests we assign new xid (to avoid problems with
+ * lost replies and therefore several transfers landing into same buffer
+ * from different sending attempts).
+ */
+void ptlrpc_resend_req(struct ptlrpc_request *req)
+{
+	DEBUG_REQ(D_HA, req, "going to resend");
+	lustre_msg_set_handle(req->rq_reqmsg, &(struct lustre_handle){ 0 });
+	req->rq_status = -EAGAIN;
+
+	spin_lock(&req->rq_lock);
+	req->rq_resend = 1;
+	req->rq_net_err = 0;
+	req->rq_timedout = 0;
+	if (req->rq_bulk) {
+		__u64 old_xid = req->rq_xid;
+
+		/* ensure previous bulk fails */
+		req->rq_xid = ptlrpc_next_xid();
+		CDEBUG(D_HA, "resend bulk old x"LPU64" new x"LPU64"\n",
+		       old_xid, req->rq_xid);
+	}
+	ptlrpc_client_wake_req(req);
+	spin_unlock(&req->rq_lock);
+}
+EXPORT_SYMBOL(ptlrpc_resend_req);
+
+/* XXX: this function and rq_status are currently unused */
+void ptlrpc_restart_req(struct ptlrpc_request *req)
+{
+	DEBUG_REQ(D_HA, req, "restarting (possibly-)completed request");
+	req->rq_status = -ERESTARTSYS;
+
+	spin_lock(&req->rq_lock);
+	req->rq_restart = 1;
+	req->rq_timedout = 0;
+	ptlrpc_client_wake_req(req);
+	spin_unlock(&req->rq_lock);
+}
+EXPORT_SYMBOL(ptlrpc_restart_req);
+
+/**
+ * Grab additional reference on a request \a req
+ */
+struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req)
+{
+	ENTRY;
+	atomic_inc(&req->rq_refcount);
+	RETURN(req);
+}
+EXPORT_SYMBOL(ptlrpc_request_addref);
+
+/**
+ * Add a request to import replay_list.
+ * Must be called under imp_lock
+ */
+void ptlrpc_retain_replayable_request(struct ptlrpc_request *req,
+				      struct obd_import *imp)
+{
+	struct list_head *tmp;
+
+	LASSERT(spin_is_locked(&imp->imp_lock));
+
+	if (req->rq_transno == 0) {
+		DEBUG_REQ(D_EMERG, req, "saving request with zero transno");
+		LBUG();
+	}
+
+	/* clear this for new requests that were resent as well
+	   as resent replayed requests. */
+	lustre_msg_clear_flags(req->rq_reqmsg, MSG_RESENT);
+
+	/* don't re-add requests that have been replayed */
+	if (!list_empty(&req->rq_replay_list))
+		return;
+
+	lustre_msg_add_flags(req->rq_reqmsg, MSG_REPLAY);
+
+	LASSERT(imp->imp_replayable);
+	/* Balanced in ptlrpc_free_committed, usually. */
+	ptlrpc_request_addref(req);
+	list_for_each_prev(tmp, &imp->imp_replay_list) {
+		struct ptlrpc_request *iter =
+			list_entry(tmp, struct ptlrpc_request,
+				       rq_replay_list);
+
+		/* We may have duplicate transnos if we create and then
+		 * open a file, or for closes retained if to match creating
+		 * opens, so use req->rq_xid as a secondary key.
+		 * (See bugs 684, 685, and 428.)
+		 * XXX no longer needed, but all opens need transnos!
+		 */
+		if (iter->rq_transno > req->rq_transno)
+			continue;
+
+		if (iter->rq_transno == req->rq_transno) {
+			LASSERT(iter->rq_xid != req->rq_xid);
+			if (iter->rq_xid > req->rq_xid)
+				continue;
+		}
+
+		list_add(&req->rq_replay_list, &iter->rq_replay_list);
+		return;
+	}
+
+	list_add(&req->rq_replay_list, &imp->imp_replay_list);
+}
+EXPORT_SYMBOL(ptlrpc_retain_replayable_request);
+
+/**
+ * Send request and wait until it completes.
+ * Returns request processing status.
+ */
+int ptlrpc_queue_wait(struct ptlrpc_request *req)
+{
+	struct ptlrpc_request_set *set;
+	int rc;
+	ENTRY;
+
+	LASSERT(req->rq_set == NULL);
+	LASSERT(!req->rq_receiving_reply);
+
+	set = ptlrpc_prep_set();
+	if (set == NULL) {
+		CERROR("Unable to allocate ptlrpc set.");
+		RETURN(-ENOMEM);
+	}
+
+	/* for distributed debugging */
+	lustre_msg_set_status(req->rq_reqmsg, current_pid());
+
+	/* add a ref for the set (see comment in ptlrpc_set_add_req) */
+	ptlrpc_request_addref(req);
+	ptlrpc_set_add_req(set, req);
+	rc = ptlrpc_set_wait(set);
+	ptlrpc_set_destroy(set);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_queue_wait);
+
+struct ptlrpc_replay_async_args {
+	int praa_old_state;
+	int praa_old_status;
+};
+
+/**
+ * Callback used for replayed requests reply processing.
+ * In case of succesful reply calls registeresd request replay callback.
+ * In case of error restart replay process.
+ */
+static int ptlrpc_replay_interpret(const struct lu_env *env,
+				   struct ptlrpc_request *req,
+				   void * data, int rc)
+{
+	struct ptlrpc_replay_async_args *aa = data;
+	struct obd_import *imp = req->rq_import;
+
+	ENTRY;
+	atomic_dec(&imp->imp_replay_inflight);
+
+	if (!ptlrpc_client_replied(req)) {
+		CERROR("request replay timed out, restarting recovery\n");
+		GOTO(out, rc = -ETIMEDOUT);
+	}
+
+	if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR &&
+	    (lustre_msg_get_status(req->rq_repmsg) == -ENOTCONN ||
+	     lustre_msg_get_status(req->rq_repmsg) == -ENODEV))
+		GOTO(out, rc = lustre_msg_get_status(req->rq_repmsg));
+
+	/** VBR: check version failure */
+	if (lustre_msg_get_status(req->rq_repmsg) == -EOVERFLOW) {
+		/** replay was failed due to version mismatch */
+		DEBUG_REQ(D_WARNING, req, "Version mismatch during replay\n");
+		spin_lock(&imp->imp_lock);
+		imp->imp_vbr_failed = 1;
+		imp->imp_no_lock_replay = 1;
+		spin_unlock(&imp->imp_lock);
+		lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status);
+	} else {
+		/** The transno had better not change over replay. */
+		LASSERTF(lustre_msg_get_transno(req->rq_reqmsg) ==
+			 lustre_msg_get_transno(req->rq_repmsg) ||
+			 lustre_msg_get_transno(req->rq_repmsg) == 0,
+			 LPX64"/"LPX64"\n",
+			 lustre_msg_get_transno(req->rq_reqmsg),
+			 lustre_msg_get_transno(req->rq_repmsg));
+	}
+
+	spin_lock(&imp->imp_lock);
+	/** if replays by version then gap occur on server, no trust to locks */
+	if (lustre_msg_get_flags(req->rq_repmsg) & MSG_VERSION_REPLAY)
+		imp->imp_no_lock_replay = 1;
+	imp->imp_last_replay_transno = lustre_msg_get_transno(req->rq_reqmsg);
+	spin_unlock(&imp->imp_lock);
+	LASSERT(imp->imp_last_replay_transno);
+
+	/* transaction number shouldn't be bigger than the latest replayed */
+	if (req->rq_transno > lustre_msg_get_transno(req->rq_reqmsg)) {
+		DEBUG_REQ(D_ERROR, req,
+			  "Reported transno "LPU64" is bigger than the "
+			  "replayed one: "LPU64, req->rq_transno,
+			  lustre_msg_get_transno(req->rq_reqmsg));
+		GOTO(out, rc = -EINVAL);
+	}
+
+	DEBUG_REQ(D_HA, req, "got rep");
+
+	/* let the callback do fixups, possibly including in the request */
+	if (req->rq_replay_cb)
+		req->rq_replay_cb(req);
+
+	if (ptlrpc_client_replied(req) &&
+	    lustre_msg_get_status(req->rq_repmsg) != aa->praa_old_status) {
+		DEBUG_REQ(D_ERROR, req, "status %d, old was %d",
+			  lustre_msg_get_status(req->rq_repmsg),
+			  aa->praa_old_status);
+	} else {
+		/* Put it back for re-replay. */
+		lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status);
+	}
+
+	/*
+	 * Errors while replay can set transno to 0, but
+	 * imp_last_replay_transno shouldn't be set to 0 anyway
+	 */
+	if (req->rq_transno == 0)
+		CERROR("Transno is 0 during replay!\n");
+
+	/* continue with recovery */
+	rc = ptlrpc_import_recovery_state_machine(imp);
+ out:
+	req->rq_send_state = aa->praa_old_state;
+
+	if (rc != 0)
+		/* this replay failed, so restart recovery */
+		ptlrpc_connect_import(imp);
+
+	RETURN(rc);
+}
+
+/**
+ * Prepares and queues request for replay.
+ * Adds it to ptlrpcd queue for actual sending.
+ * Returns 0 on success.
+ */
+int ptlrpc_replay_req(struct ptlrpc_request *req)
+{
+	struct ptlrpc_replay_async_args *aa;
+	ENTRY;
+
+	LASSERT(req->rq_import->imp_state == LUSTRE_IMP_REPLAY);
+
+	LASSERT (sizeof (*aa) <= sizeof (req->rq_async_args));
+	aa = ptlrpc_req_async_args(req);
+	memset(aa, 0, sizeof *aa);
+
+	/* Prepare request to be resent with ptlrpcd */
+	aa->praa_old_state = req->rq_send_state;
+	req->rq_send_state = LUSTRE_IMP_REPLAY;
+	req->rq_phase = RQ_PHASE_NEW;
+	req->rq_next_phase = RQ_PHASE_UNDEFINED;
+	if (req->rq_repmsg)
+		aa->praa_old_status = lustre_msg_get_status(req->rq_repmsg);
+	req->rq_status = 0;
+	req->rq_interpret_reply = ptlrpc_replay_interpret;
+	/* Readjust the timeout for current conditions */
+	ptlrpc_at_set_req_timeout(req);
+
+	/* Tell server the net_latency, so the server can calculate how long
+	 * it should wait for next replay */
+	lustre_msg_set_service_time(req->rq_reqmsg,
+				    ptlrpc_at_get_net_latency(req));
+	DEBUG_REQ(D_HA, req, "REPLAY");
+
+	atomic_inc(&req->rq_import->imp_replay_inflight);
+	ptlrpc_request_addref(req); /* ptlrpcd needs a ref */
+
+	ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1);
+	RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_replay_req);
+
+/**
+ * Aborts all in-flight request on import \a imp sending and delayed lists
+ */
+void ptlrpc_abort_inflight(struct obd_import *imp)
+{
+	struct list_head *tmp, *n;
+	ENTRY;
+
+	/* Make sure that no new requests get processed for this import.
+	 * ptlrpc_{queue,set}_wait must (and does) hold imp_lock while testing
+	 * this flag and then putting requests on sending_list or delayed_list.
+	 */
+	spin_lock(&imp->imp_lock);
+
+	/* XXX locking?  Maybe we should remove each request with the list
+	 * locked?  Also, how do we know if the requests on the list are
+	 * being freed at this time?
+	 */
+	list_for_each_safe(tmp, n, &imp->imp_sending_list) {
+		struct ptlrpc_request *req =
+			list_entry(tmp, struct ptlrpc_request, rq_list);
+
+		DEBUG_REQ(D_RPCTRACE, req, "inflight");
+
+		spin_lock(&req->rq_lock);
+		if (req->rq_import_generation < imp->imp_generation) {
+			req->rq_err = 1;
+			req->rq_status = -EIO;
+			ptlrpc_client_wake_req(req);
+		}
+		spin_unlock(&req->rq_lock);
+	}
+
+	list_for_each_safe(tmp, n, &imp->imp_delayed_list) {
+		struct ptlrpc_request *req =
+			list_entry(tmp, struct ptlrpc_request, rq_list);
+
+		DEBUG_REQ(D_RPCTRACE, req, "aborting waiting req");
+
+		spin_lock(&req->rq_lock);
+		if (req->rq_import_generation < imp->imp_generation) {
+			req->rq_err = 1;
+			req->rq_status = -EIO;
+			ptlrpc_client_wake_req(req);
+		}
+		spin_unlock(&req->rq_lock);
+	}
+
+	/* Last chance to free reqs left on the replay list, but we
+	 * will still leak reqs that haven't committed.  */
+	if (imp->imp_replayable)
+		ptlrpc_free_committed(imp);
+
+	spin_unlock(&imp->imp_lock);
+
+	EXIT;
+}
+EXPORT_SYMBOL(ptlrpc_abort_inflight);
+
+/**
+ * Abort all uncompleted requests in request set \a set
+ */
+void ptlrpc_abort_set(struct ptlrpc_request_set *set)
+{
+	struct list_head *tmp, *pos;
+
+	LASSERT(set != NULL);
+
+	list_for_each_safe(pos, tmp, &set->set_requests) {
+		struct ptlrpc_request *req =
+			list_entry(pos, struct ptlrpc_request,
+				       rq_set_chain);
+
+		spin_lock(&req->rq_lock);
+		if (req->rq_phase != RQ_PHASE_RPC) {
+			spin_unlock(&req->rq_lock);
+			continue;
+		}
+
+		req->rq_err = 1;
+		req->rq_status = -EINTR;
+		ptlrpc_client_wake_req(req);
+		spin_unlock(&req->rq_lock);
+	}
+}
+
+static __u64 ptlrpc_last_xid;
+static spinlock_t ptlrpc_last_xid_lock;
+
+/**
+ * Initialize the XID for the node.  This is common among all requests on
+ * this node, and only requires the property that it is monotonically
+ * increasing.  It does not need to be sequential.  Since this is also used
+ * as the RDMA match bits, it is important that a single client NOT have
+ * the same match bits for two different in-flight requests, hence we do
+ * NOT want to have an XID per target or similar.
+ *
+ * To avoid an unlikely collision between match bits after a client reboot
+ * (which would deliver old data into the wrong RDMA buffer) initialize
+ * the XID based on the current time, assuming a maximum RPC rate of 1M RPC/s.
+ * If the time is clearly incorrect, we instead use a 62-bit random number.
+ * In the worst case the random number will overflow 1M RPCs per second in
+ * 9133 years, or permutations thereof.
+ */
+#define YEAR_2004 (1ULL << 30)
+void ptlrpc_init_xid(void)
+{
+	time_t now = cfs_time_current_sec();
+
+	spin_lock_init(&ptlrpc_last_xid_lock);
+	if (now < YEAR_2004) {
+		cfs_get_random_bytes(&ptlrpc_last_xid, sizeof(ptlrpc_last_xid));
+		ptlrpc_last_xid >>= 2;
+		ptlrpc_last_xid |= (1ULL << 61);
+	} else {
+		ptlrpc_last_xid = (__u64)now << 20;
+	}
+
+	/* Need to always be aligned to a power-of-two for mutli-bulk BRW */
+	CLASSERT((PTLRPC_BULK_OPS_COUNT & (PTLRPC_BULK_OPS_COUNT - 1)) == 0);
+	ptlrpc_last_xid &= PTLRPC_BULK_OPS_MASK;
+}
+
+/**
+ * Increase xid and returns resulting new value to the caller.
+ *
+ * Multi-bulk BRW RPCs consume multiple XIDs for each bulk transfer, starting
+ * at the returned xid, up to xid + PTLRPC_BULK_OPS_COUNT - 1. The BRW RPC
+ * itself uses the last bulk xid needed, so the server can determine the
+ * the number of bulk transfers from the RPC XID and a bitmask.  The starting
+ * xid must align to a power-of-two value.
+ *
+ * This is assumed to be true due to the initial ptlrpc_last_xid
+ * value also being initialized to a power-of-two value. LU-1431
+ */
+__u64 ptlrpc_next_xid(void)
+{
+	__u64 next;
+
+	spin_lock(&ptlrpc_last_xid_lock);
+	next = ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT;
+	ptlrpc_last_xid = next;
+	spin_unlock(&ptlrpc_last_xid_lock);
+
+	return next;
+}
+EXPORT_SYMBOL(ptlrpc_next_xid);
+
+/**
+ * Get a glimpse at what next xid value might have been.
+ * Returns possible next xid.
+ */
+__u64 ptlrpc_sample_next_xid(void)
+{
+#if BITS_PER_LONG == 32
+	/* need to avoid possible word tearing on 32-bit systems */
+	__u64 next;
+
+	spin_lock(&ptlrpc_last_xid_lock);
+	next = ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT;
+	spin_unlock(&ptlrpc_last_xid_lock);
+
+	return next;
+#else
+	/* No need to lock, since returned value is racy anyways */
+	return ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT;
+#endif
+}
+EXPORT_SYMBOL(ptlrpc_sample_next_xid);
+
+/**
+ * Functions for operating ptlrpc workers.
+ *
+ * A ptlrpc work is a function which will be running inside ptlrpc context.
+ * The callback shouldn't sleep otherwise it will block that ptlrpcd thread.
+ *
+ * 1. after a work is created, it can be used many times, that is:
+ *	 handler = ptlrpcd_alloc_work();
+ *	 ptlrpcd_queue_work();
+ *
+ *    queue it again when necessary:
+ *	 ptlrpcd_queue_work();
+ *	 ptlrpcd_destroy_work();
+ * 2. ptlrpcd_queue_work() can be called by multiple processes meanwhile, but
+ *    it will only be queued once in any time. Also as its name implies, it may
+ *    have delay before it really runs by ptlrpcd thread.
+ */
+struct ptlrpc_work_async_args {
+	__u64   magic;
+	int   (*cb)(const struct lu_env *, void *);
+	void   *cbdata;
+};
+
+#define PTLRPC_WORK_MAGIC 0x6655436b676f4f44ULL /* magic code */
+
+static int work_interpreter(const struct lu_env *env,
+			    struct ptlrpc_request *req, void *data, int rc)
+{
+	struct ptlrpc_work_async_args *arg = data;
+
+	LASSERT(arg->magic == PTLRPC_WORK_MAGIC);
+	LASSERT(arg->cb != NULL);
+
+	return arg->cb(env, arg->cbdata);
+}
+
+/**
+ * Create a work for ptlrpc.
+ */
+void *ptlrpcd_alloc_work(struct obd_import *imp,
+			 int (*cb)(const struct lu_env *, void *), void *cbdata)
+{
+	struct ptlrpc_request	 *req = NULL;
+	struct ptlrpc_work_async_args *args;
+	ENTRY;
+
+	might_sleep();
+
+	if (cb == NULL)
+		RETURN(ERR_PTR(-EINVAL));
+
+	/* copy some code from deprecated fakereq. */
+	OBD_ALLOC_PTR(req);
+	if (req == NULL) {
+		CERROR("ptlrpc: run out of memory!\n");
+		RETURN(ERR_PTR(-ENOMEM));
+	}
+
+	req->rq_send_state = LUSTRE_IMP_FULL;
+	req->rq_type = PTL_RPC_MSG_REQUEST;
+	req->rq_import = class_import_get(imp);
+	req->rq_export = NULL;
+	req->rq_interpret_reply = work_interpreter;
+	/* don't want reply */
+	req->rq_receiving_reply = 0;
+	req->rq_must_unlink = 0;
+	req->rq_no_delay = req->rq_no_resend = 1;
+
+	spin_lock_init(&req->rq_lock);
+	INIT_LIST_HEAD(&req->rq_list);
+	INIT_LIST_HEAD(&req->rq_replay_list);
+	INIT_LIST_HEAD(&req->rq_set_chain);
+	INIT_LIST_HEAD(&req->rq_history_list);
+	INIT_LIST_HEAD(&req->rq_exp_list);
+	init_waitqueue_head(&req->rq_reply_waitq);
+	init_waitqueue_head(&req->rq_set_waitq);
+	atomic_set(&req->rq_refcount, 1);
+
+	CLASSERT (sizeof(*args) <= sizeof(req->rq_async_args));
+	args = ptlrpc_req_async_args(req);
+	args->magic  = PTLRPC_WORK_MAGIC;
+	args->cb     = cb;
+	args->cbdata = cbdata;
+
+	RETURN(req);
+}
+EXPORT_SYMBOL(ptlrpcd_alloc_work);
+
+void ptlrpcd_destroy_work(void *handler)
+{
+	struct ptlrpc_request *req = handler;
+
+	if (req)
+		ptlrpc_req_finished(req);
+}
+EXPORT_SYMBOL(ptlrpcd_destroy_work);
+
+int ptlrpcd_queue_work(void *handler)
+{
+	struct ptlrpc_request *req = handler;
+
+	/*
+	 * Check if the req is already being queued.
+	 *
+	 * Here comes a trick: it lacks a way of checking if a req is being
+	 * processed reliably in ptlrpc. Here I have to use refcount of req
+	 * for this purpose. This is okay because the caller should use this
+	 * req as opaque data. - Jinshan
+	 */
+	LASSERT(atomic_read(&req->rq_refcount) > 0);
+	if (atomic_read(&req->rq_refcount) > 1)
+		return -EBUSY;
+
+	if (atomic_inc_return(&req->rq_refcount) > 2) { /* race */
+		atomic_dec(&req->rq_refcount);
+		return -EBUSY;
+	}
+
+	/* re-initialize the req */
+	req->rq_timeout	= obd_timeout;
+	req->rq_sent	   = cfs_time_current_sec();
+	req->rq_deadline       = req->rq_sent + req->rq_timeout;
+	req->rq_reply_deadline = req->rq_deadline;
+	req->rq_phase	  = RQ_PHASE_INTERPRET;
+	req->rq_next_phase     = RQ_PHASE_COMPLETE;
+	req->rq_xid	    = ptlrpc_next_xid();
+	req->rq_import_generation = req->rq_import->imp_generation;
+
+	ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+	return 0;
+}
+EXPORT_SYMBOL(ptlrpcd_queue_work);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/connection.c b/drivers/staging/lustre/lustre/ptlrpc/connection.c
new file mode 100644
index 000000000000..a0757f372be5
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/connection.c
@@ -0,0 +1,248 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+
+#include "ptlrpc_internal.h"
+
+static cfs_hash_t *conn_hash = NULL;
+static cfs_hash_ops_t conn_hash_ops;
+
+struct ptlrpc_connection *
+ptlrpc_connection_get(lnet_process_id_t peer, lnet_nid_t self,
+		      struct obd_uuid *uuid)
+{
+	struct ptlrpc_connection *conn, *conn2;
+	ENTRY;
+
+	conn = cfs_hash_lookup(conn_hash, &peer);
+	if (conn)
+		GOTO(out, conn);
+
+	OBD_ALLOC_PTR(conn);
+	if (!conn)
+		RETURN(NULL);
+
+	conn->c_peer = peer;
+	conn->c_self = self;
+	INIT_HLIST_NODE(&conn->c_hash);
+	atomic_set(&conn->c_refcount, 1);
+	if (uuid)
+		obd_str2uuid(&conn->c_remote_uuid, uuid->uuid);
+
+	/*
+	 * Add the newly created conn to the hash, on key collision we
+	 * lost a racing addition and must destroy our newly allocated
+	 * connection.  The object which exists in the has will be
+	 * returned and may be compared against out object.
+	 */
+	/* In the function below, .hs_keycmp resolves to
+	 * conn_keycmp() */
+	/* coverity[overrun-buffer-val] */
+	conn2 = cfs_hash_findadd_unique(conn_hash, &peer, &conn->c_hash);
+	if (conn != conn2) {
+		OBD_FREE_PTR(conn);
+		conn = conn2;
+	}
+	EXIT;
+out:
+	CDEBUG(D_INFO, "conn=%p refcount %d to %s\n",
+	       conn, atomic_read(&conn->c_refcount),
+	       libcfs_nid2str(conn->c_peer.nid));
+	return conn;
+}
+EXPORT_SYMBOL(ptlrpc_connection_get);
+
+int ptlrpc_connection_put(struct ptlrpc_connection *conn)
+{
+	int rc = 0;
+	ENTRY;
+
+	if (!conn)
+		RETURN(rc);
+
+	LASSERT(atomic_read(&conn->c_refcount) > 1);
+
+	/*
+	 * We do not remove connection from hashtable and
+	 * do not free it even if last caller released ref,
+	 * as we want to have it cached for the case it is
+	 * needed again.
+	 *
+	 * Deallocating it and later creating new connection
+	 * again would be wastful. This way we also avoid
+	 * expensive locking to protect things from get/put
+	 * race when found cached connection is freed by
+	 * ptlrpc_connection_put().
+	 *
+	 * It will be freed later in module unload time,
+	 * when ptlrpc_connection_fini()->lh_exit->conn_exit()
+	 * path is called.
+	 */
+	if (atomic_dec_return(&conn->c_refcount) == 1)
+		rc = 1;
+
+	CDEBUG(D_INFO, "PUT conn=%p refcount %d to %s\n",
+	       conn, atomic_read(&conn->c_refcount),
+	       libcfs_nid2str(conn->c_peer.nid));
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_connection_put);
+
+struct ptlrpc_connection *
+ptlrpc_connection_addref(struct ptlrpc_connection *conn)
+{
+	ENTRY;
+
+	atomic_inc(&conn->c_refcount);
+	CDEBUG(D_INFO, "conn=%p refcount %d to %s\n",
+	       conn, atomic_read(&conn->c_refcount),
+	       libcfs_nid2str(conn->c_peer.nid));
+
+	RETURN(conn);
+}
+EXPORT_SYMBOL(ptlrpc_connection_addref);
+
+int ptlrpc_connection_init(void)
+{
+	ENTRY;
+
+	conn_hash = cfs_hash_create("CONN_HASH",
+				    HASH_CONN_CUR_BITS,
+				    HASH_CONN_MAX_BITS,
+				    HASH_CONN_BKT_BITS, 0,
+				    CFS_HASH_MIN_THETA,
+				    CFS_HASH_MAX_THETA,
+				    &conn_hash_ops, CFS_HASH_DEFAULT);
+	if (!conn_hash)
+		RETURN(-ENOMEM);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_connection_init);
+
+void ptlrpc_connection_fini(void) {
+	ENTRY;
+	cfs_hash_putref(conn_hash);
+	EXIT;
+}
+EXPORT_SYMBOL(ptlrpc_connection_fini);
+
+/*
+ * Hash operations for net_peer<->connection
+ */
+static unsigned
+conn_hashfn(cfs_hash_t *hs, const void *key, unsigned mask)
+{
+	return cfs_hash_djb2_hash(key, sizeof(lnet_process_id_t), mask);
+}
+
+static int
+conn_keycmp(const void *key, struct hlist_node *hnode)
+{
+	struct ptlrpc_connection *conn;
+	const lnet_process_id_t *conn_key;
+
+	LASSERT(key != NULL);
+	conn_key = (lnet_process_id_t*)key;
+	conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+
+	return conn_key->nid == conn->c_peer.nid &&
+	       conn_key->pid == conn->c_peer.pid;
+}
+
+static void *
+conn_key(struct hlist_node *hnode)
+{
+	struct ptlrpc_connection *conn;
+	conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+	return &conn->c_peer;
+}
+
+static void *
+conn_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+}
+
+static void
+conn_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct ptlrpc_connection *conn;
+
+	conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+	atomic_inc(&conn->c_refcount);
+}
+
+static void
+conn_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct ptlrpc_connection *conn;
+
+	conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+	atomic_dec(&conn->c_refcount);
+}
+
+static void
+conn_exit(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct ptlrpc_connection *conn;
+
+	conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+	/*
+	 * Nothing should be left. Connection user put it and
+	 * connection also was deleted from table by this time
+	 * so we should have 0 refs.
+	 */
+	LASSERTF(atomic_read(&conn->c_refcount) == 0,
+		 "Busy connection with %d refs\n",
+		 atomic_read(&conn->c_refcount));
+	OBD_FREE_PTR(conn);
+}
+
+static cfs_hash_ops_t conn_hash_ops = {
+	.hs_hash	= conn_hashfn,
+	.hs_keycmp      = conn_keycmp,
+	.hs_key	 = conn_key,
+	.hs_object      = conn_object,
+	.hs_get	 = conn_get,
+	.hs_put_locked  = conn_put_locked,
+	.hs_exit	= conn_exit,
+};
diff --git a/drivers/staging/lustre/lustre/ptlrpc/events.c b/drivers/staging/lustre/lustre/ptlrpc/events.c
new file mode 100644
index 000000000000..0264c102cb3e
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/events.c
@@ -0,0 +1,595 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+# include <linux/libcfs/libcfs.h>
+# ifdef __mips64__
+#  include <linux/kernel.h>
+# endif
+
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_sec.h>
+#include "ptlrpc_internal.h"
+
+lnet_handle_eq_t   ptlrpc_eq_h;
+
+/*
+ *  Client's outgoing request callback
+ */
+void request_out_callback(lnet_event_t *ev)
+{
+	struct ptlrpc_cb_id   *cbid = ev->md.user_ptr;
+	struct ptlrpc_request *req = cbid->cbid_arg;
+	ENTRY;
+
+	LASSERT (ev->type == LNET_EVENT_SEND ||
+		 ev->type == LNET_EVENT_UNLINK);
+	LASSERT (ev->unlinked);
+
+	DEBUG_REQ(D_NET, req, "type %d, status %d", ev->type, ev->status);
+
+	sptlrpc_request_out_callback(req);
+	req->rq_real_sent = cfs_time_current_sec();
+
+	if (ev->type == LNET_EVENT_UNLINK || ev->status != 0) {
+
+		/* Failed send: make it seem like the reply timed out, just
+		 * like failing sends in client.c does currently...  */
+
+		spin_lock(&req->rq_lock);
+		req->rq_net_err = 1;
+		spin_unlock(&req->rq_lock);
+
+		ptlrpc_client_wake_req(req);
+	}
+
+	ptlrpc_req_finished(req);
+
+	EXIT;
+}
+
+/*
+ * Client's incoming reply callback
+ */
+void reply_in_callback(lnet_event_t *ev)
+{
+	struct ptlrpc_cb_id   *cbid = ev->md.user_ptr;
+	struct ptlrpc_request *req = cbid->cbid_arg;
+	ENTRY;
+
+	DEBUG_REQ(D_NET, req, "type %d, status %d", ev->type, ev->status);
+
+	LASSERT (ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_UNLINK);
+	LASSERT (ev->md.start == req->rq_repbuf);
+	LASSERT (ev->offset + ev->mlength <= req->rq_repbuf_len);
+	/* We've set LNET_MD_MANAGE_REMOTE for all outgoing requests
+	   for adaptive timeouts' early reply. */
+	LASSERT((ev->md.options & LNET_MD_MANAGE_REMOTE) != 0);
+
+	spin_lock(&req->rq_lock);
+
+	req->rq_receiving_reply = 0;
+	req->rq_early = 0;
+	if (ev->unlinked)
+		req->rq_must_unlink = 0;
+
+	if (ev->status)
+		goto out_wake;
+
+	if (ev->type == LNET_EVENT_UNLINK) {
+		LASSERT(ev->unlinked);
+		DEBUG_REQ(D_NET, req, "unlink");
+		goto out_wake;
+	}
+
+	if (ev->mlength < ev->rlength ) {
+		CDEBUG(D_RPCTRACE, "truncate req %p rpc %d - %d+%d\n", req,
+		       req->rq_replen, ev->rlength, ev->offset);
+		req->rq_reply_truncate = 1;
+		req->rq_replied = 1;
+		req->rq_status = -EOVERFLOW;
+		req->rq_nob_received = ev->rlength + ev->offset;
+		goto out_wake;
+	}
+
+	if ((ev->offset == 0) &&
+	    ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT))) {
+		/* Early reply */
+		DEBUG_REQ(D_ADAPTTO, req,
+			  "Early reply received: mlen=%u offset=%d replen=%d "
+			  "replied=%d unlinked=%d", ev->mlength, ev->offset,
+			  req->rq_replen, req->rq_replied, ev->unlinked);
+
+		req->rq_early_count++; /* number received, client side */
+
+		if (req->rq_replied)   /* already got the real reply */
+			goto out_wake;
+
+		req->rq_early = 1;
+		req->rq_reply_off = ev->offset;
+		req->rq_nob_received = ev->mlength;
+		/* And we're still receiving */
+		req->rq_receiving_reply = 1;
+	} else {
+		/* Real reply */
+		req->rq_rep_swab_mask = 0;
+		req->rq_replied = 1;
+		req->rq_reply_off = ev->offset;
+		req->rq_nob_received = ev->mlength;
+		/* LNetMDUnlink can't be called under the LNET_LOCK,
+		   so we must unlink in ptlrpc_unregister_reply */
+		DEBUG_REQ(D_INFO, req,
+			  "reply in flags=%x mlen=%u offset=%d replen=%d",
+			  lustre_msg_get_flags(req->rq_reqmsg),
+			  ev->mlength, ev->offset, req->rq_replen);
+	}
+
+	req->rq_import->imp_last_reply_time = cfs_time_current_sec();
+
+out_wake:
+	/* NB don't unlock till after wakeup; req can disappear under us
+	 * since we don't have our own ref */
+	ptlrpc_client_wake_req(req);
+	spin_unlock(&req->rq_lock);
+	EXIT;
+}
+
+/*
+ * Client's bulk has been written/read
+ */
+void client_bulk_callback (lnet_event_t *ev)
+{
+	struct ptlrpc_cb_id     *cbid = ev->md.user_ptr;
+	struct ptlrpc_bulk_desc *desc = cbid->cbid_arg;
+	struct ptlrpc_request   *req;
+	ENTRY;
+
+	LASSERT ((desc->bd_type == BULK_PUT_SINK &&
+		  ev->type == LNET_EVENT_PUT) ||
+		 (desc->bd_type == BULK_GET_SOURCE &&
+		  ev->type == LNET_EVENT_GET) ||
+		 ev->type == LNET_EVENT_UNLINK);
+	LASSERT (ev->unlinked);
+
+	if (CFS_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB, CFS_FAIL_ONCE))
+		ev->status = -EIO;
+
+	if (CFS_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2,CFS_FAIL_ONCE))
+		ev->status = -EIO;
+
+	CDEBUG((ev->status == 0) ? D_NET : D_ERROR,
+	       "event type %d, status %d, desc %p\n",
+	       ev->type, ev->status, desc);
+
+	spin_lock(&desc->bd_lock);
+	req = desc->bd_req;
+	LASSERT(desc->bd_md_count > 0);
+	desc->bd_md_count--;
+
+	if (ev->type != LNET_EVENT_UNLINK && ev->status == 0) {
+		desc->bd_nob_transferred += ev->mlength;
+		desc->bd_sender = ev->sender;
+	} else {
+		/* start reconnect and resend if network error hit */
+		spin_lock(&req->rq_lock);
+		req->rq_net_err = 1;
+		spin_unlock(&req->rq_lock);
+	}
+
+	if (ev->status != 0)
+		desc->bd_failure = 1;
+
+	/* NB don't unlock till after wakeup; desc can disappear under us
+	 * otherwise */
+	if (desc->bd_md_count == 0)
+		ptlrpc_client_wake_req(desc->bd_req);
+
+	spin_unlock(&desc->bd_lock);
+	EXIT;
+}
+
+/*
+ * We will have percpt request history list for ptlrpc service in upcoming
+ * patches because we don't want to be serialized by current per-service
+ * history operations. So we require history ID can (somehow) show arriving
+ * order w/o grabbing global lock, and user can sort them in userspace.
+ *
+ * This is how we generate history ID for ptlrpc_request:
+ * ----------------------------------------------------
+ * |  32 bits  |  16 bits  | (16 - X)bits  |  X bits  |
+ * ----------------------------------------------------
+ * |  seconds  | usec / 16 |   sequence    | CPT id   |
+ * ----------------------------------------------------
+ *
+ * it might not be precise but should be good enough.
+ */
+
+#define REQS_CPT_BITS(svcpt)	((svcpt)->scp_service->srv_cpt_bits)
+
+#define REQS_SEC_SHIFT		32
+#define REQS_USEC_SHIFT		16
+#define REQS_SEQ_SHIFT(svcpt)	REQS_CPT_BITS(svcpt)
+
+static void ptlrpc_req_add_history(struct ptlrpc_service_part *svcpt,
+				   struct ptlrpc_request *req)
+{
+	__u64	sec = req->rq_arrival_time.tv_sec;
+	__u32	usec = req->rq_arrival_time.tv_usec >> 4; /* usec / 16 */
+	__u64	new_seq;
+
+	/* set sequence ID for request and add it to history list,
+	 * it must be called with hold svcpt::scp_lock */
+
+	new_seq = (sec << REQS_SEC_SHIFT) |
+		  (usec << REQS_USEC_SHIFT) |
+		  (svcpt->scp_cpt < 0 ? 0 : svcpt->scp_cpt);
+
+	if (new_seq > svcpt->scp_hist_seq) {
+		/* This handles the initial case of scp_hist_seq == 0 or
+		 * we just jumped into a new time window */
+		svcpt->scp_hist_seq = new_seq;
+	} else {
+		LASSERT(REQS_SEQ_SHIFT(svcpt) < REQS_USEC_SHIFT);
+		/* NB: increase sequence number in current usec bucket,
+		 * however, it's possible that we used up all bits for
+		 * sequence and jumped into the next usec bucket (future time),
+		 * then we hope there will be less RPCs per bucket at some
+		 * point, and sequence will catch up again */
+		svcpt->scp_hist_seq += (1U << REQS_SEQ_SHIFT(svcpt));
+		new_seq = svcpt->scp_hist_seq;
+	}
+
+	req->rq_history_seq = new_seq;
+
+	list_add_tail(&req->rq_history_list, &svcpt->scp_hist_reqs);
+}
+
+/*
+ * Server's incoming request callback
+ */
+void request_in_callback(lnet_event_t *ev)
+{
+	struct ptlrpc_cb_id		  *cbid = ev->md.user_ptr;
+	struct ptlrpc_request_buffer_desc *rqbd = cbid->cbid_arg;
+	struct ptlrpc_service_part	  *svcpt = rqbd->rqbd_svcpt;
+	struct ptlrpc_service	     *service = svcpt->scp_service;
+	struct ptlrpc_request	     *req;
+	ENTRY;
+
+	LASSERT (ev->type == LNET_EVENT_PUT ||
+		 ev->type == LNET_EVENT_UNLINK);
+	LASSERT ((char *)ev->md.start >= rqbd->rqbd_buffer);
+	LASSERT ((char *)ev->md.start + ev->offset + ev->mlength <=
+		 rqbd->rqbd_buffer + service->srv_buf_size);
+
+	CDEBUG((ev->status == 0) ? D_NET : D_ERROR,
+	       "event type %d, status %d, service %s\n",
+	       ev->type, ev->status, service->srv_name);
+
+	if (ev->unlinked) {
+		/* If this is the last request message to fit in the
+		 * request buffer we can use the request object embedded in
+		 * rqbd.  Note that if we failed to allocate a request,
+		 * we'd have to re-post the rqbd, which we can't do in this
+		 * context. */
+		req = &rqbd->rqbd_req;
+		memset(req, 0, sizeof (*req));
+	} else {
+		LASSERT (ev->type == LNET_EVENT_PUT);
+		if (ev->status != 0) {
+			/* We moaned above already... */
+			return;
+		}
+		OBD_ALLOC_GFP(req, sizeof(*req), ALLOC_ATOMIC_TRY);
+		if (req == NULL) {
+			CERROR("Can't allocate incoming request descriptor: "
+			       "Dropping %s RPC from %s\n",
+			       service->srv_name,
+			       libcfs_id2str(ev->initiator));
+			return;
+		}
+	}
+
+	/* NB we ABSOLUTELY RELY on req being zeroed, so pointers are NULL,
+	 * flags are reset and scalars are zero.  We only set the message
+	 * size to non-zero if this was a successful receive. */
+	req->rq_xid = ev->match_bits;
+	req->rq_reqbuf = ev->md.start + ev->offset;
+	if (ev->type == LNET_EVENT_PUT && ev->status == 0)
+		req->rq_reqdata_len = ev->mlength;
+	do_gettimeofday(&req->rq_arrival_time);
+	req->rq_peer = ev->initiator;
+	req->rq_self = ev->target.nid;
+	req->rq_rqbd = rqbd;
+	req->rq_phase = RQ_PHASE_NEW;
+	spin_lock_init(&req->rq_lock);
+	INIT_LIST_HEAD(&req->rq_timed_list);
+	INIT_LIST_HEAD(&req->rq_exp_list);
+	atomic_set(&req->rq_refcount, 1);
+	if (ev->type == LNET_EVENT_PUT)
+		CDEBUG(D_INFO, "incoming req@%p x"LPU64" msgsize %u\n",
+		       req, req->rq_xid, ev->mlength);
+
+	CDEBUG(D_RPCTRACE, "peer: %s\n", libcfs_id2str(req->rq_peer));
+
+	spin_lock(&svcpt->scp_lock);
+
+	ptlrpc_req_add_history(svcpt, req);
+
+	if (ev->unlinked) {
+		svcpt->scp_nrqbds_posted--;
+		CDEBUG(D_INFO, "Buffer complete: %d buffers still posted\n",
+		       svcpt->scp_nrqbds_posted);
+
+		/* Normally, don't complain about 0 buffers posted; LNET won't
+		 * drop incoming reqs since we set the portal lazy */
+		if (test_req_buffer_pressure &&
+		    ev->type != LNET_EVENT_UNLINK &&
+		    svcpt->scp_nrqbds_posted == 0)
+			CWARN("All %s request buffers busy\n",
+			      service->srv_name);
+
+		/* req takes over the network's ref on rqbd */
+	} else {
+		/* req takes a ref on rqbd */
+		rqbd->rqbd_refcount++;
+	}
+
+	list_add_tail(&req->rq_list, &svcpt->scp_req_incoming);
+	svcpt->scp_nreqs_incoming++;
+
+	/* NB everything can disappear under us once the request
+	 * has been queued and we unlock, so do the wake now... */
+	wake_up(&svcpt->scp_waitq);
+
+	spin_unlock(&svcpt->scp_lock);
+	EXIT;
+}
+
+/*
+ *  Server's outgoing reply callback
+ */
+void reply_out_callback(lnet_event_t *ev)
+{
+	struct ptlrpc_cb_id	  *cbid = ev->md.user_ptr;
+	struct ptlrpc_reply_state *rs = cbid->cbid_arg;
+	struct ptlrpc_service_part *svcpt = rs->rs_svcpt;
+	ENTRY;
+
+	LASSERT (ev->type == LNET_EVENT_SEND ||
+		 ev->type == LNET_EVENT_ACK ||
+		 ev->type == LNET_EVENT_UNLINK);
+
+	if (!rs->rs_difficult) {
+		/* 'Easy' replies have no further processing so I drop the
+		 * net's ref on 'rs' */
+		LASSERT (ev->unlinked);
+		ptlrpc_rs_decref(rs);
+		EXIT;
+		return;
+	}
+
+	LASSERT (rs->rs_on_net);
+
+	if (ev->unlinked) {
+		/* Last network callback. The net's ref on 'rs' stays put
+		 * until ptlrpc_handle_rs() is done with it */
+		spin_lock(&svcpt->scp_rep_lock);
+		spin_lock(&rs->rs_lock);
+
+		rs->rs_on_net = 0;
+		if (!rs->rs_no_ack ||
+		    rs->rs_transno <=
+		    rs->rs_export->exp_obd->obd_last_committed)
+			ptlrpc_schedule_difficult_reply(rs);
+
+		spin_unlock(&rs->rs_lock);
+		spin_unlock(&svcpt->scp_rep_lock);
+	}
+	EXIT;
+}
+
+
+static void ptlrpc_master_callback(lnet_event_t *ev)
+{
+	struct ptlrpc_cb_id *cbid = ev->md.user_ptr;
+	void (*callback)(lnet_event_t *ev) = cbid->cbid_fn;
+
+	/* Honestly, it's best to find out early. */
+	LASSERT (cbid->cbid_arg != LP_POISON);
+	LASSERT (callback == request_out_callback ||
+		 callback == reply_in_callback ||
+		 callback == client_bulk_callback ||
+		 callback == request_in_callback ||
+		 callback == reply_out_callback
+		 );
+
+	callback (ev);
+}
+
+int ptlrpc_uuid_to_peer (struct obd_uuid *uuid,
+			 lnet_process_id_t *peer, lnet_nid_t *self)
+{
+	int	       best_dist = 0;
+	__u32	     best_order = 0;
+	int	       count = 0;
+	int	       rc = -ENOENT;
+	int	       portals_compatibility;
+	int	       dist;
+	__u32	     order;
+	lnet_nid_t	dst_nid;
+	lnet_nid_t	src_nid;
+
+	portals_compatibility = LNetCtl(IOC_LIBCFS_PORTALS_COMPATIBILITY, NULL);
+
+	peer->pid = LUSTRE_SRV_LNET_PID;
+
+	/* Choose the matching UUID that's closest */
+	while (lustre_uuid_to_peer(uuid->uuid, &dst_nid, count++) == 0) {
+		dist = LNetDist(dst_nid, &src_nid, &order);
+		if (dist < 0)
+			continue;
+
+		if (dist == 0) {		/* local! use loopback LND */
+			peer->nid = *self = LNET_MKNID(LNET_MKNET(LOLND, 0), 0);
+			rc = 0;
+			break;
+		}
+
+		if (rc < 0 ||
+		    dist < best_dist ||
+		    (dist == best_dist && order < best_order)) {
+			best_dist = dist;
+			best_order = order;
+
+			if (portals_compatibility > 1) {
+				/* Strong portals compatibility: Zero the nid's
+				 * NET, so if I'm reading new config logs, or
+				 * getting configured by (new) lconf I can
+				 * still talk to old servers. */
+				dst_nid = LNET_MKNID(0, LNET_NIDADDR(dst_nid));
+				src_nid = LNET_MKNID(0, LNET_NIDADDR(src_nid));
+			}
+			peer->nid = dst_nid;
+			*self = src_nid;
+			rc = 0;
+		}
+	}
+
+	CDEBUG(D_NET,"%s->%s\n", uuid->uuid, libcfs_id2str(*peer));
+	return rc;
+}
+
+void ptlrpc_ni_fini(void)
+{
+	wait_queue_head_t	 waitq;
+	struct l_wait_info  lwi;
+	int		 rc;
+	int		 retries;
+
+	/* Wait for the event queue to become idle since there may still be
+	 * messages in flight with pending events (i.e. the fire-and-forget
+	 * messages == client requests and "non-difficult" server
+	 * replies */
+
+	for (retries = 0;; retries++) {
+		rc = LNetEQFree(ptlrpc_eq_h);
+		switch (rc) {
+		default:
+			LBUG();
+
+		case 0:
+			LNetNIFini();
+			return;
+
+		case -EBUSY:
+			if (retries != 0)
+				CWARN("Event queue still busy\n");
+
+			/* Wait for a bit */
+			init_waitqueue_head(&waitq);
+			lwi = LWI_TIMEOUT(cfs_time_seconds(2), NULL, NULL);
+			l_wait_event(waitq, 0, &lwi);
+			break;
+		}
+	}
+	/* notreached */
+}
+
+lnet_pid_t ptl_get_pid(void)
+{
+	lnet_pid_t	pid;
+
+	pid = LUSTRE_SRV_LNET_PID;
+	return pid;
+}
+
+int ptlrpc_ni_init(void)
+{
+	int	      rc;
+	lnet_pid_t       pid;
+
+	pid = ptl_get_pid();
+	CDEBUG(D_NET, "My pid is: %x\n", pid);
+
+	/* We're not passing any limits yet... */
+	rc = LNetNIInit(pid);
+	if (rc < 0) {
+		CDEBUG (D_NET, "Can't init network interface: %d\n", rc);
+		return (-ENOENT);
+	}
+
+	/* CAVEAT EMPTOR: how we process portals events is _radically_
+	 * different depending on... */
+	/* kernel LNet calls our master callback when there are new event,
+	 * because we are guaranteed to get every event via callback,
+	 * so we just set EQ size to 0 to avoid overhread of serializing
+	 * enqueue/dequeue operations in LNet. */
+	rc = LNetEQAlloc(0, ptlrpc_master_callback, &ptlrpc_eq_h);
+	if (rc == 0)
+		return 0;
+
+	CERROR ("Failed to allocate event queue: %d\n", rc);
+	LNetNIFini();
+
+	return (-ENOMEM);
+}
+
+
+int ptlrpc_init_portals(void)
+{
+	int   rc = ptlrpc_ni_init();
+
+	if (rc != 0) {
+		CERROR("network initialisation failed\n");
+		return -EIO;
+	}
+	rc = ptlrpcd_addref();
+	if (rc == 0)
+		return 0;
+
+	CERROR("rpcd initialisation failed\n");
+	ptlrpc_ni_fini();
+	return rc;
+}
+
+void ptlrpc_exit_portals(void)
+{
+	ptlrpcd_decref();
+	ptlrpc_ni_fini();
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/Makefile b/drivers/staging/lustre/lustre/ptlrpc/gss/Makefile
new file mode 100644
index 000000000000..8cdfbeed64e6
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/gss/Makefile
@@ -0,0 +1,8 @@
+obj-$(CONFIG_LUSTRE_FS) := ptlrpc_gss.o
+
+ptlrpc_gss-y := sec_gss.o gss_bulk.o gss_cli_upcall.o gss_svc_upcall.o	\
+		gss_rawobj.o lproc_gss.o gss_generic_token.o		\
+		gss_mech_switch.o gss_krb5_mech.o
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_api.h b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_api.h
new file mode 100644
index 000000000000..feac60482c97
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_api.h
@@ -0,0 +1,179 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ * Somewhat simplified version of the gss api.
+ *
+ * Dug Song <dugsong@monkey.org>
+ * Andy Adamson <andros@umich.edu>
+ * Bruce Fields <bfields@umich.edu>
+ * Copyright (c) 2000 The Regents of the University of Michigan
+ *
+ */
+
+#ifndef __PTLRPC_GSS_GSS_API_H_
+#define __PTLRPC_GSS_GSS_API_H_
+
+struct gss_api_mech;
+
+/* The mechanism-independent gss-api context: */
+struct gss_ctx {
+	struct gss_api_mech    *mech_type;
+	void		   *internal_ctx_id;
+};
+
+#define GSS_C_NO_BUFFER	 ((rawobj_t) 0)
+#define GSS_C_NO_CONTEXT	((struct gss_ctx *) 0)
+#define GSS_C_NULL_OID	  ((rawobj_t) 0)
+
+/*
+ * gss-api prototypes; note that these are somewhat simplified versions of
+ * the prototypes specified in RFC 2744.
+ */
+__u32 lgss_import_sec_context(
+		rawobj_t		*input_token,
+		struct gss_api_mech     *mech,
+		struct gss_ctx	 **ctx);
+__u32 lgss_copy_reverse_context(
+		struct gss_ctx	  *ctx,
+		struct gss_ctx	 **ctx_new);
+__u32 lgss_inquire_context(
+		struct gss_ctx	  *ctx,
+		unsigned long	   *endtime);
+__u32 lgss_get_mic(
+		struct gss_ctx	  *ctx,
+		int		      msgcnt,
+		rawobj_t		*msgs,
+		int		      iovcnt,
+		lnet_kiov_t	     *iovs,
+		rawobj_t		*mic_token);
+__u32 lgss_verify_mic(
+		struct gss_ctx	  *ctx,
+		int		      msgcnt,
+		rawobj_t		*msgs,
+		int		      iovcnt,
+		lnet_kiov_t	     *iovs,
+		rawobj_t		*mic_token);
+__u32 lgss_wrap(
+		struct gss_ctx	  *ctx,
+		rawobj_t		*gsshdr,
+		rawobj_t		*msg,
+		int		      msg_buflen,
+		rawobj_t		*out_token);
+__u32 lgss_unwrap(
+		struct gss_ctx	  *ctx,
+		rawobj_t		*gsshdr,
+		rawobj_t		*token,
+		rawobj_t		*out_msg);
+__u32 lgss_prep_bulk(
+		struct gss_ctx	  *gctx,
+		struct ptlrpc_bulk_desc *desc);
+__u32 lgss_wrap_bulk(
+		struct gss_ctx	  *gctx,
+		struct ptlrpc_bulk_desc *desc,
+		rawobj_t		*token,
+		int		      adj_nob);
+__u32 lgss_unwrap_bulk(
+		struct gss_ctx	  *gctx,
+		struct ptlrpc_bulk_desc *desc,
+		rawobj_t		*token,
+		int		      adj_nob);
+__u32 lgss_delete_sec_context(
+		struct gss_ctx	 **ctx);
+int lgss_display(
+		struct gss_ctx	  *ctx,
+		char		    *buf,
+		int		      bufsize);
+
+struct subflavor_desc {
+	__u32	   sf_subflavor;
+	__u32	   sf_qop;
+	__u32	   sf_service;
+	char	   *sf_name;
+};
+
+/* Each mechanism is described by the following struct: */
+struct gss_api_mech {
+	struct list_head	      gm_list;
+	module_t	   *gm_owner;
+	char		   *gm_name;
+	rawobj_t		gm_oid;
+	atomic_t	    gm_count;
+	struct gss_api_ops     *gm_ops;
+	int		     gm_sf_num;
+	struct subflavor_desc  *gm_sfs;
+};
+
+/* and must provide the following operations: */
+struct gss_api_ops {
+	__u32 (*gss_import_sec_context)(
+			rawobj_t	       *input_token,
+			struct gss_ctx	 *ctx);
+	__u32 (*gss_copy_reverse_context)(
+			struct gss_ctx	 *ctx,
+			struct gss_ctx	 *ctx_new);
+	__u32 (*gss_inquire_context)(
+			struct gss_ctx	 *ctx,
+			unsigned long	  *endtime);
+	__u32 (*gss_get_mic)(
+			struct gss_ctx	 *ctx,
+			int		     msgcnt,
+			rawobj_t	       *msgs,
+			int		     iovcnt,
+			lnet_kiov_t	    *iovs,
+			rawobj_t	       *mic_token);
+	__u32 (*gss_verify_mic)(
+			struct gss_ctx	 *ctx,
+			int		     msgcnt,
+			rawobj_t	       *msgs,
+			int		     iovcnt,
+			lnet_kiov_t	    *iovs,
+			rawobj_t	       *mic_token);
+	__u32 (*gss_wrap)(
+			struct gss_ctx	 *ctx,
+			rawobj_t	       *gsshdr,
+			rawobj_t	       *msg,
+			int		     msg_buflen,
+			rawobj_t	       *out_token);
+	__u32 (*gss_unwrap)(
+			struct gss_ctx	 *ctx,
+			rawobj_t	       *gsshdr,
+			rawobj_t	       *token,
+			rawobj_t	       *out_msg);
+	__u32 (*gss_prep_bulk)(
+			struct gss_ctx	 *gctx,
+			struct ptlrpc_bulk_desc *desc);
+	__u32 (*gss_wrap_bulk)(
+			struct gss_ctx	 *gctx,
+			struct ptlrpc_bulk_desc *desc,
+			rawobj_t	       *token,
+			int		     adj_nob);
+	__u32 (*gss_unwrap_bulk)(
+			struct gss_ctx	 *gctx,
+			struct ptlrpc_bulk_desc *desc,
+			rawobj_t	       *token,
+			int		     adj_nob);
+	void (*gss_delete_sec_context)(
+			void		   *ctx);
+	int  (*gss_display)(
+			struct gss_ctx	 *ctx,
+			char		   *buf,
+			int		     bufsize);
+};
+
+int lgss_mech_register(struct gss_api_mech *mech);
+void lgss_mech_unregister(struct gss_api_mech *mech);
+
+struct gss_api_mech * lgss_OID_to_mech(rawobj_t *oid);
+struct gss_api_mech * lgss_name_to_mech(char *name);
+struct gss_api_mech * lgss_subflavor_to_mech(__u32 subflavor);
+
+struct gss_api_mech * lgss_mech_get(struct gss_api_mech *mech);
+void lgss_mech_put(struct gss_api_mech *mech);
+
+#endif /* __PTLRPC_GSS_GSS_API_H_ */
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_asn1.h b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_asn1.h
new file mode 100644
index 000000000000..c70eb00796f9
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_asn1.h
@@ -0,0 +1,84 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  minimal asn1 for generic encoding/decoding of gss tokens
+ *
+ *  Adapted from MIT Kerberos 5-1.2.1 lib/include/krb5.h,
+ *  lib/gssapi/krb5/gssapiP_krb5.h, and others
+ *
+ *  Copyright (c) 2000 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson   <andros@umich.edu>
+ */
+
+/*
+ * Copyright 1995 by the Massachusetts Institute of Technology.
+ * All Rights Reserved.
+ *
+ * Export of this software from the United States of America may
+ *   require a specific license from the United States Government.
+ *   It is the responsibility of any person or organization contemplating
+ *   export to obtain such a license before exporting.
+ *
+ * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
+ * distribute this software and its documentation for any purpose and
+ * without fee is hereby granted, provided that the above copyright
+ * notice appear in all copies and that both that copyright notice and
+ * this permission notice appear in supporting documentation, and that
+ * the name of M.I.T. not be used in advertising or publicity pertaining
+ * to distribution of the software without specific, written prior
+ * permission.  Furthermore if you modify this software you must label
+ * your software as modified software and not distribute it in such a
+ * fashion that it might be confused with the original M.I.T. software.
+ * M.I.T. makes no representations about the suitability of
+ * this software for any purpose.  It is provided "as is" without express
+ * or implied warranty.
+ *
+ */
+
+#define SIZEOF_INT 4
+
+/* from gssapi_err_generic.h */
+#define G_BAD_SERVICE_NAME		       (-2045022976L)
+#define G_BAD_STRING_UID			 (-2045022975L)
+#define G_NOUSER				 (-2045022974L)
+#define G_VALIDATE_FAILED			(-2045022973L)
+#define G_BUFFER_ALLOC			   (-2045022972L)
+#define G_BAD_MSG_CTX			    (-2045022971L)
+#define G_WRONG_SIZE			     (-2045022970L)
+#define G_BAD_USAGE			      (-2045022969L)
+#define G_UNKNOWN_QOP			    (-2045022968L)
+#define G_NO_HOSTNAME			    (-2045022967L)
+#define G_BAD_HOSTNAME			   (-2045022966L)
+#define G_WRONG_MECH			     (-2045022965L)
+#define G_BAD_TOK_HEADER			 (-2045022964L)
+#define G_BAD_DIRECTION			  (-2045022963L)
+#define G_TOK_TRUNC			      (-2045022962L)
+#define G_REFLECT				(-2045022961L)
+#define G_WRONG_TOKID			    (-2045022960L)
+
+#define g_OID_equal(o1,o2) \
+   (((o1)->len == (o2)->len) && \
+    (memcmp((o1)->data,(o2)->data,(int) (o1)->len) == 0))
+
+__u32 g_verify_token_header(rawobj_t *mech,
+			    int *body_size,
+			    unsigned char **buf_in,
+			    int toksize);
+
+__u32 g_get_mech_oid(rawobj_t *mech,
+		     rawobj_t *in_buf);
+
+int g_token_size(rawobj_t *mech,
+		 unsigned int body_size);
+
+void g_make_token_header(rawobj_t *mech,
+			 int body_size,
+			 unsigned char **buf);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_bulk.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_bulk.c
new file mode 100644
index 000000000000..ed95bbba95ca
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_bulk.c
@@ -0,0 +1,512 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/gss/gss_bulk.c
+ *
+ * Author: Eric Mei <eric.mei@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/crypto.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+int gss_cli_ctx_wrap_bulk(struct ptlrpc_cli_ctx *ctx,
+			  struct ptlrpc_request *req,
+			  struct ptlrpc_bulk_desc *desc)
+{
+	struct gss_cli_ctx	      *gctx;
+	struct lustre_msg	       *msg;
+	struct ptlrpc_bulk_sec_desc     *bsd;
+	rawobj_t			 token;
+	__u32			    maj;
+	int			      offset;
+	int			      rc;
+	ENTRY;
+
+	LASSERT(req->rq_pack_bulk);
+	LASSERT(req->rq_bulk_read || req->rq_bulk_write);
+
+	gctx = container_of(ctx, struct gss_cli_ctx, gc_base);
+	LASSERT(gctx->gc_mechctx);
+
+	switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) {
+	case SPTLRPC_SVC_NULL:
+		LASSERT(req->rq_reqbuf->lm_bufcount >= 3);
+		msg = req->rq_reqbuf;
+		offset = msg->lm_bufcount - 1;
+		break;
+	case SPTLRPC_SVC_AUTH:
+	case SPTLRPC_SVC_INTG:
+		LASSERT(req->rq_reqbuf->lm_bufcount >= 4);
+		msg = req->rq_reqbuf;
+		offset = msg->lm_bufcount - 2;
+		break;
+	case SPTLRPC_SVC_PRIV:
+		LASSERT(req->rq_clrbuf->lm_bufcount >= 2);
+		msg = req->rq_clrbuf;
+		offset = msg->lm_bufcount - 1;
+		break;
+	default:
+		LBUG();
+	}
+
+	bsd = lustre_msg_buf(msg, offset, sizeof(*bsd));
+	bsd->bsd_version = 0;
+	bsd->bsd_flags = 0;
+	bsd->bsd_type = SPTLRPC_BULK_DEFAULT;
+	bsd->bsd_svc = SPTLRPC_FLVR_BULK_SVC(req->rq_flvr.sf_rpc);
+
+	if (bsd->bsd_svc == SPTLRPC_BULK_SVC_NULL)
+		RETURN(0);
+
+	LASSERT(bsd->bsd_svc == SPTLRPC_BULK_SVC_INTG ||
+		bsd->bsd_svc == SPTLRPC_BULK_SVC_PRIV);
+
+	if (req->rq_bulk_read) {
+		/*
+		 * bulk read: prepare receiving pages only for privacy mode.
+		 */
+		if (bsd->bsd_svc == SPTLRPC_BULK_SVC_PRIV)
+			return gss_cli_prep_bulk(req, desc);
+	} else {
+		/*
+		 * bulk write: sign or encrypt bulk pages.
+		 */
+		bsd->bsd_nob = desc->bd_nob;
+
+		if (bsd->bsd_svc == SPTLRPC_BULK_SVC_INTG) {
+			/* integrity mode */
+			token.data = bsd->bsd_data;
+			token.len = lustre_msg_buflen(msg, offset) -
+				    sizeof(*bsd);
+
+			maj = lgss_get_mic(gctx->gc_mechctx, 0, NULL,
+					   desc->bd_iov_count, desc->bd_iov,
+					   &token);
+			if (maj != GSS_S_COMPLETE) {
+				CWARN("failed to sign bulk data: %x\n", maj);
+				RETURN(-EACCES);
+			}
+		} else {
+			/* privacy mode */
+			if (desc->bd_iov_count == 0)
+				RETURN(0);
+
+			rc = sptlrpc_enc_pool_get_pages(desc);
+			if (rc) {
+				CERROR("bulk write: failed to allocate "
+				       "encryption pages: %d\n", rc);
+				RETURN(rc);
+			}
+
+			token.data = bsd->bsd_data;
+			token.len = lustre_msg_buflen(msg, offset) -
+				    sizeof(*bsd);
+
+			maj = lgss_wrap_bulk(gctx->gc_mechctx, desc, &token, 0);
+			if (maj != GSS_S_COMPLETE) {
+				CWARN("fail to encrypt bulk data: %x\n", maj);
+				RETURN(-EACCES);
+			}
+		}
+	}
+
+	RETURN(0);
+}
+
+int gss_cli_ctx_unwrap_bulk(struct ptlrpc_cli_ctx *ctx,
+			    struct ptlrpc_request *req,
+			    struct ptlrpc_bulk_desc *desc)
+{
+	struct gss_cli_ctx	      *gctx;
+	struct lustre_msg	       *rmsg, *vmsg;
+	struct ptlrpc_bulk_sec_desc     *bsdr, *bsdv;
+	rawobj_t			 token;
+	__u32			    maj;
+	int			      roff, voff;
+	ENTRY;
+
+	LASSERT(req->rq_pack_bulk);
+	LASSERT(req->rq_bulk_read || req->rq_bulk_write);
+
+	switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) {
+	case SPTLRPC_SVC_NULL:
+		vmsg = req->rq_repdata;
+		voff = vmsg->lm_bufcount - 1;
+		LASSERT(vmsg && vmsg->lm_bufcount >= 3);
+
+		rmsg = req->rq_reqbuf;
+		roff = rmsg->lm_bufcount - 1; /* last segment */
+		LASSERT(rmsg && rmsg->lm_bufcount >= 3);
+		break;
+	case SPTLRPC_SVC_AUTH:
+	case SPTLRPC_SVC_INTG:
+		vmsg = req->rq_repdata;
+		voff = vmsg->lm_bufcount - 2;
+		LASSERT(vmsg && vmsg->lm_bufcount >= 4);
+
+		rmsg = req->rq_reqbuf;
+		roff = rmsg->lm_bufcount - 2; /* second last segment */
+		LASSERT(rmsg && rmsg->lm_bufcount >= 4);
+		break;
+	case SPTLRPC_SVC_PRIV:
+		vmsg = req->rq_repdata;
+		voff = vmsg->lm_bufcount - 1;
+		LASSERT(vmsg && vmsg->lm_bufcount >= 2);
+
+		rmsg = req->rq_clrbuf;
+		roff = rmsg->lm_bufcount - 1; /* last segment */
+		LASSERT(rmsg && rmsg->lm_bufcount >= 2);
+		break;
+	default:
+		LBUG();
+	}
+
+	bsdr = lustre_msg_buf(rmsg, roff, sizeof(*bsdr));
+	bsdv = lustre_msg_buf(vmsg, voff, sizeof(*bsdv));
+	LASSERT(bsdr && bsdv);
+
+	if (bsdr->bsd_version != bsdv->bsd_version ||
+	    bsdr->bsd_type != bsdv->bsd_type ||
+	    bsdr->bsd_svc != bsdv->bsd_svc) {
+		CERROR("bulk security descriptor mismatch: "
+		       "(%u,%u,%u) != (%u,%u,%u)\n",
+		       bsdr->bsd_version, bsdr->bsd_type, bsdr->bsd_svc,
+		       bsdv->bsd_version, bsdv->bsd_type, bsdv->bsd_svc);
+		RETURN(-EPROTO);
+	}
+
+	LASSERT(bsdv->bsd_svc == SPTLRPC_BULK_SVC_NULL ||
+		bsdv->bsd_svc == SPTLRPC_BULK_SVC_INTG ||
+		bsdv->bsd_svc == SPTLRPC_BULK_SVC_PRIV);
+
+	/*
+	 * in privacy mode if return success, make sure bd_nob_transferred
+	 * is the actual size of the clear text, otherwise upper layer
+	 * may be surprised.
+	 */
+	if (req->rq_bulk_write) {
+		if (bsdv->bsd_flags & BSD_FL_ERR) {
+			CERROR("server reported bulk i/o failure\n");
+			RETURN(-EIO);
+		}
+
+		if (bsdv->bsd_svc == SPTLRPC_BULK_SVC_PRIV)
+			desc->bd_nob_transferred = desc->bd_nob;
+	} else {
+		/*
+		 * bulk read, upon return success, bd_nob_transferred is
+		 * the size of plain text actually received.
+		 */
+		gctx = container_of(ctx, struct gss_cli_ctx, gc_base);
+		LASSERT(gctx->gc_mechctx);
+
+		if (bsdv->bsd_svc == SPTLRPC_BULK_SVC_INTG) {
+			int i, nob;
+
+			/* fix the actual data size */
+			for (i = 0, nob = 0; i < desc->bd_iov_count; i++) {
+				if (desc->bd_iov[i].kiov_len + nob >
+				    desc->bd_nob_transferred) {
+					desc->bd_iov[i].kiov_len =
+						desc->bd_nob_transferred - nob;
+				}
+				nob += desc->bd_iov[i].kiov_len;
+			}
+
+			token.data = bsdv->bsd_data;
+			token.len = lustre_msg_buflen(vmsg, voff) -
+				    sizeof(*bsdv);
+
+			maj = lgss_verify_mic(gctx->gc_mechctx, 0, NULL,
+					      desc->bd_iov_count, desc->bd_iov,
+					      &token);
+			if (maj != GSS_S_COMPLETE) {
+				CERROR("failed to verify bulk read: %x\n", maj);
+				RETURN(-EACCES);
+			}
+		} else if (bsdv->bsd_svc == SPTLRPC_BULK_SVC_PRIV) {
+			desc->bd_nob = bsdv->bsd_nob;
+			if (desc->bd_nob == 0)
+				RETURN(0);
+
+			token.data = bsdv->bsd_data;
+			token.len = lustre_msg_buflen(vmsg, voff) -
+				    sizeof(*bsdr);
+
+			maj = lgss_unwrap_bulk(gctx->gc_mechctx, desc,
+					       &token, 1);
+			if (maj != GSS_S_COMPLETE) {
+				CERROR("failed to decrypt bulk read: %x\n",
+				       maj);
+				RETURN(-EACCES);
+			}
+
+			desc->bd_nob_transferred = desc->bd_nob;
+		}
+	}
+
+	RETURN(0);
+}
+
+static int gss_prep_bulk(struct ptlrpc_bulk_desc *desc,
+			 struct gss_ctx *mechctx)
+{
+	int     rc;
+
+	if (desc->bd_iov_count == 0)
+		return 0;
+
+	rc = sptlrpc_enc_pool_get_pages(desc);
+	if (rc)
+		return rc;
+
+	if (lgss_prep_bulk(mechctx, desc) != GSS_S_COMPLETE)
+		return -EACCES;
+
+	return 0;
+}
+
+int gss_cli_prep_bulk(struct ptlrpc_request *req,
+		      struct ptlrpc_bulk_desc *desc)
+{
+	int	     rc;
+	ENTRY;
+
+	LASSERT(req->rq_cli_ctx);
+	LASSERT(req->rq_pack_bulk);
+	LASSERT(req->rq_bulk_read);
+
+	if (SPTLRPC_FLVR_BULK_SVC(req->rq_flvr.sf_rpc) != SPTLRPC_BULK_SVC_PRIV)
+		RETURN(0);
+
+	rc = gss_prep_bulk(desc, ctx2gctx(req->rq_cli_ctx)->gc_mechctx);
+	if (rc)
+		CERROR("bulk read: failed to prepare encryption "
+		       "pages: %d\n", rc);
+
+	RETURN(rc);
+}
+
+int gss_svc_prep_bulk(struct ptlrpc_request *req,
+		      struct ptlrpc_bulk_desc *desc)
+{
+	struct gss_svc_reqctx	*grctx;
+	struct ptlrpc_bulk_sec_desc  *bsd;
+	int			   rc;
+	ENTRY;
+
+	LASSERT(req->rq_svc_ctx);
+	LASSERT(req->rq_pack_bulk);
+	LASSERT(req->rq_bulk_write);
+
+	grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+	LASSERT(grctx->src_reqbsd);
+	LASSERT(grctx->src_repbsd);
+	LASSERT(grctx->src_ctx);
+	LASSERT(grctx->src_ctx->gsc_mechctx);
+
+	bsd = grctx->src_reqbsd;
+	if (bsd->bsd_svc != SPTLRPC_BULK_SVC_PRIV)
+		RETURN(0);
+
+	rc = gss_prep_bulk(desc, grctx->src_ctx->gsc_mechctx);
+	if (rc)
+		CERROR("bulk write: failed to prepare encryption "
+		       "pages: %d\n", rc);
+
+	RETURN(rc);
+}
+
+int gss_svc_unwrap_bulk(struct ptlrpc_request *req,
+			struct ptlrpc_bulk_desc *desc)
+{
+	struct gss_svc_reqctx	*grctx;
+	struct ptlrpc_bulk_sec_desc  *bsdr, *bsdv;
+	rawobj_t		      token;
+	__u32			 maj;
+	ENTRY;
+
+	LASSERT(req->rq_svc_ctx);
+	LASSERT(req->rq_pack_bulk);
+	LASSERT(req->rq_bulk_write);
+
+	grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+
+	LASSERT(grctx->src_reqbsd);
+	LASSERT(grctx->src_repbsd);
+	LASSERT(grctx->src_ctx);
+	LASSERT(grctx->src_ctx->gsc_mechctx);
+
+	bsdr = grctx->src_reqbsd;
+	bsdv = grctx->src_repbsd;
+
+	/* bsdr has been sanity checked during unpacking */
+	bsdv->bsd_version = 0;
+	bsdv->bsd_type = SPTLRPC_BULK_DEFAULT;
+	bsdv->bsd_svc = bsdr->bsd_svc;
+	bsdv->bsd_flags = 0;
+
+	switch (bsdv->bsd_svc) {
+	case SPTLRPC_BULK_SVC_INTG:
+		token.data = bsdr->bsd_data;
+		token.len = grctx->src_reqbsd_size - sizeof(*bsdr);
+
+		maj = lgss_verify_mic(grctx->src_ctx->gsc_mechctx, 0, NULL,
+				      desc->bd_iov_count, desc->bd_iov, &token);
+		if (maj != GSS_S_COMPLETE) {
+			bsdv->bsd_flags |= BSD_FL_ERR;
+			CERROR("failed to verify bulk signature: %x\n", maj);
+			RETURN(-EACCES);
+		}
+		break;
+	case SPTLRPC_BULK_SVC_PRIV:
+		if (bsdr->bsd_nob != desc->bd_nob) {
+			bsdv->bsd_flags |= BSD_FL_ERR;
+			CERROR("prepared nob %d doesn't match the actual "
+			       "nob %d\n", desc->bd_nob, bsdr->bsd_nob);
+			RETURN(-EPROTO);
+		}
+
+		if (desc->bd_iov_count == 0) {
+			LASSERT(desc->bd_nob == 0);
+			break;
+		}
+
+		token.data = bsdr->bsd_data;
+		token.len = grctx->src_reqbsd_size - sizeof(*bsdr);
+
+		maj = lgss_unwrap_bulk(grctx->src_ctx->gsc_mechctx,
+				       desc, &token, 0);
+		if (maj != GSS_S_COMPLETE) {
+			bsdv->bsd_flags |= BSD_FL_ERR;
+			CERROR("failed decrypt bulk data: %x\n", maj);
+			RETURN(-EACCES);
+		}
+		break;
+	}
+
+	RETURN(0);
+}
+
+int gss_svc_wrap_bulk(struct ptlrpc_request *req,
+		      struct ptlrpc_bulk_desc *desc)
+{
+	struct gss_svc_reqctx	*grctx;
+	struct ptlrpc_bulk_sec_desc  *bsdr, *bsdv;
+	rawobj_t		      token;
+	__u32			 maj;
+	int			   rc;
+	ENTRY;
+
+	LASSERT(req->rq_svc_ctx);
+	LASSERT(req->rq_pack_bulk);
+	LASSERT(req->rq_bulk_read);
+
+	grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+
+	LASSERT(grctx->src_reqbsd);
+	LASSERT(grctx->src_repbsd);
+	LASSERT(grctx->src_ctx);
+	LASSERT(grctx->src_ctx->gsc_mechctx);
+
+	bsdr = grctx->src_reqbsd;
+	bsdv = grctx->src_repbsd;
+
+	/* bsdr has been sanity checked during unpacking */
+	bsdv->bsd_version = 0;
+	bsdv->bsd_type = SPTLRPC_BULK_DEFAULT;
+	bsdv->bsd_svc = bsdr->bsd_svc;
+	bsdv->bsd_flags = 0;
+
+	switch (bsdv->bsd_svc) {
+	case SPTLRPC_BULK_SVC_INTG:
+		token.data = bsdv->bsd_data;
+		token.len = grctx->src_repbsd_size - sizeof(*bsdv);
+
+		maj = lgss_get_mic(grctx->src_ctx->gsc_mechctx, 0, NULL,
+				   desc->bd_iov_count, desc->bd_iov, &token);
+		if (maj != GSS_S_COMPLETE) {
+			bsdv->bsd_flags |= BSD_FL_ERR;
+			CERROR("failed to sign bulk data: %x\n", maj);
+			RETURN(-EACCES);
+		}
+		break;
+	case SPTLRPC_BULK_SVC_PRIV:
+		bsdv->bsd_nob = desc->bd_nob;
+
+		if (desc->bd_iov_count == 0) {
+			LASSERT(desc->bd_nob == 0);
+			break;
+		}
+
+		rc = sptlrpc_enc_pool_get_pages(desc);
+		if (rc) {
+			bsdv->bsd_flags |= BSD_FL_ERR;
+			CERROR("bulk read: failed to allocate encryption "
+			       "pages: %d\n", rc);
+			RETURN(rc);
+		}
+
+		token.data = bsdv->bsd_data;
+		token.len = grctx->src_repbsd_size - sizeof(*bsdv);
+
+		maj = lgss_wrap_bulk(grctx->src_ctx->gsc_mechctx,
+				     desc, &token, 1);
+		if (maj != GSS_S_COMPLETE) {
+			bsdv->bsd_flags |= BSD_FL_ERR;
+			CERROR("failed to encrypt bulk data: %x\n", maj);
+			RETURN(-EACCES);
+		}
+		break;
+	}
+
+	RETURN(0);
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_cli_upcall.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_cli_upcall.c
new file mode 100644
index 000000000000..142c789b1bc6
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_cli_upcall.c
@@ -0,0 +1,447 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/gss/gss_cli_upcall.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+/**********************************************
+ * gss context init/fini helper	       *
+ **********************************************/
+
+static
+int ctx_init_pack_request(struct obd_import *imp,
+			  struct ptlrpc_request *req,
+			  int lustre_srv,
+			  uid_t uid, gid_t gid,
+			  long token_size,
+			  char __user *token)
+{
+	struct lustre_msg       *msg = req->rq_reqbuf;
+	struct gss_sec	  *gsec;
+	struct gss_header       *ghdr;
+	struct ptlrpc_user_desc *pud;
+	__u32		   *p, size, offset = 2;
+	rawobj_t		 obj;
+
+	LASSERT(msg->lm_bufcount <= 4);
+	LASSERT(req->rq_cli_ctx);
+	LASSERT(req->rq_cli_ctx->cc_sec);
+
+	/* gss hdr */
+	ghdr = lustre_msg_buf(msg, 0, sizeof(*ghdr));
+	ghdr->gh_version = PTLRPC_GSS_VERSION;
+	ghdr->gh_sp = (__u8) imp->imp_sec->ps_part;
+	ghdr->gh_flags = 0;
+	ghdr->gh_proc = PTLRPC_GSS_PROC_INIT;
+	ghdr->gh_seq = 0;
+	ghdr->gh_svc = SPTLRPC_SVC_NULL;
+	ghdr->gh_handle.len = 0;
+
+	/* fix the user desc */
+	if (req->rq_pack_udesc) {
+		ghdr->gh_flags |= LUSTRE_GSS_PACK_USER;
+
+		pud = lustre_msg_buf(msg, offset, sizeof(*pud));
+		LASSERT(pud);
+		pud->pud_uid = pud->pud_fsuid = uid;
+		pud->pud_gid = pud->pud_fsgid = gid;
+		pud->pud_cap = 0;
+		pud->pud_ngroups = 0;
+		offset++;
+	}
+
+	/* security payload */
+	p = lustre_msg_buf(msg, offset, 0);
+	size = msg->lm_buflens[offset];
+	LASSERT(p);
+
+	/* 1. lustre svc type */
+	LASSERT(size > 4);
+	*p++ = cpu_to_le32(lustre_srv);
+	size -= 4;
+
+	/* 2. target uuid */
+	obj.len = strlen(imp->imp_obd->u.cli.cl_target_uuid.uuid) + 1;
+	obj.data = imp->imp_obd->u.cli.cl_target_uuid.uuid;
+	if (rawobj_serialize(&obj, &p, &size))
+		LBUG();
+
+	/* 3. reverse context handle. actually only needed by root user,
+	 *    but we send it anyway. */
+	gsec = sec2gsec(req->rq_cli_ctx->cc_sec);
+	obj.len = sizeof(gsec->gs_rvs_hdl);
+	obj.data = (__u8 *) &gsec->gs_rvs_hdl;
+	if (rawobj_serialize(&obj, &p, &size))
+		LBUG();
+
+	/* 4. now the token */
+	LASSERT(size >= (sizeof(__u32) + token_size));
+	*p++ = cpu_to_le32(((__u32) token_size));
+	if (copy_from_user(p, token, token_size)) {
+		CERROR("can't copy token\n");
+		return -EFAULT;
+	}
+	size -= sizeof(__u32) + cfs_size_round4(token_size);
+
+	req->rq_reqdata_len = lustre_shrink_msg(req->rq_reqbuf, offset,
+						msg->lm_buflens[offset] - size, 0);
+	return 0;
+}
+
+static
+int ctx_init_parse_reply(struct lustre_msg *msg, int swabbed,
+			 char __user *outbuf, long outlen)
+{
+	struct gss_rep_header   *ghdr;
+	__u32		    obj_len, round_len;
+	__u32		    status, effective = 0;
+
+	if (msg->lm_bufcount != 3) {
+		CERROR("unexpected bufcount %u\n", msg->lm_bufcount);
+		return -EPROTO;
+	}
+
+	ghdr = (struct gss_rep_header *) gss_swab_header(msg, 0, swabbed);
+	if (ghdr == NULL) {
+		CERROR("unable to extract gss reply header\n");
+		return -EPROTO;
+	}
+
+	if (ghdr->gh_version != PTLRPC_GSS_VERSION) {
+		CERROR("invalid gss version %u\n", ghdr->gh_version);
+		return -EPROTO;
+	}
+
+	if (outlen < (4 + 2) * 4 + cfs_size_round4(ghdr->gh_handle.len) +
+		     cfs_size_round4(msg->lm_buflens[2])) {
+		CERROR("output buffer size %ld too small\n", outlen);
+		return -EFAULT;
+	}
+
+	status = 0;
+	effective = 0;
+
+	if (copy_to_user(outbuf, &status, 4))
+		return -EFAULT;
+	outbuf += 4;
+	if (copy_to_user(outbuf, &ghdr->gh_major, 4))
+		return -EFAULT;
+	outbuf += 4;
+	if (copy_to_user(outbuf, &ghdr->gh_minor, 4))
+		return -EFAULT;
+	outbuf += 4;
+	if (copy_to_user(outbuf, &ghdr->gh_seqwin, 4))
+		return -EFAULT;
+	outbuf += 4;
+	effective += 4 * 4;
+
+	/* handle */
+	obj_len = ghdr->gh_handle.len;
+	round_len = (obj_len + 3) & ~ 3;
+	if (copy_to_user(outbuf, &obj_len, 4))
+		return -EFAULT;
+	outbuf += 4;
+	if (copy_to_user(outbuf, (char *) ghdr->gh_handle.data, round_len))
+		return -EFAULT;
+	outbuf += round_len;
+	effective += 4 + round_len;
+
+	/* out token */
+	obj_len = msg->lm_buflens[2];
+	round_len = (obj_len + 3) & ~ 3;
+	if (copy_to_user(outbuf, &obj_len, 4))
+		return -EFAULT;
+	outbuf += 4;
+	if (copy_to_user(outbuf, lustre_msg_buf(msg, 2, 0), round_len))
+		return -EFAULT;
+	outbuf += round_len;
+	effective += 4 + round_len;
+
+	return effective;
+}
+
+/* XXX move to where lgssd could see */
+struct lgssd_ioctl_param {
+	int	     version;	/* in   */
+	int	     secid;	  /* in   */
+	char	   *uuid;	   /* in   */
+	int	     lustre_svc;     /* in   */
+	uid_t	   uid;	    /* in   */
+	gid_t	   gid;	    /* in   */
+	long	    send_token_size;/* in   */
+	char	   *send_token;     /* in   */
+	long	    reply_buf_size; /* in   */
+	char	   *reply_buf;      /* in   */
+	long	    status;	 /* out  */
+	long	    reply_length;   /* out  */
+};
+
+int gss_do_ctx_init_rpc(__user char *buffer, unsigned long count)
+{
+	struct obd_import	*imp;
+	struct ptlrpc_request    *req;
+	struct lgssd_ioctl_param  param;
+	struct obd_device	*obd;
+	char		      obdname[64];
+	long		      lsize;
+	int		       rc;
+
+	if (count != sizeof(param)) {
+		CERROR("ioctl size %lu, expect %lu, please check lgss_keyring "
+		       "version\n", count, (unsigned long) sizeof(param));
+		RETURN(-EINVAL);
+	}
+	if (copy_from_user(&param, buffer, sizeof(param))) {
+		CERROR("failed copy data from lgssd\n");
+		RETURN(-EFAULT);
+	}
+
+	if (param.version != GSSD_INTERFACE_VERSION) {
+		CERROR("gssd interface version %d (expect %d)\n",
+			param.version, GSSD_INTERFACE_VERSION);
+		RETURN(-EINVAL);
+	}
+
+	/* take name */
+	if (strncpy_from_user(obdname, param.uuid, sizeof(obdname)) <= 0) {
+		CERROR("Invalid obdname pointer\n");
+		RETURN(-EFAULT);
+	}
+
+	obd = class_name2obd(obdname);
+	if (!obd) {
+		CERROR("no such obd %s\n", obdname);
+		RETURN(-EINVAL);
+	}
+
+	if (unlikely(!obd->obd_set_up)) {
+		CERROR("obd %s not setup\n", obdname);
+		RETURN(-EINVAL);
+	}
+
+	spin_lock(&obd->obd_dev_lock);
+	if (obd->obd_stopping) {
+		CERROR("obd %s has stopped\n", obdname);
+		spin_unlock(&obd->obd_dev_lock);
+		RETURN(-EINVAL);
+	}
+
+	if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_MGC_NAME)) {
+		CERROR("obd %s is not a client device\n", obdname);
+		spin_unlock(&obd->obd_dev_lock);
+		RETURN(-EINVAL);
+	}
+	spin_unlock(&obd->obd_dev_lock);
+
+	down_read(&obd->u.cli.cl_sem);
+	if (obd->u.cli.cl_import == NULL) {
+		CERROR("obd %s: import has gone\n", obd->obd_name);
+		up_read(&obd->u.cli.cl_sem);
+		RETURN(-EINVAL);
+	}
+	imp = class_import_get(obd->u.cli.cl_import);
+	up_read(&obd->u.cli.cl_sem);
+
+	if (imp->imp_deactive) {
+		CERROR("import has been deactivated\n");
+		class_import_put(imp);
+		RETURN(-EINVAL);
+	}
+
+	req = ptlrpc_request_alloc_pack(imp, &RQF_SEC_CTX, LUSTRE_OBD_VERSION,
+					SEC_CTX_INIT);
+	if (req == NULL) {
+		param.status = -ENOMEM;
+		goto out_copy;
+	}
+
+	if (req->rq_cli_ctx->cc_sec->ps_id != param.secid) {
+		CWARN("original secid %d, now has changed to %d, "
+		      "cancel this negotiation\n", param.secid,
+		      req->rq_cli_ctx->cc_sec->ps_id);
+		param.status = -EINVAL;
+		goto out_copy;
+	}
+
+	/* get token */
+	rc = ctx_init_pack_request(imp, req,
+				   param.lustre_svc,
+				   param.uid, param.gid,
+				   param.send_token_size,
+				   param.send_token);
+	if (rc) {
+		param.status = rc;
+		goto out_copy;
+	}
+
+	ptlrpc_request_set_replen(req);
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc) {
+		/* If any _real_ denial be made, we expect server return
+		 * -EACCES reply or return success but indicate gss error
+		 * inside reply messsage. All other errors are treated as
+		 * timeout, caller might try the negotiation repeatedly,
+		 * leave recovery decisions to general ptlrpc layer.
+		 *
+		 * FIXME maybe some other error code shouldn't be treated
+		 * as timeout. */
+		param.status = rc;
+		if (rc != -EACCES)
+			param.status = -ETIMEDOUT;
+		goto out_copy;
+	}
+
+	LASSERT(req->rq_repdata);
+	lsize = ctx_init_parse_reply(req->rq_repdata,
+				     ptlrpc_rep_need_swab(req),
+				     param.reply_buf, param.reply_buf_size);
+	if (lsize < 0) {
+		param.status = (int) lsize;
+		goto out_copy;
+	}
+
+	param.status = 0;
+	param.reply_length = lsize;
+
+out_copy:
+	if (copy_to_user(buffer, &param, sizeof(param)))
+		rc = -EFAULT;
+	else
+		rc = 0;
+
+	class_import_put(imp);
+	ptlrpc_req_finished(req);
+	RETURN(rc);
+}
+
+int gss_do_ctx_fini_rpc(struct gss_cli_ctx *gctx)
+{
+	struct ptlrpc_cli_ctx   *ctx = &gctx->gc_base;
+	struct obd_import       *imp = ctx->cc_sec->ps_import;
+	struct ptlrpc_request   *req;
+	struct ptlrpc_user_desc *pud;
+	int		      rc;
+	ENTRY;
+
+	LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+
+	if (cli_ctx_is_error(ctx) || !cli_ctx_is_uptodate(ctx)) {
+		CDEBUG(D_SEC, "ctx %p(%u->%s) not uptodate, "
+		       "don't send destroy rpc\n", ctx,
+		       ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec));
+		RETURN(0);
+	}
+
+	might_sleep();
+
+	CWARN("%s ctx %p idx "LPX64" (%u->%s)\n",
+	      sec_is_reverse(ctx->cc_sec) ?
+	      "server finishing reverse" : "client finishing forward",
+	      ctx, gss_handle_to_u64(&gctx->gc_handle),
+	      ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec));
+
+	gctx->gc_proc = PTLRPC_GSS_PROC_DESTROY;
+
+	req = ptlrpc_request_alloc(imp, &RQF_SEC_CTX);
+	if (req == NULL) {
+		CWARN("ctx %p(%u): fail to prepare rpc, destroy locally\n",
+		      ctx, ctx->cc_vcred.vc_uid);
+		GOTO(out, rc = -ENOMEM);
+	}
+
+	rc = ptlrpc_request_bufs_pack(req, LUSTRE_OBD_VERSION, SEC_CTX_FINI,
+				      NULL, ctx);
+	if (rc) {
+		ptlrpc_request_free(req);
+		GOTO(out_ref, rc);
+	}
+
+	/* fix the user desc */
+	if (req->rq_pack_udesc) {
+		/* we rely the fact that this request is in AUTH mode,
+		 * and user_desc at offset 2. */
+		pud = lustre_msg_buf(req->rq_reqbuf, 2, sizeof(*pud));
+		LASSERT(pud);
+		pud->pud_uid = pud->pud_fsuid = ctx->cc_vcred.vc_uid;
+		pud->pud_gid = pud->pud_fsgid = ctx->cc_vcred.vc_gid;
+		pud->pud_cap = 0;
+		pud->pud_ngroups = 0;
+	}
+
+	req->rq_phase = RQ_PHASE_RPC;
+	rc = ptl_send_rpc(req, 1);
+	if (rc)
+		CWARN("ctx %p(%u->%s): rpc error %d, destroy locally\n", ctx,
+		      ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec), rc);
+
+out_ref:
+	ptlrpc_req_finished(req);
+out:
+	RETURN(rc);
+}
+
+int __init gss_init_cli_upcall(void)
+{
+	return 0;
+}
+
+void __exit gss_exit_cli_upcall(void)
+{
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_err.h b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_err.h
new file mode 100644
index 000000000000..13425796fa33
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_err.h
@@ -0,0 +1,193 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  Adapted from MIT Kerberos 5-1.2.1 include/gssapi/gssapi.h
+ *
+ *  Copyright (c) 2002 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson   <andros@umich.edu>
+ */
+
+/*
+ * Copyright 1993 by OpenVision Technologies, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software
+ * and its documentation for any purpose is hereby granted without fee,
+ * provided that the above copyright notice appears in all copies and
+ * that both that copyright notice and this permission notice appear in
+ * supporting documentation, and that the name of OpenVision not be used
+ * in advertising or publicity pertaining to distribution of the software
+ * without specific, written prior permission. OpenVision makes no
+ * representations about the suitability of this software for any
+ * purpose.  It is provided "as is" without express or implied warranty.
+ *
+ * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef __PTLRPC_GSS_GSS_ERR_H_
+#define __PTLRPC_GSS_GSS_ERR_H_
+
+typedef unsigned int OM_uint32;
+
+/*
+ * Flag bits for context-level services.
+ */
+#define GSS_C_DELEG_FLAG	(1)
+#define GSS_C_MUTUAL_FLAG       (2)
+#define GSS_C_REPLAY_FLAG       (4)
+#define GSS_C_SEQUENCE_FLAG     (8)
+#define GSS_C_CONF_FLAG	 (16)
+#define GSS_C_INTEG_FLAG	(32)
+#define GSS_C_ANON_FLAG	 (64)
+#define GSS_C_PROT_READY_FLAG   (128)
+#define GSS_C_TRANS_FLAG	(256)
+
+/*
+ * Credential usage options
+ */
+#define GSS_C_BOTH	      (0)
+#define GSS_C_INITIATE	  (1)
+#define GSS_C_ACCEPT	    (2)
+
+/*
+ * Status code types for gss_display_status
+ */
+#define GSS_C_GSS_CODE	  (1)
+#define GSS_C_MECH_CODE	 (2)
+
+
+/*
+ * Define the default Quality of Protection for per-message services.  Note
+ * that an implementation that offers multiple levels of QOP may either reserve
+ * a value (for example zero, as assumed here) to mean "default protection", or
+ * alternatively may simply equate GSS_C_QOP_DEFAULT to a specific explicit
+ * QOP value.  However a value of 0 should always be interpreted by a GSSAPI
+ * implementation as a request for the default protection level.
+ */
+#define GSS_C_QOP_DEFAULT       (0)
+
+/*
+ * Expiration time of 2^32-1 seconds means infinite lifetime for a
+ * credential or security context
+ */
+#define GSS_C_INDEFINITE	((OM_uint32) 0xfffffffful)
+
+
+/* Major status codes */
+
+#define GSS_S_COMPLETE	  (0)
+
+/*
+ * Some "helper" definitions to make the status code macros obvious.
+ */
+#define GSS_C_CALLING_ERROR_OFFSET      (24)
+#define GSS_C_ROUTINE_ERROR_OFFSET      (16)
+#define GSS_C_SUPPLEMENTARY_OFFSET      (0)
+#define GSS_C_CALLING_ERROR_MASK	((OM_uint32) 0377ul)
+#define GSS_C_ROUTINE_ERROR_MASK	((OM_uint32) 0377ul)
+#define GSS_C_SUPPLEMENTARY_MASK	((OM_uint32) 0177777ul)
+
+/*
+ * The macros that test status codes for error conditions.  Note that the
+ * GSS_ERROR() macro has changed slightly from the V1 GSSAPI so that it now
+ * evaluates its argument only once.
+ */
+#define GSS_CALLING_ERROR(x) \
+  ((x) & (GSS_C_CALLING_ERROR_MASK << GSS_C_CALLING_ERROR_OFFSET))
+#define GSS_ROUTINE_ERROR(x) \
+  ((x) & (GSS_C_ROUTINE_ERROR_MASK << GSS_C_ROUTINE_ERROR_OFFSET))
+#define GSS_SUPPLEMENTARY_INFO(x) \
+  ((x) & (GSS_C_SUPPLEMENTARY_MASK << GSS_C_SUPPLEMENTARY_OFFSET))
+#define GSS_ERROR(x) \
+  ((x) & ((GSS_C_CALLING_ERROR_MASK << GSS_C_CALLING_ERROR_OFFSET) | \
+	  (GSS_C_ROUTINE_ERROR_MASK << GSS_C_ROUTINE_ERROR_OFFSET)))
+
+/*
+ * Now the actual status code definitions
+ */
+
+/*
+ * Calling errors:
+ */
+#define GSS_S_CALL_INACCESSIBLE_READ \
+	(((OM_uint32) 1ul) << GSS_C_CALLING_ERROR_OFFSET)
+#define GSS_S_CALL_INACCESSIBLE_WRITE \
+	(((OM_uint32) 2ul) << GSS_C_CALLING_ERROR_OFFSET)
+#define GSS_S_CALL_BAD_STRUCTURE \
+	(((OM_uint32) 3ul) << GSS_C_CALLING_ERROR_OFFSET)
+
+/*
+ * Routine errors:
+ */
+#define GSS_S_BAD_MECH \
+	(((OM_uint32) 1ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_NAME \
+	(((OM_uint32) 2ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_NAMETYPE \
+	(((OM_uint32) 3ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_BINDINGS \
+	(((OM_uint32) 4ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_STATUS \
+	(((OM_uint32) 5ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_SIG \
+	(((OM_uint32) 6ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_NO_CRED \
+	(((OM_uint32) 7ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_NO_CONTEXT \
+	(((OM_uint32) 8ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_DEFECTIVE_TOKEN \
+	(((OM_uint32) 9ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_DEFECTIVE_CREDENTIAL \
+	(((OM_uint32) 10ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_CREDENTIALS_EXPIRED \
+	(((OM_uint32) 11ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_CONTEXT_EXPIRED \
+	(((OM_uint32) 12ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_FAILURE \
+	(((OM_uint32) 13ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_QOP \
+	(((OM_uint32) 14ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_UNAUTHORIZED \
+	(((OM_uint32) 15ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_UNAVAILABLE \
+	(((OM_uint32) 16ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_DUPLICATE_ELEMENT \
+	(((OM_uint32) 17ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_NAME_NOT_MN \
+	(((OM_uint32) 18ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+
+/*
+ * Supplementary info bits:
+ */
+#define GSS_S_CONTINUE_NEEDED   (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 0))
+#define GSS_S_DUPLICATE_TOKEN   (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 1))
+#define GSS_S_OLD_TOKEN	 (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 2))
+#define GSS_S_UNSEQ_TOKEN       (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 3))
+#define GSS_S_GAP_TOKEN	 (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 4))
+
+/* XXXX these are not part of the GSSAPI C bindings!  (but should be) */
+
+#define GSS_CALLING_ERROR_FIELD(x) \
+	(((x) >> GSS_C_CALLING_ERROR_OFFSET) & GSS_C_CALLING_ERROR_MASK)
+#define GSS_ROUTINE_ERROR_FIELD(x) \
+	(((x) >> GSS_C_ROUTINE_ERROR_OFFSET) & GSS_C_ROUTINE_ERROR_MASK)
+#define GSS_SUPPLEMENTARY_INFO_FIELD(x) \
+	(((x) >> GSS_C_SUPPLEMENTARY_OFFSET) & GSS_C_SUPPLEMENTARY_MASK)
+
+/* XXXX This is a necessary evil until the spec is fixed */
+#define GSS_S_CRED_UNAVAIL GSS_S_FAILURE
+
+#endif /* __PTLRPC_GSS_GSS_ERR_H_ */
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_generic_token.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_generic_token.c
new file mode 100644
index 000000000000..20b1638e7255
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_generic_token.c
@@ -0,0 +1,285 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  linux/net/sunrpc/gss_generic_token.c
+ *
+ *  Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/generic/util_token.c
+ *
+ *  Copyright (c) 2000 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson   <andros@umich.edu>
+ */
+
+/*
+ * Copyright 1993 by OpenVision Technologies, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software
+ * and its documentation for any purpose is hereby granted without fee,
+ * provided that the above copyright notice appears in all copies and
+ * that both that copyright notice and this permission notice appear in
+ * supporting documentation, and that the name of OpenVision not be used
+ * in advertising or publicity pertaining to distribution of the software
+ * without specific, written prior permission. OpenVision makes no
+ * representations about the suitability of this software for any
+ * purpose.  It is provided "as is" without express or implied warranty.
+ *
+ * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+#include "gss_krb5.h"
+#include "gss_asn1.h"
+
+
+/* TWRITE_STR from gssapiP_generic.h */
+#define TWRITE_STR(ptr, str, len) \
+	memcpy((ptr), (char *) (str), (len)); \
+	(ptr) += (len);
+
+/* XXXX this code currently makes the assumption that a mech oid will
+   never be longer than 127 bytes.  This assumption is not inherent in
+   the interfaces, so the code can be fixed if the OSI namespace
+   balloons unexpectedly. */
+
+/* Each token looks like this:
+
+0x60				tag for APPLICATION 0, SEQUENCE
+					(constructed, definite-length)
+	<length>		possible multiple bytes, need to parse/generate
+	0x06			tag for OBJECT IDENTIFIER
+		<moid_length>	compile-time constant string (assume 1 byte)
+		<moid_bytes>	compile-time constant string
+	<inner_bytes>		the ANY containing the application token
+					bytes 0,1 are the token type
+					bytes 2,n are the token data
+
+For the purposes of this abstraction, the token "header" consists of
+the sequence tag and length octets, the mech OID DER encoding, and the
+first two inner bytes, which indicate the token type.  The token
+"body" consists of everything else.
+
+*/
+
+static
+int der_length_size(int length)
+{
+	if (length < (1 << 7))
+		return 1;
+	else if (length < (1 << 8))
+		return 2;
+#if (SIZEOF_INT == 2)
+	else
+		return 3;
+#else
+	else if (length < (1 << 16))
+		return 3;
+	else if (length < (1 << 24))
+		return 4;
+	else
+		return 5;
+#endif
+}
+
+static
+void der_write_length(unsigned char **buf, int length)
+{
+	if (length < (1 << 7)) {
+		*(*buf)++ = (unsigned char) length;
+	} else {
+		*(*buf)++ = (unsigned char) (der_length_size(length) + 127);
+#if (SIZEOF_INT > 2)
+		if (length >= (1 << 24))
+			*(*buf)++ = (unsigned char) (length >> 24);
+		if (length >= (1 << 16))
+			*(*buf)++ = (unsigned char) ((length >> 16) & 0xff);
+#endif
+		if (length >= (1 << 8))
+			*(*buf)++ = (unsigned char) ((length >> 8) & 0xff);
+		*(*buf)++ = (unsigned char) (length & 0xff);
+	}
+}
+
+/*
+ * returns decoded length, or < 0 on failure.  Advances buf and
+ * decrements bufsize
+ */
+static
+int der_read_length(unsigned char **buf, int *bufsize)
+{
+	unsigned char sf;
+	int ret;
+
+	if (*bufsize < 1)
+		return -1;
+	sf = *(*buf)++;
+	(*bufsize)--;
+	if (sf & 0x80) {
+		if ((sf &= 0x7f) > ((*bufsize) - 1))
+			return -1;
+		if (sf > SIZEOF_INT)
+			return -1;
+		ret = 0;
+		for (; sf; sf--) {
+			ret = (ret << 8) + (*(*buf)++);
+			(*bufsize)--;
+		}
+	} else {
+		ret = sf;
+	}
+
+	return ret;
+}
+
+/*
+ * returns the length of a token, given the mech oid and the body size
+ */
+int g_token_size(rawobj_t *mech, unsigned int body_size)
+{
+	/* set body_size to sequence contents size */
+	body_size += 4 + (int) mech->len; /* NEED overflow check */
+	return (1 + der_length_size(body_size) + body_size);
+}
+
+/*
+ * fills in a buffer with the token header.  The buffer is assumed to
+ * be the right size.  buf is advanced past the token header
+ */
+void g_make_token_header(rawobj_t *mech, int body_size, unsigned char **buf)
+{
+	*(*buf)++ = 0x60;
+	der_write_length(buf, 4 + mech->len + body_size);
+	*(*buf)++ = 0x06;
+	*(*buf)++ = (unsigned char) mech->len;
+	TWRITE_STR(*buf, mech->data, ((int) mech->len));
+}
+
+/*
+ * Given a buffer containing a token, reads and verifies the token,
+ * leaving buf advanced past the token header, and setting body_size
+ * to the number of remaining bytes.  Returns 0 on success,
+ * G_BAD_TOK_HEADER for a variety of errors, and G_WRONG_MECH if the
+ * mechanism in the token does not match the mech argument.  buf and
+ * *body_size are left unmodified on error.
+ */
+__u32 g_verify_token_header(rawobj_t *mech, int *body_size,
+			    unsigned char **buf_in, int toksize)
+{
+	unsigned char *buf = *buf_in;
+	int seqsize;
+	rawobj_t toid;
+	int ret = 0;
+
+	if ((toksize -= 1) < 0)
+		return (G_BAD_TOK_HEADER);
+	if (*buf++ != 0x60)
+		return (G_BAD_TOK_HEADER);
+
+	if ((seqsize = der_read_length(&buf, &toksize)) < 0)
+		return(G_BAD_TOK_HEADER);
+
+	if (seqsize != toksize)
+		return (G_BAD_TOK_HEADER);
+
+	if ((toksize -= 1) < 0)
+		return (G_BAD_TOK_HEADER);
+	if (*buf++ != 0x06)
+		return (G_BAD_TOK_HEADER);
+
+	if ((toksize -= 1) < 0)
+		return (G_BAD_TOK_HEADER);
+	toid.len = *buf++;
+
+	if ((toksize -= toid.len) < 0)
+		return (G_BAD_TOK_HEADER);
+	toid.data = buf;
+	buf += toid.len;
+
+	if (!g_OID_equal(&toid, mech))
+		ret = G_WRONG_MECH;
+
+	/* G_WRONG_MECH is not returned immediately because it's more
+	 * important to return G_BAD_TOK_HEADER if the token header is
+	 * in fact bad
+	 */
+	if ((toksize -= 2) < 0)
+		return (G_BAD_TOK_HEADER);
+
+	if (ret)
+		return (ret);
+
+	if (!ret) {
+		*buf_in = buf;
+		*body_size = toksize;
+	}
+
+	return (ret);
+}
+
+/*
+ * Given a buffer containing a token, returns a copy of the mech oid in
+ * the parameter mech.
+ */
+__u32 g_get_mech_oid(rawobj_t *mech, rawobj_t *in_buf)
+{
+	unsigned char *buf = in_buf->data;
+	int len = in_buf->len;
+	int ret = 0;
+	int seqsize;
+
+	if ((len -= 1) < 0)
+		return (G_BAD_TOK_HEADER);
+	if (*buf++ != 0x60)
+		return (G_BAD_TOK_HEADER);
+
+	if ((seqsize = der_read_length(&buf, &len)) < 0)
+		return (G_BAD_TOK_HEADER);
+
+	if ((len -= 1) < 0)
+		return (G_BAD_TOK_HEADER);
+	if (*buf++ != 0x06)
+		return (G_BAD_TOK_HEADER);
+
+	if ((len -= 1) < 0)
+		return (G_BAD_TOK_HEADER);
+	mech->len = *buf++;
+
+	if ((len -= mech->len) < 0)
+		return (G_BAD_TOK_HEADER);
+	OBD_ALLOC_LARGE(mech->data, mech->len);
+	if (!mech->data)
+		return (G_BUFFER_ALLOC);
+	memcpy(mech->data, buf, mech->len);
+
+	return ret;
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_internal.h b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_internal.h
new file mode 100644
index 000000000000..cbfc47cb3f7b
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_internal.h
@@ -0,0 +1,526 @@
+/*
+ * Modified from NFSv4 project for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#ifndef __PTLRPC_GSS_GSS_INTERNAL_H_
+#define __PTLRPC_GSS_GSS_INTERNAL_H_
+
+#include <lustre_sec.h>
+
+/*
+ * rawobj stuff
+ */
+typedef struct netobj_s {
+	__u32	   len;
+	__u8	    data[0];
+} netobj_t;
+
+#define NETOBJ_EMPTY    ((netobj_t) { 0 })
+
+typedef struct rawobj_s {
+	__u32	   len;
+	__u8	   *data;
+} rawobj_t;
+
+#define RAWOBJ_EMPTY    ((rawobj_t) { 0, NULL })
+
+typedef struct rawobj_buf_s {
+	__u32	   dataoff;
+	__u32	   datalen;
+	__u32	   buflen;
+	__u8	   *buf;
+} rawobj_buf_t;
+
+int rawobj_empty(rawobj_t *obj);
+int rawobj_alloc(rawobj_t *obj, char *buf, int len);
+void rawobj_free(rawobj_t *obj);
+int rawobj_equal(rawobj_t *a, rawobj_t *b);
+int rawobj_dup(rawobj_t *dest, rawobj_t *src);
+int rawobj_serialize(rawobj_t *obj, __u32 **buf, __u32 *buflen);
+int rawobj_extract(rawobj_t *obj, __u32 **buf, __u32 *buflen);
+int rawobj_extract_alloc(rawobj_t *obj, __u32 **buf, __u32 *buflen);
+int rawobj_extract_local(rawobj_t *obj, __u32 **buf, __u32 *buflen);
+int rawobj_extract_local_alloc(rawobj_t *obj, __u32 **buf, __u32 *buflen);
+int rawobj_from_netobj(rawobj_t *rawobj, netobj_t *netobj);
+int rawobj_from_netobj_alloc(rawobj_t *obj, netobj_t *netobj);
+
+int buffer_extract_bytes(const void **buf, __u32 *buflen,
+			 void *res, __u32 reslen);
+
+/*
+ * several timeout values. client refresh upcall timeout we using
+ * default in pipefs implemnetation.
+ */
+#define __TIMEOUT_DELTA		 (10)
+
+#define GSS_SECINIT_RPC_TIMEOUT					 \
+	(obd_timeout < __TIMEOUT_DELTA ?				\
+	 __TIMEOUT_DELTA : obd_timeout - __TIMEOUT_DELTA)
+
+#define GSS_SECFINI_RPC_TIMEOUT	 (__TIMEOUT_DELTA)
+#define GSS_SECSVC_UPCALL_TIMEOUT       (GSS_SECINIT_RPC_TIMEOUT)
+
+/*
+ * default gc interval
+ */
+#define GSS_GC_INTERVAL		 (60 * 60) /* 60 minutes */
+
+static inline
+unsigned long gss_round_ctx_expiry(unsigned long expiry,
+				   unsigned long sec_flags)
+{
+	if (sec_flags & PTLRPC_SEC_FL_REVERSE)
+		return expiry;
+
+	if (get_seconds() + __TIMEOUT_DELTA <= expiry)
+		return expiry - __TIMEOUT_DELTA;
+
+	return expiry;
+}
+
+/*
+ * Max encryption element in block cipher algorithms.
+ */
+#define GSS_MAX_CIPHER_BLOCK	       (16)
+
+/*
+ * XXX make it visible of kernel and lgssd/lsvcgssd
+ */
+#define GSSD_INTERFACE_VERSION	  (1)
+
+#define PTLRPC_GSS_VERSION	      (1)
+
+
+enum ptlrpc_gss_proc {
+	PTLRPC_GSS_PROC_DATA	    = 0,
+	PTLRPC_GSS_PROC_INIT	    = 1,
+	PTLRPC_GSS_PROC_CONTINUE_INIT   = 2,
+	PTLRPC_GSS_PROC_DESTROY	 = 3,
+	PTLRPC_GSS_PROC_ERR	     = 4,
+};
+
+enum ptlrpc_gss_tgt {
+	LUSTRE_GSS_TGT_MGS	      = 0,
+	LUSTRE_GSS_TGT_MDS	      = 1,
+	LUSTRE_GSS_TGT_OSS	      = 2,
+};
+
+enum ptlrpc_gss_header_flags {
+	LUSTRE_GSS_PACK_BULK	    = 1,
+	LUSTRE_GSS_PACK_USER	    = 2,
+};
+
+static inline
+__u32 import_to_gss_svc(struct obd_import *imp)
+{
+	const char *name = imp->imp_obd->obd_type->typ_name;
+
+	if (!strcmp(name, LUSTRE_MGC_NAME))
+		return LUSTRE_GSS_TGT_MGS;
+	if (!strcmp(name, LUSTRE_MDC_NAME))
+		return LUSTRE_GSS_TGT_MDS;
+	if (!strcmp(name, LUSTRE_OSC_NAME))
+		return LUSTRE_GSS_TGT_OSS;
+	LBUG();
+	return 0;
+}
+
+/*
+ * following 3 header must have the same size and offset
+ */
+struct gss_header {
+	__u8		    gh_version;     /* gss version */
+	__u8		    gh_sp;	  /* sec part */
+	__u16		   gh_pad0;
+	__u32		   gh_flags;       /* wrap flags */
+	__u32		   gh_proc;	/* proc */
+	__u32		   gh_seq;	 /* sequence */
+	__u32		   gh_svc;	 /* service */
+	__u32		   gh_pad1;
+	__u32		   gh_pad2;
+	__u32		   gh_pad3;
+	netobj_t		gh_handle;      /* context handle */
+};
+
+struct gss_rep_header {
+	__u8		    gh_version;
+	__u8		    gh_sp;
+	__u16		   gh_pad0;
+	__u32		   gh_flags;
+	__u32		   gh_proc;
+	__u32		   gh_major;
+	__u32		   gh_minor;
+	__u32		   gh_seqwin;
+	__u32		   gh_pad2;
+	__u32		   gh_pad3;
+	netobj_t		gh_handle;
+};
+
+struct gss_err_header {
+	__u8		    gh_version;
+	__u8		    gh_sp;
+	__u16		   gh_pad0;
+	__u32		   gh_flags;
+	__u32		   gh_proc;
+	__u32		   gh_major;
+	__u32		   gh_minor;
+	__u32		   gh_pad1;
+	__u32		   gh_pad2;
+	__u32		   gh_pad3;
+	netobj_t		gh_handle;
+};
+
+/*
+ * part of wire context information send from client which be saved and
+ * used later by server.
+ */
+struct gss_wire_ctx {
+	__u32		   gw_flags;
+	__u32		   gw_proc;
+	__u32		   gw_seq;
+	__u32		   gw_svc;
+	rawobj_t		gw_handle;
+};
+
+#define PTLRPC_GSS_MAX_HANDLE_SIZE      (8)
+#define PTLRPC_GSS_HEADER_SIZE	  (sizeof(struct gss_header) + \
+					 PTLRPC_GSS_MAX_HANDLE_SIZE)
+
+
+static inline __u64 gss_handle_to_u64(rawobj_t *handle)
+{
+	if (handle->len != PTLRPC_GSS_MAX_HANDLE_SIZE)
+		return -1;
+	return *((__u64 *) handle->data);
+}
+
+#define GSS_SEQ_WIN		     (2048)
+#define GSS_SEQ_WIN_MAIN		GSS_SEQ_WIN
+#define GSS_SEQ_WIN_BACK		(128)
+#define GSS_SEQ_REPACK_THRESHOLD	(GSS_SEQ_WIN_MAIN / 2 + \
+					 GSS_SEQ_WIN_MAIN / 4)
+
+struct gss_svc_seq_data {
+	spinlock_t		ssd_lock;
+	/*
+	 * highest sequence number seen so far, for main and back window
+	 */
+	__u32		   ssd_max_main;
+	__u32		   ssd_max_back;
+	/*
+	 * main and back window
+	 * for i such that ssd_max - GSS_SEQ_WIN < i <= ssd_max, the i-th bit
+	 * of ssd_win is nonzero iff sequence number i has been seen already.
+	 */
+	unsigned long	   ssd_win_main[GSS_SEQ_WIN_MAIN/BITS_PER_LONG];
+	unsigned long	   ssd_win_back[GSS_SEQ_WIN_BACK/BITS_PER_LONG];
+};
+
+struct gss_svc_ctx {
+	struct gss_ctx	 *gsc_mechctx;
+	struct gss_svc_seq_data gsc_seqdata;
+	rawobj_t		gsc_rvs_hdl;
+	__u32		   gsc_rvs_seq;
+	uid_t		   gsc_uid;
+	gid_t		   gsc_gid;
+	uid_t		   gsc_mapped_uid;
+	unsigned int	    gsc_usr_root:1,
+				gsc_usr_mds:1,
+				gsc_usr_oss:1,
+				gsc_remote:1,
+				gsc_reverse:1;
+};
+
+struct gss_svc_reqctx {
+	struct ptlrpc_svc_ctx	   src_base;
+	/*
+	 * context
+	 */
+	struct gss_wire_ctx	     src_wirectx;
+	struct gss_svc_ctx	     *src_ctx;
+	/*
+	 * record place of bulk_sec_desc in request/reply buffer
+	 */
+	struct ptlrpc_bulk_sec_desc    *src_reqbsd;
+	int			     src_reqbsd_size;
+	struct ptlrpc_bulk_sec_desc    *src_repbsd;
+	int			     src_repbsd_size;
+	/*
+	 * flags
+	 */
+	unsigned int		    src_init:1,
+					src_init_continue:1,
+					src_err_notify:1;
+	int			     src_reserve_len;
+};
+
+struct gss_cli_ctx {
+	struct ptlrpc_cli_ctx   gc_base;
+	__u32		   gc_flavor;
+	__u32		   gc_proc;
+	__u32		   gc_win;
+	atomic_t	    gc_seq;
+	rawobj_t		gc_handle;
+	struct gss_ctx	 *gc_mechctx;
+	/* handle for the buddy svc ctx */
+	rawobj_t		gc_svc_handle;
+};
+
+struct gss_cli_ctx_keyring {
+	struct gss_cli_ctx      gck_base;
+	struct key	     *gck_key;
+	struct timer_list      *gck_timer;
+};
+
+struct gss_sec {
+	struct ptlrpc_sec	gs_base;
+	struct gss_api_mech	*gs_mech;
+	spinlock_t		gs_lock;
+	__u64			gs_rvs_hdl;
+};
+
+struct gss_sec_pipefs {
+	struct gss_sec	  gsp_base;
+	int		     gsp_chash_size;  /* must be 2^n */
+	struct hlist_head	gsp_chash[0];
+};
+
+/*
+ * FIXME cleanup the keyring upcall mutexes
+ */
+#define HAVE_KEYRING_UPCALL_SERIALIZED  1
+
+struct gss_sec_keyring {
+	struct gss_sec	  gsk_base;
+	/*
+	 * all contexts listed here. access is protected by sec spinlock.
+	 */
+	struct hlist_head	gsk_clist;
+	/*
+	 * specially point to root ctx (only one at a time). access is
+	 * protected by sec spinlock.
+	 */
+	struct ptlrpc_cli_ctx  *gsk_root_ctx;
+	/*
+	 * specially serialize upcalls for root context.
+	 */
+	struct mutex			gsk_root_uc_lock;
+
+#ifdef HAVE_KEYRING_UPCALL_SERIALIZED
+	struct mutex		gsk_uc_lock;	/* serialize upcalls */
+#endif
+};
+
+static inline struct gss_cli_ctx *ctx2gctx(struct ptlrpc_cli_ctx *ctx)
+{
+	return container_of(ctx, struct gss_cli_ctx, gc_base);
+}
+
+static inline
+struct gss_cli_ctx_keyring *ctx2gctx_keyring(struct ptlrpc_cli_ctx *ctx)
+{
+	return container_of(ctx2gctx(ctx),
+			    struct gss_cli_ctx_keyring, gck_base);
+}
+
+static inline struct gss_sec *sec2gsec(struct ptlrpc_sec *sec)
+{
+	return container_of(sec, struct gss_sec, gs_base);
+}
+
+static inline struct gss_sec_pipefs *sec2gsec_pipefs(struct ptlrpc_sec *sec)
+{
+	return container_of(sec2gsec(sec), struct gss_sec_pipefs, gsp_base);
+}
+
+static inline struct gss_sec_keyring *sec2gsec_keyring(struct ptlrpc_sec *sec)
+{
+	return container_of(sec2gsec(sec), struct gss_sec_keyring, gsk_base);
+}
+
+
+#define GSS_CTX_INIT_MAX_LEN	    (1024)
+
+/*
+ * This only guaranteed be enough for current krb5 des-cbc-crc . We might
+ * adjust this when new enc type or mech added in.
+ */
+#define GSS_PRIVBUF_PREFIX_LEN	 (32)
+#define GSS_PRIVBUF_SUFFIX_LEN	 (32)
+
+static inline
+struct gss_svc_reqctx *gss_svc_ctx2reqctx(struct ptlrpc_svc_ctx *ctx)
+{
+	LASSERT(ctx);
+	return container_of(ctx, struct gss_svc_reqctx, src_base);
+}
+
+static inline
+struct gss_svc_ctx *gss_svc_ctx2gssctx(struct ptlrpc_svc_ctx *ctx)
+{
+	LASSERT(ctx);
+	return gss_svc_ctx2reqctx(ctx)->src_ctx;
+}
+
+/* sec_gss.c */
+int gss_cli_ctx_match(struct ptlrpc_cli_ctx *ctx, struct vfs_cred *vcred);
+int gss_cli_ctx_display(struct ptlrpc_cli_ctx *ctx, char *buf, int bufsize);
+int gss_cli_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req);
+int gss_cli_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req);
+int gss_cli_ctx_seal(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req);
+int gss_cli_ctx_unseal(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req);
+
+int  gss_sec_install_rctx(struct obd_import *imp, struct ptlrpc_sec *sec,
+			  struct ptlrpc_cli_ctx *ctx);
+int  gss_alloc_reqbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req,
+		      int msgsize);
+void gss_free_reqbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req);
+int  gss_alloc_repbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req,
+		      int msgsize);
+void gss_free_repbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req);
+int  gss_enlarge_reqbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req,
+			int segment, int newsize);
+
+int  gss_svc_accept(struct ptlrpc_sec_policy *policy,
+		    struct ptlrpc_request *req);
+void gss_svc_invalidate_ctx(struct ptlrpc_svc_ctx *svc_ctx);
+int  gss_svc_alloc_rs(struct ptlrpc_request *req, int msglen);
+int  gss_svc_authorize(struct ptlrpc_request *req);
+void gss_svc_free_rs(struct ptlrpc_reply_state *rs);
+void gss_svc_free_ctx(struct ptlrpc_svc_ctx *ctx);
+
+int cli_ctx_expire(struct ptlrpc_cli_ctx *ctx);
+int cli_ctx_check_death(struct ptlrpc_cli_ctx *ctx);
+
+int gss_copy_rvc_cli_ctx(struct ptlrpc_cli_ctx *cli_ctx,
+			 struct ptlrpc_svc_ctx *svc_ctx);
+
+struct gss_header *gss_swab_header(struct lustre_msg *msg, int segment,
+				   int swabbed);
+netobj_t *gss_swab_netobj(struct lustre_msg *msg, int segment);
+
+void gss_cli_ctx_uptodate(struct gss_cli_ctx *gctx);
+int gss_pack_err_notify(struct ptlrpc_request *req, __u32 major, __u32 minor);
+int gss_check_seq_num(struct gss_svc_seq_data *sd, __u32 seq_num, int set);
+
+int gss_sec_create_common(struct gss_sec *gsec,
+			  struct ptlrpc_sec_policy *policy,
+			  struct obd_import *imp,
+			  struct ptlrpc_svc_ctx *ctx,
+			  struct sptlrpc_flavor *sf);
+void gss_sec_destroy_common(struct gss_sec *gsec);
+void gss_sec_kill(struct ptlrpc_sec *sec);
+
+int gss_cli_ctx_init_common(struct ptlrpc_sec *sec,
+			    struct ptlrpc_cli_ctx *ctx,
+			    struct ptlrpc_ctx_ops *ctxops,
+			    struct vfs_cred *vcred);
+int gss_cli_ctx_fini_common(struct ptlrpc_sec *sec,
+			    struct ptlrpc_cli_ctx *ctx);
+
+void gss_cli_ctx_flags2str(unsigned long flags, char *buf, int bufsize);
+
+/* gss_keyring.c */
+int  __init gss_init_keyring(void);
+void __exit gss_exit_keyring(void);
+
+/* gss_pipefs.c */
+int  __init gss_init_pipefs(void);
+void __exit gss_exit_pipefs(void);
+
+/* gss_bulk.c */
+int gss_cli_prep_bulk(struct ptlrpc_request *req,
+		      struct ptlrpc_bulk_desc *desc);
+int gss_cli_ctx_wrap_bulk(struct ptlrpc_cli_ctx *ctx,
+			  struct ptlrpc_request *req,
+			  struct ptlrpc_bulk_desc *desc);
+int gss_cli_ctx_unwrap_bulk(struct ptlrpc_cli_ctx *ctx,
+			    struct ptlrpc_request *req,
+			    struct ptlrpc_bulk_desc *desc);
+int gss_svc_prep_bulk(struct ptlrpc_request *req,
+		      struct ptlrpc_bulk_desc *desc);
+int gss_svc_unwrap_bulk(struct ptlrpc_request *req,
+			struct ptlrpc_bulk_desc *desc);
+int gss_svc_wrap_bulk(struct ptlrpc_request *req,
+		      struct ptlrpc_bulk_desc *desc);
+
+/* gss_mech_switch.c */
+int init_kerberos_module(void);
+void cleanup_kerberos_module(void);
+
+/* gss_generic_token.c */
+int g_token_size(rawobj_t *mech, unsigned int body_size);
+void g_make_token_header(rawobj_t *mech, int body_size, unsigned char **buf);
+__u32 g_verify_token_header(rawobj_t *mech, int *body_size,
+			    unsigned char **buf_in, int toksize);
+
+
+/* gss_cli_upcall.c */
+int gss_do_ctx_init_rpc(char *buffer, unsigned long count);
+int gss_do_ctx_fini_rpc(struct gss_cli_ctx *gctx);
+
+int  __init gss_init_cli_upcall(void);
+void __exit gss_exit_cli_upcall(void);
+
+/* gss_svc_upcall.c */
+__u64 gss_get_next_ctx_index(void);
+int gss_svc_upcall_install_rvs_ctx(struct obd_import *imp,
+				   struct gss_sec *gsec,
+				   struct gss_cli_ctx *gctx);
+int gss_svc_upcall_expire_rvs_ctx(rawobj_t *handle);
+int gss_svc_upcall_dup_handle(rawobj_t *handle, struct gss_svc_ctx *ctx);
+int gss_svc_upcall_update_sequence(rawobj_t *handle, __u32 seq);
+int gss_svc_upcall_handle_init(struct ptlrpc_request *req,
+			       struct gss_svc_reqctx *grctx,
+			       struct gss_wire_ctx *gw,
+			       struct obd_device *target,
+			       __u32 lustre_svc,
+			       rawobj_t *rvs_hdl,
+			       rawobj_t *in_token);
+struct gss_svc_ctx *gss_svc_upcall_get_ctx(struct ptlrpc_request *req,
+					   struct gss_wire_ctx *gw);
+void gss_svc_upcall_put_ctx(struct gss_svc_ctx *ctx);
+void gss_svc_upcall_destroy_ctx(struct gss_svc_ctx *ctx);
+
+int  __init gss_init_svc_upcall(void);
+void __exit gss_exit_svc_upcall(void);
+
+/* lproc_gss.c */
+void gss_stat_oos_record_cli(int behind);
+void gss_stat_oos_record_svc(int phase, int replay);
+
+int  __init gss_init_lproc(void);
+void __exit gss_exit_lproc(void);
+
+/* gss_krb5_mech.c */
+int __init init_kerberos_module(void);
+void __exit cleanup_kerberos_module(void);
+
+
+/* debug */
+static inline
+void __dbg_memdump(char *name, void *ptr, int size)
+{
+	char *buf, *p = (char *) ptr;
+	int bufsize = size * 2 + 1, i;
+
+	OBD_ALLOC(buf, bufsize);
+	if (!buf) {
+		CDEBUG(D_ERROR, "DUMP ERROR: can't alloc %d bytes\n", bufsize);
+		return;
+	}
+
+	for (i = 0; i < size; i++)
+		sprintf(&buf[i+i], "%02x", (__u8) p[i]);
+	buf[size + size] = '\0';
+	LCONSOLE_INFO("DUMP %s@%p(%d): %s\n", name, ptr, size, buf);
+	OBD_FREE(buf, bufsize);
+}
+
+#endif /* __PTLRPC_GSS_GSS_INTERNAL_H_ */
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_keyring.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_keyring.c
new file mode 100644
index 000000000000..bb571ae51054
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_keyring.c
@@ -0,0 +1,1424 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/gss/gss_keyring.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/crypto.h>
+#include <linux/key.h>
+#include <linux/keyctl.h>
+#include <linux/key-type.h>
+#include <linux/mutex.h>
+#include <asm/atomic.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_sec.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+static struct ptlrpc_sec_policy gss_policy_keyring;
+static struct ptlrpc_ctx_ops gss_keyring_ctxops;
+static struct key_type gss_key_type;
+
+static int sec_install_rctx_kr(struct ptlrpc_sec *sec,
+			       struct ptlrpc_svc_ctx *svc_ctx);
+
+/*
+ * the timeout is only for the case that upcall child process die abnormally.
+ * in any other cases it should finally update kernel key.
+ *
+ * FIXME we'd better to incorporate the client & server side upcall timeouts
+ * into the framework of Adaptive Timeouts, but we need to figure out how to
+ * make sure that kernel knows the upcall processes is in-progress or died
+ * unexpectedly.
+ */
+#define KEYRING_UPCALL_TIMEOUT  (obd_timeout + obd_timeout)
+
+/****************************************
+ * internal helpers		     *
+ ****************************************/
+
+#define DUMP_PROCESS_KEYRINGS(tsk)					\
+{									\
+	CWARN("DUMP PK: %s[%u,%u/%u](<-%s[%u,%u/%u]): "			\
+	      "a %d, t %d, p %d, s %d, u %d, us %d, df %d\n",		\
+	      tsk->comm, tsk->pid, tsk->uid, tsk->fsuid,		\
+	      tsk->parent->comm, tsk->parent->pid,			\
+	      tsk->parent->uid, tsk->parent->fsuid,			\
+	      tsk->request_key_auth ?					\
+	      tsk->request_key_auth->serial : 0,			\
+	      key_cred(tsk)->thread_keyring ?				\
+	      key_cred(tsk)->thread_keyring->serial : 0,		\
+	      key_tgcred(tsk)->process_keyring ?			\
+	      key_tgcred(tsk)->process_keyring->serial : 0,		\
+	      key_tgcred(tsk)->session_keyring ?			\
+	      key_tgcred(tsk)->session_keyring->serial : 0,		\
+	      key_cred(tsk)->user->uid_keyring ?			\
+	      key_cred(tsk)->user->uid_keyring->serial : 0,		\
+	      key_cred(tsk)->user->session_keyring ?			\
+	      key_cred(tsk)->user->session_keyring->serial : 0,		\
+	      key_cred(tsk)->jit_keyring				\
+	     );								\
+}
+
+#define DUMP_KEY(key)						   \
+{								       \
+	CWARN("DUMP KEY: %p(%d) ref %d u%u/g%u desc %s\n",	      \
+	      key, key->serial, atomic_read(&key->usage),	       \
+	      key->uid, key->gid,				       \
+	      key->description ? key->description : "n/a"	       \
+	     );							 \
+}
+
+#define key_cred(tsk)   ((tsk)->cred)
+#define key_tgcred(tsk) ((tsk)->cred->tgcred)
+
+static inline void keyring_upcall_lock(struct gss_sec_keyring *gsec_kr)
+{
+#ifdef HAVE_KEYRING_UPCALL_SERIALIZED
+	mutex_lock(&gsec_kr->gsk_uc_lock);
+#endif
+}
+
+static inline void keyring_upcall_unlock(struct gss_sec_keyring *gsec_kr)
+{
+#ifdef HAVE_KEYRING_UPCALL_SERIALIZED
+	mutex_unlock(&gsec_kr->gsk_uc_lock);
+#endif
+}
+
+static inline void key_revoke_locked(struct key *key)
+{
+	set_bit(KEY_FLAG_REVOKED, &key->flags);
+}
+
+static void ctx_upcall_timeout_kr(unsigned long data)
+{
+	struct ptlrpc_cli_ctx *ctx = (struct ptlrpc_cli_ctx *) data;
+	struct key	    *key = ctx2gctx_keyring(ctx)->gck_key;
+
+	CWARN("ctx %p, key %p\n", ctx, key);
+
+	LASSERT(key);
+
+	cli_ctx_expire(ctx);
+	key_revoke_locked(key);
+}
+
+static
+void ctx_start_timer_kr(struct ptlrpc_cli_ctx *ctx, long timeout)
+{
+	struct gss_cli_ctx_keyring *gctx_kr = ctx2gctx_keyring(ctx);
+	struct timer_list	  *timer = gctx_kr->gck_timer;
+
+	LASSERT(timer);
+
+	CDEBUG(D_SEC, "ctx %p: start timer %lds\n", ctx, timeout);
+	timeout = timeout * HZ + cfs_time_current();
+
+	init_timer(timer);
+	timer->expires = timeout;
+	timer->data = (unsigned long ) ctx;
+	timer->function = ctx_upcall_timeout_kr;
+
+	add_timer(timer);
+}
+
+/*
+ * caller should make sure no race with other threads
+ */
+static
+void ctx_clear_timer_kr(struct ptlrpc_cli_ctx *ctx)
+{
+	struct gss_cli_ctx_keyring *gctx_kr = ctx2gctx_keyring(ctx);
+	struct timer_list	  *timer = gctx_kr->gck_timer;
+
+	if (timer == NULL)
+		return;
+
+	CDEBUG(D_SEC, "ctx %p, key %p\n", ctx, gctx_kr->gck_key);
+
+	gctx_kr->gck_timer = NULL;
+
+	del_singleshot_timer_sync(timer);
+
+	OBD_FREE_PTR(timer);
+}
+
+static
+struct ptlrpc_cli_ctx *ctx_create_kr(struct ptlrpc_sec *sec,
+				     struct vfs_cred *vcred)
+{
+	struct ptlrpc_cli_ctx      *ctx;
+	struct gss_cli_ctx_keyring *gctx_kr;
+
+	OBD_ALLOC_PTR(gctx_kr);
+	if (gctx_kr == NULL)
+		return NULL;
+
+	OBD_ALLOC_PTR(gctx_kr->gck_timer);
+	if (gctx_kr->gck_timer == NULL) {
+		OBD_FREE_PTR(gctx_kr);
+		return NULL;
+	}
+	init_timer(gctx_kr->gck_timer);
+
+	ctx = &gctx_kr->gck_base.gc_base;
+
+	if (gss_cli_ctx_init_common(sec, ctx, &gss_keyring_ctxops, vcred)) {
+		OBD_FREE_PTR(gctx_kr->gck_timer);
+		OBD_FREE_PTR(gctx_kr);
+		return NULL;
+	}
+
+	ctx->cc_expire = cfs_time_current_sec() + KEYRING_UPCALL_TIMEOUT;
+	clear_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags);
+	atomic_inc(&ctx->cc_refcount); /* for the caller */
+
+	return ctx;
+}
+
+static void ctx_destroy_kr(struct ptlrpc_cli_ctx *ctx)
+{
+	struct ptlrpc_sec	  *sec = ctx->cc_sec;
+	struct gss_cli_ctx_keyring *gctx_kr = ctx2gctx_keyring(ctx);
+
+	CDEBUG(D_SEC, "destroying ctx %p\n", ctx);
+
+	/* at this time the association with key has been broken. */
+	LASSERT(sec);
+	LASSERT(atomic_read(&sec->ps_refcount) > 0);
+	LASSERT(atomic_read(&sec->ps_nctx) > 0);
+	LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags) == 0);
+	LASSERT(gctx_kr->gck_key == NULL);
+
+	ctx_clear_timer_kr(ctx);
+	LASSERT(gctx_kr->gck_timer == NULL);
+
+	if (gss_cli_ctx_fini_common(sec, ctx))
+		return;
+
+	OBD_FREE_PTR(gctx_kr);
+
+	atomic_dec(&sec->ps_nctx);
+	sptlrpc_sec_put(sec);
+}
+
+static void ctx_release_kr(struct ptlrpc_cli_ctx *ctx, int sync)
+{
+	if (sync) {
+		ctx_destroy_kr(ctx);
+	} else {
+		atomic_inc(&ctx->cc_refcount);
+		sptlrpc_gc_add_ctx(ctx);
+	}
+}
+
+static void ctx_put_kr(struct ptlrpc_cli_ctx *ctx, int sync)
+{
+	LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+
+	if (atomic_dec_and_test(&ctx->cc_refcount))
+		ctx_release_kr(ctx, sync);
+}
+
+/*
+ * key <-> ctx association and rules:
+ * - ctx might not bind with any key
+ * - key/ctx binding is protected by key semaphore (if the key present)
+ * - key and ctx each take a reference of the other
+ * - ctx enlist/unlist is protected by ctx spinlock
+ * - never enlist a ctx after it's been unlisted
+ * - whoever do enlist should also do bind, lock key before enlist:
+ *   - lock key -> lock ctx -> enlist -> unlock ctx -> bind -> unlock key
+ * - whoever do unlist should also do unbind:
+ *   - lock key -> lock ctx -> unlist -> unlock ctx -> unbind -> unlock key
+ *   - lock ctx -> unlist -> unlock ctx -> lock key -> unbind -> unlock key
+ */
+
+static inline void spin_lock_if(spinlock_t *lock, int condition)
+{
+	if (condition)
+		spin_lock(lock);
+}
+
+static inline void spin_unlock_if(spinlock_t *lock, int condition)
+{
+	if (condition)
+		spin_unlock(lock);
+}
+
+static void ctx_enlist_kr(struct ptlrpc_cli_ctx *ctx, int is_root, int locked)
+{
+	struct ptlrpc_sec      *sec = ctx->cc_sec;
+	struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec);
+
+	LASSERT(!test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags));
+	LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+
+	spin_lock_if(&sec->ps_lock, !locked);
+
+	atomic_inc(&ctx->cc_refcount);
+	set_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags);
+	hlist_add_head(&ctx->cc_cache, &gsec_kr->gsk_clist);
+	if (is_root)
+		gsec_kr->gsk_root_ctx = ctx;
+
+	spin_unlock_if(&sec->ps_lock, !locked);
+}
+
+/*
+ * Note after this get called, caller should not access ctx again because
+ * it might have been freed, unless caller hold at least one refcount of
+ * the ctx.
+ *
+ * return non-zero if we indeed unlist this ctx.
+ */
+static int ctx_unlist_kr(struct ptlrpc_cli_ctx *ctx, int locked)
+{
+	struct ptlrpc_sec       *sec = ctx->cc_sec;
+	struct gss_sec_keyring  *gsec_kr = sec2gsec_keyring(sec);
+
+	/* if hashed bit has gone, leave the job to somebody who is doing it */
+	if (test_and_clear_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags) == 0)
+		return 0;
+
+	/* drop ref inside spin lock to prevent race with other operations */
+	spin_lock_if(&sec->ps_lock, !locked);
+
+	if (gsec_kr->gsk_root_ctx == ctx)
+		gsec_kr->gsk_root_ctx = NULL;
+	hlist_del_init(&ctx->cc_cache);
+	atomic_dec(&ctx->cc_refcount);
+
+	spin_unlock_if(&sec->ps_lock, !locked);
+
+	return 1;
+}
+
+/*
+ * bind a key with a ctx together.
+ * caller must hold write lock of the key, as well as ref on key & ctx.
+ */
+static void bind_key_ctx(struct key *key, struct ptlrpc_cli_ctx *ctx)
+{
+	LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+	LASSERT(atomic_read(&key->usage) > 0);
+	LASSERT(ctx2gctx_keyring(ctx)->gck_key == NULL);
+	LASSERT(key->payload.data == NULL);
+
+	/* at this time context may or may not in list. */
+	key_get(key);
+	atomic_inc(&ctx->cc_refcount);
+	ctx2gctx_keyring(ctx)->gck_key = key;
+	key->payload.data = ctx;
+}
+
+/*
+ * unbind a key and a ctx.
+ * caller must hold write lock, as well as a ref of the key.
+ */
+static void unbind_key_ctx(struct key *key, struct ptlrpc_cli_ctx *ctx)
+{
+	LASSERT(key->payload.data == ctx);
+	LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags) == 0);
+
+	/* must revoke the key, or others may treat it as newly created */
+	key_revoke_locked(key);
+
+	key->payload.data = NULL;
+	ctx2gctx_keyring(ctx)->gck_key = NULL;
+
+	/* once ctx get split from key, the timer is meaningless */
+	ctx_clear_timer_kr(ctx);
+
+	ctx_put_kr(ctx, 1);
+	key_put(key);
+}
+
+/*
+ * given a ctx, unbind with its coupled key, if any.
+ * unbind could only be called once, so we don't worry the key be released
+ * by someone else.
+ */
+static void unbind_ctx_kr(struct ptlrpc_cli_ctx *ctx)
+{
+	struct key      *key = ctx2gctx_keyring(ctx)->gck_key;
+
+	if (key) {
+		LASSERT(key->payload.data == ctx);
+
+		key_get(key);
+		down_write(&key->sem);
+		unbind_key_ctx(key, ctx);
+		up_write(&key->sem);
+		key_put(key);
+	}
+}
+
+/*
+ * given a key, unbind with its coupled ctx, if any.
+ * caller must hold write lock, as well as a ref of the key.
+ */
+static void unbind_key_locked(struct key *key)
+{
+	struct ptlrpc_cli_ctx   *ctx = key->payload.data;
+
+	if (ctx)
+		unbind_key_ctx(key, ctx);
+}
+
+/*
+ * unlist a ctx, and unbind from coupled key
+ */
+static void kill_ctx_kr(struct ptlrpc_cli_ctx *ctx)
+{
+	if (ctx_unlist_kr(ctx, 0))
+		unbind_ctx_kr(ctx);
+}
+
+/*
+ * given a key, unlist and unbind with the coupled ctx (if any).
+ * caller must hold write lock, as well as a ref of the key.
+ */
+static void kill_key_locked(struct key *key)
+{
+	struct ptlrpc_cli_ctx *ctx = key->payload.data;
+
+	if (ctx && ctx_unlist_kr(ctx, 0))
+		unbind_key_locked(key);
+}
+
+/*
+ * caller should hold one ref on contexts in freelist.
+ */
+static void dispose_ctx_list_kr(struct hlist_head *freelist)
+{
+	struct hlist_node      *next;
+	struct ptlrpc_cli_ctx  *ctx;
+	struct gss_cli_ctx     *gctx;
+
+	hlist_for_each_entry_safe(ctx, next, freelist, cc_cache) {
+		hlist_del_init(&ctx->cc_cache);
+
+		/* reverse ctx: update current seq to buddy svcctx if exist.
+		 * ideally this should be done at gss_cli_ctx_finalize(), but
+		 * the ctx destroy could be delayed by:
+		 *  1) ctx still has reference;
+		 *  2) ctx destroy is asynchronous;
+		 * and reverse import call inval_all_ctx() require this be done
+		 *_immediately_ otherwise newly created reverse ctx might copy
+		 * the very old sequence number from svcctx. */
+		gctx = ctx2gctx(ctx);
+		if (!rawobj_empty(&gctx->gc_svc_handle) &&
+		    sec_is_reverse(gctx->gc_base.cc_sec)) {
+			gss_svc_upcall_update_sequence(&gctx->gc_svc_handle,
+					(__u32) atomic_read(&gctx->gc_seq));
+		}
+
+		/* we need to wakeup waiting reqs here. the context might
+		 * be forced released before upcall finished, then the
+		 * late-arrived downcall can't find the ctx even. */
+		sptlrpc_cli_ctx_wakeup(ctx);
+
+		unbind_ctx_kr(ctx);
+		ctx_put_kr(ctx, 0);
+	}
+}
+
+/*
+ * lookup a root context directly in a sec, return root ctx with a
+ * reference taken or NULL.
+ */
+static
+struct ptlrpc_cli_ctx * sec_lookup_root_ctx_kr(struct ptlrpc_sec *sec)
+{
+	struct gss_sec_keyring  *gsec_kr = sec2gsec_keyring(sec);
+	struct ptlrpc_cli_ctx   *ctx = NULL;
+
+	spin_lock(&sec->ps_lock);
+
+	ctx = gsec_kr->gsk_root_ctx;
+
+	if (ctx == NULL && unlikely(sec_is_reverse(sec))) {
+		struct ptlrpc_cli_ctx  *tmp;
+
+		/* reverse ctx, search root ctx in list, choose the one
+		 * with shortest expire time, which is most possibly have
+		 * an established peer ctx at client side. */
+		hlist_for_each_entry(tmp, &gsec_kr->gsk_clist, cc_cache) {
+			if (ctx == NULL || ctx->cc_expire == 0 ||
+			    ctx->cc_expire > tmp->cc_expire) {
+				ctx = tmp;
+				/* promote to be root_ctx */
+				gsec_kr->gsk_root_ctx = ctx;
+			}
+		}
+	}
+
+	if (ctx) {
+		LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+		LASSERT(!hlist_empty(&gsec_kr->gsk_clist));
+		atomic_inc(&ctx->cc_refcount);
+	}
+
+	spin_unlock(&sec->ps_lock);
+
+	return ctx;
+}
+
+#define RVS_CTX_EXPIRE_NICE    (10)
+
+static
+void rvs_sec_install_root_ctx_kr(struct ptlrpc_sec *sec,
+				 struct ptlrpc_cli_ctx *new_ctx,
+				 struct key *key)
+{
+	struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec);
+	struct ptlrpc_cli_ctx  *ctx;
+	cfs_time_t	      now;
+	ENTRY;
+
+	LASSERT(sec_is_reverse(sec));
+
+	spin_lock(&sec->ps_lock);
+
+	now = cfs_time_current_sec();
+
+	/* set all existing ctxs short expiry */
+	hlist_for_each_entry(ctx, &gsec_kr->gsk_clist, cc_cache) {
+		if (ctx->cc_expire > now + RVS_CTX_EXPIRE_NICE) {
+			ctx->cc_early_expire = 1;
+			ctx->cc_expire = now + RVS_CTX_EXPIRE_NICE;
+		}
+	}
+
+	/* if there's root_ctx there, instead obsolete the current
+	 * immediately, we leave it continue operating for a little while.
+	 * hopefully when the first backward rpc with newest ctx send out,
+	 * the client side already have the peer ctx well established. */
+	ctx_enlist_kr(new_ctx, gsec_kr->gsk_root_ctx ? 0 : 1, 1);
+
+	if (key)
+		bind_key_ctx(key, new_ctx);
+
+	spin_unlock(&sec->ps_lock);
+}
+
+static void construct_key_desc(void *buf, int bufsize,
+			       struct ptlrpc_sec *sec, uid_t uid)
+{
+	snprintf(buf, bufsize, "%d@%x", uid, sec->ps_id);
+	((char *)buf)[bufsize - 1] = '\0';
+}
+
+/****************************************
+ * sec apis			     *
+ ****************************************/
+
+static
+struct ptlrpc_sec * gss_sec_create_kr(struct obd_import *imp,
+				      struct ptlrpc_svc_ctx *svcctx,
+				      struct sptlrpc_flavor *sf)
+{
+	struct gss_sec_keyring  *gsec_kr;
+	ENTRY;
+
+	OBD_ALLOC(gsec_kr, sizeof(*gsec_kr));
+	if (gsec_kr == NULL)
+		RETURN(NULL);
+
+	INIT_HLIST_HEAD(&gsec_kr->gsk_clist);
+	gsec_kr->gsk_root_ctx = NULL;
+	mutex_init(&gsec_kr->gsk_root_uc_lock);
+#ifdef HAVE_KEYRING_UPCALL_SERIALIZED
+	mutex_init(&gsec_kr->gsk_uc_lock);
+#endif
+
+	if (gss_sec_create_common(&gsec_kr->gsk_base, &gss_policy_keyring,
+				  imp, svcctx, sf))
+		goto err_free;
+
+	if (svcctx != NULL &&
+	    sec_install_rctx_kr(&gsec_kr->gsk_base.gs_base, svcctx)) {
+		gss_sec_destroy_common(&gsec_kr->gsk_base);
+		goto err_free;
+	}
+
+	RETURN(&gsec_kr->gsk_base.gs_base);
+
+err_free:
+	OBD_FREE(gsec_kr, sizeof(*gsec_kr));
+	RETURN(NULL);
+}
+
+static
+void gss_sec_destroy_kr(struct ptlrpc_sec *sec)
+{
+	struct gss_sec	  *gsec = sec2gsec(sec);
+	struct gss_sec_keyring  *gsec_kr = sec2gsec_keyring(sec);
+
+	CDEBUG(D_SEC, "destroy %s@%p\n", sec->ps_policy->sp_name, sec);
+
+	LASSERT(hlist_empty(&gsec_kr->gsk_clist));
+	LASSERT(gsec_kr->gsk_root_ctx == NULL);
+
+	gss_sec_destroy_common(gsec);
+
+	OBD_FREE(gsec_kr, sizeof(*gsec_kr));
+}
+
+static inline int user_is_root(struct ptlrpc_sec *sec, struct vfs_cred *vcred)
+{
+	/* except the ROOTONLY flag, treat it as root user only if real uid
+	 * is 0, euid/fsuid being 0 are handled as setuid scenarios */
+	if (sec_is_rootonly(sec) || (vcred->vc_uid == 0))
+		return 1;
+	else
+		return 0;
+}
+
+/*
+ * unlink request key from it's ring, which is linked during request_key().
+ * sadly, we have to 'guess' which keyring it's linked to.
+ *
+ * FIXME this code is fragile, depend on how request_key_link() is implemented.
+ */
+static void request_key_unlink(struct key *key)
+{
+	struct task_struct *tsk = current;
+	struct key *ring;
+
+	switch (key_cred(tsk)->jit_keyring) {
+	case KEY_REQKEY_DEFL_DEFAULT:
+	case KEY_REQKEY_DEFL_THREAD_KEYRING:
+		ring = key_get(key_cred(tsk)->thread_keyring);
+		if (ring)
+			break;
+	case KEY_REQKEY_DEFL_PROCESS_KEYRING:
+		ring = key_get(key_tgcred(tsk)->process_keyring);
+		if (ring)
+			break;
+	case KEY_REQKEY_DEFL_SESSION_KEYRING:
+		rcu_read_lock();
+		ring = key_get(rcu_dereference(key_tgcred(tsk)
+					       ->session_keyring));
+		rcu_read_unlock();
+		if (ring)
+			break;
+	case KEY_REQKEY_DEFL_USER_SESSION_KEYRING:
+		ring = key_get(key_cred(tsk)->user->session_keyring);
+		break;
+	case KEY_REQKEY_DEFL_USER_KEYRING:
+		ring = key_get(key_cred(tsk)->user->uid_keyring);
+		break;
+	case KEY_REQKEY_DEFL_GROUP_KEYRING:
+	default:
+		LBUG();
+	}
+
+	LASSERT(ring);
+	key_unlink(ring, key);
+	key_put(ring);
+}
+
+static
+struct ptlrpc_cli_ctx * gss_sec_lookup_ctx_kr(struct ptlrpc_sec *sec,
+					      struct vfs_cred *vcred,
+					      int create, int remove_dead)
+{
+	struct obd_import       *imp = sec->ps_import;
+	struct gss_sec_keyring  *gsec_kr = sec2gsec_keyring(sec);
+	struct ptlrpc_cli_ctx   *ctx = NULL;
+	unsigned int	     is_root = 0, create_new = 0;
+	struct key	      *key;
+	char		     desc[24];
+	char		    *coinfo;
+	int		      coinfo_size;
+	char		    *co_flags = "";
+	ENTRY;
+
+	LASSERT(imp != NULL);
+
+	is_root = user_is_root(sec, vcred);
+
+	/* a little bit optimization for root context */
+	if (is_root) {
+		ctx = sec_lookup_root_ctx_kr(sec);
+		/*
+		 * Only lookup directly for REVERSE sec, which should
+		 * always succeed.
+		 */
+		if (ctx || sec_is_reverse(sec))
+			RETURN(ctx);
+	}
+
+	LASSERT(create != 0);
+
+	/* for root context, obtain lock and check again, this time hold
+	 * the root upcall lock, make sure nobody else populated new root
+	 * context after last check. */
+	if (is_root) {
+		mutex_lock(&gsec_kr->gsk_root_uc_lock);
+
+		ctx = sec_lookup_root_ctx_kr(sec);
+		if (ctx)
+			goto out;
+
+		/* update reverse handle for root user */
+		sec2gsec(sec)->gs_rvs_hdl = gss_get_next_ctx_index();
+
+		switch (sec->ps_part) {
+		case LUSTRE_SP_MDT:
+			co_flags = "m";
+			break;
+		case LUSTRE_SP_OST:
+			co_flags = "o";
+			break;
+		case LUSTRE_SP_MGC:
+			co_flags = "rmo";
+			break;
+		case LUSTRE_SP_CLI:
+			co_flags = "r";
+			break;
+		case LUSTRE_SP_MGS:
+		default:
+			LBUG();
+		}
+	}
+
+	/* in case of setuid, key will be constructed as owner of fsuid/fsgid,
+	 * but we do authentication based on real uid/gid. the key permission
+	 * bits will be exactly as POS_ALL, so only processes who subscribed
+	 * this key could have the access, although the quota might be counted
+	 * on others (fsuid/fsgid).
+	 *
+	 * keyring will use fsuid/fsgid as upcall parameters, so we have to
+	 * encode real uid/gid into callout info.
+	 */
+
+	construct_key_desc(desc, sizeof(desc), sec, vcred->vc_uid);
+
+	/* callout info format:
+	 * secid:mech:uid:gid:flags:svc_type:peer_nid:target_uuid
+	 */
+	coinfo_size = sizeof(struct obd_uuid) + MAX_OBD_NAME + 64;
+	OBD_ALLOC(coinfo, coinfo_size);
+	if (coinfo == NULL)
+		goto out;
+
+	snprintf(coinfo, coinfo_size, "%d:%s:%u:%u:%s:%d:"LPX64":%s",
+		 sec->ps_id, sec2gsec(sec)->gs_mech->gm_name,
+		 vcred->vc_uid, vcred->vc_gid,
+		 co_flags, import_to_gss_svc(imp),
+		 imp->imp_connection->c_peer.nid, imp->imp_obd->obd_name);
+
+	CDEBUG(D_SEC, "requesting key for %s\n", desc);
+
+	keyring_upcall_lock(gsec_kr);
+	key = request_key(&gss_key_type, desc, coinfo);
+	keyring_upcall_unlock(gsec_kr);
+
+	OBD_FREE(coinfo, coinfo_size);
+
+	if (IS_ERR(key)) {
+		CERROR("failed request key: %ld\n", PTR_ERR(key));
+		goto out;
+	}
+	CDEBUG(D_SEC, "obtained key %08x for %s\n", key->serial, desc);
+
+	/* once payload.data was pointed to a ctx, it never changes until
+	 * we de-associate them; but parallel request_key() may return
+	 * a key with payload.data == NULL at the same time. so we still
+	 * need wirtelock of key->sem to serialize them. */
+	down_write(&key->sem);
+
+	if (likely(key->payload.data != NULL)) {
+		ctx = key->payload.data;
+
+		LASSERT(atomic_read(&ctx->cc_refcount) >= 1);
+		LASSERT(ctx2gctx_keyring(ctx)->gck_key == key);
+		LASSERT(atomic_read(&key->usage) >= 2);
+
+		/* simply take a ref and return. it's upper layer's
+		 * responsibility to detect & replace dead ctx. */
+		atomic_inc(&ctx->cc_refcount);
+	} else {
+		/* pre initialization with a cli_ctx. this can't be done in
+		 * key_instantiate() because we'v no enough information
+		 * there. */
+		ctx = ctx_create_kr(sec, vcred);
+		if (ctx != NULL) {
+			ctx_enlist_kr(ctx, is_root, 0);
+			bind_key_ctx(key, ctx);
+
+			ctx_start_timer_kr(ctx, KEYRING_UPCALL_TIMEOUT);
+
+			CDEBUG(D_SEC, "installed key %p <-> ctx %p (sec %p)\n",
+			       key, ctx, sec);
+		} else {
+			/* we'd prefer to call key_revoke(), but we more like
+			 * to revoke it within this key->sem locked period. */
+			key_revoke_locked(key);
+		}
+
+		create_new = 1;
+	}
+
+	up_write(&key->sem);
+
+	if (is_root && create_new)
+		request_key_unlink(key);
+
+	key_put(key);
+out:
+	if (is_root)
+		mutex_unlock(&gsec_kr->gsk_root_uc_lock);
+	RETURN(ctx);
+}
+
+static
+void gss_sec_release_ctx_kr(struct ptlrpc_sec *sec,
+			    struct ptlrpc_cli_ctx *ctx,
+			    int sync)
+{
+	LASSERT(atomic_read(&sec->ps_refcount) > 0);
+	LASSERT(atomic_read(&ctx->cc_refcount) == 0);
+	ctx_release_kr(ctx, sync);
+}
+
+/*
+ * flush context of normal user, we must resort to keyring itself to find out
+ * contexts which belong to me.
+ *
+ * Note here we suppose only to flush _my_ context, the "uid" will
+ * be ignored in the search.
+ */
+static
+void flush_user_ctx_cache_kr(struct ptlrpc_sec *sec,
+			     uid_t uid,
+			     int grace, int force)
+{
+	struct key	      *key;
+	char		     desc[24];
+
+	/* nothing to do for reverse or rootonly sec */
+	if (sec_is_reverse(sec) || sec_is_rootonly(sec))
+		return;
+
+	construct_key_desc(desc, sizeof(desc), sec, uid);
+
+	/* there should be only one valid key, but we put it in the
+	 * loop in case of any weird cases */
+	for (;;) {
+		key = request_key(&gss_key_type, desc, NULL);
+		if (IS_ERR(key)) {
+			CDEBUG(D_SEC, "No more key found for current user\n");
+			break;
+		}
+
+		down_write(&key->sem);
+
+		kill_key_locked(key);
+
+		/* kill_key_locked() should usually revoke the key, but we
+		 * revoke it again to make sure, e.g. some case the key may
+		 * not well coupled with a context. */
+		key_revoke_locked(key);
+
+		up_write(&key->sem);
+
+		key_put(key);
+	}
+}
+
+/*
+ * flush context of root or all, we iterate through the list.
+ */
+static
+void flush_spec_ctx_cache_kr(struct ptlrpc_sec *sec,
+			     uid_t uid,
+			     int grace, int force)
+{
+	struct gss_sec_keyring *gsec_kr;
+	struct hlist_head	freelist = HLIST_HEAD_INIT;
+	struct hlist_node      *next;
+	struct ptlrpc_cli_ctx  *ctx;
+	ENTRY;
+
+	gsec_kr = sec2gsec_keyring(sec);
+
+	spin_lock(&sec->ps_lock);
+	hlist_for_each_entry_safe(ctx, next,
+				      &gsec_kr->gsk_clist, cc_cache) {
+		LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+
+		if (uid != -1 && uid != ctx->cc_vcred.vc_uid)
+			continue;
+
+		/* at this moment there's at least 2 base reference:
+		 * key association and in-list. */
+		if (atomic_read(&ctx->cc_refcount) > 2) {
+			if (!force)
+				continue;
+			CWARN("flush busy ctx %p(%u->%s, extra ref %d)\n",
+			      ctx, ctx->cc_vcred.vc_uid,
+			      sec2target_str(ctx->cc_sec),
+			      atomic_read(&ctx->cc_refcount) - 2);
+		}
+
+		set_bit(PTLRPC_CTX_DEAD_BIT, &ctx->cc_flags);
+		if (!grace)
+			clear_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags);
+
+		atomic_inc(&ctx->cc_refcount);
+
+		if (ctx_unlist_kr(ctx, 1)) {
+			hlist_add_head(&ctx->cc_cache, &freelist);
+		} else {
+			LASSERT(atomic_read(&ctx->cc_refcount) >= 2);
+			atomic_dec(&ctx->cc_refcount);
+		}
+	}
+	spin_unlock(&sec->ps_lock);
+
+	dispose_ctx_list_kr(&freelist);
+	EXIT;
+}
+
+static
+int gss_sec_flush_ctx_cache_kr(struct ptlrpc_sec *sec,
+			       uid_t uid, int grace, int force)
+{
+	ENTRY;
+
+	CDEBUG(D_SEC, "sec %p(%d, nctx %d), uid %d, grace %d, force %d\n",
+	       sec, atomic_read(&sec->ps_refcount),
+	       atomic_read(&sec->ps_nctx),
+	       uid, grace, force);
+
+	if (uid != -1 && uid != 0)
+		flush_user_ctx_cache_kr(sec, uid, grace, force);
+	else
+		flush_spec_ctx_cache_kr(sec, uid, grace, force);
+
+	RETURN(0);
+}
+
+static
+void gss_sec_gc_ctx_kr(struct ptlrpc_sec *sec)
+{
+	struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec);
+	struct hlist_head	freelist = HLIST_HEAD_INIT;
+	struct hlist_node      *next;
+	struct ptlrpc_cli_ctx  *ctx;
+	ENTRY;
+
+	CWARN("running gc\n");
+
+	spin_lock(&sec->ps_lock);
+	hlist_for_each_entry_safe(ctx, next,
+				      &gsec_kr->gsk_clist, cc_cache) {
+		LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+
+		atomic_inc(&ctx->cc_refcount);
+
+		if (cli_ctx_check_death(ctx) && ctx_unlist_kr(ctx, 1)) {
+			hlist_add_head(&ctx->cc_cache, &freelist);
+			CWARN("unhashed ctx %p\n", ctx);
+		} else {
+			LASSERT(atomic_read(&ctx->cc_refcount) >= 2);
+			atomic_dec(&ctx->cc_refcount);
+		}
+	}
+	spin_unlock(&sec->ps_lock);
+
+	dispose_ctx_list_kr(&freelist);
+	EXIT;
+	return;
+}
+
+static
+int gss_sec_display_kr(struct ptlrpc_sec *sec, struct seq_file *seq)
+{
+	struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec);
+	struct hlist_node      *next;
+	struct ptlrpc_cli_ctx  *ctx;
+	struct gss_cli_ctx     *gctx;
+	time_t		  now = cfs_time_current_sec();
+	ENTRY;
+
+	spin_lock(&sec->ps_lock);
+	hlist_for_each_entry_safe(ctx, next,
+				  &gsec_kr->gsk_clist, cc_cache) {
+		struct key	     *key;
+		char		    flags_str[40];
+		char		    mech[40];
+
+		gctx = ctx2gctx(ctx);
+		key = ctx2gctx_keyring(ctx)->gck_key;
+
+		gss_cli_ctx_flags2str(ctx->cc_flags,
+				      flags_str, sizeof(flags_str));
+
+		if (gctx->gc_mechctx)
+			lgss_display(gctx->gc_mechctx, mech, sizeof(mech));
+		else
+			snprintf(mech, sizeof(mech), "N/A");
+		mech[sizeof(mech) - 1] = '\0';
+
+		seq_printf(seq, "%p: uid %u, ref %d, expire %ld(%+ld), fl %s, "
+			   "seq %d, win %u, key %08x(ref %d), "
+			   "hdl "LPX64":"LPX64", mech: %s\n",
+			   ctx, ctx->cc_vcred.vc_uid,
+			   atomic_read(&ctx->cc_refcount),
+			   ctx->cc_expire,
+			   ctx->cc_expire ?  ctx->cc_expire - now : 0,
+			   flags_str,
+			   atomic_read(&gctx->gc_seq),
+			   gctx->gc_win,
+			   key ? key->serial : 0,
+			   key ? atomic_read(&key->usage) : 0,
+			   gss_handle_to_u64(&gctx->gc_handle),
+			   gss_handle_to_u64(&gctx->gc_svc_handle),
+			   mech);
+	}
+	spin_unlock(&sec->ps_lock);
+
+	RETURN(0);
+}
+
+/****************************************
+ * cli_ctx apis			 *
+ ****************************************/
+
+static
+int gss_cli_ctx_refresh_kr(struct ptlrpc_cli_ctx *ctx)
+{
+	/* upcall is already on the way */
+	return 0;
+}
+
+static
+int gss_cli_ctx_validate_kr(struct ptlrpc_cli_ctx *ctx)
+{
+	LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+	LASSERT(ctx->cc_sec);
+
+	if (cli_ctx_check_death(ctx)) {
+		kill_ctx_kr(ctx);
+		return 1;
+	}
+
+	if (cli_ctx_is_ready(ctx))
+		return 0;
+	return 1;
+}
+
+static
+void gss_cli_ctx_die_kr(struct ptlrpc_cli_ctx *ctx, int grace)
+{
+	LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+	LASSERT(ctx->cc_sec);
+
+	cli_ctx_expire(ctx);
+	kill_ctx_kr(ctx);
+}
+
+/****************************************
+ * (reverse) service		    *
+ ****************************************/
+
+/*
+ * reverse context could have nothing to do with keyrings. here we still keep
+ * the version which bind to a key, for future reference.
+ */
+#define HAVE_REVERSE_CTX_NOKEY
+
+
+static
+int sec_install_rctx_kr(struct ptlrpc_sec *sec,
+			struct ptlrpc_svc_ctx *svc_ctx)
+{
+	struct ptlrpc_cli_ctx   *cli_ctx;
+	struct vfs_cred	  vcred = { 0, 0 };
+	int		      rc;
+
+	LASSERT(sec);
+	LASSERT(svc_ctx);
+
+	cli_ctx = ctx_create_kr(sec, &vcred);
+	if (cli_ctx == NULL)
+		return -ENOMEM;
+
+	rc = gss_copy_rvc_cli_ctx(cli_ctx, svc_ctx);
+	if (rc) {
+		CERROR("failed copy reverse cli ctx: %d\n", rc);
+
+		ctx_put_kr(cli_ctx, 1);
+		return rc;
+	}
+
+	rvs_sec_install_root_ctx_kr(sec, cli_ctx, NULL);
+
+	ctx_put_kr(cli_ctx, 1);
+
+	return 0;
+}
+
+
+/****************************************
+ * service apis			 *
+ ****************************************/
+
+static
+int gss_svc_accept_kr(struct ptlrpc_request *req)
+{
+	return gss_svc_accept(&gss_policy_keyring, req);
+}
+
+static
+int gss_svc_install_rctx_kr(struct obd_import *imp,
+			    struct ptlrpc_svc_ctx *svc_ctx)
+{
+	struct ptlrpc_sec *sec;
+	int		rc;
+
+	sec = sptlrpc_import_sec_ref(imp);
+	LASSERT(sec);
+
+	rc = sec_install_rctx_kr(sec, svc_ctx);
+	sptlrpc_sec_put(sec);
+
+	return rc;
+}
+
+/****************************************
+ * key apis			     *
+ ****************************************/
+
+static
+int gss_kt_instantiate(struct key *key, const void *data, size_t datalen)
+{
+	int	     rc;
+	ENTRY;
+
+	if (data != NULL || datalen != 0) {
+		CERROR("invalid: data %p, len %lu\n", data, (long)datalen);
+		RETURN(-EINVAL);
+	}
+
+	if (key->payload.data != 0) {
+		CERROR("key already have payload\n");
+		RETURN(-EINVAL);
+	}
+
+	/* link the key to session keyring, so following context negotiation
+	 * rpc fired from user space could find this key. This will be unlinked
+	 * automatically when upcall processes die.
+	 *
+	 * we can't do this through keyctl from userspace, because the upcall
+	 * might be neither possessor nor owner of the key (setuid).
+	 *
+	 * the session keyring is created upon upcall, and don't change all
+	 * the way until upcall finished, so rcu lock is not needed here.
+	 */
+	LASSERT(key_tgcred(current)->session_keyring);
+
+	lockdep_off();
+	rc = key_link(key_tgcred(current)->session_keyring, key);
+	lockdep_on();
+	if (unlikely(rc)) {
+		CERROR("failed to link key %08x to keyring %08x: %d\n",
+		       key->serial,
+		       key_tgcred(current)->session_keyring->serial, rc);
+		RETURN(rc);
+	}
+
+	CDEBUG(D_SEC, "key %p instantiated, ctx %p\n", key, key->payload.data);
+	RETURN(0);
+}
+
+/*
+ * called with key semaphore write locked. it means we can operate
+ * on the context without fear of loosing refcount.
+ */
+static
+int gss_kt_update(struct key *key, const void *data, size_t datalen)
+{
+	struct ptlrpc_cli_ctx   *ctx = key->payload.data;
+	struct gss_cli_ctx      *gctx;
+	rawobj_t		 tmpobj = RAWOBJ_EMPTY;
+	__u32		    datalen32 = (__u32) datalen;
+	int		      rc;
+	ENTRY;
+
+	if (data == NULL || datalen == 0) {
+		CWARN("invalid: data %p, len %lu\n", data, (long)datalen);
+		RETURN(-EINVAL);
+	}
+
+	/* if upcall finished negotiation too fast (mostly likely because
+	 * of local error happened) and call kt_update(), the ctx
+	 * might be still NULL. but the key will finally be associate
+	 * with a context, or be revoked. if key status is fine, return
+	 * -EAGAIN to allow userspace sleep a while and call again. */
+	if (ctx == NULL) {
+		CDEBUG(D_SEC, "update too soon: key %p(%x) flags %lx\n",
+		      key, key->serial, key->flags);
+
+		rc = key_validate(key);
+		if (rc == 0)
+			RETURN(-EAGAIN);
+		else
+			RETURN(rc);
+	}
+
+	LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+	LASSERT(ctx->cc_sec);
+
+	ctx_clear_timer_kr(ctx);
+
+	/* don't proceed if already refreshed */
+	if (cli_ctx_is_refreshed(ctx)) {
+		CWARN("ctx already done refresh\n");
+		RETURN(0);
+	}
+
+	sptlrpc_cli_ctx_get(ctx);
+	gctx = ctx2gctx(ctx);
+
+	rc = buffer_extract_bytes(&data, &datalen32, &gctx->gc_win,
+				  sizeof(gctx->gc_win));
+	if (rc) {
+		CERROR("failed extract seq_win\n");
+		goto out;
+	}
+
+	if (gctx->gc_win == 0) {
+		__u32   nego_rpc_err, nego_gss_err;
+
+		rc = buffer_extract_bytes(&data, &datalen32, &nego_rpc_err,
+					  sizeof(nego_rpc_err));
+		if (rc) {
+			CERROR("failed to extrace rpc rc\n");
+			goto out;
+		}
+
+		rc = buffer_extract_bytes(&data, &datalen32, &nego_gss_err,
+					  sizeof(nego_gss_err));
+		if (rc) {
+			CERROR("failed to extrace gss rc\n");
+			goto out;
+		}
+
+		CERROR("negotiation: rpc err %d, gss err %x\n",
+		       nego_rpc_err, nego_gss_err);
+
+		rc = nego_rpc_err ? nego_rpc_err : -EACCES;
+	} else {
+		rc = rawobj_extract_local_alloc(&gctx->gc_handle,
+						(__u32 **) &data, &datalen32);
+		if (rc) {
+			CERROR("failed extract handle\n");
+			goto out;
+		}
+
+		rc = rawobj_extract_local(&tmpobj, (__u32 **) &data,&datalen32);
+		if (rc) {
+			CERROR("failed extract mech\n");
+			goto out;
+		}
+
+		rc = lgss_import_sec_context(&tmpobj,
+					     sec2gsec(ctx->cc_sec)->gs_mech,
+					     &gctx->gc_mechctx);
+		if (rc != GSS_S_COMPLETE)
+			CERROR("failed import context\n");
+		else
+			rc = 0;
+	}
+out:
+	/* we don't care what current status of this ctx, even someone else
+	 * is operating on the ctx at the same time. we just add up our own
+	 * opinions here. */
+	if (rc == 0) {
+		gss_cli_ctx_uptodate(gctx);
+	} else {
+		/* this will also revoke the key. has to be done before
+		 * wakeup waiters otherwise they can find the stale key */
+		kill_key_locked(key);
+
+		cli_ctx_expire(ctx);
+
+		if (rc != -ERESTART)
+			set_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags);
+	}
+
+	/* let user space think it's a success */
+	sptlrpc_cli_ctx_put(ctx, 1);
+	RETURN(0);
+}
+
+static
+int gss_kt_match(const struct key *key, const void *desc)
+{
+	return (strcmp(key->description, (const char *) desc) == 0);
+}
+
+static
+void gss_kt_destroy(struct key *key)
+{
+	ENTRY;
+	LASSERT(key->payload.data == NULL);
+	CDEBUG(D_SEC, "destroy key %p\n", key);
+	EXIT;
+}
+
+static
+void gss_kt_describe(const struct key *key, struct seq_file *s)
+{
+	if (key->description == NULL)
+		seq_puts(s, "[null]");
+	else
+		seq_puts(s, key->description);
+}
+
+static struct key_type gss_key_type =
+{
+	.name	   = "lgssc",
+	.def_datalen    = 0,
+	.instantiate    = gss_kt_instantiate,
+	.update	 = gss_kt_update,
+	.match	  = gss_kt_match,
+	.destroy	= gss_kt_destroy,
+	.describe       = gss_kt_describe,
+};
+
+/****************************************
+ * lustre gss keyring policy	    *
+ ****************************************/
+
+static struct ptlrpc_ctx_ops gss_keyring_ctxops = {
+	.match		  = gss_cli_ctx_match,
+	.refresh		= gss_cli_ctx_refresh_kr,
+	.validate	       = gss_cli_ctx_validate_kr,
+	.die		    = gss_cli_ctx_die_kr,
+	.sign		   = gss_cli_ctx_sign,
+	.verify		 = gss_cli_ctx_verify,
+	.seal		   = gss_cli_ctx_seal,
+	.unseal		 = gss_cli_ctx_unseal,
+	.wrap_bulk	      = gss_cli_ctx_wrap_bulk,
+	.unwrap_bulk	    = gss_cli_ctx_unwrap_bulk,
+};
+
+static struct ptlrpc_sec_cops gss_sec_keyring_cops = {
+	.create_sec	     = gss_sec_create_kr,
+	.destroy_sec	    = gss_sec_destroy_kr,
+	.kill_sec	       = gss_sec_kill,
+	.lookup_ctx	     = gss_sec_lookup_ctx_kr,
+	.release_ctx	    = gss_sec_release_ctx_kr,
+	.flush_ctx_cache	= gss_sec_flush_ctx_cache_kr,
+	.gc_ctx		 = gss_sec_gc_ctx_kr,
+	.install_rctx	   = gss_sec_install_rctx,
+	.alloc_reqbuf	   = gss_alloc_reqbuf,
+	.free_reqbuf	    = gss_free_reqbuf,
+	.alloc_repbuf	   = gss_alloc_repbuf,
+	.free_repbuf	    = gss_free_repbuf,
+	.enlarge_reqbuf	 = gss_enlarge_reqbuf,
+	.display		= gss_sec_display_kr,
+};
+
+static struct ptlrpc_sec_sops gss_sec_keyring_sops = {
+	.accept		 = gss_svc_accept_kr,
+	.invalidate_ctx	 = gss_svc_invalidate_ctx,
+	.alloc_rs	       = gss_svc_alloc_rs,
+	.authorize	      = gss_svc_authorize,
+	.free_rs		= gss_svc_free_rs,
+	.free_ctx	       = gss_svc_free_ctx,
+	.prep_bulk	      = gss_svc_prep_bulk,
+	.unwrap_bulk	    = gss_svc_unwrap_bulk,
+	.wrap_bulk	      = gss_svc_wrap_bulk,
+	.install_rctx	   = gss_svc_install_rctx_kr,
+};
+
+static struct ptlrpc_sec_policy gss_policy_keyring = {
+	.sp_owner	       = THIS_MODULE,
+	.sp_name		= "gss.keyring",
+	.sp_policy	      = SPTLRPC_POLICY_GSS,
+	.sp_cops		= &gss_sec_keyring_cops,
+	.sp_sops		= &gss_sec_keyring_sops,
+};
+
+
+int __init gss_init_keyring(void)
+{
+	int rc;
+
+	rc = register_key_type(&gss_key_type);
+	if (rc) {
+		CERROR("failed to register keyring type: %d\n", rc);
+		return rc;
+	}
+
+	rc = sptlrpc_register_policy(&gss_policy_keyring);
+	if (rc) {
+		unregister_key_type(&gss_key_type);
+		return rc;
+	}
+
+	return 0;
+}
+
+void __exit gss_exit_keyring(void)
+{
+	unregister_key_type(&gss_key_type);
+	sptlrpc_unregister_policy(&gss_policy_keyring);
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_krb5.h b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_krb5.h
new file mode 100644
index 000000000000..676d4b96311a
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_krb5.h
@@ -0,0 +1,163 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  linux/include/linux/sunrpc/gss_krb5_types.h
+ *
+ *  Adapted from MIT Kerberos 5-1.2.1 lib/include/krb5.h,
+ *  lib/gssapi/krb5/gssapiP_krb5.h, and others
+ *
+ *  Copyright (c) 2000 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson   <andros@umich.edu>
+ *  Bruce Fields   <bfields@umich.edu>
+ */
+
+/*
+ * Copyright 1995 by the Massachusetts Institute of Technology.
+ * All Rights Reserved.
+ *
+ * Export of this software from the United States of America may
+ *   require a specific license from the United States Government.
+ *   It is the responsibility of any person or organization contemplating
+ *   export to obtain such a license before exporting.
+ *
+ * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
+ * distribute this software and its documentation for any purpose and
+ * without fee is hereby granted, provided that the above copyright
+ * notice appear in all copies and that both that copyright notice and
+ * this permission notice appear in supporting documentation, and that
+ * the name of M.I.T. not be used in advertising or publicity pertaining
+ * to distribution of the software without specific, written prior
+ * permission.  Furthermore if you modify this software you must label
+ * your software as modified software and not distribute it in such a
+ * fashion that it might be confused with the original M.I.T. software.
+ * M.I.T. makes no representations about the suitability of
+ * this software for any purpose.  It is provided "as is" without express
+ * or implied warranty.
+ *
+ */
+
+#ifndef PTLRPC_GSS_KRB5_H
+#define PTLRPC_GSS_KRB5_H
+
+/*
+ * RFC 4142
+ */
+
+#define KG_USAGE_ACCEPTOR_SEAL	  22
+#define KG_USAGE_ACCEPTOR_SIGN	  23
+#define KG_USAGE_INITIATOR_SEAL	 24
+#define KG_USAGE_INITIATOR_SIGN	 25
+
+#define KG_TOK_MIC_MSG		  0x0404
+#define KG_TOK_WRAP_MSG		 0x0504
+
+#define FLAG_SENDER_IS_ACCEPTOR	 0x01
+#define FLAG_WRAP_CONFIDENTIAL	  0x02
+#define FLAG_ACCEPTOR_SUBKEY	    0x04
+
+struct krb5_header {
+	__u16	   kh_tok_id;      /* token id */
+	__u8	    kh_flags;       /* acceptor flags */
+	__u8	    kh_filler;      /* 0xff */
+	__u16	   kh_ec;	  /* extra count */
+	__u16	   kh_rrc;	 /* right rotation count */
+	__u64	   kh_seq;	 /* sequence number */
+	__u8	    kh_cksum[0];    /* checksum */
+};
+
+struct krb5_keyblock {
+	rawobj_t		 kb_key;
+	struct ll_crypto_cipher *kb_tfm;
+};
+
+struct krb5_ctx {
+	unsigned int	    kc_initiate:1,
+				kc_cfx:1,
+				kc_seed_init:1,
+				kc_have_acceptor_subkey:1;
+	__s32		   kc_endtime;
+	__u8		    kc_seed[16];
+	__u64		   kc_seq_send;
+	__u64		   kc_seq_recv;
+	__u32		   kc_enctype;
+	struct krb5_keyblock    kc_keye;	/* encryption */
+	struct krb5_keyblock    kc_keyi;	/* integrity */
+	struct krb5_keyblock    kc_keyc;	/* checksum */
+	rawobj_t		kc_mech_used;
+};
+
+enum sgn_alg {
+	SGN_ALG_DES_MAC_MD5	   = 0x0000,
+	SGN_ALG_MD2_5		 = 0x0001,
+	SGN_ALG_DES_MAC	       = 0x0002,
+	SGN_ALG_3		     = 0x0003, /* not published */
+	SGN_ALG_HMAC_MD5	      = 0x0011, /* microsoft w2k; no support */
+	SGN_ALG_HMAC_SHA1_DES3_KD     = 0x0004
+};
+
+enum seal_alg {
+	SEAL_ALG_NONE		 = 0xffff,
+	SEAL_ALG_DES		  = 0x0000,
+	SEAL_ALG_1		    = 0x0001, /* not published */
+	SEAL_ALG_MICROSOFT_RC4	= 0x0010, /* microsoft w2k; no support */
+	SEAL_ALG_DES3KD	       = 0x0002
+};
+
+#define CKSUMTYPE_CRC32		 0x0001
+#define CKSUMTYPE_RSA_MD4	       0x0002
+#define CKSUMTYPE_RSA_MD4_DES	   0x0003
+#define CKSUMTYPE_DESCBC		0x0004
+/* des-mac-k */
+/* rsa-md4-des-k */
+#define CKSUMTYPE_RSA_MD5	       0x0007
+#define CKSUMTYPE_RSA_MD5_DES	   0x0008
+#define CKSUMTYPE_NIST_SHA	      0x0009
+#define CKSUMTYPE_HMAC_SHA1_DES3	0x000c
+#define CKSUMTYPE_HMAC_SHA1_96_AES128   0x000f
+#define CKSUMTYPE_HMAC_SHA1_96_AES256   0x0010
+#define CKSUMTYPE_HMAC_MD5_ARCFOUR      -138
+
+/* from gssapi_err_krb5.h */
+#define KG_CCACHE_NOMATCH			(39756032L)
+#define KG_KEYTAB_NOMATCH			(39756033L)
+#define KG_TGT_MISSING			   (39756034L)
+#define KG_NO_SUBKEY			     (39756035L)
+#define KG_CONTEXT_ESTABLISHED		   (39756036L)
+#define KG_BAD_SIGN_TYPE			 (39756037L)
+#define KG_BAD_LENGTH			    (39756038L)
+#define KG_CTX_INCOMPLETE			(39756039L)
+#define KG_CONTEXT			       (39756040L)
+#define KG_CRED				  (39756041L)
+#define KG_ENC_DESC			      (39756042L)
+#define KG_BAD_SEQ			       (39756043L)
+#define KG_EMPTY_CCACHE			  (39756044L)
+#define KG_NO_CTYPES			     (39756045L)
+
+/* per Kerberos v5 protocol spec crypto types from the wire.
+ * these get mapped to linux kernel crypto routines.
+ */
+#define ENCTYPE_NULL	    0x0000
+#define ENCTYPE_DES_CBC_CRC     0x0001	/* DES cbc mode with CRC-32 */
+#define ENCTYPE_DES_CBC_MD4     0x0002	/* DES cbc mode with RSA-MD4 */
+#define ENCTYPE_DES_CBC_MD5     0x0003	/* DES cbc mode with RSA-MD5 */
+#define ENCTYPE_DES_CBC_RAW     0x0004	/* DES cbc mode raw */
+/* XXX deprecated? */
+#define ENCTYPE_DES3_CBC_SHA    0x0005	/* DES-3 cbc mode with NIST-SHA */
+#define ENCTYPE_DES3_CBC_RAW    0x0006	/* DES-3 cbc mode raw */
+#define ENCTYPE_DES_HMAC_SHA1   0x0008
+#define ENCTYPE_DES3_CBC_SHA1   0x0010
+#define ENCTYPE_AES128_CTS_HMAC_SHA1_96 0x0011
+#define ENCTYPE_AES256_CTS_HMAC_SHA1_96 0x0012
+#define ENCTYPE_ARCFOUR_HMAC    0x0017
+#define ENCTYPE_ARCFOUR_HMAC_EXP 0x0018
+#define ENCTYPE_UNKNOWN	 0x01ff
+
+#endif /* PTLRPC_GSS_KRB5_H */
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_krb5_mech.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_krb5_mech.c
new file mode 100644
index 000000000000..4b28931bbc96
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_krb5_mech.c
@@ -0,0 +1,1786 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  linux/net/sunrpc/gss_krb5_mech.c
+ *  linux/net/sunrpc/gss_krb5_crypto.c
+ *  linux/net/sunrpc/gss_krb5_seal.c
+ *  linux/net/sunrpc/gss_krb5_seqnum.c
+ *  linux/net/sunrpc/gss_krb5_unseal.c
+ *
+ *  Copyright (c) 2001 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson <andros@umich.edu>
+ *  J. Bruce Fields <bfields@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/crypto.h>
+#include <linux/mutex.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+#include "gss_asn1.h"
+#include "gss_krb5.h"
+
+static spinlock_t krb5_seq_lock;
+
+struct krb5_enctype {
+	char	   *ke_dispname;
+	char	   *ke_enc_name;	    /* linux tfm name */
+	char	   *ke_hash_name;	   /* linux tfm name */
+	int	     ke_enc_mode;	    /* linux tfm mode */
+	int	     ke_hash_size;	   /* checksum size */
+	int	     ke_conf_size;	   /* confounder size */
+	unsigned int    ke_hash_hmac:1;	 /* is hmac? */
+};
+
+/*
+ * NOTE: for aes128-cts and aes256-cts, MIT implementation use CTS encryption.
+ * but currently we simply CBC with padding, because linux doesn't support CTS
+ * yet. this need to be fixed in the future.
+ */
+static struct krb5_enctype enctypes[] = {
+	[ENCTYPE_DES_CBC_RAW] = {	       /* des-cbc-md5 */
+		"des-cbc-md5",
+		"cbc(des)",
+		"md5",
+		0,
+		16,
+		8,
+		0,
+	},
+	[ENCTYPE_DES3_CBC_RAW] = {	      /* des3-hmac-sha1 */
+		"des3-hmac-sha1",
+		"cbc(des3_ede)",
+		"hmac(sha1)",
+		0,
+		20,
+		8,
+		1,
+	},
+	[ENCTYPE_AES128_CTS_HMAC_SHA1_96] = {   /* aes128-cts */
+		"aes128-cts-hmac-sha1-96",
+		"cbc(aes)",
+		"hmac(sha1)",
+		0,
+		12,
+		16,
+		1,
+	},
+	[ENCTYPE_AES256_CTS_HMAC_SHA1_96] = {   /* aes256-cts */
+		"aes256-cts-hmac-sha1-96",
+		"cbc(aes)",
+		"hmac(sha1)",
+		0,
+		12,
+		16,
+		1,
+	},
+	[ENCTYPE_ARCFOUR_HMAC] = {	      /* arcfour-hmac-md5 */
+		"arcfour-hmac-md5",
+		"ecb(arc4)",
+		"hmac(md5)",
+		0,
+		16,
+		8,
+		1,
+	},
+};
+
+#define MAX_ENCTYPES    sizeof(enctypes)/sizeof(struct krb5_enctype)
+
+static const char * enctype2str(__u32 enctype)
+{
+	if (enctype < MAX_ENCTYPES && enctypes[enctype].ke_dispname)
+		return enctypes[enctype].ke_dispname;
+
+	return "unknown";
+}
+
+static
+int keyblock_init(struct krb5_keyblock *kb, char *alg_name, int alg_mode)
+{
+	kb->kb_tfm = ll_crypto_alloc_blkcipher(alg_name, alg_mode, 0);
+	if (IS_ERR(kb->kb_tfm)) {
+		CERROR("failed to alloc tfm: %s, mode %d\n",
+		       alg_name, alg_mode);
+		return -1;
+	}
+
+	if (ll_crypto_blkcipher_setkey(kb->kb_tfm, kb->kb_key.data, kb->kb_key.len)) {
+		CERROR("failed to set %s key, len %d\n",
+		       alg_name, kb->kb_key.len);
+		return -1;
+	}
+
+	return 0;
+}
+
+static
+int krb5_init_keys(struct krb5_ctx *kctx)
+{
+	struct krb5_enctype *ke;
+
+	if (kctx->kc_enctype >= MAX_ENCTYPES ||
+	    enctypes[kctx->kc_enctype].ke_hash_size == 0) {
+		CERROR("unsupported enctype %x\n", kctx->kc_enctype);
+		return -1;
+	}
+
+	ke = &enctypes[kctx->kc_enctype];
+
+	/* tfm arc4 is stateful, user should alloc-use-free by his own */
+	if (kctx->kc_enctype != ENCTYPE_ARCFOUR_HMAC &&
+	    keyblock_init(&kctx->kc_keye, ke->ke_enc_name, ke->ke_enc_mode))
+		return -1;
+
+	/* tfm hmac is stateful, user should alloc-use-free by his own */
+	if (ke->ke_hash_hmac == 0 &&
+	    keyblock_init(&kctx->kc_keyi, ke->ke_enc_name, ke->ke_enc_mode))
+		return -1;
+	if (ke->ke_hash_hmac == 0 &&
+	    keyblock_init(&kctx->kc_keyc, ke->ke_enc_name, ke->ke_enc_mode))
+		return -1;
+
+	return 0;
+}
+
+static
+void keyblock_free(struct krb5_keyblock *kb)
+{
+	rawobj_free(&kb->kb_key);
+	if (kb->kb_tfm)
+		ll_crypto_free_blkcipher(kb->kb_tfm);
+}
+
+static
+int keyblock_dup(struct krb5_keyblock *new, struct krb5_keyblock *kb)
+{
+	return rawobj_dup(&new->kb_key, &kb->kb_key);
+}
+
+static
+int get_bytes(char **ptr, const char *end, void *res, int len)
+{
+	char *p, *q;
+	p = *ptr;
+	q = p + len;
+	if (q > end || q < p)
+		return -1;
+	memcpy(res, p, len);
+	*ptr = q;
+	return 0;
+}
+
+static
+int get_rawobj(char **ptr, const char *end, rawobj_t *res)
+{
+	char   *p, *q;
+	__u32   len;
+
+	p = *ptr;
+	if (get_bytes(&p, end, &len, sizeof(len)))
+		return -1;
+
+	q = p + len;
+	if (q > end || q < p)
+		return -1;
+
+	OBD_ALLOC_LARGE(res->data, len);
+	if (!res->data)
+		return -1;
+
+	res->len = len;
+	memcpy(res->data, p, len);
+	*ptr = q;
+	return 0;
+}
+
+static
+int get_keyblock(char **ptr, const char *end,
+		 struct krb5_keyblock *kb, __u32 keysize)
+{
+	char *buf;
+
+	OBD_ALLOC_LARGE(buf, keysize);
+	if (buf == NULL)
+		return -1;
+
+	if (get_bytes(ptr, end, buf, keysize)) {
+		OBD_FREE_LARGE(buf, keysize);
+		return -1;
+	}
+
+	kb->kb_key.len = keysize;
+	kb->kb_key.data = buf;
+	return 0;
+}
+
+static
+void delete_context_kerberos(struct krb5_ctx *kctx)
+{
+	rawobj_free(&kctx->kc_mech_used);
+
+	keyblock_free(&kctx->kc_keye);
+	keyblock_free(&kctx->kc_keyi);
+	keyblock_free(&kctx->kc_keyc);
+}
+
+static
+__u32 import_context_rfc1964(struct krb5_ctx *kctx, char *p, char *end)
+{
+	unsigned int    tmp_uint, keysize;
+
+	/* seed_init flag */
+	if (get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)))
+		goto out_err;
+	kctx->kc_seed_init = (tmp_uint != 0);
+
+	/* seed */
+	if (get_bytes(&p, end, kctx->kc_seed, sizeof(kctx->kc_seed)))
+		goto out_err;
+
+	/* sign/seal algorithm, not really used now */
+	if (get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)) ||
+	    get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)))
+		goto out_err;
+
+	/* end time */
+	if (get_bytes(&p, end, &kctx->kc_endtime, sizeof(kctx->kc_endtime)))
+		goto out_err;
+
+	/* seq send */
+	if (get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)))
+		goto out_err;
+	kctx->kc_seq_send = tmp_uint;
+
+	/* mech oid */
+	if (get_rawobj(&p, end, &kctx->kc_mech_used))
+		goto out_err;
+
+	/* old style enc/seq keys in format:
+	 *   - enctype (u32)
+	 *   - keysize (u32)
+	 *   - keydata
+	 * we decompose them to fit into the new context
+	 */
+
+	/* enc key */
+	if (get_bytes(&p, end, &kctx->kc_enctype, sizeof(kctx->kc_enctype)))
+		goto out_err;
+
+	if (get_bytes(&p, end, &keysize, sizeof(keysize)))
+		goto out_err;
+
+	if (get_keyblock(&p, end, &kctx->kc_keye, keysize))
+		goto out_err;
+
+	/* seq key */
+	if (get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)) ||
+	    tmp_uint != kctx->kc_enctype)
+		goto out_err;
+
+	if (get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)) ||
+	    tmp_uint != keysize)
+		goto out_err;
+
+	if (get_keyblock(&p, end, &kctx->kc_keyc, keysize))
+		goto out_err;
+
+	/* old style fallback */
+	if (keyblock_dup(&kctx->kc_keyi, &kctx->kc_keyc))
+		goto out_err;
+
+	if (p != end)
+		goto out_err;
+
+	CDEBUG(D_SEC, "succesfully imported rfc1964 context\n");
+	return 0;
+out_err:
+	return GSS_S_FAILURE;
+}
+
+/* Flags for version 2 context flags */
+#define KRB5_CTX_FLAG_INITIATOR		0x00000001
+#define KRB5_CTX_FLAG_CFX		0x00000002
+#define KRB5_CTX_FLAG_ACCEPTOR_SUBKEY	0x00000004
+
+static
+__u32 import_context_rfc4121(struct krb5_ctx *kctx, char *p, char *end)
+{
+	unsigned int    tmp_uint, keysize;
+
+	/* end time */
+	if (get_bytes(&p, end, &kctx->kc_endtime, sizeof(kctx->kc_endtime)))
+		goto out_err;
+
+	/* flags */
+	if (get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)))
+		goto out_err;
+
+	if (tmp_uint & KRB5_CTX_FLAG_INITIATOR)
+		kctx->kc_initiate = 1;
+	if (tmp_uint & KRB5_CTX_FLAG_CFX)
+		kctx->kc_cfx = 1;
+	if (tmp_uint & KRB5_CTX_FLAG_ACCEPTOR_SUBKEY)
+		kctx->kc_have_acceptor_subkey = 1;
+
+	/* seq send */
+	if (get_bytes(&p, end, &kctx->kc_seq_send, sizeof(kctx->kc_seq_send)))
+		goto out_err;
+
+	/* enctype */
+	if (get_bytes(&p, end, &kctx->kc_enctype, sizeof(kctx->kc_enctype)))
+		goto out_err;
+
+	/* size of each key */
+	if (get_bytes(&p, end, &keysize, sizeof(keysize)))
+		goto out_err;
+
+	/* number of keys - should always be 3 */
+	if (get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)))
+		goto out_err;
+
+	if (tmp_uint != 3) {
+		CERROR("Invalid number of keys: %u\n", tmp_uint);
+		goto out_err;
+	}
+
+	/* ke */
+	if (get_keyblock(&p, end, &kctx->kc_keye, keysize))
+		goto out_err;
+	/* ki */
+	if (get_keyblock(&p, end, &kctx->kc_keyi, keysize))
+		goto out_err;
+	/* ki */
+	if (get_keyblock(&p, end, &kctx->kc_keyc, keysize))
+		goto out_err;
+
+	CDEBUG(D_SEC, "succesfully imported v2 context\n");
+	return 0;
+out_err:
+	return GSS_S_FAILURE;
+}
+
+/*
+ * The whole purpose here is trying to keep user level gss context parsing
+ * from nfs-utils unchanged as possible as we can, they are not quite mature
+ * yet, and many stuff still not clear, like heimdal etc.
+ */
+static
+__u32 gss_import_sec_context_kerberos(rawobj_t *inbuf,
+				      struct gss_ctx *gctx)
+{
+	struct krb5_ctx *kctx;
+	char	    *p = (char *) inbuf->data;
+	char	    *end = (char *) (inbuf->data + inbuf->len);
+	unsigned int     tmp_uint, rc;
+
+	if (get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint))) {
+		CERROR("Fail to read version\n");
+		return GSS_S_FAILURE;
+	}
+
+	/* only support 0, 1 for the moment */
+	if (tmp_uint > 2) {
+		CERROR("Invalid version %u\n", tmp_uint);
+		return GSS_S_FAILURE;
+	}
+
+	OBD_ALLOC_PTR(kctx);
+	if (!kctx)
+		return GSS_S_FAILURE;
+
+	if (tmp_uint == 0 || tmp_uint == 1) {
+		kctx->kc_initiate = tmp_uint;
+		rc = import_context_rfc1964(kctx, p, end);
+	} else {
+		rc = import_context_rfc4121(kctx, p, end);
+	}
+
+	if (rc == 0)
+		rc = krb5_init_keys(kctx);
+
+	if (rc) {
+		delete_context_kerberos(kctx);
+		OBD_FREE_PTR(kctx);
+
+		return GSS_S_FAILURE;
+	}
+
+	gctx->internal_ctx_id = kctx;
+	return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_copy_reverse_context_kerberos(struct gss_ctx *gctx,
+					struct gss_ctx *gctx_new)
+{
+	struct krb5_ctx *kctx = gctx->internal_ctx_id;
+	struct krb5_ctx *knew;
+
+	OBD_ALLOC_PTR(knew);
+	if (!knew)
+		return GSS_S_FAILURE;
+
+	knew->kc_initiate = kctx->kc_initiate ? 0 : 1;
+	knew->kc_cfx = kctx->kc_cfx;
+	knew->kc_seed_init = kctx->kc_seed_init;
+	knew->kc_have_acceptor_subkey = kctx->kc_have_acceptor_subkey;
+	knew->kc_endtime = kctx->kc_endtime;
+
+	memcpy(knew->kc_seed, kctx->kc_seed, sizeof(kctx->kc_seed));
+	knew->kc_seq_send = kctx->kc_seq_recv;
+	knew->kc_seq_recv = kctx->kc_seq_send;
+	knew->kc_enctype = kctx->kc_enctype;
+
+	if (rawobj_dup(&knew->kc_mech_used, &kctx->kc_mech_used))
+		goto out_err;
+
+	if (keyblock_dup(&knew->kc_keye, &kctx->kc_keye))
+		goto out_err;
+	if (keyblock_dup(&knew->kc_keyi, &kctx->kc_keyi))
+		goto out_err;
+	if (keyblock_dup(&knew->kc_keyc, &kctx->kc_keyc))
+		goto out_err;
+	if (krb5_init_keys(knew))
+		goto out_err;
+
+	gctx_new->internal_ctx_id = knew;
+	CDEBUG(D_SEC, "succesfully copied reverse context\n");
+	return GSS_S_COMPLETE;
+
+out_err:
+	delete_context_kerberos(knew);
+	OBD_FREE_PTR(knew);
+	return GSS_S_FAILURE;
+}
+
+static
+__u32 gss_inquire_context_kerberos(struct gss_ctx *gctx,
+				   unsigned long  *endtime)
+{
+	struct krb5_ctx *kctx = gctx->internal_ctx_id;
+
+	*endtime = (unsigned long) ((__u32) kctx->kc_endtime);
+	return GSS_S_COMPLETE;
+}
+
+static
+void gss_delete_sec_context_kerberos(void *internal_ctx)
+{
+	struct krb5_ctx *kctx = internal_ctx;
+
+	delete_context_kerberos(kctx);
+	OBD_FREE_PTR(kctx);
+}
+
+static
+void buf_to_sg(struct scatterlist *sg, void *ptr, int len)
+{
+	sg_set_buf(sg, ptr, len);
+}
+
+static
+__u32 krb5_encrypt(struct ll_crypto_cipher *tfm,
+		   int decrypt,
+		   void * iv,
+		   void * in,
+		   void * out,
+		   int length)
+{
+	struct blkcipher_desc desc;
+	struct scatterlist    sg;
+	__u8 local_iv[16] = {0};
+	__u32 ret = -EINVAL;
+
+	LASSERT(tfm);
+	desc.tfm  = tfm;
+	desc.info = local_iv;
+	desc.flags= 0;
+
+	if (length % ll_crypto_blkcipher_blocksize(tfm) != 0) {
+		CERROR("output length %d mismatch blocksize %d\n",
+		       length, ll_crypto_blkcipher_blocksize(tfm));
+		goto out;
+	}
+
+	if (ll_crypto_blkcipher_ivsize(tfm) > 16) {
+		CERROR("iv size too large %d\n", ll_crypto_blkcipher_ivsize(tfm));
+		goto out;
+	}
+
+	if (iv)
+		memcpy(local_iv, iv, ll_crypto_blkcipher_ivsize(tfm));
+
+	memcpy(out, in, length);
+	buf_to_sg(&sg, out, length);
+
+	if (decrypt)
+		ret = ll_crypto_blkcipher_decrypt_iv(&desc, &sg, &sg, length);
+	else
+		ret = ll_crypto_blkcipher_encrypt_iv(&desc, &sg, &sg, length);
+
+out:
+	return(ret);
+}
+
+
+static inline
+int krb5_digest_hmac(struct ll_crypto_hash *tfm,
+		     rawobj_t *key,
+		     struct krb5_header *khdr,
+		     int msgcnt, rawobj_t *msgs,
+		     int iovcnt, lnet_kiov_t *iovs,
+		     rawobj_t *cksum)
+{
+	struct hash_desc   desc;
+	struct scatterlist sg[1];
+	int		i;
+
+	ll_crypto_hash_setkey(tfm, key->data, key->len);
+	desc.tfm  = tfm;
+	desc.flags= 0;
+
+	ll_crypto_hash_init(&desc);
+
+	for (i = 0; i < msgcnt; i++) {
+		if (msgs[i].len == 0)
+			continue;
+		buf_to_sg(sg, (char *) msgs[i].data, msgs[i].len);
+		ll_crypto_hash_update(&desc, sg, msgs[i].len);
+	}
+
+	for (i = 0; i < iovcnt; i++) {
+		if (iovs[i].kiov_len == 0)
+			continue;
+
+		sg_set_page(&sg[0], iovs[i].kiov_page, iovs[i].kiov_len,
+			    iovs[i].kiov_offset);
+		ll_crypto_hash_update(&desc, sg, iovs[i].kiov_len);
+	}
+
+	if (khdr) {
+		buf_to_sg(sg, (char *) khdr, sizeof(*khdr));
+		ll_crypto_hash_update(&desc, sg, sizeof(*khdr));
+	}
+
+	return ll_crypto_hash_final(&desc, cksum->data);
+}
+
+
+static inline
+int krb5_digest_norm(struct ll_crypto_hash *tfm,
+		     struct krb5_keyblock *kb,
+		     struct krb5_header *khdr,
+		     int msgcnt, rawobj_t *msgs,
+		     int iovcnt, lnet_kiov_t *iovs,
+		     rawobj_t *cksum)
+{
+	struct hash_desc   desc;
+	struct scatterlist sg[1];
+	int		i;
+
+	LASSERT(kb->kb_tfm);
+	desc.tfm  = tfm;
+	desc.flags= 0;
+
+	ll_crypto_hash_init(&desc);
+
+	for (i = 0; i < msgcnt; i++) {
+		if (msgs[i].len == 0)
+			continue;
+		buf_to_sg(sg, (char *) msgs[i].data, msgs[i].len);
+		ll_crypto_hash_update(&desc, sg, msgs[i].len);
+	}
+
+	for (i = 0; i < iovcnt; i++) {
+		if (iovs[i].kiov_len == 0)
+			continue;
+
+		sg_set_page(&sg[0], iovs[i].kiov_page, iovs[i].kiov_len,
+			    iovs[i].kiov_offset);
+		ll_crypto_hash_update(&desc, sg, iovs[i].kiov_len);
+	}
+
+	if (khdr) {
+		buf_to_sg(sg, (char *) khdr, sizeof(*khdr));
+		ll_crypto_hash_update(&desc, sg, sizeof(*khdr));
+	}
+
+	ll_crypto_hash_final(&desc, cksum->data);
+
+	return krb5_encrypt(kb->kb_tfm, 0, NULL, cksum->data,
+			    cksum->data, cksum->len);
+}
+
+/*
+ * compute (keyed/keyless) checksum against the plain text which appended
+ * with krb5 wire token header.
+ */
+static
+__s32 krb5_make_checksum(__u32 enctype,
+			 struct krb5_keyblock *kb,
+			 struct krb5_header *khdr,
+			 int msgcnt, rawobj_t *msgs,
+			 int iovcnt, lnet_kiov_t *iovs,
+			 rawobj_t *cksum)
+{
+	struct krb5_enctype   *ke = &enctypes[enctype];
+	struct ll_crypto_hash *tfm;
+	__u32		  code = GSS_S_FAILURE;
+	int		    rc;
+
+	if (!(tfm = ll_crypto_alloc_hash(ke->ke_hash_name, 0, 0))) {
+		CERROR("failed to alloc TFM: %s\n", ke->ke_hash_name);
+		return GSS_S_FAILURE;
+	}
+
+	cksum->len = ll_crypto_hash_digestsize(tfm);
+	OBD_ALLOC_LARGE(cksum->data, cksum->len);
+	if (!cksum->data) {
+		cksum->len = 0;
+		goto out_tfm;
+	}
+
+	if (ke->ke_hash_hmac)
+		rc = krb5_digest_hmac(tfm, &kb->kb_key,
+				      khdr, msgcnt, msgs, iovcnt, iovs, cksum);
+	else
+		rc = krb5_digest_norm(tfm, kb,
+				      khdr, msgcnt, msgs, iovcnt, iovs, cksum);
+
+	if (rc == 0)
+		code = GSS_S_COMPLETE;
+out_tfm:
+	ll_crypto_free_hash(tfm);
+	return code;
+}
+
+static void fill_krb5_header(struct krb5_ctx *kctx,
+			     struct krb5_header *khdr,
+			     int privacy)
+{
+	unsigned char acceptor_flag;
+
+	acceptor_flag = kctx->kc_initiate ? 0 : FLAG_SENDER_IS_ACCEPTOR;
+
+	if (privacy) {
+		khdr->kh_tok_id = cpu_to_be16(KG_TOK_WRAP_MSG);
+		khdr->kh_flags = acceptor_flag | FLAG_WRAP_CONFIDENTIAL;
+		khdr->kh_ec = cpu_to_be16(0);
+		khdr->kh_rrc = cpu_to_be16(0);
+	} else {
+		khdr->kh_tok_id = cpu_to_be16(KG_TOK_MIC_MSG);
+		khdr->kh_flags = acceptor_flag;
+		khdr->kh_ec = cpu_to_be16(0xffff);
+		khdr->kh_rrc = cpu_to_be16(0xffff);
+	}
+
+	khdr->kh_filler = 0xff;
+	spin_lock(&krb5_seq_lock);
+	khdr->kh_seq = cpu_to_be64(kctx->kc_seq_send++);
+	spin_unlock(&krb5_seq_lock);
+}
+
+static __u32 verify_krb5_header(struct krb5_ctx *kctx,
+				struct krb5_header *khdr,
+				int privacy)
+{
+	unsigned char acceptor_flag;
+	__u16	 tok_id, ec_rrc;
+
+	acceptor_flag = kctx->kc_initiate ? FLAG_SENDER_IS_ACCEPTOR : 0;
+
+	if (privacy) {
+		tok_id = KG_TOK_WRAP_MSG;
+		ec_rrc = 0x0;
+	} else {
+		tok_id = KG_TOK_MIC_MSG;
+		ec_rrc = 0xffff;
+	}
+
+	/* sanity checks */
+	if (be16_to_cpu(khdr->kh_tok_id) != tok_id) {
+		CERROR("bad token id\n");
+		return GSS_S_DEFECTIVE_TOKEN;
+	}
+	if ((khdr->kh_flags & FLAG_SENDER_IS_ACCEPTOR) != acceptor_flag) {
+		CERROR("bad direction flag\n");
+		return GSS_S_BAD_SIG;
+	}
+	if (privacy && (khdr->kh_flags & FLAG_WRAP_CONFIDENTIAL) == 0) {
+		CERROR("missing confidential flag\n");
+		return GSS_S_BAD_SIG;
+	}
+	if (khdr->kh_filler != 0xff) {
+		CERROR("bad filler\n");
+		return GSS_S_DEFECTIVE_TOKEN;
+	}
+	if (be16_to_cpu(khdr->kh_ec) != ec_rrc ||
+	    be16_to_cpu(khdr->kh_rrc) != ec_rrc) {
+		CERROR("bad EC or RRC\n");
+		return GSS_S_DEFECTIVE_TOKEN;
+	}
+	return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_get_mic_kerberos(struct gss_ctx *gctx,
+			   int msgcnt,
+			   rawobj_t *msgs,
+			   int iovcnt,
+			   lnet_kiov_t *iovs,
+			   rawobj_t *token)
+{
+	struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+	struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+	struct krb5_header  *khdr;
+	rawobj_t	     cksum = RAWOBJ_EMPTY;
+
+	/* fill krb5 header */
+	LASSERT(token->len >= sizeof(*khdr));
+	khdr = (struct krb5_header *) token->data;
+	fill_krb5_header(kctx, khdr, 0);
+
+	/* checksum */
+	if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyc,
+			       khdr, msgcnt, msgs, iovcnt, iovs, &cksum))
+		return GSS_S_FAILURE;
+
+	LASSERT(cksum.len >= ke->ke_hash_size);
+	LASSERT(token->len >= sizeof(*khdr) + ke->ke_hash_size);
+	memcpy(khdr + 1, cksum.data + cksum.len - ke->ke_hash_size,
+	       ke->ke_hash_size);
+
+	token->len = sizeof(*khdr) + ke->ke_hash_size;
+	rawobj_free(&cksum);
+	return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_verify_mic_kerberos(struct gss_ctx *gctx,
+			      int msgcnt,
+			      rawobj_t *msgs,
+			      int iovcnt,
+			      lnet_kiov_t *iovs,
+			      rawobj_t *token)
+{
+	struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+	struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+	struct krb5_header  *khdr;
+	rawobj_t	     cksum = RAWOBJ_EMPTY;
+	__u32		major;
+
+	if (token->len < sizeof(*khdr)) {
+		CERROR("short signature: %u\n", token->len);
+		return GSS_S_DEFECTIVE_TOKEN;
+	}
+
+	khdr = (struct krb5_header *) token->data;
+
+	major = verify_krb5_header(kctx, khdr, 0);
+	if (major != GSS_S_COMPLETE) {
+		CERROR("bad krb5 header\n");
+		return major;
+	}
+
+	if (token->len < sizeof(*khdr) + ke->ke_hash_size) {
+		CERROR("short signature: %u, require %d\n",
+		       token->len, (int) sizeof(*khdr) + ke->ke_hash_size);
+		return GSS_S_FAILURE;
+	}
+
+	if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyc,
+			       khdr, msgcnt, msgs, iovcnt, iovs, &cksum)) {
+		CERROR("failed to make checksum\n");
+		return GSS_S_FAILURE;
+	}
+
+	LASSERT(cksum.len >= ke->ke_hash_size);
+	if (memcmp(khdr + 1, cksum.data + cksum.len - ke->ke_hash_size,
+		   ke->ke_hash_size)) {
+		CERROR("checksum mismatch\n");
+		rawobj_free(&cksum);
+		return GSS_S_BAD_SIG;
+	}
+
+	rawobj_free(&cksum);
+	return GSS_S_COMPLETE;
+}
+
+static
+int add_padding(rawobj_t *msg, int msg_buflen, int blocksize)
+{
+	int padding;
+
+	padding = (blocksize - (msg->len & (blocksize - 1))) &
+		  (blocksize - 1);
+	if (!padding)
+		return 0;
+
+	if (msg->len + padding > msg_buflen) {
+		CERROR("bufsize %u too small: datalen %u, padding %u\n",
+			msg_buflen, msg->len, padding);
+		return -EINVAL;
+	}
+
+	memset(msg->data + msg->len, padding, padding);
+	msg->len += padding;
+	return 0;
+}
+
+static
+int krb5_encrypt_rawobjs(struct ll_crypto_cipher *tfm,
+			 int mode_ecb,
+			 int inobj_cnt,
+			 rawobj_t *inobjs,
+			 rawobj_t *outobj,
+			 int enc)
+{
+	struct blkcipher_desc desc;
+	struct scatterlist    src, dst;
+	__u8		  local_iv[16] = {0}, *buf;
+	__u32		 datalen = 0;
+	int		   i, rc;
+	ENTRY;
+
+	buf = outobj->data;
+	desc.tfm  = tfm;
+	desc.info = local_iv;
+	desc.flags = 0;
+
+	for (i = 0; i < inobj_cnt; i++) {
+		LASSERT(buf + inobjs[i].len <= outobj->data + outobj->len);
+
+		buf_to_sg(&src, inobjs[i].data, inobjs[i].len);
+		buf_to_sg(&dst, buf, outobj->len - datalen);
+
+		if (mode_ecb) {
+			if (enc)
+				rc = ll_crypto_blkcipher_encrypt(
+					&desc, &dst, &src, src.length);
+			else
+				rc = ll_crypto_blkcipher_decrypt(
+					&desc, &dst, &src, src.length);
+		} else {
+			if (enc)
+				rc = ll_crypto_blkcipher_encrypt_iv(
+					&desc, &dst, &src, src.length);
+			else
+				rc = ll_crypto_blkcipher_decrypt_iv(
+					&desc, &dst, &src, src.length);
+		}
+
+		if (rc) {
+			CERROR("encrypt error %d\n", rc);
+			RETURN(rc);
+		}
+
+		datalen += inobjs[i].len;
+		buf += inobjs[i].len;
+	}
+
+	outobj->len = datalen;
+	RETURN(0);
+}
+
+/*
+ * if adj_nob != 0, we adjust desc->bd_nob to the actual cipher text size.
+ */
+static
+int krb5_encrypt_bulk(struct ll_crypto_cipher *tfm,
+		      struct krb5_header *khdr,
+		      char *confounder,
+		      struct ptlrpc_bulk_desc *desc,
+		      rawobj_t *cipher,
+		      int adj_nob)
+{
+	struct blkcipher_desc   ciph_desc;
+	__u8		    local_iv[16] = {0};
+	struct scatterlist      src, dst;
+	int		     blocksize, i, rc, nob = 0;
+
+	LASSERT(desc->bd_iov_count);
+	LASSERT(desc->bd_enc_iov);
+
+	blocksize = ll_crypto_blkcipher_blocksize(tfm);
+	LASSERT(blocksize > 1);
+	LASSERT(cipher->len == blocksize + sizeof(*khdr));
+
+	ciph_desc.tfm  = tfm;
+	ciph_desc.info = local_iv;
+	ciph_desc.flags = 0;
+
+	/* encrypt confounder */
+	buf_to_sg(&src, confounder, blocksize);
+	buf_to_sg(&dst, cipher->data, blocksize);
+
+	rc = ll_crypto_blkcipher_encrypt_iv(&ciph_desc, &dst, &src, blocksize);
+	if (rc) {
+		CERROR("error to encrypt confounder: %d\n", rc);
+		return rc;
+	}
+
+	/* encrypt clear pages */
+	for (i = 0; i < desc->bd_iov_count; i++) {
+		sg_set_page(&src, desc->bd_iov[i].kiov_page,
+			    (desc->bd_iov[i].kiov_len + blocksize - 1) &
+			    (~(blocksize - 1)),
+			    desc->bd_iov[i].kiov_offset);
+		if (adj_nob)
+			nob += src.length;
+		sg_set_page(&dst, desc->bd_enc_iov[i].kiov_page, src.length,
+			    src.offset);
+
+		desc->bd_enc_iov[i].kiov_offset = dst.offset;
+		desc->bd_enc_iov[i].kiov_len = dst.length;
+
+		rc = ll_crypto_blkcipher_encrypt_iv(&ciph_desc, &dst, &src,
+						    src.length);
+		if (rc) {
+			CERROR("error to encrypt page: %d\n", rc);
+			return rc;
+		}
+	}
+
+	/* encrypt krb5 header */
+	buf_to_sg(&src, khdr, sizeof(*khdr));
+	buf_to_sg(&dst, cipher->data + blocksize, sizeof(*khdr));
+
+	rc = ll_crypto_blkcipher_encrypt_iv(&ciph_desc,
+					    &dst, &src, sizeof(*khdr));
+	if (rc) {
+		CERROR("error to encrypt krb5 header: %d\n", rc);
+		return rc;
+	}
+
+	if (adj_nob)
+		desc->bd_nob = nob;
+
+	return 0;
+}
+
+/*
+ * desc->bd_nob_transferred is the size of cipher text received.
+ * desc->bd_nob is the target size of plain text supposed to be.
+ *
+ * if adj_nob != 0, we adjust each page's kiov_len to the actual
+ * plain text size.
+ * - for client read: we don't know data size for each page, so
+ *   bd_iov[]->kiov_len is set to PAGE_SIZE, but actual data received might
+ *   be smaller, so we need to adjust it according to bd_enc_iov[]->kiov_len.
+ *   this means we DO NOT support the situation that server send an odd size
+ *   data in a page which is not the last one.
+ * - for server write: we knows exactly data size for each page being expected,
+ *   thus kiov_len is accurate already, so we should not adjust it at all.
+ *   and bd_enc_iov[]->kiov_len should be round_up(bd_iov[]->kiov_len) which
+ *   should have been done by prep_bulk().
+ */
+static
+int krb5_decrypt_bulk(struct ll_crypto_cipher *tfm,
+		      struct krb5_header *khdr,
+		      struct ptlrpc_bulk_desc *desc,
+		      rawobj_t *cipher,
+		      rawobj_t *plain,
+		      int adj_nob)
+{
+	struct blkcipher_desc   ciph_desc;
+	__u8		    local_iv[16] = {0};
+	struct scatterlist      src, dst;
+	int		     ct_nob = 0, pt_nob = 0;
+	int		     blocksize, i, rc;
+
+	LASSERT(desc->bd_iov_count);
+	LASSERT(desc->bd_enc_iov);
+	LASSERT(desc->bd_nob_transferred);
+
+	blocksize = ll_crypto_blkcipher_blocksize(tfm);
+	LASSERT(blocksize > 1);
+	LASSERT(cipher->len == blocksize + sizeof(*khdr));
+
+	ciph_desc.tfm  = tfm;
+	ciph_desc.info = local_iv;
+	ciph_desc.flags = 0;
+
+	if (desc->bd_nob_transferred % blocksize) {
+		CERROR("odd transferred nob: %d\n", desc->bd_nob_transferred);
+		return -EPROTO;
+	}
+
+	/* decrypt head (confounder) */
+	buf_to_sg(&src, cipher->data, blocksize);
+	buf_to_sg(&dst, plain->data, blocksize);
+
+	rc = ll_crypto_blkcipher_decrypt_iv(&ciph_desc, &dst, &src, blocksize);
+	if (rc) {
+		CERROR("error to decrypt confounder: %d\n", rc);
+		return rc;
+	}
+
+	for (i = 0; i < desc->bd_iov_count && ct_nob < desc->bd_nob_transferred;
+	     i++) {
+		if (desc->bd_enc_iov[i].kiov_offset % blocksize != 0 ||
+		    desc->bd_enc_iov[i].kiov_len % blocksize != 0) {
+			CERROR("page %d: odd offset %u len %u, blocksize %d\n",
+			       i, desc->bd_enc_iov[i].kiov_offset,
+			       desc->bd_enc_iov[i].kiov_len, blocksize);
+			return -EFAULT;
+		}
+
+		if (adj_nob) {
+			if (ct_nob + desc->bd_enc_iov[i].kiov_len >
+			    desc->bd_nob_transferred)
+				desc->bd_enc_iov[i].kiov_len =
+					desc->bd_nob_transferred - ct_nob;
+
+			desc->bd_iov[i].kiov_len = desc->bd_enc_iov[i].kiov_len;
+			if (pt_nob + desc->bd_enc_iov[i].kiov_len >desc->bd_nob)
+				desc->bd_iov[i].kiov_len = desc->bd_nob -pt_nob;
+		} else {
+			/* this should be guaranteed by LNET */
+			LASSERT(ct_nob + desc->bd_enc_iov[i].kiov_len <=
+				desc->bd_nob_transferred);
+			LASSERT(desc->bd_iov[i].kiov_len <=
+				desc->bd_enc_iov[i].kiov_len);
+		}
+
+		if (desc->bd_enc_iov[i].kiov_len == 0)
+			continue;
+
+		sg_set_page(&src, desc->bd_enc_iov[i].kiov_page,
+			    desc->bd_enc_iov[i].kiov_len,
+			    desc->bd_enc_iov[i].kiov_offset);
+		dst = src;
+		if (desc->bd_iov[i].kiov_len % blocksize == 0)
+			sg_assign_page(&dst, desc->bd_iov[i].kiov_page);
+
+		rc = ll_crypto_blkcipher_decrypt_iv(&ciph_desc, &dst, &src,
+						    src.length);
+		if (rc) {
+			CERROR("error to decrypt page: %d\n", rc);
+			return rc;
+		}
+
+		if (desc->bd_iov[i].kiov_len % blocksize != 0) {
+			memcpy(page_address(desc->bd_iov[i].kiov_page) +
+			       desc->bd_iov[i].kiov_offset,
+			       page_address(desc->bd_enc_iov[i].kiov_page) +
+			       desc->bd_iov[i].kiov_offset,
+			       desc->bd_iov[i].kiov_len);
+		}
+
+		ct_nob += desc->bd_enc_iov[i].kiov_len;
+		pt_nob += desc->bd_iov[i].kiov_len;
+	}
+
+	if (unlikely(ct_nob != desc->bd_nob_transferred)) {
+		CERROR("%d cipher text transferred but only %d decrypted\n",
+		       desc->bd_nob_transferred, ct_nob);
+		return -EFAULT;
+	}
+
+	if (unlikely(!adj_nob && pt_nob != desc->bd_nob)) {
+		CERROR("%d plain text expected but only %d received\n",
+		       desc->bd_nob, pt_nob);
+		return -EFAULT;
+	}
+
+	/* if needed, clear up the rest unused iovs */
+	if (adj_nob)
+		while (i < desc->bd_iov_count)
+			desc->bd_iov[i++].kiov_len = 0;
+
+	/* decrypt tail (krb5 header) */
+	buf_to_sg(&src, cipher->data + blocksize, sizeof(*khdr));
+	buf_to_sg(&dst, cipher->data + blocksize, sizeof(*khdr));
+
+	rc = ll_crypto_blkcipher_decrypt_iv(&ciph_desc,
+					    &dst, &src, sizeof(*khdr));
+	if (rc) {
+		CERROR("error to decrypt tail: %d\n", rc);
+		return rc;
+	}
+
+	if (memcmp(cipher->data + blocksize, khdr, sizeof(*khdr))) {
+		CERROR("krb5 header doesn't match\n");
+		return -EACCES;
+	}
+
+	return 0;
+}
+
+static
+__u32 gss_wrap_kerberos(struct gss_ctx *gctx,
+			rawobj_t *gsshdr,
+			rawobj_t *msg,
+			int msg_buflen,
+			rawobj_t *token)
+{
+	struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+	struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+	struct krb5_header  *khdr;
+	int		  blocksize;
+	rawobj_t	     cksum = RAWOBJ_EMPTY;
+	rawobj_t	     data_desc[3], cipher;
+	__u8		 conf[GSS_MAX_CIPHER_BLOCK];
+	int		  rc = 0;
+
+	LASSERT(ke);
+	LASSERT(ke->ke_conf_size <= GSS_MAX_CIPHER_BLOCK);
+	LASSERT(kctx->kc_keye.kb_tfm == NULL ||
+		ke->ke_conf_size >=
+		ll_crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm));
+
+	/*
+	 * final token format:
+	 * ---------------------------------------------------
+	 * | krb5 header | cipher text | checksum (16 bytes) |
+	 * ---------------------------------------------------
+	 */
+
+	/* fill krb5 header */
+	LASSERT(token->len >= sizeof(*khdr));
+	khdr = (struct krb5_header *) token->data;
+	fill_krb5_header(kctx, khdr, 1);
+
+	/* generate confounder */
+	cfs_get_random_bytes(conf, ke->ke_conf_size);
+
+	/* get encryption blocksize. note kc_keye might not associated with
+	 * a tfm, currently only for arcfour-hmac */
+	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+		LASSERT(kctx->kc_keye.kb_tfm == NULL);
+		blocksize = 1;
+	} else {
+		LASSERT(kctx->kc_keye.kb_tfm);
+		blocksize = ll_crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm);
+	}
+	LASSERT(blocksize <= ke->ke_conf_size);
+
+	/* padding the message */
+	if (add_padding(msg, msg_buflen, blocksize))
+		return GSS_S_FAILURE;
+
+	/*
+	 * clear text layout for checksum:
+	 * ------------------------------------------------------
+	 * | confounder | gss header | clear msgs | krb5 header |
+	 * ------------------------------------------------------
+	 */
+	data_desc[0].data = conf;
+	data_desc[0].len = ke->ke_conf_size;
+	data_desc[1].data = gsshdr->data;
+	data_desc[1].len = gsshdr->len;
+	data_desc[2].data = msg->data;
+	data_desc[2].len = msg->len;
+
+	/* compute checksum */
+	if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
+			       khdr, 3, data_desc, 0, NULL, &cksum))
+		return GSS_S_FAILURE;
+	LASSERT(cksum.len >= ke->ke_hash_size);
+
+	/*
+	 * clear text layout for encryption:
+	 * -----------------------------------------
+	 * | confounder | clear msgs | krb5 header |
+	 * -----------------------------------------
+	 */
+	data_desc[0].data = conf;
+	data_desc[0].len = ke->ke_conf_size;
+	data_desc[1].data = msg->data;
+	data_desc[1].len = msg->len;
+	data_desc[2].data = (__u8 *) khdr;
+	data_desc[2].len = sizeof(*khdr);
+
+	/* cipher text will be directly inplace */
+	cipher.data = (__u8 *) (khdr + 1);
+	cipher.len = token->len - sizeof(*khdr);
+	LASSERT(cipher.len >= ke->ke_conf_size + msg->len + sizeof(*khdr));
+
+	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+		rawobj_t		 arc4_keye;
+		struct ll_crypto_cipher *arc4_tfm;
+
+		if (krb5_make_checksum(ENCTYPE_ARCFOUR_HMAC, &kctx->kc_keyi,
+				       NULL, 1, &cksum, 0, NULL, &arc4_keye)) {
+			CERROR("failed to obtain arc4 enc key\n");
+			GOTO(arc4_out, rc = -EACCES);
+		}
+
+		arc4_tfm = ll_crypto_alloc_blkcipher("ecb(arc4)", 0, 0);
+		if (IS_ERR(arc4_tfm)) {
+			CERROR("failed to alloc tfm arc4 in ECB mode\n");
+			GOTO(arc4_out_key, rc = -EACCES);
+		}
+
+		if (ll_crypto_blkcipher_setkey(arc4_tfm, arc4_keye.data,
+					       arc4_keye.len)) {
+			CERROR("failed to set arc4 key, len %d\n",
+			       arc4_keye.len);
+			GOTO(arc4_out_tfm, rc = -EACCES);
+		}
+
+		rc = krb5_encrypt_rawobjs(arc4_tfm, 1,
+					  3, data_desc, &cipher, 1);
+arc4_out_tfm:
+		ll_crypto_free_blkcipher(arc4_tfm);
+arc4_out_key:
+		rawobj_free(&arc4_keye);
+arc4_out:
+		do {} while(0); /* just to avoid compile warning */
+	} else {
+		rc = krb5_encrypt_rawobjs(kctx->kc_keye.kb_tfm, 0,
+					  3, data_desc, &cipher, 1);
+	}
+
+	if (rc != 0) {
+		rawobj_free(&cksum);
+		return GSS_S_FAILURE;
+	}
+
+	/* fill in checksum */
+	LASSERT(token->len >= sizeof(*khdr) + cipher.len + ke->ke_hash_size);
+	memcpy((char *)(khdr + 1) + cipher.len,
+	       cksum.data + cksum.len - ke->ke_hash_size,
+	       ke->ke_hash_size);
+	rawobj_free(&cksum);
+
+	/* final token length */
+	token->len = sizeof(*khdr) + cipher.len + ke->ke_hash_size;
+	return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_prep_bulk_kerberos(struct gss_ctx *gctx,
+			     struct ptlrpc_bulk_desc *desc)
+{
+	struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+	int		  blocksize, i;
+
+	LASSERT(desc->bd_iov_count);
+	LASSERT(desc->bd_enc_iov);
+	LASSERT(kctx->kc_keye.kb_tfm);
+
+	blocksize = ll_crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm);
+
+	for (i = 0; i < desc->bd_iov_count; i++) {
+		LASSERT(desc->bd_enc_iov[i].kiov_page);
+		/*
+		 * offset should always start at page boundary of either
+		 * client or server side.
+		 */
+		if (desc->bd_iov[i].kiov_offset & blocksize) {
+			CERROR("odd offset %d in page %d\n",
+			       desc->bd_iov[i].kiov_offset, i);
+			return GSS_S_FAILURE;
+		}
+
+		desc->bd_enc_iov[i].kiov_offset = desc->bd_iov[i].kiov_offset;
+		desc->bd_enc_iov[i].kiov_len = (desc->bd_iov[i].kiov_len +
+						blocksize - 1) & (~(blocksize - 1));
+	}
+
+	return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_wrap_bulk_kerberos(struct gss_ctx *gctx,
+			     struct ptlrpc_bulk_desc *desc,
+			     rawobj_t *token, int adj_nob)
+{
+	struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+	struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+	struct krb5_header  *khdr;
+	int		  blocksize;
+	rawobj_t	     cksum = RAWOBJ_EMPTY;
+	rawobj_t	     data_desc[1], cipher;
+	__u8		 conf[GSS_MAX_CIPHER_BLOCK];
+	int		  rc = 0;
+
+	LASSERT(ke);
+	LASSERT(ke->ke_conf_size <= GSS_MAX_CIPHER_BLOCK);
+
+	/*
+	 * final token format:
+	 * --------------------------------------------------
+	 * | krb5 header | head/tail cipher text | checksum |
+	 * --------------------------------------------------
+	 */
+
+	/* fill krb5 header */
+	LASSERT(token->len >= sizeof(*khdr));
+	khdr = (struct krb5_header *) token->data;
+	fill_krb5_header(kctx, khdr, 1);
+
+	/* generate confounder */
+	cfs_get_random_bytes(conf, ke->ke_conf_size);
+
+	/* get encryption blocksize. note kc_keye might not associated with
+	 * a tfm, currently only for arcfour-hmac */
+	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+		LASSERT(kctx->kc_keye.kb_tfm == NULL);
+		blocksize = 1;
+	} else {
+		LASSERT(kctx->kc_keye.kb_tfm);
+		blocksize = ll_crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm);
+	}
+
+	/*
+	 * we assume the size of krb5_header (16 bytes) must be n * blocksize.
+	 * the bulk token size would be exactly (sizeof(krb5_header) +
+	 * blocksize + sizeof(krb5_header) + hashsize)
+	 */
+	LASSERT(blocksize <= ke->ke_conf_size);
+	LASSERT(sizeof(*khdr) >= blocksize && sizeof(*khdr) % blocksize == 0);
+	LASSERT(token->len >= sizeof(*khdr) + blocksize + sizeof(*khdr) + 16);
+
+	/*
+	 * clear text layout for checksum:
+	 * ------------------------------------------
+	 * | confounder | clear pages | krb5 header |
+	 * ------------------------------------------
+	 */
+	data_desc[0].data = conf;
+	data_desc[0].len = ke->ke_conf_size;
+
+	/* compute checksum */
+	if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
+			       khdr, 1, data_desc,
+			       desc->bd_iov_count, desc->bd_iov,
+			       &cksum))
+		return GSS_S_FAILURE;
+	LASSERT(cksum.len >= ke->ke_hash_size);
+
+	/*
+	 * clear text layout for encryption:
+	 * ------------------------------------------
+	 * | confounder | clear pages | krb5 header |
+	 * ------------------------------------------
+	 *	|	      |	     |
+	 *	----------  (cipher pages)   |
+	 * result token:   |		   |
+	 * -------------------------------------------
+	 * | krb5 header | cipher text | cipher text |
+	 * -------------------------------------------
+	 */
+	data_desc[0].data = conf;
+	data_desc[0].len = ke->ke_conf_size;
+
+	cipher.data = (__u8 *) (khdr + 1);
+	cipher.len = blocksize + sizeof(*khdr);
+
+	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+		LBUG();
+		rc = 0;
+	} else {
+		rc = krb5_encrypt_bulk(kctx->kc_keye.kb_tfm, khdr,
+				       conf, desc, &cipher, adj_nob);
+	}
+
+	if (rc != 0) {
+		rawobj_free(&cksum);
+		return GSS_S_FAILURE;
+	}
+
+	/* fill in checksum */
+	LASSERT(token->len >= sizeof(*khdr) + cipher.len + ke->ke_hash_size);
+	memcpy((char *)(khdr + 1) + cipher.len,
+	       cksum.data + cksum.len - ke->ke_hash_size,
+	       ke->ke_hash_size);
+	rawobj_free(&cksum);
+
+	/* final token length */
+	token->len = sizeof(*khdr) + cipher.len + ke->ke_hash_size;
+	return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_unwrap_kerberos(struct gss_ctx  *gctx,
+			  rawobj_t	*gsshdr,
+			  rawobj_t	*token,
+			  rawobj_t	*msg)
+{
+	struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+	struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+	struct krb5_header  *khdr;
+	unsigned char       *tmpbuf;
+	int		  blocksize, bodysize;
+	rawobj_t	     cksum = RAWOBJ_EMPTY;
+	rawobj_t	     cipher_in, plain_out;
+	rawobj_t	     hash_objs[3];
+	int		  rc = 0;
+	__u32		major;
+
+	LASSERT(ke);
+
+	if (token->len < sizeof(*khdr)) {
+		CERROR("short signature: %u\n", token->len);
+		return GSS_S_DEFECTIVE_TOKEN;
+	}
+
+	khdr = (struct krb5_header *) token->data;
+
+	major = verify_krb5_header(kctx, khdr, 1);
+	if (major != GSS_S_COMPLETE) {
+		CERROR("bad krb5 header\n");
+		return major;
+	}
+
+	/* block size */
+	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+		LASSERT(kctx->kc_keye.kb_tfm == NULL);
+		blocksize = 1;
+	} else {
+		LASSERT(kctx->kc_keye.kb_tfm);
+		blocksize = ll_crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm);
+	}
+
+	/* expected token layout:
+	 * ----------------------------------------
+	 * | krb5 header | cipher text | checksum |
+	 * ----------------------------------------
+	 */
+	bodysize = token->len - sizeof(*khdr) - ke->ke_hash_size;
+
+	if (bodysize % blocksize) {
+		CERROR("odd bodysize %d\n", bodysize);
+		return GSS_S_DEFECTIVE_TOKEN;
+	}
+
+	if (bodysize <= ke->ke_conf_size + sizeof(*khdr)) {
+		CERROR("incomplete token: bodysize %d\n", bodysize);
+		return GSS_S_DEFECTIVE_TOKEN;
+	}
+
+	if (msg->len < bodysize - ke->ke_conf_size - sizeof(*khdr)) {
+		CERROR("buffer too small: %u, require %d\n",
+		       msg->len, bodysize - ke->ke_conf_size);
+		return GSS_S_FAILURE;
+	}
+
+	/* decrypting */
+	OBD_ALLOC_LARGE(tmpbuf, bodysize);
+	if (!tmpbuf)
+		return GSS_S_FAILURE;
+
+	major = GSS_S_FAILURE;
+
+	cipher_in.data = (__u8 *) (khdr + 1);
+	cipher_in.len = bodysize;
+	plain_out.data = tmpbuf;
+	plain_out.len = bodysize;
+
+	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+		rawobj_t		 arc4_keye;
+		struct ll_crypto_cipher *arc4_tfm;
+
+		cksum.data = token->data + token->len - ke->ke_hash_size;
+		cksum.len = ke->ke_hash_size;
+
+		if (krb5_make_checksum(ENCTYPE_ARCFOUR_HMAC, &kctx->kc_keyi,
+				       NULL, 1, &cksum, 0, NULL, &arc4_keye)) {
+			CERROR("failed to obtain arc4 enc key\n");
+			GOTO(arc4_out, rc = -EACCES);
+		}
+
+		arc4_tfm = ll_crypto_alloc_blkcipher("ecb(arc4)", 0, 0);
+		if (IS_ERR(arc4_tfm)) {
+			CERROR("failed to alloc tfm arc4 in ECB mode\n");
+			GOTO(arc4_out_key, rc = -EACCES);
+		}
+
+		if (ll_crypto_blkcipher_setkey(arc4_tfm,
+					 arc4_keye.data, arc4_keye.len)) {
+			CERROR("failed to set arc4 key, len %d\n",
+			       arc4_keye.len);
+			GOTO(arc4_out_tfm, rc = -EACCES);
+		}
+
+		rc = krb5_encrypt_rawobjs(arc4_tfm, 1,
+					  1, &cipher_in, &plain_out, 0);
+arc4_out_tfm:
+		ll_crypto_free_blkcipher(arc4_tfm);
+arc4_out_key:
+		rawobj_free(&arc4_keye);
+arc4_out:
+		cksum = RAWOBJ_EMPTY;
+	} else {
+		rc = krb5_encrypt_rawobjs(kctx->kc_keye.kb_tfm, 0,
+					  1, &cipher_in, &plain_out, 0);
+	}
+
+	if (rc != 0) {
+		CERROR("error decrypt\n");
+		goto out_free;
+	}
+	LASSERT(plain_out.len == bodysize);
+
+	/* expected clear text layout:
+	 * -----------------------------------------
+	 * | confounder | clear msgs | krb5 header |
+	 * -----------------------------------------
+	 */
+
+	/* verify krb5 header in token is not modified */
+	if (memcmp(khdr, plain_out.data + plain_out.len - sizeof(*khdr),
+		   sizeof(*khdr))) {
+		CERROR("decrypted krb5 header mismatch\n");
+		goto out_free;
+	}
+
+	/* verify checksum, compose clear text as layout:
+	 * ------------------------------------------------------
+	 * | confounder | gss header | clear msgs | krb5 header |
+	 * ------------------------------------------------------
+	 */
+	hash_objs[0].len = ke->ke_conf_size;
+	hash_objs[0].data = plain_out.data;
+	hash_objs[1].len = gsshdr->len;
+	hash_objs[1].data = gsshdr->data;
+	hash_objs[2].len = plain_out.len - ke->ke_conf_size - sizeof(*khdr);
+	hash_objs[2].data = plain_out.data + ke->ke_conf_size;
+	if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
+			       khdr, 3, hash_objs, 0, NULL, &cksum))
+		goto out_free;
+
+	LASSERT(cksum.len >= ke->ke_hash_size);
+	if (memcmp((char *)(khdr + 1) + bodysize,
+		   cksum.data + cksum.len - ke->ke_hash_size,
+		   ke->ke_hash_size)) {
+		CERROR("checksum mismatch\n");
+		goto out_free;
+	}
+
+	msg->len =  bodysize - ke->ke_conf_size - sizeof(*khdr);
+	memcpy(msg->data, tmpbuf + ke->ke_conf_size, msg->len);
+
+	major = GSS_S_COMPLETE;
+out_free:
+	OBD_FREE_LARGE(tmpbuf, bodysize);
+	rawobj_free(&cksum);
+	return major;
+}
+
+static
+__u32 gss_unwrap_bulk_kerberos(struct gss_ctx *gctx,
+			       struct ptlrpc_bulk_desc *desc,
+			       rawobj_t *token, int adj_nob)
+{
+	struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+	struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+	struct krb5_header  *khdr;
+	int		  blocksize;
+	rawobj_t	     cksum = RAWOBJ_EMPTY;
+	rawobj_t	     cipher, plain;
+	rawobj_t	     data_desc[1];
+	int		  rc;
+	__u32		major;
+
+	LASSERT(ke);
+
+	if (token->len < sizeof(*khdr)) {
+		CERROR("short signature: %u\n", token->len);
+		return GSS_S_DEFECTIVE_TOKEN;
+	}
+
+	khdr = (struct krb5_header *) token->data;
+
+	major = verify_krb5_header(kctx, khdr, 1);
+	if (major != GSS_S_COMPLETE) {
+		CERROR("bad krb5 header\n");
+		return major;
+	}
+
+	/* block size */
+	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+		LASSERT(kctx->kc_keye.kb_tfm == NULL);
+		blocksize = 1;
+		LBUG();
+	} else {
+		LASSERT(kctx->kc_keye.kb_tfm);
+		blocksize = ll_crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm);
+	}
+	LASSERT(sizeof(*khdr) >= blocksize && sizeof(*khdr) % blocksize == 0);
+
+	/*
+	 * token format is expected as:
+	 * -----------------------------------------------
+	 * | krb5 header | head/tail cipher text | cksum |
+	 * -----------------------------------------------
+	 */
+	if (token->len < sizeof(*khdr) + blocksize + sizeof(*khdr) +
+			 ke->ke_hash_size) {
+		CERROR("short token size: %u\n", token->len);
+		return GSS_S_DEFECTIVE_TOKEN;
+	}
+
+	cipher.data = (__u8 *) (khdr + 1);
+	cipher.len = blocksize + sizeof(*khdr);
+	plain.data = cipher.data;
+	plain.len = cipher.len;
+
+	rc = krb5_decrypt_bulk(kctx->kc_keye.kb_tfm, khdr,
+			       desc, &cipher, &plain, adj_nob);
+	if (rc)
+		return GSS_S_DEFECTIVE_TOKEN;
+
+	/*
+	 * verify checksum, compose clear text as layout:
+	 * ------------------------------------------
+	 * | confounder | clear pages | krb5 header |
+	 * ------------------------------------------
+	 */
+	data_desc[0].data = plain.data;
+	data_desc[0].len = blocksize;
+
+	if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
+			       khdr, 1, data_desc,
+			       desc->bd_iov_count, desc->bd_iov,
+			       &cksum))
+		return GSS_S_FAILURE;
+	LASSERT(cksum.len >= ke->ke_hash_size);
+
+	if (memcmp(plain.data + blocksize + sizeof(*khdr),
+		   cksum.data + cksum.len - ke->ke_hash_size,
+		   ke->ke_hash_size)) {
+		CERROR("checksum mismatch\n");
+		rawobj_free(&cksum);
+		return GSS_S_BAD_SIG;
+	}
+
+	rawobj_free(&cksum);
+	return GSS_S_COMPLETE;
+}
+
+int gss_display_kerberos(struct gss_ctx	*ctx,
+			 char		  *buf,
+			 int		    bufsize)
+{
+	struct krb5_ctx    *kctx = ctx->internal_ctx_id;
+	int		 written;
+
+	written = snprintf(buf, bufsize, "krb5 (%s)",
+			   enctype2str(kctx->kc_enctype));
+	return written;
+}
+
+static struct gss_api_ops gss_kerberos_ops = {
+	.gss_import_sec_context     = gss_import_sec_context_kerberos,
+	.gss_copy_reverse_context   = gss_copy_reverse_context_kerberos,
+	.gss_inquire_context	= gss_inquire_context_kerberos,
+	.gss_get_mic		= gss_get_mic_kerberos,
+	.gss_verify_mic	     = gss_verify_mic_kerberos,
+	.gss_wrap		   = gss_wrap_kerberos,
+	.gss_unwrap		 = gss_unwrap_kerberos,
+	.gss_prep_bulk	      = gss_prep_bulk_kerberos,
+	.gss_wrap_bulk	      = gss_wrap_bulk_kerberos,
+	.gss_unwrap_bulk	    = gss_unwrap_bulk_kerberos,
+	.gss_delete_sec_context     = gss_delete_sec_context_kerberos,
+	.gss_display		= gss_display_kerberos,
+};
+
+static struct subflavor_desc gss_kerberos_sfs[] = {
+	{
+		.sf_subflavor   = SPTLRPC_SUBFLVR_KRB5N,
+		.sf_qop	 = 0,
+		.sf_service     = SPTLRPC_SVC_NULL,
+		.sf_name	= "krb5n"
+	},
+	{
+		.sf_subflavor   = SPTLRPC_SUBFLVR_KRB5A,
+		.sf_qop	 = 0,
+		.sf_service     = SPTLRPC_SVC_AUTH,
+		.sf_name	= "krb5a"
+	},
+	{
+		.sf_subflavor   = SPTLRPC_SUBFLVR_KRB5I,
+		.sf_qop	 = 0,
+		.sf_service     = SPTLRPC_SVC_INTG,
+		.sf_name	= "krb5i"
+	},
+	{
+		.sf_subflavor   = SPTLRPC_SUBFLVR_KRB5P,
+		.sf_qop	 = 0,
+		.sf_service     = SPTLRPC_SVC_PRIV,
+		.sf_name	= "krb5p"
+	},
+};
+
+/*
+ * currently we leave module owner NULL
+ */
+static struct gss_api_mech gss_kerberos_mech = {
+	.gm_owner       = NULL, /*THIS_MODULE, */
+	.gm_name	= "krb5",
+	.gm_oid	 = (rawobj_t)
+				{9, "\052\206\110\206\367\022\001\002\002"},
+	.gm_ops	 = &gss_kerberos_ops,
+	.gm_sf_num      = 4,
+	.gm_sfs	 = gss_kerberos_sfs,
+};
+
+int __init init_kerberos_module(void)
+{
+	int status;
+
+	spin_lock_init(&krb5_seq_lock);
+
+	status = lgss_mech_register(&gss_kerberos_mech);
+	if (status)
+		CERROR("Failed to register kerberos gss mechanism!\n");
+	return status;
+}
+
+void __exit cleanup_kerberos_module(void)
+{
+	lgss_mech_unregister(&gss_kerberos_mech);
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_mech_switch.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_mech_switch.c
new file mode 100644
index 000000000000..8cdad800382d
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_mech_switch.c
@@ -0,0 +1,359 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  linux/net/sunrpc/gss_mech_switch.c
+ *
+ *  Copyright (c) 2001 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  J. Bruce Fields   <bfields@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+static LIST_HEAD(registered_mechs);
+static DEFINE_SPINLOCK(registered_mechs_lock);
+
+int lgss_mech_register(struct gss_api_mech *gm)
+{
+	spin_lock(&registered_mechs_lock);
+	list_add(&gm->gm_list, &registered_mechs);
+	spin_unlock(&registered_mechs_lock);
+	CWARN("Register %s mechanism\n", gm->gm_name);
+	return 0;
+}
+
+void lgss_mech_unregister(struct gss_api_mech *gm)
+{
+	spin_lock(&registered_mechs_lock);
+	list_del(&gm->gm_list);
+	spin_unlock(&registered_mechs_lock);
+	CWARN("Unregister %s mechanism\n", gm->gm_name);
+}
+
+
+struct gss_api_mech *lgss_mech_get(struct gss_api_mech *gm)
+{
+	__module_get(gm->gm_owner);
+	return gm;
+}
+
+struct gss_api_mech *lgss_name_to_mech(char *name)
+{
+	struct gss_api_mech *pos, *gm = NULL;
+
+	spin_lock(&registered_mechs_lock);
+	list_for_each_entry(pos, &registered_mechs, gm_list) {
+		if (0 == strcmp(name, pos->gm_name)) {
+			if (!try_module_get(pos->gm_owner))
+				continue;
+			gm = pos;
+			break;
+		}
+	}
+	spin_unlock(&registered_mechs_lock);
+	return gm;
+
+}
+
+static inline
+int mech_supports_subflavor(struct gss_api_mech *gm, __u32 subflavor)
+{
+	int i;
+
+	for (i = 0; i < gm->gm_sf_num; i++) {
+		if (gm->gm_sfs[i].sf_subflavor == subflavor)
+			return 1;
+	}
+	return 0;
+}
+
+struct gss_api_mech *lgss_subflavor_to_mech(__u32 subflavor)
+{
+	struct gss_api_mech *pos, *gm = NULL;
+
+	spin_lock(&registered_mechs_lock);
+	list_for_each_entry(pos, &registered_mechs, gm_list) {
+		if (!try_module_get(pos->gm_owner))
+			continue;
+		if (!mech_supports_subflavor(pos, subflavor)) {
+			module_put(pos->gm_owner);
+			continue;
+		}
+		gm = pos;
+		break;
+	}
+	spin_unlock(&registered_mechs_lock);
+	return gm;
+}
+
+void lgss_mech_put(struct gss_api_mech *gm)
+{
+	module_put(gm->gm_owner);
+}
+
+/* The mech could probably be determined from the token instead, but it's just
+ * as easy for now to pass it in. */
+__u32 lgss_import_sec_context(rawobj_t *input_token,
+			      struct gss_api_mech *mech,
+			      struct gss_ctx **ctx_id)
+{
+	OBD_ALLOC_PTR(*ctx_id);
+	if (*ctx_id == NULL)
+		return GSS_S_FAILURE;
+
+	(*ctx_id)->mech_type = lgss_mech_get(mech);
+
+	LASSERT(mech);
+	LASSERT(mech->gm_ops);
+	LASSERT(mech->gm_ops->gss_import_sec_context);
+	return mech->gm_ops->gss_import_sec_context(input_token, *ctx_id);
+}
+
+__u32 lgss_copy_reverse_context(struct gss_ctx *ctx_id,
+				struct gss_ctx **ctx_id_new)
+{
+	struct gss_api_mech *mech = ctx_id->mech_type;
+	__u32		major;
+
+	LASSERT(mech);
+
+	OBD_ALLOC_PTR(*ctx_id_new);
+	if (*ctx_id_new == NULL)
+		return GSS_S_FAILURE;
+
+	(*ctx_id_new)->mech_type = lgss_mech_get(mech);
+
+	LASSERT(mech);
+	LASSERT(mech->gm_ops);
+	LASSERT(mech->gm_ops->gss_copy_reverse_context);
+
+	major = mech->gm_ops->gss_copy_reverse_context(ctx_id, *ctx_id_new);
+	if (major != GSS_S_COMPLETE) {
+		lgss_mech_put(mech);
+		OBD_FREE_PTR(*ctx_id_new);
+		*ctx_id_new = NULL;
+	}
+	return major;
+}
+
+/*
+ * this interface is much simplified, currently we only need endtime.
+ */
+__u32 lgss_inquire_context(struct gss_ctx *context_handle,
+			   unsigned long  *endtime)
+{
+	LASSERT(context_handle);
+	LASSERT(context_handle->mech_type);
+	LASSERT(context_handle->mech_type->gm_ops);
+	LASSERT(context_handle->mech_type->gm_ops->gss_inquire_context);
+
+	return context_handle->mech_type->gm_ops
+		->gss_inquire_context(context_handle,
+				      endtime);
+}
+
+/* gss_get_mic: compute a mic over message and return mic_token. */
+__u32 lgss_get_mic(struct gss_ctx *context_handle,
+		   int msgcnt,
+		   rawobj_t *msg,
+		   int iovcnt,
+		   lnet_kiov_t *iovs,
+		   rawobj_t *mic_token)
+{
+	LASSERT(context_handle);
+	LASSERT(context_handle->mech_type);
+	LASSERT(context_handle->mech_type->gm_ops);
+	LASSERT(context_handle->mech_type->gm_ops->gss_get_mic);
+
+	return context_handle->mech_type->gm_ops
+		->gss_get_mic(context_handle,
+			      msgcnt,
+			      msg,
+			      iovcnt,
+			      iovs,
+			      mic_token);
+}
+
+/* gss_verify_mic: check whether the provided mic_token verifies message. */
+__u32 lgss_verify_mic(struct gss_ctx *context_handle,
+		      int msgcnt,
+		      rawobj_t *msg,
+		      int iovcnt,
+		      lnet_kiov_t *iovs,
+		      rawobj_t *mic_token)
+{
+	LASSERT(context_handle);
+	LASSERT(context_handle->mech_type);
+	LASSERT(context_handle->mech_type->gm_ops);
+	LASSERT(context_handle->mech_type->gm_ops->gss_verify_mic);
+
+	return context_handle->mech_type->gm_ops
+		->gss_verify_mic(context_handle,
+				 msgcnt,
+				 msg,
+				 iovcnt,
+				 iovs,
+				 mic_token);
+}
+
+__u32 lgss_wrap(struct gss_ctx *context_handle,
+		rawobj_t *gsshdr,
+		rawobj_t *msg,
+		int msg_buflen,
+		rawobj_t *out_token)
+{
+	LASSERT(context_handle);
+	LASSERT(context_handle->mech_type);
+	LASSERT(context_handle->mech_type->gm_ops);
+	LASSERT(context_handle->mech_type->gm_ops->gss_wrap);
+
+	return context_handle->mech_type->gm_ops
+		->gss_wrap(context_handle, gsshdr, msg, msg_buflen, out_token);
+}
+
+__u32 lgss_unwrap(struct gss_ctx *context_handle,
+		  rawobj_t *gsshdr,
+		  rawobj_t *token,
+		  rawobj_t *out_msg)
+{
+	LASSERT(context_handle);
+	LASSERT(context_handle->mech_type);
+	LASSERT(context_handle->mech_type->gm_ops);
+	LASSERT(context_handle->mech_type->gm_ops->gss_unwrap);
+
+	return context_handle->mech_type->gm_ops
+		->gss_unwrap(context_handle, gsshdr, token, out_msg);
+}
+
+
+__u32 lgss_prep_bulk(struct gss_ctx *context_handle,
+		     struct ptlrpc_bulk_desc *desc)
+{
+	LASSERT(context_handle);
+	LASSERT(context_handle->mech_type);
+	LASSERT(context_handle->mech_type->gm_ops);
+	LASSERT(context_handle->mech_type->gm_ops->gss_prep_bulk);
+
+	return context_handle->mech_type->gm_ops
+		->gss_prep_bulk(context_handle, desc);
+}
+
+__u32 lgss_wrap_bulk(struct gss_ctx *context_handle,
+		     struct ptlrpc_bulk_desc *desc,
+		     rawobj_t *token,
+		     int adj_nob)
+{
+	LASSERT(context_handle);
+	LASSERT(context_handle->mech_type);
+	LASSERT(context_handle->mech_type->gm_ops);
+	LASSERT(context_handle->mech_type->gm_ops->gss_wrap_bulk);
+
+	return context_handle->mech_type->gm_ops
+		->gss_wrap_bulk(context_handle, desc, token, adj_nob);
+}
+
+__u32 lgss_unwrap_bulk(struct gss_ctx *context_handle,
+		       struct ptlrpc_bulk_desc *desc,
+		       rawobj_t *token,
+		       int adj_nob)
+{
+	LASSERT(context_handle);
+	LASSERT(context_handle->mech_type);
+	LASSERT(context_handle->mech_type->gm_ops);
+	LASSERT(context_handle->mech_type->gm_ops->gss_unwrap_bulk);
+
+	return context_handle->mech_type->gm_ops
+		->gss_unwrap_bulk(context_handle, desc, token, adj_nob);
+}
+
+/* gss_delete_sec_context: free all resources associated with context_handle.
+ * Note this differs from the RFC 2744-specified prototype in that we don't
+ * bother returning an output token, since it would never be used anyway. */
+
+__u32 lgss_delete_sec_context(struct gss_ctx **context_handle)
+{
+	struct gss_api_mech *mech;
+
+	CDEBUG(D_SEC, "deleting %p\n", *context_handle);
+
+	if (!*context_handle)
+		return(GSS_S_NO_CONTEXT);
+
+	mech = (*context_handle)->mech_type;
+	if ((*context_handle)->internal_ctx_id != 0) {
+		LASSERT(mech);
+		LASSERT(mech->gm_ops);
+		LASSERT(mech->gm_ops->gss_delete_sec_context);
+		mech->gm_ops->gss_delete_sec_context(
+					(*context_handle)->internal_ctx_id);
+	}
+	if (mech)
+		lgss_mech_put(mech);
+
+	OBD_FREE_PTR(*context_handle);
+	*context_handle=NULL;
+	return GSS_S_COMPLETE;
+}
+
+int lgss_display(struct gss_ctx *ctx,
+		 char	   *buf,
+		 int	     bufsize)
+{
+	LASSERT(ctx);
+	LASSERT(ctx->mech_type);
+	LASSERT(ctx->mech_type->gm_ops);
+	LASSERT(ctx->mech_type->gm_ops->gss_display);
+
+	return ctx->mech_type->gm_ops->gss_display(ctx, buf, bufsize);
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_pipefs.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_pipefs.c
new file mode 100644
index 000000000000..3df7257b7fa0
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_pipefs.c
@@ -0,0 +1,1252 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ * linux/net/sunrpc/auth_gss.c
+ *
+ * RPCSEC_GSS client authentication.
+ *
+ *  Copyright (c) 2000 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Dug Song       <dugsong@monkey.org>
+ *  Andy Adamson   <andros@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/crypto.h>
+#include <asm/atomic.h>
+struct rpc_clnt; /* for rpc_pipefs */
+#include <linux/sunrpc/rpc_pipe_fs.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_sec.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+static struct ptlrpc_sec_policy gss_policy_pipefs;
+static struct ptlrpc_ctx_ops gss_pipefs_ctxops;
+
+static int gss_cli_ctx_refresh_pf(struct ptlrpc_cli_ctx *ctx);
+
+static int gss_sec_pipe_upcall_init(struct gss_sec *gsec)
+{
+	return 0;
+}
+
+static void gss_sec_pipe_upcall_fini(struct gss_sec *gsec)
+{
+}
+
+/****************************************
+ * internel context helpers	     *
+ ****************************************/
+
+static
+struct ptlrpc_cli_ctx *ctx_create_pf(struct ptlrpc_sec *sec,
+				     struct vfs_cred *vcred)
+{
+	struct gss_cli_ctx *gctx;
+	int		 rc;
+
+	OBD_ALLOC_PTR(gctx);
+	if (gctx == NULL)
+		return NULL;
+
+	rc = gss_cli_ctx_init_common(sec, &gctx->gc_base,
+				     &gss_pipefs_ctxops, vcred);
+	if (rc) {
+		OBD_FREE_PTR(gctx);
+		return NULL;
+	}
+
+	return &gctx->gc_base;
+}
+
+static
+void ctx_destroy_pf(struct ptlrpc_sec *sec, struct ptlrpc_cli_ctx *ctx)
+{
+	struct gss_cli_ctx *gctx = ctx2gctx(ctx);
+
+	if (gss_cli_ctx_fini_common(sec, ctx))
+		return;
+
+	OBD_FREE_PTR(gctx);
+
+	atomic_dec(&sec->ps_nctx);
+	sptlrpc_sec_put(sec);
+}
+
+static
+void ctx_enhash_pf(struct ptlrpc_cli_ctx *ctx, struct hlist_head *hash)
+{
+	set_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags);
+	atomic_inc(&ctx->cc_refcount);
+	hlist_add_head(&ctx->cc_cache, hash);
+}
+
+/*
+ * caller must hold spinlock
+ */
+static
+void ctx_unhash_pf(struct ptlrpc_cli_ctx *ctx, struct hlist_head *freelist)
+{
+	LASSERT(spin_is_locked(&ctx->cc_sec->ps_lock));
+	LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+	LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags));
+	LASSERT(!hlist_unhashed(&ctx->cc_cache));
+
+	clear_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags);
+
+	if (atomic_dec_and_test(&ctx->cc_refcount)) {
+		__hlist_del(&ctx->cc_cache);
+		hlist_add_head(&ctx->cc_cache, freelist);
+	} else {
+		hlist_del_init(&ctx->cc_cache);
+	}
+}
+
+/*
+ * return 1 if the context is dead.
+ */
+static
+int ctx_check_death_pf(struct ptlrpc_cli_ctx *ctx,
+		       struct hlist_head *freelist)
+{
+	if (cli_ctx_check_death(ctx)) {
+		if (freelist)
+			ctx_unhash_pf(ctx, freelist);
+		return 1;
+	}
+
+	return 0;
+}
+
+static inline
+int ctx_check_death_locked_pf(struct ptlrpc_cli_ctx *ctx,
+			      struct hlist_head *freelist)
+{
+	LASSERT(ctx->cc_sec);
+	LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+	LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags));
+
+	return ctx_check_death_pf(ctx, freelist);
+}
+
+static inline
+int ctx_match_pf(struct ptlrpc_cli_ctx *ctx, struct vfs_cred *vcred)
+{
+	/* a little bit optimization for null policy */
+	if (!ctx->cc_ops->match)
+		return 1;
+
+	return ctx->cc_ops->match(ctx, vcred);
+}
+
+static
+void ctx_list_destroy_pf(struct hlist_head *head)
+{
+	struct ptlrpc_cli_ctx *ctx;
+
+	while (!hlist_empty(head)) {
+		ctx = hlist_entry(head->first, struct ptlrpc_cli_ctx,
+				      cc_cache);
+
+		LASSERT(atomic_read(&ctx->cc_refcount) == 0);
+		LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT,
+				     &ctx->cc_flags) == 0);
+
+		hlist_del_init(&ctx->cc_cache);
+		ctx_destroy_pf(ctx->cc_sec, ctx);
+	}
+}
+
+/****************************************
+ * context apis			 *
+ ****************************************/
+
+static
+int gss_cli_ctx_validate_pf(struct ptlrpc_cli_ctx *ctx)
+{
+	if (ctx_check_death_pf(ctx, NULL))
+		return 1;
+	if (cli_ctx_is_ready(ctx))
+		return 0;
+	return 1;
+}
+
+static
+void gss_cli_ctx_die_pf(struct ptlrpc_cli_ctx *ctx, int grace)
+{
+	LASSERT(ctx->cc_sec);
+	LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+
+	cli_ctx_expire(ctx);
+
+	spin_lock(&ctx->cc_sec->ps_lock);
+
+	if (test_and_clear_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags)) {
+		LASSERT(!hlist_unhashed(&ctx->cc_cache));
+		LASSERT(atomic_read(&ctx->cc_refcount) > 1);
+
+		hlist_del_init(&ctx->cc_cache);
+		if (atomic_dec_and_test(&ctx->cc_refcount))
+			LBUG();
+	}
+
+	spin_unlock(&ctx->cc_sec->ps_lock);
+}
+
+/****************************************
+ * reverse context installation	 *
+ ****************************************/
+
+static inline
+unsigned int ctx_hash_index(int hashsize, __u64 key)
+{
+	return (unsigned int) (key & ((__u64) hashsize - 1));
+}
+
+static
+void gss_sec_ctx_replace_pf(struct gss_sec *gsec,
+			    struct ptlrpc_cli_ctx *new)
+{
+	struct gss_sec_pipefs *gsec_pf;
+	struct ptlrpc_cli_ctx *ctx;
+	struct hlist_node     *next;
+	HLIST_HEAD(freelist);
+	unsigned int hash;
+	ENTRY;
+
+	gsec_pf = container_of(gsec, struct gss_sec_pipefs, gsp_base);
+
+	hash = ctx_hash_index(gsec_pf->gsp_chash_size,
+			      (__u64) new->cc_vcred.vc_uid);
+	LASSERT(hash < gsec_pf->gsp_chash_size);
+
+	spin_lock(&gsec->gs_base.ps_lock);
+
+	hlist_for_each_entry_safe(ctx, next,
+				      &gsec_pf->gsp_chash[hash], cc_cache) {
+		if (!ctx_match_pf(ctx, &new->cc_vcred))
+			continue;
+
+		cli_ctx_expire(ctx);
+		ctx_unhash_pf(ctx, &freelist);
+		break;
+	}
+
+	ctx_enhash_pf(new, &gsec_pf->gsp_chash[hash]);
+
+	spin_unlock(&gsec->gs_base.ps_lock);
+
+	ctx_list_destroy_pf(&freelist);
+	EXIT;
+}
+
+static
+int gss_install_rvs_cli_ctx_pf(struct gss_sec *gsec,
+			       struct ptlrpc_svc_ctx *svc_ctx)
+{
+	struct vfs_cred	  vcred;
+	struct ptlrpc_cli_ctx   *cli_ctx;
+	int		      rc;
+	ENTRY;
+
+	vcred.vc_uid = 0;
+	vcred.vc_gid = 0;
+
+	cli_ctx = ctx_create_pf(&gsec->gs_base, &vcred);
+	if (!cli_ctx)
+		RETURN(-ENOMEM);
+
+	rc = gss_copy_rvc_cli_ctx(cli_ctx, svc_ctx);
+	if (rc) {
+		ctx_destroy_pf(cli_ctx->cc_sec, cli_ctx);
+		RETURN(rc);
+	}
+
+	gss_sec_ctx_replace_pf(gsec, cli_ctx);
+	RETURN(0);
+}
+
+static
+void gss_ctx_cache_gc_pf(struct gss_sec_pipefs *gsec_pf,
+			 struct hlist_head *freelist)
+{
+	struct ptlrpc_sec       *sec;
+	struct ptlrpc_cli_ctx   *ctx;
+	struct hlist_node       *next;
+	int i;
+	ENTRY;
+
+	sec = &gsec_pf->gsp_base.gs_base;
+
+	CDEBUG(D_SEC, "do gc on sec %s@%p\n", sec->ps_policy->sp_name, sec);
+
+	for (i = 0; i < gsec_pf->gsp_chash_size; i++) {
+		hlist_for_each_entry_safe(ctx, next,
+					      &gsec_pf->gsp_chash[i], cc_cache)
+			ctx_check_death_locked_pf(ctx, freelist);
+	}
+
+	sec->ps_gc_next = cfs_time_current_sec() + sec->ps_gc_interval;
+	EXIT;
+}
+
+static
+struct ptlrpc_sec* gss_sec_create_pf(struct obd_import *imp,
+				     struct ptlrpc_svc_ctx *ctx,
+				     struct sptlrpc_flavor *sf)
+{
+	struct gss_sec_pipefs   *gsec_pf;
+	int		      alloc_size, hash_size, i;
+	ENTRY;
+
+#define GSS_SEC_PIPEFS_CTX_HASH_SIZE    (32)
+
+	if (ctx ||
+	    sf->sf_flags & (PTLRPC_SEC_FL_ROOTONLY | PTLRPC_SEC_FL_REVERSE))
+		hash_size = 1;
+	else
+		hash_size = GSS_SEC_PIPEFS_CTX_HASH_SIZE;
+
+	alloc_size = sizeof(*gsec_pf) +
+		     sizeof(struct hlist_head) * hash_size;
+
+	OBD_ALLOC(gsec_pf, alloc_size);
+	if (!gsec_pf)
+		RETURN(NULL);
+
+	gsec_pf->gsp_chash_size = hash_size;
+	for (i = 0; i < hash_size; i++)
+		INIT_HLIST_HEAD(&gsec_pf->gsp_chash[i]);
+
+	if (gss_sec_create_common(&gsec_pf->gsp_base, &gss_policy_pipefs,
+				  imp, ctx, sf))
+		goto err_free;
+
+	if (ctx == NULL) {
+		if (gss_sec_pipe_upcall_init(&gsec_pf->gsp_base))
+			goto err_destroy;
+	} else {
+		if (gss_install_rvs_cli_ctx_pf(&gsec_pf->gsp_base, ctx))
+			goto err_destroy;
+	}
+
+	RETURN(&gsec_pf->gsp_base.gs_base);
+
+err_destroy:
+	gss_sec_destroy_common(&gsec_pf->gsp_base);
+err_free:
+	OBD_FREE(gsec_pf, alloc_size);
+	RETURN(NULL);
+}
+
+static
+void gss_sec_destroy_pf(struct ptlrpc_sec *sec)
+{
+	struct gss_sec_pipefs   *gsec_pf;
+	struct gss_sec	  *gsec;
+
+	CWARN("destroy %s@%p\n", sec->ps_policy->sp_name, sec);
+
+	gsec = container_of(sec, struct gss_sec, gs_base);
+	gsec_pf = container_of(gsec, struct gss_sec_pipefs, gsp_base);
+
+	LASSERT(gsec_pf->gsp_chash);
+	LASSERT(gsec_pf->gsp_chash_size);
+
+	gss_sec_pipe_upcall_fini(gsec);
+
+	gss_sec_destroy_common(gsec);
+
+	OBD_FREE(gsec, sizeof(*gsec_pf) +
+		       sizeof(struct hlist_head) * gsec_pf->gsp_chash_size);
+}
+
+static
+struct ptlrpc_cli_ctx * gss_sec_lookup_ctx_pf(struct ptlrpc_sec *sec,
+					      struct vfs_cred *vcred,
+					      int create, int remove_dead)
+{
+	struct gss_sec	 *gsec;
+	struct gss_sec_pipefs  *gsec_pf;
+	struct ptlrpc_cli_ctx  *ctx = NULL, *new = NULL;
+	struct hlist_head       *hash_head;
+	struct hlist_node       *next;
+	HLIST_HEAD(freelist);
+	unsigned int	    hash, gc = 0, found = 0;
+	ENTRY;
+
+	might_sleep();
+
+	gsec = container_of(sec, struct gss_sec, gs_base);
+	gsec_pf = container_of(gsec, struct gss_sec_pipefs, gsp_base);
+
+	hash = ctx_hash_index(gsec_pf->gsp_chash_size,
+			      (__u64) vcred->vc_uid);
+	hash_head = &gsec_pf->gsp_chash[hash];
+	LASSERT(hash < gsec_pf->gsp_chash_size);
+
+retry:
+	spin_lock(&sec->ps_lock);
+
+	/* gc_next == 0 means never do gc */
+	if (remove_dead && sec->ps_gc_next &&
+	    cfs_time_after(cfs_time_current_sec(), sec->ps_gc_next)) {
+		gss_ctx_cache_gc_pf(gsec_pf, &freelist);
+		gc = 1;
+	}
+
+	hlist_for_each_entry_safe(ctx, next, hash_head, cc_cache) {
+		if (gc == 0 &&
+		    ctx_check_death_locked_pf(ctx,
+					      remove_dead ? &freelist : NULL))
+			continue;
+
+		if (ctx_match_pf(ctx, vcred)) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (found) {
+		if (new && new != ctx) {
+			/* lost the race, just free it */
+			hlist_add_head(&new->cc_cache, &freelist);
+			new = NULL;
+		}
+
+		/* hot node, move to head */
+		if (hash_head->first != &ctx->cc_cache) {
+			__hlist_del(&ctx->cc_cache);
+			hlist_add_head(&ctx->cc_cache, hash_head);
+		}
+	} else {
+		/* don't allocate for reverse sec */
+		if (sec_is_reverse(sec)) {
+			spin_unlock(&sec->ps_lock);
+			RETURN(NULL);
+		}
+
+		if (new) {
+			ctx_enhash_pf(new, hash_head);
+			ctx = new;
+		} else if (create) {
+			spin_unlock(&sec->ps_lock);
+			new = ctx_create_pf(sec, vcred);
+			if (new) {
+				clear_bit(PTLRPC_CTX_NEW_BIT, &new->cc_flags);
+				goto retry;
+			}
+		} else {
+			ctx = NULL;
+		}
+	}
+
+	/* hold a ref */
+	if (ctx)
+		atomic_inc(&ctx->cc_refcount);
+
+	spin_unlock(&sec->ps_lock);
+
+	/* the allocator of the context must give the first push to refresh */
+	if (new) {
+		LASSERT(new == ctx);
+		gss_cli_ctx_refresh_pf(new);
+	}
+
+	ctx_list_destroy_pf(&freelist);
+	RETURN(ctx);
+}
+
+static
+void gss_sec_release_ctx_pf(struct ptlrpc_sec *sec,
+			    struct ptlrpc_cli_ctx *ctx,
+			    int sync)
+{
+	LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags) == 0);
+	LASSERT(hlist_unhashed(&ctx->cc_cache));
+
+	/* if required async, we must clear the UPTODATE bit to prevent extra
+	 * rpcs during destroy procedure. */
+	if (!sync)
+		clear_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags);
+
+	/* destroy this context */
+	ctx_destroy_pf(sec, ctx);
+}
+
+/*
+ * @uid: which user. "-1" means flush all.
+ * @grace: mark context DEAD, allow graceful destroy like notify
+ *	 server side, etc.
+ * @force: also flush busy entries.
+ *
+ * return the number of busy context encountered.
+ *
+ * In any cases, never touch "eternal" contexts.
+ */
+static
+int gss_sec_flush_ctx_cache_pf(struct ptlrpc_sec *sec,
+			       uid_t uid,
+			       int grace, int force)
+{
+	struct gss_sec	  *gsec;
+	struct gss_sec_pipefs   *gsec_pf;
+	struct ptlrpc_cli_ctx   *ctx;
+	struct hlist_node       *next;
+	HLIST_HEAD(freelist);
+	int i, busy = 0;
+	ENTRY;
+
+	might_sleep_if(grace);
+
+	gsec = container_of(sec, struct gss_sec, gs_base);
+	gsec_pf = container_of(gsec, struct gss_sec_pipefs, gsp_base);
+
+	spin_lock(&sec->ps_lock);
+	for (i = 0; i < gsec_pf->gsp_chash_size; i++) {
+		hlist_for_each_entry_safe(ctx, next,
+					      &gsec_pf->gsp_chash[i],
+					      cc_cache) {
+			LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+
+			if (uid != -1 && uid != ctx->cc_vcred.vc_uid)
+				continue;
+
+			if (atomic_read(&ctx->cc_refcount) > 1) {
+				busy++;
+				if (!force)
+					continue;
+
+				CWARN("flush busy(%d) ctx %p(%u->%s) by force, "
+				      "grace %d\n",
+				      atomic_read(&ctx->cc_refcount),
+				      ctx, ctx->cc_vcred.vc_uid,
+				      sec2target_str(ctx->cc_sec), grace);
+			}
+			ctx_unhash_pf(ctx, &freelist);
+
+			set_bit(PTLRPC_CTX_DEAD_BIT, &ctx->cc_flags);
+			if (!grace)
+				clear_bit(PTLRPC_CTX_UPTODATE_BIT,
+					  &ctx->cc_flags);
+		}
+	}
+	spin_unlock(&sec->ps_lock);
+
+	ctx_list_destroy_pf(&freelist);
+	RETURN(busy);
+}
+
+/****************************************
+ * service apis			 *
+ ****************************************/
+
+static
+int gss_svc_accept_pf(struct ptlrpc_request *req)
+{
+	return gss_svc_accept(&gss_policy_pipefs, req);
+}
+
+static
+int gss_svc_install_rctx_pf(struct obd_import *imp,
+			    struct ptlrpc_svc_ctx *ctx)
+{
+	struct ptlrpc_sec *sec;
+	int		rc;
+
+	sec = sptlrpc_import_sec_ref(imp);
+	LASSERT(sec);
+	rc = gss_install_rvs_cli_ctx_pf(sec2gsec(sec), ctx);
+
+	sptlrpc_sec_put(sec);
+	return rc;
+}
+
+/****************************************
+ * rpc_pipefs definitions	       *
+ ****************************************/
+
+#define LUSTRE_PIPE_ROOT	"/lustre"
+#define LUSTRE_PIPE_KRB5	LUSTRE_PIPE_ROOT"/krb5"
+
+struct gss_upcall_msg_data {
+	__u32			   gum_seq;
+	__u32			   gum_uid;
+	__u32			   gum_gid;
+	__u32			   gum_svc;	/* MDS/OSS... */
+	__u64			   gum_nid;	/* peer NID */
+	__u8			    gum_obd[64];    /* client obd name */
+};
+
+struct gss_upcall_msg {
+	struct rpc_pipe_msg	     gum_base;
+	atomic_t		    gum_refcount;
+	struct list_head		      gum_list;
+	__u32			   gum_mechidx;
+	struct gss_sec		 *gum_gsec;
+	struct gss_cli_ctx	     *gum_gctx;
+	struct gss_upcall_msg_data      gum_data;
+};
+
+static atomic_t upcall_seq = ATOMIC_INIT(0);
+
+static inline
+__u32 upcall_get_sequence(void)
+{
+	return (__u32) atomic_inc_return(&upcall_seq);
+}
+
+enum mech_idx_t {
+	MECH_KRB5   = 0,
+	MECH_MAX
+};
+
+static inline
+__u32 mech_name2idx(const char *name)
+{
+	LASSERT(!strcmp(name, "krb5"));
+	return MECH_KRB5;
+}
+
+/* pipefs dentries for each mechanisms */
+static struct dentry *de_pipes[MECH_MAX] = { NULL, };
+/* all upcall messgaes linked here */
+static struct list_head upcall_lists[MECH_MAX];
+/* and protected by this */
+static spinlock_t upcall_locks[MECH_MAX];
+
+static inline
+void upcall_list_lock(int idx)
+{
+	spin_lock(&upcall_locks[idx]);
+}
+
+static inline
+void upcall_list_unlock(int idx)
+{
+	spin_unlock(&upcall_locks[idx]);
+}
+
+static
+void upcall_msg_enlist(struct gss_upcall_msg *msg)
+{
+	__u32 idx = msg->gum_mechidx;
+
+	upcall_list_lock(idx);
+	list_add(&msg->gum_list, &upcall_lists[idx]);
+	upcall_list_unlock(idx);
+}
+
+static
+void upcall_msg_delist(struct gss_upcall_msg *msg)
+{
+	__u32 idx = msg->gum_mechidx;
+
+	upcall_list_lock(idx);
+	list_del_init(&msg->gum_list);
+	upcall_list_unlock(idx);
+}
+
+/****************************************
+ * rpc_pipefs upcall helpers	    *
+ ****************************************/
+
+static
+void gss_release_msg(struct gss_upcall_msg *gmsg)
+{
+	ENTRY;
+	LASSERT(atomic_read(&gmsg->gum_refcount) > 0);
+
+	if (!atomic_dec_and_test(&gmsg->gum_refcount)) {
+		EXIT;
+		return;
+	}
+
+	if (gmsg->gum_gctx) {
+		sptlrpc_cli_ctx_wakeup(&gmsg->gum_gctx->gc_base);
+		sptlrpc_cli_ctx_put(&gmsg->gum_gctx->gc_base, 1);
+		gmsg->gum_gctx = NULL;
+	}
+
+	LASSERT(list_empty(&gmsg->gum_list));
+	LASSERT(list_empty(&gmsg->gum_base.list));
+	OBD_FREE_PTR(gmsg);
+	EXIT;
+}
+
+static
+void gss_unhash_msg_nolock(struct gss_upcall_msg *gmsg)
+{
+	__u32 idx = gmsg->gum_mechidx;
+
+	LASSERT(idx < MECH_MAX);
+	LASSERT(spin_is_locked(&upcall_locks[idx]));
+
+	if (list_empty(&gmsg->gum_list))
+		return;
+
+	list_del_init(&gmsg->gum_list);
+	LASSERT(atomic_read(&gmsg->gum_refcount) > 1);
+	atomic_dec(&gmsg->gum_refcount);
+}
+
+static
+void gss_unhash_msg(struct gss_upcall_msg *gmsg)
+{
+	__u32 idx = gmsg->gum_mechidx;
+
+	LASSERT(idx < MECH_MAX);
+	upcall_list_lock(idx);
+	gss_unhash_msg_nolock(gmsg);
+	upcall_list_unlock(idx);
+}
+
+static
+void gss_msg_fail_ctx(struct gss_upcall_msg *gmsg)
+{
+	if (gmsg->gum_gctx) {
+		struct ptlrpc_cli_ctx *ctx = &gmsg->gum_gctx->gc_base;
+
+		LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+		sptlrpc_cli_ctx_expire(ctx);
+		set_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags);
+	}
+}
+
+static
+struct gss_upcall_msg * gss_find_upcall(__u32 mechidx, __u32 seq)
+{
+	struct gss_upcall_msg *gmsg;
+
+	upcall_list_lock(mechidx);
+	list_for_each_entry(gmsg, &upcall_lists[mechidx], gum_list) {
+		if (gmsg->gum_data.gum_seq != seq)
+			continue;
+
+		LASSERT(atomic_read(&gmsg->gum_refcount) > 0);
+		LASSERT(gmsg->gum_mechidx == mechidx);
+
+		atomic_inc(&gmsg->gum_refcount);
+		upcall_list_unlock(mechidx);
+		return gmsg;
+	}
+	upcall_list_unlock(mechidx);
+	return NULL;
+}
+
+static
+int simple_get_bytes(char **buf, __u32 *buflen, void *res, __u32 reslen)
+{
+	if (*buflen < reslen) {
+		CERROR("buflen %u < %u\n", *buflen, reslen);
+		return -EINVAL;
+	}
+
+	memcpy(res, *buf, reslen);
+	*buf += reslen;
+	*buflen -= reslen;
+	return 0;
+}
+
+/****************************************
+ * rpc_pipefs apis		      *
+ ****************************************/
+
+static
+ssize_t gss_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
+			char *dst, size_t buflen)
+{
+	char *data = (char *)msg->data + msg->copied;
+	ssize_t mlen = msg->len;
+	ssize_t left;
+	ENTRY;
+
+	if (mlen > buflen)
+		mlen = buflen;
+	left = copy_to_user(dst, data, mlen);
+	if (left < 0) {
+		msg->errno = left;
+		RETURN(left);
+	}
+	mlen -= left;
+	msg->copied += mlen;
+	msg->errno = 0;
+	RETURN(mlen);
+}
+
+static
+ssize_t gss_pipe_downcall(struct file *filp, const char *src, size_t mlen)
+{
+	struct rpc_inode	*rpci = RPC_I(filp->f_dentry->d_inode);
+	struct gss_upcall_msg   *gss_msg;
+	struct ptlrpc_cli_ctx   *ctx;
+	struct gss_cli_ctx      *gctx = NULL;
+	char		    *buf, *data;
+	int		      datalen;
+	int		      timeout, rc;
+	__u32		    mechidx, seq, gss_err;
+	ENTRY;
+
+	mechidx = (__u32) (long) rpci->private;
+	LASSERT(mechidx < MECH_MAX);
+
+	OBD_ALLOC(buf, mlen);
+	if (!buf)
+		RETURN(-ENOMEM);
+
+	if (copy_from_user(buf, src, mlen)) {
+		CERROR("failed copy user space data\n");
+		GOTO(out_free, rc = -EFAULT);
+	}
+	data = buf;
+	datalen = mlen;
+
+	/* data passed down format:
+	 *  - seq
+	 *  - timeout
+	 *  - gc_win / error
+	 *  - wire_ctx (rawobj)
+	 *  - mech_ctx (rawobj)
+	 */
+	if (simple_get_bytes(&data, &datalen, &seq, sizeof(seq))) {
+		CERROR("fail to get seq\n");
+		GOTO(out_free, rc = -EFAULT);
+	}
+
+	gss_msg = gss_find_upcall(mechidx, seq);
+	if (!gss_msg) {
+		CERROR("upcall %u has aborted earlier\n", seq);
+		GOTO(out_free, rc = -EINVAL);
+	}
+
+	gss_unhash_msg(gss_msg);
+	gctx = gss_msg->gum_gctx;
+	LASSERT(gctx);
+	LASSERT(atomic_read(&gctx->gc_base.cc_refcount) > 0);
+
+	/* timeout is not in use for now */
+	if (simple_get_bytes(&data, &datalen, &timeout, sizeof(timeout)))
+		GOTO(out_msg, rc = -EFAULT);
+
+	/* lgssd signal an error by gc_win == 0 */
+	if (simple_get_bytes(&data, &datalen, &gctx->gc_win,
+			     sizeof(gctx->gc_win)))
+		GOTO(out_msg, rc = -EFAULT);
+
+	if (gctx->gc_win == 0) {
+		/* followed by:
+		 * - rpc error
+		 * - gss error
+		 */
+		if (simple_get_bytes(&data, &datalen, &rc, sizeof(rc)))
+			GOTO(out_msg, rc = -EFAULT);
+		if (simple_get_bytes(&data, &datalen, &gss_err,sizeof(gss_err)))
+			GOTO(out_msg, rc = -EFAULT);
+
+		if (rc == 0 && gss_err == GSS_S_COMPLETE) {
+			CWARN("both rpc & gss error code not set\n");
+			rc = -EPERM;
+		}
+	} else {
+		rawobj_t tmpobj;
+
+		/* handle */
+		if (rawobj_extract_local(&tmpobj, (__u32 **) &data, &datalen))
+			GOTO(out_msg, rc = -EFAULT);
+		if (rawobj_dup(&gctx->gc_handle, &tmpobj))
+			GOTO(out_msg, rc = -ENOMEM);
+
+		/* mechctx */
+		if (rawobj_extract_local(&tmpobj, (__u32 **) &data, &datalen))
+			GOTO(out_msg, rc = -EFAULT);
+		gss_err = lgss_import_sec_context(&tmpobj,
+						  gss_msg->gum_gsec->gs_mech,
+						  &gctx->gc_mechctx);
+		rc = 0;
+	}
+
+	if (likely(rc == 0 && gss_err == GSS_S_COMPLETE)) {
+		gss_cli_ctx_uptodate(gctx);
+	} else {
+		ctx = &gctx->gc_base;
+		sptlrpc_cli_ctx_expire(ctx);
+		if (rc != -ERESTART || gss_err != GSS_S_COMPLETE)
+			set_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags);
+
+		CERROR("refresh ctx %p(uid %d) failed: %d/0x%08x: %s\n",
+		       ctx, ctx->cc_vcred.vc_uid, rc, gss_err,
+		       test_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags) ?
+		       "fatal error" : "non-fatal");
+	}
+
+	rc = mlen;
+
+out_msg:
+	gss_release_msg(gss_msg);
+
+out_free:
+	OBD_FREE(buf, mlen);
+	/* FIXME
+	 * hack pipefs: always return asked length unless all following
+	 * downcalls might be messed up. */
+	rc = mlen;
+	RETURN(rc);
+}
+
+static
+void gss_pipe_destroy_msg(struct rpc_pipe_msg *msg)
+{
+	struct gss_upcall_msg	  *gmsg;
+	struct gss_upcall_msg_data     *gumd;
+	static cfs_time_t	       ratelimit = 0;
+	ENTRY;
+
+	LASSERT(list_empty(&msg->list));
+
+	/* normally errno is >= 0 */
+	if (msg->errno >= 0) {
+		EXIT;
+		return;
+	}
+
+	gmsg = container_of(msg, struct gss_upcall_msg, gum_base);
+	gumd = &gmsg->gum_data;
+	LASSERT(atomic_read(&gmsg->gum_refcount) > 0);
+
+	CERROR("failed msg %p (seq %u, uid %u, svc %u, nid "LPX64", obd %.*s): "
+	       "errno %d\n", msg, gumd->gum_seq, gumd->gum_uid, gumd->gum_svc,
+	       gumd->gum_nid, (int) sizeof(gumd->gum_obd),
+	       gumd->gum_obd, msg->errno);
+
+	atomic_inc(&gmsg->gum_refcount);
+	gss_unhash_msg(gmsg);
+	if (msg->errno == -ETIMEDOUT || msg->errno == -EPIPE) {
+		cfs_time_t now = cfs_time_current_sec();
+
+		if (cfs_time_after(now, ratelimit)) {
+			CWARN("upcall timed out, is lgssd running?\n");
+			ratelimit = now + 15;
+		}
+	}
+	gss_msg_fail_ctx(gmsg);
+	gss_release_msg(gmsg);
+	EXIT;
+}
+
+static
+void gss_pipe_release(struct inode *inode)
+{
+	struct rpc_inode *rpci = RPC_I(inode);
+	__u32	     idx;
+	ENTRY;
+
+	idx = (__u32) (long) rpci->private;
+	LASSERT(idx < MECH_MAX);
+
+	upcall_list_lock(idx);
+	while (!list_empty(&upcall_lists[idx])) {
+		struct gss_upcall_msg      *gmsg;
+		struct gss_upcall_msg_data *gumd;
+
+		gmsg = list_entry(upcall_lists[idx].next,
+				      struct gss_upcall_msg, gum_list);
+		gumd = &gmsg->gum_data;
+		LASSERT(list_empty(&gmsg->gum_base.list));
+
+		CERROR("failing remaining msg %p:seq %u, uid %u, svc %u, "
+		       "nid "LPX64", obd %.*s\n", gmsg,
+		       gumd->gum_seq, gumd->gum_uid, gumd->gum_svc,
+		       gumd->gum_nid, (int) sizeof(gumd->gum_obd),
+		       gumd->gum_obd);
+
+		gmsg->gum_base.errno = -EPIPE;
+		atomic_inc(&gmsg->gum_refcount);
+		gss_unhash_msg_nolock(gmsg);
+
+		gss_msg_fail_ctx(gmsg);
+
+		upcall_list_unlock(idx);
+		gss_release_msg(gmsg);
+		upcall_list_lock(idx);
+	}
+	upcall_list_unlock(idx);
+	EXIT;
+}
+
+static struct rpc_pipe_ops gss_upcall_ops = {
+	.upcall	 = gss_pipe_upcall,
+	.downcall       = gss_pipe_downcall,
+	.destroy_msg    = gss_pipe_destroy_msg,
+	.release_pipe   = gss_pipe_release,
+};
+
+/****************************************
+ * upcall helper functions	      *
+ ****************************************/
+
+static
+int gss_ctx_refresh_pf(struct ptlrpc_cli_ctx *ctx)
+{
+	struct obd_import	  *imp;
+	struct gss_sec	     *gsec;
+	struct gss_upcall_msg      *gmsg;
+	int			 rc = 0;
+	ENTRY;
+
+	might_sleep();
+
+	LASSERT(ctx->cc_sec);
+	LASSERT(ctx->cc_sec->ps_import);
+	LASSERT(ctx->cc_sec->ps_import->imp_obd);
+
+	imp = ctx->cc_sec->ps_import;
+	if (!imp->imp_connection) {
+		CERROR("import has no connection set\n");
+		RETURN(-EINVAL);
+	}
+
+	gsec = container_of(ctx->cc_sec, struct gss_sec, gs_base);
+
+	OBD_ALLOC_PTR(gmsg);
+	if (!gmsg)
+		RETURN(-ENOMEM);
+
+	/* initialize pipefs base msg */
+	INIT_LIST_HEAD(&gmsg->gum_base.list);
+	gmsg->gum_base.data = &gmsg->gum_data;
+	gmsg->gum_base.len = sizeof(gmsg->gum_data);
+	gmsg->gum_base.copied = 0;
+	gmsg->gum_base.errno = 0;
+
+	/* init upcall msg */
+	atomic_set(&gmsg->gum_refcount, 1);
+	gmsg->gum_mechidx = mech_name2idx(gsec->gs_mech->gm_name);
+	gmsg->gum_gsec = gsec;
+	gmsg->gum_gctx = container_of(sptlrpc_cli_ctx_get(ctx),
+				      struct gss_cli_ctx, gc_base);
+	gmsg->gum_data.gum_seq = upcall_get_sequence();
+	gmsg->gum_data.gum_uid = ctx->cc_vcred.vc_uid;
+	gmsg->gum_data.gum_gid = 0; /* not used for now */
+	gmsg->gum_data.gum_svc = import_to_gss_svc(imp);
+	gmsg->gum_data.gum_nid = imp->imp_connection->c_peer.nid;
+	strncpy(gmsg->gum_data.gum_obd, imp->imp_obd->obd_name,
+		sizeof(gmsg->gum_data.gum_obd));
+
+	/* This only could happen when sysadmin set it dead/expired
+	 * using lctl by force. */
+	if (ctx->cc_flags & PTLRPC_CTX_STATUS_MASK) {
+		CWARN("ctx %p(%u->%s) was set flags %lx unexpectedly\n",
+		      ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec),
+		      ctx->cc_flags);
+
+		LASSERT(!(ctx->cc_flags & PTLRPC_CTX_UPTODATE));
+		ctx->cc_flags |= PTLRPC_CTX_DEAD | PTLRPC_CTX_ERROR;
+
+		rc = -EIO;
+		goto err_free;
+	}
+
+	upcall_msg_enlist(gmsg);
+
+	rc = rpc_queue_upcall(de_pipes[gmsg->gum_mechidx]->d_inode,
+			      &gmsg->gum_base);
+	if (rc) {
+		CERROR("rpc_queue_upcall failed: %d\n", rc);
+
+		upcall_msg_delist(gmsg);
+		goto err_free;
+	}
+
+	RETURN(0);
+err_free:
+	OBD_FREE_PTR(gmsg);
+	RETURN(rc);
+}
+
+static
+int gss_cli_ctx_refresh_pf(struct ptlrpc_cli_ctx *ctx)
+{
+	/* if we are refreshing for root, also update the reverse
+	 * handle index, do not confuse reverse contexts. */
+	if (ctx->cc_vcred.vc_uid == 0) {
+		struct gss_sec *gsec;
+
+		gsec = container_of(ctx->cc_sec, struct gss_sec, gs_base);
+		gsec->gs_rvs_hdl = gss_get_next_ctx_index();
+	}
+
+	return gss_ctx_refresh_pf(ctx);
+}
+
+/****************************************
+ * lustre gss pipefs policy	     *
+ ****************************************/
+
+static struct ptlrpc_ctx_ops gss_pipefs_ctxops = {
+	.match		  = gss_cli_ctx_match,
+	.refresh		= gss_cli_ctx_refresh_pf,
+	.validate	       = gss_cli_ctx_validate_pf,
+	.die		    = gss_cli_ctx_die_pf,
+	.sign		   = gss_cli_ctx_sign,
+	.verify		 = gss_cli_ctx_verify,
+	.seal		   = gss_cli_ctx_seal,
+	.unseal		 = gss_cli_ctx_unseal,
+	.wrap_bulk	      = gss_cli_ctx_wrap_bulk,
+	.unwrap_bulk	    = gss_cli_ctx_unwrap_bulk,
+};
+
+static struct ptlrpc_sec_cops gss_sec_pipefs_cops = {
+	.create_sec	     = gss_sec_create_pf,
+	.destroy_sec	    = gss_sec_destroy_pf,
+	.kill_sec	       = gss_sec_kill,
+	.lookup_ctx	     = gss_sec_lookup_ctx_pf,
+	.release_ctx	    = gss_sec_release_ctx_pf,
+	.flush_ctx_cache	= gss_sec_flush_ctx_cache_pf,
+	.install_rctx	   = gss_sec_install_rctx,
+	.alloc_reqbuf	   = gss_alloc_reqbuf,
+	.free_reqbuf	    = gss_free_reqbuf,
+	.alloc_repbuf	   = gss_alloc_repbuf,
+	.free_repbuf	    = gss_free_repbuf,
+	.enlarge_reqbuf	 = gss_enlarge_reqbuf,
+};
+
+static struct ptlrpc_sec_sops gss_sec_pipefs_sops = {
+	.accept		 = gss_svc_accept_pf,
+	.invalidate_ctx	 = gss_svc_invalidate_ctx,
+	.alloc_rs	       = gss_svc_alloc_rs,
+	.authorize	      = gss_svc_authorize,
+	.free_rs		= gss_svc_free_rs,
+	.free_ctx	       = gss_svc_free_ctx,
+	.unwrap_bulk	    = gss_svc_unwrap_bulk,
+	.wrap_bulk	      = gss_svc_wrap_bulk,
+	.install_rctx	   = gss_svc_install_rctx_pf,
+};
+
+static struct ptlrpc_sec_policy gss_policy_pipefs = {
+	.sp_owner	       = THIS_MODULE,
+	.sp_name		= "gss.pipefs",
+	.sp_policy	      = SPTLRPC_POLICY_GSS_PIPEFS,
+	.sp_cops		= &gss_sec_pipefs_cops,
+	.sp_sops		= &gss_sec_pipefs_sops,
+};
+
+static
+int __init gss_init_pipefs_upcall(void)
+{
+	struct dentry   *de;
+
+	/* pipe dir */
+	de = rpc_mkdir(LUSTRE_PIPE_ROOT, NULL);
+	if (IS_ERR(de) && PTR_ERR(de) != -EEXIST) {
+		CERROR("Failed to create gss pipe dir: %ld\n", PTR_ERR(de));
+		return PTR_ERR(de);
+	}
+
+	/* FIXME hack pipefs: dput will sometimes cause oops during module
+	 * unload and lgssd close the pipe fds. */
+
+	/* krb5 mechanism */
+	de = rpc_mkpipe(LUSTRE_PIPE_KRB5, (void *) MECH_KRB5, &gss_upcall_ops,
+			RPC_PIPE_WAIT_FOR_OPEN);
+	if (!de || IS_ERR(de)) {
+		CERROR("failed to make rpc_pipe %s: %ld\n",
+		       LUSTRE_PIPE_KRB5, PTR_ERR(de));
+		rpc_rmdir(LUSTRE_PIPE_ROOT);
+		return PTR_ERR(de);
+	}
+
+	de_pipes[MECH_KRB5] = de;
+	INIT_LIST_HEAD(&upcall_lists[MECH_KRB5]);
+	spin_lock_init(&upcall_locks[MECH_KRB5]);
+
+	return 0;
+}
+
+static
+void __exit gss_exit_pipefs_upcall(void)
+{
+	__u32   i;
+
+	for (i = 0; i < MECH_MAX; i++) {
+		LASSERT(list_empty(&upcall_lists[i]));
+
+		/* dput pipe dentry here might cause lgssd oops. */
+		de_pipes[i] = NULL;
+	}
+
+	rpc_unlink(LUSTRE_PIPE_KRB5);
+	rpc_rmdir(LUSTRE_PIPE_ROOT);
+}
+
+int __init gss_init_pipefs(void)
+{
+	int rc;
+
+	rc = gss_init_pipefs_upcall();
+	if (rc)
+		return rc;
+
+	rc = sptlrpc_register_policy(&gss_policy_pipefs);
+	if (rc) {
+		gss_exit_pipefs_upcall();
+		return rc;
+	}
+
+	return 0;
+}
+
+void __exit gss_exit_pipefs(void)
+{
+	gss_exit_pipefs_upcall();
+	sptlrpc_unregister_policy(&gss_policy_pipefs);
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_rawobj.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_rawobj.c
new file mode 100644
index 000000000000..474ecf805307
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_rawobj.c
@@ -0,0 +1,242 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/gss/gss_rawobj.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/mutex.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_sec.h>
+
+#include "gss_internal.h"
+
+int rawobj_empty(rawobj_t *obj)
+{
+	LASSERT(equi(obj->len, obj->data));
+	return (obj->len == 0);
+}
+
+int rawobj_alloc(rawobj_t *obj, char *buf, int len)
+{
+	LASSERT(obj);
+	LASSERT(len >= 0);
+
+	obj->len = len;
+	if (len) {
+		OBD_ALLOC_LARGE(obj->data, len);
+		if (!obj->data) {
+			obj->len = 0;
+			RETURN(-ENOMEM);
+		}
+		memcpy(obj->data, buf, len);
+	} else
+		obj->data = NULL;
+	return 0;
+}
+
+void rawobj_free(rawobj_t *obj)
+{
+	LASSERT(obj);
+
+	if (obj->len) {
+		LASSERT(obj->data);
+		OBD_FREE_LARGE(obj->data, obj->len);
+		obj->len = 0;
+		obj->data = NULL;
+	} else
+		LASSERT(!obj->data);
+}
+
+int rawobj_equal(rawobj_t *a, rawobj_t *b)
+{
+	LASSERT(a && b);
+
+	return (a->len == b->len &&
+		(!a->len || !memcmp(a->data, b->data, a->len)));
+}
+
+int rawobj_dup(rawobj_t *dest, rawobj_t *src)
+{
+	LASSERT(src && dest);
+
+	dest->len = src->len;
+	if (dest->len) {
+		OBD_ALLOC_LARGE(dest->data, dest->len);
+		if (!dest->data) {
+			dest->len = 0;
+			return -ENOMEM;
+		}
+		memcpy(dest->data, src->data, dest->len);
+	} else
+		dest->data = NULL;
+	return 0;
+}
+
+int rawobj_serialize(rawobj_t *obj, __u32 **buf, __u32 *buflen)
+{
+	__u32 len;
+
+	LASSERT(obj);
+	LASSERT(buf);
+	LASSERT(buflen);
+
+	len = cfs_size_round4(obj->len);
+
+	if (*buflen < 4 + len) {
+		CERROR("buflen %u <  %u\n", *buflen, 4 + len);
+		return -EINVAL;
+	}
+
+	*(*buf)++ = cpu_to_le32(obj->len);
+	memcpy(*buf, obj->data, obj->len);
+	*buf += (len >> 2);
+	*buflen -= (4 + len);
+
+	return 0;
+}
+
+static int __rawobj_extract(rawobj_t *obj, __u32 **buf, __u32 *buflen,
+			    int alloc, int local)
+{
+	__u32 len;
+
+	if (*buflen < sizeof(__u32)) {
+		CERROR("buflen %u\n", *buflen);
+		return -EINVAL;
+	}
+
+	obj->len = *(*buf)++;
+	if (!local)
+		obj->len = le32_to_cpu(obj->len);
+	*buflen -= sizeof(__u32);
+
+	if (!obj->len) {
+		obj->data = NULL;
+		return 0;
+	}
+
+	len = local ? obj->len : cfs_size_round4(obj->len);
+	if (*buflen < len) {
+		CERROR("buflen %u < %u\n", *buflen, len);
+		obj->len = 0;
+		return -EINVAL;
+	}
+
+	if (!alloc)
+		obj->data = (__u8 *) *buf;
+	else {
+		OBD_ALLOC_LARGE(obj->data, obj->len);
+		if (!obj->data) {
+			CERROR("fail to alloc %u bytes\n", obj->len);
+			obj->len = 0;
+			return -ENOMEM;
+		}
+		memcpy(obj->data, *buf, obj->len);
+	}
+
+	*((char **)buf) += len;
+	*buflen -= len;
+
+	return 0;
+}
+
+int rawobj_extract(rawobj_t *obj, __u32 **buf, __u32 *buflen)
+{
+	return __rawobj_extract(obj, buf, buflen, 0, 0);
+}
+
+int rawobj_extract_alloc(rawobj_t *obj, __u32 **buf, __u32 *buflen)
+{
+	return __rawobj_extract(obj, buf, buflen, 1, 0);
+}
+
+int rawobj_extract_local(rawobj_t *obj, __u32 **buf, __u32 *buflen)
+{
+	return __rawobj_extract(obj, buf, buflen, 0, 1);
+}
+
+int rawobj_extract_local_alloc(rawobj_t *obj, __u32 **buf, __u32 *buflen)
+{
+	return __rawobj_extract(obj, buf, buflen, 1, 1);
+}
+
+int rawobj_from_netobj(rawobj_t *rawobj, netobj_t *netobj)
+{
+	rawobj->len = netobj->len;
+	rawobj->data = netobj->data;
+	return 0;
+}
+
+int rawobj_from_netobj_alloc(rawobj_t *rawobj, netobj_t *netobj)
+{
+	rawobj->len = 0;
+	rawobj->data = NULL;
+
+	if (netobj->len == 0)
+		return 0;
+
+	OBD_ALLOC_LARGE(rawobj->data, netobj->len);
+	if (rawobj->data == NULL)
+		return -ENOMEM;
+
+	rawobj->len = netobj->len;
+	memcpy(rawobj->data, netobj->data, netobj->len);
+	return 0;
+}
+
+/****************************************
+ * misc more			    *
+ ****************************************/
+
+int buffer_extract_bytes(const void **buf, __u32 *buflen,
+			 void *res, __u32 reslen)
+{
+	if (*buflen < reslen) {
+		CERROR("buflen %u < %u\n", *buflen, reslen);
+		return -EINVAL;
+	}
+
+	memcpy(res, *buf, reslen);
+	*buf += reslen;
+	*buflen -= reslen;
+	return 0;
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_svc_upcall.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_svc_upcall.c
new file mode 100644
index 000000000000..31b50ea19c25
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_svc_upcall.c
@@ -0,0 +1,1099 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ * Neil Brown <neilb@cse.unsw.edu.au>
+ * J. Bruce Fields <bfields@umich.edu>
+ * Andy Adamson <andros@umich.edu>
+ * Dug Song <dugsong@monkey.org>
+ *
+ * RPCSEC_GSS server authentication.
+ * This implements RPCSEC_GSS as defined in rfc2203 (rpcsec_gss) and rfc2078
+ * (gssapi)
+ *
+ * The RPCSEC_GSS involves three stages:
+ *  1/ context creation
+ *  2/ data exchange
+ *  3/ context destruction
+ *
+ * Context creation is handled largely by upcalls to user-space.
+ *  In particular, GSS_Accept_sec_context is handled by an upcall
+ * Data exchange is handled entirely within the kernel
+ *  In particular, GSS_GetMIC, GSS_VerifyMIC, GSS_Seal, GSS_Unseal are in-kernel.
+ * Context destruction is handled in-kernel
+ *  GSS_Delete_sec_context is in-kernel
+ *
+ * Context creation is initiated by a RPCSEC_GSS_INIT request arriving.
+ * The context handle and gss_token are used as a key into the rpcsec_init cache.
+ * The content of this cache includes some of the outputs of GSS_Accept_sec_context,
+ * being major_status, minor_status, context_handle, reply_token.
+ * These are sent back to the client.
+ * Sequence window management is handled by the kernel.  The window size if currently
+ * a compile time constant.
+ *
+ * When user-space is happy that a context is established, it places an entry
+ * in the rpcsec_context cache. The key for this cache is the context_handle.
+ * The content includes:
+ *   uid/gidlist - for determining access rights
+ *   mechanism type
+ *   mechanism specific information, such as a key
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/hash.h>
+#include <linux/mutex.h>
+#include <linux/sunrpc/cache.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+#define GSS_SVC_UPCALL_TIMEOUT  (20)
+
+static spinlock_t __ctx_index_lock;
+static __u64 __ctx_index;
+
+__u64 gss_get_next_ctx_index(void)
+{
+	__u64 idx;
+
+	spin_lock(&__ctx_index_lock);
+	idx = __ctx_index++;
+	spin_unlock(&__ctx_index_lock);
+
+	return idx;
+}
+
+static inline unsigned long hash_mem(char *buf, int length, int bits)
+{
+	unsigned long hash = 0;
+	unsigned long l = 0;
+	int len = 0;
+	unsigned char c;
+
+	do {
+		if (len == length) {
+			c = (char) len;
+			len = -1;
+		} else
+			c = *buf++;
+
+		l = (l << 8) | c;
+		len++;
+
+		if ((len & (BITS_PER_LONG/8-1)) == 0)
+			hash = cfs_hash_long(hash^l, BITS_PER_LONG);
+	} while (len);
+
+	return hash >> (BITS_PER_LONG - bits);
+}
+
+/****************************************
+ * rsi cache			    *
+ ****************************************/
+
+#define RSI_HASHBITS    (6)
+#define RSI_HASHMAX     (1 << RSI_HASHBITS)
+#define RSI_HASHMASK    (RSI_HASHMAX - 1)
+
+struct rsi {
+	struct cache_head       h;
+	__u32		   lustre_svc;
+	__u64		   nid;
+	wait_queue_head_t	     waitq;
+	rawobj_t		in_handle, in_token;
+	rawobj_t		out_handle, out_token;
+	int		     major_status, minor_status;
+};
+
+static struct cache_head *rsi_table[RSI_HASHMAX];
+static struct cache_detail rsi_cache;
+static struct rsi *rsi_update(struct rsi *new, struct rsi *old);
+static struct rsi *rsi_lookup(struct rsi *item);
+
+static inline int rsi_hash(struct rsi *item)
+{
+	return hash_mem((char *)item->in_handle.data, item->in_handle.len,
+			RSI_HASHBITS) ^
+	       hash_mem((char *)item->in_token.data, item->in_token.len,
+			RSI_HASHBITS);
+}
+
+static inline int __rsi_match(struct rsi *item, struct rsi *tmp)
+{
+	return (rawobj_equal(&item->in_handle, &tmp->in_handle) &&
+		rawobj_equal(&item->in_token, &tmp->in_token));
+}
+
+static void rsi_free(struct rsi *rsi)
+{
+	rawobj_free(&rsi->in_handle);
+	rawobj_free(&rsi->in_token);
+	rawobj_free(&rsi->out_handle);
+	rawobj_free(&rsi->out_token);
+}
+
+static void rsi_request(struct cache_detail *cd,
+			struct cache_head *h,
+			char **bpp, int *blen)
+{
+	struct rsi *rsi = container_of(h, struct rsi, h);
+	__u64 index = 0;
+
+	/* if in_handle is null, provide kernel suggestion */
+	if (rsi->in_handle.len == 0)
+		index = gss_get_next_ctx_index();
+
+	qword_addhex(bpp, blen, (char *) &rsi->lustre_svc,
+		     sizeof(rsi->lustre_svc));
+	qword_addhex(bpp, blen, (char *) &rsi->nid, sizeof(rsi->nid));
+	qword_addhex(bpp, blen, (char *) &index, sizeof(index));
+	qword_addhex(bpp, blen, rsi->in_handle.data, rsi->in_handle.len);
+	qword_addhex(bpp, blen, rsi->in_token.data, rsi->in_token.len);
+	(*bpp)[-1] = '\n';
+}
+
+static int rsi_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+	return sunrpc_cache_pipe_upcall(cd, h, rsi_request);
+}
+
+static inline void __rsi_init(struct rsi *new, struct rsi *item)
+{
+	new->out_handle = RAWOBJ_EMPTY;
+	new->out_token = RAWOBJ_EMPTY;
+
+	new->in_handle = item->in_handle;
+	item->in_handle = RAWOBJ_EMPTY;
+	new->in_token = item->in_token;
+	item->in_token = RAWOBJ_EMPTY;
+
+	new->lustre_svc = item->lustre_svc;
+	new->nid = item->nid;
+	init_waitqueue_head(&new->waitq);
+}
+
+static inline void __rsi_update(struct rsi *new, struct rsi *item)
+{
+	LASSERT(new->out_handle.len == 0);
+	LASSERT(new->out_token.len == 0);
+
+	new->out_handle = item->out_handle;
+	item->out_handle = RAWOBJ_EMPTY;
+	new->out_token = item->out_token;
+	item->out_token = RAWOBJ_EMPTY;
+
+	new->major_status = item->major_status;
+	new->minor_status = item->minor_status;
+}
+
+static void rsi_put(struct kref *ref)
+{
+	struct rsi *rsi = container_of(ref, struct rsi, h.ref);
+
+	LASSERT(rsi->h.next == NULL);
+	rsi_free(rsi);
+	OBD_FREE_PTR(rsi);
+}
+
+static int rsi_match(struct cache_head *a, struct cache_head *b)
+{
+	struct rsi *item = container_of(a, struct rsi, h);
+	struct rsi *tmp = container_of(b, struct rsi, h);
+
+	return __rsi_match(item, tmp);
+}
+
+static void rsi_init(struct cache_head *cnew, struct cache_head *citem)
+{
+	struct rsi *new = container_of(cnew, struct rsi, h);
+	struct rsi *item = container_of(citem, struct rsi, h);
+
+	__rsi_init(new, item);
+}
+
+static void update_rsi(struct cache_head *cnew, struct cache_head *citem)
+{
+	struct rsi *new = container_of(cnew, struct rsi, h);
+	struct rsi *item = container_of(citem, struct rsi, h);
+
+	__rsi_update(new, item);
+}
+
+static struct cache_head *rsi_alloc(void)
+{
+	struct rsi *rsi;
+
+	OBD_ALLOC_PTR(rsi);
+	if (rsi)
+		return &rsi->h;
+	else
+		return NULL;
+}
+
+static int rsi_parse(struct cache_detail *cd, char *mesg, int mlen)
+{
+	char	   *buf = mesg;
+	char	   *ep;
+	int	     len;
+	struct rsi      rsii, *rsip = NULL;
+	time_t	  expiry;
+	int	     status = -EINVAL;
+	ENTRY;
+
+
+	memset(&rsii, 0, sizeof(rsii));
+
+	/* handle */
+	len = qword_get(&mesg, buf, mlen);
+	if (len < 0)
+		goto out;
+	if (rawobj_alloc(&rsii.in_handle, buf, len)) {
+		status = -ENOMEM;
+		goto out;
+	}
+
+	/* token */
+	len = qword_get(&mesg, buf, mlen);
+	if (len < 0)
+		goto out;
+	if (rawobj_alloc(&rsii.in_token, buf, len)) {
+		status = -ENOMEM;
+		goto out;
+	}
+
+	rsip = rsi_lookup(&rsii);
+	if (!rsip)
+		goto out;
+
+	rsii.h.flags = 0;
+	/* expiry */
+	expiry = get_expiry(&mesg);
+	if (expiry == 0)
+		goto out;
+
+	len = qword_get(&mesg, buf, mlen);
+	if (len <= 0)
+		goto out;
+
+	/* major */
+	rsii.major_status = simple_strtol(buf, &ep, 10);
+	if (*ep)
+		goto out;
+
+	/* minor */
+	len = qword_get(&mesg, buf, mlen);
+	if (len <= 0)
+		goto out;
+	rsii.minor_status = simple_strtol(buf, &ep, 10);
+	if (*ep)
+		goto out;
+
+	/* out_handle */
+	len = qword_get(&mesg, buf, mlen);
+	if (len < 0)
+		goto out;
+	if (rawobj_alloc(&rsii.out_handle, buf, len)) {
+		status = -ENOMEM;
+		goto out;
+	}
+
+	/* out_token */
+	len = qword_get(&mesg, buf, mlen);
+	if (len < 0)
+		goto out;
+	if (rawobj_alloc(&rsii.out_token, buf, len)) {
+		status = -ENOMEM;
+		goto out;
+	}
+
+	rsii.h.expiry_time = expiry;
+	rsip = rsi_update(&rsii, rsip);
+	status = 0;
+out:
+	rsi_free(&rsii);
+	if (rsip) {
+		wake_up_all(&rsip->waitq);
+		cache_put(&rsip->h, &rsi_cache);
+	} else {
+		status = -ENOMEM;
+	}
+
+	if (status)
+		CERROR("rsi parse error %d\n", status);
+	RETURN(status);
+}
+
+static struct cache_detail rsi_cache = {
+	.hash_size      = RSI_HASHMAX,
+	.hash_table     = rsi_table,
+	.name	   = "auth.sptlrpc.init",
+	.cache_put      = rsi_put,
+	.cache_upcall   = rsi_upcall,
+	.cache_parse    = rsi_parse,
+	.match	  = rsi_match,
+	.init	   = rsi_init,
+	.update	 = update_rsi,
+	.alloc	  = rsi_alloc,
+};
+
+static struct rsi *rsi_lookup(struct rsi *item)
+{
+	struct cache_head *ch;
+	int hash = rsi_hash(item);
+
+	ch = sunrpc_cache_lookup(&rsi_cache, &item->h, hash);
+	if (ch)
+		return container_of(ch, struct rsi, h);
+	else
+		return NULL;
+}
+
+static struct rsi *rsi_update(struct rsi *new, struct rsi *old)
+{
+	struct cache_head *ch;
+	int hash = rsi_hash(new);
+
+	ch = sunrpc_cache_update(&rsi_cache, &new->h, &old->h, hash);
+	if (ch)
+		return container_of(ch, struct rsi, h);
+	else
+		return NULL;
+}
+
+/****************************************
+ * rsc cache			    *
+ ****************************************/
+
+#define RSC_HASHBITS    (10)
+#define RSC_HASHMAX     (1 << RSC_HASHBITS)
+#define RSC_HASHMASK    (RSC_HASHMAX - 1)
+
+struct rsc {
+	struct cache_head       h;
+	struct obd_device      *target;
+	rawobj_t		handle;
+	struct gss_svc_ctx      ctx;
+};
+
+static struct cache_head *rsc_table[RSC_HASHMAX];
+static struct cache_detail rsc_cache;
+static struct rsc *rsc_update(struct rsc *new, struct rsc *old);
+static struct rsc *rsc_lookup(struct rsc *item);
+
+static void rsc_free(struct rsc *rsci)
+{
+	rawobj_free(&rsci->handle);
+	rawobj_free(&rsci->ctx.gsc_rvs_hdl);
+	lgss_delete_sec_context(&rsci->ctx.gsc_mechctx);
+}
+
+static inline int rsc_hash(struct rsc *rsci)
+{
+	return hash_mem((char *)rsci->handle.data,
+			rsci->handle.len, RSC_HASHBITS);
+}
+
+static inline int __rsc_match(struct rsc *new, struct rsc *tmp)
+{
+	return rawobj_equal(&new->handle, &tmp->handle);
+}
+
+static inline void __rsc_init(struct rsc *new, struct rsc *tmp)
+{
+	new->handle = tmp->handle;
+	tmp->handle = RAWOBJ_EMPTY;
+
+	new->target = NULL;
+	memset(&new->ctx, 0, sizeof(new->ctx));
+	new->ctx.gsc_rvs_hdl = RAWOBJ_EMPTY;
+}
+
+static inline void __rsc_update(struct rsc *new, struct rsc *tmp)
+{
+	new->ctx = tmp->ctx;
+	tmp->ctx.gsc_rvs_hdl = RAWOBJ_EMPTY;
+	tmp->ctx.gsc_mechctx = NULL;
+
+	memset(&new->ctx.gsc_seqdata, 0, sizeof(new->ctx.gsc_seqdata));
+	spin_lock_init(&new->ctx.gsc_seqdata.ssd_lock);
+}
+
+static void rsc_put(struct kref *ref)
+{
+	struct rsc *rsci = container_of(ref, struct rsc, h.ref);
+
+	LASSERT(rsci->h.next == NULL);
+	rsc_free(rsci);
+	OBD_FREE_PTR(rsci);
+}
+
+static int rsc_match(struct cache_head *a, struct cache_head *b)
+{
+	struct rsc *new = container_of(a, struct rsc, h);
+	struct rsc *tmp = container_of(b, struct rsc, h);
+
+	return __rsc_match(new, tmp);
+}
+
+static void rsc_init(struct cache_head *cnew, struct cache_head *ctmp)
+{
+	struct rsc *new = container_of(cnew, struct rsc, h);
+	struct rsc *tmp = container_of(ctmp, struct rsc, h);
+
+	__rsc_init(new, tmp);
+}
+
+static void update_rsc(struct cache_head *cnew, struct cache_head *ctmp)
+{
+	struct rsc *new = container_of(cnew, struct rsc, h);
+	struct rsc *tmp = container_of(ctmp, struct rsc, h);
+
+	__rsc_update(new, tmp);
+}
+
+static struct cache_head * rsc_alloc(void)
+{
+	struct rsc *rsc;
+
+	OBD_ALLOC_PTR(rsc);
+	if (rsc)
+		return &rsc->h;
+	else
+		return NULL;
+}
+
+static int rsc_parse(struct cache_detail *cd, char *mesg, int mlen)
+{
+	char		*buf = mesg;
+	int		  len, rv, tmp_int;
+	struct rsc	   rsci, *rscp = NULL;
+	time_t	       expiry;
+	int		  status = -EINVAL;
+	struct gss_api_mech *gm = NULL;
+
+	memset(&rsci, 0, sizeof(rsci));
+
+	/* context handle */
+	len = qword_get(&mesg, buf, mlen);
+	if (len < 0) goto out;
+	status = -ENOMEM;
+	if (rawobj_alloc(&rsci.handle, buf, len))
+		goto out;
+
+	rsci.h.flags = 0;
+	/* expiry */
+	expiry = get_expiry(&mesg);
+	status = -EINVAL;
+	if (expiry == 0)
+		goto out;
+
+	/* remote flag */
+	rv = get_int(&mesg, &tmp_int);
+	if (rv) {
+		CERROR("fail to get remote flag\n");
+		goto out;
+	}
+	rsci.ctx.gsc_remote = (tmp_int != 0);
+
+	/* root user flag */
+	rv = get_int(&mesg, &tmp_int);
+	if (rv) {
+		CERROR("fail to get oss user flag\n");
+		goto out;
+	}
+	rsci.ctx.gsc_usr_root = (tmp_int != 0);
+
+	/* mds user flag */
+	rv = get_int(&mesg, &tmp_int);
+	if (rv) {
+		CERROR("fail to get mds user flag\n");
+		goto out;
+	}
+	rsci.ctx.gsc_usr_mds = (tmp_int != 0);
+
+	/* oss user flag */
+	rv = get_int(&mesg, &tmp_int);
+	if (rv) {
+		CERROR("fail to get oss user flag\n");
+		goto out;
+	}
+	rsci.ctx.gsc_usr_oss = (tmp_int != 0);
+
+	/* mapped uid */
+	rv = get_int(&mesg, (int *) &rsci.ctx.gsc_mapped_uid);
+	if (rv) {
+		CERROR("fail to get mapped uid\n");
+		goto out;
+	}
+
+	rscp = rsc_lookup(&rsci);
+	if (!rscp)
+		goto out;
+
+	/* uid, or NEGATIVE */
+	rv = get_int(&mesg, (int *) &rsci.ctx.gsc_uid);
+	if (rv == -EINVAL)
+		goto out;
+	if (rv == -ENOENT) {
+		CERROR("NOENT? set rsc entry negative\n");
+		set_bit(CACHE_NEGATIVE, &rsci.h.flags);
+	} else {
+		rawobj_t tmp_buf;
+		unsigned long ctx_expiry;
+
+		/* gid */
+		if (get_int(&mesg, (int *) &rsci.ctx.gsc_gid))
+			goto out;
+
+		/* mech name */
+		len = qword_get(&mesg, buf, mlen);
+		if (len < 0)
+			goto out;
+		gm = lgss_name_to_mech(buf);
+		status = -EOPNOTSUPP;
+		if (!gm)
+			goto out;
+
+		status = -EINVAL;
+		/* mech-specific data: */
+		len = qword_get(&mesg, buf, mlen);
+		if (len < 0)
+			goto out;
+
+		tmp_buf.len = len;
+		tmp_buf.data = (unsigned char *)buf;
+		if (lgss_import_sec_context(&tmp_buf, gm,
+					    &rsci.ctx.gsc_mechctx))
+			goto out;
+
+		/* currently the expiry time passed down from user-space
+		 * is invalid, here we retrive it from mech. */
+		if (lgss_inquire_context(rsci.ctx.gsc_mechctx, &ctx_expiry)) {
+			CERROR("unable to get expire time, drop it\n");
+			goto out;
+		}
+		expiry = (time_t) ctx_expiry;
+	}
+
+	rsci.h.expiry_time = expiry;
+	rscp = rsc_update(&rsci, rscp);
+	status = 0;
+out:
+	if (gm)
+		lgss_mech_put(gm);
+	rsc_free(&rsci);
+	if (rscp)
+		cache_put(&rscp->h, &rsc_cache);
+	else
+		status = -ENOMEM;
+
+	if (status)
+		CERROR("parse rsc error %d\n", status);
+	return status;
+}
+
+static struct cache_detail rsc_cache = {
+	.hash_size      = RSC_HASHMAX,
+	.hash_table     = rsc_table,
+	.name	   = "auth.sptlrpc.context",
+	.cache_put      = rsc_put,
+	.cache_parse    = rsc_parse,
+	.match	  = rsc_match,
+	.init	   = rsc_init,
+	.update	 = update_rsc,
+	.alloc	  = rsc_alloc,
+};
+
+static struct rsc *rsc_lookup(struct rsc *item)
+{
+	struct cache_head *ch;
+	int		hash = rsc_hash(item);
+
+	ch = sunrpc_cache_lookup(&rsc_cache, &item->h, hash);
+	if (ch)
+		return container_of(ch, struct rsc, h);
+	else
+		return NULL;
+}
+
+static struct rsc *rsc_update(struct rsc *new, struct rsc *old)
+{
+	struct cache_head *ch;
+	int		hash = rsc_hash(new);
+
+	ch = sunrpc_cache_update(&rsc_cache, &new->h, &old->h, hash);
+	if (ch)
+		return container_of(ch, struct rsc, h);
+	else
+		return NULL;
+}
+
+#define COMPAT_RSC_PUT(item, cd)	cache_put((item), (cd))
+
+/****************************************
+ * rsc cache flush		      *
+ ****************************************/
+
+typedef int rsc_entry_match(struct rsc *rscp, long data);
+
+static void rsc_flush(rsc_entry_match *match, long data)
+{
+	struct cache_head **ch;
+	struct rsc *rscp;
+	int n;
+	ENTRY;
+
+	write_lock(&rsc_cache.hash_lock);
+	for (n = 0; n < RSC_HASHMAX; n++) {
+		for (ch = &rsc_cache.hash_table[n]; *ch;) {
+			rscp = container_of(*ch, struct rsc, h);
+
+			if (!match(rscp, data)) {
+				ch = &((*ch)->next);
+				continue;
+			}
+
+			/* it seems simply set NEGATIVE doesn't work */
+			*ch = (*ch)->next;
+			rscp->h.next = NULL;
+			cache_get(&rscp->h);
+			set_bit(CACHE_NEGATIVE, &rscp->h.flags);
+			COMPAT_RSC_PUT(&rscp->h, &rsc_cache);
+			rsc_cache.entries--;
+		}
+	}
+	write_unlock(&rsc_cache.hash_lock);
+	EXIT;
+}
+
+static int match_uid(struct rsc *rscp, long uid)
+{
+	if ((int) uid == -1)
+		return 1;
+	return ((int) rscp->ctx.gsc_uid == (int) uid);
+}
+
+static int match_target(struct rsc *rscp, long target)
+{
+	return (rscp->target == (struct obd_device *) target);
+}
+
+static inline void rsc_flush_uid(int uid)
+{
+	if (uid == -1)
+		CWARN("flush all gss contexts...\n");
+
+	rsc_flush(match_uid, (long) uid);
+}
+
+static inline void rsc_flush_target(struct obd_device *target)
+{
+	rsc_flush(match_target, (long) target);
+}
+
+void gss_secsvc_flush(struct obd_device *target)
+{
+	rsc_flush_target(target);
+}
+EXPORT_SYMBOL(gss_secsvc_flush);
+
+static struct rsc *gss_svc_searchbyctx(rawobj_t *handle)
+{
+	struct rsc  rsci;
+	struct rsc *found;
+
+	memset(&rsci, 0, sizeof(rsci));
+	if (rawobj_dup(&rsci.handle, handle))
+		return NULL;
+
+	found = rsc_lookup(&rsci);
+	rsc_free(&rsci);
+	if (!found)
+		return NULL;
+	if (cache_check(&rsc_cache, &found->h, NULL))
+		return NULL;
+	return found;
+}
+
+int gss_svc_upcall_install_rvs_ctx(struct obd_import *imp,
+				   struct gss_sec *gsec,
+				   struct gss_cli_ctx *gctx)
+{
+	struct rsc      rsci, *rscp = NULL;
+	unsigned long   ctx_expiry;
+	__u32	   major;
+	int	     rc;
+	ENTRY;
+
+	memset(&rsci, 0, sizeof(rsci));
+
+	if (rawobj_alloc(&rsci.handle, (char *) &gsec->gs_rvs_hdl,
+			 sizeof(gsec->gs_rvs_hdl)))
+		GOTO(out, rc = -ENOMEM);
+
+	rscp = rsc_lookup(&rsci);
+	if (rscp == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	major = lgss_copy_reverse_context(gctx->gc_mechctx,
+					  &rsci.ctx.gsc_mechctx);
+	if (major != GSS_S_COMPLETE)
+		GOTO(out, rc = -ENOMEM);
+
+	if (lgss_inquire_context(rsci.ctx.gsc_mechctx, &ctx_expiry)) {
+		CERROR("unable to get expire time, drop it\n");
+		GOTO(out, rc = -EINVAL);
+	}
+	rsci.h.expiry_time = (time_t) ctx_expiry;
+
+	if (strcmp(imp->imp_obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0)
+		rsci.ctx.gsc_usr_mds = 1;
+	else if (strcmp(imp->imp_obd->obd_type->typ_name, LUSTRE_OSC_NAME) == 0)
+		rsci.ctx.gsc_usr_oss = 1;
+	else
+		rsci.ctx.gsc_usr_root = 1;
+
+	rscp = rsc_update(&rsci, rscp);
+	if (rscp == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	rscp->target = imp->imp_obd;
+	rawobj_dup(&gctx->gc_svc_handle, &rscp->handle);
+
+	CWARN("create reverse svc ctx %p to %s: idx "LPX64"\n",
+	      &rscp->ctx, obd2cli_tgt(imp->imp_obd), gsec->gs_rvs_hdl);
+	rc = 0;
+out:
+	if (rscp)
+		cache_put(&rscp->h, &rsc_cache);
+	rsc_free(&rsci);
+
+	if (rc)
+		CERROR("create reverse svc ctx: idx "LPX64", rc %d\n",
+		       gsec->gs_rvs_hdl, rc);
+	RETURN(rc);
+}
+
+int gss_svc_upcall_expire_rvs_ctx(rawobj_t *handle)
+{
+	const cfs_time_t	expire = 20;
+	struct rsc	     *rscp;
+
+	rscp = gss_svc_searchbyctx(handle);
+	if (rscp) {
+		CDEBUG(D_SEC, "reverse svcctx %p (rsc %p) expire soon\n",
+		       &rscp->ctx, rscp);
+
+		rscp->h.expiry_time = cfs_time_current_sec() + expire;
+		COMPAT_RSC_PUT(&rscp->h, &rsc_cache);
+	}
+	return 0;
+}
+
+int gss_svc_upcall_dup_handle(rawobj_t *handle, struct gss_svc_ctx *ctx)
+{
+	struct rsc *rscp = container_of(ctx, struct rsc, ctx);
+
+	return rawobj_dup(handle, &rscp->handle);
+}
+
+int gss_svc_upcall_update_sequence(rawobj_t *handle, __u32 seq)
+{
+	struct rsc	     *rscp;
+
+	rscp = gss_svc_searchbyctx(handle);
+	if (rscp) {
+		CDEBUG(D_SEC, "reverse svcctx %p (rsc %p) update seq to %u\n",
+		       &rscp->ctx, rscp, seq + 1);
+
+		rscp->ctx.gsc_rvs_seq = seq + 1;
+		COMPAT_RSC_PUT(&rscp->h, &rsc_cache);
+	}
+	return 0;
+}
+
+static struct cache_deferred_req* cache_upcall_defer(struct cache_req *req)
+{
+	return NULL;
+}
+static struct cache_req cache_upcall_chandle = { cache_upcall_defer };
+
+int gss_svc_upcall_handle_init(struct ptlrpc_request *req,
+			       struct gss_svc_reqctx *grctx,
+			       struct gss_wire_ctx *gw,
+			       struct obd_device *target,
+			       __u32 lustre_svc,
+			       rawobj_t *rvs_hdl,
+			       rawobj_t *in_token)
+{
+	struct ptlrpc_reply_state *rs;
+	struct rsc		*rsci = NULL;
+	struct rsi		*rsip = NULL, rsikey;
+	wait_queue_t	     wait;
+	int			replen = sizeof(struct ptlrpc_body);
+	struct gss_rep_header     *rephdr;
+	int			first_check = 1;
+	int			rc = SECSVC_DROP;
+	ENTRY;
+
+	memset(&rsikey, 0, sizeof(rsikey));
+	rsikey.lustre_svc = lustre_svc;
+	rsikey.nid = (__u64) req->rq_peer.nid;
+
+	/* duplicate context handle. for INIT it always 0 */
+	if (rawobj_dup(&rsikey.in_handle, &gw->gw_handle)) {
+		CERROR("fail to dup context handle\n");
+		GOTO(out, rc);
+	}
+
+	if (rawobj_dup(&rsikey.in_token, in_token)) {
+		CERROR("can't duplicate token\n");
+		rawobj_free(&rsikey.in_handle);
+		GOTO(out, rc);
+	}
+
+	rsip = rsi_lookup(&rsikey);
+	rsi_free(&rsikey);
+	if (!rsip) {
+		CERROR("error in rsi_lookup.\n");
+
+		if (!gss_pack_err_notify(req, GSS_S_FAILURE, 0))
+			rc = SECSVC_COMPLETE;
+
+		GOTO(out, rc);
+	}
+
+	cache_get(&rsip->h); /* take an extra ref */
+	init_waitqueue_head(&rsip->waitq);
+	init_waitqueue_entry_current(&wait);
+	add_wait_queue(&rsip->waitq, &wait);
+
+cache_check:
+	/* Note each time cache_check() will drop a reference if return
+	 * non-zero. We hold an extra reference on initial rsip, but must
+	 * take care of following calls. */
+	rc = cache_check(&rsi_cache, &rsip->h, &cache_upcall_chandle);
+	switch (rc) {
+	case -EAGAIN: {
+		int valid;
+
+		if (first_check) {
+			first_check = 0;
+
+			read_lock(&rsi_cache.hash_lock);
+			valid = test_bit(CACHE_VALID, &rsip->h.flags);
+			if (valid == 0)
+				set_current_state(TASK_INTERRUPTIBLE);
+			read_unlock(&rsi_cache.hash_lock);
+
+			if (valid == 0)
+				schedule_timeout(GSS_SVC_UPCALL_TIMEOUT *
+						     HZ);
+
+			cache_get(&rsip->h);
+			goto cache_check;
+		}
+		CWARN("waited %ds timeout, drop\n", GSS_SVC_UPCALL_TIMEOUT);
+		break;
+	}
+	case -ENOENT:
+		CWARN("cache_check return ENOENT, drop\n");
+		break;
+	case 0:
+		/* if not the first check, we have to release the extra
+		 * reference we just added on it. */
+		if (!first_check)
+			cache_put(&rsip->h, &rsi_cache);
+		CDEBUG(D_SEC, "cache_check is good\n");
+		break;
+	}
+
+	remove_wait_queue(&rsip->waitq, &wait);
+	cache_put(&rsip->h, &rsi_cache);
+
+	if (rc)
+		GOTO(out, rc = SECSVC_DROP);
+
+	rc = SECSVC_DROP;
+	rsci = gss_svc_searchbyctx(&rsip->out_handle);
+	if (!rsci) {
+		CERROR("authentication failed\n");
+
+		if (!gss_pack_err_notify(req, GSS_S_FAILURE, 0))
+			rc = SECSVC_COMPLETE;
+
+		GOTO(out, rc);
+	} else {
+		cache_get(&rsci->h);
+		grctx->src_ctx = &rsci->ctx;
+	}
+
+	if (rawobj_dup(&rsci->ctx.gsc_rvs_hdl, rvs_hdl)) {
+		CERROR("failed duplicate reverse handle\n");
+		GOTO(out, rc);
+	}
+
+	rsci->target = target;
+
+	CDEBUG(D_SEC, "server create rsc %p(%u->%s)\n",
+	       rsci, rsci->ctx.gsc_uid, libcfs_nid2str(req->rq_peer.nid));
+
+	if (rsip->out_handle.len > PTLRPC_GSS_MAX_HANDLE_SIZE) {
+		CERROR("handle size %u too large\n", rsip->out_handle.len);
+		GOTO(out, rc = SECSVC_DROP);
+	}
+
+	grctx->src_init = 1;
+	grctx->src_reserve_len = cfs_size_round4(rsip->out_token.len);
+
+	rc = lustre_pack_reply_v2(req, 1, &replen, NULL, 0);
+	if (rc) {
+		CERROR("failed to pack reply: %d\n", rc);
+		GOTO(out, rc = SECSVC_DROP);
+	}
+
+	rs = req->rq_reply_state;
+	LASSERT(rs->rs_repbuf->lm_bufcount == 3);
+	LASSERT(rs->rs_repbuf->lm_buflens[0] >=
+		sizeof(*rephdr) + rsip->out_handle.len);
+	LASSERT(rs->rs_repbuf->lm_buflens[2] >= rsip->out_token.len);
+
+	rephdr = lustre_msg_buf(rs->rs_repbuf, 0, 0);
+	rephdr->gh_version = PTLRPC_GSS_VERSION;
+	rephdr->gh_flags = 0;
+	rephdr->gh_proc = PTLRPC_GSS_PROC_ERR;
+	rephdr->gh_major = rsip->major_status;
+	rephdr->gh_minor = rsip->minor_status;
+	rephdr->gh_seqwin = GSS_SEQ_WIN;
+	rephdr->gh_handle.len = rsip->out_handle.len;
+	memcpy(rephdr->gh_handle.data, rsip->out_handle.data,
+	       rsip->out_handle.len);
+
+	memcpy(lustre_msg_buf(rs->rs_repbuf, 2, 0), rsip->out_token.data,
+	       rsip->out_token.len);
+
+	rs->rs_repdata_len = lustre_shrink_msg(rs->rs_repbuf, 2,
+					       rsip->out_token.len, 0);
+
+	rc = SECSVC_OK;
+
+out:
+	/* it looks like here we should put rsip also, but this mess up
+	 * with NFS cache mgmt code... FIXME */
+#if 0
+	if (rsip)
+		rsi_put(&rsip->h, &rsi_cache);
+#endif
+
+	if (rsci) {
+		/* if anything went wrong, we don't keep the context too */
+		if (rc != SECSVC_OK)
+			set_bit(CACHE_NEGATIVE, &rsci->h.flags);
+		else
+			CDEBUG(D_SEC, "create rsc with idx "LPX64"\n",
+			       gss_handle_to_u64(&rsci->handle));
+
+		COMPAT_RSC_PUT(&rsci->h, &rsc_cache);
+	}
+	RETURN(rc);
+}
+
+struct gss_svc_ctx *gss_svc_upcall_get_ctx(struct ptlrpc_request *req,
+					   struct gss_wire_ctx *gw)
+{
+	struct rsc *rsc;
+
+	rsc = gss_svc_searchbyctx(&gw->gw_handle);
+	if (!rsc) {
+		CWARN("Invalid gss ctx idx "LPX64" from %s\n",
+		      gss_handle_to_u64(&gw->gw_handle),
+		      libcfs_nid2str(req->rq_peer.nid));
+		return NULL;
+	}
+
+	return &rsc->ctx;
+}
+
+void gss_svc_upcall_put_ctx(struct gss_svc_ctx *ctx)
+{
+	struct rsc *rsc = container_of(ctx, struct rsc, ctx);
+
+	COMPAT_RSC_PUT(&rsc->h, &rsc_cache);
+}
+
+void gss_svc_upcall_destroy_ctx(struct gss_svc_ctx *ctx)
+{
+	struct rsc *rsc = container_of(ctx, struct rsc, ctx);
+
+	/* can't be found */
+	set_bit(CACHE_NEGATIVE, &rsc->h.flags);
+	/* to be removed at next scan */
+	rsc->h.expiry_time = 1;
+}
+
+int __init gss_init_svc_upcall(void)
+{
+	int     i;
+
+	spin_lock_init(&__ctx_index_lock);
+	/*
+	 * this helps reducing context index confliction. after server reboot,
+	 * conflicting request from clients might be filtered out by initial
+	 * sequence number checking, thus no chance to sent error notification
+	 * back to clients.
+	 */
+	cfs_get_random_bytes(&__ctx_index, sizeof(__ctx_index));
+
+
+	cache_register(&rsi_cache);
+	cache_register(&rsc_cache);
+
+	/* FIXME this looks stupid. we intend to give lsvcgssd a chance to open
+	 * the init upcall channel, otherwise there's big chance that the first
+	 * upcall issued before the channel be opened thus nfsv4 cache code will
+	 * drop the request direclty, thus lead to unnecessary recovery time.
+	 * here we wait at miximum 1.5 seconds. */
+	for (i = 0; i < 6; i++) {
+		if (atomic_read(&rsi_cache.readers) > 0)
+			break;
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		LASSERT(HZ >= 4);
+		schedule_timeout(HZ / 4);
+	}
+
+	if (atomic_read(&rsi_cache.readers) == 0)
+		CWARN("Init channel is not opened by lsvcgssd, following "
+		      "request might be dropped until lsvcgssd is active\n");
+
+	return 0;
+}
+
+void __exit gss_exit_svc_upcall(void)
+{
+	cache_purge(&rsi_cache);
+	cache_unregister(&rsi_cache);
+
+	cache_purge(&rsc_cache);
+	cache_unregister(&rsc_cache);
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/lproc_gss.c b/drivers/staging/lustre/lustre/ptlrpc/gss/lproc_gss.c
new file mode 100644
index 000000000000..2522e0517282
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/gss/lproc_gss.c
@@ -0,0 +1,219 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lprocfs_status.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+static struct proc_dir_entry *gss_proc_root = NULL;
+static struct proc_dir_entry *gss_proc_lk = NULL;
+
+/*
+ * statistic of "out-of-sequence-window"
+ */
+static struct {
+	spinlock_t  oos_lock;
+	atomic_t    oos_cli_count;       /* client occurrence */
+	int	     oos_cli_behind;      /* client max seqs behind */
+	atomic_t    oos_svc_replay[3];   /* server replay detected */
+	atomic_t    oos_svc_pass[3];     /* server verified ok */
+} gss_stat_oos = {
+	.oos_cli_count  = ATOMIC_INIT(0),
+	.oos_cli_behind = 0,
+	.oos_svc_replay = { ATOMIC_INIT(0), },
+	.oos_svc_pass   = { ATOMIC_INIT(0), },
+};
+
+void gss_stat_oos_record_cli(int behind)
+{
+	atomic_inc(&gss_stat_oos.oos_cli_count);
+
+	spin_lock(&gss_stat_oos.oos_lock);
+	if (behind > gss_stat_oos.oos_cli_behind)
+		gss_stat_oos.oos_cli_behind = behind;
+	spin_unlock(&gss_stat_oos.oos_lock);
+}
+
+void gss_stat_oos_record_svc(int phase, int replay)
+{
+	LASSERT(phase >= 0 && phase <= 2);
+
+	if (replay)
+		atomic_inc(&gss_stat_oos.oos_svc_replay[phase]);
+	else
+		atomic_inc(&gss_stat_oos.oos_svc_pass[phase]);
+}
+
+static int gss_proc_read_oos(char *page, char **start, off_t off, int count,
+			     int *eof, void *data)
+{
+	int written;
+
+	written = snprintf(page, count,
+			"seqwin:		%u\n"
+			"backwin:	       %u\n"
+			"client fall behind seqwin\n"
+			"  occurrence:	  %d\n"
+			"  max seq behind:      %d\n"
+			"server replay detected:\n"
+			"  phase 0:	     %d\n"
+			"  phase 1:	     %d\n"
+			"  phase 2:	     %d\n"
+			"server verify ok:\n"
+			"  phase 2:	     %d\n",
+			GSS_SEQ_WIN_MAIN,
+			GSS_SEQ_WIN_BACK,
+			atomic_read(&gss_stat_oos.oos_cli_count),
+			gss_stat_oos.oos_cli_behind,
+			atomic_read(&gss_stat_oos.oos_svc_replay[0]),
+			atomic_read(&gss_stat_oos.oos_svc_replay[1]),
+			atomic_read(&gss_stat_oos.oos_svc_replay[2]),
+			atomic_read(&gss_stat_oos.oos_svc_pass[2]));
+
+	return written;
+}
+
+static int gss_proc_write_secinit(struct file *file, const char *buffer,
+				  unsigned long count, void *data)
+{
+	int rc;
+
+	rc = gss_do_ctx_init_rpc((char *) buffer, count);
+	if (rc) {
+		LASSERT(rc < 0);
+		return rc;
+	}
+
+	return ((int) count);
+}
+
+static struct lprocfs_vars gss_lprocfs_vars[] = {
+	{ "replays", gss_proc_read_oos, NULL },
+	{ "init_channel", NULL, gss_proc_write_secinit, NULL, NULL, 0222 },
+	{ NULL }
+};
+
+/*
+ * for userspace helper lgss_keyring.
+ *
+ * debug_level: [0, 4], defined in utils/gss/lgss_utils.h
+ */
+static int gss_lk_debug_level = 1;
+
+static int gss_lk_proc_read_dl(char *page, char **start, off_t off,
+			       int count, int *eof, void *data)
+{
+	return snprintf(page, count, "%u\n", gss_lk_debug_level);
+}
+
+static int gss_lk_proc_write_dl(struct file *file, const char *buffer,
+				unsigned long count, void *data)
+{
+	int     val, rc;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc < 0)
+		return rc;
+
+	if (val < 0 || val > 4)
+		return -ERANGE;
+
+	gss_lk_debug_level = val;
+	return count;
+}
+
+static struct lprocfs_vars gss_lk_lprocfs_vars[] = {
+	{ "debug_level", gss_lk_proc_read_dl, gss_lk_proc_write_dl, NULL },
+	{ NULL }
+};
+
+void gss_exit_lproc(void)
+{
+	if (gss_proc_lk) {
+		lprocfs_remove(&gss_proc_lk);
+		gss_proc_lk = NULL;
+	}
+
+	if (gss_proc_root) {
+		lprocfs_remove(&gss_proc_root);
+		gss_proc_root = NULL;
+	}
+}
+
+int gss_init_lproc(void)
+{
+	int     rc;
+
+	spin_lock_init(&gss_stat_oos.oos_lock);
+
+	gss_proc_root = lprocfs_register("gss", sptlrpc_proc_root,
+					 gss_lprocfs_vars, NULL);
+	if (IS_ERR(gss_proc_root)) {
+		gss_proc_root = NULL;
+		GOTO(err_out, rc = PTR_ERR(gss_proc_root));
+	}
+
+	gss_proc_lk = lprocfs_register("lgss_keyring", gss_proc_root,
+				       gss_lk_lprocfs_vars, NULL);
+	if (IS_ERR(gss_proc_lk)) {
+		gss_proc_lk = NULL;
+		GOTO(err_out, rc = PTR_ERR(gss_proc_root));
+	}
+
+	return 0;
+
+err_out:
+	CERROR("failed to initialize gss lproc entries: %d\n", rc);
+	gss_exit_lproc();
+	return rc;
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/sec_gss.c b/drivers/staging/lustre/lustre/ptlrpc/gss/sec_gss.c
new file mode 100644
index 000000000000..ebca858ca183
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/gss/sec_gss.c
@@ -0,0 +1,2916 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ * linux/net/sunrpc/auth_gss.c
+ *
+ * RPCSEC_GSS client authentication.
+ *
+ *  Copyright (c) 2000 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Dug Song       <dugsong@monkey.org>
+ *  Andy Adamson   <andros@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <asm/atomic.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <obd_cksum.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+#include <linux/crypto.h>
+#include <linux/crc32.h>
+
+/*
+ * early reply have fixed size, respectively in privacy and integrity mode.
+ * so we calculate them only once.
+ */
+static int gss_at_reply_off_integ;
+static int gss_at_reply_off_priv;
+
+
+static inline int msg_last_segidx(struct lustre_msg *msg)
+{
+	LASSERT(msg->lm_bufcount > 0);
+	return msg->lm_bufcount - 1;
+}
+static inline int msg_last_seglen(struct lustre_msg *msg)
+{
+	return msg->lm_buflens[msg_last_segidx(msg)];
+}
+
+/********************************************
+ * wire data swabber			*
+ ********************************************/
+
+static
+void gss_header_swabber(struct gss_header *ghdr)
+{
+	__swab32s(&ghdr->gh_flags);
+	__swab32s(&ghdr->gh_proc);
+	__swab32s(&ghdr->gh_seq);
+	__swab32s(&ghdr->gh_svc);
+	__swab32s(&ghdr->gh_pad1);
+	__swab32s(&ghdr->gh_handle.len);
+}
+
+struct gss_header *gss_swab_header(struct lustre_msg *msg, int segment,
+				   int swabbed)
+{
+	struct gss_header *ghdr;
+
+	ghdr = lustre_msg_buf(msg, segment, sizeof(*ghdr));
+	if (ghdr == NULL)
+		return NULL;
+
+	if (swabbed)
+		gss_header_swabber(ghdr);
+
+	if (sizeof(*ghdr) + ghdr->gh_handle.len > msg->lm_buflens[segment]) {
+		CERROR("gss header has length %d, now %u received\n",
+		       (int) sizeof(*ghdr) + ghdr->gh_handle.len,
+		       msg->lm_buflens[segment]);
+		return NULL;
+	}
+
+	return ghdr;
+}
+
+#if 0
+static
+void gss_netobj_swabber(netobj_t *obj)
+{
+	__swab32s(&obj->len);
+}
+
+netobj_t *gss_swab_netobj(struct lustre_msg *msg, int segment)
+{
+	netobj_t  *obj;
+
+	obj = lustre_swab_buf(msg, segment, sizeof(*obj), gss_netobj_swabber);
+	if (obj && sizeof(*obj) + obj->len > msg->lm_buflens[segment]) {
+		CERROR("netobj require length %u but only %u received\n",
+		       (unsigned int) sizeof(*obj) + obj->len,
+		       msg->lm_buflens[segment]);
+		return NULL;
+	}
+
+	return obj;
+}
+#endif
+
+/*
+ * payload should be obtained from mechanism. but currently since we
+ * only support kerberos, we could simply use fixed value.
+ * krb5 "meta" data:
+ *  - krb5 header:      16
+ *  - krb5 checksum:    20
+ *
+ * for privacy mode, payload also include the cipher text which has the same
+ * size as plain text, plus possible confounder, padding both at maximum cipher
+ * block size.
+ */
+#define GSS_KRB5_INTEG_MAX_PAYLOAD      (40)
+
+static inline
+int gss_mech_payload(struct gss_ctx *mechctx, int msgsize, int privacy)
+{
+	if (privacy)
+		return GSS_KRB5_INTEG_MAX_PAYLOAD + 16 + 16 + 16 + msgsize;
+	else
+		return GSS_KRB5_INTEG_MAX_PAYLOAD;
+}
+
+/*
+ * return signature size, otherwise < 0 to indicate error
+ */
+static int gss_sign_msg(struct lustre_msg *msg,
+			struct gss_ctx *mechctx,
+			enum lustre_sec_part sp,
+			__u32 flags, __u32 proc, __u32 seq, __u32 svc,
+			rawobj_t *handle)
+{
+	struct gss_header      *ghdr;
+	rawobj_t		text[4], mic;
+	int		     textcnt, max_textcnt, mic_idx;
+	__u32		   major;
+
+	LASSERT(msg->lm_bufcount >= 2);
+
+	/* gss hdr */
+	LASSERT(msg->lm_buflens[0] >=
+		sizeof(*ghdr) + (handle ? handle->len : 0));
+	ghdr = lustre_msg_buf(msg, 0, 0);
+
+	ghdr->gh_version = PTLRPC_GSS_VERSION;
+	ghdr->gh_sp = (__u8) sp;
+	ghdr->gh_flags = flags;
+	ghdr->gh_proc = proc;
+	ghdr->gh_seq = seq;
+	ghdr->gh_svc = svc;
+	if (!handle) {
+		/* fill in a fake one */
+		ghdr->gh_handle.len = 0;
+	} else {
+		ghdr->gh_handle.len = handle->len;
+		memcpy(ghdr->gh_handle.data, handle->data, handle->len);
+	}
+
+	/* no actual signature for null mode */
+	if (svc == SPTLRPC_SVC_NULL)
+		return lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+
+	/* MIC */
+	mic_idx = msg_last_segidx(msg);
+	max_textcnt = (svc == SPTLRPC_SVC_AUTH) ? 1 : mic_idx;
+
+	for (textcnt = 0; textcnt < max_textcnt; textcnt++) {
+		text[textcnt].len = msg->lm_buflens[textcnt];
+		text[textcnt].data = lustre_msg_buf(msg, textcnt, 0);
+	}
+
+	mic.len = msg->lm_buflens[mic_idx];
+	mic.data = lustre_msg_buf(msg, mic_idx, 0);
+
+	major = lgss_get_mic(mechctx, textcnt, text, 0, NULL, &mic);
+	if (major != GSS_S_COMPLETE) {
+		CERROR("fail to generate MIC: %08x\n", major);
+		return -EPERM;
+	}
+	LASSERT(mic.len <= msg->lm_buflens[mic_idx]);
+
+	return lustre_shrink_msg(msg, mic_idx, mic.len, 0);
+}
+
+/*
+ * return gss error
+ */
+static
+__u32 gss_verify_msg(struct lustre_msg *msg,
+		     struct gss_ctx *mechctx,
+		     __u32 svc)
+{
+	rawobj_t	text[4], mic;
+	int	     textcnt, max_textcnt;
+	int	     mic_idx;
+	__u32	   major;
+
+	LASSERT(msg->lm_bufcount >= 2);
+
+	if (svc == SPTLRPC_SVC_NULL)
+		return GSS_S_COMPLETE;
+
+	mic_idx = msg_last_segidx(msg);
+	max_textcnt = (svc == SPTLRPC_SVC_AUTH) ? 1 : mic_idx;
+
+	for (textcnt = 0; textcnt < max_textcnt; textcnt++) {
+		text[textcnt].len = msg->lm_buflens[textcnt];
+		text[textcnt].data = lustre_msg_buf(msg, textcnt, 0);
+	}
+
+	mic.len = msg->lm_buflens[mic_idx];
+	mic.data = lustre_msg_buf(msg, mic_idx, 0);
+
+	major = lgss_verify_mic(mechctx, textcnt, text, 0, NULL, &mic);
+	if (major != GSS_S_COMPLETE)
+		CERROR("mic verify error: %08x\n", major);
+
+	return major;
+}
+
+/*
+ * return gss error code
+ */
+static
+__u32 gss_unseal_msg(struct gss_ctx *mechctx,
+		   struct lustre_msg *msgbuf,
+		   int *msg_len, int msgbuf_len)
+{
+	rawobj_t		 clear_obj, hdrobj, token;
+	__u8		    *clear_buf;
+	int		      clear_buflen;
+	__u32		    major;
+	ENTRY;
+
+	if (msgbuf->lm_bufcount != 2) {
+		CERROR("invalid bufcount %d\n", msgbuf->lm_bufcount);
+		RETURN(GSS_S_FAILURE);
+	}
+
+	/* allocate a temporary clear text buffer, same sized as token,
+	 * we assume the final clear text size <= token size */
+	clear_buflen = lustre_msg_buflen(msgbuf, 1);
+	OBD_ALLOC_LARGE(clear_buf, clear_buflen);
+	if (!clear_buf)
+		RETURN(GSS_S_FAILURE);
+
+	/* buffer objects */
+	hdrobj.len = lustre_msg_buflen(msgbuf, 0);
+	hdrobj.data = lustre_msg_buf(msgbuf, 0, 0);
+	token.len = lustre_msg_buflen(msgbuf, 1);
+	token.data = lustre_msg_buf(msgbuf, 1, 0);
+	clear_obj.len = clear_buflen;
+	clear_obj.data = clear_buf;
+
+	major = lgss_unwrap(mechctx, &hdrobj, &token, &clear_obj);
+	if (major != GSS_S_COMPLETE) {
+		CERROR("unwrap message error: %08x\n", major);
+		GOTO(out_free, major = GSS_S_FAILURE);
+	}
+	LASSERT(clear_obj.len <= clear_buflen);
+	LASSERT(clear_obj.len <= msgbuf_len);
+
+	/* now the decrypted message */
+	memcpy(msgbuf, clear_obj.data, clear_obj.len);
+	*msg_len = clear_obj.len;
+
+	major = GSS_S_COMPLETE;
+out_free:
+	OBD_FREE_LARGE(clear_buf, clear_buflen);
+	RETURN(major);
+}
+
+/********************************************
+ * gss client context manipulation helpers  *
+ ********************************************/
+
+int cli_ctx_expire(struct ptlrpc_cli_ctx *ctx)
+{
+	LASSERT(atomic_read(&ctx->cc_refcount));
+
+	if (!test_and_set_bit(PTLRPC_CTX_DEAD_BIT, &ctx->cc_flags)) {
+		if (!ctx->cc_early_expire)
+			clear_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags);
+
+		CWARN("ctx %p(%u->%s) get expired: %lu(%+lds)\n",
+		      ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec),
+		      ctx->cc_expire,
+		      ctx->cc_expire == 0 ? 0 :
+		      cfs_time_sub(ctx->cc_expire, cfs_time_current_sec()));
+
+		sptlrpc_cli_ctx_wakeup(ctx);
+		return 1;
+	}
+
+	return 0;
+}
+
+/*
+ * return 1 if the context is dead.
+ */
+int cli_ctx_check_death(struct ptlrpc_cli_ctx *ctx)
+{
+	if (unlikely(cli_ctx_is_dead(ctx)))
+		return 1;
+
+	/* expire is 0 means never expire. a newly created gss context
+	 * which during upcall may has 0 expiration */
+	if (ctx->cc_expire == 0)
+		return 0;
+
+	/* check real expiration */
+	if (cfs_time_after(ctx->cc_expire, cfs_time_current_sec()))
+		return 0;
+
+	cli_ctx_expire(ctx);
+	return 1;
+}
+
+void gss_cli_ctx_uptodate(struct gss_cli_ctx *gctx)
+{
+	struct ptlrpc_cli_ctx  *ctx = &gctx->gc_base;
+	unsigned long	   ctx_expiry;
+
+	if (lgss_inquire_context(gctx->gc_mechctx, &ctx_expiry)) {
+		CERROR("ctx %p(%u): unable to inquire, expire it now\n",
+		       gctx, ctx->cc_vcred.vc_uid);
+		ctx_expiry = 1; /* make it expired now */
+	}
+
+	ctx->cc_expire = gss_round_ctx_expiry(ctx_expiry,
+					      ctx->cc_sec->ps_flvr.sf_flags);
+
+	/* At this point this ctx might have been marked as dead by
+	 * someone else, in which case nobody will make further use
+	 * of it. we don't care, and mark it UPTODATE will help
+	 * destroying server side context when it be destroied. */
+	set_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags);
+
+	if (sec_is_reverse(ctx->cc_sec)) {
+		CWARN("server installed reverse ctx %p idx "LPX64", "
+		      "expiry %lu(%+lds)\n", ctx,
+		      gss_handle_to_u64(&gctx->gc_handle),
+		      ctx->cc_expire, ctx->cc_expire - cfs_time_current_sec());
+	} else {
+		CWARN("client refreshed ctx %p idx "LPX64" (%u->%s), "
+		      "expiry %lu(%+lds)\n", ctx,
+		      gss_handle_to_u64(&gctx->gc_handle),
+		      ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec),
+		      ctx->cc_expire, ctx->cc_expire - cfs_time_current_sec());
+
+		/* install reverse svc ctx for root context */
+		if (ctx->cc_vcred.vc_uid == 0)
+			gss_sec_install_rctx(ctx->cc_sec->ps_import,
+					     ctx->cc_sec, ctx);
+	}
+
+	sptlrpc_cli_ctx_wakeup(ctx);
+}
+
+static void gss_cli_ctx_finalize(struct gss_cli_ctx *gctx)
+{
+	LASSERT(gctx->gc_base.cc_sec);
+
+	if (gctx->gc_mechctx) {
+		lgss_delete_sec_context(&gctx->gc_mechctx);
+		gctx->gc_mechctx = NULL;
+	}
+
+	if (!rawobj_empty(&gctx->gc_svc_handle)) {
+		/* forward ctx: mark buddy reverse svcctx soon-expire. */
+		if (!sec_is_reverse(gctx->gc_base.cc_sec) &&
+		    !rawobj_empty(&gctx->gc_svc_handle))
+			gss_svc_upcall_expire_rvs_ctx(&gctx->gc_svc_handle);
+
+		rawobj_free(&gctx->gc_svc_handle);
+	}
+
+	rawobj_free(&gctx->gc_handle);
+}
+
+/*
+ * Based on sequence number algorithm as specified in RFC 2203.
+ *
+ * modified for our own problem: arriving request has valid sequence number,
+ * but unwrapping request might cost a long time, after that its sequence
+ * are not valid anymore (fall behind the window). It rarely happen, mostly
+ * under extreme load.
+ *
+ * note we should not check sequence before verify the integrity of incoming
+ * request, because just one attacking request with high sequence number might
+ * cause all following request be dropped.
+ *
+ * so here we use a multi-phase approach: prepare 2 sequence windows,
+ * "main window" for normal sequence and "back window" for fall behind sequence.
+ * and 3-phase checking mechanism:
+ *  0 - before integrity verification, perform a initial sequence checking in
+ *      main window, which only try and don't actually set any bits. if the
+ *      sequence is high above the window or fit in the window and the bit
+ *      is 0, then accept and proceed to integrity verification. otherwise
+ *      reject this sequence.
+ *  1 - after integrity verification, check in main window again. if this
+ *      sequence is high above the window or fit in the window and the bit
+ *      is 0, then set the bit and accept; if it fit in the window but bit
+ *      already set, then reject; if it fall behind the window, then proceed
+ *      to phase 2.
+ *  2 - check in back window. if it is high above the window or fit in the
+ *      window and the bit is 0, then set the bit and accept. otherwise reject.
+ *
+ * return value:
+ *   1: looks like a replay
+ *   0: is ok
+ *  -1: is a replay
+ *
+ * note phase 0 is necessary, because otherwise replay attacking request of
+ * sequence which between the 2 windows can't be detected.
+ *
+ * this mechanism can't totally solve the problem, but could help much less
+ * number of valid requests be dropped.
+ */
+static
+int gss_do_check_seq(unsigned long *window, __u32 win_size, __u32 *max_seq,
+		     __u32 seq_num, int phase)
+{
+	LASSERT(phase >= 0 && phase <= 2);
+
+	if (seq_num > *max_seq) {
+		/*
+		 * 1. high above the window
+		 */
+		if (phase == 0)
+			return 0;
+
+		if (seq_num >= *max_seq + win_size) {
+			memset(window, 0, win_size / 8);
+			*max_seq = seq_num;
+		} else {
+			while(*max_seq < seq_num) {
+				(*max_seq)++;
+				__clear_bit((*max_seq) % win_size, window);
+			}
+		}
+		__set_bit(seq_num % win_size, window);
+	} else if (seq_num + win_size <= *max_seq) {
+		/*
+		 * 2. low behind the window
+		 */
+		if (phase == 0 || phase == 2)
+			goto replay;
+
+		CWARN("seq %u is %u behind (size %d), check backup window\n",
+		      seq_num, *max_seq - win_size - seq_num, win_size);
+		return 1;
+	} else {
+		/*
+		 * 3. fit into the window
+		 */
+		switch (phase) {
+		case 0:
+			if (test_bit(seq_num % win_size, window))
+				goto replay;
+			break;
+		case 1:
+		case 2:
+		     if (__test_and_set_bit(seq_num % win_size, window))
+				goto replay;
+			break;
+		}
+	}
+
+	return 0;
+
+replay:
+	CERROR("seq %u (%s %s window) is a replay: max %u, winsize %d\n",
+	       seq_num,
+	       seq_num + win_size > *max_seq ? "in" : "behind",
+	       phase == 2 ? "backup " : "main",
+	       *max_seq, win_size);
+	return -1;
+}
+
+/*
+ * Based on sequence number algorithm as specified in RFC 2203.
+ *
+ * if @set == 0: initial check, don't set any bit in window
+ * if @sec == 1: final check, set bit in window
+ */
+int gss_check_seq_num(struct gss_svc_seq_data *ssd, __u32 seq_num, int set)
+{
+	int rc = 0;
+
+	spin_lock(&ssd->ssd_lock);
+
+	if (set == 0) {
+		/*
+		 * phase 0 testing
+		 */
+		rc = gss_do_check_seq(ssd->ssd_win_main, GSS_SEQ_WIN_MAIN,
+				      &ssd->ssd_max_main, seq_num, 0);
+		if (unlikely(rc))
+			gss_stat_oos_record_svc(0, 1);
+	} else {
+		/*
+		 * phase 1 checking main window
+		 */
+		rc = gss_do_check_seq(ssd->ssd_win_main, GSS_SEQ_WIN_MAIN,
+				      &ssd->ssd_max_main, seq_num, 1);
+		switch (rc) {
+		case -1:
+			gss_stat_oos_record_svc(1, 1);
+			/* fall through */
+		case 0:
+			goto exit;
+		}
+		/*
+		 * phase 2 checking back window
+		 */
+		rc = gss_do_check_seq(ssd->ssd_win_back, GSS_SEQ_WIN_BACK,
+				      &ssd->ssd_max_back, seq_num, 2);
+		if (rc)
+			gss_stat_oos_record_svc(2, 1);
+		else
+			gss_stat_oos_record_svc(2, 0);
+	}
+exit:
+	spin_unlock(&ssd->ssd_lock);
+	return rc;
+}
+
+/***************************************
+ * cred APIs			   *
+ ***************************************/
+
+static inline int gss_cli_payload(struct ptlrpc_cli_ctx *ctx,
+				  int msgsize, int privacy)
+{
+	return gss_mech_payload(NULL, msgsize, privacy);
+}
+
+static int gss_cli_bulk_payload(struct ptlrpc_cli_ctx *ctx,
+				struct sptlrpc_flavor *flvr,
+				int reply, int read)
+{
+	int     payload = sizeof(struct ptlrpc_bulk_sec_desc);
+
+	LASSERT(SPTLRPC_FLVR_BULK_TYPE(flvr->sf_rpc) == SPTLRPC_BULK_DEFAULT);
+
+	if ((!reply && !read) || (reply && read)) {
+		switch (SPTLRPC_FLVR_BULK_SVC(flvr->sf_rpc)) {
+		case SPTLRPC_BULK_SVC_NULL:
+			break;
+		case SPTLRPC_BULK_SVC_INTG:
+			payload += gss_cli_payload(ctx, 0, 0);
+			break;
+		case SPTLRPC_BULK_SVC_PRIV:
+			payload += gss_cli_payload(ctx, 0, 1);
+			break;
+		case SPTLRPC_BULK_SVC_AUTH:
+		default:
+			LBUG();
+		}
+	}
+
+	return payload;
+}
+
+int gss_cli_ctx_match(struct ptlrpc_cli_ctx *ctx, struct vfs_cred *vcred)
+{
+	return (ctx->cc_vcred.vc_uid == vcred->vc_uid);
+}
+
+void gss_cli_ctx_flags2str(unsigned long flags, char *buf, int bufsize)
+{
+	buf[0] = '\0';
+
+	if (flags & PTLRPC_CTX_NEW)
+		strncat(buf, "new,", bufsize);
+	if (flags & PTLRPC_CTX_UPTODATE)
+		strncat(buf, "uptodate,", bufsize);
+	if (flags & PTLRPC_CTX_DEAD)
+		strncat(buf, "dead,", bufsize);
+	if (flags & PTLRPC_CTX_ERROR)
+		strncat(buf, "error,", bufsize);
+	if (flags & PTLRPC_CTX_CACHED)
+		strncat(buf, "cached,", bufsize);
+	if (flags & PTLRPC_CTX_ETERNAL)
+		strncat(buf, "eternal,", bufsize);
+	if (buf[0] == '\0')
+		strncat(buf, "-,", bufsize);
+
+	buf[strlen(buf) - 1] = '\0';
+}
+
+int gss_cli_ctx_sign(struct ptlrpc_cli_ctx *ctx,
+		     struct ptlrpc_request *req)
+{
+	struct gss_cli_ctx      *gctx = ctx2gctx(ctx);
+	__u32		    flags = 0, seq, svc;
+	int		      rc;
+	ENTRY;
+
+	LASSERT(req->rq_reqbuf);
+	LASSERT(req->rq_reqbuf->lm_bufcount >= 2);
+	LASSERT(req->rq_cli_ctx == ctx);
+
+	/* nothing to do for context negotiation RPCs */
+	if (req->rq_ctx_init)
+		RETURN(0);
+
+	svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc);
+	if (req->rq_pack_bulk)
+		flags |= LUSTRE_GSS_PACK_BULK;
+	if (req->rq_pack_udesc)
+		flags |= LUSTRE_GSS_PACK_USER;
+
+redo:
+	seq = atomic_inc_return(&gctx->gc_seq);
+
+	rc = gss_sign_msg(req->rq_reqbuf, gctx->gc_mechctx,
+			  ctx->cc_sec->ps_part,
+			  flags, gctx->gc_proc, seq, svc,
+			  &gctx->gc_handle);
+	if (rc < 0)
+		RETURN(rc);
+
+	/* gss_sign_msg() msg might take long time to finish, in which period
+	 * more rpcs could be wrapped up and sent out. if we found too many
+	 * of them we should repack this rpc, because sent it too late might
+	 * lead to the sequence number fall behind the window on server and
+	 * be dropped. also applies to gss_cli_ctx_seal().
+	 *
+	 * Note: null mode dosen't check sequence number. */
+	if (svc != SPTLRPC_SVC_NULL &&
+	    atomic_read(&gctx->gc_seq) - seq > GSS_SEQ_REPACK_THRESHOLD) {
+		int behind = atomic_read(&gctx->gc_seq) - seq;
+
+		gss_stat_oos_record_cli(behind);
+		CWARN("req %p: %u behind, retry signing\n", req, behind);
+		goto redo;
+	}
+
+	req->rq_reqdata_len = rc;
+	RETURN(0);
+}
+
+static
+int gss_cli_ctx_handle_err_notify(struct ptlrpc_cli_ctx *ctx,
+				  struct ptlrpc_request *req,
+				  struct gss_header *ghdr)
+{
+	struct gss_err_header *errhdr;
+	int rc;
+
+	LASSERT(ghdr->gh_proc == PTLRPC_GSS_PROC_ERR);
+
+	errhdr = (struct gss_err_header *) ghdr;
+
+	CWARN("req x"LPU64"/t"LPU64", ctx %p idx "LPX64"(%u->%s): "
+	      "%sserver respond (%08x/%08x)\n",
+	      req->rq_xid, req->rq_transno, ctx,
+	      gss_handle_to_u64(&ctx2gctx(ctx)->gc_handle),
+	      ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec),
+	      sec_is_reverse(ctx->cc_sec) ? "reverse" : "",
+	      errhdr->gh_major, errhdr->gh_minor);
+
+	/* context fini rpc, let it failed */
+	if (req->rq_ctx_fini) {
+		CWARN("context fini rpc failed\n");
+		return -EINVAL;
+	}
+
+	/* reverse sec, just return error, don't expire this ctx because it's
+	 * crucial to callback rpcs. note if the callback rpc failed because
+	 * of bit flip during network transfer, the client will be evicted
+	 * directly. so more gracefully we probably want let it retry for
+	 * number of times. */
+	if (sec_is_reverse(ctx->cc_sec))
+		return -EINVAL;
+
+	if (errhdr->gh_major != GSS_S_NO_CONTEXT &&
+	    errhdr->gh_major != GSS_S_BAD_SIG)
+		return -EACCES;
+
+	/* server return NO_CONTEXT might be caused by context expire
+	 * or server reboot/failover. we try to refresh a new ctx which
+	 * be transparent to upper layer.
+	 *
+	 * In some cases, our gss handle is possible to be incidentally
+	 * identical to another handle since the handle itself is not
+	 * fully random. In krb5 case, the GSS_S_BAD_SIG will be
+	 * returned, maybe other gss error for other mechanism.
+	 *
+	 * if we add new mechanism, make sure the correct error are
+	 * returned in this case. */
+	CWARN("%s: server might lost the context, retrying\n",
+	      errhdr->gh_major == GSS_S_NO_CONTEXT ?  "NO_CONTEXT" : "BAD_SIG");
+
+	sptlrpc_cli_ctx_expire(ctx);
+
+	/* we need replace the ctx right here, otherwise during
+	 * resent we'll hit the logic in sptlrpc_req_refresh_ctx()
+	 * which keep the ctx with RESEND flag, thus we'll never
+	 * get rid of this ctx. */
+	rc = sptlrpc_req_replace_dead_ctx(req);
+	if (rc == 0)
+		req->rq_resend = 1;
+
+	return rc;
+}
+
+int gss_cli_ctx_verify(struct ptlrpc_cli_ctx *ctx,
+		       struct ptlrpc_request *req)
+{
+	struct gss_cli_ctx     *gctx;
+	struct gss_header      *ghdr, *reqhdr;
+	struct lustre_msg      *msg = req->rq_repdata;
+	__u32		   major;
+	int		     pack_bulk, swabbed, rc = 0;
+	ENTRY;
+
+	LASSERT(req->rq_cli_ctx == ctx);
+	LASSERT(msg);
+
+	gctx = container_of(ctx, struct gss_cli_ctx, gc_base);
+
+	/* special case for context negotiation, rq_repmsg/rq_replen actually
+	 * are not used currently. but early reply always be treated normally */
+	if (req->rq_ctx_init && !req->rq_early) {
+		req->rq_repmsg = lustre_msg_buf(msg, 1, 0);
+		req->rq_replen = msg->lm_buflens[1];
+		RETURN(0);
+	}
+
+	if (msg->lm_bufcount < 2 || msg->lm_bufcount > 4) {
+		CERROR("unexpected bufcount %u\n", msg->lm_bufcount);
+		RETURN(-EPROTO);
+	}
+
+	swabbed = ptlrpc_rep_need_swab(req);
+
+	ghdr = gss_swab_header(msg, 0, swabbed);
+	if (ghdr == NULL) {
+		CERROR("can't decode gss header\n");
+		RETURN(-EPROTO);
+	}
+
+	/* sanity checks */
+	reqhdr = lustre_msg_buf(msg, 0, sizeof(*reqhdr));
+	LASSERT(reqhdr);
+
+	if (ghdr->gh_version != reqhdr->gh_version) {
+		CERROR("gss version %u mismatch, expect %u\n",
+		       ghdr->gh_version, reqhdr->gh_version);
+		RETURN(-EPROTO);
+	}
+
+	switch (ghdr->gh_proc) {
+	case PTLRPC_GSS_PROC_DATA:
+		pack_bulk = ghdr->gh_flags & LUSTRE_GSS_PACK_BULK;
+
+		if (!req->rq_early && !equi(req->rq_pack_bulk == 1, pack_bulk)){
+			CERROR("%s bulk flag in reply\n",
+			       req->rq_pack_bulk ? "missing" : "unexpected");
+			RETURN(-EPROTO);
+		}
+
+		if (ghdr->gh_seq != reqhdr->gh_seq) {
+			CERROR("seqnum %u mismatch, expect %u\n",
+			       ghdr->gh_seq, reqhdr->gh_seq);
+			RETURN(-EPROTO);
+		}
+
+		if (ghdr->gh_svc != reqhdr->gh_svc) {
+			CERROR("svc %u mismatch, expect %u\n",
+			       ghdr->gh_svc, reqhdr->gh_svc);
+			RETURN(-EPROTO);
+		}
+
+		if (swabbed)
+			gss_header_swabber(ghdr);
+
+		major = gss_verify_msg(msg, gctx->gc_mechctx, reqhdr->gh_svc);
+		if (major != GSS_S_COMPLETE) {
+			CERROR("failed to verify reply: %x\n", major);
+			RETURN(-EPERM);
+		}
+
+		if (req->rq_early && reqhdr->gh_svc == SPTLRPC_SVC_NULL) {
+			__u32 cksum;
+
+			cksum = crc32_le(!(__u32) 0,
+					 lustre_msg_buf(msg, 1, 0),
+					 lustre_msg_buflen(msg, 1));
+			if (cksum != msg->lm_cksum) {
+				CWARN("early reply checksum mismatch: "
+				      "%08x != %08x\n", cksum, msg->lm_cksum);
+				RETURN(-EPROTO);
+			}
+		}
+
+		if (pack_bulk) {
+			/* bulk checksum is right after the lustre msg */
+			if (msg->lm_bufcount < 3) {
+				CERROR("Invalid reply bufcount %u\n",
+				       msg->lm_bufcount);
+				RETURN(-EPROTO);
+			}
+
+			rc = bulk_sec_desc_unpack(msg, 2, swabbed);
+			if (rc) {
+				CERROR("unpack bulk desc: %d\n", rc);
+				RETURN(rc);
+			}
+		}
+
+		req->rq_repmsg = lustre_msg_buf(msg, 1, 0);
+		req->rq_replen = msg->lm_buflens[1];
+		break;
+	case PTLRPC_GSS_PROC_ERR:
+		if (req->rq_early) {
+			CERROR("server return error with early reply\n");
+			rc = -EPROTO;
+		} else {
+			rc = gss_cli_ctx_handle_err_notify(ctx, req, ghdr);
+		}
+		break;
+	default:
+		CERROR("unknown gss proc %d\n", ghdr->gh_proc);
+		rc = -EPROTO;
+	}
+
+	RETURN(rc);
+}
+
+int gss_cli_ctx_seal(struct ptlrpc_cli_ctx *ctx,
+		     struct ptlrpc_request *req)
+{
+	struct gss_cli_ctx      *gctx;
+	rawobj_t		 hdrobj, msgobj, token;
+	struct gss_header       *ghdr;
+	__u32		    buflens[2], major;
+	int		      wiresize, rc;
+	ENTRY;
+
+	LASSERT(req->rq_clrbuf);
+	LASSERT(req->rq_cli_ctx == ctx);
+	LASSERT(req->rq_reqlen);
+
+	gctx = container_of(ctx, struct gss_cli_ctx, gc_base);
+
+	/* final clear data length */
+	req->rq_clrdata_len = lustre_msg_size_v2(req->rq_clrbuf->lm_bufcount,
+						 req->rq_clrbuf->lm_buflens);
+
+	/* calculate wire data length */
+	buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+	buflens[1] = gss_cli_payload(&gctx->gc_base, req->rq_clrdata_len, 1);
+	wiresize = lustre_msg_size_v2(2, buflens);
+
+	/* allocate wire buffer */
+	if (req->rq_pool) {
+		/* pre-allocated */
+		LASSERT(req->rq_reqbuf);
+		LASSERT(req->rq_reqbuf != req->rq_clrbuf);
+		LASSERT(req->rq_reqbuf_len >= wiresize);
+	} else {
+		OBD_ALLOC_LARGE(req->rq_reqbuf, wiresize);
+		if (!req->rq_reqbuf)
+			RETURN(-ENOMEM);
+		req->rq_reqbuf_len = wiresize;
+	}
+
+	lustre_init_msg_v2(req->rq_reqbuf, 2, buflens, NULL);
+	req->rq_reqbuf->lm_secflvr = req->rq_flvr.sf_rpc;
+
+	/* gss header */
+	ghdr = lustre_msg_buf(req->rq_reqbuf, 0, 0);
+	ghdr->gh_version = PTLRPC_GSS_VERSION;
+	ghdr->gh_sp = (__u8) ctx->cc_sec->ps_part;
+	ghdr->gh_flags = 0;
+	ghdr->gh_proc = gctx->gc_proc;
+	ghdr->gh_svc = SPTLRPC_SVC_PRIV;
+	ghdr->gh_handle.len = gctx->gc_handle.len;
+	memcpy(ghdr->gh_handle.data, gctx->gc_handle.data, gctx->gc_handle.len);
+	if (req->rq_pack_bulk)
+		ghdr->gh_flags |= LUSTRE_GSS_PACK_BULK;
+	if (req->rq_pack_udesc)
+		ghdr->gh_flags |= LUSTRE_GSS_PACK_USER;
+
+redo:
+	ghdr->gh_seq = atomic_inc_return(&gctx->gc_seq);
+
+	/* buffer objects */
+	hdrobj.len = PTLRPC_GSS_HEADER_SIZE;
+	hdrobj.data = (__u8 *) ghdr;
+	msgobj.len = req->rq_clrdata_len;
+	msgobj.data = (__u8 *) req->rq_clrbuf;
+	token.len = lustre_msg_buflen(req->rq_reqbuf, 1);
+	token.data = lustre_msg_buf(req->rq_reqbuf, 1, 0);
+
+	major = lgss_wrap(gctx->gc_mechctx, &hdrobj, &msgobj,
+			  req->rq_clrbuf_len, &token);
+	if (major != GSS_S_COMPLETE) {
+		CERROR("priv: wrap message error: %08x\n", major);
+		GOTO(err_free, rc = -EPERM);
+	}
+	LASSERT(token.len <= buflens[1]);
+
+	/* see explain in gss_cli_ctx_sign() */
+	if (unlikely(atomic_read(&gctx->gc_seq) - ghdr->gh_seq >
+		     GSS_SEQ_REPACK_THRESHOLD)) {
+		int behind = atomic_read(&gctx->gc_seq) - ghdr->gh_seq;
+
+		gss_stat_oos_record_cli(behind);
+		CWARN("req %p: %u behind, retry sealing\n", req, behind);
+
+		ghdr->gh_seq = atomic_inc_return(&gctx->gc_seq);
+		goto redo;
+	}
+
+	/* now set the final wire data length */
+	req->rq_reqdata_len = lustre_shrink_msg(req->rq_reqbuf, 1, token.len,0);
+	RETURN(0);
+
+err_free:
+	if (!req->rq_pool) {
+		OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+		req->rq_reqbuf = NULL;
+		req->rq_reqbuf_len = 0;
+	}
+	RETURN(rc);
+}
+
+int gss_cli_ctx_unseal(struct ptlrpc_cli_ctx *ctx,
+		       struct ptlrpc_request *req)
+{
+	struct gss_cli_ctx      *gctx;
+	struct gss_header       *ghdr;
+	struct lustre_msg       *msg = req->rq_repdata;
+	int		      msglen, pack_bulk, swabbed, rc;
+	__u32		    major;
+	ENTRY;
+
+	LASSERT(req->rq_cli_ctx == ctx);
+	LASSERT(req->rq_ctx_init == 0);
+	LASSERT(msg);
+
+	gctx = container_of(ctx, struct gss_cli_ctx, gc_base);
+	swabbed = ptlrpc_rep_need_swab(req);
+
+	ghdr = gss_swab_header(msg, 0, swabbed);
+	if (ghdr == NULL) {
+		CERROR("can't decode gss header\n");
+		RETURN(-EPROTO);
+	}
+
+	/* sanity checks */
+	if (ghdr->gh_version != PTLRPC_GSS_VERSION) {
+		CERROR("gss version %u mismatch, expect %u\n",
+		       ghdr->gh_version, PTLRPC_GSS_VERSION);
+		RETURN(-EPROTO);
+	}
+
+	switch (ghdr->gh_proc) {
+	case PTLRPC_GSS_PROC_DATA:
+		pack_bulk = ghdr->gh_flags & LUSTRE_GSS_PACK_BULK;
+
+		if (!req->rq_early && !equi(req->rq_pack_bulk == 1, pack_bulk)){
+			CERROR("%s bulk flag in reply\n",
+			       req->rq_pack_bulk ? "missing" : "unexpected");
+			RETURN(-EPROTO);
+		}
+
+		if (swabbed)
+			gss_header_swabber(ghdr);
+
+		/* use rq_repdata_len as buffer size, which assume unseal
+		 * doesn't need extra memory space. for precise control, we'd
+		 * better calculate out actual buffer size as
+		 * (repbuf_len - offset - repdata_len) */
+		major = gss_unseal_msg(gctx->gc_mechctx, msg,
+				       &msglen, req->rq_repdata_len);
+		if (major != GSS_S_COMPLETE) {
+			CERROR("failed to unwrap reply: %x\n", major);
+			rc = -EPERM;
+			break;
+		}
+
+		swabbed = __lustre_unpack_msg(msg, msglen);
+		if (swabbed < 0) {
+			CERROR("Failed to unpack after decryption\n");
+			RETURN(-EPROTO);
+		}
+
+		if (msg->lm_bufcount < 1) {
+			CERROR("Invalid reply buffer: empty\n");
+			RETURN(-EPROTO);
+		}
+
+		if (pack_bulk) {
+			if (msg->lm_bufcount < 2) {
+				CERROR("bufcount %u: missing bulk sec desc\n",
+				       msg->lm_bufcount);
+				RETURN(-EPROTO);
+			}
+
+			/* bulk checksum is the last segment */
+			if (bulk_sec_desc_unpack(msg, msg->lm_bufcount - 1,
+						 swabbed))
+				RETURN(-EPROTO);
+		}
+
+		req->rq_repmsg = lustre_msg_buf(msg, 0, 0);
+		req->rq_replen = msg->lm_buflens[0];
+
+		rc = 0;
+		break;
+	case PTLRPC_GSS_PROC_ERR:
+		if (req->rq_early) {
+			CERROR("server return error with early reply\n");
+			rc = -EPROTO;
+		} else {
+			rc = gss_cli_ctx_handle_err_notify(ctx, req, ghdr);
+		}
+		break;
+	default:
+		CERROR("unexpected proc %d\n", ghdr->gh_proc);
+		rc = -EPERM;
+	}
+
+	RETURN(rc);
+}
+
+/*********************************************
+ * reverse context installation	      *
+ *********************************************/
+
+static inline
+int gss_install_rvs_svc_ctx(struct obd_import *imp,
+			    struct gss_sec *gsec,
+			    struct gss_cli_ctx *gctx)
+{
+	return gss_svc_upcall_install_rvs_ctx(imp, gsec, gctx);
+}
+
+/*********************************************
+ * GSS security APIs			 *
+ *********************************************/
+int gss_sec_create_common(struct gss_sec *gsec,
+			  struct ptlrpc_sec_policy *policy,
+			  struct obd_import *imp,
+			  struct ptlrpc_svc_ctx *svcctx,
+			  struct sptlrpc_flavor *sf)
+{
+	struct ptlrpc_sec   *sec;
+
+	LASSERT(imp);
+	LASSERT(SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_GSS);
+
+	gsec->gs_mech = lgss_subflavor_to_mech(
+				SPTLRPC_FLVR_BASE_SUB(sf->sf_rpc));
+	if (!gsec->gs_mech) {
+		CERROR("gss backend 0x%x not found\n",
+		       SPTLRPC_FLVR_BASE_SUB(sf->sf_rpc));
+		return -EOPNOTSUPP;
+	}
+
+	spin_lock_init(&gsec->gs_lock);
+	gsec->gs_rvs_hdl = 0ULL;
+
+	/* initialize upper ptlrpc_sec */
+	sec = &gsec->gs_base;
+	sec->ps_policy = policy;
+	atomic_set(&sec->ps_refcount, 0);
+	atomic_set(&sec->ps_nctx, 0);
+	sec->ps_id = sptlrpc_get_next_secid();
+	sec->ps_flvr = *sf;
+	sec->ps_import = class_import_get(imp);
+	spin_lock_init(&sec->ps_lock);
+	INIT_LIST_HEAD(&sec->ps_gc_list);
+
+	if (!svcctx) {
+		sec->ps_gc_interval = GSS_GC_INTERVAL;
+	} else {
+		LASSERT(sec_is_reverse(sec));
+
+		/* never do gc on reverse sec */
+		sec->ps_gc_interval = 0;
+	}
+
+	if (SPTLRPC_FLVR_BULK_SVC(sec->ps_flvr.sf_rpc) == SPTLRPC_BULK_SVC_PRIV)
+		sptlrpc_enc_pool_add_user();
+
+	CDEBUG(D_SEC, "create %s%s@%p\n", (svcctx ? "reverse " : ""),
+	       policy->sp_name, gsec);
+	return 0;
+}
+
+void gss_sec_destroy_common(struct gss_sec *gsec)
+{
+	struct ptlrpc_sec      *sec = &gsec->gs_base;
+	ENTRY;
+
+	LASSERT(sec->ps_import);
+	LASSERT(atomic_read(&sec->ps_refcount) == 0);
+	LASSERT(atomic_read(&sec->ps_nctx) == 0);
+
+	if (gsec->gs_mech) {
+		lgss_mech_put(gsec->gs_mech);
+		gsec->gs_mech = NULL;
+	}
+
+	class_import_put(sec->ps_import);
+
+	if (SPTLRPC_FLVR_BULK_SVC(sec->ps_flvr.sf_rpc) == SPTLRPC_BULK_SVC_PRIV)
+		sptlrpc_enc_pool_del_user();
+
+	EXIT;
+}
+
+void gss_sec_kill(struct ptlrpc_sec *sec)
+{
+	sec->ps_dying = 1;
+}
+
+int gss_cli_ctx_init_common(struct ptlrpc_sec *sec,
+			    struct ptlrpc_cli_ctx *ctx,
+			    struct ptlrpc_ctx_ops *ctxops,
+			    struct vfs_cred *vcred)
+{
+	struct gss_cli_ctx    *gctx = ctx2gctx(ctx);
+
+	gctx->gc_win = 0;
+	atomic_set(&gctx->gc_seq, 0);
+
+	INIT_HLIST_NODE(&ctx->cc_cache);
+	atomic_set(&ctx->cc_refcount, 0);
+	ctx->cc_sec = sec;
+	ctx->cc_ops = ctxops;
+	ctx->cc_expire = 0;
+	ctx->cc_flags = PTLRPC_CTX_NEW;
+	ctx->cc_vcred = *vcred;
+	spin_lock_init(&ctx->cc_lock);
+	INIT_LIST_HEAD(&ctx->cc_req_list);
+	INIT_LIST_HEAD(&ctx->cc_gc_chain);
+
+	/* take a ref on belonging sec, balanced in ctx destroying */
+	atomic_inc(&sec->ps_refcount);
+	/* statistic only */
+	atomic_inc(&sec->ps_nctx);
+
+	CDEBUG(D_SEC, "%s@%p: create ctx %p(%u->%s)\n",
+	       sec->ps_policy->sp_name, ctx->cc_sec,
+	       ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec));
+	return 0;
+}
+
+/*
+ * return value:
+ *   1: the context has been taken care of by someone else
+ *   0: proceed to really destroy the context locally
+ */
+int gss_cli_ctx_fini_common(struct ptlrpc_sec *sec,
+			    struct ptlrpc_cli_ctx *ctx)
+{
+	struct gss_cli_ctx *gctx = ctx2gctx(ctx);
+
+	LASSERT(atomic_read(&sec->ps_nctx) > 0);
+	LASSERT(atomic_read(&ctx->cc_refcount) == 0);
+	LASSERT(ctx->cc_sec == sec);
+
+	/*
+	 * remove UPTODATE flag of reverse ctx thus we won't send fini rpc,
+	 * this is to avoid potential problems of client side reverse svc ctx
+	 * be mis-destroyed in various recovery senarios. anyway client can
+	 * manage its reverse ctx well by associating it with its buddy ctx.
+	 */
+	if (sec_is_reverse(sec))
+		ctx->cc_flags &= ~PTLRPC_CTX_UPTODATE;
+
+	if (gctx->gc_mechctx) {
+		/* the final context fini rpc will use this ctx too, and it's
+		 * asynchronous which finished by request_out_callback(). so
+		 * we add refcount, whoever drop finally drop the refcount to
+		 * 0 should responsible for the rest of destroy. */
+		atomic_inc(&ctx->cc_refcount);
+
+		gss_do_ctx_fini_rpc(gctx);
+		gss_cli_ctx_finalize(gctx);
+
+		if (!atomic_dec_and_test(&ctx->cc_refcount))
+			return 1;
+	}
+
+	if (sec_is_reverse(sec))
+		CWARN("reverse sec %p: destroy ctx %p\n",
+		      ctx->cc_sec, ctx);
+	else
+		CWARN("%s@%p: destroy ctx %p(%u->%s)\n",
+		      sec->ps_policy->sp_name, ctx->cc_sec,
+		      ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec));
+
+	return 0;
+}
+
+static
+int gss_alloc_reqbuf_intg(struct ptlrpc_sec *sec,
+			  struct ptlrpc_request *req,
+			  int svc, int msgsize)
+{
+	int		       bufsize, txtsize;
+	int		       bufcnt = 2;
+	__u32		     buflens[5];
+	ENTRY;
+
+	/*
+	 * on-wire data layout:
+	 * - gss header
+	 * - lustre message
+	 * - user descriptor (optional)
+	 * - bulk sec descriptor (optional)
+	 * - signature (optional)
+	 *   - svc == NULL: NULL
+	 *   - svc == AUTH: signature of gss header
+	 *   - svc == INTG: signature of all above
+	 *
+	 * if this is context negotiation, reserver fixed space
+	 * at the last (signature) segment regardless of svc mode.
+	 */
+
+	buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+	txtsize = buflens[0];
+
+	buflens[1] = msgsize;
+	if (svc == SPTLRPC_SVC_INTG)
+		txtsize += buflens[1];
+
+	if (req->rq_pack_udesc) {
+		buflens[bufcnt] = sptlrpc_current_user_desc_size();
+		if (svc == SPTLRPC_SVC_INTG)
+			txtsize += buflens[bufcnt];
+		bufcnt++;
+	}
+
+	if (req->rq_pack_bulk) {
+		buflens[bufcnt] = gss_cli_bulk_payload(req->rq_cli_ctx,
+						       &req->rq_flvr,
+						       0, req->rq_bulk_read);
+		if (svc == SPTLRPC_SVC_INTG)
+			txtsize += buflens[bufcnt];
+		bufcnt++;
+	}
+
+	if (req->rq_ctx_init)
+		buflens[bufcnt++] = GSS_CTX_INIT_MAX_LEN;
+	else if (svc != SPTLRPC_SVC_NULL)
+		buflens[bufcnt++] = gss_cli_payload(req->rq_cli_ctx, txtsize,0);
+
+	bufsize = lustre_msg_size_v2(bufcnt, buflens);
+
+	if (!req->rq_reqbuf) {
+		bufsize = size_roundup_power2(bufsize);
+
+		OBD_ALLOC_LARGE(req->rq_reqbuf, bufsize);
+		if (!req->rq_reqbuf)
+			RETURN(-ENOMEM);
+
+		req->rq_reqbuf_len = bufsize;
+	} else {
+		LASSERT(req->rq_pool);
+		LASSERT(req->rq_reqbuf_len >= bufsize);
+		memset(req->rq_reqbuf, 0, bufsize);
+	}
+
+	lustre_init_msg_v2(req->rq_reqbuf, bufcnt, buflens, NULL);
+	req->rq_reqbuf->lm_secflvr = req->rq_flvr.sf_rpc;
+
+	req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, 1, msgsize);
+	LASSERT(req->rq_reqmsg);
+
+	/* pack user desc here, later we might leave current user's process */
+	if (req->rq_pack_udesc)
+		sptlrpc_pack_user_desc(req->rq_reqbuf, 2);
+
+	RETURN(0);
+}
+
+static
+int gss_alloc_reqbuf_priv(struct ptlrpc_sec *sec,
+			  struct ptlrpc_request *req,
+			  int msgsize)
+{
+	__u32		     ibuflens[3], wbuflens[2];
+	int		       ibufcnt;
+	int		       clearsize, wiresize;
+	ENTRY;
+
+	LASSERT(req->rq_clrbuf == NULL);
+	LASSERT(req->rq_clrbuf_len == 0);
+
+	/* Inner (clear) buffers
+	 *  - lustre message
+	 *  - user descriptor (optional)
+	 *  - bulk checksum (optional)
+	 */
+	ibufcnt = 1;
+	ibuflens[0] = msgsize;
+
+	if (req->rq_pack_udesc)
+		ibuflens[ibufcnt++] = sptlrpc_current_user_desc_size();
+	if (req->rq_pack_bulk)
+		ibuflens[ibufcnt++] = gss_cli_bulk_payload(req->rq_cli_ctx,
+							   &req->rq_flvr, 0,
+							   req->rq_bulk_read);
+
+	clearsize = lustre_msg_size_v2(ibufcnt, ibuflens);
+	/* to allow append padding during encryption */
+	clearsize += GSS_MAX_CIPHER_BLOCK;
+
+	/* Wrapper (wire) buffers
+	 *  - gss header
+	 *  - cipher text
+	 */
+	wbuflens[0] = PTLRPC_GSS_HEADER_SIZE;
+	wbuflens[1] = gss_cli_payload(req->rq_cli_ctx, clearsize, 1);
+	wiresize = lustre_msg_size_v2(2, wbuflens);
+
+	if (req->rq_pool) {
+		/* rq_reqbuf is preallocated */
+		LASSERT(req->rq_reqbuf);
+		LASSERT(req->rq_reqbuf_len >= wiresize);
+
+		memset(req->rq_reqbuf, 0, req->rq_reqbuf_len);
+
+		/* if the pre-allocated buffer is big enough, we just pack
+		 * both clear buf & request buf in it, to avoid more alloc. */
+		if (clearsize + wiresize <= req->rq_reqbuf_len) {
+			req->rq_clrbuf =
+				(void *) (((char *) req->rq_reqbuf) + wiresize);
+		} else {
+			CWARN("pre-allocated buf size %d is not enough for "
+			      "both clear (%d) and cipher (%d) text, proceed "
+			      "with extra allocation\n", req->rq_reqbuf_len,
+			      clearsize, wiresize);
+		}
+	}
+
+	if (!req->rq_clrbuf) {
+		clearsize = size_roundup_power2(clearsize);
+
+		OBD_ALLOC_LARGE(req->rq_clrbuf, clearsize);
+		if (!req->rq_clrbuf)
+			RETURN(-ENOMEM);
+	}
+	req->rq_clrbuf_len = clearsize;
+
+	lustre_init_msg_v2(req->rq_clrbuf, ibufcnt, ibuflens, NULL);
+	req->rq_reqmsg = lustre_msg_buf(req->rq_clrbuf, 0, msgsize);
+
+	if (req->rq_pack_udesc)
+		sptlrpc_pack_user_desc(req->rq_clrbuf, 1);
+
+	RETURN(0);
+}
+
+/*
+ * NOTE: any change of request buffer allocation should also consider
+ * changing enlarge_reqbuf() series functions.
+ */
+int gss_alloc_reqbuf(struct ptlrpc_sec *sec,
+		     struct ptlrpc_request *req,
+		     int msgsize)
+{
+	int     svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc);
+
+	LASSERT(!req->rq_pack_bulk ||
+		(req->rq_bulk_read || req->rq_bulk_write));
+
+	switch (svc) {
+	case SPTLRPC_SVC_NULL:
+	case SPTLRPC_SVC_AUTH:
+	case SPTLRPC_SVC_INTG:
+		return gss_alloc_reqbuf_intg(sec, req, svc, msgsize);
+	case SPTLRPC_SVC_PRIV:
+		return gss_alloc_reqbuf_priv(sec, req, msgsize);
+	default:
+		LASSERTF(0, "bad rpc flavor %x\n", req->rq_flvr.sf_rpc);
+		return 0;
+	}
+}
+
+void gss_free_reqbuf(struct ptlrpc_sec *sec,
+		     struct ptlrpc_request *req)
+{
+	int     privacy;
+	ENTRY;
+
+	LASSERT(!req->rq_pool || req->rq_reqbuf);
+	privacy = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc) == SPTLRPC_SVC_PRIV;
+
+	if (!req->rq_clrbuf)
+		goto release_reqbuf;
+
+	/* release clear buffer */
+	LASSERT(privacy);
+	LASSERT(req->rq_clrbuf_len);
+
+	if (req->rq_pool == NULL ||
+	    req->rq_clrbuf < req->rq_reqbuf ||
+	    (char *) req->rq_clrbuf >=
+	    (char *) req->rq_reqbuf + req->rq_reqbuf_len)
+		OBD_FREE_LARGE(req->rq_clrbuf, req->rq_clrbuf_len);
+
+	req->rq_clrbuf = NULL;
+	req->rq_clrbuf_len = 0;
+
+release_reqbuf:
+	if (!req->rq_pool && req->rq_reqbuf) {
+		LASSERT(req->rq_reqbuf_len);
+
+		OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+		req->rq_reqbuf = NULL;
+		req->rq_reqbuf_len = 0;
+	}
+
+	EXIT;
+}
+
+static int do_alloc_repbuf(struct ptlrpc_request *req, int bufsize)
+{
+	bufsize = size_roundup_power2(bufsize);
+
+	OBD_ALLOC_LARGE(req->rq_repbuf, bufsize);
+	if (!req->rq_repbuf)
+		return -ENOMEM;
+
+	req->rq_repbuf_len = bufsize;
+	return 0;
+}
+
+static
+int gss_alloc_repbuf_intg(struct ptlrpc_sec *sec,
+			  struct ptlrpc_request *req,
+			  int svc, int msgsize)
+{
+	int	     txtsize;
+	__u32	   buflens[4];
+	int	     bufcnt = 2;
+	int	     alloc_size;
+
+	/*
+	 * on-wire data layout:
+	 * - gss header
+	 * - lustre message
+	 * - bulk sec descriptor (optional)
+	 * - signature (optional)
+	 *   - svc == NULL: NULL
+	 *   - svc == AUTH: signature of gss header
+	 *   - svc == INTG: signature of all above
+	 *
+	 * if this is context negotiation, reserver fixed space
+	 * at the last (signature) segment regardless of svc mode.
+	 */
+
+	buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+	txtsize = buflens[0];
+
+	buflens[1] = msgsize;
+	if (svc == SPTLRPC_SVC_INTG)
+		txtsize += buflens[1];
+
+	if (req->rq_pack_bulk) {
+		buflens[bufcnt] = gss_cli_bulk_payload(req->rq_cli_ctx,
+						       &req->rq_flvr,
+						       1, req->rq_bulk_read);
+		if (svc == SPTLRPC_SVC_INTG)
+			txtsize += buflens[bufcnt];
+		bufcnt++;
+	}
+
+	if (req->rq_ctx_init)
+		buflens[bufcnt++] = GSS_CTX_INIT_MAX_LEN;
+	else if (svc != SPTLRPC_SVC_NULL)
+		buflens[bufcnt++] = gss_cli_payload(req->rq_cli_ctx, txtsize,0);
+
+	alloc_size = lustre_msg_size_v2(bufcnt, buflens);
+
+	/* add space for early reply */
+	alloc_size += gss_at_reply_off_integ;
+
+	return do_alloc_repbuf(req, alloc_size);
+}
+
+static
+int gss_alloc_repbuf_priv(struct ptlrpc_sec *sec,
+			  struct ptlrpc_request *req,
+			  int msgsize)
+{
+	int	     txtsize;
+	__u32	   buflens[2];
+	int	     bufcnt;
+	int	     alloc_size;
+
+	/* inner buffers */
+	bufcnt = 1;
+	buflens[0] = msgsize;
+
+	if (req->rq_pack_bulk)
+		buflens[bufcnt++] = gss_cli_bulk_payload(req->rq_cli_ctx,
+							 &req->rq_flvr,
+							 1, req->rq_bulk_read);
+	txtsize = lustre_msg_size_v2(bufcnt, buflens);
+	txtsize += GSS_MAX_CIPHER_BLOCK;
+
+	/* wrapper buffers */
+	bufcnt = 2;
+	buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+	buflens[1] = gss_cli_payload(req->rq_cli_ctx, txtsize, 1);
+
+	alloc_size = lustre_msg_size_v2(bufcnt, buflens);
+	/* add space for early reply */
+	alloc_size += gss_at_reply_off_priv;
+
+	return do_alloc_repbuf(req, alloc_size);
+}
+
+int gss_alloc_repbuf(struct ptlrpc_sec *sec,
+		     struct ptlrpc_request *req,
+		     int msgsize)
+{
+	int     svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc);
+	ENTRY;
+
+	LASSERT(!req->rq_pack_bulk ||
+		(req->rq_bulk_read || req->rq_bulk_write));
+
+	switch (svc) {
+	case SPTLRPC_SVC_NULL:
+	case SPTLRPC_SVC_AUTH:
+	case SPTLRPC_SVC_INTG:
+		return gss_alloc_repbuf_intg(sec, req, svc, msgsize);
+	case SPTLRPC_SVC_PRIV:
+		return gss_alloc_repbuf_priv(sec, req, msgsize);
+	default:
+		LASSERTF(0, "bad rpc flavor %x\n", req->rq_flvr.sf_rpc);
+		return 0;
+	}
+}
+
+void gss_free_repbuf(struct ptlrpc_sec *sec,
+		     struct ptlrpc_request *req)
+{
+	OBD_FREE_LARGE(req->rq_repbuf, req->rq_repbuf_len);
+	req->rq_repbuf = NULL;
+	req->rq_repbuf_len = 0;
+	req->rq_repdata = NULL;
+	req->rq_repdata_len = 0;
+}
+
+static int get_enlarged_msgsize(struct lustre_msg *msg,
+				int segment, int newsize)
+{
+	int save, newmsg_size;
+
+	LASSERT(newsize >= msg->lm_buflens[segment]);
+
+	save = msg->lm_buflens[segment];
+	msg->lm_buflens[segment] = newsize;
+	newmsg_size = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+	msg->lm_buflens[segment] = save;
+
+	return newmsg_size;
+}
+
+static int get_enlarged_msgsize2(struct lustre_msg *msg,
+				 int segment1, int newsize1,
+				 int segment2, int newsize2)
+{
+	int save1, save2, newmsg_size;
+
+	LASSERT(newsize1 >= msg->lm_buflens[segment1]);
+	LASSERT(newsize2 >= msg->lm_buflens[segment2]);
+
+	save1 = msg->lm_buflens[segment1];
+	save2 = msg->lm_buflens[segment2];
+	msg->lm_buflens[segment1] = newsize1;
+	msg->lm_buflens[segment2] = newsize2;
+	newmsg_size = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+	msg->lm_buflens[segment1] = save1;
+	msg->lm_buflens[segment2] = save2;
+
+	return newmsg_size;
+}
+
+static
+int gss_enlarge_reqbuf_intg(struct ptlrpc_sec *sec,
+			    struct ptlrpc_request *req,
+			    int svc,
+			    int segment, int newsize)
+{
+	struct lustre_msg      *newbuf;
+	int		     txtsize, sigsize = 0, i;
+	int		     newmsg_size, newbuf_size;
+
+	/*
+	 * gss header is at seg 0;
+	 * embedded msg is at seg 1;
+	 * signature (if any) is at the last seg
+	 */
+	LASSERT(req->rq_reqbuf);
+	LASSERT(req->rq_reqbuf_len > req->rq_reqlen);
+	LASSERT(req->rq_reqbuf->lm_bufcount >= 2);
+	LASSERT(lustre_msg_buf(req->rq_reqbuf, 1, 0) == req->rq_reqmsg);
+
+	/* 1. compute new embedded msg size */
+	newmsg_size = get_enlarged_msgsize(req->rq_reqmsg, segment, newsize);
+	LASSERT(newmsg_size >= req->rq_reqbuf->lm_buflens[1]);
+
+	/* 2. compute new wrapper msg size */
+	if (svc == SPTLRPC_SVC_NULL) {
+		/* no signature, get size directly */
+		newbuf_size = get_enlarged_msgsize(req->rq_reqbuf,
+						   1, newmsg_size);
+	} else {
+		txtsize = req->rq_reqbuf->lm_buflens[0];
+
+		if (svc == SPTLRPC_SVC_INTG) {
+			for (i = 1; i < req->rq_reqbuf->lm_bufcount; i++)
+				txtsize += req->rq_reqbuf->lm_buflens[i];
+			txtsize += newmsg_size - req->rq_reqbuf->lm_buflens[1];
+		}
+
+		sigsize = gss_cli_payload(req->rq_cli_ctx, txtsize, 0);
+		LASSERT(sigsize >= msg_last_seglen(req->rq_reqbuf));
+
+		newbuf_size = get_enlarged_msgsize2(
+					req->rq_reqbuf,
+					1, newmsg_size,
+					msg_last_segidx(req->rq_reqbuf),
+					sigsize);
+	}
+
+	/* request from pool should always have enough buffer */
+	LASSERT(!req->rq_pool || req->rq_reqbuf_len >= newbuf_size);
+
+	if (req->rq_reqbuf_len < newbuf_size) {
+		newbuf_size = size_roundup_power2(newbuf_size);
+
+		OBD_ALLOC_LARGE(newbuf, newbuf_size);
+		if (newbuf == NULL)
+			RETURN(-ENOMEM);
+
+		memcpy(newbuf, req->rq_reqbuf, req->rq_reqbuf_len);
+
+		OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+		req->rq_reqbuf = newbuf;
+		req->rq_reqbuf_len = newbuf_size;
+		req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, 1, 0);
+	}
+
+	/* do enlargement, from wrapper to embedded, from end to begin */
+	if (svc != SPTLRPC_SVC_NULL)
+		_sptlrpc_enlarge_msg_inplace(req->rq_reqbuf,
+					     msg_last_segidx(req->rq_reqbuf),
+					     sigsize);
+
+	_sptlrpc_enlarge_msg_inplace(req->rq_reqbuf, 1, newmsg_size);
+	_sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize);
+
+	req->rq_reqlen = newmsg_size;
+	RETURN(0);
+}
+
+static
+int gss_enlarge_reqbuf_priv(struct ptlrpc_sec *sec,
+			    struct ptlrpc_request *req,
+			    int segment, int newsize)
+{
+	struct lustre_msg      *newclrbuf;
+	int		     newmsg_size, newclrbuf_size, newcipbuf_size;
+	__u32		   buflens[3];
+
+	/*
+	 * embedded msg is at seg 0 of clear buffer;
+	 * cipher text is at seg 2 of cipher buffer;
+	 */
+	LASSERT(req->rq_pool ||
+		(req->rq_reqbuf == NULL && req->rq_reqbuf_len == 0));
+	LASSERT(req->rq_reqbuf == NULL ||
+		(req->rq_pool && req->rq_reqbuf->lm_bufcount == 3));
+	LASSERT(req->rq_clrbuf);
+	LASSERT(req->rq_clrbuf_len > req->rq_reqlen);
+	LASSERT(lustre_msg_buf(req->rq_clrbuf, 0, 0) == req->rq_reqmsg);
+
+	/* compute new embedded msg size */
+	newmsg_size = get_enlarged_msgsize(req->rq_reqmsg, segment, newsize);
+
+	/* compute new clear buffer size */
+	newclrbuf_size = get_enlarged_msgsize(req->rq_clrbuf, 0, newmsg_size);
+	newclrbuf_size += GSS_MAX_CIPHER_BLOCK;
+
+	/* compute new cipher buffer size */
+	buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+	buflens[1] = gss_cli_payload(req->rq_cli_ctx, buflens[0], 0);
+	buflens[2] = gss_cli_payload(req->rq_cli_ctx, newclrbuf_size, 1);
+	newcipbuf_size = lustre_msg_size_v2(3, buflens);
+
+	/* handle the case that we put both clear buf and cipher buf into
+	 * pre-allocated single buffer. */
+	if (unlikely(req->rq_pool) &&
+	    req->rq_clrbuf >= req->rq_reqbuf &&
+	    (char *) req->rq_clrbuf <
+	    (char *) req->rq_reqbuf + req->rq_reqbuf_len) {
+		/* it couldn't be better we still fit into the
+		 * pre-allocated buffer. */
+		if (newclrbuf_size + newcipbuf_size <= req->rq_reqbuf_len) {
+			void *src, *dst;
+
+			/* move clear text backward. */
+			src = req->rq_clrbuf;
+			dst = (char *) req->rq_reqbuf + newcipbuf_size;
+
+			memmove(dst, src, req->rq_clrbuf_len);
+
+			req->rq_clrbuf = (struct lustre_msg *) dst;
+			req->rq_clrbuf_len = newclrbuf_size;
+			req->rq_reqmsg = lustre_msg_buf(req->rq_clrbuf, 0, 0);
+		} else {
+			/* sadly we have to split out the clear buffer */
+			LASSERT(req->rq_reqbuf_len >= newcipbuf_size);
+			LASSERT(req->rq_clrbuf_len < newclrbuf_size);
+		}
+	}
+
+	if (req->rq_clrbuf_len < newclrbuf_size) {
+		newclrbuf_size = size_roundup_power2(newclrbuf_size);
+
+		OBD_ALLOC_LARGE(newclrbuf, newclrbuf_size);
+		if (newclrbuf == NULL)
+			RETURN(-ENOMEM);
+
+		memcpy(newclrbuf, req->rq_clrbuf, req->rq_clrbuf_len);
+
+		if (req->rq_reqbuf == NULL ||
+		    req->rq_clrbuf < req->rq_reqbuf ||
+		    (char *) req->rq_clrbuf >=
+		    (char *) req->rq_reqbuf + req->rq_reqbuf_len) {
+			OBD_FREE_LARGE(req->rq_clrbuf, req->rq_clrbuf_len);
+		}
+
+		req->rq_clrbuf = newclrbuf;
+		req->rq_clrbuf_len = newclrbuf_size;
+		req->rq_reqmsg = lustre_msg_buf(req->rq_clrbuf, 0, 0);
+	}
+
+	_sptlrpc_enlarge_msg_inplace(req->rq_clrbuf, 0, newmsg_size);
+	_sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize);
+	req->rq_reqlen = newmsg_size;
+
+	RETURN(0);
+}
+
+int gss_enlarge_reqbuf(struct ptlrpc_sec *sec,
+		       struct ptlrpc_request *req,
+		       int segment, int newsize)
+{
+	int     svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc);
+
+	LASSERT(!req->rq_ctx_init && !req->rq_ctx_fini);
+
+	switch (svc) {
+	case SPTLRPC_SVC_NULL:
+	case SPTLRPC_SVC_AUTH:
+	case SPTLRPC_SVC_INTG:
+		return gss_enlarge_reqbuf_intg(sec, req, svc, segment, newsize);
+	case SPTLRPC_SVC_PRIV:
+		return gss_enlarge_reqbuf_priv(sec, req, segment, newsize);
+	default:
+		LASSERTF(0, "bad rpc flavor %x\n", req->rq_flvr.sf_rpc);
+		return 0;
+	}
+}
+
+int gss_sec_install_rctx(struct obd_import *imp,
+			 struct ptlrpc_sec *sec,
+			 struct ptlrpc_cli_ctx *ctx)
+{
+	struct gss_sec     *gsec;
+	struct gss_cli_ctx *gctx;
+	int		 rc;
+
+	gsec = container_of(sec, struct gss_sec, gs_base);
+	gctx = container_of(ctx, struct gss_cli_ctx, gc_base);
+
+	rc = gss_install_rvs_svc_ctx(imp, gsec, gctx);
+	return rc;
+}
+
+/********************************************
+ * server side API			  *
+ ********************************************/
+
+static inline
+int gss_svc_reqctx_is_special(struct gss_svc_reqctx *grctx)
+{
+	LASSERT(grctx);
+	return (grctx->src_init || grctx->src_init_continue ||
+		grctx->src_err_notify);
+}
+
+static
+void gss_svc_reqctx_free(struct gss_svc_reqctx *grctx)
+{
+	if (grctx->src_ctx)
+		gss_svc_upcall_put_ctx(grctx->src_ctx);
+
+	sptlrpc_policy_put(grctx->src_base.sc_policy);
+	OBD_FREE_PTR(grctx);
+}
+
+static inline
+void gss_svc_reqctx_addref(struct gss_svc_reqctx *grctx)
+{
+	LASSERT(atomic_read(&grctx->src_base.sc_refcount) > 0);
+	atomic_inc(&grctx->src_base.sc_refcount);
+}
+
+static inline
+void gss_svc_reqctx_decref(struct gss_svc_reqctx *grctx)
+{
+	LASSERT(atomic_read(&grctx->src_base.sc_refcount) > 0);
+
+	if (atomic_dec_and_test(&grctx->src_base.sc_refcount))
+		gss_svc_reqctx_free(grctx);
+}
+
+static
+int gss_svc_sign(struct ptlrpc_request *req,
+		 struct ptlrpc_reply_state *rs,
+		 struct gss_svc_reqctx *grctx,
+		 __u32 svc)
+{
+	__u32   flags = 0;
+	int     rc;
+	ENTRY;
+
+	LASSERT(rs->rs_msg == lustre_msg_buf(rs->rs_repbuf, 1, 0));
+
+	/* embedded lustre_msg might have been shrinked */
+	if (req->rq_replen != rs->rs_repbuf->lm_buflens[1])
+		lustre_shrink_msg(rs->rs_repbuf, 1, req->rq_replen, 1);
+
+	if (req->rq_pack_bulk)
+		flags |= LUSTRE_GSS_PACK_BULK;
+
+	rc = gss_sign_msg(rs->rs_repbuf, grctx->src_ctx->gsc_mechctx,
+			  LUSTRE_SP_ANY, flags, PTLRPC_GSS_PROC_DATA,
+			  grctx->src_wirectx.gw_seq, svc, NULL);
+	if (rc < 0)
+		RETURN(rc);
+
+	rs->rs_repdata_len = rc;
+
+	if (likely(req->rq_packed_final)) {
+		if (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)
+			req->rq_reply_off = gss_at_reply_off_integ;
+		else
+			req->rq_reply_off = 0;
+	} else {
+		if (svc == SPTLRPC_SVC_NULL)
+			rs->rs_repbuf->lm_cksum = crc32_le(!(__u32) 0,
+					lustre_msg_buf(rs->rs_repbuf, 1, 0),
+					lustre_msg_buflen(rs->rs_repbuf, 1));
+		req->rq_reply_off = 0;
+	}
+
+	RETURN(0);
+}
+
+int gss_pack_err_notify(struct ptlrpc_request *req, __u32 major, __u32 minor)
+{
+	struct gss_svc_reqctx     *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+	struct ptlrpc_reply_state *rs;
+	struct gss_err_header     *ghdr;
+	int			replen = sizeof(struct ptlrpc_body);
+	int			rc;
+	ENTRY;
+
+	//if (OBD_FAIL_CHECK_ORSET(OBD_FAIL_SVCGSS_ERR_NOTIFY, OBD_FAIL_ONCE))
+	//      RETURN(-EINVAL);
+
+	grctx->src_err_notify = 1;
+	grctx->src_reserve_len = 0;
+
+	rc = lustre_pack_reply_v2(req, 1, &replen, NULL, 0);
+	if (rc) {
+		CERROR("could not pack reply, err %d\n", rc);
+		RETURN(rc);
+	}
+
+	/* gss hdr */
+	rs = req->rq_reply_state;
+	LASSERT(rs->rs_repbuf->lm_buflens[1] >= sizeof(*ghdr));
+	ghdr = lustre_msg_buf(rs->rs_repbuf, 0, 0);
+	ghdr->gh_version = PTLRPC_GSS_VERSION;
+	ghdr->gh_flags = 0;
+	ghdr->gh_proc = PTLRPC_GSS_PROC_ERR;
+	ghdr->gh_major = major;
+	ghdr->gh_minor = minor;
+	ghdr->gh_handle.len = 0; /* fake context handle */
+
+	rs->rs_repdata_len = lustre_msg_size_v2(rs->rs_repbuf->lm_bufcount,
+						rs->rs_repbuf->lm_buflens);
+
+	CDEBUG(D_SEC, "prepare gss error notify(0x%x/0x%x) to %s\n",
+	       major, minor, libcfs_nid2str(req->rq_peer.nid));
+	RETURN(0);
+}
+
+static
+int gss_svc_handle_init(struct ptlrpc_request *req,
+			struct gss_wire_ctx *gw)
+{
+	struct gss_svc_reqctx     *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+	struct lustre_msg	 *reqbuf = req->rq_reqbuf;
+	struct obd_uuid	   *uuid;
+	struct obd_device	 *target;
+	rawobj_t		   uuid_obj, rvs_hdl, in_token;
+	__u32		      lustre_svc;
+	__u32		     *secdata, seclen;
+	int			swabbed, rc;
+	ENTRY;
+
+	CDEBUG(D_SEC, "processing gss init(%d) request from %s\n", gw->gw_proc,
+	       libcfs_nid2str(req->rq_peer.nid));
+
+	req->rq_ctx_init = 1;
+
+	if (gw->gw_flags & LUSTRE_GSS_PACK_BULK) {
+		CERROR("unexpected bulk flag\n");
+		RETURN(SECSVC_DROP);
+	}
+
+	if (gw->gw_proc == PTLRPC_GSS_PROC_INIT && gw->gw_handle.len != 0) {
+		CERROR("proc %u: invalid handle length %u\n",
+		       gw->gw_proc, gw->gw_handle.len);
+		RETURN(SECSVC_DROP);
+	}
+
+	if (reqbuf->lm_bufcount < 3 || reqbuf->lm_bufcount > 4){
+		CERROR("Invalid bufcount %d\n", reqbuf->lm_bufcount);
+		RETURN(SECSVC_DROP);
+	}
+
+	swabbed = ptlrpc_req_need_swab(req);
+
+	/* ctx initiate payload is in last segment */
+	secdata = lustre_msg_buf(reqbuf, reqbuf->lm_bufcount - 1, 0);
+	seclen = reqbuf->lm_buflens[reqbuf->lm_bufcount - 1];
+
+	if (seclen < 4 + 4) {
+		CERROR("sec size %d too small\n", seclen);
+		RETURN(SECSVC_DROP);
+	}
+
+	/* lustre svc type */
+	lustre_svc = le32_to_cpu(*secdata++);
+	seclen -= 4;
+
+	/* extract target uuid, note this code is somewhat fragile
+	 * because touched internal structure of obd_uuid */
+	if (rawobj_extract(&uuid_obj, &secdata, &seclen)) {
+		CERROR("failed to extract target uuid\n");
+		RETURN(SECSVC_DROP);
+	}
+	uuid_obj.data[uuid_obj.len - 1] = '\0';
+
+	uuid = (struct obd_uuid *) uuid_obj.data;
+	target = class_uuid2obd(uuid);
+	if (!target || target->obd_stopping || !target->obd_set_up) {
+		CERROR("target '%s' is not available for context init (%s)\n",
+		       uuid->uuid, target == NULL ? "no target" :
+		       (target->obd_stopping ? "stopping" : "not set up"));
+		RETURN(SECSVC_DROP);
+	}
+
+	/* extract reverse handle */
+	if (rawobj_extract(&rvs_hdl, &secdata, &seclen)) {
+		CERROR("failed extract reverse handle\n");
+		RETURN(SECSVC_DROP);
+	}
+
+	/* extract token */
+	if (rawobj_extract(&in_token, &secdata, &seclen)) {
+		CERROR("can't extract token\n");
+		RETURN(SECSVC_DROP);
+	}
+
+	rc = gss_svc_upcall_handle_init(req, grctx, gw, target, lustre_svc,
+					&rvs_hdl, &in_token);
+	if (rc != SECSVC_OK)
+		RETURN(rc);
+
+	if (grctx->src_ctx->gsc_usr_mds || grctx->src_ctx->gsc_usr_oss ||
+	    grctx->src_ctx->gsc_usr_root)
+		CWARN("create svc ctx %p: user from %s authenticated as %s\n",
+		      grctx->src_ctx, libcfs_nid2str(req->rq_peer.nid),
+		      grctx->src_ctx->gsc_usr_mds ? "mds" :
+			(grctx->src_ctx->gsc_usr_oss ? "oss" : "root"));
+	else
+		CWARN("create svc ctx %p: accept user %u from %s\n",
+		      grctx->src_ctx, grctx->src_ctx->gsc_uid,
+		      libcfs_nid2str(req->rq_peer.nid));
+
+	if (gw->gw_flags & LUSTRE_GSS_PACK_USER) {
+		if (reqbuf->lm_bufcount < 4) {
+			CERROR("missing user descriptor\n");
+			RETURN(SECSVC_DROP);
+		}
+		if (sptlrpc_unpack_user_desc(reqbuf, 2, swabbed)) {
+			CERROR("Mal-formed user descriptor\n");
+			RETURN(SECSVC_DROP);
+		}
+
+		req->rq_pack_udesc = 1;
+		req->rq_user_desc = lustre_msg_buf(reqbuf, 2, 0);
+	}
+
+	req->rq_reqmsg = lustre_msg_buf(reqbuf, 1, 0);
+	req->rq_reqlen = lustre_msg_buflen(reqbuf, 1);
+
+	RETURN(rc);
+}
+
+/*
+ * last segment must be the gss signature.
+ */
+static
+int gss_svc_verify_request(struct ptlrpc_request *req,
+			   struct gss_svc_reqctx *grctx,
+			   struct gss_wire_ctx *gw,
+			   __u32 *major)
+{
+	struct gss_svc_ctx *gctx = grctx->src_ctx;
+	struct lustre_msg  *msg = req->rq_reqbuf;
+	int		 offset = 2;
+	int		 swabbed;
+	ENTRY;
+
+	*major = GSS_S_COMPLETE;
+
+	if (msg->lm_bufcount < 2) {
+		CERROR("Too few segments (%u) in request\n", msg->lm_bufcount);
+		RETURN(-EINVAL);
+	}
+
+	if (gw->gw_svc == SPTLRPC_SVC_NULL)
+		goto verified;
+
+	if (gss_check_seq_num(&gctx->gsc_seqdata, gw->gw_seq, 0)) {
+		CERROR("phase 0: discard replayed req: seq %u\n", gw->gw_seq);
+		*major = GSS_S_DUPLICATE_TOKEN;
+		RETURN(-EACCES);
+	}
+
+	*major = gss_verify_msg(msg, gctx->gsc_mechctx, gw->gw_svc);
+	if (*major != GSS_S_COMPLETE) {
+		CERROR("failed to verify request: %x\n", *major);
+		RETURN(-EACCES);
+	}
+
+	if (gctx->gsc_reverse == 0 &&
+	    gss_check_seq_num(&gctx->gsc_seqdata, gw->gw_seq, 1)) {
+		CERROR("phase 1+: discard replayed req: seq %u\n", gw->gw_seq);
+		*major = GSS_S_DUPLICATE_TOKEN;
+		RETURN(-EACCES);
+	}
+
+verified:
+	swabbed = ptlrpc_req_need_swab(req);
+
+	/* user descriptor */
+	if (gw->gw_flags & LUSTRE_GSS_PACK_USER) {
+		if (msg->lm_bufcount < (offset + 1)) {
+			CERROR("no user desc included\n");
+			RETURN(-EINVAL);
+		}
+
+		if (sptlrpc_unpack_user_desc(msg, offset, swabbed)) {
+			CERROR("Mal-formed user descriptor\n");
+			RETURN(-EINVAL);
+		}
+
+		req->rq_pack_udesc = 1;
+		req->rq_user_desc = lustre_msg_buf(msg, offset, 0);
+		offset++;
+	}
+
+	/* check bulk_sec_desc data */
+	if (gw->gw_flags & LUSTRE_GSS_PACK_BULK) {
+		if (msg->lm_bufcount < (offset + 1)) {
+			CERROR("missing bulk sec descriptor\n");
+			RETURN(-EINVAL);
+		}
+
+		if (bulk_sec_desc_unpack(msg, offset, swabbed))
+			RETURN(-EINVAL);
+
+		req->rq_pack_bulk = 1;
+		grctx->src_reqbsd = lustre_msg_buf(msg, offset, 0);
+		grctx->src_reqbsd_size = lustre_msg_buflen(msg, offset);
+	}
+
+	req->rq_reqmsg = lustre_msg_buf(msg, 1, 0);
+	req->rq_reqlen = msg->lm_buflens[1];
+	RETURN(0);
+}
+
+static
+int gss_svc_unseal_request(struct ptlrpc_request *req,
+			   struct gss_svc_reqctx *grctx,
+			   struct gss_wire_ctx *gw,
+			   __u32 *major)
+{
+	struct gss_svc_ctx *gctx = grctx->src_ctx;
+	struct lustre_msg  *msg = req->rq_reqbuf;
+	int		 swabbed, msglen, offset = 1;
+	ENTRY;
+
+	if (gss_check_seq_num(&gctx->gsc_seqdata, gw->gw_seq, 0)) {
+		CERROR("phase 0: discard replayed req: seq %u\n", gw->gw_seq);
+		*major = GSS_S_DUPLICATE_TOKEN;
+		RETURN(-EACCES);
+	}
+
+	*major = gss_unseal_msg(gctx->gsc_mechctx, msg,
+			       &msglen, req->rq_reqdata_len);
+	if (*major != GSS_S_COMPLETE) {
+		CERROR("failed to unwrap request: %x\n", *major);
+		RETURN(-EACCES);
+	}
+
+	if (gss_check_seq_num(&gctx->gsc_seqdata, gw->gw_seq, 1)) {
+		CERROR("phase 1+: discard replayed req: seq %u\n", gw->gw_seq);
+		*major = GSS_S_DUPLICATE_TOKEN;
+		RETURN(-EACCES);
+	}
+
+	swabbed = __lustre_unpack_msg(msg, msglen);
+	if (swabbed < 0) {
+		CERROR("Failed to unpack after decryption\n");
+		RETURN(-EINVAL);
+	}
+	req->rq_reqdata_len = msglen;
+
+	if (msg->lm_bufcount < 1) {
+		CERROR("Invalid buffer: is empty\n");
+		RETURN(-EINVAL);
+	}
+
+	if (gw->gw_flags & LUSTRE_GSS_PACK_USER) {
+		if (msg->lm_bufcount < offset + 1) {
+			CERROR("no user descriptor included\n");
+			RETURN(-EINVAL);
+		}
+
+		if (sptlrpc_unpack_user_desc(msg, offset, swabbed)) {
+			CERROR("Mal-formed user descriptor\n");
+			RETURN(-EINVAL);
+		}
+
+		req->rq_pack_udesc = 1;
+		req->rq_user_desc = lustre_msg_buf(msg, offset, 0);
+		offset++;
+	}
+
+	if (gw->gw_flags & LUSTRE_GSS_PACK_BULK) {
+		if (msg->lm_bufcount < offset + 1) {
+			CERROR("no bulk checksum included\n");
+			RETURN(-EINVAL);
+		}
+
+		if (bulk_sec_desc_unpack(msg, offset, swabbed))
+			RETURN(-EINVAL);
+
+		req->rq_pack_bulk = 1;
+		grctx->src_reqbsd = lustre_msg_buf(msg, offset, 0);
+		grctx->src_reqbsd_size = lustre_msg_buflen(msg, offset);
+	}
+
+	req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, 0, 0);
+	req->rq_reqlen = req->rq_reqbuf->lm_buflens[0];
+	RETURN(0);
+}
+
+static
+int gss_svc_handle_data(struct ptlrpc_request *req,
+			struct gss_wire_ctx *gw)
+{
+	struct gss_svc_reqctx *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+	__u32		  major = 0;
+	int		    rc = 0;
+	ENTRY;
+
+	grctx->src_ctx = gss_svc_upcall_get_ctx(req, gw);
+	if (!grctx->src_ctx) {
+		major = GSS_S_NO_CONTEXT;
+		goto error;
+	}
+
+	switch (gw->gw_svc) {
+	case SPTLRPC_SVC_NULL:
+	case SPTLRPC_SVC_AUTH:
+	case SPTLRPC_SVC_INTG:
+		rc = gss_svc_verify_request(req, grctx, gw, &major);
+		break;
+	case SPTLRPC_SVC_PRIV:
+		rc = gss_svc_unseal_request(req, grctx, gw, &major);
+		break;
+	default:
+		CERROR("unsupported gss service %d\n", gw->gw_svc);
+		rc = -EINVAL;
+	}
+
+	if (rc == 0)
+		RETURN(SECSVC_OK);
+
+	CERROR("svc %u failed: major 0x%08x: req xid "LPU64" ctx %p idx "
+	       LPX64"(%u->%s)\n", gw->gw_svc, major, req->rq_xid,
+	       grctx->src_ctx, gss_handle_to_u64(&gw->gw_handle),
+	       grctx->src_ctx->gsc_uid, libcfs_nid2str(req->rq_peer.nid));
+error:
+	/* we only notify client in case of NO_CONTEXT/BAD_SIG, which
+	 * might happen after server reboot, to allow recovery. */
+	if ((major == GSS_S_NO_CONTEXT || major == GSS_S_BAD_SIG) &&
+	    gss_pack_err_notify(req, major, 0) == 0)
+		RETURN(SECSVC_COMPLETE);
+
+	RETURN(SECSVC_DROP);
+}
+
+static
+int gss_svc_handle_destroy(struct ptlrpc_request *req,
+			   struct gss_wire_ctx *gw)
+{
+	struct gss_svc_reqctx  *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+	__u32		   major;
+	ENTRY;
+
+	req->rq_ctx_fini = 1;
+	req->rq_no_reply = 1;
+
+	grctx->src_ctx = gss_svc_upcall_get_ctx(req, gw);
+	if (!grctx->src_ctx) {
+		CDEBUG(D_SEC, "invalid gss context handle for destroy.\n");
+		RETURN(SECSVC_DROP);
+	}
+
+	if (gw->gw_svc != SPTLRPC_SVC_INTG) {
+		CERROR("svc %u is not supported in destroy.\n", gw->gw_svc);
+		RETURN(SECSVC_DROP);
+	}
+
+	if (gss_svc_verify_request(req, grctx, gw, &major))
+		RETURN(SECSVC_DROP);
+
+	CWARN("destroy svc ctx %p idx "LPX64" (%u->%s)\n",
+	      grctx->src_ctx, gss_handle_to_u64(&gw->gw_handle),
+	      grctx->src_ctx->gsc_uid, libcfs_nid2str(req->rq_peer.nid));
+
+	gss_svc_upcall_destroy_ctx(grctx->src_ctx);
+
+	if (gw->gw_flags & LUSTRE_GSS_PACK_USER) {
+		if (req->rq_reqbuf->lm_bufcount < 4) {
+			CERROR("missing user descriptor, ignore it\n");
+			RETURN(SECSVC_OK);
+		}
+		if (sptlrpc_unpack_user_desc(req->rq_reqbuf, 2,
+					     ptlrpc_req_need_swab(req))) {
+			CERROR("Mal-formed user descriptor, ignore it\n");
+			RETURN(SECSVC_OK);
+		}
+
+		req->rq_pack_udesc = 1;
+		req->rq_user_desc = lustre_msg_buf(req->rq_reqbuf, 2, 0);
+	}
+
+	RETURN(SECSVC_OK);
+}
+
+int gss_svc_accept(struct ptlrpc_sec_policy *policy, struct ptlrpc_request *req)
+{
+	struct gss_header      *ghdr;
+	struct gss_svc_reqctx  *grctx;
+	struct gss_wire_ctx    *gw;
+	int		     swabbed, rc;
+	ENTRY;
+
+	LASSERT(req->rq_reqbuf);
+	LASSERT(req->rq_svc_ctx == NULL);
+
+	if (req->rq_reqbuf->lm_bufcount < 2) {
+		CERROR("buf count only %d\n", req->rq_reqbuf->lm_bufcount);
+		RETURN(SECSVC_DROP);
+	}
+
+	swabbed = ptlrpc_req_need_swab(req);
+
+	ghdr = gss_swab_header(req->rq_reqbuf, 0, swabbed);
+	if (ghdr == NULL) {
+		CERROR("can't decode gss header\n");
+		RETURN(SECSVC_DROP);
+	}
+
+	/* sanity checks */
+	if (ghdr->gh_version != PTLRPC_GSS_VERSION) {
+		CERROR("gss version %u, expect %u\n", ghdr->gh_version,
+		       PTLRPC_GSS_VERSION);
+		RETURN(SECSVC_DROP);
+	}
+
+	req->rq_sp_from = ghdr->gh_sp;
+
+	/* alloc grctx data */
+	OBD_ALLOC_PTR(grctx);
+	if (!grctx)
+		RETURN(SECSVC_DROP);
+
+	grctx->src_base.sc_policy = sptlrpc_policy_get(policy);
+	atomic_set(&grctx->src_base.sc_refcount, 1);
+	req->rq_svc_ctx = &grctx->src_base;
+	gw = &grctx->src_wirectx;
+
+	/* save wire context */
+	gw->gw_flags = ghdr->gh_flags;
+	gw->gw_proc = ghdr->gh_proc;
+	gw->gw_seq = ghdr->gh_seq;
+	gw->gw_svc = ghdr->gh_svc;
+	rawobj_from_netobj(&gw->gw_handle, &ghdr->gh_handle);
+
+	/* keep original wire header which subject to checksum verification */
+	if (swabbed)
+		gss_header_swabber(ghdr);
+
+	switch(ghdr->gh_proc) {
+	case PTLRPC_GSS_PROC_INIT:
+	case PTLRPC_GSS_PROC_CONTINUE_INIT:
+		rc = gss_svc_handle_init(req, gw);
+		break;
+	case PTLRPC_GSS_PROC_DATA:
+		rc = gss_svc_handle_data(req, gw);
+		break;
+	case PTLRPC_GSS_PROC_DESTROY:
+		rc = gss_svc_handle_destroy(req, gw);
+		break;
+	default:
+		CERROR("unknown proc %u\n", gw->gw_proc);
+		rc = SECSVC_DROP;
+		break;
+	}
+
+	switch (rc) {
+	case SECSVC_OK:
+		LASSERT (grctx->src_ctx);
+
+		req->rq_auth_gss = 1;
+		req->rq_auth_remote = grctx->src_ctx->gsc_remote;
+		req->rq_auth_usr_mdt = grctx->src_ctx->gsc_usr_mds;
+		req->rq_auth_usr_ost = grctx->src_ctx->gsc_usr_oss;
+		req->rq_auth_usr_root = grctx->src_ctx->gsc_usr_root;
+		req->rq_auth_uid = grctx->src_ctx->gsc_uid;
+		req->rq_auth_mapped_uid = grctx->src_ctx->gsc_mapped_uid;
+		break;
+	case SECSVC_COMPLETE:
+		break;
+	case SECSVC_DROP:
+		gss_svc_reqctx_free(grctx);
+		req->rq_svc_ctx = NULL;
+		break;
+	}
+
+	RETURN(rc);
+}
+
+void gss_svc_invalidate_ctx(struct ptlrpc_svc_ctx *svc_ctx)
+{
+	struct gss_svc_reqctx  *grctx;
+	ENTRY;
+
+	if (svc_ctx == NULL) {
+		EXIT;
+		return;
+	}
+
+	grctx = gss_svc_ctx2reqctx(svc_ctx);
+
+	CWARN("gss svc invalidate ctx %p(%u)\n",
+	      grctx->src_ctx, grctx->src_ctx->gsc_uid);
+	gss_svc_upcall_destroy_ctx(grctx->src_ctx);
+
+	EXIT;
+}
+
+static inline
+int gss_svc_payload(struct gss_svc_reqctx *grctx, int early,
+		    int msgsize, int privacy)
+{
+	/* we should treat early reply normally, but which is actually sharing
+	 * the same ctx with original request, so in this case we should
+	 * ignore the special ctx's special flags */
+	if (early == 0 && gss_svc_reqctx_is_special(grctx))
+		return grctx->src_reserve_len;
+
+	return gss_mech_payload(NULL, msgsize, privacy);
+}
+
+static int gss_svc_bulk_payload(struct gss_svc_ctx *gctx,
+				struct sptlrpc_flavor *flvr,
+				int read)
+{
+	int     payload = sizeof(struct ptlrpc_bulk_sec_desc);
+
+	if (read) {
+		switch (SPTLRPC_FLVR_BULK_SVC(flvr->sf_rpc)) {
+		case SPTLRPC_BULK_SVC_NULL:
+			break;
+		case SPTLRPC_BULK_SVC_INTG:
+			payload += gss_mech_payload(NULL, 0, 0);
+			break;
+		case SPTLRPC_BULK_SVC_PRIV:
+			payload += gss_mech_payload(NULL, 0, 1);
+			break;
+		case SPTLRPC_BULK_SVC_AUTH:
+		default:
+			LBUG();
+		}
+	}
+
+	return payload;
+}
+
+int gss_svc_alloc_rs(struct ptlrpc_request *req, int msglen)
+{
+	struct gss_svc_reqctx       *grctx;
+	struct ptlrpc_reply_state   *rs;
+	int			  early, privacy, svc, bsd_off = 0;
+	__u32			ibuflens[2], buflens[4];
+	int			  ibufcnt = 0, bufcnt;
+	int			  txtsize, wmsg_size, rs_size;
+	ENTRY;
+
+	LASSERT(msglen % 8 == 0);
+
+	if (req->rq_pack_bulk && !req->rq_bulk_read && !req->rq_bulk_write) {
+		CERROR("client request bulk sec on non-bulk rpc\n");
+		RETURN(-EPROTO);
+	}
+
+	svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc);
+	early = (req->rq_packed_final == 0);
+
+	grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+	if (!early && gss_svc_reqctx_is_special(grctx))
+		privacy = 0;
+	else
+		privacy = (svc == SPTLRPC_SVC_PRIV);
+
+	if (privacy) {
+		/* inner clear buffers */
+		ibufcnt = 1;
+		ibuflens[0] = msglen;
+
+		if (req->rq_pack_bulk) {
+			LASSERT(grctx->src_reqbsd);
+
+			bsd_off = ibufcnt;
+			ibuflens[ibufcnt++] = gss_svc_bulk_payload(
+							grctx->src_ctx,
+							&req->rq_flvr,
+							req->rq_bulk_read);
+		}
+
+		txtsize = lustre_msg_size_v2(ibufcnt, ibuflens);
+		txtsize += GSS_MAX_CIPHER_BLOCK;
+
+		/* wrapper buffer */
+		bufcnt = 2;
+		buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+		buflens[1] = gss_svc_payload(grctx, early, txtsize, 1);
+	} else {
+		bufcnt = 2;
+		buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+		buflens[1] = msglen;
+
+		txtsize = buflens[0];
+		if (svc == SPTLRPC_SVC_INTG)
+			txtsize += buflens[1];
+
+		if (req->rq_pack_bulk) {
+			LASSERT(grctx->src_reqbsd);
+
+			bsd_off = bufcnt;
+			buflens[bufcnt] = gss_svc_bulk_payload(
+							grctx->src_ctx,
+							&req->rq_flvr,
+							req->rq_bulk_read);
+			if (svc == SPTLRPC_SVC_INTG)
+				txtsize += buflens[bufcnt];
+			bufcnt++;
+		}
+
+		if ((!early && gss_svc_reqctx_is_special(grctx)) ||
+		    svc != SPTLRPC_SVC_NULL)
+			buflens[bufcnt++] = gss_svc_payload(grctx, early,
+							    txtsize, 0);
+	}
+
+	wmsg_size = lustre_msg_size_v2(bufcnt, buflens);
+
+	rs_size = sizeof(*rs) + wmsg_size;
+	rs = req->rq_reply_state;
+
+	if (rs) {
+		/* pre-allocated */
+		LASSERT(rs->rs_size >= rs_size);
+	} else {
+		OBD_ALLOC_LARGE(rs, rs_size);
+		if (rs == NULL)
+			RETURN(-ENOMEM);
+
+		rs->rs_size = rs_size;
+	}
+
+	rs->rs_repbuf = (struct lustre_msg *) (rs + 1);
+	rs->rs_repbuf_len = wmsg_size;
+
+	/* initialize the buffer */
+	if (privacy) {
+		lustre_init_msg_v2(rs->rs_repbuf, ibufcnt, ibuflens, NULL);
+		rs->rs_msg = lustre_msg_buf(rs->rs_repbuf, 0, msglen);
+	} else {
+		lustre_init_msg_v2(rs->rs_repbuf, bufcnt, buflens, NULL);
+		rs->rs_repbuf->lm_secflvr = req->rq_flvr.sf_rpc;
+
+		rs->rs_msg = lustre_msg_buf(rs->rs_repbuf, 1, 0);
+	}
+
+	if (bsd_off) {
+		grctx->src_repbsd = lustre_msg_buf(rs->rs_repbuf, bsd_off, 0);
+		grctx->src_repbsd_size = lustre_msg_buflen(rs->rs_repbuf,
+							   bsd_off);
+	}
+
+	gss_svc_reqctx_addref(grctx);
+	rs->rs_svc_ctx = req->rq_svc_ctx;
+
+	LASSERT(rs->rs_msg);
+	req->rq_reply_state = rs;
+	RETURN(0);
+}
+
+static int gss_svc_seal(struct ptlrpc_request *req,
+			struct ptlrpc_reply_state *rs,
+			struct gss_svc_reqctx *grctx)
+{
+	struct gss_svc_ctx      *gctx = grctx->src_ctx;
+	rawobj_t		 hdrobj, msgobj, token;
+	struct gss_header       *ghdr;
+	__u8		    *token_buf;
+	int		      token_buflen;
+	__u32		    buflens[2], major;
+	int		      msglen, rc;
+	ENTRY;
+
+	/* get clear data length. note embedded lustre_msg might
+	 * have been shrinked */
+	if (req->rq_replen != lustre_msg_buflen(rs->rs_repbuf, 0))
+		msglen = lustre_shrink_msg(rs->rs_repbuf, 0, req->rq_replen, 1);
+	else
+		msglen = lustre_msg_size_v2(rs->rs_repbuf->lm_bufcount,
+					    rs->rs_repbuf->lm_buflens);
+
+	/* temporarily use tail of buffer to hold gss header data */
+	LASSERT(msglen + PTLRPC_GSS_HEADER_SIZE <= rs->rs_repbuf_len);
+	ghdr = (struct gss_header *) ((char *) rs->rs_repbuf +
+				rs->rs_repbuf_len - PTLRPC_GSS_HEADER_SIZE);
+	ghdr->gh_version = PTLRPC_GSS_VERSION;
+	ghdr->gh_sp = LUSTRE_SP_ANY;
+	ghdr->gh_flags = 0;
+	ghdr->gh_proc = PTLRPC_GSS_PROC_DATA;
+	ghdr->gh_seq = grctx->src_wirectx.gw_seq;
+	ghdr->gh_svc = SPTLRPC_SVC_PRIV;
+	ghdr->gh_handle.len = 0;
+	if (req->rq_pack_bulk)
+		ghdr->gh_flags |= LUSTRE_GSS_PACK_BULK;
+
+	/* allocate temporary cipher buffer */
+	token_buflen = gss_mech_payload(gctx->gsc_mechctx, msglen, 1);
+	OBD_ALLOC_LARGE(token_buf, token_buflen);
+	if (token_buf == NULL)
+		RETURN(-ENOMEM);
+
+	hdrobj.len = PTLRPC_GSS_HEADER_SIZE;
+	hdrobj.data = (__u8 *) ghdr;
+	msgobj.len = msglen;
+	msgobj.data = (__u8 *) rs->rs_repbuf;
+	token.len = token_buflen;
+	token.data = token_buf;
+
+	major = lgss_wrap(gctx->gsc_mechctx, &hdrobj, &msgobj,
+			  rs->rs_repbuf_len - PTLRPC_GSS_HEADER_SIZE, &token);
+	if (major != GSS_S_COMPLETE) {
+		CERROR("wrap message error: %08x\n", major);
+		GOTO(out_free, rc = -EPERM);
+	}
+	LASSERT(token.len <= token_buflen);
+
+	/* we are about to override data at rs->rs_repbuf, nullify pointers
+	 * to which to catch further illegal usage. */
+	if (req->rq_pack_bulk) {
+		grctx->src_repbsd = NULL;
+		grctx->src_repbsd_size = 0;
+	}
+
+	/* now fill the actual wire data
+	 * - gss header
+	 * - gss token
+	 */
+	buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+	buflens[1] = token.len;
+
+	rs->rs_repdata_len = lustre_msg_size_v2(2, buflens);
+	LASSERT(rs->rs_repdata_len <= rs->rs_repbuf_len);
+
+	lustre_init_msg_v2(rs->rs_repbuf, 2, buflens, NULL);
+	rs->rs_repbuf->lm_secflvr = req->rq_flvr.sf_rpc;
+
+	memcpy(lustre_msg_buf(rs->rs_repbuf, 0, 0), ghdr,
+	       PTLRPC_GSS_HEADER_SIZE);
+	memcpy(lustre_msg_buf(rs->rs_repbuf, 1, 0), token.data, token.len);
+
+	/* reply offset */
+	if (req->rq_packed_final &&
+	    (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT))
+		req->rq_reply_off = gss_at_reply_off_priv;
+	else
+		req->rq_reply_off = 0;
+
+	/* to catch upper layer's further access */
+	rs->rs_msg = NULL;
+	req->rq_repmsg = NULL;
+	req->rq_replen = 0;
+
+	rc = 0;
+out_free:
+	OBD_FREE_LARGE(token_buf, token_buflen);
+	RETURN(rc);
+}
+
+int gss_svc_authorize(struct ptlrpc_request *req)
+{
+	struct ptlrpc_reply_state *rs = req->rq_reply_state;
+	struct gss_svc_reqctx     *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+	struct gss_wire_ctx       *gw = &grctx->src_wirectx;
+	int			early, rc;
+	ENTRY;
+
+	early = (req->rq_packed_final == 0);
+
+	if (!early && gss_svc_reqctx_is_special(grctx)) {
+		LASSERT(rs->rs_repdata_len != 0);
+
+		req->rq_reply_off = gss_at_reply_off_integ;
+		RETURN(0);
+	}
+
+	/* early reply could happen in many cases */
+	if (!early &&
+	    gw->gw_proc != PTLRPC_GSS_PROC_DATA &&
+	    gw->gw_proc != PTLRPC_GSS_PROC_DESTROY) {
+		CERROR("proc %d not support\n", gw->gw_proc);
+		RETURN(-EINVAL);
+	}
+
+	LASSERT(grctx->src_ctx);
+
+	switch (gw->gw_svc) {
+	case SPTLRPC_SVC_NULL:
+	case SPTLRPC_SVC_AUTH:
+	case SPTLRPC_SVC_INTG:
+		rc = gss_svc_sign(req, rs, grctx, gw->gw_svc);
+		break;
+	case SPTLRPC_SVC_PRIV:
+		rc = gss_svc_seal(req, rs, grctx);
+		break;
+	default:
+		CERROR("Unknown service %d\n", gw->gw_svc);
+		GOTO(out, rc = -EINVAL);
+	}
+	rc = 0;
+
+out:
+	RETURN(rc);
+}
+
+void gss_svc_free_rs(struct ptlrpc_reply_state *rs)
+{
+	struct gss_svc_reqctx *grctx;
+
+	LASSERT(rs->rs_svc_ctx);
+	grctx = container_of(rs->rs_svc_ctx, struct gss_svc_reqctx, src_base);
+
+	gss_svc_reqctx_decref(grctx);
+	rs->rs_svc_ctx = NULL;
+
+	if (!rs->rs_prealloc)
+		OBD_FREE_LARGE(rs, rs->rs_size);
+}
+
+void gss_svc_free_ctx(struct ptlrpc_svc_ctx *ctx)
+{
+	LASSERT(atomic_read(&ctx->sc_refcount) == 0);
+	gss_svc_reqctx_free(gss_svc_ctx2reqctx(ctx));
+}
+
+int gss_copy_rvc_cli_ctx(struct ptlrpc_cli_ctx *cli_ctx,
+			 struct ptlrpc_svc_ctx *svc_ctx)
+{
+	struct gss_cli_ctx     *cli_gctx = ctx2gctx(cli_ctx);
+	struct gss_svc_ctx     *svc_gctx = gss_svc_ctx2gssctx(svc_ctx);
+	struct gss_ctx	 *mechctx = NULL;
+
+	LASSERT(cli_gctx);
+	LASSERT(svc_gctx && svc_gctx->gsc_mechctx);
+
+	cli_gctx->gc_proc = PTLRPC_GSS_PROC_DATA;
+	cli_gctx->gc_win = GSS_SEQ_WIN;
+
+	/* The problem is the reverse ctx might get lost in some recovery
+	 * situations, and the same svc_ctx will be used to re-create it.
+	 * if there's callback be sentout before that, new reverse ctx start
+	 * with sequence 0 will lead to future callback rpc be treated as
+	 * replay.
+	 *
+	 * each reverse root ctx will record its latest sequence number on its
+	 * buddy svcctx before be destroied, so here we continue use it.
+	 */
+	atomic_set(&cli_gctx->gc_seq, svc_gctx->gsc_rvs_seq);
+
+	if (gss_svc_upcall_dup_handle(&cli_gctx->gc_svc_handle, svc_gctx)) {
+		CERROR("failed to dup svc handle\n");
+		goto err_out;
+	}
+
+	if (lgss_copy_reverse_context(svc_gctx->gsc_mechctx, &mechctx) !=
+	    GSS_S_COMPLETE) {
+		CERROR("failed to copy mech context\n");
+		goto err_svc_handle;
+	}
+
+	if (rawobj_dup(&cli_gctx->gc_handle, &svc_gctx->gsc_rvs_hdl)) {
+		CERROR("failed to dup reverse handle\n");
+		goto err_ctx;
+	}
+
+	cli_gctx->gc_mechctx = mechctx;
+	gss_cli_ctx_uptodate(cli_gctx);
+
+	return 0;
+
+err_ctx:
+	lgss_delete_sec_context(&mechctx);
+err_svc_handle:
+	rawobj_free(&cli_gctx->gc_svc_handle);
+err_out:
+	return -ENOMEM;
+}
+
+static void gss_init_at_reply_offset(void)
+{
+	__u32 buflens[3];
+	int clearsize;
+
+	buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+	buflens[1] = lustre_msg_early_size();
+	buflens[2] = gss_cli_payload(NULL, buflens[1], 0);
+	gss_at_reply_off_integ = lustre_msg_size_v2(3, buflens);
+
+	buflens[0] = lustre_msg_early_size();
+	clearsize = lustre_msg_size_v2(1, buflens);
+	buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+	buflens[1] = gss_cli_payload(NULL, clearsize, 0);
+	buflens[2] = gss_cli_payload(NULL, clearsize, 1);
+	gss_at_reply_off_priv = lustre_msg_size_v2(3, buflens);
+}
+
+int __init sptlrpc_gss_init(void)
+{
+	int rc;
+
+	rc = gss_init_lproc();
+	if (rc)
+		return rc;
+
+	rc = gss_init_cli_upcall();
+	if (rc)
+		goto out_lproc;
+
+	rc = gss_init_svc_upcall();
+	if (rc)
+		goto out_cli_upcall;
+
+	rc = init_kerberos_module();
+	if (rc)
+		goto out_svc_upcall;
+
+	/* register policy after all other stuff be intialized, because it
+	 * might be in used immediately after the registration. */
+
+	rc = gss_init_keyring();
+	if (rc)
+		goto out_kerberos;
+
+#ifdef HAVE_GSS_PIPEFS
+	rc = gss_init_pipefs();
+	if (rc)
+		goto out_keyring;
+#endif
+
+	gss_init_at_reply_offset();
+
+	return 0;
+
+#ifdef HAVE_GSS_PIPEFS
+out_keyring:
+	gss_exit_keyring();
+#endif
+
+out_kerberos:
+	cleanup_kerberos_module();
+out_svc_upcall:
+	gss_exit_svc_upcall();
+out_cli_upcall:
+	gss_exit_cli_upcall();
+out_lproc:
+	gss_exit_lproc();
+	return rc;
+}
+
+static void __exit sptlrpc_gss_exit(void)
+{
+	gss_exit_keyring();
+#ifdef HAVE_GSS_PIPEFS
+	gss_exit_pipefs();
+#endif
+	cleanup_kerberos_module();
+	gss_exit_svc_upcall();
+	gss_exit_cli_upcall();
+	gss_exit_lproc();
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("GSS security policy for Lustre");
+MODULE_LICENSE("GPL");
+
+module_init(sptlrpc_gss_init);
+module_exit(sptlrpc_gss_exit);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/import.c b/drivers/staging/lustre/lustre/ptlrpc/import.c
new file mode 100644
index 000000000000..47a3c0512739
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/import.c
@@ -0,0 +1,1613 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/import.c
+ *
+ * Author: Mike Shaver <shaver@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <obd_support.h>
+#include <lustre_ha.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_export.h>
+#include <obd.h>
+#include <obd_cksum.h>
+#include <obd_class.h>
+
+#include "ptlrpc_internal.h"
+
+struct ptlrpc_connect_async_args {
+	 __u64 pcaa_peer_committed;
+	int pcaa_initial_connect;
+};
+
+/**
+ * Updates import \a imp current state to provided \a state value
+ * Helper function. Must be called under imp_lock.
+ */
+static void __import_set_state(struct obd_import *imp,
+			       enum lustre_imp_state state)
+{
+	imp->imp_state = state;
+	imp->imp_state_hist[imp->imp_state_hist_idx].ish_state = state;
+	imp->imp_state_hist[imp->imp_state_hist_idx].ish_time =
+		cfs_time_current_sec();
+	imp->imp_state_hist_idx = (imp->imp_state_hist_idx + 1) %
+		IMP_STATE_HIST_LEN;
+}
+
+/* A CLOSED import should remain so. */
+#define IMPORT_SET_STATE_NOLOCK(imp, state)				    \
+do {									   \
+	if (imp->imp_state != LUSTRE_IMP_CLOSED) {			     \
+	       CDEBUG(D_HA, "%p %s: changing import state from %s to %s\n",    \
+		      imp, obd2cli_tgt(imp->imp_obd),			  \
+		      ptlrpc_import_state_name(imp->imp_state),		\
+		      ptlrpc_import_state_name(state));			\
+	       __import_set_state(imp, state);				 \
+	}								      \
+} while(0)
+
+#define IMPORT_SET_STATE(imp, state)					\
+do {									\
+	spin_lock(&imp->imp_lock);					\
+	IMPORT_SET_STATE_NOLOCK(imp, state);				\
+	spin_unlock(&imp->imp_lock);					\
+} while(0)
+
+
+static int ptlrpc_connect_interpret(const struct lu_env *env,
+				    struct ptlrpc_request *request,
+				    void * data, int rc);
+int ptlrpc_import_recovery_state_machine(struct obd_import *imp);
+
+/* Only this function is allowed to change the import state when it is
+ * CLOSED. I would rather refcount the import and free it after
+ * disconnection like we do with exports. To do that, the client_obd
+ * will need to save the peer info somewhere other than in the import,
+ * though. */
+int ptlrpc_init_import(struct obd_import *imp)
+{
+	spin_lock(&imp->imp_lock);
+
+	imp->imp_generation++;
+	imp->imp_state =  LUSTRE_IMP_NEW;
+
+	spin_unlock(&imp->imp_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(ptlrpc_init_import);
+
+#define UUID_STR "_UUID"
+void deuuidify(char *uuid, const char *prefix, char **uuid_start, int *uuid_len)
+{
+	*uuid_start = !prefix || strncmp(uuid, prefix, strlen(prefix))
+		? uuid : uuid + strlen(prefix);
+
+	*uuid_len = strlen(*uuid_start);
+
+	if (*uuid_len < strlen(UUID_STR))
+		return;
+
+	if (!strncmp(*uuid_start + *uuid_len - strlen(UUID_STR),
+		    UUID_STR, strlen(UUID_STR)))
+		*uuid_len -= strlen(UUID_STR);
+}
+EXPORT_SYMBOL(deuuidify);
+
+/**
+ * Returns true if import was FULL, false if import was already not
+ * connected.
+ * @imp - import to be disconnected
+ * @conn_cnt - connection count (epoch) of the request that timed out
+ *	     and caused the disconnection.  In some cases, multiple
+ *	     inflight requests can fail to a single target (e.g. OST
+ *	     bulk requests) and if one has already caused a reconnection
+ *	     (increasing the import->conn_cnt) the older failure should
+ *	     not also cause a reconnection.  If zero it forces a reconnect.
+ */
+int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt)
+{
+	int rc = 0;
+
+	spin_lock(&imp->imp_lock);
+
+	if (imp->imp_state == LUSTRE_IMP_FULL &&
+	    (conn_cnt == 0 || conn_cnt == imp->imp_conn_cnt)) {
+		char *target_start;
+		int   target_len;
+
+		deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
+			  &target_start, &target_len);
+
+		if (imp->imp_replayable) {
+			LCONSOLE_WARN("%s: Connection to %.*s (at %s) was "
+			       "lost; in progress operations using this "
+			       "service will wait for recovery to complete\n",
+			       imp->imp_obd->obd_name, target_len, target_start,
+			       libcfs_nid2str(imp->imp_connection->c_peer.nid));
+		} else {
+			LCONSOLE_ERROR_MSG(0x166, "%s: Connection to "
+			       "%.*s (at %s) was lost; in progress "
+			       "operations using this service will fail\n",
+			       imp->imp_obd->obd_name,
+			       target_len, target_start,
+			       libcfs_nid2str(imp->imp_connection->c_peer.nid));
+		}
+		ptlrpc_deactivate_timeouts(imp);
+		IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON);
+		spin_unlock(&imp->imp_lock);
+
+		if (obd_dump_on_timeout)
+			libcfs_debug_dumplog();
+
+		obd_import_event(imp->imp_obd, imp, IMP_EVENT_DISCON);
+		rc = 1;
+	} else {
+		spin_unlock(&imp->imp_lock);
+		CDEBUG(D_HA, "%s: import %p already %s (conn %u, was %u): %s\n",
+		       imp->imp_client->cli_name, imp,
+		       (imp->imp_state == LUSTRE_IMP_FULL &&
+			imp->imp_conn_cnt > conn_cnt) ?
+		       "reconnected" : "not connected", imp->imp_conn_cnt,
+		       conn_cnt, ptlrpc_import_state_name(imp->imp_state));
+	}
+
+	return rc;
+}
+
+/* Must be called with imp_lock held! */
+static void ptlrpc_deactivate_and_unlock_import(struct obd_import *imp)
+{
+	ENTRY;
+	LASSERT(spin_is_locked(&imp->imp_lock));
+
+	CDEBUG(D_HA, "setting import %s INVALID\n", obd2cli_tgt(imp->imp_obd));
+	imp->imp_invalid = 1;
+	imp->imp_generation++;
+	spin_unlock(&imp->imp_lock);
+
+	ptlrpc_abort_inflight(imp);
+	obd_import_event(imp->imp_obd, imp, IMP_EVENT_INACTIVE);
+
+	EXIT;
+}
+
+/*
+ * This acts as a barrier; all existing requests are rejected, and
+ * no new requests will be accepted until the import is valid again.
+ */
+void ptlrpc_deactivate_import(struct obd_import *imp)
+{
+	spin_lock(&imp->imp_lock);
+	ptlrpc_deactivate_and_unlock_import(imp);
+}
+EXPORT_SYMBOL(ptlrpc_deactivate_import);
+
+static unsigned int
+ptlrpc_inflight_deadline(struct ptlrpc_request *req, time_t now)
+{
+	long dl;
+
+	if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) ||
+	      (req->rq_phase == RQ_PHASE_BULK) ||
+	      (req->rq_phase == RQ_PHASE_NEW)))
+		return 0;
+
+	if (req->rq_timedout)
+		return 0;
+
+	if (req->rq_phase == RQ_PHASE_NEW)
+		dl = req->rq_sent;
+	else
+		dl = req->rq_deadline;
+
+	if (dl <= now)
+		return 0;
+
+	return dl - now;
+}
+
+static unsigned int ptlrpc_inflight_timeout(struct obd_import *imp)
+{
+	time_t now = cfs_time_current_sec();
+	struct list_head *tmp, *n;
+	struct ptlrpc_request *req;
+	unsigned int timeout = 0;
+
+	spin_lock(&imp->imp_lock);
+	list_for_each_safe(tmp, n, &imp->imp_sending_list) {
+		req = list_entry(tmp, struct ptlrpc_request, rq_list);
+		timeout = max(ptlrpc_inflight_deadline(req, now), timeout);
+	}
+	spin_unlock(&imp->imp_lock);
+	return timeout;
+}
+
+/**
+ * This function will invalidate the import, if necessary, then block
+ * for all the RPC completions, and finally notify the obd to
+ * invalidate its state (ie cancel locks, clear pending requests,
+ * etc).
+ */
+void ptlrpc_invalidate_import(struct obd_import *imp)
+{
+	struct list_head *tmp, *n;
+	struct ptlrpc_request *req;
+	struct l_wait_info lwi;
+	unsigned int timeout;
+	int rc;
+
+	atomic_inc(&imp->imp_inval_count);
+
+	if (!imp->imp_invalid || imp->imp_obd->obd_no_recov)
+		ptlrpc_deactivate_import(imp);
+
+	LASSERT(imp->imp_invalid);
+
+	/* Wait forever until inflight == 0. We really can't do it another
+	 * way because in some cases we need to wait for very long reply
+	 * unlink. We can't do anything before that because there is really
+	 * no guarantee that some rdma transfer is not in progress right now. */
+	do {
+		/* Calculate max timeout for waiting on rpcs to error
+		 * out. Use obd_timeout if calculated value is smaller
+		 * than it. */
+		if (!OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) {
+			timeout = ptlrpc_inflight_timeout(imp);
+			timeout += timeout / 3;
+
+			if (timeout == 0)
+				timeout = obd_timeout;
+		} else {
+			/* decrease the interval to increase race condition */
+			timeout = 1;
+		}
+
+		CDEBUG(D_RPCTRACE,"Sleeping %d sec for inflight to error out\n",
+		       timeout);
+
+		/* Wait for all requests to error out and call completion
+		 * callbacks. Cap it at obd_timeout -- these should all
+		 * have been locally cancelled by ptlrpc_abort_inflight. */
+		lwi = LWI_TIMEOUT_INTERVAL(
+			cfs_timeout_cap(cfs_time_seconds(timeout)),
+			(timeout > 1)?cfs_time_seconds(1):cfs_time_seconds(1)/2,
+			NULL, NULL);
+		rc = l_wait_event(imp->imp_recovery_waitq,
+				  (atomic_read(&imp->imp_inflight) == 0),
+				  &lwi);
+		if (rc) {
+			const char *cli_tgt = obd2cli_tgt(imp->imp_obd);
+
+			CERROR("%s: rc = %d waiting for callback (%d != 0)\n",
+			       cli_tgt, rc,
+			       atomic_read(&imp->imp_inflight));
+
+			spin_lock(&imp->imp_lock);
+			if (atomic_read(&imp->imp_inflight) == 0) {
+				int count = atomic_read(&imp->imp_unregistering);
+
+				/* We know that "unregistering" rpcs only can
+				 * survive in sending or delaying lists (they
+				 * maybe waiting for long reply unlink in
+				 * sluggish nets). Let's check this. If there
+				 * is no inflight and unregistering != 0, this
+				 * is bug. */
+				LASSERTF(count == 0, "Some RPCs are still "
+					 "unregistering: %d\n", count);
+
+				/* Let's save one loop as soon as inflight have
+				 * dropped to zero. No new inflights possible at
+				 * this point. */
+				rc = 0;
+			} else {
+				list_for_each_safe(tmp, n,
+						       &imp->imp_sending_list) {
+					req = list_entry(tmp,
+							     struct ptlrpc_request,
+							     rq_list);
+					DEBUG_REQ(D_ERROR, req,
+						  "still on sending list");
+				}
+				list_for_each_safe(tmp, n,
+						       &imp->imp_delayed_list) {
+					req = list_entry(tmp,
+							     struct ptlrpc_request,
+							     rq_list);
+					DEBUG_REQ(D_ERROR, req,
+						  "still on delayed list");
+				}
+
+				CERROR("%s: RPCs in \"%s\" phase found (%d). "
+				       "Network is sluggish? Waiting them "
+				       "to error out.\n", cli_tgt,
+				       ptlrpc_phase2str(RQ_PHASE_UNREGISTERING),
+				       atomic_read(&imp->
+						       imp_unregistering));
+			}
+			spin_unlock(&imp->imp_lock);
+		  }
+	} while (rc != 0);
+
+	/*
+	 * Let's additionally check that no new rpcs added to import in
+	 * "invalidate" state.
+	 */
+	LASSERT(atomic_read(&imp->imp_inflight) == 0);
+	obd_import_event(imp->imp_obd, imp, IMP_EVENT_INVALIDATE);
+	sptlrpc_import_flush_all_ctx(imp);
+
+	atomic_dec(&imp->imp_inval_count);
+	wake_up_all(&imp->imp_recovery_waitq);
+}
+EXPORT_SYMBOL(ptlrpc_invalidate_import);
+
+/* unset imp_invalid */
+void ptlrpc_activate_import(struct obd_import *imp)
+{
+	struct obd_device *obd = imp->imp_obd;
+
+	spin_lock(&imp->imp_lock);
+	imp->imp_invalid = 0;
+	ptlrpc_activate_timeouts(imp);
+	spin_unlock(&imp->imp_lock);
+	obd_import_event(obd, imp, IMP_EVENT_ACTIVE);
+}
+EXPORT_SYMBOL(ptlrpc_activate_import);
+
+void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt)
+{
+	ENTRY;
+
+	LASSERT(!imp->imp_dlm_fake);
+
+	if (ptlrpc_set_import_discon(imp, conn_cnt)) {
+		if (!imp->imp_replayable) {
+			CDEBUG(D_HA, "import %s@%s for %s not replayable, "
+			       "auto-deactivating\n",
+			       obd2cli_tgt(imp->imp_obd),
+			       imp->imp_connection->c_remote_uuid.uuid,
+			       imp->imp_obd->obd_name);
+			ptlrpc_deactivate_import(imp);
+		}
+
+		CDEBUG(D_HA, "%s: waking up pinger\n",
+		       obd2cli_tgt(imp->imp_obd));
+
+		spin_lock(&imp->imp_lock);
+		imp->imp_force_verify = 1;
+		spin_unlock(&imp->imp_lock);
+
+		ptlrpc_pinger_wake_up();
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(ptlrpc_fail_import);
+
+int ptlrpc_reconnect_import(struct obd_import *imp)
+{
+	ptlrpc_set_import_discon(imp, 0);
+	/* Force a new connect attempt */
+	ptlrpc_invalidate_import(imp);
+	/* Do a fresh connect next time by zeroing the handle */
+	ptlrpc_disconnect_import(imp, 1);
+	/* Wait for all invalidate calls to finish */
+	if (atomic_read(&imp->imp_inval_count) > 0) {
+		int rc;
+		struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+		rc = l_wait_event(imp->imp_recovery_waitq,
+				  (atomic_read(&imp->imp_inval_count) == 0),
+				  &lwi);
+		if (rc)
+			CERROR("Interrupted, inval=%d\n",
+			       atomic_read(&imp->imp_inval_count));
+	}
+
+	/* Allow reconnect attempts */
+	imp->imp_obd->obd_no_recov = 0;
+	/* Remove 'invalid' flag */
+	ptlrpc_activate_import(imp);
+	/* Attempt a new connect */
+	ptlrpc_recover_import(imp, NULL, 0);
+	return 0;
+}
+EXPORT_SYMBOL(ptlrpc_reconnect_import);
+
+/**
+ * Connection on import \a imp is changed to another one (if more than one is
+ * present). We typically chose connection that we have not tried to connect to
+ * the longest
+ */
+static int import_select_connection(struct obd_import *imp)
+{
+	struct obd_import_conn *imp_conn = NULL, *conn;
+	struct obd_export *dlmexp;
+	char *target_start;
+	int target_len, tried_all = 1;
+	ENTRY;
+
+	spin_lock(&imp->imp_lock);
+
+	if (list_empty(&imp->imp_conn_list)) {
+		CERROR("%s: no connections available\n",
+		       imp->imp_obd->obd_name);
+		spin_unlock(&imp->imp_lock);
+		RETURN(-EINVAL);
+	}
+
+	list_for_each_entry(conn, &imp->imp_conn_list, oic_item) {
+		CDEBUG(D_HA, "%s: connect to NID %s last attempt "LPU64"\n",
+		       imp->imp_obd->obd_name,
+		       libcfs_nid2str(conn->oic_conn->c_peer.nid),
+		       conn->oic_last_attempt);
+
+		/* If we have not tried this connection since
+		   the last successful attempt, go with this one */
+		if ((conn->oic_last_attempt == 0) ||
+		    cfs_time_beforeq_64(conn->oic_last_attempt,
+				       imp->imp_last_success_conn)) {
+			imp_conn = conn;
+			tried_all = 0;
+			break;
+		}
+
+		/* If all of the connections have already been tried
+		   since the last successful connection; just choose the
+		   least recently used */
+		if (!imp_conn)
+			imp_conn = conn;
+		else if (cfs_time_before_64(conn->oic_last_attempt,
+					    imp_conn->oic_last_attempt))
+			imp_conn = conn;
+	}
+
+	/* if not found, simply choose the current one */
+	if (!imp_conn || imp->imp_force_reconnect) {
+		LASSERT(imp->imp_conn_current);
+		imp_conn = imp->imp_conn_current;
+		tried_all = 0;
+	}
+	LASSERT(imp_conn->oic_conn);
+
+	/* If we've tried everything, and we're back to the beginning of the
+	   list, increase our timeout and try again. It will be reset when
+	   we do finally connect. (FIXME: really we should wait for all network
+	   state associated with the last connection attempt to drain before
+	   trying to reconnect on it.) */
+	if (tried_all && (imp->imp_conn_list.next == &imp_conn->oic_item)) {
+		struct adaptive_timeout *at = &imp->imp_at.iat_net_latency;
+		if (at_get(at) < CONNECTION_SWITCH_MAX) {
+			at_measured(at, at_get(at) + CONNECTION_SWITCH_INC);
+			if (at_get(at) > CONNECTION_SWITCH_MAX)
+				at_reset(at, CONNECTION_SWITCH_MAX);
+		}
+		LASSERT(imp_conn->oic_last_attempt);
+		CDEBUG(D_HA, "%s: tried all connections, increasing latency "
+			"to %ds\n", imp->imp_obd->obd_name, at_get(at));
+	}
+
+	imp_conn->oic_last_attempt = cfs_time_current_64();
+
+	/* switch connection, don't mind if it's same as the current one */
+	if (imp->imp_connection)
+		ptlrpc_connection_put(imp->imp_connection);
+	imp->imp_connection = ptlrpc_connection_addref(imp_conn->oic_conn);
+
+	dlmexp =  class_conn2export(&imp->imp_dlm_handle);
+	LASSERT(dlmexp != NULL);
+	if (dlmexp->exp_connection)
+		ptlrpc_connection_put(dlmexp->exp_connection);
+	dlmexp->exp_connection = ptlrpc_connection_addref(imp_conn->oic_conn);
+	class_export_put(dlmexp);
+
+	if (imp->imp_conn_current != imp_conn) {
+		if (imp->imp_conn_current) {
+			deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
+				  &target_start, &target_len);
+
+			CDEBUG(D_HA, "%s: Connection changing to"
+			       " %.*s (at %s)\n",
+			       imp->imp_obd->obd_name,
+			       target_len, target_start,
+			       libcfs_nid2str(imp_conn->oic_conn->c_peer.nid));
+		}
+
+		imp->imp_conn_current = imp_conn;
+	}
+
+	CDEBUG(D_HA, "%s: import %p using connection %s/%s\n",
+	       imp->imp_obd->obd_name, imp, imp_conn->oic_uuid.uuid,
+	       libcfs_nid2str(imp_conn->oic_conn->c_peer.nid));
+
+	spin_unlock(&imp->imp_lock);
+
+	RETURN(0);
+}
+
+/*
+ * must be called under imp_lock
+ */
+static int ptlrpc_first_transno(struct obd_import *imp, __u64 *transno)
+{
+	struct ptlrpc_request *req;
+	struct list_head *tmp;
+
+	if (list_empty(&imp->imp_replay_list))
+		return 0;
+	tmp = imp->imp_replay_list.next;
+	req = list_entry(tmp, struct ptlrpc_request, rq_replay_list);
+	*transno = req->rq_transno;
+	if (req->rq_transno == 0) {
+		DEBUG_REQ(D_ERROR, req, "zero transno in replay");
+		LBUG();
+	}
+
+	return 1;
+}
+
+/**
+ * Attempt to (re)connect import \a imp. This includes all preparations,
+ * initializing CONNECT RPC request and passing it to ptlrpcd for
+ * actual sending.
+ * Returns 0 on success or error code.
+ */
+int ptlrpc_connect_import(struct obd_import *imp)
+{
+	struct obd_device *obd = imp->imp_obd;
+	int initial_connect = 0;
+	int set_transno = 0;
+	__u64 committed_before_reconnect = 0;
+	struct ptlrpc_request *request;
+	char *bufs[] = { NULL,
+			 obd2cli_tgt(imp->imp_obd),
+			 obd->obd_uuid.uuid,
+			 (char *)&imp->imp_dlm_handle,
+			 (char *)&imp->imp_connect_data };
+	struct ptlrpc_connect_async_args *aa;
+	int rc;
+	ENTRY;
+
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state == LUSTRE_IMP_CLOSED) {
+		spin_unlock(&imp->imp_lock);
+		CERROR("can't connect to a closed import\n");
+		RETURN(-EINVAL);
+	} else if (imp->imp_state == LUSTRE_IMP_FULL) {
+		spin_unlock(&imp->imp_lock);
+		CERROR("already connected\n");
+		RETURN(0);
+	} else if (imp->imp_state == LUSTRE_IMP_CONNECTING) {
+		spin_unlock(&imp->imp_lock);
+		CERROR("already connecting\n");
+		RETURN(-EALREADY);
+	}
+
+	IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CONNECTING);
+
+	imp->imp_conn_cnt++;
+	imp->imp_resend_replay = 0;
+
+	if (!lustre_handle_is_used(&imp->imp_remote_handle))
+		initial_connect = 1;
+	else
+		committed_before_reconnect = imp->imp_peer_committed_transno;
+
+	set_transno = ptlrpc_first_transno(imp,
+					   &imp->imp_connect_data.ocd_transno);
+	spin_unlock(&imp->imp_lock);
+
+	rc = import_select_connection(imp);
+	if (rc)
+		GOTO(out, rc);
+
+	rc = sptlrpc_import_sec_adapt(imp, NULL, 0);
+	if (rc)
+		GOTO(out, rc);
+
+	/* Reset connect flags to the originally requested flags, in case
+	 * the server is updated on-the-fly we will get the new features. */
+	imp->imp_connect_data.ocd_connect_flags = imp->imp_connect_flags_orig;
+	/* Reset ocd_version each time so the server knows the exact versions */
+	imp->imp_connect_data.ocd_version = LUSTRE_VERSION_CODE;
+	imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT;
+	imp->imp_msghdr_flags &= ~MSGHDR_CKSUM_INCOMPAT18;
+
+	rc = obd_reconnect(NULL, imp->imp_obd->obd_self_export, obd,
+			   &obd->obd_uuid, &imp->imp_connect_data, NULL);
+	if (rc)
+		GOTO(out, rc);
+
+	request = ptlrpc_request_alloc(imp, &RQF_MDS_CONNECT);
+	if (request == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	rc = ptlrpc_request_bufs_pack(request, LUSTRE_OBD_VERSION,
+				      imp->imp_connect_op, bufs, NULL);
+	if (rc) {
+		ptlrpc_request_free(request);
+		GOTO(out, rc);
+	}
+
+	/* Report the rpc service time to the server so that it knows how long
+	 * to wait for clients to join recovery */
+	lustre_msg_set_service_time(request->rq_reqmsg,
+				    at_timeout2est(request->rq_timeout));
+
+	/* The amount of time we give the server to process the connect req.
+	 * import_select_connection will increase the net latency on
+	 * repeated reconnect attempts to cover slow networks.
+	 * We override/ignore the server rpc completion estimate here,
+	 * which may be large if this is a reconnect attempt */
+	request->rq_timeout = INITIAL_CONNECT_TIMEOUT;
+	lustre_msg_set_timeout(request->rq_reqmsg, request->rq_timeout);
+
+	lustre_msg_add_op_flags(request->rq_reqmsg, MSG_CONNECT_NEXT_VER);
+
+	request->rq_no_resend = request->rq_no_delay = 1;
+	request->rq_send_state = LUSTRE_IMP_CONNECTING;
+	/* Allow a slightly larger reply for future growth compatibility */
+	req_capsule_set_size(&request->rq_pill, &RMF_CONNECT_DATA, RCL_SERVER,
+			     sizeof(struct obd_connect_data)+16*sizeof(__u64));
+	ptlrpc_request_set_replen(request);
+	request->rq_interpret_reply = ptlrpc_connect_interpret;
+
+	CLASSERT(sizeof (*aa) <= sizeof (request->rq_async_args));
+	aa = ptlrpc_req_async_args(request);
+	memset(aa, 0, sizeof *aa);
+
+	aa->pcaa_peer_committed = committed_before_reconnect;
+	aa->pcaa_initial_connect = initial_connect;
+
+	if (aa->pcaa_initial_connect) {
+		spin_lock(&imp->imp_lock);
+		imp->imp_replayable = 1;
+		spin_unlock(&imp->imp_lock);
+		lustre_msg_add_op_flags(request->rq_reqmsg,
+					MSG_CONNECT_INITIAL);
+	}
+
+	if (set_transno)
+		lustre_msg_add_op_flags(request->rq_reqmsg,
+					MSG_CONNECT_TRANSNO);
+
+	DEBUG_REQ(D_RPCTRACE, request, "(re)connect request (timeout %d)",
+		  request->rq_timeout);
+	ptlrpcd_add_req(request, PDL_POLICY_ROUND, -1);
+	rc = 0;
+out:
+	if (rc != 0) {
+		IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON);
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_connect_import);
+
+static void ptlrpc_maybe_ping_import_soon(struct obd_import *imp)
+{
+	int force_verify;
+
+	spin_lock(&imp->imp_lock);
+	force_verify = imp->imp_force_verify != 0;
+	spin_unlock(&imp->imp_lock);
+
+	if (force_verify)
+		ptlrpc_pinger_wake_up();
+}
+
+static int ptlrpc_busy_reconnect(int rc)
+{
+	return (rc == -EBUSY) || (rc == -EAGAIN);
+}
+
+/**
+ * interpret_reply callback for connect RPCs.
+ * Looks into returned status of connect operation and decides
+ * what to do with the import - i.e enter recovery, promote it to
+ * full state for normal operations of disconnect it due to an error.
+ */
+static int ptlrpc_connect_interpret(const struct lu_env *env,
+				    struct ptlrpc_request *request,
+				    void *data, int rc)
+{
+	struct ptlrpc_connect_async_args *aa = data;
+	struct obd_import *imp = request->rq_import;
+	struct client_obd *cli = &imp->imp_obd->u.cli;
+	struct lustre_handle old_hdl;
+	__u64 old_connect_flags;
+	int msg_flags;
+	struct obd_connect_data *ocd;
+	struct obd_export *exp;
+	int ret;
+	ENTRY;
+
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state == LUSTRE_IMP_CLOSED) {
+		imp->imp_connect_tried = 1;
+		spin_unlock(&imp->imp_lock);
+		RETURN(0);
+	}
+
+	if (rc) {
+		/* if this reconnect to busy export - not need select new target
+		 * for connecting*/
+		imp->imp_force_reconnect = ptlrpc_busy_reconnect(rc);
+		spin_unlock(&imp->imp_lock);
+		ptlrpc_maybe_ping_import_soon(imp);
+		GOTO(out, rc);
+	}
+	spin_unlock(&imp->imp_lock);
+
+	LASSERT(imp->imp_conn_current);
+
+	msg_flags = lustre_msg_get_op_flags(request->rq_repmsg);
+
+	ret = req_capsule_get_size(&request->rq_pill, &RMF_CONNECT_DATA,
+				   RCL_SERVER);
+	/* server replied obd_connect_data is always bigger */
+	ocd = req_capsule_server_sized_get(&request->rq_pill,
+					   &RMF_CONNECT_DATA, ret);
+
+	if (ocd == NULL) {
+		CERROR("%s: no connect data from server\n",
+		       imp->imp_obd->obd_name);
+		rc = -EPROTO;
+		GOTO(out, rc);
+	}
+
+	spin_lock(&imp->imp_lock);
+
+	/* All imports are pingable */
+	imp->imp_pingable = 1;
+	imp->imp_force_reconnect = 0;
+	imp->imp_force_verify = 0;
+
+	imp->imp_connect_data = *ocd;
+
+	CDEBUG(D_HA, "%s: connect to target with instance %u\n",
+	       imp->imp_obd->obd_name, ocd->ocd_instance);
+	exp = class_conn2export(&imp->imp_dlm_handle);
+
+	spin_unlock(&imp->imp_lock);
+
+	/* check that server granted subset of flags we asked for. */
+	if ((ocd->ocd_connect_flags & imp->imp_connect_flags_orig) !=
+	    ocd->ocd_connect_flags) {
+		CERROR("%s: Server didn't granted asked subset of flags: "
+		       "asked="LPX64" grranted="LPX64"\n",
+		       imp->imp_obd->obd_name,imp->imp_connect_flags_orig,
+		       ocd->ocd_connect_flags);
+		GOTO(out, rc = -EPROTO);
+	}
+
+	if (!exp) {
+		/* This could happen if export is cleaned during the
+		   connect attempt */
+		CERROR("%s: missing export after connect\n",
+		       imp->imp_obd->obd_name);
+		GOTO(out, rc = -ENODEV);
+	}
+	old_connect_flags = exp_connect_flags(exp);
+	exp->exp_connect_data = *ocd;
+	imp->imp_obd->obd_self_export->exp_connect_data = *ocd;
+	class_export_put(exp);
+
+	obd_import_event(imp->imp_obd, imp, IMP_EVENT_OCD);
+
+	if (aa->pcaa_initial_connect) {
+		spin_lock(&imp->imp_lock);
+		if (msg_flags & MSG_CONNECT_REPLAYABLE) {
+			imp->imp_replayable = 1;
+			spin_unlock(&imp->imp_lock);
+			CDEBUG(D_HA, "connected to replayable target: %s\n",
+			       obd2cli_tgt(imp->imp_obd));
+		} else {
+			imp->imp_replayable = 0;
+			spin_unlock(&imp->imp_lock);
+		}
+
+		/* if applies, adjust the imp->imp_msg_magic here
+		 * according to reply flags */
+
+		imp->imp_remote_handle =
+				*lustre_msg_get_handle(request->rq_repmsg);
+
+		/* Initial connects are allowed for clients with non-random
+		 * uuids when servers are in recovery.  Simply signal the
+		 * servers replay is complete and wait in REPLAY_WAIT. */
+		if (msg_flags & MSG_CONNECT_RECOVERING) {
+			CDEBUG(D_HA, "connect to %s during recovery\n",
+			       obd2cli_tgt(imp->imp_obd));
+			IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_LOCKS);
+		} else {
+			IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL);
+			ptlrpc_activate_import(imp);
+		}
+
+		GOTO(finish, rc = 0);
+	}
+
+	/* Determine what recovery state to move the import to. */
+	if (MSG_CONNECT_RECONNECT & msg_flags) {
+		memset(&old_hdl, 0, sizeof(old_hdl));
+		if (!memcmp(&old_hdl, lustre_msg_get_handle(request->rq_repmsg),
+			    sizeof (old_hdl))) {
+			LCONSOLE_WARN("Reconnect to %s (at @%s) failed due "
+				      "bad handle "LPX64"\n",
+				      obd2cli_tgt(imp->imp_obd),
+				      imp->imp_connection->c_remote_uuid.uuid,
+				      imp->imp_dlm_handle.cookie);
+			GOTO(out, rc = -ENOTCONN);
+		}
+
+		if (memcmp(&imp->imp_remote_handle,
+			   lustre_msg_get_handle(request->rq_repmsg),
+			   sizeof(imp->imp_remote_handle))) {
+			int level = msg_flags & MSG_CONNECT_RECOVERING ?
+				D_HA : D_WARNING;
+
+			/* Bug 16611/14775: if server handle have changed,
+			 * that means some sort of disconnection happened.
+			 * If the server is not in recovery, that also means it
+			 * already erased all of our state because of previous
+			 * eviction. If it is in recovery - we are safe to
+			 * participate since we can reestablish all of our state
+			 * with server again */
+			if ((MSG_CONNECT_RECOVERING & msg_flags)) {
+				CDEBUG(level,"%s@%s changed server handle from "
+				       LPX64" to "LPX64
+				       " but is still in recovery\n",
+				       obd2cli_tgt(imp->imp_obd),
+				       imp->imp_connection->c_remote_uuid.uuid,
+				       imp->imp_remote_handle.cookie,
+				       lustre_msg_get_handle(
+				       request->rq_repmsg)->cookie);
+			} else {
+				LCONSOLE_WARN("Evicted from %s (at %s) "
+					      "after server handle changed from "
+					      LPX64" to "LPX64"\n",
+					      obd2cli_tgt(imp->imp_obd),
+					      imp->imp_connection-> \
+					      c_remote_uuid.uuid,
+					      imp->imp_remote_handle.cookie,
+					      lustre_msg_get_handle(
+					      request->rq_repmsg)->cookie);
+			}
+
+
+			imp->imp_remote_handle =
+				     *lustre_msg_get_handle(request->rq_repmsg);
+
+			if (!(MSG_CONNECT_RECOVERING & msg_flags)) {
+				IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
+				GOTO(finish, rc = 0);
+			}
+
+		} else {
+			CDEBUG(D_HA, "reconnected to %s@%s after partition\n",
+			       obd2cli_tgt(imp->imp_obd),
+			       imp->imp_connection->c_remote_uuid.uuid);
+		}
+
+		if (imp->imp_invalid) {
+			CDEBUG(D_HA, "%s: reconnected but import is invalid; "
+			       "marking evicted\n", imp->imp_obd->obd_name);
+			IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
+		} else if (MSG_CONNECT_RECOVERING & msg_flags) {
+			CDEBUG(D_HA, "%s: reconnected to %s during replay\n",
+			       imp->imp_obd->obd_name,
+			       obd2cli_tgt(imp->imp_obd));
+
+			spin_lock(&imp->imp_lock);
+			imp->imp_resend_replay = 1;
+			spin_unlock(&imp->imp_lock);
+
+			IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY);
+		} else {
+			IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
+		}
+	} else if ((MSG_CONNECT_RECOVERING & msg_flags) && !imp->imp_invalid) {
+		LASSERT(imp->imp_replayable);
+		imp->imp_remote_handle =
+				*lustre_msg_get_handle(request->rq_repmsg);
+		imp->imp_last_replay_transno = 0;
+		IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY);
+	} else {
+		DEBUG_REQ(D_HA, request, "%s: evicting (reconnect/recover flags"
+			  " not set: %x)", imp->imp_obd->obd_name, msg_flags);
+		imp->imp_remote_handle =
+				*lustre_msg_get_handle(request->rq_repmsg);
+		IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
+	}
+
+	/* Sanity checks for a reconnected import. */
+	if (!(imp->imp_replayable) != !(msg_flags & MSG_CONNECT_REPLAYABLE)) {
+		CERROR("imp_replayable flag does not match server "
+		       "after reconnect. We should LBUG right here.\n");
+	}
+
+	if (lustre_msg_get_last_committed(request->rq_repmsg) > 0 &&
+	    lustre_msg_get_last_committed(request->rq_repmsg) <
+	    aa->pcaa_peer_committed) {
+		CERROR("%s went back in time (transno "LPD64
+		       " was previously committed, server now claims "LPD64
+		       ")!  See https://bugzilla.lustre.org/show_bug.cgi?"
+		       "id=9646\n",
+		       obd2cli_tgt(imp->imp_obd), aa->pcaa_peer_committed,
+		       lustre_msg_get_last_committed(request->rq_repmsg));
+	}
+
+finish:
+	rc = ptlrpc_import_recovery_state_machine(imp);
+	if (rc != 0) {
+		if (rc == -ENOTCONN) {
+			CDEBUG(D_HA, "evicted/aborted by %s@%s during recovery;"
+			       "invalidating and reconnecting\n",
+			       obd2cli_tgt(imp->imp_obd),
+			       imp->imp_connection->c_remote_uuid.uuid);
+			ptlrpc_connect_import(imp);
+			imp->imp_connect_tried = 1;
+			RETURN(0);
+		}
+	} else {
+
+		spin_lock(&imp->imp_lock);
+		list_del(&imp->imp_conn_current->oic_item);
+		list_add(&imp->imp_conn_current->oic_item,
+			     &imp->imp_conn_list);
+		imp->imp_last_success_conn =
+			imp->imp_conn_current->oic_last_attempt;
+
+		spin_unlock(&imp->imp_lock);
+
+		if (!ocd->ocd_ibits_known &&
+		    ocd->ocd_connect_flags & OBD_CONNECT_IBITS)
+			CERROR("Inodebits aware server returned zero compatible"
+			       " bits?\n");
+
+		if ((ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
+		    (ocd->ocd_version > LUSTRE_VERSION_CODE +
+					LUSTRE_VERSION_OFFSET_WARN ||
+		     ocd->ocd_version < LUSTRE_VERSION_CODE -
+					LUSTRE_VERSION_OFFSET_WARN)) {
+			/* Sigh, some compilers do not like #ifdef in the middle
+			   of macro arguments */
+			const char *older = "older. Consider upgrading server "
+					    "or downgrading client";
+			const char *newer = "newer than client version. "
+					    "Consider upgrading client";
+
+			LCONSOLE_WARN("Server %s version (%d.%d.%d.%d) "
+				      "is much %s (%s)\n",
+				      obd2cli_tgt(imp->imp_obd),
+				      OBD_OCD_VERSION_MAJOR(ocd->ocd_version),
+				      OBD_OCD_VERSION_MINOR(ocd->ocd_version),
+				      OBD_OCD_VERSION_PATCH(ocd->ocd_version),
+				      OBD_OCD_VERSION_FIX(ocd->ocd_version),
+				      ocd->ocd_version > LUSTRE_VERSION_CODE ?
+				      newer : older, LUSTRE_VERSION_STRING);
+		}
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 2, 50, 0)
+		/* Check if server has LU-1252 fix applied to not always swab
+		 * the IR MNE entries. Do this only once per connection.  This
+		 * fixup is version-limited, because we don't want to carry the
+		 * OBD_CONNECT_MNE_SWAB flag around forever, just so long as we
+		 * need interop with unpatched 2.2 servers.  For newer servers,
+		 * the client will do MNE swabbing only as needed.  LU-1644 */
+		if (unlikely((ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
+			     !(ocd->ocd_connect_flags & OBD_CONNECT_MNE_SWAB) &&
+			     OBD_OCD_VERSION_MAJOR(ocd->ocd_version) == 2 &&
+			     OBD_OCD_VERSION_MINOR(ocd->ocd_version) == 2 &&
+			     OBD_OCD_VERSION_PATCH(ocd->ocd_version) < 55 &&
+			     strcmp(imp->imp_obd->obd_type->typ_name,
+				    LUSTRE_MGC_NAME) == 0))
+			imp->imp_need_mne_swab = 1;
+		else /* clear if server was upgraded since last connect */
+			imp->imp_need_mne_swab = 0;
+#else
+#warning "LU-1644: Remove old OBD_CONNECT_MNE_SWAB fixup and imp_need_mne_swab"
+#endif
+
+		if (ocd->ocd_connect_flags & OBD_CONNECT_CKSUM) {
+			/* We sent to the server ocd_cksum_types with bits set
+			 * for algorithms we understand. The server masked off
+			 * the checksum types it doesn't support */
+			if ((ocd->ocd_cksum_types &
+			     cksum_types_supported_client()) == 0) {
+				LCONSOLE_WARN("The negotiation of the checksum "
+					      "alogrithm to use with server %s "
+					      "failed (%x/%x), disabling "
+					      "checksums\n",
+					      obd2cli_tgt(imp->imp_obd),
+					      ocd->ocd_cksum_types,
+					      cksum_types_supported_client());
+				cli->cl_checksum = 0;
+				cli->cl_supp_cksum_types = OBD_CKSUM_ADLER;
+			} else {
+				cli->cl_supp_cksum_types = ocd->ocd_cksum_types;
+			}
+		} else {
+			/* The server does not support OBD_CONNECT_CKSUM.
+			 * Enforce ADLER for backward compatibility*/
+			cli->cl_supp_cksum_types = OBD_CKSUM_ADLER;
+		}
+		cli->cl_cksum_type =cksum_type_select(cli->cl_supp_cksum_types);
+
+		if (ocd->ocd_connect_flags & OBD_CONNECT_BRW_SIZE)
+			cli->cl_max_pages_per_rpc =
+				min(ocd->ocd_brw_size >> PAGE_CACHE_SHIFT,
+				    cli->cl_max_pages_per_rpc);
+		else if (imp->imp_connect_op == MDS_CONNECT ||
+			 imp->imp_connect_op == MGS_CONNECT)
+			cli->cl_max_pages_per_rpc = 1;
+
+		/* Reset ns_connect_flags only for initial connect. It might be
+		 * changed in while using FS and if we reset it in reconnect
+		 * this leads to losing user settings done before such as
+		 * disable lru_resize, etc. */
+		if (old_connect_flags != exp_connect_flags(exp) ||
+		    aa->pcaa_initial_connect) {
+			CDEBUG(D_HA, "%s: Resetting ns_connect_flags to server "
+			       "flags: "LPX64"\n", imp->imp_obd->obd_name,
+			      ocd->ocd_connect_flags);
+			imp->imp_obd->obd_namespace->ns_connect_flags =
+				ocd->ocd_connect_flags;
+			imp->imp_obd->obd_namespace->ns_orig_connect_flags =
+				ocd->ocd_connect_flags;
+		}
+
+		if ((ocd->ocd_connect_flags & OBD_CONNECT_AT) &&
+		    (imp->imp_msg_magic == LUSTRE_MSG_MAGIC_V2))
+			/* We need a per-message support flag, because
+			   a. we don't know if the incoming connect reply
+			      supports AT or not (in reply_in_callback)
+			      until we unpack it.
+			   b. failovered server means export and flags are gone
+			      (in ptlrpc_send_reply).
+			   Can only be set when we know AT is supported at
+			   both ends */
+			imp->imp_msghdr_flags |= MSGHDR_AT_SUPPORT;
+		else
+			imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT;
+
+		if ((ocd->ocd_connect_flags & OBD_CONNECT_FULL20) &&
+		    (imp->imp_msg_magic == LUSTRE_MSG_MAGIC_V2))
+			imp->imp_msghdr_flags |= MSGHDR_CKSUM_INCOMPAT18;
+		else
+			imp->imp_msghdr_flags &= ~MSGHDR_CKSUM_INCOMPAT18;
+
+		LASSERT((cli->cl_max_pages_per_rpc <= PTLRPC_MAX_BRW_PAGES) &&
+			(cli->cl_max_pages_per_rpc > 0));
+	}
+
+out:
+	imp->imp_connect_tried = 1;
+
+	if (rc != 0) {
+		IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON);
+		if (rc == -EACCES) {
+			/*
+			 * Give up trying to reconnect
+			 * EACCES means client has no permission for connection
+			 */
+			imp->imp_obd->obd_no_recov = 1;
+			ptlrpc_deactivate_import(imp);
+		}
+
+		if (rc == -EPROTO) {
+			struct obd_connect_data *ocd;
+
+			/* reply message might not be ready */
+			if (request->rq_repmsg == NULL)
+				RETURN(-EPROTO);
+
+			ocd = req_capsule_server_get(&request->rq_pill,
+						     &RMF_CONNECT_DATA);
+			if (ocd &&
+			    (ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
+			    (ocd->ocd_version != LUSTRE_VERSION_CODE)) {
+			   /* Actually servers are only supposed to refuse
+			      connection from liblustre clients, so we should
+			      never see this from VFS context */
+				LCONSOLE_ERROR_MSG(0x16a, "Server %s version "
+					"(%d.%d.%d.%d)"
+					" refused connection from this client "
+					"with an incompatible version (%s).  "
+					"Client must be recompiled\n",
+					obd2cli_tgt(imp->imp_obd),
+					OBD_OCD_VERSION_MAJOR(ocd->ocd_version),
+					OBD_OCD_VERSION_MINOR(ocd->ocd_version),
+					OBD_OCD_VERSION_PATCH(ocd->ocd_version),
+					OBD_OCD_VERSION_FIX(ocd->ocd_version),
+					LUSTRE_VERSION_STRING);
+				ptlrpc_deactivate_import(imp);
+				IMPORT_SET_STATE(imp, LUSTRE_IMP_CLOSED);
+			}
+			RETURN(-EPROTO);
+		}
+
+		ptlrpc_maybe_ping_import_soon(imp);
+
+		CDEBUG(D_HA, "recovery of %s on %s failed (%d)\n",
+		       obd2cli_tgt(imp->imp_obd),
+		       (char *)imp->imp_connection->c_remote_uuid.uuid, rc);
+	}
+
+	wake_up_all(&imp->imp_recovery_waitq);
+	RETURN(rc);
+}
+
+/**
+ * interpret callback for "completed replay" RPCs.
+ * \see signal_completed_replay
+ */
+static int completed_replay_interpret(const struct lu_env *env,
+				      struct ptlrpc_request *req,
+				      void * data, int rc)
+{
+	ENTRY;
+	atomic_dec(&req->rq_import->imp_replay_inflight);
+	if (req->rq_status == 0 &&
+	    !req->rq_import->imp_vbr_failed) {
+		ptlrpc_import_recovery_state_machine(req->rq_import);
+	} else {
+		if (req->rq_import->imp_vbr_failed) {
+			CDEBUG(D_WARNING,
+			       "%s: version recovery fails, reconnecting\n",
+			       req->rq_import->imp_obd->obd_name);
+		} else {
+			CDEBUG(D_HA, "%s: LAST_REPLAY message error: %d, "
+				     "reconnecting\n",
+			       req->rq_import->imp_obd->obd_name,
+			       req->rq_status);
+		}
+		ptlrpc_connect_import(req->rq_import);
+	}
+
+	RETURN(0);
+}
+
+/**
+ * Let server know that we have no requests to replay anymore.
+ * Achieved by just sending a PING request
+ */
+static int signal_completed_replay(struct obd_import *imp)
+{
+	struct ptlrpc_request *req;
+	ENTRY;
+
+	if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_FINISH_REPLAY)))
+		RETURN(0);
+
+	LASSERT(atomic_read(&imp->imp_replay_inflight) == 0);
+	atomic_inc(&imp->imp_replay_inflight);
+
+	req = ptlrpc_request_alloc_pack(imp, &RQF_OBD_PING, LUSTRE_OBD_VERSION,
+					OBD_PING);
+	if (req == NULL) {
+		atomic_dec(&imp->imp_replay_inflight);
+		RETURN(-ENOMEM);
+	}
+
+	ptlrpc_request_set_replen(req);
+	req->rq_send_state = LUSTRE_IMP_REPLAY_WAIT;
+	lustre_msg_add_flags(req->rq_reqmsg,
+			     MSG_LOCK_REPLAY_DONE | MSG_REQ_REPLAY_DONE);
+	if (AT_OFF)
+		req->rq_timeout *= 3;
+	req->rq_interpret_reply = completed_replay_interpret;
+
+	ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+	RETURN(0);
+}
+
+/**
+ * In kernel code all import invalidation happens in its own
+ * separate thread, so that whatever application happened to encounter
+ * a problem could still be killed or otherwise continue
+ */
+static int ptlrpc_invalidate_import_thread(void *data)
+{
+	struct obd_import *imp = data;
+
+	ENTRY;
+
+	unshare_fs_struct();
+
+	CDEBUG(D_HA, "thread invalidate import %s to %s@%s\n",
+	       imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
+	       imp->imp_connection->c_remote_uuid.uuid);
+
+	ptlrpc_invalidate_import(imp);
+
+	if (obd_dump_on_eviction) {
+		CERROR("dump the log upon eviction\n");
+		libcfs_debug_dumplog();
+	}
+
+	IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
+	ptlrpc_import_recovery_state_machine(imp);
+
+	class_import_put(imp);
+	RETURN(0);
+}
+
+/**
+ * This is the state machine for client-side recovery on import.
+ *
+ * Typicaly we have two possibly paths. If we came to server and it is not
+ * in recovery, we just enter IMP_EVICTED state, invalidate our import
+ * state and reconnect from scratch.
+ * If we came to server that is in recovery, we enter IMP_REPLAY import state.
+ * We go through our list of requests to replay and send them to server one by
+ * one.
+ * After sending all request from the list we change import state to
+ * IMP_REPLAY_LOCKS and re-request all the locks we believe we have from server
+ * and also all the locks we don't yet have and wait for server to grant us.
+ * After that we send a special "replay completed" request and change import
+ * state to IMP_REPLAY_WAIT.
+ * Upon receiving reply to that "replay completed" RPC we enter IMP_RECOVER
+ * state and resend all requests from sending list.
+ * After that we promote import to FULL state and send all delayed requests
+ * and import is fully operational after that.
+ *
+ */
+int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
+{
+	int rc = 0;
+	int inflight;
+	char *target_start;
+	int target_len;
+
+	ENTRY;
+	if (imp->imp_state == LUSTRE_IMP_EVICTED) {
+		deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
+			  &target_start, &target_len);
+		/* Don't care about MGC eviction */
+		if (strcmp(imp->imp_obd->obd_type->typ_name,
+			   LUSTRE_MGC_NAME) != 0) {
+			LCONSOLE_ERROR_MSG(0x167, "%s: This client was evicted "
+					   "by %.*s; in progress operations "
+					   "using this service will fail.\n",
+					   imp->imp_obd->obd_name, target_len,
+					   target_start);
+		}
+		CDEBUG(D_HA, "evicted from %s@%s; invalidating\n",
+		       obd2cli_tgt(imp->imp_obd),
+		       imp->imp_connection->c_remote_uuid.uuid);
+		/* reset vbr_failed flag upon eviction */
+		spin_lock(&imp->imp_lock);
+		imp->imp_vbr_failed = 0;
+		spin_unlock(&imp->imp_lock);
+
+		{
+		task_t *task;
+		/* bug 17802:  XXX client_disconnect_export vs connect request
+		 * race. if client will evicted at this time, we start
+		 * invalidate thread without reference to import and import can
+		 * be freed at same time. */
+		class_import_get(imp);
+		task = kthread_run(ptlrpc_invalidate_import_thread, imp,
+				     "ll_imp_inval");
+		if (IS_ERR(task)) {
+			class_import_put(imp);
+			CERROR("error starting invalidate thread: %d\n", rc);
+			rc = PTR_ERR(task);
+		} else {
+			rc = 0;
+		}
+		RETURN(rc);
+		}
+	}
+
+	if (imp->imp_state == LUSTRE_IMP_REPLAY) {
+		CDEBUG(D_HA, "replay requested by %s\n",
+		       obd2cli_tgt(imp->imp_obd));
+		rc = ptlrpc_replay_next(imp, &inflight);
+		if (inflight == 0 &&
+		    atomic_read(&imp->imp_replay_inflight) == 0) {
+			IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_LOCKS);
+			rc = ldlm_replay_locks(imp);
+			if (rc)
+				GOTO(out, rc);
+		}
+		rc = 0;
+	}
+
+	if (imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS) {
+		if (atomic_read(&imp->imp_replay_inflight) == 0) {
+			IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_WAIT);
+			rc = signal_completed_replay(imp);
+			if (rc)
+				GOTO(out, rc);
+		}
+
+	}
+
+	if (imp->imp_state == LUSTRE_IMP_REPLAY_WAIT) {
+		if (atomic_read(&imp->imp_replay_inflight) == 0) {
+			IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
+		}
+	}
+
+	if (imp->imp_state == LUSTRE_IMP_RECOVER) {
+		CDEBUG(D_HA, "reconnected to %s@%s\n",
+		       obd2cli_tgt(imp->imp_obd),
+		       imp->imp_connection->c_remote_uuid.uuid);
+
+		rc = ptlrpc_resend(imp);
+		if (rc)
+			GOTO(out, rc);
+		IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL);
+		ptlrpc_activate_import(imp);
+
+		deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
+			  &target_start, &target_len);
+		LCONSOLE_INFO("%s: Connection restored to %.*s (at %s)\n",
+			      imp->imp_obd->obd_name,
+			      target_len, target_start,
+			      libcfs_nid2str(imp->imp_connection->c_peer.nid));
+	}
+
+	if (imp->imp_state == LUSTRE_IMP_FULL) {
+		wake_up_all(&imp->imp_recovery_waitq);
+		ptlrpc_wake_delayed(imp);
+	}
+
+out:
+	RETURN(rc);
+}
+
+int ptlrpc_disconnect_import(struct obd_import *imp, int noclose)
+{
+	struct ptlrpc_request *req;
+	int rq_opc, rc = 0;
+	int nowait = imp->imp_obd->obd_force;
+	ENTRY;
+
+	if (nowait)
+		GOTO(set_state, rc);
+
+	switch (imp->imp_connect_op) {
+	case OST_CONNECT: rq_opc = OST_DISCONNECT; break;
+	case MDS_CONNECT: rq_opc = MDS_DISCONNECT; break;
+	case MGS_CONNECT: rq_opc = MGS_DISCONNECT; break;
+	default:
+		CERROR("don't know how to disconnect from %s (connect_op %d)\n",
+		       obd2cli_tgt(imp->imp_obd), imp->imp_connect_op);
+		RETURN(-EINVAL);
+	}
+
+	if (ptlrpc_import_in_recovery(imp)) {
+		struct l_wait_info lwi;
+		cfs_duration_t timeout;
+
+
+		if (AT_OFF) {
+			if (imp->imp_server_timeout)
+				timeout = cfs_time_seconds(obd_timeout / 2);
+			else
+				timeout = cfs_time_seconds(obd_timeout);
+		} else {
+			int idx = import_at_get_index(imp,
+				imp->imp_client->cli_request_portal);
+			timeout = cfs_time_seconds(
+				at_get(&imp->imp_at.iat_service_estimate[idx]));
+		}
+
+		lwi = LWI_TIMEOUT_INTR(cfs_timeout_cap(timeout),
+				       back_to_sleep, LWI_ON_SIGNAL_NOOP, NULL);
+		rc = l_wait_event(imp->imp_recovery_waitq,
+				  !ptlrpc_import_in_recovery(imp), &lwi);
+
+	}
+
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state != LUSTRE_IMP_FULL)
+		GOTO(out, 0);
+
+	spin_unlock(&imp->imp_lock);
+
+	req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_DISCONNECT,
+					LUSTRE_OBD_VERSION, rq_opc);
+	if (req) {
+		/* We are disconnecting, do not retry a failed DISCONNECT rpc if
+		 * it fails.  We can get through the above with a down server
+		 * if the client doesn't know the server is gone yet. */
+		req->rq_no_resend = 1;
+
+		/* We want client umounts to happen quickly, no matter the
+		   server state... */
+		req->rq_timeout = min_t(int, req->rq_timeout,
+					INITIAL_CONNECT_TIMEOUT);
+
+		IMPORT_SET_STATE(imp, LUSTRE_IMP_CONNECTING);
+		req->rq_send_state =  LUSTRE_IMP_CONNECTING;
+		ptlrpc_request_set_replen(req);
+		rc = ptlrpc_queue_wait(req);
+		ptlrpc_req_finished(req);
+	}
+
+set_state:
+	spin_lock(&imp->imp_lock);
+out:
+	if (noclose)
+		IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON);
+	else
+		IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CLOSED);
+	memset(&imp->imp_remote_handle, 0, sizeof(imp->imp_remote_handle));
+	spin_unlock(&imp->imp_lock);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_disconnect_import);
+
+void ptlrpc_cleanup_imp(struct obd_import *imp)
+{
+	ENTRY;
+
+	spin_lock(&imp->imp_lock);
+	IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CLOSED);
+	imp->imp_generation++;
+	spin_unlock(&imp->imp_lock);
+	ptlrpc_abort_inflight(imp);
+
+	EXIT;
+}
+EXPORT_SYMBOL(ptlrpc_cleanup_imp);
+
+/* Adaptive Timeout utils */
+extern unsigned int at_min, at_max, at_history;
+
+/* Bin into timeslices using AT_BINS bins.
+   This gives us a max of the last binlimit*AT_BINS secs without the storage,
+   but still smoothing out a return to normalcy from a slow response.
+   (E.g. remember the maximum latency in each minute of the last 4 minutes.) */
+int at_measured(struct adaptive_timeout *at, unsigned int val)
+{
+	unsigned int old = at->at_current;
+	time_t now = cfs_time_current_sec();
+	time_t binlimit = max_t(time_t, at_history / AT_BINS, 1);
+
+	LASSERT(at);
+	CDEBUG(D_OTHER, "add %u to %p time=%lu v=%u (%u %u %u %u)\n",
+	       val, at, now - at->at_binstart, at->at_current,
+	       at->at_hist[0], at->at_hist[1], at->at_hist[2], at->at_hist[3]);
+
+	if (val == 0)
+		/* 0's don't count, because we never want our timeout to
+		   drop to 0, and because 0 could mean an error */
+		return 0;
+
+	spin_lock(&at->at_lock);
+
+	if (unlikely(at->at_binstart == 0)) {
+		/* Special case to remove default from history */
+		at->at_current = val;
+		at->at_worst_ever = val;
+		at->at_worst_time = now;
+		at->at_hist[0] = val;
+		at->at_binstart = now;
+	} else if (now - at->at_binstart < binlimit ) {
+		/* in bin 0 */
+		at->at_hist[0] = max(val, at->at_hist[0]);
+		at->at_current = max(val, at->at_current);
+	} else {
+		int i, shift;
+		unsigned int maxv = val;
+		/* move bins over */
+		shift = (now - at->at_binstart) / binlimit;
+		LASSERT(shift > 0);
+		for(i = AT_BINS - 1; i >= 0; i--) {
+			if (i >= shift) {
+				at->at_hist[i] = at->at_hist[i - shift];
+				maxv = max(maxv, at->at_hist[i]);
+			} else {
+				at->at_hist[i] = 0;
+			}
+		}
+		at->at_hist[0] = val;
+		at->at_current = maxv;
+		at->at_binstart += shift * binlimit;
+	}
+
+	if (at->at_current > at->at_worst_ever) {
+		at->at_worst_ever = at->at_current;
+		at->at_worst_time = now;
+	}
+
+	if (at->at_flags & AT_FLG_NOHIST)
+		/* Only keep last reported val; keeping the rest of the history
+		   for proc only */
+		at->at_current = val;
+
+	if (at_max > 0)
+		at->at_current =  min(at->at_current, at_max);
+	at->at_current =  max(at->at_current, at_min);
+
+	if (at->at_current != old)
+		CDEBUG(D_OTHER, "AT %p change: old=%u new=%u delta=%d "
+		       "(val=%u) hist %u %u %u %u\n", at,
+		       old, at->at_current, at->at_current - old, val,
+		       at->at_hist[0], at->at_hist[1], at->at_hist[2],
+		       at->at_hist[3]);
+
+	/* if we changed, report the old value */
+	old = (at->at_current != old) ? old : 0;
+
+	spin_unlock(&at->at_lock);
+	return old;
+}
+
+/* Find the imp_at index for a given portal; assign if space available */
+int import_at_get_index(struct obd_import *imp, int portal)
+{
+	struct imp_at *at = &imp->imp_at;
+	int i;
+
+	for (i = 0; i < IMP_AT_MAX_PORTALS; i++) {
+		if (at->iat_portal[i] == portal)
+			return i;
+		if (at->iat_portal[i] == 0)
+			/* unused */
+			break;
+	}
+
+	/* Not found in list, add it under a lock */
+	spin_lock(&imp->imp_lock);
+
+	/* Check unused under lock */
+	for (; i < IMP_AT_MAX_PORTALS; i++) {
+		if (at->iat_portal[i] == portal)
+			goto out;
+		if (at->iat_portal[i] == 0)
+			/* unused */
+			break;
+	}
+
+	/* Not enough portals? */
+	LASSERT(i < IMP_AT_MAX_PORTALS);
+
+	at->iat_portal[i] = portal;
+out:
+	spin_unlock(&imp->imp_lock);
+	return i;
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/layout.c b/drivers/staging/lustre/lustre/ptlrpc/layout.c
new file mode 100644
index 000000000000..2f55ce26ccba
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/layout.c
@@ -0,0 +1,2396 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/layout.c
+ *
+ * Lustre Metadata Target (mdt) request handler
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+/*
+ * This file contains the "capsule/pill" abstraction layered above PTLRPC.
+ *
+ * Every struct ptlrpc_request contains a "pill", which points to a description
+ * of the format that the request conforms to.
+ */
+
+#if !defined(__REQ_LAYOUT_USER__)
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <linux/module.h>
+
+/* LUSTRE_VERSION_CODE */
+#include <lustre_ver.h>
+
+#include <obd_support.h>
+/* lustre_swab_mdt_body */
+#include <lustre/lustre_idl.h>
+/* obd2cli_tgt() (required by DEBUG_REQ()) */
+#include <obd.h>
+
+/* __REQ_LAYOUT_USER__ */
+#endif
+/* struct ptlrpc_request, lustre_msg* */
+#include <lustre_req_layout.h>
+#include <lustre_update.h>
+#include <lustre_acl.h>
+#include <lustre_debug.h>
+
+/*
+ * RQFs (see below) refer to two struct req_msg_field arrays describing the
+ * client request and server reply, respectively.
+ */
+/* empty set of fields... for suitable definition of emptiness. */
+static const struct req_msg_field *empty[] = {
+	&RMF_PTLRPC_BODY
+};
+
+static const struct req_msg_field *mgs_target_info_only[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MGS_TARGET_INFO
+};
+
+static const struct req_msg_field *mgs_set_info[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MGS_SEND_PARAM
+};
+
+static const struct req_msg_field *mgs_config_read_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MGS_CONFIG_BODY
+};
+
+static const struct req_msg_field *mgs_config_read_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MGS_CONFIG_RES
+};
+
+static const struct req_msg_field *log_cancel_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_LOGCOOKIES
+};
+
+static const struct req_msg_field *mdt_body_only[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY
+};
+
+static const struct req_msg_field *mdt_body_capa[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_CAPA1
+};
+
+static const struct req_msg_field *quotactl_only[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_OBD_QUOTACTL
+};
+
+static const struct req_msg_field *quota_body_only[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_QUOTA_BODY
+};
+
+static const struct req_msg_field *ldlm_intent_quota_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_LDLM_INTENT,
+	&RMF_QUOTA_BODY
+};
+
+static const struct req_msg_field *ldlm_intent_quota_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REP,
+	&RMF_DLM_LVB,
+	&RMF_QUOTA_BODY
+};
+
+static const struct req_msg_field *mdt_close_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_EPOCH,
+	&RMF_REC_REINT,
+	&RMF_CAPA1
+};
+
+static const struct req_msg_field *obd_statfs_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_OBD_STATFS
+};
+
+static const struct req_msg_field *seq_query_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_SEQ_OPC,
+	&RMF_SEQ_RANGE
+};
+
+static const struct req_msg_field *seq_query_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_SEQ_RANGE
+};
+
+static const struct req_msg_field *fld_query_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_FLD_OPC,
+	&RMF_FLD_MDFLD
+};
+
+static const struct req_msg_field *fld_query_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_FLD_MDFLD
+};
+
+static const struct req_msg_field *mds_getattr_name_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_CAPA1,
+	&RMF_NAME
+};
+
+static const struct req_msg_field *mds_reint_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT
+};
+
+static const struct req_msg_field *mds_reint_create_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_NAME
+};
+
+static const struct req_msg_field *mds_reint_create_slave_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_NAME,
+	&RMF_EADATA,
+	&RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_reint_create_rmt_acl_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_NAME,
+	&RMF_EADATA,
+	&RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_reint_create_sym_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_NAME,
+	&RMF_SYMTGT,
+	&RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_reint_open_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_CAPA2,
+	&RMF_NAME,
+	&RMF_EADATA
+};
+
+static const struct req_msg_field *mds_reint_open_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_MDT_MD,
+	&RMF_ACL,
+	&RMF_CAPA1,
+	&RMF_CAPA2
+};
+
+static const struct req_msg_field *mds_reint_unlink_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_NAME,
+	&RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_reint_link_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_CAPA2,
+	&RMF_NAME,
+	&RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_reint_rename_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_CAPA2,
+	&RMF_NAME,
+	&RMF_SYMTGT,
+	&RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_last_unlink_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_MDT_MD,
+	&RMF_LOGCOOKIES,
+	&RMF_CAPA1,
+	&RMF_CAPA2
+};
+
+static const struct req_msg_field *mds_reint_setattr_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_MDT_EPOCH,
+	&RMF_EADATA,
+	&RMF_LOGCOOKIES,
+	&RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_reint_setxattr_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_NAME,
+	&RMF_EADATA
+};
+
+static const struct req_msg_field *mdt_swap_layouts[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_SWAP_LAYOUTS,
+	&RMF_CAPA1,
+	&RMF_CAPA2,
+	&RMF_DLM_REQ
+};
+
+static const struct req_msg_field *obd_connect_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_TGTUUID,
+	&RMF_CLUUID,
+	&RMF_CONN,
+	&RMF_CONNECT_DATA
+};
+
+static const struct req_msg_field *obd_connect_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_CONNECT_DATA
+};
+
+static const struct req_msg_field *obd_set_info_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_SETINFO_KEY,
+	&RMF_SETINFO_VAL
+};
+
+static const struct req_msg_field *ost_grant_shrink_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_SETINFO_KEY,
+	&RMF_OST_BODY
+};
+
+static const struct req_msg_field *mds_getinfo_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_GETINFO_KEY,
+	&RMF_GETINFO_VALLEN
+};
+
+static const struct req_msg_field *mds_getinfo_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_GETINFO_VAL,
+};
+
+static const struct req_msg_field *ldlm_enqueue_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ
+};
+
+static const struct req_msg_field *ldlm_enqueue_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REP
+};
+
+static const struct req_msg_field *ldlm_enqueue_lvb_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REP,
+	&RMF_DLM_LVB
+};
+
+static const struct req_msg_field *ldlm_cp_callback_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_DLM_LVB
+};
+
+static const struct req_msg_field *ldlm_gl_callback_desc_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_DLM_GL_DESC
+};
+
+static const struct req_msg_field *ldlm_gl_callback_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_LVB
+};
+
+static const struct req_msg_field *ldlm_intent_basic_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_LDLM_INTENT,
+};
+
+static const struct req_msg_field *ldlm_intent_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_LDLM_INTENT,
+	&RMF_REC_REINT
+};
+
+static const struct req_msg_field *ldlm_intent_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REP,
+	&RMF_MDT_BODY,
+	&RMF_MDT_MD,
+	&RMF_ACL
+};
+
+static const struct req_msg_field *ldlm_intent_layout_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_LDLM_INTENT,
+	&RMF_LAYOUT_INTENT,
+	&RMF_EADATA /* for new layout to be set up */
+};
+static const struct req_msg_field *ldlm_intent_open_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REP,
+	&RMF_MDT_BODY,
+	&RMF_MDT_MD,
+	&RMF_ACL,
+	&RMF_CAPA1,
+	&RMF_CAPA2
+};
+
+static const struct req_msg_field *ldlm_intent_getattr_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_LDLM_INTENT,
+	&RMF_MDT_BODY,     /* coincides with mds_getattr_name_client[] */
+	&RMF_CAPA1,
+	&RMF_NAME
+};
+
+static const struct req_msg_field *ldlm_intent_getattr_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REP,
+	&RMF_MDT_BODY,
+	&RMF_MDT_MD,
+	&RMF_ACL,
+	&RMF_CAPA1
+};
+
+static const struct req_msg_field *ldlm_intent_create_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_LDLM_INTENT,
+	&RMF_REC_REINT,    /* coincides with mds_reint_create_client[] */
+	&RMF_CAPA1,
+	&RMF_NAME,
+	&RMF_EADATA
+};
+
+static const struct req_msg_field *ldlm_intent_open_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_LDLM_INTENT,
+	&RMF_REC_REINT,    /* coincides with mds_reint_open_client[] */
+	&RMF_CAPA1,
+	&RMF_CAPA2,
+	&RMF_NAME,
+	&RMF_EADATA
+};
+
+static const struct req_msg_field *ldlm_intent_unlink_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_LDLM_INTENT,
+	&RMF_REC_REINT,    /* coincides with mds_reint_unlink_client[] */
+	&RMF_CAPA1,
+	&RMF_NAME
+};
+
+static const struct req_msg_field *mds_getxattr_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_CAPA1,
+	&RMF_NAME,
+	&RMF_EADATA
+};
+
+static const struct req_msg_field *mds_getxattr_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_EADATA
+};
+
+static const struct req_msg_field *mds_getattr_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_MDT_MD,
+	&RMF_ACL,
+	&RMF_CAPA1,
+	&RMF_CAPA2
+};
+
+static const struct req_msg_field *mds_setattr_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_MDT_MD,
+	&RMF_ACL,
+	&RMF_CAPA1,
+	&RMF_CAPA2
+};
+
+static const struct req_msg_field *mds_update_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_UPDATE,
+};
+
+static const struct req_msg_field *mds_update_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_UPDATE_REPLY,
+};
+
+static const struct req_msg_field *llog_origin_handle_create_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_LLOGD_BODY,
+	&RMF_NAME
+};
+
+static const struct req_msg_field *llogd_body_only[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_LLOGD_BODY
+};
+
+static const struct req_msg_field *llog_log_hdr_only[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_LLOG_LOG_HDR
+};
+
+static const struct req_msg_field *llogd_conn_body_only[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_LLOGD_CONN_BODY
+};
+
+static const struct req_msg_field *llog_origin_handle_next_block_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_LLOGD_BODY,
+	&RMF_EADATA
+};
+
+static const struct req_msg_field *obd_idx_read_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_IDX_INFO
+};
+
+static const struct req_msg_field *obd_idx_read_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_IDX_INFO
+};
+
+static const struct req_msg_field *ost_body_only[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_OST_BODY
+};
+
+static const struct req_msg_field *ost_body_capa[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_OST_BODY,
+	&RMF_CAPA1
+};
+
+static const struct req_msg_field *ost_destroy_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_OST_BODY,
+	&RMF_DLM_REQ,
+	&RMF_CAPA1
+};
+
+
+static const struct req_msg_field *ost_brw_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_OST_BODY,
+	&RMF_OBD_IOOBJ,
+	&RMF_NIOBUF_REMOTE,
+	&RMF_CAPA1
+};
+
+static const struct req_msg_field *ost_brw_read_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_OST_BODY
+};
+
+static const struct req_msg_field *ost_brw_write_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_OST_BODY,
+	&RMF_RCS
+};
+
+static const struct req_msg_field *ost_get_info_generic_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_GENERIC_DATA,
+};
+
+static const struct req_msg_field *ost_get_info_generic_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_SETINFO_KEY
+};
+
+static const struct req_msg_field *ost_get_last_id_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_OBD_ID
+};
+
+static const struct req_msg_field *ost_get_last_fid_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_FID,
+};
+
+static const struct req_msg_field *ost_get_fiemap_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_FIEMAP_KEY,
+	&RMF_FIEMAP_VAL
+};
+
+static const struct req_msg_field *ost_get_fiemap_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_FIEMAP_VAL
+};
+
+static const struct req_msg_field *mdt_hsm_progress[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_MDS_HSM_PROGRESS,
+};
+
+static const struct req_msg_field *mdt_hsm_ct_register[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_MDS_HSM_ARCHIVE,
+};
+
+static const struct req_msg_field *mdt_hsm_ct_unregister[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+};
+
+static const struct req_msg_field *mdt_hsm_action_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_MDS_HSM_CURRENT_ACTION,
+};
+
+static const struct req_msg_field *mdt_hsm_state_get_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_HSM_USER_STATE,
+};
+
+static const struct req_msg_field *mdt_hsm_state_set[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_CAPA1,
+	&RMF_HSM_STATE_SET,
+};
+
+static const struct req_msg_field *mdt_hsm_request[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_MDS_HSM_REQUEST,
+	&RMF_MDS_HSM_USER_ITEM,
+	&RMF_GENERIC_DATA,
+};
+
+static struct req_format *req_formats[] = {
+	&RQF_OBD_PING,
+	&RQF_OBD_SET_INFO,
+	&RQF_OBD_IDX_READ,
+	&RQF_SEC_CTX,
+	&RQF_MGS_TARGET_REG,
+	&RQF_MGS_SET_INFO,
+	&RQF_MGS_CONFIG_READ,
+	&RQF_SEQ_QUERY,
+	&RQF_FLD_QUERY,
+	&RQF_MDS_CONNECT,
+	&RQF_MDS_DISCONNECT,
+	&RQF_MDS_GET_INFO,
+	&RQF_MDS_GETSTATUS,
+	&RQF_MDS_STATFS,
+	&RQF_MDS_GETATTR,
+	&RQF_MDS_GETATTR_NAME,
+	&RQF_MDS_GETXATTR,
+	&RQF_MDS_SYNC,
+	&RQF_MDS_CLOSE,
+	&RQF_MDS_PIN,
+	&RQF_MDS_UNPIN,
+	&RQF_MDS_READPAGE,
+	&RQF_MDS_WRITEPAGE,
+	&RQF_MDS_IS_SUBDIR,
+	&RQF_MDS_DONE_WRITING,
+	&RQF_MDS_REINT,
+	&RQF_MDS_REINT_CREATE,
+	&RQF_MDS_REINT_CREATE_RMT_ACL,
+	&RQF_MDS_REINT_CREATE_SLAVE,
+	&RQF_MDS_REINT_CREATE_SYM,
+	&RQF_MDS_REINT_OPEN,
+	&RQF_MDS_REINT_UNLINK,
+	&RQF_MDS_REINT_LINK,
+	&RQF_MDS_REINT_RENAME,
+	&RQF_MDS_REINT_SETATTR,
+	&RQF_MDS_REINT_SETXATTR,
+	&RQF_MDS_QUOTACHECK,
+	&RQF_MDS_QUOTACTL,
+	&RQF_MDS_HSM_PROGRESS,
+	&RQF_MDS_HSM_CT_REGISTER,
+	&RQF_MDS_HSM_CT_UNREGISTER,
+	&RQF_MDS_HSM_STATE_GET,
+	&RQF_MDS_HSM_STATE_SET,
+	&RQF_MDS_HSM_ACTION,
+	&RQF_MDS_HSM_REQUEST,
+	&RQF_MDS_SWAP_LAYOUTS,
+	&RQF_UPDATE_OBJ,
+	&RQF_QC_CALLBACK,
+	&RQF_OST_CONNECT,
+	&RQF_OST_DISCONNECT,
+	&RQF_OST_QUOTACHECK,
+	&RQF_OST_QUOTACTL,
+	&RQF_OST_GETATTR,
+	&RQF_OST_SETATTR,
+	&RQF_OST_CREATE,
+	&RQF_OST_PUNCH,
+	&RQF_OST_SYNC,
+	&RQF_OST_DESTROY,
+	&RQF_OST_BRW_READ,
+	&RQF_OST_BRW_WRITE,
+	&RQF_OST_STATFS,
+	&RQF_OST_SET_GRANT_INFO,
+	&RQF_OST_GET_INFO_GENERIC,
+	&RQF_OST_GET_INFO_LAST_ID,
+	&RQF_OST_GET_INFO_LAST_FID,
+	&RQF_OST_SET_INFO_LAST_FID,
+	&RQF_OST_GET_INFO_FIEMAP,
+	&RQF_LDLM_ENQUEUE,
+	&RQF_LDLM_ENQUEUE_LVB,
+	&RQF_LDLM_CONVERT,
+	&RQF_LDLM_CANCEL,
+	&RQF_LDLM_CALLBACK,
+	&RQF_LDLM_CP_CALLBACK,
+	&RQF_LDLM_BL_CALLBACK,
+	&RQF_LDLM_GL_CALLBACK,
+	&RQF_LDLM_GL_DESC_CALLBACK,
+	&RQF_LDLM_INTENT,
+	&RQF_LDLM_INTENT_BASIC,
+	&RQF_LDLM_INTENT_LAYOUT,
+	&RQF_LDLM_INTENT_GETATTR,
+	&RQF_LDLM_INTENT_OPEN,
+	&RQF_LDLM_INTENT_CREATE,
+	&RQF_LDLM_INTENT_UNLINK,
+	&RQF_LDLM_INTENT_QUOTA,
+	&RQF_QUOTA_DQACQ,
+	&RQF_LOG_CANCEL,
+	&RQF_LLOG_ORIGIN_HANDLE_CREATE,
+	&RQF_LLOG_ORIGIN_HANDLE_DESTROY,
+	&RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK,
+	&RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK,
+	&RQF_LLOG_ORIGIN_HANDLE_READ_HEADER,
+	&RQF_LLOG_ORIGIN_CONNECT
+};
+
+struct req_msg_field {
+	const __u32 rmf_flags;
+	const char  *rmf_name;
+	/**
+	 * Field length. (-1) means "variable length".  If the
+	 * \a RMF_F_STRUCT_ARRAY flag is set the field is also variable-length,
+	 * but the actual size must be a whole multiple of \a rmf_size.
+	 */
+	const int   rmf_size;
+	void	(*rmf_swabber)(void *);
+	void	(*rmf_dumper)(void *);
+	int	 rmf_offset[ARRAY_SIZE(req_formats)][RCL_NR];
+};
+
+enum rmf_flags {
+	/**
+	 * The field is a string, must be NUL-terminated.
+	 */
+	RMF_F_STRING = 1 << 0,
+	/**
+	 * The field's buffer size need not match the declared \a rmf_size.
+	 */
+	RMF_F_NO_SIZE_CHECK = 1 << 1,
+	/**
+	 * The field's buffer size must be a whole multiple of the declared \a
+	 * rmf_size and the \a rmf_swabber function must work on the declared \a
+	 * rmf_size worth of bytes.
+	 */
+	RMF_F_STRUCT_ARRAY = 1 << 2
+};
+
+struct req_capsule;
+
+/*
+ * Request fields.
+ */
+#define DEFINE_MSGF(name, flags, size, swabber, dumper) {       \
+	.rmf_name    = (name),				  \
+	.rmf_flags   = (flags),				 \
+	.rmf_size    = (size),				  \
+	.rmf_swabber = (void (*)(void*))(swabber),	      \
+	.rmf_dumper  = (void (*)(void*))(dumper)		\
+}
+
+struct req_msg_field RMF_GENERIC_DATA =
+	DEFINE_MSGF("generic_data", 0,
+		    -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_GENERIC_DATA);
+
+struct req_msg_field RMF_MGS_TARGET_INFO =
+	DEFINE_MSGF("mgs_target_info", 0,
+		    sizeof(struct mgs_target_info),
+		    lustre_swab_mgs_target_info, NULL);
+EXPORT_SYMBOL(RMF_MGS_TARGET_INFO);
+
+struct req_msg_field RMF_MGS_SEND_PARAM =
+	DEFINE_MSGF("mgs_send_param", 0,
+		    sizeof(struct mgs_send_param),
+		    NULL, NULL);
+EXPORT_SYMBOL(RMF_MGS_SEND_PARAM);
+
+struct req_msg_field RMF_MGS_CONFIG_BODY =
+	DEFINE_MSGF("mgs_config_read request", 0,
+		    sizeof(struct mgs_config_body),
+		    lustre_swab_mgs_config_body, NULL);
+EXPORT_SYMBOL(RMF_MGS_CONFIG_BODY);
+
+struct req_msg_field RMF_MGS_CONFIG_RES =
+	DEFINE_MSGF("mgs_config_read reply ", 0,
+		    sizeof(struct mgs_config_res),
+		    lustre_swab_mgs_config_res, NULL);
+EXPORT_SYMBOL(RMF_MGS_CONFIG_RES);
+
+struct req_msg_field RMF_U32 =
+	DEFINE_MSGF("generic u32", 0,
+		    sizeof(__u32), lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_U32);
+
+struct req_msg_field RMF_SETINFO_VAL =
+	DEFINE_MSGF("setinfo_val", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_SETINFO_VAL);
+
+struct req_msg_field RMF_GETINFO_KEY =
+	DEFINE_MSGF("getinfo_key", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_GETINFO_KEY);
+
+struct req_msg_field RMF_GETINFO_VALLEN =
+	DEFINE_MSGF("getinfo_vallen", 0,
+		    sizeof(__u32), lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_GETINFO_VALLEN);
+
+struct req_msg_field RMF_GETINFO_VAL =
+	DEFINE_MSGF("getinfo_val", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_GETINFO_VAL);
+
+struct req_msg_field RMF_SEQ_OPC =
+	DEFINE_MSGF("seq_query_opc", 0,
+		    sizeof(__u32), lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_SEQ_OPC);
+
+struct req_msg_field RMF_SEQ_RANGE =
+	DEFINE_MSGF("seq_query_range", 0,
+		    sizeof(struct lu_seq_range),
+		    lustre_swab_lu_seq_range, NULL);
+EXPORT_SYMBOL(RMF_SEQ_RANGE);
+
+struct req_msg_field RMF_FLD_OPC =
+	DEFINE_MSGF("fld_query_opc", 0,
+		    sizeof(__u32), lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_FLD_OPC);
+
+struct req_msg_field RMF_FLD_MDFLD =
+	DEFINE_MSGF("fld_query_mdfld", 0,
+		    sizeof(struct lu_seq_range),
+		    lustre_swab_lu_seq_range, NULL);
+EXPORT_SYMBOL(RMF_FLD_MDFLD);
+
+struct req_msg_field RMF_MDT_BODY =
+	DEFINE_MSGF("mdt_body", 0,
+		    sizeof(struct mdt_body), lustre_swab_mdt_body, NULL);
+EXPORT_SYMBOL(RMF_MDT_BODY);
+
+struct req_msg_field RMF_OBD_QUOTACTL =
+	DEFINE_MSGF("obd_quotactl", 0,
+		    sizeof(struct obd_quotactl),
+		    lustre_swab_obd_quotactl, NULL);
+EXPORT_SYMBOL(RMF_OBD_QUOTACTL);
+
+struct req_msg_field RMF_QUOTA_BODY =
+	DEFINE_MSGF("quota_body", 0,
+		    sizeof(struct quota_body), lustre_swab_quota_body, NULL);
+EXPORT_SYMBOL(RMF_QUOTA_BODY);
+
+struct req_msg_field RMF_MDT_EPOCH =
+	DEFINE_MSGF("mdt_ioepoch", 0,
+		    sizeof(struct mdt_ioepoch), lustre_swab_mdt_ioepoch, NULL);
+EXPORT_SYMBOL(RMF_MDT_EPOCH);
+
+struct req_msg_field RMF_PTLRPC_BODY =
+	DEFINE_MSGF("ptlrpc_body", 0,
+		    sizeof(struct ptlrpc_body), lustre_swab_ptlrpc_body, NULL);
+EXPORT_SYMBOL(RMF_PTLRPC_BODY);
+
+struct req_msg_field RMF_OBD_STATFS =
+	DEFINE_MSGF("obd_statfs", 0,
+		    sizeof(struct obd_statfs), lustre_swab_obd_statfs, NULL);
+EXPORT_SYMBOL(RMF_OBD_STATFS);
+
+struct req_msg_field RMF_SETINFO_KEY =
+	DEFINE_MSGF("setinfo_key", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_SETINFO_KEY);
+
+struct req_msg_field RMF_NAME =
+	DEFINE_MSGF("name", RMF_F_STRING, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_NAME);
+
+struct req_msg_field RMF_SYMTGT =
+	DEFINE_MSGF("symtgt", RMF_F_STRING, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_SYMTGT);
+
+struct req_msg_field RMF_TGTUUID =
+	DEFINE_MSGF("tgtuuid", RMF_F_STRING, sizeof(struct obd_uuid) - 1, NULL,
+	NULL);
+EXPORT_SYMBOL(RMF_TGTUUID);
+
+struct req_msg_field RMF_CLUUID =
+	DEFINE_MSGF("cluuid", RMF_F_STRING, sizeof(struct obd_uuid) - 1, NULL,
+	NULL);
+EXPORT_SYMBOL(RMF_CLUUID);
+
+struct req_msg_field RMF_STRING =
+	DEFINE_MSGF("string", RMF_F_STRING, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_STRING);
+
+struct req_msg_field RMF_LLOGD_BODY =
+	DEFINE_MSGF("llogd_body", 0,
+		    sizeof(struct llogd_body), lustre_swab_llogd_body, NULL);
+EXPORT_SYMBOL(RMF_LLOGD_BODY);
+
+struct req_msg_field RMF_LLOG_LOG_HDR =
+	DEFINE_MSGF("llog_log_hdr", 0,
+		    sizeof(struct llog_log_hdr), lustre_swab_llog_hdr, NULL);
+EXPORT_SYMBOL(RMF_LLOG_LOG_HDR);
+
+struct req_msg_field RMF_LLOGD_CONN_BODY =
+	DEFINE_MSGF("llogd_conn_body", 0,
+		    sizeof(struct llogd_conn_body),
+		    lustre_swab_llogd_conn_body, NULL);
+EXPORT_SYMBOL(RMF_LLOGD_CONN_BODY);
+
+/*
+ * connection handle received in MDS_CONNECT request.
+ *
+ * No swabbing needed because struct lustre_handle contains only a 64-bit cookie
+ * that the client does not interpret at all.
+ */
+struct req_msg_field RMF_CONN =
+	DEFINE_MSGF("conn", 0, sizeof(struct lustre_handle), NULL, NULL);
+EXPORT_SYMBOL(RMF_CONN);
+
+struct req_msg_field RMF_CONNECT_DATA =
+	DEFINE_MSGF("cdata",
+		    RMF_F_NO_SIZE_CHECK /* we allow extra space for interop */,
+#if LUSTRE_VERSION_CODE > OBD_OCD_VERSION(2, 7, 50, 0)
+		    sizeof(struct obd_connect_data),
+#else
+/* For interoperability with 1.8 and 2.0 clients/servers.
+ * The RPC verification code allows larger RPC buffers, but not
+ * smaller buffers.  Until we no longer need to keep compatibility
+ * with older servers/clients we can only check that the buffer
+ * size is at least as large as obd_connect_data_v1.  That is not
+ * not in itself harmful, since the chance of just corrupting this
+ * field is low.  See JIRA LU-16 for details. */
+		    sizeof(struct obd_connect_data_v1),
+#endif
+		    lustre_swab_connect, NULL);
+EXPORT_SYMBOL(RMF_CONNECT_DATA);
+
+struct req_msg_field RMF_DLM_REQ =
+	DEFINE_MSGF("dlm_req", RMF_F_NO_SIZE_CHECK /* ldlm_request_bufsize */,
+		    sizeof(struct ldlm_request),
+		    lustre_swab_ldlm_request, NULL);
+EXPORT_SYMBOL(RMF_DLM_REQ);
+
+struct req_msg_field RMF_DLM_REP =
+	DEFINE_MSGF("dlm_rep", 0,
+		    sizeof(struct ldlm_reply), lustre_swab_ldlm_reply, NULL);
+EXPORT_SYMBOL(RMF_DLM_REP);
+
+struct req_msg_field RMF_LDLM_INTENT =
+	DEFINE_MSGF("ldlm_intent", 0,
+		    sizeof(struct ldlm_intent), lustre_swab_ldlm_intent, NULL);
+EXPORT_SYMBOL(RMF_LDLM_INTENT);
+
+struct req_msg_field RMF_DLM_LVB =
+	DEFINE_MSGF("dlm_lvb", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_DLM_LVB);
+
+struct req_msg_field RMF_DLM_GL_DESC =
+	DEFINE_MSGF("dlm_gl_desc", 0, sizeof(union ldlm_gl_desc),
+		    lustre_swab_gl_desc, NULL);
+EXPORT_SYMBOL(RMF_DLM_GL_DESC);
+
+struct req_msg_field RMF_MDT_MD =
+	DEFINE_MSGF("mdt_md", RMF_F_NO_SIZE_CHECK, MIN_MD_SIZE, NULL, NULL);
+EXPORT_SYMBOL(RMF_MDT_MD);
+
+struct req_msg_field RMF_REC_REINT =
+	DEFINE_MSGF("rec_reint", 0, sizeof(struct mdt_rec_reint),
+		    lustre_swab_mdt_rec_reint, NULL);
+EXPORT_SYMBOL(RMF_REC_REINT);
+
+/* FIXME: this length should be defined as a macro */
+struct req_msg_field RMF_EADATA = DEFINE_MSGF("eadata", 0, -1,
+						    NULL, NULL);
+EXPORT_SYMBOL(RMF_EADATA);
+
+struct req_msg_field RMF_ACL =
+	DEFINE_MSGF("acl", RMF_F_NO_SIZE_CHECK,
+		    LUSTRE_POSIX_ACL_MAX_SIZE, NULL, NULL);
+EXPORT_SYMBOL(RMF_ACL);
+
+/* FIXME: this should be made to use RMF_F_STRUCT_ARRAY */
+struct req_msg_field RMF_LOGCOOKIES =
+	DEFINE_MSGF("logcookies", RMF_F_NO_SIZE_CHECK /* multiple cookies */,
+		    sizeof(struct llog_cookie), NULL, NULL);
+EXPORT_SYMBOL(RMF_LOGCOOKIES);
+
+struct req_msg_field RMF_CAPA1 =
+	DEFINE_MSGF("capa", 0, sizeof(struct lustre_capa),
+		    lustre_swab_lustre_capa, NULL);
+EXPORT_SYMBOL(RMF_CAPA1);
+
+struct req_msg_field RMF_CAPA2 =
+	DEFINE_MSGF("capa", 0, sizeof(struct lustre_capa),
+		    lustre_swab_lustre_capa, NULL);
+EXPORT_SYMBOL(RMF_CAPA2);
+
+struct req_msg_field RMF_LAYOUT_INTENT =
+	DEFINE_MSGF("layout_intent", 0,
+		    sizeof(struct layout_intent), lustre_swab_layout_intent,
+		    NULL);
+EXPORT_SYMBOL(RMF_LAYOUT_INTENT);
+
+/*
+ * OST request field.
+ */
+struct req_msg_field RMF_OST_BODY =
+	DEFINE_MSGF("ost_body", 0,
+		    sizeof(struct ost_body), lustre_swab_ost_body, dump_ost_body);
+EXPORT_SYMBOL(RMF_OST_BODY);
+
+struct req_msg_field RMF_OBD_IOOBJ =
+	DEFINE_MSGF("obd_ioobj", RMF_F_STRUCT_ARRAY,
+		    sizeof(struct obd_ioobj), lustre_swab_obd_ioobj, dump_ioo);
+EXPORT_SYMBOL(RMF_OBD_IOOBJ);
+
+struct req_msg_field RMF_NIOBUF_REMOTE =
+	DEFINE_MSGF("niobuf_remote", RMF_F_STRUCT_ARRAY,
+		    sizeof(struct niobuf_remote), lustre_swab_niobuf_remote,
+		    dump_rniobuf);
+EXPORT_SYMBOL(RMF_NIOBUF_REMOTE);
+
+struct req_msg_field RMF_RCS =
+	DEFINE_MSGF("niobuf_remote", RMF_F_STRUCT_ARRAY, sizeof(__u32),
+		    lustre_swab_generic_32s, dump_rcs);
+EXPORT_SYMBOL(RMF_RCS);
+
+struct req_msg_field RMF_OBD_ID =
+	DEFINE_MSGF("obd_id", 0,
+		    sizeof(obd_id), lustre_swab_ost_last_id, NULL);
+EXPORT_SYMBOL(RMF_OBD_ID);
+
+struct req_msg_field RMF_FID =
+	DEFINE_MSGF("fid", 0,
+		    sizeof(struct lu_fid), lustre_swab_lu_fid, NULL);
+EXPORT_SYMBOL(RMF_FID);
+
+struct req_msg_field RMF_OST_ID =
+	DEFINE_MSGF("ost_id", 0,
+		    sizeof(struct ost_id), lustre_swab_ost_id, NULL);
+EXPORT_SYMBOL(RMF_OST_ID);
+
+struct req_msg_field RMF_FIEMAP_KEY =
+	DEFINE_MSGF("fiemap", 0, sizeof(struct ll_fiemap_info_key),
+		    lustre_swab_fiemap, NULL);
+EXPORT_SYMBOL(RMF_FIEMAP_KEY);
+
+struct req_msg_field RMF_FIEMAP_VAL =
+	DEFINE_MSGF("fiemap", 0, -1, lustre_swab_fiemap, NULL);
+EXPORT_SYMBOL(RMF_FIEMAP_VAL);
+
+struct req_msg_field RMF_IDX_INFO =
+	DEFINE_MSGF("idx_info", 0, sizeof(struct idx_info),
+		    lustre_swab_idx_info, NULL);
+EXPORT_SYMBOL(RMF_IDX_INFO);
+struct req_msg_field RMF_HSM_USER_STATE =
+	DEFINE_MSGF("hsm_user_state", 0, sizeof(struct hsm_user_state),
+		    lustre_swab_hsm_user_state, NULL);
+EXPORT_SYMBOL(RMF_HSM_USER_STATE);
+
+struct req_msg_field RMF_HSM_STATE_SET =
+	DEFINE_MSGF("hsm_state_set", 0, sizeof(struct hsm_state_set),
+		    lustre_swab_hsm_state_set, NULL);
+EXPORT_SYMBOL(RMF_HSM_STATE_SET);
+
+struct req_msg_field RMF_MDS_HSM_PROGRESS =
+	DEFINE_MSGF("hsm_progress", 0, sizeof(struct hsm_progress_kernel),
+		    lustre_swab_hsm_progress_kernel, NULL);
+EXPORT_SYMBOL(RMF_MDS_HSM_PROGRESS);
+
+struct req_msg_field RMF_MDS_HSM_CURRENT_ACTION =
+	DEFINE_MSGF("hsm_current_action", 0, sizeof(struct hsm_current_action),
+		    lustre_swab_hsm_current_action, NULL);
+EXPORT_SYMBOL(RMF_MDS_HSM_CURRENT_ACTION);
+
+struct req_msg_field RMF_MDS_HSM_USER_ITEM =
+	DEFINE_MSGF("hsm_user_item", RMF_F_STRUCT_ARRAY,
+		    sizeof(struct hsm_user_item), lustre_swab_hsm_user_item,
+		    NULL);
+EXPORT_SYMBOL(RMF_MDS_HSM_USER_ITEM);
+
+struct req_msg_field RMF_MDS_HSM_ARCHIVE =
+	DEFINE_MSGF("hsm_archive", 0,
+		    sizeof(__u32), lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_MDS_HSM_ARCHIVE);
+
+struct req_msg_field RMF_MDS_HSM_REQUEST =
+	DEFINE_MSGF("hsm_request", 0, sizeof(struct hsm_request),
+		    lustre_swab_hsm_request, NULL);
+EXPORT_SYMBOL(RMF_MDS_HSM_REQUEST);
+
+struct req_msg_field RMF_UPDATE = DEFINE_MSGF("update", 0, -1,
+					      lustre_swab_update_buf, NULL);
+EXPORT_SYMBOL(RMF_UPDATE);
+
+struct req_msg_field RMF_UPDATE_REPLY = DEFINE_MSGF("update_reply", 0, -1,
+						lustre_swab_update_reply_buf,
+						    NULL);
+EXPORT_SYMBOL(RMF_UPDATE_REPLY);
+
+struct req_msg_field RMF_SWAP_LAYOUTS =
+	DEFINE_MSGF("swap_layouts", 0, sizeof(struct  mdc_swap_layouts),
+		    lustre_swab_swap_layouts, NULL);
+EXPORT_SYMBOL(RMF_SWAP_LAYOUTS);
+/*
+ * Request formats.
+ */
+
+struct req_format {
+	const char *rf_name;
+	int	 rf_idx;
+	struct {
+		int			  nr;
+		const struct req_msg_field **d;
+	} rf_fields[RCL_NR];
+};
+
+#define DEFINE_REQ_FMT(name, client, client_nr, server, server_nr) {    \
+	.rf_name   = name,					      \
+	.rf_fields = {						  \
+		[RCL_CLIENT] = {					\
+			.nr = client_nr,				\
+			.d  = client				    \
+		},						      \
+		[RCL_SERVER] = {					\
+			.nr = server_nr,				\
+			.d  = server				    \
+		}						       \
+	}							       \
+}
+
+#define DEFINE_REQ_FMT0(name, client, server)				  \
+DEFINE_REQ_FMT(name, client, ARRAY_SIZE(client), server, ARRAY_SIZE(server))
+
+struct req_format RQF_OBD_PING =
+	DEFINE_REQ_FMT0("OBD_PING", empty, empty);
+EXPORT_SYMBOL(RQF_OBD_PING);
+
+struct req_format RQF_OBD_SET_INFO =
+	DEFINE_REQ_FMT0("OBD_SET_INFO", obd_set_info_client, empty);
+EXPORT_SYMBOL(RQF_OBD_SET_INFO);
+
+/* Read index file through the network */
+struct req_format RQF_OBD_IDX_READ =
+	DEFINE_REQ_FMT0("OBD_IDX_READ",
+			obd_idx_read_client, obd_idx_read_server);
+EXPORT_SYMBOL(RQF_OBD_IDX_READ);
+
+struct req_format RQF_SEC_CTX =
+	DEFINE_REQ_FMT0("SEC_CTX", empty, empty);
+EXPORT_SYMBOL(RQF_SEC_CTX);
+
+struct req_format RQF_MGS_TARGET_REG =
+	DEFINE_REQ_FMT0("MGS_TARGET_REG", mgs_target_info_only,
+			 mgs_target_info_only);
+EXPORT_SYMBOL(RQF_MGS_TARGET_REG);
+
+struct req_format RQF_MGS_SET_INFO =
+	DEFINE_REQ_FMT0("MGS_SET_INFO", mgs_set_info,
+			 mgs_set_info);
+EXPORT_SYMBOL(RQF_MGS_SET_INFO);
+
+struct req_format RQF_MGS_CONFIG_READ =
+	DEFINE_REQ_FMT0("MGS_CONFIG_READ", mgs_config_read_client,
+			 mgs_config_read_server);
+EXPORT_SYMBOL(RQF_MGS_CONFIG_READ);
+
+struct req_format RQF_SEQ_QUERY =
+	DEFINE_REQ_FMT0("SEQ_QUERY", seq_query_client, seq_query_server);
+EXPORT_SYMBOL(RQF_SEQ_QUERY);
+
+struct req_format RQF_FLD_QUERY =
+	DEFINE_REQ_FMT0("FLD_QUERY", fld_query_client, fld_query_server);
+EXPORT_SYMBOL(RQF_FLD_QUERY);
+
+struct req_format RQF_LOG_CANCEL =
+	DEFINE_REQ_FMT0("OBD_LOG_CANCEL", log_cancel_client, empty);
+EXPORT_SYMBOL(RQF_LOG_CANCEL);
+
+struct req_format RQF_MDS_QUOTACHECK =
+	DEFINE_REQ_FMT0("MDS_QUOTACHECK", quotactl_only, empty);
+EXPORT_SYMBOL(RQF_MDS_QUOTACHECK);
+
+struct req_format RQF_OST_QUOTACHECK =
+	DEFINE_REQ_FMT0("OST_QUOTACHECK", quotactl_only, empty);
+EXPORT_SYMBOL(RQF_OST_QUOTACHECK);
+
+struct req_format RQF_MDS_QUOTACTL =
+	DEFINE_REQ_FMT0("MDS_QUOTACTL", quotactl_only, quotactl_only);
+EXPORT_SYMBOL(RQF_MDS_QUOTACTL);
+
+struct req_format RQF_OST_QUOTACTL =
+	DEFINE_REQ_FMT0("OST_QUOTACTL", quotactl_only, quotactl_only);
+EXPORT_SYMBOL(RQF_OST_QUOTACTL);
+
+struct req_format RQF_QC_CALLBACK =
+	DEFINE_REQ_FMT0("QC_CALLBACK", quotactl_only, empty);
+EXPORT_SYMBOL(RQF_QC_CALLBACK);
+
+struct req_format RQF_QUOTA_DQACQ =
+	DEFINE_REQ_FMT0("QUOTA_DQACQ", quota_body_only, quota_body_only);
+EXPORT_SYMBOL(RQF_QUOTA_DQACQ);
+
+struct req_format RQF_LDLM_INTENT_QUOTA =
+	DEFINE_REQ_FMT0("LDLM_INTENT_QUOTA",
+			ldlm_intent_quota_client,
+			ldlm_intent_quota_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_QUOTA);
+
+struct req_format RQF_MDS_GETSTATUS =
+	DEFINE_REQ_FMT0("MDS_GETSTATUS", mdt_body_only, mdt_body_capa);
+EXPORT_SYMBOL(RQF_MDS_GETSTATUS);
+
+struct req_format RQF_MDS_STATFS =
+	DEFINE_REQ_FMT0("MDS_STATFS", empty, obd_statfs_server);
+EXPORT_SYMBOL(RQF_MDS_STATFS);
+
+struct req_format RQF_MDS_SYNC =
+	DEFINE_REQ_FMT0("MDS_SYNC", mdt_body_capa, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_SYNC);
+
+struct req_format RQF_MDS_GETATTR =
+	DEFINE_REQ_FMT0("MDS_GETATTR", mdt_body_capa, mds_getattr_server);
+EXPORT_SYMBOL(RQF_MDS_GETATTR);
+
+struct req_format RQF_MDS_GETXATTR =
+	DEFINE_REQ_FMT0("MDS_GETXATTR",
+			mds_getxattr_client, mds_getxattr_server);
+EXPORT_SYMBOL(RQF_MDS_GETXATTR);
+
+struct req_format RQF_MDS_GETATTR_NAME =
+	DEFINE_REQ_FMT0("MDS_GETATTR_NAME",
+			mds_getattr_name_client, mds_getattr_server);
+EXPORT_SYMBOL(RQF_MDS_GETATTR_NAME);
+
+struct req_format RQF_MDS_REINT =
+	DEFINE_REQ_FMT0("MDS_REINT", mds_reint_client, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_REINT);
+
+struct req_format RQF_MDS_REINT_CREATE =
+	DEFINE_REQ_FMT0("MDS_REINT_CREATE",
+			mds_reint_create_client, mdt_body_capa);
+EXPORT_SYMBOL(RQF_MDS_REINT_CREATE);
+
+struct req_format RQF_MDS_REINT_CREATE_RMT_ACL =
+	DEFINE_REQ_FMT0("MDS_REINT_CREATE_RMT_ACL",
+			mds_reint_create_rmt_acl_client, mdt_body_capa);
+EXPORT_SYMBOL(RQF_MDS_REINT_CREATE_RMT_ACL);
+
+struct req_format RQF_MDS_REINT_CREATE_SLAVE =
+	DEFINE_REQ_FMT0("MDS_REINT_CREATE_EA",
+			mds_reint_create_slave_client, mdt_body_capa);
+EXPORT_SYMBOL(RQF_MDS_REINT_CREATE_SLAVE);
+
+struct req_format RQF_MDS_REINT_CREATE_SYM =
+	DEFINE_REQ_FMT0("MDS_REINT_CREATE_SYM",
+			mds_reint_create_sym_client, mdt_body_capa);
+EXPORT_SYMBOL(RQF_MDS_REINT_CREATE_SYM);
+
+struct req_format RQF_MDS_REINT_OPEN =
+	DEFINE_REQ_FMT0("MDS_REINT_OPEN",
+			mds_reint_open_client, mds_reint_open_server);
+EXPORT_SYMBOL(RQF_MDS_REINT_OPEN);
+
+struct req_format RQF_MDS_REINT_UNLINK =
+	DEFINE_REQ_FMT0("MDS_REINT_UNLINK", mds_reint_unlink_client,
+			mds_last_unlink_server);
+EXPORT_SYMBOL(RQF_MDS_REINT_UNLINK);
+
+struct req_format RQF_MDS_REINT_LINK =
+	DEFINE_REQ_FMT0("MDS_REINT_LINK",
+			mds_reint_link_client, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_REINT_LINK);
+
+struct req_format RQF_MDS_REINT_RENAME =
+	DEFINE_REQ_FMT0("MDS_REINT_RENAME", mds_reint_rename_client,
+			mds_last_unlink_server);
+EXPORT_SYMBOL(RQF_MDS_REINT_RENAME);
+
+struct req_format RQF_MDS_REINT_SETATTR =
+	DEFINE_REQ_FMT0("MDS_REINT_SETATTR",
+			mds_reint_setattr_client, mds_setattr_server);
+EXPORT_SYMBOL(RQF_MDS_REINT_SETATTR);
+
+struct req_format RQF_MDS_REINT_SETXATTR =
+	DEFINE_REQ_FMT0("MDS_REINT_SETXATTR",
+			mds_reint_setxattr_client, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_REINT_SETXATTR);
+
+struct req_format RQF_MDS_CONNECT =
+	DEFINE_REQ_FMT0("MDS_CONNECT",
+			obd_connect_client, obd_connect_server);
+EXPORT_SYMBOL(RQF_MDS_CONNECT);
+
+struct req_format RQF_MDS_DISCONNECT =
+	DEFINE_REQ_FMT0("MDS_DISCONNECT", empty, empty);
+EXPORT_SYMBOL(RQF_MDS_DISCONNECT);
+
+struct req_format RQF_MDS_GET_INFO =
+	DEFINE_REQ_FMT0("MDS_GET_INFO", mds_getinfo_client,
+			mds_getinfo_server);
+EXPORT_SYMBOL(RQF_MDS_GET_INFO);
+
+struct req_format RQF_UPDATE_OBJ =
+	DEFINE_REQ_FMT0("OBJECT_UPDATE_OBJ", mds_update_client,
+			mds_update_server);
+EXPORT_SYMBOL(RQF_UPDATE_OBJ);
+
+struct req_format RQF_LDLM_ENQUEUE =
+	DEFINE_REQ_FMT0("LDLM_ENQUEUE",
+			ldlm_enqueue_client, ldlm_enqueue_lvb_server);
+EXPORT_SYMBOL(RQF_LDLM_ENQUEUE);
+
+struct req_format RQF_LDLM_ENQUEUE_LVB =
+	DEFINE_REQ_FMT0("LDLM_ENQUEUE_LVB",
+			ldlm_enqueue_client, ldlm_enqueue_lvb_server);
+EXPORT_SYMBOL(RQF_LDLM_ENQUEUE_LVB);
+
+struct req_format RQF_LDLM_CONVERT =
+	DEFINE_REQ_FMT0("LDLM_CONVERT",
+			ldlm_enqueue_client, ldlm_enqueue_server);
+EXPORT_SYMBOL(RQF_LDLM_CONVERT);
+
+struct req_format RQF_LDLM_CANCEL =
+	DEFINE_REQ_FMT0("LDLM_CANCEL", ldlm_enqueue_client, empty);
+EXPORT_SYMBOL(RQF_LDLM_CANCEL);
+
+struct req_format RQF_LDLM_CALLBACK =
+	DEFINE_REQ_FMT0("LDLM_CALLBACK", ldlm_enqueue_client, empty);
+EXPORT_SYMBOL(RQF_LDLM_CALLBACK);
+
+struct req_format RQF_LDLM_CP_CALLBACK =
+	DEFINE_REQ_FMT0("LDLM_CP_CALLBACK", ldlm_cp_callback_client, empty);
+EXPORT_SYMBOL(RQF_LDLM_CP_CALLBACK);
+
+struct req_format RQF_LDLM_BL_CALLBACK =
+	DEFINE_REQ_FMT0("LDLM_BL_CALLBACK", ldlm_enqueue_client, empty);
+EXPORT_SYMBOL(RQF_LDLM_BL_CALLBACK);
+
+struct req_format RQF_LDLM_GL_CALLBACK =
+	DEFINE_REQ_FMT0("LDLM_GL_CALLBACK", ldlm_enqueue_client,
+			ldlm_gl_callback_server);
+EXPORT_SYMBOL(RQF_LDLM_GL_CALLBACK);
+
+struct req_format RQF_LDLM_GL_DESC_CALLBACK =
+	DEFINE_REQ_FMT0("LDLM_GL_CALLBACK", ldlm_gl_callback_desc_client,
+			ldlm_gl_callback_server);
+EXPORT_SYMBOL(RQF_LDLM_GL_DESC_CALLBACK);
+
+struct req_format RQF_LDLM_INTENT_BASIC =
+	DEFINE_REQ_FMT0("LDLM_INTENT_BASIC",
+			ldlm_intent_basic_client, ldlm_enqueue_lvb_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_BASIC);
+
+struct req_format RQF_LDLM_INTENT =
+	DEFINE_REQ_FMT0("LDLM_INTENT",
+			ldlm_intent_client, ldlm_intent_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT);
+
+struct req_format RQF_LDLM_INTENT_LAYOUT =
+	DEFINE_REQ_FMT0("LDLM_INTENT_LAYOUT ",
+			ldlm_intent_layout_client, ldlm_enqueue_lvb_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_LAYOUT);
+
+struct req_format RQF_LDLM_INTENT_GETATTR =
+	DEFINE_REQ_FMT0("LDLM_INTENT_GETATTR",
+			ldlm_intent_getattr_client, ldlm_intent_getattr_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_GETATTR);
+
+struct req_format RQF_LDLM_INTENT_OPEN =
+	DEFINE_REQ_FMT0("LDLM_INTENT_OPEN",
+			ldlm_intent_open_client, ldlm_intent_open_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_OPEN);
+
+struct req_format RQF_LDLM_INTENT_CREATE =
+	DEFINE_REQ_FMT0("LDLM_INTENT_CREATE",
+			ldlm_intent_create_client, ldlm_intent_getattr_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_CREATE);
+
+struct req_format RQF_LDLM_INTENT_UNLINK =
+	DEFINE_REQ_FMT0("LDLM_INTENT_UNLINK",
+			ldlm_intent_unlink_client, ldlm_intent_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_UNLINK);
+
+struct req_format RQF_MDS_CLOSE =
+	DEFINE_REQ_FMT0("MDS_CLOSE",
+			mdt_close_client, mds_last_unlink_server);
+EXPORT_SYMBOL(RQF_MDS_CLOSE);
+
+struct req_format RQF_MDS_PIN =
+	DEFINE_REQ_FMT0("MDS_PIN",
+			mdt_body_capa, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_PIN);
+
+struct req_format RQF_MDS_UNPIN =
+	DEFINE_REQ_FMT0("MDS_UNPIN", mdt_body_only, empty);
+EXPORT_SYMBOL(RQF_MDS_UNPIN);
+
+struct req_format RQF_MDS_DONE_WRITING =
+	DEFINE_REQ_FMT0("MDS_DONE_WRITING",
+			mdt_close_client, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_DONE_WRITING);
+
+struct req_format RQF_MDS_READPAGE =
+	DEFINE_REQ_FMT0("MDS_READPAGE",
+			mdt_body_capa, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_READPAGE);
+
+struct req_format RQF_MDS_HSM_ACTION =
+	DEFINE_REQ_FMT0("MDS_HSM_ACTION", mdt_body_capa, mdt_hsm_action_server);
+EXPORT_SYMBOL(RQF_MDS_HSM_ACTION);
+
+struct req_format RQF_MDS_HSM_PROGRESS =
+	DEFINE_REQ_FMT0("MDS_HSM_PROGRESS", mdt_hsm_progress, empty);
+EXPORT_SYMBOL(RQF_MDS_HSM_PROGRESS);
+
+struct req_format RQF_MDS_HSM_CT_REGISTER =
+	DEFINE_REQ_FMT0("MDS_HSM_CT_REGISTER", mdt_hsm_ct_register, empty);
+EXPORT_SYMBOL(RQF_MDS_HSM_CT_REGISTER);
+
+struct req_format RQF_MDS_HSM_CT_UNREGISTER =
+	DEFINE_REQ_FMT0("MDS_HSM_CT_UNREGISTER", mdt_hsm_ct_unregister, empty);
+EXPORT_SYMBOL(RQF_MDS_HSM_CT_UNREGISTER);
+
+struct req_format RQF_MDS_HSM_STATE_GET =
+	DEFINE_REQ_FMT0("MDS_HSM_STATE_GET",
+			mdt_body_capa, mdt_hsm_state_get_server);
+EXPORT_SYMBOL(RQF_MDS_HSM_STATE_GET);
+
+struct req_format RQF_MDS_HSM_STATE_SET =
+	DEFINE_REQ_FMT0("MDS_HSM_STATE_SET", mdt_hsm_state_set, empty);
+EXPORT_SYMBOL(RQF_MDS_HSM_STATE_SET);
+
+struct req_format RQF_MDS_HSM_REQUEST =
+	DEFINE_REQ_FMT0("MDS_HSM_REQUEST", mdt_hsm_request, empty);
+EXPORT_SYMBOL(RQF_MDS_HSM_REQUEST);
+
+struct req_format RQF_MDS_SWAP_LAYOUTS =
+	DEFINE_REQ_FMT0("MDS_SWAP_LAYOUTS",
+			mdt_swap_layouts, empty);
+EXPORT_SYMBOL(RQF_MDS_SWAP_LAYOUTS);
+
+/* This is for split */
+struct req_format RQF_MDS_WRITEPAGE =
+	DEFINE_REQ_FMT0("MDS_WRITEPAGE",
+			mdt_body_capa, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_WRITEPAGE);
+
+struct req_format RQF_MDS_IS_SUBDIR =
+	DEFINE_REQ_FMT0("MDS_IS_SUBDIR",
+			mdt_body_only, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_IS_SUBDIR);
+
+struct req_format RQF_LLOG_ORIGIN_HANDLE_CREATE =
+	DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_CREATE",
+			llog_origin_handle_create_client, llogd_body_only);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_CREATE);
+
+struct req_format RQF_LLOG_ORIGIN_HANDLE_DESTROY =
+	DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_DESTROY",
+			llogd_body_only, llogd_body_only);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_DESTROY);
+
+struct req_format RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK =
+	DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_NEXT_BLOCK",
+			llogd_body_only, llog_origin_handle_next_block_server);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK);
+
+struct req_format RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK =
+	DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_PREV_BLOCK",
+			llogd_body_only, llog_origin_handle_next_block_server);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK);
+
+struct req_format RQF_LLOG_ORIGIN_HANDLE_READ_HEADER =
+	DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_READ_HEADER",
+			llogd_body_only, llog_log_hdr_only);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_READ_HEADER);
+
+struct req_format RQF_LLOG_ORIGIN_CONNECT =
+	DEFINE_REQ_FMT0("LLOG_ORIGIN_CONNECT", llogd_conn_body_only, empty);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_CONNECT);
+
+struct req_format RQF_OST_CONNECT =
+	DEFINE_REQ_FMT0("OST_CONNECT",
+			obd_connect_client, obd_connect_server);
+EXPORT_SYMBOL(RQF_OST_CONNECT);
+
+struct req_format RQF_OST_DISCONNECT =
+	DEFINE_REQ_FMT0("OST_DISCONNECT", empty, empty);
+EXPORT_SYMBOL(RQF_OST_DISCONNECT);
+
+struct req_format RQF_OST_GETATTR =
+	DEFINE_REQ_FMT0("OST_GETATTR", ost_body_capa, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_GETATTR);
+
+struct req_format RQF_OST_SETATTR =
+	DEFINE_REQ_FMT0("OST_SETATTR", ost_body_capa, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_SETATTR);
+
+struct req_format RQF_OST_CREATE =
+	DEFINE_REQ_FMT0("OST_CREATE", ost_body_only, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_CREATE);
+
+struct req_format RQF_OST_PUNCH =
+	DEFINE_REQ_FMT0("OST_PUNCH", ost_body_capa, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_PUNCH);
+
+struct req_format RQF_OST_SYNC =
+	DEFINE_REQ_FMT0("OST_SYNC", ost_body_capa, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_SYNC);
+
+struct req_format RQF_OST_DESTROY =
+	DEFINE_REQ_FMT0("OST_DESTROY", ost_destroy_client, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_DESTROY);
+
+struct req_format RQF_OST_BRW_READ =
+	DEFINE_REQ_FMT0("OST_BRW_READ", ost_brw_client, ost_brw_read_server);
+EXPORT_SYMBOL(RQF_OST_BRW_READ);
+
+struct req_format RQF_OST_BRW_WRITE =
+	DEFINE_REQ_FMT0("OST_BRW_WRITE", ost_brw_client, ost_brw_write_server);
+EXPORT_SYMBOL(RQF_OST_BRW_WRITE);
+
+struct req_format RQF_OST_STATFS =
+	DEFINE_REQ_FMT0("OST_STATFS", empty, obd_statfs_server);
+EXPORT_SYMBOL(RQF_OST_STATFS);
+
+struct req_format RQF_OST_SET_GRANT_INFO =
+	DEFINE_REQ_FMT0("OST_SET_GRANT_INFO", ost_grant_shrink_client,
+			 ost_body_only);
+EXPORT_SYMBOL(RQF_OST_SET_GRANT_INFO);
+
+struct req_format RQF_OST_GET_INFO_GENERIC =
+	DEFINE_REQ_FMT0("OST_GET_INFO", ost_get_info_generic_client,
+					ost_get_info_generic_server);
+EXPORT_SYMBOL(RQF_OST_GET_INFO_GENERIC);
+
+struct req_format RQF_OST_GET_INFO_LAST_ID =
+	DEFINE_REQ_FMT0("OST_GET_INFO_LAST_ID", ost_get_info_generic_client,
+						ost_get_last_id_server);
+EXPORT_SYMBOL(RQF_OST_GET_INFO_LAST_ID);
+
+struct req_format RQF_OST_GET_INFO_LAST_FID =
+	DEFINE_REQ_FMT0("OST_GET_INFO_LAST_FID", obd_set_info_client,
+						 ost_get_last_fid_server);
+EXPORT_SYMBOL(RQF_OST_GET_INFO_LAST_FID);
+
+struct req_format RQF_OST_SET_INFO_LAST_FID =
+	DEFINE_REQ_FMT0("OST_SET_INFO_LAST_FID", obd_set_info_client,
+						 empty);
+EXPORT_SYMBOL(RQF_OST_SET_INFO_LAST_FID);
+
+struct req_format RQF_OST_GET_INFO_FIEMAP =
+	DEFINE_REQ_FMT0("OST_GET_INFO_FIEMAP", ost_get_fiemap_client,
+					       ost_get_fiemap_server);
+EXPORT_SYMBOL(RQF_OST_GET_INFO_FIEMAP);
+
+#if !defined(__REQ_LAYOUT_USER__)
+
+/* Convenience macro */
+#define FMT_FIELD(fmt, i, j) (fmt)->rf_fields[(i)].d[(j)]
+
+/**
+ * Initializes the capsule abstraction by computing and setting the \a rf_idx
+ * field of RQFs and the \a rmf_offset field of RMFs.
+ */
+int req_layout_init(void)
+{
+	int i;
+	int j;
+	int k;
+	struct req_format *rf = NULL;
+
+	for (i = 0; i < ARRAY_SIZE(req_formats); ++i) {
+		rf = req_formats[i];
+		rf->rf_idx = i;
+		for (j = 0; j < RCL_NR; ++j) {
+			LASSERT(rf->rf_fields[j].nr <= REQ_MAX_FIELD_NR);
+			for (k = 0; k < rf->rf_fields[j].nr; ++k) {
+				struct req_msg_field *field;
+
+				field = (typeof(field))rf->rf_fields[j].d[k];
+				LASSERT(!(field->rmf_flags & RMF_F_STRUCT_ARRAY)
+					|| field->rmf_size > 0);
+				LASSERT(field->rmf_offset[i][j] == 0);
+				/*
+				 * k + 1 to detect unused format/field
+				 * combinations.
+				 */
+				field->rmf_offset[i][j] = k + 1;
+			}
+		}
+	}
+	return 0;
+}
+EXPORT_SYMBOL(req_layout_init);
+
+void req_layout_fini(void)
+{
+}
+EXPORT_SYMBOL(req_layout_fini);
+
+/**
+ * Initializes the expected sizes of each RMF in a \a pill (\a rc_area) to -1.
+ *
+ * Actual/expected field sizes are set elsewhere in functions in this file:
+ * req_capsule_init(), req_capsule_server_pack(), req_capsule_set_size() and
+ * req_capsule_msg_size().  The \a rc_area information is used by.
+ * ptlrpc_request_set_replen().
+ */
+void req_capsule_init_area(struct req_capsule *pill)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(pill->rc_area[RCL_CLIENT]); i++) {
+		pill->rc_area[RCL_CLIENT][i] = -1;
+		pill->rc_area[RCL_SERVER][i] = -1;
+	}
+}
+EXPORT_SYMBOL(req_capsule_init_area);
+
+/**
+ * Initialize a pill.
+ *
+ * The \a location indicates whether the caller is executing on the client side
+ * (RCL_CLIENT) or server side (RCL_SERVER)..
+ */
+void req_capsule_init(struct req_capsule *pill,
+		      struct ptlrpc_request *req,
+		      enum req_location location)
+{
+	LASSERT(location == RCL_SERVER || location == RCL_CLIENT);
+
+	/*
+	 * Today all capsules are embedded in ptlrpc_request structs,
+	 * but just in case that ever isn't the case, we don't reach
+	 * into req unless req != NULL and pill is the one embedded in
+	 * the req.
+	 *
+	 * The req->rq_pill_init flag makes it safe to initialize a pill
+	 * twice, which might happen in the OST paths as a result of the
+	 * high-priority RPC queue getting peeked at before ost_handle()
+	 * handles an OST RPC.
+	 */
+	if (req != NULL && pill == &req->rq_pill && req->rq_pill_init)
+		return;
+
+	memset(pill, 0, sizeof *pill);
+	pill->rc_req = req;
+	pill->rc_loc = location;
+	req_capsule_init_area(pill);
+
+	if (req != NULL && pill == &req->rq_pill)
+		req->rq_pill_init = 1;
+}
+EXPORT_SYMBOL(req_capsule_init);
+
+void req_capsule_fini(struct req_capsule *pill)
+{
+}
+EXPORT_SYMBOL(req_capsule_fini);
+
+static int __req_format_is_sane(const struct req_format *fmt)
+{
+	return
+		0 <= fmt->rf_idx && fmt->rf_idx < ARRAY_SIZE(req_formats) &&
+		req_formats[fmt->rf_idx] == fmt;
+}
+
+static struct lustre_msg *__req_msg(const struct req_capsule *pill,
+				    enum req_location loc)
+{
+	struct ptlrpc_request *req;
+
+	req = pill->rc_req;
+	return loc == RCL_CLIENT ? req->rq_reqmsg : req->rq_repmsg;
+}
+
+/**
+ * Set the format (\a fmt) of a \a pill; format changes are not allowed here
+ * (see req_capsule_extend()).
+ */
+void req_capsule_set(struct req_capsule *pill, const struct req_format *fmt)
+{
+	LASSERT(pill->rc_fmt == NULL || pill->rc_fmt == fmt);
+	LASSERT(__req_format_is_sane(fmt));
+
+	pill->rc_fmt = fmt;
+}
+EXPORT_SYMBOL(req_capsule_set);
+
+/**
+ * Fills in any parts of the \a rc_area of a \a pill that haven't been filled in
+ * yet.
+
+ * \a rc_area is an array of REQ_MAX_FIELD_NR elements, used to store sizes of
+ * variable-sized fields.  The field sizes come from the declared \a rmf_size
+ * field of a \a pill's \a rc_fmt's RMF's.
+ */
+int req_capsule_filled_sizes(struct req_capsule *pill,
+			   enum req_location loc)
+{
+	const struct req_format *fmt = pill->rc_fmt;
+	int		      i;
+
+	LASSERT(fmt != NULL);
+
+	for (i = 0; i < fmt->rf_fields[loc].nr; ++i) {
+		if (pill->rc_area[loc][i] == -1) {
+			pill->rc_area[loc][i] =
+					    fmt->rf_fields[loc].d[i]->rmf_size;
+			if (pill->rc_area[loc][i] == -1) {
+				/*
+				 * Skip the following fields.
+				 *
+				 * If this LASSERT() trips then you're missing a
+				 * call to req_capsule_set_size().
+				 */
+				LASSERT(loc != RCL_SERVER);
+				break;
+			}
+		}
+	}
+	return i;
+}
+EXPORT_SYMBOL(req_capsule_filled_sizes);
+
+/**
+ * Capsule equivalent of lustre_pack_request() and lustre_pack_reply().
+ *
+ * This function uses the \a pill's \a rc_area as filled in by
+ * req_capsule_set_size() or req_capsule_filled_sizes() (the latter is called by
+ * this function).
+ */
+int req_capsule_server_pack(struct req_capsule *pill)
+{
+	const struct req_format *fmt;
+	int		      count;
+	int		      rc;
+
+	LASSERT(pill->rc_loc == RCL_SERVER);
+	fmt = pill->rc_fmt;
+	LASSERT(fmt != NULL);
+
+	count = req_capsule_filled_sizes(pill, RCL_SERVER);
+	rc = lustre_pack_reply(pill->rc_req, count,
+			       pill->rc_area[RCL_SERVER], NULL);
+	if (rc != 0) {
+		DEBUG_REQ(D_ERROR, pill->rc_req,
+		       "Cannot pack %d fields in format `%s': ",
+		       count, fmt->rf_name);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(req_capsule_server_pack);
+
+/**
+ * Returns the PTLRPC request or reply (\a loc) buffer offset of a \a pill
+ * corresponding to the given RMF (\a field).
+ */
+static int __req_capsule_offset(const struct req_capsule *pill,
+				const struct req_msg_field *field,
+				enum req_location loc)
+{
+	int offset;
+
+	offset = field->rmf_offset[pill->rc_fmt->rf_idx][loc];
+	LASSERTF(offset > 0, "%s:%s, off=%d, loc=%d\n",
+			    pill->rc_fmt->rf_name,
+			    field->rmf_name, offset, loc);
+	offset --;
+
+	LASSERT(0 <= offset && offset < REQ_MAX_FIELD_NR);
+	return offset;
+}
+
+/**
+ * Helper for __req_capsule_get(); swabs value / array of values and/or dumps
+ * them if desired.
+ */
+static
+void
+swabber_dumper_helper(struct req_capsule *pill,
+		      const struct req_msg_field *field,
+		      enum req_location loc,
+		      int offset,
+		      void *value, int len, int dump, void (*swabber)( void *))
+{
+	void    *p;
+	int     i;
+	int     n;
+	int     do_swab;
+	int     inout = loc == RCL_CLIENT;
+
+	swabber = swabber ?: field->rmf_swabber;
+
+	if (ptlrpc_buf_need_swab(pill->rc_req, inout, offset) &&
+	    swabber != NULL && value != NULL)
+		do_swab = 1;
+	else
+		do_swab = 0;
+
+	if (!(field->rmf_flags & RMF_F_STRUCT_ARRAY)) {
+		if (dump && field->rmf_dumper) {
+			CDEBUG(D_RPCTRACE, "Dump of %sfield %s follows\n",
+			       do_swab ? "unswabbed " : "", field->rmf_name);
+			field->rmf_dumper(value);
+		}
+		if (!do_swab)
+			return;
+		swabber(value);
+		ptlrpc_buf_set_swabbed(pill->rc_req, inout, offset);
+		if (dump) {
+			CDEBUG(D_RPCTRACE, "Dump of swabbed field %s "
+			       "follows\n", field->rmf_name);
+			field->rmf_dumper(value);
+		}
+
+		return;
+	}
+
+	/*
+	 * We're swabbing an array; swabber() swabs a single array element, so
+	 * swab every element.
+	 */
+	LASSERT((len % field->rmf_size) == 0);
+	for (p = value, i = 0, n = len / field->rmf_size;
+	     i < n;
+	     i++, p += field->rmf_size) {
+		if (dump && field->rmf_dumper) {
+			CDEBUG(D_RPCTRACE, "Dump of %sarray field %s, "
+			       "element %d follows\n",
+			       do_swab ? "unswabbed " : "", field->rmf_name, i);
+			field->rmf_dumper(p);
+		}
+		if (!do_swab)
+			continue;
+		swabber(p);
+		if (dump && field->rmf_dumper) {
+			CDEBUG(D_RPCTRACE, "Dump of swabbed array field %s, "
+			       "element %d follows\n", field->rmf_name, i);
+			field->rmf_dumper(value);
+		}
+	}
+	if (do_swab)
+		ptlrpc_buf_set_swabbed(pill->rc_req, inout, offset);
+}
+
+/**
+ * Returns the pointer to a PTLRPC request or reply (\a loc) buffer of a \a pill
+ * corresponding to the given RMF (\a field).
+ *
+ * The buffer will be swabbed using the given \a swabber.  If \a swabber == NULL
+ * then the \a rmf_swabber from the RMF will be used.  Soon there will be no
+ * calls to __req_capsule_get() with a non-NULL \a swabber; \a swabber will then
+ * be removed.  Fields with the \a RMF_F_STRUCT_ARRAY flag set will have each
+ * element of the array swabbed.
+ */
+static void *__req_capsule_get(struct req_capsule *pill,
+			       const struct req_msg_field *field,
+			       enum req_location loc,
+			       void (*swabber)( void *),
+			       int dump)
+{
+	const struct req_format *fmt;
+	struct lustre_msg       *msg;
+	void		    *value;
+	int		      len;
+	int		      offset;
+
+	void *(*getter)(struct lustre_msg *m, int n, int minlen);
+
+	static const char *rcl_names[RCL_NR] = {
+		[RCL_CLIENT] = "client",
+		[RCL_SERVER] = "server"
+	};
+
+	LASSERT(pill != NULL);
+	LASSERT(pill != LP_POISON);
+	fmt = pill->rc_fmt;
+	LASSERT(fmt != NULL);
+	LASSERT(fmt != LP_POISON);
+	LASSERT(__req_format_is_sane(fmt));
+
+	offset = __req_capsule_offset(pill, field, loc);
+
+	msg = __req_msg(pill, loc);
+	LASSERT(msg != NULL);
+
+	getter = (field->rmf_flags & RMF_F_STRING) ?
+		(typeof(getter))lustre_msg_string : lustre_msg_buf;
+
+	if (field->rmf_flags & RMF_F_STRUCT_ARRAY) {
+		/*
+		 * We've already asserted that field->rmf_size > 0 in
+		 * req_layout_init().
+		 */
+		len = lustre_msg_buflen(msg, offset);
+		if ((len % field->rmf_size) != 0) {
+			CERROR("%s: array field size mismatch "
+			       "%d modulo %d != 0 (%d)\n",
+			       field->rmf_name, len, field->rmf_size, loc);
+			return NULL;
+		}
+	} else if (pill->rc_area[loc][offset] != -1) {
+		len = pill->rc_area[loc][offset];
+	} else {
+		len = max(field->rmf_size, 0);
+	}
+	value = getter(msg, offset, len);
+
+	if (value == NULL) {
+		DEBUG_REQ(D_ERROR, pill->rc_req,
+			  "Wrong buffer for field `%s' (%d of %d) "
+			  "in format `%s': %d vs. %d (%s)\n",
+			  field->rmf_name, offset, lustre_msg_bufcount(msg),
+			  fmt->rf_name, lustre_msg_buflen(msg, offset), len,
+			  rcl_names[loc]);
+	} else {
+		swabber_dumper_helper(pill, field, loc, offset, value, len,
+				      dump, swabber);
+	}
+
+	return value;
+}
+
+/**
+ * Dump a request and/or reply
+ */
+void __req_capsule_dump(struct req_capsule *pill, enum req_location loc)
+{
+	const struct    req_format *fmt;
+	const struct    req_msg_field *field;
+	int	     len;
+	int	     i;
+
+	fmt = pill->rc_fmt;
+
+	DEBUG_REQ(D_RPCTRACE, pill->rc_req, "BEGIN REQ CAPSULE DUMP\n");
+	for (i = 0; i < fmt->rf_fields[loc].nr; ++i) {
+		field = FMT_FIELD(fmt, loc, i);
+		if (field->rmf_dumper == NULL) {
+			/*
+			 * FIXME Add a default hex dumper for fields that don't
+			 * have a specific dumper
+			 */
+			len = req_capsule_get_size(pill, field, loc);
+			CDEBUG(D_RPCTRACE, "Field %s has no dumper function;"
+			       "field size is %d\n", field->rmf_name, len);
+		} else {
+			/* It's the dumping side-effect that we're interested in */
+			(void) __req_capsule_get(pill, field, loc, NULL, 1);
+		}
+	}
+	CDEBUG(D_RPCTRACE, "END REQ CAPSULE DUMP\n");
+}
+
+/**
+ * Dump a request.
+ */
+void req_capsule_client_dump(struct req_capsule *pill)
+{
+	__req_capsule_dump(pill, RCL_CLIENT);
+}
+EXPORT_SYMBOL(req_capsule_client_dump);
+
+/**
+ * Dump a reply
+ */
+void req_capsule_server_dump(struct req_capsule *pill)
+{
+	__req_capsule_dump(pill, RCL_SERVER);
+}
+EXPORT_SYMBOL(req_capsule_server_dump);
+
+/**
+ * Trivial wrapper around __req_capsule_get(), that returns the PTLRPC request
+ * buffer corresponding to the given RMF (\a field) of a \a pill.
+ */
+void *req_capsule_client_get(struct req_capsule *pill,
+			     const struct req_msg_field *field)
+{
+	return __req_capsule_get(pill, field, RCL_CLIENT, NULL, 0);
+}
+EXPORT_SYMBOL(req_capsule_client_get);
+
+/**
+ * Same as req_capsule_client_get(), but with a \a swabber argument.
+ *
+ * Currently unused; will be removed when req_capsule_server_swab_get() is
+ * unused too.
+ */
+void *req_capsule_client_swab_get(struct req_capsule *pill,
+				  const struct req_msg_field *field,
+				  void *swabber)
+{
+	return __req_capsule_get(pill, field, RCL_CLIENT, swabber, 0);
+}
+EXPORT_SYMBOL(req_capsule_client_swab_get);
+
+/**
+ * Utility that combines req_capsule_set_size() and req_capsule_client_get().
+ *
+ * First the \a pill's request \a field's size is set (\a rc_area) using
+ * req_capsule_set_size() with the given \a len.  Then the actual buffer is
+ * returned.
+ */
+void *req_capsule_client_sized_get(struct req_capsule *pill,
+				   const struct req_msg_field *field,
+				   int len)
+{
+	req_capsule_set_size(pill, field, RCL_CLIENT, len);
+	return __req_capsule_get(pill, field, RCL_CLIENT, NULL, 0);
+}
+EXPORT_SYMBOL(req_capsule_client_sized_get);
+
+/**
+ * Trivial wrapper around __req_capsule_get(), that returns the PTLRPC reply
+ * buffer corresponding to the given RMF (\a field) of a \a pill.
+ */
+void *req_capsule_server_get(struct req_capsule *pill,
+			     const struct req_msg_field *field)
+{
+	return __req_capsule_get(pill, field, RCL_SERVER, NULL, 0);
+}
+EXPORT_SYMBOL(req_capsule_server_get);
+
+/**
+ * Same as req_capsule_server_get(), but with a \a swabber argument.
+ *
+ * Ideally all swabbing should be done pursuant to RMF definitions, with no
+ * swabbing done outside this capsule abstraction.
+ */
+void *req_capsule_server_swab_get(struct req_capsule *pill,
+				  const struct req_msg_field *field,
+				  void *swabber)
+{
+	return __req_capsule_get(pill, field, RCL_SERVER, swabber, 0);
+}
+EXPORT_SYMBOL(req_capsule_server_swab_get);
+
+/**
+ * Utility that combines req_capsule_set_size() and req_capsule_server_get().
+ *
+ * First the \a pill's request \a field's size is set (\a rc_area) using
+ * req_capsule_set_size() with the given \a len.  Then the actual buffer is
+ * returned.
+ */
+void *req_capsule_server_sized_get(struct req_capsule *pill,
+				   const struct req_msg_field *field,
+				   int len)
+{
+	req_capsule_set_size(pill, field, RCL_SERVER, len);
+	return __req_capsule_get(pill, field, RCL_SERVER, NULL, 0);
+}
+EXPORT_SYMBOL(req_capsule_server_sized_get);
+
+void *req_capsule_server_sized_swab_get(struct req_capsule *pill,
+					const struct req_msg_field *field,
+					int len, void *swabber)
+{
+	req_capsule_set_size(pill, field, RCL_SERVER, len);
+	return __req_capsule_get(pill, field, RCL_SERVER, swabber, 0);
+}
+EXPORT_SYMBOL(req_capsule_server_sized_swab_get);
+
+/**
+ * Returns the buffer of a \a pill corresponding to the given \a field from the
+ * request (if the caller is executing on the server-side) or reply (if the
+ * caller is executing on the client-side).
+ *
+ * This function convienient for use is code that could be executed on the
+ * client and server alike.
+ */
+const void *req_capsule_other_get(struct req_capsule *pill,
+				  const struct req_msg_field *field)
+{
+	return __req_capsule_get(pill, field, pill->rc_loc ^ 1, NULL, 0);
+}
+EXPORT_SYMBOL(req_capsule_other_get);
+
+/**
+ * Set the size of the PTLRPC request/reply (\a loc) buffer for the given \a
+ * field of the given \a pill.
+ *
+ * This function must be used when constructing variable sized fields of a
+ * request or reply.
+ */
+void req_capsule_set_size(struct req_capsule *pill,
+			  const struct req_msg_field *field,
+			  enum req_location loc, int size)
+{
+	LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT);
+
+	if ((size != field->rmf_size) &&
+	    (field->rmf_size != -1) &&
+	    !(field->rmf_flags & RMF_F_NO_SIZE_CHECK) &&
+	    (size > 0)) {
+		if ((field->rmf_flags & RMF_F_STRUCT_ARRAY) &&
+		    (size % field->rmf_size != 0)) {
+			CERROR("%s: array field size mismatch "
+			       "%d %% %d != 0 (%d)\n",
+			       field->rmf_name, size, field->rmf_size, loc);
+			LBUG();
+		} else if (!(field->rmf_flags & RMF_F_STRUCT_ARRAY) &&
+		    size < field->rmf_size) {
+			CERROR("%s: field size mismatch %d != %d (%d)\n",
+			       field->rmf_name, size, field->rmf_size, loc);
+			LBUG();
+		}
+	}
+
+	pill->rc_area[loc][__req_capsule_offset(pill, field, loc)] = size;
+}
+EXPORT_SYMBOL(req_capsule_set_size);
+
+/**
+ * Return the actual PTLRPC buffer length of a request or reply (\a loc)
+ * for the given \a pill's given \a field.
+ *
+ * NB: this function doesn't correspond with req_capsule_set_size(), which
+ * actually sets the size in pill.rc_area[loc][offset], but this function
+ * returns the message buflen[offset], maybe we should use another name.
+ */
+int req_capsule_get_size(const struct req_capsule *pill,
+			 const struct req_msg_field *field,
+			 enum req_location loc)
+{
+	LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT);
+
+	return lustre_msg_buflen(__req_msg(pill, loc),
+				 __req_capsule_offset(pill, field, loc));
+}
+EXPORT_SYMBOL(req_capsule_get_size);
+
+/**
+ * Wrapper around lustre_msg_size() that returns the PTLRPC size needed for the
+ * given \a pill's request or reply (\a loc) given the field size recorded in
+ * the \a pill's rc_area.
+ *
+ * See also req_capsule_set_size().
+ */
+int req_capsule_msg_size(struct req_capsule *pill, enum req_location loc)
+{
+	return lustre_msg_size(pill->rc_req->rq_import->imp_msg_magic,
+			       pill->rc_fmt->rf_fields[loc].nr,
+			       pill->rc_area[loc]);
+}
+
+/**
+ * While req_capsule_msg_size() computes the size of a PTLRPC request or reply
+ * (\a loc) given a \a pill's \a rc_area, this function computes the size of a
+ * PTLRPC request or reply given only an RQF (\a fmt).
+ *
+ * This function should not be used for formats which contain variable size
+ * fields.
+ */
+int req_capsule_fmt_size(__u32 magic, const struct req_format *fmt,
+			 enum req_location loc)
+{
+	int size, i = 0;
+
+	/*
+	 * This function should probably LASSERT() that fmt has no fields with
+	 * RMF_F_STRUCT_ARRAY in rmf_flags, since we can't know here how many
+	 * elements in the array there will ultimately be, but then, we could
+	 * assume that there will be at least one element, and that's just what
+	 * we do.
+	 */
+	size = lustre_msg_hdr_size(magic, fmt->rf_fields[loc].nr);
+	if (size < 0)
+		return size;
+
+	for (; i < fmt->rf_fields[loc].nr; ++i)
+		if (fmt->rf_fields[loc].d[i]->rmf_size != -1)
+			size += cfs_size_round(fmt->rf_fields[loc].d[i]->
+					       rmf_size);
+	return size;
+}
+
+/**
+ * Changes the format of an RPC.
+ *
+ * The pill must already have been initialized, which means that it already has
+ * a request format.  The new format \a fmt must be an extension of the pill's
+ * old format.  Specifically: the new format must have as many request and reply
+ * fields as the old one, and all fields shared by the old and new format must
+ * be at least as large in the new format.
+ *
+ * The new format's fields may be of different "type" than the old format, but
+ * only for fields that are "opaque" blobs: fields which have a) have no
+ * \a rmf_swabber, b) \a rmf_flags == 0 or RMF_F_NO_SIZE_CHECK, and c) \a
+ * rmf_size == -1 or \a rmf_flags == RMF_F_NO_SIZE_CHECK.  For example,
+ * OBD_SET_INFO has a key field and an opaque value field that gets interpreted
+ * according to the key field.  When the value, according to the key, contains a
+ * structure (or array thereof) to be swabbed, the format should be changed to
+ * one where the value field has \a rmf_size/rmf_flags/rmf_swabber set
+ * accordingly.
+ */
+void req_capsule_extend(struct req_capsule *pill, const struct req_format *fmt)
+{
+	int i;
+	int j;
+
+	const struct req_format *old;
+
+	LASSERT(pill->rc_fmt != NULL);
+	LASSERT(__req_format_is_sane(fmt));
+
+	old = pill->rc_fmt;
+	/*
+	 * Sanity checking...
+	 */
+	for (i = 0; i < RCL_NR; ++i) {
+		LASSERT(fmt->rf_fields[i].nr >= old->rf_fields[i].nr);
+		for (j = 0; j < old->rf_fields[i].nr - 1; ++j) {
+			const struct req_msg_field *ofield = FMT_FIELD(old, i, j);
+
+			/* "opaque" fields can be transmogrified */
+			if (ofield->rmf_swabber == NULL &&
+			    (ofield->rmf_flags & ~RMF_F_NO_SIZE_CHECK) == 0 &&
+			    (ofield->rmf_size == -1 ||
+			    ofield->rmf_flags == RMF_F_NO_SIZE_CHECK))
+				continue;
+			LASSERT(FMT_FIELD(fmt, i, j) == FMT_FIELD(old, i, j));
+		}
+		/*
+		 * Last field in old format can be shorter than in new.
+		 */
+		LASSERT(FMT_FIELD(fmt, i, j)->rmf_size >=
+			FMT_FIELD(old, i, j)->rmf_size);
+	}
+
+	pill->rc_fmt = fmt;
+}
+EXPORT_SYMBOL(req_capsule_extend);
+
+/**
+ * This function returns a non-zero value if the given \a field is present in
+ * the format (\a rc_fmt) of \a pill's PTLRPC request or reply (\a loc), else it
+ * returns 0.
+ */
+int req_capsule_has_field(const struct req_capsule *pill,
+			  const struct req_msg_field *field,
+			  enum req_location loc)
+{
+	LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT);
+
+	return field->rmf_offset[pill->rc_fmt->rf_idx][loc];
+}
+EXPORT_SYMBOL(req_capsule_has_field);
+
+/**
+ * Returns a non-zero value if the given \a field is present in the given \a
+ * pill's PTLRPC request or reply (\a loc), else it returns 0.
+ */
+int req_capsule_field_present(const struct req_capsule *pill,
+			      const struct req_msg_field *field,
+			      enum req_location loc)
+{
+	int offset;
+
+	LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT);
+	LASSERT(req_capsule_has_field(pill, field, loc));
+
+	offset = __req_capsule_offset(pill, field, loc);
+	return lustre_msg_bufcount(__req_msg(pill, loc)) > offset;
+}
+EXPORT_SYMBOL(req_capsule_field_present);
+
+/**
+ * This function shrinks the size of the _buffer_ of the \a pill's PTLRPC
+ * request or reply (\a loc).
+ *
+ * This is not the opposite of req_capsule_extend().
+ */
+void req_capsule_shrink(struct req_capsule *pill,
+			const struct req_msg_field *field,
+			unsigned int newlen,
+			enum req_location loc)
+{
+	const struct req_format *fmt;
+	struct lustre_msg       *msg;
+	int		      len;
+	int		      offset;
+
+	fmt = pill->rc_fmt;
+	LASSERT(fmt != NULL);
+	LASSERT(__req_format_is_sane(fmt));
+	LASSERT(req_capsule_has_field(pill, field, loc));
+	LASSERT(req_capsule_field_present(pill, field, loc));
+
+	offset = __req_capsule_offset(pill, field, loc);
+
+	msg = __req_msg(pill, loc);
+	len = lustre_msg_buflen(msg, offset);
+	LASSERTF(newlen <= len, "%s:%s, oldlen=%d, newlen=%d\n",
+				fmt->rf_name, field->rmf_name, len, newlen);
+
+	if (loc == RCL_CLIENT)
+		pill->rc_req->rq_reqlen = lustre_shrink_msg(msg, offset, newlen,
+							    1);
+	else
+		pill->rc_req->rq_replen = lustre_shrink_msg(msg, offset, newlen,
+							    1);
+}
+EXPORT_SYMBOL(req_capsule_shrink);
+
+int req_capsule_server_grow(struct req_capsule *pill,
+			    const struct req_msg_field *field,
+			    unsigned int newlen)
+{
+	struct ptlrpc_reply_state *rs = pill->rc_req->rq_reply_state, *nrs;
+	char *from, *to;
+	int offset, len, rc;
+
+	LASSERT(pill->rc_fmt != NULL);
+	LASSERT(__req_format_is_sane(pill->rc_fmt));
+	LASSERT(req_capsule_has_field(pill, field, RCL_SERVER));
+	LASSERT(req_capsule_field_present(pill, field, RCL_SERVER));
+
+	len = req_capsule_get_size(pill, field, RCL_SERVER);
+	offset = __req_capsule_offset(pill, field, RCL_SERVER);
+	if (pill->rc_req->rq_repbuf_len >=
+	    lustre_packed_msg_size(pill->rc_req->rq_repmsg) - len + newlen)
+		CERROR("Inplace repack might be done\n");
+
+	pill->rc_req->rq_reply_state = NULL;
+	req_capsule_set_size(pill, field, RCL_SERVER, newlen);
+	rc = req_capsule_server_pack(pill);
+	if (rc) {
+		/* put old rs back, the caller will decide what to do */
+		pill->rc_req->rq_reply_state = rs;
+		return rc;
+	}
+	nrs = pill->rc_req->rq_reply_state;
+	/* Now we need only buffers, copy first chunk */
+	to = lustre_msg_buf(nrs->rs_msg, 0, 0);
+	from = lustre_msg_buf(rs->rs_msg, 0, 0);
+	len = (char *)lustre_msg_buf(rs->rs_msg, offset, 0) - from;
+	memcpy(to, from, len);
+	/* check if we have tail and copy it too */
+	if (rs->rs_msg->lm_bufcount > offset + 1) {
+		to = lustre_msg_buf(nrs->rs_msg, offset + 1, 0);
+		from = lustre_msg_buf(rs->rs_msg, offset + 1, 0);
+		offset = rs->rs_msg->lm_bufcount - 1;
+		len = (char *)lustre_msg_buf(rs->rs_msg, offset, 0) +
+		      cfs_size_round(rs->rs_msg->lm_buflens[offset]) - from;
+		memcpy(to, from, len);
+	}
+	/* drop old reply if everything is fine */
+	if (rs->rs_difficult) {
+		/* copy rs data */
+		int i;
+
+		nrs->rs_difficult = 1;
+		nrs->rs_no_ack = rs->rs_no_ack;
+		for (i = 0; i < rs->rs_nlocks; i++) {
+			nrs->rs_locks[i] = rs->rs_locks[i];
+			nrs->rs_modes[i] = rs->rs_modes[i];
+			nrs->rs_nlocks++;
+		}
+		rs->rs_nlocks = 0;
+		rs->rs_difficult = 0;
+		rs->rs_no_ack = 0;
+	}
+	ptlrpc_rs_decref(rs);
+	return 0;
+}
+EXPORT_SYMBOL(req_capsule_server_grow);
+/* __REQ_LAYOUT_USER__ */
+#endif
diff --git a/drivers/staging/lustre/lustre/ptlrpc/llog_client.c b/drivers/staging/lustre/lustre/ptlrpc/llog_client.c
new file mode 100644
index 000000000000..367ca8ef7d60
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/llog_client.c
@@ -0,0 +1,354 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/llog_client.c
+ *
+ * remote api for llog - client side
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+#include <linux/libcfs/libcfs.h>
+
+#include <obd_class.h>
+#include <lustre_log.h>
+#include <lustre_net.h>
+#include <linux/list.h>
+
+#define LLOG_CLIENT_ENTRY(ctxt, imp) do {			     \
+	mutex_lock(&ctxt->loc_mutex);			     \
+	if (ctxt->loc_imp) {					  \
+		imp = class_import_get(ctxt->loc_imp);		\
+	} else {						      \
+		CERROR("ctxt->loc_imp == NULL for context idx %d."    \
+		       "Unable to complete MDS/OSS recovery,"	 \
+		       "but I'll try again next time.  Not fatal.\n", \
+		       ctxt->loc_idx);				\
+		imp = NULL;					   \
+		mutex_unlock(&ctxt->loc_mutex);		   \
+		return (-EINVAL);				     \
+	}							     \
+	mutex_unlock(&ctxt->loc_mutex);			   \
+} while(0)
+
+#define LLOG_CLIENT_EXIT(ctxt, imp) do {			      \
+	mutex_lock(&ctxt->loc_mutex);			     \
+	if (ctxt->loc_imp != imp)				     \
+		CWARN("loc_imp has changed from %p to %p\n",	  \
+		       ctxt->loc_imp, imp);			   \
+	class_import_put(imp);					\
+	mutex_unlock(&ctxt->loc_mutex);			   \
+} while(0)
+
+/* This is a callback from the llog_* functions.
+ * Assumes caller has already pushed us into the kernel context. */
+static int llog_client_open(const struct lu_env *env,
+			    struct llog_handle *lgh, struct llog_logid *logid,
+			    char *name, enum llog_open_param open_param)
+{
+	struct obd_import     *imp;
+	struct llogd_body     *body;
+	struct llog_ctxt      *ctxt = lgh->lgh_ctxt;
+	struct ptlrpc_request *req = NULL;
+	int		    rc;
+	ENTRY;
+
+	LLOG_CLIENT_ENTRY(ctxt, imp);
+
+	/* client cannot create llog */
+	LASSERTF(open_param != LLOG_OPEN_NEW, "%#x\n", open_param);
+	LASSERT(lgh);
+
+	req = ptlrpc_request_alloc(imp, &RQF_LLOG_ORIGIN_HANDLE_CREATE);
+	if (req == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	if (name)
+		req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+				     strlen(name) + 1);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_LOG_VERSION,
+				 LLOG_ORIGIN_HANDLE_CREATE);
+	if (rc) {
+		ptlrpc_request_free(req);
+		req = NULL;
+		GOTO(out, rc);
+	}
+	ptlrpc_request_set_replen(req);
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	if (logid)
+		body->lgd_logid = *logid;
+	body->lgd_ctxt_idx = ctxt->loc_idx - 1;
+
+	if (name) {
+		char *tmp;
+		tmp = req_capsule_client_sized_get(&req->rq_pill, &RMF_NAME,
+						   strlen(name) + 1);
+		LASSERT(tmp);
+		strcpy(tmp, name);
+	}
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	if (body == NULL)
+		GOTO(out, rc = -EFAULT);
+
+	lgh->lgh_id = body->lgd_logid;
+	lgh->lgh_ctxt = ctxt;
+	EXIT;
+out:
+	LLOG_CLIENT_EXIT(ctxt, imp);
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int llog_client_destroy(const struct lu_env *env,
+			       struct llog_handle *loghandle)
+{
+	struct obd_import     *imp;
+	struct ptlrpc_request *req = NULL;
+	struct llogd_body     *body;
+	int		    rc;
+	ENTRY;
+
+	LLOG_CLIENT_ENTRY(loghandle->lgh_ctxt, imp);
+	req = ptlrpc_request_alloc_pack(imp, &RQF_LLOG_ORIGIN_HANDLE_DESTROY,
+					LUSTRE_LOG_VERSION,
+					LLOG_ORIGIN_HANDLE_DESTROY);
+	if (req == NULL)
+		GOTO(err_exit, rc =-ENOMEM);
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	body->lgd_logid = loghandle->lgh_id;
+	body->lgd_llh_flags = loghandle->lgh_hdr->llh_flags;
+
+	if (!(body->lgd_llh_flags & LLOG_F_IS_PLAIN))
+		CERROR("%s: wrong llog flags %x\n", imp->imp_obd->obd_name,
+		       body->lgd_llh_flags);
+
+	ptlrpc_request_set_replen(req);
+	rc = ptlrpc_queue_wait(req);
+
+	ptlrpc_req_finished(req);
+err_exit:
+	LLOG_CLIENT_EXIT(loghandle->lgh_ctxt, imp);
+	RETURN(rc);
+}
+
+
+static int llog_client_next_block(const struct lu_env *env,
+				  struct llog_handle *loghandle,
+				  int *cur_idx, int next_idx,
+				  __u64 *cur_offset, void *buf, int len)
+{
+	struct obd_import     *imp;
+	struct ptlrpc_request *req = NULL;
+	struct llogd_body     *body;
+	void		  *ptr;
+	int		    rc;
+	ENTRY;
+
+	LLOG_CLIENT_ENTRY(loghandle->lgh_ctxt, imp);
+	req = ptlrpc_request_alloc_pack(imp, &RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK,
+					LUSTRE_LOG_VERSION,
+					LLOG_ORIGIN_HANDLE_NEXT_BLOCK);
+	if (req == NULL)
+		GOTO(err_exit, rc =-ENOMEM);
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	body->lgd_logid = loghandle->lgh_id;
+	body->lgd_ctxt_idx = loghandle->lgh_ctxt->loc_idx - 1;
+	body->lgd_llh_flags = loghandle->lgh_hdr->llh_flags;
+	body->lgd_index = next_idx;
+	body->lgd_saved_index = *cur_idx;
+	body->lgd_len = len;
+	body->lgd_cur_offset = *cur_offset;
+
+	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER, len);
+	ptlrpc_request_set_replen(req);
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	if (body == NULL)
+		GOTO(out, rc =-EFAULT);
+
+	/* The log records are swabbed as they are processed */
+	ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA);
+	if (ptr == NULL)
+		GOTO(out, rc =-EFAULT);
+
+	*cur_idx = body->lgd_saved_index;
+	*cur_offset = body->lgd_cur_offset;
+
+	memcpy(buf, ptr, len);
+	EXIT;
+out:
+	ptlrpc_req_finished(req);
+err_exit:
+	LLOG_CLIENT_EXIT(loghandle->lgh_ctxt, imp);
+	return rc;
+}
+
+static int llog_client_prev_block(const struct lu_env *env,
+				  struct llog_handle *loghandle,
+				  int prev_idx, void *buf, int len)
+{
+	struct obd_import     *imp;
+	struct ptlrpc_request *req = NULL;
+	struct llogd_body     *body;
+	void		  *ptr;
+	int		    rc;
+	ENTRY;
+
+	LLOG_CLIENT_ENTRY(loghandle->lgh_ctxt, imp);
+	req = ptlrpc_request_alloc_pack(imp, &RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK,
+					LUSTRE_LOG_VERSION,
+					LLOG_ORIGIN_HANDLE_PREV_BLOCK);
+	if (req == NULL)
+		GOTO(err_exit, rc = -ENOMEM);
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	body->lgd_logid = loghandle->lgh_id;
+	body->lgd_ctxt_idx = loghandle->lgh_ctxt->loc_idx - 1;
+	body->lgd_llh_flags = loghandle->lgh_hdr->llh_flags;
+	body->lgd_index = prev_idx;
+	body->lgd_len = len;
+
+	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER, len);
+	ptlrpc_request_set_replen(req);
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	if (body == NULL)
+		GOTO(out, rc =-EFAULT);
+
+	ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA);
+	if (ptr == NULL)
+		GOTO(out, rc =-EFAULT);
+
+	memcpy(buf, ptr, len);
+	EXIT;
+out:
+	ptlrpc_req_finished(req);
+err_exit:
+	LLOG_CLIENT_EXIT(loghandle->lgh_ctxt, imp);
+	return rc;
+}
+
+static int llog_client_read_header(const struct lu_env *env,
+				   struct llog_handle *handle)
+{
+	struct obd_import     *imp;
+	struct ptlrpc_request *req = NULL;
+	struct llogd_body     *body;
+	struct llog_log_hdr   *hdr;
+	struct llog_rec_hdr   *llh_hdr;
+	int		    rc;
+	ENTRY;
+
+	LLOG_CLIENT_ENTRY(handle->lgh_ctxt, imp);
+	req = ptlrpc_request_alloc_pack(imp,&RQF_LLOG_ORIGIN_HANDLE_READ_HEADER,
+					LUSTRE_LOG_VERSION,
+					LLOG_ORIGIN_HANDLE_READ_HEADER);
+	if (req == NULL)
+		GOTO(err_exit, rc = -ENOMEM);
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	body->lgd_logid = handle->lgh_id;
+	body->lgd_ctxt_idx = handle->lgh_ctxt->loc_idx - 1;
+	body->lgd_llh_flags = handle->lgh_hdr->llh_flags;
+
+	ptlrpc_request_set_replen(req);
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
+
+	hdr = req_capsule_server_get(&req->rq_pill, &RMF_LLOG_LOG_HDR);
+	if (hdr == NULL)
+		GOTO(out, rc =-EFAULT);
+
+	memcpy(handle->lgh_hdr, hdr, sizeof (*hdr));
+	handle->lgh_last_idx = handle->lgh_hdr->llh_tail.lrt_index;
+
+	/* sanity checks */
+	llh_hdr = &handle->lgh_hdr->llh_hdr;
+	if (llh_hdr->lrh_type != LLOG_HDR_MAGIC) {
+		CERROR("bad log header magic: %#x (expecting %#x)\n",
+		       llh_hdr->lrh_type, LLOG_HDR_MAGIC);
+		rc = -EIO;
+	} else if (llh_hdr->lrh_len != LLOG_CHUNK_SIZE) {
+		CERROR("incorrectly sized log header: %#x "
+		       "(expecting %#x)\n",
+		       llh_hdr->lrh_len, LLOG_CHUNK_SIZE);
+		CERROR("you may need to re-run lconf --write_conf.\n");
+		rc = -EIO;
+	}
+	EXIT;
+out:
+	ptlrpc_req_finished(req);
+err_exit:
+	LLOG_CLIENT_EXIT(handle->lgh_ctxt, imp);
+	return rc;
+}
+
+static int llog_client_close(const struct lu_env *env,
+			     struct llog_handle *handle)
+{
+	/* this doesn't call LLOG_ORIGIN_HANDLE_CLOSE because
+	   the servers all close the file at the end of every
+	   other LLOG_ RPC. */
+	return(0);
+}
+
+struct llog_operations llog_client_ops = {
+	.lop_next_block		= llog_client_next_block,
+	.lop_prev_block		= llog_client_prev_block,
+	.lop_read_header	= llog_client_read_header,
+	.lop_open		= llog_client_open,
+	.lop_destroy		= llog_client_destroy,
+	.lop_close		= llog_client_close,
+};
+EXPORT_SYMBOL(llog_client_ops);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/llog_net.c b/drivers/staging/lustre/lustre/ptlrpc/llog_net.c
new file mode 100644
index 000000000000..a81f557d7794
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/llog_net.c
@@ -0,0 +1,75 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/llog_net.c
+ *
+ * OST<->MDS recovery logging infrastructure.
+ *
+ * Invariants in implementation:
+ * - we do not share logs among different OST<->MDS connections, so that
+ *   if an OST or MDS fails it need only look at log(s) relevant to itself
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+#include <linux/libcfs/libcfs.h>
+
+#include <obd_class.h>
+#include <lustre_log.h>
+#include <linux/list.h>
+#include <lvfs.h>
+#include <lustre_fsfilt.h>
+
+int llog_initiator_connect(struct llog_ctxt *ctxt)
+{
+	struct obd_import *new_imp;
+	ENTRY;
+
+	LASSERT(ctxt);
+	new_imp = ctxt->loc_obd->u.cli.cl_import;
+	LASSERTF(ctxt->loc_imp == NULL || ctxt->loc_imp == new_imp,
+		 "%p - %p\n", ctxt->loc_imp, new_imp);
+	mutex_lock(&ctxt->loc_mutex);
+	if (ctxt->loc_imp != new_imp) {
+		if (ctxt->loc_imp)
+			class_import_put(ctxt->loc_imp);
+		ctxt->loc_imp = class_import_get(new_imp);
+	}
+	mutex_unlock(&ctxt->loc_mutex);
+	RETURN(0);
+}
+EXPORT_SYMBOL(llog_initiator_connect);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/llog_server.c b/drivers/staging/lustre/lustre/ptlrpc/llog_server.c
new file mode 100644
index 000000000000..bc1fcd8c7e73
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/llog_server.c
@@ -0,0 +1,466 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/llog_server.c
+ *
+ * remote api for llog - server side
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include <obd_class.h>
+#include <lustre_log.h>
+#include <lustre_net.h>
+#include <lustre_fsfilt.h>
+
+#if  defined(LUSTRE_LOG_SERVER)
+static int llog_origin_close(const struct lu_env *env, struct llog_handle *lgh)
+{
+	if (lgh->lgh_hdr != NULL && lgh->lgh_hdr->llh_flags & LLOG_F_IS_CAT)
+		return llog_cat_close(env, lgh);
+	else
+		return llog_close(env, lgh);
+}
+
+/* Only open is supported, no new llog can be created remotely */
+int llog_origin_handle_open(struct ptlrpc_request *req)
+{
+	struct obd_export	*exp = req->rq_export;
+	struct obd_device	*obd = exp->exp_obd;
+	struct obd_device	*disk_obd;
+	struct lvfs_run_ctxt	 saved;
+	struct llog_handle	*loghandle;
+	struct llogd_body	*body;
+	struct llog_logid	*logid = NULL;
+	struct llog_ctxt	*ctxt;
+	char			*name = NULL;
+	int			 rc;
+
+	ENTRY;
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	if (body == NULL)
+		RETURN(-EFAULT);
+
+	if (ostid_id(&body->lgd_logid.lgl_oi) > 0)
+		logid = &body->lgd_logid;
+
+	if (req_capsule_field_present(&req->rq_pill, &RMF_NAME, RCL_CLIENT)) {
+		name = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+		if (name == NULL)
+			RETURN(-EFAULT);
+		CDEBUG(D_INFO, "%s: opening log %s\n", obd->obd_name, name);
+	}
+
+	ctxt = llog_get_context(obd, body->lgd_ctxt_idx);
+	if (ctxt == NULL) {
+		CDEBUG(D_WARNING, "%s: no ctxt. group=%p idx=%d name=%s\n",
+		       obd->obd_name, &obd->obd_olg, body->lgd_ctxt_idx, name);
+		RETURN(-ENODEV);
+	}
+	disk_obd = ctxt->loc_exp->exp_obd;
+	push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+
+	rc = llog_open(req->rq_svc_thread->t_env, ctxt, &loghandle, logid,
+		       name, LLOG_OPEN_EXISTS);
+	if (rc)
+		GOTO(out_pop, rc);
+
+	rc = req_capsule_server_pack(&req->rq_pill);
+	if (rc)
+		GOTO(out_close, rc = -ENOMEM);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	body->lgd_logid = loghandle->lgh_id;
+
+	EXIT;
+out_close:
+	llog_origin_close(req->rq_svc_thread->t_env, loghandle);
+out_pop:
+	pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+	llog_ctxt_put(ctxt);
+	return rc;
+}
+EXPORT_SYMBOL(llog_origin_handle_open);
+
+int llog_origin_handle_destroy(struct ptlrpc_request *req)
+{
+	struct obd_device	*disk_obd;
+	struct lvfs_run_ctxt	 saved;
+	struct llogd_body	*body;
+	struct llog_logid	*logid = NULL;
+	struct llog_ctxt	*ctxt;
+	int			 rc;
+
+	ENTRY;
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	if (body == NULL)
+		RETURN(-EFAULT);
+
+	if (ostid_id(&body->lgd_logid.lgl_oi) > 0)
+		logid = &body->lgd_logid;
+
+	if (!(body->lgd_llh_flags & LLOG_F_IS_PLAIN))
+		CERROR("%s: wrong llog flags %x\n",
+		       req->rq_export->exp_obd->obd_name, body->lgd_llh_flags);
+
+	ctxt = llog_get_context(req->rq_export->exp_obd, body->lgd_ctxt_idx);
+	if (ctxt == NULL)
+		RETURN(-ENODEV);
+
+	disk_obd = ctxt->loc_exp->exp_obd;
+	push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+
+	rc = req_capsule_server_pack(&req->rq_pill);
+	/* erase only if no error and logid is valid */
+	if (rc == 0)
+		rc = llog_erase(req->rq_svc_thread->t_env, ctxt, logid, NULL);
+	pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+	llog_ctxt_put(ctxt);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_origin_handle_destroy);
+
+int llog_origin_handle_next_block(struct ptlrpc_request *req)
+{
+	struct obd_device   *disk_obd;
+	struct llog_handle  *loghandle;
+	struct llogd_body   *body;
+	struct llogd_body   *repbody;
+	struct lvfs_run_ctxt saved;
+	struct llog_ctxt    *ctxt;
+	__u32		flags;
+	void		*ptr;
+	int		  rc;
+
+	ENTRY;
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	if (body == NULL)
+		RETURN(-EFAULT);
+
+	ctxt = llog_get_context(req->rq_export->exp_obd, body->lgd_ctxt_idx);
+	if (ctxt == NULL)
+		RETURN(-ENODEV);
+
+	disk_obd = ctxt->loc_exp->exp_obd;
+	push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+
+	rc = llog_open(req->rq_svc_thread->t_env, ctxt, &loghandle,
+		       &body->lgd_logid, NULL, LLOG_OPEN_EXISTS);
+	if (rc)
+		GOTO(out_pop, rc);
+
+	flags = body->lgd_llh_flags;
+	rc = llog_init_handle(req->rq_svc_thread->t_env, loghandle, flags,
+			      NULL);
+	if (rc)
+		GOTO(out_close, rc);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER,
+			     LLOG_CHUNK_SIZE);
+	rc = req_capsule_server_pack(&req->rq_pill);
+	if (rc)
+		GOTO(out_close, rc = -ENOMEM);
+
+	repbody = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	*repbody = *body;
+
+	ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA);
+	rc = llog_next_block(req->rq_svc_thread->t_env, loghandle,
+			     &repbody->lgd_saved_index, repbody->lgd_index,
+			     &repbody->lgd_cur_offset, ptr, LLOG_CHUNK_SIZE);
+	if (rc)
+		GOTO(out_close, rc);
+	EXIT;
+out_close:
+	llog_origin_close(req->rq_svc_thread->t_env, loghandle);
+out_pop:
+	pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+	llog_ctxt_put(ctxt);
+	return rc;
+}
+EXPORT_SYMBOL(llog_origin_handle_next_block);
+
+int llog_origin_handle_prev_block(struct ptlrpc_request *req)
+{
+	struct llog_handle   *loghandle;
+	struct llogd_body    *body;
+	struct llogd_body    *repbody;
+	struct obd_device    *disk_obd;
+	struct lvfs_run_ctxt  saved;
+	struct llog_ctxt     *ctxt;
+	__u32		 flags;
+	void		 *ptr;
+	int		   rc;
+
+	ENTRY;
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	if (body == NULL)
+		RETURN(-EFAULT);
+
+	ctxt = llog_get_context(req->rq_export->exp_obd, body->lgd_ctxt_idx);
+	if (ctxt == NULL)
+		RETURN(-ENODEV);
+
+	disk_obd = ctxt->loc_exp->exp_obd;
+	push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+
+	rc = llog_open(req->rq_svc_thread->t_env, ctxt, &loghandle,
+			 &body->lgd_logid, NULL, LLOG_OPEN_EXISTS);
+	if (rc)
+		GOTO(out_pop, rc);
+
+	flags = body->lgd_llh_flags;
+	rc = llog_init_handle(req->rq_svc_thread->t_env, loghandle, flags,
+			      NULL);
+	if (rc)
+		GOTO(out_close, rc);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER,
+			     LLOG_CHUNK_SIZE);
+	rc = req_capsule_server_pack(&req->rq_pill);
+	if (rc)
+		GOTO(out_close, rc = -ENOMEM);
+
+	repbody = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	*repbody = *body;
+
+	ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA);
+	rc = llog_prev_block(req->rq_svc_thread->t_env, loghandle,
+			     body->lgd_index, ptr, LLOG_CHUNK_SIZE);
+	if (rc)
+		GOTO(out_close, rc);
+
+	EXIT;
+out_close:
+	llog_origin_close(req->rq_svc_thread->t_env, loghandle);
+out_pop:
+	pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+	llog_ctxt_put(ctxt);
+	return rc;
+}
+EXPORT_SYMBOL(llog_origin_handle_prev_block);
+
+int llog_origin_handle_read_header(struct ptlrpc_request *req)
+{
+	struct obd_device    *disk_obd;
+	struct llog_handle   *loghandle;
+	struct llogd_body    *body;
+	struct llog_log_hdr  *hdr;
+	struct lvfs_run_ctxt  saved;
+	struct llog_ctxt     *ctxt;
+	__u32		 flags;
+	int		   rc;
+
+	ENTRY;
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	if (body == NULL)
+		RETURN(-EFAULT);
+
+	ctxt = llog_get_context(req->rq_export->exp_obd, body->lgd_ctxt_idx);
+	if (ctxt == NULL)
+		RETURN(-ENODEV);
+
+	disk_obd = ctxt->loc_exp->exp_obd;
+	push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+
+	rc = llog_open(req->rq_svc_thread->t_env, ctxt, &loghandle,
+		       &body->lgd_logid, NULL, LLOG_OPEN_EXISTS);
+	if (rc)
+		GOTO(out_pop, rc);
+
+	/*
+	 * llog_init_handle() reads the llog header
+	 */
+	flags = body->lgd_llh_flags;
+	rc = llog_init_handle(req->rq_svc_thread->t_env, loghandle, flags,
+			      NULL);
+	if (rc)
+		GOTO(out_close, rc);
+	flags = loghandle->lgh_hdr->llh_flags;
+
+	rc = req_capsule_server_pack(&req->rq_pill);
+	if (rc)
+		GOTO(out_close, rc = -ENOMEM);
+
+	hdr = req_capsule_server_get(&req->rq_pill, &RMF_LLOG_LOG_HDR);
+	*hdr = *loghandle->lgh_hdr;
+	EXIT;
+out_close:
+	llog_origin_close(req->rq_svc_thread->t_env, loghandle);
+out_pop:
+	pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+	llog_ctxt_put(ctxt);
+	return rc;
+}
+EXPORT_SYMBOL(llog_origin_handle_read_header);
+
+int llog_origin_handle_close(struct ptlrpc_request *req)
+{
+	ENTRY;
+	/* Nothing to do */
+	RETURN(0);
+}
+EXPORT_SYMBOL(llog_origin_handle_close);
+
+int llog_origin_handle_cancel(struct ptlrpc_request *req)
+{
+	int num_cookies, rc = 0, err, i, failed = 0;
+	struct obd_device *disk_obd;
+	struct llog_cookie *logcookies;
+	struct llog_ctxt *ctxt = NULL;
+	struct lvfs_run_ctxt saved;
+	struct llog_handle *cathandle;
+	struct inode *inode;
+	void *handle;
+	ENTRY;
+
+	logcookies = req_capsule_client_get(&req->rq_pill, &RMF_LOGCOOKIES);
+	num_cookies = req_capsule_get_size(&req->rq_pill, &RMF_LOGCOOKIES,
+					   RCL_CLIENT) / sizeof(*logcookies);
+	if (logcookies == NULL || num_cookies == 0) {
+		DEBUG_REQ(D_HA, req, "No llog cookies sent");
+		RETURN(-EFAULT);
+	}
+
+	ctxt = llog_get_context(req->rq_export->exp_obd,
+				logcookies->lgc_subsys);
+	if (ctxt == NULL)
+		RETURN(-ENODEV);
+
+	disk_obd = ctxt->loc_exp->exp_obd;
+	push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+	for (i = 0; i < num_cookies; i++, logcookies++) {
+		cathandle = ctxt->loc_handle;
+		LASSERT(cathandle != NULL);
+		inode = cathandle->lgh_file->f_dentry->d_inode;
+
+		handle = fsfilt_start_log(disk_obd, inode,
+					  FSFILT_OP_CANCEL_UNLINK, NULL, 1);
+		if (IS_ERR(handle)) {
+			CERROR("fsfilt_start_log() failed: %ld\n",
+			       PTR_ERR(handle));
+			GOTO(pop_ctxt, rc = PTR_ERR(handle));
+		}
+
+		rc = llog_cat_cancel_records(req->rq_svc_thread->t_env,
+					     cathandle, 1, logcookies);
+
+		/*
+		 * Do not raise -ENOENT errors for resent rpcs. This rec already
+		 * might be killed.
+		 */
+		if (rc == -ENOENT &&
+		    (lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT)) {
+			/*
+			 * Do not change this message, reply-single.sh test_59b
+			 * expects to find this in log.
+			 */
+			CDEBUG(D_RPCTRACE, "RESENT cancel req %p - ignored\n",
+			       req);
+			rc = 0;
+		} else if (rc == 0) {
+			CDEBUG(D_RPCTRACE, "Canceled %d llog-records\n",
+			       num_cookies);
+		}
+
+		err = fsfilt_commit(disk_obd, inode, handle, 0);
+		if (err) {
+			CERROR("Error committing transaction: %d\n", err);
+			if (!rc)
+				rc = err;
+			failed++;
+			GOTO(pop_ctxt, rc);
+		} else if (rc)
+			failed++;
+	}
+	GOTO(pop_ctxt, rc);
+pop_ctxt:
+	pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+	if (rc)
+		CERROR("Cancel %d of %d llog-records failed: %d\n",
+		       failed, num_cookies, rc);
+
+	llog_ctxt_put(ctxt);
+	return rc;
+}
+EXPORT_SYMBOL(llog_origin_handle_cancel);
+
+#else /* !__KERNEL__ */
+int llog_origin_handle_open(struct ptlrpc_request *req)
+{
+	LBUG();
+	return 0;
+}
+
+int llog_origin_handle_destroy(struct ptlrpc_request *req)
+{
+	LBUG();
+	return 0;
+}
+
+int llog_origin_handle_next_block(struct ptlrpc_request *req)
+{
+	LBUG();
+	return 0;
+}
+int llog_origin_handle_prev_block(struct ptlrpc_request *req)
+{
+	LBUG();
+	return 0;
+}
+int llog_origin_handle_read_header(struct ptlrpc_request *req)
+{
+	LBUG();
+	return 0;
+}
+int llog_origin_handle_close(struct ptlrpc_request *req)
+{
+	LBUG();
+	return 0;
+}
+int llog_origin_handle_cancel(struct ptlrpc_request *req)
+{
+	LBUG();
+	return 0;
+}
+#endif
diff --git a/drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c b/drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c
new file mode 100644
index 000000000000..031c0f9abb82
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c
@@ -0,0 +1,1401 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+
+#include <obd_support.h>
+#include <obd.h>
+#include <lprocfs_status.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <obd_class.h>
+#include "ptlrpc_internal.h"
+
+
+struct ll_rpc_opcode {
+     __u32       opcode;
+     const char *opname;
+} ll_rpc_opcode_table[LUSTRE_MAX_OPCODES] = {
+	{ OST_REPLY,	"ost_reply" },
+	{ OST_GETATTR,      "ost_getattr" },
+	{ OST_SETATTR,      "ost_setattr" },
+	{ OST_READ,	 "ost_read" },
+	{ OST_WRITE,	"ost_write" },
+	{ OST_CREATE ,      "ost_create" },
+	{ OST_DESTROY,      "ost_destroy" },
+	{ OST_GET_INFO,     "ost_get_info" },
+	{ OST_CONNECT,      "ost_connect" },
+	{ OST_DISCONNECT,   "ost_disconnect" },
+	{ OST_PUNCH,	"ost_punch" },
+	{ OST_OPEN,	 "ost_open" },
+	{ OST_CLOSE,	"ost_close" },
+	{ OST_STATFS,       "ost_statfs" },
+	{ 14,		NULL },    /* formerly OST_SAN_READ */
+	{ 15,		NULL },    /* formerly OST_SAN_WRITE */
+	{ OST_SYNC,	 "ost_sync" },
+	{ OST_SET_INFO,     "ost_set_info" },
+	{ OST_QUOTACHECK,   "ost_quotacheck" },
+	{ OST_QUOTACTL,     "ost_quotactl" },
+	{ OST_QUOTA_ADJUST_QUNIT, "ost_quota_adjust_qunit" },
+	{ MDS_GETATTR,      "mds_getattr" },
+	{ MDS_GETATTR_NAME, "mds_getattr_lock" },
+	{ MDS_CLOSE,	"mds_close" },
+	{ MDS_REINT,	"mds_reint" },
+	{ MDS_READPAGE,     "mds_readpage" },
+	{ MDS_CONNECT,      "mds_connect" },
+	{ MDS_DISCONNECT,   "mds_disconnect" },
+	{ MDS_GETSTATUS,    "mds_getstatus" },
+	{ MDS_STATFS,       "mds_statfs" },
+	{ MDS_PIN,	  "mds_pin" },
+	{ MDS_UNPIN,	"mds_unpin" },
+	{ MDS_SYNC,	 "mds_sync" },
+	{ MDS_DONE_WRITING, "mds_done_writing" },
+	{ MDS_SET_INFO,     "mds_set_info" },
+	{ MDS_QUOTACHECK,   "mds_quotacheck" },
+	{ MDS_QUOTACTL,     "mds_quotactl" },
+	{ MDS_GETXATTR,     "mds_getxattr" },
+	{ MDS_SETXATTR,     "mds_setxattr" },
+	{ MDS_WRITEPAGE,    "mds_writepage" },
+	{ MDS_IS_SUBDIR,    "mds_is_subdir" },
+	{ MDS_GET_INFO,     "mds_get_info" },
+	{ MDS_HSM_STATE_GET, "mds_hsm_state_get" },
+	{ MDS_HSM_STATE_SET, "mds_hsm_state_set" },
+	{ MDS_HSM_ACTION,   "mds_hsm_action" },
+	{ MDS_HSM_PROGRESS, "mds_hsm_progress" },
+	{ MDS_HSM_REQUEST,  "mds_hsm_request" },
+	{ MDS_HSM_CT_REGISTER, "mds_hsm_ct_register" },
+	{ MDS_HSM_CT_UNREGISTER, "mds_hsm_ct_unregister" },
+	{ MDS_SWAP_LAYOUTS,	"mds_swap_layouts" },
+	{ LDLM_ENQUEUE,     "ldlm_enqueue" },
+	{ LDLM_CONVERT,     "ldlm_convert" },
+	{ LDLM_CANCEL,      "ldlm_cancel" },
+	{ LDLM_BL_CALLBACK, "ldlm_bl_callback" },
+	{ LDLM_CP_CALLBACK, "ldlm_cp_callback" },
+	{ LDLM_GL_CALLBACK, "ldlm_gl_callback" },
+	{ LDLM_SET_INFO,    "ldlm_set_info" },
+	{ MGS_CONNECT,      "mgs_connect" },
+	{ MGS_DISCONNECT,   "mgs_disconnect" },
+	{ MGS_EXCEPTION,    "mgs_exception" },
+	{ MGS_TARGET_REG,   "mgs_target_reg" },
+	{ MGS_TARGET_DEL,   "mgs_target_del" },
+	{ MGS_SET_INFO,     "mgs_set_info" },
+	{ MGS_CONFIG_READ,  "mgs_config_read" },
+	{ OBD_PING,	 "obd_ping" },
+	{ OBD_LOG_CANCEL,   "llog_origin_handle_cancel" },
+	{ OBD_QC_CALLBACK,  "obd_quota_callback" },
+	{ OBD_IDX_READ,	    "dt_index_read" },
+	{ LLOG_ORIGIN_HANDLE_CREATE,     "llog_origin_handle_create" },
+	{ LLOG_ORIGIN_HANDLE_NEXT_BLOCK, "llog_origin_handle_next_block" },
+	{ LLOG_ORIGIN_HANDLE_READ_HEADER,"llog_origin_handle_read_header" },
+	{ LLOG_ORIGIN_HANDLE_WRITE_REC,  "llog_origin_handle_write_rec" },
+	{ LLOG_ORIGIN_HANDLE_CLOSE,      "llog_origin_handle_close" },
+	{ LLOG_ORIGIN_CONNECT,	   "llog_origin_connect" },
+	{ LLOG_CATINFO,		  "llog_catinfo" },
+	{ LLOG_ORIGIN_HANDLE_PREV_BLOCK, "llog_origin_handle_prev_block" },
+	{ LLOG_ORIGIN_HANDLE_DESTROY,    "llog_origin_handle_destroy" },
+	{ QUOTA_DQACQ,      "quota_acquire" },
+	{ QUOTA_DQREL,      "quota_release" },
+	{ SEQ_QUERY,	"seq_query" },
+	{ SEC_CTX_INIT,     "sec_ctx_init" },
+	{ SEC_CTX_INIT_CONT,"sec_ctx_init_cont" },
+	{ SEC_CTX_FINI,     "sec_ctx_fini" },
+	{ FLD_QUERY,	"fld_query" },
+	{ UPDATE_OBJ,	    "update_obj" },
+};
+
+struct ll_eopcode {
+     __u32       opcode;
+     const char *opname;
+} ll_eopcode_table[EXTRA_LAST_OPC] = {
+	{ LDLM_GLIMPSE_ENQUEUE, "ldlm_glimpse_enqueue" },
+	{ LDLM_PLAIN_ENQUEUE,   "ldlm_plain_enqueue" },
+	{ LDLM_EXTENT_ENQUEUE,  "ldlm_extent_enqueue" },
+	{ LDLM_FLOCK_ENQUEUE,   "ldlm_flock_enqueue" },
+	{ LDLM_IBITS_ENQUEUE,   "ldlm_ibits_enqueue" },
+	{ MDS_REINT_SETATTR,    "mds_reint_setattr" },
+	{ MDS_REINT_CREATE,     "mds_reint_create" },
+	{ MDS_REINT_LINK,       "mds_reint_link" },
+	{ MDS_REINT_UNLINK,     "mds_reint_unlink" },
+	{ MDS_REINT_RENAME,     "mds_reint_rename" },
+	{ MDS_REINT_OPEN,       "mds_reint_open" },
+	{ MDS_REINT_SETXATTR,   "mds_reint_setxattr" },
+	{ BRW_READ_BYTES,       "read_bytes" },
+	{ BRW_WRITE_BYTES,      "write_bytes" },
+};
+
+const char *ll_opcode2str(__u32 opcode)
+{
+	/* When one of the assertions below fail, chances are that:
+	 *     1) A new opcode was added in include/lustre/lustre_idl.h,
+	 *	but is missing from the table above.
+	 * or  2) The opcode space was renumbered or rearranged,
+	 *	and the opcode_offset() function in
+	 *	ptlrpc_internal.h needs to be modified.
+	 */
+	__u32 offset = opcode_offset(opcode);
+	LASSERTF(offset < LUSTRE_MAX_OPCODES,
+		 "offset %u >= LUSTRE_MAX_OPCODES %u\n",
+		 offset, LUSTRE_MAX_OPCODES);
+	LASSERTF(ll_rpc_opcode_table[offset].opcode == opcode,
+		 "ll_rpc_opcode_table[%u].opcode %u != opcode %u\n",
+		 offset, ll_rpc_opcode_table[offset].opcode, opcode);
+	return ll_rpc_opcode_table[offset].opname;
+}
+
+const char* ll_eopcode2str(__u32 opcode)
+{
+	LASSERT(ll_eopcode_table[opcode].opcode == opcode);
+	return ll_eopcode_table[opcode].opname;
+}
+#ifdef LPROCFS
+void ptlrpc_lprocfs_register(struct proc_dir_entry *root, char *dir,
+			     char *name, struct proc_dir_entry **procroot_ret,
+			     struct lprocfs_stats **stats_ret)
+{
+	struct proc_dir_entry *svc_procroot;
+	struct lprocfs_stats *svc_stats;
+	int i, rc;
+	unsigned int svc_counter_config = LPROCFS_CNTR_AVGMINMAX |
+					  LPROCFS_CNTR_STDDEV;
+
+	LASSERT(*procroot_ret == NULL);
+	LASSERT(*stats_ret == NULL);
+
+	svc_stats = lprocfs_alloc_stats(EXTRA_MAX_OPCODES+LUSTRE_MAX_OPCODES,0);
+	if (svc_stats == NULL)
+		return;
+
+	if (dir) {
+		svc_procroot = lprocfs_register(dir, root, NULL, NULL);
+		if (IS_ERR(svc_procroot)) {
+			lprocfs_free_stats(&svc_stats);
+			return;
+		}
+	} else {
+		svc_procroot = root;
+	}
+
+	lprocfs_counter_init(svc_stats, PTLRPC_REQWAIT_CNTR,
+			     svc_counter_config, "req_waittime", "usec");
+	lprocfs_counter_init(svc_stats, PTLRPC_REQQDEPTH_CNTR,
+			     svc_counter_config, "req_qdepth", "reqs");
+	lprocfs_counter_init(svc_stats, PTLRPC_REQACTIVE_CNTR,
+			     svc_counter_config, "req_active", "reqs");
+	lprocfs_counter_init(svc_stats, PTLRPC_TIMEOUT,
+			     svc_counter_config, "req_timeout", "sec");
+	lprocfs_counter_init(svc_stats, PTLRPC_REQBUF_AVAIL_CNTR,
+			     svc_counter_config, "reqbuf_avail", "bufs");
+	for (i = 0; i < EXTRA_LAST_OPC; i++) {
+		char *units;
+
+		switch(i) {
+		case BRW_WRITE_BYTES:
+		case BRW_READ_BYTES:
+			units = "bytes";
+			break;
+		default:
+			units = "reqs";
+			break;
+		}
+		lprocfs_counter_init(svc_stats, PTLRPC_LAST_CNTR + i,
+				     svc_counter_config,
+				     ll_eopcode2str(i), units);
+	}
+	for (i = 0; i < LUSTRE_MAX_OPCODES; i++) {
+		__u32 opcode = ll_rpc_opcode_table[i].opcode;
+		lprocfs_counter_init(svc_stats,
+				     EXTRA_MAX_OPCODES + i, svc_counter_config,
+				     ll_opcode2str(opcode), "usec");
+	}
+
+	rc = lprocfs_register_stats(svc_procroot, name, svc_stats);
+	if (rc < 0) {
+		if (dir)
+			lprocfs_remove(&svc_procroot);
+		lprocfs_free_stats(&svc_stats);
+	} else {
+		if (dir)
+			*procroot_ret = svc_procroot;
+		*stats_ret = svc_stats;
+	}
+}
+
+static int
+ptlrpc_lprocfs_read_req_history_len(char *page, char **start, off_t off,
+				    int count, int *eof, void *data)
+{
+	struct ptlrpc_service *svc = data;
+	struct ptlrpc_service_part *svcpt;
+	int	total = 0;
+	int	i;
+
+	*eof = 1;
+
+	ptlrpc_service_for_each_part(svcpt, i, svc)
+		total += svcpt->scp_hist_nrqbds;
+
+	return snprintf(page, count, "%d\n", total);
+}
+
+static int
+ptlrpc_lprocfs_read_req_history_max(char *page, char **start, off_t off,
+				    int count, int *eof, void *data)
+{
+	struct ptlrpc_service *svc = data;
+	struct ptlrpc_service_part *svcpt;
+	int	total = 0;
+	int	i;
+
+	*eof = 1;
+	ptlrpc_service_for_each_part(svcpt, i, svc)
+		total += svc->srv_hist_nrqbds_cpt_max;
+
+	return snprintf(page, count, "%d\n", total);
+}
+
+static int
+ptlrpc_lprocfs_write_req_history_max(struct file *file, const char *buffer,
+				     unsigned long count, void *data)
+{
+	struct ptlrpc_service	   *svc = data;
+	int			    bufpages;
+	int			    val;
+	int			    rc;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc < 0)
+		return rc;
+
+	if (val < 0)
+		return -ERANGE;
+
+	/* This sanity check is more of an insanity check; we can still
+	 * hose a kernel by allowing the request history to grow too
+	 * far. */
+	bufpages = (svc->srv_buf_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	if (val > num_physpages/(2 * bufpages))
+		return -ERANGE;
+
+	spin_lock(&svc->srv_lock);
+
+	if (val == 0)
+		svc->srv_hist_nrqbds_cpt_max = 0;
+	else
+		svc->srv_hist_nrqbds_cpt_max = max(1, (val / svc->srv_ncpts));
+
+	spin_unlock(&svc->srv_lock);
+
+	return count;
+}
+
+static int
+ptlrpc_lprocfs_rd_threads_min(char *page, char **start, off_t off,
+			      int count, int *eof, void *data)
+{
+	struct ptlrpc_service *svc = data;
+
+	return snprintf(page, count, "%d\n",
+			svc->srv_nthrs_cpt_init * svc->srv_ncpts);
+}
+
+static int
+ptlrpc_lprocfs_wr_threads_min(struct file *file, const char *buffer,
+			      unsigned long count, void *data)
+{
+	struct ptlrpc_service	   *svc = data;
+	int	val;
+	int	rc = lprocfs_write_helper(buffer, count, &val);
+
+	if (rc < 0)
+		return rc;
+
+	if (val / svc->srv_ncpts < PTLRPC_NTHRS_INIT)
+		return -ERANGE;
+
+	spin_lock(&svc->srv_lock);
+	if (val > svc->srv_nthrs_cpt_limit * svc->srv_ncpts) {
+		spin_unlock(&svc->srv_lock);
+		return -ERANGE;
+	}
+
+	svc->srv_nthrs_cpt_init = val / svc->srv_ncpts;
+
+	spin_unlock(&svc->srv_lock);
+
+	return count;
+}
+
+static int
+ptlrpc_lprocfs_rd_threads_started(char *page, char **start, off_t off,
+				  int count, int *eof, void *data)
+{
+	struct ptlrpc_service *svc = data;
+	struct ptlrpc_service_part *svcpt;
+	int	total = 0;
+	int	i;
+
+	ptlrpc_service_for_each_part(svcpt, i, svc)
+		total += svcpt->scp_nthrs_running;
+
+	return snprintf(page, count, "%d\n", total);
+}
+
+static int
+ptlrpc_lprocfs_rd_threads_max(char *page, char **start, off_t off,
+			      int count, int *eof, void *data)
+{
+	struct ptlrpc_service *svc = data;
+
+	return snprintf(page, count, "%d\n",
+			svc->srv_nthrs_cpt_limit * svc->srv_ncpts);
+}
+
+static int
+ptlrpc_lprocfs_wr_threads_max(struct file *file, const char *buffer,
+			      unsigned long count, void *data)
+{
+	struct ptlrpc_service *svc = data;
+	int	val;
+	int	rc = lprocfs_write_helper(buffer, count, &val);
+
+	if (rc < 0)
+		return rc;
+
+	if (val / svc->srv_ncpts < PTLRPC_NTHRS_INIT)
+		return -ERANGE;
+
+	spin_lock(&svc->srv_lock);
+	if (val < svc->srv_nthrs_cpt_init * svc->srv_ncpts) {
+		spin_unlock(&svc->srv_lock);
+		return -ERANGE;
+	}
+
+	svc->srv_nthrs_cpt_limit = val / svc->srv_ncpts;
+
+	spin_unlock(&svc->srv_lock);
+
+	return count;
+}
+
+/**
+ * \addtogoup nrs
+ * @{
+ */
+extern struct nrs_core nrs_core;
+
+/**
+ * Translates \e ptlrpc_nrs_pol_state values to human-readable strings.
+ *
+ * \param[in] state The policy state
+ */
+static const char *nrs_state2str(enum ptlrpc_nrs_pol_state state)
+{
+	switch (state) {
+	default:
+		LBUG();
+	case NRS_POL_STATE_INVALID:
+		return "invalid";
+	case NRS_POL_STATE_STOPPED:
+		return "stopped";
+	case NRS_POL_STATE_STOPPING:
+		return "stopping";
+	case NRS_POL_STATE_STARTING:
+		return "starting";
+	case NRS_POL_STATE_STARTED:
+		return "started";
+	}
+}
+
+/**
+ * Obtains status information for \a policy.
+ *
+ * Information is copied in \a info.
+ *
+ * \param[in] policy The policy
+ * \param[out] info  Holds returned status information
+ */
+void nrs_policy_get_info_locked(struct ptlrpc_nrs_policy *policy,
+				struct ptlrpc_nrs_pol_info *info)
+{
+	LASSERT(policy != NULL);
+	LASSERT(info != NULL);
+	LASSERT(spin_is_locked(&policy->pol_nrs->nrs_lock));
+
+	memcpy(info->pi_name, policy->pol_desc->pd_name, NRS_POL_NAME_MAX);
+
+	info->pi_fallback    = !!(policy->pol_flags & PTLRPC_NRS_FL_FALLBACK);
+	info->pi_state	     = policy->pol_state;
+	/**
+	 * XXX: These are accessed without holding
+	 * ptlrpc_service_part::scp_req_lock.
+	 */
+	info->pi_req_queued  = policy->pol_req_queued;
+	info->pi_req_started = policy->pol_req_started;
+}
+
+/**
+ * Reads and prints policy status information for all policies of a PTLRPC
+ * service.
+ */
+static int ptlrpc_lprocfs_rd_nrs(char *page, char **start, off_t off,
+				 int count, int *eof, void *data)
+{
+	struct ptlrpc_service	       *svc = data;
+	struct ptlrpc_service_part     *svcpt;
+	struct ptlrpc_nrs	       *nrs;
+	struct ptlrpc_nrs_policy       *policy;
+	struct ptlrpc_nrs_pol_info     *infos;
+	struct ptlrpc_nrs_pol_info	tmp;
+	unsigned			num_pols;
+	unsigned			pol_idx = 0;
+	bool				hp = false;
+	int				i;
+	int				rc = 0;
+	int				rc2 = 0;
+	ENTRY;
+
+	/**
+	 * Serialize NRS core lprocfs operations with policy registration/
+	 * unregistration.
+	 */
+	mutex_lock(&nrs_core.nrs_mutex);
+
+	/**
+	 * Use the first service partition's regular NRS head in order to obtain
+	 * the number of policies registered with NRS heads of this service. All
+	 * service partitions will have the same number of policies.
+	 */
+	nrs = nrs_svcpt2nrs(svc->srv_parts[0], false);
+
+	spin_lock(&nrs->nrs_lock);
+	num_pols = svc->srv_parts[0]->scp_nrs_reg.nrs_num_pols;
+	spin_unlock(&nrs->nrs_lock);
+
+	OBD_ALLOC(infos, num_pols * sizeof(*infos));
+	if (infos == NULL)
+		GOTO(out, rc = -ENOMEM);
+again:
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		nrs = nrs_svcpt2nrs(svcpt, hp);
+		spin_lock(&nrs->nrs_lock);
+
+		pol_idx = 0;
+
+		list_for_each_entry(policy, &nrs->nrs_policy_list,
+					pol_list) {
+			LASSERT(pol_idx < num_pols);
+
+			nrs_policy_get_info_locked(policy, &tmp);
+			/**
+			 * Copy values when handling the first service
+			 * partition.
+			 */
+			if (i == 0) {
+				memcpy(infos[pol_idx].pi_name, tmp.pi_name,
+				       NRS_POL_NAME_MAX);
+				memcpy(&infos[pol_idx].pi_state, &tmp.pi_state,
+				       sizeof(tmp.pi_state));
+				infos[pol_idx].pi_fallback = tmp.pi_fallback;
+				/**
+				 * For the rest of the service partitions
+				 * sanity-check the values we get.
+				 */
+			} else {
+				LASSERT(strncmp(infos[pol_idx].pi_name,
+						tmp.pi_name,
+						NRS_POL_NAME_MAX) == 0);
+				/**
+				 * Not asserting ptlrpc_nrs_pol_info::pi_state,
+				 * because it may be different between
+				 * instances of the same policy in different
+				 * service partitions.
+				 */
+				LASSERT(infos[pol_idx].pi_fallback ==
+					tmp.pi_fallback);
+			}
+
+			infos[pol_idx].pi_req_queued += tmp.pi_req_queued;
+			infos[pol_idx].pi_req_started += tmp.pi_req_started;
+
+			pol_idx++;
+		}
+		spin_unlock(&nrs->nrs_lock);
+	}
+
+	/**
+	 * Policy status information output is in YAML format.
+	 * For example:
+	 *
+	 *	regular_requests:
+	 *	  - name: fifo
+	 *	    state: started
+	 *	    fallback: yes
+	 *	    queued: 0
+	 *	    active: 0
+	 *
+	 *	  - name: crrn
+	 *	    state: started
+	 *	    fallback: no
+	 *	    queued: 2015
+	 *	    active: 384
+	 *
+	 *	high_priority_requests:
+	 *	  - name: fifo
+	 *	    state: started
+	 *	    fallback: yes
+	 *	    queued: 0
+	 *	    active: 2
+	 *
+	 *	  - name: crrn
+	 *	    state: stopped
+	 *	    fallback: no
+	 *	    queued: 0
+	 *	    active: 0
+	 */
+	rc2 = snprintf(page + rc, count - rc,
+		       "%s\n", !hp ?
+		       "\nregular_requests:" :
+		       "high_priority_requests:");
+
+	if (rc2 >= count - rc) {
+		/** Output was truncated */
+		GOTO(out, rc = -EFBIG);
+	}
+
+	rc += rc2;
+
+	for (pol_idx = 0; pol_idx < num_pols; pol_idx++) {
+		rc2 = snprintf(page + rc, count - rc,
+			       "  - name: %s\n"
+			       "    state: %s\n"
+			       "    fallback: %s\n"
+			       "    queued: %-20d\n"
+			       "    active: %-20d\n\n",
+			       infos[pol_idx].pi_name,
+			       nrs_state2str(infos[pol_idx].pi_state),
+			       infos[pol_idx].pi_fallback ? "yes" : "no",
+			       (int)infos[pol_idx].pi_req_queued,
+			       (int)infos[pol_idx].pi_req_started);
+
+
+		if (rc2 >= count - rc) {
+			/** Output was truncated */
+			GOTO(out, rc = -EFBIG);
+		}
+
+		rc += rc2;
+	}
+
+	if (!hp && nrs_svc_has_hp(svc)) {
+		memset(infos, 0, num_pols * sizeof(*infos));
+
+		/**
+		 * Redo the processing for the service's HP NRS heads' policies.
+		 */
+		hp = true;
+		goto again;
+	}
+
+	*eof = 1;
+
+out:
+	if (infos)
+		OBD_FREE(infos, num_pols * sizeof(*infos));
+
+	mutex_unlock(&nrs_core.nrs_mutex);
+
+	RETURN(rc);
+}
+
+/**
+ * The longest valid command string is the maxium policy name size, plus the
+ * length of the " reg" substring
+ */
+#define LPROCFS_NRS_WR_MAX_CMD	(NRS_POL_NAME_MAX + sizeof(" reg") - 1)
+
+/**
+ * Starts and stops a given policy on a PTLRPC service.
+ *
+ * Commands consist of the policy name, followed by an optional [reg|hp] token;
+ * if the optional token is omitted, the operation is performed on both the
+ * regular and high-priority (if the service has one) NRS head.
+ */
+static int ptlrpc_lprocfs_wr_nrs(struct file *file, const char *buffer,
+				 unsigned long count, void *data)
+{
+	struct ptlrpc_service	       *svc = data;
+	enum ptlrpc_nrs_queue_type	queue = PTLRPC_NRS_QUEUE_BOTH;
+	char			       *cmd;
+	char			       *cmd_copy = NULL;
+	char			       *token;
+	int				rc = 0;
+	ENTRY;
+
+	if (count >= LPROCFS_NRS_WR_MAX_CMD)
+		GOTO(out, rc = -EINVAL);
+
+	OBD_ALLOC(cmd, LPROCFS_NRS_WR_MAX_CMD);
+	if (cmd == NULL)
+		GOTO(out, rc = -ENOMEM);
+	/**
+	 * strsep() modifies its argument, so keep a copy
+	 */
+	cmd_copy = cmd;
+
+	if (copy_from_user(cmd, buffer, count))
+		GOTO(out, rc = -EFAULT);
+
+	cmd[count] = '\0';
+
+	token = strsep(&cmd, " ");
+
+	if (strlen(token) > NRS_POL_NAME_MAX - 1)
+		GOTO(out, rc = -EINVAL);
+
+	/**
+	 * No [reg|hp] token has been specified
+	 */
+	if (cmd == NULL)
+		goto default_queue;
+
+	/**
+	 * The second token is either NULL, or an optional [reg|hp] string
+	 */
+	if (strcmp(cmd, "reg") == 0)
+		queue = PTLRPC_NRS_QUEUE_REG;
+	else if (strcmp(cmd, "hp") == 0)
+		queue = PTLRPC_NRS_QUEUE_HP;
+	else
+		GOTO(out, rc = -EINVAL);
+
+default_queue:
+
+	if (queue == PTLRPC_NRS_QUEUE_HP && !nrs_svc_has_hp(svc))
+		GOTO(out, rc = -ENODEV);
+	else if (queue == PTLRPC_NRS_QUEUE_BOTH && !nrs_svc_has_hp(svc))
+		queue = PTLRPC_NRS_QUEUE_REG;
+
+	/**
+	 * Serialize NRS core lprocfs operations with policy registration/
+	 * unregistration.
+	 */
+	mutex_lock(&nrs_core.nrs_mutex);
+
+	rc = ptlrpc_nrs_policy_control(svc, queue, token, PTLRPC_NRS_CTL_START,
+				       false, NULL);
+
+	mutex_unlock(&nrs_core.nrs_mutex);
+out:
+	if (cmd_copy)
+		OBD_FREE(cmd_copy, LPROCFS_NRS_WR_MAX_CMD);
+
+	RETURN(rc < 0 ? rc : count);
+}
+
+/** @} nrs */
+
+struct ptlrpc_srh_iterator {
+	int			srhi_idx;
+	__u64			srhi_seq;
+	struct ptlrpc_request	*srhi_req;
+};
+
+int
+ptlrpc_lprocfs_svc_req_history_seek(struct ptlrpc_service_part *svcpt,
+				    struct ptlrpc_srh_iterator *srhi,
+				    __u64 seq)
+{
+	struct list_head		*e;
+	struct ptlrpc_request	*req;
+
+	if (srhi->srhi_req != NULL &&
+	    srhi->srhi_seq > svcpt->scp_hist_seq_culled &&
+	    srhi->srhi_seq <= seq) {
+		/* If srhi_req was set previously, hasn't been culled and
+		 * we're searching for a seq on or after it (i.e. more
+		 * recent), search from it onwards.
+		 * Since the service history is LRU (i.e. culled reqs will
+		 * be near the head), we shouldn't have to do long
+		 * re-scans */
+		LASSERTF(srhi->srhi_seq == srhi->srhi_req->rq_history_seq,
+			 "%s:%d: seek seq "LPU64", request seq "LPU64"\n",
+			 svcpt->scp_service->srv_name, svcpt->scp_cpt,
+			 srhi->srhi_seq, srhi->srhi_req->rq_history_seq);
+		LASSERTF(!list_empty(&svcpt->scp_hist_reqs),
+			 "%s:%d: seek offset "LPU64", request seq "LPU64", "
+			 "last culled "LPU64"\n",
+			 svcpt->scp_service->srv_name, svcpt->scp_cpt,
+			 seq, srhi->srhi_seq, svcpt->scp_hist_seq_culled);
+		e = &srhi->srhi_req->rq_history_list;
+	} else {
+		/* search from start */
+		e = svcpt->scp_hist_reqs.next;
+	}
+
+	while (e != &svcpt->scp_hist_reqs) {
+		req = list_entry(e, struct ptlrpc_request, rq_history_list);
+
+		if (req->rq_history_seq >= seq) {
+			srhi->srhi_seq = req->rq_history_seq;
+			srhi->srhi_req = req;
+			return 0;
+		}
+		e = e->next;
+	}
+
+	return -ENOENT;
+}
+
+/*
+ * ptlrpc history sequence is used as "position" of seq_file, in some case,
+ * seq_read() will increase "position" to indicate reading the next
+ * element, however, low bits of history sequence are reserved for CPT id
+ * (check the details from comments before ptlrpc_req_add_history), which
+ * means seq_read() might change CPT id of history sequence and never
+ * finish reading of requests on a CPT. To make it work, we have to shift
+ * CPT id to high bits and timestamp to low bits, so seq_read() will only
+ * increase timestamp which can correctly indicate the next position.
+ */
+
+/* convert seq_file pos to cpt */
+#define PTLRPC_REQ_POS2CPT(svc, pos)			\
+	((svc)->srv_cpt_bits == 0 ? 0 :			\
+	 (__u64)(pos) >> (64 - (svc)->srv_cpt_bits))
+
+/* make up seq_file pos from cpt */
+#define PTLRPC_REQ_CPT2POS(svc, cpt)			\
+	((svc)->srv_cpt_bits == 0 ? 0 :			\
+	 (cpt) << (64 - (svc)->srv_cpt_bits))
+
+/* convert sequence to position */
+#define PTLRPC_REQ_SEQ2POS(svc, seq)			\
+	((svc)->srv_cpt_bits == 0 ? (seq) :		\
+	 ((seq) >> (svc)->srv_cpt_bits) |		\
+	 ((seq) << (64 - (svc)->srv_cpt_bits)))
+
+/* convert position to sequence */
+#define PTLRPC_REQ_POS2SEQ(svc, pos)			\
+	((svc)->srv_cpt_bits == 0 ? (pos) :		\
+	 ((__u64)(pos) << (svc)->srv_cpt_bits) |	\
+	 ((__u64)(pos) >> (64 - (svc)->srv_cpt_bits)))
+
+static void *
+ptlrpc_lprocfs_svc_req_history_start(struct seq_file *s, loff_t *pos)
+{
+	struct ptlrpc_service		*svc = s->private;
+	struct ptlrpc_service_part	*svcpt;
+	struct ptlrpc_srh_iterator	*srhi;
+	unsigned int			cpt;
+	int				rc;
+	int				i;
+
+	if (sizeof(loff_t) != sizeof(__u64)) { /* can't support */
+		CWARN("Failed to read request history because size of loff_t "
+		      "%d can't match size of u64\n", (int)sizeof(loff_t));
+		return NULL;
+	}
+
+	OBD_ALLOC(srhi, sizeof(*srhi));
+	if (srhi == NULL)
+		return NULL;
+
+	srhi->srhi_seq = 0;
+	srhi->srhi_req = NULL;
+
+	cpt = PTLRPC_REQ_POS2CPT(svc, *pos);
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		if (i < cpt) /* skip */
+			continue;
+		if (i > cpt) /* make up the lowest position for this CPT */
+			*pos = PTLRPC_REQ_CPT2POS(svc, i);
+
+		spin_lock(&svcpt->scp_lock);
+		rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi,
+				PTLRPC_REQ_POS2SEQ(svc, *pos));
+		spin_unlock(&svcpt->scp_lock);
+		if (rc == 0) {
+			*pos = PTLRPC_REQ_SEQ2POS(svc, srhi->srhi_seq);
+			srhi->srhi_idx = i;
+			return srhi;
+		}
+	}
+
+	OBD_FREE(srhi, sizeof(*srhi));
+	return NULL;
+}
+
+static void
+ptlrpc_lprocfs_svc_req_history_stop(struct seq_file *s, void *iter)
+{
+	struct ptlrpc_srh_iterator *srhi = iter;
+
+	if (srhi != NULL)
+		OBD_FREE(srhi, sizeof(*srhi));
+}
+
+static void *
+ptlrpc_lprocfs_svc_req_history_next(struct seq_file *s,
+				    void *iter, loff_t *pos)
+{
+	struct ptlrpc_service		*svc = s->private;
+	struct ptlrpc_srh_iterator	*srhi = iter;
+	struct ptlrpc_service_part	*svcpt;
+	__u64				seq;
+	int				rc;
+	int				i;
+
+	for (i = srhi->srhi_idx; i < svc->srv_ncpts; i++) {
+		svcpt = svc->srv_parts[i];
+
+		if (i > srhi->srhi_idx) { /* reset iterator for a new CPT */
+			srhi->srhi_req = NULL;
+			seq = srhi->srhi_seq = 0;
+		} else { /* the next sequence */
+			seq = srhi->srhi_seq + (1 << svc->srv_cpt_bits);
+		}
+
+		spin_lock(&svcpt->scp_lock);
+		rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi, seq);
+		spin_unlock(&svcpt->scp_lock);
+		if (rc == 0) {
+			*pos = PTLRPC_REQ_SEQ2POS(svc, srhi->srhi_seq);
+			srhi->srhi_idx = i;
+			return srhi;
+		}
+	}
+
+	OBD_FREE(srhi, sizeof(*srhi));
+	return NULL;
+}
+
+/* common ost/mdt so_req_printer */
+void target_print_req(void *seq_file, struct ptlrpc_request *req)
+{
+	/* Called holding srv_lock with irqs disabled.
+	 * Print specific req contents and a newline.
+	 * CAVEAT EMPTOR: check request message length before printing!!!
+	 * You might have received any old crap so you must be just as
+	 * careful here as the service's request parser!!! */
+	struct seq_file *sf = seq_file;
+
+	switch (req->rq_phase) {
+	case RQ_PHASE_NEW:
+		/* still awaiting a service thread's attention, or rejected
+		 * because the generic request message didn't unpack */
+		seq_printf(sf, "<not swabbed>\n");
+		break;
+	case RQ_PHASE_INTERPRET:
+		/* being handled, so basic msg swabbed, and opc is valid
+		 * but racing with mds_handle() */
+	case RQ_PHASE_COMPLETE:
+		/* been handled by mds_handle() reply state possibly still
+		 * volatile */
+		seq_printf(sf, "opc %d\n", lustre_msg_get_opc(req->rq_reqmsg));
+		break;
+	default:
+		DEBUG_REQ(D_ERROR, req, "bad phase %d", req->rq_phase);
+	}
+}
+EXPORT_SYMBOL(target_print_req);
+
+static int ptlrpc_lprocfs_svc_req_history_show(struct seq_file *s, void *iter)
+{
+	struct ptlrpc_service		*svc = s->private;
+	struct ptlrpc_srh_iterator	*srhi = iter;
+	struct ptlrpc_service_part	*svcpt;
+	struct ptlrpc_request		*req;
+	int				rc;
+
+	LASSERT(srhi->srhi_idx < svc->srv_ncpts);
+
+	svcpt = svc->srv_parts[srhi->srhi_idx];
+
+	spin_lock(&svcpt->scp_lock);
+
+	rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi, srhi->srhi_seq);
+
+	if (rc == 0) {
+		req = srhi->srhi_req;
+
+		/* Print common req fields.
+		 * CAVEAT EMPTOR: we're racing with the service handler
+		 * here.  The request could contain any old crap, so you
+		 * must be just as careful as the service's request
+		 * parser. Currently I only print stuff here I know is OK
+		 * to look at coz it was set up in request_in_callback()!!! */
+		seq_printf(s, LPD64":%s:%s:x"LPU64":%d:%s:%ld:%lds(%+lds) ",
+			   req->rq_history_seq, libcfs_nid2str(req->rq_self),
+			   libcfs_id2str(req->rq_peer), req->rq_xid,
+			   req->rq_reqlen, ptlrpc_rqphase2str(req),
+			   req->rq_arrival_time.tv_sec,
+			   req->rq_sent - req->rq_arrival_time.tv_sec,
+			   req->rq_sent - req->rq_deadline);
+		if (svc->srv_ops.so_req_printer == NULL)
+			seq_printf(s, "\n");
+		else
+			svc->srv_ops.so_req_printer(s, srhi->srhi_req);
+	}
+
+	spin_unlock(&svcpt->scp_lock);
+	return rc;
+}
+
+static int
+ptlrpc_lprocfs_svc_req_history_open(struct inode *inode, struct file *file)
+{
+	static struct seq_operations sops = {
+		.start = ptlrpc_lprocfs_svc_req_history_start,
+		.stop  = ptlrpc_lprocfs_svc_req_history_stop,
+		.next  = ptlrpc_lprocfs_svc_req_history_next,
+		.show  = ptlrpc_lprocfs_svc_req_history_show,
+	};
+	struct proc_dir_entry *dp = PDE(inode);
+	struct seq_file       *seqf;
+	int		    rc;
+
+	LPROCFS_ENTRY_AND_CHECK(dp);
+	rc = seq_open(file, &sops);
+	if (rc) {
+		LPROCFS_EXIT();
+		return rc;
+	}
+
+	seqf = file->private_data;
+	seqf->private = dp->data;
+	return 0;
+}
+
+/* See also lprocfs_rd_timeouts */
+static int ptlrpc_lprocfs_rd_timeouts(char *page, char **start, off_t off,
+				      int count, int *eof, void *data)
+{
+	struct ptlrpc_service		*svc = data;
+	struct ptlrpc_service_part	*svcpt;
+	struct dhms			ts;
+	time_t				worstt;
+	unsigned int			cur;
+	unsigned int			worst;
+	int				nob = 0;
+	int				rc = 0;
+	int				i;
+
+	if (AT_OFF) {
+		rc += snprintf(page + rc, count - rc,
+			       "adaptive timeouts off, using obd_timeout %u\n",
+			       obd_timeout);
+		return rc;
+	}
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		cur	= at_get(&svcpt->scp_at_estimate);
+		worst	= svcpt->scp_at_estimate.at_worst_ever;
+		worstt	= svcpt->scp_at_estimate.at_worst_time;
+		s2dhms(&ts, cfs_time_current_sec() - worstt);
+
+		nob = snprintf(page, count,
+			       "%10s : cur %3u  worst %3u (at %ld, "
+			       DHMS_FMT" ago) ", "service",
+			       cur, worst, worstt, DHMS_VARS(&ts));
+
+		nob = lprocfs_at_hist_helper(page, count, nob,
+					     &svcpt->scp_at_estimate);
+		rc += nob;
+		page += nob;
+		count -= nob;
+
+		/*
+		 * NB: for lustre proc read, the read count must be less
+		 * than PAGE_SIZE, please see details in lprocfs_fops_read.
+		 * It's unlikely that we exceed PAGE_SIZE at here because
+		 * it means the service has more than 50 partitions.
+		 */
+		if (count <= 0) {
+			CWARN("Can't fit AT information of %s in one page, "
+			      "please contact with developer to fix this.\n",
+			      svc->srv_name);
+			break;
+		}
+	}
+
+	return rc;
+}
+
+static int ptlrpc_lprocfs_rd_hp_ratio(char *page, char **start, off_t off,
+				      int count, int *eof, void *data)
+{
+	struct ptlrpc_service *svc = data;
+	int rc = snprintf(page, count, "%d", svc->srv_hpreq_ratio);
+	return rc;
+}
+
+static int ptlrpc_lprocfs_wr_hp_ratio(struct file *file, const char *buffer,
+				      unsigned long count, void *data)
+{
+	struct ptlrpc_service		*svc = data;
+	int	rc;
+	int	val;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc < 0)
+		return rc;
+
+	if (val < 0)
+		return -ERANGE;
+
+	spin_lock(&svc->srv_lock);
+	svc->srv_hpreq_ratio = val;
+	spin_unlock(&svc->srv_lock);
+
+	return count;
+}
+
+void ptlrpc_lprocfs_register_service(struct proc_dir_entry *entry,
+				     struct ptlrpc_service *svc)
+{
+	struct lprocfs_vars lproc_vars[] = {
+		{.name       = "high_priority_ratio",
+		 .read_fptr  = ptlrpc_lprocfs_rd_hp_ratio,
+		 .write_fptr = ptlrpc_lprocfs_wr_hp_ratio,
+		 .data       = svc},
+		{.name       = "req_buffer_history_len",
+		 .read_fptr  = ptlrpc_lprocfs_read_req_history_len,
+		 .data       = svc},
+		{.name       = "req_buffer_history_max",
+		 .write_fptr = ptlrpc_lprocfs_write_req_history_max,
+		 .read_fptr  = ptlrpc_lprocfs_read_req_history_max,
+		 .data       = svc},
+		{.name       = "threads_min",
+		 .read_fptr  = ptlrpc_lprocfs_rd_threads_min,
+		 .write_fptr = ptlrpc_lprocfs_wr_threads_min,
+		 .data       = svc},
+		{.name       = "threads_max",
+		 .read_fptr  = ptlrpc_lprocfs_rd_threads_max,
+		 .write_fptr = ptlrpc_lprocfs_wr_threads_max,
+		 .data       = svc},
+		{.name       = "threads_started",
+		 .read_fptr  = ptlrpc_lprocfs_rd_threads_started,
+		 .data       = svc},
+		{.name       = "timeouts",
+		 .read_fptr  = ptlrpc_lprocfs_rd_timeouts,
+		 .data       = svc},
+		{.name       = "nrs_policies",
+		 .read_fptr  = ptlrpc_lprocfs_rd_nrs,
+		 .write_fptr = ptlrpc_lprocfs_wr_nrs,
+		 .data	     = svc},
+		{NULL}
+	};
+	static struct file_operations req_history_fops = {
+		.owner       = THIS_MODULE,
+		.open	= ptlrpc_lprocfs_svc_req_history_open,
+		.read	= seq_read,
+		.llseek      = seq_lseek,
+		.release     = lprocfs_seq_release,
+	};
+
+	int rc;
+
+	ptlrpc_lprocfs_register(entry, svc->srv_name,
+				"stats", &svc->srv_procroot,
+				&svc->srv_stats);
+
+	if (svc->srv_procroot == NULL)
+		return;
+
+	lprocfs_add_vars(svc->srv_procroot, lproc_vars, NULL);
+
+	rc = lprocfs_seq_create(svc->srv_procroot, "req_history",
+				0400, &req_history_fops, svc);
+	if (rc)
+		CWARN("Error adding the req_history file\n");
+}
+
+void ptlrpc_lprocfs_register_obd(struct obd_device *obddev)
+{
+	ptlrpc_lprocfs_register(obddev->obd_proc_entry, NULL, "stats",
+				&obddev->obd_svc_procroot,
+				&obddev->obd_svc_stats);
+}
+EXPORT_SYMBOL(ptlrpc_lprocfs_register_obd);
+
+void ptlrpc_lprocfs_rpc_sent(struct ptlrpc_request *req, long amount)
+{
+	struct lprocfs_stats *svc_stats;
+	__u32 op = lustre_msg_get_opc(req->rq_reqmsg);
+	int opc = opcode_offset(op);
+
+	svc_stats = req->rq_import->imp_obd->obd_svc_stats;
+	if (svc_stats == NULL || opc <= 0)
+		return;
+	LASSERT(opc < LUSTRE_MAX_OPCODES);
+	if (!(op == LDLM_ENQUEUE || op == MDS_REINT))
+		lprocfs_counter_add(svc_stats, opc + EXTRA_MAX_OPCODES, amount);
+}
+
+void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes)
+{
+	struct lprocfs_stats *svc_stats;
+	int idx;
+
+	if (!req->rq_import)
+		return;
+	svc_stats = req->rq_import->imp_obd->obd_svc_stats;
+	if (!svc_stats)
+		return;
+	idx = lustre_msg_get_opc(req->rq_reqmsg);
+	switch (idx) {
+	case OST_READ:
+		idx = BRW_READ_BYTES + PTLRPC_LAST_CNTR;
+		break;
+	case OST_WRITE:
+		idx = BRW_WRITE_BYTES + PTLRPC_LAST_CNTR;
+		break;
+	default:
+		LASSERTF(0, "unsupported opcode %u\n", idx);
+		break;
+	}
+
+	lprocfs_counter_add(svc_stats, idx, bytes);
+}
+
+EXPORT_SYMBOL(ptlrpc_lprocfs_brw);
+
+void ptlrpc_lprocfs_unregister_service(struct ptlrpc_service *svc)
+{
+	if (svc->srv_procroot != NULL)
+		lprocfs_remove(&svc->srv_procroot);
+
+	if (svc->srv_stats)
+		lprocfs_free_stats(&svc->srv_stats);
+}
+
+void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd)
+{
+	if (obd->obd_svc_procroot)
+		lprocfs_remove(&obd->obd_svc_procroot);
+
+	if (obd->obd_svc_stats)
+		lprocfs_free_stats(&obd->obd_svc_stats);
+}
+EXPORT_SYMBOL(ptlrpc_lprocfs_unregister_obd);
+
+
+#define BUFLEN (UUID_MAX + 5)
+
+int lprocfs_wr_evict_client(struct file *file, const char *buffer,
+			    unsigned long count, void *data)
+{
+	struct obd_device *obd = data;
+	char	      *kbuf;
+	char	      *tmpbuf;
+
+	OBD_ALLOC(kbuf, BUFLEN);
+	if (kbuf == NULL)
+		return -ENOMEM;
+
+	/*
+	 * OBD_ALLOC() will zero kbuf, but we only copy BUFLEN - 1
+	 * bytes into kbuf, to ensure that the string is NUL-terminated.
+	 * UUID_MAX should include a trailing NUL already.
+	 */
+	if (copy_from_user(kbuf, buffer,
+			       min_t(unsigned long, BUFLEN - 1, count))) {
+		count = -EFAULT;
+		goto out;
+	}
+	tmpbuf = cfs_firststr(kbuf, min_t(unsigned long, BUFLEN - 1, count));
+	/* Kludge code(deadlock situation): the lprocfs lock has been held
+	 * since the client is evicted by writting client's
+	 * uuid/nid to procfs "evict_client" entry. However,
+	 * obd_export_evict_by_uuid() will call lprocfs_remove() to destroy
+	 * the proc entries under the being destroyed export{}, so I have
+	 * to drop the lock at first here.
+	 * - jay, jxiong@clusterfs.com */
+	LPROCFS_EXIT();
+	class_incref(obd, __FUNCTION__, current);
+
+	if (strncmp(tmpbuf, "nid:", 4) == 0)
+		obd_export_evict_by_nid(obd, tmpbuf + 4);
+	else if (strncmp(tmpbuf, "uuid:", 5) == 0)
+		obd_export_evict_by_uuid(obd, tmpbuf + 5);
+	else
+		obd_export_evict_by_uuid(obd, tmpbuf);
+
+	class_decref(obd, __FUNCTION__, current);
+	LPROCFS_ENTRY();
+
+out:
+	OBD_FREE(kbuf, BUFLEN);
+	return count;
+}
+EXPORT_SYMBOL(lprocfs_wr_evict_client);
+
+#undef BUFLEN
+
+int lprocfs_wr_ping(struct file *file, const char *buffer,
+		    unsigned long count, void *data)
+{
+	struct obd_device     *obd = data;
+	struct ptlrpc_request *req;
+	int		    rc;
+	ENTRY;
+
+	LPROCFS_CLIMP_CHECK(obd);
+	req = ptlrpc_prep_ping(obd->u.cli.cl_import);
+	LPROCFS_CLIMP_EXIT(obd);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	req->rq_send_state = LUSTRE_IMP_FULL;
+
+	rc = ptlrpc_queue_wait(req);
+
+	ptlrpc_req_finished(req);
+	if (rc >= 0)
+		RETURN(count);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(lprocfs_wr_ping);
+
+/* Write the connection UUID to this file to attempt to connect to that node.
+ * The connection UUID is a node's primary NID. For example,
+ * "echo connection=192.168.0.1@tcp0::instance > .../import".
+ */
+int lprocfs_wr_import(struct file *file, const char *buffer,
+		      unsigned long count, void *data)
+{
+	struct obd_device *obd = data;
+	struct obd_import *imp = obd->u.cli.cl_import;
+	char *kbuf = NULL;
+	char *uuid;
+	char *ptr;
+	int do_reconn = 1;
+	const char prefix[] = "connection=";
+	const int prefix_len = sizeof(prefix) - 1;
+
+	if (count > PAGE_CACHE_SIZE - 1 || count <= prefix_len)
+		return -EINVAL;
+
+	OBD_ALLOC(kbuf, count + 1);
+	if (kbuf == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(kbuf, buffer, count))
+		GOTO(out, count = -EFAULT);
+
+	kbuf[count] = 0;
+
+	/* only support connection=uuid::instance now */
+	if (strncmp(prefix, kbuf, prefix_len) != 0)
+		GOTO(out, count = -EINVAL);
+
+	uuid = kbuf + prefix_len;
+	ptr = strstr(uuid, "::");
+	if (ptr) {
+		__u32 inst;
+		char *endptr;
+
+		*ptr = 0;
+		do_reconn = 0;
+		ptr += strlen("::");
+		inst = simple_strtol(ptr, &endptr, 10);
+		if (*endptr) {
+			CERROR("config: wrong instance # %s\n", ptr);
+		} else if (inst != imp->imp_connect_data.ocd_instance) {
+			CDEBUG(D_INFO, "IR: %s is connecting to an obsoleted "
+			       "target(%u/%u), reconnecting...\n",
+			       imp->imp_obd->obd_name,
+			       imp->imp_connect_data.ocd_instance, inst);
+			do_reconn = 1;
+		} else {
+			CDEBUG(D_INFO, "IR: %s has already been connecting to "
+			       "new target(%u)\n",
+			       imp->imp_obd->obd_name, inst);
+		}
+	}
+
+	if (do_reconn)
+		ptlrpc_recover_import(imp, uuid, 1);
+
+out:
+	OBD_FREE(kbuf, count + 1);
+	return count;
+}
+EXPORT_SYMBOL(lprocfs_wr_import);
+
+int lprocfs_rd_pinger_recov(char *page, char **start, off_t off,
+			    int count, int *eof, void *data)
+{
+	struct obd_device *obd = data;
+	struct obd_import *imp = obd->u.cli.cl_import;
+	int rc;
+
+	LPROCFS_CLIMP_CHECK(obd);
+	rc = snprintf(page, count, "%d\n", !imp->imp_no_pinger_recover);
+	LPROCFS_CLIMP_EXIT(obd);
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_pinger_recov);
+
+int lprocfs_wr_pinger_recov(struct file *file, const char *buffer,
+		      unsigned long count, void *data)
+{
+	struct obd_device *obd = data;
+	struct client_obd *cli = &obd->u.cli;
+	struct obd_import *imp = cli->cl_import;
+	int rc, val;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc < 0)
+		return rc;
+
+	if (val != 0 && val != 1)
+		return -ERANGE;
+
+	LPROCFS_CLIMP_CHECK(obd);
+	spin_lock(&imp->imp_lock);
+	imp->imp_no_pinger_recover = !val;
+	spin_unlock(&imp->imp_lock);
+	LPROCFS_CLIMP_EXIT(obd);
+
+	return count;
+
+}
+EXPORT_SYMBOL(lprocfs_wr_pinger_recov);
+
+#endif /* LPROCFS */
diff --git a/drivers/staging/lustre/lustre/ptlrpc/niobuf.c b/drivers/staging/lustre/lustre/ptlrpc/niobuf.c
new file mode 100644
index 000000000000..de3f0db0ba47
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/niobuf.c
@@ -0,0 +1,728 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include <obd_support.h>
+#include <lustre_net.h>
+#include <lustre_lib.h>
+#include <obd.h>
+#include <obd_class.h>
+#include "ptlrpc_internal.h"
+
+/**
+ * Helper function. Sends \a len bytes from \a base at offset \a offset
+ * over \a conn connection to portal \a portal.
+ * Returns 0 on success or error code.
+ */
+static int ptl_send_buf (lnet_handle_md_t *mdh, void *base, int len,
+			 lnet_ack_req_t ack, struct ptlrpc_cb_id *cbid,
+			 struct ptlrpc_connection *conn, int portal, __u64 xid,
+			 unsigned int offset)
+{
+	int	      rc;
+	lnet_md_t	 md;
+	ENTRY;
+
+	LASSERT (portal != 0);
+	LASSERT (conn != NULL);
+	CDEBUG (D_INFO, "conn=%p id %s\n", conn, libcfs_id2str(conn->c_peer));
+	md.start     = base;
+	md.length    = len;
+	md.threshold = (ack == LNET_ACK_REQ) ? 2 : 1;
+	md.options   = PTLRPC_MD_OPTIONS;
+	md.user_ptr  = cbid;
+	md.eq_handle = ptlrpc_eq_h;
+
+	if (unlikely(ack == LNET_ACK_REQ &&
+		     OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_ACK, OBD_FAIL_ONCE))){
+		/* don't ask for the ack to simulate failing client */
+		ack = LNET_NOACK_REQ;
+	}
+
+	rc = LNetMDBind (md, LNET_UNLINK, mdh);
+	if (unlikely(rc != 0)) {
+		CERROR ("LNetMDBind failed: %d\n", rc);
+		LASSERT (rc == -ENOMEM);
+		RETURN (-ENOMEM);
+	}
+
+	CDEBUG(D_NET, "Sending %d bytes to portal %d, xid "LPD64", offset %u\n",
+	       len, portal, xid, offset);
+
+	rc = LNetPut (conn->c_self, *mdh, ack,
+		      conn->c_peer, portal, xid, offset, 0);
+	if (unlikely(rc != 0)) {
+		int rc2;
+		/* We're going to get an UNLINK event when I unlink below,
+		 * which will complete just like any other failed send, so
+		 * I fall through and return success here! */
+		CERROR("LNetPut(%s, %d, "LPD64") failed: %d\n",
+		       libcfs_id2str(conn->c_peer), portal, xid, rc);
+		rc2 = LNetMDUnlink(*mdh);
+		LASSERTF(rc2 == 0, "rc2 = %d\n", rc2);
+	}
+
+	RETURN (0);
+}
+
+static void mdunlink_iterate_helper(lnet_handle_md_t *bd_mds, int count)
+{
+	int i;
+
+	for (i = 0; i < count; i++)
+		LNetMDUnlink(bd_mds[i]);
+}
+
+
+/**
+ * Register bulk at the sender for later transfer.
+ * Returns 0 on success or error code.
+ */
+int ptlrpc_register_bulk(struct ptlrpc_request *req)
+{
+	struct ptlrpc_bulk_desc *desc = req->rq_bulk;
+	lnet_process_id_t peer;
+	int rc = 0;
+	int rc2;
+	int posted_md;
+	int total_md;
+	__u64 xid;
+	lnet_handle_me_t  me_h;
+	lnet_md_t	 md;
+	ENTRY;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_BULK_GET_NET))
+		RETURN(0);
+
+	/* NB no locking required until desc is on the network */
+	LASSERT(desc->bd_nob > 0);
+	LASSERT(desc->bd_md_count == 0);
+	LASSERT(desc->bd_md_max_brw <= PTLRPC_BULK_OPS_COUNT);
+	LASSERT(desc->bd_iov_count <= PTLRPC_MAX_BRW_PAGES);
+	LASSERT(desc->bd_req != NULL);
+	LASSERT(desc->bd_type == BULK_PUT_SINK ||
+		desc->bd_type == BULK_GET_SOURCE);
+
+	/* cleanup the state of the bulk for it will be reused */
+	if (req->rq_resend || req->rq_send_state == LUSTRE_IMP_REPLAY)
+		desc->bd_nob_transferred = 0;
+	else
+		LASSERT(desc->bd_nob_transferred == 0);
+
+	desc->bd_failure = 0;
+
+	peer = desc->bd_import->imp_connection->c_peer;
+
+	LASSERT(desc->bd_cbid.cbid_fn == client_bulk_callback);
+	LASSERT(desc->bd_cbid.cbid_arg == desc);
+
+	/* An XID is only used for a single request from the client.
+	 * For retried bulk transfers, a new XID will be allocated in
+	 * in ptlrpc_check_set() if it needs to be resent, so it is not
+	 * using the same RDMA match bits after an error.
+	 *
+	 * For multi-bulk RPCs, rq_xid is the last XID needed for bulks. The
+	 * first bulk XID is power-of-two aligned before rq_xid. LU-1431 */
+	xid = req->rq_xid & ~((__u64)desc->bd_md_max_brw - 1);
+	LASSERTF(!(desc->bd_registered &&
+		   req->rq_send_state != LUSTRE_IMP_REPLAY) ||
+		 xid != desc->bd_last_xid,
+		 "registered: %d  rq_xid: "LPU64" bd_last_xid: "LPU64"\n",
+		 desc->bd_registered, xid, desc->bd_last_xid);
+
+	total_md = (desc->bd_iov_count + LNET_MAX_IOV - 1) / LNET_MAX_IOV;
+	desc->bd_registered = 1;
+	desc->bd_last_xid = xid;
+	desc->bd_md_count = total_md;
+	md.user_ptr = &desc->bd_cbid;
+	md.eq_handle = ptlrpc_eq_h;
+	md.threshold = 1;		       /* PUT or GET */
+
+	for (posted_md = 0; posted_md < total_md; posted_md++, xid++) {
+		md.options = PTLRPC_MD_OPTIONS |
+			     ((desc->bd_type == BULK_GET_SOURCE) ?
+			      LNET_MD_OP_GET : LNET_MD_OP_PUT);
+		ptlrpc_fill_bulk_md(&md, desc, posted_md);
+
+		rc = LNetMEAttach(desc->bd_portal, peer, xid, 0,
+				  LNET_UNLINK, LNET_INS_AFTER, &me_h);
+		if (rc != 0) {
+			CERROR("%s: LNetMEAttach failed x"LPU64"/%d: rc = %d\n",
+			       desc->bd_export->exp_obd->obd_name, xid,
+			       posted_md, rc);
+			break;
+		}
+
+		/* About to let the network at it... */
+		rc = LNetMDAttach(me_h, md, LNET_UNLINK,
+				  &desc->bd_mds[posted_md]);
+		if (rc != 0) {
+			CERROR("%s: LNetMDAttach failed x"LPU64"/%d: rc = %d\n",
+			       desc->bd_export->exp_obd->obd_name, xid,
+			       posted_md, rc);
+			rc2 = LNetMEUnlink(me_h);
+			LASSERT(rc2 == 0);
+			break;
+		}
+	}
+
+	if (rc != 0) {
+		LASSERT(rc == -ENOMEM);
+		spin_lock(&desc->bd_lock);
+		desc->bd_md_count -= total_md - posted_md;
+		spin_unlock(&desc->bd_lock);
+		LASSERT(desc->bd_md_count >= 0);
+		mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw);
+		req->rq_status = -ENOMEM;
+		RETURN(-ENOMEM);
+	}
+
+	/* Set rq_xid to matchbits of the final bulk so that server can
+	 * infer the number of bulks that were prepared */
+	req->rq_xid = --xid;
+	LASSERTF(desc->bd_last_xid == (req->rq_xid & PTLRPC_BULK_OPS_MASK),
+		 "bd_last_xid = x"LPU64", rq_xid = x"LPU64"\n",
+		 desc->bd_last_xid, req->rq_xid);
+
+	spin_lock(&desc->bd_lock);
+	/* Holler if peer manages to touch buffers before he knows the xid */
+	if (desc->bd_md_count != total_md)
+		CWARN("%s: Peer %s touched %d buffers while I registered\n",
+		      desc->bd_export->exp_obd->obd_name, libcfs_id2str(peer),
+		      total_md - desc->bd_md_count);
+	spin_unlock(&desc->bd_lock);
+
+	CDEBUG(D_NET, "Setup %u bulk %s buffers: %u pages %u bytes, "
+	       "xid x"LPX64"-"LPX64", portal %u\n", desc->bd_md_count,
+	       desc->bd_type == BULK_GET_SOURCE ? "get-source" : "put-sink",
+	       desc->bd_iov_count, desc->bd_nob,
+	       desc->bd_last_xid, req->rq_xid, desc->bd_portal);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_register_bulk);
+
+/**
+ * Disconnect a bulk desc from the network. Idempotent. Not
+ * thread-safe (i.e. only interlocks with completion callback).
+ * Returns 1 on success or 0 if network unregistration failed for whatever
+ * reason.
+ */
+int ptlrpc_unregister_bulk(struct ptlrpc_request *req, int async)
+{
+	struct ptlrpc_bulk_desc *desc = req->rq_bulk;
+	wait_queue_head_t	     *wq;
+	struct l_wait_info       lwi;
+	int		      rc;
+	ENTRY;
+
+	LASSERT(!in_interrupt());     /* might sleep */
+
+	/* Let's setup deadline for reply unlink. */
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK) &&
+	    async && req->rq_bulk_deadline == 0)
+		req->rq_bulk_deadline = cfs_time_current_sec() + LONG_UNLINK;
+
+	if (ptlrpc_client_bulk_active(req) == 0)	/* completed or */
+		RETURN(1);				/* never registered */
+
+	LASSERT(desc->bd_req == req);  /* bd_req NULL until registered */
+
+	/* the unlink ensures the callback happens ASAP and is the last
+	 * one.  If it fails, it must be because completion just happened,
+	 * but we must still l_wait_event() in this case to give liblustre
+	 * a chance to run client_bulk_callback() */
+	mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw);
+
+	if (ptlrpc_client_bulk_active(req) == 0)	/* completed or */
+		RETURN(1);				/* never registered */
+
+	/* Move to "Unregistering" phase as bulk was not unlinked yet. */
+	ptlrpc_rqphase_move(req, RQ_PHASE_UNREGISTERING);
+
+	/* Do not wait for unlink to finish. */
+	if (async)
+		RETURN(0);
+
+	if (req->rq_set != NULL)
+		wq = &req->rq_set->set_waitq;
+	else
+		wq = &req->rq_reply_waitq;
+
+	for (;;) {
+		/* Network access will complete in finite time but the HUGE
+		 * timeout lets us CWARN for visibility of sluggish NALs */
+		lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK),
+					   cfs_time_seconds(1), NULL, NULL);
+		rc = l_wait_event(*wq, !ptlrpc_client_bulk_active(req), &lwi);
+		if (rc == 0) {
+			ptlrpc_rqphase_move(req, req->rq_next_phase);
+			RETURN(1);
+		}
+
+		LASSERT(rc == -ETIMEDOUT);
+		DEBUG_REQ(D_WARNING, req, "Unexpectedly long timeout: desc %p",
+			  desc);
+	}
+	RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_unregister_bulk);
+
+static void ptlrpc_at_set_reply(struct ptlrpc_request *req, int flags)
+{
+	struct ptlrpc_service_part	*svcpt = req->rq_rqbd->rqbd_svcpt;
+	struct ptlrpc_service		*svc = svcpt->scp_service;
+	int service_time = max_t(int, cfs_time_current_sec() -
+				 req->rq_arrival_time.tv_sec, 1);
+
+	if (!(flags & PTLRPC_REPLY_EARLY) &&
+	    (req->rq_type != PTL_RPC_MSG_ERR) &&
+	    (req->rq_reqmsg != NULL) &&
+	    !(lustre_msg_get_flags(req->rq_reqmsg) &
+	      (MSG_RESENT | MSG_REPLAY |
+	       MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE))) {
+		/* early replies, errors and recovery requests don't count
+		 * toward our service time estimate */
+		int oldse = at_measured(&svcpt->scp_at_estimate, service_time);
+
+		if (oldse != 0) {
+			DEBUG_REQ(D_ADAPTTO, req,
+				  "svc %s changed estimate from %d to %d",
+				  svc->srv_name, oldse,
+				  at_get(&svcpt->scp_at_estimate));
+		}
+	}
+	/* Report actual service time for client latency calc */
+	lustre_msg_set_service_time(req->rq_repmsg, service_time);
+	/* Report service time estimate for future client reqs, but report 0
+	 * (to be ignored by client) if it's a error reply during recovery.
+	 * (bz15815) */
+	if (req->rq_type == PTL_RPC_MSG_ERR &&
+	    (req->rq_export == NULL || req->rq_export->exp_obd->obd_recovering))
+		lustre_msg_set_timeout(req->rq_repmsg, 0);
+	else
+		lustre_msg_set_timeout(req->rq_repmsg,
+				       at_get(&svcpt->scp_at_estimate));
+
+	if (req->rq_reqmsg &&
+	    !(lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) {
+		CDEBUG(D_ADAPTTO, "No early reply support: flags=%#x "
+		       "req_flags=%#x magic=%d:%x/%x len=%d\n",
+		       flags, lustre_msg_get_flags(req->rq_reqmsg),
+		       lustre_msg_is_v1(req->rq_reqmsg),
+		       lustre_msg_get_magic(req->rq_reqmsg),
+		       lustre_msg_get_magic(req->rq_repmsg), req->rq_replen);
+	}
+}
+
+/**
+ * Send request reply from request \a req reply buffer.
+ * \a flags defines reply types
+ * Returns 0 on sucess or error code
+ */
+int ptlrpc_send_reply(struct ptlrpc_request *req, int flags)
+{
+	struct ptlrpc_reply_state *rs = req->rq_reply_state;
+	struct ptlrpc_connection  *conn;
+	int			rc;
+
+	/* We must already have a reply buffer (only ptlrpc_error() may be
+	 * called without one). The reply generated by sptlrpc layer (e.g.
+	 * error notify, etc.) might have NULL rq->reqmsg; Otherwise we must
+	 * have a request buffer which is either the actual (swabbed) incoming
+	 * request, or a saved copy if this is a req saved in
+	 * target_queue_final_reply().
+	 */
+	LASSERT (req->rq_no_reply == 0);
+	LASSERT (req->rq_reqbuf != NULL);
+	LASSERT (rs != NULL);
+	LASSERT ((flags & PTLRPC_REPLY_MAYBE_DIFFICULT) || !rs->rs_difficult);
+	LASSERT (req->rq_repmsg != NULL);
+	LASSERT (req->rq_repmsg == rs->rs_msg);
+	LASSERT (rs->rs_cb_id.cbid_fn == reply_out_callback);
+	LASSERT (rs->rs_cb_id.cbid_arg == rs);
+
+	/* There may be no rq_export during failover */
+
+	if (unlikely(req->rq_export && req->rq_export->exp_obd &&
+		     req->rq_export->exp_obd->obd_fail)) {
+		/* Failed obd's only send ENODEV */
+		req->rq_type = PTL_RPC_MSG_ERR;
+		req->rq_status = -ENODEV;
+		CDEBUG(D_HA, "sending ENODEV from failed obd %d\n",
+		       req->rq_export->exp_obd->obd_minor);
+	}
+
+	/* In order to keep interoprability with the client (< 2.3) which
+	 * doesn't have pb_jobid in ptlrpc_body, We have to shrink the
+	 * ptlrpc_body in reply buffer to ptlrpc_body_v2, otherwise, the
+	 * reply buffer on client will be overflow.
+	 *
+	 * XXX Remove this whenver we drop the interoprability with such client.
+	 */
+	req->rq_replen = lustre_shrink_msg(req->rq_repmsg, 0,
+					   sizeof(struct ptlrpc_body_v2), 1);
+
+	if (req->rq_type != PTL_RPC_MSG_ERR)
+		req->rq_type = PTL_RPC_MSG_REPLY;
+
+	lustre_msg_set_type(req->rq_repmsg, req->rq_type);
+	lustre_msg_set_status(req->rq_repmsg, req->rq_status);
+	lustre_msg_set_opc(req->rq_repmsg,
+		req->rq_reqmsg ? lustre_msg_get_opc(req->rq_reqmsg) : 0);
+
+	target_pack_pool_reply(req);
+
+	ptlrpc_at_set_reply(req, flags);
+
+	if (req->rq_export == NULL || req->rq_export->exp_connection == NULL)
+		conn = ptlrpc_connection_get(req->rq_peer, req->rq_self, NULL);
+	else
+		conn = ptlrpc_connection_addref(req->rq_export->exp_connection);
+
+	if (unlikely(conn == NULL)) {
+		CERROR("not replying on NULL connection\n"); /* bug 9635 */
+		return -ENOTCONN;
+	}
+	ptlrpc_rs_addref(rs);		   /* +1 ref for the network */
+
+	rc = sptlrpc_svc_wrap_reply(req);
+	if (unlikely(rc))
+		goto out;
+
+	req->rq_sent = cfs_time_current_sec();
+
+	rc = ptl_send_buf (&rs->rs_md_h, rs->rs_repbuf, rs->rs_repdata_len,
+			   (rs->rs_difficult && !rs->rs_no_ack) ?
+			   LNET_ACK_REQ : LNET_NOACK_REQ,
+			   &rs->rs_cb_id, conn,
+			   ptlrpc_req2svc(req)->srv_rep_portal,
+			   req->rq_xid, req->rq_reply_off);
+out:
+	if (unlikely(rc != 0))
+		ptlrpc_req_drop_rs(req);
+	ptlrpc_connection_put(conn);
+	return rc;
+}
+EXPORT_SYMBOL(ptlrpc_send_reply);
+
+int ptlrpc_reply (struct ptlrpc_request *req)
+{
+	if (req->rq_no_reply)
+		return 0;
+	else
+		return (ptlrpc_send_reply(req, 0));
+}
+EXPORT_SYMBOL(ptlrpc_reply);
+
+/**
+ * For request \a req send an error reply back. Create empty
+ * reply buffers if necessary.
+ */
+int ptlrpc_send_error(struct ptlrpc_request *req, int may_be_difficult)
+{
+	int rc;
+	ENTRY;
+
+	if (req->rq_no_reply)
+		RETURN(0);
+
+	if (!req->rq_repmsg) {
+		rc = lustre_pack_reply(req, 1, NULL, NULL);
+		if (rc)
+			RETURN(rc);
+	}
+
+	if (req->rq_status != -ENOSPC && req->rq_status != -EACCES &&
+	    req->rq_status != -EPERM && req->rq_status != -ENOENT &&
+	    req->rq_status != -EINPROGRESS && req->rq_status != -EDQUOT)
+		req->rq_type = PTL_RPC_MSG_ERR;
+
+	rc = ptlrpc_send_reply(req, may_be_difficult);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_send_error);
+
+int ptlrpc_error(struct ptlrpc_request *req)
+{
+	return ptlrpc_send_error(req, 0);
+}
+EXPORT_SYMBOL(ptlrpc_error);
+
+/**
+ * Send request \a request.
+ * if \a noreply is set, don't expect any reply back and don't set up
+ * reply buffers.
+ * Returns 0 on success or error code.
+ */
+int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
+{
+	int rc;
+	int rc2;
+	int mpflag = 0;
+	struct ptlrpc_connection *connection;
+	lnet_handle_me_t  reply_me_h;
+	lnet_md_t	 reply_md;
+	struct obd_device *obd = request->rq_import->imp_obd;
+	ENTRY;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_RPC))
+		RETURN(0);
+
+	LASSERT(request->rq_type == PTL_RPC_MSG_REQUEST);
+	LASSERT(request->rq_wait_ctx == 0);
+
+	/* If this is a re-transmit, we're required to have disengaged
+	 * cleanly from the previous attempt */
+	LASSERT(!request->rq_receiving_reply);
+
+	if (request->rq_import->imp_obd &&
+	    request->rq_import->imp_obd->obd_fail) {
+		CDEBUG(D_HA, "muting rpc for failed imp obd %s\n",
+		       request->rq_import->imp_obd->obd_name);
+		/* this prevents us from waiting in ptlrpc_queue_wait */
+		request->rq_err = 1;
+		request->rq_status = -ENODEV;
+		RETURN(-ENODEV);
+	}
+
+	connection = request->rq_import->imp_connection;
+
+	lustre_msg_set_handle(request->rq_reqmsg,
+			      &request->rq_import->imp_remote_handle);
+	lustre_msg_set_type(request->rq_reqmsg, PTL_RPC_MSG_REQUEST);
+	lustre_msg_set_conn_cnt(request->rq_reqmsg,
+				request->rq_import->imp_conn_cnt);
+	lustre_msghdr_set_flags(request->rq_reqmsg,
+				request->rq_import->imp_msghdr_flags);
+
+	if (request->rq_resend)
+		lustre_msg_add_flags(request->rq_reqmsg, MSG_RESENT);
+
+	if (request->rq_memalloc)
+		mpflag = cfs_memory_pressure_get_and_set();
+
+	rc = sptlrpc_cli_wrap_request(request);
+	if (rc)
+		GOTO(out, rc);
+
+	/* bulk register should be done after wrap_request() */
+	if (request->rq_bulk != NULL) {
+		rc = ptlrpc_register_bulk (request);
+		if (rc != 0)
+			GOTO(out, rc);
+	}
+
+	if (!noreply) {
+		LASSERT (request->rq_replen != 0);
+		if (request->rq_repbuf == NULL) {
+			LASSERT(request->rq_repdata == NULL);
+			LASSERT(request->rq_repmsg == NULL);
+			rc = sptlrpc_cli_alloc_repbuf(request,
+						      request->rq_replen);
+			if (rc) {
+				/* this prevents us from looping in
+				 * ptlrpc_queue_wait */
+				request->rq_err = 1;
+				request->rq_status = rc;
+				GOTO(cleanup_bulk, rc);
+			}
+		} else {
+			request->rq_repdata = NULL;
+			request->rq_repmsg = NULL;
+		}
+
+		rc = LNetMEAttach(request->rq_reply_portal,/*XXX FIXME bug 249*/
+				  connection->c_peer, request->rq_xid, 0,
+				  LNET_UNLINK, LNET_INS_AFTER, &reply_me_h);
+		if (rc != 0) {
+			CERROR("LNetMEAttach failed: %d\n", rc);
+			LASSERT (rc == -ENOMEM);
+			GOTO(cleanup_bulk, rc = -ENOMEM);
+		}
+	}
+
+	spin_lock(&request->rq_lock);
+	/* If the MD attach succeeds, there _will_ be a reply_in callback */
+	request->rq_receiving_reply = !noreply;
+	/* We are responsible for unlinking the reply buffer */
+	request->rq_must_unlink = !noreply;
+	/* Clear any flags that may be present from previous sends. */
+	request->rq_replied = 0;
+	request->rq_err = 0;
+	request->rq_timedout = 0;
+	request->rq_net_err = 0;
+	request->rq_resend = 0;
+	request->rq_restart = 0;
+	request->rq_reply_truncate = 0;
+	spin_unlock(&request->rq_lock);
+
+	if (!noreply) {
+		reply_md.start     = request->rq_repbuf;
+		reply_md.length    = request->rq_repbuf_len;
+		/* Allow multiple early replies */
+		reply_md.threshold = LNET_MD_THRESH_INF;
+		/* Manage remote for early replies */
+		reply_md.options   = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT |
+			LNET_MD_MANAGE_REMOTE |
+			LNET_MD_TRUNCATE; /* allow to make EOVERFLOW error */;
+		reply_md.user_ptr  = &request->rq_reply_cbid;
+		reply_md.eq_handle = ptlrpc_eq_h;
+
+		/* We must see the unlink callback to unset rq_must_unlink,
+		   so we can't auto-unlink */
+		rc = LNetMDAttach(reply_me_h, reply_md, LNET_RETAIN,
+				  &request->rq_reply_md_h);
+		if (rc != 0) {
+			CERROR("LNetMDAttach failed: %d\n", rc);
+			LASSERT (rc == -ENOMEM);
+			spin_lock(&request->rq_lock);
+			/* ...but the MD attach didn't succeed... */
+			request->rq_receiving_reply = 0;
+			spin_unlock(&request->rq_lock);
+			GOTO(cleanup_me, rc = -ENOMEM);
+		}
+
+		CDEBUG(D_NET, "Setup reply buffer: %u bytes, xid "LPU64
+		       ", portal %u\n",
+		       request->rq_repbuf_len, request->rq_xid,
+		       request->rq_reply_portal);
+	}
+
+	/* add references on request for request_out_callback */
+	ptlrpc_request_addref(request);
+	if (obd->obd_svc_stats != NULL)
+		lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQACTIVE_CNTR,
+			atomic_read(&request->rq_import->imp_inflight));
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_SEND, request->rq_timeout + 5);
+
+	do_gettimeofday(&request->rq_arrival_time);
+	request->rq_sent = cfs_time_current_sec();
+	/* We give the server rq_timeout secs to process the req, and
+	   add the network latency for our local timeout. */
+	request->rq_deadline = request->rq_sent + request->rq_timeout +
+		ptlrpc_at_get_net_latency(request);
+
+	ptlrpc_pinger_sending_on_import(request->rq_import);
+
+	DEBUG_REQ(D_INFO, request, "send flg=%x",
+		  lustre_msg_get_flags(request->rq_reqmsg));
+	rc = ptl_send_buf(&request->rq_req_md_h,
+			  request->rq_reqbuf, request->rq_reqdata_len,
+			  LNET_NOACK_REQ, &request->rq_req_cbid,
+			  connection,
+			  request->rq_request_portal,
+			  request->rq_xid, 0);
+	if (rc == 0)
+		GOTO(out, rc);
+
+	ptlrpc_req_finished(request);
+	if (noreply)
+		GOTO(out, rc);
+
+ cleanup_me:
+	/* MEUnlink is safe; the PUT didn't even get off the ground, and
+	 * nobody apart from the PUT's target has the right nid+XID to
+	 * access the reply buffer. */
+	rc2 = LNetMEUnlink(reply_me_h);
+	LASSERT (rc2 == 0);
+	/* UNLINKED callback called synchronously */
+	LASSERT(!request->rq_receiving_reply);
+
+ cleanup_bulk:
+	/* We do sync unlink here as there was no real transfer here so
+	 * the chance to have long unlink to sluggish net is smaller here. */
+	ptlrpc_unregister_bulk(request, 0);
+ out:
+	if (request->rq_memalloc)
+		cfs_memory_pressure_restore(mpflag);
+	return rc;
+}
+EXPORT_SYMBOL(ptl_send_rpc);
+
+/**
+ * Register request buffer descriptor for request receiving.
+ */
+int ptlrpc_register_rqbd(struct ptlrpc_request_buffer_desc *rqbd)
+{
+	struct ptlrpc_service	  *service = rqbd->rqbd_svcpt->scp_service;
+	static lnet_process_id_t  match_id = {LNET_NID_ANY, LNET_PID_ANY};
+	int			  rc;
+	lnet_md_t		 md;
+	lnet_handle_me_t	  me_h;
+
+	CDEBUG(D_NET, "LNetMEAttach: portal %d\n",
+	       service->srv_req_portal);
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_RQBD))
+		return (-ENOMEM);
+
+	/* NB: CPT affinity service should use new LNet flag LNET_INS_LOCAL,
+	 * which means buffer can only be attached on local CPT, and LND
+	 * threads can find it by grabbing a local lock */
+	rc = LNetMEAttach(service->srv_req_portal,
+			  match_id, 0, ~0, LNET_UNLINK,
+			  rqbd->rqbd_svcpt->scp_cpt >= 0 ?
+			  LNET_INS_LOCAL : LNET_INS_AFTER, &me_h);
+	if (rc != 0) {
+		CERROR("LNetMEAttach failed: %d\n", rc);
+		return (-ENOMEM);
+	}
+
+	LASSERT(rqbd->rqbd_refcount == 0);
+	rqbd->rqbd_refcount = 1;
+
+	md.start     = rqbd->rqbd_buffer;
+	md.length    = service->srv_buf_size;
+	md.max_size  = service->srv_max_req_size;
+	md.threshold = LNET_MD_THRESH_INF;
+	md.options   = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT | LNET_MD_MAX_SIZE;
+	md.user_ptr  = &rqbd->rqbd_cbid;
+	md.eq_handle = ptlrpc_eq_h;
+
+	rc = LNetMDAttach(me_h, md, LNET_UNLINK, &rqbd->rqbd_md_h);
+	if (rc == 0)
+		return (0);
+
+	CERROR("LNetMDAttach failed: %d; \n", rc);
+	LASSERT (rc == -ENOMEM);
+	rc = LNetMEUnlink (me_h);
+	LASSERT (rc == 0);
+	rqbd->rqbd_refcount = 0;
+
+	return (-ENOMEM);
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/nrs.c b/drivers/staging/lustre/lustre/ptlrpc/nrs.c
new file mode 100644
index 000000000000..1996431e35ff
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/nrs.c
@@ -0,0 +1,1790 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011 Intel Corporation
+ *
+ * Copyright 2012 Xyratex Technology Limited
+ */
+/*
+ * lustre/ptlrpc/nrs.c
+ *
+ * Network Request Scheduler (NRS)
+ *
+ * Allows to reorder the handling of RPCs at servers.
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ * Author: Nikitas Angelinas <nikitas_angelinas@xyratex.com>
+ */
+/**
+ * \addtogoup nrs
+ * @{
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lprocfs_status.h>
+#include <linux/libcfs/libcfs.h>
+#include "ptlrpc_internal.h"
+
+/* XXX: This is just for liblustre. Remove the #if defined directive when the
+ * "cfs_" prefix is dropped from cfs_list_head. */
+extern struct list_head ptlrpc_all_services;
+
+/**
+ * NRS core object.
+ */
+struct nrs_core nrs_core;
+
+static int nrs_policy_init(struct ptlrpc_nrs_policy *policy)
+{
+	return policy->pol_desc->pd_ops->op_policy_init != NULL ?
+	       policy->pol_desc->pd_ops->op_policy_init(policy) : 0;
+}
+
+static void nrs_policy_fini(struct ptlrpc_nrs_policy *policy)
+{
+	LASSERT(policy->pol_ref == 0);
+	LASSERT(policy->pol_req_queued == 0);
+
+	if (policy->pol_desc->pd_ops->op_policy_fini != NULL)
+		policy->pol_desc->pd_ops->op_policy_fini(policy);
+}
+
+static int nrs_policy_ctl_locked(struct ptlrpc_nrs_policy *policy,
+				 enum ptlrpc_nrs_ctl opc, void *arg)
+{
+	/**
+	 * The policy may be stopped, but the lprocfs files and
+	 * ptlrpc_nrs_policy instances remain present until unregistration time.
+	 * Do not perform the ctl operation if the policy is stopped, as
+	 * policy->pol_private will be NULL in such a case.
+	 */
+	if (policy->pol_state == NRS_POL_STATE_STOPPED)
+		RETURN(-ENODEV);
+
+	RETURN(policy->pol_desc->pd_ops->op_policy_ctl != NULL ?
+	       policy->pol_desc->pd_ops->op_policy_ctl(policy, opc, arg) :
+	       -ENOSYS);
+}
+
+static void nrs_policy_stop0(struct ptlrpc_nrs_policy *policy)
+{
+	struct ptlrpc_nrs *nrs = policy->pol_nrs;
+	ENTRY;
+
+	if (policy->pol_desc->pd_ops->op_policy_stop != NULL) {
+		spin_unlock(&nrs->nrs_lock);
+
+		policy->pol_desc->pd_ops->op_policy_stop(policy);
+
+		spin_lock(&nrs->nrs_lock);
+	}
+
+	LASSERT(list_empty(&policy->pol_list_queued));
+	LASSERT(policy->pol_req_queued == 0 &&
+		policy->pol_req_started == 0);
+
+	policy->pol_private = NULL;
+
+	policy->pol_state = NRS_POL_STATE_STOPPED;
+
+	if (atomic_dec_and_test(&policy->pol_desc->pd_refs))
+		module_put(policy->pol_desc->pd_owner);
+
+	EXIT;
+}
+
+static int nrs_policy_stop_locked(struct ptlrpc_nrs_policy *policy)
+{
+	struct ptlrpc_nrs *nrs = policy->pol_nrs;
+	ENTRY;
+
+	if (nrs->nrs_policy_fallback == policy && !nrs->nrs_stopping)
+		RETURN(-EPERM);
+
+	if (policy->pol_state == NRS_POL_STATE_STARTING)
+		RETURN(-EAGAIN);
+
+	/* In progress or already stopped */
+	if (policy->pol_state != NRS_POL_STATE_STARTED)
+		RETURN(0);
+
+	policy->pol_state = NRS_POL_STATE_STOPPING;
+
+	/* Immediately make it invisible */
+	if (nrs->nrs_policy_primary == policy) {
+		nrs->nrs_policy_primary = NULL;
+
+	} else {
+		LASSERT(nrs->nrs_policy_fallback == policy);
+		nrs->nrs_policy_fallback = NULL;
+	}
+
+	/* I have the only refcount */
+	if (policy->pol_ref == 1)
+		nrs_policy_stop0(policy);
+
+	RETURN(0);
+}
+
+/**
+ * Transitions the \a nrs NRS head's primary policy to
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING and if the policy has no
+ * pending usage references, to ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED.
+ *
+ * \param[in] nrs the NRS head to carry out this operation on
+ */
+static void nrs_policy_stop_primary(struct ptlrpc_nrs *nrs)
+{
+	struct ptlrpc_nrs_policy *tmp = nrs->nrs_policy_primary;
+	ENTRY;
+
+	if (tmp == NULL) {
+		/**
+		 * XXX: This should really be RETURN_EXIT, but the latter does
+		 * not currently print anything out, and possibly should be
+		 * fixed to do so.
+		 */
+		EXIT;
+		return;
+	}
+
+	nrs->nrs_policy_primary = NULL;
+
+	LASSERT(tmp->pol_state == NRS_POL_STATE_STARTED);
+	tmp->pol_state = NRS_POL_STATE_STOPPING;
+
+	if (tmp->pol_ref == 0)
+		nrs_policy_stop0(tmp);
+	EXIT;
+}
+
+/**
+ * Transitions a policy across the ptlrpc_nrs_pol_state range of values, in
+ * response to an lprocfs command to start a policy.
+ *
+ * If a primary policy different to the current one is specified, this function
+ * will transition the new policy to the
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTING and then to
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED, and will then transition
+ * the old primary policy (if there is one) to
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING, and if there are no outstanding
+ * references on the policy to ptlrpc_nrs_pol_stae::NRS_POL_STATE_STOPPED.
+ *
+ * If the fallback policy is specified, this is taken to indicate an instruction
+ * to stop the current primary policy, without substituting it with another
+ * primary policy, so the primary policy (if any) is transitioned to
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING, and if there are no outstanding
+ * references on the policy to ptlrpc_nrs_pol_stae::NRS_POL_STATE_STOPPED. In
+ * this case, the fallback policy is only left active in the NRS head.
+ */
+static int nrs_policy_start_locked(struct ptlrpc_nrs_policy *policy)
+{
+	struct ptlrpc_nrs      *nrs = policy->pol_nrs;
+	int			rc = 0;
+	ENTRY;
+
+	/**
+	 * Don't allow multiple starting which is too complex, and has no real
+	 * benefit.
+	 */
+	if (nrs->nrs_policy_starting)
+		RETURN(-EAGAIN);
+
+	LASSERT(policy->pol_state != NRS_POL_STATE_STARTING);
+
+	if (policy->pol_state == NRS_POL_STATE_STOPPING)
+		RETURN(-EAGAIN);
+
+	if (policy->pol_flags & PTLRPC_NRS_FL_FALLBACK) {
+		/**
+		 * This is for cases in which the user sets the policy to the
+		 * fallback policy (currently fifo for all services); i.e. the
+		 * user is resetting the policy to the default; so we stop the
+		 * primary policy, if any.
+		 */
+		if (policy == nrs->nrs_policy_fallback) {
+			nrs_policy_stop_primary(nrs);
+			RETURN(0);
+		}
+
+		/**
+		 * If we reach here, we must be setting up the fallback policy
+		 * at service startup time, and only a single policy with the
+		 * nrs_policy_flags::PTLRPC_NRS_FL_FALLBACK flag set can
+		 * register with NRS core.
+		 */
+		LASSERT(nrs->nrs_policy_fallback == NULL);
+	} else {
+		/**
+		 * Shouldn't start primary policy if w/o fallback policy.
+		 */
+		if (nrs->nrs_policy_fallback == NULL)
+			RETURN(-EPERM);
+
+		if (policy->pol_state == NRS_POL_STATE_STARTED)
+			RETURN(0);
+	}
+
+	/**
+	 * Increase the module usage count for policies registering from other
+	 * modules.
+	 */
+	if (atomic_inc_return(&policy->pol_desc->pd_refs) == 1 &&
+	    !try_module_get(policy->pol_desc->pd_owner)) {
+		atomic_dec(&policy->pol_desc->pd_refs);
+		CERROR("NRS: cannot get module for policy %s; is it alive?\n",
+		       policy->pol_desc->pd_name);
+		RETURN(-ENODEV);
+	}
+
+	/**
+	 * Serialize policy starting across the NRS head
+	 */
+	nrs->nrs_policy_starting = 1;
+
+	policy->pol_state = NRS_POL_STATE_STARTING;
+
+	if (policy->pol_desc->pd_ops->op_policy_start) {
+		spin_unlock(&nrs->nrs_lock);
+
+		rc = policy->pol_desc->pd_ops->op_policy_start(policy);
+
+		spin_lock(&nrs->nrs_lock);
+		if (rc != 0) {
+			if (atomic_dec_and_test(&policy->pol_desc->pd_refs))
+				module_put(policy->pol_desc->pd_owner);
+
+			policy->pol_state = NRS_POL_STATE_STOPPED;
+			GOTO(out, rc);
+		}
+	}
+
+	policy->pol_state = NRS_POL_STATE_STARTED;
+
+	if (policy->pol_flags & PTLRPC_NRS_FL_FALLBACK) {
+		/**
+		 * This path is only used at PTLRPC service setup time.
+		 */
+		nrs->nrs_policy_fallback = policy;
+	} else {
+		/*
+		 * Try to stop the current primary policy if there is one.
+		 */
+		nrs_policy_stop_primary(nrs);
+
+		/**
+		 * And set the newly-started policy as the primary one.
+		 */
+		nrs->nrs_policy_primary = policy;
+	}
+
+out:
+	nrs->nrs_policy_starting = 0;
+
+	RETURN(rc);
+}
+
+/**
+ * Increases the policy's usage reference count.
+ */
+static inline void nrs_policy_get_locked(struct ptlrpc_nrs_policy *policy)
+{
+	policy->pol_ref++;
+}
+
+/**
+ * Decreases the policy's usage reference count, and stops the policy in case it
+ * was already stopping and have no more outstanding usage references (which
+ * indicates it has no more queued or started requests, and can be safely
+ * stopped).
+ */
+static void nrs_policy_put_locked(struct ptlrpc_nrs_policy *policy)
+{
+	LASSERT(policy->pol_ref > 0);
+
+	policy->pol_ref--;
+	if (unlikely(policy->pol_ref == 0 &&
+	    policy->pol_state == NRS_POL_STATE_STOPPING))
+		nrs_policy_stop0(policy);
+}
+
+static void nrs_policy_put(struct ptlrpc_nrs_policy *policy)
+{
+	spin_lock(&policy->pol_nrs->nrs_lock);
+	nrs_policy_put_locked(policy);
+	spin_unlock(&policy->pol_nrs->nrs_lock);
+}
+
+/**
+ * Find and return a policy by name.
+ */
+static struct ptlrpc_nrs_policy * nrs_policy_find_locked(struct ptlrpc_nrs *nrs,
+							 char *name)
+{
+	struct ptlrpc_nrs_policy *tmp;
+
+	list_for_each_entry(tmp, &nrs->nrs_policy_list, pol_list) {
+		if (strncmp(tmp->pol_desc->pd_name, name,
+			    NRS_POL_NAME_MAX) == 0) {
+			nrs_policy_get_locked(tmp);
+			return tmp;
+		}
+	}
+	return NULL;
+}
+
+/**
+ * Release references for the resource hierarchy moving upwards towards the
+ * policy instance resource.
+ */
+static void nrs_resource_put(struct ptlrpc_nrs_resource *res)
+{
+	struct ptlrpc_nrs_policy *policy = res->res_policy;
+
+	if (policy->pol_desc->pd_ops->op_res_put != NULL) {
+		struct ptlrpc_nrs_resource *parent;
+
+		for (; res != NULL; res = parent) {
+			parent = res->res_parent;
+			policy->pol_desc->pd_ops->op_res_put(policy, res);
+		}
+	}
+}
+
+/**
+ * Obtains references for each resource in the resource hierarchy for request
+ * \a nrq if it is to be handled by \a policy.
+ *
+ * \param[in] policy	  the policy
+ * \param[in] nrq	  the request
+ * \param[in] moving_req  denotes whether this is a call to the function by
+ *			  ldlm_lock_reorder_req(), in order to move \a nrq to
+ *			  the high-priority NRS head; we should not sleep when
+ *			  set.
+ *
+ * \retval NULL		  resource hierarchy references not obtained
+ * \retval valid-pointer  the bottom level of the resource hierarchy
+ *
+ * \see ptlrpc_nrs_pol_ops::op_res_get()
+ */
+static
+struct ptlrpc_nrs_resource * nrs_resource_get(struct ptlrpc_nrs_policy *policy,
+					      struct ptlrpc_nrs_request *nrq,
+					      bool moving_req)
+{
+	/**
+	 * Set to NULL to traverse the resource hierarchy from the top.
+	 */
+	struct ptlrpc_nrs_resource *res = NULL;
+	struct ptlrpc_nrs_resource *tmp = NULL;
+	int			    rc;
+
+	while (1) {
+		rc = policy->pol_desc->pd_ops->op_res_get(policy, nrq, res,
+							  &tmp, moving_req);
+		if (rc < 0) {
+			if (res != NULL)
+				nrs_resource_put(res);
+			return NULL;
+		}
+
+		LASSERT(tmp != NULL);
+		tmp->res_parent = res;
+		tmp->res_policy = policy;
+		res = tmp;
+		tmp = NULL;
+		/**
+		 * Return once we have obtained a reference to the bottom level
+		 * of the resource hierarchy.
+		 */
+		if (rc > 0)
+			return res;
+	}
+}
+
+/**
+ * Obtains resources for the resource hierarchies and policy references for
+ * the fallback and current primary policy (if any), that will later be used
+ * to handle request \a nrq.
+ *
+ * \param[in]  nrs  the NRS head instance that will be handling request \a nrq.
+ * \param[in]  nrq  the request that is being handled.
+ * \param[out] resp the array where references to the resource hierarchy are
+ *		    stored.
+ * \param[in]  moving_req  is set when obtaining resources while moving a
+ *			   request from a policy on the regular NRS head to a
+ *			   policy on the HP NRS head (via
+ *			   ldlm_lock_reorder_req()). It signifies that
+ *			   allocations to get resources should be atomic; for
+ *			   a full explanation, see comment in
+ *			   ptlrpc_nrs_pol_ops::op_res_get().
+ */
+static void nrs_resource_get_safe(struct ptlrpc_nrs *nrs,
+				  struct ptlrpc_nrs_request *nrq,
+				  struct ptlrpc_nrs_resource **resp,
+				  bool moving_req)
+{
+	struct ptlrpc_nrs_policy   *primary = NULL;
+	struct ptlrpc_nrs_policy   *fallback = NULL;
+
+	memset(resp, 0, sizeof(resp[0]) * NRS_RES_MAX);
+
+	/**
+	 * Obtain policy references.
+	 */
+	spin_lock(&nrs->nrs_lock);
+
+	fallback = nrs->nrs_policy_fallback;
+	nrs_policy_get_locked(fallback);
+
+	primary = nrs->nrs_policy_primary;
+	if (primary != NULL)
+		nrs_policy_get_locked(primary);
+
+	spin_unlock(&nrs->nrs_lock);
+
+	/**
+	 * Obtain resource hierarchy references.
+	 */
+	resp[NRS_RES_FALLBACK] = nrs_resource_get(fallback, nrq, moving_req);
+	LASSERT(resp[NRS_RES_FALLBACK] != NULL);
+
+	if (primary != NULL) {
+		resp[NRS_RES_PRIMARY] = nrs_resource_get(primary, nrq,
+							 moving_req);
+		/**
+		 * A primary policy may exist which may not wish to serve a
+		 * particular request for different reasons; release the
+		 * reference on the policy as it will not be used for this
+		 * request.
+		 */
+		if (resp[NRS_RES_PRIMARY] == NULL)
+			nrs_policy_put(primary);
+	}
+}
+
+/**
+ * Releases references to resource hierarchies and policies, because they are no
+ * longer required; used when request handling has been completed, or the
+ * request is moving to the high priority NRS head.
+ *
+ * \param resp	the resource hierarchy that is being released
+ *
+ * \see ptlrpcnrs_req_hp_move()
+ * \see ptlrpc_nrs_req_finalize()
+ */
+static void nrs_resource_put_safe(struct ptlrpc_nrs_resource **resp)
+{
+	struct ptlrpc_nrs_policy *pols[NRS_RES_MAX];
+	struct ptlrpc_nrs	 *nrs = NULL;
+	int			  i;
+
+	for (i = 0; i < NRS_RES_MAX; i++) {
+		if (resp[i] != NULL) {
+			pols[i] = resp[i]->res_policy;
+			nrs_resource_put(resp[i]);
+			resp[i] = NULL;
+		} else {
+			pols[i] = NULL;
+		}
+	}
+
+	for (i = 0; i < NRS_RES_MAX; i++) {
+		if (pols[i] == NULL)
+			continue;
+
+		if (nrs == NULL) {
+			nrs = pols[i]->pol_nrs;
+			spin_lock(&nrs->nrs_lock);
+		}
+		nrs_policy_put_locked(pols[i]);
+	}
+
+	if (nrs != NULL)
+		spin_unlock(&nrs->nrs_lock);
+}
+
+/**
+ * Obtains an NRS request from \a policy for handling or examination; the
+ * request should be removed in the 'handling' case.
+ *
+ * Calling into this function implies we already know the policy has a request
+ * waiting to be handled.
+ *
+ * \param[in] policy the policy from which a request
+ * \param[in] peek   when set, signifies that we just want to examine the
+ *		     request, and not handle it, so the request is not removed
+ *		     from the policy.
+ * \param[in] force  when set, it will force a policy to return a request if it
+ *		     has one pending
+ *
+ * \retval the NRS request to be handled
+ */
+static inline
+struct ptlrpc_nrs_request * nrs_request_get(struct ptlrpc_nrs_policy *policy,
+					    bool peek, bool force)
+{
+	struct ptlrpc_nrs_request *nrq;
+
+	LASSERT(policy->pol_req_queued > 0);
+
+	nrq = policy->pol_desc->pd_ops->op_req_get(policy, peek, force);
+
+	LASSERT(ergo(nrq != NULL, nrs_request_policy(nrq) == policy));
+
+	return nrq;
+}
+
+/**
+ * Enqueues request \a nrq for later handling, via one one the policies for
+ * which resources where earlier obtained via nrs_resource_get_safe(). The
+ * function attempts to enqueue the request first on the primary policy
+ * (if any), since this is the preferred choice.
+ *
+ * \param nrq the request being enqueued
+ *
+ * \see nrs_resource_get_safe()
+ */
+static inline void nrs_request_enqueue(struct ptlrpc_nrs_request *nrq)
+{
+	struct ptlrpc_nrs_policy *policy;
+	int			  rc;
+	int			  i;
+
+	/**
+	 * Try in descending order, because the primary policy (if any) is
+	 * the preferred choice.
+	 */
+	for (i = NRS_RES_MAX - 1; i >= 0; i--) {
+		if (nrq->nr_res_ptrs[i] == NULL)
+			continue;
+
+		nrq->nr_res_idx = i;
+		policy = nrq->nr_res_ptrs[i]->res_policy;
+
+		rc = policy->pol_desc->pd_ops->op_req_enqueue(policy, nrq);
+		if (rc == 0) {
+			policy->pol_nrs->nrs_req_queued++;
+			policy->pol_req_queued++;
+			return;
+		}
+	}
+	/**
+	 * Should never get here, as at least the primary policy's
+	 * ptlrpc_nrs_pol_ops::op_req_enqueue() implementation should always
+	 * succeed.
+	 */
+	LBUG();
+}
+
+/**
+ * Called when a request has been handled
+ *
+ * \param[in] nrs the request that has been handled; can be used for
+ *		  job/resource control.
+ *
+ * \see ptlrpc_nrs_req_stop_nolock()
+ */
+static inline void nrs_request_stop(struct ptlrpc_nrs_request *nrq)
+{
+	struct ptlrpc_nrs_policy *policy = nrs_request_policy(nrq);
+
+	if (policy->pol_desc->pd_ops->op_req_stop)
+		policy->pol_desc->pd_ops->op_req_stop(policy, nrq);
+
+	LASSERT(policy->pol_nrs->nrs_req_started > 0);
+	LASSERT(policy->pol_req_started > 0);
+
+	policy->pol_nrs->nrs_req_started--;
+	policy->pol_req_started--;
+}
+
+/**
+ * Handler for operations that can be carried out on policies.
+ *
+ * Handles opcodes that are common to all policy types within NRS core, and
+ * passes any unknown opcodes to the policy-specific control function.
+ *
+ * \param[in]	  nrs  the NRS head this policy belongs to.
+ * \param[in]	  name the human-readable policy name; should be the same as
+ *		       ptlrpc_nrs_pol_desc::pd_name.
+ * \param[in]	  opc  the opcode of the operation being carried out.
+ * \param[in,out] arg  can be used to pass information in and out between when
+ *		       carrying an operation; usually data that is private to
+ *		       the policy at some level, or generic policy status
+ *		       information.
+ *
+ * \retval -ve error condition
+ * \retval   0 operation was carried out successfully
+ */
+static int nrs_policy_ctl(struct ptlrpc_nrs *nrs, char *name,
+			  enum ptlrpc_nrs_ctl opc, void *arg)
+{
+	struct ptlrpc_nrs_policy       *policy;
+	int				rc = 0;
+	ENTRY;
+
+	spin_lock(&nrs->nrs_lock);
+
+	policy = nrs_policy_find_locked(nrs, name);
+	if (policy == NULL)
+		GOTO(out, rc = -ENOENT);
+
+	switch (opc) {
+		/**
+		 * Unknown opcode, pass it down to the policy-specific control
+		 * function for handling.
+		 */
+	default:
+		rc = nrs_policy_ctl_locked(policy, opc, arg);
+		break;
+
+		/**
+		 * Start \e policy
+		 */
+	case PTLRPC_NRS_CTL_START:
+		rc = nrs_policy_start_locked(policy);
+		break;
+	}
+out:
+	if (policy != NULL)
+		nrs_policy_put_locked(policy);
+
+	spin_unlock(&nrs->nrs_lock);
+
+	RETURN(rc);
+}
+
+/**
+ * Unregisters a policy by name.
+ *
+ * \param[in] nrs  the NRS head this policy belongs to.
+ * \param[in] name the human-readable policy name; should be the same as
+ *		   ptlrpc_nrs_pol_desc::pd_name
+ *
+ * \retval -ve error
+ * \retval   0 success
+ */
+static int nrs_policy_unregister(struct ptlrpc_nrs *nrs, char *name)
+{
+	struct ptlrpc_nrs_policy *policy = NULL;
+	ENTRY;
+
+	spin_lock(&nrs->nrs_lock);
+
+	policy = nrs_policy_find_locked(nrs, name);
+	if (policy == NULL) {
+		spin_unlock(&nrs->nrs_lock);
+
+		CERROR("Can't find NRS policy %s\n", name);
+		RETURN(-ENOENT);
+	}
+
+	if (policy->pol_ref > 1) {
+		CERROR("Policy %s is busy with %d references\n", name,
+		       (int)policy->pol_ref);
+		nrs_policy_put_locked(policy);
+
+		spin_unlock(&nrs->nrs_lock);
+		RETURN(-EBUSY);
+	}
+
+	LASSERT(policy->pol_req_queued == 0);
+	LASSERT(policy->pol_req_started == 0);
+
+	if (policy->pol_state != NRS_POL_STATE_STOPPED) {
+		nrs_policy_stop_locked(policy);
+		LASSERT(policy->pol_state == NRS_POL_STATE_STOPPED);
+	}
+
+	list_del(&policy->pol_list);
+	nrs->nrs_num_pols--;
+
+	nrs_policy_put_locked(policy);
+
+	spin_unlock(&nrs->nrs_lock);
+
+	nrs_policy_fini(policy);
+
+	LASSERT(policy->pol_private == NULL);
+	OBD_FREE_PTR(policy);
+
+	RETURN(0);
+}
+
+/**
+ * Register a policy from \policy descriptor \a desc with NRS head \a nrs.
+ *
+ * \param[in] nrs   the NRS head on which the policy will be registered.
+ * \param[in] desc  the policy descriptor from which the information will be
+ *		    obtained to register the policy.
+ *
+ * \retval -ve error
+ * \retval   0 success
+ */
+static int nrs_policy_register(struct ptlrpc_nrs *nrs,
+			       struct ptlrpc_nrs_pol_desc *desc)
+{
+	struct ptlrpc_nrs_policy       *policy;
+	struct ptlrpc_nrs_policy       *tmp;
+	struct ptlrpc_service_part     *svcpt = nrs->nrs_svcpt;
+	int				rc;
+	ENTRY;
+
+	LASSERT(svcpt != NULL);
+	LASSERT(desc->pd_ops != NULL);
+	LASSERT(desc->pd_ops->op_res_get != NULL);
+	LASSERT(desc->pd_ops->op_req_get != NULL);
+	LASSERT(desc->pd_ops->op_req_enqueue != NULL);
+	LASSERT(desc->pd_ops->op_req_dequeue != NULL);
+	LASSERT(desc->pd_compat != NULL);
+
+	OBD_CPT_ALLOC_GFP(policy, svcpt->scp_service->srv_cptable,
+			  svcpt->scp_cpt, sizeof(*policy), __GFP_IO);
+	if (policy == NULL)
+		RETURN(-ENOMEM);
+
+	policy->pol_nrs     = nrs;
+	policy->pol_desc    = desc;
+	policy->pol_state   = NRS_POL_STATE_STOPPED;
+	policy->pol_flags   = desc->pd_flags;
+
+	INIT_LIST_HEAD(&policy->pol_list);
+	INIT_LIST_HEAD(&policy->pol_list_queued);
+
+	rc = nrs_policy_init(policy);
+	if (rc != 0) {
+		OBD_FREE_PTR(policy);
+		RETURN(rc);
+	}
+
+	spin_lock(&nrs->nrs_lock);
+
+	tmp = nrs_policy_find_locked(nrs, policy->pol_desc->pd_name);
+	if (tmp != NULL) {
+		CERROR("NRS policy %s has been registered, can't register it "
+		       "for %s\n", policy->pol_desc->pd_name,
+		       svcpt->scp_service->srv_name);
+		nrs_policy_put_locked(tmp);
+
+		spin_unlock(&nrs->nrs_lock);
+		nrs_policy_fini(policy);
+		OBD_FREE_PTR(policy);
+
+		RETURN(-EEXIST);
+	}
+
+	list_add_tail(&policy->pol_list, &nrs->nrs_policy_list);
+	nrs->nrs_num_pols++;
+
+	if (policy->pol_flags & PTLRPC_NRS_FL_REG_START)
+		rc = nrs_policy_start_locked(policy);
+
+	spin_unlock(&nrs->nrs_lock);
+
+	if (rc != 0)
+		(void) nrs_policy_unregister(nrs, policy->pol_desc->pd_name);
+
+	RETURN(rc);
+}
+
+/**
+ * Enqueue request \a req using one of the policies its resources are referring
+ * to.
+ *
+ * \param[in] req the request to enqueue.
+ */
+static void ptlrpc_nrs_req_add_nolock(struct ptlrpc_request *req)
+{
+	struct ptlrpc_nrs_policy       *policy;
+
+	LASSERT(req->rq_nrq.nr_initialized);
+	LASSERT(!req->rq_nrq.nr_enqueued);
+
+	nrs_request_enqueue(&req->rq_nrq);
+	req->rq_nrq.nr_enqueued = 1;
+
+	policy = nrs_request_policy(&req->rq_nrq);
+	/**
+	 * Add the policy to the NRS head's list of policies with enqueued
+	 * requests, if it has not been added there.
+	 */
+	if (unlikely(list_empty(&policy->pol_list_queued)))
+		list_add_tail(&policy->pol_list_queued,
+				  &policy->pol_nrs->nrs_policy_queued);
+}
+
+/**
+ * Enqueue a request on the high priority NRS head.
+ *
+ * \param req the request to enqueue.
+ */
+static void ptlrpc_nrs_hpreq_add_nolock(struct ptlrpc_request *req)
+{
+	int	opc = lustre_msg_get_opc(req->rq_reqmsg);
+	ENTRY;
+
+	spin_lock(&req->rq_lock);
+	req->rq_hp = 1;
+	ptlrpc_nrs_req_add_nolock(req);
+	if (opc != OBD_PING)
+		DEBUG_REQ(D_NET, req, "high priority req");
+	spin_unlock(&req->rq_lock);
+	EXIT;
+}
+
+/**
+ * Returns a boolean predicate indicating whether the policy described by
+ * \a desc is adequate for use with service \a svc.
+ *
+ * \param[in] svc  the service
+ * \param[in] desc the policy descriptor
+ *
+ * \retval false the policy is not compatible with the service
+ * \retval true	 the policy is compatible with the service
+ */
+static inline bool nrs_policy_compatible(const struct ptlrpc_service *svc,
+					 const struct ptlrpc_nrs_pol_desc *desc)
+{
+	return desc->pd_compat(svc, desc);
+}
+
+/**
+ * Registers all compatible policies in nrs_core.nrs_policies, for NRS head
+ * \a nrs.
+ *
+ * \param[in] nrs the NRS head
+ *
+ * \retval -ve error
+ * \retval   0 success
+ *
+ * \pre mutex_is_locked(&nrs_core.nrs_mutex)
+ *
+ * \see ptlrpc_service_nrs_setup()
+ */
+static int nrs_register_policies_locked(struct ptlrpc_nrs *nrs)
+{
+	struct ptlrpc_nrs_pol_desc *desc;
+	/* for convenience */
+	struct ptlrpc_service_part	 *svcpt = nrs->nrs_svcpt;
+	struct ptlrpc_service		 *svc = svcpt->scp_service;
+	int				  rc = -EINVAL;
+	ENTRY;
+
+	LASSERT(mutex_is_locked(&nrs_core.nrs_mutex));
+
+	list_for_each_entry(desc, &nrs_core.nrs_policies, pd_list) {
+		if (nrs_policy_compatible(svc, desc)) {
+			rc = nrs_policy_register(nrs, desc);
+			if (rc != 0) {
+				CERROR("Failed to register NRS policy %s for "
+				       "partition %d of service %s: %d\n",
+				       desc->pd_name, svcpt->scp_cpt,
+				       svc->srv_name, rc);
+				/**
+				 * Fail registration if any of the policies'
+				 * registration fails.
+				 */
+				break;
+			}
+		}
+	}
+
+	RETURN(rc);
+}
+
+/**
+ * Initializes NRS head \a nrs of service partition \a svcpt, and registers all
+ * compatible policies in NRS core, with the NRS head.
+ *
+ * \param[in] nrs   the NRS head
+ * \param[in] svcpt the PTLRPC service partition to setup
+ *
+ * \retval -ve error
+ * \retval   0 success
+ *
+ * \pre mutex_is_locked(&nrs_core.nrs_mutex)
+ */
+static int nrs_svcpt_setup_locked0(struct ptlrpc_nrs *nrs,
+				   struct ptlrpc_service_part *svcpt)
+{
+	int				rc;
+	enum ptlrpc_nrs_queue_type	queue;
+
+	LASSERT(mutex_is_locked(&nrs_core.nrs_mutex));
+
+	if (nrs == &svcpt->scp_nrs_reg)
+		queue = PTLRPC_NRS_QUEUE_REG;
+	else if (nrs == svcpt->scp_nrs_hp)
+		queue = PTLRPC_NRS_QUEUE_HP;
+	else
+		LBUG();
+
+	nrs->nrs_svcpt = svcpt;
+	nrs->nrs_queue_type = queue;
+	spin_lock_init(&nrs->nrs_lock);
+	INIT_LIST_HEAD(&nrs->nrs_policy_list);
+	INIT_LIST_HEAD(&nrs->nrs_policy_queued);
+
+	rc = nrs_register_policies_locked(nrs);
+
+	RETURN(rc);
+}
+
+/**
+ * Allocates a regular and optionally a high-priority NRS head (if the service
+ * handles high-priority RPCs), and then registers all available compatible
+ * policies on those NRS heads.
+ *
+ * \param[in,out] svcpt the PTLRPC service partition to setup
+ *
+ * \pre mutex_is_locked(&nrs_core.nrs_mutex)
+ */
+static int nrs_svcpt_setup_locked(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_nrs	       *nrs;
+	int				rc;
+	ENTRY;
+
+	LASSERT(mutex_is_locked(&nrs_core.nrs_mutex));
+
+	/**
+	 * Initialize the regular NRS head.
+	 */
+	nrs = nrs_svcpt2nrs(svcpt, false);
+	rc = nrs_svcpt_setup_locked0(nrs, svcpt);
+	if (rc < 0)
+		GOTO(out, rc);
+
+	/**
+	 * Optionally allocate a high-priority NRS head.
+	 */
+	if (svcpt->scp_service->srv_ops.so_hpreq_handler == NULL)
+		GOTO(out, rc);
+
+	OBD_CPT_ALLOC_PTR(svcpt->scp_nrs_hp,
+			  svcpt->scp_service->srv_cptable,
+			  svcpt->scp_cpt);
+	if (svcpt->scp_nrs_hp == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	nrs = nrs_svcpt2nrs(svcpt, true);
+	rc = nrs_svcpt_setup_locked0(nrs, svcpt);
+
+out:
+	RETURN(rc);
+}
+
+/**
+ * Unregisters all policies on all available NRS heads in a service partition;
+ * called at PTLRPC service unregistration time.
+ *
+ * \param[in] svcpt the PTLRPC service partition
+ *
+ * \pre mutex_is_locked(&nrs_core.nrs_mutex)
+ */
+static void nrs_svcpt_cleanup_locked(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_nrs	       *nrs;
+	struct ptlrpc_nrs_policy       *policy;
+	struct ptlrpc_nrs_policy       *tmp;
+	int				rc;
+	bool				hp = false;
+	ENTRY;
+
+	LASSERT(mutex_is_locked(&nrs_core.nrs_mutex));
+
+again:
+	nrs = nrs_svcpt2nrs(svcpt, hp);
+	nrs->nrs_stopping = 1;
+
+	list_for_each_entry_safe(policy, tmp, &nrs->nrs_policy_list,
+				     pol_list) {
+		rc = nrs_policy_unregister(nrs, policy->pol_desc->pd_name);
+		LASSERT(rc == 0);
+	}
+
+	/**
+	 * If the service partition has an HP NRS head, clean that up as well.
+	 */
+	if (!hp && nrs_svcpt_has_hp(svcpt)) {
+		hp = true;
+		goto again;
+	}
+
+	if (hp)
+		OBD_FREE_PTR(nrs);
+
+	EXIT;
+}
+
+/**
+ * Returns the descriptor for a policy as identified by by \a name.
+ *
+ * \param[in] name the policy name
+ *
+ * \retval the policy descriptor
+ * \retval NULL
+ */
+static struct ptlrpc_nrs_pol_desc *nrs_policy_find_desc_locked(const char *name)
+{
+	struct ptlrpc_nrs_pol_desc     *tmp;
+	ENTRY;
+
+	list_for_each_entry(tmp, &nrs_core.nrs_policies, pd_list) {
+		if (strncmp(tmp->pd_name, name, NRS_POL_NAME_MAX) == 0)
+			RETURN(tmp);
+	}
+	RETURN(NULL);
+}
+
+/**
+ * Removes the policy from all supported NRS heads of all partitions of all
+ * PTLRPC services.
+ *
+ * \param[in] desc the policy descriptor to unregister
+ *
+ * \retval -ve error
+ * \retval  0  successfully unregistered policy on all supported NRS heads
+ *
+ * \pre mutex_is_locked(&nrs_core.nrs_mutex)
+ * \pre mutex_is_locked(&ptlrpc_all_services_mutex)
+ */
+static int nrs_policy_unregister_locked(struct ptlrpc_nrs_pol_desc *desc)
+{
+	struct ptlrpc_nrs	       *nrs;
+	struct ptlrpc_service	       *svc;
+	struct ptlrpc_service_part     *svcpt;
+	int				i;
+	int				rc = 0;
+	ENTRY;
+
+	LASSERT(mutex_is_locked(&nrs_core.nrs_mutex));
+	LASSERT(mutex_is_locked(&ptlrpc_all_services_mutex));
+
+	list_for_each_entry(svc, &ptlrpc_all_services, srv_list) {
+
+		if (!nrs_policy_compatible(svc, desc) ||
+		    unlikely(svc->srv_is_stopping))
+			continue;
+
+		ptlrpc_service_for_each_part(svcpt, i, svc) {
+			bool hp = false;
+
+again:
+			nrs = nrs_svcpt2nrs(svcpt, hp);
+			rc = nrs_policy_unregister(nrs, desc->pd_name);
+			/**
+			 * Ignore -ENOENT as the policy may not have registered
+			 * successfully on all service partitions.
+			 */
+			if (rc == -ENOENT) {
+				rc = 0;
+			} else if (rc != 0) {
+				CERROR("Failed to unregister NRS policy %s for "
+				       "partition %d of service %s: %d\n",
+				       desc->pd_name, svcpt->scp_cpt,
+				       svcpt->scp_service->srv_name, rc);
+				RETURN(rc);
+			}
+
+			if (!hp && nrs_svc_has_hp(svc)) {
+				hp = true;
+				goto again;
+			}
+		}
+
+		if (desc->pd_ops->op_lprocfs_fini != NULL)
+			desc->pd_ops->op_lprocfs_fini(svc);
+	}
+
+	RETURN(rc);
+}
+
+/**
+ * Registers a new policy with NRS core.
+ *
+ * The function will only succeed if policy registration with all compatible
+ * service partitions (if any) is successful.
+ *
+ * N.B. This function should be called either at ptlrpc module initialization
+ *	time when registering a policy that ships with NRS core, or in a
+ *	module's init() function for policies registering from other modules.
+ *
+ * \param[in] conf configuration information for the new policy to register
+ *
+ * \retval -ve error
+ * \retval   0 success
+ */
+int ptlrpc_nrs_policy_register(struct ptlrpc_nrs_pol_conf *conf)
+{
+	struct ptlrpc_service	       *svc;
+	struct ptlrpc_nrs_pol_desc     *desc;
+	int				rc = 0;
+	ENTRY;
+
+	LASSERT(conf != NULL);
+	LASSERT(conf->nc_ops != NULL);
+	LASSERT(conf->nc_compat != NULL);
+	LASSERT(ergo(conf->nc_compat == nrs_policy_compat_one,
+		conf->nc_compat_svc_name != NULL));
+	LASSERT(ergo((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) != 0,
+		     conf->nc_owner != NULL));
+
+	conf->nc_name[NRS_POL_NAME_MAX - 1] = '\0';
+
+	/**
+	 * External policies are not allowed to start immediately upon
+	 * registration, as there is a relatively higher chance that their
+	 * registration might fail. In such a case, some policy instances may
+	 * already have requests queued wen unregistration needs to happen as
+	 * part o cleanup; since there is currently no way to drain requests
+	 * from a policy unless the service is unregistering, we just disallow
+	 * this.
+	 */
+	if ((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) &&
+	    (conf->nc_flags & (PTLRPC_NRS_FL_FALLBACK |
+			       PTLRPC_NRS_FL_REG_START))) {
+		CERROR("NRS: failing to register policy %s. Please check "
+		       "policy flags; external policies cannot act as fallback "
+		       "policies, or be started immediately upon registration "
+		       "without interaction with lprocfs\n", conf->nc_name);
+		RETURN(-EINVAL);
+	}
+
+	mutex_lock(&nrs_core.nrs_mutex);
+
+	if (nrs_policy_find_desc_locked(conf->nc_name) != NULL) {
+		CERROR("NRS: failing to register policy %s which has already "
+		       "been registered with NRS core!\n",
+		       conf->nc_name);
+		GOTO(fail, rc = -EEXIST);
+	}
+
+	OBD_ALLOC_PTR(desc);
+	if (desc == NULL)
+		GOTO(fail, rc = -ENOMEM);
+
+	strncpy(desc->pd_name, conf->nc_name, NRS_POL_NAME_MAX);
+	desc->pd_ops		 = conf->nc_ops;
+	desc->pd_compat		 = conf->nc_compat;
+	desc->pd_compat_svc_name = conf->nc_compat_svc_name;
+	if ((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) != 0)
+		desc->pd_owner	 = conf->nc_owner;
+	desc->pd_flags		 = conf->nc_flags;
+	atomic_set(&desc->pd_refs, 0);
+
+	/**
+	 * For policies that are held in the same module as NRS (currently
+	 * ptlrpc), do not register the policy with all compatible services,
+	 * as the services will not have started at this point, since we are
+	 * calling from ptlrpc module initialization code. In such cases each
+	 * service will register all compatible policies later, via
+	 * ptlrpc_service_nrs_setup().
+	 */
+	if ((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) == 0)
+		goto internal;
+
+	/**
+	 * Register the new policy on all compatible services
+	 */
+	mutex_lock(&ptlrpc_all_services_mutex);
+
+	list_for_each_entry(svc, &ptlrpc_all_services, srv_list) {
+		struct ptlrpc_service_part     *svcpt;
+		int				i;
+		int				rc2;
+
+		if (!nrs_policy_compatible(svc, desc) ||
+		    unlikely(svc->srv_is_stopping))
+			continue;
+
+		ptlrpc_service_for_each_part(svcpt, i, svc) {
+			struct ptlrpc_nrs      *nrs;
+			bool			hp = false;
+again:
+			nrs = nrs_svcpt2nrs(svcpt, hp);
+			rc = nrs_policy_register(nrs, desc);
+			if (rc != 0) {
+				CERROR("Failed to register NRS policy %s for "
+				       "partition %d of service %s: %d\n",
+				       desc->pd_name, svcpt->scp_cpt,
+				       svcpt->scp_service->srv_name, rc);
+
+				rc2 = nrs_policy_unregister_locked(desc);
+				/**
+				 * Should not fail at this point
+				 */
+				LASSERT(rc2 == 0);
+				mutex_unlock(&ptlrpc_all_services_mutex);
+				OBD_FREE_PTR(desc);
+				GOTO(fail, rc);
+			}
+
+			if (!hp && nrs_svc_has_hp(svc)) {
+				hp = true;
+				goto again;
+			}
+		}
+
+		/**
+		 * No need to take a reference to other modules here, as we
+		 * will be calling from the module's init() function.
+		 */
+		if (desc->pd_ops->op_lprocfs_init != NULL) {
+			rc = desc->pd_ops->op_lprocfs_init(svc);
+			if (rc != 0) {
+				rc2 = nrs_policy_unregister_locked(desc);
+				/**
+				 * Should not fail at this point
+				 */
+				LASSERT(rc2 == 0);
+				mutex_unlock(&ptlrpc_all_services_mutex);
+				OBD_FREE_PTR(desc);
+				GOTO(fail, rc);
+			}
+		}
+	}
+
+	mutex_unlock(&ptlrpc_all_services_mutex);
+internal:
+	list_add_tail(&desc->pd_list, &nrs_core.nrs_policies);
+fail:
+	mutex_unlock(&nrs_core.nrs_mutex);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_nrs_policy_register);
+
+/**
+ * Unregisters a previously registered policy with NRS core. All instances of
+ * the policy on all NRS heads of all supported services are removed.
+ *
+ * N.B. This function should only be called from a module's exit() function.
+ *	Although it can be used for policies that ship alongside NRS core, the
+ *	function is primarily intended for policies that register externally,
+ *	from other modules.
+ *
+ * \param[in] conf configuration information for the policy to unregister
+ *
+ * \retval -ve error
+ * \retval   0 success
+ */
+int ptlrpc_nrs_policy_unregister(struct ptlrpc_nrs_pol_conf *conf)
+{
+	struct ptlrpc_nrs_pol_desc	*desc;
+	int				 rc;
+	ENTRY;
+
+	LASSERT(conf != NULL);
+
+	if (conf->nc_flags & PTLRPC_NRS_FL_FALLBACK) {
+		CERROR("Unable to unregister a fallback policy, unless the "
+		       "PTLRPC service is stopping.\n");
+		RETURN(-EPERM);
+	}
+
+	conf->nc_name[NRS_POL_NAME_MAX - 1] = '\0';
+
+	mutex_lock(&nrs_core.nrs_mutex);
+
+	desc = nrs_policy_find_desc_locked(conf->nc_name);
+	if (desc == NULL) {
+		CERROR("Failing to unregister NRS policy %s which has "
+		       "not been registered with NRS core!\n",
+		       conf->nc_name);
+		GOTO(not_exist, rc = -ENOENT);
+	}
+
+	mutex_lock(&ptlrpc_all_services_mutex);
+
+	rc = nrs_policy_unregister_locked(desc);
+	if (rc < 0) {
+		if (rc == -EBUSY)
+			CERROR("Please first stop policy %s on all service "
+			       "partitions and then retry to unregister the "
+			       "policy.\n", conf->nc_name);
+		GOTO(fail, rc);
+	}
+
+	CDEBUG(D_INFO, "Unregistering policy %s from NRS core.\n",
+	       conf->nc_name);
+
+	list_del(&desc->pd_list);
+	OBD_FREE_PTR(desc);
+
+fail:
+	mutex_unlock(&ptlrpc_all_services_mutex);
+
+not_exist:
+	mutex_unlock(&nrs_core.nrs_mutex);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_nrs_policy_unregister);
+
+/**
+ * Setup NRS heads on all service partitions of service \a svc, and register
+ * all compatible policies on those NRS heads.
+ *
+ * To be called from withing ptl
+ * \param[in] svc the service to setup
+ *
+ * \retval -ve error, the calling logic should eventually call
+ *		      ptlrpc_service_nrs_cleanup() to undo any work performed
+ *		      by this function.
+ *
+ * \see ptlrpc_register_service()
+ * \see ptlrpc_service_nrs_cleanup()
+ */
+int ptlrpc_service_nrs_setup(struct ptlrpc_service *svc)
+{
+	struct ptlrpc_service_part	       *svcpt;
+	const struct ptlrpc_nrs_pol_desc       *desc;
+	int					i;
+	int					rc = 0;
+
+	mutex_lock(&nrs_core.nrs_mutex);
+
+	/**
+	 * Initialize NRS heads on all service CPTs.
+	 */
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		rc = nrs_svcpt_setup_locked(svcpt);
+		if (rc != 0)
+			GOTO(failed, rc);
+	}
+
+	/**
+	 * Set up lprocfs interfaces for all supported policies for the
+	 * service.
+	 */
+	list_for_each_entry(desc, &nrs_core.nrs_policies, pd_list) {
+		if (!nrs_policy_compatible(svc, desc))
+			continue;
+
+		if (desc->pd_ops->op_lprocfs_init != NULL) {
+			rc = desc->pd_ops->op_lprocfs_init(svc);
+			if (rc != 0)
+				GOTO(failed, rc);
+		}
+	}
+
+failed:
+
+	mutex_unlock(&nrs_core.nrs_mutex);
+
+	RETURN(rc);
+}
+
+/**
+ * Unregisters all policies on all service partitions of service \a svc.
+ *
+ * \param[in] svc the PTLRPC service to unregister
+ */
+void ptlrpc_service_nrs_cleanup(struct ptlrpc_service *svc)
+{
+	struct ptlrpc_service_part	     *svcpt;
+	const struct ptlrpc_nrs_pol_desc     *desc;
+	int				      i;
+
+	mutex_lock(&nrs_core.nrs_mutex);
+
+	/**
+	 * Clean up NRS heads on all service partitions
+	 */
+	ptlrpc_service_for_each_part(svcpt, i, svc)
+		nrs_svcpt_cleanup_locked(svcpt);
+
+	/**
+	 * Clean up lprocfs interfaces for all supported policies for the
+	 * service.
+	 */
+	list_for_each_entry(desc, &nrs_core.nrs_policies, pd_list) {
+		if (!nrs_policy_compatible(svc, desc))
+			continue;
+
+		if (desc->pd_ops->op_lprocfs_fini != NULL)
+			desc->pd_ops->op_lprocfs_fini(svc);
+	}
+
+	mutex_unlock(&nrs_core.nrs_mutex);
+}
+
+/**
+ * Obtains NRS head resources for request \a req.
+ *
+ * These could be either on the regular or HP NRS head of \a svcpt; resources
+ * taken on the regular head can later be swapped for HP head resources by
+ * ldlm_lock_reorder_req().
+ *
+ * \param[in] svcpt the service partition
+ * \param[in] req   the request
+ * \param[in] hp    which NRS head of \a svcpt to use
+ */
+void ptlrpc_nrs_req_initialize(struct ptlrpc_service_part *svcpt,
+			       struct ptlrpc_request *req, bool hp)
+{
+	struct ptlrpc_nrs	*nrs = nrs_svcpt2nrs(svcpt, hp);
+
+	memset(&req->rq_nrq, 0, sizeof(req->rq_nrq));
+	nrs_resource_get_safe(nrs, &req->rq_nrq, req->rq_nrq.nr_res_ptrs,
+			      false);
+
+	/**
+	 * It is fine to access \e nr_initialized without locking as there is
+	 * no contention at this early stage.
+	 */
+	req->rq_nrq.nr_initialized = 1;
+}
+
+/**
+ * Releases resources for a request; is called after the request has been
+ * handled.
+ *
+ * \param[in] req the request
+ *
+ * \see ptlrpc_server_finish_request()
+ */
+void ptlrpc_nrs_req_finalize(struct ptlrpc_request *req)
+{
+	if (req->rq_nrq.nr_initialized) {
+		nrs_resource_put_safe(req->rq_nrq.nr_res_ptrs);
+		/* no protection on bit nr_initialized because no
+		 * contention at this late stage */
+		req->rq_nrq.nr_finalized = 1;
+	}
+}
+
+void ptlrpc_nrs_req_stop_nolock(struct ptlrpc_request *req)
+{
+	if (req->rq_nrq.nr_started)
+		nrs_request_stop(&req->rq_nrq);
+}
+
+/**
+ * Enqueues request \a req on either the regular or high-priority NRS head
+ * of service partition \a svcpt.
+ *
+ * \param[in] svcpt the service partition
+ * \param[in] req   the request to be enqueued
+ * \param[in] hp    whether to enqueue the request on the regular or
+ *		    high-priority NRS head.
+ */
+void ptlrpc_nrs_req_add(struct ptlrpc_service_part *svcpt,
+			struct ptlrpc_request *req, bool hp)
+{
+	spin_lock(&svcpt->scp_req_lock);
+
+	if (hp)
+		ptlrpc_nrs_hpreq_add_nolock(req);
+	else
+		ptlrpc_nrs_req_add_nolock(req);
+
+	spin_unlock(&svcpt->scp_req_lock);
+}
+
+static void nrs_request_removed(struct ptlrpc_nrs_policy *policy)
+{
+	LASSERT(policy->pol_nrs->nrs_req_queued > 0);
+	LASSERT(policy->pol_req_queued > 0);
+
+	policy->pol_nrs->nrs_req_queued--;
+	policy->pol_req_queued--;
+
+	/**
+	 * If the policy has no more requests queued, remove it from
+	 * ptlrpc_nrs::nrs_policy_queued.
+	 */
+	if (unlikely(policy->pol_req_queued == 0)) {
+		list_del_init(&policy->pol_list_queued);
+
+		/**
+		 * If there are other policies with queued requests, move the
+		 * current policy to the end so that we can round robin over
+		 * all policies and drain the requests.
+		 */
+	} else if (policy->pol_req_queued != policy->pol_nrs->nrs_req_queued) {
+		LASSERT(policy->pol_req_queued <
+			policy->pol_nrs->nrs_req_queued);
+
+		list_move_tail(&policy->pol_list_queued,
+				   &policy->pol_nrs->nrs_policy_queued);
+	}
+}
+
+/**
+ * Obtains a request for handling from an NRS head of service partition
+ * \a svcpt.
+ *
+ * \param[in] svcpt the service partition
+ * \param[in] hp    whether to obtain a request from the regular or
+ *		    high-priority NRS head.
+ * \param[in] peek  when set, signifies that we just want to examine the
+ *		    request, and not handle it, so the request is not removed
+ *		    from the policy.
+ * \param[in] force when set, it will force a policy to return a request if it
+ *		    has one pending
+ *
+ * \retval the	request to be handled
+ * \retval NULL the head has no requests to serve
+ */
+struct ptlrpc_request *
+ptlrpc_nrs_req_get_nolock0(struct ptlrpc_service_part *svcpt, bool hp,
+			   bool peek, bool force)
+{
+	struct ptlrpc_nrs	  *nrs = nrs_svcpt2nrs(svcpt, hp);
+	struct ptlrpc_nrs_policy  *policy;
+	struct ptlrpc_nrs_request *nrq;
+
+	/**
+	 * Always try to drain requests from all NRS polices even if they are
+	 * inactive, because the user can change policy status at runtime.
+	 */
+	list_for_each_entry(policy, &nrs->nrs_policy_queued,
+				pol_list_queued) {
+		nrq = nrs_request_get(policy, peek, force);
+		if (nrq != NULL) {
+			if (likely(!peek)) {
+				nrq->nr_started = 1;
+
+				policy->pol_req_started++;
+				policy->pol_nrs->nrs_req_started++;
+
+				nrs_request_removed(policy);
+			}
+
+			return container_of(nrq, struct ptlrpc_request, rq_nrq);
+		}
+	}
+
+	return NULL;
+}
+
+/**
+ * Dequeues request \a req from the policy it has been enqueued on.
+ *
+ * \param[in] req the request
+ */
+void ptlrpc_nrs_req_del_nolock(struct ptlrpc_request *req)
+{
+	struct ptlrpc_nrs_policy *policy = nrs_request_policy(&req->rq_nrq);
+
+	policy->pol_desc->pd_ops->op_req_dequeue(policy, &req->rq_nrq);
+
+	req->rq_nrq.nr_enqueued = 0;
+
+	nrs_request_removed(policy);
+}
+
+/**
+ * Returns whether there are any requests currently enqueued on any of the
+ * policies of service partition's \a svcpt NRS head specified by \a hp. Should
+ * be called while holding ptlrpc_service_part::scp_req_lock to get a reliable
+ * result.
+ *
+ * \param[in] svcpt the service partition to enquire.
+ * \param[in] hp    whether the regular or high-priority NRS head is to be
+ *		    enquired.
+ *
+ * \retval false the indicated NRS head has no enqueued requests.
+ * \retval true	 the indicated NRS head has some enqueued requests.
+ */
+bool ptlrpc_nrs_req_pending_nolock(struct ptlrpc_service_part *svcpt, bool hp)
+{
+	struct ptlrpc_nrs *nrs = nrs_svcpt2nrs(svcpt, hp);
+
+	return nrs->nrs_req_queued > 0;
+};
+
+/**
+ * Moves request \a req from the regular to the high-priority NRS head.
+ *
+ * \param[in] req the request to move
+ */
+void ptlrpc_nrs_req_hp_move(struct ptlrpc_request *req)
+{
+	struct ptlrpc_service_part	*svcpt = req->rq_rqbd->rqbd_svcpt;
+	struct ptlrpc_nrs_request	*nrq = &req->rq_nrq;
+	struct ptlrpc_nrs_resource	*res1[NRS_RES_MAX];
+	struct ptlrpc_nrs_resource	*res2[NRS_RES_MAX];
+	ENTRY;
+
+	/**
+	 * Obtain the high-priority NRS head resources.
+	 */
+	nrs_resource_get_safe(nrs_svcpt2nrs(svcpt, true), nrq, res1, true);
+
+	spin_lock(&svcpt->scp_req_lock);
+
+	if (!ptlrpc_nrs_req_can_move(req))
+		goto out;
+
+	ptlrpc_nrs_req_del_nolock(req);
+
+	memcpy(res2, nrq->nr_res_ptrs, NRS_RES_MAX * sizeof(res2[0]));
+	memcpy(nrq->nr_res_ptrs, res1, NRS_RES_MAX * sizeof(res1[0]));
+
+	ptlrpc_nrs_hpreq_add_nolock(req);
+
+	memcpy(res1, res2, NRS_RES_MAX * sizeof(res1[0]));
+out:
+	spin_unlock(&svcpt->scp_req_lock);
+
+	/**
+	 * Release either the regular NRS head resources if we moved the
+	 * request, or the high-priority NRS head resources if we took a
+	 * reference earlier in this function and ptlrpc_nrs_req_can_move()
+	 * returned false.
+	 */
+	nrs_resource_put_safe(res1);
+	EXIT;
+}
+
+/**
+ * Carries out a control operation \a opc on the policy identified by the
+ * human-readable \a name, on either all partitions, or only on the first
+ * partition of service \a svc.
+ *
+ * \param[in]	  svc	 the service the policy belongs to.
+ * \param[in]	  queue  whether to carry out the command on the policy which
+ *			 belongs to the regular, high-priority, or both NRS
+ *			 heads of service partitions of \a svc.
+ * \param[in]	  name   the policy to act upon, by human-readable name
+ * \param[in]	  opc	 the opcode of the operation to carry out
+ * \param[in]	  single when set, the operation will only be carried out on the
+ *			 NRS heads of the first service partition of \a svc.
+ *			 This is useful for some policies which e.g. share
+ *			 identical values on the same parameters of different
+ *			 service partitions; when reading these parameters via
+ *			 lprocfs, these policies may just want to obtain and
+ *			 print out the values from the first service partition.
+ *			 Storing these values centrally elsewhere then could be
+ *			 another solution for this.
+ * \param[in,out] arg	 can be used as a generic in/out buffer between control
+ *			 operations and the user environment.
+ *
+ *\retval -ve error condition
+ *\retval   0 operation was carried out successfully
+ */
+int ptlrpc_nrs_policy_control(const struct ptlrpc_service *svc,
+			      enum ptlrpc_nrs_queue_type queue, char *name,
+			      enum ptlrpc_nrs_ctl opc, bool single, void *arg)
+{
+	struct ptlrpc_service_part     *svcpt;
+	int				i;
+	int				rc = 0;
+	ENTRY;
+
+	LASSERT(opc != PTLRPC_NRS_CTL_INVALID);
+
+	if ((queue & PTLRPC_NRS_QUEUE_BOTH) == 0)
+		return -EINVAL;
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		if ((queue & PTLRPC_NRS_QUEUE_REG) != 0) {
+			rc = nrs_policy_ctl(nrs_svcpt2nrs(svcpt, false), name,
+					    opc, arg);
+			if (rc != 0 || (queue == PTLRPC_NRS_QUEUE_REG &&
+					single))
+				GOTO(out, rc);
+		}
+
+		if ((queue & PTLRPC_NRS_QUEUE_HP) != 0) {
+			/**
+			 * XXX: We could optionally check for
+			 * nrs_svc_has_hp(svc) here, and return an error if it
+			 * is false. Right now we rely on the policies' lprocfs
+			 * handlers that call the present function to make this
+			 * check; if they fail to do so, they might hit the
+			 * assertion inside nrs_svcpt2nrs() below.
+			 */
+			rc = nrs_policy_ctl(nrs_svcpt2nrs(svcpt, true), name,
+					    opc, arg);
+			if (rc != 0 || single)
+				GOTO(out, rc);
+		}
+	}
+out:
+	RETURN(rc);
+}
+
+
+/* ptlrpc/nrs_fifo.c */
+extern struct ptlrpc_nrs_pol_conf nrs_conf_fifo;
+
+/**
+ * Adds all policies that ship with the ptlrpc module, to NRS core's list of
+ * policies \e nrs_core.nrs_policies.
+ *
+ * \retval 0 all policies have been registered successfully
+ * \retval -ve error
+ */
+int ptlrpc_nrs_init(void)
+{
+	int	rc;
+	ENTRY;
+
+	mutex_init(&nrs_core.nrs_mutex);
+	INIT_LIST_HEAD(&nrs_core.nrs_policies);
+
+	rc = ptlrpc_nrs_policy_register(&nrs_conf_fifo);
+	if (rc != 0)
+		GOTO(fail, rc);
+
+
+	RETURN(rc);
+fail:
+	/**
+	 * Since no PTLRPC services have been started at this point, all we need
+	 * to do for cleanup is to free the descriptors.
+	 */
+	ptlrpc_nrs_fini();
+
+	RETURN(rc);
+}
+
+/**
+ * Removes all policy desciptors from nrs_core::nrs_policies, and frees the
+ * policy descriptors.
+ *
+ * Since all PTLRPC services are stopped at this point, there are no more
+ * instances of any policies, because each service will have stopped its policy
+ * instances in ptlrpc_service_nrs_cleanup(), so we just need to free the
+ * descriptors here.
+ */
+void ptlrpc_nrs_fini(void)
+{
+	struct ptlrpc_nrs_pol_desc *desc;
+	struct ptlrpc_nrs_pol_desc *tmp;
+
+	list_for_each_entry_safe(desc, tmp, &nrs_core.nrs_policies,
+				     pd_list) {
+		list_del_init(&desc->pd_list);
+		OBD_FREE_PTR(desc);
+	}
+}
+
+/** @} nrs */
diff --git a/drivers/staging/lustre/lustre/ptlrpc/nrs_crr.c b/drivers/staging/lustre/lustre/ptlrpc/nrs_crr.c
new file mode 100644
index 000000000000..ddfb5102d822
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/nrs_crr.c
@@ -0,0 +1,40 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011 Intel Corporation
+ *
+ * Copyright 2012 Xyratex Technology Limited
+ */
+/*
+ * lustre/ptlrpc/nrs_crr.c
+ *
+ * Network Request Scheduler (NRS) CRR-N policy
+ *
+ * Request ordering in a batched Round-Robin manner over client NIDs
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ * Author: Nikitas Angelinas <nikitas_angelinas@xyratex.com>
+ */
+/**
+ * \addtogoup nrs
+ * @{
+ */
diff --git a/drivers/staging/lustre/lustre/ptlrpc/nrs_fifo.c b/drivers/staging/lustre/lustre/ptlrpc/nrs_fifo.c
new file mode 100644
index 000000000000..7d3ee9706c9b
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/nrs_fifo.c
@@ -0,0 +1,270 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011 Intel Corporation
+ *
+ * Copyright 2012 Xyratex Technology Limited
+ */
+/*
+ * lustre/ptlrpc/nrs_fifo.c
+ *
+ * Network Request Scheduler (NRS) FIFO policy
+ *
+ * Handles RPCs in a FIFO manner, as received from the network. This policy is
+ * a logical wrapper around previous, non-NRS functionality. It is used as the
+ * default and fallback policy for all types of RPCs on all PTLRPC service
+ * partitions, for both regular and high-priority NRS heads. Default here means
+ * the policy is the one enabled at PTLRPC service partition startup time, and
+ * fallback means the policy is used to handle RPCs that are not handled
+ * successfully or are not handled at all by any primary policy that may be
+ * enabled on a given NRS head.
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ * Author: Nikitas Angelinas <nikitas_angelinas@xyratex.com>
+ */
+/**
+ * \addtogoup nrs
+ * @{
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include <obd_support.h>
+#include <obd_class.h>
+#include <linux/libcfs/libcfs.h>
+#include "ptlrpc_internal.h"
+
+/**
+ * \name fifo
+ *
+ * The FIFO policy is a logical wrapper around previous, non-NRS functionality.
+ * It schedules RPCs in the same order as they are queued from LNet.
+ *
+ * @{
+ */
+
+#define NRS_POL_NAME_FIFO	"fifo"
+
+/**
+ * Is called before the policy transitions into
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED; allocates and initializes a
+ * policy-specific private data structure.
+ *
+ * \param[in] policy The policy to start
+ *
+ * \retval -ENOMEM OOM error
+ * \retval  0	   success
+ *
+ * \see nrs_policy_register()
+ * \see nrs_policy_ctl()
+ */
+static int nrs_fifo_start(struct ptlrpc_nrs_policy *policy)
+{
+	struct nrs_fifo_head *head;
+
+	OBD_CPT_ALLOC_PTR(head, nrs_pol2cptab(policy), nrs_pol2cptid(policy));
+	if (head == NULL)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&head->fh_list);
+	policy->pol_private = head;
+	return 0;
+}
+
+/**
+ * Is called before the policy transitions into
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED; deallocates the policy-specific
+ * private data structure.
+ *
+ * \param[in] policy The policy to stop
+ *
+ * \see nrs_policy_stop0()
+ */
+static void nrs_fifo_stop(struct ptlrpc_nrs_policy *policy)
+{
+	struct nrs_fifo_head *head = policy->pol_private;
+
+	LASSERT(head != NULL);
+	LASSERT(list_empty(&head->fh_list));
+
+	OBD_FREE_PTR(head);
+}
+
+/**
+ * Is called for obtaining a FIFO policy resource.
+ *
+ * \param[in]  policy	  The policy on which the request is being asked for
+ * \param[in]  nrq	  The request for which resources are being taken
+ * \param[in]  parent	  Parent resource, unused in this policy
+ * \param[out] resp	  Resources references are placed in this array
+ * \param[in]  moving_req Signifies limited caller context; unused in this
+ *			  policy
+ *
+ * \retval 1 The FIFO policy only has a one-level resource hierarchy, as since
+ *	     it implements a simple scheduling algorithm in which request
+ *	     priority is determined on the request arrival order, it does not
+ *	     need to maintain a set of resources that would otherwise be used
+ *	     to calculate a request's priority.
+ *
+ * \see nrs_resource_get_safe()
+ */
+static int nrs_fifo_res_get(struct ptlrpc_nrs_policy *policy,
+			    struct ptlrpc_nrs_request *nrq,
+			    const struct ptlrpc_nrs_resource *parent,
+			    struct ptlrpc_nrs_resource **resp, bool moving_req)
+{
+	/**
+	 * Just return the resource embedded inside nrs_fifo_head, and end this
+	 * resource hierarchy reference request.
+	 */
+	*resp = &((struct nrs_fifo_head *)policy->pol_private)->fh_res;
+	return 1;
+}
+
+/**
+ * Called when getting a request from the FIFO policy for handling, or just
+ * peeking; removes the request from the policy when it is to be handled.
+ *
+ * \param[in] policy The policy
+ * \param[in] peek   When set, signifies that we just want to examine the
+ *		     request, and not handle it, so the request is not removed
+ *		     from the policy.
+ * \param[in] force  Force the policy to return a request; unused in this
+ *		     policy
+ *
+ * \retval The request to be handled; this is the next request in the FIFO
+ *	   queue
+ *
+ * \see ptlrpc_nrs_req_get_nolock()
+ * \see nrs_request_get()
+ */
+static
+struct ptlrpc_nrs_request * nrs_fifo_req_get(struct ptlrpc_nrs_policy *policy,
+					     bool peek, bool force)
+{
+	struct nrs_fifo_head	  *head = policy->pol_private;
+	struct ptlrpc_nrs_request *nrq;
+
+	nrq = unlikely(list_empty(&head->fh_list)) ? NULL :
+	      list_entry(head->fh_list.next, struct ptlrpc_nrs_request,
+			     nr_u.fifo.fr_list);
+
+	if (likely(!peek && nrq != NULL)) {
+		struct ptlrpc_request *req = container_of(nrq,
+							  struct ptlrpc_request,
+							  rq_nrq);
+
+		list_del_init(&nrq->nr_u.fifo.fr_list);
+
+		CDEBUG(D_RPCTRACE, "NRS start %s request from %s, seq: "LPU64
+		       "\n", policy->pol_desc->pd_name,
+		       libcfs_id2str(req->rq_peer), nrq->nr_u.fifo.fr_sequence);
+	}
+
+	return nrq;
+}
+
+/**
+ * Adds request \a nrq to \a policy's list of queued requests
+ *
+ * \param[in] policy The policy
+ * \param[in] nrq    The request to add
+ *
+ * \retval 0 success; nrs_request_enqueue() assumes this function will always
+ *		      succeed
+ */
+static int nrs_fifo_req_add(struct ptlrpc_nrs_policy *policy,
+			    struct ptlrpc_nrs_request *nrq)
+{
+	struct nrs_fifo_head *head;
+
+	head = container_of(nrs_request_resource(nrq), struct nrs_fifo_head,
+			    fh_res);
+	/**
+	 * Only used for debugging
+	 */
+	nrq->nr_u.fifo.fr_sequence = head->fh_sequence++;
+	list_add_tail(&nrq->nr_u.fifo.fr_list, &head->fh_list);
+
+	return 0;
+}
+
+/**
+ * Removes request \a nrq from \a policy's list of queued requests.
+ *
+ * \param[in] policy The policy
+ * \param[in] nrq    The request to remove
+ */
+static void nrs_fifo_req_del(struct ptlrpc_nrs_policy *policy,
+			     struct ptlrpc_nrs_request *nrq)
+{
+	LASSERT(!list_empty(&nrq->nr_u.fifo.fr_list));
+	list_del_init(&nrq->nr_u.fifo.fr_list);
+}
+
+/**
+ * Prints a debug statement right before the request \a nrq stops being
+ * handled.
+ *
+ * \param[in] policy The policy handling the request
+ * \param[in] nrq    The request being handled
+ *
+ * \see ptlrpc_server_finish_request()
+ * \see ptlrpc_nrs_req_stop_nolock()
+ */
+static void nrs_fifo_req_stop(struct ptlrpc_nrs_policy *policy,
+			      struct ptlrpc_nrs_request *nrq)
+{
+	struct ptlrpc_request *req = container_of(nrq, struct ptlrpc_request,
+						  rq_nrq);
+
+	CDEBUG(D_RPCTRACE, "NRS stop %s request from %s, seq: "LPU64"\n",
+	       policy->pol_desc->pd_name, libcfs_id2str(req->rq_peer),
+	       nrq->nr_u.fifo.fr_sequence);
+}
+
+/**
+ * FIFO policy operations
+ */
+static const struct ptlrpc_nrs_pol_ops nrs_fifo_ops = {
+	.op_policy_start	= nrs_fifo_start,
+	.op_policy_stop		= nrs_fifo_stop,
+	.op_res_get		= nrs_fifo_res_get,
+	.op_req_get		= nrs_fifo_req_get,
+	.op_req_enqueue		= nrs_fifo_req_add,
+	.op_req_dequeue		= nrs_fifo_req_del,
+	.op_req_stop		= nrs_fifo_req_stop,
+};
+
+/**
+ * FIFO policy configuration
+ */
+struct ptlrpc_nrs_pol_conf nrs_conf_fifo = {
+	.nc_name		= NRS_POL_NAME_FIFO,
+	.nc_ops			= &nrs_fifo_ops,
+	.nc_compat		= nrs_policy_compat_all,
+	.nc_flags		= PTLRPC_NRS_FL_FALLBACK |
+				  PTLRPC_NRS_FL_REG_START
+};
+
+/** @} fifo */
+
+/** @} nrs */
diff --git a/drivers/staging/lustre/lustre/ptlrpc/nrs_orr.c b/drivers/staging/lustre/lustre/ptlrpc/nrs_orr.c
new file mode 100644
index 000000000000..a88c51993df6
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/nrs_orr.c
@@ -0,0 +1,37 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011 Intel Corporation
+ *
+ * Copyright 2012 Xyratex Technology Limited
+ */
+/*
+ * lustre/ptlrpc/nrs_orr.c
+ *
+ * Network Request Scheduler (NRS) ORR and TRR policies
+ *
+ * Request scheduling in a Round-Robin manner over backend-fs objects and OSTs
+ * respectively
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ * Author: Nikitas Angelinas <nikitas_angelinas@xyratex.com>
+ */
diff --git a/drivers/staging/lustre/lustre/ptlrpc/pack_generic.c b/drivers/staging/lustre/lustre/ptlrpc/pack_generic.c
new file mode 100644
index 000000000000..1437636dfe28
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/pack_generic.c
@@ -0,0 +1,2575 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/pack_generic.c
+ *
+ * (Un)packing of OST requests
+ *
+ * Author: Peter J. Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Eric Barton <eeb@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <linux/libcfs/libcfs.h>
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <obd_cksum.h>
+#include <lustre/ll_fiemap.h>
+
+static inline int lustre_msg_hdr_size_v2(int count)
+{
+	return cfs_size_round(offsetof(struct lustre_msg_v2,
+				       lm_buflens[count]));
+}
+
+int lustre_msg_hdr_size(__u32 magic, int count)
+{
+	switch (magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_msg_hdr_size_v2(count);
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", magic);
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_hdr_size);
+
+void ptlrpc_buf_set_swabbed(struct ptlrpc_request *req, const int inout,
+			    int index)
+{
+	if (inout)
+		lustre_set_req_swabbed(req, index);
+	else
+		lustre_set_rep_swabbed(req, index);
+}
+EXPORT_SYMBOL(ptlrpc_buf_set_swabbed);
+
+int ptlrpc_buf_need_swab(struct ptlrpc_request *req, const int inout,
+			 int index)
+{
+	if (inout)
+		return (ptlrpc_req_need_swab(req) &&
+			!lustre_req_swabbed(req, index));
+	else
+		return (ptlrpc_rep_need_swab(req) &&
+			!lustre_rep_swabbed(req, index));
+}
+EXPORT_SYMBOL(ptlrpc_buf_need_swab);
+
+static inline int lustre_msg_check_version_v2(struct lustre_msg_v2 *msg,
+					      __u32 version)
+{
+	__u32 ver = lustre_msg_get_version(msg);
+	return (ver & LUSTRE_VERSION_MASK) != version;
+}
+
+int lustre_msg_check_version(struct lustre_msg *msg, __u32 version)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V1:
+		CERROR("msg v1 not supported - please upgrade you system\n");
+		return -EINVAL;
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_msg_check_version_v2(msg, version);
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_check_version);
+
+/* early reply size */
+int lustre_msg_early_size()
+{
+	static int size = 0;
+	if (!size) {
+		/* Always reply old ptlrpc_body_v2 to keep interoprability
+		 * with the old client (< 2.3) which doesn't have pb_jobid
+		 * in the ptlrpc_body.
+		 *
+		 * XXX Remove this whenever we dorp interoprability with such
+		 *     client.
+		 */
+		__u32 pblen = sizeof(struct ptlrpc_body_v2);
+		size = lustre_msg_size(LUSTRE_MSG_MAGIC_V2, 1, &pblen);
+	}
+	return size;
+}
+EXPORT_SYMBOL(lustre_msg_early_size);
+
+int lustre_msg_size_v2(int count, __u32 *lengths)
+{
+	int size;
+	int i;
+
+	size = lustre_msg_hdr_size_v2(count);
+	for (i = 0; i < count; i++)
+		size += cfs_size_round(lengths[i]);
+
+	return size;
+}
+EXPORT_SYMBOL(lustre_msg_size_v2);
+
+/* This returns the size of the buffer that is required to hold a lustre_msg
+ * with the given sub-buffer lengths.
+ * NOTE: this should only be used for NEW requests, and should always be
+ *       in the form of a v2 request.  If this is a connection to a v1
+ *       target then the first buffer will be stripped because the ptlrpc
+ *       data is part of the lustre_msg_v1 header. b=14043 */
+int lustre_msg_size(__u32 magic, int count, __u32 *lens)
+{
+	__u32 size[] = { sizeof(struct ptlrpc_body) };
+
+	if (!lens) {
+		LASSERT(count == 1);
+		lens = size;
+	}
+
+	LASSERT(count > 0);
+	LASSERT(lens[MSG_PTLRPC_BODY_OFF] >= sizeof(struct ptlrpc_body_v2));
+
+	switch (magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_msg_size_v2(count, lens);
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", magic);
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_size);
+
+/* This is used to determine the size of a buffer that was already packed
+ * and will correctly handle the different message formats. */
+int lustre_packed_msg_size(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_packed_msg_size);
+
+void lustre_init_msg_v2(struct lustre_msg_v2 *msg, int count, __u32 *lens,
+			char **bufs)
+{
+	char *ptr;
+	int i;
+
+	msg->lm_bufcount = count;
+	/* XXX: lm_secflvr uninitialized here */
+	msg->lm_magic = LUSTRE_MSG_MAGIC_V2;
+
+	for (i = 0; i < count; i++)
+		msg->lm_buflens[i] = lens[i];
+
+	if (bufs == NULL)
+		return;
+
+	ptr = (char *)msg + lustre_msg_hdr_size_v2(count);
+	for (i = 0; i < count; i++) {
+		char *tmp = bufs[i];
+		LOGL(tmp, lens[i], ptr);
+	}
+}
+EXPORT_SYMBOL(lustre_init_msg_v2);
+
+static int lustre_pack_request_v2(struct ptlrpc_request *req,
+				  int count, __u32 *lens, char **bufs)
+{
+	int reqlen, rc;
+
+	reqlen = lustre_msg_size_v2(count, lens);
+
+	rc = sptlrpc_cli_alloc_reqbuf(req, reqlen);
+	if (rc)
+		return rc;
+
+	req->rq_reqlen = reqlen;
+
+	lustre_init_msg_v2(req->rq_reqmsg, count, lens, bufs);
+	lustre_msg_add_version(req->rq_reqmsg, PTLRPC_MSG_VERSION);
+	return 0;
+}
+
+int lustre_pack_request(struct ptlrpc_request *req, __u32 magic, int count,
+			__u32 *lens, char **bufs)
+{
+	__u32 size[] = { sizeof(struct ptlrpc_body) };
+
+	if (!lens) {
+		LASSERT(count == 1);
+		lens = size;
+	}
+
+	LASSERT(count > 0);
+	LASSERT(lens[MSG_PTLRPC_BODY_OFF] == sizeof(struct ptlrpc_body));
+
+	/* only use new format, we don't need to be compatible with 1.4 */
+	magic = LUSTRE_MSG_MAGIC_V2;
+
+	switch (magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_pack_request_v2(req, count, lens, bufs);
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", magic);
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL(lustre_pack_request);
+
+#if RS_DEBUG
+LIST_HEAD(ptlrpc_rs_debug_lru);
+spinlock_t ptlrpc_rs_debug_lock;
+
+#define PTLRPC_RS_DEBUG_LRU_ADD(rs)					\
+do {									\
+	spin_lock(&ptlrpc_rs_debug_lock);				\
+	list_add_tail(&(rs)->rs_debug_list, &ptlrpc_rs_debug_lru);	\
+	spin_unlock(&ptlrpc_rs_debug_lock);				\
+} while (0)
+
+#define PTLRPC_RS_DEBUG_LRU_DEL(rs)					\
+do {									\
+	spin_lock(&ptlrpc_rs_debug_lock);				\
+	list_del(&(rs)->rs_debug_list);				\
+	spin_unlock(&ptlrpc_rs_debug_lock);				\
+} while (0)
+#else
+# define PTLRPC_RS_DEBUG_LRU_ADD(rs) do {} while(0)
+# define PTLRPC_RS_DEBUG_LRU_DEL(rs) do {} while(0)
+#endif
+
+struct ptlrpc_reply_state *
+lustre_get_emerg_rs(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_reply_state *rs = NULL;
+
+	spin_lock(&svcpt->scp_rep_lock);
+
+	/* See if we have anything in a pool, and wait if nothing */
+	while (list_empty(&svcpt->scp_rep_idle)) {
+		struct l_wait_info	lwi;
+		int			rc;
+
+		spin_unlock(&svcpt->scp_rep_lock);
+		/* If we cannot get anything for some long time, we better
+		 * bail out instead of waiting infinitely */
+		lwi = LWI_TIMEOUT(cfs_time_seconds(10), NULL, NULL);
+		rc = l_wait_event(svcpt->scp_rep_waitq,
+				  !list_empty(&svcpt->scp_rep_idle), &lwi);
+		if (rc != 0)
+			goto out;
+		spin_lock(&svcpt->scp_rep_lock);
+	}
+
+	rs = list_entry(svcpt->scp_rep_idle.next,
+			    struct ptlrpc_reply_state, rs_list);
+	list_del(&rs->rs_list);
+
+	spin_unlock(&svcpt->scp_rep_lock);
+
+	memset(rs, 0, svcpt->scp_service->srv_max_reply_size);
+	rs->rs_svcpt = svcpt;
+	rs->rs_prealloc = 1;
+out:
+	return rs;
+}
+
+void lustre_put_emerg_rs(struct ptlrpc_reply_state *rs)
+{
+	struct ptlrpc_service_part *svcpt = rs->rs_svcpt;
+
+	spin_lock(&svcpt->scp_rep_lock);
+	list_add(&rs->rs_list, &svcpt->scp_rep_idle);
+	spin_unlock(&svcpt->scp_rep_lock);
+	wake_up(&svcpt->scp_rep_waitq);
+}
+
+int lustre_pack_reply_v2(struct ptlrpc_request *req, int count,
+			 __u32 *lens, char **bufs, int flags)
+{
+	struct ptlrpc_reply_state *rs;
+	int			msg_len, rc;
+	ENTRY;
+
+	LASSERT(req->rq_reply_state == NULL);
+
+	if ((flags & LPRFL_EARLY_REPLY) == 0) {
+		spin_lock(&req->rq_lock);
+		req->rq_packed_final = 1;
+		spin_unlock(&req->rq_lock);
+	}
+
+	msg_len = lustre_msg_size_v2(count, lens);
+	rc = sptlrpc_svc_alloc_rs(req, msg_len);
+	if (rc)
+		RETURN(rc);
+
+	rs = req->rq_reply_state;
+	atomic_set(&rs->rs_refcount, 1);    /* 1 ref for rq_reply_state */
+	rs->rs_cb_id.cbid_fn = reply_out_callback;
+	rs->rs_cb_id.cbid_arg = rs;
+	rs->rs_svcpt = req->rq_rqbd->rqbd_svcpt;
+	INIT_LIST_HEAD(&rs->rs_exp_list);
+	INIT_LIST_HEAD(&rs->rs_obd_list);
+	INIT_LIST_HEAD(&rs->rs_list);
+	spin_lock_init(&rs->rs_lock);
+
+	req->rq_replen = msg_len;
+	req->rq_reply_state = rs;
+	req->rq_repmsg = rs->rs_msg;
+
+	lustre_init_msg_v2(rs->rs_msg, count, lens, bufs);
+	lustre_msg_add_version(rs->rs_msg, PTLRPC_MSG_VERSION);
+
+	PTLRPC_RS_DEBUG_LRU_ADD(rs);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(lustre_pack_reply_v2);
+
+int lustre_pack_reply_flags(struct ptlrpc_request *req, int count, __u32 *lens,
+			    char **bufs, int flags)
+{
+	int rc = 0;
+	__u32 size[] = { sizeof(struct ptlrpc_body) };
+
+	if (!lens) {
+		LASSERT(count == 1);
+		lens = size;
+	}
+
+	LASSERT(count > 0);
+	LASSERT(lens[MSG_PTLRPC_BODY_OFF] == sizeof(struct ptlrpc_body));
+
+	switch (req->rq_reqmsg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		rc = lustre_pack_reply_v2(req, count, lens, bufs, flags);
+		break;
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n",
+			 req->rq_reqmsg->lm_magic);
+		rc = -EINVAL;
+	}
+	if (rc != 0)
+		CERROR("lustre_pack_reply failed: rc=%d size=%d\n", rc,
+		       lustre_msg_size(req->rq_reqmsg->lm_magic, count, lens));
+	return rc;
+}
+EXPORT_SYMBOL(lustre_pack_reply_flags);
+
+int lustre_pack_reply(struct ptlrpc_request *req, int count, __u32 *lens,
+		      char **bufs)
+{
+	return lustre_pack_reply_flags(req, count, lens, bufs, 0);
+}
+EXPORT_SYMBOL(lustre_pack_reply);
+
+void *lustre_msg_buf_v2(struct lustre_msg_v2 *m, int n, int min_size)
+{
+	int i, offset, buflen, bufcount;
+
+	LASSERT(m != NULL);
+	LASSERT(n >= 0);
+
+	bufcount = m->lm_bufcount;
+	if (unlikely(n >= bufcount)) {
+		CDEBUG(D_INFO, "msg %p buffer[%d] not present (count %d)\n",
+		       m, n, bufcount);
+		return NULL;
+	}
+
+	buflen = m->lm_buflens[n];
+	if (unlikely(buflen < min_size)) {
+		CERROR("msg %p buffer[%d] size %d too small "
+		       "(required %d, opc=%d)\n", m, n, buflen, min_size,
+		       n == MSG_PTLRPC_BODY_OFF ? -1 : lustre_msg_get_opc(m));
+		return NULL;
+	}
+
+	offset = lustre_msg_hdr_size_v2(bufcount);
+	for (i = 0; i < n; i++)
+		offset += cfs_size_round(m->lm_buflens[i]);
+
+	return (char *)m + offset;
+}
+
+void *lustre_msg_buf(struct lustre_msg *m, int n, int min_size)
+{
+	switch (m->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_msg_buf_v2(m, n, min_size);
+	default:
+		LASSERTF(0, "incorrect message magic: %08x(msg:%p)\n", m->lm_magic, m);
+		return NULL;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_buf);
+
+int lustre_shrink_msg_v2(struct lustre_msg_v2 *msg, int segment,
+			 unsigned int newlen, int move_data)
+{
+	char   *tail = NULL, *newpos;
+	int     tail_len = 0, n;
+
+	LASSERT(msg);
+	LASSERT(msg->lm_bufcount > segment);
+	LASSERT(msg->lm_buflens[segment] >= newlen);
+
+	if (msg->lm_buflens[segment] == newlen)
+		goto out;
+
+	if (move_data && msg->lm_bufcount > segment + 1) {
+		tail = lustre_msg_buf_v2(msg, segment + 1, 0);
+		for (n = segment + 1; n < msg->lm_bufcount; n++)
+			tail_len += cfs_size_round(msg->lm_buflens[n]);
+	}
+
+	msg->lm_buflens[segment] = newlen;
+
+	if (tail && tail_len) {
+		newpos = lustre_msg_buf_v2(msg, segment + 1, 0);
+		LASSERT(newpos <= tail);
+		if (newpos != tail)
+			memmove(newpos, tail, tail_len);
+	}
+out:
+	return lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+}
+
+/*
+ * for @msg, shrink @segment to size @newlen. if @move_data is non-zero,
+ * we also move data forward from @segment + 1.
+ *
+ * if @newlen == 0, we remove the segment completely, but we still keep the
+ * totally bufcount the same to save possible data moving. this will leave a
+ * unused segment with size 0 at the tail, but that's ok.
+ *
+ * return new msg size after shrinking.
+ *
+ * CAUTION:
+ * + if any buffers higher than @segment has been filled in, must call shrink
+ *   with non-zero @move_data.
+ * + caller should NOT keep pointers to msg buffers which higher than @segment
+ *   after call shrink.
+ */
+int lustre_shrink_msg(struct lustre_msg *msg, int segment,
+		      unsigned int newlen, int move_data)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_shrink_msg_v2(msg, segment, newlen, move_data);
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_shrink_msg);
+
+void lustre_free_reply_state(struct ptlrpc_reply_state *rs)
+{
+	PTLRPC_RS_DEBUG_LRU_DEL(rs);
+
+	LASSERT (atomic_read(&rs->rs_refcount) == 0);
+	LASSERT (!rs->rs_difficult || rs->rs_handled);
+	LASSERT (!rs->rs_on_net);
+	LASSERT (!rs->rs_scheduled);
+	LASSERT (rs->rs_export == NULL);
+	LASSERT (rs->rs_nlocks == 0);
+	LASSERT (list_empty(&rs->rs_exp_list));
+	LASSERT (list_empty(&rs->rs_obd_list));
+
+	sptlrpc_svc_free_rs(rs);
+}
+EXPORT_SYMBOL(lustre_free_reply_state);
+
+static int lustre_unpack_msg_v2(struct lustre_msg_v2 *m, int len)
+{
+	int swabbed, required_len, i;
+
+	/* Now we know the sender speaks my language. */
+	required_len = lustre_msg_hdr_size_v2(0);
+	if (len < required_len) {
+		/* can't even look inside the message */
+		CERROR("message length %d too small for lustre_msg\n", len);
+		return -EINVAL;
+	}
+
+	swabbed = (m->lm_magic == LUSTRE_MSG_MAGIC_V2_SWABBED);
+
+	if (swabbed) {
+		__swab32s(&m->lm_magic);
+		__swab32s(&m->lm_bufcount);
+		__swab32s(&m->lm_secflvr);
+		__swab32s(&m->lm_repsize);
+		__swab32s(&m->lm_cksum);
+		__swab32s(&m->lm_flags);
+		CLASSERT(offsetof(typeof(*m), lm_padding_2) != 0);
+		CLASSERT(offsetof(typeof(*m), lm_padding_3) != 0);
+	}
+
+	required_len = lustre_msg_hdr_size_v2(m->lm_bufcount);
+	if (len < required_len) {
+		/* didn't receive all the buffer lengths */
+		CERROR ("message length %d too small for %d buflens\n",
+			len, m->lm_bufcount);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < m->lm_bufcount; i++) {
+		if (swabbed)
+			__swab32s(&m->lm_buflens[i]);
+		required_len += cfs_size_round(m->lm_buflens[i]);
+	}
+
+	if (len < required_len) {
+		CERROR("len: %d, required_len %d\n", len, required_len);
+		CERROR("bufcount: %d\n", m->lm_bufcount);
+		for (i = 0; i < m->lm_bufcount; i++)
+			CERROR("buffer %d length %d\n", i, m->lm_buflens[i]);
+		return -EINVAL;
+	}
+
+	return swabbed;
+}
+
+int __lustre_unpack_msg(struct lustre_msg *m, int len)
+{
+	int required_len, rc;
+	ENTRY;
+
+	/* We can provide a slightly better error log, if we check the
+	 * message magic and version first.  In the future, struct
+	 * lustre_msg may grow, and we'd like to log a version mismatch,
+	 * rather than a short message.
+	 *
+	 */
+	required_len = offsetof(struct lustre_msg, lm_magic) +
+		       sizeof(m->lm_magic);
+	if (len < required_len) {
+		/* can't even look inside the message */
+		CERROR("message length %d too small for magic/version check\n",
+		       len);
+		RETURN(-EINVAL);
+	}
+
+	rc = lustre_unpack_msg_v2(m, len);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(__lustre_unpack_msg);
+
+int ptlrpc_unpack_req_msg(struct ptlrpc_request *req, int len)
+{
+	int rc;
+	rc = __lustre_unpack_msg(req->rq_reqmsg, len);
+	if (rc == 1) {
+		lustre_set_req_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+		rc = 0;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(ptlrpc_unpack_req_msg);
+
+int ptlrpc_unpack_rep_msg(struct ptlrpc_request *req, int len)
+{
+	int rc;
+	rc = __lustre_unpack_msg(req->rq_repmsg, len);
+	if (rc == 1) {
+		lustre_set_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+		rc = 0;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(ptlrpc_unpack_rep_msg);
+
+static inline int lustre_unpack_ptlrpc_body_v2(struct ptlrpc_request *req,
+					       const int inout, int offset)
+{
+	struct ptlrpc_body *pb;
+	struct lustre_msg_v2 *m = inout ? req->rq_reqmsg : req->rq_repmsg;
+
+	pb = lustre_msg_buf_v2(m, offset, sizeof(struct ptlrpc_body_v2));
+	if (!pb) {
+		CERROR("error unpacking ptlrpc body\n");
+		return -EFAULT;
+	}
+	if (ptlrpc_buf_need_swab(req, inout, offset)) {
+		lustre_swab_ptlrpc_body(pb);
+		ptlrpc_buf_set_swabbed(req, inout, offset);
+	}
+
+	if ((pb->pb_version & ~LUSTRE_VERSION_MASK) != PTLRPC_MSG_VERSION) {
+		 CERROR("wrong lustre_msg version %08x\n", pb->pb_version);
+		 return -EINVAL;
+	}
+
+	return 0;
+}
+
+int lustre_unpack_req_ptlrpc_body(struct ptlrpc_request *req, int offset)
+{
+	switch (req->rq_reqmsg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_unpack_ptlrpc_body_v2(req, 1, offset);
+	default:
+		CERROR("bad lustre msg magic: %08x\n",
+		       req->rq_reqmsg->lm_magic);
+		return -EINVAL;
+	}
+}
+
+int lustre_unpack_rep_ptlrpc_body(struct ptlrpc_request *req, int offset)
+{
+	switch (req->rq_repmsg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_unpack_ptlrpc_body_v2(req, 0, offset);
+	default:
+		CERROR("bad lustre msg magic: %08x\n",
+		       req->rq_repmsg->lm_magic);
+		return -EINVAL;
+	}
+}
+
+static inline int lustre_msg_buflen_v2(struct lustre_msg_v2 *m, int n)
+{
+	if (n >= m->lm_bufcount)
+		return 0;
+
+	return m->lm_buflens[n];
+}
+
+/**
+ * lustre_msg_buflen - return the length of buffer \a n in message \a m
+ * \param m lustre_msg (request or reply) to look at
+ * \param n message index (base 0)
+ *
+ * returns zero for non-existent message indices
+ */
+int lustre_msg_buflen(struct lustre_msg *m, int n)
+{
+	switch (m->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_msg_buflen_v2(m, n);
+	default:
+		CERROR("incorrect message magic: %08x\n", m->lm_magic);
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_buflen);
+
+static inline void
+lustre_msg_set_buflen_v2(struct lustre_msg_v2 *m, int n, int len)
+{
+	if (n >= m->lm_bufcount)
+		LBUG();
+
+	m->lm_buflens[n] = len;
+}
+
+void lustre_msg_set_buflen(struct lustre_msg *m, int n, int len)
+{
+	switch (m->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		lustre_msg_set_buflen_v2(m, n, len);
+		return;
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", m->lm_magic);
+	}
+}
+
+EXPORT_SYMBOL(lustre_msg_set_buflen);
+
+/* NB return the bufcount for lustre_msg_v2 format, so if message is packed
+ * in V1 format, the result is one bigger. (add struct ptlrpc_body). */
+int lustre_msg_bufcount(struct lustre_msg *m)
+{
+	switch (m->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return m->lm_bufcount;
+	default:
+		CERROR("incorrect message magic: %08x\n", m->lm_magic);
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_bufcount);
+
+char *lustre_msg_string(struct lustre_msg *m, int index, int max_len)
+{
+	/* max_len == 0 means the string should fill the buffer */
+	char *str;
+	int slen, blen;
+
+	switch (m->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		str = lustre_msg_buf_v2(m, index, 0);
+		blen = lustre_msg_buflen_v2(m, index);
+		break;
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", m->lm_magic);
+	}
+
+	if (str == NULL) {
+		CERROR ("can't unpack string in msg %p buffer[%d]\n", m, index);
+		return NULL;
+	}
+
+	slen = strnlen(str, blen);
+
+	if (slen == blen) {		     /* not NULL terminated */
+		CERROR("can't unpack non-NULL terminated string in "
+			"msg %p buffer[%d] len %d\n", m, index, blen);
+		return NULL;
+	}
+
+	if (max_len == 0) {
+		if (slen != blen - 1) {
+			CERROR("can't unpack short string in msg %p "
+			       "buffer[%d] len %d: strlen %d\n",
+			       m, index, blen, slen);
+			return NULL;
+		}
+	} else if (slen > max_len) {
+		CERROR("can't unpack oversized string in msg %p "
+		       "buffer[%d] len %d strlen %d: max %d expected\n",
+		       m, index, blen, slen, max_len);
+		return NULL;
+	}
+
+	return str;
+}
+EXPORT_SYMBOL(lustre_msg_string);
+
+/* Wrap up the normal fixed length cases */
+static inline void *__lustre_swab_buf(struct lustre_msg *msg, int index,
+				      int min_size, void *swabber)
+{
+	void *ptr = NULL;
+
+	LASSERT(msg != NULL);
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		ptr = lustre_msg_buf_v2(msg, index, min_size);
+		break;
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+	}
+
+	if (ptr && swabber)
+		((void (*)(void *))swabber)(ptr);
+
+	return ptr;
+}
+
+static inline struct ptlrpc_body *lustre_msg_ptlrpc_body(struct lustre_msg *msg)
+{
+	return lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF,
+				 sizeof(struct ptlrpc_body_v2));
+}
+
+__u32 lustre_msghdr_get_flags(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V1:
+	case LUSTRE_MSG_MAGIC_V1_SWABBED:
+		return 0;
+	case LUSTRE_MSG_MAGIC_V2:
+		/* already in host endian */
+		return msg->lm_flags;
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msghdr_get_flags);
+
+void lustre_msghdr_set_flags(struct lustre_msg *msg, __u32 flags)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V1:
+		return;
+	case LUSTRE_MSG_MAGIC_V2:
+		msg->lm_flags = flags;
+		return;
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+
+__u32 lustre_msg_get_flags(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_flags;
+	}
+	default:
+		/* flags might be printed in debug code while message
+		 * uninitialized */
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_flags);
+
+void lustre_msg_add_flags(struct lustre_msg *msg, int flags)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_flags |= flags;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_add_flags);
+
+void lustre_msg_set_flags(struct lustre_msg *msg, int flags)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_flags = flags;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_flags);
+
+void lustre_msg_clear_flags(struct lustre_msg *msg, int flags)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_flags &= ~(MSG_GEN_FLAG_MASK & flags);
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_clear_flags);
+
+__u32 lustre_msg_get_op_flags(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_op_flags;
+	}
+	default:
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_op_flags);
+
+void lustre_msg_add_op_flags(struct lustre_msg *msg, int flags)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_op_flags |= flags;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_add_op_flags);
+
+void lustre_msg_set_op_flags(struct lustre_msg *msg, int flags)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_op_flags |= flags;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_op_flags);
+
+struct lustre_handle *lustre_msg_get_handle(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return NULL;
+		}
+		return &pb->pb_handle;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return NULL;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_handle);
+
+__u32 lustre_msg_get_type(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return PTL_RPC_MSG_ERR;
+		}
+		return pb->pb_type;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return PTL_RPC_MSG_ERR;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_type);
+
+__u32 lustre_msg_get_version(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_version;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_version);
+
+void lustre_msg_add_version(struct lustre_msg *msg, int version)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_version |= version;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_add_version);
+
+__u32 lustre_msg_get_opc(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_opc;
+	}
+	default:
+		CERROR("incorrect message magic: %08x(msg:%p)\n", msg->lm_magic, msg);
+		LBUG();
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_opc);
+
+__u64 lustre_msg_get_last_xid(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_last_xid;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_last_xid);
+
+__u64 lustre_msg_get_last_committed(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_last_committed;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_last_committed);
+
+__u64 *lustre_msg_get_versions(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V1:
+		return NULL;
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return NULL;
+		}
+		return pb->pb_pre_versions;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return NULL;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_versions);
+
+__u64 lustre_msg_get_transno(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_transno;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_transno);
+
+int lustre_msg_get_status(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return -EINVAL;
+		}
+		return pb->pb_status;
+	}
+	default:
+		/* status might be printed in debug code while message
+		 * uninitialized */
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_status);
+
+__u64 lustre_msg_get_slv(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return -EINVAL;
+		}
+		return pb->pb_slv;
+	}
+	default:
+		CERROR("invalid msg magic %08x\n", msg->lm_magic);
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_slv);
+
+
+void lustre_msg_set_slv(struct lustre_msg *msg, __u64 slv)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return;
+		}
+		pb->pb_slv = slv;
+		return;
+	}
+	default:
+		CERROR("invalid msg magic %x\n", msg->lm_magic);
+		return;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_slv);
+
+__u32 lustre_msg_get_limit(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return -EINVAL;
+		}
+		return pb->pb_limit;
+	}
+	default:
+		CERROR("invalid msg magic %x\n", msg->lm_magic);
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_limit);
+
+
+void lustre_msg_set_limit(struct lustre_msg *msg, __u64 limit)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return;
+		}
+		pb->pb_limit = limit;
+		return;
+	}
+	default:
+		CERROR("invalid msg magic %08x\n", msg->lm_magic);
+		return;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_limit);
+
+__u32 lustre_msg_get_conn_cnt(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_conn_cnt;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_conn_cnt);
+
+int lustre_msg_is_v1(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V1:
+	case LUSTRE_MSG_MAGIC_V1_SWABBED:
+		return 1;
+	default:
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_is_v1);
+
+__u32 lustre_msg_get_magic(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return msg->lm_magic;
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_magic);
+
+__u32 lustre_msg_get_timeout(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V1:
+	case LUSTRE_MSG_MAGIC_V1_SWABBED:
+		return 0;
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+
+		}
+		return pb->pb_timeout;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+
+__u32 lustre_msg_get_service_time(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V1:
+	case LUSTRE_MSG_MAGIC_V1_SWABBED:
+		return 0;
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+
+		}
+		return pb->pb_service_time;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+
+char *lustre_msg_get_jobid(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V1:
+	case LUSTRE_MSG_MAGIC_V1_SWABBED:
+		return NULL;
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb =
+			lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF,
+					  sizeof(struct ptlrpc_body));
+		if (!pb)
+			return NULL;
+
+		return pb->pb_jobid;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return NULL;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_jobid);
+
+__u32 lustre_msg_get_cksum(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return msg->lm_cksum;
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0)
+/*
+ * In 1.6 and 1.8 the checksum was computed only on struct ptlrpc_body as
+ * it was in 1.6 (88 bytes, smaller than the full size in 1.8).  It makes
+ * more sense to compute the checksum on the full ptlrpc_body, regardless
+ * of what size it is, but in order to keep interoperability with 1.8 we
+ * can optionally also checksum only the first 88 bytes (caller decides). */
+# define ptlrpc_body_cksum_size_compat18	 88
+
+__u32 lustre_msg_calc_cksum(struct lustre_msg *msg, int compat18)
+#else
+# warning "remove checksum compatibility support for b1_8"
+__u32 lustre_msg_calc_cksum(struct lustre_msg *msg)
+#endif
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0)
+		__u32 crc;
+		unsigned int hsize = 4;
+		__u32 len = compat18 ? ptlrpc_body_cksum_size_compat18 :
+			    lustre_msg_buflen(msg, MSG_PTLRPC_BODY_OFF);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		cfs_crypto_hash_digest(CFS_HASH_ALG_CRC32, (unsigned char *)pb,
+				       len, NULL, 0, (unsigned char *)&crc,
+				       &hsize);
+		return crc;
+#else
+# warning "remove checksum compatibility support for b1_8"
+		__u32 crc;
+		unsigned int hsize = 4;
+		cfs_crypto_hash_digest(CFS_HASH_ALG_CRC32, (unsigned char *)pb,
+				   lustre_msg_buflen(msg, MSG_PTLRPC_BODY_OFF),
+				   NULL, 0, (unsigned char *)&crc, &hsize);
+		return crc;
+#endif
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+
+void lustre_msg_set_handle(struct lustre_msg *msg, struct lustre_handle *handle)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_handle = *handle;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_handle);
+
+void lustre_msg_set_type(struct lustre_msg *msg, __u32 type)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_type = type;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_type);
+
+void lustre_msg_set_opc(struct lustre_msg *msg, __u32 opc)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_opc = opc;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_opc);
+
+void lustre_msg_set_last_xid(struct lustre_msg *msg, __u64 last_xid)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_last_xid = last_xid;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_last_xid);
+
+void lustre_msg_set_last_committed(struct lustre_msg *msg, __u64 last_committed)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_last_committed = last_committed;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_last_committed);
+
+void lustre_msg_set_versions(struct lustre_msg *msg, __u64 *versions)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V1:
+		return;
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_pre_versions[0] = versions[0];
+		pb->pb_pre_versions[1] = versions[1];
+		pb->pb_pre_versions[2] = versions[2];
+		pb->pb_pre_versions[3] = versions[3];
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_versions);
+
+void lustre_msg_set_transno(struct lustre_msg *msg, __u64 transno)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_transno = transno;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_transno);
+
+void lustre_msg_set_status(struct lustre_msg *msg, __u32 status)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_status = status;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_status);
+
+void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_conn_cnt = conn_cnt;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_conn_cnt);
+
+void lustre_msg_set_timeout(struct lustre_msg *msg, __u32 timeout)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V1:
+		return;
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_timeout = timeout;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+
+void lustre_msg_set_service_time(struct lustre_msg *msg, __u32 service_time)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V1:
+		return;
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_service_time = service_time;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+
+void lustre_msg_set_jobid(struct lustre_msg *msg, char *jobid)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V1:
+		return;
+	case LUSTRE_MSG_MAGIC_V2: {
+		__u32 opc = lustre_msg_get_opc(msg);
+		struct ptlrpc_body *pb;
+
+		/* Don't set jobid for ldlm ast RPCs, they've been shrinked.
+		 * See the comment in ptlrpc_request_pack(). */
+		if (!opc || opc == LDLM_BL_CALLBACK ||
+		    opc == LDLM_CP_CALLBACK || opc == LDLM_GL_CALLBACK)
+			return;
+
+		pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF,
+				       sizeof(struct ptlrpc_body));
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+
+		if (jobid != NULL)
+			memcpy(pb->pb_jobid, jobid, JOBSTATS_JOBID_SIZE);
+		else if (pb->pb_jobid[0] == '\0')
+			lustre_get_jobid(pb->pb_jobid);
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_jobid);
+
+void lustre_msg_set_cksum(struct lustre_msg *msg, __u32 cksum)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V1:
+		return;
+	case LUSTRE_MSG_MAGIC_V2:
+		msg->lm_cksum = cksum;
+		return;
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+
+
+void ptlrpc_request_set_replen(struct ptlrpc_request *req)
+{
+	int count = req_capsule_filled_sizes(&req->rq_pill, RCL_SERVER);
+
+	req->rq_replen = lustre_msg_size(req->rq_reqmsg->lm_magic, count,
+					 req->rq_pill.rc_area[RCL_SERVER]);
+	if (req->rq_reqmsg->lm_magic == LUSTRE_MSG_MAGIC_V2)
+		req->rq_reqmsg->lm_repsize = req->rq_replen;
+}
+EXPORT_SYMBOL(ptlrpc_request_set_replen);
+
+void ptlrpc_req_set_repsize(struct ptlrpc_request *req, int count, __u32 *lens)
+{
+	req->rq_replen = lustre_msg_size(req->rq_reqmsg->lm_magic, count, lens);
+	if (req->rq_reqmsg->lm_magic == LUSTRE_MSG_MAGIC_V2)
+		req->rq_reqmsg->lm_repsize = req->rq_replen;
+}
+EXPORT_SYMBOL(ptlrpc_req_set_repsize);
+
+/**
+ * Send a remote set_info_async.
+ *
+ * This may go from client to server or server to client.
+ */
+int do_set_info_async(struct obd_import *imp,
+		      int opcode, int version,
+		      obd_count keylen, void *key,
+		      obd_count vallen, void *val,
+		      struct ptlrpc_request_set *set)
+{
+	struct ptlrpc_request *req;
+	char		  *tmp;
+	int		    rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(imp, &RQF_OBD_SET_INFO);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY,
+			     RCL_CLIENT, keylen);
+	req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_VAL,
+			     RCL_CLIENT, vallen);
+	rc = ptlrpc_request_pack(req, version, opcode);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY);
+	memcpy(tmp, key, keylen);
+	tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_VAL);
+	memcpy(tmp, val, vallen);
+
+	ptlrpc_request_set_replen(req);
+
+	if (set) {
+		ptlrpc_set_add_req(set, req);
+		ptlrpc_check_set(NULL, set);
+	} else {
+		rc = ptlrpc_queue_wait(req);
+		ptlrpc_req_finished(req);
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(do_set_info_async);
+
+/* byte flipping routines for all wire types declared in
+ * lustre_idl.h implemented here.
+ */
+void lustre_swab_ptlrpc_body(struct ptlrpc_body *b)
+{
+	__swab32s (&b->pb_type);
+	__swab32s (&b->pb_version);
+	__swab32s (&b->pb_opc);
+	__swab32s (&b->pb_status);
+	__swab64s (&b->pb_last_xid);
+	__swab64s (&b->pb_last_seen);
+	__swab64s (&b->pb_last_committed);
+	__swab64s (&b->pb_transno);
+	__swab32s (&b->pb_flags);
+	__swab32s (&b->pb_op_flags);
+	__swab32s (&b->pb_conn_cnt);
+	__swab32s (&b->pb_timeout);
+	__swab32s (&b->pb_service_time);
+	__swab32s (&b->pb_limit);
+	__swab64s (&b->pb_slv);
+	__swab64s (&b->pb_pre_versions[0]);
+	__swab64s (&b->pb_pre_versions[1]);
+	__swab64s (&b->pb_pre_versions[2]);
+	__swab64s (&b->pb_pre_versions[3]);
+	CLASSERT(offsetof(typeof(*b), pb_padding) != 0);
+	/* While we need to maintain compatibility between
+	 * clients and servers without ptlrpc_body_v2 (< 2.3)
+	 * do not swab any fields beyond pb_jobid, as we are
+	 * using this swab function for both ptlrpc_body
+	 * and ptlrpc_body_v2. */
+	CLASSERT(offsetof(typeof(*b), pb_jobid) != 0);
+}
+EXPORT_SYMBOL(lustre_swab_ptlrpc_body);
+
+void lustre_swab_connect(struct obd_connect_data *ocd)
+{
+	__swab64s(&ocd->ocd_connect_flags);
+	__swab32s(&ocd->ocd_version);
+	__swab32s(&ocd->ocd_grant);
+	__swab64s(&ocd->ocd_ibits_known);
+	__swab32s(&ocd->ocd_index);
+	__swab32s(&ocd->ocd_brw_size);
+	/* ocd_blocksize and ocd_inodespace don't need to be swabbed because
+	 * they are 8-byte values */
+	__swab16s(&ocd->ocd_grant_extent);
+	__swab32s(&ocd->ocd_unused);
+	__swab64s(&ocd->ocd_transno);
+	__swab32s(&ocd->ocd_group);
+	__swab32s(&ocd->ocd_cksum_types);
+	__swab32s(&ocd->ocd_instance);
+	/* Fields after ocd_cksum_types are only accessible by the receiver
+	 * if the corresponding flag in ocd_connect_flags is set. Accessing
+	 * any field after ocd_maxbytes on the receiver without a valid flag
+	 * may result in out-of-bound memory access and kernel oops. */
+	if (ocd->ocd_connect_flags & OBD_CONNECT_MAX_EASIZE)
+		__swab32s(&ocd->ocd_max_easize);
+	if (ocd->ocd_connect_flags & OBD_CONNECT_MAXBYTES)
+		__swab64s(&ocd->ocd_maxbytes);
+	CLASSERT(offsetof(typeof(*ocd), padding1) != 0);
+	CLASSERT(offsetof(typeof(*ocd), padding2) != 0);
+	CLASSERT(offsetof(typeof(*ocd), padding3) != 0);
+	CLASSERT(offsetof(typeof(*ocd), padding4) != 0);
+	CLASSERT(offsetof(typeof(*ocd), padding5) != 0);
+	CLASSERT(offsetof(typeof(*ocd), padding6) != 0);
+	CLASSERT(offsetof(typeof(*ocd), padding7) != 0);
+	CLASSERT(offsetof(typeof(*ocd), padding8) != 0);
+	CLASSERT(offsetof(typeof(*ocd), padding9) != 0);
+	CLASSERT(offsetof(typeof(*ocd), paddingA) != 0);
+	CLASSERT(offsetof(typeof(*ocd), paddingB) != 0);
+	CLASSERT(offsetof(typeof(*ocd), paddingC) != 0);
+	CLASSERT(offsetof(typeof(*ocd), paddingD) != 0);
+	CLASSERT(offsetof(typeof(*ocd), paddingE) != 0);
+	CLASSERT(offsetof(typeof(*ocd), paddingF) != 0);
+}
+
+void lustre_swab_obdo (struct obdo  *o)
+{
+	__swab64s (&o->o_valid);
+	lustre_swab_ost_id(&o->o_oi);
+	__swab64s (&o->o_parent_seq);
+	__swab64s (&o->o_size);
+	__swab64s (&o->o_mtime);
+	__swab64s (&o->o_atime);
+	__swab64s (&o->o_ctime);
+	__swab64s (&o->o_blocks);
+	__swab64s (&o->o_grant);
+	__swab32s (&o->o_blksize);
+	__swab32s (&o->o_mode);
+	__swab32s (&o->o_uid);
+	__swab32s (&o->o_gid);
+	__swab32s (&o->o_flags);
+	__swab32s (&o->o_nlink);
+	__swab32s (&o->o_parent_oid);
+	__swab32s (&o->o_misc);
+	__swab64s (&o->o_ioepoch);
+	__swab32s (&o->o_stripe_idx);
+	__swab32s (&o->o_parent_ver);
+	/* o_handle is opaque */
+	/* o_lcookie is swabbed elsewhere */
+	__swab32s (&o->o_uid_h);
+	__swab32s (&o->o_gid_h);
+	__swab64s (&o->o_data_version);
+	CLASSERT(offsetof(typeof(*o), o_padding_4) != 0);
+	CLASSERT(offsetof(typeof(*o), o_padding_5) != 0);
+	CLASSERT(offsetof(typeof(*o), o_padding_6) != 0);
+
+}
+EXPORT_SYMBOL(lustre_swab_obdo);
+
+void lustre_swab_obd_statfs (struct obd_statfs *os)
+{
+	__swab64s (&os->os_type);
+	__swab64s (&os->os_blocks);
+	__swab64s (&os->os_bfree);
+	__swab64s (&os->os_bavail);
+	__swab64s (&os->os_files);
+	__swab64s (&os->os_ffree);
+	/* no need to swab os_fsid */
+	__swab32s (&os->os_bsize);
+	__swab32s (&os->os_namelen);
+	__swab64s (&os->os_maxbytes);
+	__swab32s (&os->os_state);
+	CLASSERT(offsetof(typeof(*os), os_fprecreated) != 0);
+	CLASSERT(offsetof(typeof(*os), os_spare2) != 0);
+	CLASSERT(offsetof(typeof(*os), os_spare3) != 0);
+	CLASSERT(offsetof(typeof(*os), os_spare4) != 0);
+	CLASSERT(offsetof(typeof(*os), os_spare5) != 0);
+	CLASSERT(offsetof(typeof(*os), os_spare6) != 0);
+	CLASSERT(offsetof(typeof(*os), os_spare7) != 0);
+	CLASSERT(offsetof(typeof(*os), os_spare8) != 0);
+	CLASSERT(offsetof(typeof(*os), os_spare9) != 0);
+}
+EXPORT_SYMBOL(lustre_swab_obd_statfs);
+
+void lustre_swab_obd_ioobj(struct obd_ioobj *ioo)
+{
+	lustre_swab_ost_id(&ioo->ioo_oid);
+	__swab32s(&ioo->ioo_max_brw);
+	__swab32s(&ioo->ioo_bufcnt);
+}
+EXPORT_SYMBOL(lustre_swab_obd_ioobj);
+
+void lustre_swab_niobuf_remote (struct niobuf_remote *nbr)
+{
+	__swab64s (&nbr->offset);
+	__swab32s (&nbr->len);
+	__swab32s (&nbr->flags);
+}
+EXPORT_SYMBOL(lustre_swab_niobuf_remote);
+
+void lustre_swab_ost_body (struct ost_body *b)
+{
+	lustre_swab_obdo (&b->oa);
+}
+EXPORT_SYMBOL(lustre_swab_ost_body);
+
+void lustre_swab_ost_last_id(obd_id *id)
+{
+	__swab64s(id);
+}
+EXPORT_SYMBOL(lustre_swab_ost_last_id);
+
+void lustre_swab_generic_32s(__u32 *val)
+{
+	__swab32s(val);
+}
+EXPORT_SYMBOL(lustre_swab_generic_32s);
+
+void lustre_swab_gl_desc(union ldlm_gl_desc *desc)
+{
+	lustre_swab_lu_fid(&desc->lquota_desc.gl_id.qid_fid);
+	__swab64s(&desc->lquota_desc.gl_flags);
+	__swab64s(&desc->lquota_desc.gl_ver);
+	__swab64s(&desc->lquota_desc.gl_hardlimit);
+	__swab64s(&desc->lquota_desc.gl_softlimit);
+	__swab64s(&desc->lquota_desc.gl_time);
+	CLASSERT(offsetof(typeof(desc->lquota_desc), gl_pad2) != 0);
+}
+
+void lustre_swab_ost_lvb_v1(struct ost_lvb_v1 *lvb)
+{
+	__swab64s(&lvb->lvb_size);
+	__swab64s(&lvb->lvb_mtime);
+	__swab64s(&lvb->lvb_atime);
+	__swab64s(&lvb->lvb_ctime);
+	__swab64s(&lvb->lvb_blocks);
+}
+EXPORT_SYMBOL(lustre_swab_ost_lvb_v1);
+
+void lustre_swab_ost_lvb(struct ost_lvb *lvb)
+{
+	__swab64s(&lvb->lvb_size);
+	__swab64s(&lvb->lvb_mtime);
+	__swab64s(&lvb->lvb_atime);
+	__swab64s(&lvb->lvb_ctime);
+	__swab64s(&lvb->lvb_blocks);
+	__swab32s(&lvb->lvb_mtime_ns);
+	__swab32s(&lvb->lvb_atime_ns);
+	__swab32s(&lvb->lvb_ctime_ns);
+	__swab32s(&lvb->lvb_padding);
+}
+EXPORT_SYMBOL(lustre_swab_ost_lvb);
+
+void lustre_swab_lquota_lvb(struct lquota_lvb *lvb)
+{
+	__swab64s(&lvb->lvb_flags);
+	__swab64s(&lvb->lvb_id_may_rel);
+	__swab64s(&lvb->lvb_id_rel);
+	__swab64s(&lvb->lvb_id_qunit);
+	__swab64s(&lvb->lvb_pad1);
+}
+EXPORT_SYMBOL(lustre_swab_lquota_lvb);
+
+void lustre_swab_mdt_body (struct mdt_body *b)
+{
+	lustre_swab_lu_fid (&b->fid1);
+	lustre_swab_lu_fid (&b->fid2);
+	/* handle is opaque */
+	__swab64s (&b->valid);
+	__swab64s (&b->size);
+	__swab64s (&b->mtime);
+	__swab64s (&b->atime);
+	__swab64s (&b->ctime);
+	__swab64s (&b->blocks);
+	__swab64s (&b->ioepoch);
+	CLASSERT(offsetof(typeof(*b), unused1) != 0);
+	__swab32s (&b->fsuid);
+	__swab32s (&b->fsgid);
+	__swab32s (&b->capability);
+	__swab32s (&b->mode);
+	__swab32s (&b->uid);
+	__swab32s (&b->gid);
+	__swab32s (&b->flags);
+	__swab32s (&b->rdev);
+	__swab32s (&b->nlink);
+	CLASSERT(offsetof(typeof(*b), unused2) != 0);
+	__swab32s (&b->suppgid);
+	__swab32s (&b->eadatasize);
+	__swab32s (&b->aclsize);
+	__swab32s (&b->max_mdsize);
+	__swab32s (&b->max_cookiesize);
+	__swab32s (&b->uid_h);
+	__swab32s (&b->gid_h);
+	CLASSERT(offsetof(typeof(*b), padding_5) != 0);
+}
+EXPORT_SYMBOL(lustre_swab_mdt_body);
+
+void lustre_swab_mdt_ioepoch (struct mdt_ioepoch *b)
+{
+	/* handle is opaque */
+	 __swab64s (&b->ioepoch);
+	 __swab32s (&b->flags);
+	 CLASSERT(offsetof(typeof(*b), padding) != 0);
+}
+EXPORT_SYMBOL(lustre_swab_mdt_ioepoch);
+
+void lustre_swab_mgs_target_info(struct mgs_target_info *mti)
+{
+	int i;
+	__swab32s(&mti->mti_lustre_ver);
+	__swab32s(&mti->mti_stripe_index);
+	__swab32s(&mti->mti_config_ver);
+	__swab32s(&mti->mti_flags);
+	__swab32s(&mti->mti_instance);
+	__swab32s(&mti->mti_nid_count);
+	CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64));
+	for (i = 0; i < MTI_NIDS_MAX; i++)
+		__swab64s(&mti->mti_nids[i]);
+}
+EXPORT_SYMBOL(lustre_swab_mgs_target_info);
+
+void lustre_swab_mgs_nidtbl_entry(struct mgs_nidtbl_entry *entry)
+{
+	int i;
+
+	__swab64s(&entry->mne_version);
+	__swab32s(&entry->mne_instance);
+	__swab32s(&entry->mne_index);
+	__swab32s(&entry->mne_length);
+
+	/* mne_nid_(count|type) must be one byte size because we're gonna
+	 * access it w/o swapping. */
+	CLASSERT(sizeof(entry->mne_nid_count) == sizeof(__u8));
+	CLASSERT(sizeof(entry->mne_nid_type) == sizeof(__u8));
+
+	/* remove this assertion if ipv6 is supported. */
+	LASSERT(entry->mne_nid_type == 0);
+	for (i = 0; i < entry->mne_nid_count; i++) {
+		CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64));
+		__swab64s(&entry->u.nids[i]);
+	}
+}
+EXPORT_SYMBOL(lustre_swab_mgs_nidtbl_entry);
+
+void lustre_swab_mgs_config_body(struct mgs_config_body *body)
+{
+	__swab64s(&body->mcb_offset);
+	__swab32s(&body->mcb_units);
+	__swab16s(&body->mcb_type);
+}
+EXPORT_SYMBOL(lustre_swab_mgs_config_body);
+
+void lustre_swab_mgs_config_res(struct mgs_config_res *body)
+{
+	__swab64s(&body->mcr_offset);
+	__swab64s(&body->mcr_size);
+}
+EXPORT_SYMBOL(lustre_swab_mgs_config_res);
+
+static void lustre_swab_obd_dqinfo (struct obd_dqinfo *i)
+{
+	__swab64s (&i->dqi_bgrace);
+	__swab64s (&i->dqi_igrace);
+	__swab32s (&i->dqi_flags);
+	__swab32s (&i->dqi_valid);
+}
+
+static void lustre_swab_obd_dqblk (struct obd_dqblk *b)
+{
+	__swab64s (&b->dqb_ihardlimit);
+	__swab64s (&b->dqb_isoftlimit);
+	__swab64s (&b->dqb_curinodes);
+	__swab64s (&b->dqb_bhardlimit);
+	__swab64s (&b->dqb_bsoftlimit);
+	__swab64s (&b->dqb_curspace);
+	__swab64s (&b->dqb_btime);
+	__swab64s (&b->dqb_itime);
+	__swab32s (&b->dqb_valid);
+	CLASSERT(offsetof(typeof(*b), dqb_padding) != 0);
+}
+
+void lustre_swab_obd_quotactl (struct obd_quotactl *q)
+{
+	__swab32s (&q->qc_cmd);
+	__swab32s (&q->qc_type);
+	__swab32s (&q->qc_id);
+	__swab32s (&q->qc_stat);
+	lustre_swab_obd_dqinfo (&q->qc_dqinfo);
+	lustre_swab_obd_dqblk (&q->qc_dqblk);
+}
+EXPORT_SYMBOL(lustre_swab_obd_quotactl);
+
+void lustre_swab_mdt_remote_perm (struct mdt_remote_perm *p)
+{
+	__swab32s (&p->rp_uid);
+	__swab32s (&p->rp_gid);
+	__swab32s (&p->rp_fsuid);
+	__swab32s (&p->rp_fsuid_h);
+	__swab32s (&p->rp_fsgid);
+	__swab32s (&p->rp_fsgid_h);
+	__swab32s (&p->rp_access_perm);
+	__swab32s (&p->rp_padding);
+};
+EXPORT_SYMBOL(lustre_swab_mdt_remote_perm);
+
+void lustre_swab_fid2path(struct getinfo_fid2path *gf)
+{
+	lustre_swab_lu_fid(&gf->gf_fid);
+	__swab64s(&gf->gf_recno);
+	__swab32s(&gf->gf_linkno);
+	__swab32s(&gf->gf_pathlen);
+}
+EXPORT_SYMBOL(lustre_swab_fid2path);
+
+void lustre_swab_fiemap_extent(struct ll_fiemap_extent *fm_extent)
+{
+	__swab64s(&fm_extent->fe_logical);
+	__swab64s(&fm_extent->fe_physical);
+	__swab64s(&fm_extent->fe_length);
+	__swab32s(&fm_extent->fe_flags);
+	__swab32s(&fm_extent->fe_device);
+}
+
+void lustre_swab_fiemap(struct ll_user_fiemap *fiemap)
+{
+	int i;
+
+	__swab64s(&fiemap->fm_start);
+	__swab64s(&fiemap->fm_length);
+	__swab32s(&fiemap->fm_flags);
+	__swab32s(&fiemap->fm_mapped_extents);
+	__swab32s(&fiemap->fm_extent_count);
+	__swab32s(&fiemap->fm_reserved);
+
+	for (i = 0; i < fiemap->fm_mapped_extents; i++)
+		lustre_swab_fiemap_extent(&fiemap->fm_extents[i]);
+}
+EXPORT_SYMBOL(lustre_swab_fiemap);
+
+void lustre_swab_idx_info(struct idx_info *ii)
+{
+	__swab32s(&ii->ii_magic);
+	__swab32s(&ii->ii_flags);
+	__swab16s(&ii->ii_count);
+	__swab32s(&ii->ii_attrs);
+	lustre_swab_lu_fid(&ii->ii_fid);
+	__swab64s(&ii->ii_version);
+	__swab64s(&ii->ii_hash_start);
+	__swab64s(&ii->ii_hash_end);
+	__swab16s(&ii->ii_keysize);
+	__swab16s(&ii->ii_recsize);
+}
+
+void lustre_swab_lip_header(struct lu_idxpage *lip)
+{
+	/* swab header */
+	__swab32s(&lip->lip_magic);
+	__swab16s(&lip->lip_flags);
+	__swab16s(&lip->lip_nr);
+}
+EXPORT_SYMBOL(lustre_swab_lip_header);
+
+void lustre_swab_mdt_rec_reint (struct mdt_rec_reint *rr)
+{
+	__swab32s(&rr->rr_opcode);
+	__swab32s(&rr->rr_cap);
+	__swab32s(&rr->rr_fsuid);
+	/* rr_fsuid_h is unused */
+	__swab32s(&rr->rr_fsgid);
+	/* rr_fsgid_h is unused */
+	__swab32s(&rr->rr_suppgid1);
+	/* rr_suppgid1_h is unused */
+	__swab32s(&rr->rr_suppgid2);
+	/* rr_suppgid2_h is unused */
+	lustre_swab_lu_fid(&rr->rr_fid1);
+	lustre_swab_lu_fid(&rr->rr_fid2);
+	__swab64s(&rr->rr_mtime);
+	__swab64s(&rr->rr_atime);
+	__swab64s(&rr->rr_ctime);
+	__swab64s(&rr->rr_size);
+	__swab64s(&rr->rr_blocks);
+	__swab32s(&rr->rr_bias);
+	__swab32s(&rr->rr_mode);
+	__swab32s(&rr->rr_flags);
+	__swab32s(&rr->rr_flags_h);
+	__swab32s(&rr->rr_umask);
+
+	CLASSERT(offsetof(typeof(*rr), rr_padding_4) != 0);
+};
+EXPORT_SYMBOL(lustre_swab_mdt_rec_reint);
+
+void lustre_swab_lov_desc (struct lov_desc *ld)
+{
+	__swab32s (&ld->ld_tgt_count);
+	__swab32s (&ld->ld_active_tgt_count);
+	__swab32s (&ld->ld_default_stripe_count);
+	__swab32s (&ld->ld_pattern);
+	__swab64s (&ld->ld_default_stripe_size);
+	__swab64s (&ld->ld_default_stripe_offset);
+	__swab32s (&ld->ld_qos_maxage);
+	/* uuid endian insensitive */
+}
+EXPORT_SYMBOL(lustre_swab_lov_desc);
+
+void lustre_swab_lmv_desc (struct lmv_desc *ld)
+{
+	__swab32s (&ld->ld_tgt_count);
+	__swab32s (&ld->ld_active_tgt_count);
+	__swab32s (&ld->ld_default_stripe_count);
+	__swab32s (&ld->ld_pattern);
+	__swab64s (&ld->ld_default_hash_size);
+	__swab32s (&ld->ld_qos_maxage);
+	/* uuid endian insensitive */
+}
+
+void lustre_swab_lmv_stripe_md (struct lmv_stripe_md *mea)
+{
+	__swab32s(&mea->mea_magic);
+	__swab32s(&mea->mea_count);
+	__swab32s(&mea->mea_master);
+	CLASSERT(offsetof(typeof(*mea), mea_padding) != 0);
+}
+
+void lustre_swab_lmv_user_md(struct lmv_user_md *lum)
+{
+	int i;
+
+	__swab32s(&lum->lum_magic);
+	__swab32s(&lum->lum_stripe_count);
+	__swab32s(&lum->lum_stripe_offset);
+	__swab32s(&lum->lum_hash_type);
+	__swab32s(&lum->lum_type);
+	CLASSERT(offsetof(typeof(*lum), lum_padding1) != 0);
+	CLASSERT(offsetof(typeof(*lum), lum_padding2) != 0);
+	CLASSERT(offsetof(typeof(*lum), lum_padding3) != 0);
+
+	for (i = 0; i < lum->lum_stripe_count; i++) {
+		__swab32s(&lum->lum_objects[i].lum_mds);
+		lustre_swab_lu_fid(&lum->lum_objects[i].lum_fid);
+	}
+
+}
+EXPORT_SYMBOL(lustre_swab_lmv_user_md);
+
+static void print_lum (struct lov_user_md *lum)
+{
+	CDEBUG(D_OTHER, "lov_user_md %p:\n", lum);
+	CDEBUG(D_OTHER, "\tlmm_magic: %#x\n", lum->lmm_magic);
+	CDEBUG(D_OTHER, "\tlmm_pattern: %#x\n", lum->lmm_pattern);
+	CDEBUG(D_OTHER, "\tlmm_object_id: "LPU64"\n", lmm_oi_id(&lum->lmm_oi));
+	CDEBUG(D_OTHER, "\tlmm_object_gr: "LPU64"\n", lmm_oi_seq(&lum->lmm_oi));
+	CDEBUG(D_OTHER, "\tlmm_stripe_size: %#x\n", lum->lmm_stripe_size);
+	CDEBUG(D_OTHER, "\tlmm_stripe_count: %#x\n", lum->lmm_stripe_count);
+	CDEBUG(D_OTHER, "\tlmm_stripe_offset/lmm_layout_gen: %#x\n",
+			lum->lmm_stripe_offset);
+}
+
+static void lustre_swab_lmm_oi(struct ost_id *oi)
+{
+	__swab64s(&oi->oi.oi_id);
+	__swab64s(&oi->oi.oi_seq);
+}
+
+static void lustre_swab_lov_user_md_common(struct lov_user_md_v1 *lum)
+{
+	ENTRY;
+	__swab32s(&lum->lmm_magic);
+	__swab32s(&lum->lmm_pattern);
+	lustre_swab_lmm_oi(&lum->lmm_oi);
+	__swab32s(&lum->lmm_stripe_size);
+	__swab16s(&lum->lmm_stripe_count);
+	__swab16s(&lum->lmm_stripe_offset);
+	print_lum(lum);
+	EXIT;
+}
+
+void lustre_swab_lov_user_md_v1(struct lov_user_md_v1 *lum)
+{
+	ENTRY;
+	CDEBUG(D_IOCTL, "swabbing lov_user_md v1\n");
+	lustre_swab_lov_user_md_common(lum);
+	EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_lov_user_md_v1);
+
+void lustre_swab_lov_user_md_v3(struct lov_user_md_v3 *lum)
+{
+	ENTRY;
+	CDEBUG(D_IOCTL, "swabbing lov_user_md v3\n");
+	lustre_swab_lov_user_md_common((struct lov_user_md_v1 *)lum);
+	/* lmm_pool_name nothing to do with char */
+	EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_lov_user_md_v3);
+
+void lustre_swab_lov_mds_md(struct lov_mds_md *lmm)
+{
+	ENTRY;
+	CDEBUG(D_IOCTL, "swabbing lov_mds_md\n");
+	__swab32s(&lmm->lmm_magic);
+	__swab32s(&lmm->lmm_pattern);
+	lustre_swab_lmm_oi(&lmm->lmm_oi);
+	__swab32s(&lmm->lmm_stripe_size);
+	__swab16s(&lmm->lmm_stripe_count);
+	__swab16s(&lmm->lmm_layout_gen);
+	EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_lov_mds_md);
+
+void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod,
+				     int stripe_count)
+{
+	int i;
+	ENTRY;
+	for (i = 0; i < stripe_count; i++) {
+		lustre_swab_ost_id(&(lod[i].l_ost_oi));
+		__swab32s(&(lod[i].l_ost_gen));
+		__swab32s(&(lod[i].l_ost_idx));
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_lov_user_md_objects);
+
+void lustre_swab_ldlm_res_id (struct ldlm_res_id *id)
+{
+	int  i;
+
+	for (i = 0; i < RES_NAME_SIZE; i++)
+		__swab64s (&id->name[i]);
+}
+EXPORT_SYMBOL(lustre_swab_ldlm_res_id);
+
+void lustre_swab_ldlm_policy_data (ldlm_wire_policy_data_t *d)
+{
+	/* the lock data is a union and the first two fields are always an
+	 * extent so it's ok to process an LDLM_EXTENT and LDLM_FLOCK lock
+	 * data the same way. */
+	__swab64s(&d->l_extent.start);
+	__swab64s(&d->l_extent.end);
+	__swab64s(&d->l_extent.gid);
+	__swab64s(&d->l_flock.lfw_owner);
+	__swab32s(&d->l_flock.lfw_pid);
+}
+EXPORT_SYMBOL(lustre_swab_ldlm_policy_data);
+
+void lustre_swab_ldlm_intent (struct ldlm_intent *i)
+{
+	__swab64s (&i->opc);
+}
+EXPORT_SYMBOL(lustre_swab_ldlm_intent);
+
+void lustre_swab_ldlm_resource_desc (struct ldlm_resource_desc *r)
+{
+	__swab32s (&r->lr_type);
+	CLASSERT(offsetof(typeof(*r), lr_padding) != 0);
+	lustre_swab_ldlm_res_id (&r->lr_name);
+}
+EXPORT_SYMBOL(lustre_swab_ldlm_resource_desc);
+
+void lustre_swab_ldlm_lock_desc (struct ldlm_lock_desc *l)
+{
+	lustre_swab_ldlm_resource_desc (&l->l_resource);
+	__swab32s (&l->l_req_mode);
+	__swab32s (&l->l_granted_mode);
+	lustre_swab_ldlm_policy_data (&l->l_policy_data);
+}
+EXPORT_SYMBOL(lustre_swab_ldlm_lock_desc);
+
+void lustre_swab_ldlm_request (struct ldlm_request *rq)
+{
+	__swab32s (&rq->lock_flags);
+	lustre_swab_ldlm_lock_desc (&rq->lock_desc);
+	__swab32s (&rq->lock_count);
+	/* lock_handle[] opaque */
+}
+EXPORT_SYMBOL(lustre_swab_ldlm_request);
+
+void lustre_swab_ldlm_reply (struct ldlm_reply *r)
+{
+	__swab32s (&r->lock_flags);
+	CLASSERT(offsetof(typeof(*r), lock_padding) != 0);
+	lustre_swab_ldlm_lock_desc (&r->lock_desc);
+	/* lock_handle opaque */
+	__swab64s (&r->lock_policy_res1);
+	__swab64s (&r->lock_policy_res2);
+}
+EXPORT_SYMBOL(lustre_swab_ldlm_reply);
+
+void lustre_swab_quota_body(struct quota_body *b)
+{
+	lustre_swab_lu_fid(&b->qb_fid);
+	lustre_swab_lu_fid((struct lu_fid *)&b->qb_id);
+	__swab32s(&b->qb_flags);
+	__swab64s(&b->qb_count);
+	__swab64s(&b->qb_usage);
+	__swab64s(&b->qb_slv_ver);
+}
+
+/* Dump functions */
+void dump_ioo(struct obd_ioobj *ioo)
+{
+	CDEBUG(D_RPCTRACE,
+	       "obd_ioobj: ioo_oid="DOSTID", ioo_max_brw=%#x, "
+	       "ioo_bufct=%d\n", POSTID(&ioo->ioo_oid), ioo->ioo_max_brw,
+	       ioo->ioo_bufcnt);
+}
+EXPORT_SYMBOL(dump_ioo);
+
+void dump_rniobuf(struct niobuf_remote *nb)
+{
+	CDEBUG(D_RPCTRACE, "niobuf_remote: offset="LPU64", len=%d, flags=%x\n",
+	       nb->offset, nb->len, nb->flags);
+}
+EXPORT_SYMBOL(dump_rniobuf);
+
+void dump_obdo(struct obdo *oa)
+{
+	__u32 valid = oa->o_valid;
+
+	CDEBUG(D_RPCTRACE, "obdo: o_valid = %08x\n", valid);
+	if (valid & OBD_MD_FLID)
+		CDEBUG(D_RPCTRACE, "obdo: id = "DOSTID"\n", POSTID(&oa->o_oi));
+	if (valid & OBD_MD_FLFID)
+		CDEBUG(D_RPCTRACE, "obdo: o_parent_seq = "LPX64"\n",
+		       oa->o_parent_seq);
+	if (valid & OBD_MD_FLSIZE)
+		CDEBUG(D_RPCTRACE, "obdo: o_size = "LPD64"\n", oa->o_size);
+	if (valid & OBD_MD_FLMTIME)
+		CDEBUG(D_RPCTRACE, "obdo: o_mtime = "LPD64"\n", oa->o_mtime);
+	if (valid & OBD_MD_FLATIME)
+		CDEBUG(D_RPCTRACE, "obdo: o_atime = "LPD64"\n", oa->o_atime);
+	if (valid & OBD_MD_FLCTIME)
+		CDEBUG(D_RPCTRACE, "obdo: o_ctime = "LPD64"\n", oa->o_ctime);
+	if (valid & OBD_MD_FLBLOCKS)   /* allocation of space */
+		CDEBUG(D_RPCTRACE, "obdo: o_blocks = "LPD64"\n", oa->o_blocks);
+	if (valid & OBD_MD_FLGRANT)
+		CDEBUG(D_RPCTRACE, "obdo: o_grant = "LPD64"\n", oa->o_grant);
+	if (valid & OBD_MD_FLBLKSZ)
+		CDEBUG(D_RPCTRACE, "obdo: o_blksize = %d\n", oa->o_blksize);
+	if (valid & (OBD_MD_FLTYPE | OBD_MD_FLMODE))
+		CDEBUG(D_RPCTRACE, "obdo: o_mode = %o\n",
+		       oa->o_mode & ((valid & OBD_MD_FLTYPE ?  S_IFMT : 0) |
+				     (valid & OBD_MD_FLMODE ? ~S_IFMT : 0)));
+	if (valid & OBD_MD_FLUID)
+		CDEBUG(D_RPCTRACE, "obdo: o_uid = %u\n", oa->o_uid);
+	if (valid & OBD_MD_FLUID)
+		CDEBUG(D_RPCTRACE, "obdo: o_uid_h = %u\n", oa->o_uid_h);
+	if (valid & OBD_MD_FLGID)
+		CDEBUG(D_RPCTRACE, "obdo: o_gid = %u\n", oa->o_gid);
+	if (valid & OBD_MD_FLGID)
+		CDEBUG(D_RPCTRACE, "obdo: o_gid_h = %u\n", oa->o_gid_h);
+	if (valid & OBD_MD_FLFLAGS)
+		CDEBUG(D_RPCTRACE, "obdo: o_flags = %x\n", oa->o_flags);
+	if (valid & OBD_MD_FLNLINK)
+		CDEBUG(D_RPCTRACE, "obdo: o_nlink = %u\n", oa->o_nlink);
+	else if (valid & OBD_MD_FLCKSUM)
+		CDEBUG(D_RPCTRACE, "obdo: o_checksum (o_nlink) = %u\n",
+		       oa->o_nlink);
+	if (valid & OBD_MD_FLGENER)
+		CDEBUG(D_RPCTRACE, "obdo: o_parent_oid = %x\n",
+		       oa->o_parent_oid);
+	if (valid & OBD_MD_FLEPOCH)
+		CDEBUG(D_RPCTRACE, "obdo: o_ioepoch = "LPD64"\n",
+		       oa->o_ioepoch);
+	if (valid & OBD_MD_FLFID) {
+		CDEBUG(D_RPCTRACE, "obdo: o_stripe_idx = %u\n",
+		       oa->o_stripe_idx);
+		CDEBUG(D_RPCTRACE, "obdo: o_parent_ver = %x\n",
+		       oa->o_parent_ver);
+	}
+	if (valid & OBD_MD_FLHANDLE)
+		CDEBUG(D_RPCTRACE, "obdo: o_handle = "LPD64"\n",
+		       oa->o_handle.cookie);
+	if (valid & OBD_MD_FLCOOKIE)
+		CDEBUG(D_RPCTRACE, "obdo: o_lcookie = "
+		       "(llog_cookie dumping not yet implemented)\n");
+}
+EXPORT_SYMBOL(dump_obdo);
+
+void dump_ost_body(struct ost_body *ob)
+{
+	dump_obdo(&ob->oa);
+}
+EXPORT_SYMBOL(dump_ost_body);
+
+void dump_rcs(__u32 *rc)
+{
+	CDEBUG(D_RPCTRACE, "rmf_rcs: %d\n", *rc);
+}
+EXPORT_SYMBOL(dump_rcs);
+
+static inline int req_ptlrpc_body_swabbed(struct ptlrpc_request *req)
+{
+	LASSERT(req->rq_reqmsg);
+
+	switch (req->rq_reqmsg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_req_swabbed(req, MSG_PTLRPC_BODY_OFF);
+	default:
+		CERROR("bad lustre msg magic: %#08X\n",
+		       req->rq_reqmsg->lm_magic);
+	}
+	return 0;
+}
+
+static inline int rep_ptlrpc_body_swabbed(struct ptlrpc_request *req)
+{
+	LASSERT(req->rq_repmsg);
+
+	switch (req->rq_repmsg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_rep_swabbed(req, MSG_PTLRPC_BODY_OFF);
+	default:
+		/* uninitialized yet */
+		return 0;
+	}
+}
+
+void _debug_req(struct ptlrpc_request *req,
+		struct libcfs_debug_msg_data *msgdata,
+		const char *fmt, ... )
+{
+	int req_ok = req->rq_reqmsg != NULL;
+	int rep_ok = req->rq_repmsg != NULL;
+	lnet_nid_t nid = LNET_NID_ANY;
+	va_list args;
+
+	if (ptlrpc_req_need_swab(req)) {
+		req_ok = req_ok && req_ptlrpc_body_swabbed(req);
+		rep_ok = rep_ok && rep_ptlrpc_body_swabbed(req);
+	}
+
+	if (req->rq_import && req->rq_import->imp_connection)
+		nid = req->rq_import->imp_connection->c_peer.nid;
+	else if (req->rq_export && req->rq_export->exp_connection)
+		nid = req->rq_export->exp_connection->c_peer.nid;
+
+	va_start(args, fmt);
+	libcfs_debug_vmsg2(msgdata, fmt, args,
+			   " req@%p x"LPU64"/t"LPD64"("LPD64") o%d->%s@%s:%d/%d"
+			   " lens %d/%d e %d to %d dl "CFS_TIME_T" ref %d "
+			   "fl "REQ_FLAGS_FMT"/%x/%x rc %d/%d\n",
+			   req, req->rq_xid, req->rq_transno,
+			   req_ok ? lustre_msg_get_transno(req->rq_reqmsg) : 0,
+			   req_ok ? lustre_msg_get_opc(req->rq_reqmsg) : -1,
+			   req->rq_import ?
+				req->rq_import->imp_obd->obd_name :
+				req->rq_export ?
+				     req->rq_export->exp_client_uuid.uuid :
+				     "<?>",
+			   libcfs_nid2str(nid),
+			   req->rq_request_portal, req->rq_reply_portal,
+			   req->rq_reqlen, req->rq_replen,
+			   req->rq_early_count, req->rq_timedout,
+			   req->rq_deadline,
+			   atomic_read(&req->rq_refcount),
+			   DEBUG_REQ_FLAGS(req),
+			   req_ok ? lustre_msg_get_flags(req->rq_reqmsg) : -1,
+			   rep_ok ? lustre_msg_get_flags(req->rq_repmsg) : -1,
+			   req->rq_status,
+			   rep_ok ? lustre_msg_get_status(req->rq_repmsg) : -1);
+}
+EXPORT_SYMBOL(_debug_req);
+
+void lustre_swab_lustre_capa(struct lustre_capa *c)
+{
+	lustre_swab_lu_fid(&c->lc_fid);
+	__swab64s (&c->lc_opc);
+	__swab64s (&c->lc_uid);
+	__swab64s (&c->lc_gid);
+	__swab32s (&c->lc_flags);
+	__swab32s (&c->lc_keyid);
+	__swab32s (&c->lc_timeout);
+	__swab32s (&c->lc_expiry);
+}
+EXPORT_SYMBOL(lustre_swab_lustre_capa);
+
+void lustre_swab_lustre_capa_key(struct lustre_capa_key *k)
+{
+	__swab64s (&k->lk_seq);
+	__swab32s (&k->lk_keyid);
+	CLASSERT(offsetof(typeof(*k), lk_padding) != 0);
+}
+EXPORT_SYMBOL(lustre_swab_lustre_capa_key);
+
+void lustre_swab_hsm_user_state(struct hsm_user_state *state)
+{
+	__swab32s(&state->hus_states);
+	__swab32s(&state->hus_archive_id);
+}
+EXPORT_SYMBOL(lustre_swab_hsm_user_state);
+
+void lustre_swab_hsm_state_set(struct hsm_state_set *hss)
+{
+	__swab32s(&hss->hss_valid);
+	__swab64s(&hss->hss_setmask);
+	__swab64s(&hss->hss_clearmask);
+	__swab32s(&hss->hss_archive_id);
+}
+EXPORT_SYMBOL(lustre_swab_hsm_state_set);
+
+void lustre_swab_hsm_extent(struct hsm_extent *extent)
+{
+	__swab64s(&extent->offset);
+	__swab64s(&extent->length);
+}
+
+void lustre_swab_hsm_current_action(struct hsm_current_action *action)
+{
+	__swab32s(&action->hca_state);
+	__swab32s(&action->hca_action);
+	lustre_swab_hsm_extent(&action->hca_location);
+}
+EXPORT_SYMBOL(lustre_swab_hsm_current_action);
+
+void lustre_swab_hsm_user_item(struct hsm_user_item *hui)
+{
+	lustre_swab_lu_fid(&hui->hui_fid);
+	lustre_swab_hsm_extent(&hui->hui_extent);
+}
+EXPORT_SYMBOL(lustre_swab_hsm_user_item);
+
+void lustre_swab_layout_intent(struct layout_intent *li)
+{
+	__swab32s(&li->li_opc);
+	__swab32s(&li->li_flags);
+	__swab64s(&li->li_start);
+	__swab64s(&li->li_end);
+}
+EXPORT_SYMBOL(lustre_swab_layout_intent);
+
+void lustre_swab_hsm_progress_kernel(struct hsm_progress_kernel *hpk)
+{
+	lustre_swab_lu_fid(&hpk->hpk_fid);
+	__swab64s(&hpk->hpk_cookie);
+	__swab64s(&hpk->hpk_extent.offset);
+	__swab64s(&hpk->hpk_extent.length);
+	__swab16s(&hpk->hpk_flags);
+	__swab16s(&hpk->hpk_errval);
+}
+EXPORT_SYMBOL(lustre_swab_hsm_progress_kernel);
+
+void lustre_swab_hsm_request(struct hsm_request *hr)
+{
+	__swab32s(&hr->hr_action);
+	__swab32s(&hr->hr_archive_id);
+	__swab64s(&hr->hr_flags);
+	__swab32s(&hr->hr_itemcount);
+	__swab32s(&hr->hr_data_len);
+}
+EXPORT_SYMBOL(lustre_swab_hsm_request);
+
+void lustre_swab_update_buf(struct update_buf *ub)
+{
+	__swab32s(&ub->ub_magic);
+	__swab32s(&ub->ub_count);
+}
+EXPORT_SYMBOL(lustre_swab_update_buf);
+
+void lustre_swab_update_reply_buf(struct update_reply *ur)
+{
+	int i;
+
+	__swab32s(&ur->ur_version);
+	__swab32s(&ur->ur_count);
+	for (i = 0; i < ur->ur_count; i++)
+		__swab32s(&ur->ur_lens[i]);
+}
+EXPORT_SYMBOL(lustre_swab_update_reply_buf);
+
+void lustre_swab_swap_layouts(struct mdc_swap_layouts *msl)
+{
+	__swab64s(&msl->msl_flags);
+}
+EXPORT_SYMBOL(lustre_swab_swap_layouts);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/pers.c b/drivers/staging/lustre/lustre/ptlrpc/pers.c
new file mode 100644
index 000000000000..d926d2b36fb4
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/pers.c
@@ -0,0 +1,75 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_lib.h>
+#include <lustre_ha.h>
+#include <lustre_import.h>
+
+#include "ptlrpc_internal.h"
+
+
+void ptlrpc_fill_bulk_md(lnet_md_t *md, struct ptlrpc_bulk_desc *desc,
+			 int mdidx)
+{
+	CLASSERT(PTLRPC_MAX_BRW_PAGES < LI_POISON);
+
+	LASSERT(mdidx < desc->bd_md_max_brw);
+	LASSERT(desc->bd_iov_count <= PTLRPC_MAX_BRW_PAGES);
+	LASSERT(!(md->options & (LNET_MD_IOVEC | LNET_MD_KIOV |
+				 LNET_MD_PHYS)));
+
+	md->options |= LNET_MD_KIOV;
+	md->length = max(0, desc->bd_iov_count - mdidx * LNET_MAX_IOV);
+	md->length = min_t(unsigned int, LNET_MAX_IOV, md->length);
+	if (desc->bd_enc_iov)
+		md->start = &desc->bd_enc_iov[mdidx * LNET_MAX_IOV];
+	else
+		md->start = &desc->bd_iov[mdidx * LNET_MAX_IOV];
+}
+
+void ptlrpc_add_bulk_page(struct ptlrpc_bulk_desc *desc, struct page *page,
+			  int pageoffset, int len)
+{
+	lnet_kiov_t *kiov = &desc->bd_iov[desc->bd_iov_count];
+
+	kiov->kiov_page = page;
+	kiov->kiov_offset = pageoffset;
+	kiov->kiov_len = len;
+
+	desc->bd_iov_count++;
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/pinger.c b/drivers/staging/lustre/lustre/ptlrpc/pinger.c
new file mode 100644
index 000000000000..ef5269aee0de
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/pinger.c
@@ -0,0 +1,763 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/pinger.c
+ *
+ * Portal-RPC reconnection and replay operations, for use in recovery.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include "ptlrpc_internal.h"
+
+static int suppress_pings;
+CFS_MODULE_PARM(suppress_pings, "i", int, 0644, "Suppress pings");
+
+struct mutex pinger_mutex;
+static LIST_HEAD(pinger_imports);
+static struct list_head timeout_list = LIST_HEAD_INIT(timeout_list);
+
+int ptlrpc_pinger_suppress_pings()
+{
+	return suppress_pings;
+}
+EXPORT_SYMBOL(ptlrpc_pinger_suppress_pings);
+
+struct ptlrpc_request *
+ptlrpc_prep_ping(struct obd_import *imp)
+{
+	struct ptlrpc_request *req;
+
+	req = ptlrpc_request_alloc_pack(imp, &RQF_OBD_PING,
+					LUSTRE_OBD_VERSION, OBD_PING);
+	if (req) {
+		ptlrpc_request_set_replen(req);
+		req->rq_no_resend = req->rq_no_delay = 1;
+	}
+	return req;
+}
+
+int ptlrpc_obd_ping(struct obd_device *obd)
+{
+	int rc;
+	struct ptlrpc_request *req;
+	ENTRY;
+
+	req = ptlrpc_prep_ping(obd->u.cli.cl_import);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	req->rq_send_state = LUSTRE_IMP_FULL;
+
+	rc = ptlrpc_queue_wait(req);
+
+	ptlrpc_req_finished(req);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_obd_ping);
+
+int ptlrpc_ping(struct obd_import *imp)
+{
+	struct ptlrpc_request *req;
+	ENTRY;
+
+	req = ptlrpc_prep_ping(imp);
+	if (req == NULL) {
+		CERROR("OOM trying to ping %s->%s\n",
+		       imp->imp_obd->obd_uuid.uuid,
+		       obd2cli_tgt(imp->imp_obd));
+		RETURN(-ENOMEM);
+	}
+
+	DEBUG_REQ(D_INFO, req, "pinging %s->%s",
+		  imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd));
+	ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+
+	RETURN(0);
+}
+
+void ptlrpc_update_next_ping(struct obd_import *imp, int soon)
+{
+	int time = soon ? PING_INTERVAL_SHORT : PING_INTERVAL;
+	if (imp->imp_state == LUSTRE_IMP_DISCON) {
+		int dtime = max_t(int, CONNECTION_SWITCH_MIN,
+				  AT_OFF ? 0 :
+				  at_get(&imp->imp_at.iat_net_latency));
+		time = min(time, dtime);
+	}
+	imp->imp_next_ping = cfs_time_shift(time);
+}
+
+void ptlrpc_ping_import_soon(struct obd_import *imp)
+{
+	imp->imp_next_ping = cfs_time_current();
+}
+
+static inline int imp_is_deactive(struct obd_import *imp)
+{
+	return (imp->imp_deactive ||
+		OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_IMP_DEACTIVE));
+}
+
+static inline int ptlrpc_next_reconnect(struct obd_import *imp)
+{
+	if (imp->imp_server_timeout)
+		return cfs_time_shift(obd_timeout / 2);
+	else
+		return cfs_time_shift(obd_timeout);
+}
+
+static atomic_t suspend_timeouts = ATOMIC_INIT(0);
+static cfs_time_t suspend_wakeup_time = 0;
+
+cfs_duration_t pinger_check_timeout(cfs_time_t time)
+{
+	struct timeout_item *item;
+	cfs_time_t timeout = PING_INTERVAL;
+
+	/* The timeout list is a increase order sorted list */
+	mutex_lock(&pinger_mutex);
+	list_for_each_entry(item, &timeout_list, ti_chain) {
+		int ti_timeout = item->ti_timeout;
+		if (timeout > ti_timeout)
+			timeout = ti_timeout;
+		break;
+	}
+	mutex_unlock(&pinger_mutex);
+
+	return cfs_time_sub(cfs_time_add(time, cfs_time_seconds(timeout)),
+					 cfs_time_current());
+}
+
+static wait_queue_head_t suspend_timeouts_waitq;
+
+cfs_time_t ptlrpc_suspend_wakeup_time(void)
+{
+	return suspend_wakeup_time;
+}
+
+void ptlrpc_deactivate_timeouts(struct obd_import *imp)
+{
+	/*XXX: disabled for now, will be replaced by adaptive timeouts */
+#if 0
+	if (imp->imp_no_timeout)
+		return;
+	imp->imp_no_timeout = 1;
+	atomic_inc(&suspend_timeouts);
+	CDEBUG(D_HA|D_WARNING, "deactivate timeouts %u\n",
+	       atomic_read(&suspend_timeouts));
+#endif
+}
+
+void ptlrpc_activate_timeouts(struct obd_import *imp)
+{
+	/*XXX: disabled for now, will be replaced by adaptive timeouts */
+#if 0
+	if (!imp->imp_no_timeout)
+		return;
+	imp->imp_no_timeout = 0;
+	LASSERT(atomic_read(&suspend_timeouts) > 0);
+	if (atomic_dec_and_test(&suspend_timeouts)) {
+		suspend_wakeup_time = cfs_time_current();
+		wake_up(&suspend_timeouts_waitq);
+	}
+	CDEBUG(D_HA|D_WARNING, "activate timeouts %u\n",
+	       atomic_read(&suspend_timeouts));
+#endif
+}
+
+int ptlrpc_check_suspend(void)
+{
+	if (atomic_read(&suspend_timeouts))
+		return 1;
+	return 0;
+}
+
+int ptlrpc_check_and_wait_suspend(struct ptlrpc_request *req)
+{
+	struct l_wait_info lwi;
+
+	if (atomic_read(&suspend_timeouts)) {
+		DEBUG_REQ(D_NET, req, "-- suspend %d regular timeout",
+			  atomic_read(&suspend_timeouts));
+		lwi = LWI_INTR(NULL, NULL);
+		l_wait_event(suspend_timeouts_waitq,
+			     atomic_read(&suspend_timeouts) == 0, &lwi);
+		DEBUG_REQ(D_NET, req, "-- recharge regular timeout");
+		return 1;
+	}
+	return 0;
+}
+
+
+static bool ir_up;
+
+void ptlrpc_pinger_ir_up(void)
+{
+	CDEBUG(D_HA, "IR up\n");
+	ir_up = true;
+}
+EXPORT_SYMBOL(ptlrpc_pinger_ir_up);
+
+void ptlrpc_pinger_ir_down(void)
+{
+	CDEBUG(D_HA, "IR down\n");
+	ir_up = false;
+}
+EXPORT_SYMBOL(ptlrpc_pinger_ir_down);
+
+static void ptlrpc_pinger_process_import(struct obd_import *imp,
+					 unsigned long this_ping)
+{
+	int level;
+	int force;
+	int force_next;
+	int suppress;
+
+	spin_lock(&imp->imp_lock);
+
+	level = imp->imp_state;
+	force = imp->imp_force_verify;
+	force_next = imp->imp_force_next_verify;
+	/*
+	 * This will be used below only if the import is "FULL".
+	 */
+	suppress = ir_up && OCD_HAS_FLAG(&imp->imp_connect_data, PINGLESS);
+
+	imp->imp_force_verify = 0;
+
+	if (cfs_time_aftereq(imp->imp_next_ping - 5 * CFS_TICK, this_ping) &&
+	    !force) {
+		spin_unlock(&imp->imp_lock);
+		return;
+	}
+
+	imp->imp_force_next_verify = 0;
+
+	spin_unlock(&imp->imp_lock);
+
+	CDEBUG(level == LUSTRE_IMP_FULL ? D_INFO : D_HA, "%s->%s: level %s/%u "
+	       "force %u force_next %u deactive %u pingable %u suppress %u\n",
+	       imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd),
+	       ptlrpc_import_state_name(level), level, force, force_next,
+	       imp->imp_deactive, imp->imp_pingable, suppress);
+
+	if (level == LUSTRE_IMP_DISCON && !imp_is_deactive(imp)) {
+		/* wait for a while before trying recovery again */
+		imp->imp_next_ping = ptlrpc_next_reconnect(imp);
+		if (!imp->imp_no_pinger_recover)
+			ptlrpc_initiate_recovery(imp);
+	} else if (level != LUSTRE_IMP_FULL ||
+		   imp->imp_obd->obd_no_recov ||
+		   imp_is_deactive(imp)) {
+		CDEBUG(D_HA, "%s->%s: not pinging (in recovery "
+		       "or recovery disabled: %s)\n",
+		       imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd),
+		       ptlrpc_import_state_name(level));
+	} else if ((imp->imp_pingable && !suppress) || force_next || force) {
+		ptlrpc_ping(imp);
+	}
+}
+
+static int ptlrpc_pinger_main(void *arg)
+{
+	struct ptlrpc_thread *thread = (struct ptlrpc_thread *)arg;
+	ENTRY;
+
+	/* Record that the thread is running */
+	thread_set_flags(thread, SVC_RUNNING);
+	wake_up(&thread->t_ctl_waitq);
+
+	/* And now, loop forever, pinging as needed. */
+	while (1) {
+		cfs_time_t this_ping = cfs_time_current();
+		struct l_wait_info lwi;
+		cfs_duration_t time_to_next_wake;
+		struct timeout_item *item;
+		struct list_head *iter;
+
+		mutex_lock(&pinger_mutex);
+		list_for_each_entry(item, &timeout_list, ti_chain) {
+			item->ti_cb(item, item->ti_cb_data);
+		}
+		list_for_each(iter, &pinger_imports) {
+			struct obd_import *imp =
+				list_entry(iter, struct obd_import,
+					       imp_pinger_chain);
+
+			ptlrpc_pinger_process_import(imp, this_ping);
+			/* obd_timeout might have changed */
+			if (imp->imp_pingable && imp->imp_next_ping &&
+			    cfs_time_after(imp->imp_next_ping,
+					   cfs_time_add(this_ping,
+							cfs_time_seconds(PING_INTERVAL))))
+				ptlrpc_update_next_ping(imp, 0);
+		}
+		mutex_unlock(&pinger_mutex);
+		/* update memory usage info */
+		obd_update_maxusage();
+
+		/* Wait until the next ping time, or until we're stopped. */
+		time_to_next_wake = pinger_check_timeout(this_ping);
+		/* The ping sent by ptlrpc_send_rpc may get sent out
+		   say .01 second after this.
+		   ptlrpc_pinger_sending_on_import will then set the
+		   next ping time to next_ping + .01 sec, which means
+		   we will SKIP the next ping at next_ping, and the
+		   ping will get sent 2 timeouts from now!  Beware. */
+		CDEBUG(D_INFO, "next wakeup in "CFS_DURATION_T" ("
+		       CFS_TIME_T")\n", time_to_next_wake,
+		       cfs_time_add(this_ping,cfs_time_seconds(PING_INTERVAL)));
+		if (time_to_next_wake > 0) {
+			lwi = LWI_TIMEOUT(max_t(cfs_duration_t,
+						time_to_next_wake,
+						cfs_time_seconds(1)),
+					  NULL, NULL);
+			l_wait_event(thread->t_ctl_waitq,
+				     thread_is_stopping(thread) ||
+				     thread_is_event(thread),
+				     &lwi);
+			if (thread_test_and_clear_flags(thread, SVC_STOPPING)) {
+				EXIT;
+				break;
+			} else {
+				/* woken after adding import to reset timer */
+				thread_test_and_clear_flags(thread, SVC_EVENT);
+			}
+		}
+	}
+
+	thread_set_flags(thread, SVC_STOPPED);
+	wake_up(&thread->t_ctl_waitq);
+
+	CDEBUG(D_NET, "pinger thread exiting, process %d\n", current_pid());
+	return 0;
+}
+
+static struct ptlrpc_thread *pinger_thread = NULL;
+
+int ptlrpc_start_pinger(void)
+{
+	struct l_wait_info lwi = { 0 };
+	int rc;
+	ENTRY;
+
+	if (pinger_thread != NULL)
+		RETURN(-EALREADY);
+
+	OBD_ALLOC_PTR(pinger_thread);
+	if (pinger_thread == NULL)
+		RETURN(-ENOMEM);
+	init_waitqueue_head(&pinger_thread->t_ctl_waitq);
+	init_waitqueue_head(&suspend_timeouts_waitq);
+
+	strcpy(pinger_thread->t_name, "ll_ping");
+
+	/* CLONE_VM and CLONE_FILES just avoid a needless copy, because we
+	 * just drop the VM and FILES in cfs_daemonize_ctxt() right away. */
+	rc = PTR_ERR(kthread_run(ptlrpc_pinger_main,
+				 pinger_thread, pinger_thread->t_name));
+	if (IS_ERR_VALUE(rc)) {
+		CERROR("cannot start thread: %d\n", rc);
+		OBD_FREE(pinger_thread, sizeof(*pinger_thread));
+		pinger_thread = NULL;
+		RETURN(rc);
+	}
+	l_wait_event(pinger_thread->t_ctl_waitq,
+		     thread_is_running(pinger_thread), &lwi);
+
+	if (suppress_pings)
+		CWARN("Pings will be suppressed at the request of the "
+		      "administrator.  The configuration shall meet the "
+		      "additional requirements described in the manual.  "
+		      "(Search for the \"suppress_pings\" kernel module "
+		      "parameter.)\n");
+
+	RETURN(0);
+}
+
+int ptlrpc_pinger_remove_timeouts(void);
+
+int ptlrpc_stop_pinger(void)
+{
+	struct l_wait_info lwi = { 0 };
+	int rc = 0;
+	ENTRY;
+
+	if (pinger_thread == NULL)
+		RETURN(-EALREADY);
+
+	ptlrpc_pinger_remove_timeouts();
+	mutex_lock(&pinger_mutex);
+	thread_set_flags(pinger_thread, SVC_STOPPING);
+	wake_up(&pinger_thread->t_ctl_waitq);
+	mutex_unlock(&pinger_mutex);
+
+	l_wait_event(pinger_thread->t_ctl_waitq,
+		     thread_is_stopped(pinger_thread), &lwi);
+
+	OBD_FREE_PTR(pinger_thread);
+	pinger_thread = NULL;
+	RETURN(rc);
+}
+
+void ptlrpc_pinger_sending_on_import(struct obd_import *imp)
+{
+	ptlrpc_update_next_ping(imp, 0);
+}
+EXPORT_SYMBOL(ptlrpc_pinger_sending_on_import);
+
+void ptlrpc_pinger_commit_expected(struct obd_import *imp)
+{
+	ptlrpc_update_next_ping(imp, 1);
+	LASSERT(spin_is_locked(&imp->imp_lock));
+	/*
+	 * Avoid reading stale imp_connect_data.  When not sure if pings are
+	 * expected or not on next connection, we assume they are not and force
+	 * one anyway to guarantee the chance of updating
+	 * imp_peer_committed_transno.
+	 */
+	if (imp->imp_state != LUSTRE_IMP_FULL ||
+	    OCD_HAS_FLAG(&imp->imp_connect_data, PINGLESS))
+		imp->imp_force_next_verify = 1;
+}
+
+int ptlrpc_pinger_add_import(struct obd_import *imp)
+{
+	ENTRY;
+	if (!list_empty(&imp->imp_pinger_chain))
+		RETURN(-EALREADY);
+
+	mutex_lock(&pinger_mutex);
+	CDEBUG(D_HA, "adding pingable import %s->%s\n",
+	       imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd));
+	/* if we add to pinger we want recovery on this import */
+	imp->imp_obd->obd_no_recov = 0;
+	ptlrpc_update_next_ping(imp, 0);
+	/* XXX sort, blah blah */
+	list_add_tail(&imp->imp_pinger_chain, &pinger_imports);
+	class_import_get(imp);
+
+	ptlrpc_pinger_wake_up();
+	mutex_unlock(&pinger_mutex);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_pinger_add_import);
+
+int ptlrpc_pinger_del_import(struct obd_import *imp)
+{
+	ENTRY;
+	if (list_empty(&imp->imp_pinger_chain))
+		RETURN(-ENOENT);
+
+	mutex_lock(&pinger_mutex);
+	list_del_init(&imp->imp_pinger_chain);
+	CDEBUG(D_HA, "removing pingable import %s->%s\n",
+	       imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd));
+	/* if we remove from pinger we don't want recovery on this import */
+	imp->imp_obd->obd_no_recov = 1;
+	class_import_put(imp);
+	mutex_unlock(&pinger_mutex);
+	RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_pinger_del_import);
+
+/**
+ * Register a timeout callback to the pinger list, and the callback will
+ * be called when timeout happens.
+ */
+struct timeout_item* ptlrpc_new_timeout(int time, enum timeout_event event,
+					timeout_cb_t cb, void *data)
+{
+	struct timeout_item *ti;
+
+	OBD_ALLOC_PTR(ti);
+	if (!ti)
+		return(NULL);
+
+	INIT_LIST_HEAD(&ti->ti_obd_list);
+	INIT_LIST_HEAD(&ti->ti_chain);
+	ti->ti_timeout = time;
+	ti->ti_event = event;
+	ti->ti_cb = cb;
+	ti->ti_cb_data = data;
+
+	return ti;
+}
+
+/**
+ * Register timeout event on the the pinger thread.
+ * Note: the timeout list is an sorted list with increased timeout value.
+ */
+static struct timeout_item*
+ptlrpc_pinger_register_timeout(int time, enum timeout_event event,
+			       timeout_cb_t cb, void *data)
+{
+	struct timeout_item *item, *tmp;
+
+	LASSERT(mutex_is_locked(&pinger_mutex));
+
+	list_for_each_entry(item, &timeout_list, ti_chain)
+		if (item->ti_event == event)
+			goto out;
+
+	item = ptlrpc_new_timeout(time, event, cb, data);
+	if (item) {
+		list_for_each_entry_reverse(tmp, &timeout_list, ti_chain) {
+			if (tmp->ti_timeout < time) {
+				list_add(&item->ti_chain, &tmp->ti_chain);
+				goto out;
+			}
+		}
+		list_add(&item->ti_chain, &timeout_list);
+	}
+out:
+	return item;
+}
+
+/* Add a client_obd to the timeout event list, when timeout(@time)
+ * happens, the callback(@cb) will be called.
+ */
+int ptlrpc_add_timeout_client(int time, enum timeout_event event,
+			      timeout_cb_t cb, void *data,
+			      struct list_head *obd_list)
+{
+	struct timeout_item *ti;
+
+	mutex_lock(&pinger_mutex);
+	ti = ptlrpc_pinger_register_timeout(time, event, cb, data);
+	if (!ti) {
+		mutex_unlock(&pinger_mutex);
+		return (-EINVAL);
+	}
+	list_add(obd_list, &ti->ti_obd_list);
+	mutex_unlock(&pinger_mutex);
+	return 0;
+}
+EXPORT_SYMBOL(ptlrpc_add_timeout_client);
+
+int ptlrpc_del_timeout_client(struct list_head *obd_list,
+			      enum timeout_event event)
+{
+	struct timeout_item *ti = NULL, *item;
+
+	if (list_empty(obd_list))
+		return 0;
+	mutex_lock(&pinger_mutex);
+	list_del_init(obd_list);
+	/**
+	 * If there are no obd attached to the timeout event
+	 * list, remove this timeout event from the pinger
+	 */
+	list_for_each_entry(item, &timeout_list, ti_chain) {
+		if (item->ti_event == event) {
+			ti = item;
+			break;
+		}
+	}
+	LASSERTF(ti != NULL, "ti is NULL ! \n");
+	if (list_empty(&ti->ti_obd_list)) {
+		list_del(&ti->ti_chain);
+		OBD_FREE_PTR(ti);
+	}
+	mutex_unlock(&pinger_mutex);
+	return 0;
+}
+EXPORT_SYMBOL(ptlrpc_del_timeout_client);
+
+int ptlrpc_pinger_remove_timeouts(void)
+{
+	struct timeout_item *item, *tmp;
+
+	mutex_lock(&pinger_mutex);
+	list_for_each_entry_safe(item, tmp, &timeout_list, ti_chain) {
+		LASSERT(list_empty(&item->ti_obd_list));
+		list_del(&item->ti_chain);
+		OBD_FREE_PTR(item);
+	}
+	mutex_unlock(&pinger_mutex);
+	return 0;
+}
+
+void ptlrpc_pinger_wake_up()
+{
+	thread_add_flags(pinger_thread, SVC_EVENT);
+	wake_up(&pinger_thread->t_ctl_waitq);
+}
+
+/* Ping evictor thread */
+#define PET_READY     1
+#define PET_TERMINATE 2
+
+static int	       pet_refcount = 0;
+static int	       pet_state;
+static wait_queue_head_t       pet_waitq;
+LIST_HEAD(pet_list);
+static DEFINE_SPINLOCK(pet_lock);
+
+int ping_evictor_wake(struct obd_export *exp)
+{
+	struct obd_device *obd;
+
+	spin_lock(&pet_lock);
+	if (pet_state != PET_READY) {
+		/* eventually the new obd will call here again. */
+		spin_unlock(&pet_lock);
+		return 1;
+	}
+
+	obd = class_exp2obd(exp);
+	if (list_empty(&obd->obd_evict_list)) {
+		class_incref(obd, "evictor", obd);
+		list_add(&obd->obd_evict_list, &pet_list);
+	}
+	spin_unlock(&pet_lock);
+
+	wake_up(&pet_waitq);
+	return 0;
+}
+
+static int ping_evictor_main(void *arg)
+{
+	struct obd_device *obd;
+	struct obd_export *exp;
+	struct l_wait_info lwi = { 0 };
+	time_t expire_time;
+	ENTRY;
+
+	unshare_fs_struct();
+
+	CDEBUG(D_HA, "Starting Ping Evictor\n");
+	pet_state = PET_READY;
+	while (1) {
+		l_wait_event(pet_waitq, (!list_empty(&pet_list)) ||
+			     (pet_state == PET_TERMINATE), &lwi);
+
+		/* loop until all obd's will be removed */
+		if ((pet_state == PET_TERMINATE) && list_empty(&pet_list))
+			break;
+
+		/* we only get here if pet_exp != NULL, and the end of this
+		 * loop is the only place which sets it NULL again, so lock
+		 * is not strictly necessary. */
+		spin_lock(&pet_lock);
+		obd = list_entry(pet_list.next, struct obd_device,
+				     obd_evict_list);
+		spin_unlock(&pet_lock);
+
+		expire_time = cfs_time_current_sec() - PING_EVICT_TIMEOUT;
+
+		CDEBUG(D_HA, "evicting all exports of obd %s older than %ld\n",
+		       obd->obd_name, expire_time);
+
+		/* Exports can't be deleted out of the list while we hold
+		 * the obd lock (class_unlink_export), which means we can't
+		 * lose the last ref on the export.  If they've already been
+		 * removed from the list, we won't find them here. */
+		spin_lock(&obd->obd_dev_lock);
+		while (!list_empty(&obd->obd_exports_timed)) {
+			exp = list_entry(obd->obd_exports_timed.next,
+					     struct obd_export,
+					     exp_obd_chain_timed);
+			if (expire_time > exp->exp_last_request_time) {
+				class_export_get(exp);
+				spin_unlock(&obd->obd_dev_lock);
+				LCONSOLE_WARN("%s: haven't heard from client %s"
+					      " (at %s) in %ld seconds. I think"
+					      " it's dead, and I am evicting"
+					      " it. exp %p, cur %ld expire %ld"
+					      " last %ld\n",
+					      obd->obd_name,
+					      obd_uuid2str(&exp->exp_client_uuid),
+					      obd_export_nid2str(exp),
+					      (long)(cfs_time_current_sec() -
+						     exp->exp_last_request_time),
+					      exp, (long)cfs_time_current_sec(),
+					      (long)expire_time,
+					      (long)exp->exp_last_request_time);
+				CDEBUG(D_HA, "Last request was at %ld\n",
+				       exp->exp_last_request_time);
+				class_fail_export(exp);
+				class_export_put(exp);
+				spin_lock(&obd->obd_dev_lock);
+			} else {
+				/* List is sorted, so everyone below is ok */
+				break;
+			}
+		}
+		spin_unlock(&obd->obd_dev_lock);
+
+		spin_lock(&pet_lock);
+		list_del_init(&obd->obd_evict_list);
+		spin_unlock(&pet_lock);
+
+		class_decref(obd, "evictor", obd);
+	}
+	CDEBUG(D_HA, "Exiting Ping Evictor\n");
+
+	RETURN(0);
+}
+
+void ping_evictor_start(void)
+{
+	task_t *task;
+
+	if (++pet_refcount > 1)
+		return;
+
+	init_waitqueue_head(&pet_waitq);
+
+	task = kthread_run(ping_evictor_main, NULL, "ll_evictor");
+	if (IS_ERR(task)) {
+		pet_refcount--;
+		CERROR("Cannot start ping evictor thread: %ld\n",
+			PTR_ERR(task));
+	}
+}
+EXPORT_SYMBOL(ping_evictor_start);
+
+void ping_evictor_stop(void)
+{
+	if (--pet_refcount > 0)
+		return;
+
+	pet_state = PET_TERMINATE;
+	wake_up(&pet_waitq);
+}
+EXPORT_SYMBOL(ping_evictor_stop);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_internal.h b/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_internal.h
new file mode 100644
index 000000000000..9ba760089b9d
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_internal.h
@@ -0,0 +1,303 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/* Intramodule declarations for ptlrpc. */
+
+#ifndef PTLRPC_INTERNAL_H
+#define PTLRPC_INTERNAL_H
+
+#include "../ldlm/ldlm_internal.h"
+
+struct ldlm_namespace;
+struct obd_import;
+struct ldlm_res_id;
+struct ptlrpc_request_set;
+extern int test_req_buffer_pressure;
+extern struct mutex ptlrpc_all_services_mutex;
+
+int ptlrpc_start_thread(struct ptlrpc_service_part *svcpt, int wait);
+/* ptlrpcd.c */
+int ptlrpcd_start(int index, int max, const char *name, struct ptlrpcd_ctl *pc);
+
+/* client.c */
+struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned npages, unsigned max_brw,
+					 unsigned type, unsigned portal);
+void ptlrpc_init_xid(void);
+
+/* events.c */
+int ptlrpc_init_portals(void);
+void ptlrpc_exit_portals(void);
+
+void ptlrpc_request_handle_notconn(struct ptlrpc_request *);
+void lustre_assert_wire_constants(void);
+int ptlrpc_import_in_recovery(struct obd_import *imp);
+int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt);
+void ptlrpc_handle_failed_import(struct obd_import *imp);
+int ptlrpc_replay_next(struct obd_import *imp, int *inflight);
+void ptlrpc_initiate_recovery(struct obd_import *imp);
+
+int lustre_unpack_req_ptlrpc_body(struct ptlrpc_request *req, int offset);
+int lustre_unpack_rep_ptlrpc_body(struct ptlrpc_request *req, int offset);
+
+#ifdef LPROCFS
+void ptlrpc_lprocfs_register_service(struct proc_dir_entry *proc_entry,
+				     struct ptlrpc_service *svc);
+void ptlrpc_lprocfs_unregister_service(struct ptlrpc_service *svc);
+void ptlrpc_lprocfs_rpc_sent(struct ptlrpc_request *req, long amount);
+void ptlrpc_lprocfs_do_request_stat (struct ptlrpc_request *req,
+				     long q_usec, long work_usec);
+#else
+#define ptlrpc_lprocfs_register_service(params...) do{}while(0)
+#define ptlrpc_lprocfs_unregister_service(params...) do{}while(0)
+#define ptlrpc_lprocfs_rpc_sent(params...) do{}while(0)
+#define ptlrpc_lprocfs_do_request_stat(params...) do{}while(0)
+#endif /* LPROCFS */
+
+/* NRS */
+
+/**
+ * NRS core object.
+ *
+ * Holds NRS core fields.
+ */
+struct nrs_core {
+	/**
+	 * Protects nrs_core::nrs_policies, serializes external policy
+	 * registration/unregistration, and NRS core lprocfs operations.
+	 */
+	struct mutex nrs_mutex;
+	/* XXX: This is just for liblustre. Remove the #if defined directive
+	 * when the * "cfs_" prefix is dropped from cfs_list_head. */
+	/**
+	 * List of all policy descriptors registered with NRS core; protected
+	 * by nrs_core::nrs_mutex.
+	 */
+	struct list_head nrs_policies;
+
+};
+
+int ptlrpc_service_nrs_setup(struct ptlrpc_service *svc);
+void ptlrpc_service_nrs_cleanup(struct ptlrpc_service *svc);
+
+void ptlrpc_nrs_req_initialize(struct ptlrpc_service_part *svcpt,
+			       struct ptlrpc_request *req, bool hp);
+void ptlrpc_nrs_req_finalize(struct ptlrpc_request *req);
+void ptlrpc_nrs_req_stop_nolock(struct ptlrpc_request *req);
+void ptlrpc_nrs_req_add(struct ptlrpc_service_part *svcpt,
+			struct ptlrpc_request *req, bool hp);
+
+struct ptlrpc_request *
+ptlrpc_nrs_req_get_nolock0(struct ptlrpc_service_part *svcpt, bool hp,
+			   bool peek, bool force);
+
+static inline struct ptlrpc_request *
+ptlrpc_nrs_req_get_nolock(struct ptlrpc_service_part *svcpt, bool hp,
+			  bool force)
+{
+	return ptlrpc_nrs_req_get_nolock0(svcpt, hp, false, force);
+}
+
+static inline struct ptlrpc_request *
+ptlrpc_nrs_req_peek_nolock(struct ptlrpc_service_part *svcpt, bool hp)
+{
+	return ptlrpc_nrs_req_get_nolock0(svcpt, hp, true, false);
+}
+
+void ptlrpc_nrs_req_del_nolock(struct ptlrpc_request *req);
+bool ptlrpc_nrs_req_pending_nolock(struct ptlrpc_service_part *svcpt, bool hp);
+
+int ptlrpc_nrs_policy_control(const struct ptlrpc_service *svc,
+			      enum ptlrpc_nrs_queue_type queue, char *name,
+			      enum ptlrpc_nrs_ctl opc, bool single, void *arg);
+
+int ptlrpc_nrs_init(void);
+void ptlrpc_nrs_fini(void);
+
+static inline bool nrs_svcpt_has_hp(const struct ptlrpc_service_part *svcpt)
+{
+	return svcpt->scp_nrs_hp != NULL;
+}
+
+static inline bool nrs_svc_has_hp(const struct ptlrpc_service *svc)
+{
+	/**
+	 * If the first service partition has an HP NRS head, all service
+	 * partitions will.
+	 */
+	return nrs_svcpt_has_hp(svc->srv_parts[0]);
+}
+
+static inline
+struct ptlrpc_nrs *nrs_svcpt2nrs(struct ptlrpc_service_part *svcpt, bool hp)
+{
+	LASSERT(ergo(hp, nrs_svcpt_has_hp(svcpt)));
+	return hp ? svcpt->scp_nrs_hp : &svcpt->scp_nrs_reg;
+}
+
+static inline int nrs_pol2cptid(const struct ptlrpc_nrs_policy *policy)
+{
+	return policy->pol_nrs->nrs_svcpt->scp_cpt;
+}
+
+static inline
+struct ptlrpc_service *nrs_pol2svc(struct ptlrpc_nrs_policy *policy)
+{
+	return policy->pol_nrs->nrs_svcpt->scp_service;
+}
+
+static inline
+struct ptlrpc_service_part *nrs_pol2svcpt(struct ptlrpc_nrs_policy *policy)
+{
+	return policy->pol_nrs->nrs_svcpt;
+}
+
+static inline
+struct cfs_cpt_table *nrs_pol2cptab(struct ptlrpc_nrs_policy *policy)
+{
+	return nrs_pol2svc(policy)->srv_cptable;
+}
+
+static inline struct ptlrpc_nrs_resource *
+nrs_request_resource(struct ptlrpc_nrs_request *nrq)
+{
+	LASSERT(nrq->nr_initialized);
+	LASSERT(!nrq->nr_finalized);
+
+	return nrq->nr_res_ptrs[nrq->nr_res_idx];
+}
+
+static inline
+struct ptlrpc_nrs_policy *nrs_request_policy(struct ptlrpc_nrs_request *nrq)
+{
+	return nrs_request_resource(nrq)->res_policy;
+}
+
+#define NRS_LPROCFS_QUANTUM_NAME_REG	"reg_quantum:"
+#define NRS_LPROCFS_QUANTUM_NAME_HP	"hp_quantum:"
+
+/**
+ * the maximum size of nrs_crrn_client::cc_quantum and nrs_orr_data::od_quantum.
+ */
+#define LPROCFS_NRS_QUANTUM_MAX		65535
+
+/**
+ * Max valid command string is the size of the labels, plus "65535" twice, plus
+ * a separating space character.
+ */
+#define LPROCFS_NRS_WR_QUANTUM_MAX_CMD					       \
+ sizeof(NRS_LPROCFS_QUANTUM_NAME_REG __stringify(LPROCFS_NRS_QUANTUM_MAX) " "  \
+	NRS_LPROCFS_QUANTUM_NAME_HP __stringify(LPROCFS_NRS_QUANTUM_MAX))
+
+/* recovd_thread.c */
+
+int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink);
+
+/* pers.c */
+void ptlrpc_fill_bulk_md(lnet_md_t *md, struct ptlrpc_bulk_desc *desc,
+			 int mdcnt);
+void ptlrpc_add_bulk_page(struct ptlrpc_bulk_desc *desc, struct page *page,
+			  int pageoffset, int len);
+
+/* pack_generic.c */
+struct ptlrpc_reply_state *
+lustre_get_emerg_rs(struct ptlrpc_service_part *svcpt);
+void lustre_put_emerg_rs(struct ptlrpc_reply_state *rs);
+
+/* pinger.c */
+int ptlrpc_start_pinger(void);
+int ptlrpc_stop_pinger(void);
+void ptlrpc_pinger_sending_on_import(struct obd_import *imp);
+void ptlrpc_pinger_commit_expected(struct obd_import *imp);
+void ptlrpc_pinger_wake_up(void);
+void ptlrpc_ping_import_soon(struct obd_import *imp);
+int ping_evictor_wake(struct obd_export *exp);
+
+/* sec_null.c */
+int  sptlrpc_null_init(void);
+void sptlrpc_null_fini(void);
+
+/* sec_plain.c */
+int  sptlrpc_plain_init(void);
+void sptlrpc_plain_fini(void);
+
+/* sec_bulk.c */
+int  sptlrpc_enc_pool_init(void);
+void sptlrpc_enc_pool_fini(void);
+int sptlrpc_proc_read_enc_pool(char *page, char **start, off_t off, int count,
+			       int *eof, void *data);
+
+/* sec_lproc.c */
+int  sptlrpc_lproc_init(void);
+void sptlrpc_lproc_fini(void);
+
+/* sec_gc.c */
+int sptlrpc_gc_init(void);
+void sptlrpc_gc_fini(void);
+
+/* sec_config.c */
+void sptlrpc_conf_choose_flavor(enum lustre_sec_part from,
+				enum lustre_sec_part to,
+				struct obd_uuid *target,
+				lnet_nid_t nid,
+				struct sptlrpc_flavor *sf);
+int  sptlrpc_conf_init(void);
+void sptlrpc_conf_fini(void);
+
+/* sec.c */
+int  sptlrpc_init(void);
+void sptlrpc_fini(void);
+
+static inline int ll_rpc_recoverable_error(int rc)
+{
+	return (rc == -ENOTCONN || rc == -ENODEV);
+}
+
+static inline int tgt_mod_init(void)
+{
+	return 0;
+}
+
+static inline void tgt_mod_exit(void)
+{
+	return;
+}
+
+static inline void ptlrpc_reqset_put(struct ptlrpc_request_set *set)
+{
+	if (atomic_dec_and_test(&set->set_refcount))
+		OBD_FREE_PTR(set);
+}
+#endif /* PTLRPC_INTERNAL_H */
diff --git a/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_module.c b/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_module.c
new file mode 100644
index 000000000000..f6ea80f0b105
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_module.c
@@ -0,0 +1,154 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_req_layout.h>
+
+#include "ptlrpc_internal.h"
+
+extern spinlock_t ptlrpc_last_xid_lock;
+#if RS_DEBUG
+extern spinlock_t ptlrpc_rs_debug_lock;
+#endif
+extern struct mutex pinger_mutex;
+extern struct mutex ptlrpcd_mutex;
+
+__init int ptlrpc_init(void)
+{
+	int rc, cleanup_phase = 0;
+	ENTRY;
+
+	lustre_assert_wire_constants();
+#if RS_DEBUG
+	spin_lock_init(&ptlrpc_rs_debug_lock);
+#endif
+	mutex_init(&ptlrpc_all_services_mutex);
+	mutex_init(&pinger_mutex);
+	mutex_init(&ptlrpcd_mutex);
+	ptlrpc_init_xid();
+
+	rc = req_layout_init();
+	if (rc)
+		RETURN(rc);
+
+	rc = ptlrpc_hr_init();
+	if (rc)
+		RETURN(rc);
+
+	cleanup_phase = 1;
+
+	rc = ptlrpc_init_portals();
+	if (rc)
+		GOTO(cleanup, rc);
+	cleanup_phase = 2;
+
+	rc = ptlrpc_connection_init();
+	if (rc)
+		GOTO(cleanup, rc);
+	cleanup_phase = 3;
+
+	ptlrpc_put_connection_superhack = ptlrpc_connection_put;
+
+	rc = ptlrpc_start_pinger();
+	if (rc)
+		GOTO(cleanup, rc);
+	cleanup_phase = 4;
+
+	rc = ldlm_init();
+	if (rc)
+		GOTO(cleanup, rc);
+	cleanup_phase = 5;
+
+	rc = sptlrpc_init();
+	if (rc)
+		GOTO(cleanup, rc);
+
+	cleanup_phase = 7;
+	rc = ptlrpc_nrs_init();
+	if (rc)
+		GOTO(cleanup, rc);
+
+	cleanup_phase = 8;
+	rc = tgt_mod_init();
+	if (rc)
+		GOTO(cleanup, rc);
+	RETURN(0);
+
+cleanup:
+	switch(cleanup_phase) {
+	case 8:
+		ptlrpc_nrs_fini();
+	case 7:
+		sptlrpc_fini();
+	case 5:
+		ldlm_exit();
+	case 4:
+		ptlrpc_stop_pinger();
+	case 3:
+		ptlrpc_connection_fini();
+	case 2:
+		ptlrpc_exit_portals();
+	case 1:
+		ptlrpc_hr_fini();
+		req_layout_fini();
+	default: ;
+	}
+
+	return rc;
+}
+
+static void __exit ptlrpc_exit(void)
+{
+	tgt_mod_exit();
+	ptlrpc_nrs_fini();
+	sptlrpc_fini();
+	ldlm_exit();
+	ptlrpc_stop_pinger();
+	ptlrpc_exit_portals();
+	ptlrpc_hr_fini();
+	ptlrpc_connection_fini();
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Request Processor and Lock Management");
+MODULE_LICENSE("GPL");
+
+cfs_module(ptlrpc, "1.0.0", ptlrpc_init, ptlrpc_exit);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/ptlrpcd.c b/drivers/staging/lustre/lustre/ptlrpc/ptlrpcd.c
new file mode 100644
index 000000000000..185841fe8d00
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/ptlrpcd.c
@@ -0,0 +1,827 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/ptlrpcd.c
+ */
+
+/** \defgroup ptlrpcd PortalRPC daemon
+ *
+ * ptlrpcd is a special thread with its own set where other user might add
+ * requests when they don't want to wait for their completion.
+ * PtlRPCD will take care of sending such requests and then processing their
+ * replies and calling completion callbacks as necessary.
+ * The callbacks are called directly from ptlrpcd context.
+ * It is important to never significantly block (esp. on RPCs!) within such
+ * completion handler or a deadlock might occur where ptlrpcd enters some
+ * callback that attempts to send another RPC and wait for it to return,
+ * during which time ptlrpcd is completely blocked, so e.g. if import
+ * fails, recovery cannot progress because connection requests are also
+ * sent by ptlrpcd.
+ *
+ * @{
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+# include <linux/libcfs/libcfs.h>
+
+#include <lustre_net.h>
+# include <lustre_lib.h>
+
+#include <lustre_ha.h>
+#include <obd_class.h>   /* for obd_zombie */
+#include <obd_support.h> /* for OBD_FAIL_CHECK */
+#include <cl_object.h> /* cl_env_{get,put}() */
+#include <lprocfs_status.h>
+
+#include "ptlrpc_internal.h"
+
+struct ptlrpcd {
+	int		pd_size;
+	int		pd_index;
+	int		pd_nthreads;
+	struct ptlrpcd_ctl pd_thread_rcv;
+	struct ptlrpcd_ctl pd_threads[0];
+};
+
+static int max_ptlrpcds;
+CFS_MODULE_PARM(max_ptlrpcds, "i", int, 0644,
+		"Max ptlrpcd thread count to be started.");
+
+static int ptlrpcd_bind_policy = PDB_POLICY_PAIR;
+CFS_MODULE_PARM(ptlrpcd_bind_policy, "i", int, 0644,
+		"Ptlrpcd threads binding mode.");
+static struct ptlrpcd *ptlrpcds;
+
+struct mutex ptlrpcd_mutex;
+static int ptlrpcd_users = 0;
+
+void ptlrpcd_wake(struct ptlrpc_request *req)
+{
+	struct ptlrpc_request_set *rq_set = req->rq_set;
+
+	LASSERT(rq_set != NULL);
+
+	wake_up(&rq_set->set_waitq);
+}
+EXPORT_SYMBOL(ptlrpcd_wake);
+
+static struct ptlrpcd_ctl *
+ptlrpcd_select_pc(struct ptlrpc_request *req, pdl_policy_t policy, int index)
+{
+	int idx = 0;
+
+	if (req != NULL && req->rq_send_state != LUSTRE_IMP_FULL)
+		return &ptlrpcds->pd_thread_rcv;
+
+	switch (policy) {
+	case PDL_POLICY_SAME:
+		idx = smp_processor_id() % ptlrpcds->pd_nthreads;
+		break;
+	case PDL_POLICY_LOCAL:
+		/* Before CPU partition patches available, process it the same
+		 * as "PDL_POLICY_ROUND". */
+# ifdef CFS_CPU_MODE_NUMA
+# warning "fix this code to use new CPU partition APIs"
+# endif
+		/* Fall through to PDL_POLICY_ROUND until the CPU
+		 * CPU partition patches are available. */
+		index = -1;
+	case PDL_POLICY_PREFERRED:
+		if (index >= 0 && index < num_online_cpus()) {
+			idx = index % ptlrpcds->pd_nthreads;
+			break;
+		}
+		/* Fall through to PDL_POLICY_ROUND for bad index. */
+	default:
+		/* Fall through to PDL_POLICY_ROUND for unknown policy. */
+	case PDL_POLICY_ROUND:
+		/* We do not care whether it is strict load balance. */
+		idx = ptlrpcds->pd_index + 1;
+		if (idx == smp_processor_id())
+			idx++;
+		idx %= ptlrpcds->pd_nthreads;
+		ptlrpcds->pd_index = idx;
+		break;
+	}
+
+	return &ptlrpcds->pd_threads[idx];
+}
+
+/**
+ * Move all request from an existing request set to the ptlrpcd queue.
+ * All requests from the set must be in phase RQ_PHASE_NEW.
+ */
+void ptlrpcd_add_rqset(struct ptlrpc_request_set *set)
+{
+	struct list_head *tmp, *pos;
+	struct ptlrpcd_ctl *pc;
+	struct ptlrpc_request_set *new;
+	int count, i;
+
+	pc = ptlrpcd_select_pc(NULL, PDL_POLICY_LOCAL, -1);
+	new = pc->pc_set;
+
+	list_for_each_safe(pos, tmp, &set->set_requests) {
+		struct ptlrpc_request *req =
+			list_entry(pos, struct ptlrpc_request,
+				       rq_set_chain);
+
+		LASSERT(req->rq_phase == RQ_PHASE_NEW);
+		req->rq_set = new;
+		req->rq_queued_time = cfs_time_current();
+	}
+
+	spin_lock(&new->set_new_req_lock);
+	list_splice_init(&set->set_requests, &new->set_new_requests);
+	i = atomic_read(&set->set_remaining);
+	count = atomic_add_return(i, &new->set_new_count);
+	atomic_set(&set->set_remaining, 0);
+	spin_unlock(&new->set_new_req_lock);
+	if (count == i) {
+		wake_up(&new->set_waitq);
+
+		/* XXX: It maybe unnecessary to wakeup all the partners. But to
+		 *      guarantee the async RPC can be processed ASAP, we have
+		 *      no other better choice. It maybe fixed in future. */
+		for (i = 0; i < pc->pc_npartners; i++)
+			wake_up(&pc->pc_partners[i]->pc_set->set_waitq);
+	}
+}
+EXPORT_SYMBOL(ptlrpcd_add_rqset);
+
+/**
+ * Return transferred RPCs count.
+ */
+static int ptlrpcd_steal_rqset(struct ptlrpc_request_set *des,
+			       struct ptlrpc_request_set *src)
+{
+	struct list_head *tmp, *pos;
+	struct ptlrpc_request *req;
+	int rc = 0;
+
+	spin_lock(&src->set_new_req_lock);
+	if (likely(!list_empty(&src->set_new_requests))) {
+		list_for_each_safe(pos, tmp, &src->set_new_requests) {
+			req = list_entry(pos, struct ptlrpc_request,
+					     rq_set_chain);
+			req->rq_set = des;
+		}
+		list_splice_init(&src->set_new_requests,
+				     &des->set_requests);
+		rc = atomic_read(&src->set_new_count);
+		atomic_add(rc, &des->set_remaining);
+		atomic_set(&src->set_new_count, 0);
+	}
+	spin_unlock(&src->set_new_req_lock);
+	return rc;
+}
+
+/**
+ * Requests that are added to the ptlrpcd queue are sent via
+ * ptlrpcd_check->ptlrpc_check_set().
+ */
+void ptlrpcd_add_req(struct ptlrpc_request *req, pdl_policy_t policy, int idx)
+{
+	struct ptlrpcd_ctl *pc;
+
+	if (req->rq_reqmsg)
+		lustre_msg_set_jobid(req->rq_reqmsg, NULL);
+
+	spin_lock(&req->rq_lock);
+	if (req->rq_invalid_rqset) {
+		struct l_wait_info lwi = LWI_TIMEOUT(cfs_time_seconds(5),
+						     back_to_sleep, NULL);
+
+		req->rq_invalid_rqset = 0;
+		spin_unlock(&req->rq_lock);
+		l_wait_event(req->rq_set_waitq, (req->rq_set == NULL), &lwi);
+	} else if (req->rq_set) {
+		/* If we have a vaid "rq_set", just reuse it to avoid double
+		 * linked. */
+		LASSERT(req->rq_phase == RQ_PHASE_NEW);
+		LASSERT(req->rq_send_state == LUSTRE_IMP_REPLAY);
+
+		/* ptlrpc_check_set will decrease the count */
+		atomic_inc(&req->rq_set->set_remaining);
+		spin_unlock(&req->rq_lock);
+		wake_up(&req->rq_set->set_waitq);
+		return;
+	} else {
+		spin_unlock(&req->rq_lock);
+	}
+
+	pc = ptlrpcd_select_pc(req, policy, idx);
+
+	DEBUG_REQ(D_INFO, req, "add req [%p] to pc [%s:%d]",
+		  req, pc->pc_name, pc->pc_index);
+
+	ptlrpc_set_add_new_req(pc, req);
+}
+EXPORT_SYMBOL(ptlrpcd_add_req);
+
+static inline void ptlrpc_reqset_get(struct ptlrpc_request_set *set)
+{
+	atomic_inc(&set->set_refcount);
+}
+
+/**
+ * Check if there is more work to do on ptlrpcd set.
+ * Returns 1 if yes.
+ */
+static int ptlrpcd_check(struct lu_env *env, struct ptlrpcd_ctl *pc)
+{
+	struct list_head *tmp, *pos;
+	struct ptlrpc_request *req;
+	struct ptlrpc_request_set *set = pc->pc_set;
+	int rc = 0;
+	int rc2;
+	ENTRY;
+
+	if (atomic_read(&set->set_new_count)) {
+		spin_lock(&set->set_new_req_lock);
+		if (likely(!list_empty(&set->set_new_requests))) {
+			list_splice_init(&set->set_new_requests,
+					     &set->set_requests);
+			atomic_add(atomic_read(&set->set_new_count),
+				       &set->set_remaining);
+			atomic_set(&set->set_new_count, 0);
+			/*
+			 * Need to calculate its timeout.
+			 */
+			rc = 1;
+		}
+		spin_unlock(&set->set_new_req_lock);
+	}
+
+	/* We should call lu_env_refill() before handling new requests to make
+	 * sure that env key the requests depending on really exists.
+	 */
+	rc2 = lu_env_refill(env);
+	if (rc2 != 0) {
+		/*
+		 * XXX This is very awkward situation, because
+		 * execution can neither continue (request
+		 * interpreters assume that env is set up), nor repeat
+		 * the loop (as this potentially results in a tight
+		 * loop of -ENOMEM's).
+		 *
+		 * Fortunately, refill only ever does something when
+		 * new modules are loaded, i.e., early during boot up.
+		 */
+		CERROR("Failure to refill session: %d\n", rc2);
+		RETURN(rc);
+	}
+
+	if (atomic_read(&set->set_remaining))
+		rc |= ptlrpc_check_set(env, set);
+
+	if (!list_empty(&set->set_requests)) {
+		/*
+		 * XXX: our set never completes, so we prune the completed
+		 * reqs after each iteration. boy could this be smarter.
+		 */
+		list_for_each_safe(pos, tmp, &set->set_requests) {
+			req = list_entry(pos, struct ptlrpc_request,
+					     rq_set_chain);
+			if (req->rq_phase != RQ_PHASE_COMPLETE)
+				continue;
+
+			list_del_init(&req->rq_set_chain);
+			req->rq_set = NULL;
+			ptlrpc_req_finished(req);
+		}
+	}
+
+	if (rc == 0) {
+		/*
+		 * If new requests have been added, make sure to wake up.
+		 */
+		rc = atomic_read(&set->set_new_count);
+
+		/* If we have nothing to do, check whether we can take some
+		 * work from our partner threads. */
+		if (rc == 0 && pc->pc_npartners > 0) {
+			struct ptlrpcd_ctl *partner;
+			struct ptlrpc_request_set *ps;
+			int first = pc->pc_cursor;
+
+			do {
+				partner = pc->pc_partners[pc->pc_cursor++];
+				if (pc->pc_cursor >= pc->pc_npartners)
+					pc->pc_cursor = 0;
+				if (partner == NULL)
+					continue;
+
+				spin_lock(&partner->pc_lock);
+				ps = partner->pc_set;
+				if (ps == NULL) {
+					spin_unlock(&partner->pc_lock);
+					continue;
+				}
+
+				ptlrpc_reqset_get(ps);
+				spin_unlock(&partner->pc_lock);
+
+				if (atomic_read(&ps->set_new_count)) {
+					rc = ptlrpcd_steal_rqset(set, ps);
+					if (rc > 0)
+						CDEBUG(D_RPCTRACE, "transfer %d"
+						       " async RPCs [%d->%d]\n",
+							rc, partner->pc_index,
+							pc->pc_index);
+				}
+				ptlrpc_reqset_put(ps);
+			} while (rc == 0 && pc->pc_cursor != first);
+		}
+	}
+
+	RETURN(rc);
+}
+
+/**
+ * Main ptlrpcd thread.
+ * ptlrpc's code paths like to execute in process context, so we have this
+ * thread which spins on a set which contains the rpcs and sends them.
+ *
+ */
+static int ptlrpcd(void *arg)
+{
+	struct ptlrpcd_ctl *pc = arg;
+	struct ptlrpc_request_set *set = pc->pc_set;
+	struct lu_env env = { .le_ses = NULL };
+	int rc, exit = 0;
+	ENTRY;
+
+	unshare_fs_struct();
+#if defined(CONFIG_SMP)
+	if (test_bit(LIOD_BIND, &pc->pc_flags)) {
+		int index = pc->pc_index;
+
+		if (index >= 0 && index < num_possible_cpus()) {
+			while (!cpu_online(index)) {
+				if (++index >= num_possible_cpus())
+					index = 0;
+			}
+			cfs_set_cpus_allowed(current,
+				     *cpumask_of_node(cpu_to_node(index)));
+		}
+	}
+#endif
+	/*
+	 * XXX So far only "client" ptlrpcd uses an environment. In
+	 * the future, ptlrpcd thread (or a thread-set) has to given
+	 * an argument, describing its "scope".
+	 */
+	rc = lu_context_init(&env.le_ctx,
+			     LCT_CL_THREAD|LCT_REMEMBER|LCT_NOREF);
+	complete(&pc->pc_starting);
+
+	if (rc != 0)
+		RETURN(rc);
+
+	/*
+	 * This mainloop strongly resembles ptlrpc_set_wait() except that our
+	 * set never completes.  ptlrpcd_check() calls ptlrpc_check_set() when
+	 * there are requests in the set. New requests come in on the set's
+	 * new_req_list and ptlrpcd_check() moves them into the set.
+	 */
+	do {
+		struct l_wait_info lwi;
+		int timeout;
+
+		timeout = ptlrpc_set_next_timeout(set);
+		lwi = LWI_TIMEOUT(cfs_time_seconds(timeout ? timeout : 1),
+				  ptlrpc_expired_set, set);
+
+		lu_context_enter(&env.le_ctx);
+		l_wait_event(set->set_waitq,
+			     ptlrpcd_check(&env, pc), &lwi);
+		lu_context_exit(&env.le_ctx);
+
+		/*
+		 * Abort inflight rpcs for forced stop case.
+		 */
+		if (test_bit(LIOD_STOP, &pc->pc_flags)) {
+			if (test_bit(LIOD_FORCE, &pc->pc_flags))
+				ptlrpc_abort_set(set);
+			exit++;
+		}
+
+		/*
+		 * Let's make one more loop to make sure that ptlrpcd_check()
+		 * copied all raced new rpcs into the set so we can kill them.
+		 */
+	} while (exit < 2);
+
+	/*
+	 * Wait for inflight requests to drain.
+	 */
+	if (!list_empty(&set->set_requests))
+		ptlrpc_set_wait(set);
+	lu_context_fini(&env.le_ctx);
+
+	complete(&pc->pc_finishing);
+
+	return 0;
+}
+
+/* XXX: We want multiple CPU cores to share the async RPC load. So we start many
+ *      ptlrpcd threads. We also want to reduce the ptlrpcd overhead caused by
+ *      data transfer cross-CPU cores. So we bind ptlrpcd thread to specified
+ *      CPU core. But binding all ptlrpcd threads maybe cause response delay
+ *      because of some CPU core(s) busy with other loads.
+ *
+ *      For example: "ls -l", some async RPCs for statahead are assigned to
+ *      ptlrpcd_0, and ptlrpcd_0 is bound to CPU_0, but CPU_0 may be quite busy
+ *      with other non-ptlrpcd, like "ls -l" itself (we want to the "ls -l"
+ *      thread, statahead thread, and ptlrpcd thread can run in parallel), under
+ *      such case, the statahead async RPCs can not be processed in time, it is
+ *      unexpected. If ptlrpcd_0 can be re-scheduled on other CPU core, it may
+ *      be better. But it breaks former data transfer policy.
+ *
+ *      So we shouldn't be blind for avoiding the data transfer. We make some
+ *      compromise: divide the ptlrpcd threds pool into two parts. One part is
+ *      for bound mode, each ptlrpcd thread in this part is bound to some CPU
+ *      core. The other part is for free mode, all the ptlrpcd threads in the
+ *      part can be scheduled on any CPU core. We specify some partnership
+ *      between bound mode ptlrpcd thread(s) and free mode ptlrpcd thread(s),
+ *      and the async RPC load within the partners are shared.
+ *
+ *      It can partly avoid data transfer cross-CPU (if the bound mode ptlrpcd
+ *      thread can be scheduled in time), and try to guarantee the async RPC
+ *      processed ASAP (as long as the free mode ptlrpcd thread can be scheduled
+ *      on any CPU core).
+ *
+ *      As for how to specify the partnership between bound mode ptlrpcd
+ *      thread(s) and free mode ptlrpcd thread(s), the simplest way is to use
+ *      <free bound> pair. In future, we can specify some more complex
+ *      partnership based on the patches for CPU partition. But before such
+ *      patches are available, we prefer to use the simplest one.
+ */
+# ifdef CFS_CPU_MODE_NUMA
+# warning "fix ptlrpcd_bind() to use new CPU partition APIs"
+# endif
+static int ptlrpcd_bind(int index, int max)
+{
+	struct ptlrpcd_ctl *pc;
+	int rc = 0;
+#if defined(CONFIG_NUMA)
+	cpumask_t mask;
+#endif
+	ENTRY;
+
+	LASSERT(index <= max - 1);
+	pc = &ptlrpcds->pd_threads[index];
+	switch (ptlrpcd_bind_policy) {
+	case PDB_POLICY_NONE:
+		pc->pc_npartners = -1;
+		break;
+	case PDB_POLICY_FULL:
+		pc->pc_npartners = 0;
+		set_bit(LIOD_BIND, &pc->pc_flags);
+		break;
+	case PDB_POLICY_PAIR:
+		LASSERT(max % 2 == 0);
+		pc->pc_npartners = 1;
+		break;
+	case PDB_POLICY_NEIGHBOR:
+#if defined(CONFIG_NUMA)
+	{
+		int i;
+		mask = *cpumask_of_node(cpu_to_node(index));
+		for (i = max; i < num_online_cpus(); i++)
+			cpu_clear(i, mask);
+		pc->pc_npartners = cpus_weight(mask) - 1;
+		set_bit(LIOD_BIND, &pc->pc_flags);
+	}
+#else
+		LASSERT(max >= 3);
+		pc->pc_npartners = 2;
+#endif
+		break;
+	default:
+		CERROR("unknown ptlrpcd bind policy %d\n", ptlrpcd_bind_policy);
+		rc = -EINVAL;
+	}
+
+	if (rc == 0 && pc->pc_npartners > 0) {
+		OBD_ALLOC(pc->pc_partners,
+			  sizeof(struct ptlrpcd_ctl *) * pc->pc_npartners);
+		if (pc->pc_partners == NULL) {
+			pc->pc_npartners = 0;
+			rc = -ENOMEM;
+		} else {
+			switch (ptlrpcd_bind_policy) {
+			case PDB_POLICY_PAIR:
+				if (index & 0x1) {
+					set_bit(LIOD_BIND, &pc->pc_flags);
+					pc->pc_partners[0] = &ptlrpcds->
+						pd_threads[index - 1];
+					ptlrpcds->pd_threads[index - 1].
+						pc_partners[0] = pc;
+				}
+				break;
+			case PDB_POLICY_NEIGHBOR:
+#if defined(CONFIG_NUMA)
+			{
+				struct ptlrpcd_ctl *ppc;
+				int i, pidx;
+				/* partners are cores in the same NUMA node.
+				 * setup partnership only with ptlrpcd threads
+				 * that are already initialized
+				 */
+				for (pidx = 0, i = 0; i < index; i++) {
+					if (cpu_isset(i, mask)) {
+						ppc = &ptlrpcds->pd_threads[i];
+						pc->pc_partners[pidx++] = ppc;
+						ppc->pc_partners[ppc->
+							  pc_npartners++] = pc;
+					}
+				}
+				/* adjust number of partners to the number
+				 * of partnership really setup */
+				pc->pc_npartners = pidx;
+			}
+#else
+				if (index & 0x1)
+					set_bit(LIOD_BIND, &pc->pc_flags);
+				if (index > 0) {
+					pc->pc_partners[0] = &ptlrpcds->
+						pd_threads[index - 1];
+					ptlrpcds->pd_threads[index - 1].
+						pc_partners[1] = pc;
+					if (index == max - 1) {
+						pc->pc_partners[1] =
+						&ptlrpcds->pd_threads[0];
+						ptlrpcds->pd_threads[0].
+						pc_partners[0] = pc;
+					}
+				}
+#endif
+				break;
+			}
+		}
+	}
+
+	RETURN(rc);
+}
+
+
+int ptlrpcd_start(int index, int max, const char *name, struct ptlrpcd_ctl *pc)
+{
+	int rc;
+	int env = 0;
+	ENTRY;
+
+	/*
+	 * Do not allow start second thread for one pc.
+	 */
+	if (test_and_set_bit(LIOD_START, &pc->pc_flags)) {
+		CWARN("Starting second thread (%s) for same pc %p\n",
+		      name, pc);
+		RETURN(0);
+	}
+
+	pc->pc_index = index;
+	init_completion(&pc->pc_starting);
+	init_completion(&pc->pc_finishing);
+	spin_lock_init(&pc->pc_lock);
+	strncpy(pc->pc_name, name, sizeof(pc->pc_name) - 1);
+	pc->pc_set = ptlrpc_prep_set();
+	if (pc->pc_set == NULL)
+		GOTO(out, rc = -ENOMEM);
+	/*
+	 * So far only "client" ptlrpcd uses an environment. In the future,
+	 * ptlrpcd thread (or a thread-set) has to be given an argument,
+	 * describing its "scope".
+	 */
+	rc = lu_context_init(&pc->pc_env.le_ctx, LCT_CL_THREAD|LCT_REMEMBER);
+	if (rc != 0)
+		GOTO(out, rc);
+
+	env = 1;
+	{
+		task_t *task;
+		if (index >= 0) {
+			rc = ptlrpcd_bind(index, max);
+			if (rc < 0)
+				GOTO(out, rc);
+		}
+
+		task = kthread_run(ptlrpcd, pc, pc->pc_name);
+		if (IS_ERR(task))
+			GOTO(out, rc = PTR_ERR(task));
+
+		rc = 0;
+		wait_for_completion(&pc->pc_starting);
+	}
+out:
+	if (rc) {
+		if (pc->pc_set != NULL) {
+			struct ptlrpc_request_set *set = pc->pc_set;
+
+			spin_lock(&pc->pc_lock);
+			pc->pc_set = NULL;
+			spin_unlock(&pc->pc_lock);
+			ptlrpc_set_destroy(set);
+		}
+		if (env != 0)
+			lu_context_fini(&pc->pc_env.le_ctx);
+		clear_bit(LIOD_BIND, &pc->pc_flags);
+		clear_bit(LIOD_START, &pc->pc_flags);
+	}
+	RETURN(rc);
+}
+
+void ptlrpcd_stop(struct ptlrpcd_ctl *pc, int force)
+{
+	ENTRY;
+
+	if (!test_bit(LIOD_START, &pc->pc_flags)) {
+		CWARN("Thread for pc %p was not started\n", pc);
+		goto out;
+	}
+
+	set_bit(LIOD_STOP, &pc->pc_flags);
+	if (force)
+		set_bit(LIOD_FORCE, &pc->pc_flags);
+	wake_up(&pc->pc_set->set_waitq);
+
+out:
+	EXIT;
+}
+
+void ptlrpcd_free(struct ptlrpcd_ctl *pc)
+{
+	struct ptlrpc_request_set *set = pc->pc_set;
+	ENTRY;
+
+	if (!test_bit(LIOD_START, &pc->pc_flags)) {
+		CWARN("Thread for pc %p was not started\n", pc);
+		goto out;
+	}
+
+	wait_for_completion(&pc->pc_finishing);
+	lu_context_fini(&pc->pc_env.le_ctx);
+
+	spin_lock(&pc->pc_lock);
+	pc->pc_set = NULL;
+	spin_unlock(&pc->pc_lock);
+	ptlrpc_set_destroy(set);
+
+	clear_bit(LIOD_START, &pc->pc_flags);
+	clear_bit(LIOD_STOP, &pc->pc_flags);
+	clear_bit(LIOD_FORCE, &pc->pc_flags);
+	clear_bit(LIOD_BIND, &pc->pc_flags);
+
+out:
+	if (pc->pc_npartners > 0) {
+		LASSERT(pc->pc_partners != NULL);
+
+		OBD_FREE(pc->pc_partners,
+			 sizeof(struct ptlrpcd_ctl *) * pc->pc_npartners);
+		pc->pc_partners = NULL;
+	}
+	pc->pc_npartners = 0;
+	EXIT;
+}
+
+static void ptlrpcd_fini(void)
+{
+	int i;
+	ENTRY;
+
+	if (ptlrpcds != NULL) {
+		for (i = 0; i < ptlrpcds->pd_nthreads; i++)
+			ptlrpcd_stop(&ptlrpcds->pd_threads[i], 0);
+		for (i = 0; i < ptlrpcds->pd_nthreads; i++)
+			ptlrpcd_free(&ptlrpcds->pd_threads[i]);
+		ptlrpcd_stop(&ptlrpcds->pd_thread_rcv, 0);
+		ptlrpcd_free(&ptlrpcds->pd_thread_rcv);
+		OBD_FREE(ptlrpcds, ptlrpcds->pd_size);
+		ptlrpcds = NULL;
+	}
+
+	EXIT;
+}
+
+static int ptlrpcd_init(void)
+{
+	int nthreads = num_online_cpus();
+	char name[16];
+	int size, i = -1, j, rc = 0;
+	ENTRY;
+
+	if (max_ptlrpcds > 0 && max_ptlrpcds < nthreads)
+		nthreads = max_ptlrpcds;
+	if (nthreads < 2)
+		nthreads = 2;
+	if (nthreads < 3 && ptlrpcd_bind_policy == PDB_POLICY_NEIGHBOR)
+		ptlrpcd_bind_policy = PDB_POLICY_PAIR;
+	else if (nthreads % 2 != 0 && ptlrpcd_bind_policy == PDB_POLICY_PAIR)
+		nthreads &= ~1; /* make sure it is even */
+
+	size = offsetof(struct ptlrpcd, pd_threads[nthreads]);
+	OBD_ALLOC(ptlrpcds, size);
+	if (ptlrpcds == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	snprintf(name, 15, "ptlrpcd_rcv");
+	set_bit(LIOD_RECOVERY, &ptlrpcds->pd_thread_rcv.pc_flags);
+	rc = ptlrpcd_start(-1, nthreads, name, &ptlrpcds->pd_thread_rcv);
+	if (rc < 0)
+		GOTO(out, rc);
+
+	/* XXX: We start nthreads ptlrpc daemons. Each of them can process any
+	 *      non-recovery async RPC to improve overall async RPC efficiency.
+	 *
+	 *      But there are some issues with async I/O RPCs and async non-I/O
+	 *      RPCs processed in the same set under some cases. The ptlrpcd may
+	 *      be blocked by some async I/O RPC(s), then will cause other async
+	 *      non-I/O RPC(s) can not be processed in time.
+	 *
+	 *      Maybe we should distinguish blocked async RPCs from non-blocked
+	 *      async RPCs, and process them in different ptlrpcd sets to avoid
+	 *      unnecessary dependency. But how to distribute async RPCs load
+	 *      among all the ptlrpc daemons becomes another trouble. */
+	for (i = 0; i < nthreads; i++) {
+		snprintf(name, 15, "ptlrpcd_%d", i);
+		rc = ptlrpcd_start(i, nthreads, name, &ptlrpcds->pd_threads[i]);
+		if (rc < 0)
+			GOTO(out, rc);
+	}
+
+	ptlrpcds->pd_size = size;
+	ptlrpcds->pd_index = 0;
+	ptlrpcds->pd_nthreads = nthreads;
+
+out:
+	if (rc != 0 && ptlrpcds != NULL) {
+		for (j = 0; j <= i; j++)
+			ptlrpcd_stop(&ptlrpcds->pd_threads[j], 0);
+		for (j = 0; j <= i; j++)
+			ptlrpcd_free(&ptlrpcds->pd_threads[j]);
+		ptlrpcd_stop(&ptlrpcds->pd_thread_rcv, 0);
+		ptlrpcd_free(&ptlrpcds->pd_thread_rcv);
+		OBD_FREE(ptlrpcds, size);
+		ptlrpcds = NULL;
+	}
+
+	RETURN(0);
+}
+
+int ptlrpcd_addref(void)
+{
+	int rc = 0;
+	ENTRY;
+
+	mutex_lock(&ptlrpcd_mutex);
+	if (++ptlrpcd_users == 1)
+		rc = ptlrpcd_init();
+	mutex_unlock(&ptlrpcd_mutex);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpcd_addref);
+
+void ptlrpcd_decref(void)
+{
+	mutex_lock(&ptlrpcd_mutex);
+	if (--ptlrpcd_users == 0)
+		ptlrpcd_fini();
+	mutex_unlock(&ptlrpcd_mutex);
+}
+EXPORT_SYMBOL(ptlrpcd_decref);
+/** @} ptlrpcd */
diff --git a/drivers/staging/lustre/lustre/ptlrpc/recover.c b/drivers/staging/lustre/lustre/ptlrpc/recover.c
new file mode 100644
index 000000000000..2960889834a2
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/recover.c
@@ -0,0 +1,357 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/recover.c
+ *
+ * Author: Mike Shaver <shaver@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+# include <linux/libcfs/libcfs.h>
+
+#include <obd_support.h>
+#include <lustre_ha.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_export.h>
+#include <obd.h>
+#include <obd_ost.h>
+#include <obd_class.h>
+#include <obd_lov.h> /* for IOC_LOV_SET_OSC_ACTIVE */
+#include <linux/list.h>
+
+#include "ptlrpc_internal.h"
+
+/**
+ * Start recovery on disconnected import.
+ * This is done by just attempting a connect
+ */
+void ptlrpc_initiate_recovery(struct obd_import *imp)
+{
+	ENTRY;
+
+	CDEBUG(D_HA, "%s: starting recovery\n", obd2cli_tgt(imp->imp_obd));
+	ptlrpc_connect_import(imp);
+
+	EXIT;
+}
+
+/**
+ * Identify what request from replay list needs to be replayed next
+ * (based on what we have already replayed) and send it to server.
+ */
+int ptlrpc_replay_next(struct obd_import *imp, int *inflight)
+{
+	int rc = 0;
+	struct list_head *tmp, *pos;
+	struct ptlrpc_request *req = NULL;
+	__u64 last_transno;
+	ENTRY;
+
+	*inflight = 0;
+
+	/* It might have committed some after we last spoke, so make sure we
+	 * get rid of them now.
+	 */
+	spin_lock(&imp->imp_lock);
+	imp->imp_last_transno_checked = 0;
+	ptlrpc_free_committed(imp);
+	last_transno = imp->imp_last_replay_transno;
+	spin_unlock(&imp->imp_lock);
+
+	CDEBUG(D_HA, "import %p from %s committed "LPU64" last "LPU64"\n",
+	       imp, obd2cli_tgt(imp->imp_obd),
+	       imp->imp_peer_committed_transno, last_transno);
+
+	/* Do I need to hold a lock across this iteration?  We shouldn't be
+	 * racing with any additions to the list, because we're in recovery
+	 * and are therefore not processing additional requests to add.  Calls
+	 * to ptlrpc_free_committed might commit requests, but nothing "newer"
+	 * than the one we're replaying (it can't be committed until it's
+	 * replayed, and we're doing that here).  l_f_e_safe protects against
+	 * problems with the current request being committed, in the unlikely
+	 * event of that race.  So, in conclusion, I think that it's safe to
+	 * perform this list-walk without the imp_lock held.
+	 *
+	 * But, the {mdc,osc}_replay_open callbacks both iterate
+	 * request lists, and have comments saying they assume the
+	 * imp_lock is being held by ptlrpc_replay, but it's not. it's
+	 * just a little race...
+	 */
+	list_for_each_safe(tmp, pos, &imp->imp_replay_list) {
+		req = list_entry(tmp, struct ptlrpc_request,
+				     rq_replay_list);
+
+		/* If need to resend the last sent transno (because a
+		   reconnect has occurred), then stop on the matching
+		   req and send it again. If, however, the last sent
+		   transno has been committed then we continue replay
+		   from the next request. */
+		if (req->rq_transno > last_transno) {
+			if (imp->imp_resend_replay)
+				lustre_msg_add_flags(req->rq_reqmsg,
+						     MSG_RESENT);
+			break;
+		}
+		req = NULL;
+	}
+
+	spin_lock(&imp->imp_lock);
+	imp->imp_resend_replay = 0;
+	spin_unlock(&imp->imp_lock);
+
+	if (req != NULL) {
+		rc = ptlrpc_replay_req(req);
+		if (rc) {
+			CERROR("recovery replay error %d for req "
+			       LPU64"\n", rc, req->rq_xid);
+			RETURN(rc);
+		}
+		*inflight = 1;
+	}
+	RETURN(rc);
+}
+
+/**
+ * Schedule resending of request on sending_list. This is done after
+ * we completed replaying of requests and locks.
+ */
+int ptlrpc_resend(struct obd_import *imp)
+{
+	struct ptlrpc_request *req, *next;
+
+	ENTRY;
+
+	/* As long as we're in recovery, nothing should be added to the sending
+	 * list, so we don't need to hold the lock during this iteration and
+	 * resend process.
+	 */
+	/* Well... what if lctl recover is called twice at the same time?
+	 */
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state != LUSTRE_IMP_RECOVER) {
+		spin_unlock(&imp->imp_lock);
+		RETURN(-1);
+	}
+
+	list_for_each_entry_safe(req, next, &imp->imp_sending_list,
+				     rq_list) {
+		LASSERTF((long)req > PAGE_CACHE_SIZE && req != LP_POISON,
+			 "req %p bad\n", req);
+		LASSERTF(req->rq_type != LI_POISON, "req %p freed\n", req);
+		if (!ptlrpc_no_resend(req))
+			ptlrpc_resend_req(req);
+	}
+	spin_unlock(&imp->imp_lock);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_resend);
+
+/**
+ * Go through all requests in delayed list and wake their threads
+ * for resending
+ */
+void ptlrpc_wake_delayed(struct obd_import *imp)
+{
+	struct list_head *tmp, *pos;
+	struct ptlrpc_request *req;
+
+	spin_lock(&imp->imp_lock);
+	list_for_each_safe(tmp, pos, &imp->imp_delayed_list) {
+		req = list_entry(tmp, struct ptlrpc_request, rq_list);
+
+		DEBUG_REQ(D_HA, req, "waking (set %p):", req->rq_set);
+		ptlrpc_client_wake_req(req);
+	}
+	spin_unlock(&imp->imp_lock);
+}
+EXPORT_SYMBOL(ptlrpc_wake_delayed);
+
+void ptlrpc_request_handle_notconn(struct ptlrpc_request *failed_req)
+{
+	struct obd_import *imp = failed_req->rq_import;
+	ENTRY;
+
+	CDEBUG(D_HA, "import %s of %s@%s abruptly disconnected: reconnecting\n",
+	       imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
+	       imp->imp_connection->c_remote_uuid.uuid);
+
+	if (ptlrpc_set_import_discon(imp,
+			      lustre_msg_get_conn_cnt(failed_req->rq_reqmsg))) {
+		if (!imp->imp_replayable) {
+			CDEBUG(D_HA, "import %s@%s for %s not replayable, "
+			       "auto-deactivating\n",
+			       obd2cli_tgt(imp->imp_obd),
+			       imp->imp_connection->c_remote_uuid.uuid,
+			       imp->imp_obd->obd_name);
+			ptlrpc_deactivate_import(imp);
+		}
+		/* to control recovery via lctl {disable|enable}_recovery */
+		if (imp->imp_deactive == 0)
+			ptlrpc_connect_import(imp);
+	}
+
+	/* Wait for recovery to complete and resend. If evicted, then
+	   this request will be errored out later.*/
+	spin_lock(&failed_req->rq_lock);
+	if (!failed_req->rq_no_resend)
+		failed_req->rq_resend = 1;
+	spin_unlock(&failed_req->rq_lock);
+
+	EXIT;
+}
+
+/**
+ * Administratively active/deactive a client.
+ * This should only be called by the ioctl interface, currently
+ *  - the lctl deactivate and activate commands
+ *  - echo 0/1 >> /proc/osc/XXX/active
+ *  - client umount -f (ll_umount_begin)
+ */
+int ptlrpc_set_import_active(struct obd_import *imp, int active)
+{
+	struct obd_device *obd = imp->imp_obd;
+	int rc = 0;
+
+	ENTRY;
+	LASSERT(obd);
+
+	/* When deactivating, mark import invalid, and abort in-flight
+	 * requests. */
+	if (!active) {
+		LCONSOLE_WARN("setting import %s INACTIVE by administrator "
+			      "request\n", obd2cli_tgt(imp->imp_obd));
+
+		/* set before invalidate to avoid messages about imp_inval
+		 * set without imp_deactive in ptlrpc_import_delay_req */
+		spin_lock(&imp->imp_lock);
+		imp->imp_deactive = 1;
+		spin_unlock(&imp->imp_lock);
+
+		obd_import_event(imp->imp_obd, imp, IMP_EVENT_DEACTIVATE);
+
+		ptlrpc_invalidate_import(imp);
+	}
+
+	/* When activating, mark import valid, and attempt recovery */
+	if (active) {
+		CDEBUG(D_HA, "setting import %s VALID\n",
+		       obd2cli_tgt(imp->imp_obd));
+
+		spin_lock(&imp->imp_lock);
+		imp->imp_deactive = 0;
+		spin_unlock(&imp->imp_lock);
+		obd_import_event(imp->imp_obd, imp, IMP_EVENT_ACTIVATE);
+
+		rc = ptlrpc_recover_import(imp, NULL, 0);
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_set_import_active);
+
+/* Attempt to reconnect an import */
+int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid, int async)
+{
+	int rc = 0;
+	ENTRY;
+
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state == LUSTRE_IMP_NEW || imp->imp_deactive ||
+	    atomic_read(&imp->imp_inval_count))
+		rc = -EINVAL;
+	spin_unlock(&imp->imp_lock);
+	if (rc)
+		GOTO(out, rc);
+
+	/* force import to be disconnected. */
+	ptlrpc_set_import_discon(imp, 0);
+
+	if (new_uuid) {
+		struct obd_uuid uuid;
+
+		/* intruct import to use new uuid */
+		obd_str2uuid(&uuid, new_uuid);
+		rc = import_set_conn_priority(imp, &uuid);
+		if (rc)
+			GOTO(out, rc);
+	}
+
+	/* Check if reconnect is already in progress */
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state != LUSTRE_IMP_DISCON) {
+		imp->imp_force_verify = 1;
+		rc = -EALREADY;
+	}
+	spin_unlock(&imp->imp_lock);
+	if (rc)
+		GOTO(out, rc);
+
+	rc = ptlrpc_connect_import(imp);
+	if (rc)
+		GOTO(out, rc);
+
+	if (!async) {
+		struct l_wait_info lwi;
+		int secs = cfs_time_seconds(obd_timeout);
+
+		CDEBUG(D_HA, "%s: recovery started, waiting %u seconds\n",
+		       obd2cli_tgt(imp->imp_obd), secs);
+
+		lwi = LWI_TIMEOUT(secs, NULL, NULL);
+		rc = l_wait_event(imp->imp_recovery_waitq,
+				  !ptlrpc_import_in_recovery(imp), &lwi);
+		CDEBUG(D_HA, "%s: recovery finished\n",
+		       obd2cli_tgt(imp->imp_obd));
+	}
+	EXIT;
+
+out:
+	return rc;
+}
+EXPORT_SYMBOL(ptlrpc_recover_import);
+
+int ptlrpc_import_in_recovery(struct obd_import *imp)
+{
+	int in_recovery = 1;
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state == LUSTRE_IMP_FULL ||
+	    imp->imp_state == LUSTRE_IMP_CLOSED ||
+	    imp->imp_state == LUSTRE_IMP_DISCON)
+		in_recovery = 0;
+	spin_unlock(&imp->imp_lock);
+	return in_recovery;
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec.c b/drivers/staging/lustre/lustre/ptlrpc/sec.c
new file mode 100644
index 000000000000..36e8bed5458a
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/sec.c
@@ -0,0 +1,2465 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/sec.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/crypto.h>
+#include <linux/key.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_dlm.h>
+#include <lustre_sec.h>
+
+#include "ptlrpc_internal.h"
+
+/***********************************************
+ * policy registers			    *
+ ***********************************************/
+
+static rwlock_t policy_lock;
+static struct ptlrpc_sec_policy *policies[SPTLRPC_POLICY_MAX] = {
+	NULL,
+};
+
+int sptlrpc_register_policy(struct ptlrpc_sec_policy *policy)
+{
+	__u16 number = policy->sp_policy;
+
+	LASSERT(policy->sp_name);
+	LASSERT(policy->sp_cops);
+	LASSERT(policy->sp_sops);
+
+	if (number >= SPTLRPC_POLICY_MAX)
+		return -EINVAL;
+
+	write_lock(&policy_lock);
+	if (unlikely(policies[number])) {
+		write_unlock(&policy_lock);
+		return -EALREADY;
+	}
+	policies[number] = policy;
+	write_unlock(&policy_lock);
+
+	CDEBUG(D_SEC, "%s: registered\n", policy->sp_name);
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_register_policy);
+
+int sptlrpc_unregister_policy(struct ptlrpc_sec_policy *policy)
+{
+	__u16 number = policy->sp_policy;
+
+	LASSERT(number < SPTLRPC_POLICY_MAX);
+
+	write_lock(&policy_lock);
+	if (unlikely(policies[number] == NULL)) {
+		write_unlock(&policy_lock);
+		CERROR("%s: already unregistered\n", policy->sp_name);
+		return -EINVAL;
+	}
+
+	LASSERT(policies[number] == policy);
+	policies[number] = NULL;
+	write_unlock(&policy_lock);
+
+	CDEBUG(D_SEC, "%s: unregistered\n", policy->sp_name);
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_unregister_policy);
+
+static
+struct ptlrpc_sec_policy * sptlrpc_wireflavor2policy(__u32 flavor)
+{
+	static DEFINE_MUTEX(load_mutex);
+	static atomic_t       loaded = ATOMIC_INIT(0);
+	struct ptlrpc_sec_policy *policy;
+	__u16		     number = SPTLRPC_FLVR_POLICY(flavor);
+	__u16		     flag = 0;
+
+	if (number >= SPTLRPC_POLICY_MAX)
+		return NULL;
+
+	while (1) {
+		read_lock(&policy_lock);
+		policy = policies[number];
+		if (policy && !try_module_get(policy->sp_owner))
+			policy = NULL;
+		if (policy == NULL)
+			flag = atomic_read(&loaded);
+		read_unlock(&policy_lock);
+
+		if (policy != NULL || flag != 0 ||
+		    number != SPTLRPC_POLICY_GSS)
+			break;
+
+		/* try to load gss module, once */
+		mutex_lock(&load_mutex);
+		if (atomic_read(&loaded) == 0) {
+			if (request_module("ptlrpc_gss") == 0)
+				CDEBUG(D_SEC,
+				       "module ptlrpc_gss loaded on demand\n");
+			else
+				CERROR("Unable to load module ptlrpc_gss\n");
+
+			atomic_set(&loaded, 1);
+		}
+		mutex_unlock(&load_mutex);
+	}
+
+	return policy;
+}
+
+__u32 sptlrpc_name2flavor_base(const char *name)
+{
+	if (!strcmp(name, "null"))
+		return SPTLRPC_FLVR_NULL;
+	if (!strcmp(name, "plain"))
+		return SPTLRPC_FLVR_PLAIN;
+	if (!strcmp(name, "krb5n"))
+		return SPTLRPC_FLVR_KRB5N;
+	if (!strcmp(name, "krb5a"))
+		return SPTLRPC_FLVR_KRB5A;
+	if (!strcmp(name, "krb5i"))
+		return SPTLRPC_FLVR_KRB5I;
+	if (!strcmp(name, "krb5p"))
+		return SPTLRPC_FLVR_KRB5P;
+
+	return SPTLRPC_FLVR_INVALID;
+}
+EXPORT_SYMBOL(sptlrpc_name2flavor_base);
+
+const char *sptlrpc_flavor2name_base(__u32 flvr)
+{
+	__u32   base = SPTLRPC_FLVR_BASE(flvr);
+
+	if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_NULL))
+		return "null";
+	else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_PLAIN))
+		return "plain";
+	else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5N))
+		return "krb5n";
+	else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5A))
+		return "krb5a";
+	else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5I))
+		return "krb5i";
+	else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5P))
+		return "krb5p";
+
+	CERROR("invalid wire flavor 0x%x\n", flvr);
+	return "invalid";
+}
+EXPORT_SYMBOL(sptlrpc_flavor2name_base);
+
+char *sptlrpc_flavor2name_bulk(struct sptlrpc_flavor *sf,
+			       char *buf, int bufsize)
+{
+	if (SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN)
+		snprintf(buf, bufsize, "hash:%s",
+			 sptlrpc_get_hash_name(sf->u_bulk.hash.hash_alg));
+	else
+		snprintf(buf, bufsize, "%s",
+			 sptlrpc_flavor2name_base(sf->sf_rpc));
+
+	buf[bufsize - 1] = '\0';
+	return buf;
+}
+EXPORT_SYMBOL(sptlrpc_flavor2name_bulk);
+
+char *sptlrpc_flavor2name(struct sptlrpc_flavor *sf, char *buf, int bufsize)
+{
+	snprintf(buf, bufsize, "%s", sptlrpc_flavor2name_base(sf->sf_rpc));
+
+	/*
+	 * currently we don't support customized bulk specification for
+	 * flavors other than plain
+	 */
+	if (SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN) {
+		char bspec[16];
+
+		bspec[0] = '-';
+		sptlrpc_flavor2name_bulk(sf, &bspec[1], sizeof(bspec) - 1);
+		strncat(buf, bspec, bufsize);
+	}
+
+	buf[bufsize - 1] = '\0';
+	return buf;
+}
+EXPORT_SYMBOL(sptlrpc_flavor2name);
+
+char *sptlrpc_secflags2str(__u32 flags, char *buf, int bufsize)
+{
+	buf[0] = '\0';
+
+	if (flags & PTLRPC_SEC_FL_REVERSE)
+		strlcat(buf, "reverse,", bufsize);
+	if (flags & PTLRPC_SEC_FL_ROOTONLY)
+		strlcat(buf, "rootonly,", bufsize);
+	if (flags & PTLRPC_SEC_FL_UDESC)
+		strlcat(buf, "udesc,", bufsize);
+	if (flags & PTLRPC_SEC_FL_BULK)
+		strlcat(buf, "bulk,", bufsize);
+	if (buf[0] == '\0')
+		strlcat(buf, "-,", bufsize);
+
+	return buf;
+}
+EXPORT_SYMBOL(sptlrpc_secflags2str);
+
+/**************************************************
+ * client context APIs			    *
+ **************************************************/
+
+static
+struct ptlrpc_cli_ctx *get_my_ctx(struct ptlrpc_sec *sec)
+{
+	struct vfs_cred vcred;
+	int create = 1, remove_dead = 1;
+
+	LASSERT(sec);
+	LASSERT(sec->ps_policy->sp_cops->lookup_ctx);
+
+	if (sec->ps_flvr.sf_flags & (PTLRPC_SEC_FL_REVERSE |
+				     PTLRPC_SEC_FL_ROOTONLY)) {
+		vcred.vc_uid = 0;
+		vcred.vc_gid = 0;
+		if (sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_REVERSE) {
+			create = 0;
+			remove_dead = 0;
+		}
+	} else {
+		vcred.vc_uid = current_uid();
+		vcred.vc_gid = current_gid();
+	}
+
+	return sec->ps_policy->sp_cops->lookup_ctx(sec, &vcred,
+						   create, remove_dead);
+}
+
+struct ptlrpc_cli_ctx *sptlrpc_cli_ctx_get(struct ptlrpc_cli_ctx *ctx)
+{
+	atomic_inc(&ctx->cc_refcount);
+	return ctx;
+}
+EXPORT_SYMBOL(sptlrpc_cli_ctx_get);
+
+void sptlrpc_cli_ctx_put(struct ptlrpc_cli_ctx *ctx, int sync)
+{
+	struct ptlrpc_sec *sec = ctx->cc_sec;
+
+	LASSERT(sec);
+	LASSERT_ATOMIC_POS(&ctx->cc_refcount);
+
+	if (!atomic_dec_and_test(&ctx->cc_refcount))
+		return;
+
+	sec->ps_policy->sp_cops->release_ctx(sec, ctx, sync);
+}
+EXPORT_SYMBOL(sptlrpc_cli_ctx_put);
+
+/**
+ * Expire the client context immediately.
+ *
+ * \pre Caller must hold at least 1 reference on the \a ctx.
+ */
+void sptlrpc_cli_ctx_expire(struct ptlrpc_cli_ctx *ctx)
+{
+	LASSERT(ctx->cc_ops->die);
+	ctx->cc_ops->die(ctx, 0);
+}
+EXPORT_SYMBOL(sptlrpc_cli_ctx_expire);
+
+/**
+ * To wake up the threads who are waiting for this client context. Called
+ * after some status change happened on \a ctx.
+ */
+void sptlrpc_cli_ctx_wakeup(struct ptlrpc_cli_ctx *ctx)
+{
+	struct ptlrpc_request *req, *next;
+
+	spin_lock(&ctx->cc_lock);
+	list_for_each_entry_safe(req, next, &ctx->cc_req_list,
+				     rq_ctx_chain) {
+		list_del_init(&req->rq_ctx_chain);
+		ptlrpc_client_wake_req(req);
+	}
+	spin_unlock(&ctx->cc_lock);
+}
+EXPORT_SYMBOL(sptlrpc_cli_ctx_wakeup);
+
+int sptlrpc_cli_ctx_display(struct ptlrpc_cli_ctx *ctx, char *buf, int bufsize)
+{
+	LASSERT(ctx->cc_ops);
+
+	if (ctx->cc_ops->display == NULL)
+		return 0;
+
+	return ctx->cc_ops->display(ctx, buf, bufsize);
+}
+
+static int import_sec_check_expire(struct obd_import *imp)
+{
+	int     adapt = 0;
+
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_sec_expire &&
+	    imp->imp_sec_expire < cfs_time_current_sec()) {
+		adapt = 1;
+		imp->imp_sec_expire = 0;
+	}
+	spin_unlock(&imp->imp_lock);
+
+	if (!adapt)
+		return 0;
+
+	CDEBUG(D_SEC, "found delayed sec adapt expired, do it now\n");
+	return sptlrpc_import_sec_adapt(imp, NULL, 0);
+}
+
+static int import_sec_validate_get(struct obd_import *imp,
+				   struct ptlrpc_sec **sec)
+{
+	int     rc;
+
+	if (unlikely(imp->imp_sec_expire)) {
+		rc = import_sec_check_expire(imp);
+		if (rc)
+			return rc;
+	}
+
+	*sec = sptlrpc_import_sec_ref(imp);
+	if (*sec == NULL) {
+		CERROR("import %p (%s) with no sec\n",
+		       imp, ptlrpc_import_state_name(imp->imp_state));
+		return -EACCES;
+	}
+
+	if (unlikely((*sec)->ps_dying)) {
+		CERROR("attempt to use dying sec %p\n", sec);
+		sptlrpc_sec_put(*sec);
+		return -EACCES;
+	}
+
+	return 0;
+}
+
+/**
+ * Given a \a req, find or allocate a appropriate context for it.
+ * \pre req->rq_cli_ctx == NULL.
+ *
+ * \retval 0 succeed, and req->rq_cli_ctx is set.
+ * \retval -ev error number, and req->rq_cli_ctx == NULL.
+ */
+int sptlrpc_req_get_ctx(struct ptlrpc_request *req)
+{
+	struct obd_import *imp = req->rq_import;
+	struct ptlrpc_sec *sec;
+	int		rc;
+	ENTRY;
+
+	LASSERT(!req->rq_cli_ctx);
+	LASSERT(imp);
+
+	rc = import_sec_validate_get(imp, &sec);
+	if (rc)
+		RETURN(rc);
+
+	req->rq_cli_ctx = get_my_ctx(sec);
+
+	sptlrpc_sec_put(sec);
+
+	if (!req->rq_cli_ctx) {
+		CERROR("req %p: fail to get context\n", req);
+		RETURN(-ENOMEM);
+	}
+
+	RETURN(0);
+}
+
+/**
+ * Drop the context for \a req.
+ * \pre req->rq_cli_ctx != NULL.
+ * \post req->rq_cli_ctx == NULL.
+ *
+ * If \a sync == 0, this function should return quickly without sleep;
+ * otherwise it might trigger and wait for the whole process of sending
+ * an context-destroying rpc to server.
+ */
+void sptlrpc_req_put_ctx(struct ptlrpc_request *req, int sync)
+{
+	ENTRY;
+
+	LASSERT(req);
+	LASSERT(req->rq_cli_ctx);
+
+	/* request might be asked to release earlier while still
+	 * in the context waiting list.
+	 */
+	if (!list_empty(&req->rq_ctx_chain)) {
+		spin_lock(&req->rq_cli_ctx->cc_lock);
+		list_del_init(&req->rq_ctx_chain);
+		spin_unlock(&req->rq_cli_ctx->cc_lock);
+	}
+
+	sptlrpc_cli_ctx_put(req->rq_cli_ctx, sync);
+	req->rq_cli_ctx = NULL;
+	EXIT;
+}
+
+static
+int sptlrpc_req_ctx_switch(struct ptlrpc_request *req,
+			   struct ptlrpc_cli_ctx *oldctx,
+			   struct ptlrpc_cli_ctx *newctx)
+{
+	struct sptlrpc_flavor   old_flvr;
+	char		   *reqmsg = NULL; /* to workaround old gcc */
+	int		     reqmsg_size;
+	int		     rc = 0;
+
+	LASSERT(req->rq_reqmsg);
+	LASSERT(req->rq_reqlen);
+	LASSERT(req->rq_replen);
+
+	CDEBUG(D_SEC, "req %p: switch ctx %p(%u->%s) -> %p(%u->%s), "
+	       "switch sec %p(%s) -> %p(%s)\n", req,
+	       oldctx, oldctx->cc_vcred.vc_uid, sec2target_str(oldctx->cc_sec),
+	       newctx, newctx->cc_vcred.vc_uid, sec2target_str(newctx->cc_sec),
+	       oldctx->cc_sec, oldctx->cc_sec->ps_policy->sp_name,
+	       newctx->cc_sec, newctx->cc_sec->ps_policy->sp_name);
+
+	/* save flavor */
+	old_flvr = req->rq_flvr;
+
+	/* save request message */
+	reqmsg_size = req->rq_reqlen;
+	if (reqmsg_size != 0) {
+		OBD_ALLOC_LARGE(reqmsg, reqmsg_size);
+		if (reqmsg == NULL)
+			return -ENOMEM;
+		memcpy(reqmsg, req->rq_reqmsg, reqmsg_size);
+	}
+
+	/* release old req/rep buf */
+	req->rq_cli_ctx = oldctx;
+	sptlrpc_cli_free_reqbuf(req);
+	sptlrpc_cli_free_repbuf(req);
+	req->rq_cli_ctx = newctx;
+
+	/* recalculate the flavor */
+	sptlrpc_req_set_flavor(req, 0);
+
+	/* alloc new request buffer
+	 * we don't need to alloc reply buffer here, leave it to the
+	 * rest procedure of ptlrpc */
+	if (reqmsg_size != 0) {
+		rc = sptlrpc_cli_alloc_reqbuf(req, reqmsg_size);
+		if (!rc) {
+			LASSERT(req->rq_reqmsg);
+			memcpy(req->rq_reqmsg, reqmsg, reqmsg_size);
+		} else {
+			CWARN("failed to alloc reqbuf: %d\n", rc);
+			req->rq_flvr = old_flvr;
+		}
+
+		OBD_FREE_LARGE(reqmsg, reqmsg_size);
+	}
+	return rc;
+}
+
+/**
+ * If current context of \a req is dead somehow, e.g. we just switched flavor
+ * thus marked original contexts dead, we'll find a new context for it. if
+ * no switch is needed, \a req will end up with the same context.
+ *
+ * \note a request must have a context, to keep other parts of code happy.
+ * In any case of failure during the switching, we must restore the old one.
+ */
+int sptlrpc_req_replace_dead_ctx(struct ptlrpc_request *req)
+{
+	struct ptlrpc_cli_ctx *oldctx = req->rq_cli_ctx;
+	struct ptlrpc_cli_ctx *newctx;
+	int		    rc;
+	ENTRY;
+
+	LASSERT(oldctx);
+
+	sptlrpc_cli_ctx_get(oldctx);
+	sptlrpc_req_put_ctx(req, 0);
+
+	rc = sptlrpc_req_get_ctx(req);
+	if (unlikely(rc)) {
+		LASSERT(!req->rq_cli_ctx);
+
+		/* restore old ctx */
+		req->rq_cli_ctx = oldctx;
+		RETURN(rc);
+	}
+
+	newctx = req->rq_cli_ctx;
+	LASSERT(newctx);
+
+	if (unlikely(newctx == oldctx &&
+		     test_bit(PTLRPC_CTX_DEAD_BIT, &oldctx->cc_flags))) {
+		/*
+		 * still get the old dead ctx, usually means system too busy
+		 */
+		CDEBUG(D_SEC,
+		       "ctx (%p, fl %lx) doesn't switch, relax a little bit\n",
+		       newctx, newctx->cc_flags);
+
+		schedule_timeout_and_set_state(TASK_INTERRUPTIBLE,
+						   HZ);
+	} else {
+		/*
+		 * it's possible newctx == oldctx if we're switching
+		 * subflavor with the same sec.
+		 */
+		rc = sptlrpc_req_ctx_switch(req, oldctx, newctx);
+		if (rc) {
+			/* restore old ctx */
+			sptlrpc_req_put_ctx(req, 0);
+			req->rq_cli_ctx = oldctx;
+			RETURN(rc);
+		}
+
+		LASSERT(req->rq_cli_ctx == newctx);
+	}
+
+	sptlrpc_cli_ctx_put(oldctx, 1);
+	RETURN(0);
+}
+EXPORT_SYMBOL(sptlrpc_req_replace_dead_ctx);
+
+static
+int ctx_check_refresh(struct ptlrpc_cli_ctx *ctx)
+{
+	if (cli_ctx_is_refreshed(ctx))
+		return 1;
+	return 0;
+}
+
+static
+int ctx_refresh_timeout(void *data)
+{
+	struct ptlrpc_request *req = data;
+	int rc;
+
+	/* conn_cnt is needed in expire_one_request */
+	lustre_msg_set_conn_cnt(req->rq_reqmsg, req->rq_import->imp_conn_cnt);
+
+	rc = ptlrpc_expire_one_request(req, 1);
+	/* if we started recovery, we should mark this ctx dead; otherwise
+	 * in case of lgssd died nobody would retire this ctx, following
+	 * connecting will still find the same ctx thus cause deadlock.
+	 * there's an assumption that expire time of the request should be
+	 * later than the context refresh expire time.
+	 */
+	if (rc == 0)
+		req->rq_cli_ctx->cc_ops->die(req->rq_cli_ctx, 0);
+	return rc;
+}
+
+static
+void ctx_refresh_interrupt(void *data)
+{
+	struct ptlrpc_request *req = data;
+
+	spin_lock(&req->rq_lock);
+	req->rq_intr = 1;
+	spin_unlock(&req->rq_lock);
+}
+
+static
+void req_off_ctx_list(struct ptlrpc_request *req, struct ptlrpc_cli_ctx *ctx)
+{
+	spin_lock(&ctx->cc_lock);
+	if (!list_empty(&req->rq_ctx_chain))
+		list_del_init(&req->rq_ctx_chain);
+	spin_unlock(&ctx->cc_lock);
+}
+
+/**
+ * To refresh the context of \req, if it's not up-to-date.
+ * \param timeout
+ * - < 0: don't wait
+ * - = 0: wait until success or fatal error occur
+ * - > 0: timeout value (in seconds)
+ *
+ * The status of the context could be subject to be changed by other threads
+ * at any time. We allow this race, but once we return with 0, the caller will
+ * suppose it's uptodated and keep using it until the owning rpc is done.
+ *
+ * \retval 0 only if the context is uptodated.
+ * \retval -ev error number.
+ */
+int sptlrpc_req_refresh_ctx(struct ptlrpc_request *req, long timeout)
+{
+	struct ptlrpc_cli_ctx  *ctx = req->rq_cli_ctx;
+	struct ptlrpc_sec      *sec;
+	struct l_wait_info      lwi;
+	int		     rc;
+	ENTRY;
+
+	LASSERT(ctx);
+
+	if (req->rq_ctx_init || req->rq_ctx_fini)
+		RETURN(0);
+
+	/*
+	 * during the process a request's context might change type even
+	 * (e.g. from gss ctx to null ctx), so each loop we need to re-check
+	 * everything
+	 */
+again:
+	rc = import_sec_validate_get(req->rq_import, &sec);
+	if (rc)
+		RETURN(rc);
+
+	if (sec->ps_flvr.sf_rpc != req->rq_flvr.sf_rpc) {
+		CDEBUG(D_SEC, "req %p: flavor has changed %x -> %x\n",
+		      req, req->rq_flvr.sf_rpc, sec->ps_flvr.sf_rpc);
+		req_off_ctx_list(req, ctx);
+		sptlrpc_req_replace_dead_ctx(req);
+		ctx = req->rq_cli_ctx;
+	}
+	sptlrpc_sec_put(sec);
+
+	if (cli_ctx_is_eternal(ctx))
+		RETURN(0);
+
+	if (unlikely(test_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags))) {
+		LASSERT(ctx->cc_ops->refresh);
+		ctx->cc_ops->refresh(ctx);
+	}
+	LASSERT(test_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags) == 0);
+
+	LASSERT(ctx->cc_ops->validate);
+	if (ctx->cc_ops->validate(ctx) == 0) {
+		req_off_ctx_list(req, ctx);
+		RETURN(0);
+	}
+
+	if (unlikely(test_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags))) {
+		spin_lock(&req->rq_lock);
+		req->rq_err = 1;
+		spin_unlock(&req->rq_lock);
+		req_off_ctx_list(req, ctx);
+		RETURN(-EPERM);
+	}
+
+	/*
+	 * There's a subtle issue for resending RPCs, suppose following
+	 * situation:
+	 *  1. the request was sent to server.
+	 *  2. recovery was kicked start, after finished the request was
+	 *     marked as resent.
+	 *  3. resend the request.
+	 *  4. old reply from server received, we accept and verify the reply.
+	 *     this has to be success, otherwise the error will be aware
+	 *     by application.
+	 *  5. new reply from server received, dropped by LNet.
+	 *
+	 * Note the xid of old & new request is the same. We can't simply
+	 * change xid for the resent request because the server replies on
+	 * it for reply reconstruction.
+	 *
+	 * Commonly the original context should be uptodate because we
+	 * have a expiry nice time; server will keep its context because
+	 * we at least hold a ref of old context which prevent context
+	 * destroying RPC being sent. So server still can accept the request
+	 * and finish the RPC. But if that's not the case:
+	 *  1. If server side context has been trimmed, a NO_CONTEXT will
+	 *     be returned, gss_cli_ctx_verify/unseal will switch to new
+	 *     context by force.
+	 *  2. Current context never be refreshed, then we are fine: we
+	 *     never really send request with old context before.
+	 */
+	if (test_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags) &&
+	    unlikely(req->rq_reqmsg) &&
+	    lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) {
+		req_off_ctx_list(req, ctx);
+		RETURN(0);
+	}
+
+	if (unlikely(test_bit(PTLRPC_CTX_DEAD_BIT, &ctx->cc_flags))) {
+		req_off_ctx_list(req, ctx);
+		/*
+		 * don't switch ctx if import was deactivated
+		 */
+		if (req->rq_import->imp_deactive) {
+			spin_lock(&req->rq_lock);
+			req->rq_err = 1;
+			spin_unlock(&req->rq_lock);
+			RETURN(-EINTR);
+		}
+
+		rc = sptlrpc_req_replace_dead_ctx(req);
+		if (rc) {
+			LASSERT(ctx == req->rq_cli_ctx);
+			CERROR("req %p: failed to replace dead ctx %p: %d\n",
+			       req, ctx, rc);
+			spin_lock(&req->rq_lock);
+			req->rq_err = 1;
+			spin_unlock(&req->rq_lock);
+			RETURN(rc);
+		}
+
+		ctx = req->rq_cli_ctx;
+		goto again;
+	}
+
+	/*
+	 * Now we're sure this context is during upcall, add myself into
+	 * waiting list
+	 */
+	spin_lock(&ctx->cc_lock);
+	if (list_empty(&req->rq_ctx_chain))
+		list_add(&req->rq_ctx_chain, &ctx->cc_req_list);
+	spin_unlock(&ctx->cc_lock);
+
+	if (timeout < 0)
+		RETURN(-EWOULDBLOCK);
+
+	/* Clear any flags that may be present from previous sends */
+	LASSERT(req->rq_receiving_reply == 0);
+	spin_lock(&req->rq_lock);
+	req->rq_err = 0;
+	req->rq_timedout = 0;
+	req->rq_resend = 0;
+	req->rq_restart = 0;
+	spin_unlock(&req->rq_lock);
+
+	lwi = LWI_TIMEOUT_INTR(timeout * HZ, ctx_refresh_timeout,
+			       ctx_refresh_interrupt, req);
+	rc = l_wait_event(req->rq_reply_waitq, ctx_check_refresh(ctx), &lwi);
+
+	/*
+	 * following cases could lead us here:
+	 * - successfully refreshed;
+	 * - interrupted;
+	 * - timedout, and we don't want recover from the failure;
+	 * - timedout, and waked up upon recovery finished;
+	 * - someone else mark this ctx dead by force;
+	 * - someone invalidate the req and call ptlrpc_client_wake_req(),
+	 *   e.g. ptlrpc_abort_inflight();
+	 */
+	if (!cli_ctx_is_refreshed(ctx)) {
+		/* timed out or interruptted */
+		req_off_ctx_list(req, ctx);
+
+		LASSERT(rc != 0);
+		RETURN(rc);
+	}
+
+	goto again;
+}
+
+/**
+ * Initialize flavor settings for \a req, according to \a opcode.
+ *
+ * \note this could be called in two situations:
+ * - new request from ptlrpc_pre_req(), with proper @opcode
+ * - old request which changed ctx in the middle, with @opcode == 0
+ */
+void sptlrpc_req_set_flavor(struct ptlrpc_request *req, int opcode)
+{
+	struct ptlrpc_sec *sec;
+
+	LASSERT(req->rq_import);
+	LASSERT(req->rq_cli_ctx);
+	LASSERT(req->rq_cli_ctx->cc_sec);
+	LASSERT(req->rq_bulk_read == 0 || req->rq_bulk_write == 0);
+
+	/* special security flags accoding to opcode */
+	switch (opcode) {
+	case OST_READ:
+	case MDS_READPAGE:
+	case MGS_CONFIG_READ:
+	case OBD_IDX_READ:
+		req->rq_bulk_read = 1;
+		break;
+	case OST_WRITE:
+	case MDS_WRITEPAGE:
+		req->rq_bulk_write = 1;
+		break;
+	case SEC_CTX_INIT:
+		req->rq_ctx_init = 1;
+		break;
+	case SEC_CTX_FINI:
+		req->rq_ctx_fini = 1;
+		break;
+	case 0:
+		/* init/fini rpc won't be resend, so can't be here */
+		LASSERT(req->rq_ctx_init == 0);
+		LASSERT(req->rq_ctx_fini == 0);
+
+		/* cleanup flags, which should be recalculated */
+		req->rq_pack_udesc = 0;
+		req->rq_pack_bulk = 0;
+		break;
+	}
+
+	sec = req->rq_cli_ctx->cc_sec;
+
+	spin_lock(&sec->ps_lock);
+	req->rq_flvr = sec->ps_flvr;
+	spin_unlock(&sec->ps_lock);
+
+	/* force SVC_NULL for context initiation rpc, SVC_INTG for context
+	 * destruction rpc */
+	if (unlikely(req->rq_ctx_init))
+		flvr_set_svc(&req->rq_flvr.sf_rpc, SPTLRPC_SVC_NULL);
+	else if (unlikely(req->rq_ctx_fini))
+		flvr_set_svc(&req->rq_flvr.sf_rpc, SPTLRPC_SVC_INTG);
+
+	/* user descriptor flag, null security can't do it anyway */
+	if ((sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_UDESC) &&
+	    (req->rq_flvr.sf_rpc != SPTLRPC_FLVR_NULL))
+		req->rq_pack_udesc = 1;
+
+	/* bulk security flag */
+	if ((req->rq_bulk_read || req->rq_bulk_write) &&
+	    sptlrpc_flavor_has_bulk(&req->rq_flvr))
+		req->rq_pack_bulk = 1;
+}
+
+void sptlrpc_request_out_callback(struct ptlrpc_request *req)
+{
+	if (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc) != SPTLRPC_SVC_PRIV)
+		return;
+
+	LASSERT(req->rq_clrbuf);
+	if (req->rq_pool || !req->rq_reqbuf)
+		return;
+
+	OBD_FREE(req->rq_reqbuf, req->rq_reqbuf_len);
+	req->rq_reqbuf = NULL;
+	req->rq_reqbuf_len = 0;
+}
+
+/**
+ * Given an import \a imp, check whether current user has a valid context
+ * or not. We may create a new context and try to refresh it, and try
+ * repeatedly try in case of non-fatal errors. Return 0 means success.
+ */
+int sptlrpc_import_check_ctx(struct obd_import *imp)
+{
+	struct ptlrpc_sec     *sec;
+	struct ptlrpc_cli_ctx *ctx;
+	struct ptlrpc_request *req = NULL;
+	int rc;
+	ENTRY;
+
+	might_sleep();
+
+	sec = sptlrpc_import_sec_ref(imp);
+	ctx = get_my_ctx(sec);
+	sptlrpc_sec_put(sec);
+
+	if (!ctx)
+		RETURN(-ENOMEM);
+
+	if (cli_ctx_is_eternal(ctx) ||
+	    ctx->cc_ops->validate(ctx) == 0) {
+		sptlrpc_cli_ctx_put(ctx, 1);
+		RETURN(0);
+	}
+
+	if (cli_ctx_is_error(ctx)) {
+		sptlrpc_cli_ctx_put(ctx, 1);
+		RETURN(-EACCES);
+	}
+
+	OBD_ALLOC_PTR(req);
+	if (!req)
+		RETURN(-ENOMEM);
+
+	spin_lock_init(&req->rq_lock);
+	atomic_set(&req->rq_refcount, 10000);
+	INIT_LIST_HEAD(&req->rq_ctx_chain);
+	init_waitqueue_head(&req->rq_reply_waitq);
+	init_waitqueue_head(&req->rq_set_waitq);
+	req->rq_import = imp;
+	req->rq_flvr = sec->ps_flvr;
+	req->rq_cli_ctx = ctx;
+
+	rc = sptlrpc_req_refresh_ctx(req, 0);
+	LASSERT(list_empty(&req->rq_ctx_chain));
+	sptlrpc_cli_ctx_put(req->rq_cli_ctx, 1);
+	OBD_FREE_PTR(req);
+
+	RETURN(rc);
+}
+
+/**
+ * Used by ptlrpc client, to perform the pre-defined security transformation
+ * upon the request message of \a req. After this function called,
+ * req->rq_reqmsg is still accessible as clear text.
+ */
+int sptlrpc_cli_wrap_request(struct ptlrpc_request *req)
+{
+	struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(ctx);
+	LASSERT(ctx->cc_sec);
+	LASSERT(req->rq_reqbuf || req->rq_clrbuf);
+
+	/* we wrap bulk request here because now we can be sure
+	 * the context is uptodate.
+	 */
+	if (req->rq_bulk) {
+		rc = sptlrpc_cli_wrap_bulk(req, req->rq_bulk);
+		if (rc)
+			RETURN(rc);
+	}
+
+	switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) {
+	case SPTLRPC_SVC_NULL:
+	case SPTLRPC_SVC_AUTH:
+	case SPTLRPC_SVC_INTG:
+		LASSERT(ctx->cc_ops->sign);
+		rc = ctx->cc_ops->sign(ctx, req);
+		break;
+	case SPTLRPC_SVC_PRIV:
+		LASSERT(ctx->cc_ops->seal);
+		rc = ctx->cc_ops->seal(ctx, req);
+		break;
+	default:
+		LBUG();
+	}
+
+	if (rc == 0) {
+		LASSERT(req->rq_reqdata_len);
+		LASSERT(req->rq_reqdata_len % 8 == 0);
+		LASSERT(req->rq_reqdata_len <= req->rq_reqbuf_len);
+	}
+
+	RETURN(rc);
+}
+
+static int do_cli_unwrap_reply(struct ptlrpc_request *req)
+{
+	struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+	int		    rc;
+	ENTRY;
+
+	LASSERT(ctx);
+	LASSERT(ctx->cc_sec);
+	LASSERT(req->rq_repbuf);
+	LASSERT(req->rq_repdata);
+	LASSERT(req->rq_repmsg == NULL);
+
+	req->rq_rep_swab_mask = 0;
+
+	rc = __lustre_unpack_msg(req->rq_repdata, req->rq_repdata_len);
+	switch (rc) {
+	case 1:
+		lustre_set_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+	case 0:
+		break;
+	default:
+		CERROR("failed unpack reply: x"LPU64"\n", req->rq_xid);
+		RETURN(-EPROTO);
+	}
+
+	if (req->rq_repdata_len < sizeof(struct lustre_msg)) {
+		CERROR("replied data length %d too small\n",
+		       req->rq_repdata_len);
+		RETURN(-EPROTO);
+	}
+
+	if (SPTLRPC_FLVR_POLICY(req->rq_repdata->lm_secflvr) !=
+	    SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc)) {
+		CERROR("reply policy %u doesn't match request policy %u\n",
+		       SPTLRPC_FLVR_POLICY(req->rq_repdata->lm_secflvr),
+		       SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc));
+		RETURN(-EPROTO);
+	}
+
+	switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) {
+	case SPTLRPC_SVC_NULL:
+	case SPTLRPC_SVC_AUTH:
+	case SPTLRPC_SVC_INTG:
+		LASSERT(ctx->cc_ops->verify);
+		rc = ctx->cc_ops->verify(ctx, req);
+		break;
+	case SPTLRPC_SVC_PRIV:
+		LASSERT(ctx->cc_ops->unseal);
+		rc = ctx->cc_ops->unseal(ctx, req);
+		break;
+	default:
+		LBUG();
+	}
+	LASSERT(rc || req->rq_repmsg || req->rq_resend);
+
+	if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL &&
+	    !req->rq_ctx_init)
+		req->rq_rep_swab_mask = 0;
+	RETURN(rc);
+}
+
+/**
+ * Used by ptlrpc client, to perform security transformation upon the reply
+ * message of \a req. After return successfully, req->rq_repmsg points to
+ * the reply message in clear text.
+ *
+ * \pre the reply buffer should have been un-posted from LNet, so nothing is
+ * going to change.
+ */
+int sptlrpc_cli_unwrap_reply(struct ptlrpc_request *req)
+{
+	LASSERT(req->rq_repbuf);
+	LASSERT(req->rq_repdata == NULL);
+	LASSERT(req->rq_repmsg == NULL);
+	LASSERT(req->rq_reply_off + req->rq_nob_received <= req->rq_repbuf_len);
+
+	if (req->rq_reply_off == 0 &&
+	    (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) {
+		CERROR("real reply with offset 0\n");
+		return -EPROTO;
+	}
+
+	if (req->rq_reply_off % 8 != 0) {
+		CERROR("reply at odd offset %u\n", req->rq_reply_off);
+		return -EPROTO;
+	}
+
+	req->rq_repdata = (struct lustre_msg *)
+				(req->rq_repbuf + req->rq_reply_off);
+	req->rq_repdata_len = req->rq_nob_received;
+
+	return do_cli_unwrap_reply(req);
+}
+
+/**
+ * Used by ptlrpc client, to perform security transformation upon the early
+ * reply message of \a req. We expect the rq_reply_off is 0, and
+ * rq_nob_received is the early reply size.
+ *
+ * Because the receive buffer might be still posted, the reply data might be
+ * changed at any time, no matter we're holding rq_lock or not. For this reason
+ * we allocate a separate ptlrpc_request and reply buffer for early reply
+ * processing.
+ *
+ * \retval 0 success, \a req_ret is filled with a duplicated ptlrpc_request.
+ * Later the caller must call sptlrpc_cli_finish_early_reply() on the returned
+ * \a *req_ret to release it.
+ * \retval -ev error number, and \a req_ret will not be set.
+ */
+int sptlrpc_cli_unwrap_early_reply(struct ptlrpc_request *req,
+				   struct ptlrpc_request **req_ret)
+{
+	struct ptlrpc_request  *early_req;
+	char		   *early_buf;
+	int		     early_bufsz, early_size;
+	int		     rc;
+	ENTRY;
+
+	OBD_ALLOC_PTR(early_req);
+	if (early_req == NULL)
+		RETURN(-ENOMEM);
+
+	early_size = req->rq_nob_received;
+	early_bufsz = size_roundup_power2(early_size);
+	OBD_ALLOC_LARGE(early_buf, early_bufsz);
+	if (early_buf == NULL)
+		GOTO(err_req, rc = -ENOMEM);
+
+	/* sanity checkings and copy data out, do it inside spinlock */
+	spin_lock(&req->rq_lock);
+
+	if (req->rq_replied) {
+		spin_unlock(&req->rq_lock);
+		GOTO(err_buf, rc = -EALREADY);
+	}
+
+	LASSERT(req->rq_repbuf);
+	LASSERT(req->rq_repdata == NULL);
+	LASSERT(req->rq_repmsg == NULL);
+
+	if (req->rq_reply_off != 0) {
+		CERROR("early reply with offset %u\n", req->rq_reply_off);
+		spin_unlock(&req->rq_lock);
+		GOTO(err_buf, rc = -EPROTO);
+	}
+
+	if (req->rq_nob_received != early_size) {
+		/* even another early arrived the size should be the same */
+		CERROR("data size has changed from %u to %u\n",
+		       early_size, req->rq_nob_received);
+		spin_unlock(&req->rq_lock);
+		GOTO(err_buf, rc = -EINVAL);
+	}
+
+	if (req->rq_nob_received < sizeof(struct lustre_msg)) {
+		CERROR("early reply length %d too small\n",
+		       req->rq_nob_received);
+		spin_unlock(&req->rq_lock);
+		GOTO(err_buf, rc = -EALREADY);
+	}
+
+	memcpy(early_buf, req->rq_repbuf, early_size);
+	spin_unlock(&req->rq_lock);
+
+	spin_lock_init(&early_req->rq_lock);
+	early_req->rq_cli_ctx = sptlrpc_cli_ctx_get(req->rq_cli_ctx);
+	early_req->rq_flvr = req->rq_flvr;
+	early_req->rq_repbuf = early_buf;
+	early_req->rq_repbuf_len = early_bufsz;
+	early_req->rq_repdata = (struct lustre_msg *) early_buf;
+	early_req->rq_repdata_len = early_size;
+	early_req->rq_early = 1;
+	early_req->rq_reqmsg = req->rq_reqmsg;
+
+	rc = do_cli_unwrap_reply(early_req);
+	if (rc) {
+		DEBUG_REQ(D_ADAPTTO, early_req,
+			  "error %d unwrap early reply", rc);
+		GOTO(err_ctx, rc);
+	}
+
+	LASSERT(early_req->rq_repmsg);
+	*req_ret = early_req;
+	RETURN(0);
+
+err_ctx:
+	sptlrpc_cli_ctx_put(early_req->rq_cli_ctx, 1);
+err_buf:
+	OBD_FREE_LARGE(early_buf, early_bufsz);
+err_req:
+	OBD_FREE_PTR(early_req);
+	RETURN(rc);
+}
+
+/**
+ * Used by ptlrpc client, to release a processed early reply \a early_req.
+ *
+ * \pre \a early_req was obtained from calling sptlrpc_cli_unwrap_early_reply().
+ */
+void sptlrpc_cli_finish_early_reply(struct ptlrpc_request *early_req)
+{
+	LASSERT(early_req->rq_repbuf);
+	LASSERT(early_req->rq_repdata);
+	LASSERT(early_req->rq_repmsg);
+
+	sptlrpc_cli_ctx_put(early_req->rq_cli_ctx, 1);
+	OBD_FREE_LARGE(early_req->rq_repbuf, early_req->rq_repbuf_len);
+	OBD_FREE_PTR(early_req);
+}
+
+/**************************************************
+ * sec ID					 *
+ **************************************************/
+
+/*
+ * "fixed" sec (e.g. null) use sec_id < 0
+ */
+static atomic_t sptlrpc_sec_id = ATOMIC_INIT(1);
+
+int sptlrpc_get_next_secid(void)
+{
+	return atomic_inc_return(&sptlrpc_sec_id);
+}
+EXPORT_SYMBOL(sptlrpc_get_next_secid);
+
+/**************************************************
+ * client side high-level security APIs	   *
+ **************************************************/
+
+static int sec_cop_flush_ctx_cache(struct ptlrpc_sec *sec, uid_t uid,
+				   int grace, int force)
+{
+	struct ptlrpc_sec_policy *policy = sec->ps_policy;
+
+	LASSERT(policy->sp_cops);
+	LASSERT(policy->sp_cops->flush_ctx_cache);
+
+	return policy->sp_cops->flush_ctx_cache(sec, uid, grace, force);
+}
+
+static void sec_cop_destroy_sec(struct ptlrpc_sec *sec)
+{
+	struct ptlrpc_sec_policy *policy = sec->ps_policy;
+
+	LASSERT_ATOMIC_ZERO(&sec->ps_refcount);
+	LASSERT_ATOMIC_ZERO(&sec->ps_nctx);
+	LASSERT(policy->sp_cops->destroy_sec);
+
+	CDEBUG(D_SEC, "%s@%p: being destroied\n", sec->ps_policy->sp_name, sec);
+
+	policy->sp_cops->destroy_sec(sec);
+	sptlrpc_policy_put(policy);
+}
+
+void sptlrpc_sec_destroy(struct ptlrpc_sec *sec)
+{
+	sec_cop_destroy_sec(sec);
+}
+EXPORT_SYMBOL(sptlrpc_sec_destroy);
+
+static void sptlrpc_sec_kill(struct ptlrpc_sec *sec)
+{
+	LASSERT_ATOMIC_POS(&sec->ps_refcount);
+
+	if (sec->ps_policy->sp_cops->kill_sec) {
+		sec->ps_policy->sp_cops->kill_sec(sec);
+
+		sec_cop_flush_ctx_cache(sec, -1, 1, 1);
+	}
+}
+
+struct ptlrpc_sec *sptlrpc_sec_get(struct ptlrpc_sec *sec)
+{
+	if (sec)
+		atomic_inc(&sec->ps_refcount);
+
+	return sec;
+}
+EXPORT_SYMBOL(sptlrpc_sec_get);
+
+void sptlrpc_sec_put(struct ptlrpc_sec *sec)
+{
+	if (sec) {
+		LASSERT_ATOMIC_POS(&sec->ps_refcount);
+
+		if (atomic_dec_and_test(&sec->ps_refcount)) {
+			sptlrpc_gc_del_sec(sec);
+			sec_cop_destroy_sec(sec);
+		}
+	}
+}
+EXPORT_SYMBOL(sptlrpc_sec_put);
+
+/*
+ * policy module is responsible for taking refrence of import
+ */
+static
+struct ptlrpc_sec * sptlrpc_sec_create(struct obd_import *imp,
+				       struct ptlrpc_svc_ctx *svc_ctx,
+				       struct sptlrpc_flavor *sf,
+				       enum lustre_sec_part sp)
+{
+	struct ptlrpc_sec_policy *policy;
+	struct ptlrpc_sec	*sec;
+	char		      str[32];
+	ENTRY;
+
+	if (svc_ctx) {
+		LASSERT(imp->imp_dlm_fake == 1);
+
+		CDEBUG(D_SEC, "%s %s: reverse sec using flavor %s\n",
+		       imp->imp_obd->obd_type->typ_name,
+		       imp->imp_obd->obd_name,
+		       sptlrpc_flavor2name(sf, str, sizeof(str)));
+
+		policy = sptlrpc_policy_get(svc_ctx->sc_policy);
+		sf->sf_flags |= PTLRPC_SEC_FL_REVERSE | PTLRPC_SEC_FL_ROOTONLY;
+	} else {
+		LASSERT(imp->imp_dlm_fake == 0);
+
+		CDEBUG(D_SEC, "%s %s: select security flavor %s\n",
+		       imp->imp_obd->obd_type->typ_name,
+		       imp->imp_obd->obd_name,
+		       sptlrpc_flavor2name(sf, str, sizeof(str)));
+
+		policy = sptlrpc_wireflavor2policy(sf->sf_rpc);
+		if (!policy) {
+			CERROR("invalid flavor 0x%x\n", sf->sf_rpc);
+			RETURN(NULL);
+		}
+	}
+
+	sec = policy->sp_cops->create_sec(imp, svc_ctx, sf);
+	if (sec) {
+		atomic_inc(&sec->ps_refcount);
+
+		sec->ps_part = sp;
+
+		if (sec->ps_gc_interval && policy->sp_cops->gc_ctx)
+			sptlrpc_gc_add_sec(sec);
+	} else {
+		sptlrpc_policy_put(policy);
+	}
+
+	RETURN(sec);
+}
+
+struct ptlrpc_sec *sptlrpc_import_sec_ref(struct obd_import *imp)
+{
+	struct ptlrpc_sec *sec;
+
+	spin_lock(&imp->imp_lock);
+	sec = sptlrpc_sec_get(imp->imp_sec);
+	spin_unlock(&imp->imp_lock);
+
+	return sec;
+}
+EXPORT_SYMBOL(sptlrpc_import_sec_ref);
+
+static void sptlrpc_import_sec_install(struct obd_import *imp,
+				       struct ptlrpc_sec *sec)
+{
+	struct ptlrpc_sec *old_sec;
+
+	LASSERT_ATOMIC_POS(&sec->ps_refcount);
+
+	spin_lock(&imp->imp_lock);
+	old_sec = imp->imp_sec;
+	imp->imp_sec = sec;
+	spin_unlock(&imp->imp_lock);
+
+	if (old_sec) {
+		sptlrpc_sec_kill(old_sec);
+
+		/* balance the ref taken by this import */
+		sptlrpc_sec_put(old_sec);
+	}
+}
+
+static inline
+int flavor_equal(struct sptlrpc_flavor *sf1, struct sptlrpc_flavor *sf2)
+{
+	return (memcmp(sf1, sf2, sizeof(*sf1)) == 0);
+}
+
+static inline
+void flavor_copy(struct sptlrpc_flavor *dst, struct sptlrpc_flavor *src)
+{
+	*dst = *src;
+}
+
+static void sptlrpc_import_sec_adapt_inplace(struct obd_import *imp,
+					     struct ptlrpc_sec *sec,
+					     struct sptlrpc_flavor *sf)
+{
+	char    str1[32], str2[32];
+
+	if (sec->ps_flvr.sf_flags != sf->sf_flags)
+		CDEBUG(D_SEC, "changing sec flags: %s -> %s\n",
+		       sptlrpc_secflags2str(sec->ps_flvr.sf_flags,
+					    str1, sizeof(str1)),
+		       sptlrpc_secflags2str(sf->sf_flags,
+					    str2, sizeof(str2)));
+
+	spin_lock(&sec->ps_lock);
+	flavor_copy(&sec->ps_flvr, sf);
+	spin_unlock(&sec->ps_lock);
+}
+
+/**
+ * To get an appropriate ptlrpc_sec for the \a imp, according to the current
+ * configuration. Upon called, imp->imp_sec may or may not be NULL.
+ *
+ *  - regular import: \a svc_ctx should be NULL and \a flvr is ignored;
+ *  - reverse import: \a svc_ctx and \a flvr are obtained from incoming request.
+ */
+int sptlrpc_import_sec_adapt(struct obd_import *imp,
+			     struct ptlrpc_svc_ctx *svc_ctx,
+			     struct sptlrpc_flavor *flvr)
+{
+	struct ptlrpc_connection   *conn;
+	struct sptlrpc_flavor       sf;
+	struct ptlrpc_sec	  *sec, *newsec;
+	enum lustre_sec_part	sp;
+	char			str[24];
+	int			 rc = 0;
+	ENTRY;
+
+	might_sleep();
+
+	if (imp == NULL)
+		RETURN(0);
+
+	conn = imp->imp_connection;
+
+	if (svc_ctx == NULL) {
+		struct client_obd *cliobd = &imp->imp_obd->u.cli;
+		/*
+		 * normal import, determine flavor from rule set, except
+		 * for mgc the flavor is predetermined.
+		 */
+		if (cliobd->cl_sp_me == LUSTRE_SP_MGC)
+			sf = cliobd->cl_flvr_mgc;
+		else
+			sptlrpc_conf_choose_flavor(cliobd->cl_sp_me,
+						   cliobd->cl_sp_to,
+						   &cliobd->cl_target_uuid,
+						   conn->c_self, &sf);
+
+		sp = imp->imp_obd->u.cli.cl_sp_me;
+	} else {
+		/* reverse import, determine flavor from incoming reqeust */
+		sf = *flvr;
+
+		if (sf.sf_rpc != SPTLRPC_FLVR_NULL)
+			sf.sf_flags = PTLRPC_SEC_FL_REVERSE |
+				      PTLRPC_SEC_FL_ROOTONLY;
+
+		sp = sptlrpc_target_sec_part(imp->imp_obd);
+	}
+
+	sec = sptlrpc_import_sec_ref(imp);
+	if (sec) {
+		char    str2[24];
+
+		if (flavor_equal(&sf, &sec->ps_flvr))
+			GOTO(out, rc);
+
+		CDEBUG(D_SEC, "import %s->%s: changing flavor %s -> %s\n",
+		       imp->imp_obd->obd_name,
+		       obd_uuid2str(&conn->c_remote_uuid),
+		       sptlrpc_flavor2name(&sec->ps_flvr, str, sizeof(str)),
+		       sptlrpc_flavor2name(&sf, str2, sizeof(str2)));
+
+		if (SPTLRPC_FLVR_POLICY(sf.sf_rpc) ==
+		    SPTLRPC_FLVR_POLICY(sec->ps_flvr.sf_rpc) &&
+		    SPTLRPC_FLVR_MECH(sf.sf_rpc) ==
+		    SPTLRPC_FLVR_MECH(sec->ps_flvr.sf_rpc)) {
+			sptlrpc_import_sec_adapt_inplace(imp, sec, &sf);
+			GOTO(out, rc);
+		}
+	} else if (SPTLRPC_FLVR_BASE(sf.sf_rpc) !=
+		   SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_NULL)) {
+		CDEBUG(D_SEC, "import %s->%s netid %x: select flavor %s\n",
+		       imp->imp_obd->obd_name,
+		       obd_uuid2str(&conn->c_remote_uuid),
+		       LNET_NIDNET(conn->c_self),
+		       sptlrpc_flavor2name(&sf, str, sizeof(str)));
+	}
+
+	mutex_lock(&imp->imp_sec_mutex);
+
+	newsec = sptlrpc_sec_create(imp, svc_ctx, &sf, sp);
+	if (newsec) {
+		sptlrpc_import_sec_install(imp, newsec);
+	} else {
+		CERROR("import %s->%s: failed to create new sec\n",
+		       imp->imp_obd->obd_name,
+		       obd_uuid2str(&conn->c_remote_uuid));
+		rc = -EPERM;
+	}
+
+	mutex_unlock(&imp->imp_sec_mutex);
+out:
+	sptlrpc_sec_put(sec);
+	RETURN(rc);
+}
+
+void sptlrpc_import_sec_put(struct obd_import *imp)
+{
+	if (imp->imp_sec) {
+		sptlrpc_sec_kill(imp->imp_sec);
+
+		sptlrpc_sec_put(imp->imp_sec);
+		imp->imp_sec = NULL;
+	}
+}
+
+static void import_flush_ctx_common(struct obd_import *imp,
+				    uid_t uid, int grace, int force)
+{
+	struct ptlrpc_sec *sec;
+
+	if (imp == NULL)
+		return;
+
+	sec = sptlrpc_import_sec_ref(imp);
+	if (sec == NULL)
+		return;
+
+	sec_cop_flush_ctx_cache(sec, uid, grace, force);
+	sptlrpc_sec_put(sec);
+}
+
+void sptlrpc_import_flush_root_ctx(struct obd_import *imp)
+{
+	/* it's important to use grace mode, see explain in
+	 * sptlrpc_req_refresh_ctx() */
+	import_flush_ctx_common(imp, 0, 1, 1);
+}
+
+void sptlrpc_import_flush_my_ctx(struct obd_import *imp)
+{
+	import_flush_ctx_common(imp, current_uid(), 1, 1);
+}
+EXPORT_SYMBOL(sptlrpc_import_flush_my_ctx);
+
+void sptlrpc_import_flush_all_ctx(struct obd_import *imp)
+{
+	import_flush_ctx_common(imp, -1, 1, 1);
+}
+EXPORT_SYMBOL(sptlrpc_import_flush_all_ctx);
+
+/**
+ * Used by ptlrpc client to allocate request buffer of \a req. Upon return
+ * successfully, req->rq_reqmsg points to a buffer with size \a msgsize.
+ */
+int sptlrpc_cli_alloc_reqbuf(struct ptlrpc_request *req, int msgsize)
+{
+	struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+	struct ptlrpc_sec_policy *policy;
+	int rc;
+
+	LASSERT(ctx);
+	LASSERT(ctx->cc_sec);
+	LASSERT(ctx->cc_sec->ps_policy);
+	LASSERT(req->rq_reqmsg == NULL);
+	LASSERT_ATOMIC_POS(&ctx->cc_refcount);
+
+	policy = ctx->cc_sec->ps_policy;
+	rc = policy->sp_cops->alloc_reqbuf(ctx->cc_sec, req, msgsize);
+	if (!rc) {
+		LASSERT(req->rq_reqmsg);
+		LASSERT(req->rq_reqbuf || req->rq_clrbuf);
+
+		/* zeroing preallocated buffer */
+		if (req->rq_pool)
+			memset(req->rq_reqmsg, 0, msgsize);
+	}
+
+	return rc;
+}
+
+/**
+ * Used by ptlrpc client to free request buffer of \a req. After this
+ * req->rq_reqmsg is set to NULL and should not be accessed anymore.
+ */
+void sptlrpc_cli_free_reqbuf(struct ptlrpc_request *req)
+{
+	struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+	struct ptlrpc_sec_policy *policy;
+
+	LASSERT(ctx);
+	LASSERT(ctx->cc_sec);
+	LASSERT(ctx->cc_sec->ps_policy);
+	LASSERT_ATOMIC_POS(&ctx->cc_refcount);
+
+	if (req->rq_reqbuf == NULL && req->rq_clrbuf == NULL)
+		return;
+
+	policy = ctx->cc_sec->ps_policy;
+	policy->sp_cops->free_reqbuf(ctx->cc_sec, req);
+	req->rq_reqmsg = NULL;
+}
+
+/*
+ * NOTE caller must guarantee the buffer size is enough for the enlargement
+ */
+void _sptlrpc_enlarge_msg_inplace(struct lustre_msg *msg,
+				  int segment, int newsize)
+{
+	void   *src, *dst;
+	int     oldsize, oldmsg_size, movesize;
+
+	LASSERT(segment < msg->lm_bufcount);
+	LASSERT(msg->lm_buflens[segment] <= newsize);
+
+	if (msg->lm_buflens[segment] == newsize)
+		return;
+
+	/* nothing to do if we are enlarging the last segment */
+	if (segment == msg->lm_bufcount - 1) {
+		msg->lm_buflens[segment] = newsize;
+		return;
+	}
+
+	oldsize = msg->lm_buflens[segment];
+
+	src = lustre_msg_buf(msg, segment + 1, 0);
+	msg->lm_buflens[segment] = newsize;
+	dst = lustre_msg_buf(msg, segment + 1, 0);
+	msg->lm_buflens[segment] = oldsize;
+
+	/* move from segment + 1 to end segment */
+	LASSERT(msg->lm_magic == LUSTRE_MSG_MAGIC_V2);
+	oldmsg_size = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+	movesize = oldmsg_size - ((unsigned long) src - (unsigned long) msg);
+	LASSERT(movesize >= 0);
+
+	if (movesize)
+		memmove(dst, src, movesize);
+
+	/* note we don't clear the ares where old data live, not secret */
+
+	/* finally set new segment size */
+	msg->lm_buflens[segment] = newsize;
+}
+EXPORT_SYMBOL(_sptlrpc_enlarge_msg_inplace);
+
+/**
+ * Used by ptlrpc client to enlarge the \a segment of request message pointed
+ * by req->rq_reqmsg to size \a newsize, all previously filled-in data will be
+ * preserved after the enlargement. this must be called after original request
+ * buffer being allocated.
+ *
+ * \note after this be called, rq_reqmsg and rq_reqlen might have been changed,
+ * so caller should refresh its local pointers if needed.
+ */
+int sptlrpc_cli_enlarge_reqbuf(struct ptlrpc_request *req,
+			       int segment, int newsize)
+{
+	struct ptlrpc_cli_ctx    *ctx = req->rq_cli_ctx;
+	struct ptlrpc_sec_cops   *cops;
+	struct lustre_msg	*msg = req->rq_reqmsg;
+
+	LASSERT(ctx);
+	LASSERT(msg);
+	LASSERT(msg->lm_bufcount > segment);
+	LASSERT(msg->lm_buflens[segment] <= newsize);
+
+	if (msg->lm_buflens[segment] == newsize)
+		return 0;
+
+	cops = ctx->cc_sec->ps_policy->sp_cops;
+	LASSERT(cops->enlarge_reqbuf);
+	return cops->enlarge_reqbuf(ctx->cc_sec, req, segment, newsize);
+}
+EXPORT_SYMBOL(sptlrpc_cli_enlarge_reqbuf);
+
+/**
+ * Used by ptlrpc client to allocate reply buffer of \a req.
+ *
+ * \note After this, req->rq_repmsg is still not accessible.
+ */
+int sptlrpc_cli_alloc_repbuf(struct ptlrpc_request *req, int msgsize)
+{
+	struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+	struct ptlrpc_sec_policy *policy;
+	ENTRY;
+
+	LASSERT(ctx);
+	LASSERT(ctx->cc_sec);
+	LASSERT(ctx->cc_sec->ps_policy);
+
+	if (req->rq_repbuf)
+		RETURN(0);
+
+	policy = ctx->cc_sec->ps_policy;
+	RETURN(policy->sp_cops->alloc_repbuf(ctx->cc_sec, req, msgsize));
+}
+
+/**
+ * Used by ptlrpc client to free reply buffer of \a req. After this
+ * req->rq_repmsg is set to NULL and should not be accessed anymore.
+ */
+void sptlrpc_cli_free_repbuf(struct ptlrpc_request *req)
+{
+	struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+	struct ptlrpc_sec_policy *policy;
+	ENTRY;
+
+	LASSERT(ctx);
+	LASSERT(ctx->cc_sec);
+	LASSERT(ctx->cc_sec->ps_policy);
+	LASSERT_ATOMIC_POS(&ctx->cc_refcount);
+
+	if (req->rq_repbuf == NULL)
+		return;
+	LASSERT(req->rq_repbuf_len);
+
+	policy = ctx->cc_sec->ps_policy;
+	policy->sp_cops->free_repbuf(ctx->cc_sec, req);
+	req->rq_repmsg = NULL;
+	EXIT;
+}
+
+int sptlrpc_cli_install_rvs_ctx(struct obd_import *imp,
+				struct ptlrpc_cli_ctx *ctx)
+{
+	struct ptlrpc_sec_policy *policy = ctx->cc_sec->ps_policy;
+
+	if (!policy->sp_cops->install_rctx)
+		return 0;
+	return policy->sp_cops->install_rctx(imp, ctx->cc_sec, ctx);
+}
+
+int sptlrpc_svc_install_rvs_ctx(struct obd_import *imp,
+				struct ptlrpc_svc_ctx *ctx)
+{
+	struct ptlrpc_sec_policy *policy = ctx->sc_policy;
+
+	if (!policy->sp_sops->install_rctx)
+		return 0;
+	return policy->sp_sops->install_rctx(imp, ctx);
+}
+
+/****************************************
+ * server side security		 *
+ ****************************************/
+
+static int flavor_allowed(struct sptlrpc_flavor *exp,
+			  struct ptlrpc_request *req)
+{
+	struct sptlrpc_flavor *flvr = &req->rq_flvr;
+
+	if (exp->sf_rpc == SPTLRPC_FLVR_ANY || exp->sf_rpc == flvr->sf_rpc)
+		return 1;
+
+	if ((req->rq_ctx_init || req->rq_ctx_fini) &&
+	    SPTLRPC_FLVR_POLICY(exp->sf_rpc) ==
+	    SPTLRPC_FLVR_POLICY(flvr->sf_rpc) &&
+	    SPTLRPC_FLVR_MECH(exp->sf_rpc) == SPTLRPC_FLVR_MECH(flvr->sf_rpc))
+		return 1;
+
+	return 0;
+}
+
+#define EXP_FLVR_UPDATE_EXPIRE      (OBD_TIMEOUT_DEFAULT + 10)
+
+/**
+ * Given an export \a exp, check whether the flavor of incoming \a req
+ * is allowed by the export \a exp. Main logic is about taking care of
+ * changing configurations. Return 0 means success.
+ */
+int sptlrpc_target_export_check(struct obd_export *exp,
+				struct ptlrpc_request *req)
+{
+	struct sptlrpc_flavor   flavor;
+
+	if (exp == NULL)
+		return 0;
+
+	/* client side export has no imp_reverse, skip
+	 * FIXME maybe we should check flavor this as well??? */
+	if (exp->exp_imp_reverse == NULL)
+		return 0;
+
+	/* don't care about ctx fini rpc */
+	if (req->rq_ctx_fini)
+		return 0;
+
+	spin_lock(&exp->exp_lock);
+
+	/* if flavor just changed (exp->exp_flvr_changed != 0), we wait for
+	 * the first req with the new flavor, then treat it as current flavor,
+	 * adapt reverse sec according to it.
+	 * note the first rpc with new flavor might not be with root ctx, in
+	 * which case delay the sec_adapt by leaving exp_flvr_adapt == 1. */
+	if (unlikely(exp->exp_flvr_changed) &&
+	    flavor_allowed(&exp->exp_flvr_old[1], req)) {
+		/* make the new flavor as "current", and old ones as
+		 * about-to-expire */
+		CDEBUG(D_SEC, "exp %p: just changed: %x->%x\n", exp,
+		       exp->exp_flvr.sf_rpc, exp->exp_flvr_old[1].sf_rpc);
+		flavor = exp->exp_flvr_old[1];
+		exp->exp_flvr_old[1] = exp->exp_flvr_old[0];
+		exp->exp_flvr_expire[1] = exp->exp_flvr_expire[0];
+		exp->exp_flvr_old[0] = exp->exp_flvr;
+		exp->exp_flvr_expire[0] = cfs_time_current_sec() +
+					  EXP_FLVR_UPDATE_EXPIRE;
+		exp->exp_flvr = flavor;
+
+		/* flavor change finished */
+		exp->exp_flvr_changed = 0;
+		LASSERT(exp->exp_flvr_adapt == 1);
+
+		/* if it's gss, we only interested in root ctx init */
+		if (req->rq_auth_gss &&
+		    !(req->rq_ctx_init &&
+		      (req->rq_auth_usr_root || req->rq_auth_usr_mdt ||
+		       req->rq_auth_usr_ost))) {
+			spin_unlock(&exp->exp_lock);
+			CDEBUG(D_SEC, "is good but not root(%d:%d:%d:%d:%d)\n",
+			       req->rq_auth_gss, req->rq_ctx_init,
+			       req->rq_auth_usr_root, req->rq_auth_usr_mdt,
+			       req->rq_auth_usr_ost);
+			return 0;
+		}
+
+		exp->exp_flvr_adapt = 0;
+		spin_unlock(&exp->exp_lock);
+
+		return sptlrpc_import_sec_adapt(exp->exp_imp_reverse,
+						req->rq_svc_ctx, &flavor);
+	}
+
+	/* if it equals to the current flavor, we accept it, but need to
+	 * dealing with reverse sec/ctx */
+	if (likely(flavor_allowed(&exp->exp_flvr, req))) {
+		/* most cases should return here, we only interested in
+		 * gss root ctx init */
+		if (!req->rq_auth_gss || !req->rq_ctx_init ||
+		    (!req->rq_auth_usr_root && !req->rq_auth_usr_mdt &&
+		     !req->rq_auth_usr_ost)) {
+			spin_unlock(&exp->exp_lock);
+			return 0;
+		}
+
+		/* if flavor just changed, we should not proceed, just leave
+		 * it and current flavor will be discovered and replaced
+		 * shortly, and let _this_ rpc pass through */
+		if (exp->exp_flvr_changed) {
+			LASSERT(exp->exp_flvr_adapt);
+			spin_unlock(&exp->exp_lock);
+			return 0;
+		}
+
+		if (exp->exp_flvr_adapt) {
+			exp->exp_flvr_adapt = 0;
+			CDEBUG(D_SEC, "exp %p (%x|%x|%x): do delayed adapt\n",
+			       exp, exp->exp_flvr.sf_rpc,
+			       exp->exp_flvr_old[0].sf_rpc,
+			       exp->exp_flvr_old[1].sf_rpc);
+			flavor = exp->exp_flvr;
+			spin_unlock(&exp->exp_lock);
+
+			return sptlrpc_import_sec_adapt(exp->exp_imp_reverse,
+							req->rq_svc_ctx,
+							&flavor);
+		} else {
+			CDEBUG(D_SEC, "exp %p (%x|%x|%x): is current flavor, "
+			       "install rvs ctx\n", exp, exp->exp_flvr.sf_rpc,
+			       exp->exp_flvr_old[0].sf_rpc,
+			       exp->exp_flvr_old[1].sf_rpc);
+			spin_unlock(&exp->exp_lock);
+
+			return sptlrpc_svc_install_rvs_ctx(exp->exp_imp_reverse,
+							   req->rq_svc_ctx);
+		}
+	}
+
+	if (exp->exp_flvr_expire[0]) {
+		if (exp->exp_flvr_expire[0] >= cfs_time_current_sec()) {
+			if (flavor_allowed(&exp->exp_flvr_old[0], req)) {
+				CDEBUG(D_SEC, "exp %p (%x|%x|%x): match the "
+				       "middle one ("CFS_DURATION_T")\n", exp,
+				       exp->exp_flvr.sf_rpc,
+				       exp->exp_flvr_old[0].sf_rpc,
+				       exp->exp_flvr_old[1].sf_rpc,
+				       exp->exp_flvr_expire[0] -
+						cfs_time_current_sec());
+				spin_unlock(&exp->exp_lock);
+				return 0;
+			}
+		} else {
+			CDEBUG(D_SEC, "mark middle expired\n");
+			exp->exp_flvr_expire[0] = 0;
+		}
+		CDEBUG(D_SEC, "exp %p (%x|%x|%x): %x not match middle\n", exp,
+		       exp->exp_flvr.sf_rpc,
+		       exp->exp_flvr_old[0].sf_rpc, exp->exp_flvr_old[1].sf_rpc,
+		       req->rq_flvr.sf_rpc);
+	}
+
+	/* now it doesn't match the current flavor, the only chance we can
+	 * accept it is match the old flavors which is not expired. */
+	if (exp->exp_flvr_changed == 0 && exp->exp_flvr_expire[1]) {
+		if (exp->exp_flvr_expire[1] >= cfs_time_current_sec()) {
+			if (flavor_allowed(&exp->exp_flvr_old[1], req)) {
+				CDEBUG(D_SEC, "exp %p (%x|%x|%x): match the "
+				       "oldest one ("CFS_DURATION_T")\n", exp,
+				       exp->exp_flvr.sf_rpc,
+				       exp->exp_flvr_old[0].sf_rpc,
+				       exp->exp_flvr_old[1].sf_rpc,
+				       exp->exp_flvr_expire[1] -
+						cfs_time_current_sec());
+				spin_unlock(&exp->exp_lock);
+				return 0;
+			}
+		} else {
+			CDEBUG(D_SEC, "mark oldest expired\n");
+			exp->exp_flvr_expire[1] = 0;
+		}
+		CDEBUG(D_SEC, "exp %p (%x|%x|%x): %x not match found\n",
+		       exp, exp->exp_flvr.sf_rpc,
+		       exp->exp_flvr_old[0].sf_rpc, exp->exp_flvr_old[1].sf_rpc,
+		       req->rq_flvr.sf_rpc);
+	} else {
+		CDEBUG(D_SEC, "exp %p (%x|%x|%x): skip the last one\n",
+		       exp, exp->exp_flvr.sf_rpc, exp->exp_flvr_old[0].sf_rpc,
+		       exp->exp_flvr_old[1].sf_rpc);
+	}
+
+	spin_unlock(&exp->exp_lock);
+
+	CWARN("exp %p(%s): req %p (%u|%u|%u|%u|%u|%u) with "
+	      "unauthorized flavor %x, expect %x|%x(%+ld)|%x(%+ld)\n",
+	      exp, exp->exp_obd->obd_name,
+	      req, req->rq_auth_gss, req->rq_ctx_init, req->rq_ctx_fini,
+	      req->rq_auth_usr_root, req->rq_auth_usr_mdt, req->rq_auth_usr_ost,
+	      req->rq_flvr.sf_rpc,
+	      exp->exp_flvr.sf_rpc,
+	      exp->exp_flvr_old[0].sf_rpc,
+	      exp->exp_flvr_expire[0] ?
+	      (unsigned long) (exp->exp_flvr_expire[0] -
+			       cfs_time_current_sec()) : 0,
+	      exp->exp_flvr_old[1].sf_rpc,
+	      exp->exp_flvr_expire[1] ?
+	      (unsigned long) (exp->exp_flvr_expire[1] -
+			       cfs_time_current_sec()) : 0);
+	return -EACCES;
+}
+EXPORT_SYMBOL(sptlrpc_target_export_check);
+
+void sptlrpc_target_update_exp_flavor(struct obd_device *obd,
+				      struct sptlrpc_rule_set *rset)
+{
+	struct obd_export       *exp;
+	struct sptlrpc_flavor    new_flvr;
+
+	LASSERT(obd);
+
+	spin_lock(&obd->obd_dev_lock);
+
+	list_for_each_entry(exp, &obd->obd_exports, exp_obd_chain) {
+		if (exp->exp_connection == NULL)
+			continue;
+
+		/* note if this export had just been updated flavor
+		 * (exp_flvr_changed == 1), this will override the
+		 * previous one. */
+		spin_lock(&exp->exp_lock);
+		sptlrpc_target_choose_flavor(rset, exp->exp_sp_peer,
+					     exp->exp_connection->c_peer.nid,
+					     &new_flvr);
+		if (exp->exp_flvr_changed ||
+		    !flavor_equal(&new_flvr, &exp->exp_flvr)) {
+			exp->exp_flvr_old[1] = new_flvr;
+			exp->exp_flvr_expire[1] = 0;
+			exp->exp_flvr_changed = 1;
+			exp->exp_flvr_adapt = 1;
+
+			CDEBUG(D_SEC, "exp %p (%s): updated flavor %x->%x\n",
+			       exp, sptlrpc_part2name(exp->exp_sp_peer),
+			       exp->exp_flvr.sf_rpc,
+			       exp->exp_flvr_old[1].sf_rpc);
+		}
+		spin_unlock(&exp->exp_lock);
+	}
+
+	spin_unlock(&obd->obd_dev_lock);
+}
+EXPORT_SYMBOL(sptlrpc_target_update_exp_flavor);
+
+static int sptlrpc_svc_check_from(struct ptlrpc_request *req, int svc_rc)
+{
+	/* peer's claim is unreliable unless gss is being used */
+	if (!req->rq_auth_gss || svc_rc == SECSVC_DROP)
+		return svc_rc;
+
+	switch (req->rq_sp_from) {
+	case LUSTRE_SP_CLI:
+		if (req->rq_auth_usr_mdt || req->rq_auth_usr_ost) {
+			DEBUG_REQ(D_ERROR, req, "faked source CLI");
+			svc_rc = SECSVC_DROP;
+		}
+		break;
+	case LUSTRE_SP_MDT:
+		if (!req->rq_auth_usr_mdt) {
+			DEBUG_REQ(D_ERROR, req, "faked source MDT");
+			svc_rc = SECSVC_DROP;
+		}
+		break;
+	case LUSTRE_SP_OST:
+		if (!req->rq_auth_usr_ost) {
+			DEBUG_REQ(D_ERROR, req, "faked source OST");
+			svc_rc = SECSVC_DROP;
+		}
+		break;
+	case LUSTRE_SP_MGS:
+	case LUSTRE_SP_MGC:
+		if (!req->rq_auth_usr_root && !req->rq_auth_usr_mdt &&
+		    !req->rq_auth_usr_ost) {
+			DEBUG_REQ(D_ERROR, req, "faked source MGC/MGS");
+			svc_rc = SECSVC_DROP;
+		}
+		break;
+	case LUSTRE_SP_ANY:
+	default:
+		DEBUG_REQ(D_ERROR, req, "invalid source %u", req->rq_sp_from);
+		svc_rc = SECSVC_DROP;
+	}
+
+	return svc_rc;
+}
+
+/**
+ * Used by ptlrpc server, to perform transformation upon request message of
+ * incoming \a req. This must be the first thing to do with a incoming
+ * request in ptlrpc layer.
+ *
+ * \retval SECSVC_OK success, and req->rq_reqmsg point to request message in
+ * clear text, size is req->rq_reqlen; also req->rq_svc_ctx is set.
+ * \retval SECSVC_COMPLETE success, the request has been fully processed, and
+ * reply message has been prepared.
+ * \retval SECSVC_DROP failed, this request should be dropped.
+ */
+int sptlrpc_svc_unwrap_request(struct ptlrpc_request *req)
+{
+	struct ptlrpc_sec_policy *policy;
+	struct lustre_msg	*msg = req->rq_reqbuf;
+	int		       rc;
+	ENTRY;
+
+	LASSERT(msg);
+	LASSERT(req->rq_reqmsg == NULL);
+	LASSERT(req->rq_repmsg == NULL);
+	LASSERT(req->rq_svc_ctx == NULL);
+
+	req->rq_req_swab_mask = 0;
+
+	rc = __lustre_unpack_msg(msg, req->rq_reqdata_len);
+	switch (rc) {
+	case 1:
+		lustre_set_req_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+	case 0:
+		break;
+	default:
+		CERROR("error unpacking request from %s x"LPU64"\n",
+		       libcfs_id2str(req->rq_peer), req->rq_xid);
+		RETURN(SECSVC_DROP);
+	}
+
+	req->rq_flvr.sf_rpc = WIRE_FLVR(msg->lm_secflvr);
+	req->rq_sp_from = LUSTRE_SP_ANY;
+	req->rq_auth_uid = INVALID_UID;
+	req->rq_auth_mapped_uid = INVALID_UID;
+
+	policy = sptlrpc_wireflavor2policy(req->rq_flvr.sf_rpc);
+	if (!policy) {
+		CERROR("unsupported rpc flavor %x\n", req->rq_flvr.sf_rpc);
+		RETURN(SECSVC_DROP);
+	}
+
+	LASSERT(policy->sp_sops->accept);
+	rc = policy->sp_sops->accept(req);
+	sptlrpc_policy_put(policy);
+	LASSERT(req->rq_reqmsg || rc != SECSVC_OK);
+	LASSERT(req->rq_svc_ctx || rc == SECSVC_DROP);
+
+	/*
+	 * if it's not null flavor (which means embedded packing msg),
+	 * reset the swab mask for the comming inner msg unpacking.
+	 */
+	if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL)
+		req->rq_req_swab_mask = 0;
+
+	/* sanity check for the request source */
+	rc = sptlrpc_svc_check_from(req, rc);
+	RETURN(rc);
+}
+
+/**
+ * Used by ptlrpc server, to allocate reply buffer for \a req. If succeed,
+ * req->rq_reply_state is set, and req->rq_reply_state->rs_msg point to
+ * a buffer of \a msglen size.
+ */
+int sptlrpc_svc_alloc_rs(struct ptlrpc_request *req, int msglen)
+{
+	struct ptlrpc_sec_policy *policy;
+	struct ptlrpc_reply_state *rs;
+	int rc;
+	ENTRY;
+
+	LASSERT(req->rq_svc_ctx);
+	LASSERT(req->rq_svc_ctx->sc_policy);
+
+	policy = req->rq_svc_ctx->sc_policy;
+	LASSERT(policy->sp_sops->alloc_rs);
+
+	rc = policy->sp_sops->alloc_rs(req, msglen);
+	if (unlikely(rc == -ENOMEM)) {
+		/* failed alloc, try emergency pool */
+		rs = lustre_get_emerg_rs(req->rq_rqbd->rqbd_svcpt);
+		if (rs == NULL)
+			RETURN(-ENOMEM);
+
+		req->rq_reply_state = rs;
+		rc = policy->sp_sops->alloc_rs(req, msglen);
+		if (rc) {
+			lustre_put_emerg_rs(rs);
+			req->rq_reply_state = NULL;
+		}
+	}
+
+	LASSERT(rc != 0 ||
+		(req->rq_reply_state && req->rq_reply_state->rs_msg));
+
+	RETURN(rc);
+}
+
+/**
+ * Used by ptlrpc server, to perform transformation upon reply message.
+ *
+ * \post req->rq_reply_off is set to approriate server-controlled reply offset.
+ * \post req->rq_repmsg and req->rq_reply_state->rs_msg becomes inaccessible.
+ */
+int sptlrpc_svc_wrap_reply(struct ptlrpc_request *req)
+{
+	struct ptlrpc_sec_policy *policy;
+	int rc;
+	ENTRY;
+
+	LASSERT(req->rq_svc_ctx);
+	LASSERT(req->rq_svc_ctx->sc_policy);
+
+	policy = req->rq_svc_ctx->sc_policy;
+	LASSERT(policy->sp_sops->authorize);
+
+	rc = policy->sp_sops->authorize(req);
+	LASSERT(rc || req->rq_reply_state->rs_repdata_len);
+
+	RETURN(rc);
+}
+
+/**
+ * Used by ptlrpc server, to free reply_state.
+ */
+void sptlrpc_svc_free_rs(struct ptlrpc_reply_state *rs)
+{
+	struct ptlrpc_sec_policy *policy;
+	unsigned int prealloc;
+	ENTRY;
+
+	LASSERT(rs->rs_svc_ctx);
+	LASSERT(rs->rs_svc_ctx->sc_policy);
+
+	policy = rs->rs_svc_ctx->sc_policy;
+	LASSERT(policy->sp_sops->free_rs);
+
+	prealloc = rs->rs_prealloc;
+	policy->sp_sops->free_rs(rs);
+
+	if (prealloc)
+		lustre_put_emerg_rs(rs);
+	EXIT;
+}
+
+void sptlrpc_svc_ctx_addref(struct ptlrpc_request *req)
+{
+	struct ptlrpc_svc_ctx *ctx = req->rq_svc_ctx;
+
+	if (ctx != NULL)
+		atomic_inc(&ctx->sc_refcount);
+}
+
+void sptlrpc_svc_ctx_decref(struct ptlrpc_request *req)
+{
+	struct ptlrpc_svc_ctx *ctx = req->rq_svc_ctx;
+
+	if (ctx == NULL)
+		return;
+
+	LASSERT_ATOMIC_POS(&ctx->sc_refcount);
+	if (atomic_dec_and_test(&ctx->sc_refcount)) {
+		if (ctx->sc_policy->sp_sops->free_ctx)
+			ctx->sc_policy->sp_sops->free_ctx(ctx);
+	}
+	req->rq_svc_ctx = NULL;
+}
+
+void sptlrpc_svc_ctx_invalidate(struct ptlrpc_request *req)
+{
+	struct ptlrpc_svc_ctx *ctx = req->rq_svc_ctx;
+
+	if (ctx == NULL)
+		return;
+
+	LASSERT_ATOMIC_POS(&ctx->sc_refcount);
+	if (ctx->sc_policy->sp_sops->invalidate_ctx)
+		ctx->sc_policy->sp_sops->invalidate_ctx(ctx);
+}
+EXPORT_SYMBOL(sptlrpc_svc_ctx_invalidate);
+
+/****************************************
+ * bulk security			*
+ ****************************************/
+
+/**
+ * Perform transformation upon bulk data pointed by \a desc. This is called
+ * before transforming the request message.
+ */
+int sptlrpc_cli_wrap_bulk(struct ptlrpc_request *req,
+			  struct ptlrpc_bulk_desc *desc)
+{
+	struct ptlrpc_cli_ctx *ctx;
+
+	LASSERT(req->rq_bulk_read || req->rq_bulk_write);
+
+	if (!req->rq_pack_bulk)
+		return 0;
+
+	ctx = req->rq_cli_ctx;
+	if (ctx->cc_ops->wrap_bulk)
+		return ctx->cc_ops->wrap_bulk(ctx, req, desc);
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_cli_wrap_bulk);
+
+/**
+ * This is called after unwrap the reply message.
+ * return nob of actual plain text size received, or error code.
+ */
+int sptlrpc_cli_unwrap_bulk_read(struct ptlrpc_request *req,
+				 struct ptlrpc_bulk_desc *desc,
+				 int nob)
+{
+	struct ptlrpc_cli_ctx  *ctx;
+	int		     rc;
+
+	LASSERT(req->rq_bulk_read && !req->rq_bulk_write);
+
+	if (!req->rq_pack_bulk)
+		return desc->bd_nob_transferred;
+
+	ctx = req->rq_cli_ctx;
+	if (ctx->cc_ops->unwrap_bulk) {
+		rc = ctx->cc_ops->unwrap_bulk(ctx, req, desc);
+		if (rc < 0)
+			return rc;
+	}
+	return desc->bd_nob_transferred;
+}
+EXPORT_SYMBOL(sptlrpc_cli_unwrap_bulk_read);
+
+/**
+ * This is called after unwrap the reply message.
+ * return 0 for success or error code.
+ */
+int sptlrpc_cli_unwrap_bulk_write(struct ptlrpc_request *req,
+				  struct ptlrpc_bulk_desc *desc)
+{
+	struct ptlrpc_cli_ctx  *ctx;
+	int		     rc;
+
+	LASSERT(!req->rq_bulk_read && req->rq_bulk_write);
+
+	if (!req->rq_pack_bulk)
+		return 0;
+
+	ctx = req->rq_cli_ctx;
+	if (ctx->cc_ops->unwrap_bulk) {
+		rc = ctx->cc_ops->unwrap_bulk(ctx, req, desc);
+		if (rc < 0)
+			return rc;
+	}
+
+	/*
+	 * if everything is going right, nob should equals to nob_transferred.
+	 * in case of privacy mode, nob_transferred needs to be adjusted.
+	 */
+	if (desc->bd_nob != desc->bd_nob_transferred) {
+		CERROR("nob %d doesn't match transferred nob %d",
+		       desc->bd_nob, desc->bd_nob_transferred);
+		return -EPROTO;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_cli_unwrap_bulk_write);
+
+
+/****************************************
+ * user descriptor helpers	      *
+ ****************************************/
+
+int sptlrpc_current_user_desc_size(void)
+{
+	int ngroups;
+
+	ngroups = current_ngroups;
+
+	if (ngroups > LUSTRE_MAX_GROUPS)
+		ngroups = LUSTRE_MAX_GROUPS;
+	return sptlrpc_user_desc_size(ngroups);
+}
+EXPORT_SYMBOL(sptlrpc_current_user_desc_size);
+
+int sptlrpc_pack_user_desc(struct lustre_msg *msg, int offset)
+{
+	struct ptlrpc_user_desc *pud;
+
+	pud = lustre_msg_buf(msg, offset, 0);
+
+	pud->pud_uid = current_uid();
+	pud->pud_gid = current_gid();
+	pud->pud_fsuid = current_fsuid();
+	pud->pud_fsgid = current_fsgid();
+	pud->pud_cap = cfs_curproc_cap_pack();
+	pud->pud_ngroups = (msg->lm_buflens[offset] - sizeof(*pud)) / 4;
+
+	task_lock(current);
+	if (pud->pud_ngroups > current_ngroups)
+		pud->pud_ngroups = current_ngroups;
+	memcpy(pud->pud_groups, current_cred()->group_info->blocks[0],
+	       pud->pud_ngroups * sizeof(__u32));
+	task_unlock(current);
+
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_pack_user_desc);
+
+int sptlrpc_unpack_user_desc(struct lustre_msg *msg, int offset, int swabbed)
+{
+	struct ptlrpc_user_desc *pud;
+	int		      i;
+
+	pud = lustre_msg_buf(msg, offset, sizeof(*pud));
+	if (!pud)
+		return -EINVAL;
+
+	if (swabbed) {
+		__swab32s(&pud->pud_uid);
+		__swab32s(&pud->pud_gid);
+		__swab32s(&pud->pud_fsuid);
+		__swab32s(&pud->pud_fsgid);
+		__swab32s(&pud->pud_cap);
+		__swab32s(&pud->pud_ngroups);
+	}
+
+	if (pud->pud_ngroups > LUSTRE_MAX_GROUPS) {
+		CERROR("%u groups is too large\n", pud->pud_ngroups);
+		return -EINVAL;
+	}
+
+	if (sizeof(*pud) + pud->pud_ngroups * sizeof(__u32) >
+	    msg->lm_buflens[offset]) {
+		CERROR("%u groups are claimed but bufsize only %u\n",
+		       pud->pud_ngroups, msg->lm_buflens[offset]);
+		return -EINVAL;
+	}
+
+	if (swabbed) {
+		for (i = 0; i < pud->pud_ngroups; i++)
+			__swab32s(&pud->pud_groups[i]);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_unpack_user_desc);
+
+/****************************************
+ * misc helpers			 *
+ ****************************************/
+
+const char * sec2target_str(struct ptlrpc_sec *sec)
+{
+	if (!sec || !sec->ps_import || !sec->ps_import->imp_obd)
+		return "*";
+	if (sec_is_reverse(sec))
+		return "c";
+	return obd_uuid2str(&sec->ps_import->imp_obd->u.cli.cl_target_uuid);
+}
+EXPORT_SYMBOL(sec2target_str);
+
+/*
+ * return true if the bulk data is protected
+ */
+int sptlrpc_flavor_has_bulk(struct sptlrpc_flavor *flvr)
+{
+	switch (SPTLRPC_FLVR_BULK_SVC(flvr->sf_rpc)) {
+	case SPTLRPC_BULK_SVC_INTG:
+	case SPTLRPC_BULK_SVC_PRIV:
+		return 1;
+	default:
+		return 0;
+	}
+}
+EXPORT_SYMBOL(sptlrpc_flavor_has_bulk);
+
+/****************************************
+ * crypto API helper/alloc blkciper     *
+ ****************************************/
+
+/****************************************
+ * initialize/finalize		  *
+ ****************************************/
+
+int sptlrpc_init(void)
+{
+	int rc;
+
+	rwlock_init(&policy_lock);
+
+	rc = sptlrpc_gc_init();
+	if (rc)
+		goto out;
+
+	rc = sptlrpc_conf_init();
+	if (rc)
+		goto out_gc;
+
+	rc = sptlrpc_enc_pool_init();
+	if (rc)
+		goto out_conf;
+
+	rc = sptlrpc_null_init();
+	if (rc)
+		goto out_pool;
+
+	rc = sptlrpc_plain_init();
+	if (rc)
+		goto out_null;
+
+	rc = sptlrpc_lproc_init();
+	if (rc)
+		goto out_plain;
+
+	return 0;
+
+out_plain:
+	sptlrpc_plain_fini();
+out_null:
+	sptlrpc_null_fini();
+out_pool:
+	sptlrpc_enc_pool_fini();
+out_conf:
+	sptlrpc_conf_fini();
+out_gc:
+	sptlrpc_gc_fini();
+out:
+	return rc;
+}
+
+void sptlrpc_fini(void)
+{
+	sptlrpc_lproc_fini();
+	sptlrpc_plain_fini();
+	sptlrpc_null_fini();
+	sptlrpc_enc_pool_fini();
+	sptlrpc_conf_fini();
+	sptlrpc_gc_fini();
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c b/drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c
new file mode 100644
index 000000000000..60ab2eaf6683
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c
@@ -0,0 +1,881 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/sec_bulk.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/crypto.h>
+
+#include <obd.h>
+#include <obd_cksum.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_dlm.h>
+#include <lustre_sec.h>
+
+#include "ptlrpc_internal.h"
+
+/****************************************
+ * bulk encryption page pools	   *
+ ****************************************/
+
+
+#define PTRS_PER_PAGE   (PAGE_CACHE_SIZE / sizeof(void *))
+#define PAGES_PER_POOL  (PTRS_PER_PAGE)
+
+#define IDLE_IDX_MAX	    (100)
+#define IDLE_IDX_WEIGHT	 (3)
+
+#define CACHE_QUIESCENT_PERIOD  (20)
+
+static struct ptlrpc_enc_page_pool {
+	/*
+	 * constants
+	 */
+	unsigned long    epp_max_pages;   /* maximum pages can hold, const */
+	unsigned int     epp_max_pools;   /* number of pools, const */
+
+	/*
+	 * wait queue in case of not enough free pages.
+	 */
+	wait_queue_head_t      epp_waitq;       /* waiting threads */
+	unsigned int     epp_waitqlen;    /* wait queue length */
+	unsigned long    epp_pages_short; /* # of pages wanted of in-q users */
+	unsigned int     epp_growing:1;   /* during adding pages */
+
+	/*
+	 * indicating how idle the pools are, from 0 to MAX_IDLE_IDX
+	 * this is counted based on each time when getting pages from
+	 * the pools, not based on time. which means in case that system
+	 * is idled for a while but the idle_idx might still be low if no
+	 * activities happened in the pools.
+	 */
+	unsigned long    epp_idle_idx;
+
+	/* last shrink time due to mem tight */
+	long	     epp_last_shrink;
+	long	     epp_last_access;
+
+	/*
+	 * in-pool pages bookkeeping
+	 */
+	spinlock_t	 epp_lock;	   /* protect following fields */
+	unsigned long    epp_total_pages; /* total pages in pools */
+	unsigned long    epp_free_pages;  /* current pages available */
+
+	/*
+	 * statistics
+	 */
+	unsigned long    epp_st_max_pages;      /* # of pages ever reached */
+	unsigned int     epp_st_grows;	  /* # of grows */
+	unsigned int     epp_st_grow_fails;     /* # of add pages failures */
+	unsigned int     epp_st_shrinks;	/* # of shrinks */
+	unsigned long    epp_st_access;	 /* # of access */
+	unsigned long    epp_st_missings;       /* # of cache missing */
+	unsigned long    epp_st_lowfree;	/* lowest free pages reached */
+	unsigned int     epp_st_max_wqlen;      /* highest waitqueue length */
+	cfs_time_t       epp_st_max_wait;       /* in jeffies */
+	/*
+	 * pointers to pools
+	 */
+	struct page    ***epp_pools;
+} page_pools;
+
+/*
+ * memory shrinker
+ */
+const int pools_shrinker_seeks = DEFAULT_SEEKS;
+static struct shrinker *pools_shrinker = NULL;
+
+
+/*
+ * /proc/fs/lustre/sptlrpc/encrypt_page_pools
+ */
+int sptlrpc_proc_read_enc_pool(char *page, char **start, off_t off, int count,
+			       int *eof, void *data)
+{
+	int     rc;
+
+	spin_lock(&page_pools.epp_lock);
+
+	rc = snprintf(page, count,
+		      "physical pages:	  %lu\n"
+		      "pages per pool:	  %lu\n"
+		      "max pages:	       %lu\n"
+		      "max pools:	       %u\n"
+		      "total pages:	     %lu\n"
+		      "total free:	      %lu\n"
+		      "idle index:	      %lu/100\n"
+		      "last shrink:	     %lds\n"
+		      "last access:	     %lds\n"
+		      "max pages reached:       %lu\n"
+		      "grows:		   %u\n"
+		      "grows failure:	   %u\n"
+		      "shrinks:		 %u\n"
+		      "cache access:	    %lu\n"
+		      "cache missing:	   %lu\n"
+		      "low free mark:	   %lu\n"
+		      "max waitqueue depth:     %u\n"
+		      "max wait time:	   "CFS_TIME_T"/%u\n"
+		      ,
+		      num_physpages,
+		      PAGES_PER_POOL,
+		      page_pools.epp_max_pages,
+		      page_pools.epp_max_pools,
+		      page_pools.epp_total_pages,
+		      page_pools.epp_free_pages,
+		      page_pools.epp_idle_idx,
+		      cfs_time_current_sec() - page_pools.epp_last_shrink,
+		      cfs_time_current_sec() - page_pools.epp_last_access,
+		      page_pools.epp_st_max_pages,
+		      page_pools.epp_st_grows,
+		      page_pools.epp_st_grow_fails,
+		      page_pools.epp_st_shrinks,
+		      page_pools.epp_st_access,
+		      page_pools.epp_st_missings,
+		      page_pools.epp_st_lowfree,
+		      page_pools.epp_st_max_wqlen,
+		      page_pools.epp_st_max_wait, HZ
+		     );
+
+	spin_unlock(&page_pools.epp_lock);
+	return rc;
+}
+
+static void enc_pools_release_free_pages(long npages)
+{
+	int     p_idx, g_idx;
+	int     p_idx_max1, p_idx_max2;
+
+	LASSERT(npages > 0);
+	LASSERT(npages <= page_pools.epp_free_pages);
+	LASSERT(page_pools.epp_free_pages <= page_pools.epp_total_pages);
+
+	/* max pool index before the release */
+	p_idx_max2 = (page_pools.epp_total_pages - 1) / PAGES_PER_POOL;
+
+	page_pools.epp_free_pages -= npages;
+	page_pools.epp_total_pages -= npages;
+
+	/* max pool index after the release */
+	p_idx_max1 = page_pools.epp_total_pages == 0 ? -1 :
+		     ((page_pools.epp_total_pages - 1) / PAGES_PER_POOL);
+
+	p_idx = page_pools.epp_free_pages / PAGES_PER_POOL;
+	g_idx = page_pools.epp_free_pages % PAGES_PER_POOL;
+	LASSERT(page_pools.epp_pools[p_idx]);
+
+	while (npages--) {
+		LASSERT(page_pools.epp_pools[p_idx]);
+		LASSERT(page_pools.epp_pools[p_idx][g_idx] != NULL);
+
+		__free_page(page_pools.epp_pools[p_idx][g_idx]);
+		page_pools.epp_pools[p_idx][g_idx] = NULL;
+
+		if (++g_idx == PAGES_PER_POOL) {
+			p_idx++;
+			g_idx = 0;
+		}
+	};
+
+	/* free unused pools */
+	while (p_idx_max1 < p_idx_max2) {
+		LASSERT(page_pools.epp_pools[p_idx_max2]);
+		OBD_FREE(page_pools.epp_pools[p_idx_max2], PAGE_CACHE_SIZE);
+		page_pools.epp_pools[p_idx_max2] = NULL;
+		p_idx_max2--;
+	}
+}
+
+/*
+ * could be called frequently for query (@nr_to_scan == 0).
+ * we try to keep at least PTLRPC_MAX_BRW_PAGES pages in the pool.
+ */
+static int enc_pools_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask))
+{
+	if (unlikely(shrink_param(sc, nr_to_scan) != 0)) {
+		spin_lock(&page_pools.epp_lock);
+		shrink_param(sc, nr_to_scan) = min_t(unsigned long,
+						   shrink_param(sc, nr_to_scan),
+						   page_pools.epp_free_pages -
+						   PTLRPC_MAX_BRW_PAGES);
+		if (shrink_param(sc, nr_to_scan) > 0) {
+			enc_pools_release_free_pages(shrink_param(sc,
+								  nr_to_scan));
+			CDEBUG(D_SEC, "released %ld pages, %ld left\n",
+			       (long)shrink_param(sc, nr_to_scan),
+			       page_pools.epp_free_pages);
+
+			page_pools.epp_st_shrinks++;
+			page_pools.epp_last_shrink = cfs_time_current_sec();
+		}
+		spin_unlock(&page_pools.epp_lock);
+	}
+
+	/*
+	 * if no pool access for a long time, we consider it's fully idle.
+	 * a little race here is fine.
+	 */
+	if (unlikely(cfs_time_current_sec() - page_pools.epp_last_access >
+		     CACHE_QUIESCENT_PERIOD)) {
+		spin_lock(&page_pools.epp_lock);
+		page_pools.epp_idle_idx = IDLE_IDX_MAX;
+		spin_unlock(&page_pools.epp_lock);
+	}
+
+	LASSERT(page_pools.epp_idle_idx <= IDLE_IDX_MAX);
+	return max((int)page_pools.epp_free_pages - PTLRPC_MAX_BRW_PAGES, 0) *
+		(IDLE_IDX_MAX - page_pools.epp_idle_idx) / IDLE_IDX_MAX;
+}
+
+static inline
+int npages_to_npools(unsigned long npages)
+{
+	return (int) ((npages + PAGES_PER_POOL - 1) / PAGES_PER_POOL);
+}
+
+/*
+ * return how many pages cleaned up.
+ */
+static unsigned long enc_pools_cleanup(struct page ***pools, int npools)
+{
+	unsigned long cleaned = 0;
+	int	   i, j;
+
+	for (i = 0; i < npools; i++) {
+		if (pools[i]) {
+			for (j = 0; j < PAGES_PER_POOL; j++) {
+				if (pools[i][j]) {
+					__free_page(pools[i][j]);
+					cleaned++;
+				}
+			}
+			OBD_FREE(pools[i], PAGE_CACHE_SIZE);
+			pools[i] = NULL;
+		}
+	}
+
+	return cleaned;
+}
+
+/*
+ * merge @npools pointed by @pools which contains @npages new pages
+ * into current pools.
+ *
+ * we have options to avoid most memory copy with some tricks. but we choose
+ * the simplest way to avoid complexity. It's not frequently called.
+ */
+static void enc_pools_insert(struct page ***pools, int npools, int npages)
+{
+	int     freeslot;
+	int     op_idx, np_idx, og_idx, ng_idx;
+	int     cur_npools, end_npools;
+
+	LASSERT(npages > 0);
+	LASSERT(page_pools.epp_total_pages+npages <= page_pools.epp_max_pages);
+	LASSERT(npages_to_npools(npages) == npools);
+	LASSERT(page_pools.epp_growing);
+
+	spin_lock(&page_pools.epp_lock);
+
+	/*
+	 * (1) fill all the free slots of current pools.
+	 */
+	/* free slots are those left by rent pages, and the extra ones with
+	 * index >= total_pages, locate at the tail of last pool. */
+	freeslot = page_pools.epp_total_pages % PAGES_PER_POOL;
+	if (freeslot != 0)
+		freeslot = PAGES_PER_POOL - freeslot;
+	freeslot += page_pools.epp_total_pages - page_pools.epp_free_pages;
+
+	op_idx = page_pools.epp_free_pages / PAGES_PER_POOL;
+	og_idx = page_pools.epp_free_pages % PAGES_PER_POOL;
+	np_idx = npools - 1;
+	ng_idx = (npages - 1) % PAGES_PER_POOL;
+
+	while (freeslot) {
+		LASSERT(page_pools.epp_pools[op_idx][og_idx] == NULL);
+		LASSERT(pools[np_idx][ng_idx] != NULL);
+
+		page_pools.epp_pools[op_idx][og_idx] = pools[np_idx][ng_idx];
+		pools[np_idx][ng_idx] = NULL;
+
+		freeslot--;
+
+		if (++og_idx == PAGES_PER_POOL) {
+			op_idx++;
+			og_idx = 0;
+		}
+		if (--ng_idx < 0) {
+			if (np_idx == 0)
+				break;
+			np_idx--;
+			ng_idx = PAGES_PER_POOL - 1;
+		}
+	}
+
+	/*
+	 * (2) add pools if needed.
+	 */
+	cur_npools = (page_pools.epp_total_pages + PAGES_PER_POOL - 1) /
+		     PAGES_PER_POOL;
+	end_npools = (page_pools.epp_total_pages + npages + PAGES_PER_POOL -1) /
+		     PAGES_PER_POOL;
+	LASSERT(end_npools <= page_pools.epp_max_pools);
+
+	np_idx = 0;
+	while (cur_npools < end_npools) {
+		LASSERT(page_pools.epp_pools[cur_npools] == NULL);
+		LASSERT(np_idx < npools);
+		LASSERT(pools[np_idx] != NULL);
+
+		page_pools.epp_pools[cur_npools++] = pools[np_idx];
+		pools[np_idx++] = NULL;
+	}
+
+	page_pools.epp_total_pages += npages;
+	page_pools.epp_free_pages += npages;
+	page_pools.epp_st_lowfree = page_pools.epp_free_pages;
+
+	if (page_pools.epp_total_pages > page_pools.epp_st_max_pages)
+		page_pools.epp_st_max_pages = page_pools.epp_total_pages;
+
+	CDEBUG(D_SEC, "add %d pages to total %lu\n", npages,
+	       page_pools.epp_total_pages);
+
+	spin_unlock(&page_pools.epp_lock);
+}
+
+static int enc_pools_add_pages(int npages)
+{
+	static DEFINE_MUTEX(add_pages_mutex);
+	struct page   ***pools;
+	int	     npools, alloced = 0;
+	int	     i, j, rc = -ENOMEM;
+
+	if (npages < PTLRPC_MAX_BRW_PAGES)
+		npages = PTLRPC_MAX_BRW_PAGES;
+
+	mutex_lock(&add_pages_mutex);
+
+	if (npages + page_pools.epp_total_pages > page_pools.epp_max_pages)
+		npages = page_pools.epp_max_pages - page_pools.epp_total_pages;
+	LASSERT(npages > 0);
+
+	page_pools.epp_st_grows++;
+
+	npools = npages_to_npools(npages);
+	OBD_ALLOC(pools, npools * sizeof(*pools));
+	if (pools == NULL)
+		goto out;
+
+	for (i = 0; i < npools; i++) {
+		OBD_ALLOC(pools[i], PAGE_CACHE_SIZE);
+		if (pools[i] == NULL)
+			goto out_pools;
+
+		for (j = 0; j < PAGES_PER_POOL && alloced < npages; j++) {
+			pools[i][j] = alloc_page(__GFP_IO |
+						     __GFP_HIGHMEM);
+			if (pools[i][j] == NULL)
+				goto out_pools;
+
+			alloced++;
+		}
+	}
+	LASSERT(alloced == npages);
+
+	enc_pools_insert(pools, npools, npages);
+	CDEBUG(D_SEC, "added %d pages into pools\n", npages);
+	rc = 0;
+
+out_pools:
+	enc_pools_cleanup(pools, npools);
+	OBD_FREE(pools, npools * sizeof(*pools));
+out:
+	if (rc) {
+		page_pools.epp_st_grow_fails++;
+		CERROR("Failed to allocate %d enc pages\n", npages);
+	}
+
+	mutex_unlock(&add_pages_mutex);
+	return rc;
+}
+
+static inline void enc_pools_wakeup(void)
+{
+	LASSERT(spin_is_locked(&page_pools.epp_lock));
+	LASSERT(page_pools.epp_waitqlen >= 0);
+
+	if (unlikely(page_pools.epp_waitqlen)) {
+		LASSERT(waitqueue_active(&page_pools.epp_waitq));
+		wake_up_all(&page_pools.epp_waitq);
+	}
+}
+
+static int enc_pools_should_grow(int page_needed, long now)
+{
+	/* don't grow if someone else is growing the pools right now,
+	 * or the pools has reached its full capacity
+	 */
+	if (page_pools.epp_growing ||
+	    page_pools.epp_total_pages == page_pools.epp_max_pages)
+		return 0;
+
+	/* if total pages is not enough, we need to grow */
+	if (page_pools.epp_total_pages < page_needed)
+		return 1;
+
+	/*
+	 * we wanted to return 0 here if there was a shrink just happened
+	 * moment ago, but this may cause deadlock if both client and ost
+	 * live on single node.
+	 */
+#if 0
+	if (now - page_pools.epp_last_shrink < 2)
+		return 0;
+#endif
+
+	/*
+	 * here we perhaps need consider other factors like wait queue
+	 * length, idle index, etc. ?
+	 */
+
+	/* grow the pools in any other cases */
+	return 1;
+}
+
+/*
+ * we allocate the requested pages atomically.
+ */
+int sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc)
+{
+	wait_queue_t  waitlink;
+	unsigned long   this_idle = -1;
+	cfs_time_t      tick = 0;
+	long	    now;
+	int	     p_idx, g_idx;
+	int	     i;
+
+	LASSERT(desc->bd_iov_count > 0);
+	LASSERT(desc->bd_iov_count <= page_pools.epp_max_pages);
+
+	/* resent bulk, enc iov might have been allocated previously */
+	if (desc->bd_enc_iov != NULL)
+		return 0;
+
+	OBD_ALLOC(desc->bd_enc_iov,
+		  desc->bd_iov_count * sizeof(*desc->bd_enc_iov));
+	if (desc->bd_enc_iov == NULL)
+		return -ENOMEM;
+
+	spin_lock(&page_pools.epp_lock);
+
+	page_pools.epp_st_access++;
+again:
+	if (unlikely(page_pools.epp_free_pages < desc->bd_iov_count)) {
+		if (tick == 0)
+			tick = cfs_time_current();
+
+		now = cfs_time_current_sec();
+
+		page_pools.epp_st_missings++;
+		page_pools.epp_pages_short += desc->bd_iov_count;
+
+		if (enc_pools_should_grow(desc->bd_iov_count, now)) {
+			page_pools.epp_growing = 1;
+
+			spin_unlock(&page_pools.epp_lock);
+			enc_pools_add_pages(page_pools.epp_pages_short / 2);
+			spin_lock(&page_pools.epp_lock);
+
+			page_pools.epp_growing = 0;
+
+			enc_pools_wakeup();
+		} else {
+			if (++page_pools.epp_waitqlen >
+			    page_pools.epp_st_max_wqlen)
+				page_pools.epp_st_max_wqlen =
+						page_pools.epp_waitqlen;
+
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			init_waitqueue_entry_current(&waitlink);
+			add_wait_queue(&page_pools.epp_waitq, &waitlink);
+
+			spin_unlock(&page_pools.epp_lock);
+			waitq_wait(&waitlink, TASK_UNINTERRUPTIBLE);
+			remove_wait_queue(&page_pools.epp_waitq, &waitlink);
+			LASSERT(page_pools.epp_waitqlen > 0);
+			spin_lock(&page_pools.epp_lock);
+			page_pools.epp_waitqlen--;
+		}
+
+		LASSERT(page_pools.epp_pages_short >= desc->bd_iov_count);
+		page_pools.epp_pages_short -= desc->bd_iov_count;
+
+		this_idle = 0;
+		goto again;
+	}
+
+	/* record max wait time */
+	if (unlikely(tick != 0)) {
+		tick = cfs_time_current() - tick;
+		if (tick > page_pools.epp_st_max_wait)
+			page_pools.epp_st_max_wait = tick;
+	}
+
+	/* proceed with rest of allocation */
+	page_pools.epp_free_pages -= desc->bd_iov_count;
+
+	p_idx = page_pools.epp_free_pages / PAGES_PER_POOL;
+	g_idx = page_pools.epp_free_pages % PAGES_PER_POOL;
+
+	for (i = 0; i < desc->bd_iov_count; i++) {
+		LASSERT(page_pools.epp_pools[p_idx][g_idx] != NULL);
+		desc->bd_enc_iov[i].kiov_page =
+					page_pools.epp_pools[p_idx][g_idx];
+		page_pools.epp_pools[p_idx][g_idx] = NULL;
+
+		if (++g_idx == PAGES_PER_POOL) {
+			p_idx++;
+			g_idx = 0;
+		}
+	}
+
+	if (page_pools.epp_free_pages < page_pools.epp_st_lowfree)
+		page_pools.epp_st_lowfree = page_pools.epp_free_pages;
+
+	/*
+	 * new idle index = (old * weight + new) / (weight + 1)
+	 */
+	if (this_idle == -1) {
+		this_idle = page_pools.epp_free_pages * IDLE_IDX_MAX /
+			    page_pools.epp_total_pages;
+	}
+	page_pools.epp_idle_idx = (page_pools.epp_idle_idx * IDLE_IDX_WEIGHT +
+				   this_idle) /
+				  (IDLE_IDX_WEIGHT + 1);
+
+	page_pools.epp_last_access = cfs_time_current_sec();
+
+	spin_unlock(&page_pools.epp_lock);
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_enc_pool_get_pages);
+
+void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc)
+{
+	int     p_idx, g_idx;
+	int     i;
+
+	if (desc->bd_enc_iov == NULL)
+		return;
+
+	LASSERT(desc->bd_iov_count > 0);
+
+	spin_lock(&page_pools.epp_lock);
+
+	p_idx = page_pools.epp_free_pages / PAGES_PER_POOL;
+	g_idx = page_pools.epp_free_pages % PAGES_PER_POOL;
+
+	LASSERT(page_pools.epp_free_pages + desc->bd_iov_count <=
+		page_pools.epp_total_pages);
+	LASSERT(page_pools.epp_pools[p_idx]);
+
+	for (i = 0; i < desc->bd_iov_count; i++) {
+		LASSERT(desc->bd_enc_iov[i].kiov_page != NULL);
+		LASSERT(g_idx != 0 || page_pools.epp_pools[p_idx]);
+		LASSERT(page_pools.epp_pools[p_idx][g_idx] == NULL);
+
+		page_pools.epp_pools[p_idx][g_idx] =
+					desc->bd_enc_iov[i].kiov_page;
+
+		if (++g_idx == PAGES_PER_POOL) {
+			p_idx++;
+			g_idx = 0;
+		}
+	}
+
+	page_pools.epp_free_pages += desc->bd_iov_count;
+
+	enc_pools_wakeup();
+
+	spin_unlock(&page_pools.epp_lock);
+
+	OBD_FREE(desc->bd_enc_iov,
+		 desc->bd_iov_count * sizeof(*desc->bd_enc_iov));
+	desc->bd_enc_iov = NULL;
+}
+EXPORT_SYMBOL(sptlrpc_enc_pool_put_pages);
+
+/*
+ * we don't do much stuff for add_user/del_user anymore, except adding some
+ * initial pages in add_user() if current pools are empty, rest would be
+ * handled by the pools's self-adaption.
+ */
+int sptlrpc_enc_pool_add_user(void)
+{
+	int     need_grow = 0;
+
+	spin_lock(&page_pools.epp_lock);
+	if (page_pools.epp_growing == 0 && page_pools.epp_total_pages == 0) {
+		page_pools.epp_growing = 1;
+		need_grow = 1;
+	}
+	spin_unlock(&page_pools.epp_lock);
+
+	if (need_grow) {
+		enc_pools_add_pages(PTLRPC_MAX_BRW_PAGES +
+				    PTLRPC_MAX_BRW_PAGES);
+
+		spin_lock(&page_pools.epp_lock);
+		page_pools.epp_growing = 0;
+		enc_pools_wakeup();
+		spin_unlock(&page_pools.epp_lock);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_enc_pool_add_user);
+
+int sptlrpc_enc_pool_del_user(void)
+{
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_enc_pool_del_user);
+
+static inline void enc_pools_alloc(void)
+{
+	LASSERT(page_pools.epp_max_pools);
+	OBD_ALLOC_LARGE(page_pools.epp_pools,
+			page_pools.epp_max_pools *
+			sizeof(*page_pools.epp_pools));
+}
+
+static inline void enc_pools_free(void)
+{
+	LASSERT(page_pools.epp_max_pools);
+	LASSERT(page_pools.epp_pools);
+
+	OBD_FREE_LARGE(page_pools.epp_pools,
+		       page_pools.epp_max_pools *
+		       sizeof(*page_pools.epp_pools));
+}
+
+int sptlrpc_enc_pool_init(void)
+{
+	/*
+	 * maximum capacity is 1/8 of total physical memory.
+	 * is the 1/8 a good number?
+	 */
+	page_pools.epp_max_pages = num_physpages / 8;
+	page_pools.epp_max_pools = npages_to_npools(page_pools.epp_max_pages);
+
+	init_waitqueue_head(&page_pools.epp_waitq);
+	page_pools.epp_waitqlen = 0;
+	page_pools.epp_pages_short = 0;
+
+	page_pools.epp_growing = 0;
+
+	page_pools.epp_idle_idx = 0;
+	page_pools.epp_last_shrink = cfs_time_current_sec();
+	page_pools.epp_last_access = cfs_time_current_sec();
+
+	spin_lock_init(&page_pools.epp_lock);
+	page_pools.epp_total_pages = 0;
+	page_pools.epp_free_pages = 0;
+
+	page_pools.epp_st_max_pages = 0;
+	page_pools.epp_st_grows = 0;
+	page_pools.epp_st_grow_fails = 0;
+	page_pools.epp_st_shrinks = 0;
+	page_pools.epp_st_access = 0;
+	page_pools.epp_st_missings = 0;
+	page_pools.epp_st_lowfree = 0;
+	page_pools.epp_st_max_wqlen = 0;
+	page_pools.epp_st_max_wait = 0;
+
+	enc_pools_alloc();
+	if (page_pools.epp_pools == NULL)
+		return -ENOMEM;
+
+	pools_shrinker = set_shrinker(pools_shrinker_seeks,
+					  enc_pools_shrink);
+	if (pools_shrinker == NULL) {
+		enc_pools_free();
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void sptlrpc_enc_pool_fini(void)
+{
+	unsigned long cleaned, npools;
+
+	LASSERT(pools_shrinker);
+	LASSERT(page_pools.epp_pools);
+	LASSERT(page_pools.epp_total_pages == page_pools.epp_free_pages);
+
+	remove_shrinker(pools_shrinker);
+
+	npools = npages_to_npools(page_pools.epp_total_pages);
+	cleaned = enc_pools_cleanup(page_pools.epp_pools, npools);
+	LASSERT(cleaned == page_pools.epp_total_pages);
+
+	enc_pools_free();
+
+	if (page_pools.epp_st_access > 0) {
+		CDEBUG(D_SEC,
+		       "max pages %lu, grows %u, grow fails %u, shrinks %u, "
+		       "access %lu, missing %lu, max qlen %u, max wait "
+		       CFS_TIME_T"/%d\n",
+		       page_pools.epp_st_max_pages, page_pools.epp_st_grows,
+		       page_pools.epp_st_grow_fails,
+		       page_pools.epp_st_shrinks, page_pools.epp_st_access,
+		       page_pools.epp_st_missings, page_pools.epp_st_max_wqlen,
+		       page_pools.epp_st_max_wait, HZ);
+	}
+}
+
+
+static int cfs_hash_alg_id[] = {
+	[BULK_HASH_ALG_NULL]	= CFS_HASH_ALG_NULL,
+	[BULK_HASH_ALG_ADLER32]	= CFS_HASH_ALG_ADLER32,
+	[BULK_HASH_ALG_CRC32]	= CFS_HASH_ALG_CRC32,
+	[BULK_HASH_ALG_MD5]	= CFS_HASH_ALG_MD5,
+	[BULK_HASH_ALG_SHA1]	= CFS_HASH_ALG_SHA1,
+	[BULK_HASH_ALG_SHA256]	= CFS_HASH_ALG_SHA256,
+	[BULK_HASH_ALG_SHA384]	= CFS_HASH_ALG_SHA384,
+	[BULK_HASH_ALG_SHA512]	= CFS_HASH_ALG_SHA512,
+};
+const char * sptlrpc_get_hash_name(__u8 hash_alg)
+{
+	return cfs_crypto_hash_name(cfs_hash_alg_id[hash_alg]);
+}
+EXPORT_SYMBOL(sptlrpc_get_hash_name);
+
+__u8 sptlrpc_get_hash_alg(const char *algname)
+{
+	return cfs_crypto_hash_alg(algname);
+}
+EXPORT_SYMBOL(sptlrpc_get_hash_alg);
+
+int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset, int swabbed)
+{
+	struct ptlrpc_bulk_sec_desc *bsd;
+	int			  size = msg->lm_buflens[offset];
+
+	bsd = lustre_msg_buf(msg, offset, sizeof(*bsd));
+	if (bsd == NULL) {
+		CERROR("Invalid bulk sec desc: size %d\n", size);
+		return -EINVAL;
+	}
+
+	if (swabbed) {
+		__swab32s(&bsd->bsd_nob);
+	}
+
+	if (unlikely(bsd->bsd_version != 0)) {
+		CERROR("Unexpected version %u\n", bsd->bsd_version);
+		return -EPROTO;
+	}
+
+	if (unlikely(bsd->bsd_type >= SPTLRPC_BULK_MAX)) {
+		CERROR("Invalid type %u\n", bsd->bsd_type);
+		return -EPROTO;
+	}
+
+	/* FIXME more sanity check here */
+
+	if (unlikely(bsd->bsd_svc != SPTLRPC_BULK_SVC_NULL &&
+		     bsd->bsd_svc != SPTLRPC_BULK_SVC_INTG &&
+		     bsd->bsd_svc != SPTLRPC_BULK_SVC_PRIV)) {
+		CERROR("Invalid svc %u\n", bsd->bsd_svc);
+		return -EPROTO;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(bulk_sec_desc_unpack);
+
+int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg,
+			      void *buf, int buflen)
+{
+	struct cfs_crypto_hash_desc	*hdesc;
+	int				hashsize;
+	char				hashbuf[64];
+	unsigned int			bufsize;
+	int				i, err;
+
+	LASSERT(alg > BULK_HASH_ALG_NULL && alg < BULK_HASH_ALG_MAX);
+	LASSERT(buflen >= 4);
+
+	hdesc = cfs_crypto_hash_init(cfs_hash_alg_id[alg], NULL, 0);
+	if (IS_ERR(hdesc)) {
+		CERROR("Unable to initialize checksum hash %s\n",
+		       cfs_crypto_hash_name(cfs_hash_alg_id[alg]));
+		return PTR_ERR(hdesc);
+	}
+
+	hashsize = cfs_crypto_hash_digestsize(cfs_hash_alg_id[alg]);
+
+	for (i = 0; i < desc->bd_iov_count; i++) {
+		cfs_crypto_hash_update_page(hdesc, desc->bd_iov[i].kiov_page,
+				  desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK,
+				  desc->bd_iov[i].kiov_len);
+	}
+	if (hashsize > buflen) {
+		bufsize = sizeof(hashbuf);
+		err = cfs_crypto_hash_final(hdesc, (unsigned char *)hashbuf,
+					    &bufsize);
+		memcpy(buf, hashbuf, buflen);
+	} else {
+		bufsize = buflen;
+		err = cfs_crypto_hash_final(hdesc, (unsigned char *)buf,
+					    &bufsize);
+	}
+
+	if (err)
+		cfs_crypto_hash_final(hdesc, NULL, NULL);
+	return err;
+}
+EXPORT_SYMBOL(sptlrpc_get_bulk_checksum);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_config.c b/drivers/staging/lustre/lustre/ptlrpc/sec_config.c
new file mode 100644
index 000000000000..a45a3929b59f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/sec_config.c
@@ -0,0 +1,1233 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/crypto.h>
+#include <linux/key.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_log.h>
+#include <lustre_disk.h>
+#include <lustre_dlm.h>
+#include <lustre_param.h>
+#include <lustre_sec.h>
+
+#include "ptlrpc_internal.h"
+
+const char *sptlrpc_part2name(enum lustre_sec_part part)
+{
+	switch (part) {
+	case LUSTRE_SP_CLI:
+		return "cli";
+	case LUSTRE_SP_MDT:
+		return "mdt";
+	case LUSTRE_SP_OST:
+		return "ost";
+	case LUSTRE_SP_MGC:
+		return "mgc";
+	case LUSTRE_SP_MGS:
+		return "mgs";
+	case LUSTRE_SP_ANY:
+		return "any";
+	default:
+		return "err";
+	}
+}
+EXPORT_SYMBOL(sptlrpc_part2name);
+
+enum lustre_sec_part sptlrpc_target_sec_part(struct obd_device *obd)
+{
+	const char *type = obd->obd_type->typ_name;
+
+	if (!strcmp(type, LUSTRE_MDT_NAME))
+		return LUSTRE_SP_MDT;
+	if (!strcmp(type, LUSTRE_OST_NAME))
+		return LUSTRE_SP_OST;
+	if (!strcmp(type, LUSTRE_MGS_NAME))
+		return LUSTRE_SP_MGS;
+
+	CERROR("unknown target %p(%s)\n", obd, type);
+	return LUSTRE_SP_ANY;
+}
+EXPORT_SYMBOL(sptlrpc_target_sec_part);
+
+/****************************************
+ * user supplied flavor string parsing  *
+ ****************************************/
+
+/*
+ * format: <base_flavor>[-<bulk_type:alg_spec>]
+ */
+int sptlrpc_parse_flavor(const char *str, struct sptlrpc_flavor *flvr)
+{
+	char	    buf[32];
+	char	   *bulk, *alg;
+
+	memset(flvr, 0, sizeof(*flvr));
+
+	if (str == NULL || str[0] == '\0') {
+		flvr->sf_rpc = SPTLRPC_FLVR_INVALID;
+		return 0;
+	}
+
+	strncpy(buf, str, sizeof(buf));
+	buf[sizeof(buf) - 1] = '\0';
+
+	bulk = strchr(buf, '-');
+	if (bulk)
+		*bulk++ = '\0';
+
+	flvr->sf_rpc = sptlrpc_name2flavor_base(buf);
+	if (flvr->sf_rpc == SPTLRPC_FLVR_INVALID)
+		goto err_out;
+
+	/*
+	 * currently only base flavor "plain" can have bulk specification.
+	 */
+	if (flvr->sf_rpc == SPTLRPC_FLVR_PLAIN) {
+		flvr->u_bulk.hash.hash_alg = BULK_HASH_ALG_ADLER32;
+		if (bulk) {
+			/*
+			 * format: plain-hash:<hash_alg>
+			 */
+			alg = strchr(bulk, ':');
+			if (alg == NULL)
+				goto err_out;
+			*alg++ = '\0';
+
+			if (strcmp(bulk, "hash"))
+				goto err_out;
+
+			flvr->u_bulk.hash.hash_alg = sptlrpc_get_hash_alg(alg);
+			if (flvr->u_bulk.hash.hash_alg >= BULK_HASH_ALG_MAX)
+				goto err_out;
+		}
+
+		if (flvr->u_bulk.hash.hash_alg == BULK_HASH_ALG_NULL)
+			flvr_set_bulk_svc(&flvr->sf_rpc, SPTLRPC_BULK_SVC_NULL);
+		else
+			flvr_set_bulk_svc(&flvr->sf_rpc, SPTLRPC_BULK_SVC_INTG);
+	} else {
+		if (bulk)
+			goto err_out;
+	}
+
+	flvr->sf_flags = 0;
+	return 0;
+
+err_out:
+	CERROR("invalid flavor string: %s\n", str);
+	return -EINVAL;
+}
+EXPORT_SYMBOL(sptlrpc_parse_flavor);
+
+/****************************************
+ * configure rules		      *
+ ****************************************/
+
+static void get_default_flavor(struct sptlrpc_flavor *sf)
+{
+	memset(sf, 0, sizeof(*sf));
+
+	sf->sf_rpc = SPTLRPC_FLVR_NULL;
+	sf->sf_flags = 0;
+}
+
+static void sptlrpc_rule_init(struct sptlrpc_rule *rule)
+{
+	rule->sr_netid = LNET_NIDNET(LNET_NID_ANY);
+	rule->sr_from = LUSTRE_SP_ANY;
+	rule->sr_to = LUSTRE_SP_ANY;
+	rule->sr_padding = 0;
+
+	get_default_flavor(&rule->sr_flvr);
+}
+
+/*
+ * format: network[.direction]=flavor
+ */
+int sptlrpc_parse_rule(char *param, struct sptlrpc_rule *rule)
+{
+	char	   *flavor, *dir;
+	int	     rc;
+
+	sptlrpc_rule_init(rule);
+
+	flavor = strchr(param, '=');
+	if (flavor == NULL) {
+		CERROR("invalid param, no '='\n");
+		RETURN(-EINVAL);
+	}
+	*flavor++ = '\0';
+
+	dir = strchr(param, '.');
+	if (dir)
+		*dir++ = '\0';
+
+	/* 1.1 network */
+	if (strcmp(param, "default")) {
+		rule->sr_netid = libcfs_str2net(param);
+		if (rule->sr_netid == LNET_NIDNET(LNET_NID_ANY)) {
+			CERROR("invalid network name: %s\n", param);
+			RETURN(-EINVAL);
+		}
+	}
+
+	/* 1.2 direction */
+	if (dir) {
+		if (!strcmp(dir, "mdt2ost")) {
+			rule->sr_from = LUSTRE_SP_MDT;
+			rule->sr_to = LUSTRE_SP_OST;
+		} else if (!strcmp(dir, "mdt2mdt")) {
+			rule->sr_from = LUSTRE_SP_MDT;
+			rule->sr_to = LUSTRE_SP_MDT;
+		} else if (!strcmp(dir, "cli2ost")) {
+			rule->sr_from = LUSTRE_SP_CLI;
+			rule->sr_to = LUSTRE_SP_OST;
+		} else if (!strcmp(dir, "cli2mdt")) {
+			rule->sr_from = LUSTRE_SP_CLI;
+			rule->sr_to = LUSTRE_SP_MDT;
+		} else {
+			CERROR("invalid rule dir segment: %s\n", dir);
+			RETURN(-EINVAL);
+		}
+	}
+
+	/* 2.1 flavor */
+	rc = sptlrpc_parse_flavor(flavor, &rule->sr_flvr);
+	if (rc)
+		RETURN(-EINVAL);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(sptlrpc_parse_rule);
+
+void sptlrpc_rule_set_free(struct sptlrpc_rule_set *rset)
+{
+	LASSERT(rset->srs_nslot ||
+		(rset->srs_nrule == 0 && rset->srs_rules == NULL));
+
+	if (rset->srs_nslot) {
+		OBD_FREE(rset->srs_rules,
+			 rset->srs_nslot * sizeof(*rset->srs_rules));
+		sptlrpc_rule_set_init(rset);
+	}
+}
+EXPORT_SYMBOL(sptlrpc_rule_set_free);
+
+/*
+ * return 0 if the rule set could accomodate one more rule.
+ */
+int sptlrpc_rule_set_expand(struct sptlrpc_rule_set *rset)
+{
+	struct sptlrpc_rule *rules;
+	int nslot;
+
+	might_sleep();
+
+	if (rset->srs_nrule < rset->srs_nslot)
+		return 0;
+
+	nslot = rset->srs_nslot + 8;
+
+	/* better use realloc() if available */
+	OBD_ALLOC(rules, nslot * sizeof(*rset->srs_rules));
+	if (rules == NULL)
+		return -ENOMEM;
+
+	if (rset->srs_nrule) {
+		LASSERT(rset->srs_nslot && rset->srs_rules);
+		memcpy(rules, rset->srs_rules,
+		       rset->srs_nrule * sizeof(*rset->srs_rules));
+
+		OBD_FREE(rset->srs_rules,
+			 rset->srs_nslot * sizeof(*rset->srs_rules));
+	}
+
+	rset->srs_rules = rules;
+	rset->srs_nslot = nslot;
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_rule_set_expand);
+
+static inline int rule_spec_dir(struct sptlrpc_rule *rule)
+{
+	return (rule->sr_from != LUSTRE_SP_ANY ||
+		rule->sr_to != LUSTRE_SP_ANY);
+}
+static inline int rule_spec_net(struct sptlrpc_rule *rule)
+{
+	return (rule->sr_netid != LNET_NIDNET(LNET_NID_ANY));
+}
+static inline int rule_match_dir(struct sptlrpc_rule *r1,
+				 struct sptlrpc_rule *r2)
+{
+	return (r1->sr_from == r2->sr_from && r1->sr_to == r2->sr_to);
+}
+static inline int rule_match_net(struct sptlrpc_rule *r1,
+				 struct sptlrpc_rule *r2)
+{
+	return (r1->sr_netid == r2->sr_netid);
+}
+
+/*
+ * merge @rule into @rset.
+ * the @rset slots might be expanded.
+ */
+int sptlrpc_rule_set_merge(struct sptlrpc_rule_set *rset,
+			   struct sptlrpc_rule *rule)
+{
+	struct sptlrpc_rule      *p = rset->srs_rules;
+	int		       spec_dir, spec_net;
+	int		       rc, n, match = 0;
+
+	might_sleep();
+
+	spec_net = rule_spec_net(rule);
+	spec_dir = rule_spec_dir(rule);
+
+	for (n = 0; n < rset->srs_nrule; n++) {
+		p = &rset->srs_rules[n];
+
+		/* test network match, if failed:
+		 * - spec rule: skip rules which is also spec rule match, until
+		 *   we hit a wild rule, which means no more chance
+		 * - wild rule: skip until reach the one which is also wild
+		 *   and matches
+		 */
+		if (!rule_match_net(p, rule)) {
+			if (spec_net) {
+				if (rule_spec_net(p))
+					continue;
+				else
+					break;
+			} else {
+				continue;
+			}
+		}
+
+		/* test dir match, same logic as net matching */
+		if (!rule_match_dir(p, rule)) {
+			if (spec_dir) {
+				if (rule_spec_dir(p))
+					continue;
+				else
+					break;
+			} else {
+				continue;
+			}
+		}
+
+		/* find a match */
+		match = 1;
+		break;
+	}
+
+	if (match) {
+		LASSERT(n >= 0 && n < rset->srs_nrule);
+
+		if (rule->sr_flvr.sf_rpc == SPTLRPC_FLVR_INVALID) {
+			/* remove this rule */
+			if (n < rset->srs_nrule - 1)
+				memmove(&rset->srs_rules[n],
+					&rset->srs_rules[n + 1],
+					(rset->srs_nrule - n - 1) *
+					sizeof(*rule));
+			rset->srs_nrule--;
+		} else {
+			/* override the rule */
+			memcpy(&rset->srs_rules[n], rule, sizeof(*rule));
+		}
+	} else {
+		LASSERT(n >= 0 && n <= rset->srs_nrule);
+
+		if (rule->sr_flvr.sf_rpc != SPTLRPC_FLVR_INVALID) {
+			rc = sptlrpc_rule_set_expand(rset);
+			if (rc)
+				return rc;
+
+			if (n < rset->srs_nrule)
+				memmove(&rset->srs_rules[n + 1],
+					&rset->srs_rules[n],
+					(rset->srs_nrule - n) * sizeof(*rule));
+			memcpy(&rset->srs_rules[n], rule, sizeof(*rule));
+			rset->srs_nrule++;
+		} else {
+			CDEBUG(D_CONFIG, "ignore the unmatched deletion\n");
+		}
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_rule_set_merge);
+
+/**
+ * given from/to/nid, determine a matching flavor in ruleset.
+ * return 1 if a match found, otherwise return 0.
+ */
+int sptlrpc_rule_set_choose(struct sptlrpc_rule_set *rset,
+			    enum lustre_sec_part from,
+			    enum lustre_sec_part to,
+			    lnet_nid_t nid,
+			    struct sptlrpc_flavor *sf)
+{
+	struct sptlrpc_rule    *r;
+	int		     n;
+
+	for (n = 0; n < rset->srs_nrule; n++) {
+		r = &rset->srs_rules[n];
+
+		if (LNET_NIDNET(nid) != LNET_NIDNET(LNET_NID_ANY) &&
+		    r->sr_netid != LNET_NIDNET(LNET_NID_ANY) &&
+		    LNET_NIDNET(nid) != r->sr_netid)
+			continue;
+
+		if (from != LUSTRE_SP_ANY && r->sr_from != LUSTRE_SP_ANY &&
+		    from != r->sr_from)
+			continue;
+
+		if (to != LUSTRE_SP_ANY && r->sr_to != LUSTRE_SP_ANY &&
+		    to != r->sr_to)
+			continue;
+
+		*sf = r->sr_flvr;
+		return 1;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_rule_set_choose);
+
+void sptlrpc_rule_set_dump(struct sptlrpc_rule_set *rset)
+{
+	struct sptlrpc_rule *r;
+	int     n;
+
+	for (n = 0; n < rset->srs_nrule; n++) {
+		r = &rset->srs_rules[n];
+		CDEBUG(D_SEC, "<%02d> from %x to %x, net %x, rpc %x\n", n,
+		       r->sr_from, r->sr_to, r->sr_netid, r->sr_flvr.sf_rpc);
+	}
+}
+EXPORT_SYMBOL(sptlrpc_rule_set_dump);
+
+static int sptlrpc_rule_set_extract(struct sptlrpc_rule_set *gen,
+				    struct sptlrpc_rule_set *tgt,
+				    enum lustre_sec_part from,
+				    enum lustre_sec_part to,
+				    struct sptlrpc_rule_set *rset)
+{
+	struct sptlrpc_rule_set *src[2] = { gen, tgt };
+	struct sptlrpc_rule     *rule;
+	int		      i, n, rc;
+
+	might_sleep();
+
+	/* merge general rules firstly, then target-specific rules */
+	for (i = 0; i < 2; i++) {
+		if (src[i] == NULL)
+			continue;
+
+		for (n = 0; n < src[i]->srs_nrule; n++) {
+			rule = &src[i]->srs_rules[n];
+
+			if (from != LUSTRE_SP_ANY &&
+			    rule->sr_from != LUSTRE_SP_ANY &&
+			    rule->sr_from != from)
+				continue;
+			if (to != LUSTRE_SP_ANY &&
+			    rule->sr_to != LUSTRE_SP_ANY &&
+			    rule->sr_to != to)
+				continue;
+
+			rc = sptlrpc_rule_set_merge(rset, rule);
+			if (rc) {
+				CERROR("can't merge: %d\n", rc);
+				return rc;
+			}
+		}
+	}
+
+	return 0;
+}
+
+/**********************************
+ * sptlrpc configuration support  *
+ **********************************/
+
+struct sptlrpc_conf_tgt {
+	struct list_head	      sct_list;
+	char		    sct_name[MAX_OBD_NAME];
+	struct sptlrpc_rule_set sct_rset;
+};
+
+struct sptlrpc_conf {
+	struct list_head	      sc_list;
+	char		    sc_fsname[MTI_NAME_MAXLEN];
+	unsigned int	    sc_modified;  /* modified during updating */
+	unsigned int	    sc_updated:1, /* updated copy from MGS */
+				sc_local:1;   /* local copy from target */
+	struct sptlrpc_rule_set sc_rset;      /* fs general rules */
+	struct list_head	      sc_tgts;      /* target-specific rules */
+};
+
+static struct mutex sptlrpc_conf_lock;
+static LIST_HEAD(sptlrpc_confs);
+
+static inline int is_hex(char c)
+{
+	return ((c >= '0' && c <= '9') ||
+		(c >= 'a' && c <= 'f'));
+}
+
+static void target2fsname(const char *tgt, char *fsname, int buflen)
+{
+	const char     *ptr;
+	int	     len;
+
+	ptr = strrchr(tgt, '-');
+	if (ptr) {
+		if ((strncmp(ptr, "-MDT", 4) != 0 &&
+		     strncmp(ptr, "-OST", 4) != 0) ||
+		    !is_hex(ptr[4]) || !is_hex(ptr[5]) ||
+		    !is_hex(ptr[6]) || !is_hex(ptr[7]))
+			ptr = NULL;
+	}
+
+	/* if we didn't find the pattern, treat the whole string as fsname */
+	if (ptr == NULL)
+		len = strlen(tgt);
+	else
+		len = ptr - tgt;
+
+	len = min(len, buflen - 1);
+	memcpy(fsname, tgt, len);
+	fsname[len] = '\0';
+}
+
+static void sptlrpc_conf_free_rsets(struct sptlrpc_conf *conf)
+{
+	struct sptlrpc_conf_tgt *conf_tgt, *conf_tgt_next;
+
+	sptlrpc_rule_set_free(&conf->sc_rset);
+
+	list_for_each_entry_safe(conf_tgt, conf_tgt_next,
+				     &conf->sc_tgts, sct_list) {
+		sptlrpc_rule_set_free(&conf_tgt->sct_rset);
+		list_del(&conf_tgt->sct_list);
+		OBD_FREE_PTR(conf_tgt);
+	}
+	LASSERT(list_empty(&conf->sc_tgts));
+
+	conf->sc_updated = 0;
+	conf->sc_local = 0;
+}
+
+static void sptlrpc_conf_free(struct sptlrpc_conf *conf)
+{
+	CDEBUG(D_SEC, "free sptlrpc conf %s\n", conf->sc_fsname);
+
+	sptlrpc_conf_free_rsets(conf);
+	list_del(&conf->sc_list);
+	OBD_FREE_PTR(conf);
+}
+
+static
+struct sptlrpc_conf_tgt *sptlrpc_conf_get_tgt(struct sptlrpc_conf *conf,
+					      const char *name,
+					      int create)
+{
+	struct sptlrpc_conf_tgt *conf_tgt;
+
+	list_for_each_entry(conf_tgt, &conf->sc_tgts, sct_list) {
+		if (strcmp(conf_tgt->sct_name, name) == 0)
+			return conf_tgt;
+	}
+
+	if (!create)
+		return NULL;
+
+	OBD_ALLOC_PTR(conf_tgt);
+	if (conf_tgt) {
+		strlcpy(conf_tgt->sct_name, name, sizeof(conf_tgt->sct_name));
+		sptlrpc_rule_set_init(&conf_tgt->sct_rset);
+		list_add(&conf_tgt->sct_list, &conf->sc_tgts);
+	}
+
+	return conf_tgt;
+}
+
+static
+struct sptlrpc_conf *sptlrpc_conf_get(const char *fsname,
+				      int create)
+{
+	struct sptlrpc_conf *conf;
+
+	list_for_each_entry(conf, &sptlrpc_confs, sc_list) {
+		if (strcmp(conf->sc_fsname, fsname) == 0)
+			return conf;
+	}
+
+	if (!create)
+		return NULL;
+
+	OBD_ALLOC_PTR(conf);
+	if (conf == NULL)
+		return NULL;
+
+	strcpy(conf->sc_fsname, fsname);
+	sptlrpc_rule_set_init(&conf->sc_rset);
+	INIT_LIST_HEAD(&conf->sc_tgts);
+	list_add(&conf->sc_list, &sptlrpc_confs);
+
+	CDEBUG(D_SEC, "create sptlrpc conf %s\n", conf->sc_fsname);
+	return conf;
+}
+
+/**
+ * caller must hold conf_lock already.
+ */
+static int sptlrpc_conf_merge_rule(struct sptlrpc_conf *conf,
+				   const char *target,
+				   struct sptlrpc_rule *rule)
+{
+	struct sptlrpc_conf_tgt  *conf_tgt;
+	struct sptlrpc_rule_set  *rule_set;
+
+	/* fsname == target means general rules for the whole fs */
+	if (strcmp(conf->sc_fsname, target) == 0) {
+		rule_set = &conf->sc_rset;
+	} else {
+		conf_tgt = sptlrpc_conf_get_tgt(conf, target, 1);
+		if (conf_tgt) {
+			rule_set = &conf_tgt->sct_rset;
+		} else {
+			CERROR("out of memory, can't merge rule!\n");
+			return -ENOMEM;
+		}
+	}
+
+	return sptlrpc_rule_set_merge(rule_set, rule);
+}
+
+/**
+ * process one LCFG_SPTLRPC_CONF record. if \a conf is NULL, we
+ * find one through the target name in the record inside conf_lock;
+ * otherwise means caller already hold conf_lock.
+ */
+static int __sptlrpc_process_config(struct lustre_cfg *lcfg,
+				    struct sptlrpc_conf *conf)
+{
+	char		   *target, *param;
+	char		    fsname[MTI_NAME_MAXLEN];
+	struct sptlrpc_rule     rule;
+	int		     rc;
+	ENTRY;
+
+	target = lustre_cfg_string(lcfg, 1);
+	if (target == NULL) {
+		CERROR("missing target name\n");
+		RETURN(-EINVAL);
+	}
+
+	param = lustre_cfg_string(lcfg, 2);
+	if (param == NULL) {
+		CERROR("missing parameter\n");
+		RETURN(-EINVAL);
+	}
+
+	CDEBUG(D_SEC, "processing rule: %s.%s\n", target, param);
+
+	/* parse rule to make sure the format is correct */
+	if (strncmp(param, PARAM_SRPC_FLVR, sizeof(PARAM_SRPC_FLVR) - 1) != 0) {
+		CERROR("Invalid sptlrpc parameter: %s\n", param);
+		RETURN(-EINVAL);
+	}
+	param += sizeof(PARAM_SRPC_FLVR) - 1;
+
+	rc = sptlrpc_parse_rule(param, &rule);
+	if (rc)
+		RETURN(-EINVAL);
+
+	if (conf == NULL) {
+		target2fsname(target, fsname, sizeof(fsname));
+
+		mutex_lock(&sptlrpc_conf_lock);
+		conf = sptlrpc_conf_get(fsname, 0);
+		if (conf == NULL) {
+			CERROR("can't find conf\n");
+			rc = -ENOMEM;
+		} else {
+			rc = sptlrpc_conf_merge_rule(conf, target, &rule);
+		}
+		mutex_unlock(&sptlrpc_conf_lock);
+	} else {
+		LASSERT(mutex_is_locked(&sptlrpc_conf_lock));
+		rc = sptlrpc_conf_merge_rule(conf, target, &rule);
+	}
+
+	if (rc == 0)
+		conf->sc_modified++;
+
+	RETURN(rc);
+}
+
+int sptlrpc_process_config(struct lustre_cfg *lcfg)
+{
+	return __sptlrpc_process_config(lcfg, NULL);
+}
+EXPORT_SYMBOL(sptlrpc_process_config);
+
+static int logname2fsname(const char *logname, char *buf, int buflen)
+{
+	char   *ptr;
+	int     len;
+
+	ptr = strrchr(logname, '-');
+	if (ptr == NULL || strcmp(ptr, "-sptlrpc")) {
+		CERROR("%s is not a sptlrpc config log\n", logname);
+		return -EINVAL;
+	}
+
+	len = min((int) (ptr - logname), buflen - 1);
+
+	memcpy(buf, logname, len);
+	buf[len] = '\0';
+	return 0;
+}
+
+void sptlrpc_conf_log_update_begin(const char *logname)
+{
+	struct sptlrpc_conf *conf;
+	char		 fsname[16];
+
+	if (logname2fsname(logname, fsname, sizeof(fsname)))
+		return;
+
+	mutex_lock(&sptlrpc_conf_lock);
+
+	conf = sptlrpc_conf_get(fsname, 0);
+	if (conf && conf->sc_local) {
+		LASSERT(conf->sc_updated == 0);
+		sptlrpc_conf_free_rsets(conf);
+	}
+	conf->sc_modified = 0;
+
+	mutex_unlock(&sptlrpc_conf_lock);
+}
+EXPORT_SYMBOL(sptlrpc_conf_log_update_begin);
+
+/**
+ * mark a config log has been updated
+ */
+void sptlrpc_conf_log_update_end(const char *logname)
+{
+	struct sptlrpc_conf *conf;
+	char		 fsname[16];
+
+	if (logname2fsname(logname, fsname, sizeof(fsname)))
+		return;
+
+	mutex_lock(&sptlrpc_conf_lock);
+
+	conf = sptlrpc_conf_get(fsname, 0);
+	if (conf) {
+		/*
+		 * if original state is not updated, make sure the
+		 * modified counter > 0 to enforce updating local copy.
+		 */
+		if (conf->sc_updated == 0)
+			conf->sc_modified++;
+
+		conf->sc_updated = 1;
+	}
+
+	mutex_unlock(&sptlrpc_conf_lock);
+}
+EXPORT_SYMBOL(sptlrpc_conf_log_update_end);
+
+void sptlrpc_conf_log_start(const char *logname)
+{
+	char		 fsname[16];
+
+	if (logname2fsname(logname, fsname, sizeof(fsname)))
+		return;
+
+	mutex_lock(&sptlrpc_conf_lock);
+	sptlrpc_conf_get(fsname, 1);
+	mutex_unlock(&sptlrpc_conf_lock);
+}
+EXPORT_SYMBOL(sptlrpc_conf_log_start);
+
+void sptlrpc_conf_log_stop(const char *logname)
+{
+	struct sptlrpc_conf *conf;
+	char		 fsname[16];
+
+	if (logname2fsname(logname, fsname, sizeof(fsname)))
+		return;
+
+	mutex_lock(&sptlrpc_conf_lock);
+	conf = sptlrpc_conf_get(fsname, 0);
+	if (conf)
+		sptlrpc_conf_free(conf);
+	mutex_unlock(&sptlrpc_conf_lock);
+}
+EXPORT_SYMBOL(sptlrpc_conf_log_stop);
+
+static void inline flavor_set_flags(struct sptlrpc_flavor *sf,
+				    enum lustre_sec_part from,
+				    enum lustre_sec_part to,
+				    unsigned int fl_udesc)
+{
+	/*
+	 * null flavor doesn't need to set any flavor, and in fact
+	 * we'd better not do that because everybody share a single sec.
+	 */
+	if (sf->sf_rpc == SPTLRPC_FLVR_NULL)
+		return;
+
+	if (from == LUSTRE_SP_MDT) {
+		/* MDT->MDT; MDT->OST */
+		sf->sf_flags |= PTLRPC_SEC_FL_ROOTONLY;
+	} else if (from == LUSTRE_SP_CLI && to == LUSTRE_SP_OST) {
+		/* CLI->OST */
+		sf->sf_flags |= PTLRPC_SEC_FL_ROOTONLY | PTLRPC_SEC_FL_BULK;
+	} else if (from == LUSTRE_SP_CLI && to == LUSTRE_SP_MDT) {
+		/* CLI->MDT */
+		if (fl_udesc && sf->sf_rpc != SPTLRPC_FLVR_NULL)
+			sf->sf_flags |= PTLRPC_SEC_FL_UDESC;
+	}
+}
+
+void sptlrpc_conf_choose_flavor(enum lustre_sec_part from,
+				enum lustre_sec_part to,
+				struct obd_uuid *target,
+				lnet_nid_t nid,
+				struct sptlrpc_flavor *sf)
+{
+	struct sptlrpc_conf     *conf;
+	struct sptlrpc_conf_tgt *conf_tgt;
+	char		     name[MTI_NAME_MAXLEN];
+	int		      len, rc = 0;
+
+	target2fsname(target->uuid, name, sizeof(name));
+
+	mutex_lock(&sptlrpc_conf_lock);
+
+	conf = sptlrpc_conf_get(name, 0);
+	if (conf == NULL)
+		goto out;
+
+	/* convert uuid name (supposed end with _UUID) to target name */
+	len = strlen(target->uuid);
+	LASSERT(len > 5);
+	memcpy(name, target->uuid, len - 5);
+	name[len - 5] = '\0';
+
+	conf_tgt = sptlrpc_conf_get_tgt(conf, name, 0);
+	if (conf_tgt) {
+		rc = sptlrpc_rule_set_choose(&conf_tgt->sct_rset,
+					     from, to, nid, sf);
+		if (rc)
+			goto out;
+	}
+
+	rc = sptlrpc_rule_set_choose(&conf->sc_rset, from, to, nid, sf);
+out:
+	mutex_unlock(&sptlrpc_conf_lock);
+
+	if (rc == 0)
+		get_default_flavor(sf);
+
+	flavor_set_flags(sf, from, to, 1);
+}
+
+/**
+ * called by target devices, determine the expected flavor from
+ * certain peer (from, nid).
+ */
+void sptlrpc_target_choose_flavor(struct sptlrpc_rule_set *rset,
+				  enum lustre_sec_part from,
+				  lnet_nid_t nid,
+				  struct sptlrpc_flavor *sf)
+{
+	if (sptlrpc_rule_set_choose(rset, from, LUSTRE_SP_ANY, nid, sf) == 0)
+		get_default_flavor(sf);
+}
+EXPORT_SYMBOL(sptlrpc_target_choose_flavor);
+
+#define SEC_ADAPT_DELAY	 (10)
+
+/**
+ * called by client devices, notify the sptlrpc config has changed and
+ * do import_sec_adapt later.
+ */
+void sptlrpc_conf_client_adapt(struct obd_device *obd)
+{
+	struct obd_import  *imp;
+	ENTRY;
+
+	LASSERT(strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0 ||
+		strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) ==0);
+	CDEBUG(D_SEC, "obd %s\n", obd->u.cli.cl_target_uuid.uuid);
+
+	/* serialize with connect/disconnect import */
+	down_read(&obd->u.cli.cl_sem);
+
+	imp = obd->u.cli.cl_import;
+	if (imp) {
+		spin_lock(&imp->imp_lock);
+		if (imp->imp_sec)
+			imp->imp_sec_expire = cfs_time_current_sec() +
+				SEC_ADAPT_DELAY;
+		spin_unlock(&imp->imp_lock);
+	}
+
+	up_read(&obd->u.cli.cl_sem);
+	EXIT;
+}
+EXPORT_SYMBOL(sptlrpc_conf_client_adapt);
+
+
+static void rule2string(struct sptlrpc_rule *r, char *buf, int buflen)
+{
+	char    dirbuf[8];
+	char   *net;
+	char   *ptr = buf;
+
+	if (r->sr_netid == LNET_NIDNET(LNET_NID_ANY))
+		net = "default";
+	else
+		net = libcfs_net2str(r->sr_netid);
+
+	if (r->sr_from == LUSTRE_SP_ANY && r->sr_to == LUSTRE_SP_ANY)
+		dirbuf[0] = '\0';
+	else
+		snprintf(dirbuf, sizeof(dirbuf), ".%s2%s",
+			 sptlrpc_part2name(r->sr_from),
+			 sptlrpc_part2name(r->sr_to));
+
+	ptr += snprintf(buf, buflen, "srpc.flavor.%s%s=", net, dirbuf);
+
+	sptlrpc_flavor2name(&r->sr_flvr, ptr, buflen - (ptr - buf));
+	buf[buflen - 1] = '\0';
+}
+
+static int sptlrpc_record_rule_set(struct llog_handle *llh,
+				   char *target,
+				   struct sptlrpc_rule_set *rset)
+{
+	struct lustre_cfg_bufs  bufs;
+	struct lustre_cfg      *lcfg;
+	struct llog_rec_hdr     rec;
+	int		     buflen;
+	char		    param[48];
+	int		     i, rc;
+
+	for (i = 0; i < rset->srs_nrule; i++) {
+		rule2string(&rset->srs_rules[i], param, sizeof(param));
+
+		lustre_cfg_bufs_reset(&bufs, NULL);
+		lustre_cfg_bufs_set_string(&bufs, 1, target);
+		lustre_cfg_bufs_set_string(&bufs, 2, param);
+		lcfg = lustre_cfg_new(LCFG_SPTLRPC_CONF, &bufs);
+		LASSERT(lcfg);
+
+		buflen = lustre_cfg_len(lcfg->lcfg_bufcount,
+					lcfg->lcfg_buflens);
+		rec.lrh_len = llog_data_len(buflen);
+		rec.lrh_type = OBD_CFG_REC;
+		rc = llog_write(NULL, llh, &rec, NULL, 0, (void *)lcfg, -1);
+		if (rc)
+			CERROR("failed to write a rec: rc = %d\n", rc);
+		lustre_cfg_free(lcfg);
+	}
+	return 0;
+}
+
+static int sptlrpc_record_rules(struct llog_handle *llh,
+				struct sptlrpc_conf *conf)
+{
+	struct sptlrpc_conf_tgt *conf_tgt;
+
+	sptlrpc_record_rule_set(llh, conf->sc_fsname, &conf->sc_rset);
+
+	list_for_each_entry(conf_tgt, &conf->sc_tgts, sct_list) {
+		sptlrpc_record_rule_set(llh, conf_tgt->sct_name,
+					&conf_tgt->sct_rset);
+	}
+	return 0;
+}
+
+#define LOG_SPTLRPC_TMP "sptlrpc.tmp"
+#define LOG_SPTLRPC     "sptlrpc"
+
+static
+int sptlrpc_target_local_copy_conf(struct obd_device *obd,
+				   struct sptlrpc_conf *conf)
+{
+	struct llog_handle   *llh = NULL;
+	struct llog_ctxt     *ctxt;
+	struct lvfs_run_ctxt  saved;
+	struct dentry	*dentry;
+	int		   rc;
+	ENTRY;
+
+	ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT);
+	if (ctxt == NULL)
+		RETURN(-EINVAL);
+
+	push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+
+	dentry = ll_lookup_one_len(MOUNT_CONFIGS_DIR, cfs_fs_pwd(current->fs),
+				   strlen(MOUNT_CONFIGS_DIR));
+	if (IS_ERR(dentry)) {
+		rc = PTR_ERR(dentry);
+		CERROR("cannot lookup %s directory: rc = %d\n",
+		       MOUNT_CONFIGS_DIR, rc);
+		GOTO(out_ctx, rc);
+	}
+
+	/* erase the old tmp log */
+	rc = llog_erase(NULL, ctxt, NULL, LOG_SPTLRPC_TMP);
+	if (rc < 0 && rc != -ENOENT) {
+		CERROR("%s: cannot erase temporary sptlrpc log: rc = %d\n",
+		       obd->obd_name, rc);
+		GOTO(out_dput, rc);
+	}
+
+	/* write temporary log */
+	rc = llog_open_create(NULL, ctxt, &llh, NULL, LOG_SPTLRPC_TMP);
+	if (rc)
+		GOTO(out_dput, rc);
+	rc = llog_init_handle(NULL, llh, LLOG_F_IS_PLAIN, NULL);
+	if (rc)
+		GOTO(out_close, rc);
+
+	rc = sptlrpc_record_rules(llh, conf);
+
+out_close:
+	llog_close(NULL, llh);
+	if (rc == 0)
+		rc = lustre_rename(dentry, obd->obd_lvfs_ctxt.pwdmnt,
+				   LOG_SPTLRPC_TMP, LOG_SPTLRPC);
+out_dput:
+	l_dput(dentry);
+out_ctx:
+	pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+	llog_ctxt_put(ctxt);
+	CDEBUG(D_SEC, "target %s: write local sptlrpc conf: rc = %d\n",
+	       obd->obd_name, rc);
+	RETURN(rc);
+}
+
+static int local_read_handler(const struct lu_env *env,
+			      struct llog_handle *llh,
+			      struct llog_rec_hdr *rec, void *data)
+{
+	struct sptlrpc_conf  *conf = (struct sptlrpc_conf *) data;
+	struct lustre_cfg    *lcfg = (struct lustre_cfg *)(rec + 1);
+	int		   cfg_len, rc;
+	ENTRY;
+
+	if (rec->lrh_type != OBD_CFG_REC) {
+		CERROR("unhandled lrh_type: %#x\n", rec->lrh_type);
+		RETURN(-EINVAL);
+	}
+
+	cfg_len = rec->lrh_len - sizeof(struct llog_rec_hdr) -
+		  sizeof(struct llog_rec_tail);
+
+	rc = lustre_cfg_sanity_check(lcfg, cfg_len);
+	if (rc) {
+		CERROR("Insane cfg\n");
+		RETURN(rc);
+	}
+
+	if (lcfg->lcfg_command != LCFG_SPTLRPC_CONF) {
+		CERROR("invalid command (%x)\n", lcfg->lcfg_command);
+		RETURN(-EINVAL);
+	}
+
+	RETURN(__sptlrpc_process_config(lcfg, conf));
+}
+
+static
+int sptlrpc_target_local_read_conf(struct obd_device *obd,
+				   struct sptlrpc_conf *conf)
+{
+	struct llog_handle    *llh = NULL;
+	struct llog_ctxt      *ctxt;
+	struct lvfs_run_ctxt   saved;
+	int		    rc;
+	ENTRY;
+
+	LASSERT(conf->sc_updated == 0 && conf->sc_local == 0);
+
+	ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT);
+	if (ctxt == NULL) {
+		CERROR("missing llog context\n");
+		RETURN(-EINVAL);
+	}
+
+	push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+
+	rc = llog_open(NULL, ctxt, &llh, NULL, LOG_SPTLRPC, LLOG_OPEN_EXISTS);
+	if (rc < 0) {
+		if (rc == -ENOENT)
+			rc = 0;
+		GOTO(out_pop, rc);
+	}
+
+	rc = llog_init_handle(NULL, llh, LLOG_F_IS_PLAIN, NULL);
+	if (rc)
+		GOTO(out_close, rc);
+
+	if (llog_get_size(llh) <= 1) {
+		CDEBUG(D_SEC, "no local sptlrpc copy found\n");
+		GOTO(out_close, rc = 0);
+	}
+
+	rc = llog_process(NULL, llh, local_read_handler, (void *)conf, NULL);
+
+	if (rc == 0) {
+		conf->sc_local = 1;
+	} else {
+		sptlrpc_conf_free_rsets(conf);
+	}
+
+out_close:
+	llog_close(NULL, llh);
+out_pop:
+	pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+	llog_ctxt_put(ctxt);
+	CDEBUG(D_SEC, "target %s: read local sptlrpc conf: rc = %d\n",
+	       obd->obd_name, rc);
+	RETURN(rc);
+}
+
+
+/**
+ * called by target devices, extract sptlrpc rules which applies to
+ * this target, to be used for future rpc flavor checking.
+ */
+int sptlrpc_conf_target_get_rules(struct obd_device *obd,
+				  struct sptlrpc_rule_set *rset,
+				  int initial)
+{
+	struct sptlrpc_conf      *conf;
+	struct sptlrpc_conf_tgt  *conf_tgt;
+	enum lustre_sec_part      sp_dst;
+	char		      fsname[MTI_NAME_MAXLEN];
+	int		       rc = 0;
+	ENTRY;
+
+	if (strcmp(obd->obd_type->typ_name, LUSTRE_MDT_NAME) == 0) {
+		sp_dst = LUSTRE_SP_MDT;
+	} else if (strcmp(obd->obd_type->typ_name, LUSTRE_OST_NAME) == 0) {
+		sp_dst = LUSTRE_SP_OST;
+	} else {
+		CERROR("unexpected obd type %s\n", obd->obd_type->typ_name);
+		RETURN(-EINVAL);
+	}
+	CDEBUG(D_SEC, "get rules for target %s\n", obd->obd_uuid.uuid);
+
+	target2fsname(obd->obd_uuid.uuid, fsname, sizeof(fsname));
+
+	mutex_lock(&sptlrpc_conf_lock);
+
+	conf = sptlrpc_conf_get(fsname, 0);
+	if (conf == NULL) {
+		CERROR("missing sptlrpc config log\n");
+		GOTO(out, rc);
+	}
+
+	if (conf->sc_updated  == 0) {
+		/*
+		 * always read from local copy. here another option is
+		 * if we already have a local copy (read from another
+		 * target device hosted on the same node) we simply use that.
+		 */
+		if (conf->sc_local)
+			sptlrpc_conf_free_rsets(conf);
+
+		sptlrpc_target_local_read_conf(obd, conf);
+	} else {
+		LASSERT(conf->sc_local == 0);
+
+		/* write a local copy */
+		if (initial || conf->sc_modified)
+			sptlrpc_target_local_copy_conf(obd, conf);
+		else
+			CDEBUG(D_SEC, "unchanged, skip updating local copy\n");
+	}
+
+	/* extract rule set for this target */
+	conf_tgt = sptlrpc_conf_get_tgt(conf, obd->obd_name, 0);
+
+	rc = sptlrpc_rule_set_extract(&conf->sc_rset,
+				      conf_tgt ? &conf_tgt->sct_rset: NULL,
+				      LUSTRE_SP_ANY, sp_dst, rset);
+out:
+	mutex_unlock(&sptlrpc_conf_lock);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(sptlrpc_conf_target_get_rules);
+
+int  sptlrpc_conf_init(void)
+{
+	mutex_init(&sptlrpc_conf_lock);
+	return 0;
+}
+
+void sptlrpc_conf_fini(void)
+{
+	struct sptlrpc_conf  *conf, *conf_next;
+
+	mutex_lock(&sptlrpc_conf_lock);
+	list_for_each_entry_safe(conf, conf_next, &sptlrpc_confs, sc_list) {
+		sptlrpc_conf_free(conf);
+	}
+	LASSERT(list_empty(&sptlrpc_confs));
+	mutex_unlock(&sptlrpc_conf_lock);
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_gc.c b/drivers/staging/lustre/lustre/ptlrpc/sec_gc.c
new file mode 100644
index 000000000000..4c96a14a1bb6
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/sec_gc.c
@@ -0,0 +1,250 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/sec_gc.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/libcfs/libcfs.h>
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_sec.h>
+
+#define SEC_GC_INTERVAL (30 * 60)
+
+
+static struct mutex sec_gc_mutex;
+static LIST_HEAD(sec_gc_list);
+static spinlock_t sec_gc_list_lock;
+
+static LIST_HEAD(sec_gc_ctx_list);
+static spinlock_t sec_gc_ctx_list_lock;
+
+static struct ptlrpc_thread sec_gc_thread;
+static atomic_t sec_gc_wait_del = ATOMIC_INIT(0);
+
+
+void sptlrpc_gc_add_sec(struct ptlrpc_sec *sec)
+{
+	LASSERT(sec->ps_policy->sp_cops->gc_ctx);
+	LASSERT(sec->ps_gc_interval > 0);
+	LASSERT(list_empty(&sec->ps_gc_list));
+
+	sec->ps_gc_next = cfs_time_current_sec() + sec->ps_gc_interval;
+
+	spin_lock(&sec_gc_list_lock);
+	list_add_tail(&sec_gc_list, &sec->ps_gc_list);
+	spin_unlock(&sec_gc_list_lock);
+
+	CDEBUG(D_SEC, "added sec %p(%s)\n", sec, sec->ps_policy->sp_name);
+}
+EXPORT_SYMBOL(sptlrpc_gc_add_sec);
+
+void sptlrpc_gc_del_sec(struct ptlrpc_sec *sec)
+{
+	if (list_empty(&sec->ps_gc_list))
+		return;
+
+	might_sleep();
+
+	/* signal before list_del to make iteration in gc thread safe */
+	atomic_inc(&sec_gc_wait_del);
+
+	spin_lock(&sec_gc_list_lock);
+	list_del_init(&sec->ps_gc_list);
+	spin_unlock(&sec_gc_list_lock);
+
+	/* barrier */
+	mutex_lock(&sec_gc_mutex);
+	mutex_unlock(&sec_gc_mutex);
+
+	atomic_dec(&sec_gc_wait_del);
+
+	CDEBUG(D_SEC, "del sec %p(%s)\n", sec, sec->ps_policy->sp_name);
+}
+EXPORT_SYMBOL(sptlrpc_gc_del_sec);
+
+void sptlrpc_gc_add_ctx(struct ptlrpc_cli_ctx *ctx)
+{
+	LASSERT(list_empty(&ctx->cc_gc_chain));
+
+	CDEBUG(D_SEC, "hand over ctx %p(%u->%s)\n",
+	       ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec));
+	spin_lock(&sec_gc_ctx_list_lock);
+	list_add(&ctx->cc_gc_chain, &sec_gc_ctx_list);
+	spin_unlock(&sec_gc_ctx_list_lock);
+
+	thread_add_flags(&sec_gc_thread, SVC_SIGNAL);
+	wake_up(&sec_gc_thread.t_ctl_waitq);
+}
+EXPORT_SYMBOL(sptlrpc_gc_add_ctx);
+
+static void sec_process_ctx_list(void)
+{
+	struct ptlrpc_cli_ctx *ctx;
+
+	spin_lock(&sec_gc_ctx_list_lock);
+
+	while (!list_empty(&sec_gc_ctx_list)) {
+		ctx = list_entry(sec_gc_ctx_list.next,
+				     struct ptlrpc_cli_ctx, cc_gc_chain);
+		list_del_init(&ctx->cc_gc_chain);
+		spin_unlock(&sec_gc_ctx_list_lock);
+
+		LASSERT(ctx->cc_sec);
+		LASSERT(atomic_read(&ctx->cc_refcount) == 1);
+		CDEBUG(D_SEC, "gc pick up ctx %p(%u->%s)\n",
+		       ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec));
+		sptlrpc_cli_ctx_put(ctx, 1);
+
+		spin_lock(&sec_gc_ctx_list_lock);
+	}
+
+	spin_unlock(&sec_gc_ctx_list_lock);
+}
+
+static void sec_do_gc(struct ptlrpc_sec *sec)
+{
+	LASSERT(sec->ps_policy->sp_cops->gc_ctx);
+
+	if (unlikely(sec->ps_gc_next == 0)) {
+		CDEBUG(D_SEC, "sec %p(%s) has 0 gc time\n",
+		      sec, sec->ps_policy->sp_name);
+		return;
+	}
+
+	CDEBUG(D_SEC, "check on sec %p(%s)\n", sec, sec->ps_policy->sp_name);
+
+	if (cfs_time_after(sec->ps_gc_next, cfs_time_current_sec()))
+		return;
+
+	sec->ps_policy->sp_cops->gc_ctx(sec);
+	sec->ps_gc_next = cfs_time_current_sec() + sec->ps_gc_interval;
+}
+
+static int sec_gc_main(void *arg)
+{
+	struct ptlrpc_thread *thread = (struct ptlrpc_thread *) arg;
+	struct l_wait_info    lwi;
+
+	unshare_fs_struct();
+
+	/* Record that the thread is running */
+	thread_set_flags(thread, SVC_RUNNING);
+	wake_up(&thread->t_ctl_waitq);
+
+	while (1) {
+		struct ptlrpc_sec *sec;
+
+		thread_clear_flags(thread, SVC_SIGNAL);
+		sec_process_ctx_list();
+again:
+		/* go through sec list do gc.
+		 * FIXME here we iterate through the whole list each time which
+		 * is not optimal. we perhaps want to use balanced binary tree
+		 * to trace each sec as order of expiry time.
+		 * another issue here is we wakeup as fixed interval instead of
+		 * according to each sec's expiry time */
+		mutex_lock(&sec_gc_mutex);
+		list_for_each_entry(sec, &sec_gc_list, ps_gc_list) {
+			/* if someone is waiting to be deleted, let it
+			 * proceed as soon as possible. */
+			if (atomic_read(&sec_gc_wait_del)) {
+				CDEBUG(D_SEC, "deletion pending, start over\n");
+				mutex_unlock(&sec_gc_mutex);
+				goto again;
+			}
+
+			sec_do_gc(sec);
+		}
+		mutex_unlock(&sec_gc_mutex);
+
+		/* check ctx list again before sleep */
+		sec_process_ctx_list();
+
+		lwi = LWI_TIMEOUT(SEC_GC_INTERVAL * HZ, NULL, NULL);
+		l_wait_event(thread->t_ctl_waitq,
+			     thread_is_stopping(thread) ||
+			     thread_is_signal(thread),
+			     &lwi);
+
+		if (thread_test_and_clear_flags(thread, SVC_STOPPING))
+			break;
+	}
+
+	thread_set_flags(thread, SVC_STOPPED);
+	wake_up(&thread->t_ctl_waitq);
+	return 0;
+}
+
+int sptlrpc_gc_init(void)
+{
+	struct l_wait_info lwi = { 0 };
+	task_t *task;
+
+	mutex_init(&sec_gc_mutex);
+	spin_lock_init(&sec_gc_list_lock);
+	spin_lock_init(&sec_gc_ctx_list_lock);
+
+	/* initialize thread control */
+	memset(&sec_gc_thread, 0, sizeof(sec_gc_thread));
+	init_waitqueue_head(&sec_gc_thread.t_ctl_waitq);
+
+	task = kthread_run(sec_gc_main, &sec_gc_thread, "sptlrpc_gc");
+	if (IS_ERR(task)) {
+		CERROR("can't start gc thread: %ld\n", PTR_ERR(task));
+		return PTR_ERR(task);
+	}
+
+	l_wait_event(sec_gc_thread.t_ctl_waitq,
+		     thread_is_running(&sec_gc_thread), &lwi);
+	return 0;
+}
+
+void sptlrpc_gc_fini(void)
+{
+	struct l_wait_info lwi = { 0 };
+
+	thread_set_flags(&sec_gc_thread, SVC_STOPPING);
+	wake_up(&sec_gc_thread.t_ctl_waitq);
+
+	l_wait_event(sec_gc_thread.t_ctl_waitq,
+		     thread_is_stopped(&sec_gc_thread), &lwi);
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_lproc.c b/drivers/staging/lustre/lustre/ptlrpc/sec_lproc.c
new file mode 100644
index 000000000000..920591b3bb17
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/sec_lproc.c
@@ -0,0 +1,198 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/sec_lproc.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/crypto.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_dlm.h>
+#include <lustre_sec.h>
+
+#include "ptlrpc_internal.h"
+
+
+struct proc_dir_entry *sptlrpc_proc_root = NULL;
+EXPORT_SYMBOL(sptlrpc_proc_root);
+
+char *sec_flags2str(unsigned long flags, char *buf, int bufsize)
+{
+	buf[0] = '\0';
+
+	if (flags & PTLRPC_SEC_FL_REVERSE)
+		strlcat(buf, "reverse,", bufsize);
+	if (flags & PTLRPC_SEC_FL_ROOTONLY)
+		strlcat(buf, "rootonly,", bufsize);
+	if (flags & PTLRPC_SEC_FL_UDESC)
+		strlcat(buf, "udesc,", bufsize);
+	if (flags & PTLRPC_SEC_FL_BULK)
+		strlcat(buf, "bulk,", bufsize);
+	if (buf[0] == '\0')
+		strlcat(buf, "-,", bufsize);
+
+	return buf;
+}
+
+static int sptlrpc_info_lprocfs_seq_show(struct seq_file *seq, void *v)
+{
+	struct obd_device *dev = seq->private;
+	struct client_obd *cli = &dev->u.cli;
+	struct ptlrpc_sec *sec = NULL;
+	char	       str[32];
+
+	LASSERT(strcmp(dev->obd_type->typ_name, LUSTRE_OSC_NAME) == 0 ||
+		strcmp(dev->obd_type->typ_name, LUSTRE_MDC_NAME) == 0 ||
+		strcmp(dev->obd_type->typ_name, LUSTRE_MGC_NAME) == 0);
+
+	if (cli->cl_import)
+		sec = sptlrpc_import_sec_ref(cli->cl_import);
+	if (sec == NULL)
+		goto out;
+
+	sec_flags2str(sec->ps_flvr.sf_flags, str, sizeof(str));
+
+	seq_printf(seq, "rpc flavor:    %s\n",
+		   sptlrpc_flavor2name_base(sec->ps_flvr.sf_rpc));
+	seq_printf(seq, "bulk flavor:   %s\n",
+		   sptlrpc_flavor2name_bulk(&sec->ps_flvr, str, sizeof(str)));
+	seq_printf(seq, "flags:	 %s\n",
+		   sec_flags2str(sec->ps_flvr.sf_flags, str, sizeof(str)));
+	seq_printf(seq, "id:	    %d\n", sec->ps_id);
+	seq_printf(seq, "refcount:      %d\n",
+		   atomic_read(&sec->ps_refcount));
+	seq_printf(seq, "nctx:	  %d\n", atomic_read(&sec->ps_nctx));
+	seq_printf(seq, "gc internal    %ld\n", sec->ps_gc_interval);
+	seq_printf(seq, "gc next	%ld\n",
+		   sec->ps_gc_interval ?
+		   sec->ps_gc_next - cfs_time_current_sec() : 0);
+
+	sptlrpc_sec_put(sec);
+out:
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(sptlrpc_info_lprocfs);
+
+static int sptlrpc_ctxs_lprocfs_seq_show(struct seq_file *seq, void *v)
+{
+	struct obd_device *dev = seq->private;
+	struct client_obd *cli = &dev->u.cli;
+	struct ptlrpc_sec *sec = NULL;
+
+	LASSERT(strcmp(dev->obd_type->typ_name, LUSTRE_OSC_NAME) == 0 ||
+		strcmp(dev->obd_type->typ_name, LUSTRE_MDC_NAME) == 0 ||
+		strcmp(dev->obd_type->typ_name, LUSTRE_MGC_NAME) == 0);
+
+	if (cli->cl_import)
+		sec = sptlrpc_import_sec_ref(cli->cl_import);
+	if (sec == NULL)
+		goto out;
+
+	if (sec->ps_policy->sp_cops->display)
+		sec->ps_policy->sp_cops->display(sec, seq);
+
+	sptlrpc_sec_put(sec);
+out:
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(sptlrpc_ctxs_lprocfs);
+
+int sptlrpc_lprocfs_cliobd_attach(struct obd_device *dev)
+{
+	int     rc;
+
+	if (strcmp(dev->obd_type->typ_name, LUSTRE_OSC_NAME) != 0 &&
+	    strcmp(dev->obd_type->typ_name, LUSTRE_MDC_NAME) != 0 &&
+	    strcmp(dev->obd_type->typ_name, LUSTRE_MGC_NAME) != 0) {
+		CERROR("can't register lproc for obd type %s\n",
+		       dev->obd_type->typ_name);
+		return -EINVAL;
+	}
+
+	rc = lprocfs_obd_seq_create(dev, "srpc_info", 0444,
+				    &sptlrpc_info_lprocfs_fops, dev);
+	if (rc) {
+		CERROR("create proc entry srpc_info for %s: %d\n",
+		       dev->obd_name, rc);
+		return rc;
+	}
+
+	rc = lprocfs_obd_seq_create(dev, "srpc_contexts", 0444,
+				    &sptlrpc_ctxs_lprocfs_fops, dev);
+	if (rc) {
+		CERROR("create proc entry srpc_contexts for %s: %d\n",
+		       dev->obd_name, rc);
+		return rc;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_lprocfs_cliobd_attach);
+
+static struct lprocfs_vars sptlrpc_lprocfs_vars[] = {
+	{ "encrypt_page_pools", sptlrpc_proc_read_enc_pool, NULL, NULL },
+	{ NULL }
+};
+
+int sptlrpc_lproc_init(void)
+{
+	int     rc;
+
+	LASSERT(sptlrpc_proc_root == NULL);
+
+	sptlrpc_proc_root = lprocfs_register("sptlrpc", proc_lustre_root,
+					     sptlrpc_lprocfs_vars, NULL);
+	if (IS_ERR(sptlrpc_proc_root)) {
+		rc = PTR_ERR(sptlrpc_proc_root);
+		sptlrpc_proc_root = NULL;
+		return rc;
+	}
+	return 0;
+}
+
+void sptlrpc_lproc_fini(void)
+{
+	if (sptlrpc_proc_root) {
+		lprocfs_remove(&sptlrpc_proc_root);
+		sptlrpc_proc_root = NULL;
+	}
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_null.c b/drivers/staging/lustre/lustre/ptlrpc/sec_null.c
new file mode 100644
index 000000000000..ff1137fe4dd6
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/sec_null.c
@@ -0,0 +1,464 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/sec_null.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+
+#include <obd_support.h>
+#include <obd_cksum.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_sec.h>
+
+static struct ptlrpc_sec_policy null_policy;
+static struct ptlrpc_sec	null_sec;
+static struct ptlrpc_cli_ctx    null_cli_ctx;
+static struct ptlrpc_svc_ctx    null_svc_ctx;
+
+/*
+ * we can temporarily use the topmost 8-bits of lm_secflvr to identify
+ * the source sec part.
+ */
+static inline
+void null_encode_sec_part(struct lustre_msg *msg, enum lustre_sec_part sp)
+{
+	msg->lm_secflvr |= (((__u32) sp) & 0xFF) << 24;
+}
+
+static inline
+enum lustre_sec_part null_decode_sec_part(struct lustre_msg *msg)
+{
+	return (msg->lm_secflvr >> 24) & 0xFF;
+}
+
+static int null_ctx_refresh(struct ptlrpc_cli_ctx *ctx)
+{
+	/* should never reach here */
+	LBUG();
+	return 0;
+}
+
+static
+int null_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
+{
+	req->rq_reqbuf->lm_secflvr = SPTLRPC_FLVR_NULL;
+
+	if (!req->rq_import->imp_dlm_fake) {
+		struct obd_device *obd = req->rq_import->imp_obd;
+		null_encode_sec_part(req->rq_reqbuf,
+				     obd->u.cli.cl_sp_me);
+	}
+	req->rq_reqdata_len = req->rq_reqlen;
+	return 0;
+}
+
+static
+int null_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
+{
+	__u32   cksums, cksumc;
+
+	LASSERT(req->rq_repdata);
+
+	req->rq_repmsg = req->rq_repdata;
+	req->rq_replen = req->rq_repdata_len;
+
+	if (req->rq_early) {
+		cksums = lustre_msg_get_cksum(req->rq_repdata);
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0)
+		if (lustre_msghdr_get_flags(req->rq_reqmsg) &
+		    MSGHDR_CKSUM_INCOMPAT18)
+			cksumc = lustre_msg_calc_cksum(req->rq_repmsg, 0);
+		else
+			cksumc = lustre_msg_calc_cksum(req->rq_repmsg, 1);
+#else
+# warning "remove checksum compatibility support for b1_8"
+		cksumc = lustre_msg_calc_cksum(req->rq_repmsg);
+#endif
+		if (cksumc != cksums) {
+			CDEBUG(D_SEC,
+			       "early reply checksum mismatch: %08x != %08x\n",
+			       cksumc, cksums);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static
+struct ptlrpc_sec *null_create_sec(struct obd_import *imp,
+				   struct ptlrpc_svc_ctx *svc_ctx,
+				   struct sptlrpc_flavor *sf)
+{
+	LASSERT(SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_NULL);
+
+	/* general layer has take a module reference for us, because we never
+	 * really destroy the sec, simply release the reference here.
+	 */
+	sptlrpc_policy_put(&null_policy);
+	return &null_sec;
+}
+
+static
+void null_destroy_sec(struct ptlrpc_sec *sec)
+{
+	LASSERT(sec == &null_sec);
+}
+
+static
+struct ptlrpc_cli_ctx *null_lookup_ctx(struct ptlrpc_sec *sec,
+				       struct vfs_cred *vcred,
+				       int create, int remove_dead)
+{
+	atomic_inc(&null_cli_ctx.cc_refcount);
+	return &null_cli_ctx;
+}
+
+static
+int null_flush_ctx_cache(struct ptlrpc_sec *sec,
+			 uid_t uid,
+			 int grace, int force)
+{
+	return 0;
+}
+
+static
+int null_alloc_reqbuf(struct ptlrpc_sec *sec,
+		      struct ptlrpc_request *req,
+		      int msgsize)
+{
+	if (!req->rq_reqbuf) {
+		int alloc_size = size_roundup_power2(msgsize);
+
+		LASSERT(!req->rq_pool);
+		OBD_ALLOC_LARGE(req->rq_reqbuf, alloc_size);
+		if (!req->rq_reqbuf)
+			return -ENOMEM;
+
+		req->rq_reqbuf_len = alloc_size;
+	} else {
+		LASSERT(req->rq_pool);
+		LASSERT(req->rq_reqbuf_len >= msgsize);
+		memset(req->rq_reqbuf, 0, msgsize);
+	}
+
+	req->rq_reqmsg = req->rq_reqbuf;
+	return 0;
+}
+
+static
+void null_free_reqbuf(struct ptlrpc_sec *sec,
+		      struct ptlrpc_request *req)
+{
+	if (!req->rq_pool) {
+		LASSERTF(req->rq_reqmsg == req->rq_reqbuf,
+			 "req %p: reqmsg %p is not reqbuf %p in null sec\n",
+			 req, req->rq_reqmsg, req->rq_reqbuf);
+		LASSERTF(req->rq_reqbuf_len >= req->rq_reqlen,
+			 "req %p: reqlen %d should smaller than buflen %d\n",
+			 req, req->rq_reqlen, req->rq_reqbuf_len);
+
+		OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+		req->rq_reqbuf = NULL;
+		req->rq_reqbuf_len = 0;
+	}
+}
+
+static
+int null_alloc_repbuf(struct ptlrpc_sec *sec,
+		      struct ptlrpc_request *req,
+		      int msgsize)
+{
+	/* add space for early replied */
+	msgsize += lustre_msg_early_size();
+
+	msgsize = size_roundup_power2(msgsize);
+
+	OBD_ALLOC_LARGE(req->rq_repbuf, msgsize);
+	if (!req->rq_repbuf)
+		return -ENOMEM;
+
+	req->rq_repbuf_len = msgsize;
+	return 0;
+}
+
+static
+void null_free_repbuf(struct ptlrpc_sec *sec,
+		      struct ptlrpc_request *req)
+{
+	LASSERT(req->rq_repbuf);
+
+	OBD_FREE_LARGE(req->rq_repbuf, req->rq_repbuf_len);
+	req->rq_repbuf = NULL;
+	req->rq_repbuf_len = 0;
+}
+
+static
+int null_enlarge_reqbuf(struct ptlrpc_sec *sec,
+			struct ptlrpc_request *req,
+			int segment, int newsize)
+{
+	struct lustre_msg      *newbuf;
+	struct lustre_msg      *oldbuf = req->rq_reqmsg;
+	int		     oldsize, newmsg_size, alloc_size;
+
+	LASSERT(req->rq_reqbuf);
+	LASSERT(req->rq_reqbuf == req->rq_reqmsg);
+	LASSERT(req->rq_reqbuf_len >= req->rq_reqlen);
+	LASSERT(req->rq_reqlen == lustre_packed_msg_size(oldbuf));
+
+	/* compute new message size */
+	oldsize = req->rq_reqbuf->lm_buflens[segment];
+	req->rq_reqbuf->lm_buflens[segment] = newsize;
+	newmsg_size = lustre_packed_msg_size(oldbuf);
+	req->rq_reqbuf->lm_buflens[segment] = oldsize;
+
+	/* request from pool should always have enough buffer */
+	LASSERT(!req->rq_pool || req->rq_reqbuf_len >= newmsg_size);
+
+	if (req->rq_reqbuf_len < newmsg_size) {
+		alloc_size = size_roundup_power2(newmsg_size);
+
+		OBD_ALLOC_LARGE(newbuf, alloc_size);
+		if (newbuf == NULL)
+			return -ENOMEM;
+
+		memcpy(newbuf, req->rq_reqbuf, req->rq_reqlen);
+
+		OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+		req->rq_reqbuf = req->rq_reqmsg = newbuf;
+		req->rq_reqbuf_len = alloc_size;
+	}
+
+	_sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize);
+	req->rq_reqlen = newmsg_size;
+
+	return 0;
+}
+
+static struct ptlrpc_svc_ctx null_svc_ctx = {
+	.sc_refcount    = ATOMIC_INIT(1),
+	.sc_policy      = &null_policy,
+};
+
+static
+int null_accept(struct ptlrpc_request *req)
+{
+	LASSERT(SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) ==
+		SPTLRPC_POLICY_NULL);
+
+	if (req->rq_flvr.sf_rpc != SPTLRPC_FLVR_NULL) {
+		CERROR("Invalid rpc flavor 0x%x\n", req->rq_flvr.sf_rpc);
+		return SECSVC_DROP;
+	}
+
+	req->rq_sp_from = null_decode_sec_part(req->rq_reqbuf);
+
+	req->rq_reqmsg = req->rq_reqbuf;
+	req->rq_reqlen = req->rq_reqdata_len;
+
+	req->rq_svc_ctx = &null_svc_ctx;
+	atomic_inc(&req->rq_svc_ctx->sc_refcount);
+
+	return SECSVC_OK;
+}
+
+static
+int null_alloc_rs(struct ptlrpc_request *req, int msgsize)
+{
+	struct ptlrpc_reply_state *rs;
+	int rs_size = sizeof(*rs) + msgsize;
+
+	LASSERT(msgsize % 8 == 0);
+
+	rs = req->rq_reply_state;
+
+	if (rs) {
+		/* pre-allocated */
+		LASSERT(rs->rs_size >= rs_size);
+	} else {
+		OBD_ALLOC_LARGE(rs, rs_size);
+		if (rs == NULL)
+			return -ENOMEM;
+
+		rs->rs_size = rs_size;
+	}
+
+	rs->rs_svc_ctx = req->rq_svc_ctx;
+	atomic_inc(&req->rq_svc_ctx->sc_refcount);
+
+	rs->rs_repbuf = (struct lustre_msg *) (rs + 1);
+	rs->rs_repbuf_len = rs_size - sizeof(*rs);
+	rs->rs_msg = rs->rs_repbuf;
+
+	req->rq_reply_state = rs;
+	return 0;
+}
+
+static
+void null_free_rs(struct ptlrpc_reply_state *rs)
+{
+	LASSERT_ATOMIC_GT(&rs->rs_svc_ctx->sc_refcount, 1);
+	atomic_dec(&rs->rs_svc_ctx->sc_refcount);
+
+	if (!rs->rs_prealloc)
+		OBD_FREE_LARGE(rs, rs->rs_size);
+}
+
+static
+int null_authorize(struct ptlrpc_request *req)
+{
+	struct ptlrpc_reply_state *rs = req->rq_reply_state;
+
+	LASSERT(rs);
+
+	rs->rs_repbuf->lm_secflvr = SPTLRPC_FLVR_NULL;
+	rs->rs_repdata_len = req->rq_replen;
+
+	if (likely(req->rq_packed_final)) {
+		if (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)
+			req->rq_reply_off = lustre_msg_early_size();
+		else
+			req->rq_reply_off = 0;
+	} else {
+		__u32 cksum;
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0)
+		if (lustre_msghdr_get_flags(req->rq_reqmsg) &
+		    MSGHDR_CKSUM_INCOMPAT18)
+			cksum = lustre_msg_calc_cksum(rs->rs_repbuf, 0);
+		else
+			cksum = lustre_msg_calc_cksum(rs->rs_repbuf, 1);
+#else
+# warning "remove checksum compatibility support for b1_8"
+		cksum = lustre_msg_calc_cksum(rs->rs_repbuf);
+#endif
+		lustre_msg_set_cksum(rs->rs_repbuf, cksum);
+		req->rq_reply_off = 0;
+	}
+
+	return 0;
+}
+
+static struct ptlrpc_ctx_ops null_ctx_ops = {
+	.refresh		= null_ctx_refresh,
+	.sign		   = null_ctx_sign,
+	.verify		 = null_ctx_verify,
+};
+
+static struct ptlrpc_sec_cops null_sec_cops = {
+	.create_sec	     = null_create_sec,
+	.destroy_sec	    = null_destroy_sec,
+	.lookup_ctx	     = null_lookup_ctx,
+	.flush_ctx_cache	= null_flush_ctx_cache,
+	.alloc_reqbuf	   = null_alloc_reqbuf,
+	.alloc_repbuf	   = null_alloc_repbuf,
+	.free_reqbuf	    = null_free_reqbuf,
+	.free_repbuf	    = null_free_repbuf,
+	.enlarge_reqbuf	 = null_enlarge_reqbuf,
+};
+
+static struct ptlrpc_sec_sops null_sec_sops = {
+	.accept		 = null_accept,
+	.alloc_rs	       = null_alloc_rs,
+	.authorize	      = null_authorize,
+	.free_rs		= null_free_rs,
+};
+
+static struct ptlrpc_sec_policy null_policy = {
+	.sp_owner	       = THIS_MODULE,
+	.sp_name		= "sec.null",
+	.sp_policy	      = SPTLRPC_POLICY_NULL,
+	.sp_cops		= &null_sec_cops,
+	.sp_sops		= &null_sec_sops,
+};
+
+static void null_init_internal(void)
+{
+	static HLIST_HEAD(__list);
+
+	null_sec.ps_policy = &null_policy;
+	atomic_set(&null_sec.ps_refcount, 1);     /* always busy */
+	null_sec.ps_id = -1;
+	null_sec.ps_import = NULL;
+	null_sec.ps_flvr.sf_rpc = SPTLRPC_FLVR_NULL;
+	null_sec.ps_flvr.sf_flags = 0;
+	null_sec.ps_part = LUSTRE_SP_ANY;
+	null_sec.ps_dying = 0;
+	spin_lock_init(&null_sec.ps_lock);
+	atomic_set(&null_sec.ps_nctx, 1);	 /* for "null_cli_ctx" */
+	INIT_LIST_HEAD(&null_sec.ps_gc_list);
+	null_sec.ps_gc_interval = 0;
+	null_sec.ps_gc_next = 0;
+
+	hlist_add_head(&null_cli_ctx.cc_cache, &__list);
+	atomic_set(&null_cli_ctx.cc_refcount, 1);    /* for hash */
+	null_cli_ctx.cc_sec = &null_sec;
+	null_cli_ctx.cc_ops = &null_ctx_ops;
+	null_cli_ctx.cc_expire = 0;
+	null_cli_ctx.cc_flags = PTLRPC_CTX_CACHED | PTLRPC_CTX_ETERNAL |
+				PTLRPC_CTX_UPTODATE;
+	null_cli_ctx.cc_vcred.vc_uid = 0;
+	spin_lock_init(&null_cli_ctx.cc_lock);
+	INIT_LIST_HEAD(&null_cli_ctx.cc_req_list);
+	INIT_LIST_HEAD(&null_cli_ctx.cc_gc_chain);
+}
+
+int sptlrpc_null_init(void)
+{
+	int rc;
+
+	null_init_internal();
+
+	rc = sptlrpc_register_policy(&null_policy);
+	if (rc)
+		CERROR("failed to register %s: %d\n", null_policy.sp_name, rc);
+
+	return rc;
+}
+
+void sptlrpc_null_fini(void)
+{
+	int rc;
+
+	rc = sptlrpc_unregister_policy(&null_policy);
+	if (rc)
+		CERROR("failed to unregister %s: %d\n", null_policy.sp_name,rc);
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_plain.c b/drivers/staging/lustre/lustre/ptlrpc/sec_plain.c
new file mode 100644
index 000000000000..f552d2f182b1
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/sec_plain.c
@@ -0,0 +1,1021 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/sec_plain.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+
+#include <obd_support.h>
+#include <obd_cksum.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_sec.h>
+
+struct plain_sec {
+	struct ptlrpc_sec       pls_base;
+	rwlock_t	    pls_lock;
+	struct ptlrpc_cli_ctx  *pls_ctx;
+};
+
+static inline struct plain_sec *sec2plsec(struct ptlrpc_sec *sec)
+{
+	return container_of(sec, struct plain_sec, pls_base);
+}
+
+static struct ptlrpc_sec_policy plain_policy;
+static struct ptlrpc_ctx_ops    plain_ctx_ops;
+static struct ptlrpc_svc_ctx    plain_svc_ctx;
+
+static unsigned int plain_at_offset;
+
+/*
+ * for simplicity, plain policy rpc use fixed layout.
+ */
+#define PLAIN_PACK_SEGMENTS	     (4)
+
+#define PLAIN_PACK_HDR_OFF	      (0)
+#define PLAIN_PACK_MSG_OFF	      (1)
+#define PLAIN_PACK_USER_OFF	     (2)
+#define PLAIN_PACK_BULK_OFF	     (3)
+
+#define PLAIN_FL_USER		   (0x01)
+#define PLAIN_FL_BULK		   (0x02)
+
+struct plain_header {
+	__u8	    ph_ver;	    /* 0 */
+	__u8	    ph_flags;
+	__u8	    ph_sp;	     /* source */
+	__u8	    ph_bulk_hash_alg;  /* complete flavor desc */
+	__u8	    ph_pad[4];
+};
+
+struct plain_bulk_token {
+	__u8	    pbt_hash[8];
+};
+
+#define PLAIN_BSD_SIZE \
+	(sizeof(struct ptlrpc_bulk_sec_desc) + sizeof(struct plain_bulk_token))
+
+/****************************************
+ * bulk checksum helpers		*
+ ****************************************/
+
+static int plain_unpack_bsd(struct lustre_msg *msg, int swabbed)
+{
+	struct ptlrpc_bulk_sec_desc *bsd;
+
+	if (bulk_sec_desc_unpack(msg, PLAIN_PACK_BULK_OFF, swabbed))
+		return -EPROTO;
+
+	bsd = lustre_msg_buf(msg, PLAIN_PACK_BULK_OFF, PLAIN_BSD_SIZE);
+	if (bsd == NULL) {
+		CERROR("bulk sec desc has short size %d\n",
+		       lustre_msg_buflen(msg, PLAIN_PACK_BULK_OFF));
+		return -EPROTO;
+	}
+
+	if (bsd->bsd_svc != SPTLRPC_BULK_SVC_NULL &&
+	    bsd->bsd_svc != SPTLRPC_BULK_SVC_INTG) {
+		CERROR("invalid bulk svc %u\n", bsd->bsd_svc);
+		return -EPROTO;
+	}
+
+	return 0;
+}
+
+static int plain_generate_bulk_csum(struct ptlrpc_bulk_desc *desc,
+				    __u8 hash_alg,
+				    struct plain_bulk_token *token)
+{
+	if (hash_alg == BULK_HASH_ALG_NULL)
+		return 0;
+
+	memset(token->pbt_hash, 0, sizeof(token->pbt_hash));
+	return sptlrpc_get_bulk_checksum(desc, hash_alg, token->pbt_hash,
+					 sizeof(token->pbt_hash));
+}
+
+static int plain_verify_bulk_csum(struct ptlrpc_bulk_desc *desc,
+				  __u8 hash_alg,
+				  struct plain_bulk_token *tokenr)
+{
+	struct plain_bulk_token tokenv;
+	int		     rc;
+
+	if (hash_alg == BULK_HASH_ALG_NULL)
+		return 0;
+
+	memset(&tokenv.pbt_hash, 0, sizeof(tokenv.pbt_hash));
+	rc = sptlrpc_get_bulk_checksum(desc, hash_alg, tokenv.pbt_hash,
+				       sizeof(tokenv.pbt_hash));
+	if (rc)
+		return rc;
+
+	if (memcmp(tokenr->pbt_hash, tokenv.pbt_hash, sizeof(tokenr->pbt_hash)))
+		return -EACCES;
+	return 0;
+}
+
+static void corrupt_bulk_data(struct ptlrpc_bulk_desc *desc)
+{
+	char	   *ptr;
+	unsigned int    off, i;
+
+	for (i = 0; i < desc->bd_iov_count; i++) {
+		if (desc->bd_iov[i].kiov_len == 0)
+			continue;
+
+		ptr = kmap(desc->bd_iov[i].kiov_page);
+		off = desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK;
+		ptr[off] ^= 0x1;
+		kunmap(desc->bd_iov[i].kiov_page);
+		return;
+	}
+}
+
+/****************************************
+ * cli_ctx apis			 *
+ ****************************************/
+
+static
+int plain_ctx_refresh(struct ptlrpc_cli_ctx *ctx)
+{
+	/* should never reach here */
+	LBUG();
+	return 0;
+}
+
+static
+int plain_ctx_validate(struct ptlrpc_cli_ctx *ctx)
+{
+	return 0;
+}
+
+static
+int plain_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
+{
+	struct lustre_msg   *msg = req->rq_reqbuf;
+	struct plain_header *phdr;
+	ENTRY;
+
+	msg->lm_secflvr = req->rq_flvr.sf_rpc;
+
+	phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, 0);
+	phdr->ph_ver = 0;
+	phdr->ph_flags = 0;
+	phdr->ph_sp = ctx->cc_sec->ps_part;
+	phdr->ph_bulk_hash_alg = req->rq_flvr.u_bulk.hash.hash_alg;
+
+	if (req->rq_pack_udesc)
+		phdr->ph_flags |= PLAIN_FL_USER;
+	if (req->rq_pack_bulk)
+		phdr->ph_flags |= PLAIN_FL_BULK;
+
+	req->rq_reqdata_len = lustre_msg_size_v2(msg->lm_bufcount,
+						 msg->lm_buflens);
+	RETURN(0);
+}
+
+static
+int plain_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
+{
+	struct lustre_msg   *msg = req->rq_repdata;
+	struct plain_header *phdr;
+	__u32		cksum;
+	int		  swabbed;
+	ENTRY;
+
+	if (msg->lm_bufcount != PLAIN_PACK_SEGMENTS) {
+		CERROR("unexpected reply buf count %u\n", msg->lm_bufcount);
+		RETURN(-EPROTO);
+	}
+
+	swabbed = ptlrpc_rep_need_swab(req);
+
+	phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, sizeof(*phdr));
+	if (phdr == NULL) {
+		CERROR("missing plain header\n");
+		RETURN(-EPROTO);
+	}
+
+	if (phdr->ph_ver != 0) {
+		CERROR("Invalid header version\n");
+		RETURN(-EPROTO);
+	}
+
+	/* expect no user desc in reply */
+	if (phdr->ph_flags & PLAIN_FL_USER) {
+		CERROR("Unexpected udesc flag in reply\n");
+		RETURN(-EPROTO);
+	}
+
+	if (phdr->ph_bulk_hash_alg != req->rq_flvr.u_bulk.hash.hash_alg) {
+		CERROR("reply bulk flavor %u != %u\n", phdr->ph_bulk_hash_alg,
+		       req->rq_flvr.u_bulk.hash.hash_alg);
+		RETURN(-EPROTO);
+	}
+
+	if (unlikely(req->rq_early)) {
+		unsigned int hsize = 4;
+
+		cfs_crypto_hash_digest(CFS_HASH_ALG_CRC32,
+				lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0),
+				lustre_msg_buflen(msg, PLAIN_PACK_MSG_OFF),
+				NULL, 0, (unsigned char *)&cksum, &hsize);
+		if (cksum != msg->lm_cksum) {
+			CDEBUG(D_SEC,
+			       "early reply checksum mismatch: %08x != %08x\n",
+			       cpu_to_le32(cksum), msg->lm_cksum);
+			RETURN(-EINVAL);
+		}
+	} else {
+		/* whether we sent with bulk or not, we expect the same
+		 * in reply, except for early reply */
+		if (!req->rq_early &&
+		    !equi(req->rq_pack_bulk == 1,
+			  phdr->ph_flags & PLAIN_FL_BULK)) {
+			CERROR("%s bulk checksum in reply\n",
+			       req->rq_pack_bulk ? "Missing" : "Unexpected");
+			RETURN(-EPROTO);
+		}
+
+		if (phdr->ph_flags & PLAIN_FL_BULK) {
+			if (plain_unpack_bsd(msg, swabbed))
+				RETURN(-EPROTO);
+		}
+	}
+
+	req->rq_repmsg = lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0);
+	req->rq_replen = lustre_msg_buflen(msg, PLAIN_PACK_MSG_OFF);
+	RETURN(0);
+}
+
+static
+int plain_cli_wrap_bulk(struct ptlrpc_cli_ctx *ctx,
+			struct ptlrpc_request *req,
+			struct ptlrpc_bulk_desc *desc)
+{
+	struct ptlrpc_bulk_sec_desc *bsd;
+	struct plain_bulk_token     *token;
+	int			  rc;
+
+	LASSERT(req->rq_pack_bulk);
+	LASSERT(req->rq_reqbuf->lm_bufcount == PLAIN_PACK_SEGMENTS);
+
+	bsd = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0);
+	token = (struct plain_bulk_token *) bsd->bsd_data;
+
+	bsd->bsd_version = 0;
+	bsd->bsd_flags = 0;
+	bsd->bsd_type = SPTLRPC_BULK_DEFAULT;
+	bsd->bsd_svc = SPTLRPC_FLVR_BULK_SVC(req->rq_flvr.sf_rpc);
+
+	if (bsd->bsd_svc == SPTLRPC_BULK_SVC_NULL)
+		RETURN(0);
+
+	if (req->rq_bulk_read)
+		RETURN(0);
+
+	rc = plain_generate_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg,
+				      token);
+	if (rc) {
+		CERROR("bulk write: failed to compute checksum: %d\n", rc);
+	} else {
+		/*
+		 * for sending we only compute the wrong checksum instead
+		 * of corrupting the data so it is still correct on a redo
+		 */
+		if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND) &&
+		    req->rq_flvr.u_bulk.hash.hash_alg != BULK_HASH_ALG_NULL)
+			token->pbt_hash[0] ^= 0x1;
+	}
+
+	return rc;
+}
+
+static
+int plain_cli_unwrap_bulk(struct ptlrpc_cli_ctx *ctx,
+			  struct ptlrpc_request *req,
+			  struct ptlrpc_bulk_desc *desc)
+{
+	struct ptlrpc_bulk_sec_desc *bsdv;
+	struct plain_bulk_token     *tokenv;
+	int			  rc;
+	int			  i, nob;
+
+	LASSERT(req->rq_pack_bulk);
+	LASSERT(req->rq_reqbuf->lm_bufcount == PLAIN_PACK_SEGMENTS);
+	LASSERT(req->rq_repdata->lm_bufcount == PLAIN_PACK_SEGMENTS);
+
+	bsdv = lustre_msg_buf(req->rq_repdata, PLAIN_PACK_BULK_OFF, 0);
+	tokenv = (struct plain_bulk_token *) bsdv->bsd_data;
+
+	if (req->rq_bulk_write) {
+		if (bsdv->bsd_flags & BSD_FL_ERR)
+			return -EIO;
+		return 0;
+	}
+
+	/* fix the actual data size */
+	for (i = 0, nob = 0; i < desc->bd_iov_count; i++) {
+		if (desc->bd_iov[i].kiov_len + nob > desc->bd_nob_transferred) {
+			desc->bd_iov[i].kiov_len =
+				desc->bd_nob_transferred - nob;
+		}
+		nob += desc->bd_iov[i].kiov_len;
+	}
+
+	rc = plain_verify_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg,
+				    tokenv);
+	if (rc)
+		CERROR("bulk read: client verify failed: %d\n", rc);
+
+	return rc;
+}
+
+/****************************************
+ * sec apis			     *
+ ****************************************/
+
+static
+struct ptlrpc_cli_ctx *plain_sec_install_ctx(struct plain_sec *plsec)
+{
+	struct ptlrpc_cli_ctx  *ctx, *ctx_new;
+
+	OBD_ALLOC_PTR(ctx_new);
+
+	write_lock(&plsec->pls_lock);
+
+	ctx = plsec->pls_ctx;
+	if (ctx) {
+		atomic_inc(&ctx->cc_refcount);
+
+		if (ctx_new)
+			OBD_FREE_PTR(ctx_new);
+	} else if (ctx_new) {
+		ctx = ctx_new;
+
+		atomic_set(&ctx->cc_refcount, 1); /* for cache */
+		ctx->cc_sec = &plsec->pls_base;
+		ctx->cc_ops = &plain_ctx_ops;
+		ctx->cc_expire = 0;
+		ctx->cc_flags = PTLRPC_CTX_CACHED | PTLRPC_CTX_UPTODATE;
+		ctx->cc_vcred.vc_uid = 0;
+		spin_lock_init(&ctx->cc_lock);
+		INIT_LIST_HEAD(&ctx->cc_req_list);
+		INIT_LIST_HEAD(&ctx->cc_gc_chain);
+
+		plsec->pls_ctx = ctx;
+		atomic_inc(&plsec->pls_base.ps_nctx);
+		atomic_inc(&plsec->pls_base.ps_refcount);
+
+		atomic_inc(&ctx->cc_refcount); /* for caller */
+	}
+
+	write_unlock(&plsec->pls_lock);
+
+	return ctx;
+}
+
+static
+void plain_destroy_sec(struct ptlrpc_sec *sec)
+{
+	struct plain_sec       *plsec = sec2plsec(sec);
+	ENTRY;
+
+	LASSERT(sec->ps_policy == &plain_policy);
+	LASSERT(sec->ps_import);
+	LASSERT(atomic_read(&sec->ps_refcount) == 0);
+	LASSERT(atomic_read(&sec->ps_nctx) == 0);
+	LASSERT(plsec->pls_ctx == NULL);
+
+	class_import_put(sec->ps_import);
+
+	OBD_FREE_PTR(plsec);
+	EXIT;
+}
+
+static
+void plain_kill_sec(struct ptlrpc_sec *sec)
+{
+	sec->ps_dying = 1;
+}
+
+static
+struct ptlrpc_sec *plain_create_sec(struct obd_import *imp,
+				    struct ptlrpc_svc_ctx *svc_ctx,
+				    struct sptlrpc_flavor *sf)
+{
+	struct plain_sec       *plsec;
+	struct ptlrpc_sec      *sec;
+	struct ptlrpc_cli_ctx  *ctx;
+	ENTRY;
+
+	LASSERT(SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN);
+
+	OBD_ALLOC_PTR(plsec);
+	if (plsec == NULL)
+		RETURN(NULL);
+
+	/*
+	 * initialize plain_sec
+	 */
+	rwlock_init(&plsec->pls_lock);
+	plsec->pls_ctx = NULL;
+
+	sec = &plsec->pls_base;
+	sec->ps_policy = &plain_policy;
+	atomic_set(&sec->ps_refcount, 0);
+	atomic_set(&sec->ps_nctx, 0);
+	sec->ps_id = sptlrpc_get_next_secid();
+	sec->ps_import = class_import_get(imp);
+	sec->ps_flvr = *sf;
+	spin_lock_init(&sec->ps_lock);
+	INIT_LIST_HEAD(&sec->ps_gc_list);
+	sec->ps_gc_interval = 0;
+	sec->ps_gc_next = 0;
+
+	/* install ctx immediately if this is a reverse sec */
+	if (svc_ctx) {
+		ctx = plain_sec_install_ctx(plsec);
+		if (ctx == NULL) {
+			plain_destroy_sec(sec);
+			RETURN(NULL);
+		}
+		sptlrpc_cli_ctx_put(ctx, 1);
+	}
+
+	RETURN(sec);
+}
+
+static
+struct ptlrpc_cli_ctx *plain_lookup_ctx(struct ptlrpc_sec *sec,
+					struct vfs_cred *vcred,
+					int create, int remove_dead)
+{
+	struct plain_sec       *plsec = sec2plsec(sec);
+	struct ptlrpc_cli_ctx  *ctx;
+	ENTRY;
+
+	read_lock(&plsec->pls_lock);
+	ctx = plsec->pls_ctx;
+	if (ctx)
+		atomic_inc(&ctx->cc_refcount);
+	read_unlock(&plsec->pls_lock);
+
+	if (unlikely(ctx == NULL))
+		ctx = plain_sec_install_ctx(plsec);
+
+	RETURN(ctx);
+}
+
+static
+void plain_release_ctx(struct ptlrpc_sec *sec,
+		       struct ptlrpc_cli_ctx *ctx, int sync)
+{
+	LASSERT(atomic_read(&sec->ps_refcount) > 0);
+	LASSERT(atomic_read(&sec->ps_nctx) > 0);
+	LASSERT(atomic_read(&ctx->cc_refcount) == 0);
+	LASSERT(ctx->cc_sec == sec);
+
+	OBD_FREE_PTR(ctx);
+
+	atomic_dec(&sec->ps_nctx);
+	sptlrpc_sec_put(sec);
+}
+
+static
+int plain_flush_ctx_cache(struct ptlrpc_sec *sec,
+			  uid_t uid, int grace, int force)
+{
+	struct plain_sec       *plsec = sec2plsec(sec);
+	struct ptlrpc_cli_ctx  *ctx;
+	ENTRY;
+
+	/* do nothing unless caller want to flush for 'all' */
+	if (uid != -1)
+		RETURN(0);
+
+	write_lock(&plsec->pls_lock);
+	ctx = plsec->pls_ctx;
+	plsec->pls_ctx = NULL;
+	write_unlock(&plsec->pls_lock);
+
+	if (ctx)
+		sptlrpc_cli_ctx_put(ctx, 1);
+	RETURN(0);
+}
+
+static
+int plain_alloc_reqbuf(struct ptlrpc_sec *sec,
+		       struct ptlrpc_request *req,
+		       int msgsize)
+{
+	__u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, };
+	int   alloc_len;
+	ENTRY;
+
+	buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header);
+	buflens[PLAIN_PACK_MSG_OFF] = msgsize;
+
+	if (req->rq_pack_udesc)
+		buflens[PLAIN_PACK_USER_OFF] = sptlrpc_current_user_desc_size();
+
+	if (req->rq_pack_bulk) {
+		LASSERT(req->rq_bulk_read || req->rq_bulk_write);
+		buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE;
+	}
+
+	alloc_len = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens);
+
+	if (!req->rq_reqbuf) {
+		LASSERT(!req->rq_pool);
+
+		alloc_len = size_roundup_power2(alloc_len);
+		OBD_ALLOC_LARGE(req->rq_reqbuf, alloc_len);
+		if (!req->rq_reqbuf)
+			RETURN(-ENOMEM);
+
+		req->rq_reqbuf_len = alloc_len;
+	} else {
+		LASSERT(req->rq_pool);
+		LASSERT(req->rq_reqbuf_len >= alloc_len);
+		memset(req->rq_reqbuf, 0, alloc_len);
+	}
+
+	lustre_init_msg_v2(req->rq_reqbuf, PLAIN_PACK_SEGMENTS, buflens, NULL);
+	req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_MSG_OFF, 0);
+
+	if (req->rq_pack_udesc)
+		sptlrpc_pack_user_desc(req->rq_reqbuf, PLAIN_PACK_USER_OFF);
+
+	RETURN(0);
+}
+
+static
+void plain_free_reqbuf(struct ptlrpc_sec *sec,
+		       struct ptlrpc_request *req)
+{
+	ENTRY;
+	if (!req->rq_pool) {
+		OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+		req->rq_reqbuf = NULL;
+		req->rq_reqbuf_len = 0;
+	}
+	EXIT;
+}
+
+static
+int plain_alloc_repbuf(struct ptlrpc_sec *sec,
+		       struct ptlrpc_request *req,
+		       int msgsize)
+{
+	__u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, };
+	int alloc_len;
+	ENTRY;
+
+	buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header);
+	buflens[PLAIN_PACK_MSG_OFF] = msgsize;
+
+	if (req->rq_pack_bulk) {
+		LASSERT(req->rq_bulk_read || req->rq_bulk_write);
+		buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE;
+	}
+
+	alloc_len = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens);
+
+	/* add space for early reply */
+	alloc_len += plain_at_offset;
+
+	alloc_len = size_roundup_power2(alloc_len);
+
+	OBD_ALLOC_LARGE(req->rq_repbuf, alloc_len);
+	if (!req->rq_repbuf)
+		RETURN(-ENOMEM);
+
+	req->rq_repbuf_len = alloc_len;
+	RETURN(0);
+}
+
+static
+void plain_free_repbuf(struct ptlrpc_sec *sec,
+		       struct ptlrpc_request *req)
+{
+	ENTRY;
+	OBD_FREE_LARGE(req->rq_repbuf, req->rq_repbuf_len);
+	req->rq_repbuf = NULL;
+	req->rq_repbuf_len = 0;
+	EXIT;
+}
+
+static
+int plain_enlarge_reqbuf(struct ptlrpc_sec *sec,
+			 struct ptlrpc_request *req,
+			 int segment, int newsize)
+{
+	struct lustre_msg      *newbuf;
+	int		     oldsize;
+	int		     newmsg_size, newbuf_size;
+	ENTRY;
+
+	LASSERT(req->rq_reqbuf);
+	LASSERT(req->rq_reqbuf_len >= req->rq_reqlen);
+	LASSERT(lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_MSG_OFF, 0) ==
+		req->rq_reqmsg);
+
+	/* compute new embedded msg size.  */
+	oldsize = req->rq_reqmsg->lm_buflens[segment];
+	req->rq_reqmsg->lm_buflens[segment] = newsize;
+	newmsg_size = lustre_msg_size_v2(req->rq_reqmsg->lm_bufcount,
+					 req->rq_reqmsg->lm_buflens);
+	req->rq_reqmsg->lm_buflens[segment] = oldsize;
+
+	/* compute new wrapper msg size.  */
+	oldsize = req->rq_reqbuf->lm_buflens[PLAIN_PACK_MSG_OFF];
+	req->rq_reqbuf->lm_buflens[PLAIN_PACK_MSG_OFF] = newmsg_size;
+	newbuf_size = lustre_msg_size_v2(req->rq_reqbuf->lm_bufcount,
+					 req->rq_reqbuf->lm_buflens);
+	req->rq_reqbuf->lm_buflens[PLAIN_PACK_MSG_OFF] = oldsize;
+
+	/* request from pool should always have enough buffer */
+	LASSERT(!req->rq_pool || req->rq_reqbuf_len >= newbuf_size);
+
+	if (req->rq_reqbuf_len < newbuf_size) {
+		newbuf_size = size_roundup_power2(newbuf_size);
+
+		OBD_ALLOC_LARGE(newbuf, newbuf_size);
+		if (newbuf == NULL)
+			RETURN(-ENOMEM);
+
+		memcpy(newbuf, req->rq_reqbuf, req->rq_reqbuf_len);
+
+		OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+		req->rq_reqbuf = newbuf;
+		req->rq_reqbuf_len = newbuf_size;
+		req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf,
+						PLAIN_PACK_MSG_OFF, 0);
+	}
+
+	_sptlrpc_enlarge_msg_inplace(req->rq_reqbuf, PLAIN_PACK_MSG_OFF,
+				     newmsg_size);
+	_sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize);
+
+	req->rq_reqlen = newmsg_size;
+	RETURN(0);
+}
+
+/****************************************
+ * service apis			 *
+ ****************************************/
+
+static struct ptlrpc_svc_ctx plain_svc_ctx = {
+	.sc_refcount    = ATOMIC_INIT(1),
+	.sc_policy      = &plain_policy,
+};
+
+static
+int plain_accept(struct ptlrpc_request *req)
+{
+	struct lustre_msg   *msg = req->rq_reqbuf;
+	struct plain_header *phdr;
+	int		  swabbed;
+	ENTRY;
+
+	LASSERT(SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) ==
+		SPTLRPC_POLICY_PLAIN);
+
+	if (SPTLRPC_FLVR_BASE(req->rq_flvr.sf_rpc) !=
+	    SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_PLAIN) ||
+	    SPTLRPC_FLVR_BULK_TYPE(req->rq_flvr.sf_rpc) !=
+	    SPTLRPC_FLVR_BULK_TYPE(SPTLRPC_FLVR_PLAIN)) {
+		CERROR("Invalid rpc flavor %x\n", req->rq_flvr.sf_rpc);
+		RETURN(SECSVC_DROP);
+	}
+
+	if (msg->lm_bufcount < PLAIN_PACK_SEGMENTS) {
+		CERROR("unexpected request buf count %u\n", msg->lm_bufcount);
+		RETURN(SECSVC_DROP);
+	}
+
+	swabbed = ptlrpc_req_need_swab(req);
+
+	phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, sizeof(*phdr));
+	if (phdr == NULL) {
+		CERROR("missing plain header\n");
+		RETURN(-EPROTO);
+	}
+
+	if (phdr->ph_ver != 0) {
+		CERROR("Invalid header version\n");
+		RETURN(-EPROTO);
+	}
+
+	if (phdr->ph_bulk_hash_alg >= BULK_HASH_ALG_MAX) {
+		CERROR("invalid hash algorithm: %u\n", phdr->ph_bulk_hash_alg);
+		RETURN(-EPROTO);
+	}
+
+	req->rq_sp_from = phdr->ph_sp;
+	req->rq_flvr.u_bulk.hash.hash_alg = phdr->ph_bulk_hash_alg;
+
+	if (phdr->ph_flags & PLAIN_FL_USER) {
+		if (sptlrpc_unpack_user_desc(msg, PLAIN_PACK_USER_OFF,
+					     swabbed)) {
+			CERROR("Mal-formed user descriptor\n");
+			RETURN(SECSVC_DROP);
+		}
+
+		req->rq_pack_udesc = 1;
+		req->rq_user_desc = lustre_msg_buf(msg, PLAIN_PACK_USER_OFF, 0);
+	}
+
+	if (phdr->ph_flags & PLAIN_FL_BULK) {
+		if (plain_unpack_bsd(msg, swabbed))
+			RETURN(SECSVC_DROP);
+
+		req->rq_pack_bulk = 1;
+	}
+
+	req->rq_reqmsg = lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0);
+	req->rq_reqlen = msg->lm_buflens[PLAIN_PACK_MSG_OFF];
+
+	req->rq_svc_ctx = &plain_svc_ctx;
+	atomic_inc(&req->rq_svc_ctx->sc_refcount);
+
+	RETURN(SECSVC_OK);
+}
+
+static
+int plain_alloc_rs(struct ptlrpc_request *req, int msgsize)
+{
+	struct ptlrpc_reply_state   *rs;
+	__u32			buflens[PLAIN_PACK_SEGMENTS] = { 0, };
+	int			  rs_size = sizeof(*rs);
+	ENTRY;
+
+	LASSERT(msgsize % 8 == 0);
+
+	buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header);
+	buflens[PLAIN_PACK_MSG_OFF] = msgsize;
+
+	if (req->rq_pack_bulk && (req->rq_bulk_read || req->rq_bulk_write))
+		buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE;
+
+	rs_size += lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens);
+
+	rs = req->rq_reply_state;
+
+	if (rs) {
+		/* pre-allocated */
+		LASSERT(rs->rs_size >= rs_size);
+	} else {
+		OBD_ALLOC_LARGE(rs, rs_size);
+		if (rs == NULL)
+			RETURN(-ENOMEM);
+
+		rs->rs_size = rs_size;
+	}
+
+	rs->rs_svc_ctx = req->rq_svc_ctx;
+	atomic_inc(&req->rq_svc_ctx->sc_refcount);
+	rs->rs_repbuf = (struct lustre_msg *) (rs + 1);
+	rs->rs_repbuf_len = rs_size - sizeof(*rs);
+
+	lustre_init_msg_v2(rs->rs_repbuf, PLAIN_PACK_SEGMENTS, buflens, NULL);
+	rs->rs_msg = lustre_msg_buf_v2(rs->rs_repbuf, PLAIN_PACK_MSG_OFF, 0);
+
+	req->rq_reply_state = rs;
+	RETURN(0);
+}
+
+static
+void plain_free_rs(struct ptlrpc_reply_state *rs)
+{
+	ENTRY;
+
+	LASSERT(atomic_read(&rs->rs_svc_ctx->sc_refcount) > 1);
+	atomic_dec(&rs->rs_svc_ctx->sc_refcount);
+
+	if (!rs->rs_prealloc)
+		OBD_FREE_LARGE(rs, rs->rs_size);
+	EXIT;
+}
+
+static
+int plain_authorize(struct ptlrpc_request *req)
+{
+	struct ptlrpc_reply_state *rs = req->rq_reply_state;
+	struct lustre_msg_v2      *msg = rs->rs_repbuf;
+	struct plain_header       *phdr;
+	int			len;
+	ENTRY;
+
+	LASSERT(rs);
+	LASSERT(msg);
+
+	if (req->rq_replen != msg->lm_buflens[PLAIN_PACK_MSG_OFF])
+		len = lustre_shrink_msg(msg, PLAIN_PACK_MSG_OFF,
+					req->rq_replen, 1);
+	else
+		len = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+
+	msg->lm_secflvr = req->rq_flvr.sf_rpc;
+
+	phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, 0);
+	phdr->ph_ver = 0;
+	phdr->ph_flags = 0;
+	phdr->ph_bulk_hash_alg = req->rq_flvr.u_bulk.hash.hash_alg;
+
+	if (req->rq_pack_bulk)
+		phdr->ph_flags |= PLAIN_FL_BULK;
+
+	rs->rs_repdata_len = len;
+
+	if (likely(req->rq_packed_final)) {
+		if (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)
+			req->rq_reply_off = plain_at_offset;
+		else
+			req->rq_reply_off = 0;
+	} else {
+		unsigned int hsize = 4;
+
+		cfs_crypto_hash_digest(CFS_HASH_ALG_CRC32,
+			lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0),
+			lustre_msg_buflen(msg, PLAIN_PACK_MSG_OFF),
+			NULL, 0, (unsigned char *)&msg->lm_cksum, &hsize);
+			req->rq_reply_off = 0;
+	}
+
+	RETURN(0);
+}
+
+static
+int plain_svc_unwrap_bulk(struct ptlrpc_request *req,
+			  struct ptlrpc_bulk_desc *desc)
+{
+	struct ptlrpc_reply_state   *rs = req->rq_reply_state;
+	struct ptlrpc_bulk_sec_desc *bsdr, *bsdv;
+	struct plain_bulk_token     *tokenr;
+	int			  rc;
+
+	LASSERT(req->rq_bulk_write);
+	LASSERT(req->rq_pack_bulk);
+
+	bsdr = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0);
+	tokenr = (struct plain_bulk_token *) bsdr->bsd_data;
+	bsdv = lustre_msg_buf(rs->rs_repbuf, PLAIN_PACK_BULK_OFF, 0);
+
+	bsdv->bsd_version = 0;
+	bsdv->bsd_type = SPTLRPC_BULK_DEFAULT;
+	bsdv->bsd_svc = bsdr->bsd_svc;
+	bsdv->bsd_flags = 0;
+
+	if (bsdr->bsd_svc == SPTLRPC_BULK_SVC_NULL)
+		return 0;
+
+	rc = plain_verify_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg,
+				    tokenr);
+	if (rc) {
+		bsdv->bsd_flags |= BSD_FL_ERR;
+		CERROR("bulk write: server verify failed: %d\n", rc);
+	}
+
+	return rc;
+}
+
+static
+int plain_svc_wrap_bulk(struct ptlrpc_request *req,
+			struct ptlrpc_bulk_desc *desc)
+{
+	struct ptlrpc_reply_state   *rs = req->rq_reply_state;
+	struct ptlrpc_bulk_sec_desc *bsdr, *bsdv;
+	struct plain_bulk_token     *tokenv;
+	int			  rc;
+
+	LASSERT(req->rq_bulk_read);
+	LASSERT(req->rq_pack_bulk);
+
+	bsdr = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0);
+	bsdv = lustre_msg_buf(rs->rs_repbuf, PLAIN_PACK_BULK_OFF, 0);
+	tokenv = (struct plain_bulk_token *) bsdv->bsd_data;
+
+	bsdv->bsd_version = 0;
+	bsdv->bsd_type = SPTLRPC_BULK_DEFAULT;
+	bsdv->bsd_svc = bsdr->bsd_svc;
+	bsdv->bsd_flags = 0;
+
+	if (bsdr->bsd_svc == SPTLRPC_BULK_SVC_NULL)
+		return 0;
+
+	rc = plain_generate_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg,
+				      tokenv);
+	if (rc) {
+		CERROR("bulk read: server failed to compute "
+		       "checksum: %d\n", rc);
+	} else {
+		if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE))
+			corrupt_bulk_data(desc);
+	}
+
+	return rc;
+}
+
+static struct ptlrpc_ctx_ops plain_ctx_ops = {
+	.refresh		= plain_ctx_refresh,
+	.validate	       = plain_ctx_validate,
+	.sign		   = plain_ctx_sign,
+	.verify		 = plain_ctx_verify,
+	.wrap_bulk	      = plain_cli_wrap_bulk,
+	.unwrap_bulk	    = plain_cli_unwrap_bulk,
+};
+
+static struct ptlrpc_sec_cops plain_sec_cops = {
+	.create_sec	     = plain_create_sec,
+	.destroy_sec	    = plain_destroy_sec,
+	.kill_sec	       = plain_kill_sec,
+	.lookup_ctx	     = plain_lookup_ctx,
+	.release_ctx	    = plain_release_ctx,
+	.flush_ctx_cache	= plain_flush_ctx_cache,
+	.alloc_reqbuf	   = plain_alloc_reqbuf,
+	.free_reqbuf	    = plain_free_reqbuf,
+	.alloc_repbuf	   = plain_alloc_repbuf,
+	.free_repbuf	    = plain_free_repbuf,
+	.enlarge_reqbuf	 = plain_enlarge_reqbuf,
+};
+
+static struct ptlrpc_sec_sops plain_sec_sops = {
+	.accept		 = plain_accept,
+	.alloc_rs	       = plain_alloc_rs,
+	.authorize	      = plain_authorize,
+	.free_rs		= plain_free_rs,
+	.unwrap_bulk	    = plain_svc_unwrap_bulk,
+	.wrap_bulk	      = plain_svc_wrap_bulk,
+};
+
+static struct ptlrpc_sec_policy plain_policy = {
+	.sp_owner	       = THIS_MODULE,
+	.sp_name		= "plain",
+	.sp_policy	      = SPTLRPC_POLICY_PLAIN,
+	.sp_cops		= &plain_sec_cops,
+	.sp_sops		= &plain_sec_sops,
+};
+
+int sptlrpc_plain_init(void)
+{
+	__u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, };
+	int rc;
+
+	buflens[PLAIN_PACK_MSG_OFF] = lustre_msg_early_size();
+	plain_at_offset = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens);
+
+	rc = sptlrpc_register_policy(&plain_policy);
+	if (rc)
+		CERROR("failed to register: %d\n", rc);
+
+	return rc;
+}
+
+void sptlrpc_plain_fini(void)
+{
+	int rc;
+
+	rc = sptlrpc_unregister_policy(&plain_policy);
+	if (rc)
+		CERROR("cannot unregister: %d\n", rc);
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/service.c b/drivers/staging/lustre/lustre/ptlrpc/service.c
new file mode 100644
index 000000000000..80111273b2dc
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/service.c
@@ -0,0 +1,3128 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lu_object.h>
+#include <linux/lnet/types.h>
+#include "ptlrpc_internal.h"
+
+/* The following are visible and mutable through /sys/module/ptlrpc */
+int test_req_buffer_pressure = 0;
+CFS_MODULE_PARM(test_req_buffer_pressure, "i", int, 0444,
+		"set non-zero to put pressure on request buffer pools");
+CFS_MODULE_PARM(at_min, "i", int, 0644,
+		"Adaptive timeout minimum (sec)");
+CFS_MODULE_PARM(at_max, "i", int, 0644,
+		"Adaptive timeout maximum (sec)");
+CFS_MODULE_PARM(at_history, "i", int, 0644,
+		"Adaptive timeouts remember the slowest event that took place "
+		"within this period (sec)");
+CFS_MODULE_PARM(at_early_margin, "i", int, 0644,
+		"How soon before an RPC deadline to send an early reply");
+CFS_MODULE_PARM(at_extra, "i", int, 0644,
+		"How much extra time to give with each early reply");
+
+
+/* forward ref */
+static int ptlrpc_server_post_idle_rqbds(struct ptlrpc_service_part *svcpt);
+static void ptlrpc_server_hpreq_fini(struct ptlrpc_request *req);
+static void ptlrpc_at_remove_timed(struct ptlrpc_request *req);
+
+/** Holds a list of all PTLRPC services */
+LIST_HEAD(ptlrpc_all_services);
+/** Used to protect the \e ptlrpc_all_services list */
+struct mutex ptlrpc_all_services_mutex;
+
+struct ptlrpc_request_buffer_desc *
+ptlrpc_alloc_rqbd(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_service		  *svc = svcpt->scp_service;
+	struct ptlrpc_request_buffer_desc *rqbd;
+
+	OBD_CPT_ALLOC_PTR(rqbd, svc->srv_cptable, svcpt->scp_cpt);
+	if (rqbd == NULL)
+		return NULL;
+
+	rqbd->rqbd_svcpt = svcpt;
+	rqbd->rqbd_refcount = 0;
+	rqbd->rqbd_cbid.cbid_fn = request_in_callback;
+	rqbd->rqbd_cbid.cbid_arg = rqbd;
+	INIT_LIST_HEAD(&rqbd->rqbd_reqs);
+	OBD_CPT_ALLOC_LARGE(rqbd->rqbd_buffer, svc->srv_cptable,
+			    svcpt->scp_cpt, svc->srv_buf_size);
+	if (rqbd->rqbd_buffer == NULL) {
+		OBD_FREE_PTR(rqbd);
+		return NULL;
+	}
+
+	spin_lock(&svcpt->scp_lock);
+	list_add(&rqbd->rqbd_list, &svcpt->scp_rqbd_idle);
+	svcpt->scp_nrqbds_total++;
+	spin_unlock(&svcpt->scp_lock);
+
+	return rqbd;
+}
+
+void
+ptlrpc_free_rqbd(struct ptlrpc_request_buffer_desc *rqbd)
+{
+	struct ptlrpc_service_part *svcpt = rqbd->rqbd_svcpt;
+
+	LASSERT(rqbd->rqbd_refcount == 0);
+	LASSERT(list_empty(&rqbd->rqbd_reqs));
+
+	spin_lock(&svcpt->scp_lock);
+	list_del(&rqbd->rqbd_list);
+	svcpt->scp_nrqbds_total--;
+	spin_unlock(&svcpt->scp_lock);
+
+	OBD_FREE_LARGE(rqbd->rqbd_buffer, svcpt->scp_service->srv_buf_size);
+	OBD_FREE_PTR(rqbd);
+}
+
+int
+ptlrpc_grow_req_bufs(struct ptlrpc_service_part *svcpt, int post)
+{
+	struct ptlrpc_service		  *svc = svcpt->scp_service;
+	struct ptlrpc_request_buffer_desc *rqbd;
+	int				rc = 0;
+	int				i;
+
+	if (svcpt->scp_rqbd_allocating)
+		goto try_post;
+
+	spin_lock(&svcpt->scp_lock);
+	/* check again with lock */
+	if (svcpt->scp_rqbd_allocating) {
+		/* NB: we might allow more than one thread in the future */
+		LASSERT(svcpt->scp_rqbd_allocating == 1);
+		spin_unlock(&svcpt->scp_lock);
+		goto try_post;
+	}
+
+	svcpt->scp_rqbd_allocating++;
+	spin_unlock(&svcpt->scp_lock);
+
+
+	for (i = 0; i < svc->srv_nbuf_per_group; i++) {
+		/* NB: another thread might have recycled enough rqbds, we
+		 * need to make sure it wouldn't over-allocate, see LU-1212. */
+		if (svcpt->scp_nrqbds_posted >= svc->srv_nbuf_per_group)
+			break;
+
+		rqbd = ptlrpc_alloc_rqbd(svcpt);
+
+		if (rqbd == NULL) {
+			CERROR("%s: Can't allocate request buffer\n",
+			       svc->srv_name);
+			rc = -ENOMEM;
+			break;
+		}
+	}
+
+	spin_lock(&svcpt->scp_lock);
+
+	LASSERT(svcpt->scp_rqbd_allocating == 1);
+	svcpt->scp_rqbd_allocating--;
+
+	spin_unlock(&svcpt->scp_lock);
+
+	CDEBUG(D_RPCTRACE,
+	       "%s: allocate %d new %d-byte reqbufs (%d/%d left), rc = %d\n",
+	       svc->srv_name, i, svc->srv_buf_size, svcpt->scp_nrqbds_posted,
+	       svcpt->scp_nrqbds_total, rc);
+
+ try_post:
+	if (post && rc == 0)
+		rc = ptlrpc_server_post_idle_rqbds(svcpt);
+
+	return rc;
+}
+
+/**
+ * Part of Rep-Ack logic.
+ * Puts a lock and its mode into reply state assotiated to request reply.
+ */
+void
+ptlrpc_save_lock(struct ptlrpc_request *req,
+		 struct lustre_handle *lock, int mode, int no_ack)
+{
+	struct ptlrpc_reply_state *rs = req->rq_reply_state;
+	int			idx;
+
+	LASSERT(rs != NULL);
+	LASSERT(rs->rs_nlocks < RS_MAX_LOCKS);
+
+	if (req->rq_export->exp_disconnected) {
+		ldlm_lock_decref(lock, mode);
+	} else {
+		idx = rs->rs_nlocks++;
+		rs->rs_locks[idx] = *lock;
+		rs->rs_modes[idx] = mode;
+		rs->rs_difficult = 1;
+		rs->rs_no_ack = !!no_ack;
+	}
+}
+EXPORT_SYMBOL(ptlrpc_save_lock);
+
+
+struct ptlrpc_hr_partition;
+
+struct ptlrpc_hr_thread {
+	int				hrt_id;		/* thread ID */
+	spinlock_t			hrt_lock;
+	wait_queue_head_t			hrt_waitq;
+	struct list_head			hrt_queue;	/* RS queue */
+	struct ptlrpc_hr_partition	*hrt_partition;
+};
+
+struct ptlrpc_hr_partition {
+	/* # of started threads */
+	atomic_t			hrp_nstarted;
+	/* # of stopped threads */
+	atomic_t			hrp_nstopped;
+	/* cpu partition id */
+	int				hrp_cpt;
+	/* round-robin rotor for choosing thread */
+	int				hrp_rotor;
+	/* total number of threads on this partition */
+	int				hrp_nthrs;
+	/* threads table */
+	struct ptlrpc_hr_thread		*hrp_thrs;
+};
+
+#define HRT_RUNNING 0
+#define HRT_STOPPING 1
+
+struct ptlrpc_hr_service {
+	/* CPU partition table, it's just cfs_cpt_table for now */
+	struct cfs_cpt_table		*hr_cpt_table;
+	/** controller sleep waitq */
+	wait_queue_head_t			hr_waitq;
+	unsigned int			hr_stopping;
+	/** roundrobin rotor for non-affinity service */
+	unsigned int			hr_rotor;
+	/* partition data */
+	struct ptlrpc_hr_partition	**hr_partitions;
+};
+
+struct rs_batch {
+	struct list_head			rsb_replies;
+	unsigned int			rsb_n_replies;
+	struct ptlrpc_service_part	*rsb_svcpt;
+};
+
+/** reply handling service. */
+static struct ptlrpc_hr_service		ptlrpc_hr;
+
+/**
+ * maximum mumber of replies scheduled in one batch
+ */
+#define MAX_SCHEDULED 256
+
+/**
+ * Initialize a reply batch.
+ *
+ * \param b batch
+ */
+static void rs_batch_init(struct rs_batch *b)
+{
+	memset(b, 0, sizeof *b);
+	INIT_LIST_HEAD(&b->rsb_replies);
+}
+
+/**
+ * Choose an hr thread to dispatch requests to.
+ */
+static struct ptlrpc_hr_thread *
+ptlrpc_hr_select(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_hr_partition	*hrp;
+	unsigned int			rotor;
+
+	if (svcpt->scp_cpt >= 0 &&
+	    svcpt->scp_service->srv_cptable == ptlrpc_hr.hr_cpt_table) {
+		/* directly match partition */
+		hrp = ptlrpc_hr.hr_partitions[svcpt->scp_cpt];
+
+	} else {
+		rotor = ptlrpc_hr.hr_rotor++;
+		rotor %= cfs_cpt_number(ptlrpc_hr.hr_cpt_table);
+
+		hrp = ptlrpc_hr.hr_partitions[rotor];
+	}
+
+	rotor = hrp->hrp_rotor++;
+	return &hrp->hrp_thrs[rotor % hrp->hrp_nthrs];
+}
+
+/**
+ * Dispatch all replies accumulated in the batch to one from
+ * dedicated reply handling threads.
+ *
+ * \param b batch
+ */
+static void rs_batch_dispatch(struct rs_batch *b)
+{
+	if (b->rsb_n_replies != 0) {
+		struct ptlrpc_hr_thread	*hrt;
+
+		hrt = ptlrpc_hr_select(b->rsb_svcpt);
+
+		spin_lock(&hrt->hrt_lock);
+		list_splice_init(&b->rsb_replies, &hrt->hrt_queue);
+		spin_unlock(&hrt->hrt_lock);
+
+		wake_up(&hrt->hrt_waitq);
+		b->rsb_n_replies = 0;
+	}
+}
+
+/**
+ * Add a reply to a batch.
+ * Add one reply object to a batch, schedule batched replies if overload.
+ *
+ * \param b batch
+ * \param rs reply
+ */
+static void rs_batch_add(struct rs_batch *b, struct ptlrpc_reply_state *rs)
+{
+	struct ptlrpc_service_part *svcpt = rs->rs_svcpt;
+
+	if (svcpt != b->rsb_svcpt || b->rsb_n_replies >= MAX_SCHEDULED) {
+		if (b->rsb_svcpt != NULL) {
+			rs_batch_dispatch(b);
+			spin_unlock(&b->rsb_svcpt->scp_rep_lock);
+		}
+		spin_lock(&svcpt->scp_rep_lock);
+		b->rsb_svcpt = svcpt;
+	}
+	spin_lock(&rs->rs_lock);
+	rs->rs_scheduled_ever = 1;
+	if (rs->rs_scheduled == 0) {
+		list_move(&rs->rs_list, &b->rsb_replies);
+		rs->rs_scheduled = 1;
+		b->rsb_n_replies++;
+	}
+	rs->rs_committed = 1;
+	spin_unlock(&rs->rs_lock);
+}
+
+/**
+ * Reply batch finalization.
+ * Dispatch remaining replies from the batch
+ * and release remaining spinlock.
+ *
+ * \param b batch
+ */
+static void rs_batch_fini(struct rs_batch *b)
+{
+	if (b->rsb_svcpt != NULL) {
+		rs_batch_dispatch(b);
+		spin_unlock(&b->rsb_svcpt->scp_rep_lock);
+	}
+}
+
+#define DECLARE_RS_BATCH(b)     struct rs_batch b
+
+
+/**
+ * Put reply state into a queue for processing because we received
+ * ACK from the client
+ */
+void ptlrpc_dispatch_difficult_reply(struct ptlrpc_reply_state *rs)
+{
+	struct ptlrpc_hr_thread *hrt;
+	ENTRY;
+
+	LASSERT(list_empty(&rs->rs_list));
+
+	hrt = ptlrpc_hr_select(rs->rs_svcpt);
+
+	spin_lock(&hrt->hrt_lock);
+	list_add_tail(&rs->rs_list, &hrt->hrt_queue);
+	spin_unlock(&hrt->hrt_lock);
+
+	wake_up(&hrt->hrt_waitq);
+	EXIT;
+}
+
+void
+ptlrpc_schedule_difficult_reply(struct ptlrpc_reply_state *rs)
+{
+	ENTRY;
+
+	LASSERT(spin_is_locked(&rs->rs_svcpt->scp_rep_lock));
+	LASSERT(spin_is_locked(&rs->rs_lock));
+	LASSERT (rs->rs_difficult);
+	rs->rs_scheduled_ever = 1;  /* flag any notification attempt */
+
+	if (rs->rs_scheduled) {     /* being set up or already notified */
+		EXIT;
+		return;
+	}
+
+	rs->rs_scheduled = 1;
+	list_del_init(&rs->rs_list);
+	ptlrpc_dispatch_difficult_reply(rs);
+	EXIT;
+}
+EXPORT_SYMBOL(ptlrpc_schedule_difficult_reply);
+
+void ptlrpc_commit_replies(struct obd_export *exp)
+{
+	struct ptlrpc_reply_state *rs, *nxt;
+	DECLARE_RS_BATCH(batch);
+	ENTRY;
+
+	rs_batch_init(&batch);
+	/* Find any replies that have been committed and get their service
+	 * to attend to complete them. */
+
+	/* CAVEAT EMPTOR: spinlock ordering!!! */
+	spin_lock(&exp->exp_uncommitted_replies_lock);
+	list_for_each_entry_safe(rs, nxt, &exp->exp_uncommitted_replies,
+				     rs_obd_list) {
+		LASSERT (rs->rs_difficult);
+		/* VBR: per-export last_committed */
+		LASSERT(rs->rs_export);
+		if (rs->rs_transno <= exp->exp_last_committed) {
+			list_del_init(&rs->rs_obd_list);
+			rs_batch_add(&batch, rs);
+		}
+	}
+	spin_unlock(&exp->exp_uncommitted_replies_lock);
+	rs_batch_fini(&batch);
+	EXIT;
+}
+EXPORT_SYMBOL(ptlrpc_commit_replies);
+
+static int
+ptlrpc_server_post_idle_rqbds(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_request_buffer_desc *rqbd;
+	int				  rc;
+	int				  posted = 0;
+
+	for (;;) {
+		spin_lock(&svcpt->scp_lock);
+
+		if (list_empty(&svcpt->scp_rqbd_idle)) {
+			spin_unlock(&svcpt->scp_lock);
+			return posted;
+		}
+
+		rqbd = list_entry(svcpt->scp_rqbd_idle.next,
+				      struct ptlrpc_request_buffer_desc,
+				      rqbd_list);
+		list_del(&rqbd->rqbd_list);
+
+		/* assume we will post successfully */
+		svcpt->scp_nrqbds_posted++;
+		list_add(&rqbd->rqbd_list, &svcpt->scp_rqbd_posted);
+
+		spin_unlock(&svcpt->scp_lock);
+
+		rc = ptlrpc_register_rqbd(rqbd);
+		if (rc != 0)
+			break;
+
+		posted = 1;
+	}
+
+	spin_lock(&svcpt->scp_lock);
+
+	svcpt->scp_nrqbds_posted--;
+	list_del(&rqbd->rqbd_list);
+	list_add_tail(&rqbd->rqbd_list, &svcpt->scp_rqbd_idle);
+
+	/* Don't complain if no request buffers are posted right now; LNET
+	 * won't drop requests because we set the portal lazy! */
+
+	spin_unlock(&svcpt->scp_lock);
+
+	return -1;
+}
+
+static void ptlrpc_at_timer(unsigned long castmeharder)
+{
+	struct ptlrpc_service_part *svcpt;
+
+	svcpt = (struct ptlrpc_service_part *)castmeharder;
+
+	svcpt->scp_at_check = 1;
+	svcpt->scp_at_checktime = cfs_time_current();
+	wake_up(&svcpt->scp_waitq);
+}
+
+static void
+ptlrpc_server_nthreads_check(struct ptlrpc_service *svc,
+			     struct ptlrpc_service_conf *conf)
+{
+	struct ptlrpc_service_thr_conf	*tc = &conf->psc_thr;
+	unsigned			init;
+	unsigned			total;
+	unsigned			nthrs;
+	int				weight;
+
+	/*
+	 * Common code for estimating & validating threads number.
+	 * CPT affinity service could have percpt thread-pool instead
+	 * of a global thread-pool, which means user might not always
+	 * get the threads number they give it in conf::tc_nthrs_user
+	 * even they did set. It's because we need to validate threads
+	 * number for each CPT to guarantee each pool will have enough
+	 * threads to keep the service healthy.
+	 */
+	init = PTLRPC_NTHRS_INIT + (svc->srv_ops.so_hpreq_handler != NULL);
+	init = max_t(int, init, tc->tc_nthrs_init);
+
+	/* NB: please see comments in lustre_lnet.h for definition
+	 * details of these members */
+	LASSERT(tc->tc_nthrs_max != 0);
+
+	if (tc->tc_nthrs_user != 0) {
+		/* In case there is a reason to test a service with many
+		 * threads, we give a less strict check here, it can
+		 * be up to 8 * nthrs_max */
+		total = min(tc->tc_nthrs_max * 8, tc->tc_nthrs_user);
+		nthrs = total / svc->srv_ncpts;
+		init  = max(init, nthrs);
+		goto out;
+	}
+
+	total = tc->tc_nthrs_max;
+	if (tc->tc_nthrs_base == 0) {
+		/* don't care about base threads number per partition,
+		 * this is most for non-affinity service */
+		nthrs = total / svc->srv_ncpts;
+		goto out;
+	}
+
+	nthrs = tc->tc_nthrs_base;
+	if (svc->srv_ncpts == 1) {
+		int	i;
+
+		/* NB: Increase the base number if it's single partition
+		 * and total number of cores/HTs is larger or equal to 4.
+		 * result will always < 2 * nthrs_base */
+		weight = cfs_cpt_weight(svc->srv_cptable, CFS_CPT_ANY);
+		for (i = 1; (weight >> (i + 1)) != 0 && /* >= 4 cores/HTs */
+			    (tc->tc_nthrs_base >> i) != 0; i++)
+			nthrs += tc->tc_nthrs_base >> i;
+	}
+
+	if (tc->tc_thr_factor != 0) {
+		int	  factor = tc->tc_thr_factor;
+		const int fade = 4;
+
+		/*
+		 * User wants to increase number of threads with for
+		 * each CPU core/HT, most likely the factor is larger then
+		 * one thread/core because service threads are supposed to
+		 * be blocked by lock or wait for IO.
+		 */
+		/*
+		 * Amdahl's law says that adding processors wouldn't give
+		 * a linear increasing of parallelism, so it's nonsense to
+		 * have too many threads no matter how many cores/HTs
+		 * there are.
+		 */
+		if (cfs_cpu_ht_nsiblings(0) > 1) { /* weight is # of HTs */
+			/* depress thread factor for hyper-thread */
+			factor = factor - (factor >> 1) + (factor >> 3);
+		}
+
+		weight = cfs_cpt_weight(svc->srv_cptable, 0);
+		LASSERT(weight > 0);
+
+		for (; factor > 0 && weight > 0; factor--, weight -= fade)
+			nthrs += min(weight, fade) * factor;
+	}
+
+	if (nthrs * svc->srv_ncpts > tc->tc_nthrs_max) {
+		nthrs = max(tc->tc_nthrs_base,
+			    tc->tc_nthrs_max / svc->srv_ncpts);
+	}
+ out:
+	nthrs = max(nthrs, tc->tc_nthrs_init);
+	svc->srv_nthrs_cpt_limit = nthrs;
+	svc->srv_nthrs_cpt_init = init;
+
+	if (nthrs * svc->srv_ncpts > tc->tc_nthrs_max) {
+		CDEBUG(D_OTHER, "%s: This service may have more threads (%d) "
+		       "than the given soft limit (%d)\n",
+		       svc->srv_name, nthrs * svc->srv_ncpts,
+		       tc->tc_nthrs_max);
+	}
+}
+
+/**
+ * Initialize percpt data for a service
+ */
+static int
+ptlrpc_service_part_init(struct ptlrpc_service *svc,
+			 struct ptlrpc_service_part *svcpt, int cpt)
+{
+	struct ptlrpc_at_array	*array;
+	int			size;
+	int			index;
+	int			rc;
+
+	svcpt->scp_cpt = cpt;
+	INIT_LIST_HEAD(&svcpt->scp_threads);
+
+	/* rqbd and incoming request queue */
+	spin_lock_init(&svcpt->scp_lock);
+	INIT_LIST_HEAD(&svcpt->scp_rqbd_idle);
+	INIT_LIST_HEAD(&svcpt->scp_rqbd_posted);
+	INIT_LIST_HEAD(&svcpt->scp_req_incoming);
+	init_waitqueue_head(&svcpt->scp_waitq);
+	/* history request & rqbd list */
+	INIT_LIST_HEAD(&svcpt->scp_hist_reqs);
+	INIT_LIST_HEAD(&svcpt->scp_hist_rqbds);
+
+	/* acitve requests and hp requests */
+	spin_lock_init(&svcpt->scp_req_lock);
+
+	/* reply states */
+	spin_lock_init(&svcpt->scp_rep_lock);
+	INIT_LIST_HEAD(&svcpt->scp_rep_active);
+	INIT_LIST_HEAD(&svcpt->scp_rep_idle);
+	init_waitqueue_head(&svcpt->scp_rep_waitq);
+	atomic_set(&svcpt->scp_nreps_difficult, 0);
+
+	/* adaptive timeout */
+	spin_lock_init(&svcpt->scp_at_lock);
+	array = &svcpt->scp_at_array;
+
+	size = at_est2timeout(at_max);
+	array->paa_size     = size;
+	array->paa_count    = 0;
+	array->paa_deadline = -1;
+
+	/* allocate memory for scp_at_array (ptlrpc_at_array) */
+	OBD_CPT_ALLOC(array->paa_reqs_array,
+		      svc->srv_cptable, cpt, sizeof(struct list_head) * size);
+	if (array->paa_reqs_array == NULL)
+		return -ENOMEM;
+
+	for (index = 0; index < size; index++)
+		INIT_LIST_HEAD(&array->paa_reqs_array[index]);
+
+	OBD_CPT_ALLOC(array->paa_reqs_count,
+		      svc->srv_cptable, cpt, sizeof(__u32) * size);
+	if (array->paa_reqs_count == NULL)
+		goto failed;
+
+	cfs_timer_init(&svcpt->scp_at_timer, ptlrpc_at_timer, svcpt);
+	/* At SOW, service time should be quick; 10s seems generous. If client
+	 * timeout is less than this, we'll be sending an early reply. */
+	at_init(&svcpt->scp_at_estimate, 10, 0);
+
+	/* assign this before call ptlrpc_grow_req_bufs */
+	svcpt->scp_service = svc;
+	/* Now allocate the request buffers, but don't post them now */
+	rc = ptlrpc_grow_req_bufs(svcpt, 0);
+	/* We shouldn't be under memory pressure at startup, so
+	 * fail if we can't allocate all our buffers at this time. */
+	if (rc != 0)
+		goto failed;
+
+	return 0;
+
+ failed:
+	if (array->paa_reqs_count != NULL) {
+		OBD_FREE(array->paa_reqs_count, sizeof(__u32) * size);
+		array->paa_reqs_count = NULL;
+	}
+
+	if (array->paa_reqs_array != NULL) {
+		OBD_FREE(array->paa_reqs_array,
+			 sizeof(struct list_head) * array->paa_size);
+		array->paa_reqs_array = NULL;
+	}
+
+	return -ENOMEM;
+}
+
+/**
+ * Initialize service on a given portal.
+ * This includes starting serving threads , allocating and posting rqbds and
+ * so on.
+ */
+struct ptlrpc_service *
+ptlrpc_register_service(struct ptlrpc_service_conf *conf,
+			proc_dir_entry_t *proc_entry)
+{
+	struct ptlrpc_service_cpt_conf	*cconf = &conf->psc_cpt;
+	struct ptlrpc_service		*service;
+	struct ptlrpc_service_part	*svcpt;
+	struct cfs_cpt_table		*cptable;
+	__u32				*cpts = NULL;
+	int				ncpts;
+	int				cpt;
+	int				rc;
+	int				i;
+	ENTRY;
+
+	LASSERT(conf->psc_buf.bc_nbufs > 0);
+	LASSERT(conf->psc_buf.bc_buf_size >=
+		conf->psc_buf.bc_req_max_size + SPTLRPC_MAX_PAYLOAD);
+	LASSERT(conf->psc_thr.tc_ctx_tags != 0);
+
+	cptable = cconf->cc_cptable;
+	if (cptable == NULL)
+		cptable = cfs_cpt_table;
+
+	if (!conf->psc_thr.tc_cpu_affinity) {
+		ncpts = 1;
+	} else {
+		ncpts = cfs_cpt_number(cptable);
+		if (cconf->cc_pattern != NULL) {
+			struct cfs_expr_list	*el;
+
+			rc = cfs_expr_list_parse(cconf->cc_pattern,
+						 strlen(cconf->cc_pattern),
+						 0, ncpts - 1, &el);
+			if (rc != 0) {
+				CERROR("%s: invalid CPT pattern string: %s",
+				       conf->psc_name, cconf->cc_pattern);
+				RETURN(ERR_PTR(-EINVAL));
+			}
+
+			rc = cfs_expr_list_values(el, ncpts, &cpts);
+			cfs_expr_list_free(el);
+			if (rc <= 0) {
+				CERROR("%s: failed to parse CPT array %s: %d\n",
+				       conf->psc_name, cconf->cc_pattern, rc);
+				if (cpts != NULL)
+					OBD_FREE(cpts, sizeof(*cpts) * ncpts);
+				RETURN(ERR_PTR(rc < 0 ? rc : -EINVAL));
+			}
+			ncpts = rc;
+		}
+	}
+
+	OBD_ALLOC(service, offsetof(struct ptlrpc_service, srv_parts[ncpts]));
+	if (service == NULL) {
+		if (cpts != NULL)
+			OBD_FREE(cpts, sizeof(*cpts) * ncpts);
+		RETURN(ERR_PTR(-ENOMEM));
+	}
+
+	service->srv_cptable		= cptable;
+	service->srv_cpts		= cpts;
+	service->srv_ncpts		= ncpts;
+
+	service->srv_cpt_bits = 0; /* it's zero already, easy to read... */
+	while ((1 << service->srv_cpt_bits) < cfs_cpt_number(cptable))
+		service->srv_cpt_bits++;
+
+	/* public members */
+	spin_lock_init(&service->srv_lock);
+	service->srv_name		= conf->psc_name;
+	service->srv_watchdog_factor	= conf->psc_watchdog_factor;
+	INIT_LIST_HEAD(&service->srv_list); /* for safty of cleanup */
+
+	/* buffer configuration */
+	service->srv_nbuf_per_group	= test_req_buffer_pressure ?
+					  1 : conf->psc_buf.bc_nbufs;
+	service->srv_max_req_size	= conf->psc_buf.bc_req_max_size +
+					  SPTLRPC_MAX_PAYLOAD;
+	service->srv_buf_size		= conf->psc_buf.bc_buf_size;
+	service->srv_rep_portal		= conf->psc_buf.bc_rep_portal;
+	service->srv_req_portal		= conf->psc_buf.bc_req_portal;
+
+	/* Increase max reply size to next power of two */
+	service->srv_max_reply_size = 1;
+	while (service->srv_max_reply_size <
+	       conf->psc_buf.bc_rep_max_size + SPTLRPC_MAX_PAYLOAD)
+		service->srv_max_reply_size <<= 1;
+
+	service->srv_thread_name	= conf->psc_thr.tc_thr_name;
+	service->srv_ctx_tags		= conf->psc_thr.tc_ctx_tags;
+	service->srv_hpreq_ratio	= PTLRPC_SVC_HP_RATIO;
+	service->srv_ops		= conf->psc_ops;
+
+	for (i = 0; i < ncpts; i++) {
+		if (!conf->psc_thr.tc_cpu_affinity)
+			cpt = CFS_CPT_ANY;
+		else
+			cpt = cpts != NULL ? cpts[i] : i;
+
+		OBD_CPT_ALLOC(svcpt, cptable, cpt, sizeof(*svcpt));
+		if (svcpt == NULL)
+			GOTO(failed, rc = -ENOMEM);
+
+		service->srv_parts[i] = svcpt;
+		rc = ptlrpc_service_part_init(service, svcpt, cpt);
+		if (rc != 0)
+			GOTO(failed, rc);
+	}
+
+	ptlrpc_server_nthreads_check(service, conf);
+
+	rc = LNetSetLazyPortal(service->srv_req_portal);
+	LASSERT(rc == 0);
+
+	mutex_lock(&ptlrpc_all_services_mutex);
+	list_add (&service->srv_list, &ptlrpc_all_services);
+	mutex_unlock(&ptlrpc_all_services_mutex);
+
+	if (proc_entry != NULL)
+		ptlrpc_lprocfs_register_service(proc_entry, service);
+
+	rc = ptlrpc_service_nrs_setup(service);
+	if (rc != 0)
+		GOTO(failed, rc);
+
+	CDEBUG(D_NET, "%s: Started, listening on portal %d\n",
+	       service->srv_name, service->srv_req_portal);
+
+	rc = ptlrpc_start_threads(service);
+	if (rc != 0) {
+		CERROR("Failed to start threads for service %s: %d\n",
+		       service->srv_name, rc);
+		GOTO(failed, rc);
+	}
+
+	RETURN(service);
+failed:
+	ptlrpc_unregister_service(service);
+	RETURN(ERR_PTR(rc));
+}
+EXPORT_SYMBOL(ptlrpc_register_service);
+
+/**
+ * to actually free the request, must be called without holding svc_lock.
+ * note it's caller's responsibility to unlink req->rq_list.
+ */
+static void ptlrpc_server_free_request(struct ptlrpc_request *req)
+{
+	LASSERT(atomic_read(&req->rq_refcount) == 0);
+	LASSERT(list_empty(&req->rq_timed_list));
+
+	 /* DEBUG_REQ() assumes the reply state of a request with a valid
+	  * ref will not be destroyed until that reference is dropped. */
+	ptlrpc_req_drop_rs(req);
+
+	sptlrpc_svc_ctx_decref(req);
+
+	if (req != &req->rq_rqbd->rqbd_req) {
+		/* NB request buffers use an embedded
+		 * req if the incoming req unlinked the
+		 * MD; this isn't one of them! */
+		OBD_FREE(req, sizeof(*req));
+	}
+}
+
+/**
+ * drop a reference count of the request. if it reaches 0, we either
+ * put it into history list, or free it immediately.
+ */
+void ptlrpc_server_drop_request(struct ptlrpc_request *req)
+{
+	struct ptlrpc_request_buffer_desc *rqbd = req->rq_rqbd;
+	struct ptlrpc_service_part	  *svcpt = rqbd->rqbd_svcpt;
+	struct ptlrpc_service		  *svc = svcpt->scp_service;
+	int				refcount;
+	struct list_head			*tmp;
+	struct list_head			*nxt;
+
+	if (!atomic_dec_and_test(&req->rq_refcount))
+		return;
+
+	if (req->rq_at_linked) {
+		spin_lock(&svcpt->scp_at_lock);
+		/* recheck with lock, in case it's unlinked by
+		 * ptlrpc_at_check_timed() */
+		if (likely(req->rq_at_linked))
+			ptlrpc_at_remove_timed(req);
+		spin_unlock(&svcpt->scp_at_lock);
+	}
+
+	LASSERT(list_empty(&req->rq_timed_list));
+
+	/* finalize request */
+	if (req->rq_export) {
+		class_export_put(req->rq_export);
+		req->rq_export = NULL;
+	}
+
+	spin_lock(&svcpt->scp_lock);
+
+	list_add(&req->rq_list, &rqbd->rqbd_reqs);
+
+	refcount = --(rqbd->rqbd_refcount);
+	if (refcount == 0) {
+		/* request buffer is now idle: add to history */
+		list_del(&rqbd->rqbd_list);
+
+		list_add_tail(&rqbd->rqbd_list, &svcpt->scp_hist_rqbds);
+		svcpt->scp_hist_nrqbds++;
+
+		/* cull some history?
+		 * I expect only about 1 or 2 rqbds need to be recycled here */
+		while (svcpt->scp_hist_nrqbds > svc->srv_hist_nrqbds_cpt_max) {
+			rqbd = list_entry(svcpt->scp_hist_rqbds.next,
+					      struct ptlrpc_request_buffer_desc,
+					      rqbd_list);
+
+			list_del(&rqbd->rqbd_list);
+			svcpt->scp_hist_nrqbds--;
+
+			/* remove rqbd's reqs from svc's req history while
+			 * I've got the service lock */
+			list_for_each(tmp, &rqbd->rqbd_reqs) {
+				req = list_entry(tmp, struct ptlrpc_request,
+						     rq_list);
+				/* Track the highest culled req seq */
+				if (req->rq_history_seq >
+				    svcpt->scp_hist_seq_culled) {
+					svcpt->scp_hist_seq_culled =
+						req->rq_history_seq;
+				}
+				list_del(&req->rq_history_list);
+			}
+
+			spin_unlock(&svcpt->scp_lock);
+
+			list_for_each_safe(tmp, nxt, &rqbd->rqbd_reqs) {
+				req = list_entry(rqbd->rqbd_reqs.next,
+						     struct ptlrpc_request,
+						     rq_list);
+				list_del(&req->rq_list);
+				ptlrpc_server_free_request(req);
+			}
+
+			spin_lock(&svcpt->scp_lock);
+			/*
+			 * now all reqs including the embedded req has been
+			 * disposed, schedule request buffer for re-use.
+			 */
+			LASSERT(atomic_read(&rqbd->rqbd_req.rq_refcount) ==
+				0);
+			list_add_tail(&rqbd->rqbd_list,
+					  &svcpt->scp_rqbd_idle);
+		}
+
+		spin_unlock(&svcpt->scp_lock);
+	} else if (req->rq_reply_state && req->rq_reply_state->rs_prealloc) {
+		/* If we are low on memory, we are not interested in history */
+		list_del(&req->rq_list);
+		list_del_init(&req->rq_history_list);
+
+		/* Track the highest culled req seq */
+		if (req->rq_history_seq > svcpt->scp_hist_seq_culled)
+			svcpt->scp_hist_seq_culled = req->rq_history_seq;
+
+		spin_unlock(&svcpt->scp_lock);
+
+		ptlrpc_server_free_request(req);
+	} else {
+		spin_unlock(&svcpt->scp_lock);
+	}
+}
+
+/** Change request export and move hp request from old export to new */
+void ptlrpc_request_change_export(struct ptlrpc_request *req,
+				  struct obd_export *export)
+{
+	if (req->rq_export != NULL) {
+		if (!list_empty(&req->rq_exp_list)) {
+			/* remove rq_exp_list from last export */
+			spin_lock_bh(&req->rq_export->exp_rpc_lock);
+			list_del_init(&req->rq_exp_list);
+			spin_unlock_bh(&req->rq_export->exp_rpc_lock);
+
+			/* export has one reference already, so it`s safe to
+			 * add req to export queue here and get another
+			 * reference for request later */
+			spin_lock_bh(&export->exp_rpc_lock);
+			list_add(&req->rq_exp_list, &export->exp_hp_rpcs);
+			spin_unlock_bh(&export->exp_rpc_lock);
+		}
+		class_export_rpc_dec(req->rq_export);
+		class_export_put(req->rq_export);
+	}
+
+	/* request takes one export refcount */
+	req->rq_export = class_export_get(export);
+	class_export_rpc_inc(export);
+
+	return;
+}
+
+/**
+ * to finish a request: stop sending more early replies, and release
+ * the request.
+ */
+static void ptlrpc_server_finish_request(struct ptlrpc_service_part *svcpt,
+					 struct ptlrpc_request *req)
+{
+	ptlrpc_server_hpreq_fini(req);
+
+	ptlrpc_server_drop_request(req);
+}
+
+/**
+ * to finish a active request: stop sending more early replies, and release
+ * the request. should be called after we finished handling the request.
+ */
+static void ptlrpc_server_finish_active_request(
+					struct ptlrpc_service_part *svcpt,
+					struct ptlrpc_request *req)
+{
+	spin_lock(&svcpt->scp_req_lock);
+	ptlrpc_nrs_req_stop_nolock(req);
+	svcpt->scp_nreqs_active--;
+	if (req->rq_hp)
+		svcpt->scp_nhreqs_active--;
+	spin_unlock(&svcpt->scp_req_lock);
+
+	ptlrpc_nrs_req_finalize(req);
+
+	if (req->rq_export != NULL)
+		class_export_rpc_dec(req->rq_export);
+
+	ptlrpc_server_finish_request(svcpt, req);
+}
+
+/**
+ * This function makes sure dead exports are evicted in a timely manner.
+ * This function is only called when some export receives a message (i.e.,
+ * the network is up.)
+ */
+static void ptlrpc_update_export_timer(struct obd_export *exp, long extra_delay)
+{
+	struct obd_export *oldest_exp;
+	time_t oldest_time, new_time;
+
+	ENTRY;
+
+	LASSERT(exp);
+
+	/* Compensate for slow machines, etc, by faking our request time
+	   into the future.  Although this can break the strict time-ordering
+	   of the list, we can be really lazy here - we don't have to evict
+	   at the exact right moment.  Eventually, all silent exports
+	   will make it to the top of the list. */
+
+	/* Do not pay attention on 1sec or smaller renewals. */
+	new_time = cfs_time_current_sec() + extra_delay;
+	if (exp->exp_last_request_time + 1 /*second */ >= new_time)
+		RETURN_EXIT;
+
+	exp->exp_last_request_time = new_time;
+	CDEBUG(D_HA, "updating export %s at "CFS_TIME_T" exp %p\n",
+	       exp->exp_client_uuid.uuid,
+	       exp->exp_last_request_time, exp);
+
+	/* exports may get disconnected from the chain even though the
+	   export has references, so we must keep the spin lock while
+	   manipulating the lists */
+	spin_lock(&exp->exp_obd->obd_dev_lock);
+
+	if (list_empty(&exp->exp_obd_chain_timed)) {
+		/* this one is not timed */
+		spin_unlock(&exp->exp_obd->obd_dev_lock);
+		RETURN_EXIT;
+	}
+
+	list_move_tail(&exp->exp_obd_chain_timed,
+			   &exp->exp_obd->obd_exports_timed);
+
+	oldest_exp = list_entry(exp->exp_obd->obd_exports_timed.next,
+				    struct obd_export, exp_obd_chain_timed);
+	oldest_time = oldest_exp->exp_last_request_time;
+	spin_unlock(&exp->exp_obd->obd_dev_lock);
+
+	if (exp->exp_obd->obd_recovering) {
+		/* be nice to everyone during recovery */
+		EXIT;
+		return;
+	}
+
+	/* Note - racing to start/reset the obd_eviction timer is safe */
+	if (exp->exp_obd->obd_eviction_timer == 0) {
+		/* Check if the oldest entry is expired. */
+		if (cfs_time_current_sec() > (oldest_time + PING_EVICT_TIMEOUT +
+					      extra_delay)) {
+			/* We need a second timer, in case the net was down and
+			 * it just came back. Since the pinger may skip every
+			 * other PING_INTERVAL (see note in ptlrpc_pinger_main),
+			 * we better wait for 3. */
+			exp->exp_obd->obd_eviction_timer =
+				cfs_time_current_sec() + 3 * PING_INTERVAL;
+			CDEBUG(D_HA, "%s: Think about evicting %s from "CFS_TIME_T"\n",
+			       exp->exp_obd->obd_name,
+			       obd_export_nid2str(oldest_exp), oldest_time);
+		}
+	} else {
+		if (cfs_time_current_sec() >
+		    (exp->exp_obd->obd_eviction_timer + extra_delay)) {
+			/* The evictor won't evict anyone who we've heard from
+			 * recently, so we don't have to check before we start
+			 * it. */
+			if (!ping_evictor_wake(exp))
+				exp->exp_obd->obd_eviction_timer = 0;
+		}
+	}
+
+	EXIT;
+}
+
+/**
+ * Sanity check request \a req.
+ * Return 0 if all is ok, error code otherwise.
+ */
+static int ptlrpc_check_req(struct ptlrpc_request *req)
+{
+	int rc = 0;
+
+	if (unlikely(lustre_msg_get_conn_cnt(req->rq_reqmsg) <
+		     req->rq_export->exp_conn_cnt)) {
+		DEBUG_REQ(D_RPCTRACE, req,
+			  "DROPPING req from old connection %d < %d",
+			  lustre_msg_get_conn_cnt(req->rq_reqmsg),
+			  req->rq_export->exp_conn_cnt);
+		return -EEXIST;
+	}
+	if (unlikely(req->rq_export->exp_obd &&
+		     req->rq_export->exp_obd->obd_fail)) {
+	     /* Failing over, don't handle any more reqs, send
+		error response instead. */
+		CDEBUG(D_RPCTRACE, "Dropping req %p for failed obd %s\n",
+		       req, req->rq_export->exp_obd->obd_name);
+		rc = -ENODEV;
+	} else if (lustre_msg_get_flags(req->rq_reqmsg) &
+		   (MSG_REPLAY | MSG_REQ_REPLAY_DONE) &&
+		   !(req->rq_export->exp_obd->obd_recovering)) {
+			DEBUG_REQ(D_ERROR, req,
+				  "Invalid replay without recovery");
+			class_fail_export(req->rq_export);
+			rc = -ENODEV;
+	} else if (lustre_msg_get_transno(req->rq_reqmsg) != 0 &&
+		   !(req->rq_export->exp_obd->obd_recovering)) {
+			DEBUG_REQ(D_ERROR, req, "Invalid req with transno "
+				  LPU64" without recovery",
+				  lustre_msg_get_transno(req->rq_reqmsg));
+			class_fail_export(req->rq_export);
+			rc = -ENODEV;
+	}
+
+	if (unlikely(rc < 0)) {
+		req->rq_status = rc;
+		ptlrpc_error(req);
+	}
+	return rc;
+}
+
+static void ptlrpc_at_set_timer(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_at_array *array = &svcpt->scp_at_array;
+	__s32 next;
+
+	if (array->paa_count == 0) {
+		cfs_timer_disarm(&svcpt->scp_at_timer);
+		return;
+	}
+
+	/* Set timer for closest deadline */
+	next = (__s32)(array->paa_deadline - cfs_time_current_sec() -
+		       at_early_margin);
+	if (next <= 0) {
+		ptlrpc_at_timer((unsigned long)svcpt);
+	} else {
+		cfs_timer_arm(&svcpt->scp_at_timer, cfs_time_shift(next));
+		CDEBUG(D_INFO, "armed %s at %+ds\n",
+		       svcpt->scp_service->srv_name, next);
+	}
+}
+
+/* Add rpc to early reply check list */
+static int ptlrpc_at_add_timed(struct ptlrpc_request *req)
+{
+	struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt;
+	struct ptlrpc_at_array *array = &svcpt->scp_at_array;
+	struct ptlrpc_request *rq = NULL;
+	__u32 index;
+
+	if (AT_OFF)
+		return(0);
+
+	if (req->rq_no_reply)
+		return 0;
+
+	if ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) == 0)
+		return(-ENOSYS);
+
+	spin_lock(&svcpt->scp_at_lock);
+	LASSERT(list_empty(&req->rq_timed_list));
+
+	index = (unsigned long)req->rq_deadline % array->paa_size;
+	if (array->paa_reqs_count[index] > 0) {
+		/* latest rpcs will have the latest deadlines in the list,
+		 * so search backward. */
+		list_for_each_entry_reverse(rq,
+						&array->paa_reqs_array[index],
+						rq_timed_list) {
+			if (req->rq_deadline >= rq->rq_deadline) {
+				list_add(&req->rq_timed_list,
+					     &rq->rq_timed_list);
+				break;
+			}
+		}
+	}
+
+	/* Add the request at the head of the list */
+	if (list_empty(&req->rq_timed_list))
+		list_add(&req->rq_timed_list,
+			     &array->paa_reqs_array[index]);
+
+	spin_lock(&req->rq_lock);
+	req->rq_at_linked = 1;
+	spin_unlock(&req->rq_lock);
+	req->rq_at_index = index;
+	array->paa_reqs_count[index]++;
+	array->paa_count++;
+	if (array->paa_count == 1 || array->paa_deadline > req->rq_deadline) {
+		array->paa_deadline = req->rq_deadline;
+		ptlrpc_at_set_timer(svcpt);
+	}
+	spin_unlock(&svcpt->scp_at_lock);
+
+	return 0;
+}
+
+static void
+ptlrpc_at_remove_timed(struct ptlrpc_request *req)
+{
+	struct ptlrpc_at_array *array;
+
+	array = &req->rq_rqbd->rqbd_svcpt->scp_at_array;
+
+	/* NB: must call with hold svcpt::scp_at_lock */
+	LASSERT(!list_empty(&req->rq_timed_list));
+	list_del_init(&req->rq_timed_list);
+
+	spin_lock(&req->rq_lock);
+	req->rq_at_linked = 0;
+	spin_unlock(&req->rq_lock);
+
+	array->paa_reqs_count[req->rq_at_index]--;
+	array->paa_count--;
+}
+
+static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req)
+{
+	struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt;
+	struct ptlrpc_request *reqcopy;
+	struct lustre_msg *reqmsg;
+	cfs_duration_t olddl = req->rq_deadline - cfs_time_current_sec();
+	time_t newdl;
+	int rc;
+	ENTRY;
+
+	/* deadline is when the client expects us to reply, margin is the
+	   difference between clients' and servers' expectations */
+	DEBUG_REQ(D_ADAPTTO, req,
+		  "%ssending early reply (deadline %+lds, margin %+lds) for "
+		  "%d+%d", AT_OFF ? "AT off - not " : "",
+		  olddl, olddl - at_get(&svcpt->scp_at_estimate),
+		  at_get(&svcpt->scp_at_estimate), at_extra);
+
+	if (AT_OFF)
+		RETURN(0);
+
+	if (olddl < 0) {
+		DEBUG_REQ(D_WARNING, req, "Already past deadline (%+lds), "
+			  "not sending early reply. Consider increasing "
+			  "at_early_margin (%d)?", olddl, at_early_margin);
+
+		/* Return an error so we're not re-added to the timed list. */
+		RETURN(-ETIMEDOUT);
+	}
+
+	if ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) == 0){
+		DEBUG_REQ(D_INFO, req, "Wanted to ask client for more time, "
+			  "but no AT support");
+		RETURN(-ENOSYS);
+	}
+
+	if (req->rq_export &&
+	    lustre_msg_get_flags(req->rq_reqmsg) &
+	    (MSG_REPLAY | MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE)) {
+		/* During recovery, we don't want to send too many early
+		 * replies, but on the other hand we want to make sure the
+		 * client has enough time to resend if the rpc is lost. So
+		 * during the recovery period send at least 4 early replies,
+		 * spacing them every at_extra if we can. at_estimate should
+		 * always equal this fixed value during recovery. */
+		at_measured(&svcpt->scp_at_estimate, min(at_extra,
+			    req->rq_export->exp_obd->obd_recovery_timeout / 4));
+	} else {
+		/* Fake our processing time into the future to ask the clients
+		 * for some extra amount of time */
+		at_measured(&svcpt->scp_at_estimate, at_extra +
+			    cfs_time_current_sec() -
+			    req->rq_arrival_time.tv_sec);
+
+		/* Check to see if we've actually increased the deadline -
+		 * we may be past adaptive_max */
+		if (req->rq_deadline >= req->rq_arrival_time.tv_sec +
+		    at_get(&svcpt->scp_at_estimate)) {
+			DEBUG_REQ(D_WARNING, req, "Couldn't add any time "
+				  "(%ld/%ld), not sending early reply\n",
+				  olddl, req->rq_arrival_time.tv_sec +
+				  at_get(&svcpt->scp_at_estimate) -
+				  cfs_time_current_sec());
+			RETURN(-ETIMEDOUT);
+		}
+	}
+	newdl = cfs_time_current_sec() + at_get(&svcpt->scp_at_estimate);
+
+	OBD_ALLOC(reqcopy, sizeof *reqcopy);
+	if (reqcopy == NULL)
+		RETURN(-ENOMEM);
+	OBD_ALLOC_LARGE(reqmsg, req->rq_reqlen);
+	if (!reqmsg) {
+		OBD_FREE(reqcopy, sizeof *reqcopy);
+		RETURN(-ENOMEM);
+	}
+
+	*reqcopy = *req;
+	reqcopy->rq_reply_state = NULL;
+	reqcopy->rq_rep_swab_mask = 0;
+	reqcopy->rq_pack_bulk = 0;
+	reqcopy->rq_pack_udesc = 0;
+	reqcopy->rq_packed_final = 0;
+	sptlrpc_svc_ctx_addref(reqcopy);
+	/* We only need the reqmsg for the magic */
+	reqcopy->rq_reqmsg = reqmsg;
+	memcpy(reqmsg, req->rq_reqmsg, req->rq_reqlen);
+
+	LASSERT(atomic_read(&req->rq_refcount));
+	/** if it is last refcount then early reply isn't needed */
+	if (atomic_read(&req->rq_refcount) == 1) {
+		DEBUG_REQ(D_ADAPTTO, reqcopy, "Normal reply already sent out, "
+			  "abort sending early reply\n");
+		GOTO(out, rc = -EINVAL);
+	}
+
+	/* Connection ref */
+	reqcopy->rq_export = class_conn2export(
+				     lustre_msg_get_handle(reqcopy->rq_reqmsg));
+	if (reqcopy->rq_export == NULL)
+		GOTO(out, rc = -ENODEV);
+
+	/* RPC ref */
+	class_export_rpc_inc(reqcopy->rq_export);
+	if (reqcopy->rq_export->exp_obd &&
+	    reqcopy->rq_export->exp_obd->obd_fail)
+		GOTO(out_put, rc = -ENODEV);
+
+	rc = lustre_pack_reply_flags(reqcopy, 1, NULL, NULL, LPRFL_EARLY_REPLY);
+	if (rc)
+		GOTO(out_put, rc);
+
+	rc = ptlrpc_send_reply(reqcopy, PTLRPC_REPLY_EARLY);
+
+	if (!rc) {
+		/* Adjust our own deadline to what we told the client */
+		req->rq_deadline = newdl;
+		req->rq_early_count++; /* number sent, server side */
+	} else {
+		DEBUG_REQ(D_ERROR, req, "Early reply send failed %d", rc);
+	}
+
+	/* Free the (early) reply state from lustre_pack_reply.
+	   (ptlrpc_send_reply takes it's own rs ref, so this is safe here) */
+	ptlrpc_req_drop_rs(reqcopy);
+
+out_put:
+	class_export_rpc_dec(reqcopy->rq_export);
+	class_export_put(reqcopy->rq_export);
+out:
+	sptlrpc_svc_ctx_decref(reqcopy);
+	OBD_FREE_LARGE(reqmsg, req->rq_reqlen);
+	OBD_FREE(reqcopy, sizeof *reqcopy);
+	RETURN(rc);
+}
+
+/* Send early replies to everybody expiring within at_early_margin
+   asking for at_extra time */
+static int ptlrpc_at_check_timed(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_at_array *array = &svcpt->scp_at_array;
+	struct ptlrpc_request *rq, *n;
+	struct list_head work_list;
+	__u32  index, count;
+	time_t deadline;
+	time_t now = cfs_time_current_sec();
+	cfs_duration_t delay;
+	int first, counter = 0;
+	ENTRY;
+
+	spin_lock(&svcpt->scp_at_lock);
+	if (svcpt->scp_at_check == 0) {
+		spin_unlock(&svcpt->scp_at_lock);
+		RETURN(0);
+	}
+	delay = cfs_time_sub(cfs_time_current(), svcpt->scp_at_checktime);
+	svcpt->scp_at_check = 0;
+
+	if (array->paa_count == 0) {
+		spin_unlock(&svcpt->scp_at_lock);
+		RETURN(0);
+	}
+
+	/* The timer went off, but maybe the nearest rpc already completed. */
+	first = array->paa_deadline - now;
+	if (first > at_early_margin) {
+		/* We've still got plenty of time.  Reset the timer. */
+		ptlrpc_at_set_timer(svcpt);
+		spin_unlock(&svcpt->scp_at_lock);
+		RETURN(0);
+	}
+
+	/* We're close to a timeout, and we don't know how much longer the
+	   server will take. Send early replies to everyone expiring soon. */
+	INIT_LIST_HEAD(&work_list);
+	deadline = -1;
+	index = (unsigned long)array->paa_deadline % array->paa_size;
+	count = array->paa_count;
+	while (count > 0) {
+		count -= array->paa_reqs_count[index];
+		list_for_each_entry_safe(rq, n,
+					     &array->paa_reqs_array[index],
+					     rq_timed_list) {
+			if (rq->rq_deadline > now + at_early_margin) {
+				/* update the earliest deadline */
+				if (deadline == -1 ||
+				    rq->rq_deadline < deadline)
+					deadline = rq->rq_deadline;
+				break;
+			}
+
+			ptlrpc_at_remove_timed(rq);
+			/**
+			 * ptlrpc_server_drop_request() may drop
+			 * refcount to 0 already. Let's check this and
+			 * don't add entry to work_list
+			 */
+			if (likely(atomic_inc_not_zero(&rq->rq_refcount)))
+				list_add(&rq->rq_timed_list, &work_list);
+			counter++;
+		}
+
+		if (++index >= array->paa_size)
+			index = 0;
+	}
+	array->paa_deadline = deadline;
+	/* we have a new earliest deadline, restart the timer */
+	ptlrpc_at_set_timer(svcpt);
+
+	spin_unlock(&svcpt->scp_at_lock);
+
+	CDEBUG(D_ADAPTTO, "timeout in %+ds, asking for %d secs on %d early "
+	       "replies\n", first, at_extra, counter);
+	if (first < 0) {
+		/* We're already past request deadlines before we even get a
+		   chance to send early replies */
+		LCONSOLE_WARN("%s: This server is not able to keep up with "
+			      "request traffic (cpu-bound).\n",
+			      svcpt->scp_service->srv_name);
+		CWARN("earlyQ=%d reqQ=%d recA=%d, svcEst=%d, "
+		      "delay="CFS_DURATION_T"(jiff)\n",
+		      counter, svcpt->scp_nreqs_incoming,
+		      svcpt->scp_nreqs_active,
+		      at_get(&svcpt->scp_at_estimate), delay);
+	}
+
+	/* we took additional refcount so entries can't be deleted from list, no
+	 * locking is needed */
+	while (!list_empty(&work_list)) {
+		rq = list_entry(work_list.next, struct ptlrpc_request,
+				    rq_timed_list);
+		list_del_init(&rq->rq_timed_list);
+
+		if (ptlrpc_at_send_early_reply(rq) == 0)
+			ptlrpc_at_add_timed(rq);
+
+		ptlrpc_server_drop_request(rq);
+	}
+
+	RETURN(1); /* return "did_something" for liblustre */
+}
+
+/**
+ * Put the request to the export list if the request may become
+ * a high priority one.
+ */
+static int ptlrpc_server_hpreq_init(struct ptlrpc_service_part *svcpt,
+				    struct ptlrpc_request *req)
+{
+	int rc = 0;
+	ENTRY;
+
+	if (svcpt->scp_service->srv_ops.so_hpreq_handler) {
+		rc = svcpt->scp_service->srv_ops.so_hpreq_handler(req);
+		if (rc < 0)
+			RETURN(rc);
+		LASSERT(rc == 0);
+	}
+	if (req->rq_export && req->rq_ops) {
+		/* Perform request specific check. We should do this check
+		 * before the request is added into exp_hp_rpcs list otherwise
+		 * it may hit swab race at LU-1044. */
+		if (req->rq_ops->hpreq_check) {
+			rc = req->rq_ops->hpreq_check(req);
+			/**
+			 * XXX: Out of all current
+			 * ptlrpc_hpreq_ops::hpreq_check(), only
+			 * ldlm_cancel_hpreq_check() can return an error code;
+			 * other functions assert in similar places, which seems
+			 * odd. What also does not seem right is that handlers
+			 * for those RPCs do not assert on the same checks, but
+			 * rather handle the error cases. e.g. see
+			 * ost_rw_hpreq_check(), and ost_brw_read(),
+			 * ost_brw_write().
+			 */
+			if (rc < 0)
+				RETURN(rc);
+			LASSERT(rc == 0 || rc == 1);
+		}
+
+		spin_lock_bh(&req->rq_export->exp_rpc_lock);
+		list_add(&req->rq_exp_list,
+			     &req->rq_export->exp_hp_rpcs);
+		spin_unlock_bh(&req->rq_export->exp_rpc_lock);
+	}
+
+	ptlrpc_nrs_req_initialize(svcpt, req, rc);
+
+	RETURN(rc);
+}
+
+/** Remove the request from the export list. */
+static void ptlrpc_server_hpreq_fini(struct ptlrpc_request *req)
+{
+	ENTRY;
+	if (req->rq_export && req->rq_ops) {
+		/* refresh lock timeout again so that client has more
+		 * room to send lock cancel RPC. */
+		if (req->rq_ops->hpreq_fini)
+			req->rq_ops->hpreq_fini(req);
+
+		spin_lock_bh(&req->rq_export->exp_rpc_lock);
+		list_del_init(&req->rq_exp_list);
+		spin_unlock_bh(&req->rq_export->exp_rpc_lock);
+	}
+	EXIT;
+}
+
+static int ptlrpc_hpreq_check(struct ptlrpc_request *req)
+{
+	return 1;
+}
+
+static struct ptlrpc_hpreq_ops ptlrpc_hpreq_common = {
+	.hpreq_check       = ptlrpc_hpreq_check,
+};
+
+/* Hi-Priority RPC check by RPC operation code. */
+int ptlrpc_hpreq_handler(struct ptlrpc_request *req)
+{
+	int opc = lustre_msg_get_opc(req->rq_reqmsg);
+
+	/* Check for export to let only reconnects for not yet evicted
+	 * export to become a HP rpc. */
+	if ((req->rq_export != NULL) &&
+	    (opc == OBD_PING || opc == MDS_CONNECT || opc == OST_CONNECT))
+		req->rq_ops = &ptlrpc_hpreq_common;
+
+	return 0;
+}
+EXPORT_SYMBOL(ptlrpc_hpreq_handler);
+
+static int ptlrpc_server_request_add(struct ptlrpc_service_part *svcpt,
+				     struct ptlrpc_request *req)
+{
+	int	rc;
+	ENTRY;
+
+	rc = ptlrpc_server_hpreq_init(svcpt, req);
+	if (rc < 0)
+		RETURN(rc);
+
+	ptlrpc_nrs_req_add(svcpt, req, !!rc);
+
+	RETURN(0);
+}
+
+/**
+ * Allow to handle high priority request
+ * User can call it w/o any lock but need to hold
+ * ptlrpc_service_part::scp_req_lock to get reliable result
+ */
+static bool ptlrpc_server_allow_high(struct ptlrpc_service_part *svcpt,
+				     bool force)
+{
+	int running = svcpt->scp_nthrs_running;
+
+	if (!nrs_svcpt_has_hp(svcpt))
+		return false;
+
+	if (force)
+		return true;
+
+	if (unlikely(svcpt->scp_service->srv_req_portal == MDS_REQUEST_PORTAL &&
+		     CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND))) {
+		/* leave just 1 thread for normal RPCs */
+		running = PTLRPC_NTHRS_INIT;
+		if (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL)
+			running += 1;
+	}
+
+	if (svcpt->scp_nreqs_active >= running - 1)
+		return false;
+
+	if (svcpt->scp_nhreqs_active == 0)
+		return true;
+
+	return !ptlrpc_nrs_req_pending_nolock(svcpt, false) ||
+	       svcpt->scp_hreq_count < svcpt->scp_service->srv_hpreq_ratio;
+}
+
+static bool ptlrpc_server_high_pending(struct ptlrpc_service_part *svcpt,
+				       bool force)
+{
+	return ptlrpc_server_allow_high(svcpt, force) &&
+	       ptlrpc_nrs_req_pending_nolock(svcpt, true);
+}
+
+/**
+ * Only allow normal priority requests on a service that has a high-priority
+ * queue if forced (i.e. cleanup), if there are other high priority requests
+ * already being processed (i.e. those threads can service more high-priority
+ * requests), or if there are enough idle threads that a later thread can do
+ * a high priority request.
+ * User can call it w/o any lock but need to hold
+ * ptlrpc_service_part::scp_req_lock to get reliable result
+ */
+static bool ptlrpc_server_allow_normal(struct ptlrpc_service_part *svcpt,
+				       bool force)
+{
+	int running = svcpt->scp_nthrs_running;
+	if (unlikely(svcpt->scp_service->srv_req_portal == MDS_REQUEST_PORTAL &&
+		     CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND))) {
+		/* leave just 1 thread for normal RPCs */
+		running = PTLRPC_NTHRS_INIT;
+		if (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL)
+			running += 1;
+	}
+
+	if (force ||
+	    svcpt->scp_nreqs_active < running - 2)
+		return true;
+
+	if (svcpt->scp_nreqs_active >= running - 1)
+		return false;
+
+	return svcpt->scp_nhreqs_active > 0 || !nrs_svcpt_has_hp(svcpt);
+}
+
+static bool ptlrpc_server_normal_pending(struct ptlrpc_service_part *svcpt,
+					 bool force)
+{
+	return ptlrpc_server_allow_normal(svcpt, force) &&
+	       ptlrpc_nrs_req_pending_nolock(svcpt, false);
+}
+
+/**
+ * Returns true if there are requests available in incoming
+ * request queue for processing and it is allowed to fetch them.
+ * User can call it w/o any lock but need to hold ptlrpc_service::scp_req_lock
+ * to get reliable result
+ * \see ptlrpc_server_allow_normal
+ * \see ptlrpc_server_allow high
+ */
+static inline bool
+ptlrpc_server_request_pending(struct ptlrpc_service_part *svcpt, bool force)
+{
+	return ptlrpc_server_high_pending(svcpt, force) ||
+	       ptlrpc_server_normal_pending(svcpt, force);
+}
+
+/**
+ * Fetch a request for processing from queue of unprocessed requests.
+ * Favors high-priority requests.
+ * Returns a pointer to fetched request.
+ */
+static struct ptlrpc_request *
+ptlrpc_server_request_get(struct ptlrpc_service_part *svcpt, bool force)
+{
+	struct ptlrpc_request *req = NULL;
+	ENTRY;
+
+	spin_lock(&svcpt->scp_req_lock);
+
+	if (ptlrpc_server_high_pending(svcpt, force)) {
+		req = ptlrpc_nrs_req_get_nolock(svcpt, true, force);
+		if (req != NULL) {
+			svcpt->scp_hreq_count++;
+			goto got_request;
+		}
+	}
+
+	if (ptlrpc_server_normal_pending(svcpt, force)) {
+		req = ptlrpc_nrs_req_get_nolock(svcpt, false, force);
+		if (req != NULL) {
+			svcpt->scp_hreq_count = 0;
+			goto got_request;
+		}
+	}
+
+	spin_unlock(&svcpt->scp_req_lock);
+	RETURN(NULL);
+
+got_request:
+	svcpt->scp_nreqs_active++;
+	if (req->rq_hp)
+		svcpt->scp_nhreqs_active++;
+
+	spin_unlock(&svcpt->scp_req_lock);
+
+	if (likely(req->rq_export))
+		class_export_rpc_inc(req->rq_export);
+
+	RETURN(req);
+}
+
+/**
+ * Handle freshly incoming reqs, add to timed early reply list,
+ * pass on to regular request queue.
+ * All incoming requests pass through here before getting into
+ * ptlrpc_server_handle_req later on.
+ */
+static int
+ptlrpc_server_handle_req_in(struct ptlrpc_service_part *svcpt,
+			    struct ptlrpc_thread *thread)
+{
+	struct ptlrpc_service	*svc = svcpt->scp_service;
+	struct ptlrpc_request	*req;
+	__u32			deadline;
+	int			rc;
+	ENTRY;
+
+	spin_lock(&svcpt->scp_lock);
+	if (list_empty(&svcpt->scp_req_incoming)) {
+		spin_unlock(&svcpt->scp_lock);
+		RETURN(0);
+	}
+
+	req = list_entry(svcpt->scp_req_incoming.next,
+			     struct ptlrpc_request, rq_list);
+	list_del_init(&req->rq_list);
+	svcpt->scp_nreqs_incoming--;
+	/* Consider this still a "queued" request as far as stats are
+	 * concerned */
+	spin_unlock(&svcpt->scp_lock);
+
+	/* go through security check/transform */
+	rc = sptlrpc_svc_unwrap_request(req);
+	switch (rc) {
+	case SECSVC_OK:
+		break;
+	case SECSVC_COMPLETE:
+		target_send_reply(req, 0, OBD_FAIL_MDS_ALL_REPLY_NET);
+		goto err_req;
+	case SECSVC_DROP:
+		goto err_req;
+	default:
+		LBUG();
+	}
+
+	/*
+	 * for null-flavored rpc, msg has been unpacked by sptlrpc, although
+	 * redo it wouldn't be harmful.
+	 */
+	if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) {
+		rc = ptlrpc_unpack_req_msg(req, req->rq_reqlen);
+		if (rc != 0) {
+			CERROR("error unpacking request: ptl %d from %s "
+			       "x"LPU64"\n", svc->srv_req_portal,
+			       libcfs_id2str(req->rq_peer), req->rq_xid);
+			goto err_req;
+		}
+	}
+
+	rc = lustre_unpack_req_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF);
+	if (rc) {
+		CERROR ("error unpacking ptlrpc body: ptl %d from %s x"
+			LPU64"\n", svc->srv_req_portal,
+			libcfs_id2str(req->rq_peer), req->rq_xid);
+		goto err_req;
+	}
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_REQ_OPC) &&
+	    lustre_msg_get_opc(req->rq_reqmsg) == cfs_fail_val) {
+		CERROR("drop incoming rpc opc %u, x"LPU64"\n",
+		       cfs_fail_val, req->rq_xid);
+		goto err_req;
+	}
+
+	rc = -EINVAL;
+	if (lustre_msg_get_type(req->rq_reqmsg) != PTL_RPC_MSG_REQUEST) {
+		CERROR("wrong packet type received (type=%u) from %s\n",
+		       lustre_msg_get_type(req->rq_reqmsg),
+		       libcfs_id2str(req->rq_peer));
+		goto err_req;
+	}
+
+	switch(lustre_msg_get_opc(req->rq_reqmsg)) {
+	case MDS_WRITEPAGE:
+	case OST_WRITE:
+		req->rq_bulk_write = 1;
+		break;
+	case MDS_READPAGE:
+	case OST_READ:
+	case MGS_CONFIG_READ:
+		req->rq_bulk_read = 1;
+		break;
+	}
+
+	CDEBUG(D_RPCTRACE, "got req x"LPU64"\n", req->rq_xid);
+
+	req->rq_export = class_conn2export(
+		lustre_msg_get_handle(req->rq_reqmsg));
+	if (req->rq_export) {
+		rc = ptlrpc_check_req(req);
+		if (rc == 0) {
+			rc = sptlrpc_target_export_check(req->rq_export, req);
+			if (rc)
+				DEBUG_REQ(D_ERROR, req, "DROPPING req with "
+					  "illegal security flavor,");
+		}
+
+		if (rc)
+			goto err_req;
+		ptlrpc_update_export_timer(req->rq_export, 0);
+	}
+
+	/* req_in handling should/must be fast */
+	if (cfs_time_current_sec() - req->rq_arrival_time.tv_sec > 5)
+		DEBUG_REQ(D_WARNING, req, "Slow req_in handling "CFS_DURATION_T"s",
+			  cfs_time_sub(cfs_time_current_sec(),
+				       req->rq_arrival_time.tv_sec));
+
+	/* Set rpc server deadline and add it to the timed list */
+	deadline = (lustre_msghdr_get_flags(req->rq_reqmsg) &
+		    MSGHDR_AT_SUPPORT) ?
+		   /* The max time the client expects us to take */
+		   lustre_msg_get_timeout(req->rq_reqmsg) : obd_timeout;
+	req->rq_deadline = req->rq_arrival_time.tv_sec + deadline;
+	if (unlikely(deadline == 0)) {
+		DEBUG_REQ(D_ERROR, req, "Dropping request with 0 timeout");
+		goto err_req;
+	}
+
+	req->rq_svc_thread = thread;
+
+	ptlrpc_at_add_timed(req);
+
+	/* Move it over to the request processing queue */
+	rc = ptlrpc_server_request_add(svcpt, req);
+	if (rc)
+		GOTO(err_req, rc);
+
+	wake_up(&svcpt->scp_waitq);
+	RETURN(1);
+
+err_req:
+	ptlrpc_server_finish_request(svcpt, req);
+
+	RETURN(1);
+}
+
+/**
+ * Main incoming request handling logic.
+ * Calls handler function from service to do actual processing.
+ */
+static int
+ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt,
+			     struct ptlrpc_thread *thread)
+{
+	struct ptlrpc_service *svc = svcpt->scp_service;
+	struct ptlrpc_request *request;
+	struct timeval	 work_start;
+	struct timeval	 work_end;
+	long		   timediff;
+	int		    rc;
+	int		    fail_opc = 0;
+	ENTRY;
+
+	request = ptlrpc_server_request_get(svcpt, false);
+	if (request == NULL)
+		RETURN(0);
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT))
+		fail_opc = OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT;
+	else if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT))
+		fail_opc = OBD_FAIL_PTLRPC_HPREQ_TIMEOUT;
+
+	if (unlikely(fail_opc)) {
+		if (request->rq_export && request->rq_ops)
+			OBD_FAIL_TIMEOUT(fail_opc, 4);
+	}
+
+	ptlrpc_rqphase_move(request, RQ_PHASE_INTERPRET);
+
+	if(OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DUMP_LOG))
+		libcfs_debug_dumplog();
+
+	do_gettimeofday(&work_start);
+	timediff = cfs_timeval_sub(&work_start, &request->rq_arrival_time,NULL);
+	if (likely(svc->srv_stats != NULL)) {
+		lprocfs_counter_add(svc->srv_stats, PTLRPC_REQWAIT_CNTR,
+				    timediff);
+		lprocfs_counter_add(svc->srv_stats, PTLRPC_REQQDEPTH_CNTR,
+				    svcpt->scp_nreqs_incoming);
+		lprocfs_counter_add(svc->srv_stats, PTLRPC_REQACTIVE_CNTR,
+				    svcpt->scp_nreqs_active);
+		lprocfs_counter_add(svc->srv_stats, PTLRPC_TIMEOUT,
+				    at_get(&svcpt->scp_at_estimate));
+	}
+
+	rc = lu_context_init(&request->rq_session, LCT_SESSION | LCT_NOREF);
+	if (rc) {
+		CERROR("Failure to initialize session: %d\n", rc);
+		goto out_req;
+	}
+	request->rq_session.lc_thread = thread;
+	request->rq_session.lc_cookie = 0x5;
+	lu_context_enter(&request->rq_session);
+
+	CDEBUG(D_NET, "got req "LPU64"\n", request->rq_xid);
+
+	request->rq_svc_thread = thread;
+	if (thread)
+		request->rq_svc_thread->t_env->le_ses = &request->rq_session;
+
+	if (likely(request->rq_export)) {
+		if (unlikely(ptlrpc_check_req(request)))
+			goto put_conn;
+		ptlrpc_update_export_timer(request->rq_export, timediff >> 19);
+	}
+
+	/* Discard requests queued for longer than the deadline.
+	   The deadline is increased if we send an early reply. */
+	if (cfs_time_current_sec() > request->rq_deadline) {
+		DEBUG_REQ(D_ERROR, request, "Dropping timed-out request from %s"
+			  ": deadline "CFS_DURATION_T":"CFS_DURATION_T"s ago\n",
+			  libcfs_id2str(request->rq_peer),
+			  cfs_time_sub(request->rq_deadline,
+			  request->rq_arrival_time.tv_sec),
+			  cfs_time_sub(cfs_time_current_sec(),
+			  request->rq_deadline));
+		goto put_conn;
+	}
+
+	CDEBUG(D_RPCTRACE, "Handling RPC pname:cluuid+ref:pid:xid:nid:opc "
+	       "%s:%s+%d:%d:x"LPU64":%s:%d\n", current_comm(),
+	       (request->rq_export ?
+		(char *)request->rq_export->exp_client_uuid.uuid : "0"),
+	       (request->rq_export ?
+		atomic_read(&request->rq_export->exp_refcount) : -99),
+	       lustre_msg_get_status(request->rq_reqmsg), request->rq_xid,
+	       libcfs_id2str(request->rq_peer),
+	       lustre_msg_get_opc(request->rq_reqmsg));
+
+	if (lustre_msg_get_opc(request->rq_reqmsg) != OBD_PING)
+		CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_PAUSE_REQ, cfs_fail_val);
+
+	rc = svc->srv_ops.so_req_handler(request);
+
+	ptlrpc_rqphase_move(request, RQ_PHASE_COMPLETE);
+
+put_conn:
+	lu_context_exit(&request->rq_session);
+	lu_context_fini(&request->rq_session);
+
+	if (unlikely(cfs_time_current_sec() > request->rq_deadline)) {
+		     DEBUG_REQ(D_WARNING, request, "Request took longer "
+			       "than estimated ("CFS_DURATION_T":"CFS_DURATION_T"s);"
+			       " client may timeout.",
+			       cfs_time_sub(request->rq_deadline,
+					    request->rq_arrival_time.tv_sec),
+			       cfs_time_sub(cfs_time_current_sec(),
+					    request->rq_deadline));
+	}
+
+	do_gettimeofday(&work_end);
+	timediff = cfs_timeval_sub(&work_end, &work_start, NULL);
+	CDEBUG(D_RPCTRACE, "Handled RPC pname:cluuid+ref:pid:xid:nid:opc "
+	       "%s:%s+%d:%d:x"LPU64":%s:%d Request procesed in "
+	       "%ldus (%ldus total) trans "LPU64" rc %d/%d\n",
+		current_comm(),
+		(request->rq_export ?
+		 (char *)request->rq_export->exp_client_uuid.uuid : "0"),
+		(request->rq_export ?
+		 atomic_read(&request->rq_export->exp_refcount) : -99),
+		lustre_msg_get_status(request->rq_reqmsg),
+		request->rq_xid,
+		libcfs_id2str(request->rq_peer),
+		lustre_msg_get_opc(request->rq_reqmsg),
+		timediff,
+		cfs_timeval_sub(&work_end, &request->rq_arrival_time, NULL),
+		(request->rq_repmsg ?
+		 lustre_msg_get_transno(request->rq_repmsg) :
+		 request->rq_transno),
+		request->rq_status,
+		(request->rq_repmsg ?
+		 lustre_msg_get_status(request->rq_repmsg) : -999));
+	if (likely(svc->srv_stats != NULL && request->rq_reqmsg != NULL)) {
+		__u32 op = lustre_msg_get_opc(request->rq_reqmsg);
+		int opc = opcode_offset(op);
+		if (opc > 0 && !(op == LDLM_ENQUEUE || op == MDS_REINT)) {
+			LASSERT(opc < LUSTRE_MAX_OPCODES);
+			lprocfs_counter_add(svc->srv_stats,
+					    opc + EXTRA_MAX_OPCODES,
+					    timediff);
+		}
+	}
+	if (unlikely(request->rq_early_count)) {
+		DEBUG_REQ(D_ADAPTTO, request,
+			  "sent %d early replies before finishing in "
+			  CFS_DURATION_T"s",
+			  request->rq_early_count,
+			  cfs_time_sub(work_end.tv_sec,
+			  request->rq_arrival_time.tv_sec));
+	}
+
+out_req:
+	ptlrpc_server_finish_active_request(svcpt, request);
+
+	RETURN(1);
+}
+
+/**
+ * An internal function to process a single reply state object.
+ */
+static int
+ptlrpc_handle_rs(struct ptlrpc_reply_state *rs)
+{
+	struct ptlrpc_service_part *svcpt = rs->rs_svcpt;
+	struct ptlrpc_service     *svc = svcpt->scp_service;
+	struct obd_export	 *exp;
+	int			nlocks;
+	int			been_handled;
+	ENTRY;
+
+	exp = rs->rs_export;
+
+	LASSERT (rs->rs_difficult);
+	LASSERT (rs->rs_scheduled);
+	LASSERT (list_empty(&rs->rs_list));
+
+	spin_lock(&exp->exp_lock);
+	/* Noop if removed already */
+	list_del_init (&rs->rs_exp_list);
+	spin_unlock(&exp->exp_lock);
+
+	/* The disk commit callback holds exp_uncommitted_replies_lock while it
+	 * iterates over newly committed replies, removing them from
+	 * exp_uncommitted_replies.  It then drops this lock and schedules the
+	 * replies it found for handling here.
+	 *
+	 * We can avoid contention for exp_uncommitted_replies_lock between the
+	 * HRT threads and further commit callbacks by checking rs_committed
+	 * which is set in the commit callback while it holds both
+	 * rs_lock and exp_uncommitted_reples.
+	 *
+	 * If we see rs_committed clear, the commit callback _may_ not have
+	 * handled this reply yet and we race with it to grab
+	 * exp_uncommitted_replies_lock before removing the reply from
+	 * exp_uncommitted_replies.  Note that if we lose the race and the
+	 * reply has already been removed, list_del_init() is a noop.
+	 *
+	 * If we see rs_committed set, we know the commit callback is handling,
+	 * or has handled this reply since store reordering might allow us to
+	 * see rs_committed set out of sequence.  But since this is done
+	 * holding rs_lock, we can be sure it has all completed once we hold
+	 * rs_lock, which we do right next.
+	 */
+	if (!rs->rs_committed) {
+		spin_lock(&exp->exp_uncommitted_replies_lock);
+		list_del_init(&rs->rs_obd_list);
+		spin_unlock(&exp->exp_uncommitted_replies_lock);
+	}
+
+	spin_lock(&rs->rs_lock);
+
+	been_handled = rs->rs_handled;
+	rs->rs_handled = 1;
+
+	nlocks = rs->rs_nlocks;		 /* atomic "steal", but */
+	rs->rs_nlocks = 0;		      /* locks still on rs_locks! */
+
+	if (nlocks == 0 && !been_handled) {
+		/* If we see this, we should already have seen the warning
+		 * in mds_steal_ack_locks()  */
+		CDEBUG(D_HA, "All locks stolen from rs %p x"LPD64".t"LPD64
+		       " o%d NID %s\n",
+		       rs,
+		       rs->rs_xid, rs->rs_transno, rs->rs_opc,
+		       libcfs_nid2str(exp->exp_connection->c_peer.nid));
+	}
+
+	if ((!been_handled && rs->rs_on_net) || nlocks > 0) {
+		spin_unlock(&rs->rs_lock);
+
+		if (!been_handled && rs->rs_on_net) {
+			LNetMDUnlink(rs->rs_md_h);
+			/* Ignore return code; we're racing with completion */
+		}
+
+		while (nlocks-- > 0)
+			ldlm_lock_decref(&rs->rs_locks[nlocks],
+					 rs->rs_modes[nlocks]);
+
+		spin_lock(&rs->rs_lock);
+	}
+
+	rs->rs_scheduled = 0;
+
+	if (!rs->rs_on_net) {
+		/* Off the net */
+		spin_unlock(&rs->rs_lock);
+
+		class_export_put (exp);
+		rs->rs_export = NULL;
+		ptlrpc_rs_decref (rs);
+		if (atomic_dec_and_test(&svcpt->scp_nreps_difficult) &&
+		    svc->srv_is_stopping)
+			wake_up_all(&svcpt->scp_waitq);
+		RETURN(1);
+	}
+
+	/* still on the net; callback will schedule */
+	spin_unlock(&rs->rs_lock);
+	RETURN(1);
+}
+
+
+static void
+ptlrpc_check_rqbd_pool(struct ptlrpc_service_part *svcpt)
+{
+	int avail = svcpt->scp_nrqbds_posted;
+	int low_water = test_req_buffer_pressure ? 0 :
+			svcpt->scp_service->srv_nbuf_per_group / 2;
+
+	/* NB I'm not locking; just looking. */
+
+	/* CAVEAT EMPTOR: We might be allocating buffers here because we've
+	 * allowed the request history to grow out of control.  We could put a
+	 * sanity check on that here and cull some history if we need the
+	 * space. */
+
+	if (avail <= low_water)
+		ptlrpc_grow_req_bufs(svcpt, 1);
+
+	if (svcpt->scp_service->srv_stats) {
+		lprocfs_counter_add(svcpt->scp_service->srv_stats,
+				    PTLRPC_REQBUF_AVAIL_CNTR, avail);
+	}
+}
+
+static int
+ptlrpc_retry_rqbds(void *arg)
+{
+	struct ptlrpc_service_part *svcpt = (struct ptlrpc_service_part *)arg;
+
+	svcpt->scp_rqbd_timeout = 0;
+	return -ETIMEDOUT;
+}
+
+static inline int
+ptlrpc_threads_enough(struct ptlrpc_service_part *svcpt)
+{
+	return svcpt->scp_nreqs_active <
+	       svcpt->scp_nthrs_running - 1 -
+	       (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL);
+}
+
+/**
+ * allowed to create more threads
+ * user can call it w/o any lock but need to hold
+ * ptlrpc_service_part::scp_lock to get reliable result
+ */
+static inline int
+ptlrpc_threads_increasable(struct ptlrpc_service_part *svcpt)
+{
+	return svcpt->scp_nthrs_running +
+	       svcpt->scp_nthrs_starting <
+	       svcpt->scp_service->srv_nthrs_cpt_limit;
+}
+
+/**
+ * too many requests and allowed to create more threads
+ */
+static inline int
+ptlrpc_threads_need_create(struct ptlrpc_service_part *svcpt)
+{
+	return !ptlrpc_threads_enough(svcpt) &&
+		ptlrpc_threads_increasable(svcpt);
+}
+
+static inline int
+ptlrpc_thread_stopping(struct ptlrpc_thread *thread)
+{
+	return thread_is_stopping(thread) ||
+	       thread->t_svcpt->scp_service->srv_is_stopping;
+}
+
+static inline int
+ptlrpc_rqbd_pending(struct ptlrpc_service_part *svcpt)
+{
+	return !list_empty(&svcpt->scp_rqbd_idle) &&
+	       svcpt->scp_rqbd_timeout == 0;
+}
+
+static inline int
+ptlrpc_at_check(struct ptlrpc_service_part *svcpt)
+{
+	return svcpt->scp_at_check;
+}
+
+/**
+ * requests wait on preprocessing
+ * user can call it w/o any lock but need to hold
+ * ptlrpc_service_part::scp_lock to get reliable result
+ */
+static inline int
+ptlrpc_server_request_incoming(struct ptlrpc_service_part *svcpt)
+{
+	return !list_empty(&svcpt->scp_req_incoming);
+}
+
+static __attribute__((__noinline__)) int
+ptlrpc_wait_event(struct ptlrpc_service_part *svcpt,
+		  struct ptlrpc_thread *thread)
+{
+	/* Don't exit while there are replies to be handled */
+	struct l_wait_info lwi = LWI_TIMEOUT(svcpt->scp_rqbd_timeout,
+					     ptlrpc_retry_rqbds, svcpt);
+
+	lc_watchdog_disable(thread->t_watchdog);
+
+	cond_resched();
+
+	l_wait_event_exclusive_head(svcpt->scp_waitq,
+				ptlrpc_thread_stopping(thread) ||
+				ptlrpc_server_request_incoming(svcpt) ||
+				ptlrpc_server_request_pending(svcpt, false) ||
+				ptlrpc_rqbd_pending(svcpt) ||
+				ptlrpc_at_check(svcpt), &lwi);
+
+	if (ptlrpc_thread_stopping(thread))
+		return -EINTR;
+
+	lc_watchdog_touch(thread->t_watchdog,
+			  ptlrpc_server_get_timeout(svcpt));
+	return 0;
+}
+
+/**
+ * Main thread body for service threads.
+ * Waits in a loop waiting for new requests to process to appear.
+ * Every time an incoming requests is added to its queue, a waitq
+ * is woken up and one of the threads will handle it.
+ */
+static int ptlrpc_main(void *arg)
+{
+	struct ptlrpc_thread		*thread = (struct ptlrpc_thread *)arg;
+	struct ptlrpc_service_part	*svcpt = thread->t_svcpt;
+	struct ptlrpc_service		*svc = svcpt->scp_service;
+	struct ptlrpc_reply_state	*rs;
+#ifdef WITH_GROUP_INFO
+	group_info_t *ginfo = NULL;
+#endif
+	struct lu_env *env;
+	int counter = 0, rc = 0;
+	ENTRY;
+
+	thread->t_pid = current_pid();
+	unshare_fs_struct();
+
+	/* NB: we will call cfs_cpt_bind() for all threads, because we
+	 * might want to run lustre server only on a subset of system CPUs,
+	 * in that case ->scp_cpt is CFS_CPT_ANY */
+	rc = cfs_cpt_bind(svc->srv_cptable, svcpt->scp_cpt);
+	if (rc != 0) {
+		CWARN("%s: failed to bind %s on CPT %d\n",
+		      svc->srv_name, thread->t_name, svcpt->scp_cpt);
+	}
+
+#ifdef WITH_GROUP_INFO
+	ginfo = groups_alloc(0);
+	if (!ginfo) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	set_current_groups(ginfo);
+	put_group_info(ginfo);
+#endif
+
+	if (svc->srv_ops.so_thr_init != NULL) {
+		rc = svc->srv_ops.so_thr_init(thread);
+		if (rc)
+			goto out;
+	}
+
+	OBD_ALLOC_PTR(env);
+	if (env == NULL) {
+		rc = -ENOMEM;
+		goto out_srv_fini;
+	}
+
+	rc = lu_context_init(&env->le_ctx,
+			     svc->srv_ctx_tags|LCT_REMEMBER|LCT_NOREF);
+	if (rc)
+		goto out_srv_fini;
+
+	thread->t_env = env;
+	env->le_ctx.lc_thread = thread;
+	env->le_ctx.lc_cookie = 0x6;
+
+	while (!list_empty(&svcpt->scp_rqbd_idle)) {
+		rc = ptlrpc_server_post_idle_rqbds(svcpt);
+		if (rc >= 0)
+			continue;
+
+		CERROR("Failed to post rqbd for %s on CPT %d: %d\n",
+			svc->srv_name, svcpt->scp_cpt, rc);
+		goto out_srv_fini;
+	}
+
+	/* Alloc reply state structure for this one */
+	OBD_ALLOC_LARGE(rs, svc->srv_max_reply_size);
+	if (!rs) {
+		rc = -ENOMEM;
+		goto out_srv_fini;
+	}
+
+	spin_lock(&svcpt->scp_lock);
+
+	LASSERT(thread_is_starting(thread));
+	thread_clear_flags(thread, SVC_STARTING);
+
+	LASSERT(svcpt->scp_nthrs_starting == 1);
+	svcpt->scp_nthrs_starting--;
+
+	/* SVC_STOPPING may already be set here if someone else is trying
+	 * to stop the service while this new thread has been dynamically
+	 * forked. We still set SVC_RUNNING to let our creator know that
+	 * we are now running, however we will exit as soon as possible */
+	thread_add_flags(thread, SVC_RUNNING);
+	svcpt->scp_nthrs_running++;
+	spin_unlock(&svcpt->scp_lock);
+
+	/* wake up our creator in case he's still waiting. */
+	wake_up(&thread->t_ctl_waitq);
+
+	thread->t_watchdog = lc_watchdog_add(ptlrpc_server_get_timeout(svcpt),
+					     NULL, NULL);
+
+	spin_lock(&svcpt->scp_rep_lock);
+	list_add(&rs->rs_list, &svcpt->scp_rep_idle);
+	wake_up(&svcpt->scp_rep_waitq);
+	spin_unlock(&svcpt->scp_rep_lock);
+
+	CDEBUG(D_NET, "service thread %d (#%d) started\n", thread->t_id,
+	       svcpt->scp_nthrs_running);
+
+	/* XXX maintain a list of all managed devices: insert here */
+	while (!ptlrpc_thread_stopping(thread)) {
+		if (ptlrpc_wait_event(svcpt, thread))
+			break;
+
+		ptlrpc_check_rqbd_pool(svcpt);
+
+		if (ptlrpc_threads_need_create(svcpt)) {
+			/* Ignore return code - we tried... */
+			ptlrpc_start_thread(svcpt, 0);
+		}
+
+		/* Process all incoming reqs before handling any */
+		if (ptlrpc_server_request_incoming(svcpt)) {
+			lu_context_enter(&env->le_ctx);
+			ptlrpc_server_handle_req_in(svcpt, thread);
+			lu_context_exit(&env->le_ctx);
+
+			/* but limit ourselves in case of flood */
+			if (counter++ < 100)
+				continue;
+			counter = 0;
+		}
+
+		if (ptlrpc_at_check(svcpt))
+			ptlrpc_at_check_timed(svcpt);
+
+		if (ptlrpc_server_request_pending(svcpt, false)) {
+			lu_context_enter(&env->le_ctx);
+			ptlrpc_server_handle_request(svcpt, thread);
+			lu_context_exit(&env->le_ctx);
+		}
+
+		if (ptlrpc_rqbd_pending(svcpt) &&
+		    ptlrpc_server_post_idle_rqbds(svcpt) < 0) {
+			/* I just failed to repost request buffers.
+			 * Wait for a timeout (unless something else
+			 * happens) before I try again */
+			svcpt->scp_rqbd_timeout = cfs_time_seconds(1) / 10;
+			CDEBUG(D_RPCTRACE, "Posted buffers: %d\n",
+			       svcpt->scp_nrqbds_posted);
+		}
+	}
+
+	lc_watchdog_delete(thread->t_watchdog);
+	thread->t_watchdog = NULL;
+
+out_srv_fini:
+	/*
+	 * deconstruct service specific state created by ptlrpc_start_thread()
+	 */
+	if (svc->srv_ops.so_thr_done != NULL)
+		svc->srv_ops.so_thr_done(thread);
+
+	if (env != NULL) {
+		lu_context_fini(&env->le_ctx);
+		OBD_FREE_PTR(env);
+	}
+out:
+	CDEBUG(D_RPCTRACE, "service thread [ %p : %u ] %d exiting: rc %d\n",
+	       thread, thread->t_pid, thread->t_id, rc);
+
+	spin_lock(&svcpt->scp_lock);
+	if (thread_test_and_clear_flags(thread, SVC_STARTING))
+		svcpt->scp_nthrs_starting--;
+
+	if (thread_test_and_clear_flags(thread, SVC_RUNNING)) {
+		/* must know immediately */
+		svcpt->scp_nthrs_running--;
+	}
+
+	thread->t_id = rc;
+	thread_add_flags(thread, SVC_STOPPED);
+
+	wake_up(&thread->t_ctl_waitq);
+	spin_unlock(&svcpt->scp_lock);
+
+	return rc;
+}
+
+static int hrt_dont_sleep(struct ptlrpc_hr_thread *hrt,
+			  struct list_head *replies)
+{
+	int result;
+
+	spin_lock(&hrt->hrt_lock);
+
+	list_splice_init(&hrt->hrt_queue, replies);
+	result = ptlrpc_hr.hr_stopping || !list_empty(replies);
+
+	spin_unlock(&hrt->hrt_lock);
+	return result;
+}
+
+/**
+ * Main body of "handle reply" function.
+ * It processes acked reply states
+ */
+static int ptlrpc_hr_main(void *arg)
+{
+	struct ptlrpc_hr_thread		*hrt = (struct ptlrpc_hr_thread *)arg;
+	struct ptlrpc_hr_partition	*hrp = hrt->hrt_partition;
+	LIST_HEAD			(replies);
+	char				threadname[20];
+	int				rc;
+
+	snprintf(threadname, sizeof(threadname), "ptlrpc_hr%02d_%03d",
+		 hrp->hrp_cpt, hrt->hrt_id);
+	unshare_fs_struct();
+
+	rc = cfs_cpt_bind(ptlrpc_hr.hr_cpt_table, hrp->hrp_cpt);
+	if (rc != 0) {
+		CWARN("Failed to bind %s on CPT %d of CPT table %p: rc = %d\n",
+		      threadname, hrp->hrp_cpt, ptlrpc_hr.hr_cpt_table, rc);
+	}
+
+	atomic_inc(&hrp->hrp_nstarted);
+	wake_up(&ptlrpc_hr.hr_waitq);
+
+	while (!ptlrpc_hr.hr_stopping) {
+		l_wait_condition(hrt->hrt_waitq, hrt_dont_sleep(hrt, &replies));
+
+		while (!list_empty(&replies)) {
+			struct ptlrpc_reply_state *rs;
+
+			rs = list_entry(replies.prev,
+					    struct ptlrpc_reply_state,
+					    rs_list);
+			list_del_init(&rs->rs_list);
+			ptlrpc_handle_rs(rs);
+		}
+	}
+
+	atomic_inc(&hrp->hrp_nstopped);
+	wake_up(&ptlrpc_hr.hr_waitq);
+
+	return 0;
+}
+
+static void ptlrpc_stop_hr_threads(void)
+{
+	struct ptlrpc_hr_partition	*hrp;
+	int				i;
+	int				j;
+
+	ptlrpc_hr.hr_stopping = 1;
+
+	cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
+		if (hrp->hrp_thrs == NULL)
+			continue; /* uninitialized */
+		for (j = 0; j < hrp->hrp_nthrs; j++)
+			wake_up_all(&hrp->hrp_thrs[j].hrt_waitq);
+	}
+
+	cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
+		if (hrp->hrp_thrs == NULL)
+			continue; /* uninitialized */
+		wait_event(ptlrpc_hr.hr_waitq,
+			       atomic_read(&hrp->hrp_nstopped) ==
+			       atomic_read(&hrp->hrp_nstarted));
+	}
+}
+
+static int ptlrpc_start_hr_threads(void)
+{
+	struct ptlrpc_hr_partition	*hrp;
+	int				i;
+	int				j;
+	ENTRY;
+
+	cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
+		int	rc = 0;
+
+		for (j = 0; j < hrp->hrp_nthrs; j++) {
+			struct	ptlrpc_hr_thread *hrt = &hrp->hrp_thrs[j];
+			rc = PTR_ERR(kthread_run(ptlrpc_hr_main,
+						 &hrp->hrp_thrs[j],
+						 "ptlrpc_hr%02d_%03d",
+						 hrp->hrp_cpt,
+						 hrt->hrt_id));
+			if (IS_ERR_VALUE(rc))
+				break;
+		}
+		wait_event(ptlrpc_hr.hr_waitq,
+			       atomic_read(&hrp->hrp_nstarted) == j);
+		if (!IS_ERR_VALUE(rc))
+			continue;
+
+		CERROR("Reply handling thread %d:%d Failed on starting: "
+		       "rc = %d\n", i, j, rc);
+		ptlrpc_stop_hr_threads();
+		RETURN(rc);
+	}
+	RETURN(0);
+}
+
+static void ptlrpc_svcpt_stop_threads(struct ptlrpc_service_part *svcpt)
+{
+	struct l_wait_info	lwi = { 0 };
+	struct ptlrpc_thread	*thread;
+	LIST_HEAD		(zombie);
+
+	ENTRY;
+
+	CDEBUG(D_INFO, "Stopping threads for service %s\n",
+	       svcpt->scp_service->srv_name);
+
+	spin_lock(&svcpt->scp_lock);
+	/* let the thread know that we would like it to stop asap */
+	list_for_each_entry(thread, &svcpt->scp_threads, t_link) {
+		CDEBUG(D_INFO, "Stopping thread %s #%u\n",
+		       svcpt->scp_service->srv_thread_name, thread->t_id);
+		thread_add_flags(thread, SVC_STOPPING);
+	}
+
+	wake_up_all(&svcpt->scp_waitq);
+
+	while (!list_empty(&svcpt->scp_threads)) {
+		thread = list_entry(svcpt->scp_threads.next,
+					struct ptlrpc_thread, t_link);
+		if (thread_is_stopped(thread)) {
+			list_del(&thread->t_link);
+			list_add(&thread->t_link, &zombie);
+			continue;
+		}
+		spin_unlock(&svcpt->scp_lock);
+
+		CDEBUG(D_INFO, "waiting for stopping-thread %s #%u\n",
+		       svcpt->scp_service->srv_thread_name, thread->t_id);
+		l_wait_event(thread->t_ctl_waitq,
+			     thread_is_stopped(thread), &lwi);
+
+		spin_lock(&svcpt->scp_lock);
+	}
+
+	spin_unlock(&svcpt->scp_lock);
+
+	while (!list_empty(&zombie)) {
+		thread = list_entry(zombie.next,
+					struct ptlrpc_thread, t_link);
+		list_del(&thread->t_link);
+		OBD_FREE_PTR(thread);
+	}
+	EXIT;
+}
+
+/**
+ * Stops all threads of a particular service \a svc
+ */
+void ptlrpc_stop_all_threads(struct ptlrpc_service *svc)
+{
+	struct ptlrpc_service_part *svcpt;
+	int			   i;
+	ENTRY;
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		if (svcpt->scp_service != NULL)
+			ptlrpc_svcpt_stop_threads(svcpt);
+	}
+
+	EXIT;
+}
+EXPORT_SYMBOL(ptlrpc_stop_all_threads);
+
+int ptlrpc_start_threads(struct ptlrpc_service *svc)
+{
+	int	rc = 0;
+	int	i;
+	int	j;
+	ENTRY;
+
+	/* We require 2 threads min, see note in ptlrpc_server_handle_request */
+	LASSERT(svc->srv_nthrs_cpt_init >= PTLRPC_NTHRS_INIT);
+
+	for (i = 0; i < svc->srv_ncpts; i++) {
+		for (j = 0; j < svc->srv_nthrs_cpt_init; j++) {
+			rc = ptlrpc_start_thread(svc->srv_parts[i], 1);
+			if (rc == 0)
+				continue;
+
+			if (rc != -EMFILE)
+				goto failed;
+			/* We have enough threads, don't start more. b=15759 */
+			break;
+		}
+	}
+
+	RETURN(0);
+ failed:
+	CERROR("cannot start %s thread #%d_%d: rc %d\n",
+	       svc->srv_thread_name, i, j, rc);
+	ptlrpc_stop_all_threads(svc);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_start_threads);
+
+int ptlrpc_start_thread(struct ptlrpc_service_part *svcpt, int wait)
+{
+	struct l_wait_info	lwi = { 0 };
+	struct ptlrpc_thread	*thread;
+	struct ptlrpc_service	*svc;
+	int			rc;
+	ENTRY;
+
+	LASSERT(svcpt != NULL);
+
+	svc = svcpt->scp_service;
+
+	CDEBUG(D_RPCTRACE, "%s[%d] started %d min %d max %d\n",
+	       svc->srv_name, svcpt->scp_cpt, svcpt->scp_nthrs_running,
+	       svc->srv_nthrs_cpt_init, svc->srv_nthrs_cpt_limit);
+
+ again:
+	if (unlikely(svc->srv_is_stopping))
+		RETURN(-ESRCH);
+
+	if (!ptlrpc_threads_increasable(svcpt) ||
+	    (OBD_FAIL_CHECK(OBD_FAIL_TGT_TOOMANY_THREADS) &&
+	     svcpt->scp_nthrs_running == svc->srv_nthrs_cpt_init - 1))
+		RETURN(-EMFILE);
+
+	OBD_CPT_ALLOC_PTR(thread, svc->srv_cptable, svcpt->scp_cpt);
+	if (thread == NULL)
+		RETURN(-ENOMEM);
+	init_waitqueue_head(&thread->t_ctl_waitq);
+
+	spin_lock(&svcpt->scp_lock);
+	if (!ptlrpc_threads_increasable(svcpt)) {
+		spin_unlock(&svcpt->scp_lock);
+		OBD_FREE_PTR(thread);
+		RETURN(-EMFILE);
+	}
+
+	if (svcpt->scp_nthrs_starting != 0) {
+		/* serialize starting because some modules (obdfilter)
+		 * might require unique and contiguous t_id */
+		LASSERT(svcpt->scp_nthrs_starting == 1);
+		spin_unlock(&svcpt->scp_lock);
+		OBD_FREE_PTR(thread);
+		if (wait) {
+			CDEBUG(D_INFO, "Waiting for creating thread %s #%d\n",
+			       svc->srv_thread_name, svcpt->scp_thr_nextid);
+			schedule();
+			goto again;
+		}
+
+		CDEBUG(D_INFO, "Creating thread %s #%d race, retry later\n",
+		       svc->srv_thread_name, svcpt->scp_thr_nextid);
+		RETURN(-EAGAIN);
+	}
+
+	svcpt->scp_nthrs_starting++;
+	thread->t_id = svcpt->scp_thr_nextid++;
+	thread_add_flags(thread, SVC_STARTING);
+	thread->t_svcpt = svcpt;
+
+	list_add(&thread->t_link, &svcpt->scp_threads);
+	spin_unlock(&svcpt->scp_lock);
+
+	if (svcpt->scp_cpt >= 0) {
+		snprintf(thread->t_name, PTLRPC_THR_NAME_LEN, "%s%02d_%03d",
+			 svc->srv_thread_name, svcpt->scp_cpt, thread->t_id);
+	} else {
+		snprintf(thread->t_name, PTLRPC_THR_NAME_LEN, "%s_%04d",
+			 svc->srv_thread_name, thread->t_id);
+	}
+
+	CDEBUG(D_RPCTRACE, "starting thread '%s'\n", thread->t_name);
+	rc = PTR_ERR(kthread_run(ptlrpc_main, thread, thread->t_name));
+	if (IS_ERR_VALUE(rc)) {
+		CERROR("cannot start thread '%s': rc %d\n",
+		       thread->t_name, rc);
+		spin_lock(&svcpt->scp_lock);
+		list_del(&thread->t_link);
+		--svcpt->scp_nthrs_starting;
+		spin_unlock(&svcpt->scp_lock);
+
+		OBD_FREE(thread, sizeof(*thread));
+		RETURN(rc);
+	}
+
+	if (!wait)
+		RETURN(0);
+
+	l_wait_event(thread->t_ctl_waitq,
+		     thread_is_running(thread) || thread_is_stopped(thread),
+		     &lwi);
+
+	rc = thread_is_stopped(thread) ? thread->t_id : 0;
+	RETURN(rc);
+}
+
+int ptlrpc_hr_init(void)
+{
+	struct ptlrpc_hr_partition	*hrp;
+	struct ptlrpc_hr_thread		*hrt;
+	int				rc;
+	int				i;
+	int				j;
+	ENTRY;
+
+	memset(&ptlrpc_hr, 0, sizeof(ptlrpc_hr));
+	ptlrpc_hr.hr_cpt_table = cfs_cpt_table;
+
+	ptlrpc_hr.hr_partitions = cfs_percpt_alloc(ptlrpc_hr.hr_cpt_table,
+						   sizeof(*hrp));
+	if (ptlrpc_hr.hr_partitions == NULL)
+		RETURN(-ENOMEM);
+
+	init_waitqueue_head(&ptlrpc_hr.hr_waitq);
+
+	cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
+		hrp->hrp_cpt = i;
+
+		atomic_set(&hrp->hrp_nstarted, 0);
+		atomic_set(&hrp->hrp_nstopped, 0);
+
+		hrp->hrp_nthrs = cfs_cpt_weight(ptlrpc_hr.hr_cpt_table, i);
+		hrp->hrp_nthrs /= cfs_cpu_ht_nsiblings(0);
+
+		LASSERT(hrp->hrp_nthrs > 0);
+		OBD_CPT_ALLOC(hrp->hrp_thrs, ptlrpc_hr.hr_cpt_table, i,
+			      hrp->hrp_nthrs * sizeof(*hrt));
+		if (hrp->hrp_thrs == NULL)
+			GOTO(out, rc = -ENOMEM);
+
+		for (j = 0; j < hrp->hrp_nthrs; j++) {
+			hrt = &hrp->hrp_thrs[j];
+
+			hrt->hrt_id = j;
+			hrt->hrt_partition = hrp;
+			init_waitqueue_head(&hrt->hrt_waitq);
+			spin_lock_init(&hrt->hrt_lock);
+			INIT_LIST_HEAD(&hrt->hrt_queue);
+		}
+	}
+
+	rc = ptlrpc_start_hr_threads();
+out:
+	if (rc != 0)
+		ptlrpc_hr_fini();
+	RETURN(rc);
+}
+
+void ptlrpc_hr_fini(void)
+{
+	struct ptlrpc_hr_partition	*hrp;
+	int				i;
+
+	if (ptlrpc_hr.hr_partitions == NULL)
+		return;
+
+	ptlrpc_stop_hr_threads();
+
+	cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
+		if (hrp->hrp_thrs != NULL) {
+			OBD_FREE(hrp->hrp_thrs,
+				 hrp->hrp_nthrs * sizeof(hrp->hrp_thrs[0]));
+		}
+	}
+
+	cfs_percpt_free(ptlrpc_hr.hr_partitions);
+	ptlrpc_hr.hr_partitions = NULL;
+}
+
+
+/**
+ * Wait until all already scheduled replies are processed.
+ */
+static void ptlrpc_wait_replies(struct ptlrpc_service_part *svcpt)
+{
+	while (1) {
+		int rc;
+		struct l_wait_info lwi = LWI_TIMEOUT(cfs_time_seconds(10),
+						     NULL, NULL);
+
+		rc = l_wait_event(svcpt->scp_waitq,
+		     atomic_read(&svcpt->scp_nreps_difficult) == 0, &lwi);
+		if (rc == 0)
+			break;
+		CWARN("Unexpectedly long timeout %s %p\n",
+		      svcpt->scp_service->srv_name, svcpt->scp_service);
+	}
+}
+
+static void
+ptlrpc_service_del_atimer(struct ptlrpc_service *svc)
+{
+	struct ptlrpc_service_part	*svcpt;
+	int				i;
+
+	/* early disarm AT timer... */
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		if (svcpt->scp_service != NULL)
+			cfs_timer_disarm(&svcpt->scp_at_timer);
+	}
+}
+
+static void
+ptlrpc_service_unlink_rqbd(struct ptlrpc_service *svc)
+{
+	struct ptlrpc_service_part	  *svcpt;
+	struct ptlrpc_request_buffer_desc *rqbd;
+	struct l_wait_info		  lwi;
+	int				  rc;
+	int				  i;
+
+	/* All history will be culled when the next request buffer is
+	 * freed in ptlrpc_service_purge_all() */
+	svc->srv_hist_nrqbds_cpt_max = 0;
+
+	rc = LNetClearLazyPortal(svc->srv_req_portal);
+	LASSERT(rc == 0);
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		if (svcpt->scp_service == NULL)
+			break;
+
+		/* Unlink all the request buffers.  This forces a 'final'
+		 * event with its 'unlink' flag set for each posted rqbd */
+		list_for_each_entry(rqbd, &svcpt->scp_rqbd_posted,
+					rqbd_list) {
+			rc = LNetMDUnlink(rqbd->rqbd_md_h);
+			LASSERT(rc == 0 || rc == -ENOENT);
+		}
+	}
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		if (svcpt->scp_service == NULL)
+			break;
+
+		/* Wait for the network to release any buffers
+		 * it's currently filling */
+		spin_lock(&svcpt->scp_lock);
+		while (svcpt->scp_nrqbds_posted != 0) {
+			spin_unlock(&svcpt->scp_lock);
+			/* Network access will complete in finite time but
+			 * the HUGE timeout lets us CWARN for visibility
+			 * of sluggish NALs */
+			lwi = LWI_TIMEOUT_INTERVAL(
+					cfs_time_seconds(LONG_UNLINK),
+					cfs_time_seconds(1), NULL, NULL);
+			rc = l_wait_event(svcpt->scp_waitq,
+					  svcpt->scp_nrqbds_posted == 0, &lwi);
+			if (rc == -ETIMEDOUT) {
+				CWARN("Service %s waiting for "
+				      "request buffers\n",
+				      svcpt->scp_service->srv_name);
+			}
+			spin_lock(&svcpt->scp_lock);
+		}
+		spin_unlock(&svcpt->scp_lock);
+	}
+}
+
+static void
+ptlrpc_service_purge_all(struct ptlrpc_service *svc)
+{
+	struct ptlrpc_service_part		*svcpt;
+	struct ptlrpc_request_buffer_desc	*rqbd;
+	struct ptlrpc_request			*req;
+	struct ptlrpc_reply_state		*rs;
+	int					i;
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		if (svcpt->scp_service == NULL)
+			break;
+
+		spin_lock(&svcpt->scp_rep_lock);
+		while (!list_empty(&svcpt->scp_rep_active)) {
+			rs = list_entry(svcpt->scp_rep_active.next,
+					    struct ptlrpc_reply_state, rs_list);
+			spin_lock(&rs->rs_lock);
+			ptlrpc_schedule_difficult_reply(rs);
+			spin_unlock(&rs->rs_lock);
+		}
+		spin_unlock(&svcpt->scp_rep_lock);
+
+		/* purge the request queue.  NB No new replies (rqbds
+		 * all unlinked) and no service threads, so I'm the only
+		 * thread noodling the request queue now */
+		while (!list_empty(&svcpt->scp_req_incoming)) {
+			req = list_entry(svcpt->scp_req_incoming.next,
+					     struct ptlrpc_request, rq_list);
+
+			list_del(&req->rq_list);
+			svcpt->scp_nreqs_incoming--;
+			ptlrpc_server_finish_request(svcpt, req);
+		}
+
+		while (ptlrpc_server_request_pending(svcpt, true)) {
+			req = ptlrpc_server_request_get(svcpt, true);
+			ptlrpc_server_finish_active_request(svcpt, req);
+		}
+
+		LASSERT(list_empty(&svcpt->scp_rqbd_posted));
+		LASSERT(svcpt->scp_nreqs_incoming == 0);
+		LASSERT(svcpt->scp_nreqs_active == 0);
+		/* history should have been culled by
+		 * ptlrpc_server_finish_request */
+		LASSERT(svcpt->scp_hist_nrqbds == 0);
+
+		/* Now free all the request buffers since nothing
+		 * references them any more... */
+
+		while (!list_empty(&svcpt->scp_rqbd_idle)) {
+			rqbd = list_entry(svcpt->scp_rqbd_idle.next,
+					      struct ptlrpc_request_buffer_desc,
+					      rqbd_list);
+			ptlrpc_free_rqbd(rqbd);
+		}
+		ptlrpc_wait_replies(svcpt);
+
+		while (!list_empty(&svcpt->scp_rep_idle)) {
+			rs = list_entry(svcpt->scp_rep_idle.next,
+					    struct ptlrpc_reply_state,
+					    rs_list);
+			list_del(&rs->rs_list);
+			OBD_FREE_LARGE(rs, svc->srv_max_reply_size);
+		}
+	}
+}
+
+static void
+ptlrpc_service_free(struct ptlrpc_service *svc)
+{
+	struct ptlrpc_service_part	*svcpt;
+	struct ptlrpc_at_array		*array;
+	int				i;
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		if (svcpt->scp_service == NULL)
+			break;
+
+		/* In case somebody rearmed this in the meantime */
+		cfs_timer_disarm(&svcpt->scp_at_timer);
+		array = &svcpt->scp_at_array;
+
+		if (array->paa_reqs_array != NULL) {
+			OBD_FREE(array->paa_reqs_array,
+				 sizeof(struct list_head) * array->paa_size);
+			array->paa_reqs_array = NULL;
+		}
+
+		if (array->paa_reqs_count != NULL) {
+			OBD_FREE(array->paa_reqs_count,
+				 sizeof(__u32) * array->paa_size);
+			array->paa_reqs_count = NULL;
+		}
+	}
+
+	ptlrpc_service_for_each_part(svcpt, i, svc)
+		OBD_FREE_PTR(svcpt);
+
+	if (svc->srv_cpts != NULL)
+		cfs_expr_list_values_free(svc->srv_cpts, svc->srv_ncpts);
+
+	OBD_FREE(svc, offsetof(struct ptlrpc_service,
+			       srv_parts[svc->srv_ncpts]));
+}
+
+int ptlrpc_unregister_service(struct ptlrpc_service *service)
+{
+	ENTRY;
+
+	CDEBUG(D_NET, "%s: tearing down\n", service->srv_name);
+
+	service->srv_is_stopping = 1;
+
+	mutex_lock(&ptlrpc_all_services_mutex);
+	list_del_init(&service->srv_list);
+	mutex_unlock(&ptlrpc_all_services_mutex);
+
+	ptlrpc_service_del_atimer(service);
+	ptlrpc_stop_all_threads(service);
+
+	ptlrpc_service_unlink_rqbd(service);
+	ptlrpc_service_purge_all(service);
+	ptlrpc_service_nrs_cleanup(service);
+
+	ptlrpc_lprocfs_unregister_service(service);
+
+	ptlrpc_service_free(service);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_unregister_service);
+
+/**
+ * Returns 0 if the service is healthy.
+ *
+ * Right now, it just checks to make sure that requests aren't languishing
+ * in the queue.  We'll use this health check to govern whether a node needs
+ * to be shot, so it's intentionally non-aggressive. */
+int ptlrpc_svcpt_health_check(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_request		*request = NULL;
+	struct timeval			right_now;
+	long				timediff;
+
+	do_gettimeofday(&right_now);
+
+	spin_lock(&svcpt->scp_req_lock);
+	/* How long has the next entry been waiting? */
+	if (ptlrpc_server_high_pending(svcpt, true))
+		request = ptlrpc_nrs_req_peek_nolock(svcpt, true);
+	else if (ptlrpc_server_normal_pending(svcpt, true))
+		request = ptlrpc_nrs_req_peek_nolock(svcpt, false);
+
+	if (request == NULL) {
+		spin_unlock(&svcpt->scp_req_lock);
+		return 0;
+	}
+
+	timediff = cfs_timeval_sub(&right_now, &request->rq_arrival_time, NULL);
+	spin_unlock(&svcpt->scp_req_lock);
+
+	if ((timediff / ONE_MILLION) >
+	    (AT_OFF ? obd_timeout * 3 / 2 : at_max)) {
+		CERROR("%s: unhealthy - request has been waiting %lds\n",
+		       svcpt->scp_service->srv_name, timediff / ONE_MILLION);
+		return -1;
+	}
+
+	return 0;
+}
+
+int
+ptlrpc_service_health_check(struct ptlrpc_service *svc)
+{
+	struct ptlrpc_service_part	*svcpt;
+	int				i;
+
+	if (svc == NULL)
+		return 0;
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		int rc = ptlrpc_svcpt_health_check(svcpt);
+
+		if (rc != 0)
+			return rc;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ptlrpc_service_health_check);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/wirehdr.c b/drivers/staging/lustre/lustre/ptlrpc/wirehdr.c
new file mode 100644
index 000000000000..93bc40b422ee
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/wirehdr.c
@@ -0,0 +1,47 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+# ifdef CONFIG_FS_POSIX_ACL
+#  include <linux/fs.h>
+#  include <linux/posix_acl_xattr.h>
+# endif
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_disk.h>
diff --git a/drivers/staging/lustre/lustre/ptlrpc/wiretest.c b/drivers/staging/lustre/lustre/ptlrpc/wiretest.c
new file mode 100644
index 000000000000..9890bd9cfb93
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/wiretest.c
@@ -0,0 +1,4474 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+# ifdef CONFIG_FS_POSIX_ACL
+#  include <linux/fs.h>
+#  include <linux/posix_acl_xattr.h>
+# endif
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_disk.h>
+void lustre_assert_wire_constants(void)
+{
+	 /* Wire protocol assertions generated by 'wirecheck'
+	  * (make -C lustre/utils newwiretest)
+	  * running on Linux deva 2.6.32.279.lustre #5 SMP Tue Apr 9 22:52:17 CST 2013 x86_64 x86_64 x
+	  * with gcc version 4.4.4 20100726 (Red Hat 4.4.4-13) (GCC)  */
+
+
+	/* Constants... */
+	LASSERTF(PTL_RPC_MSG_REQUEST == 4711, "found %lld\n",
+		 (long long)PTL_RPC_MSG_REQUEST);
+	LASSERTF(PTL_RPC_MSG_ERR == 4712, "found %lld\n",
+		 (long long)PTL_RPC_MSG_ERR);
+	LASSERTF(PTL_RPC_MSG_REPLY == 4713, "found %lld\n",
+		 (long long)PTL_RPC_MSG_REPLY);
+	LASSERTF(MDS_DIR_END_OFF == 0xfffffffffffffffeULL, "found 0x%.16llxULL\n",
+		 MDS_DIR_END_OFF);
+	LASSERTF(DEAD_HANDLE_MAGIC == 0xdeadbeefcafebabeULL, "found 0x%.16llxULL\n",
+		 DEAD_HANDLE_MAGIC);
+	CLASSERT(MTI_NAME_MAXLEN == 64);
+	LASSERTF(OST_REPLY == 0, "found %lld\n",
+		 (long long)OST_REPLY);
+	LASSERTF(OST_GETATTR == 1, "found %lld\n",
+		 (long long)OST_GETATTR);
+	LASSERTF(OST_SETATTR == 2, "found %lld\n",
+		 (long long)OST_SETATTR);
+	LASSERTF(OST_READ == 3, "found %lld\n",
+		 (long long)OST_READ);
+	LASSERTF(OST_WRITE == 4, "found %lld\n",
+		 (long long)OST_WRITE);
+	LASSERTF(OST_CREATE == 5, "found %lld\n",
+		 (long long)OST_CREATE);
+	LASSERTF(OST_DESTROY == 6, "found %lld\n",
+		 (long long)OST_DESTROY);
+	LASSERTF(OST_GET_INFO == 7, "found %lld\n",
+		 (long long)OST_GET_INFO);
+	LASSERTF(OST_CONNECT == 8, "found %lld\n",
+		 (long long)OST_CONNECT);
+	LASSERTF(OST_DISCONNECT == 9, "found %lld\n",
+		 (long long)OST_DISCONNECT);
+	LASSERTF(OST_PUNCH == 10, "found %lld\n",
+		 (long long)OST_PUNCH);
+	LASSERTF(OST_OPEN == 11, "found %lld\n",
+		 (long long)OST_OPEN);
+	LASSERTF(OST_CLOSE == 12, "found %lld\n",
+		 (long long)OST_CLOSE);
+	LASSERTF(OST_STATFS == 13, "found %lld\n",
+		 (long long)OST_STATFS);
+	LASSERTF(OST_SYNC == 16, "found %lld\n",
+		 (long long)OST_SYNC);
+	LASSERTF(OST_SET_INFO == 17, "found %lld\n",
+		 (long long)OST_SET_INFO);
+	LASSERTF(OST_QUOTACHECK == 18, "found %lld\n",
+		 (long long)OST_QUOTACHECK);
+	LASSERTF(OST_QUOTACTL == 19, "found %lld\n",
+		 (long long)OST_QUOTACTL);
+	LASSERTF(OST_QUOTA_ADJUST_QUNIT == 20, "found %lld\n",
+		 (long long)OST_QUOTA_ADJUST_QUNIT);
+	LASSERTF(OST_LAST_OPC == 21, "found %lld\n",
+		 (long long)OST_LAST_OPC);
+	LASSERTF(OBD_OBJECT_EOF == 0xffffffffffffffffULL, "found 0x%.16llxULL\n",
+		 OBD_OBJECT_EOF);
+	LASSERTF(OST_MIN_PRECREATE == 32, "found %lld\n",
+		 (long long)OST_MIN_PRECREATE);
+	LASSERTF(OST_MAX_PRECREATE == 20000, "found %lld\n",
+		 (long long)OST_MAX_PRECREATE);
+	LASSERTF(OST_LVB_ERR_INIT == 0xffbadbad80000000ULL, "found 0x%.16llxULL\n",
+		 OST_LVB_ERR_INIT);
+	LASSERTF(OST_LVB_ERR_MASK == 0xffbadbad00000000ULL, "found 0x%.16llxULL\n",
+		 OST_LVB_ERR_MASK);
+	LASSERTF(MDS_FIRST_OPC == 33, "found %lld\n",
+		 (long long)MDS_FIRST_OPC);
+	LASSERTF(MDS_GETATTR == 33, "found %lld\n",
+		 (long long)MDS_GETATTR);
+	LASSERTF(MDS_GETATTR_NAME == 34, "found %lld\n",
+		 (long long)MDS_GETATTR_NAME);
+	LASSERTF(MDS_CLOSE == 35, "found %lld\n",
+		 (long long)MDS_CLOSE);
+	LASSERTF(MDS_REINT == 36, "found %lld\n",
+		 (long long)MDS_REINT);
+	LASSERTF(MDS_READPAGE == 37, "found %lld\n",
+		 (long long)MDS_READPAGE);
+	LASSERTF(MDS_CONNECT == 38, "found %lld\n",
+		 (long long)MDS_CONNECT);
+	LASSERTF(MDS_DISCONNECT == 39, "found %lld\n",
+		 (long long)MDS_DISCONNECT);
+	LASSERTF(MDS_GETSTATUS == 40, "found %lld\n",
+		 (long long)MDS_GETSTATUS);
+	LASSERTF(MDS_STATFS == 41, "found %lld\n",
+		 (long long)MDS_STATFS);
+	LASSERTF(MDS_PIN == 42, "found %lld\n",
+		 (long long)MDS_PIN);
+	LASSERTF(MDS_UNPIN == 43, "found %lld\n",
+		 (long long)MDS_UNPIN);
+	LASSERTF(MDS_SYNC == 44, "found %lld\n",
+		 (long long)MDS_SYNC);
+	LASSERTF(MDS_DONE_WRITING == 45, "found %lld\n",
+		 (long long)MDS_DONE_WRITING);
+	LASSERTF(MDS_SET_INFO == 46, "found %lld\n",
+		 (long long)MDS_SET_INFO);
+	LASSERTF(MDS_QUOTACHECK == 47, "found %lld\n",
+		 (long long)MDS_QUOTACHECK);
+	LASSERTF(MDS_QUOTACTL == 48, "found %lld\n",
+		 (long long)MDS_QUOTACTL);
+	LASSERTF(MDS_GETXATTR == 49, "found %lld\n",
+		 (long long)MDS_GETXATTR);
+	LASSERTF(MDS_SETXATTR == 50, "found %lld\n",
+		 (long long)MDS_SETXATTR);
+	LASSERTF(MDS_WRITEPAGE == 51, "found %lld\n",
+		 (long long)MDS_WRITEPAGE);
+	LASSERTF(MDS_IS_SUBDIR == 52, "found %lld\n",
+		 (long long)MDS_IS_SUBDIR);
+	LASSERTF(MDS_GET_INFO == 53, "found %lld\n",
+		 (long long)MDS_GET_INFO);
+	LASSERTF(MDS_HSM_STATE_GET == 54, "found %lld\n",
+		 (long long)MDS_HSM_STATE_GET);
+	LASSERTF(MDS_HSM_STATE_SET == 55, "found %lld\n",
+		 (long long)MDS_HSM_STATE_SET);
+	LASSERTF(MDS_HSM_ACTION == 56, "found %lld\n",
+		 (long long)MDS_HSM_ACTION);
+	LASSERTF(MDS_HSM_PROGRESS == 57, "found %lld\n",
+		 (long long)MDS_HSM_PROGRESS);
+	LASSERTF(MDS_HSM_REQUEST == 58, "found %lld\n",
+		 (long long)MDS_HSM_REQUEST);
+	LASSERTF(MDS_HSM_CT_REGISTER == 59, "found %lld\n",
+		 (long long)MDS_HSM_CT_REGISTER);
+	LASSERTF(MDS_HSM_CT_UNREGISTER == 60, "found %lld\n",
+		 (long long)MDS_HSM_CT_UNREGISTER);
+	LASSERTF(MDS_SWAP_LAYOUTS == 61, "found %lld\n",
+		 (long long)MDS_SWAP_LAYOUTS);
+	LASSERTF(MDS_LAST_OPC == 62, "found %lld\n",
+		 (long long)MDS_LAST_OPC);
+	LASSERTF(REINT_SETATTR == 1, "found %lld\n",
+		 (long long)REINT_SETATTR);
+	LASSERTF(REINT_CREATE == 2, "found %lld\n",
+		 (long long)REINT_CREATE);
+	LASSERTF(REINT_LINK == 3, "found %lld\n",
+		 (long long)REINT_LINK);
+	LASSERTF(REINT_UNLINK == 4, "found %lld\n",
+		 (long long)REINT_UNLINK);
+	LASSERTF(REINT_RENAME == 5, "found %lld\n",
+		 (long long)REINT_RENAME);
+	LASSERTF(REINT_OPEN == 6, "found %lld\n",
+		 (long long)REINT_OPEN);
+	LASSERTF(REINT_SETXATTR == 7, "found %lld\n",
+		 (long long)REINT_SETXATTR);
+	LASSERTF(REINT_RMENTRY == 8, "found %lld\n",
+		 (long long)REINT_RMENTRY);
+	LASSERTF(REINT_MAX == 9, "found %lld\n",
+		 (long long)REINT_MAX);
+	LASSERTF(DISP_IT_EXECD == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_IT_EXECD);
+	LASSERTF(DISP_LOOKUP_EXECD == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_LOOKUP_EXECD);
+	LASSERTF(DISP_LOOKUP_NEG == 0x00000004UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_LOOKUP_NEG);
+	LASSERTF(DISP_LOOKUP_POS == 0x00000008UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_LOOKUP_POS);
+	LASSERTF(DISP_OPEN_CREATE == 0x00000010UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_OPEN_CREATE);
+	LASSERTF(DISP_OPEN_OPEN == 0x00000020UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_OPEN_OPEN);
+	LASSERTF(DISP_ENQ_COMPLETE == 0x00400000UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_ENQ_COMPLETE);
+	LASSERTF(DISP_ENQ_OPEN_REF == 0x00800000UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_ENQ_OPEN_REF);
+	LASSERTF(DISP_ENQ_CREATE_REF == 0x01000000UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_ENQ_CREATE_REF);
+	LASSERTF(DISP_OPEN_LOCK == 0x02000000UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_OPEN_LOCK);
+	LASSERTF(MDS_STATUS_CONN == 1, "found %lld\n",
+		 (long long)MDS_STATUS_CONN);
+	LASSERTF(MDS_STATUS_LOV == 2, "found %lld\n",
+		 (long long)MDS_STATUS_LOV);
+	LASSERTF(LUSTRE_BFLAG_UNCOMMITTED_WRITES == 1, "found %lld\n",
+		 (long long)LUSTRE_BFLAG_UNCOMMITTED_WRITES);
+	LASSERTF(MF_SOM_CHANGE == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)MF_SOM_CHANGE);
+	LASSERTF(MF_EPOCH_OPEN == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)MF_EPOCH_OPEN);
+	LASSERTF(MF_EPOCH_CLOSE == 0x00000004UL, "found 0x%.8xUL\n",
+		(unsigned)MF_EPOCH_CLOSE);
+	LASSERTF(MF_MDC_CANCEL_FID1 == 0x00000008UL, "found 0x%.8xUL\n",
+		(unsigned)MF_MDC_CANCEL_FID1);
+	LASSERTF(MF_MDC_CANCEL_FID2 == 0x00000010UL, "found 0x%.8xUL\n",
+		(unsigned)MF_MDC_CANCEL_FID2);
+	LASSERTF(MF_MDC_CANCEL_FID3 == 0x00000020UL, "found 0x%.8xUL\n",
+		(unsigned)MF_MDC_CANCEL_FID3);
+	LASSERTF(MF_MDC_CANCEL_FID4 == 0x00000040UL, "found 0x%.8xUL\n",
+		(unsigned)MF_MDC_CANCEL_FID4);
+	LASSERTF(MF_SOM_AU == 0x00000080UL, "found 0x%.8xUL\n",
+		(unsigned)MF_SOM_AU);
+	LASSERTF(MF_GETATTR_LOCK == 0x00000100UL, "found 0x%.8xUL\n",
+		(unsigned)MF_GETATTR_LOCK);
+	LASSERTF(MDS_ATTR_MODE == 0x0000000000000001ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_MODE);
+	LASSERTF(MDS_ATTR_UID == 0x0000000000000002ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_UID);
+	LASSERTF(MDS_ATTR_GID == 0x0000000000000004ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_GID);
+	LASSERTF(MDS_ATTR_SIZE == 0x0000000000000008ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_SIZE);
+	LASSERTF(MDS_ATTR_ATIME == 0x0000000000000010ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_ATIME);
+	LASSERTF(MDS_ATTR_MTIME == 0x0000000000000020ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_MTIME);
+	LASSERTF(MDS_ATTR_CTIME == 0x0000000000000040ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_CTIME);
+	LASSERTF(MDS_ATTR_ATIME_SET == 0x0000000000000080ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_ATIME_SET);
+	LASSERTF(MDS_ATTR_MTIME_SET == 0x0000000000000100ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_MTIME_SET);
+	LASSERTF(MDS_ATTR_FORCE == 0x0000000000000200ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_FORCE);
+	LASSERTF(MDS_ATTR_ATTR_FLAG == 0x0000000000000400ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_ATTR_FLAG);
+	LASSERTF(MDS_ATTR_KILL_SUID == 0x0000000000000800ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_KILL_SUID);
+	LASSERTF(MDS_ATTR_KILL_SGID == 0x0000000000001000ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_KILL_SGID);
+	LASSERTF(MDS_ATTR_CTIME_SET == 0x0000000000002000ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_CTIME_SET);
+	LASSERTF(MDS_ATTR_FROM_OPEN == 0x0000000000004000ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_FROM_OPEN);
+	LASSERTF(MDS_ATTR_BLOCKS == 0x0000000000008000ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_BLOCKS);
+	LASSERTF(FLD_QUERY == 900, "found %lld\n",
+		 (long long)FLD_QUERY);
+	LASSERTF(FLD_FIRST_OPC == 900, "found %lld\n",
+		 (long long)FLD_FIRST_OPC);
+	LASSERTF(FLD_LAST_OPC == 901, "found %lld\n",
+		 (long long)FLD_LAST_OPC);
+	LASSERTF(SEQ_QUERY == 700, "found %lld\n",
+		 (long long)SEQ_QUERY);
+	LASSERTF(SEQ_FIRST_OPC == 700, "found %lld\n",
+		 (long long)SEQ_FIRST_OPC);
+	LASSERTF(SEQ_LAST_OPC == 701, "found %lld\n",
+		 (long long)SEQ_LAST_OPC);
+	LASSERTF(SEQ_ALLOC_SUPER == 0, "found %lld\n",
+		 (long long)SEQ_ALLOC_SUPER);
+	LASSERTF(SEQ_ALLOC_META == 1, "found %lld\n",
+		 (long long)SEQ_ALLOC_META);
+	LASSERTF(LDLM_ENQUEUE == 101, "found %lld\n",
+		 (long long)LDLM_ENQUEUE);
+	LASSERTF(LDLM_CONVERT == 102, "found %lld\n",
+		 (long long)LDLM_CONVERT);
+	LASSERTF(LDLM_CANCEL == 103, "found %lld\n",
+		 (long long)LDLM_CANCEL);
+	LASSERTF(LDLM_BL_CALLBACK == 104, "found %lld\n",
+		 (long long)LDLM_BL_CALLBACK);
+	LASSERTF(LDLM_CP_CALLBACK == 105, "found %lld\n",
+		 (long long)LDLM_CP_CALLBACK);
+	LASSERTF(LDLM_GL_CALLBACK == 106, "found %lld\n",
+		 (long long)LDLM_GL_CALLBACK);
+	LASSERTF(LDLM_SET_INFO == 107, "found %lld\n",
+		 (long long)LDLM_SET_INFO);
+	LASSERTF(LDLM_LAST_OPC == 108, "found %lld\n",
+		 (long long)LDLM_LAST_OPC);
+	LASSERTF(LCK_MINMODE == 0, "found %lld\n",
+		 (long long)LCK_MINMODE);
+	LASSERTF(LCK_EX == 1, "found %lld\n",
+		 (long long)LCK_EX);
+	LASSERTF(LCK_PW == 2, "found %lld\n",
+		 (long long)LCK_PW);
+	LASSERTF(LCK_PR == 4, "found %lld\n",
+		 (long long)LCK_PR);
+	LASSERTF(LCK_CW == 8, "found %lld\n",
+		 (long long)LCK_CW);
+	LASSERTF(LCK_CR == 16, "found %lld\n",
+		 (long long)LCK_CR);
+	LASSERTF(LCK_NL == 32, "found %lld\n",
+		 (long long)LCK_NL);
+	LASSERTF(LCK_GROUP == 64, "found %lld\n",
+		 (long long)LCK_GROUP);
+	LASSERTF(LCK_COS == 128, "found %lld\n",
+		 (long long)LCK_COS);
+	LASSERTF(LCK_MAXMODE == 129, "found %lld\n",
+		 (long long)LCK_MAXMODE);
+	LASSERTF(LCK_MODE_NUM == 8, "found %lld\n",
+		 (long long)LCK_MODE_NUM);
+	CLASSERT(LDLM_PLAIN == 10);
+	CLASSERT(LDLM_EXTENT == 11);
+	CLASSERT(LDLM_FLOCK == 12);
+	CLASSERT(LDLM_IBITS == 13);
+	CLASSERT(LDLM_MAX_TYPE == 14);
+	CLASSERT(LUSTRE_RES_ID_SEQ_OFF == 0);
+	CLASSERT(LUSTRE_RES_ID_VER_OID_OFF == 1);
+	LASSERTF(UPDATE_OBJ == 1000, "found %lld\n",
+		 (long long)UPDATE_OBJ);
+	LASSERTF(UPDATE_LAST_OPC == 1001, "found %lld\n",
+		 (long long)UPDATE_LAST_OPC);
+	CLASSERT(LUSTRE_RES_ID_QUOTA_SEQ_OFF == 2);
+	CLASSERT(LUSTRE_RES_ID_QUOTA_VER_OID_OFF == 3);
+	CLASSERT(LUSTRE_RES_ID_HSH_OFF == 3);
+	CLASSERT(LQUOTA_TYPE_USR == 0);
+	CLASSERT(LQUOTA_TYPE_GRP == 1);
+	CLASSERT(LQUOTA_RES_MD == 1);
+	CLASSERT(LQUOTA_RES_DT == 2);
+	LASSERTF(OBD_PING == 400, "found %lld\n",
+		 (long long)OBD_PING);
+	LASSERTF(OBD_LOG_CANCEL == 401, "found %lld\n",
+		 (long long)OBD_LOG_CANCEL);
+	LASSERTF(OBD_QC_CALLBACK == 402, "found %lld\n",
+		 (long long)OBD_QC_CALLBACK);
+	LASSERTF(OBD_IDX_READ == 403, "found %lld\n",
+		 (long long)OBD_IDX_READ);
+	LASSERTF(OBD_LAST_OPC == 404, "found %lld\n",
+		 (long long)OBD_LAST_OPC);
+	LASSERTF(QUOTA_DQACQ == 601, "found %lld\n",
+		 (long long)QUOTA_DQACQ);
+	LASSERTF(QUOTA_DQREL == 602, "found %lld\n",
+		 (long long)QUOTA_DQREL);
+	LASSERTF(QUOTA_LAST_OPC == 603, "found %lld\n",
+		 (long long)QUOTA_LAST_OPC);
+	LASSERTF(MGS_CONNECT == 250, "found %lld\n",
+		 (long long)MGS_CONNECT);
+	LASSERTF(MGS_DISCONNECT == 251, "found %lld\n",
+		 (long long)MGS_DISCONNECT);
+	LASSERTF(MGS_EXCEPTION == 252, "found %lld\n",
+		 (long long)MGS_EXCEPTION);
+	LASSERTF(MGS_TARGET_REG == 253, "found %lld\n",
+		 (long long)MGS_TARGET_REG);
+	LASSERTF(MGS_TARGET_DEL == 254, "found %lld\n",
+		 (long long)MGS_TARGET_DEL);
+	LASSERTF(MGS_SET_INFO == 255, "found %lld\n",
+		 (long long)MGS_SET_INFO);
+	LASSERTF(MGS_LAST_OPC == 257, "found %lld\n",
+		 (long long)MGS_LAST_OPC);
+	LASSERTF(SEC_CTX_INIT == 801, "found %lld\n",
+		 (long long)SEC_CTX_INIT);
+	LASSERTF(SEC_CTX_INIT_CONT == 802, "found %lld\n",
+		 (long long)SEC_CTX_INIT_CONT);
+	LASSERTF(SEC_CTX_FINI == 803, "found %lld\n",
+		 (long long)SEC_CTX_FINI);
+	LASSERTF(SEC_LAST_OPC == 804, "found %lld\n",
+		 (long long)SEC_LAST_OPC);
+	/* Sizes and Offsets */
+
+	/* Checks for struct obd_uuid */
+	LASSERTF((int)sizeof(struct obd_uuid) == 40, "found %lld\n",
+		 (long long)(int)sizeof(struct obd_uuid));
+
+	/* Checks for struct lu_seq_range */
+	LASSERTF((int)sizeof(struct lu_seq_range) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct lu_seq_range));
+	LASSERTF((int)offsetof(struct lu_seq_range, lsr_start) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_seq_range, lsr_start));
+	LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_start));
+	LASSERTF((int)offsetof(struct lu_seq_range, lsr_end) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_seq_range, lsr_end));
+	LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_end) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_end));
+	LASSERTF((int)offsetof(struct lu_seq_range, lsr_index) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_seq_range, lsr_index));
+	LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_index));
+	LASSERTF((int)offsetof(struct lu_seq_range, lsr_flags) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_seq_range, lsr_flags));
+	LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_flags));
+	LASSERTF(LU_SEQ_RANGE_MDT == 0, "found %lld\n",
+		 (long long)LU_SEQ_RANGE_MDT);
+	LASSERTF(LU_SEQ_RANGE_OST == 1, "found %lld\n",
+		 (long long)LU_SEQ_RANGE_OST);
+
+	/* Checks for struct lustre_mdt_attrs */
+	LASSERTF((int)sizeof(struct lustre_mdt_attrs) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct lustre_mdt_attrs));
+	LASSERTF((int)offsetof(struct lustre_mdt_attrs, lma_compat) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_mdt_attrs, lma_compat));
+	LASSERTF((int)sizeof(((struct lustre_mdt_attrs *)0)->lma_compat) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_mdt_attrs *)0)->lma_compat));
+	LASSERTF((int)offsetof(struct lustre_mdt_attrs, lma_incompat) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_mdt_attrs, lma_incompat));
+	LASSERTF((int)sizeof(((struct lustre_mdt_attrs *)0)->lma_incompat) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_mdt_attrs *)0)->lma_incompat));
+	LASSERTF((int)offsetof(struct lustre_mdt_attrs, lma_self_fid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_mdt_attrs, lma_self_fid));
+	LASSERTF((int)sizeof(((struct lustre_mdt_attrs *)0)->lma_self_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_mdt_attrs *)0)->lma_self_fid));
+	LASSERTF(LMAI_RELEASED == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)LMAI_RELEASED);
+	LASSERTF(LMAC_HSM == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)LMAC_HSM);
+	LASSERTF(LMAC_SOM == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)LMAC_SOM);
+	LASSERTF(OBJ_CREATE == 1, "found %lld\n",
+		 (long long)OBJ_CREATE);
+	LASSERTF(OBJ_DESTROY == 2, "found %lld\n",
+		 (long long)OBJ_DESTROY);
+	LASSERTF(OBJ_REF_ADD == 3, "found %lld\n",
+		 (long long)OBJ_REF_ADD);
+	LASSERTF(OBJ_REF_DEL == 4, "found %lld\n",
+		 (long long)OBJ_REF_DEL);
+	LASSERTF(OBJ_ATTR_SET == 5, "found %lld\n",
+		 (long long)OBJ_ATTR_SET);
+	LASSERTF(OBJ_ATTR_GET == 6, "found %lld\n",
+		 (long long)OBJ_ATTR_GET);
+	LASSERTF(OBJ_XATTR_SET == 7, "found %lld\n",
+		 (long long)OBJ_XATTR_SET);
+	LASSERTF(OBJ_XATTR_GET == 8, "found %lld\n",
+		 (long long)OBJ_XATTR_GET);
+	LASSERTF(OBJ_INDEX_LOOKUP == 9, "found %lld\n",
+		 (long long)OBJ_INDEX_LOOKUP);
+	LASSERTF(OBJ_INDEX_LOOKUP == 9, "found %lld\n",
+		 (long long)OBJ_INDEX_LOOKUP);
+	LASSERTF(OBJ_INDEX_INSERT == 10, "found %lld\n",
+		 (long long)OBJ_INDEX_INSERT);
+	LASSERTF(OBJ_INDEX_DELETE == 11, "found %lld\n",
+		 (long long)OBJ_INDEX_DELETE);
+
+	/* Checks for struct som_attrs */
+	LASSERTF((int)sizeof(struct som_attrs) == 40, "found %lld\n",
+		 (long long)(int)sizeof(struct som_attrs));
+	LASSERTF((int)offsetof(struct som_attrs, som_compat) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct som_attrs, som_compat));
+	LASSERTF((int)sizeof(((struct som_attrs *)0)->som_compat) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct som_attrs *)0)->som_compat));
+	LASSERTF((int)offsetof(struct som_attrs, som_incompat) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct som_attrs, som_incompat));
+	LASSERTF((int)sizeof(((struct som_attrs *)0)->som_incompat) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct som_attrs *)0)->som_incompat));
+	LASSERTF((int)offsetof(struct som_attrs, som_ioepoch) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct som_attrs, som_ioepoch));
+	LASSERTF((int)sizeof(((struct som_attrs *)0)->som_ioepoch) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct som_attrs *)0)->som_ioepoch));
+	LASSERTF((int)offsetof(struct som_attrs, som_size) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct som_attrs, som_size));
+	LASSERTF((int)sizeof(((struct som_attrs *)0)->som_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct som_attrs *)0)->som_size));
+	LASSERTF((int)offsetof(struct som_attrs, som_blocks) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct som_attrs, som_blocks));
+	LASSERTF((int)sizeof(((struct som_attrs *)0)->som_blocks) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct som_attrs *)0)->som_blocks));
+	LASSERTF((int)offsetof(struct som_attrs, som_mountid) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct som_attrs, som_mountid));
+	LASSERTF((int)sizeof(((struct som_attrs *)0)->som_mountid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct som_attrs *)0)->som_mountid));
+
+	/* Checks for struct hsm_attrs */
+	LASSERTF((int)sizeof(struct hsm_attrs) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_attrs));
+	LASSERTF((int)offsetof(struct hsm_attrs, hsm_compat) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_attrs, hsm_compat));
+	LASSERTF((int)sizeof(((struct hsm_attrs *)0)->hsm_compat) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_attrs *)0)->hsm_compat));
+	LASSERTF((int)offsetof(struct hsm_attrs, hsm_flags) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_attrs, hsm_flags));
+	LASSERTF((int)sizeof(((struct hsm_attrs *)0)->hsm_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_attrs *)0)->hsm_flags));
+	LASSERTF((int)offsetof(struct hsm_attrs, hsm_arch_id) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_attrs, hsm_arch_id));
+	LASSERTF((int)sizeof(((struct hsm_attrs *)0)->hsm_arch_id) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_attrs *)0)->hsm_arch_id));
+	LASSERTF((int)offsetof(struct hsm_attrs, hsm_arch_ver) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_attrs, hsm_arch_ver));
+	LASSERTF((int)sizeof(((struct hsm_attrs *)0)->hsm_arch_ver) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_attrs *)0)->hsm_arch_ver));
+
+	/* Checks for struct ost_id */
+	LASSERTF((int)sizeof(struct ost_id) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct ost_id));
+	LASSERTF((int)offsetof(struct ost_id, oi) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_id, oi));
+	LASSERTF((int)sizeof(((struct ost_id *)0)->oi) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_id *)0)->oi));
+	LASSERTF(LUSTRE_FID_INIT_OID == 1, "found %lld\n",
+		 (long long)LUSTRE_FID_INIT_OID);
+	LASSERTF(FID_SEQ_OST_MDT0 == 0, "found %lld\n",
+		 (long long)FID_SEQ_OST_MDT0);
+	LASSERTF(FID_SEQ_LLOG == 1, "found %lld\n",
+		 (long long)FID_SEQ_LLOG);
+	LASSERTF(FID_SEQ_ECHO == 2, "found %lld\n",
+		 (long long)FID_SEQ_ECHO);
+	LASSERTF(FID_SEQ_OST_MDT1 == 3, "found %lld\n",
+		 (long long)FID_SEQ_OST_MDT1);
+	LASSERTF(FID_SEQ_OST_MAX == 9, "found %lld\n",
+		 (long long)FID_SEQ_OST_MAX);
+	LASSERTF(FID_SEQ_RSVD == 11, "found %lld\n",
+		 (long long)FID_SEQ_RSVD);
+	LASSERTF(FID_SEQ_IGIF == 12, "found %lld\n",
+		 (long long)FID_SEQ_IGIF);
+	LASSERTF(FID_SEQ_IGIF_MAX == 0x00000000ffffffffULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_IGIF_MAX);
+	LASSERTF(FID_SEQ_IDIF == 0x0000000100000000ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_IDIF);
+	LASSERTF(FID_SEQ_IDIF_MAX == 0x00000001ffffffffULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_IDIF_MAX);
+	LASSERTF(FID_SEQ_START == 0x0000000200000000ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_START);
+	LASSERTF(FID_SEQ_LOCAL_FILE == 0x0000000200000001ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_LOCAL_FILE);
+	LASSERTF(FID_SEQ_DOT_LUSTRE == 0x0000000200000002ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_DOT_LUSTRE);
+	LASSERTF(FID_SEQ_SPECIAL == 0x0000000200000004ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_SPECIAL);
+	LASSERTF(FID_SEQ_QUOTA == 0x0000000200000005ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_QUOTA);
+	LASSERTF(FID_SEQ_QUOTA_GLB == 0x0000000200000006ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_QUOTA_GLB);
+	LASSERTF(FID_SEQ_ROOT == 0x0000000200000007ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_ROOT);
+	LASSERTF(FID_SEQ_NORMAL == 0x0000000200000400ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_NORMAL);
+	LASSERTF(FID_SEQ_LOV_DEFAULT == 0xffffffffffffffffULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_LOV_DEFAULT);
+	LASSERTF(FID_OID_SPECIAL_BFL == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)FID_OID_SPECIAL_BFL);
+	LASSERTF(FID_OID_DOT_LUSTRE == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)FID_OID_DOT_LUSTRE);
+	LASSERTF(FID_OID_DOT_LUSTRE_OBF == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)FID_OID_DOT_LUSTRE_OBF);
+
+	/* Checks for struct lu_dirent */
+	LASSERTF((int)sizeof(struct lu_dirent) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct lu_dirent));
+	LASSERTF((int)offsetof(struct lu_dirent, lde_fid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirent, lde_fid));
+	LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirent *)0)->lde_fid));
+	LASSERTF((int)offsetof(struct lu_dirent, lde_hash) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirent, lde_hash));
+	LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_hash) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirent *)0)->lde_hash));
+	LASSERTF((int)offsetof(struct lu_dirent, lde_reclen) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirent, lde_reclen));
+	LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_reclen) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirent *)0)->lde_reclen));
+	LASSERTF((int)offsetof(struct lu_dirent, lde_namelen) == 26, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirent, lde_namelen));
+	LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_namelen) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirent *)0)->lde_namelen));
+	LASSERTF((int)offsetof(struct lu_dirent, lde_attrs) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirent, lde_attrs));
+	LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_attrs) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirent *)0)->lde_attrs));
+	LASSERTF((int)offsetof(struct lu_dirent, lde_name[0]) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirent, lde_name[0]));
+	LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_name[0]) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirent *)0)->lde_name[0]));
+	LASSERTF(LUDA_FID == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)LUDA_FID);
+	LASSERTF(LUDA_TYPE == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)LUDA_TYPE);
+	LASSERTF(LUDA_64BITHASH == 0x00000004UL, "found 0x%.8xUL\n",
+		(unsigned)LUDA_64BITHASH);
+
+	/* Checks for struct luda_type */
+	LASSERTF((int)sizeof(struct luda_type) == 2, "found %lld\n",
+		 (long long)(int)sizeof(struct luda_type));
+	LASSERTF((int)offsetof(struct luda_type, lt_type) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct luda_type, lt_type));
+	LASSERTF((int)sizeof(((struct luda_type *)0)->lt_type) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct luda_type *)0)->lt_type));
+
+	/* Checks for struct lu_dirpage */
+	LASSERTF((int)sizeof(struct lu_dirpage) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct lu_dirpage));
+	LASSERTF((int)offsetof(struct lu_dirpage, ldp_hash_start) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirpage, ldp_hash_start));
+	LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_hash_start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_hash_start));
+	LASSERTF((int)offsetof(struct lu_dirpage, ldp_hash_end) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirpage, ldp_hash_end));
+	LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_hash_end) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_hash_end));
+	LASSERTF((int)offsetof(struct lu_dirpage, ldp_flags) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirpage, ldp_flags));
+	LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_flags));
+	LASSERTF((int)offsetof(struct lu_dirpage, ldp_pad0) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirpage, ldp_pad0));
+	LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_pad0) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_pad0));
+	LASSERTF((int)offsetof(struct lu_dirpage, ldp_entries[0]) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirpage, ldp_entries[0]));
+	LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_entries[0]) == 32, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_entries[0]));
+	LASSERTF(LDF_EMPTY == 1, "found %lld\n",
+		 (long long)LDF_EMPTY);
+	LASSERTF(LDF_COLLIDE == 2, "found %lld\n",
+		 (long long)LDF_COLLIDE);
+	LASSERTF(LU_PAGE_SIZE == 4096, "found %lld\n",
+		 (long long)LU_PAGE_SIZE);
+	/* Checks for union lu_page */
+	LASSERTF((int)sizeof(union lu_page) == 4096, "found %lld\n",
+		 (long long)(int)sizeof(union lu_page));
+
+	/* Checks for struct lustre_handle */
+	LASSERTF((int)sizeof(struct lustre_handle) == 8, "found %lld\n",
+		 (long long)(int)sizeof(struct lustre_handle));
+	LASSERTF((int)offsetof(struct lustre_handle, cookie) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_handle, cookie));
+	LASSERTF((int)sizeof(((struct lustre_handle *)0)->cookie) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_handle *)0)->cookie));
+
+	/* Checks for struct lustre_msg_v2 */
+	LASSERTF((int)sizeof(struct lustre_msg_v2) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct lustre_msg_v2));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_bufcount) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_bufcount));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_bufcount) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_bufcount));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_secflvr) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_secflvr));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_secflvr) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_secflvr));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_magic) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_magic));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_magic));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_repsize) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_repsize));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_cksum) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_cksum));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_flags) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_flags));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_flags));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_padding_2) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_padding_2));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_2));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_padding_3) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_padding_3));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_3) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_3));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_buflens[0]) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_buflens[0]));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_buflens[0]) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_buflens[0]));
+	LASSERTF(LUSTRE_MSG_MAGIC_V1 == 0x0BD00BD0, "found 0x%.8x\n",
+		LUSTRE_MSG_MAGIC_V1);
+	LASSERTF(LUSTRE_MSG_MAGIC_V2 == 0x0BD00BD3, "found 0x%.8x\n",
+		LUSTRE_MSG_MAGIC_V2);
+	LASSERTF(LUSTRE_MSG_MAGIC_V1_SWABBED == 0xD00BD00B, "found 0x%.8x\n",
+		LUSTRE_MSG_MAGIC_V1_SWABBED);
+	LASSERTF(LUSTRE_MSG_MAGIC_V2_SWABBED == 0xD30BD00B, "found 0x%.8x\n",
+		LUSTRE_MSG_MAGIC_V2_SWABBED);
+
+	/* Checks for struct ptlrpc_body */
+	LASSERTF((int)sizeof(struct ptlrpc_body_v3) == 184, "found %lld\n",
+		 (long long)(int)sizeof(struct ptlrpc_body_v3));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_handle) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_handle));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_type) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_type));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_version) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_version));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_opc) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_opc));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_status) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_status));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_xid) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_last_xid));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_seen) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_last_seen));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_seen) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_seen));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_committed) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_last_committed));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_transno) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_transno));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_flags) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_flags));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_op_flags) == 60, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_op_flags));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_timeout) == 68, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_timeout));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_service_time) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_service_time));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_limit) == 76, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_limit));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_slv) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_slv));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv));
+	CLASSERT(PTLRPC_NUM_VERSIONS == 4);
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_pre_versions) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_pre_versions));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions) == 32, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_padding));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding) == 32, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding));
+	CLASSERT(JOBSTATS_JOBID_SIZE == 32);
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_jobid) == 152, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_jobid));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_jobid) == 32, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_jobid));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_handle) == (int)offsetof(struct ptlrpc_body_v2, pb_handle), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_handle), (int)offsetof(struct ptlrpc_body_v2, pb_handle));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_handle), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_handle));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_type) == (int)offsetof(struct ptlrpc_body_v2, pb_type), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_type), (int)offsetof(struct ptlrpc_body_v2, pb_type));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_type), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_type));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_version) == (int)offsetof(struct ptlrpc_body_v2, pb_version), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_version), (int)offsetof(struct ptlrpc_body_v2, pb_version));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_version), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_version));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_opc) == (int)offsetof(struct ptlrpc_body_v2, pb_opc), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_opc), (int)offsetof(struct ptlrpc_body_v2, pb_opc));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_opc), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_opc));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_status) == (int)offsetof(struct ptlrpc_body_v2, pb_status), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_status), (int)offsetof(struct ptlrpc_body_v2, pb_status));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_status), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_status));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_xid) == (int)offsetof(struct ptlrpc_body_v2, pb_last_xid), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_last_xid), (int)offsetof(struct ptlrpc_body_v2, pb_last_xid));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_xid), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_xid));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_seen) == (int)offsetof(struct ptlrpc_body_v2, pb_last_seen), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_last_seen), (int)offsetof(struct ptlrpc_body_v2, pb_last_seen));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_seen) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_seen), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_seen), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_seen));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_committed) == (int)offsetof(struct ptlrpc_body_v2, pb_last_committed), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_last_committed), (int)offsetof(struct ptlrpc_body_v2, pb_last_committed));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_committed), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_committed));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_transno) == (int)offsetof(struct ptlrpc_body_v2, pb_transno), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_transno), (int)offsetof(struct ptlrpc_body_v2, pb_transno));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_transno), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_transno));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_flags) == (int)offsetof(struct ptlrpc_body_v2, pb_flags), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_flags), (int)offsetof(struct ptlrpc_body_v2, pb_flags));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_flags), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_flags));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_op_flags) == (int)offsetof(struct ptlrpc_body_v2, pb_op_flags), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_op_flags), (int)offsetof(struct ptlrpc_body_v2, pb_op_flags));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_op_flags), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_op_flags));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt) == (int)offsetof(struct ptlrpc_body_v2, pb_conn_cnt), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt), (int)offsetof(struct ptlrpc_body_v2, pb_conn_cnt));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_conn_cnt), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_conn_cnt));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_timeout) == (int)offsetof(struct ptlrpc_body_v2, pb_timeout), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_timeout), (int)offsetof(struct ptlrpc_body_v2, pb_timeout));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_timeout), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_timeout));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_service_time) == (int)offsetof(struct ptlrpc_body_v2, pb_service_time), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_service_time), (int)offsetof(struct ptlrpc_body_v2, pb_service_time));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_service_time), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_service_time));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_limit) == (int)offsetof(struct ptlrpc_body_v2, pb_limit), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_limit), (int)offsetof(struct ptlrpc_body_v2, pb_limit));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_limit), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_limit));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_slv) == (int)offsetof(struct ptlrpc_body_v2, pb_slv), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_slv), (int)offsetof(struct ptlrpc_body_v2, pb_slv));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_slv), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_slv));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_pre_versions) == (int)offsetof(struct ptlrpc_body_v2, pb_pre_versions), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_pre_versions), (int)offsetof(struct ptlrpc_body_v2, pb_pre_versions));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_pre_versions), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_pre_versions));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding) == (int)offsetof(struct ptlrpc_body_v2, pb_padding), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_padding), (int)offsetof(struct ptlrpc_body_v2, pb_padding));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding));
+	LASSERTF(MSG_PTLRPC_BODY_OFF == 0, "found %lld\n",
+		 (long long)MSG_PTLRPC_BODY_OFF);
+	LASSERTF(REQ_REC_OFF == 1, "found %lld\n",
+		 (long long)REQ_REC_OFF);
+	LASSERTF(REPLY_REC_OFF == 1, "found %lld\n",
+		 (long long)REPLY_REC_OFF);
+	LASSERTF(DLM_LOCKREQ_OFF == 1, "found %lld\n",
+		 (long long)DLM_LOCKREQ_OFF);
+	LASSERTF(DLM_REQ_REC_OFF == 2, "found %lld\n",
+		 (long long)DLM_REQ_REC_OFF);
+	LASSERTF(DLM_INTENT_IT_OFF == 2, "found %lld\n",
+		 (long long)DLM_INTENT_IT_OFF);
+	LASSERTF(DLM_INTENT_REC_OFF == 3, "found %lld\n",
+		 (long long)DLM_INTENT_REC_OFF);
+	LASSERTF(DLM_LOCKREPLY_OFF == 1, "found %lld\n",
+		 (long long)DLM_LOCKREPLY_OFF);
+	LASSERTF(DLM_REPLY_REC_OFF == 2, "found %lld\n",
+		 (long long)DLM_REPLY_REC_OFF);
+	LASSERTF(MSG_PTLRPC_HEADER_OFF == 31, "found %lld\n",
+		 (long long)MSG_PTLRPC_HEADER_OFF);
+	LASSERTF(PTLRPC_MSG_VERSION == 0x00000003, "found 0x%.8x\n",
+		PTLRPC_MSG_VERSION);
+	LASSERTF(LUSTRE_VERSION_MASK == 0xffff0000, "found 0x%.8x\n",
+		LUSTRE_VERSION_MASK);
+	LASSERTF(LUSTRE_OBD_VERSION == 0x00010000, "found 0x%.8x\n",
+		LUSTRE_OBD_VERSION);
+	LASSERTF(LUSTRE_MDS_VERSION == 0x00020000, "found 0x%.8x\n",
+		LUSTRE_MDS_VERSION);
+	LASSERTF(LUSTRE_OST_VERSION == 0x00030000, "found 0x%.8x\n",
+		LUSTRE_OST_VERSION);
+	LASSERTF(LUSTRE_DLM_VERSION == 0x00040000, "found 0x%.8x\n",
+		LUSTRE_DLM_VERSION);
+	LASSERTF(LUSTRE_LOG_VERSION == 0x00050000, "found 0x%.8x\n",
+		LUSTRE_LOG_VERSION);
+	LASSERTF(LUSTRE_MGS_VERSION == 0x00060000, "found 0x%.8x\n",
+		LUSTRE_MGS_VERSION);
+	LASSERTF(MSGHDR_AT_SUPPORT == 1, "found %lld\n",
+		 (long long)MSGHDR_AT_SUPPORT);
+	LASSERTF(MSGHDR_CKSUM_INCOMPAT18 == 2, "found %lld\n",
+		 (long long)MSGHDR_CKSUM_INCOMPAT18);
+	LASSERTF(MSG_OP_FLAG_MASK == 0xffff0000UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_OP_FLAG_MASK);
+	LASSERTF(MSG_OP_FLAG_SHIFT == 16, "found %lld\n",
+		 (long long)MSG_OP_FLAG_SHIFT);
+	LASSERTF(MSG_GEN_FLAG_MASK == 0x0000ffffUL, "found 0x%.8xUL\n",
+		(unsigned)MSG_GEN_FLAG_MASK);
+	LASSERTF(MSG_LAST_REPLAY == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_LAST_REPLAY);
+	LASSERTF(MSG_RESENT == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_RESENT);
+	LASSERTF(MSG_REPLAY == 0x00000004UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_REPLAY);
+	LASSERTF(MSG_DELAY_REPLAY == 0x00000010UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_DELAY_REPLAY);
+	LASSERTF(MSG_VERSION_REPLAY == 0x00000020UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_VERSION_REPLAY);
+	LASSERTF(MSG_REQ_REPLAY_DONE == 0x00000040UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_REQ_REPLAY_DONE);
+	LASSERTF(MSG_LOCK_REPLAY_DONE == 0x00000080UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_LOCK_REPLAY_DONE);
+	LASSERTF(MSG_CONNECT_RECOVERING == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_CONNECT_RECOVERING);
+	LASSERTF(MSG_CONNECT_RECONNECT == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_CONNECT_RECONNECT);
+	LASSERTF(MSG_CONNECT_REPLAYABLE == 0x00000004UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_CONNECT_REPLAYABLE);
+	LASSERTF(MSG_CONNECT_LIBCLIENT == 0x00000010UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_CONNECT_LIBCLIENT);
+	LASSERTF(MSG_CONNECT_INITIAL == 0x00000020UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_CONNECT_INITIAL);
+	LASSERTF(MSG_CONNECT_ASYNC == 0x00000040UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_CONNECT_ASYNC);
+	LASSERTF(MSG_CONNECT_NEXT_VER == 0x00000080UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_CONNECT_NEXT_VER);
+	LASSERTF(MSG_CONNECT_TRANSNO == 0x00000100UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_CONNECT_TRANSNO);
+
+	/* Checks for struct obd_connect_data */
+	LASSERTF((int)sizeof(struct obd_connect_data) == 192, "found %lld\n",
+		 (long long)(int)sizeof(struct obd_connect_data));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_connect_flags) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_connect_flags));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_connect_flags) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_connect_flags));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_version) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_version));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_version) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_version));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_grant));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_index) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_index));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_index));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_brw_size) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_brw_size));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_brw_size) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_brw_size));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_ibits_known) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_ibits_known));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_ibits_known) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_ibits_known));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_blocksize) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_blocksize));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_blocksize) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_blocksize));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_inodespace) == 33, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_inodespace));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_inodespace) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_inodespace));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant_extent) == 34, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_grant_extent));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant_extent) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant_extent));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_unused) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_unused));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_unused) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_unused));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_transno) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_transno));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_transno) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_transno));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_group) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_group));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_group) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_group));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_cksum_types) == 52, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_cksum_types));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_cksum_types) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_cksum_types));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_max_easize) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_max_easize));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_max_easize) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_max_easize));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_instance) == 60, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_instance));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_instance) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_instance));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_maxbytes) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_maxbytes));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_maxbytes) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_maxbytes));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding1) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding1));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding1));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding2) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding2));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding2));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding3) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding3));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding3));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding4) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding4));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding4) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding4));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding5) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding5));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding5) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding5));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding6) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding6));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding6) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding6));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding7) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding7));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding7) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding7));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding8) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding8));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding8) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding8));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding9) == 136, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding9));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding9) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding9));
+	LASSERTF((int)offsetof(struct obd_connect_data, paddingA) == 144, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, paddingA));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingA) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingA));
+	LASSERTF((int)offsetof(struct obd_connect_data, paddingB) == 152, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, paddingB));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingB) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingB));
+	LASSERTF((int)offsetof(struct obd_connect_data, paddingC) == 160, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, paddingC));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingC) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingC));
+	LASSERTF((int)offsetof(struct obd_connect_data, paddingD) == 168, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, paddingD));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingD) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingD));
+	LASSERTF((int)offsetof(struct obd_connect_data, paddingE) == 176, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, paddingE));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingE) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingE));
+	LASSERTF((int)offsetof(struct obd_connect_data, paddingF) == 184, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, paddingF));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingF) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingF));
+	LASSERTF(OBD_CONNECT_RDONLY == 0x1ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_RDONLY);
+	LASSERTF(OBD_CONNECT_INDEX == 0x2ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_INDEX);
+	LASSERTF(OBD_CONNECT_MDS == 0x4ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_MDS);
+	LASSERTF(OBD_CONNECT_GRANT == 0x8ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_GRANT);
+	LASSERTF(OBD_CONNECT_SRVLOCK == 0x10ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_SRVLOCK);
+	LASSERTF(OBD_CONNECT_VERSION == 0x20ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_VERSION);
+	LASSERTF(OBD_CONNECT_REQPORTAL == 0x40ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_REQPORTAL);
+	LASSERTF(OBD_CONNECT_ACL == 0x80ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_ACL);
+	LASSERTF(OBD_CONNECT_XATTR == 0x100ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_XATTR);
+	LASSERTF(OBD_CONNECT_CROW == 0x200ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_CROW);
+	LASSERTF(OBD_CONNECT_TRUNCLOCK == 0x400ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_TRUNCLOCK);
+	LASSERTF(OBD_CONNECT_TRANSNO == 0x800ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_TRANSNO);
+	LASSERTF(OBD_CONNECT_IBITS == 0x1000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_IBITS);
+	LASSERTF(OBD_CONNECT_JOIN == 0x2000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_JOIN);
+	LASSERTF(OBD_CONNECT_ATTRFID == 0x4000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_ATTRFID);
+	LASSERTF(OBD_CONNECT_NODEVOH == 0x8000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_NODEVOH);
+	LASSERTF(OBD_CONNECT_RMT_CLIENT == 0x10000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_RMT_CLIENT);
+	LASSERTF(OBD_CONNECT_RMT_CLIENT_FORCE == 0x20000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_RMT_CLIENT_FORCE);
+	LASSERTF(OBD_CONNECT_BRW_SIZE == 0x40000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_BRW_SIZE);
+	LASSERTF(OBD_CONNECT_QUOTA64 == 0x80000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_QUOTA64);
+	LASSERTF(OBD_CONNECT_MDS_CAPA == 0x100000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_MDS_CAPA);
+	LASSERTF(OBD_CONNECT_OSS_CAPA == 0x200000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_OSS_CAPA);
+	LASSERTF(OBD_CONNECT_CANCELSET == 0x400000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_CANCELSET);
+	LASSERTF(OBD_CONNECT_SOM == 0x800000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_SOM);
+	LASSERTF(OBD_CONNECT_AT == 0x1000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_AT);
+	LASSERTF(OBD_CONNECT_LRU_RESIZE == 0x2000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_LRU_RESIZE);
+	LASSERTF(OBD_CONNECT_MDS_MDS == 0x4000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_MDS_MDS);
+	LASSERTF(OBD_CONNECT_REAL == 0x8000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_REAL);
+	LASSERTF(OBD_CONNECT_CHANGE_QS == 0x10000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_CHANGE_QS);
+	LASSERTF(OBD_CONNECT_CKSUM == 0x20000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_CKSUM);
+	LASSERTF(OBD_CONNECT_FID == 0x40000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_FID);
+	LASSERTF(OBD_CONNECT_VBR == 0x80000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_VBR);
+	LASSERTF(OBD_CONNECT_LOV_V3 == 0x100000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_LOV_V3);
+	LASSERTF(OBD_CONNECT_GRANT_SHRINK == 0x200000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_GRANT_SHRINK);
+	LASSERTF(OBD_CONNECT_SKIP_ORPHAN == 0x400000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_SKIP_ORPHAN);
+	LASSERTF(OBD_CONNECT_MAX_EASIZE == 0x800000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_MAX_EASIZE);
+	LASSERTF(OBD_CONNECT_FULL20 == 0x1000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_FULL20);
+	LASSERTF(OBD_CONNECT_LAYOUTLOCK == 0x2000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_LAYOUTLOCK);
+	LASSERTF(OBD_CONNECT_64BITHASH == 0x4000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_64BITHASH);
+	LASSERTF(OBD_CONNECT_MAXBYTES == 0x8000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_MAXBYTES);
+	LASSERTF(OBD_CONNECT_IMP_RECOV == 0x10000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_IMP_RECOV);
+	LASSERTF(OBD_CONNECT_JOBSTATS == 0x20000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_JOBSTATS);
+	LASSERTF(OBD_CONNECT_UMASK == 0x40000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_UMASK);
+	LASSERTF(OBD_CONNECT_EINPROGRESS == 0x80000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_EINPROGRESS);
+	LASSERTF(OBD_CONNECT_GRANT_PARAM == 0x100000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_GRANT_PARAM);
+	LASSERTF(OBD_CONNECT_FLOCK_OWNER == 0x200000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_FLOCK_OWNER);
+	LASSERTF(OBD_CONNECT_LVB_TYPE == 0x400000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_LVB_TYPE);
+	LASSERTF(OBD_CONNECT_NANOSEC_TIME == 0x800000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_NANOSEC_TIME);
+	LASSERTF(OBD_CONNECT_LIGHTWEIGHT == 0x1000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_LIGHTWEIGHT);
+	LASSERTF(OBD_CONNECT_SHORTIO == 0x2000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_SHORTIO);
+	LASSERTF(OBD_CONNECT_PINGLESS == 0x4000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_PINGLESS);
+	LASSERTF(OBD_CKSUM_CRC32 == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)OBD_CKSUM_CRC32);
+	LASSERTF(OBD_CKSUM_ADLER == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)OBD_CKSUM_ADLER);
+	LASSERTF(OBD_CKSUM_CRC32C == 0x00000004UL, "found 0x%.8xUL\n",
+		(unsigned)OBD_CKSUM_CRC32C);
+
+	/* Checks for struct obdo */
+	LASSERTF((int)sizeof(struct obdo) == 208, "found %lld\n",
+		 (long long)(int)sizeof(struct obdo));
+	LASSERTF((int)offsetof(struct obdo, o_valid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_valid));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_valid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_valid));
+	LASSERTF((int)offsetof(struct obdo, o_oi) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_oi));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_oi) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_oi));
+	LASSERTF((int)offsetof(struct obdo, o_parent_seq) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_parent_seq));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_parent_seq) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_parent_seq));
+	LASSERTF((int)offsetof(struct obdo, o_size) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_size));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_size));
+	LASSERTF((int)offsetof(struct obdo, o_mtime) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_mtime));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_mtime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_mtime));
+	LASSERTF((int)offsetof(struct obdo, o_atime) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_atime));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_atime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_atime));
+	LASSERTF((int)offsetof(struct obdo, o_ctime) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_ctime));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_ctime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_ctime));
+	LASSERTF((int)offsetof(struct obdo, o_blocks) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_blocks));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_blocks) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_blocks));
+	LASSERTF((int)offsetof(struct obdo, o_grant) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_grant));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_grant) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_grant));
+	LASSERTF((int)offsetof(struct obdo, o_blksize) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_blksize));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_blksize) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_blksize));
+	LASSERTF((int)offsetof(struct obdo, o_mode) == 84, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_mode));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_mode));
+	LASSERTF((int)offsetof(struct obdo, o_uid) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_uid));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_uid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_uid));
+	LASSERTF((int)offsetof(struct obdo, o_gid) == 92, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_gid));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_gid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_gid));
+	LASSERTF((int)offsetof(struct obdo, o_flags) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_flags));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_flags));
+	LASSERTF((int)offsetof(struct obdo, o_nlink) == 100, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_nlink));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_nlink) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_nlink));
+	LASSERTF((int)offsetof(struct obdo, o_parent_oid) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_parent_oid));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_parent_oid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_parent_oid));
+	LASSERTF((int)offsetof(struct obdo, o_misc) == 108, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_misc));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_misc) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_misc));
+	LASSERTF((int)offsetof(struct obdo, o_ioepoch) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_ioepoch));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_ioepoch) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_ioepoch));
+	LASSERTF((int)offsetof(struct obdo, o_stripe_idx) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_stripe_idx));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_stripe_idx) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_stripe_idx));
+	LASSERTF((int)offsetof(struct obdo, o_parent_ver) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_parent_ver));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_parent_ver) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_parent_ver));
+	LASSERTF((int)offsetof(struct obdo, o_handle) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_handle));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_handle) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_handle));
+	LASSERTF((int)offsetof(struct obdo, o_lcookie) == 136, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_lcookie));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_lcookie) == 32, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_lcookie));
+	LASSERTF((int)offsetof(struct obdo, o_uid_h) == 168, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_uid_h));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_uid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_uid_h));
+	LASSERTF((int)offsetof(struct obdo, o_gid_h) == 172, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_gid_h));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_gid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_gid_h));
+	LASSERTF((int)offsetof(struct obdo, o_data_version) == 176, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_data_version));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_data_version) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_data_version));
+	LASSERTF((int)offsetof(struct obdo, o_padding_4) == 184, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_padding_4));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_4) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_padding_4));
+	LASSERTF((int)offsetof(struct obdo, o_padding_5) == 192, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_padding_5));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_5) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_padding_5));
+	LASSERTF((int)offsetof(struct obdo, o_padding_6) == 200, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_padding_6));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_6) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_padding_6));
+	LASSERTF(OBD_MD_FLID == (0x00000001ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLID);
+	LASSERTF(OBD_MD_FLATIME == (0x00000002ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLATIME);
+	LASSERTF(OBD_MD_FLMTIME == (0x00000004ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLMTIME);
+	LASSERTF(OBD_MD_FLCTIME == (0x00000008ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLCTIME);
+	LASSERTF(OBD_MD_FLSIZE == (0x00000010ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLSIZE);
+	LASSERTF(OBD_MD_FLBLOCKS == (0x00000020ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLBLOCKS);
+	LASSERTF(OBD_MD_FLBLKSZ == (0x00000040ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLBLKSZ);
+	LASSERTF(OBD_MD_FLMODE == (0x00000080ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLMODE);
+	LASSERTF(OBD_MD_FLTYPE == (0x00000100ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLTYPE);
+	LASSERTF(OBD_MD_FLUID == (0x00000200ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLUID);
+	LASSERTF(OBD_MD_FLGID == (0x00000400ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLGID);
+	LASSERTF(OBD_MD_FLFLAGS == (0x00000800ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLFLAGS);
+	LASSERTF(OBD_MD_FLNLINK == (0x00002000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLNLINK);
+	LASSERTF(OBD_MD_FLGENER == (0x00004000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLGENER);
+	LASSERTF(OBD_MD_FLRDEV == (0x00010000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLRDEV);
+	LASSERTF(OBD_MD_FLEASIZE == (0x00020000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLEASIZE);
+	LASSERTF(OBD_MD_LINKNAME == (0x00040000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_LINKNAME);
+	LASSERTF(OBD_MD_FLHANDLE == (0x00080000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLHANDLE);
+	LASSERTF(OBD_MD_FLCKSUM == (0x00100000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLCKSUM);
+	LASSERTF(OBD_MD_FLQOS == (0x00200000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLQOS);
+	LASSERTF(OBD_MD_FLCOOKIE == (0x00800000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLCOOKIE);
+	LASSERTF(OBD_MD_FLGROUP == (0x01000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLGROUP);
+	LASSERTF(OBD_MD_FLFID == (0x02000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLFID);
+	LASSERTF(OBD_MD_FLEPOCH == (0x04000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLEPOCH);
+	LASSERTF(OBD_MD_FLGRANT == (0x08000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLGRANT);
+	LASSERTF(OBD_MD_FLDIREA == (0x10000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLDIREA);
+	LASSERTF(OBD_MD_FLUSRQUOTA == (0x20000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLUSRQUOTA);
+	LASSERTF(OBD_MD_FLGRPQUOTA == (0x40000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLGRPQUOTA);
+	LASSERTF(OBD_MD_FLMODEASIZE == (0x80000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLMODEASIZE);
+	LASSERTF(OBD_MD_MDS == (0x0000000100000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_MDS);
+	LASSERTF(OBD_MD_REINT == (0x0000000200000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_REINT);
+	LASSERTF(OBD_MD_MEA == (0x0000000400000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_MEA);
+	LASSERTF(OBD_MD_FLXATTR == (0x0000001000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLXATTR);
+	LASSERTF(OBD_MD_FLXATTRLS == (0x0000002000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLXATTRLS);
+	LASSERTF(OBD_MD_FLXATTRRM == (0x0000004000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLXATTRRM);
+	LASSERTF(OBD_MD_FLACL == (0x0000008000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLACL);
+	LASSERTF(OBD_MD_FLRMTPERM == (0x0000010000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLRMTPERM);
+	LASSERTF(OBD_MD_FLMDSCAPA == (0x0000020000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLMDSCAPA);
+	LASSERTF(OBD_MD_FLOSSCAPA == (0x0000040000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLOSSCAPA);
+	LASSERTF(OBD_MD_FLCKSPLIT == (0x0000080000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLCKSPLIT);
+	LASSERTF(OBD_MD_FLCROSSREF == (0x0000100000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLCROSSREF);
+	LASSERTF(OBD_MD_FLGETATTRLOCK == (0x0000200000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLGETATTRLOCK);
+	LASSERTF(OBD_MD_FLRMTLSETFACL == (0x0001000000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLRMTLSETFACL);
+	LASSERTF(OBD_MD_FLRMTLGETFACL == (0x0002000000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLRMTLGETFACL);
+	LASSERTF(OBD_MD_FLRMTRSETFACL == (0x0004000000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLRMTRSETFACL);
+	LASSERTF(OBD_MD_FLRMTRGETFACL == (0x0008000000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLRMTRGETFACL);
+	LASSERTF(OBD_MD_FLDATAVERSION == (0x0010000000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLDATAVERSION);
+	CLASSERT(OBD_FL_INLINEDATA == 0x00000001);
+	CLASSERT(OBD_FL_OBDMDEXISTS == 0x00000002);
+	CLASSERT(OBD_FL_DELORPHAN == 0x00000004);
+	CLASSERT(OBD_FL_NORPC == 0x00000008);
+	CLASSERT(OBD_FL_IDONLY == 0x00000010);
+	CLASSERT(OBD_FL_RECREATE_OBJS == 0x00000020);
+	CLASSERT(OBD_FL_DEBUG_CHECK == 0x00000040);
+	CLASSERT(OBD_FL_NO_USRQUOTA == 0x00000100);
+	CLASSERT(OBD_FL_NO_GRPQUOTA == 0x00000200);
+	CLASSERT(OBD_FL_CREATE_CROW == 0x00000400);
+	CLASSERT(OBD_FL_SRVLOCK == 0x00000800);
+	CLASSERT(OBD_FL_CKSUM_CRC32 == 0x00001000);
+	CLASSERT(OBD_FL_CKSUM_ADLER == 0x00002000);
+	CLASSERT(OBD_FL_CKSUM_CRC32C == 0x00004000);
+	CLASSERT(OBD_FL_CKSUM_RSVD2 == 0x00008000);
+	CLASSERT(OBD_FL_CKSUM_RSVD3 == 0x00010000);
+	CLASSERT(OBD_FL_SHRINK_GRANT == 0x00020000);
+	CLASSERT(OBD_FL_MMAP == 0x00040000);
+	CLASSERT(OBD_FL_RECOV_RESEND == 0x00080000);
+	CLASSERT(OBD_FL_NOSPC_BLK == 0x00100000);
+	CLASSERT(OBD_FL_LOCAL_MASK == 0xf0000000);
+
+	/* Checks for struct lov_ost_data_v1 */
+	LASSERTF((int)sizeof(struct lov_ost_data_v1) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct lov_ost_data_v1));
+	LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_oi) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_oi));
+	LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_oi) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_oi));
+	LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_gen) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_gen));
+	LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_gen) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_gen));
+	LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_idx) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_idx));
+	LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_idx) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_idx));
+
+	/* Checks for struct lov_mds_md_v1 */
+	LASSERTF((int)sizeof(struct lov_mds_md_v1) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct lov_mds_md_v1));
+	LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v1, lmm_magic));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_magic));
+	LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_pattern) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v1, lmm_pattern));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_pattern) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_pattern));
+	LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_oi) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v1, lmm_oi));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_oi) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_oi));
+	LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_stripe_size) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v1, lmm_stripe_size));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_size) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_size));
+	LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_stripe_count) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v1, lmm_stripe_count));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_count) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_count));
+	LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_layout_gen) == 30, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v1, lmm_layout_gen));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_layout_gen) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_layout_gen));
+	LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_objects[0]) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v1, lmm_objects[0]));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_objects[0]) == 24, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_objects[0]));
+	CLASSERT(LOV_MAGIC_V1 == 0x0BD10BD0);
+
+	/* Checks for struct lov_mds_md_v3 */
+	LASSERTF((int)sizeof(struct lov_mds_md_v3) == 48, "found %lld\n",
+		 (long long)(int)sizeof(struct lov_mds_md_v3));
+	LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_magic));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_magic));
+	LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_pattern) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_pattern));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pattern) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pattern));
+	LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_oi) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_oi));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_oi) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_oi));
+	LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_stripe_size) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_stripe_size));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_size) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_size));
+	LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_stripe_count) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_stripe_count));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_count) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_count));
+	LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_layout_gen) == 30, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_layout_gen));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_layout_gen) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_layout_gen));
+	CLASSERT(LOV_MAXPOOLNAME == 16);
+	LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_pool_name[16]) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_pool_name[16]));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pool_name[16]) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pool_name[16]));
+	LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_objects[0]) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_objects[0]));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_objects[0]) == 24, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_objects[0]));
+	CLASSERT(LOV_MAGIC_V3 == 0x0BD30BD0);
+	LASSERTF(LOV_PATTERN_RAID0 == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)LOV_PATTERN_RAID0);
+	LASSERTF(LOV_PATTERN_RAID1 == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)LOV_PATTERN_RAID1);
+	LASSERTF(LOV_PATTERN_FIRST == 0x00000100UL, "found 0x%.8xUL\n",
+		(unsigned)LOV_PATTERN_FIRST);
+	LASSERTF(LOV_PATTERN_CMOBD == 0x00000200UL, "found 0x%.8xUL\n",
+		(unsigned)LOV_PATTERN_CMOBD);
+
+	/* Checks for struct obd_statfs */
+	LASSERTF((int)sizeof(struct obd_statfs) == 144, "found %lld\n",
+		 (long long)(int)sizeof(struct obd_statfs));
+	LASSERTF((int)offsetof(struct obd_statfs, os_type) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_type));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_type) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_type));
+	LASSERTF((int)offsetof(struct obd_statfs, os_blocks) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_blocks));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_blocks) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_blocks));
+	LASSERTF((int)offsetof(struct obd_statfs, os_bfree) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_bfree));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_bfree) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_bfree));
+	LASSERTF((int)offsetof(struct obd_statfs, os_bavail) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_bavail));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_bavail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_bavail));
+	LASSERTF((int)offsetof(struct obd_statfs, os_ffree) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_ffree));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_ffree) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_ffree));
+	LASSERTF((int)offsetof(struct obd_statfs, os_fsid) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_fsid));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_fsid) == 40, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_fsid));
+	LASSERTF((int)offsetof(struct obd_statfs, os_bsize) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_bsize));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_bsize) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_bsize));
+	LASSERTF((int)offsetof(struct obd_statfs, os_namelen) == 92, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_namelen));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_namelen) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_namelen));
+	LASSERTF((int)offsetof(struct obd_statfs, os_state) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_state));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_state) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_state));
+	LASSERTF((int)offsetof(struct obd_statfs, os_fprecreated) == 108, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_fprecreated));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_fprecreated) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_fprecreated));
+	LASSERTF((int)offsetof(struct obd_statfs, os_spare2) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_spare2));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare2));
+	LASSERTF((int)offsetof(struct obd_statfs, os_spare3) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_spare3));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare3) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare3));
+	LASSERTF((int)offsetof(struct obd_statfs, os_spare4) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_spare4));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare4) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare4));
+	LASSERTF((int)offsetof(struct obd_statfs, os_spare5) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_spare5));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare5) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare5));
+	LASSERTF((int)offsetof(struct obd_statfs, os_spare6) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_spare6));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare6) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare6));
+	LASSERTF((int)offsetof(struct obd_statfs, os_spare7) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_spare7));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare7) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare7));
+	LASSERTF((int)offsetof(struct obd_statfs, os_spare8) == 136, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_spare8));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare8) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare8));
+	LASSERTF((int)offsetof(struct obd_statfs, os_spare9) == 140, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_spare9));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare9) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare9));
+
+	/* Checks for struct obd_ioobj */
+	LASSERTF((int)sizeof(struct obd_ioobj) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct obd_ioobj));
+	LASSERTF((int)offsetof(struct obd_ioobj, ioo_oid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_ioobj, ioo_oid));
+	LASSERTF((int)sizeof(((struct obd_ioobj *)0)->ioo_oid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_ioobj *)0)->ioo_oid));
+	LASSERTF((int)offsetof(struct obd_ioobj, ioo_max_brw) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_ioobj, ioo_max_brw));
+	LASSERTF((int)sizeof(((struct obd_ioobj *)0)->ioo_max_brw) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_ioobj *)0)->ioo_max_brw));
+	LASSERTF((int)offsetof(struct obd_ioobj, ioo_bufcnt) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_ioobj, ioo_bufcnt));
+	LASSERTF((int)sizeof(((struct obd_ioobj *)0)->ioo_bufcnt) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_ioobj *)0)->ioo_bufcnt));
+
+	/* Checks for union lquota_id */
+	LASSERTF((int)sizeof(union lquota_id) == 16, "found %lld\n",
+		 (long long)(int)sizeof(union lquota_id));
+
+	LASSERTF(QUOTABLOCK_BITS == 10, "found %lld\n",
+		 (long long)QUOTABLOCK_BITS);
+	LASSERTF(QUOTABLOCK_SIZE == 1024, "found %lld\n",
+		 (long long)QUOTABLOCK_SIZE);
+
+	/* Checks for struct obd_quotactl */
+	LASSERTF((int)sizeof(struct obd_quotactl) == 112, "found %lld\n",
+		 (long long)(int)sizeof(struct obd_quotactl));
+	LASSERTF((int)offsetof(struct obd_quotactl, qc_cmd) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_quotactl, qc_cmd));
+	LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_cmd) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_cmd));
+	LASSERTF((int)offsetof(struct obd_quotactl, qc_type) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_quotactl, qc_type));
+	LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_type) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_type));
+	LASSERTF((int)offsetof(struct obd_quotactl, qc_id) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_quotactl, qc_id));
+	LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_id));
+	LASSERTF((int)offsetof(struct obd_quotactl, qc_stat) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_quotactl, qc_stat));
+	LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_stat) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_stat));
+	LASSERTF((int)offsetof(struct obd_quotactl, qc_dqinfo) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_quotactl, qc_dqinfo));
+	LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_dqinfo) == 24, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_dqinfo));
+	LASSERTF((int)offsetof(struct obd_quotactl, qc_dqblk) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_quotactl, qc_dqblk));
+	LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_dqblk) == 72, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_dqblk));
+
+	/* Checks for struct obd_dqinfo */
+	LASSERTF((int)sizeof(struct obd_dqinfo) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct obd_dqinfo));
+	LASSERTF((int)offsetof(struct obd_dqinfo, dqi_bgrace) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqinfo, dqi_bgrace));
+	LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_bgrace) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_bgrace));
+	LASSERTF((int)offsetof(struct obd_dqinfo, dqi_igrace) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqinfo, dqi_igrace));
+	LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_igrace) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_igrace));
+	LASSERTF((int)offsetof(struct obd_dqinfo, dqi_flags) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqinfo, dqi_flags));
+	LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_flags));
+	LASSERTF((int)offsetof(struct obd_dqinfo, dqi_valid) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqinfo, dqi_valid));
+	LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_valid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_valid));
+
+	/* Checks for struct obd_dqblk */
+	LASSERTF((int)sizeof(struct obd_dqblk) == 72, "found %lld\n",
+		 (long long)(int)sizeof(struct obd_dqblk));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_bhardlimit) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_bhardlimit));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_bhardlimit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_bhardlimit));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_bsoftlimit) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_bsoftlimit));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_bsoftlimit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_bsoftlimit));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_curspace) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_curspace));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_curspace) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_curspace));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_ihardlimit) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_ihardlimit));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_ihardlimit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_ihardlimit));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_isoftlimit) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_isoftlimit));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_isoftlimit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_isoftlimit));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_curinodes) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_curinodes));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_curinodes) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_curinodes));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_btime) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_btime));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_btime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_btime));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_itime) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_itime));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_itime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_itime));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_valid) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_valid));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_valid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_valid));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_padding) == 68, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_padding));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_padding));
+	LASSERTF(Q_QUOTACHECK == 0x800100, "found 0x%.8x\n",
+		Q_QUOTACHECK);
+	LASSERTF(Q_INITQUOTA == 0x800101, "found 0x%.8x\n",
+		Q_INITQUOTA);
+	LASSERTF(Q_GETOINFO == 0x800102, "found 0x%.8x\n",
+		Q_GETOINFO);
+	LASSERTF(Q_GETOQUOTA == 0x800103, "found 0x%.8x\n",
+		Q_GETOQUOTA);
+	LASSERTF(Q_FINVALIDATE == 0x800104, "found 0x%.8x\n",
+		Q_FINVALIDATE);
+
+	/* Checks for struct lquota_acct_rec */
+	LASSERTF((int)sizeof(struct lquota_acct_rec) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct lquota_acct_rec));
+	LASSERTF((int)offsetof(struct lquota_acct_rec, bspace) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_acct_rec, bspace));
+	LASSERTF((int)sizeof(((struct lquota_acct_rec *)0)->bspace) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_acct_rec *)0)->bspace));
+	LASSERTF((int)offsetof(struct lquota_acct_rec, ispace) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_acct_rec, ispace));
+	LASSERTF((int)sizeof(((struct lquota_acct_rec *)0)->ispace) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_acct_rec *)0)->ispace));
+
+	/* Checks for struct lquota_glb_rec */
+	LASSERTF((int)sizeof(struct lquota_glb_rec) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct lquota_glb_rec));
+	LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_hardlimit) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_glb_rec, qbr_hardlimit));
+	LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_hardlimit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_hardlimit));
+	LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_softlimit) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_glb_rec, qbr_softlimit));
+	LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_softlimit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_softlimit));
+	LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_time) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_glb_rec, qbr_time));
+	LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_time));
+	LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_granted) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_glb_rec, qbr_granted));
+	LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_granted) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_granted));
+
+	/* Checks for struct lquota_slv_rec */
+	LASSERTF((int)sizeof(struct lquota_slv_rec) == 8, "found %lld\n",
+		 (long long)(int)sizeof(struct lquota_slv_rec));
+	LASSERTF((int)offsetof(struct lquota_slv_rec, qsr_granted) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_slv_rec, qsr_granted));
+	LASSERTF((int)sizeof(((struct lquota_slv_rec *)0)->qsr_granted) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_slv_rec *)0)->qsr_granted));
+
+	/* Checks for struct idx_info */
+	LASSERTF((int)sizeof(struct idx_info) == 80, "found %lld\n",
+		 (long long)(int)sizeof(struct idx_info));
+	LASSERTF((int)offsetof(struct idx_info, ii_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_magic));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_magic));
+	LASSERTF((int)offsetof(struct idx_info, ii_flags) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_flags));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_flags));
+	LASSERTF((int)offsetof(struct idx_info, ii_count) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_count));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_count) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_count));
+	LASSERTF((int)offsetof(struct idx_info, ii_pad0) == 10, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_pad0));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad0) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_pad0));
+	LASSERTF((int)offsetof(struct idx_info, ii_attrs) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_attrs));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_attrs) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_attrs));
+	LASSERTF((int)offsetof(struct idx_info, ii_fid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_fid));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_fid));
+	LASSERTF((int)offsetof(struct idx_info, ii_version) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_version));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_version) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_version));
+	LASSERTF((int)offsetof(struct idx_info, ii_hash_start) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_hash_start));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_hash_start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_hash_start));
+	LASSERTF((int)offsetof(struct idx_info, ii_hash_end) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_hash_end));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_hash_end) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_hash_end));
+	LASSERTF((int)offsetof(struct idx_info, ii_keysize) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_keysize));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_keysize) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_keysize));
+	LASSERTF((int)offsetof(struct idx_info, ii_recsize) == 58, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_recsize));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_recsize) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_recsize));
+	LASSERTF((int)offsetof(struct idx_info, ii_pad1) == 60, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_pad1));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_pad1));
+	LASSERTF((int)offsetof(struct idx_info, ii_pad2) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_pad2));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_pad2));
+	LASSERTF((int)offsetof(struct idx_info, ii_pad3) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_pad3));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_pad3));
+	CLASSERT(IDX_INFO_MAGIC == 0x3D37CC37);
+
+	/* Checks for struct lu_idxpage */
+	LASSERTF((int)sizeof(struct lu_idxpage) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct lu_idxpage));
+	LASSERTF((int)offsetof(struct lu_idxpage, lip_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_idxpage, lip_magic));
+	LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_magic));
+	LASSERTF((int)offsetof(struct lu_idxpage, lip_flags) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_idxpage, lip_flags));
+	LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_flags) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_flags));
+	LASSERTF((int)offsetof(struct lu_idxpage, lip_nr) == 6, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_idxpage, lip_nr));
+	LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_nr) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_nr));
+	LASSERTF((int)offsetof(struct lu_idxpage, lip_pad0) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_idxpage, lip_pad0));
+	LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_pad0) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_pad0));
+	CLASSERT(LIP_MAGIC == 0x8A6D6B6C);
+	LASSERTF(LIP_HDR_SIZE == 16, "found %lld\n",
+		 (long long)LIP_HDR_SIZE);
+	LASSERTF(II_FL_NOHASH == 1, "found %lld\n",
+		 (long long)II_FL_NOHASH);
+	LASSERTF(II_FL_VARKEY == 2, "found %lld\n",
+		 (long long)II_FL_VARKEY);
+	LASSERTF(II_FL_VARREC == 4, "found %lld\n",
+		 (long long)II_FL_VARREC);
+	LASSERTF(II_FL_NONUNQ == 8, "found %lld\n",
+		 (long long)II_FL_NONUNQ);
+
+	/* Checks for struct niobuf_remote */
+	LASSERTF((int)sizeof(struct niobuf_remote) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct niobuf_remote));
+	LASSERTF((int)offsetof(struct niobuf_remote, offset) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct niobuf_remote, offset));
+	LASSERTF((int)sizeof(((struct niobuf_remote *)0)->offset) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct niobuf_remote *)0)->offset));
+	LASSERTF((int)offsetof(struct niobuf_remote, len) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct niobuf_remote, len));
+	LASSERTF((int)sizeof(((struct niobuf_remote *)0)->len) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct niobuf_remote *)0)->len));
+	LASSERTF((int)offsetof(struct niobuf_remote, flags) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct niobuf_remote, flags));
+	LASSERTF((int)sizeof(((struct niobuf_remote *)0)->flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct niobuf_remote *)0)->flags));
+	LASSERTF(OBD_BRW_READ == 0x01, "found 0x%.8x\n",
+		OBD_BRW_READ);
+	LASSERTF(OBD_BRW_WRITE == 0x02, "found 0x%.8x\n",
+		OBD_BRW_WRITE);
+	LASSERTF(OBD_BRW_SYNC == 0x08, "found 0x%.8x\n",
+		OBD_BRW_SYNC);
+	LASSERTF(OBD_BRW_CHECK == 0x10, "found 0x%.8x\n",
+		OBD_BRW_CHECK);
+	LASSERTF(OBD_BRW_FROM_GRANT == 0x20, "found 0x%.8x\n",
+		OBD_BRW_FROM_GRANT);
+	LASSERTF(OBD_BRW_GRANTED == 0x40, "found 0x%.8x\n",
+		OBD_BRW_GRANTED);
+	LASSERTF(OBD_BRW_NOCACHE == 0x80, "found 0x%.8x\n",
+		OBD_BRW_NOCACHE);
+	LASSERTF(OBD_BRW_NOQUOTA == 0x100, "found 0x%.8x\n",
+		OBD_BRW_NOQUOTA);
+	LASSERTF(OBD_BRW_SRVLOCK == 0x200, "found 0x%.8x\n",
+		OBD_BRW_SRVLOCK);
+	LASSERTF(OBD_BRW_ASYNC == 0x400, "found 0x%.8x\n",
+		OBD_BRW_ASYNC);
+	LASSERTF(OBD_BRW_MEMALLOC == 0x800, "found 0x%.8x\n",
+		OBD_BRW_MEMALLOC);
+
+	/* Checks for struct ost_body */
+	LASSERTF((int)sizeof(struct ost_body) == 208, "found %lld\n",
+		 (long long)(int)sizeof(struct ost_body));
+	LASSERTF((int)offsetof(struct ost_body, oa) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_body, oa));
+	LASSERTF((int)sizeof(((struct ost_body *)0)->oa) == 208, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_body *)0)->oa));
+
+	/* Checks for struct ll_fid */
+	LASSERTF((int)sizeof(struct ll_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct ll_fid));
+	LASSERTF((int)offsetof(struct ll_fid, id) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fid, id));
+	LASSERTF((int)sizeof(((struct ll_fid *)0)->id) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fid *)0)->id));
+	LASSERTF((int)offsetof(struct ll_fid, generation) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fid, generation));
+	LASSERTF((int)sizeof(((struct ll_fid *)0)->generation) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fid *)0)->generation));
+	LASSERTF((int)offsetof(struct ll_fid, f_type) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fid, f_type));
+	LASSERTF((int)sizeof(((struct ll_fid *)0)->f_type) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fid *)0)->f_type));
+
+	/* Checks for struct mdt_body */
+	LASSERTF((int)sizeof(struct mdt_body) == 216, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_body));
+	LASSERTF((int)offsetof(struct mdt_body, fid1) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, fid1));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->fid1) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->fid1));
+	LASSERTF((int)offsetof(struct mdt_body, fid2) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, fid2));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->fid2) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->fid2));
+	LASSERTF((int)offsetof(struct mdt_body, handle) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, handle));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->handle) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->handle));
+	LASSERTF((int)offsetof(struct mdt_body, valid) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, valid));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->valid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->valid));
+	LASSERTF((int)offsetof(struct mdt_body, size) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, size));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->size));
+	LASSERTF((int)offsetof(struct mdt_body, mtime) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mtime));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mtime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mtime));
+	LASSERTF((int)offsetof(struct mdt_body, atime) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, atime));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->atime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->atime));
+	LASSERTF((int)offsetof(struct mdt_body, ctime) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, ctime));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->ctime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->ctime));
+	LASSERTF((int)offsetof(struct mdt_body, blocks) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, blocks));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->blocks) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->blocks));
+	LASSERTF((int)offsetof(struct mdt_body, unused1) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, unused1));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->unused1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->unused1));
+	LASSERTF((int)offsetof(struct mdt_body, fsuid) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, fsuid));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->fsuid));
+	LASSERTF((int)offsetof(struct mdt_body, fsgid) == 108, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, fsgid));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->fsgid));
+	LASSERTF((int)offsetof(struct mdt_body, capability) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, capability));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->capability) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->capability));
+	LASSERTF((int)offsetof(struct mdt_body, mode) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mode));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mode));
+	LASSERTF((int)offsetof(struct mdt_body, uid) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, uid));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->uid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->uid));
+	LASSERTF((int)offsetof(struct mdt_body, gid) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, gid));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->gid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->gid));
+	LASSERTF((int)offsetof(struct mdt_body, flags) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, flags));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->flags));
+	LASSERTF((int)offsetof(struct mdt_body, rdev) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, rdev));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->rdev) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->rdev));
+	LASSERTF((int)offsetof(struct mdt_body, nlink) == 136, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, nlink));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->nlink) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->nlink));
+	LASSERTF((int)offsetof(struct mdt_body, unused2) == 140, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, unused2));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->unused2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->unused2));
+	LASSERTF((int)offsetof(struct mdt_body, suppgid) == 144, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, suppgid));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->suppgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->suppgid));
+	LASSERTF((int)offsetof(struct mdt_body, eadatasize) == 148, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, eadatasize));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->eadatasize) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->eadatasize));
+	LASSERTF((int)offsetof(struct mdt_body, aclsize) == 152, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, aclsize));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->aclsize) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->aclsize));
+	LASSERTF((int)offsetof(struct mdt_body, max_mdsize) == 156, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, max_mdsize));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->max_mdsize) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->max_mdsize));
+	LASSERTF((int)offsetof(struct mdt_body, max_cookiesize) == 160, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, max_cookiesize));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->max_cookiesize) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->max_cookiesize));
+	LASSERTF((int)offsetof(struct mdt_body, uid_h) == 164, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, uid_h));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->uid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->uid_h));
+	LASSERTF((int)offsetof(struct mdt_body, gid_h) == 168, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, gid_h));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->gid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->gid_h));
+	LASSERTF((int)offsetof(struct mdt_body, padding_5) == 172, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, padding_5));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_5) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->padding_5));
+	LASSERTF((int)offsetof(struct mdt_body, padding_6) == 176, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, padding_6));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_6) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->padding_6));
+	LASSERTF((int)offsetof(struct mdt_body, padding_7) == 184, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, padding_7));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_7) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->padding_7));
+	LASSERTF((int)offsetof(struct mdt_body, padding_8) == 192, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, padding_8));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_8) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->padding_8));
+	LASSERTF((int)offsetof(struct mdt_body, padding_9) == 200, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, padding_9));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_9) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->padding_9));
+	LASSERTF((int)offsetof(struct mdt_body, padding_10) == 208, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, padding_10));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_10) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->padding_10));
+	LASSERTF(MDS_FMODE_CLOSED == 000000000000UL, "found 0%.11oUL\n",
+		MDS_FMODE_CLOSED);
+	LASSERTF(MDS_FMODE_EXEC == 000000000004UL, "found 0%.11oUL\n",
+		MDS_FMODE_EXEC);
+	LASSERTF(MDS_FMODE_EPOCH == 000001000000UL, "found 0%.11oUL\n",
+		MDS_FMODE_EPOCH);
+	LASSERTF(MDS_FMODE_TRUNC == 000002000000UL, "found 0%.11oUL\n",
+		MDS_FMODE_TRUNC);
+	LASSERTF(MDS_FMODE_SOM == 000004000000UL, "found 0%.11oUL\n",
+		MDS_FMODE_SOM);
+	LASSERTF(MDS_OPEN_CREATED == 000000000010UL, "found 0%.11oUL\n",
+		MDS_OPEN_CREATED);
+	LASSERTF(MDS_OPEN_CROSS == 000000000020UL, "found 0%.11oUL\n",
+		MDS_OPEN_CROSS);
+	LASSERTF(MDS_OPEN_CREAT == 000000000100UL, "found 0%.11oUL\n",
+		MDS_OPEN_CREAT);
+	LASSERTF(MDS_OPEN_EXCL == 000000000200UL, "found 0%.11oUL\n",
+		MDS_OPEN_EXCL);
+	LASSERTF(MDS_OPEN_TRUNC == 000000001000UL, "found 0%.11oUL\n",
+		MDS_OPEN_TRUNC);
+	LASSERTF(MDS_OPEN_APPEND == 000000002000UL, "found 0%.11oUL\n",
+		MDS_OPEN_APPEND);
+	LASSERTF(MDS_OPEN_SYNC == 000000010000UL, "found 0%.11oUL\n",
+		MDS_OPEN_SYNC);
+	LASSERTF(MDS_OPEN_DIRECTORY == 000000200000UL, "found 0%.11oUL\n",
+		MDS_OPEN_DIRECTORY);
+	LASSERTF(MDS_OPEN_BY_FID == 000040000000UL, "found 0%.11oUL\n",
+		MDS_OPEN_BY_FID);
+	LASSERTF(MDS_OPEN_DELAY_CREATE == 000100000000UL, "found 0%.11oUL\n",
+		MDS_OPEN_DELAY_CREATE);
+	LASSERTF(MDS_OPEN_OWNEROVERRIDE == 000200000000UL, "found 0%.11oUL\n",
+		MDS_OPEN_OWNEROVERRIDE);
+	LASSERTF(MDS_OPEN_JOIN_FILE == 000400000000UL, "found 0%.11oUL\n",
+		MDS_OPEN_JOIN_FILE);
+	LASSERTF(MDS_OPEN_LOCK == 004000000000UL, "found 0%.11oUL\n",
+		MDS_OPEN_LOCK);
+	LASSERTF(MDS_OPEN_HAS_EA == 010000000000UL, "found 0%.11oUL\n",
+		MDS_OPEN_HAS_EA);
+	LASSERTF(MDS_OPEN_HAS_OBJS == 020000000000UL, "found 0%.11oUL\n",
+		MDS_OPEN_HAS_OBJS);
+	LASSERTF(MDS_OPEN_NORESTORE == 00000000000100000000000ULL, "found 0%.22lloULL\n",
+			(long long)MDS_OPEN_NORESTORE);
+	LASSERTF(MDS_OPEN_NEWSTRIPE == 00000000000200000000000ULL, "found 0%.22lloULL\n",
+			(long long)MDS_OPEN_NEWSTRIPE);
+	LASSERTF(MDS_OPEN_VOLATILE == 00000000000400000000000ULL, "found 0%.22lloULL\n",
+			(long long)MDS_OPEN_VOLATILE);
+	LASSERTF(LUSTRE_SYNC_FL == 0x00000008, "found 0x%.8x\n",
+		LUSTRE_SYNC_FL);
+	LASSERTF(LUSTRE_IMMUTABLE_FL == 0x00000010, "found 0x%.8x\n",
+		LUSTRE_IMMUTABLE_FL);
+	LASSERTF(LUSTRE_APPEND_FL == 0x00000020, "found 0x%.8x\n",
+		LUSTRE_APPEND_FL);
+	LASSERTF(LUSTRE_NOATIME_FL == 0x00000080, "found 0x%.8x\n",
+		LUSTRE_NOATIME_FL);
+	LASSERTF(LUSTRE_DIRSYNC_FL == 0x00010000, "found 0x%.8x\n",
+		LUSTRE_DIRSYNC_FL);
+	LASSERTF(MDS_INODELOCK_LOOKUP == 0x000001, "found 0x%.8x\n",
+		MDS_INODELOCK_LOOKUP);
+	LASSERTF(MDS_INODELOCK_UPDATE == 0x000002, "found 0x%.8x\n",
+		MDS_INODELOCK_UPDATE);
+	LASSERTF(MDS_INODELOCK_OPEN == 0x000004, "found 0x%.8x\n",
+		MDS_INODELOCK_OPEN);
+	LASSERTF(MDS_INODELOCK_LAYOUT == 0x000008, "found 0x%.8x\n",
+		MDS_INODELOCK_LAYOUT);
+
+	/* Checks for struct mdt_ioepoch */
+	LASSERTF((int)sizeof(struct mdt_ioepoch) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_ioepoch));
+	LASSERTF((int)offsetof(struct mdt_ioepoch, handle) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_ioepoch, handle));
+	LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->handle) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_ioepoch *)0)->handle));
+	LASSERTF((int)offsetof(struct mdt_ioepoch, ioepoch) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_ioepoch, ioepoch));
+	LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->ioepoch) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_ioepoch *)0)->ioepoch));
+	LASSERTF((int)offsetof(struct mdt_ioepoch, flags) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_ioepoch, flags));
+	LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_ioepoch *)0)->flags));
+	LASSERTF((int)offsetof(struct mdt_ioepoch, padding) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_ioepoch, padding));
+	LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_ioepoch *)0)->padding));
+
+	/* Checks for struct mdt_remote_perm */
+	LASSERTF((int)sizeof(struct mdt_remote_perm) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_remote_perm));
+	LASSERTF((int)offsetof(struct mdt_remote_perm, rp_uid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_remote_perm, rp_uid));
+	LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_uid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_uid));
+	LASSERTF((int)offsetof(struct mdt_remote_perm, rp_gid) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_remote_perm, rp_gid));
+	LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_gid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_gid));
+	LASSERTF((int)offsetof(struct mdt_remote_perm, rp_fsuid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_remote_perm, rp_fsuid));
+	LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_fsuid));
+	LASSERTF((int)offsetof(struct mdt_remote_perm, rp_fsgid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_remote_perm, rp_fsgid));
+	LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_fsgid));
+	LASSERTF((int)offsetof(struct mdt_remote_perm, rp_access_perm) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_remote_perm, rp_access_perm));
+	LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_access_perm) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_access_perm));
+	LASSERTF((int)offsetof(struct mdt_remote_perm, rp_padding) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_remote_perm, rp_padding));
+	LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_padding));
+	LASSERTF(CFS_SETUID_PERM == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)CFS_SETUID_PERM);
+	LASSERTF(CFS_SETGID_PERM == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)CFS_SETGID_PERM);
+	LASSERTF(CFS_SETGRP_PERM == 0x00000004UL, "found 0x%.8xUL\n",
+		(unsigned)CFS_SETGRP_PERM);
+	LASSERTF(CFS_RMTACL_PERM == 0x00000008UL, "found 0x%.8xUL\n",
+		(unsigned)CFS_RMTACL_PERM);
+	LASSERTF(CFS_RMTOWN_PERM == 0x00000010UL, "found 0x%.8xUL\n",
+		(unsigned)CFS_RMTOWN_PERM);
+
+	/* Checks for struct mdt_rec_setattr */
+	LASSERTF((int)sizeof(struct mdt_rec_setattr) == 136, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_rec_setattr));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_opcode) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_opcode));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_opcode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_opcode));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_cap) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_cap));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_cap) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_cap));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsuid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsuid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsuid_h) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsuid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsgid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsgid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsgid_h) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsgid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_suppgid) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_suppgid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_suppgid_h) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_suppgid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_1) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_1));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_1_h) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_1_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1_h));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fid) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_fid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fid));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_valid) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_valid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_valid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_valid));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_uid) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_uid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_uid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_uid));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_gid) == 68, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_gid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_gid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_gid));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_size) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_size));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_size));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_blocks) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_blocks));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_blocks) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_blocks));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_mtime) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_mtime));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_mtime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_mtime));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_atime) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_atime));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_atime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_atime));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_ctime) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_ctime));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_ctime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_ctime));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_attr_flags) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_attr_flags));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_attr_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_attr_flags));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_mode) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_mode));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_mode));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_bias) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_bias));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_bias) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_bias));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_3) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_3));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_3) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_3));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_4) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_4));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_4) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_4));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_5) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_5));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_5) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_5));
+
+	/* Checks for struct mdt_rec_create */
+	LASSERTF((int)sizeof(struct mdt_rec_create) == 136, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_rec_create));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_opcode) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_opcode));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_opcode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_opcode));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_cap) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_cap));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_cap) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_cap));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsuid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_fsuid));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsuid_h) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_fsuid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsgid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_fsgid));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsgid_h) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_fsgid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid1) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid1_h) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid1_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1_h));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid2) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid2_h) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid2_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2_h));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_fid1) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_fid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fid1) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fid1));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_fid2) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_fid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fid2) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fid2));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_old_handle) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_old_handle));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_old_handle) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_old_handle));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_time) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_time));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_time));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_rdev) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_rdev));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_rdev) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_rdev));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_ioepoch) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_ioepoch));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_ioepoch) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_ioepoch));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_padding_1) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_padding_1));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_padding_1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_padding_1));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_mode) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_mode));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_mode));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_bias) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_bias));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_bias) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_bias));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_flags_l) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_flags_l));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_flags_l) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_flags_l));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_flags_h) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_flags_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_flags_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_flags_h));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_umask) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_umask));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_umask) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_umask));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_padding_4) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_padding_4));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_padding_4) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_padding_4));
+
+	/* Checks for struct mdt_rec_link */
+	LASSERTF((int)sizeof(struct mdt_rec_link) == 136, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_rec_link));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_opcode) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_opcode));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_opcode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_opcode));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_cap) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_cap));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_cap) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_cap));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsuid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_fsuid));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsuid_h) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_fsuid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsgid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_fsgid));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsgid_h) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_fsgid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid1) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid1_h) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid1_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1_h));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid2) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid2_h) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid2_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2_h));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_fid1) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_fid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fid1) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fid1));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_fid2) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_fid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fid2) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fid2));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_time) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_time));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_time));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_1) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_1));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_1));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_2) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_2));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_2));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_3) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_3));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_3));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_4) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_4));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_4) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_4));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_bias) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_bias));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_bias) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_bias));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_5) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_5));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_5) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_5));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_6) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_6));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_6) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_6));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_7) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_7));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_7) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_7));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_8) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_8));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_8) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_8));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_9) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_9));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_9) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_9));
+
+	/* Checks for struct mdt_rec_unlink */
+	LASSERTF((int)sizeof(struct mdt_rec_unlink) == 136, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_rec_unlink));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_opcode) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_opcode));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_opcode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_opcode));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_cap) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_cap));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_cap) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_cap));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsuid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsuid));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsuid_h) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsuid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsgid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsgid));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsgid_h) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsgid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid1) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid1_h) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid1_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1_h));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid2) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid2_h) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid2_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2_h));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fid1) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_fid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid1) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid1));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fid2) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_fid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid2) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid2));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_time) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_time));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_time));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_2) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_2));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_2));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_3) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_3));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_3));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_4) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_4));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_4) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_4));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_5) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_5));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_5) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_5));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_bias) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_bias));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_bias) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_bias));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_mode) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_mode));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_mode));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_6) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_6));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_6) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_6));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_7) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_7));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_7) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_7));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_8) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_8));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_8) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_8));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_9) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_9));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_9) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_9));
+
+	/* Checks for struct mdt_rec_rename */
+	LASSERTF((int)sizeof(struct mdt_rec_rename) == 136, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_rec_rename));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_opcode) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_opcode));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_opcode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_opcode));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_cap) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_cap));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_cap) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_cap));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsuid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_fsuid));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsuid_h) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_fsuid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsgid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_fsgid));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsgid_h) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_fsgid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid1) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid1_h) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid1_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1_h));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid2) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid2_h) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid2_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2_h));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fid1) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_fid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fid1) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fid1));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fid2) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_fid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fid2) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fid2));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_time) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_time));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_time));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_1) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_1));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_1));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_2) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_2));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_2));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_3) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_3));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_3));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_4) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_4));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_4) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_4));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_bias) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_bias));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_bias) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_bias));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_mode) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_mode));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_mode));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_5) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_5));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_5) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_5));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_6) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_6));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_6) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_6));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_7) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_7));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_7) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_7));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_8) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_8));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_8) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_8));
+
+	/* Checks for struct mdt_rec_setxattr */
+	LASSERTF((int)sizeof(struct mdt_rec_setxattr) == 136, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_rec_setxattr));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_opcode) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_opcode));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_opcode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_opcode));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_cap) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_cap));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_cap) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_cap));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsuid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsuid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsuid_h) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsuid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsgid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsgid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsgid_h) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsgid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid1) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid1_h) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid1_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1_h));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid2) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid2_h) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid2_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2_h));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fid) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fid));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_1) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_1));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_1));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_2) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_2));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_2));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_3) == 68, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_3));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_3) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_3));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_valid) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_valid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_valid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_valid));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_time) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_time));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_time));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_5) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_5));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_5) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_5));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_6) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_6));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_6) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_6));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_7) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_7));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_7) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_7));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_size) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_size));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_size) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_size));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_flags) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_flags));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_flags));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_8) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_8));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_8) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_8));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_9) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_9));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_9) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_9));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_10) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_10));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_10) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_10));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_11) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_11));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11));
+
+	/* Checks for struct mdt_rec_reint */
+	LASSERTF((int)sizeof(struct mdt_rec_reint) == 136, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_rec_reint));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_opcode) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_opcode));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_opcode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_opcode));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_cap) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_cap));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_cap) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_cap));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsuid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_fsuid));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsuid_h) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_fsuid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsgid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_fsgid));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsgid_h) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_fsgid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid1) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid1_h) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid1_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1_h));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid2) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid2_h) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid2_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2_h));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fid1) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_fid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fid1) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fid1));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fid2) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_fid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fid2) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fid2));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_mtime) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_mtime));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_mtime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_mtime));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_atime) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_atime));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_atime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_atime));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_ctime) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_ctime));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_ctime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_ctime));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_size) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_size));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_size));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_blocks) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_blocks));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_blocks) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_blocks));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_bias) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_bias));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_bias) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_bias));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_mode) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_mode));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_mode));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_flags) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_flags));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_flags));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_flags_h) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_flags_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_flags_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_flags_h));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_umask) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_umask));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_umask) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_umask));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_padding_4) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_padding_4));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_padding_4) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_padding_4));
+
+	/* Checks for struct lmv_desc */
+	LASSERTF((int)sizeof(struct lmv_desc) == 88, "found %lld\n",
+		 (long long)(int)sizeof(struct lmv_desc));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_tgt_count) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_tgt_count));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_tgt_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_tgt_count));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_active_tgt_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_active_tgt_count));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_active_tgt_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_active_tgt_count));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_default_stripe_count) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_default_stripe_count));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_default_stripe_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_default_stripe_count));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_pattern) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_pattern));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_pattern) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_pattern));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_default_hash_size) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_default_hash_size));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_default_hash_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_default_hash_size));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_padding_1) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_padding_1));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_1));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_padding_2) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_padding_2));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_2));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_qos_maxage) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_qos_maxage));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_qos_maxage) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_qos_maxage));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_padding_3) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_padding_3));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_3) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_3));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_padding_4) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_padding_4));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_4) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_4));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_uuid) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_uuid));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_uuid) == 40, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_uuid));
+
+	/* Checks for struct lmv_stripe_md */
+	LASSERTF((int)sizeof(struct lmv_stripe_md) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct lmv_stripe_md));
+	LASSERTF((int)offsetof(struct lmv_stripe_md, mea_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_stripe_md, mea_magic));
+	LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_magic));
+	LASSERTF((int)offsetof(struct lmv_stripe_md, mea_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_stripe_md, mea_count));
+	LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_count));
+	LASSERTF((int)offsetof(struct lmv_stripe_md, mea_master) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_stripe_md, mea_master));
+	LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_master) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_master));
+	LASSERTF((int)offsetof(struct lmv_stripe_md, mea_padding) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_stripe_md, mea_padding));
+	LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_padding));
+	CLASSERT(LOV_MAXPOOLNAME == 16);
+	LASSERTF((int)offsetof(struct lmv_stripe_md, mea_pool_name[16]) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_stripe_md, mea_pool_name[16]));
+	LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_pool_name[16]) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_pool_name[16]));
+	LASSERTF((int)offsetof(struct lmv_stripe_md, mea_ids[0]) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_stripe_md, mea_ids[0]));
+	LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_ids[0]) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_ids[0]));
+
+	/* Checks for struct lov_desc */
+	LASSERTF((int)sizeof(struct lov_desc) == 88, "found %lld\n",
+		 (long long)(int)sizeof(struct lov_desc));
+	LASSERTF((int)offsetof(struct lov_desc, ld_tgt_count) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_tgt_count));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_tgt_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_tgt_count));
+	LASSERTF((int)offsetof(struct lov_desc, ld_active_tgt_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_active_tgt_count));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_active_tgt_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_active_tgt_count));
+	LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_count) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_default_stripe_count));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_count));
+	LASSERTF((int)offsetof(struct lov_desc, ld_pattern) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_pattern));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_pattern) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_pattern));
+	LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_size) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_default_stripe_size));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_size));
+	LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_offset) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_default_stripe_offset));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset));
+	LASSERTF((int)offsetof(struct lov_desc, ld_padding_0) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_padding_0));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_0) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_0));
+	LASSERTF((int)offsetof(struct lov_desc, ld_qos_maxage) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_qos_maxage));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_qos_maxage) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_qos_maxage));
+	LASSERTF((int)offsetof(struct lov_desc, ld_padding_1) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_padding_1));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_1));
+	LASSERTF((int)offsetof(struct lov_desc, ld_padding_2) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_padding_2));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_2));
+	LASSERTF((int)offsetof(struct lov_desc, ld_uuid) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_uuid));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_uuid) == 40, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_uuid));
+	CLASSERT(LOV_DESC_MAGIC == 0xB0CCDE5C);
+
+	/* Checks for struct ldlm_res_id */
+	LASSERTF((int)sizeof(struct ldlm_res_id) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_res_id));
+	CLASSERT(RES_NAME_SIZE == 4);
+	LASSERTF((int)offsetof(struct ldlm_res_id, name[4]) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_res_id, name[4]));
+	LASSERTF((int)sizeof(((struct ldlm_res_id *)0)->name[4]) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_res_id *)0)->name[4]));
+
+	/* Checks for struct ldlm_extent */
+	LASSERTF((int)sizeof(struct ldlm_extent) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_extent));
+	LASSERTF((int)offsetof(struct ldlm_extent, start) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_extent, start));
+	LASSERTF((int)sizeof(((struct ldlm_extent *)0)->start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_extent *)0)->start));
+	LASSERTF((int)offsetof(struct ldlm_extent, end) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_extent, end));
+	LASSERTF((int)sizeof(((struct ldlm_extent *)0)->end) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_extent *)0)->end));
+	LASSERTF((int)offsetof(struct ldlm_extent, gid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_extent, gid));
+	LASSERTF((int)sizeof(((struct ldlm_extent *)0)->gid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_extent *)0)->gid));
+
+	/* Checks for struct ldlm_inodebits */
+	LASSERTF((int)sizeof(struct ldlm_inodebits) == 8, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_inodebits));
+	LASSERTF((int)offsetof(struct ldlm_inodebits, bits) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_inodebits, bits));
+	LASSERTF((int)sizeof(((struct ldlm_inodebits *)0)->bits) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_inodebits *)0)->bits));
+
+	/* Checks for struct ldlm_flock_wire */
+	LASSERTF((int)sizeof(struct ldlm_flock_wire) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_flock_wire));
+	LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_start) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_flock_wire, lfw_start));
+	LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_start));
+	LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_end) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_flock_wire, lfw_end));
+	LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_end) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_end));
+	LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_owner) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_flock_wire, lfw_owner));
+	LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_owner) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_owner));
+	LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_padding) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_flock_wire, lfw_padding));
+	LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_padding));
+	LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_pid) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_flock_wire, lfw_pid));
+	LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_pid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_pid));
+
+	/* Checks for struct ldlm_intent */
+	LASSERTF((int)sizeof(struct ldlm_intent) == 8, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_intent));
+	LASSERTF((int)offsetof(struct ldlm_intent, opc) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_intent, opc));
+	LASSERTF((int)sizeof(((struct ldlm_intent *)0)->opc) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_intent *)0)->opc));
+
+	/* Checks for struct ldlm_resource_desc */
+	LASSERTF((int)sizeof(struct ldlm_resource_desc) == 40, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_resource_desc));
+	LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_type) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_resource_desc, lr_type));
+	LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_type) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_type));
+	LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_padding) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_resource_desc, lr_padding));
+	LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_padding));
+	LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_name) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_resource_desc, lr_name));
+	LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_name) == 32, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_name));
+
+	/* Checks for struct ldlm_lock_desc */
+	LASSERTF((int)sizeof(struct ldlm_lock_desc) == 80, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_lock_desc));
+	LASSERTF((int)offsetof(struct ldlm_lock_desc, l_resource) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_lock_desc, l_resource));
+	LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_resource) == 40, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_resource));
+	LASSERTF((int)offsetof(struct ldlm_lock_desc, l_req_mode) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_lock_desc, l_req_mode));
+	LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_req_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_req_mode));
+	LASSERTF((int)offsetof(struct ldlm_lock_desc, l_granted_mode) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_lock_desc, l_granted_mode));
+	LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_granted_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_granted_mode));
+	LASSERTF((int)offsetof(struct ldlm_lock_desc, l_policy_data) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_lock_desc, l_policy_data));
+	LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_policy_data) == 32, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_policy_data));
+
+	/* Checks for struct ldlm_request */
+	LASSERTF((int)sizeof(struct ldlm_request) == 104, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_request));
+	LASSERTF((int)offsetof(struct ldlm_request, lock_flags) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_request, lock_flags));
+	LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_request *)0)->lock_flags));
+	LASSERTF((int)offsetof(struct ldlm_request, lock_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_request, lock_count));
+	LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_request *)0)->lock_count));
+	LASSERTF((int)offsetof(struct ldlm_request, lock_desc) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_request, lock_desc));
+	LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_desc) == 80, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_request *)0)->lock_desc));
+	LASSERTF((int)offsetof(struct ldlm_request, lock_handle) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_request, lock_handle));
+	LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_handle) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_request *)0)->lock_handle));
+
+	/* Checks for struct ldlm_reply */
+	LASSERTF((int)sizeof(struct ldlm_reply) == 112, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_reply));
+	LASSERTF((int)offsetof(struct ldlm_reply, lock_flags) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_reply, lock_flags));
+	LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_flags));
+	LASSERTF((int)offsetof(struct ldlm_reply, lock_padding) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_reply, lock_padding));
+	LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_padding));
+	LASSERTF((int)offsetof(struct ldlm_reply, lock_desc) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_reply, lock_desc));
+	LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_desc) == 80, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_desc));
+	LASSERTF((int)offsetof(struct ldlm_reply, lock_handle) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_reply, lock_handle));
+	LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_handle) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_handle));
+	LASSERTF((int)offsetof(struct ldlm_reply, lock_policy_res1) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_reply, lock_policy_res1));
+	LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_policy_res1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_policy_res1));
+	LASSERTF((int)offsetof(struct ldlm_reply, lock_policy_res2) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_reply, lock_policy_res2));
+	LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_policy_res2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_policy_res2));
+
+	/* Checks for struct ost_lvb_v1 */
+	LASSERTF((int)sizeof(struct ost_lvb_v1) == 40, "found %lld\n",
+		 (long long)(int)sizeof(struct ost_lvb_v1));
+	LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_size) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb_v1, lvb_size));
+	LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_size));
+	LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_mtime) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb_v1, lvb_mtime));
+	LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_mtime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_mtime));
+	LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_atime) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb_v1, lvb_atime));
+	LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_atime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_atime));
+	LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_ctime) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb_v1, lvb_ctime));
+	LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_ctime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_ctime));
+	LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_blocks) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb_v1, lvb_blocks));
+	LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_blocks) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_blocks));
+
+	/* Checks for struct ost_lvb */
+	LASSERTF((int)sizeof(struct ost_lvb) == 56, "found %lld\n",
+		 (long long)(int)sizeof(struct ost_lvb));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_size) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_size));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_size));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_mtime) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_mtime));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_mtime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_mtime));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_atime) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_atime));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_atime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_atime));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_ctime) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_ctime));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_ctime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_ctime));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_blocks) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_blocks));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_blocks) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_blocks));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_mtime_ns) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_mtime_ns));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_mtime_ns) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_mtime_ns));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_atime_ns) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_atime_ns));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_atime_ns) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_atime_ns));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_ctime_ns) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_ctime_ns));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_ctime_ns) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_ctime_ns));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_padding) == 52, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_padding));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_padding));
+
+	/* Checks for struct lquota_lvb */
+	LASSERTF((int)sizeof(struct lquota_lvb) == 40, "found %lld\n",
+		 (long long)(int)sizeof(struct lquota_lvb));
+	LASSERTF((int)offsetof(struct lquota_lvb, lvb_flags) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_lvb, lvb_flags));
+	LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_flags) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_flags));
+	LASSERTF((int)offsetof(struct lquota_lvb, lvb_id_may_rel) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_lvb, lvb_id_may_rel));
+	LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_id_may_rel) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_id_may_rel));
+	LASSERTF((int)offsetof(struct lquota_lvb, lvb_id_rel) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_lvb, lvb_id_rel));
+	LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_id_rel) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_id_rel));
+	LASSERTF((int)offsetof(struct lquota_lvb, lvb_id_qunit) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_lvb, lvb_id_qunit));
+	LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_id_qunit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_id_qunit));
+	LASSERTF((int)offsetof(struct lquota_lvb, lvb_pad1) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_lvb, lvb_pad1));
+	LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_pad1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_pad1));
+	LASSERTF(LQUOTA_FL_EDQUOT == 1, "found %lld\n",
+		 (long long)LQUOTA_FL_EDQUOT);
+
+	/* Checks for struct ldlm_gl_lquota_desc */
+	LASSERTF((int)sizeof(struct ldlm_gl_lquota_desc) == 64, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_gl_lquota_desc));
+	LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_id) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_id));
+	LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_id) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_id));
+	LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_flags) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_flags));
+	LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_flags) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_flags));
+	LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_ver) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_ver));
+	LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_ver) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_ver));
+	LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_hardlimit) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_hardlimit));
+	LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_hardlimit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_hardlimit));
+	LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_softlimit) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_softlimit));
+	LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_softlimit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_softlimit));
+	LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_time) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_time));
+	LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_time));
+	LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_pad2) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_pad2));
+	LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_pad2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_pad2));
+
+	/* Checks for struct mgs_send_param */
+	LASSERTF((int)sizeof(struct mgs_send_param) == 1024, "found %lld\n",
+		 (long long)(int)sizeof(struct mgs_send_param));
+	CLASSERT(MGS_PARAM_MAXLEN == 1024);
+	LASSERTF((int)offsetof(struct mgs_send_param, mgs_param[1024]) == 1024, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_send_param, mgs_param[1024]));
+	LASSERTF((int)sizeof(((struct mgs_send_param *)0)->mgs_param[1024]) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_send_param *)0)->mgs_param[1024]));
+
+	/* Checks for struct cfg_marker */
+	LASSERTF((int)sizeof(struct cfg_marker) == 160, "found %lld\n",
+		 (long long)(int)sizeof(struct cfg_marker));
+	LASSERTF((int)offsetof(struct cfg_marker, cm_step) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct cfg_marker, cm_step));
+	LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_step) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_step));
+	LASSERTF((int)offsetof(struct cfg_marker, cm_flags) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct cfg_marker, cm_flags));
+	LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_flags));
+	LASSERTF((int)offsetof(struct cfg_marker, cm_vers) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct cfg_marker, cm_vers));
+	LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_vers) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_vers));
+	LASSERTF((int)offsetof(struct cfg_marker, cm_padding) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct cfg_marker, cm_padding));
+	LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_padding));
+	LASSERTF((int)offsetof(struct cfg_marker, cm_createtime) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct cfg_marker, cm_createtime));
+	LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_createtime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_createtime));
+	LASSERTF((int)offsetof(struct cfg_marker, cm_canceltime) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct cfg_marker, cm_canceltime));
+	LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_canceltime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_canceltime));
+	LASSERTF((int)offsetof(struct cfg_marker, cm_tgtname) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct cfg_marker, cm_tgtname));
+	LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_tgtname) == 64, "found %lld\n",
+		 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_tgtname));
+	LASSERTF((int)offsetof(struct cfg_marker, cm_comment) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct cfg_marker, cm_comment));
+	LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_comment) == 64, "found %lld\n",
+		 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_comment));
+
+	/* Checks for struct llog_logid */
+	LASSERTF((int)sizeof(struct llog_logid) == 20, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_logid));
+	LASSERTF((int)offsetof(struct llog_logid, lgl_oi) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_logid, lgl_oi));
+	LASSERTF((int)sizeof(((struct llog_logid *)0)->lgl_oi) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_logid *)0)->lgl_oi));
+	LASSERTF((int)offsetof(struct llog_logid, lgl_ogen) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_logid, lgl_ogen));
+	LASSERTF((int)sizeof(((struct llog_logid *)0)->lgl_ogen) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_logid *)0)->lgl_ogen));
+	CLASSERT(OST_SZ_REC == 274730752);
+	CLASSERT(MDS_UNLINK_REC == 274801668);
+	CLASSERT(MDS_UNLINK64_REC == 275325956);
+	CLASSERT(MDS_SETATTR64_REC == 275325953);
+	CLASSERT(OBD_CFG_REC == 274857984);
+	CLASSERT(LLOG_GEN_REC == 274989056);
+	CLASSERT(CHANGELOG_REC == 275120128);
+	CLASSERT(CHANGELOG_USER_REC == 275185664);
+	CLASSERT(LLOG_HDR_MAGIC == 275010873);
+	CLASSERT(LLOG_LOGID_MAGIC == 275010875);
+
+	/* Checks for struct llog_catid */
+	LASSERTF((int)sizeof(struct llog_catid) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_catid));
+	LASSERTF((int)offsetof(struct llog_catid, lci_logid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_catid, lci_logid));
+	LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_logid) == 20, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_catid *)0)->lci_logid));
+	LASSERTF((int)offsetof(struct llog_catid, lci_padding1) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_catid, lci_padding1));
+	LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding1));
+	LASSERTF((int)offsetof(struct llog_catid, lci_padding2) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_catid, lci_padding2));
+	LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding2));
+	LASSERTF((int)offsetof(struct llog_catid, lci_padding3) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_catid, lci_padding3));
+	LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding3) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding3));
+
+	/* Checks for struct llog_rec_hdr */
+	LASSERTF((int)sizeof(struct llog_rec_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_rec_hdr));
+	LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_len) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_rec_hdr, lrh_len));
+	LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_len) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_len));
+	LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_index) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_rec_hdr, lrh_index));
+	LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_index));
+	LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_type) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_rec_hdr, lrh_type));
+	LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_type) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_type));
+	LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_id) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_rec_hdr, lrh_id));
+	LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_id));
+
+	/* Checks for struct llog_rec_tail */
+	LASSERTF((int)sizeof(struct llog_rec_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_rec_tail));
+	LASSERTF((int)offsetof(struct llog_rec_tail, lrt_len) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_rec_tail, lrt_len));
+	LASSERTF((int)sizeof(((struct llog_rec_tail *)0)->lrt_len) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_rec_tail *)0)->lrt_len));
+	LASSERTF((int)offsetof(struct llog_rec_tail, lrt_index) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_rec_tail, lrt_index));
+	LASSERTF((int)sizeof(((struct llog_rec_tail *)0)->lrt_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_rec_tail *)0)->lrt_index));
+
+	/* Checks for struct llog_logid_rec */
+	LASSERTF((int)sizeof(struct llog_logid_rec) == 64, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_logid_rec));
+	LASSERTF((int)offsetof(struct llog_logid_rec, lid_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_logid_rec, lid_hdr));
+	LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_hdr));
+	LASSERTF((int)offsetof(struct llog_logid_rec, lid_id) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_logid_rec, lid_id));
+	LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_id) == 20, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_id));
+	LASSERTF((int)offsetof(struct llog_logid_rec, lid_padding1) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_logid_rec, lid_padding1));
+	LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_padding1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_padding1));
+	LASSERTF((int)offsetof(struct llog_logid_rec, lid_padding2) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_logid_rec, lid_padding2));
+	LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_padding2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_padding2));
+	LASSERTF((int)offsetof(struct llog_logid_rec, lid_padding3) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_logid_rec, lid_padding3));
+	LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_padding3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_padding3));
+	LASSERTF((int)offsetof(struct llog_logid_rec, lid_tail) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_logid_rec, lid_tail));
+	LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_tail));
+
+	/* Checks for struct llog_unlink_rec */
+	LASSERTF((int)sizeof(struct llog_unlink_rec) == 40, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_unlink_rec));
+	LASSERTF((int)offsetof(struct llog_unlink_rec, lur_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink_rec, lur_hdr));
+	LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_hdr));
+	LASSERTF((int)offsetof(struct llog_unlink_rec, lur_oid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink_rec, lur_oid));
+	LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_oid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_oid));
+	LASSERTF((int)offsetof(struct llog_unlink_rec, lur_oseq) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink_rec, lur_oseq));
+	LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_oseq) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_oseq));
+	LASSERTF((int)offsetof(struct llog_unlink_rec, lur_count) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink_rec, lur_count));
+	LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_count));
+	LASSERTF((int)offsetof(struct llog_unlink_rec, lur_tail) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink_rec, lur_tail));
+	LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_tail));
+	/* Checks for struct llog_unlink64_rec */
+	LASSERTF((int)sizeof(struct llog_unlink64_rec) == 64, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_unlink64_rec));
+	LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink64_rec, lur_hdr));
+	LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_hdr));
+	LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_fid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink64_rec, lur_fid));
+	LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_fid));
+	LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_count) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink64_rec, lur_count));
+	LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_count));
+	LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_tail) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink64_rec, lur_tail));
+	LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_tail));
+	LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_padding1) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink64_rec, lur_padding1));
+	LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding1));
+	LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_padding2) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink64_rec, lur_padding2));
+	LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding2));
+	LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_padding3) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink64_rec, lur_padding3));
+	LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding3));
+
+	/* Checks for struct llog_setattr64_rec */
+	LASSERTF((int)sizeof(struct llog_setattr64_rec) == 64, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_setattr64_rec));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_hdr));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_hdr));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_oi) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_oi));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_oi) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_oi));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_uid) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_uid));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_uid_h) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_uid_h));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid_h));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_gid) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_gid));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_gid_h) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_gid_h));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid_h));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_padding) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_padding));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_padding) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_padding));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_tail) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_tail));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_tail));
+
+	/* Checks for struct llog_size_change_rec */
+	LASSERTF((int)sizeof(struct llog_size_change_rec) == 64, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_size_change_rec));
+	LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_size_change_rec, lsc_hdr));
+	LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_hdr));
+	LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_fid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_size_change_rec, lsc_fid));
+	LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_fid));
+	LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_ioepoch) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_size_change_rec, lsc_ioepoch));
+	LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_ioepoch) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_ioepoch));
+	LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_padding1) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_size_change_rec, lsc_padding1));
+	LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding1));
+	LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_padding2) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_size_change_rec, lsc_padding2));
+	LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding2));
+	LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_padding3) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_size_change_rec, lsc_padding3));
+	LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding3));
+	LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_tail) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_size_change_rec, lsc_tail));
+	LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_tail));
+
+	/* Checks for struct changelog_rec */
+	LASSERTF((int)sizeof(struct changelog_rec) == 64, "found %lld\n",
+		 (long long)(int)sizeof(struct changelog_rec));
+	LASSERTF((int)offsetof(struct changelog_rec, cr_namelen) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_rec, cr_namelen));
+	LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_namelen) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_rec *)0)->cr_namelen));
+	LASSERTF((int)offsetof(struct changelog_rec, cr_flags) == 2, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_rec, cr_flags));
+	LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_flags) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_rec *)0)->cr_flags));
+	LASSERTF((int)offsetof(struct changelog_rec, cr_type) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_rec, cr_type));
+	LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_type) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_rec *)0)->cr_type));
+	LASSERTF((int)offsetof(struct changelog_rec, cr_index) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_rec, cr_index));
+	LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_index) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_rec *)0)->cr_index));
+	LASSERTF((int)offsetof(struct changelog_rec, cr_prev) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_rec, cr_prev));
+	LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_prev) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_rec *)0)->cr_prev));
+	LASSERTF((int)offsetof(struct changelog_rec, cr_time) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_rec, cr_time));
+	LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_rec *)0)->cr_time));
+	LASSERTF((int)offsetof(struct changelog_rec, cr_tfid) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_rec, cr_tfid));
+	LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_tfid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_rec *)0)->cr_tfid));
+	LASSERTF((int)offsetof(struct changelog_rec, cr_pfid) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_rec, cr_pfid));
+	LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_pfid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_rec *)0)->cr_pfid));
+
+	/* Checks for struct changelog_ext_rec */
+	LASSERTF((int)sizeof(struct changelog_ext_rec) == 96, "found %lld\n",
+		 (long long)(int)sizeof(struct changelog_ext_rec));
+	LASSERTF((int)offsetof(struct changelog_ext_rec, cr_namelen) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_ext_rec, cr_namelen));
+	LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_namelen) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_namelen));
+	LASSERTF((int)offsetof(struct changelog_ext_rec, cr_flags) == 2, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_ext_rec, cr_flags));
+	LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_flags) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_flags));
+	LASSERTF((int)offsetof(struct changelog_ext_rec, cr_type) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_ext_rec, cr_type));
+	LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_type) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_type));
+	LASSERTF((int)offsetof(struct changelog_ext_rec, cr_index) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_ext_rec, cr_index));
+	LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_index) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_index));
+	LASSERTF((int)offsetof(struct changelog_ext_rec, cr_prev) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_ext_rec, cr_prev));
+	LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_prev) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_prev));
+	LASSERTF((int)offsetof(struct changelog_ext_rec, cr_time) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_ext_rec, cr_time));
+	LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_time));
+	LASSERTF((int)offsetof(struct changelog_ext_rec, cr_tfid) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_ext_rec, cr_tfid));
+	LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_tfid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_tfid));
+	LASSERTF((int)offsetof(struct changelog_ext_rec, cr_pfid) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_ext_rec, cr_pfid));
+	LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_pfid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_pfid));
+	LASSERTF((int)offsetof(struct changelog_ext_rec, cr_sfid) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_ext_rec, cr_sfid));
+	LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_sfid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_sfid));
+	LASSERTF((int)offsetof(struct changelog_ext_rec, cr_spfid) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_ext_rec, cr_spfid));
+	LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_spfid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_spfid));
+
+	/* Checks for struct changelog_setinfo */
+	LASSERTF((int)sizeof(struct changelog_setinfo) == 12, "found %lld\n",
+		 (long long)(int)sizeof(struct changelog_setinfo));
+	LASSERTF((int)offsetof(struct changelog_setinfo, cs_recno) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_setinfo, cs_recno));
+	LASSERTF((int)sizeof(((struct changelog_setinfo *)0)->cs_recno) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_setinfo *)0)->cs_recno));
+	LASSERTF((int)offsetof(struct changelog_setinfo, cs_id) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_setinfo, cs_id));
+	LASSERTF((int)sizeof(((struct changelog_setinfo *)0)->cs_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_setinfo *)0)->cs_id));
+
+	/* Checks for struct llog_changelog_rec */
+	LASSERTF((int)sizeof(struct llog_changelog_rec) == 88, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_changelog_rec));
+	LASSERTF((int)offsetof(struct llog_changelog_rec, cr_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_changelog_rec, cr_hdr));
+	LASSERTF((int)sizeof(((struct llog_changelog_rec *)0)->cr_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_changelog_rec *)0)->cr_hdr));
+	LASSERTF((int)offsetof(struct llog_changelog_rec, cr) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_changelog_rec, cr));
+	LASSERTF((int)sizeof(((struct llog_changelog_rec *)0)->cr) == 64, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_changelog_rec *)0)->cr));
+	LASSERTF((int)offsetof(struct llog_changelog_rec, cr_tail) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_changelog_rec, cr_tail));
+	LASSERTF((int)sizeof(((struct llog_changelog_rec *)0)->cr_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_changelog_rec *)0)->cr_tail));
+
+	/* Checks for struct llog_changelog_user_rec */
+	LASSERTF((int)sizeof(struct llog_changelog_user_rec) == 40, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_changelog_user_rec));
+	LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_changelog_user_rec, cur_hdr));
+	LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_hdr));
+	LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_id) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_changelog_user_rec, cur_id));
+	LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_id));
+	LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_padding) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_changelog_user_rec, cur_padding));
+	LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_padding));
+	LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_endrec) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_changelog_user_rec, cur_endrec));
+	LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_endrec) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_endrec));
+	LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_tail) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_changelog_user_rec, cur_tail));
+	LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_tail));
+
+	/* Checks for struct llog_gen */
+	LASSERTF((int)sizeof(struct llog_gen) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_gen));
+	LASSERTF((int)offsetof(struct llog_gen, mnt_cnt) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_gen, mnt_cnt));
+	LASSERTF((int)sizeof(((struct llog_gen *)0)->mnt_cnt) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_gen *)0)->mnt_cnt));
+	LASSERTF((int)offsetof(struct llog_gen, conn_cnt) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_gen, conn_cnt));
+	LASSERTF((int)sizeof(((struct llog_gen *)0)->conn_cnt) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_gen *)0)->conn_cnt));
+
+	/* Checks for struct llog_gen_rec */
+	LASSERTF((int)sizeof(struct llog_gen_rec) == 64, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_gen_rec));
+	LASSERTF((int)offsetof(struct llog_gen_rec, lgr_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_gen_rec, lgr_hdr));
+	LASSERTF((int)sizeof(((struct llog_gen_rec *)0)->lgr_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_gen_rec *)0)->lgr_hdr));
+	LASSERTF((int)offsetof(struct llog_gen_rec, lgr_gen) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_gen_rec, lgr_gen));
+	LASSERTF((int)sizeof(((struct llog_gen_rec *)0)->lgr_gen) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_gen_rec *)0)->lgr_gen));
+	LASSERTF((int)offsetof(struct llog_gen_rec, lgr_tail) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_gen_rec, lgr_tail));
+	LASSERTF((int)sizeof(((struct llog_gen_rec *)0)->lgr_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_gen_rec *)0)->lgr_tail));
+
+	/* Checks for struct llog_log_hdr */
+	LASSERTF((int)sizeof(struct llog_log_hdr) == 8192, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_log_hdr));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_hdr));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_hdr));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_timestamp) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_timestamp));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_timestamp) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_timestamp));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_count) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_count));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_count));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_bitmap_offset) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_bitmap_offset));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap_offset) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap_offset));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_size) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_size));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_size) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_size));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_flags) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_flags));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_flags));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_cat_idx) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_cat_idx));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_cat_idx) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_cat_idx));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_tgtuuid) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_tgtuuid));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_tgtuuid) == 40, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_tgtuuid));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_reserved) == 84, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_reserved));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_reserved) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_reserved));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_bitmap) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_bitmap));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap) == 8096, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_tail) == 8184, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_tail));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_tail));
+
+	/* Checks for struct llog_cookie */
+	LASSERTF((int)sizeof(struct llog_cookie) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_cookie));
+	LASSERTF((int)offsetof(struct llog_cookie, lgc_lgl) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_cookie, lgc_lgl));
+	LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_lgl) == 20, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_lgl));
+	LASSERTF((int)offsetof(struct llog_cookie, lgc_subsys) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_cookie, lgc_subsys));
+	LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_subsys) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_subsys));
+	LASSERTF((int)offsetof(struct llog_cookie, lgc_index) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_cookie, lgc_index));
+	LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_index));
+	LASSERTF((int)offsetof(struct llog_cookie, lgc_padding) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_cookie, lgc_padding));
+	LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_padding));
+
+	/* Checks for struct llogd_body */
+	LASSERTF((int)sizeof(struct llogd_body) == 48, "found %lld\n",
+		 (long long)(int)sizeof(struct llogd_body));
+	LASSERTF((int)offsetof(struct llogd_body, lgd_logid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_body, lgd_logid));
+	LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_logid) == 20, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_body *)0)->lgd_logid));
+	LASSERTF((int)offsetof(struct llogd_body, lgd_ctxt_idx) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_body, lgd_ctxt_idx));
+	LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_ctxt_idx) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_body *)0)->lgd_ctxt_idx));
+	LASSERTF((int)offsetof(struct llogd_body, lgd_llh_flags) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_body, lgd_llh_flags));
+	LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_llh_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_body *)0)->lgd_llh_flags));
+	LASSERTF((int)offsetof(struct llogd_body, lgd_index) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_body, lgd_index));
+	LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_body *)0)->lgd_index));
+	LASSERTF((int)offsetof(struct llogd_body, lgd_saved_index) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_body, lgd_saved_index));
+	LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_saved_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_body *)0)->lgd_saved_index));
+	LASSERTF((int)offsetof(struct llogd_body, lgd_len) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_body, lgd_len));
+	LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_len) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_body *)0)->lgd_len));
+	LASSERTF((int)offsetof(struct llogd_body, lgd_cur_offset) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_body, lgd_cur_offset));
+	LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_cur_offset) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_body *)0)->lgd_cur_offset));
+	CLASSERT(LLOG_ORIGIN_HANDLE_CREATE == 501);
+	CLASSERT(LLOG_ORIGIN_HANDLE_NEXT_BLOCK == 502);
+	CLASSERT(LLOG_ORIGIN_HANDLE_READ_HEADER == 503);
+	CLASSERT(LLOG_ORIGIN_HANDLE_WRITE_REC == 504);
+	CLASSERT(LLOG_ORIGIN_HANDLE_CLOSE == 505);
+	CLASSERT(LLOG_ORIGIN_CONNECT == 506);
+	CLASSERT(LLOG_CATINFO == 507);
+	CLASSERT(LLOG_ORIGIN_HANDLE_PREV_BLOCK == 508);
+	CLASSERT(LLOG_ORIGIN_HANDLE_DESTROY == 509);
+	CLASSERT(LLOG_FIRST_OPC == 501);
+	CLASSERT(LLOG_LAST_OPC == 510);
+
+	/* Checks for struct llogd_conn_body */
+	LASSERTF((int)sizeof(struct llogd_conn_body) == 40, "found %lld\n",
+		 (long long)(int)sizeof(struct llogd_conn_body));
+	LASSERTF((int)offsetof(struct llogd_conn_body, lgdc_gen) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_conn_body, lgdc_gen));
+	LASSERTF((int)sizeof(((struct llogd_conn_body *)0)->lgdc_gen) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_conn_body *)0)->lgdc_gen));
+	LASSERTF((int)offsetof(struct llogd_conn_body, lgdc_logid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_conn_body, lgdc_logid));
+	LASSERTF((int)sizeof(((struct llogd_conn_body *)0)->lgdc_logid) == 20, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_conn_body *)0)->lgdc_logid));
+	LASSERTF((int)offsetof(struct llogd_conn_body, lgdc_ctxt_idx) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_conn_body, lgdc_ctxt_idx));
+	LASSERTF((int)sizeof(((struct llogd_conn_body *)0)->lgdc_ctxt_idx) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_conn_body *)0)->lgdc_ctxt_idx));
+
+	/* Checks for struct ll_fiemap_info_key */
+	LASSERTF((int)sizeof(struct ll_fiemap_info_key) == 248, "found %lld\n",
+		 (long long)(int)sizeof(struct ll_fiemap_info_key));
+	LASSERTF((int)offsetof(struct ll_fiemap_info_key, name[8]) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fiemap_info_key, name[8]));
+	LASSERTF((int)sizeof(((struct ll_fiemap_info_key *)0)->name[8]) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fiemap_info_key *)0)->name[8]));
+	LASSERTF((int)offsetof(struct ll_fiemap_info_key, oa) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fiemap_info_key, oa));
+	LASSERTF((int)sizeof(((struct ll_fiemap_info_key *)0)->oa) == 208, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fiemap_info_key *)0)->oa));
+	LASSERTF((int)offsetof(struct ll_fiemap_info_key, fiemap) == 216, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fiemap_info_key, fiemap));
+	LASSERTF((int)sizeof(((struct ll_fiemap_info_key *)0)->fiemap) == 32, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fiemap_info_key *)0)->fiemap));
+
+	/* Checks for struct quota_body */
+	LASSERTF((int)sizeof(struct quota_body) == 112, "found %lld\n",
+		 (long long)(int)sizeof(struct quota_body));
+	LASSERTF((int)offsetof(struct quota_body, qb_fid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_fid));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_fid));
+	LASSERTF((int)offsetof(struct quota_body, qb_id) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_id));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_id) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_id));
+	LASSERTF((int)offsetof(struct quota_body, qb_flags) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_flags));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_flags));
+	LASSERTF((int)offsetof(struct quota_body, qb_padding) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_padding));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_padding));
+	LASSERTF((int)offsetof(struct quota_body, qb_count) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_count));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_count) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_count));
+	LASSERTF((int)offsetof(struct quota_body, qb_usage) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_usage));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_usage) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_usage));
+	LASSERTF((int)offsetof(struct quota_body, qb_slv_ver) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_slv_ver));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_slv_ver) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_slv_ver));
+	LASSERTF((int)offsetof(struct quota_body, qb_lockh) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_lockh));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_lockh) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_lockh));
+	LASSERTF((int)offsetof(struct quota_body, qb_glb_lockh) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_glb_lockh));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_glb_lockh) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_glb_lockh));
+	LASSERTF((int)offsetof(struct quota_body, qb_padding1[4]) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_padding1[4]));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_padding1[4]) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_padding1[4]));
+
+	/* Checks for struct mgs_target_info */
+	LASSERTF((int)sizeof(struct mgs_target_info) == 4544, "found %lld\n",
+		 (long long)(int)sizeof(struct mgs_target_info));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_lustre_ver) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_lustre_ver));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_lustre_ver) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_lustre_ver));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_stripe_index) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_stripe_index));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_stripe_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_stripe_index));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_config_ver) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_config_ver));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_config_ver) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_config_ver));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_flags) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_flags));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_flags));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_nid_count) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_nid_count));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_nid_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_nid_count));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_instance) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_instance));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_instance) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_instance));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_fsname) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_fsname));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_fsname) == 64, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_fsname));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_svname) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_svname));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_svname) == 64, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_svname));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_uuid) == 152, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_uuid));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_uuid) == 40, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_uuid));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_nids) == 192, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_nids));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_nids) == 256, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_nids));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_params) == 448, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_params));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_params) == 4096, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_params));
+
+	/* Checks for struct lustre_capa */
+	LASSERTF((int)sizeof(struct lustre_capa) == 120, "found %lld\n",
+		 (long long)(int)sizeof(struct lustre_capa));
+	LASSERTF((int)offsetof(struct lustre_capa, lc_fid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa, lc_fid));
+	LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa *)0)->lc_fid));
+	LASSERTF((int)offsetof(struct lustre_capa, lc_opc) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa, lc_opc));
+	LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_opc) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa *)0)->lc_opc));
+	LASSERTF((int)offsetof(struct lustre_capa, lc_uid) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa, lc_uid));
+	LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_uid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa *)0)->lc_uid));
+	LASSERTF((int)offsetof(struct lustre_capa, lc_gid) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa, lc_gid));
+	LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_gid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa *)0)->lc_gid));
+	LASSERTF((int)offsetof(struct lustre_capa, lc_flags) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa, lc_flags));
+	LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa *)0)->lc_flags));
+	LASSERTF((int)offsetof(struct lustre_capa, lc_keyid) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa, lc_keyid));
+	LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_keyid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa *)0)->lc_keyid));
+	LASSERTF((int)offsetof(struct lustre_capa, lc_timeout) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa, lc_timeout));
+	LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_timeout) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa *)0)->lc_timeout));
+	LASSERTF((int)offsetof(struct lustre_capa, lc_expiry) == 52, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa, lc_expiry));
+	LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_expiry) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa *)0)->lc_expiry));
+	CLASSERT(CAPA_HMAC_MAX_LEN == 64);
+	LASSERTF((int)offsetof(struct lustre_capa, lc_hmac[64]) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa, lc_hmac[64]));
+	LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_hmac[64]) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa *)0)->lc_hmac[64]));
+
+	/* Checks for struct lustre_capa_key */
+	LASSERTF((int)sizeof(struct lustre_capa_key) == 72, "found %lld\n",
+		 (long long)(int)sizeof(struct lustre_capa_key));
+	LASSERTF((int)offsetof(struct lustre_capa_key, lk_seq) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa_key, lk_seq));
+	LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_seq) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_seq));
+	LASSERTF((int)offsetof(struct lustre_capa_key, lk_keyid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa_key, lk_keyid));
+	LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_keyid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_keyid));
+	LASSERTF((int)offsetof(struct lustre_capa_key, lk_padding) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa_key, lk_padding));
+	LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_padding));
+	CLASSERT(CAPA_HMAC_KEY_MAX_LEN == 56);
+	LASSERTF((int)offsetof(struct lustre_capa_key, lk_key[56]) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa_key, lk_key[56]));
+	LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_key[56]) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_key[56]));
+
+	/* Checks for struct getinfo_fid2path */
+	LASSERTF((int)sizeof(struct getinfo_fid2path) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct getinfo_fid2path));
+	LASSERTF((int)offsetof(struct getinfo_fid2path, gf_fid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct getinfo_fid2path, gf_fid));
+	LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_fid));
+	LASSERTF((int)offsetof(struct getinfo_fid2path, gf_recno) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct getinfo_fid2path, gf_recno));
+	LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_recno) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_recno));
+	LASSERTF((int)offsetof(struct getinfo_fid2path, gf_linkno) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct getinfo_fid2path, gf_linkno));
+	LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_linkno) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_linkno));
+	LASSERTF((int)offsetof(struct getinfo_fid2path, gf_pathlen) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct getinfo_fid2path, gf_pathlen));
+	LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_pathlen) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_pathlen));
+	LASSERTF((int)offsetof(struct getinfo_fid2path, gf_path[0]) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct getinfo_fid2path, gf_path[0]));
+	LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_path[0]) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_path[0]));
+
+	/* Checks for struct ll_user_fiemap */
+	LASSERTF((int)sizeof(struct ll_user_fiemap) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct ll_user_fiemap));
+	LASSERTF((int)offsetof(struct ll_user_fiemap, fm_start) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_user_fiemap, fm_start));
+	LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_start));
+	LASSERTF((int)offsetof(struct ll_user_fiemap, fm_length) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_user_fiemap, fm_length));
+	LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_length) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_length));
+	LASSERTF((int)offsetof(struct ll_user_fiemap, fm_flags) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_user_fiemap, fm_flags));
+	LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_flags));
+	LASSERTF((int)offsetof(struct ll_user_fiemap, fm_mapped_extents) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_user_fiemap, fm_mapped_extents));
+	LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_mapped_extents) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_mapped_extents));
+	LASSERTF((int)offsetof(struct ll_user_fiemap, fm_extent_count) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_user_fiemap, fm_extent_count));
+	LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_extent_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_extent_count));
+	LASSERTF((int)offsetof(struct ll_user_fiemap, fm_reserved) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_user_fiemap, fm_reserved));
+	LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_reserved) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_reserved));
+	LASSERTF((int)offsetof(struct ll_user_fiemap, fm_extents) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_user_fiemap, fm_extents));
+	LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_extents) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_extents));
+	CLASSERT(FIEMAP_FLAG_SYNC == 0x00000001);
+	CLASSERT(FIEMAP_FLAG_XATTR == 0x00000002);
+	CLASSERT(FIEMAP_FLAG_DEVICE_ORDER == 0x40000000);
+
+	/* Checks for struct ll_fiemap_extent */
+	LASSERTF((int)sizeof(struct ll_fiemap_extent) == 56, "found %lld\n",
+		 (long long)(int)sizeof(struct ll_fiemap_extent));
+	LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_logical) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fiemap_extent, fe_logical));
+	LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_logical) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_logical));
+	LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_physical) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fiemap_extent, fe_physical));
+	LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_physical) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_physical));
+	LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_length) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fiemap_extent, fe_length));
+	LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_length) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_length));
+	LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_flags) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fiemap_extent, fe_flags));
+	LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_flags));
+	LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_device) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fiemap_extent, fe_device));
+	LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_device) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_device));
+	CLASSERT(FIEMAP_EXTENT_LAST == 0x00000001);
+	CLASSERT(FIEMAP_EXTENT_UNKNOWN == 0x00000002);
+	CLASSERT(FIEMAP_EXTENT_DELALLOC == 0x00000004);
+	CLASSERT(FIEMAP_EXTENT_ENCODED == 0x00000008);
+	CLASSERT(FIEMAP_EXTENT_DATA_ENCRYPTED == 0x00000080);
+	CLASSERT(FIEMAP_EXTENT_NOT_ALIGNED == 0x00000100);
+	CLASSERT(FIEMAP_EXTENT_DATA_INLINE == 0x00000200);
+	CLASSERT(FIEMAP_EXTENT_DATA_TAIL == 0x00000400);
+	CLASSERT(FIEMAP_EXTENT_UNWRITTEN == 0x00000800);
+	CLASSERT(FIEMAP_EXTENT_MERGED == 0x00001000);
+	CLASSERT(FIEMAP_EXTENT_NO_DIRECT == 0x40000000);
+	CLASSERT(FIEMAP_EXTENT_NET == 0x80000000);
+
+	/* Checks for type posix_acl_xattr_entry */
+	LASSERTF((int)sizeof(posix_acl_xattr_entry) == 8, "found %lld\n",
+		 (long long)(int)sizeof(posix_acl_xattr_entry));
+	LASSERTF((int)offsetof(posix_acl_xattr_entry, e_tag) == 0, "found %lld\n",
+		 (long long)(int)offsetof(posix_acl_xattr_entry, e_tag));
+	LASSERTF((int)sizeof(((posix_acl_xattr_entry *)0)->e_tag) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((posix_acl_xattr_entry *)0)->e_tag));
+	LASSERTF((int)offsetof(posix_acl_xattr_entry, e_perm) == 2, "found %lld\n",
+		 (long long)(int)offsetof(posix_acl_xattr_entry, e_perm));
+	LASSERTF((int)sizeof(((posix_acl_xattr_entry *)0)->e_perm) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((posix_acl_xattr_entry *)0)->e_perm));
+	LASSERTF((int)offsetof(posix_acl_xattr_entry, e_id) == 4, "found %lld\n",
+		 (long long)(int)offsetof(posix_acl_xattr_entry, e_id));
+	LASSERTF((int)sizeof(((posix_acl_xattr_entry *)0)->e_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((posix_acl_xattr_entry *)0)->e_id));
+
+	/* Checks for type posix_acl_xattr_header */
+	LASSERTF((int)sizeof(posix_acl_xattr_header) == 4, "found %lld\n",
+		 (long long)(int)sizeof(posix_acl_xattr_header));
+	LASSERTF((int)offsetof(posix_acl_xattr_header, a_version) == 0, "found %lld\n",
+		 (long long)(int)offsetof(posix_acl_xattr_header, a_version));
+	LASSERTF((int)sizeof(((posix_acl_xattr_header *)0)->a_version) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((posix_acl_xattr_header *)0)->a_version));
+	LASSERTF((int)offsetof(posix_acl_xattr_header, a_entries) == 4, "found %lld\n",
+		 (long long)(int)offsetof(posix_acl_xattr_header, a_entries));
+	LASSERTF((int)sizeof(((posix_acl_xattr_header *)0)->a_entries) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((posix_acl_xattr_header *)0)->a_entries));
+
+	/* Checks for struct link_ea_header */
+	LASSERTF((int)sizeof(struct link_ea_header) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct link_ea_header));
+	LASSERTF((int)offsetof(struct link_ea_header, leh_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct link_ea_header, leh_magic));
+	LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct link_ea_header *)0)->leh_magic));
+	LASSERTF((int)offsetof(struct link_ea_header, leh_reccount) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct link_ea_header, leh_reccount));
+	LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_reccount) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct link_ea_header *)0)->leh_reccount));
+	LASSERTF((int)offsetof(struct link_ea_header, leh_len) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct link_ea_header, leh_len));
+	LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_len) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct link_ea_header *)0)->leh_len));
+	LASSERTF((int)offsetof(struct link_ea_header, padding1) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct link_ea_header, padding1));
+	LASSERTF((int)sizeof(((struct link_ea_header *)0)->padding1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct link_ea_header *)0)->padding1));
+	LASSERTF((int)offsetof(struct link_ea_header, padding2) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct link_ea_header, padding2));
+	LASSERTF((int)sizeof(((struct link_ea_header *)0)->padding2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct link_ea_header *)0)->padding2));
+	CLASSERT(LINK_EA_MAGIC == 0x11EAF1DFUL);
+
+	/* Checks for struct link_ea_entry */
+	LASSERTF((int)sizeof(struct link_ea_entry) == 18, "found %lld\n",
+		 (long long)(int)sizeof(struct link_ea_entry));
+	LASSERTF((int)offsetof(struct link_ea_entry, lee_reclen) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct link_ea_entry, lee_reclen));
+	LASSERTF((int)sizeof(((struct link_ea_entry *)0)->lee_reclen) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct link_ea_entry *)0)->lee_reclen));
+	LASSERTF((int)offsetof(struct link_ea_entry, lee_parent_fid) == 2, "found %lld\n",
+		 (long long)(int)offsetof(struct link_ea_entry, lee_parent_fid));
+	LASSERTF((int)sizeof(((struct link_ea_entry *)0)->lee_parent_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct link_ea_entry *)0)->lee_parent_fid));
+	LASSERTF((int)offsetof(struct link_ea_entry, lee_name) == 18, "found %lld\n",
+		 (long long)(int)offsetof(struct link_ea_entry, lee_name));
+	LASSERTF((int)sizeof(((struct link_ea_entry *)0)->lee_name) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct link_ea_entry *)0)->lee_name));
+
+	/* Checks for struct layout_intent */
+	LASSERTF((int)sizeof(struct layout_intent) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct layout_intent));
+	LASSERTF((int)offsetof(struct layout_intent, li_opc) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct layout_intent, li_opc));
+	LASSERTF((int)sizeof(((struct layout_intent *)0)->li_opc) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct layout_intent *)0)->li_opc));
+	LASSERTF((int)offsetof(struct layout_intent, li_flags) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct layout_intent, li_flags));
+	LASSERTF((int)sizeof(((struct layout_intent *)0)->li_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct layout_intent *)0)->li_flags));
+	LASSERTF((int)offsetof(struct layout_intent, li_start) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct layout_intent, li_start));
+	LASSERTF((int)sizeof(((struct layout_intent *)0)->li_start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct layout_intent *)0)->li_start));
+	LASSERTF((int)offsetof(struct layout_intent, li_end) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct layout_intent, li_end));
+	LASSERTF((int)sizeof(((struct layout_intent *)0)->li_end) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct layout_intent *)0)->li_end));
+	LASSERTF(LAYOUT_INTENT_ACCESS == 0, "found %lld\n",
+		 (long long)LAYOUT_INTENT_ACCESS);
+	LASSERTF(LAYOUT_INTENT_READ == 1, "found %lld\n",
+		 (long long)LAYOUT_INTENT_READ);
+	LASSERTF(LAYOUT_INTENT_WRITE == 2, "found %lld\n",
+		 (long long)LAYOUT_INTENT_WRITE);
+	LASSERTF(LAYOUT_INTENT_GLIMPSE == 3, "found %lld\n",
+		 (long long)LAYOUT_INTENT_GLIMPSE);
+	LASSERTF(LAYOUT_INTENT_TRUNC == 4, "found %lld\n",
+		 (long long)LAYOUT_INTENT_TRUNC);
+	LASSERTF(LAYOUT_INTENT_RELEASE == 5, "found %lld\n",
+		 (long long)LAYOUT_INTENT_RELEASE);
+	LASSERTF(LAYOUT_INTENT_RESTORE == 6, "found %lld\n",
+		 (long long)LAYOUT_INTENT_RESTORE);
+
+	/* Checks for struct hsm_action_item */
+	LASSERTF((int)sizeof(struct hsm_action_item) == 72, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_action_item));
+	LASSERTF((int)offsetof(struct hsm_action_item, hai_len) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_item, hai_len));
+	LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_len) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_len));
+	LASSERTF((int)offsetof(struct hsm_action_item, hai_action) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_item, hai_action));
+	LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_action) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_action));
+	LASSERTF((int)offsetof(struct hsm_action_item, hai_fid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_item, hai_fid));
+	LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_fid));
+	LASSERTF((int)offsetof(struct hsm_action_item, hai_dfid) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_item, hai_dfid));
+	LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_dfid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_dfid));
+	LASSERTF((int)offsetof(struct hsm_action_item, hai_extent) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_item, hai_extent));
+	LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_extent) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_extent));
+	LASSERTF((int)offsetof(struct hsm_action_item, hai_cookie) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_item, hai_cookie));
+	LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_cookie) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_cookie));
+	LASSERTF((int)offsetof(struct hsm_action_item, hai_gid) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_item, hai_gid));
+	LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_gid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_gid));
+	LASSERTF((int)offsetof(struct hsm_action_item, hai_data) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_item, hai_data));
+	LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_data) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_data));
+
+	/* Checks for struct hsm_action_list */
+	LASSERTF((int)sizeof(struct hsm_action_list) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_action_list));
+	LASSERTF((int)offsetof(struct hsm_action_list, hal_version) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_list, hal_version));
+	LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_version) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_version));
+	LASSERTF((int)offsetof(struct hsm_action_list, hal_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_list, hal_count));
+	LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_count));
+	LASSERTF((int)offsetof(struct hsm_action_list, hal_compound_id) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_list, hal_compound_id));
+	LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_compound_id) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_compound_id));
+	LASSERTF((int)offsetof(struct hsm_action_list, hal_flags) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_list, hal_flags));
+	LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_flags) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_flags));
+	LASSERTF((int)offsetof(struct hsm_action_list, hal_archive_id) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_list, hal_archive_id));
+	LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_archive_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_archive_id));
+	LASSERTF((int)offsetof(struct hsm_action_list, padding1) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_list, padding1));
+	LASSERTF((int)sizeof(((struct hsm_action_list *)0)->padding1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_list *)0)->padding1));
+	LASSERTF((int)offsetof(struct hsm_action_list, hal_fsname) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_list, hal_fsname));
+	LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_fsname) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_fsname));
+
+	/* Checks for struct hsm_progress */
+	LASSERTF((int)sizeof(struct hsm_progress) == 48, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_progress));
+	LASSERTF((int)offsetof(struct hsm_progress, hp_fid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress, hp_fid));
+	LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress *)0)->hp_fid));
+	LASSERTF((int)offsetof(struct hsm_progress, hp_cookie) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress, hp_cookie));
+	LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_cookie) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress *)0)->hp_cookie));
+	LASSERTF((int)offsetof(struct hsm_progress, hp_extent) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress, hp_extent));
+	LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_extent) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress *)0)->hp_extent));
+	LASSERTF((int)offsetof(struct hsm_progress, hp_flags) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress, hp_flags));
+	LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_flags) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress *)0)->hp_flags));
+	LASSERTF((int)offsetof(struct hsm_progress, hp_errval) == 42, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress, hp_errval));
+	LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_errval) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress *)0)->hp_errval));
+	LASSERTF((int)offsetof(struct hsm_progress, padding) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress, padding));
+	LASSERTF((int)sizeof(((struct hsm_progress *)0)->padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress *)0)->padding));
+	LASSERTF(HP_FLAG_COMPLETED == 0x01, "found 0x%.8x\n",
+		HP_FLAG_COMPLETED);
+	LASSERTF(HP_FLAG_RETRY == 0x02, "found 0x%.8x\n",
+		HP_FLAG_RETRY);
+
+	LASSERTF((int)offsetof(struct hsm_copy, hc_data_version) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_copy, hc_data_version));
+	LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_data_version) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_copy *)0)->hc_data_version));
+	LASSERTF((int)offsetof(struct hsm_copy, hc_flags) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_copy, hc_flags));
+	LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_flags) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_copy *)0)->hc_flags));
+	LASSERTF((int)offsetof(struct hsm_copy, hc_errval) == 10, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_copy, hc_errval));
+	LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_errval) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_copy *)0)->hc_errval));
+	LASSERTF((int)offsetof(struct hsm_copy, padding) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_copy, padding));
+	LASSERTF((int)sizeof(((struct hsm_copy *)0)->padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_copy *)0)->padding));
+	LASSERTF((int)offsetof(struct hsm_copy, hc_hai) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_copy, hc_hai));
+	LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_hai) == 72, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_copy *)0)->hc_hai));
+
+	/* Checks for struct hsm_progress_kernel */
+	LASSERTF((int)sizeof(struct hsm_progress_kernel) == 64, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_progress_kernel));
+	LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_fid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress_kernel, hpk_fid));
+	LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_fid));
+	LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_cookie) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress_kernel, hpk_cookie));
+	LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_cookie) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_cookie));
+	LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_extent) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress_kernel, hpk_extent));
+	LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_extent) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_extent));
+	LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_flags) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress_kernel, hpk_flags));
+	LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_flags) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_flags));
+	LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_errval) == 42, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress_kernel, hpk_errval));
+	LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_errval) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_errval));
+	LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_padding1) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress_kernel, hpk_padding1));
+	LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding1));
+	LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_data_version) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress_kernel, hpk_data_version));
+	LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_data_version) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_data_version));
+	LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_padding2) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress_kernel, hpk_padding2));
+	LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding2));
+
+	/* Checks for struct hsm_user_item */
+	LASSERTF((int)sizeof(struct hsm_user_item) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_user_item));
+	LASSERTF((int)offsetof(struct hsm_user_item, hui_fid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_item, hui_fid));
+	LASSERTF((int)sizeof(((struct hsm_user_item *)0)->hui_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_item *)0)->hui_fid));
+	LASSERTF((int)offsetof(struct hsm_user_item, hui_extent) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_item, hui_extent));
+	LASSERTF((int)sizeof(((struct hsm_user_item *)0)->hui_extent) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_item *)0)->hui_extent));
+
+	/* Checks for struct hsm_user_state */
+	LASSERTF((int)sizeof(struct hsm_user_state) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_user_state));
+	LASSERTF((int)offsetof(struct hsm_user_state, hus_states) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_state, hus_states));
+	LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_states) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_states));
+	LASSERTF((int)offsetof(struct hsm_user_state, hus_archive_id) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_state, hus_archive_id));
+	LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_archive_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_archive_id));
+	LASSERTF((int)offsetof(struct hsm_user_state, hus_in_progress_state) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_state, hus_in_progress_state));
+	LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_state) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_state));
+	LASSERTF((int)offsetof(struct hsm_user_state, hus_in_progress_action) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_state, hus_in_progress_action));
+	LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_action) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_action));
+	LASSERTF((int)offsetof(struct hsm_user_state, hus_in_progress_location) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_state, hus_in_progress_location));
+	LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_location) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_location));
+
+	/* Checks for struct hsm_state_set */
+	LASSERTF((int)sizeof(struct hsm_state_set) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_state_set));
+	LASSERTF((int)offsetof(struct hsm_state_set, hss_valid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_state_set, hss_valid));
+	LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_valid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_valid));
+	LASSERTF((int)offsetof(struct hsm_state_set, hss_archive_id) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_state_set, hss_archive_id));
+	LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_archive_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_archive_id));
+	LASSERTF((int)offsetof(struct hsm_state_set, hss_setmask) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_state_set, hss_setmask));
+	LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_setmask) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_setmask));
+	LASSERTF((int)offsetof(struct hsm_state_set, hss_clearmask) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_state_set, hss_clearmask));
+	LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_clearmask) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_clearmask));
+
+	/* Checks for struct hsm_current_action */
+	LASSERTF((int)sizeof(struct hsm_current_action) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_current_action));
+	LASSERTF((int)offsetof(struct hsm_current_action, hca_state) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_current_action, hca_state));
+	LASSERTF((int)sizeof(((struct hsm_current_action *)0)->hca_state) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_current_action *)0)->hca_state));
+	LASSERTF((int)offsetof(struct hsm_current_action, hca_action) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_current_action, hca_action));
+	LASSERTF((int)sizeof(((struct hsm_current_action *)0)->hca_action) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_current_action *)0)->hca_action));
+	LASSERTF((int)offsetof(struct hsm_current_action, hca_location) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_current_action, hca_location));
+	LASSERTF((int)sizeof(((struct hsm_current_action *)0)->hca_location) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_current_action *)0)->hca_location));
+
+	/* Checks for struct hsm_request */
+	LASSERTF((int)sizeof(struct hsm_request) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_request));
+	LASSERTF((int)offsetof(struct hsm_request, hr_action) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_request, hr_action));
+	LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_action) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_request *)0)->hr_action));
+	LASSERTF((int)offsetof(struct hsm_request, hr_archive_id) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_request, hr_archive_id));
+	LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_archive_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_request *)0)->hr_archive_id));
+	LASSERTF((int)offsetof(struct hsm_request, hr_flags) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_request, hr_flags));
+	LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_flags) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_request *)0)->hr_flags));
+	LASSERTF((int)offsetof(struct hsm_request, hr_itemcount) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_request, hr_itemcount));
+	LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_itemcount) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_request *)0)->hr_itemcount));
+	LASSERTF((int)offsetof(struct hsm_request, hr_data_len) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_request, hr_data_len));
+	LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_data_len) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_request *)0)->hr_data_len));
+	LASSERTF(HSM_FORCE_ACTION == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)HSM_FORCE_ACTION);
+	LASSERTF(HSM_GHOST_COPY == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)HSM_GHOST_COPY);
+
+	/* Checks for struct hsm_user_request */
+	LASSERTF((int)sizeof(struct hsm_user_request) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_user_request));
+	LASSERTF((int)offsetof(struct hsm_user_request, hur_request) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_request, hur_request));
+	LASSERTF((int)sizeof(((struct hsm_user_request *)0)->hur_request) == 24, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_request *)0)->hur_request));
+	LASSERTF((int)offsetof(struct hsm_user_request, hur_user_item) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_request, hur_user_item));
+	LASSERTF((int)sizeof(((struct hsm_user_request *)0)->hur_user_item) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_request *)0)->hur_user_item));
+
+	/* Checks for struct update_buf */
+	LASSERTF((int)sizeof(struct update_buf) == 8, "found %lld\n",
+		 (long long)(int)sizeof(struct update_buf));
+	LASSERTF((int)offsetof(struct update_buf, ub_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct update_buf, ub_magic));
+	LASSERTF((int)sizeof(((struct update_buf *)0)->ub_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct update_buf *)0)->ub_magic));
+	LASSERTF((int)offsetof(struct update_buf, ub_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct update_buf, ub_count));
+	LASSERTF((int)sizeof(((struct update_buf *)0)->ub_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct update_buf *)0)->ub_count));
+	LASSERTF((int)offsetof(struct update_buf, ub_bufs) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct update_buf, ub_bufs));
+	LASSERTF((int)sizeof(((struct update_buf *)0)->ub_bufs) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct update_buf *)0)->ub_bufs));
+
+	/* Checks for struct update_reply */
+	LASSERTF((int)sizeof(struct update_reply) == 8, "found %lld\n",
+		 (long long)(int)sizeof(struct update_reply));
+	LASSERTF((int)offsetof(struct update_reply, ur_version) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct update_reply, ur_version));
+	LASSERTF((int)sizeof(((struct update_reply *)0)->ur_version) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct update_reply *)0)->ur_version));
+	LASSERTF((int)offsetof(struct update_reply, ur_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct update_reply, ur_count));
+	LASSERTF((int)sizeof(((struct update_reply *)0)->ur_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct update_reply *)0)->ur_count));
+	LASSERTF((int)offsetof(struct update_reply, ur_lens) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct update_reply, ur_lens));
+	LASSERTF((int)sizeof(((struct update_reply *)0)->ur_lens) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct update_reply *)0)->ur_lens));
+
+	/* Checks for struct update */
+	LASSERTF((int)sizeof(struct update) == 56, "found %lld\n",
+		 (long long)(int)sizeof(struct update));
+	LASSERTF((int)offsetof(struct update, u_type) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct update, u_type));
+	LASSERTF((int)sizeof(((struct update *)0)->u_type) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct update *)0)->u_type));
+	LASSERTF((int)offsetof(struct update, u_batchid) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct update, u_batchid));
+	LASSERTF((int)sizeof(((struct update *)0)->u_batchid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct update *)0)->u_batchid));
+	LASSERTF((int)offsetof(struct update, u_fid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct update, u_fid));
+	LASSERTF((int)sizeof(((struct update *)0)->u_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct update *)0)->u_fid));
+	LASSERTF((int)offsetof(struct update, u_lens) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct update, u_lens));
+	LASSERTF((int)sizeof(((struct update *)0)->u_lens) == 32, "found %lld\n",
+		 (long long)(int)sizeof(((struct update *)0)->u_lens));
+	LASSERTF((int)offsetof(struct update, u_bufs) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct update, u_bufs));
+	LASSERTF((int)sizeof(((struct update *)0)->u_bufs) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct update *)0)->u_bufs));
+}